diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,9957 @@ +{ + "best_metric": 0.833830714225769, + "best_model_checkpoint": "/home/nlplab12/Desktop/roberta-pretrain/ckpt/roberta/pretrain/medium/512/checkpoint-12420", + "epoch": 10.559424425487993, + "eval_steps": 90, + "global_step": 12420, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008500045535958229, + "grad_norm": 514775.65625, + "learning_rate": 4.166666666666667e-06, + "loss": 7.2556, + "step": 10 + }, + { + "epoch": 0.017000091071916457, + "grad_norm": 399649.5, + "learning_rate": 8.333333333333334e-06, + "loss": 6.7984, + "step": 20 + }, + { + "epoch": 0.025500136607874684, + "grad_norm": 137177.0, + "learning_rate": 1.25e-05, + "loss": 6.2165, + "step": 30 + }, + { + "epoch": 0.034000182143832915, + "grad_norm": 151481.765625, + "learning_rate": 1.6666666666666667e-05, + "loss": 5.9275, + "step": 40 + }, + { + "epoch": 0.04250022767979114, + "grad_norm": 92555.8359375, + "learning_rate": 2.0833333333333333e-05, + "loss": 5.7498, + "step": 50 + }, + { + "epoch": 0.05100027321574937, + "grad_norm": 86679.46875, + "learning_rate": 2.5e-05, + "loss": 5.6096, + "step": 60 + }, + { + "epoch": 0.059500318751707595, + "grad_norm": 66003.234375, + "learning_rate": 2.9166666666666666e-05, + "loss": 5.4786, + "step": 70 + }, + { + "epoch": 0.06800036428766583, + "grad_norm": 57082.796875, + "learning_rate": 3.3333333333333335e-05, + "loss": 5.3565, + "step": 80 + }, + { + "epoch": 0.07650040982362405, + "grad_norm": 46905.7734375, + "learning_rate": 3.75e-05, + "loss": 5.2382, + "step": 90 + }, + { + "epoch": 0.07650040982362405, + "eval_accuracy": 0.2240983525108359, + "eval_loss": 5.1665496826171875, + "eval_runtime": 1330.3439, + "eval_samples_per_second": 375.375, + "eval_steps_per_second": 1.303, + "step": 90 + }, + { + "epoch": 0.08500045535958228, + "grad_norm": 36661.265625, + "learning_rate": 4.1666666666666665e-05, + "loss": 5.1301, + "step": 100 + }, + { + "epoch": 0.09350050089554052, + "grad_norm": 28157.064453125, + "learning_rate": 4.5833333333333334e-05, + "loss": 5.0237, + "step": 110 + }, + { + "epoch": 0.10200054643149874, + "grad_norm": 31035.537109375, + "learning_rate": 5e-05, + "loss": 4.9291, + "step": 120 + }, + { + "epoch": 0.11050059196745697, + "grad_norm": 41497.1640625, + "learning_rate": 5.416666666666667e-05, + "loss": 4.8501, + "step": 130 + }, + { + "epoch": 0.11900063750341519, + "grad_norm": 44447.87890625, + "learning_rate": 5.833333333333333e-05, + "loss": 4.7938, + "step": 140 + }, + { + "epoch": 0.12750068303937342, + "grad_norm": 36388.1015625, + "learning_rate": 6.25e-05, + "loss": 4.7491, + "step": 150 + }, + { + "epoch": 0.13600072857533166, + "grad_norm": 24360.1875, + "learning_rate": 6.666666666666667e-05, + "loss": 4.7147, + "step": 160 + }, + { + "epoch": 0.1445007741112899, + "grad_norm": 19699.2734375, + "learning_rate": 7.083333333333334e-05, + "loss": 4.6829, + "step": 170 + }, + { + "epoch": 0.1530008196472481, + "grad_norm": 11622.908203125, + "learning_rate": 7.5e-05, + "loss": 4.6545, + "step": 180 + }, + { + "epoch": 0.1530008196472481, + "eval_accuracy": 0.284967956348924, + "eval_loss": 4.628236293792725, + "eval_runtime": 1276.531, + "eval_samples_per_second": 391.199, + "eval_steps_per_second": 1.358, + "step": 180 + }, + { + "epoch": 0.16150086518320633, + "grad_norm": 16312.888671875, + "learning_rate": 7.916666666666666e-05, + "loss": 4.6377, + "step": 190 + }, + { + "epoch": 0.17000091071916457, + "grad_norm": 37823.2890625, + "learning_rate": 8.333333333333333e-05, + "loss": 4.6244, + "step": 200 + }, + { + "epoch": 0.1785009562551228, + "grad_norm": 18776.1796875, + "learning_rate": 8.75e-05, + "loss": 4.6141, + "step": 210 + }, + { + "epoch": 0.18700100179108103, + "grad_norm": 34368.3359375, + "learning_rate": 9.166666666666667e-05, + "loss": 4.6043, + "step": 220 + }, + { + "epoch": 0.19550104732703924, + "grad_norm": 38324.95703125, + "learning_rate": 9.583333333333334e-05, + "loss": 4.5989, + "step": 230 + }, + { + "epoch": 0.20400109286299747, + "grad_norm": 35916.953125, + "learning_rate": 0.0001, + "loss": 4.5924, + "step": 240 + }, + { + "epoch": 0.2125011383989557, + "grad_norm": 34779.6328125, + "learning_rate": 0.00010416666666666667, + "loss": 4.5855, + "step": 250 + }, + { + "epoch": 0.22100118393491394, + "grad_norm": 38605.4921875, + "learning_rate": 0.00010833333333333334, + "loss": 4.5771, + "step": 260 + }, + { + "epoch": 0.22950122947087218, + "grad_norm": 17166.369140625, + "learning_rate": 0.00011250000000000001, + "loss": 4.5704, + "step": 270 + }, + { + "epoch": 0.22950122947087218, + "eval_accuracy": 0.2895338322091062, + "eval_loss": 4.5623779296875, + "eval_runtime": 1273.631, + "eval_samples_per_second": 392.09, + "eval_steps_per_second": 1.361, + "step": 270 + }, + { + "epoch": 0.23800127500683038, + "grad_norm": 42357.046875, + "learning_rate": 0.00011666666666666667, + "loss": 4.5682, + "step": 280 + }, + { + "epoch": 0.24650132054278862, + "grad_norm": 20787.40625, + "learning_rate": 0.00012083333333333333, + "loss": 4.5646, + "step": 290 + }, + { + "epoch": 0.25500136607874685, + "grad_norm": 49082.19921875, + "learning_rate": 0.000125, + "loss": 4.559, + "step": 300 + }, + { + "epoch": 0.2635014116147051, + "grad_norm": 18796.533203125, + "learning_rate": 0.00012916666666666667, + "loss": 4.5534, + "step": 310 + }, + { + "epoch": 0.2720014571506633, + "grad_norm": 45862.125, + "learning_rate": 0.00013333333333333334, + "loss": 4.5513, + "step": 320 + }, + { + "epoch": 0.28050150268662155, + "grad_norm": 27371.748046875, + "learning_rate": 0.0001375, + "loss": 4.5489, + "step": 330 + }, + { + "epoch": 0.2890015482225798, + "grad_norm": 26154.533203125, + "learning_rate": 0.00014166666666666668, + "loss": 4.5471, + "step": 340 + }, + { + "epoch": 0.297501593758538, + "grad_norm": 46019.3671875, + "learning_rate": 0.00014583333333333335, + "loss": 4.5453, + "step": 350 + }, + { + "epoch": 0.3060016392944962, + "grad_norm": 54414.8828125, + "learning_rate": 0.00015, + "loss": 4.5435, + "step": 360 + }, + { + "epoch": 0.3060016392944962, + "eval_accuracy": 0.28996644531895915, + "eval_loss": 4.538370609283447, + "eval_runtime": 1274.2161, + "eval_samples_per_second": 391.91, + "eval_steps_per_second": 1.361, + "step": 360 + }, + { + "epoch": 0.31450168483045443, + "grad_norm": 61712.6640625, + "learning_rate": 0.00015416666666666668, + "loss": 4.5387, + "step": 370 + }, + { + "epoch": 0.32300173036641266, + "grad_norm": 35200.015625, + "learning_rate": 0.00015833333333333332, + "loss": 4.537, + "step": 380 + }, + { + "epoch": 0.3315017759023709, + "grad_norm": 40758.90234375, + "learning_rate": 0.00016250000000000002, + "loss": 4.5345, + "step": 390 + }, + { + "epoch": 0.34000182143832913, + "grad_norm": 38277.3515625, + "learning_rate": 0.00016666666666666666, + "loss": 4.5351, + "step": 400 + }, + { + "epoch": 0.34850186697428737, + "grad_norm": 28720.861328125, + "learning_rate": 0.00017083333333333333, + "loss": 4.534, + "step": 410 + }, + { + "epoch": 0.3570019125102456, + "grad_norm": 21332.2109375, + "learning_rate": 0.000175, + "loss": 4.5329, + "step": 420 + }, + { + "epoch": 0.36550195804620383, + "grad_norm": 46977.23828125, + "learning_rate": 0.00017916666666666667, + "loss": 4.5282, + "step": 430 + }, + { + "epoch": 0.37400200358216207, + "grad_norm": 54403.328125, + "learning_rate": 0.00018333333333333334, + "loss": 4.5273, + "step": 440 + }, + { + "epoch": 0.3825020491181203, + "grad_norm": 33038.54296875, + "learning_rate": 0.0001875, + "loss": 4.5272, + "step": 450 + }, + { + "epoch": 0.3825020491181203, + "eval_accuracy": 0.2900167539077763, + "eval_loss": 4.523510932922363, + "eval_runtime": 1270.6276, + "eval_samples_per_second": 393.017, + "eval_steps_per_second": 1.365, + "step": 450 + }, + { + "epoch": 0.3910020946540785, + "grad_norm": 38754.59765625, + "learning_rate": 0.00019166666666666667, + "loss": 4.5216, + "step": 460 + }, + { + "epoch": 0.3995021401900367, + "grad_norm": 36281.0078125, + "learning_rate": 0.00019583333333333334, + "loss": 4.5237, + "step": 470 + }, + { + "epoch": 0.40800218572599495, + "grad_norm": 35634.79296875, + "learning_rate": 0.0002, + "loss": 4.5186, + "step": 480 + }, + { + "epoch": 0.4165022312619532, + "grad_norm": 15257.716796875, + "learning_rate": 0.00020416666666666668, + "loss": 4.5173, + "step": 490 + }, + { + "epoch": 0.4250022767979114, + "grad_norm": 36190.26171875, + "learning_rate": 0.00020833333333333335, + "loss": 4.5174, + "step": 500 + }, + { + "epoch": 0.43350232233386965, + "grad_norm": 57666.21875, + "learning_rate": 0.0002125, + "loss": 4.5144, + "step": 510 + }, + { + "epoch": 0.4420023678698279, + "grad_norm": 33545.0078125, + "learning_rate": 0.00021666666666666668, + "loss": 4.5131, + "step": 520 + }, + { + "epoch": 0.4505024134057861, + "grad_norm": 75054.1328125, + "learning_rate": 0.00022083333333333333, + "loss": 4.5116, + "step": 530 + }, + { + "epoch": 0.45900245894174435, + "grad_norm": 27162.548828125, + "learning_rate": 0.00022500000000000002, + "loss": 4.5104, + "step": 540 + }, + { + "epoch": 0.45900245894174435, + "eval_accuracy": 0.2903866574093662, + "eval_loss": 4.506768703460693, + "eval_runtime": 1274.9615, + "eval_samples_per_second": 391.681, + "eval_steps_per_second": 1.36, + "step": 540 + }, + { + "epoch": 0.4675025044777026, + "grad_norm": 31226.064453125, + "learning_rate": 0.00022916666666666666, + "loss": 4.5064, + "step": 550 + }, + { + "epoch": 0.47600255001366076, + "grad_norm": 34113.46484375, + "learning_rate": 0.00023333333333333333, + "loss": 4.5085, + "step": 560 + }, + { + "epoch": 0.484502595549619, + "grad_norm": 26731.498046875, + "learning_rate": 0.0002375, + "loss": 4.5079, + "step": 570 + }, + { + "epoch": 0.49300264108557723, + "grad_norm": 18876.388671875, + "learning_rate": 0.00024166666666666667, + "loss": 4.5041, + "step": 580 + }, + { + "epoch": 0.5015026866215355, + "grad_norm": 30968.685546875, + "learning_rate": 0.0002458333333333333, + "loss": 4.5044, + "step": 590 + }, + { + "epoch": 0.5100027321574937, + "grad_norm": 33946.8125, + "learning_rate": 0.00025, + "loss": 4.5036, + "step": 600 + }, + { + "epoch": 0.5185027776934519, + "grad_norm": 27673.900390625, + "learning_rate": 0.00025416666666666665, + "loss": 4.5007, + "step": 610 + }, + { + "epoch": 0.5270028232294102, + "grad_norm": 17412.22265625, + "learning_rate": 0.00025833333333333334, + "loss": 4.5003, + "step": 620 + }, + { + "epoch": 0.5355028687653683, + "grad_norm": 38210.4140625, + "learning_rate": 0.00026250000000000004, + "loss": 4.4977, + "step": 630 + }, + { + "epoch": 0.5355028687653683, + "eval_accuracy": 0.2904019042861853, + "eval_loss": 4.49587869644165, + "eval_runtime": 1274.4439, + "eval_samples_per_second": 391.84, + "eval_steps_per_second": 1.361, + "step": 630 + }, + { + "epoch": 0.5440029143013266, + "grad_norm": 22160.921875, + "learning_rate": 0.0002666666666666667, + "loss": 4.498, + "step": 640 + }, + { + "epoch": 0.5525029598372848, + "grad_norm": 40694.95703125, + "learning_rate": 0.0002708333333333333, + "loss": 4.4947, + "step": 650 + }, + { + "epoch": 0.5610030053732431, + "grad_norm": 22046.97265625, + "learning_rate": 0.000275, + "loss": 4.495, + "step": 660 + }, + { + "epoch": 0.5695030509092013, + "grad_norm": 45871.296875, + "learning_rate": 0.00027916666666666666, + "loss": 4.4946, + "step": 670 + }, + { + "epoch": 0.5780030964451596, + "grad_norm": 32230.837890625, + "learning_rate": 0.00028333333333333335, + "loss": 4.4941, + "step": 680 + }, + { + "epoch": 0.5865031419811177, + "grad_norm": 31339.076171875, + "learning_rate": 0.0002875, + "loss": 4.4921, + "step": 690 + }, + { + "epoch": 0.595003187517076, + "grad_norm": 21844.26171875, + "learning_rate": 0.0002916666666666667, + "loss": 4.4912, + "step": 700 + }, + { + "epoch": 0.6035032330530342, + "grad_norm": 20966.12109375, + "learning_rate": 0.00029583333333333333, + "loss": 4.489, + "step": 710 + }, + { + "epoch": 0.6120032785889924, + "grad_norm": 19211.65625, + "learning_rate": 0.0003, + "loss": 4.4903, + "step": 720 + }, + { + "epoch": 0.6120032785889924, + "eval_accuracy": 0.2904662633238704, + "eval_loss": 4.487661361694336, + "eval_runtime": 1273.4031, + "eval_samples_per_second": 392.16, + "eval_steps_per_second": 1.362, + "step": 720 + }, + { + "epoch": 0.6205033241249507, + "grad_norm": 47674.4140625, + "learning_rate": 0.00030416666666666667, + "loss": 4.4886, + "step": 730 + }, + { + "epoch": 0.6290033696609089, + "grad_norm": 44354.7890625, + "learning_rate": 0.00030833333333333337, + "loss": 4.4865, + "step": 740 + }, + { + "epoch": 0.6375034151968672, + "grad_norm": 27939.216796875, + "learning_rate": 0.0003125, + "loss": 4.4868, + "step": 750 + }, + { + "epoch": 0.6460034607328253, + "grad_norm": 29380.662109375, + "learning_rate": 0.00031666666666666665, + "loss": 4.4858, + "step": 760 + }, + { + "epoch": 0.6545035062687836, + "grad_norm": 38158.73046875, + "learning_rate": 0.00032083333333333334, + "loss": 4.4834, + "step": 770 + }, + { + "epoch": 0.6630035518047418, + "grad_norm": 40741.23828125, + "learning_rate": 0.00032500000000000004, + "loss": 4.4826, + "step": 780 + }, + { + "epoch": 0.6715035973407001, + "grad_norm": 25295.03515625, + "learning_rate": 0.0003291666666666667, + "loss": 4.4845, + "step": 790 + }, + { + "epoch": 0.6800036428766583, + "grad_norm": 49988.22265625, + "learning_rate": 0.0003333333333333333, + "loss": 4.4814, + "step": 800 + }, + { + "epoch": 0.6885036884126164, + "grad_norm": 32892.44921875, + "learning_rate": 0.0003375, + "loss": 4.4826, + "step": 810 + }, + { + "epoch": 0.6885036884126164, + "eval_accuracy": 0.29046815360567907, + "eval_loss": 4.479104518890381, + "eval_runtime": 1276.8746, + "eval_samples_per_second": 391.094, + "eval_steps_per_second": 1.358, + "step": 810 + }, + { + "epoch": 0.6970037339485747, + "grad_norm": 32854.1015625, + "learning_rate": 0.00034166666666666666, + "loss": 4.4804, + "step": 820 + }, + { + "epoch": 0.7055037794845329, + "grad_norm": 31755.115234375, + "learning_rate": 0.00034583333333333335, + "loss": 4.4803, + "step": 830 + }, + { + "epoch": 0.7140038250204912, + "grad_norm": 26255.55078125, + "learning_rate": 0.00035, + "loss": 4.4767, + "step": 840 + }, + { + "epoch": 0.7225038705564494, + "grad_norm": 23372.48046875, + "learning_rate": 0.0003541666666666667, + "loss": 4.477, + "step": 850 + }, + { + "epoch": 0.7310039160924077, + "grad_norm": 37866.93359375, + "learning_rate": 0.00035833333333333333, + "loss": 4.4765, + "step": 860 + }, + { + "epoch": 0.7395039616283658, + "grad_norm": 26253.822265625, + "learning_rate": 0.0003625, + "loss": 4.4738, + "step": 870 + }, + { + "epoch": 0.7480040071643241, + "grad_norm": 25544.201171875, + "learning_rate": 0.00036666666666666667, + "loss": 4.4745, + "step": 880 + }, + { + "epoch": 0.7565040527002823, + "grad_norm": 25811.166015625, + "learning_rate": 0.00037083333333333337, + "loss": 4.4734, + "step": 890 + }, + { + "epoch": 0.7650040982362406, + "grad_norm": 27087.75390625, + "learning_rate": 0.000375, + "loss": 4.4737, + "step": 900 + }, + { + "epoch": 0.7650040982362406, + "eval_accuracy": 0.2906699089310573, + "eval_loss": 4.468409538269043, + "eval_runtime": 1272.8084, + "eval_samples_per_second": 392.343, + "eval_steps_per_second": 1.362, + "step": 900 + }, + { + "epoch": 0.7735041437721988, + "grad_norm": 34065.42578125, + "learning_rate": 0.00037916666666666665, + "loss": 4.4712, + "step": 910 + }, + { + "epoch": 0.782004189308157, + "grad_norm": 20391.37109375, + "learning_rate": 0.00038333333333333334, + "loss": 4.4627, + "step": 920 + }, + { + "epoch": 0.7905042348441152, + "grad_norm": 22009.44140625, + "learning_rate": 0.00038750000000000004, + "loss": 4.4501, + "step": 930 + }, + { + "epoch": 0.7990042803800734, + "grad_norm": 37206.39453125, + "learning_rate": 0.0003916666666666667, + "loss": 4.4388, + "step": 940 + }, + { + "epoch": 0.8075043259160317, + "grad_norm": 51573.41015625, + "learning_rate": 0.0003958333333333333, + "loss": 4.4256, + "step": 950 + }, + { + "epoch": 0.8160043714519899, + "grad_norm": 47200.921875, + "learning_rate": 0.0004, + "loss": 4.4164, + "step": 960 + }, + { + "epoch": 0.8245044169879482, + "grad_norm": 34512.5234375, + "learning_rate": 0.00040416666666666666, + "loss": 4.4093, + "step": 970 + }, + { + "epoch": 0.8330044625239064, + "grad_norm": 38512.90234375, + "learning_rate": 0.00040833333333333336, + "loss": 4.4011, + "step": 980 + }, + { + "epoch": 0.8415045080598647, + "grad_norm": 44749.2109375, + "learning_rate": 0.0004125, + "loss": 4.3852, + "step": 990 + }, + { + "epoch": 0.8415045080598647, + "eval_accuracy": 0.2906419834101783, + "eval_loss": 4.365363597869873, + "eval_runtime": 1274.1773, + "eval_samples_per_second": 391.922, + "eval_steps_per_second": 1.361, + "step": 990 + }, + { + "epoch": 0.8500045535958228, + "grad_norm": 43667.578125, + "learning_rate": 0.0004166666666666667, + "loss": 4.3701, + "step": 1000 + }, + { + "epoch": 0.858504599131781, + "grad_norm": 55209.21875, + "learning_rate": 0.00042083333333333333, + "loss": 4.3609, + "step": 1010 + }, + { + "epoch": 0.8670046446677393, + "grad_norm": 60726.68359375, + "learning_rate": 0.000425, + "loss": 4.3476, + "step": 1020 + }, + { + "epoch": 0.8755046902036975, + "grad_norm": 49279.09765625, + "learning_rate": 0.00042916666666666667, + "loss": 4.3382, + "step": 1030 + }, + { + "epoch": 0.8840047357396558, + "grad_norm": 44572.08203125, + "learning_rate": 0.00043333333333333337, + "loss": 4.3271, + "step": 1040 + }, + { + "epoch": 0.892504781275614, + "grad_norm": 70982.9140625, + "learning_rate": 0.0004375, + "loss": 4.3161, + "step": 1050 + }, + { + "epoch": 0.9010048268115722, + "grad_norm": 49678.1796875, + "learning_rate": 0.00044166666666666665, + "loss": 4.3032, + "step": 1060 + }, + { + "epoch": 0.9095048723475304, + "grad_norm": 42499.19140625, + "learning_rate": 0.00044583333333333335, + "loss": 4.2959, + "step": 1070 + }, + { + "epoch": 0.9180049178834887, + "grad_norm": 61131.90625, + "learning_rate": 0.00045000000000000004, + "loss": 4.2901, + "step": 1080 + }, + { + "epoch": 0.9180049178834887, + "eval_accuracy": 0.2915224046693052, + "eval_loss": 4.254197597503662, + "eval_runtime": 1275.3488, + "eval_samples_per_second": 391.562, + "eval_steps_per_second": 1.36, + "step": 1080 + }, + { + "epoch": 0.9265049634194469, + "grad_norm": 61474.8046875, + "learning_rate": 0.0004541666666666667, + "loss": 4.2782, + "step": 1090 + }, + { + "epoch": 0.9350050089554052, + "grad_norm": 81287.625, + "learning_rate": 0.0004583333333333333, + "loss": 4.2578, + "step": 1100 + }, + { + "epoch": 0.9435050544913633, + "grad_norm": 71441.078125, + "learning_rate": 0.0004625, + "loss": 4.2455, + "step": 1110 + }, + { + "epoch": 0.9520051000273215, + "grad_norm": 52111.4921875, + "learning_rate": 0.00046666666666666666, + "loss": 4.237, + "step": 1120 + }, + { + "epoch": 0.9605051455632798, + "grad_norm": 79908.65625, + "learning_rate": 0.00047083333333333336, + "loss": 4.2271, + "step": 1130 + }, + { + "epoch": 0.969005191099238, + "grad_norm": 57955.83203125, + "learning_rate": 0.000475, + "loss": 4.2194, + "step": 1140 + }, + { + "epoch": 0.9775052366351963, + "grad_norm": 52858.40234375, + "learning_rate": 0.0004791666666666667, + "loss": 4.2118, + "step": 1150 + }, + { + "epoch": 0.9860052821711545, + "grad_norm": 78292.59375, + "learning_rate": 0.00048333333333333334, + "loss": 4.2114, + "step": 1160 + }, + { + "epoch": 0.9945053277071128, + "grad_norm": 69718.5859375, + "learning_rate": 0.0004875, + "loss": 4.1945, + "step": 1170 + }, + { + "epoch": 0.9945053277071128, + "eval_accuracy": 0.29390537675129147, + "eval_loss": 4.130845069885254, + "eval_runtime": 1275.9587, + "eval_samples_per_second": 391.375, + "eval_steps_per_second": 1.359, + "step": 1170 + }, + { + "epoch": 1.003005373243071, + "grad_norm": 75537.1875, + "learning_rate": 0.0004916666666666666, + "loss": 4.1849, + "step": 1180 + }, + { + "epoch": 1.011505418779029, + "grad_norm": 81501.7734375, + "learning_rate": 0.0004958333333333334, + "loss": 4.1814, + "step": 1190 + }, + { + "epoch": 1.0200054643149874, + "grad_norm": 148886.71875, + "learning_rate": 0.0005, + "loss": 4.1642, + "step": 1200 + }, + { + "epoch": 1.0285055098509457, + "grad_norm": 78217.7109375, + "learning_rate": 0.0005041666666666667, + "loss": 4.1625, + "step": 1210 + }, + { + "epoch": 1.0370055553869038, + "grad_norm": 74189.0234375, + "learning_rate": 0.0005083333333333333, + "loss": 4.141, + "step": 1220 + }, + { + "epoch": 1.045505600922862, + "grad_norm": 105369.5546875, + "learning_rate": 0.0005124999999999999, + "loss": 4.1294, + "step": 1230 + }, + { + "epoch": 1.0540056464588203, + "grad_norm": 105365.2109375, + "learning_rate": 0.0005166666666666667, + "loss": 4.123, + "step": 1240 + }, + { + "epoch": 1.0625056919947786, + "grad_norm": 81112.6328125, + "learning_rate": 0.0005208333333333334, + "loss": 4.1089, + "step": 1250 + }, + { + "epoch": 1.0710057375307367, + "grad_norm": 115335.5859375, + "learning_rate": 0.0005250000000000001, + "loss": 4.1027, + "step": 1260 + }, + { + "epoch": 1.0710057375307367, + "eval_accuracy": 0.3011365978433179, + "eval_loss": 4.004420757293701, + "eval_runtime": 1276.3722, + "eval_samples_per_second": 391.248, + "eval_steps_per_second": 1.359, + "step": 1260 + }, + { + "epoch": 1.079505783066695, + "grad_norm": 108373.8359375, + "learning_rate": 0.0005291666666666667, + "loss": 4.0735, + "step": 1270 + }, + { + "epoch": 1.0880058286026533, + "grad_norm": 112014.7890625, + "learning_rate": 0.0005333333333333334, + "loss": 4.0363, + "step": 1280 + }, + { + "epoch": 1.0965058741386116, + "grad_norm": 153617.40625, + "learning_rate": 0.0005375, + "loss": 4.0039, + "step": 1290 + }, + { + "epoch": 1.1050059196745696, + "grad_norm": 124114.390625, + "learning_rate": 0.0005416666666666666, + "loss": 3.9605, + "step": 1300 + }, + { + "epoch": 1.113505965210528, + "grad_norm": 146727.78125, + "learning_rate": 0.0005458333333333333, + "loss": 3.8962, + "step": 1310 + }, + { + "epoch": 1.1220060107464862, + "grad_norm": 178636.21875, + "learning_rate": 0.00055, + "loss": 3.8446, + "step": 1320 + }, + { + "epoch": 1.1305060562824445, + "grad_norm": 177699.4375, + "learning_rate": 0.0005541666666666667, + "loss": 3.8045, + "step": 1330 + }, + { + "epoch": 1.1390061018184026, + "grad_norm": 121470.7890625, + "learning_rate": 0.0005583333333333333, + "loss": 3.7662, + "step": 1340 + }, + { + "epoch": 1.1475061473543609, + "grad_norm": 185655.078125, + "learning_rate": 0.0005625000000000001, + "loss": 3.7529, + "step": 1350 + }, + { + "epoch": 1.1475061473543609, + "eval_accuracy": 0.34186952633889467, + "eval_loss": 3.526193380355835, + "eval_runtime": 1274.9146, + "eval_samples_per_second": 391.695, + "eval_steps_per_second": 1.36, + "step": 1350 + }, + { + "epoch": 1.1560061928903191, + "grad_norm": 194355.203125, + "learning_rate": 0.0005666666666666667, + "loss": 3.6921, + "step": 1360 + }, + { + "epoch": 1.1645062384262772, + "grad_norm": 102708.9375, + "learning_rate": 0.0005708333333333333, + "loss": 3.6505, + "step": 1370 + }, + { + "epoch": 1.1730062839622355, + "grad_norm": 153301.15625, + "learning_rate": 0.000575, + "loss": 3.6163, + "step": 1380 + }, + { + "epoch": 1.1815063294981938, + "grad_norm": 158013.796875, + "learning_rate": 0.0005791666666666667, + "loss": 3.5602, + "step": 1390 + }, + { + "epoch": 1.190006375034152, + "grad_norm": 113413.78125, + "learning_rate": 0.0005833333333333334, + "loss": 3.5185, + "step": 1400 + }, + { + "epoch": 1.1985064205701101, + "grad_norm": 157948.4375, + "learning_rate": 0.0005875, + "loss": 3.4583, + "step": 1410 + }, + { + "epoch": 1.2070064661060684, + "grad_norm": 157980.484375, + "learning_rate": 0.0005916666666666667, + "loss": 3.3973, + "step": 1420 + }, + { + "epoch": 1.2155065116420267, + "grad_norm": 100785.5234375, + "learning_rate": 0.0005958333333333333, + "loss": 3.3448, + "step": 1430 + }, + { + "epoch": 1.2240065571779848, + "grad_norm": 146644.875, + "learning_rate": 0.0006, + "loss": 3.3135, + "step": 1440 + }, + { + "epoch": 1.2240065571779848, + "eval_accuracy": 0.40612111173593607, + "eval_loss": 3.1175479888916016, + "eval_runtime": 1275.6913, + "eval_samples_per_second": 391.457, + "eval_steps_per_second": 1.359, + "step": 1440 + }, + { + "epoch": 1.232506602713943, + "grad_norm": 169930.046875, + "learning_rate": 0.0006041666666666666, + "loss": 3.31, + "step": 1450 + }, + { + "epoch": 1.2410066482499014, + "grad_norm": 166900.796875, + "learning_rate": 0.0006083333333333333, + "loss": 3.2813, + "step": 1460 + }, + { + "epoch": 1.2495066937858597, + "grad_norm": 142726.859375, + "learning_rate": 0.0006125000000000001, + "loss": 3.2236, + "step": 1470 + }, + { + "epoch": 1.2580067393218177, + "grad_norm": 113215.296875, + "learning_rate": 0.0006166666666666667, + "loss": 3.1767, + "step": 1480 + }, + { + "epoch": 1.266506784857776, + "grad_norm": 150677.703125, + "learning_rate": 0.0006208333333333334, + "loss": 3.1548, + "step": 1490 + }, + { + "epoch": 1.2750068303937343, + "grad_norm": 134445.515625, + "learning_rate": 0.000625, + "loss": 3.0924, + "step": 1500 + }, + { + "epoch": 1.2835068759296924, + "grad_norm": 116583.6796875, + "learning_rate": 0.0006291666666666667, + "loss": 3.023, + "step": 1510 + }, + { + "epoch": 1.2920069214656507, + "grad_norm": 99715.9765625, + "learning_rate": 0.0006333333333333333, + "loss": 2.9354, + "step": 1520 + }, + { + "epoch": 1.300506967001609, + "grad_norm": 97103.8125, + "learning_rate": 0.0006374999999999999, + "loss": 2.8965, + "step": 1530 + }, + { + "epoch": 1.300506967001609, + "eval_accuracy": 0.46788244925851397, + "eval_loss": 2.698025941848755, + "eval_runtime": 1277.5104, + "eval_samples_per_second": 390.899, + "eval_steps_per_second": 1.357, + "step": 1530 + }, + { + "epoch": 1.3090070125375672, + "grad_norm": 87332.8515625, + "learning_rate": 0.0006416666666666667, + "loss": 2.8339, + "step": 1540 + }, + { + "epoch": 1.3175070580735255, + "grad_norm": 89923.7890625, + "learning_rate": 0.0006458333333333334, + "loss": 2.7669, + "step": 1550 + }, + { + "epoch": 1.3260071036094836, + "grad_norm": 75226.8515625, + "learning_rate": 0.0006500000000000001, + "loss": 2.7303, + "step": 1560 + }, + { + "epoch": 1.3345071491454419, + "grad_norm": 66864.1484375, + "learning_rate": 0.0006541666666666667, + "loss": 2.6759, + "step": 1570 + }, + { + "epoch": 1.3430071946814, + "grad_norm": 61924.27734375, + "learning_rate": 0.0006583333333333334, + "loss": 2.6349, + "step": 1580 + }, + { + "epoch": 1.3515072402173582, + "grad_norm": 54619.734375, + "learning_rate": 0.0006625, + "loss": 2.5945, + "step": 1590 + }, + { + "epoch": 1.3600072857533165, + "grad_norm": 74138.375, + "learning_rate": 0.0006666666666666666, + "loss": 2.5633, + "step": 1600 + }, + { + "epoch": 1.3685073312892748, + "grad_norm": 58250.90625, + "learning_rate": 0.0006708333333333333, + "loss": 2.5349, + "step": 1610 + }, + { + "epoch": 1.377007376825233, + "grad_norm": 76097.328125, + "learning_rate": 0.000675, + "loss": 2.4952, + "step": 1620 + }, + { + "epoch": 1.377007376825233, + "eval_accuracy": 0.5196907758785413, + "eval_loss": 2.3480491638183594, + "eval_runtime": 1280.8752, + "eval_samples_per_second": 389.872, + "eval_steps_per_second": 1.354, + "step": 1620 + }, + { + "epoch": 1.3855074223611912, + "grad_norm": 77774.96875, + "learning_rate": 0.0006791666666666667, + "loss": 2.4686, + "step": 1630 + }, + { + "epoch": 1.3940074678971495, + "grad_norm": 65427.421875, + "learning_rate": 0.0006833333333333333, + "loss": 2.4615, + "step": 1640 + }, + { + "epoch": 1.4025075134331078, + "grad_norm": 56960.09765625, + "learning_rate": 0.0006875, + "loss": 2.4228, + "step": 1650 + }, + { + "epoch": 1.4110075589690658, + "grad_norm": 74808.4296875, + "learning_rate": 0.0006916666666666667, + "loss": 2.391, + "step": 1660 + }, + { + "epoch": 1.4195076045050241, + "grad_norm": 47526.44140625, + "learning_rate": 0.0006958333333333334, + "loss": 2.3693, + "step": 1670 + }, + { + "epoch": 1.4280076500409824, + "grad_norm": 54673.7578125, + "learning_rate": 0.0007, + "loss": 2.3505, + "step": 1680 + }, + { + "epoch": 1.4365076955769407, + "grad_norm": 33447.546875, + "learning_rate": 0.0007041666666666667, + "loss": 2.3205, + "step": 1690 + }, + { + "epoch": 1.4450077411128988, + "grad_norm": 56293.46484375, + "learning_rate": 0.0007083333333333334, + "loss": 2.318, + "step": 1700 + }, + { + "epoch": 1.453507786648857, + "grad_norm": 41077.09375, + "learning_rate": 0.0007125, + "loss": 2.2838, + "step": 1710 + }, + { + "epoch": 1.453507786648857, + "eval_accuracy": 0.5489981207105539, + "eval_loss": 2.1566824913024902, + "eval_runtime": 1279.9042, + "eval_samples_per_second": 390.168, + "eval_steps_per_second": 1.355, + "step": 1710 + }, + { + "epoch": 1.4620078321848153, + "grad_norm": 51759.4296875, + "learning_rate": 0.0007166666666666667, + "loss": 2.264, + "step": 1720 + }, + { + "epoch": 1.4705078777207734, + "grad_norm": 50884.80078125, + "learning_rate": 0.0007208333333333333, + "loss": 2.2408, + "step": 1730 + }, + { + "epoch": 1.4790079232567317, + "grad_norm": 38072.73046875, + "learning_rate": 0.000725, + "loss": 2.2221, + "step": 1740 + }, + { + "epoch": 1.48750796879269, + "grad_norm": 45036.5, + "learning_rate": 0.0007291666666666666, + "loss": 2.2046, + "step": 1750 + }, + { + "epoch": 1.4960080143286483, + "grad_norm": 30498.197265625, + "learning_rate": 0.0007333333333333333, + "loss": 2.1925, + "step": 1760 + }, + { + "epoch": 1.5045080598646066, + "grad_norm": 52624.26953125, + "learning_rate": 0.0007375000000000001, + "loss": 2.1845, + "step": 1770 + }, + { + "epoch": 1.5130081054005646, + "grad_norm": 39472.8125, + "learning_rate": 0.0007416666666666667, + "loss": 2.1579, + "step": 1780 + }, + { + "epoch": 1.521508150936523, + "grad_norm": 43868.4375, + "learning_rate": 0.0007458333333333334, + "loss": 2.1517, + "step": 1790 + }, + { + "epoch": 1.530008196472481, + "grad_norm": 39694.20703125, + "learning_rate": 0.00075, + "loss": 2.1324, + "step": 1800 + }, + { + "epoch": 1.530008196472481, + "eval_accuracy": 0.571443558749772, + "eval_loss": 2.0166752338409424, + "eval_runtime": 1279.8418, + "eval_samples_per_second": 390.187, + "eval_steps_per_second": 1.355, + "step": 1800 + }, + { + "epoch": 1.5385082420084393, + "grad_norm": 41399.078125, + "learning_rate": 0.0007541666666666667, + "loss": 2.1129, + "step": 1810 + }, + { + "epoch": 1.5470082875443976, + "grad_norm": 39988.51171875, + "learning_rate": 0.0007583333333333333, + "loss": 2.0979, + "step": 1820 + }, + { + "epoch": 1.5555083330803559, + "grad_norm": 40106.96875, + "learning_rate": 0.0007624999999999999, + "loss": 2.0861, + "step": 1830 + }, + { + "epoch": 1.5640083786163141, + "grad_norm": 37416.859375, + "learning_rate": 0.0007666666666666667, + "loss": 2.0691, + "step": 1840 + }, + { + "epoch": 1.5725084241522722, + "grad_norm": 45897.46484375, + "learning_rate": 0.0007708333333333334, + "loss": 2.0541, + "step": 1850 + }, + { + "epoch": 1.5810084696882305, + "grad_norm": 43155.91796875, + "learning_rate": 0.0007750000000000001, + "loss": 2.0409, + "step": 1860 + }, + { + "epoch": 1.5895085152241886, + "grad_norm": 44230.0703125, + "learning_rate": 0.0007791666666666667, + "loss": 2.026, + "step": 1870 + }, + { + "epoch": 1.5980085607601469, + "grad_norm": 41950.99609375, + "learning_rate": 0.0007833333333333334, + "loss": 2.007, + "step": 1880 + }, + { + "epoch": 1.6065086062961051, + "grad_norm": 50098.26171875, + "learning_rate": 0.0007875, + "loss": 2.004, + "step": 1890 + }, + { + "epoch": 1.6065086062961051, + "eval_accuracy": 0.5934511080462407, + "eval_loss": 1.89019775390625, + "eval_runtime": 1279.3367, + "eval_samples_per_second": 390.341, + "eval_steps_per_second": 1.355, + "step": 1890 + }, + { + "epoch": 1.6150086518320634, + "grad_norm": 38339.48046875, + "learning_rate": 0.0007916666666666666, + "loss": 1.992, + "step": 1900 + }, + { + "epoch": 1.6235086973680217, + "grad_norm": 45457.52734375, + "learning_rate": 0.0007958333333333333, + "loss": 1.9782, + "step": 1910 + }, + { + "epoch": 1.6320087429039798, + "grad_norm": 44519.359375, + "learning_rate": 0.0008, + "loss": 1.9713, + "step": 1920 + }, + { + "epoch": 1.640508788439938, + "grad_norm": 32601.9921875, + "learning_rate": 0.0008041666666666667, + "loss": 1.9564, + "step": 1930 + }, + { + "epoch": 1.6490088339758961, + "grad_norm": 38710.42578125, + "learning_rate": 0.0008083333333333333, + "loss": 1.9461, + "step": 1940 + }, + { + "epoch": 1.6575088795118544, + "grad_norm": 36938.29296875, + "learning_rate": 0.0008125000000000001, + "loss": 1.9386, + "step": 1950 + }, + { + "epoch": 1.6660089250478127, + "grad_norm": 39862.9609375, + "learning_rate": 0.0008166666666666667, + "loss": 1.9229, + "step": 1960 + }, + { + "epoch": 1.674508970583771, + "grad_norm": 29285.794921875, + "learning_rate": 0.0008208333333333334, + "loss": 1.9096, + "step": 1970 + }, + { + "epoch": 1.6830090161197293, + "grad_norm": 40694.77734375, + "learning_rate": 0.000825, + "loss": 1.8992, + "step": 1980 + }, + { + "epoch": 1.6830090161197293, + "eval_accuracy": 0.6102293333175925, + "eval_loss": 1.7952998876571655, + "eval_runtime": 1281.484, + "eval_samples_per_second": 389.687, + "eval_steps_per_second": 1.353, + "step": 1980 + }, + { + "epoch": 1.6915090616556876, + "grad_norm": 52670.69140625, + "learning_rate": 0.0008291666666666667, + "loss": 1.8874, + "step": 1990 + }, + { + "epoch": 1.7000091071916457, + "grad_norm": 41263.34375, + "learning_rate": 0.0008333333333333334, + "loss": 1.8832, + "step": 2000 + }, + { + "epoch": 1.708509152727604, + "grad_norm": 48928.69921875, + "learning_rate": 0.0008375, + "loss": 1.8709, + "step": 2010 + }, + { + "epoch": 1.717009198263562, + "grad_norm": 34242.77734375, + "learning_rate": 0.0008416666666666667, + "loss": 1.8672, + "step": 2020 + }, + { + "epoch": 1.7255092437995203, + "grad_norm": 43822.64453125, + "learning_rate": 0.0008458333333333333, + "loss": 1.8542, + "step": 2030 + }, + { + "epoch": 1.7340092893354786, + "grad_norm": 30140.544921875, + "learning_rate": 0.00085, + "loss": 1.8418, + "step": 2040 + }, + { + "epoch": 1.7425093348714369, + "grad_norm": 32354.177734375, + "learning_rate": 0.0008541666666666666, + "loss": 1.833, + "step": 2050 + }, + { + "epoch": 1.7510093804073952, + "grad_norm": 37106.171875, + "learning_rate": 0.0008583333333333333, + "loss": 1.8285, + "step": 2060 + }, + { + "epoch": 1.7595094259433532, + "grad_norm": 35038.6875, + "learning_rate": 0.0008625000000000001, + "loss": 1.815, + "step": 2070 + }, + { + "epoch": 1.7595094259433532, + "eval_accuracy": 0.6255571282197189, + "eval_loss": 1.7101428508758545, + "eval_runtime": 1279.0851, + "eval_samples_per_second": 390.418, + "eval_steps_per_second": 1.356, + "step": 2070 + }, + { + "epoch": 1.7680094714793115, + "grad_norm": 33263.3046875, + "learning_rate": 0.0008666666666666667, + "loss": 1.8021, + "step": 2080 + }, + { + "epoch": 1.7765095170152696, + "grad_norm": 29019.279296875, + "learning_rate": 0.0008708333333333334, + "loss": 1.7913, + "step": 2090 + }, + { + "epoch": 1.785009562551228, + "grad_norm": 40720.78125, + "learning_rate": 0.000875, + "loss": 1.7823, + "step": 2100 + }, + { + "epoch": 1.7935096080871862, + "grad_norm": 40485.59765625, + "learning_rate": 0.0008791666666666667, + "loss": 1.7738, + "step": 2110 + }, + { + "epoch": 1.8020096536231445, + "grad_norm": 34836.8984375, + "learning_rate": 0.0008833333333333333, + "loss": 1.7628, + "step": 2120 + }, + { + "epoch": 1.8105096991591028, + "grad_norm": 37340.25, + "learning_rate": 0.0008874999999999999, + "loss": 1.7564, + "step": 2130 + }, + { + "epoch": 1.8190097446950608, + "grad_norm": 34712.98828125, + "learning_rate": 0.0008916666666666667, + "loss": 1.7454, + "step": 2140 + }, + { + "epoch": 1.8275097902310191, + "grad_norm": 36124.44921875, + "learning_rate": 0.0008958333333333334, + "loss": 1.7375, + "step": 2150 + }, + { + "epoch": 1.8360098357669772, + "grad_norm": 26800.90234375, + "learning_rate": 0.0009000000000000001, + "loss": 1.724, + "step": 2160 + }, + { + "epoch": 1.8360098357669772, + "eval_accuracy": 0.6417359150220479, + "eval_loss": 1.619744062423706, + "eval_runtime": 1279.7377, + "eval_samples_per_second": 390.219, + "eval_steps_per_second": 1.355, + "step": 2160 + }, + { + "epoch": 1.8445098813029355, + "grad_norm": 29157.935546875, + "learning_rate": 0.0009041666666666667, + "loss": 1.714, + "step": 2170 + }, + { + "epoch": 1.8530099268388938, + "grad_norm": 32622.201171875, + "learning_rate": 0.0009083333333333334, + "loss": 1.6972, + "step": 2180 + }, + { + "epoch": 1.861509972374852, + "grad_norm": 27361.466796875, + "learning_rate": 0.0009125, + "loss": 1.6807, + "step": 2190 + }, + { + "epoch": 1.8700100179108103, + "grad_norm": 30800.376953125, + "learning_rate": 0.0009166666666666666, + "loss": 1.6644, + "step": 2200 + }, + { + "epoch": 1.8785100634467686, + "grad_norm": 28547.041015625, + "learning_rate": 0.0009208333333333333, + "loss": 1.6546, + "step": 2210 + }, + { + "epoch": 1.8870101089827267, + "grad_norm": 32673.380859375, + "learning_rate": 0.000925, + "loss": 1.6407, + "step": 2220 + }, + { + "epoch": 1.8955101545186848, + "grad_norm": 38690.15625, + "learning_rate": 0.0009291666666666667, + "loss": 1.6327, + "step": 2230 + }, + { + "epoch": 1.904010200054643, + "grad_norm": 24017.017578125, + "learning_rate": 0.0009333333333333333, + "loss": 1.6185, + "step": 2240 + }, + { + "epoch": 1.9125102455906013, + "grad_norm": 33731.96484375, + "learning_rate": 0.0009375, + "loss": 1.6115, + "step": 2250 + }, + { + "epoch": 1.9125102455906013, + "eval_accuracy": 0.6670528637301408, + "eval_loss": 1.5023800134658813, + "eval_runtime": 1278.3406, + "eval_samples_per_second": 390.646, + "eval_steps_per_second": 1.356, + "step": 2250 + }, + { + "epoch": 1.9210102911265596, + "grad_norm": 35314.43359375, + "learning_rate": 0.0009416666666666667, + "loss": 1.5993, + "step": 2260 + }, + { + "epoch": 1.929510336662518, + "grad_norm": 36152.0078125, + "learning_rate": 0.0009458333333333334, + "loss": 1.5944, + "step": 2270 + }, + { + "epoch": 1.9380103821984762, + "grad_norm": 31576.48046875, + "learning_rate": 0.00095, + "loss": 1.5867, + "step": 2280 + }, + { + "epoch": 1.9465104277344343, + "grad_norm": 31250.708984375, + "learning_rate": 0.0009541666666666667, + "loss": 1.5779, + "step": 2290 + }, + { + "epoch": 1.9550104732703926, + "grad_norm": 39243.34375, + "learning_rate": 0.0009583333333333334, + "loss": 1.5693, + "step": 2300 + }, + { + "epoch": 1.9635105188063506, + "grad_norm": 26391.3046875, + "learning_rate": 0.0009625, + "loss": 1.5671, + "step": 2310 + }, + { + "epoch": 1.972010564342309, + "grad_norm": 34276.82421875, + "learning_rate": 0.0009666666666666667, + "loss": 1.56, + "step": 2320 + }, + { + "epoch": 1.9805106098782672, + "grad_norm": 28257.998046875, + "learning_rate": 0.0009708333333333333, + "loss": 1.5482, + "step": 2330 + }, + { + "epoch": 1.9890106554142255, + "grad_norm": 38320.5234375, + "learning_rate": 0.000975, + "loss": 1.5435, + "step": 2340 + }, + { + "epoch": 1.9890106554142255, + "eval_accuracy": 0.679645168221919, + "eval_loss": 1.4393101930618286, + "eval_runtime": 1277.7056, + "eval_samples_per_second": 390.84, + "eval_steps_per_second": 1.357, + "step": 2340 + }, + { + "epoch": 1.9975107009501838, + "grad_norm": 30497.1171875, + "learning_rate": 0.0009791666666666666, + "loss": 1.5364, + "step": 2350 + }, + { + "epoch": 2.006010746486142, + "grad_norm": 30113.25, + "learning_rate": 0.0009833333333333332, + "loss": 1.5333, + "step": 2360 + }, + { + "epoch": 2.0145107920221, + "grad_norm": 23521.29296875, + "learning_rate": 0.0009875, + "loss": 1.528, + "step": 2370 + }, + { + "epoch": 2.023010837558058, + "grad_norm": 33135.26953125, + "learning_rate": 0.0009916666666666667, + "loss": 1.5211, + "step": 2380 + }, + { + "epoch": 2.0315108830940165, + "grad_norm": 22258.33984375, + "learning_rate": 0.0009958333333333334, + "loss": 1.5173, + "step": 2390 + }, + { + "epoch": 2.040010928629975, + "grad_norm": 30941.345703125, + "learning_rate": 0.001, + "loss": 1.5097, + "step": 2400 + }, + { + "epoch": 2.048510974165933, + "grad_norm": 27459.111328125, + "learning_rate": 0.000999009900990099, + "loss": 1.5022, + "step": 2410 + }, + { + "epoch": 2.0570110197018914, + "grad_norm": 23237.234375, + "learning_rate": 0.0009980198019801981, + "loss": 1.4956, + "step": 2420 + }, + { + "epoch": 2.0655110652378497, + "grad_norm": 24305.986328125, + "learning_rate": 0.000997029702970297, + "loss": 1.4921, + "step": 2430 + }, + { + "epoch": 2.0655110652378497, + "eval_accuracy": 0.6886431014072092, + "eval_loss": 1.3913538455963135, + "eval_runtime": 1278.454, + "eval_samples_per_second": 390.611, + "eval_steps_per_second": 1.356, + "step": 2430 + }, + { + "epoch": 2.0740111107738075, + "grad_norm": 26497.21484375, + "learning_rate": 0.000996039603960396, + "loss": 1.4843, + "step": 2440 + }, + { + "epoch": 2.082511156309766, + "grad_norm": 24107.4375, + "learning_rate": 0.000995049504950495, + "loss": 1.4824, + "step": 2450 + }, + { + "epoch": 2.091011201845724, + "grad_norm": 26006.498046875, + "learning_rate": 0.0009940594059405941, + "loss": 1.4763, + "step": 2460 + }, + { + "epoch": 2.0995112473816824, + "grad_norm": 27216.076171875, + "learning_rate": 0.0009930693069306932, + "loss": 1.4704, + "step": 2470 + }, + { + "epoch": 2.1080112929176407, + "grad_norm": 21669.0546875, + "learning_rate": 0.000992079207920792, + "loss": 1.4636, + "step": 2480 + }, + { + "epoch": 2.116511338453599, + "grad_norm": 23455.484375, + "learning_rate": 0.000991089108910891, + "loss": 1.4566, + "step": 2490 + }, + { + "epoch": 2.1250113839895572, + "grad_norm": 20500.41015625, + "learning_rate": 0.0009900990099009901, + "loss": 1.4554, + "step": 2500 + }, + { + "epoch": 2.133511429525515, + "grad_norm": 26141.59765625, + "learning_rate": 0.0009891089108910892, + "loss": 1.4484, + "step": 2510 + }, + { + "epoch": 2.1420114750614734, + "grad_norm": 23275.765625, + "learning_rate": 0.0009881188118811882, + "loss": 1.4427, + "step": 2520 + }, + { + "epoch": 2.1420114750614734, + "eval_accuracy": 0.6980444623562028, + "eval_loss": 1.3418840169906616, + "eval_runtime": 1280.6296, + "eval_samples_per_second": 389.947, + "eval_steps_per_second": 1.354, + "step": 2520 + }, + { + "epoch": 2.1505115205974317, + "grad_norm": 26020.708984375, + "learning_rate": 0.000987128712871287, + "loss": 1.4374, + "step": 2530 + }, + { + "epoch": 2.15901156613339, + "grad_norm": 20408.48828125, + "learning_rate": 0.000986138613861386, + "loss": 1.4302, + "step": 2540 + }, + { + "epoch": 2.1675116116693482, + "grad_norm": 23640.2265625, + "learning_rate": 0.0009851485148514852, + "loss": 1.4277, + "step": 2550 + }, + { + "epoch": 2.1760116572053065, + "grad_norm": 22099.396484375, + "learning_rate": 0.0009841584158415842, + "loss": 1.4217, + "step": 2560 + }, + { + "epoch": 2.184511702741265, + "grad_norm": 23317.130859375, + "learning_rate": 0.0009831683168316833, + "loss": 1.4188, + "step": 2570 + }, + { + "epoch": 2.193011748277223, + "grad_norm": 24106.27734375, + "learning_rate": 0.000982178217821782, + "loss": 1.4147, + "step": 2580 + }, + { + "epoch": 2.201511793813181, + "grad_norm": 18410.43359375, + "learning_rate": 0.0009811881188118811, + "loss": 1.4107, + "step": 2590 + }, + { + "epoch": 2.2100118393491393, + "grad_norm": 22841.390625, + "learning_rate": 0.0009801980198019802, + "loss": 1.4075, + "step": 2600 + }, + { + "epoch": 2.2185118848850975, + "grad_norm": 24554.787109375, + "learning_rate": 0.0009792079207920793, + "loss": 1.404, + "step": 2610 + }, + { + "epoch": 2.2185118848850975, + "eval_accuracy": 0.7051629840505044, + "eval_loss": 1.3090746402740479, + "eval_runtime": 1279.6345, + "eval_samples_per_second": 390.25, + "eval_steps_per_second": 1.355, + "step": 2610 + }, + { + "epoch": 2.227011930421056, + "grad_norm": 25836.71484375, + "learning_rate": 0.0009782178217821783, + "loss": 1.4039, + "step": 2620 + }, + { + "epoch": 2.235511975957014, + "grad_norm": 21653.619140625, + "learning_rate": 0.0009772277227722771, + "loss": 1.3946, + "step": 2630 + }, + { + "epoch": 2.2440120214929724, + "grad_norm": 23728.3046875, + "learning_rate": 0.0009762376237623762, + "loss": 1.3929, + "step": 2640 + }, + { + "epoch": 2.2525120670289307, + "grad_norm": 21733.001953125, + "learning_rate": 0.0009752475247524752, + "loss": 1.3884, + "step": 2650 + }, + { + "epoch": 2.261012112564889, + "grad_norm": 21566.19921875, + "learning_rate": 0.0009742574257425743, + "loss": 1.3825, + "step": 2660 + }, + { + "epoch": 2.269512158100847, + "grad_norm": 19529.41015625, + "learning_rate": 0.0009732673267326732, + "loss": 1.3786, + "step": 2670 + }, + { + "epoch": 2.278012203636805, + "grad_norm": 20555.123046875, + "learning_rate": 0.0009722772277227723, + "loss": 1.3755, + "step": 2680 + }, + { + "epoch": 2.2865122491727634, + "grad_norm": 19776.90234375, + "learning_rate": 0.0009712871287128712, + "loss": 1.3706, + "step": 2690 + }, + { + "epoch": 2.2950122947087217, + "grad_norm": 18725.46875, + "learning_rate": 0.0009702970297029703, + "loss": 1.3703, + "step": 2700 + }, + { + "epoch": 2.2950122947087217, + "eval_accuracy": 0.7118258434856092, + "eval_loss": 1.2756892442703247, + "eval_runtime": 1278.3225, + "eval_samples_per_second": 390.651, + "eval_steps_per_second": 1.356, + "step": 2700 + }, + { + "epoch": 2.30351234024468, + "grad_norm": 21781.615234375, + "learning_rate": 0.0009693069306930693, + "loss": 1.369, + "step": 2710 + }, + { + "epoch": 2.3120123857806383, + "grad_norm": 21352.599609375, + "learning_rate": 0.0009683168316831683, + "loss": 1.3609, + "step": 2720 + }, + { + "epoch": 2.3205124313165966, + "grad_norm": 19621.919921875, + "learning_rate": 0.0009673267326732673, + "loss": 1.3579, + "step": 2730 + }, + { + "epoch": 2.3290124768525544, + "grad_norm": 15898.189453125, + "learning_rate": 0.0009663366336633663, + "loss": 1.3585, + "step": 2740 + }, + { + "epoch": 2.3375125223885127, + "grad_norm": 24640.2734375, + "learning_rate": 0.0009653465346534653, + "loss": 1.3521, + "step": 2750 + }, + { + "epoch": 2.346012567924471, + "grad_norm": 21775.046875, + "learning_rate": 0.0009643564356435644, + "loss": 1.3496, + "step": 2760 + }, + { + "epoch": 2.3545126134604293, + "grad_norm": 20199.751953125, + "learning_rate": 0.0009633663366336633, + "loss": 1.3477, + "step": 2770 + }, + { + "epoch": 2.3630126589963876, + "grad_norm": 23324.25, + "learning_rate": 0.0009623762376237624, + "loss": 1.344, + "step": 2780 + }, + { + "epoch": 2.371512704532346, + "grad_norm": 23789.677734375, + "learning_rate": 0.0009613861386138613, + "loss": 1.3423, + "step": 2790 + }, + { + "epoch": 2.371512704532346, + "eval_accuracy": 0.7178257874607675, + "eval_loss": 1.2434872388839722, + "eval_runtime": 1276.5226, + "eval_samples_per_second": 391.202, + "eval_steps_per_second": 1.358, + "step": 2790 + }, + { + "epoch": 2.380012750068304, + "grad_norm": 22512.796875, + "learning_rate": 0.0009603960396039604, + "loss": 1.3394, + "step": 2800 + }, + { + "epoch": 2.388512795604262, + "grad_norm": 24620.826171875, + "learning_rate": 0.0009594059405940594, + "loss": 1.3331, + "step": 2810 + }, + { + "epoch": 2.3970128411402203, + "grad_norm": 20756.77734375, + "learning_rate": 0.0009584158415841584, + "loss": 1.3293, + "step": 2820 + }, + { + "epoch": 2.4055128866761786, + "grad_norm": 17418.865234375, + "learning_rate": 0.0009574257425742574, + "loss": 1.3268, + "step": 2830 + }, + { + "epoch": 2.414012932212137, + "grad_norm": 19701.8984375, + "learning_rate": 0.0009564356435643564, + "loss": 1.3256, + "step": 2840 + }, + { + "epoch": 2.422512977748095, + "grad_norm": 22757.66015625, + "learning_rate": 0.0009554455445544554, + "loss": 1.3229, + "step": 2850 + }, + { + "epoch": 2.4310130232840534, + "grad_norm": 24491.03515625, + "learning_rate": 0.0009544554455445545, + "loss": 1.3187, + "step": 2860 + }, + { + "epoch": 2.4395130688200117, + "grad_norm": 19789.939453125, + "learning_rate": 0.0009534653465346534, + "loss": 1.3176, + "step": 2870 + }, + { + "epoch": 2.4480131143559696, + "grad_norm": 19331.78515625, + "learning_rate": 0.0009524752475247525, + "loss": 1.3165, + "step": 2880 + }, + { + "epoch": 2.4480131143559696, + "eval_accuracy": 0.7224858086201374, + "eval_loss": 1.2204521894454956, + "eval_runtime": 1278.1811, + "eval_samples_per_second": 390.694, + "eval_steps_per_second": 1.357, + "step": 2880 + }, + { + "epoch": 2.456513159891928, + "grad_norm": 19115.720703125, + "learning_rate": 0.0009514851485148514, + "loss": 1.3118, + "step": 2890 + }, + { + "epoch": 2.465013205427886, + "grad_norm": 18051.21875, + "learning_rate": 0.0009504950495049505, + "loss": 1.3087, + "step": 2900 + }, + { + "epoch": 2.4735132509638444, + "grad_norm": 17369.791015625, + "learning_rate": 0.0009495049504950495, + "loss": 1.3088, + "step": 2910 + }, + { + "epoch": 2.4820132964998027, + "grad_norm": 19926.080078125, + "learning_rate": 0.0009485148514851485, + "loss": 1.3067, + "step": 2920 + }, + { + "epoch": 2.490513342035761, + "grad_norm": 21108.0078125, + "learning_rate": 0.0009475247524752475, + "loss": 1.3034, + "step": 2930 + }, + { + "epoch": 2.4990133875717193, + "grad_norm": 20779.73828125, + "learning_rate": 0.0009465346534653465, + "loss": 1.2954, + "step": 2940 + }, + { + "epoch": 2.507513433107677, + "grad_norm": 19840.251953125, + "learning_rate": 0.0009455445544554455, + "loss": 1.2957, + "step": 2950 + }, + { + "epoch": 2.5160134786436354, + "grad_norm": 21993.7734375, + "learning_rate": 0.0009445544554455446, + "loss": 1.2948, + "step": 2960 + }, + { + "epoch": 2.5245135241795937, + "grad_norm": 23669.6171875, + "learning_rate": 0.0009435643564356435, + "loss": 1.2932, + "step": 2970 + }, + { + "epoch": 2.5245135241795937, + "eval_accuracy": 0.7274937619747395, + "eval_loss": 1.196637511253357, + "eval_runtime": 1278.0042, + "eval_samples_per_second": 390.748, + "eval_steps_per_second": 1.357, + "step": 2970 + }, + { + "epoch": 2.533013569715552, + "grad_norm": 16925.79296875, + "learning_rate": 0.0009425742574257426, + "loss": 1.2884, + "step": 2980 + }, + { + "epoch": 2.5415136152515103, + "grad_norm": 16753.69921875, + "learning_rate": 0.0009415841584158415, + "loss": 1.2862, + "step": 2990 + }, + { + "epoch": 2.5500136607874686, + "grad_norm": 19538.154296875, + "learning_rate": 0.0009405940594059406, + "loss": 1.2843, + "step": 3000 + }, + { + "epoch": 2.558513706323427, + "grad_norm": 18874.037109375, + "learning_rate": 0.0009396039603960396, + "loss": 1.2821, + "step": 3010 + }, + { + "epoch": 2.5670137518593847, + "grad_norm": 19626.3671875, + "learning_rate": 0.0009386138613861386, + "loss": 1.2793, + "step": 3020 + }, + { + "epoch": 2.575513797395343, + "grad_norm": 21052.349609375, + "learning_rate": 0.0009376237623762376, + "loss": 1.2789, + "step": 3030 + }, + { + "epoch": 2.5840138429313013, + "grad_norm": 19245.396484375, + "learning_rate": 0.0009366336633663367, + "loss": 1.2744, + "step": 3040 + }, + { + "epoch": 2.5925138884672596, + "grad_norm": 16579.640625, + "learning_rate": 0.0009356435643564357, + "loss": 1.2726, + "step": 3050 + }, + { + "epoch": 2.601013934003218, + "grad_norm": 17490.4609375, + "learning_rate": 0.0009346534653465348, + "loss": 1.2726, + "step": 3060 + }, + { + "epoch": 2.601013934003218, + "eval_accuracy": 0.7312536295087113, + "eval_loss": 1.1788941621780396, + "eval_runtime": 1282.3919, + "eval_samples_per_second": 389.411, + "eval_steps_per_second": 1.352, + "step": 3060 + }, + { + "epoch": 2.609513979539176, + "grad_norm": 21261.8125, + "learning_rate": 0.0009336633663366337, + "loss": 1.2683, + "step": 3070 + }, + { + "epoch": 2.6180140250751345, + "grad_norm": 21293.064453125, + "learning_rate": 0.0009326732673267328, + "loss": 1.2695, + "step": 3080 + }, + { + "epoch": 2.6265140706110923, + "grad_norm": 18312.087890625, + "learning_rate": 0.0009316831683168317, + "loss": 1.2651, + "step": 3090 + }, + { + "epoch": 2.635014116147051, + "grad_norm": 18225.01953125, + "learning_rate": 0.0009306930693069308, + "loss": 1.2608, + "step": 3100 + }, + { + "epoch": 2.643514161683009, + "grad_norm": 19320.2890625, + "learning_rate": 0.0009297029702970298, + "loss": 1.2628, + "step": 3110 + }, + { + "epoch": 2.652014207218967, + "grad_norm": 19210.060546875, + "learning_rate": 0.0009287128712871288, + "loss": 1.2602, + "step": 3120 + }, + { + "epoch": 2.6605142527549255, + "grad_norm": 20046.904296875, + "learning_rate": 0.0009277227722772278, + "loss": 1.2569, + "step": 3130 + }, + { + "epoch": 2.6690142982908838, + "grad_norm": 20291.744140625, + "learning_rate": 0.0009267326732673268, + "loss": 1.2546, + "step": 3140 + }, + { + "epoch": 2.677514343826842, + "grad_norm": 17438.1796875, + "learning_rate": 0.0009257425742574258, + "loss": 1.2553, + "step": 3150 + }, + { + "epoch": 2.677514343826842, + "eval_accuracy": 0.7340778616085779, + "eval_loss": 1.1635637283325195, + "eval_runtime": 1283.8261, + "eval_samples_per_second": 388.976, + "eval_steps_per_second": 1.351, + "step": 3150 + }, + { + "epoch": 2.6860143893628, + "grad_norm": 16703.447265625, + "learning_rate": 0.0009247524752475249, + "loss": 1.2526, + "step": 3160 + }, + { + "epoch": 2.6945144348987586, + "grad_norm": 16225.796875, + "learning_rate": 0.0009237623762376238, + "loss": 1.252, + "step": 3170 + }, + { + "epoch": 2.7030144804347165, + "grad_norm": 19865.849609375, + "learning_rate": 0.0009227722772277229, + "loss": 1.2472, + "step": 3180 + }, + { + "epoch": 2.7115145259706748, + "grad_norm": 18682.638671875, + "learning_rate": 0.0009217821782178218, + "loss": 1.2485, + "step": 3190 + }, + { + "epoch": 2.720014571506633, + "grad_norm": 24501.96875, + "learning_rate": 0.0009207920792079209, + "loss": 1.2447, + "step": 3200 + }, + { + "epoch": 2.7285146170425914, + "grad_norm": 16843.01953125, + "learning_rate": 0.0009198019801980199, + "loss": 1.2431, + "step": 3210 + }, + { + "epoch": 2.7370146625785496, + "grad_norm": 19249.625, + "learning_rate": 0.0009188118811881188, + "loss": 1.2404, + "step": 3220 + }, + { + "epoch": 2.7455147081145075, + "grad_norm": 16381.576171875, + "learning_rate": 0.0009178217821782179, + "loss": 1.2402, + "step": 3230 + }, + { + "epoch": 2.754014753650466, + "grad_norm": 17400.1015625, + "learning_rate": 0.0009168316831683168, + "loss": 1.2352, + "step": 3240 + }, + { + "epoch": 2.754014753650466, + "eval_accuracy": 0.738359822040415, + "eval_loss": 1.1412700414657593, + "eval_runtime": 1278.9235, + "eval_samples_per_second": 390.467, + "eval_steps_per_second": 1.356, + "step": 3240 + }, + { + "epoch": 2.762514799186424, + "grad_norm": 20167.6328125, + "learning_rate": 0.0009158415841584159, + "loss": 1.2357, + "step": 3250 + }, + { + "epoch": 2.7710148447223824, + "grad_norm": 17094.001953125, + "learning_rate": 0.000914851485148515, + "loss": 1.2327, + "step": 3260 + }, + { + "epoch": 2.7795148902583406, + "grad_norm": 19824.177734375, + "learning_rate": 0.0009138613861386139, + "loss": 1.2323, + "step": 3270 + }, + { + "epoch": 2.788014935794299, + "grad_norm": 20459.54296875, + "learning_rate": 0.0009128712871287129, + "loss": 1.2317, + "step": 3280 + }, + { + "epoch": 2.796514981330257, + "grad_norm": 19844.548828125, + "learning_rate": 0.0009118811881188119, + "loss": 1.2303, + "step": 3290 + }, + { + "epoch": 2.8050150268662155, + "grad_norm": 15563.7314453125, + "learning_rate": 0.0009108910891089109, + "loss": 1.2254, + "step": 3300 + }, + { + "epoch": 2.813515072402174, + "grad_norm": 19343.908203125, + "learning_rate": 0.00090990099009901, + "loss": 1.2253, + "step": 3310 + }, + { + "epoch": 2.8220151179381316, + "grad_norm": 15785.529296875, + "learning_rate": 0.0009089108910891089, + "loss": 1.2263, + "step": 3320 + }, + { + "epoch": 2.83051516347409, + "grad_norm": 21025.06640625, + "learning_rate": 0.000907920792079208, + "loss": 1.224, + "step": 3330 + }, + { + "epoch": 2.83051516347409, + "eval_accuracy": 0.7409290871538374, + "eval_loss": 1.1298062801361084, + "eval_runtime": 1281.4127, + "eval_samples_per_second": 389.709, + "eval_steps_per_second": 1.353, + "step": 3330 + }, + { + "epoch": 2.8390152090100482, + "grad_norm": 16538.94140625, + "learning_rate": 0.0009069306930693069, + "loss": 1.223, + "step": 3340 + }, + { + "epoch": 2.8475152545460065, + "grad_norm": 18982.095703125, + "learning_rate": 0.000905940594059406, + "loss": 1.2217, + "step": 3350 + }, + { + "epoch": 2.856015300081965, + "grad_norm": 18279.283203125, + "learning_rate": 0.000904950495049505, + "loss": 1.2178, + "step": 3360 + }, + { + "epoch": 2.864515345617923, + "grad_norm": 19073.97265625, + "learning_rate": 0.000903960396039604, + "loss": 1.2142, + "step": 3370 + }, + { + "epoch": 2.8730153911538814, + "grad_norm": 18313.646484375, + "learning_rate": 0.000902970297029703, + "loss": 1.2164, + "step": 3380 + }, + { + "epoch": 2.8815154366898392, + "grad_norm": 18153.08203125, + "learning_rate": 0.000901980198019802, + "loss": 1.2132, + "step": 3390 + }, + { + "epoch": 2.8900154822257975, + "grad_norm": 17545.95703125, + "learning_rate": 0.000900990099009901, + "loss": 1.211, + "step": 3400 + }, + { + "epoch": 2.898515527761756, + "grad_norm": 17355.42578125, + "learning_rate": 0.0009000000000000001, + "loss": 1.2137, + "step": 3410 + }, + { + "epoch": 2.907015573297714, + "grad_norm": 16659.796875, + "learning_rate": 0.000899009900990099, + "loss": 1.2094, + "step": 3420 + }, + { + "epoch": 2.907015573297714, + "eval_accuracy": 0.7434043563853827, + "eval_loss": 1.1177165508270264, + "eval_runtime": 1280.5435, + "eval_samples_per_second": 389.973, + "eval_steps_per_second": 1.354, + "step": 3420 + }, + { + "epoch": 2.9155156188336724, + "grad_norm": 20371.392578125, + "learning_rate": 0.0008980198019801981, + "loss": 1.2082, + "step": 3430 + }, + { + "epoch": 2.9240156643696307, + "grad_norm": 20884.041015625, + "learning_rate": 0.000897029702970297, + "loss": 1.2049, + "step": 3440 + }, + { + "epoch": 2.932515709905589, + "grad_norm": 18348.6328125, + "learning_rate": 0.0008960396039603961, + "loss": 1.2043, + "step": 3450 + }, + { + "epoch": 2.941015755441547, + "grad_norm": 17201.962890625, + "learning_rate": 0.0008950495049504951, + "loss": 1.2023, + "step": 3460 + }, + { + "epoch": 2.949515800977505, + "grad_norm": 18133.63671875, + "learning_rate": 0.0008940594059405941, + "loss": 1.2026, + "step": 3470 + }, + { + "epoch": 2.9580158465134634, + "grad_norm": 16873.494140625, + "learning_rate": 0.0008930693069306931, + "loss": 1.1983, + "step": 3480 + }, + { + "epoch": 2.9665158920494217, + "grad_norm": 18968.74609375, + "learning_rate": 0.0008920792079207921, + "loss": 1.2007, + "step": 3490 + }, + { + "epoch": 2.97501593758538, + "grad_norm": 16066.7578125, + "learning_rate": 0.0008910891089108911, + "loss": 1.1992, + "step": 3500 + }, + { + "epoch": 2.9835159831213383, + "grad_norm": 18147.33203125, + "learning_rate": 0.0008900990099009902, + "loss": 1.1955, + "step": 3510 + }, + { + "epoch": 2.9835159831213383, + "eval_accuracy": 0.7461886735137164, + "eval_loss": 1.101768970489502, + "eval_runtime": 1279.5411, + "eval_samples_per_second": 390.279, + "eval_steps_per_second": 1.355, + "step": 3510 + }, + { + "epoch": 2.9920160286572965, + "grad_norm": 16867.4296875, + "learning_rate": 0.0008891089108910891, + "loss": 1.1933, + "step": 3520 + }, + { + "epoch": 3.0005160741932544, + "grad_norm": 17814.7109375, + "learning_rate": 0.0008881188118811882, + "loss": 1.1949, + "step": 3530 + }, + { + "epoch": 3.0090161197292127, + "grad_norm": 17680.919921875, + "learning_rate": 0.0008871287128712871, + "loss": 1.1922, + "step": 3540 + }, + { + "epoch": 3.017516165265171, + "grad_norm": 20543.033203125, + "learning_rate": 0.0008861386138613862, + "loss": 1.1924, + "step": 3550 + }, + { + "epoch": 3.0260162108011293, + "grad_norm": 22293.501953125, + "learning_rate": 0.0008851485148514852, + "loss": 1.1918, + "step": 3560 + }, + { + "epoch": 3.0345162563370875, + "grad_norm": 18199.220703125, + "learning_rate": 0.0008841584158415842, + "loss": 1.1886, + "step": 3570 + }, + { + "epoch": 3.043016301873046, + "grad_norm": 17805.478515625, + "learning_rate": 0.0008831683168316832, + "loss": 1.1878, + "step": 3580 + }, + { + "epoch": 3.051516347409004, + "grad_norm": 17259.236328125, + "learning_rate": 0.0008821782178217822, + "loss": 1.1862, + "step": 3590 + }, + { + "epoch": 3.0600163929449624, + "grad_norm": 15020.2236328125, + "learning_rate": 0.0008811881188118812, + "loss": 1.1859, + "step": 3600 + }, + { + "epoch": 3.0600163929449624, + "eval_accuracy": 0.7483353625113318, + "eval_loss": 1.0916943550109863, + "eval_runtime": 1279.6266, + "eval_samples_per_second": 390.253, + "eval_steps_per_second": 1.355, + "step": 3600 + }, + { + "epoch": 3.0685164384809203, + "grad_norm": 17711.4765625, + "learning_rate": 0.0008801980198019803, + "loss": 1.1849, + "step": 3610 + }, + { + "epoch": 3.0770164840168786, + "grad_norm": 21087.078125, + "learning_rate": 0.0008792079207920792, + "loss": 1.1835, + "step": 3620 + }, + { + "epoch": 3.085516529552837, + "grad_norm": 15549.55859375, + "learning_rate": 0.0008782178217821783, + "loss": 1.1821, + "step": 3630 + }, + { + "epoch": 3.094016575088795, + "grad_norm": 17686.115234375, + "learning_rate": 0.0008772277227722772, + "loss": 1.1793, + "step": 3640 + }, + { + "epoch": 3.1025166206247534, + "grad_norm": 18682.033203125, + "learning_rate": 0.0008762376237623763, + "loss": 1.1787, + "step": 3650 + }, + { + "epoch": 3.1110166661607117, + "grad_norm": 18502.626953125, + "learning_rate": 0.0008752475247524753, + "loss": 1.1776, + "step": 3660 + }, + { + "epoch": 3.11951671169667, + "grad_norm": 14834.3359375, + "learning_rate": 0.0008742574257425743, + "loss": 1.1772, + "step": 3670 + }, + { + "epoch": 3.128016757232628, + "grad_norm": 17454.373046875, + "learning_rate": 0.0008732673267326733, + "loss": 1.1766, + "step": 3680 + }, + { + "epoch": 3.136516802768586, + "grad_norm": 17697.75, + "learning_rate": 0.0008722772277227722, + "loss": 1.1764, + "step": 3690 + }, + { + "epoch": 3.136516802768586, + "eval_accuracy": 0.7508950932738141, + "eval_loss": 1.0795581340789795, + "eval_runtime": 1281.2745, + "eval_samples_per_second": 389.751, + "eval_steps_per_second": 1.353, + "step": 3690 + }, + { + "epoch": 3.1450168483045444, + "grad_norm": 20430.919921875, + "learning_rate": 0.0008712871287128713, + "loss": 1.1738, + "step": 3700 + }, + { + "epoch": 3.1535168938405027, + "grad_norm": 16903.4453125, + "learning_rate": 0.0008702970297029704, + "loss": 1.1709, + "step": 3710 + }, + { + "epoch": 3.162016939376461, + "grad_norm": 17143.486328125, + "learning_rate": 0.0008693069306930693, + "loss": 1.1698, + "step": 3720 + }, + { + "epoch": 3.1705169849124193, + "grad_norm": 15930.9501953125, + "learning_rate": 0.0008683168316831684, + "loss": 1.1696, + "step": 3730 + }, + { + "epoch": 3.1790170304483776, + "grad_norm": 16211.982421875, + "learning_rate": 0.0008673267326732673, + "loss": 1.169, + "step": 3740 + }, + { + "epoch": 3.1875170759843354, + "grad_norm": 19266.4140625, + "learning_rate": 0.0008663366336633663, + "loss": 1.1673, + "step": 3750 + }, + { + "epoch": 3.1960171215202937, + "grad_norm": 18455.83203125, + "learning_rate": 0.0008653465346534654, + "loss": 1.1682, + "step": 3760 + }, + { + "epoch": 3.204517167056252, + "grad_norm": 17270.205078125, + "learning_rate": 0.0008643564356435643, + "loss": 1.1662, + "step": 3770 + }, + { + "epoch": 3.2130172125922103, + "grad_norm": 15437.96484375, + "learning_rate": 0.0008633663366336634, + "loss": 1.1654, + "step": 3780 + }, + { + "epoch": 3.2130172125922103, + "eval_accuracy": 0.7525794036416251, + "eval_loss": 1.0734323263168335, + "eval_runtime": 1278.3867, + "eval_samples_per_second": 390.631, + "eval_steps_per_second": 1.356, + "step": 3780 + }, + { + "epoch": 3.2215172581281686, + "grad_norm": 16193.529296875, + "learning_rate": 0.0008623762376237623, + "loss": 1.1628, + "step": 3790 + }, + { + "epoch": 3.230017303664127, + "grad_norm": 17204.466796875, + "learning_rate": 0.0008613861386138614, + "loss": 1.1631, + "step": 3800 + }, + { + "epoch": 3.238517349200085, + "grad_norm": 17406.2265625, + "learning_rate": 0.0008603960396039604, + "loss": 1.1621, + "step": 3810 + }, + { + "epoch": 3.2470173947360434, + "grad_norm": 17054.076171875, + "learning_rate": 0.0008594059405940594, + "loss": 1.1614, + "step": 3820 + }, + { + "epoch": 3.2555174402720013, + "grad_norm": 18476.68359375, + "learning_rate": 0.0008584158415841584, + "loss": 1.1606, + "step": 3830 + }, + { + "epoch": 3.2640174858079596, + "grad_norm": 19170.802734375, + "learning_rate": 0.0008574257425742574, + "loss": 1.1581, + "step": 3840 + }, + { + "epoch": 3.272517531343918, + "grad_norm": 17178.326171875, + "learning_rate": 0.0008564356435643564, + "loss": 1.1562, + "step": 3850 + }, + { + "epoch": 3.281017576879876, + "grad_norm": 17949.396484375, + "learning_rate": 0.0008554455445544555, + "loss": 1.1557, + "step": 3860 + }, + { + "epoch": 3.2895176224158345, + "grad_norm": 15490.787109375, + "learning_rate": 0.0008544554455445544, + "loss": 1.1571, + "step": 3870 + }, + { + "epoch": 3.2895176224158345, + "eval_accuracy": 0.754430037196134, + "eval_loss": 1.0631576776504517, + "eval_runtime": 1278.1739, + "eval_samples_per_second": 390.696, + "eval_steps_per_second": 1.357, + "step": 3870 + }, + { + "epoch": 3.2980176679517927, + "grad_norm": 17647.640625, + "learning_rate": 0.0008534653465346535, + "loss": 1.1552, + "step": 3880 + }, + { + "epoch": 3.306517713487751, + "grad_norm": 16109.29296875, + "learning_rate": 0.0008524752475247524, + "loss": 1.153, + "step": 3890 + }, + { + "epoch": 3.315017759023709, + "grad_norm": 19355.884765625, + "learning_rate": 0.0008514851485148515, + "loss": 1.1543, + "step": 3900 + }, + { + "epoch": 3.323517804559667, + "grad_norm": 15843.697265625, + "learning_rate": 0.0008504950495049505, + "loss": 1.1533, + "step": 3910 + }, + { + "epoch": 3.3320178500956255, + "grad_norm": 20818.416015625, + "learning_rate": 0.0008495049504950495, + "loss": 1.1497, + "step": 3920 + }, + { + "epoch": 3.3405178956315837, + "grad_norm": 17527.841796875, + "learning_rate": 0.0008485148514851485, + "loss": 1.1508, + "step": 3930 + }, + { + "epoch": 3.349017941167542, + "grad_norm": 20557.205078125, + "learning_rate": 0.0008475247524752475, + "loss": 1.1508, + "step": 3940 + }, + { + "epoch": 3.3575179867035003, + "grad_norm": 17984.611328125, + "learning_rate": 0.0008465346534653465, + "loss": 1.1462, + "step": 3950 + }, + { + "epoch": 3.3660180322394586, + "grad_norm": 15756.77734375, + "learning_rate": 0.0008455445544554456, + "loss": 1.1456, + "step": 3960 + }, + { + "epoch": 3.3660180322394586, + "eval_accuracy": 0.7564186600937407, + "eval_loss": 1.0549671649932861, + "eval_runtime": 1283.4177, + "eval_samples_per_second": 389.1, + "eval_steps_per_second": 1.351, + "step": 3960 + }, + { + "epoch": 3.3745180777754165, + "grad_norm": 15412.625, + "learning_rate": 0.0008445544554455445, + "loss": 1.1451, + "step": 3970 + }, + { + "epoch": 3.3830181233113747, + "grad_norm": 16771.29296875, + "learning_rate": 0.0008435643564356436, + "loss": 1.1465, + "step": 3980 + }, + { + "epoch": 3.391518168847333, + "grad_norm": 18703.853515625, + "learning_rate": 0.0008425742574257425, + "loss": 1.1454, + "step": 3990 + }, + { + "epoch": 3.4000182143832913, + "grad_norm": 15575.3798828125, + "learning_rate": 0.0008415841584158416, + "loss": 1.1453, + "step": 4000 + }, + { + "epoch": 3.4085182599192496, + "grad_norm": 18316.923828125, + "learning_rate": 0.0008405940594059406, + "loss": 1.1408, + "step": 4010 + }, + { + "epoch": 3.417018305455208, + "grad_norm": 18408.486328125, + "learning_rate": 0.0008396039603960396, + "loss": 1.1432, + "step": 4020 + }, + { + "epoch": 3.425518350991166, + "grad_norm": 18312.7734375, + "learning_rate": 0.0008386138613861386, + "loss": 1.1404, + "step": 4030 + }, + { + "epoch": 3.434018396527124, + "grad_norm": 15673.939453125, + "learning_rate": 0.0008376237623762376, + "loss": 1.1406, + "step": 4040 + }, + { + "epoch": 3.4425184420630823, + "grad_norm": 18445.818359375, + "learning_rate": 0.0008366336633663366, + "loss": 1.1391, + "step": 4050 + }, + { + "epoch": 3.4425184420630823, + "eval_accuracy": 0.7579978835836059, + "eval_loss": 1.0442180633544922, + "eval_runtime": 1278.59, + "eval_samples_per_second": 390.569, + "eval_steps_per_second": 1.356, + "step": 4050 + }, + { + "epoch": 3.4510184875990406, + "grad_norm": 21039.810546875, + "learning_rate": 0.0008356435643564357, + "loss": 1.1395, + "step": 4060 + }, + { + "epoch": 3.459518533134999, + "grad_norm": 15481.697265625, + "learning_rate": 0.0008346534653465346, + "loss": 1.1379, + "step": 4070 + }, + { + "epoch": 3.468018578670957, + "grad_norm": 17034.869140625, + "learning_rate": 0.0008336633663366337, + "loss": 1.1365, + "step": 4080 + }, + { + "epoch": 3.4765186242069155, + "grad_norm": 17020.0703125, + "learning_rate": 0.0008326732673267326, + "loss": 1.1331, + "step": 4090 + }, + { + "epoch": 3.4850186697428738, + "grad_norm": 19914.296875, + "learning_rate": 0.0008316831683168317, + "loss": 1.1352, + "step": 4100 + }, + { + "epoch": 3.4935187152788316, + "grad_norm": 19380.546875, + "learning_rate": 0.0008306930693069307, + "loss": 1.1334, + "step": 4110 + }, + { + "epoch": 3.5020187608147904, + "grad_norm": 16126.1982421875, + "learning_rate": 0.0008297029702970297, + "loss": 1.133, + "step": 4120 + }, + { + "epoch": 3.510518806350748, + "grad_norm": 16914.3046875, + "learning_rate": 0.0008287128712871287, + "loss": 1.1328, + "step": 4130 + }, + { + "epoch": 3.5190188518867065, + "grad_norm": 15453.5771484375, + "learning_rate": 0.0008277227722772277, + "loss": 1.1302, + "step": 4140 + }, + { + "epoch": 3.5190188518867065, + "eval_accuracy": 0.7592944501997413, + "eval_loss": 1.0387505292892456, + "eval_runtime": 1280.9395, + "eval_samples_per_second": 389.853, + "eval_steps_per_second": 1.354, + "step": 4140 + }, + { + "epoch": 3.527518897422665, + "grad_norm": 17636.76171875, + "learning_rate": 0.0008267326732673267, + "loss": 1.1313, + "step": 4150 + }, + { + "epoch": 3.536018942958623, + "grad_norm": 16709.236328125, + "learning_rate": 0.0008257425742574258, + "loss": 1.1307, + "step": 4160 + }, + { + "epoch": 3.5445189884945814, + "grad_norm": 18499.865234375, + "learning_rate": 0.0008247524752475247, + "loss": 1.1294, + "step": 4170 + }, + { + "epoch": 3.553019034030539, + "grad_norm": 16481.85546875, + "learning_rate": 0.0008237623762376238, + "loss": 1.1276, + "step": 4180 + }, + { + "epoch": 3.561519079566498, + "grad_norm": 16035.626953125, + "learning_rate": 0.0008227722772277227, + "loss": 1.1259, + "step": 4190 + }, + { + "epoch": 3.570019125102456, + "grad_norm": 17429.3046875, + "learning_rate": 0.0008217821782178218, + "loss": 1.1281, + "step": 4200 + }, + { + "epoch": 3.578519170638414, + "grad_norm": 17563.748046875, + "learning_rate": 0.0008207920792079208, + "loss": 1.1244, + "step": 4210 + }, + { + "epoch": 3.5870192161743724, + "grad_norm": 17140.78125, + "learning_rate": 0.0008198019801980197, + "loss": 1.1264, + "step": 4220 + }, + { + "epoch": 3.5955192617103306, + "grad_norm": 18145.185546875, + "learning_rate": 0.0008188118811881188, + "loss": 1.1229, + "step": 4230 + }, + { + "epoch": 3.5955192617103306, + "eval_accuracy": 0.7607957295247235, + "eval_loss": 1.0331238508224487, + "eval_runtime": 1281.4596, + "eval_samples_per_second": 389.695, + "eval_steps_per_second": 1.353, + "step": 4230 + }, + { + "epoch": 3.604019307246289, + "grad_norm": 14324.3125, + "learning_rate": 0.0008178217821782177, + "loss": 1.124, + "step": 4240 + }, + { + "epoch": 3.612519352782247, + "grad_norm": 19569.59375, + "learning_rate": 0.0008168316831683168, + "loss": 1.124, + "step": 4250 + }, + { + "epoch": 3.6210193983182055, + "grad_norm": 18639.96484375, + "learning_rate": 0.0008158415841584159, + "loss": 1.1221, + "step": 4260 + }, + { + "epoch": 3.6295194438541634, + "grad_norm": 17077.609375, + "learning_rate": 0.0008148514851485148, + "loss": 1.1177, + "step": 4270 + }, + { + "epoch": 3.6380194893901217, + "grad_norm": 19580.677734375, + "learning_rate": 0.0008138613861386138, + "loss": 1.1208, + "step": 4280 + }, + { + "epoch": 3.64651953492608, + "grad_norm": 14986.6796875, + "learning_rate": 0.0008128712871287128, + "loss": 1.1192, + "step": 4290 + }, + { + "epoch": 3.6550195804620382, + "grad_norm": 16046.3740234375, + "learning_rate": 0.000811881188118812, + "loss": 1.1193, + "step": 4300 + }, + { + "epoch": 3.6635196259979965, + "grad_norm": 14974.083984375, + "learning_rate": 0.000810891089108911, + "loss": 1.1162, + "step": 4310 + }, + { + "epoch": 3.672019671533955, + "grad_norm": 17986.744140625, + "learning_rate": 0.00080990099009901, + "loss": 1.1183, + "step": 4320 + }, + { + "epoch": 3.672019671533955, + "eval_accuracy": 0.7619108635382325, + "eval_loss": 1.0262423753738403, + "eval_runtime": 1281.4568, + "eval_samples_per_second": 389.696, + "eval_steps_per_second": 1.353, + "step": 4320 + }, + { + "epoch": 3.680519717069913, + "grad_norm": 19318.87109375, + "learning_rate": 0.000808910891089109, + "loss": 1.1153, + "step": 4330 + }, + { + "epoch": 3.689019762605871, + "grad_norm": 18132.083984375, + "learning_rate": 0.0008079207920792079, + "loss": 1.1155, + "step": 4340 + }, + { + "epoch": 3.6975198081418292, + "grad_norm": 15848.3271484375, + "learning_rate": 0.000806930693069307, + "loss": 1.1172, + "step": 4350 + }, + { + "epoch": 3.7060198536777875, + "grad_norm": 16213.193359375, + "learning_rate": 0.000805940594059406, + "loss": 1.1136, + "step": 4360 + }, + { + "epoch": 3.714519899213746, + "grad_norm": 18759.615234375, + "learning_rate": 0.000804950495049505, + "loss": 1.1129, + "step": 4370 + }, + { + "epoch": 3.723019944749704, + "grad_norm": 19659.138671875, + "learning_rate": 0.000803960396039604, + "loss": 1.115, + "step": 4380 + }, + { + "epoch": 3.7315199902856624, + "grad_norm": 19299.97265625, + "learning_rate": 0.000802970297029703, + "loss": 1.1124, + "step": 4390 + }, + { + "epoch": 3.7400200358216207, + "grad_norm": 15511.5537109375, + "learning_rate": 0.000801980198019802, + "loss": 1.1121, + "step": 4400 + }, + { + "epoch": 3.7485200813575785, + "grad_norm": 18630.70703125, + "learning_rate": 0.0008009900990099011, + "loss": 1.1106, + "step": 4410 + }, + { + "epoch": 3.7485200813575785, + "eval_accuracy": 0.763328105961178, + "eval_loss": 1.0201424360275269, + "eval_runtime": 1277.6344, + "eval_samples_per_second": 390.861, + "eval_steps_per_second": 1.357, + "step": 4410 + }, + { + "epoch": 3.757020126893537, + "grad_norm": 17692.544921875, + "learning_rate": 0.0008, + "loss": 1.1116, + "step": 4420 + }, + { + "epoch": 3.765520172429495, + "grad_norm": 14010.234375, + "learning_rate": 0.0007990099009900991, + "loss": 1.1089, + "step": 4430 + }, + { + "epoch": 3.7740202179654534, + "grad_norm": 18022.48828125, + "learning_rate": 0.000798019801980198, + "loss": 1.1082, + "step": 4440 + }, + { + "epoch": 3.7825202635014117, + "grad_norm": 16178.5849609375, + "learning_rate": 0.0007970297029702971, + "loss": 1.108, + "step": 4450 + }, + { + "epoch": 3.79102030903737, + "grad_norm": 16874.7734375, + "learning_rate": 0.0007960396039603961, + "loss": 1.1071, + "step": 4460 + }, + { + "epoch": 3.7995203545733283, + "grad_norm": 16226.3564453125, + "learning_rate": 0.0007950495049504951, + "loss": 1.1052, + "step": 4470 + }, + { + "epoch": 3.808020400109286, + "grad_norm": 21136.931640625, + "learning_rate": 0.0007940594059405941, + "loss": 1.1071, + "step": 4480 + }, + { + "epoch": 3.8165204456452444, + "grad_norm": 19177.98828125, + "learning_rate": 0.0007930693069306931, + "loss": 1.1048, + "step": 4490 + }, + { + "epoch": 3.8250204911812027, + "grad_norm": 16423.876953125, + "learning_rate": 0.0007920792079207921, + "loss": 1.1021, + "step": 4500 + }, + { + "epoch": 3.8250204911812027, + "eval_accuracy": 0.7649368216993604, + "eval_loss": 1.0112674236297607, + "eval_runtime": 1282.4987, + "eval_samples_per_second": 389.379, + "eval_steps_per_second": 1.352, + "step": 4500 + }, + { + "epoch": 3.833520536717161, + "grad_norm": 15636.1162109375, + "learning_rate": 0.0007910891089108912, + "loss": 1.1007, + "step": 4510 + }, + { + "epoch": 3.8420205822531193, + "grad_norm": 16542.201171875, + "learning_rate": 0.0007900990099009901, + "loss": 1.1032, + "step": 4520 + }, + { + "epoch": 3.8505206277890776, + "grad_norm": 17437.408203125, + "learning_rate": 0.0007891089108910892, + "loss": 1.1015, + "step": 4530 + }, + { + "epoch": 3.859020673325036, + "grad_norm": 19502.89453125, + "learning_rate": 0.0007881188118811881, + "loss": 1.1025, + "step": 4540 + }, + { + "epoch": 3.8675207188609937, + "grad_norm": 18670.546875, + "learning_rate": 0.0007871287128712872, + "loss": 1.0993, + "step": 4550 + }, + { + "epoch": 3.876020764396952, + "grad_norm": 18770.947265625, + "learning_rate": 0.0007861386138613862, + "loss": 1.1026, + "step": 4560 + }, + { + "epoch": 3.8845208099329103, + "grad_norm": 17557.958984375, + "learning_rate": 0.0007851485148514852, + "loss": 1.0983, + "step": 4570 + }, + { + "epoch": 3.8930208554688686, + "grad_norm": 18551.2265625, + "learning_rate": 0.0007841584158415842, + "loss": 1.1001, + "step": 4580 + }, + { + "epoch": 3.901520901004827, + "grad_norm": 17471.43359375, + "learning_rate": 0.0007831683168316832, + "loss": 1.097, + "step": 4590 + }, + { + "epoch": 3.901520901004827, + "eval_accuracy": 0.7659643920785019, + "eval_loss": 1.0095082521438599, + "eval_runtime": 1281.981, + "eval_samples_per_second": 389.536, + "eval_steps_per_second": 1.353, + "step": 4590 + }, + { + "epoch": 3.910020946540785, + "grad_norm": 17272.111328125, + "learning_rate": 0.0007821782178217822, + "loss": 1.0978, + "step": 4600 + }, + { + "epoch": 3.9185209920767434, + "grad_norm": 13404.1982421875, + "learning_rate": 0.0007811881188118813, + "loss": 1.0995, + "step": 4610 + }, + { + "epoch": 3.9270210376127013, + "grad_norm": 14501.0966796875, + "learning_rate": 0.0007801980198019802, + "loss": 1.0989, + "step": 4620 + }, + { + "epoch": 3.9355210831486596, + "grad_norm": 19314.373046875, + "learning_rate": 0.0007792079207920793, + "loss": 1.0949, + "step": 4630 + }, + { + "epoch": 3.944021128684618, + "grad_norm": 17886.853515625, + "learning_rate": 0.0007782178217821782, + "loss": 1.097, + "step": 4640 + }, + { + "epoch": 3.952521174220576, + "grad_norm": 16101.6513671875, + "learning_rate": 0.0007772277227722773, + "loss": 1.0922, + "step": 4650 + }, + { + "epoch": 3.9610212197565344, + "grad_norm": 16483.103515625, + "learning_rate": 0.0007762376237623763, + "loss": 1.094, + "step": 4660 + }, + { + "epoch": 3.9695212652924927, + "grad_norm": 17740.6640625, + "learning_rate": 0.0007752475247524753, + "loss": 1.0923, + "step": 4670 + }, + { + "epoch": 3.978021310828451, + "grad_norm": 18332.16015625, + "learning_rate": 0.0007742574257425743, + "loss": 1.0932, + "step": 4680 + }, + { + "epoch": 3.978021310828451, + "eval_accuracy": 0.7671802734913508, + "eval_loss": 1.0010571479797363, + "eval_runtime": 1282.4979, + "eval_samples_per_second": 389.379, + "eval_steps_per_second": 1.352, + "step": 4680 + }, + { + "epoch": 3.986521356364409, + "grad_norm": 15639.021484375, + "learning_rate": 0.0007732673267326733, + "loss": 1.0928, + "step": 4690 + }, + { + "epoch": 3.9950214019003676, + "grad_norm": 15639.115234375, + "learning_rate": 0.0007722772277227723, + "loss": 1.0929, + "step": 4700 + }, + { + "epoch": 4.003521447436325, + "grad_norm": 16150.44140625, + "learning_rate": 0.0007712871287128714, + "loss": 1.0907, + "step": 4710 + }, + { + "epoch": 4.012021492972284, + "grad_norm": 20656.818359375, + "learning_rate": 0.0007702970297029703, + "loss": 1.0894, + "step": 4720 + }, + { + "epoch": 4.020521538508242, + "grad_norm": 18745.37890625, + "learning_rate": 0.0007693069306930694, + "loss": 1.0919, + "step": 4730 + }, + { + "epoch": 4.0290215840442, + "grad_norm": 16387.470703125, + "learning_rate": 0.0007683168316831683, + "loss": 1.09, + "step": 4740 + }, + { + "epoch": 4.037521629580159, + "grad_norm": 18658.521484375, + "learning_rate": 0.0007673267326732674, + "loss": 1.0873, + "step": 4750 + }, + { + "epoch": 4.046021675116116, + "grad_norm": 19641.572265625, + "learning_rate": 0.0007663366336633664, + "loss": 1.0886, + "step": 4760 + }, + { + "epoch": 4.054521720652075, + "grad_norm": 16653.2578125, + "learning_rate": 0.0007653465346534654, + "loss": 1.0873, + "step": 4770 + }, + { + "epoch": 4.054521720652075, + "eval_accuracy": 0.7683563302945906, + "eval_loss": 0.9970803260803223, + "eval_runtime": 1282.3032, + "eval_samples_per_second": 389.438, + "eval_steps_per_second": 1.352, + "step": 4770 + }, + { + "epoch": 4.063021766188033, + "grad_norm": 17606.6015625, + "learning_rate": 0.0007643564356435644, + "loss": 1.0889, + "step": 4780 + }, + { + "epoch": 4.071521811723992, + "grad_norm": 17704.697265625, + "learning_rate": 0.0007633663366336634, + "loss": 1.0869, + "step": 4790 + }, + { + "epoch": 4.08002185725995, + "grad_norm": 20424.306640625, + "learning_rate": 0.0007623762376237624, + "loss": 1.0861, + "step": 4800 + }, + { + "epoch": 4.088521902795907, + "grad_norm": 15488.392578125, + "learning_rate": 0.0007613861386138615, + "loss": 1.0858, + "step": 4810 + }, + { + "epoch": 4.097021948331866, + "grad_norm": 18205.36328125, + "learning_rate": 0.0007603960396039604, + "loss": 1.0847, + "step": 4820 + }, + { + "epoch": 4.105521993867824, + "grad_norm": 15557.935546875, + "learning_rate": 0.0007594059405940595, + "loss": 1.0845, + "step": 4830 + }, + { + "epoch": 4.114022039403783, + "grad_norm": 17741.306640625, + "learning_rate": 0.0007584158415841584, + "loss": 1.0837, + "step": 4840 + }, + { + "epoch": 4.122522084939741, + "grad_norm": 21305.34765625, + "learning_rate": 0.0007574257425742574, + "loss": 1.0841, + "step": 4850 + }, + { + "epoch": 4.131022130475699, + "grad_norm": 20442.0390625, + "learning_rate": 0.0007564356435643565, + "loss": 1.0813, + "step": 4860 + }, + { + "epoch": 4.131022130475699, + "eval_accuracy": 0.769627324769287, + "eval_loss": 0.9902046322822571, + "eval_runtime": 1278.8399, + "eval_samples_per_second": 390.493, + "eval_steps_per_second": 1.356, + "step": 4860 + }, + { + "epoch": 4.139522176011657, + "grad_norm": 19991.08203125, + "learning_rate": 0.0007554455445544554, + "loss": 1.0809, + "step": 4870 + }, + { + "epoch": 4.148022221547615, + "grad_norm": 21327.169921875, + "learning_rate": 0.0007544554455445545, + "loss": 1.0822, + "step": 4880 + }, + { + "epoch": 4.156522267083574, + "grad_norm": 20165.03515625, + "learning_rate": 0.0007534653465346534, + "loss": 1.0824, + "step": 4890 + }, + { + "epoch": 4.165022312619532, + "grad_norm": 15693.8251953125, + "learning_rate": 0.0007524752475247525, + "loss": 1.0786, + "step": 4900 + }, + { + "epoch": 4.17352235815549, + "grad_norm": 18861.962890625, + "learning_rate": 0.0007514851485148515, + "loss": 1.0802, + "step": 4910 + }, + { + "epoch": 4.182022403691448, + "grad_norm": 18349.630859375, + "learning_rate": 0.0007504950495049505, + "loss": 1.0811, + "step": 4920 + }, + { + "epoch": 4.190522449227407, + "grad_norm": 16894.869140625, + "learning_rate": 0.0007495049504950495, + "loss": 1.0784, + "step": 4930 + }, + { + "epoch": 4.199022494763365, + "grad_norm": 15264.0703125, + "learning_rate": 0.0007485148514851485, + "loss": 1.0779, + "step": 4940 + }, + { + "epoch": 4.207522540299323, + "grad_norm": 18500.328125, + "learning_rate": 0.0007475247524752475, + "loss": 1.0769, + "step": 4950 + }, + { + "epoch": 4.207522540299323, + "eval_accuracy": 0.7703979549467554, + "eval_loss": 0.9842762351036072, + "eval_runtime": 1282.9458, + "eval_samples_per_second": 389.243, + "eval_steps_per_second": 1.352, + "step": 4950 + }, + { + "epoch": 4.216022585835281, + "grad_norm": 16738.04296875, + "learning_rate": 0.0007465346534653466, + "loss": 1.0763, + "step": 4960 + }, + { + "epoch": 4.224522631371239, + "grad_norm": 15980.9921875, + "learning_rate": 0.0007455445544554455, + "loss": 1.0779, + "step": 4970 + }, + { + "epoch": 4.233022676907198, + "grad_norm": 17264.802734375, + "learning_rate": 0.0007445544554455446, + "loss": 1.0752, + "step": 4980 + }, + { + "epoch": 4.241522722443156, + "grad_norm": 19698.16796875, + "learning_rate": 0.0007435643564356435, + "loss": 1.0726, + "step": 4990 + }, + { + "epoch": 4.2500227679791145, + "grad_norm": 16336.4052734375, + "learning_rate": 0.0007425742574257426, + "loss": 1.0749, + "step": 5000 + }, + { + "epoch": 4.258522813515072, + "grad_norm": 17774.734375, + "learning_rate": 0.0007415841584158416, + "loss": 1.073, + "step": 5010 + }, + { + "epoch": 4.26702285905103, + "grad_norm": 19482.98046875, + "learning_rate": 0.0007405940594059406, + "loss": 1.0754, + "step": 5020 + }, + { + "epoch": 4.275522904586989, + "grad_norm": 17985.568359375, + "learning_rate": 0.0007396039603960396, + "loss": 1.072, + "step": 5030 + }, + { + "epoch": 4.284022950122947, + "grad_norm": 16432.27734375, + "learning_rate": 0.0007386138613861386, + "loss": 1.0737, + "step": 5040 + }, + { + "epoch": 4.284022950122947, + "eval_accuracy": 0.7713623846481911, + "eval_loss": 0.982131838798523, + "eval_runtime": 1279.4531, + "eval_samples_per_second": 390.306, + "eval_steps_per_second": 1.355, + "step": 5040 + }, + { + "epoch": 4.2925229956589055, + "grad_norm": 18672.43359375, + "learning_rate": 0.0007376237623762376, + "loss": 1.0719, + "step": 5050 + }, + { + "epoch": 4.301023041194863, + "grad_norm": 15502.23828125, + "learning_rate": 0.0007366336633663367, + "loss": 1.0719, + "step": 5060 + }, + { + "epoch": 4.309523086730822, + "grad_norm": 17056.31640625, + "learning_rate": 0.0007356435643564356, + "loss": 1.0708, + "step": 5070 + }, + { + "epoch": 4.31802313226678, + "grad_norm": 17404.642578125, + "learning_rate": 0.0007346534653465347, + "loss": 1.0711, + "step": 5080 + }, + { + "epoch": 4.326523177802738, + "grad_norm": 16851.5390625, + "learning_rate": 0.0007336633663366336, + "loss": 1.0711, + "step": 5090 + }, + { + "epoch": 4.3350232233386965, + "grad_norm": 16218.2109375, + "learning_rate": 0.0007326732673267327, + "loss": 1.0705, + "step": 5100 + }, + { + "epoch": 4.343523268874654, + "grad_norm": 15998.1015625, + "learning_rate": 0.0007316831683168317, + "loss": 1.0706, + "step": 5110 + }, + { + "epoch": 4.352023314410613, + "grad_norm": 17217.677734375, + "learning_rate": 0.0007306930693069307, + "loss": 1.0688, + "step": 5120 + }, + { + "epoch": 4.360523359946571, + "grad_norm": 20126.892578125, + "learning_rate": 0.0007297029702970297, + "loss": 1.0687, + "step": 5130 + }, + { + "epoch": 4.360523359946571, + "eval_accuracy": 0.7720324660360576, + "eval_loss": 0.9775763154029846, + "eval_runtime": 1279.545, + "eval_samples_per_second": 390.278, + "eval_steps_per_second": 1.355, + "step": 5130 + }, + { + "epoch": 4.36902340548253, + "grad_norm": 18316.3203125, + "learning_rate": 0.0007287128712871287, + "loss": 1.0677, + "step": 5140 + }, + { + "epoch": 4.3775234510184875, + "grad_norm": 19130.66796875, + "learning_rate": 0.0007277227722772277, + "loss": 1.0673, + "step": 5150 + }, + { + "epoch": 4.386023496554446, + "grad_norm": 16458.451171875, + "learning_rate": 0.0007267326732673268, + "loss": 1.0662, + "step": 5160 + }, + { + "epoch": 4.394523542090404, + "grad_norm": 15268.9111328125, + "learning_rate": 0.0007257425742574257, + "loss": 1.0674, + "step": 5170 + }, + { + "epoch": 4.403023587626362, + "grad_norm": 15841.1025390625, + "learning_rate": 0.0007247524752475248, + "loss": 1.0661, + "step": 5180 + }, + { + "epoch": 4.411523633162321, + "grad_norm": 15076.5078125, + "learning_rate": 0.0007237623762376237, + "loss": 1.0656, + "step": 5190 + }, + { + "epoch": 4.4200236786982785, + "grad_norm": 17972.560546875, + "learning_rate": 0.0007227722772277228, + "loss": 1.0653, + "step": 5200 + }, + { + "epoch": 4.428523724234237, + "grad_norm": 17051.736328125, + "learning_rate": 0.0007217821782178218, + "loss": 1.0642, + "step": 5210 + }, + { + "epoch": 4.437023769770195, + "grad_norm": 18644.025390625, + "learning_rate": 0.0007207920792079208, + "loss": 1.0635, + "step": 5220 + }, + { + "epoch": 4.437023769770195, + "eval_accuracy": 0.7729909297402872, + "eval_loss": 0.9726957678794861, + "eval_runtime": 1279.9366, + "eval_samples_per_second": 390.158, + "eval_steps_per_second": 1.355, + "step": 5220 + }, + { + "epoch": 4.445523815306154, + "grad_norm": 17667.255859375, + "learning_rate": 0.0007198019801980198, + "loss": 1.0627, + "step": 5230 + }, + { + "epoch": 4.454023860842112, + "grad_norm": 16890.02734375, + "learning_rate": 0.0007188118811881188, + "loss": 1.0621, + "step": 5240 + }, + { + "epoch": 4.4625239063780695, + "grad_norm": 17795.29296875, + "learning_rate": 0.0007178217821782178, + "loss": 1.0609, + "step": 5250 + }, + { + "epoch": 4.471023951914028, + "grad_norm": 17139.46875, + "learning_rate": 0.0007168316831683169, + "loss": 1.0626, + "step": 5260 + }, + { + "epoch": 4.479523997449986, + "grad_norm": 16425.12109375, + "learning_rate": 0.0007158415841584158, + "loss": 1.0611, + "step": 5270 + }, + { + "epoch": 4.488024042985945, + "grad_norm": 20427.578125, + "learning_rate": 0.0007148514851485149, + "loss": 1.0575, + "step": 5280 + }, + { + "epoch": 4.496524088521903, + "grad_norm": 21385.232421875, + "learning_rate": 0.0007138613861386138, + "loss": 1.0609, + "step": 5290 + }, + { + "epoch": 4.505024134057861, + "grad_norm": 16601.791015625, + "learning_rate": 0.0007128712871287129, + "loss": 1.0594, + "step": 5300 + }, + { + "epoch": 4.513524179593819, + "grad_norm": 17113.41015625, + "learning_rate": 0.0007118811881188119, + "loss": 1.0583, + "step": 5310 + }, + { + "epoch": 4.513524179593819, + "eval_accuracy": 0.7740536654886605, + "eval_loss": 0.9671040773391724, + "eval_runtime": 1277.3232, + "eval_samples_per_second": 390.957, + "eval_steps_per_second": 1.358, + "step": 5310 + }, + { + "epoch": 4.522024225129778, + "grad_norm": 18315.3984375, + "learning_rate": 0.0007108910891089109, + "loss": 1.0581, + "step": 5320 + }, + { + "epoch": 4.530524270665736, + "grad_norm": 17246.2890625, + "learning_rate": 0.0007099009900990099, + "loss": 1.0594, + "step": 5330 + }, + { + "epoch": 4.539024316201694, + "grad_norm": 19144.822265625, + "learning_rate": 0.0007089108910891088, + "loss": 1.0577, + "step": 5340 + }, + { + "epoch": 4.547524361737652, + "grad_norm": 17691.115234375, + "learning_rate": 0.0007079207920792079, + "loss": 1.0581, + "step": 5350 + }, + { + "epoch": 4.55602440727361, + "grad_norm": 17498.75, + "learning_rate": 0.000706930693069307, + "loss": 1.0567, + "step": 5360 + }, + { + "epoch": 4.564524452809569, + "grad_norm": 19553.53515625, + "learning_rate": 0.0007059405940594059, + "loss": 1.0565, + "step": 5370 + }, + { + "epoch": 4.573024498345527, + "grad_norm": 17747.8359375, + "learning_rate": 0.000704950495049505, + "loss": 1.0603, + "step": 5380 + }, + { + "epoch": 4.581524543881486, + "grad_norm": 17742.814453125, + "learning_rate": 0.0007039603960396039, + "loss": 1.0567, + "step": 5390 + }, + { + "epoch": 4.590024589417443, + "grad_norm": 18410.59765625, + "learning_rate": 0.0007029702970297029, + "loss": 1.0561, + "step": 5400 + }, + { + "epoch": 4.590024589417443, + "eval_accuracy": 0.7741942335122652, + "eval_loss": 0.968053936958313, + "eval_runtime": 1278.1291, + "eval_samples_per_second": 390.71, + "eval_steps_per_second": 1.357, + "step": 5400 + }, + { + "epoch": 4.598524634953401, + "grad_norm": 14357.056640625, + "learning_rate": 0.000701980198019802, + "loss": 1.0544, + "step": 5410 + }, + { + "epoch": 4.60702468048936, + "grad_norm": 18278.533203125, + "learning_rate": 0.0007009900990099009, + "loss": 1.0531, + "step": 5420 + }, + { + "epoch": 4.615524726025318, + "grad_norm": 18012.599609375, + "learning_rate": 0.0007, + "loss": 1.0567, + "step": 5430 + }, + { + "epoch": 4.624024771561277, + "grad_norm": 18881.583984375, + "learning_rate": 0.0006990099009900989, + "loss": 1.0548, + "step": 5440 + }, + { + "epoch": 4.632524817097234, + "grad_norm": 19899.90234375, + "learning_rate": 0.000698019801980198, + "loss": 1.0541, + "step": 5450 + }, + { + "epoch": 4.641024862633193, + "grad_norm": 15934.2138671875, + "learning_rate": 0.000697029702970297, + "loss": 1.0536, + "step": 5460 + }, + { + "epoch": 4.649524908169151, + "grad_norm": 15930.8544921875, + "learning_rate": 0.000696039603960396, + "loss": 1.0529, + "step": 5470 + }, + { + "epoch": 4.658024953705109, + "grad_norm": 18346.009765625, + "learning_rate": 0.000695049504950495, + "loss": 1.0507, + "step": 5480 + }, + { + "epoch": 4.666524999241068, + "grad_norm": 17350.013671875, + "learning_rate": 0.000694059405940594, + "loss": 1.0528, + "step": 5490 + }, + { + "epoch": 4.666524999241068, + "eval_accuracy": 0.7752448834285515, + "eval_loss": 0.9624823927879333, + "eval_runtime": 1279.2263, + "eval_samples_per_second": 390.375, + "eval_steps_per_second": 1.356, + "step": 5490 + }, + { + "epoch": 4.675025044777025, + "grad_norm": 17104.44921875, + "learning_rate": 0.000693069306930693, + "loss": 1.0502, + "step": 5500 + }, + { + "epoch": 4.683525090312984, + "grad_norm": 16279.3994140625, + "learning_rate": 0.0006920792079207921, + "loss": 1.0525, + "step": 5510 + }, + { + "epoch": 4.692025135848942, + "grad_norm": 17741.423828125, + "learning_rate": 0.000691089108910891, + "loss": 1.0495, + "step": 5520 + }, + { + "epoch": 4.700525181384901, + "grad_norm": 18454.111328125, + "learning_rate": 0.0006900990099009901, + "loss": 1.0501, + "step": 5530 + }, + { + "epoch": 4.709025226920859, + "grad_norm": 20272.701171875, + "learning_rate": 0.000689108910891089, + "loss": 1.0518, + "step": 5540 + }, + { + "epoch": 4.717525272456816, + "grad_norm": 17187.880859375, + "learning_rate": 0.0006881188118811881, + "loss": 1.0498, + "step": 5550 + }, + { + "epoch": 4.726025317992775, + "grad_norm": 16848.552734375, + "learning_rate": 0.0006871287128712872, + "loss": 1.0475, + "step": 5560 + }, + { + "epoch": 4.734525363528733, + "grad_norm": 18908.748046875, + "learning_rate": 0.0006861386138613862, + "loss": 1.0477, + "step": 5570 + }, + { + "epoch": 4.743025409064692, + "grad_norm": 17731.822265625, + "learning_rate": 0.0006851485148514852, + "loss": 1.0496, + "step": 5580 + }, + { + "epoch": 4.743025409064692, + "eval_accuracy": 0.7761857833832465, + "eval_loss": 0.9594100117683411, + "eval_runtime": 1281.0251, + "eval_samples_per_second": 389.827, + "eval_steps_per_second": 1.354, + "step": 5580 + }, + { + "epoch": 4.75152545460065, + "grad_norm": 16594.486328125, + "learning_rate": 0.0006841584158415842, + "loss": 1.0479, + "step": 5590 + }, + { + "epoch": 4.760025500136608, + "grad_norm": 17834.193359375, + "learning_rate": 0.0006831683168316832, + "loss": 1.0465, + "step": 5600 + }, + { + "epoch": 4.768525545672566, + "grad_norm": 16431.404296875, + "learning_rate": 0.0006821782178217823, + "loss": 1.047, + "step": 5610 + }, + { + "epoch": 4.777025591208524, + "grad_norm": 18339.9921875, + "learning_rate": 0.0006811881188118812, + "loss": 1.0456, + "step": 5620 + }, + { + "epoch": 4.785525636744483, + "grad_norm": 15955.2890625, + "learning_rate": 0.0006801980198019803, + "loss": 1.0436, + "step": 5630 + }, + { + "epoch": 4.794025682280441, + "grad_norm": 18920.619140625, + "learning_rate": 0.0006792079207920792, + "loss": 1.0458, + "step": 5640 + }, + { + "epoch": 4.802525727816399, + "grad_norm": 16657.296875, + "learning_rate": 0.0006782178217821783, + "loss": 1.0452, + "step": 5650 + }, + { + "epoch": 4.811025773352357, + "grad_norm": 19700.88671875, + "learning_rate": 0.0006772277227722773, + "loss": 1.0473, + "step": 5660 + }, + { + "epoch": 4.819525818888316, + "grad_norm": 19387.7578125, + "learning_rate": 0.0006762376237623763, + "loss": 1.0458, + "step": 5670 + }, + { + "epoch": 4.819525818888316, + "eval_accuracy": 0.7771768462948865, + "eval_loss": 0.9521399736404419, + "eval_runtime": 1279.986, + "eval_samples_per_second": 390.143, + "eval_steps_per_second": 1.355, + "step": 5670 + }, + { + "epoch": 4.828025864424274, + "grad_norm": 18805.466796875, + "learning_rate": 0.0006752475247524753, + "loss": 1.0425, + "step": 5680 + }, + { + "epoch": 4.836525909960232, + "grad_norm": 17308.564453125, + "learning_rate": 0.0006742574257425743, + "loss": 1.043, + "step": 5690 + }, + { + "epoch": 4.84502595549619, + "grad_norm": 19598.796875, + "learning_rate": 0.0006732673267326733, + "loss": 1.0425, + "step": 5700 + }, + { + "epoch": 4.853526001032148, + "grad_norm": 17387.70703125, + "learning_rate": 0.0006722772277227724, + "loss": 1.0435, + "step": 5710 + }, + { + "epoch": 4.862026046568107, + "grad_norm": 18542.671875, + "learning_rate": 0.0006712871287128713, + "loss": 1.0429, + "step": 5720 + }, + { + "epoch": 4.870526092104065, + "grad_norm": 15447.505859375, + "learning_rate": 0.0006702970297029704, + "loss": 1.0428, + "step": 5730 + }, + { + "epoch": 4.8790261376400235, + "grad_norm": 19042.740234375, + "learning_rate": 0.0006693069306930693, + "loss": 1.0428, + "step": 5740 + }, + { + "epoch": 4.887526183175981, + "grad_norm": 19183.861328125, + "learning_rate": 0.0006683168316831684, + "loss": 1.042, + "step": 5750 + }, + { + "epoch": 4.896026228711939, + "grad_norm": 16694.583984375, + "learning_rate": 0.0006673267326732674, + "loss": 1.0407, + "step": 5760 + }, + { + "epoch": 4.896026228711939, + "eval_accuracy": 0.7776950393076092, + "eval_loss": 0.950414776802063, + "eval_runtime": 1280.3997, + "eval_samples_per_second": 390.017, + "eval_steps_per_second": 1.354, + "step": 5760 + }, + { + "epoch": 4.904526274247898, + "grad_norm": 16852.962890625, + "learning_rate": 0.0006663366336633664, + "loss": 1.0404, + "step": 5770 + }, + { + "epoch": 4.913026319783856, + "grad_norm": 20231.42578125, + "learning_rate": 0.0006653465346534654, + "loss": 1.0399, + "step": 5780 + }, + { + "epoch": 4.9215263653198145, + "grad_norm": 15834.216796875, + "learning_rate": 0.0006643564356435644, + "loss": 1.042, + "step": 5790 + }, + { + "epoch": 4.930026410855772, + "grad_norm": 15273.1630859375, + "learning_rate": 0.0006633663366336634, + "loss": 1.0397, + "step": 5800 + }, + { + "epoch": 4.938526456391731, + "grad_norm": 20834.515625, + "learning_rate": 0.0006623762376237625, + "loss": 1.0386, + "step": 5810 + }, + { + "epoch": 4.947026501927689, + "grad_norm": 21407.017578125, + "learning_rate": 0.0006613861386138614, + "loss": 1.0389, + "step": 5820 + }, + { + "epoch": 4.955526547463647, + "grad_norm": 19523.822265625, + "learning_rate": 0.0006603960396039605, + "loss": 1.038, + "step": 5830 + }, + { + "epoch": 4.9640265929996055, + "grad_norm": 18045.078125, + "learning_rate": 0.0006594059405940594, + "loss": 1.0359, + "step": 5840 + }, + { + "epoch": 4.972526638535563, + "grad_norm": 16661.830078125, + "learning_rate": 0.0006584158415841585, + "loss": 1.037, + "step": 5850 + }, + { + "epoch": 4.972526638535563, + "eval_accuracy": 0.7784280857816438, + "eval_loss": 0.9486609101295471, + "eval_runtime": 1279.799, + "eval_samples_per_second": 390.2, + "eval_steps_per_second": 1.355, + "step": 5850 + }, + { + "epoch": 4.981026684071522, + "grad_norm": 19275.583984375, + "learning_rate": 0.0006574257425742575, + "loss": 1.0376, + "step": 5860 + }, + { + "epoch": 4.98952672960748, + "grad_norm": 20361.98828125, + "learning_rate": 0.0006564356435643565, + "loss": 1.0386, + "step": 5870 + }, + { + "epoch": 4.998026775143439, + "grad_norm": 18276.357421875, + "learning_rate": 0.0006554455445544555, + "loss": 1.0364, + "step": 5880 + }, + { + "epoch": 5.0065268206793965, + "grad_norm": 18657.6484375, + "learning_rate": 0.0006544554455445545, + "loss": 1.0364, + "step": 5890 + }, + { + "epoch": 5.015026866215354, + "grad_norm": 17588.00390625, + "learning_rate": 0.0006534653465346535, + "loss": 1.0346, + "step": 5900 + }, + { + "epoch": 5.023526911751313, + "grad_norm": 20156.283203125, + "learning_rate": 0.0006524752475247526, + "loss": 1.0341, + "step": 5910 + }, + { + "epoch": 5.032026957287271, + "grad_norm": 18811.03125, + "learning_rate": 0.0006514851485148515, + "loss": 1.034, + "step": 5920 + }, + { + "epoch": 5.04052700282323, + "grad_norm": 18864.615234375, + "learning_rate": 0.0006504950495049506, + "loss": 1.0356, + "step": 5930 + }, + { + "epoch": 5.0490270483591875, + "grad_norm": 18199.67578125, + "learning_rate": 0.0006495049504950495, + "loss": 1.0339, + "step": 5940 + }, + { + "epoch": 5.0490270483591875, + "eval_accuracy": 0.7792297436274894, + "eval_loss": 0.9439004063606262, + "eval_runtime": 1279.6853, + "eval_samples_per_second": 390.235, + "eval_steps_per_second": 1.355, + "step": 5940 + }, + { + "epoch": 5.057527093895146, + "grad_norm": 19274.73828125, + "learning_rate": 0.0006485148514851485, + "loss": 1.0334, + "step": 5950 + }, + { + "epoch": 5.066027139431104, + "grad_norm": 20778.62109375, + "learning_rate": 0.0006475247524752476, + "loss": 1.0337, + "step": 5960 + }, + { + "epoch": 5.074527184967062, + "grad_norm": 19509.306640625, + "learning_rate": 0.0006465346534653465, + "loss": 1.0355, + "step": 5970 + }, + { + "epoch": 5.083027230503021, + "grad_norm": 18669.24609375, + "learning_rate": 0.0006455445544554456, + "loss": 1.0329, + "step": 5980 + }, + { + "epoch": 5.0915272760389785, + "grad_norm": 15668.9755859375, + "learning_rate": 0.0006445544554455445, + "loss": 1.0294, + "step": 5990 + }, + { + "epoch": 5.100027321574937, + "grad_norm": 18858.783203125, + "learning_rate": 0.0006435643564356436, + "loss": 1.0324, + "step": 6000 + }, + { + "epoch": 5.108527367110895, + "grad_norm": 16453.763671875, + "learning_rate": 0.0006425742574257426, + "loss": 1.033, + "step": 6010 + }, + { + "epoch": 5.117027412646854, + "grad_norm": 19821.6875, + "learning_rate": 0.0006415841584158416, + "loss": 1.0324, + "step": 6020 + }, + { + "epoch": 5.125527458182812, + "grad_norm": 18548.5078125, + "learning_rate": 0.0006405940594059406, + "loss": 1.0284, + "step": 6030 + }, + { + "epoch": 5.125527458182812, + "eval_accuracy": 0.7796817210719912, + "eval_loss": 0.9404940605163574, + "eval_runtime": 1281.8344, + "eval_samples_per_second": 389.581, + "eval_steps_per_second": 1.353, + "step": 6030 + }, + { + "epoch": 5.1340275037187695, + "grad_norm": 18221.74609375, + "learning_rate": 0.0006396039603960396, + "loss": 1.0309, + "step": 6040 + }, + { + "epoch": 5.142527549254728, + "grad_norm": 19257.5078125, + "learning_rate": 0.0006386138613861386, + "loss": 1.0316, + "step": 6050 + }, + { + "epoch": 5.151027594790686, + "grad_norm": 20660.0859375, + "learning_rate": 0.0006376237623762377, + "loss": 1.0281, + "step": 6060 + }, + { + "epoch": 5.159527640326645, + "grad_norm": 17646.05859375, + "learning_rate": 0.0006366336633663366, + "loss": 1.0298, + "step": 6070 + }, + { + "epoch": 5.168027685862603, + "grad_norm": 15755.4951171875, + "learning_rate": 0.0006356435643564357, + "loss": 1.0296, + "step": 6080 + }, + { + "epoch": 5.176527731398561, + "grad_norm": 19384.6484375, + "learning_rate": 0.0006346534653465346, + "loss": 1.0277, + "step": 6090 + }, + { + "epoch": 5.185027776934519, + "grad_norm": 16928.974609375, + "learning_rate": 0.0006336633663366337, + "loss": 1.0295, + "step": 6100 + }, + { + "epoch": 5.193527822470478, + "grad_norm": 15206.5625, + "learning_rate": 0.0006326732673267327, + "loss": 1.0288, + "step": 6110 + }, + { + "epoch": 5.202027868006436, + "grad_norm": 18075.7421875, + "learning_rate": 0.0006316831683168317, + "loss": 1.0288, + "step": 6120 + }, + { + "epoch": 5.202027868006436, + "eval_accuracy": 0.7805738833087461, + "eval_loss": 0.9371596574783325, + "eval_runtime": 1279.2071, + "eval_samples_per_second": 390.381, + "eval_steps_per_second": 1.356, + "step": 6120 + }, + { + "epoch": 5.210527913542394, + "grad_norm": 17331.333984375, + "learning_rate": 0.0006306930693069307, + "loss": 1.0245, + "step": 6130 + }, + { + "epoch": 5.219027959078352, + "grad_norm": 19757.05078125, + "learning_rate": 0.0006297029702970297, + "loss": 1.0266, + "step": 6140 + }, + { + "epoch": 5.22752800461431, + "grad_norm": 16021.1396484375, + "learning_rate": 0.0006287128712871287, + "loss": 1.0256, + "step": 6150 + }, + { + "epoch": 5.236028050150269, + "grad_norm": 16731.166015625, + "learning_rate": 0.0006277227722772278, + "loss": 1.0257, + "step": 6160 + }, + { + "epoch": 5.244528095686227, + "grad_norm": 17451.5390625, + "learning_rate": 0.0006267326732673267, + "loss": 1.0271, + "step": 6170 + }, + { + "epoch": 5.2530281412221855, + "grad_norm": 19212.619140625, + "learning_rate": 0.0006257425742574258, + "loss": 1.0255, + "step": 6180 + }, + { + "epoch": 5.261528186758143, + "grad_norm": 17397.1015625, + "learning_rate": 0.0006247524752475247, + "loss": 1.026, + "step": 6190 + }, + { + "epoch": 5.270028232294101, + "grad_norm": 18388.25, + "learning_rate": 0.0006237623762376238, + "loss": 1.0253, + "step": 6200 + }, + { + "epoch": 5.27852827783006, + "grad_norm": 18145.59765625, + "learning_rate": 0.0006227722772277228, + "loss": 1.0251, + "step": 6210 + }, + { + "epoch": 5.27852827783006, + "eval_accuracy": 0.7809789625861284, + "eval_loss": 0.9343125820159912, + "eval_runtime": 1280.9852, + "eval_samples_per_second": 389.839, + "eval_steps_per_second": 1.354, + "step": 6210 + }, + { + "epoch": 5.287028323366018, + "grad_norm": 18143.001953125, + "learning_rate": 0.0006217821782178218, + "loss": 1.0251, + "step": 6220 + }, + { + "epoch": 5.2955283689019765, + "grad_norm": 23030.259765625, + "learning_rate": 0.0006207920792079208, + "loss": 1.0248, + "step": 6230 + }, + { + "epoch": 5.304028414437934, + "grad_norm": 20499.400390625, + "learning_rate": 0.0006198019801980198, + "loss": 1.0239, + "step": 6240 + }, + { + "epoch": 5.312528459973893, + "grad_norm": 16773.736328125, + "learning_rate": 0.0006188118811881188, + "loss": 1.0251, + "step": 6250 + }, + { + "epoch": 5.321028505509851, + "grad_norm": 15036.5712890625, + "learning_rate": 0.0006178217821782179, + "loss": 1.0248, + "step": 6260 + }, + { + "epoch": 5.329528551045809, + "grad_norm": 18476.0703125, + "learning_rate": 0.0006168316831683168, + "loss": 1.0241, + "step": 6270 + }, + { + "epoch": 5.3380285965817675, + "grad_norm": 18741.55859375, + "learning_rate": 0.0006158415841584159, + "loss": 1.0238, + "step": 6280 + }, + { + "epoch": 5.346528642117725, + "grad_norm": 17602.275390625, + "learning_rate": 0.0006148514851485148, + "loss": 1.0222, + "step": 6290 + }, + { + "epoch": 5.355028687653684, + "grad_norm": 15750.1806640625, + "learning_rate": 0.0006138613861386139, + "loss": 1.0218, + "step": 6300 + }, + { + "epoch": 5.355028687653684, + "eval_accuracy": 0.7811827125583685, + "eval_loss": 0.9342101216316223, + "eval_runtime": 1278.6765, + "eval_samples_per_second": 390.543, + "eval_steps_per_second": 1.356, + "step": 6300 + }, + { + "epoch": 5.363528733189642, + "grad_norm": 15361.734375, + "learning_rate": 0.0006128712871287129, + "loss": 1.0212, + "step": 6310 + }, + { + "epoch": 5.372028778725601, + "grad_norm": 17438.470703125, + "learning_rate": 0.0006118811881188119, + "loss": 1.0217, + "step": 6320 + }, + { + "epoch": 5.3805288242615585, + "grad_norm": 16182.955078125, + "learning_rate": 0.0006108910891089109, + "loss": 1.0207, + "step": 6330 + }, + { + "epoch": 5.389028869797516, + "grad_norm": 15098.9970703125, + "learning_rate": 0.0006099009900990099, + "loss": 1.021, + "step": 6340 + }, + { + "epoch": 5.397528915333475, + "grad_norm": 16916.810546875, + "learning_rate": 0.0006089108910891089, + "loss": 1.0187, + "step": 6350 + }, + { + "epoch": 5.406028960869433, + "grad_norm": 15510.921875, + "learning_rate": 0.000607920792079208, + "loss": 1.0203, + "step": 6360 + }, + { + "epoch": 5.414529006405392, + "grad_norm": 18232.8671875, + "learning_rate": 0.0006069306930693069, + "loss": 1.0209, + "step": 6370 + }, + { + "epoch": 5.4230290519413495, + "grad_norm": 15943.5810546875, + "learning_rate": 0.000605940594059406, + "loss": 1.0189, + "step": 6380 + }, + { + "epoch": 5.431529097477308, + "grad_norm": 17515.39453125, + "learning_rate": 0.0006049504950495049, + "loss": 1.0185, + "step": 6390 + }, + { + "epoch": 5.431529097477308, + "eval_accuracy": 0.7821448982293225, + "eval_loss": 0.9290784597396851, + "eval_runtime": 1274.8156, + "eval_samples_per_second": 391.726, + "eval_steps_per_second": 1.36, + "step": 6390 + }, + { + "epoch": 5.4420023678698275, + "grad_norm": 17196.8828125, + "learning_rate": 0.000603960396039604, + "loss": 1.0209, + "step": 6400 + }, + { + "epoch": 5.450502413405786, + "grad_norm": 19976.57421875, + "learning_rate": 0.000602970297029703, + "loss": 1.0202, + "step": 6410 + }, + { + "epoch": 5.459002458941744, + "grad_norm": 16986.595703125, + "learning_rate": 0.000601980198019802, + "loss": 1.0194, + "step": 6420 + }, + { + "epoch": 5.467502504477703, + "grad_norm": 16322.2490234375, + "learning_rate": 0.000600990099009901, + "loss": 1.018, + "step": 6430 + }, + { + "epoch": 5.476002550013661, + "grad_norm": 16943.923828125, + "learning_rate": 0.0006, + "loss": 1.0173, + "step": 6440 + }, + { + "epoch": 5.484502595549619, + "grad_norm": 17515.416015625, + "learning_rate": 0.000599009900990099, + "loss": 1.0171, + "step": 6450 + }, + { + "epoch": 5.493002641085577, + "grad_norm": 20824.935546875, + "learning_rate": 0.000598019801980198, + "loss": 1.0167, + "step": 6460 + }, + { + "epoch": 5.501502686621535, + "grad_norm": 19812.29296875, + "learning_rate": 0.000597029702970297, + "loss": 1.017, + "step": 6470 + }, + { + "epoch": 5.510002732157494, + "grad_norm": 20733.58984375, + "learning_rate": 0.000596039603960396, + "loss": 1.0143, + "step": 6480 + }, + { + "epoch": 5.510002732157494, + "eval_accuracy": 0.7827784542349862, + "eval_loss": 0.926770031452179, + "eval_runtime": 1034.8114, + "eval_samples_per_second": 482.579, + "eval_steps_per_second": 0.821, + "step": 6480 + }, + { + "epoch": 5.518502777693452, + "grad_norm": 19389.359375, + "learning_rate": 0.000595049504950495, + "loss": 1.0156, + "step": 6490 + }, + { + "epoch": 5.52700282322941, + "grad_norm": 18111.1796875, + "learning_rate": 0.000594059405940594, + "loss": 1.015, + "step": 6500 + }, + { + "epoch": 5.535502868765368, + "grad_norm": 17627.548828125, + "learning_rate": 0.0005930693069306931, + "loss": 1.0149, + "step": 6510 + }, + { + "epoch": 5.544002914301327, + "grad_norm": 19444.2421875, + "learning_rate": 0.000592079207920792, + "loss": 1.0161, + "step": 6520 + }, + { + "epoch": 5.552502959837285, + "grad_norm": 20379.31640625, + "learning_rate": 0.0005910891089108911, + "loss": 1.0151, + "step": 6530 + }, + { + "epoch": 5.561003005373243, + "grad_norm": 17187.162109375, + "learning_rate": 0.00059009900990099, + "loss": 1.0164, + "step": 6540 + }, + { + "epoch": 5.569503050909201, + "grad_norm": 20134.74609375, + "learning_rate": 0.0005891089108910891, + "loss": 1.0125, + "step": 6550 + }, + { + "epoch": 5.578003096445159, + "grad_norm": 16345.7314453125, + "learning_rate": 0.0005881188118811881, + "loss": 1.0143, + "step": 6560 + }, + { + "epoch": 5.586503141981118, + "grad_norm": 16676.40625, + "learning_rate": 0.0005871287128712871, + "loss": 1.0146, + "step": 6570 + }, + { + "epoch": 5.586503141981118, + "eval_accuracy": 0.7830355090944983, + "eval_loss": 0.925011157989502, + "eval_runtime": 1032.366, + "eval_samples_per_second": 483.722, + "eval_steps_per_second": 0.823, + "step": 6570 + }, + { + "epoch": 5.595003187517076, + "grad_norm": 18004.888671875, + "learning_rate": 0.0005861386138613861, + "loss": 1.0134, + "step": 6580 + }, + { + "epoch": 5.6035032330530345, + "grad_norm": 17843.01171875, + "learning_rate": 0.0005851485148514851, + "loss": 1.0121, + "step": 6590 + }, + { + "epoch": 5.612003278588992, + "grad_norm": 19261.90625, + "learning_rate": 0.0005841584158415841, + "loss": 1.014, + "step": 6600 + }, + { + "epoch": 5.62050332412495, + "grad_norm": 19034.001953125, + "learning_rate": 0.0005831683168316832, + "loss": 1.014, + "step": 6610 + }, + { + "epoch": 5.629003369660909, + "grad_norm": 15280.501953125, + "learning_rate": 0.0005821782178217821, + "loss": 1.0127, + "step": 6620 + }, + { + "epoch": 5.637503415196867, + "grad_norm": 16361.3017578125, + "learning_rate": 0.0005811881188118812, + "loss": 1.0135, + "step": 6630 + }, + { + "epoch": 5.6460034607328256, + "grad_norm": 18691.2578125, + "learning_rate": 0.0005801980198019801, + "loss": 1.0116, + "step": 6640 + }, + { + "epoch": 5.654503506268783, + "grad_norm": 17126.857421875, + "learning_rate": 0.0005792079207920792, + "loss": 1.0123, + "step": 6650 + }, + { + "epoch": 5.663003551804742, + "grad_norm": 18155.701171875, + "learning_rate": 0.0005782178217821782, + "loss": 1.01, + "step": 6660 + }, + { + "epoch": 5.663003551804742, + "eval_accuracy": 0.7837371016336877, + "eval_loss": 0.9207693338394165, + "eval_runtime": 1032.8984, + "eval_samples_per_second": 483.473, + "eval_steps_per_second": 0.823, + "step": 6660 + }, + { + "epoch": 5.6715035973407, + "grad_norm": 19733.861328125, + "learning_rate": 0.0005772277227722772, + "loss": 1.0122, + "step": 6670 + }, + { + "epoch": 5.680003642876658, + "grad_norm": 16721.662109375, + "learning_rate": 0.0005762376237623762, + "loss": 1.013, + "step": 6680 + }, + { + "epoch": 5.6885036884126166, + "grad_norm": 19389.369140625, + "learning_rate": 0.0005752475247524752, + "loss": 1.012, + "step": 6690 + }, + { + "epoch": 5.697003733948574, + "grad_norm": 20522.015625, + "learning_rate": 0.0005742574257425742, + "loss": 1.0087, + "step": 6700 + }, + { + "epoch": 5.705503779484533, + "grad_norm": 19054.39453125, + "learning_rate": 0.0005732673267326733, + "loss": 1.0092, + "step": 6710 + }, + { + "epoch": 5.714003825020491, + "grad_norm": 16219.0595703125, + "learning_rate": 0.0005722772277227722, + "loss": 1.0105, + "step": 6720 + }, + { + "epoch": 5.72250387055645, + "grad_norm": 19937.716796875, + "learning_rate": 0.0005712871287128713, + "loss": 1.01, + "step": 6730 + }, + { + "epoch": 5.731003916092408, + "grad_norm": 18509.220703125, + "learning_rate": 0.0005702970297029702, + "loss": 1.0084, + "step": 6740 + }, + { + "epoch": 5.739503961628365, + "grad_norm": 18168.08203125, + "learning_rate": 0.0005693069306930693, + "loss": 1.0081, + "step": 6750 + }, + { + "epoch": 5.739503961628365, + "eval_accuracy": 0.7840970171619854, + "eval_loss": 0.9189032316207886, + "eval_runtime": 1032.6648, + "eval_samples_per_second": 483.582, + "eval_steps_per_second": 0.823, + "step": 6750 + }, + { + "epoch": 5.748004007164324, + "grad_norm": 17099.634765625, + "learning_rate": 0.0005683168316831683, + "loss": 1.0085, + "step": 6760 + }, + { + "epoch": 5.756504052700282, + "grad_norm": 19988.625, + "learning_rate": 0.0005673267326732673, + "loss": 1.0079, + "step": 6770 + }, + { + "epoch": 5.765004098236241, + "grad_norm": 17239.306640625, + "learning_rate": 0.0005663366336633663, + "loss": 1.0075, + "step": 6780 + }, + { + "epoch": 5.773504143772199, + "grad_norm": 16728.1171875, + "learning_rate": 0.0005653465346534653, + "loss": 1.0062, + "step": 6790 + }, + { + "epoch": 5.782004189308157, + "grad_norm": 15729.82421875, + "learning_rate": 0.0005643564356435643, + "loss": 1.0074, + "step": 6800 + }, + { + "epoch": 5.790504234844115, + "grad_norm": 19562.201171875, + "learning_rate": 0.0005633663366336634, + "loss": 1.0075, + "step": 6810 + }, + { + "epoch": 5.799004280380073, + "grad_norm": 20575.86328125, + "learning_rate": 0.0005623762376237624, + "loss": 1.0056, + "step": 6820 + }, + { + "epoch": 5.807504325916032, + "grad_norm": 19013.0625, + "learning_rate": 0.0005613861386138615, + "loss": 1.0058, + "step": 6830 + }, + { + "epoch": 5.81600437145199, + "grad_norm": 18319.556640625, + "learning_rate": 0.0005603960396039604, + "loss": 1.0055, + "step": 6840 + }, + { + "epoch": 5.81600437145199, + "eval_accuracy": 0.7846662939402871, + "eval_loss": 0.9171181321144104, + "eval_runtime": 1033.9438, + "eval_samples_per_second": 482.984, + "eval_steps_per_second": 0.822, + "step": 6840 + }, + { + "epoch": 5.824504416987948, + "grad_norm": 19040.82421875, + "learning_rate": 0.0005594059405940595, + "loss": 1.0077, + "step": 6850 + }, + { + "epoch": 5.833004462523906, + "grad_norm": 19041.265625, + "learning_rate": 0.0005584158415841585, + "loss": 1.0064, + "step": 6860 + }, + { + "epoch": 5.841504508059865, + "grad_norm": 15583.31640625, + "learning_rate": 0.0005574257425742575, + "loss": 1.0052, + "step": 6870 + }, + { + "epoch": 5.850004553595823, + "grad_norm": 15519.5576171875, + "learning_rate": 0.0005564356435643565, + "loss": 1.0066, + "step": 6880 + }, + { + "epoch": 5.858504599131781, + "grad_norm": 19509.908203125, + "learning_rate": 0.0005554455445544555, + "loss": 1.005, + "step": 6890 + }, + { + "epoch": 5.867004644667739, + "grad_norm": 16785.40625, + "learning_rate": 0.0005544554455445545, + "loss": 1.0045, + "step": 6900 + }, + { + "epoch": 5.875504690203697, + "grad_norm": 20995.373046875, + "learning_rate": 0.0005534653465346536, + "loss": 1.0028, + "step": 6910 + }, + { + "epoch": 5.884004735739656, + "grad_norm": 16980.791015625, + "learning_rate": 0.0005524752475247525, + "loss": 1.0058, + "step": 6920 + }, + { + "epoch": 5.892504781275614, + "grad_norm": 17058.552734375, + "learning_rate": 0.0005514851485148516, + "loss": 1.0032, + "step": 6930 + }, + { + "epoch": 5.892504781275614, + "eval_accuracy": 0.7852809138194954, + "eval_loss": 0.9138230085372925, + "eval_runtime": 1038.0576, + "eval_samples_per_second": 481.07, + "eval_steps_per_second": 0.819, + "step": 6930 + }, + { + "epoch": 5.9010048268115725, + "grad_norm": 15984.5615234375, + "learning_rate": 0.0005504950495049505, + "loss": 1.002, + "step": 6940 + }, + { + "epoch": 5.90950487234753, + "grad_norm": 18076.875, + "learning_rate": 0.0005495049504950496, + "loss": 1.0029, + "step": 6950 + }, + { + "epoch": 5.918004917883489, + "grad_norm": 17178.27734375, + "learning_rate": 0.0005485148514851486, + "loss": 1.0024, + "step": 6960 + }, + { + "epoch": 5.926504963419447, + "grad_norm": 17936.349609375, + "learning_rate": 0.0005475247524752476, + "loss": 0.9992, + "step": 6970 + }, + { + "epoch": 5.935005008955406, + "grad_norm": 16773.853515625, + "learning_rate": 0.0005465346534653466, + "loss": 1.002, + "step": 6980 + }, + { + "epoch": 5.9435050544913635, + "grad_norm": 18567.22265625, + "learning_rate": 0.0005455445544554456, + "loss": 1.0019, + "step": 6990 + }, + { + "epoch": 5.952005100027321, + "grad_norm": 20730.1484375, + "learning_rate": 0.0005445544554455446, + "loss": 1.0032, + "step": 7000 + }, + { + "epoch": 5.96050514556328, + "grad_norm": 18468.900390625, + "learning_rate": 0.0005435643564356437, + "loss": 1.0029, + "step": 7010 + }, + { + "epoch": 5.969005191099238, + "grad_norm": 21202.24609375, + "learning_rate": 0.0005425742574257426, + "loss": 1.0019, + "step": 7020 + }, + { + "epoch": 5.969005191099238, + "eval_accuracy": 0.7857404893569327, + "eval_loss": 0.9110856652259827, + "eval_runtime": 1036.1907, + "eval_samples_per_second": 481.936, + "eval_steps_per_second": 0.82, + "step": 7020 + }, + { + "epoch": 5.977505236635197, + "grad_norm": 17446.140625, + "learning_rate": 0.0005415841584158417, + "loss": 1.0013, + "step": 7030 + }, + { + "epoch": 5.9860052821711545, + "grad_norm": 20787.7109375, + "learning_rate": 0.0005405940594059406, + "loss": 1.0006, + "step": 7040 + }, + { + "epoch": 5.994505327707113, + "grad_norm": 15513.68359375, + "learning_rate": 0.0005396039603960396, + "loss": 1.001, + "step": 7050 + }, + { + "epoch": 6.003005373243071, + "grad_norm": 17996.607421875, + "learning_rate": 0.0005386138613861387, + "loss": 1.0003, + "step": 7060 + }, + { + "epoch": 6.011505418779029, + "grad_norm": 17095.095703125, + "learning_rate": 0.0005376237623762376, + "loss": 1.0007, + "step": 7070 + }, + { + "epoch": 6.020005464314988, + "grad_norm": 17025.11328125, + "learning_rate": 0.0005366336633663367, + "loss": 1.0003, + "step": 7080 + }, + { + "epoch": 6.0285055098509455, + "grad_norm": 15303.3544921875, + "learning_rate": 0.0005356435643564356, + "loss": 1.0, + "step": 7090 + }, + { + "epoch": 6.037005555386904, + "grad_norm": 18501.576171875, + "learning_rate": 0.0005346534653465347, + "loss": 0.9981, + "step": 7100 + }, + { + "epoch": 6.045505600922862, + "grad_norm": 19782.1171875, + "learning_rate": 0.0005336633663366337, + "loss": 0.9989, + "step": 7110 + }, + { + "epoch": 6.045505600922862, + "eval_accuracy": 0.7861505064886588, + "eval_loss": 0.9094851613044739, + "eval_runtime": 1034.5248, + "eval_samples_per_second": 482.712, + "eval_steps_per_second": 0.822, + "step": 7110 + }, + { + "epoch": 6.05400564645882, + "grad_norm": 19169.7890625, + "learning_rate": 0.0005326732673267327, + "loss": 0.9998, + "step": 7120 + }, + { + "epoch": 6.062505691994779, + "grad_norm": 19047.958984375, + "learning_rate": 0.0005316831683168317, + "loss": 0.9973, + "step": 7130 + }, + { + "epoch": 6.0710057375307365, + "grad_norm": 20607.556640625, + "learning_rate": 0.0005306930693069307, + "loss": 0.9962, + "step": 7140 + }, + { + "epoch": 6.079505783066695, + "grad_norm": 16297.8662109375, + "learning_rate": 0.0005297029702970297, + "loss": 0.9968, + "step": 7150 + }, + { + "epoch": 6.088005828602653, + "grad_norm": 16640.685546875, + "learning_rate": 0.0005287128712871288, + "loss": 0.9987, + "step": 7160 + }, + { + "epoch": 6.096505874138612, + "grad_norm": 16723.65625, + "learning_rate": 0.0005277227722772277, + "loss": 0.9981, + "step": 7170 + }, + { + "epoch": 6.10500591967457, + "grad_norm": 18703.06640625, + "learning_rate": 0.0005267326732673268, + "loss": 0.9959, + "step": 7180 + }, + { + "epoch": 6.113505965210528, + "grad_norm": 16973.158203125, + "learning_rate": 0.0005257425742574257, + "loss": 0.9988, + "step": 7190 + }, + { + "epoch": 6.122006010746486, + "grad_norm": 16989.619140625, + "learning_rate": 0.0005247524752475248, + "loss": 0.9955, + "step": 7200 + }, + { + "epoch": 6.122006010746486, + "eval_accuracy": 0.7866073008018492, + "eval_loss": 0.9084503650665283, + "eval_runtime": 1037.0895, + "eval_samples_per_second": 481.519, + "eval_steps_per_second": 0.82, + "step": 7200 + }, + { + "epoch": 6.130506056282444, + "grad_norm": 16469.03515625, + "learning_rate": 0.0005237623762376238, + "loss": 0.9962, + "step": 7210 + }, + { + "epoch": 6.139006101818403, + "grad_norm": 16646.052734375, + "learning_rate": 0.0005227722772277228, + "loss": 0.9978, + "step": 7220 + }, + { + "epoch": 6.147506147354361, + "grad_norm": 18823.22265625, + "learning_rate": 0.0005217821782178218, + "loss": 0.9967, + "step": 7230 + }, + { + "epoch": 6.156006192890319, + "grad_norm": 16628.587890625, + "learning_rate": 0.0005207920792079208, + "loss": 0.9968, + "step": 7240 + }, + { + "epoch": 6.164506238426277, + "grad_norm": 17544.67578125, + "learning_rate": 0.0005198019801980198, + "loss": 0.9958, + "step": 7250 + }, + { + "epoch": 6.173006283962236, + "grad_norm": 16480.125, + "learning_rate": 0.0005188118811881189, + "loss": 0.997, + "step": 7260 + }, + { + "epoch": 6.181506329498194, + "grad_norm": 18358.3984375, + "learning_rate": 0.0005178217821782178, + "loss": 0.9956, + "step": 7270 + }, + { + "epoch": 6.190006375034152, + "grad_norm": 19029.322265625, + "learning_rate": 0.0005168316831683169, + "loss": 0.9977, + "step": 7280 + }, + { + "epoch": 6.19850642057011, + "grad_norm": 16023.6572265625, + "learning_rate": 0.0005158415841584158, + "loss": 0.9944, + "step": 7290 + }, + { + "epoch": 6.19850642057011, + "eval_accuracy": 0.7871460102262176, + "eval_loss": 0.9043031930923462, + "eval_runtime": 1032.4584, + "eval_samples_per_second": 483.679, + "eval_steps_per_second": 0.823, + "step": 7290 + }, + { + "epoch": 6.207006466106068, + "grad_norm": 17711.34765625, + "learning_rate": 0.0005148514851485149, + "loss": 0.9931, + "step": 7300 + }, + { + "epoch": 6.215506511642027, + "grad_norm": 15834.974609375, + "learning_rate": 0.0005138613861386139, + "loss": 0.9935, + "step": 7310 + }, + { + "epoch": 6.224006557177985, + "grad_norm": 15997.443359375, + "learning_rate": 0.0005128712871287129, + "loss": 0.9943, + "step": 7320 + }, + { + "epoch": 6.2325066027139435, + "grad_norm": 19241.9921875, + "learning_rate": 0.0005118811881188119, + "loss": 0.9942, + "step": 7330 + }, + { + "epoch": 6.241006648249901, + "grad_norm": 14933.837890625, + "learning_rate": 0.0005108910891089109, + "loss": 0.9949, + "step": 7340 + }, + { + "epoch": 6.249506693785859, + "grad_norm": 16723.673828125, + "learning_rate": 0.0005099009900990099, + "loss": 0.9937, + "step": 7350 + }, + { + "epoch": 6.258006739321818, + "grad_norm": 19108.876953125, + "learning_rate": 0.000508910891089109, + "loss": 0.9915, + "step": 7360 + }, + { + "epoch": 6.266506784857776, + "grad_norm": 14807.8232421875, + "learning_rate": 0.0005079207920792079, + "loss": 0.9912, + "step": 7370 + }, + { + "epoch": 6.2750068303937345, + "grad_norm": 16829.048828125, + "learning_rate": 0.000506930693069307, + "loss": 0.9917, + "step": 7380 + }, + { + "epoch": 6.2750068303937345, + "eval_accuracy": 0.7875770735366631, + "eval_loss": 0.9019953608512878, + "eval_runtime": 1035.5292, + "eval_samples_per_second": 482.244, + "eval_steps_per_second": 0.821, + "step": 7380 + }, + { + "epoch": 6.283506875929692, + "grad_norm": 17801.87109375, + "learning_rate": 0.0005059405940594059, + "loss": 0.9928, + "step": 7390 + }, + { + "epoch": 6.292006921465651, + "grad_norm": 18446.126953125, + "learning_rate": 0.000504950495049505, + "loss": 0.9915, + "step": 7400 + }, + { + "epoch": 6.300506967001609, + "grad_norm": 16055.3349609375, + "learning_rate": 0.000503960396039604, + "loss": 0.9903, + "step": 7410 + }, + { + "epoch": 6.309007012537567, + "grad_norm": 16897.765625, + "learning_rate": 0.000502970297029703, + "loss": 0.9908, + "step": 7420 + }, + { + "epoch": 6.3175070580735255, + "grad_norm": 19176.193359375, + "learning_rate": 0.000501980198019802, + "loss": 0.9898, + "step": 7430 + }, + { + "epoch": 6.326007103609483, + "grad_norm": 14390.6298828125, + "learning_rate": 0.000500990099009901, + "loss": 0.9904, + "step": 7440 + }, + { + "epoch": 6.334507149145442, + "grad_norm": 21656.654296875, + "learning_rate": 0.0005, + "loss": 0.9916, + "step": 7450 + }, + { + "epoch": 6.3430071946814, + "grad_norm": 18097.005859375, + "learning_rate": 0.0004990099009900991, + "loss": 0.9902, + "step": 7460 + }, + { + "epoch": 6.351507240217359, + "grad_norm": 17372.796875, + "learning_rate": 0.000498019801980198, + "loss": 0.9896, + "step": 7470 + }, + { + "epoch": 6.351507240217359, + "eval_accuracy": 0.7878735288789341, + "eval_loss": 0.9011977314949036, + "eval_runtime": 1035.7365, + "eval_samples_per_second": 482.148, + "eval_steps_per_second": 0.821, + "step": 7470 + }, + { + "epoch": 6.3600072857533165, + "grad_norm": 16490.4375, + "learning_rate": 0.0004970297029702971, + "loss": 0.989, + "step": 7480 + }, + { + "epoch": 6.368507331289274, + "grad_norm": 18962.072265625, + "learning_rate": 0.000496039603960396, + "loss": 0.9887, + "step": 7490 + }, + { + "epoch": 6.377007376825233, + "grad_norm": 19760.28125, + "learning_rate": 0.0004950495049504951, + "loss": 0.9891, + "step": 7500 + }, + { + "epoch": 6.385507422361191, + "grad_norm": 16014.2646484375, + "learning_rate": 0.0004940594059405941, + "loss": 0.9921, + "step": 7510 + }, + { + "epoch": 6.39400746789715, + "grad_norm": 18045.798828125, + "learning_rate": 0.000493069306930693, + "loss": 0.9886, + "step": 7520 + }, + { + "epoch": 6.4025075134331075, + "grad_norm": 18082.34375, + "learning_rate": 0.0004920792079207921, + "loss": 0.987, + "step": 7530 + }, + { + "epoch": 6.411007558969066, + "grad_norm": 17991.447265625, + "learning_rate": 0.000491089108910891, + "loss": 0.99, + "step": 7540 + }, + { + "epoch": 6.419507604505024, + "grad_norm": 15685.39453125, + "learning_rate": 0.0004900990099009901, + "loss": 0.9884, + "step": 7550 + }, + { + "epoch": 6.428007650040982, + "grad_norm": 16992.146484375, + "learning_rate": 0.0004891089108910892, + "loss": 0.9878, + "step": 7560 + }, + { + "epoch": 6.428007650040982, + "eval_accuracy": 0.7882446558820121, + "eval_loss": 0.8991919755935669, + "eval_runtime": 1034.5241, + "eval_samples_per_second": 482.713, + "eval_steps_per_second": 0.822, + "step": 7560 + }, + { + "epoch": 6.436507695576941, + "grad_norm": 18808.25390625, + "learning_rate": 0.0004881188118811881, + "loss": 0.9872, + "step": 7570 + }, + { + "epoch": 6.4450077411128985, + "grad_norm": 18018.94921875, + "learning_rate": 0.00048712871287128715, + "loss": 0.9882, + "step": 7580 + }, + { + "epoch": 6.453507786648857, + "grad_norm": 16361.677734375, + "learning_rate": 0.00048613861386138615, + "loss": 0.9867, + "step": 7590 + }, + { + "epoch": 6.462007832184815, + "grad_norm": 16750.744140625, + "learning_rate": 0.00048514851485148515, + "loss": 0.9878, + "step": 7600 + }, + { + "epoch": 6.470507877720774, + "grad_norm": 19413.93359375, + "learning_rate": 0.00048415841584158414, + "loss": 0.9868, + "step": 7610 + }, + { + "epoch": 6.479007923256732, + "grad_norm": 17738.328125, + "learning_rate": 0.00048316831683168314, + "loss": 0.9869, + "step": 7620 + }, + { + "epoch": 6.4875079687926895, + "grad_norm": 16600.763671875, + "learning_rate": 0.0004821782178217822, + "loss": 0.9868, + "step": 7630 + }, + { + "epoch": 6.496008014328648, + "grad_norm": 19458.306640625, + "learning_rate": 0.0004811881188118812, + "loss": 0.9842, + "step": 7640 + }, + { + "epoch": 6.504508059864606, + "grad_norm": 15481.515625, + "learning_rate": 0.0004801980198019802, + "loss": 0.9886, + "step": 7650 + }, + { + "epoch": 6.504508059864606, + "eval_accuracy": 0.7887328723155168, + "eval_loss": 0.8970336318016052, + "eval_runtime": 1037.2132, + "eval_samples_per_second": 481.461, + "eval_steps_per_second": 0.82, + "step": 7650 + }, + { + "epoch": 6.513402750371877, + "grad_norm": 20331.080078125, + "learning_rate": 0.0004792079207920792, + "loss": 0.9857, + "step": 7660 + }, + { + "epoch": 6.5219027959078355, + "grad_norm": 17513.203125, + "learning_rate": 0.0004782178217821782, + "loss": 0.9846, + "step": 7670 + }, + { + "epoch": 6.530402841443793, + "grad_norm": 18388.54296875, + "learning_rate": 0.00047722772277227724, + "loss": 0.9853, + "step": 7680 + }, + { + "epoch": 6.538902886979752, + "grad_norm": 18303.26953125, + "learning_rate": 0.00047623762376237624, + "loss": 0.9853, + "step": 7690 + }, + { + "epoch": 6.54740293251571, + "grad_norm": 16617.505859375, + "learning_rate": 0.00047524752475247524, + "loss": 0.9859, + "step": 7700 + }, + { + "epoch": 6.555902978051668, + "grad_norm": 17269.935546875, + "learning_rate": 0.00047425742574257423, + "loss": 0.9855, + "step": 7710 + }, + { + "epoch": 6.5644030235876265, + "grad_norm": 16519.927734375, + "learning_rate": 0.00047326732673267323, + "loss": 0.9843, + "step": 7720 + }, + { + "epoch": 6.572903069123584, + "grad_norm": 15807.8505859375, + "learning_rate": 0.0004722772277227723, + "loss": 0.983, + "step": 7730 + }, + { + "epoch": 6.581403114659543, + "grad_norm": 15810.3408203125, + "learning_rate": 0.0004712871287128713, + "loss": 0.9861, + "step": 7740 + }, + { + "epoch": 6.581403114659543, + "eval_accuracy": 0.7891655969340131, + "eval_loss": 0.8951780200004578, + "eval_runtime": 1035.4986, + "eval_samples_per_second": 482.259, + "eval_steps_per_second": 0.821, + "step": 7740 + }, + { + "epoch": 6.589903160195501, + "grad_norm": 16766.341796875, + "learning_rate": 0.0004702970297029703, + "loss": 0.9851, + "step": 7750 + }, + { + "epoch": 6.59840320573146, + "grad_norm": 20359.9140625, + "learning_rate": 0.0004693069306930693, + "loss": 0.9844, + "step": 7760 + }, + { + "epoch": 6.6069032512674175, + "grad_norm": 16457.33984375, + "learning_rate": 0.00046831683168316833, + "loss": 0.9822, + "step": 7770 + }, + { + "epoch": 6.615403296803375, + "grad_norm": 18674.6015625, + "learning_rate": 0.0004673267326732674, + "loss": 0.983, + "step": 7780 + }, + { + "epoch": 6.623903342339334, + "grad_norm": 18476.642578125, + "learning_rate": 0.0004663366336633664, + "loss": 0.9826, + "step": 7790 + }, + { + "epoch": 6.632403387875292, + "grad_norm": 15693.6640625, + "learning_rate": 0.0004653465346534654, + "loss": 0.9806, + "step": 7800 + }, + { + "epoch": 6.640903433411251, + "grad_norm": 18678.359375, + "learning_rate": 0.0004643564356435644, + "loss": 0.9826, + "step": 7810 + }, + { + "epoch": 6.6494034789472085, + "grad_norm": 18860.5390625, + "learning_rate": 0.0004633663366336634, + "loss": 0.9843, + "step": 7820 + }, + { + "epoch": 6.657903524483167, + "grad_norm": 18948.015625, + "learning_rate": 0.00046237623762376243, + "loss": 0.9814, + "step": 7830 + }, + { + "epoch": 6.657903524483167, + "eval_accuracy": 0.789608308121163, + "eval_loss": 0.8939031362533569, + "eval_runtime": 1028.1352, + "eval_samples_per_second": 485.712, + "eval_steps_per_second": 0.827, + "step": 7830 + }, + { + "epoch": 6.666403570019125, + "grad_norm": 17197.728515625, + "learning_rate": 0.00046138613861386143, + "loss": 0.9816, + "step": 7840 + }, + { + "epoch": 6.674903615555083, + "grad_norm": 21051.08984375, + "learning_rate": 0.0004603960396039604, + "loss": 0.9812, + "step": 7850 + }, + { + "epoch": 6.683403661091042, + "grad_norm": 19255.662109375, + "learning_rate": 0.0004594059405940594, + "loss": 0.9836, + "step": 7860 + }, + { + "epoch": 6.6919037066269995, + "grad_norm": 16365.4599609375, + "learning_rate": 0.0004584158415841584, + "loss": 0.9824, + "step": 7870 + }, + { + "epoch": 6.700403752162958, + "grad_norm": 17249.7109375, + "learning_rate": 0.0004574257425742575, + "loss": 0.9793, + "step": 7880 + }, + { + "epoch": 6.708903797698916, + "grad_norm": 15945.6630859375, + "learning_rate": 0.00045643564356435647, + "loss": 0.9819, + "step": 7890 + }, + { + "epoch": 6.717403843234875, + "grad_norm": 16529.767578125, + "learning_rate": 0.00045544554455445547, + "loss": 0.9801, + "step": 7900 + }, + { + "epoch": 6.725903888770833, + "grad_norm": 16305.484375, + "learning_rate": 0.00045445544554455447, + "loss": 0.9823, + "step": 7910 + }, + { + "epoch": 6.7344039343067905, + "grad_norm": 17304.328125, + "learning_rate": 0.00045346534653465347, + "loss": 0.9782, + "step": 7920 + }, + { + "epoch": 6.7344039343067905, + "eval_accuracy": 0.7898104897932849, + "eval_loss": 0.8923720121383667, + "eval_runtime": 1032.3531, + "eval_samples_per_second": 483.728, + "eval_steps_per_second": 0.823, + "step": 7920 + }, + { + "epoch": 6.742903979842749, + "grad_norm": 16057.078125, + "learning_rate": 0.0004524752475247525, + "loss": 0.9804, + "step": 7930 + }, + { + "epoch": 6.751404025378707, + "grad_norm": 16539.8046875, + "learning_rate": 0.0004514851485148515, + "loss": 0.9816, + "step": 7940 + }, + { + "epoch": 6.759904070914666, + "grad_norm": 18756.751953125, + "learning_rate": 0.0004504950495049505, + "loss": 0.9803, + "step": 7950 + }, + { + "epoch": 6.768404116450624, + "grad_norm": 16140.8369140625, + "learning_rate": 0.0004495049504950495, + "loss": 0.9807, + "step": 7960 + }, + { + "epoch": 6.776904161986582, + "grad_norm": 16101.984375, + "learning_rate": 0.0004485148514851485, + "loss": 0.9795, + "step": 7970 + }, + { + "epoch": 6.78540420752254, + "grad_norm": 18084.240234375, + "learning_rate": 0.00044752475247524756, + "loss": 0.9798, + "step": 7980 + }, + { + "epoch": 6.793904253058498, + "grad_norm": 17187.412109375, + "learning_rate": 0.00044653465346534656, + "loss": 0.9801, + "step": 7990 + }, + { + "epoch": 6.802404298594457, + "grad_norm": 16965.23046875, + "learning_rate": 0.00044554455445544556, + "loss": 0.9784, + "step": 8000 + }, + { + "epoch": 6.810904344130415, + "grad_norm": 16516.47265625, + "learning_rate": 0.00044455445544554456, + "loss": 0.9786, + "step": 8010 + }, + { + "epoch": 6.810904344130415, + "eval_accuracy": 0.7902325926967286, + "eval_loss": 0.8902812004089355, + "eval_runtime": 1032.3456, + "eval_samples_per_second": 483.731, + "eval_steps_per_second": 0.823, + "step": 8010 + }, + { + "epoch": 6.819404389666373, + "grad_norm": 18656.958984375, + "learning_rate": 0.00044356435643564356, + "loss": 0.9794, + "step": 8020 + }, + { + "epoch": 6.827904435202331, + "grad_norm": 18381.744140625, + "learning_rate": 0.0004425742574257426, + "loss": 0.978, + "step": 8030 + }, + { + "epoch": 6.83640448073829, + "grad_norm": 16134.7392578125, + "learning_rate": 0.0004415841584158416, + "loss": 0.9783, + "step": 8040 + }, + { + "epoch": 6.844904526274248, + "grad_norm": 15535.76953125, + "learning_rate": 0.0004405940594059406, + "loss": 0.9771, + "step": 8050 + }, + { + "epoch": 6.853404571810206, + "grad_norm": 17276.91015625, + "learning_rate": 0.0004396039603960396, + "loss": 0.9778, + "step": 8060 + }, + { + "epoch": 6.861904617346164, + "grad_norm": 16477.451171875, + "learning_rate": 0.0004386138613861386, + "loss": 0.9762, + "step": 8070 + }, + { + "epoch": 6.870404662882122, + "grad_norm": 17235.421875, + "learning_rate": 0.00043762376237623765, + "loss": 0.9758, + "step": 8080 + }, + { + "epoch": 6.878904708418081, + "grad_norm": 23922.306640625, + "learning_rate": 0.00043663366336633665, + "loss": 0.9771, + "step": 8090 + }, + { + "epoch": 6.887404753954039, + "grad_norm": 20163.564453125, + "learning_rate": 0.00043564356435643565, + "loss": 0.9786, + "step": 8100 + }, + { + "epoch": 6.887404753954039, + "eval_accuracy": 0.7905720984224686, + "eval_loss": 0.8882827162742615, + "eval_runtime": 1035.2187, + "eval_samples_per_second": 482.389, + "eval_steps_per_second": 0.821, + "step": 8100 + }, + { + "epoch": 6.895904799489998, + "grad_norm": 22567.283203125, + "learning_rate": 0.00043465346534653465, + "loss": 0.9767, + "step": 8110 + }, + { + "epoch": 6.904404845025955, + "grad_norm": 18677.17578125, + "learning_rate": 0.00043366336633663365, + "loss": 0.9768, + "step": 8120 + }, + { + "epoch": 6.912904890561913, + "grad_norm": 18427.8046875, + "learning_rate": 0.0004326732673267327, + "loss": 0.9765, + "step": 8130 + }, + { + "epoch": 6.921404936097872, + "grad_norm": 16562.19140625, + "learning_rate": 0.0004316831683168317, + "loss": 0.9778, + "step": 8140 + }, + { + "epoch": 6.92990498163383, + "grad_norm": 18019.8046875, + "learning_rate": 0.0004306930693069307, + "loss": 0.9765, + "step": 8150 + }, + { + "epoch": 6.938405027169789, + "grad_norm": 15273.8798828125, + "learning_rate": 0.0004297029702970297, + "loss": 0.9756, + "step": 8160 + }, + { + "epoch": 6.946905072705746, + "grad_norm": 16736.18359375, + "learning_rate": 0.0004287128712871287, + "loss": 0.9739, + "step": 8170 + }, + { + "epoch": 6.955405118241705, + "grad_norm": 17612.673828125, + "learning_rate": 0.00042772277227722774, + "loss": 0.9772, + "step": 8180 + }, + { + "epoch": 6.963905163777663, + "grad_norm": 15665.3857421875, + "learning_rate": 0.00042673267326732674, + "loss": 0.9742, + "step": 8190 + }, + { + "epoch": 6.963905163777663, + "eval_accuracy": 0.7910029077910385, + "eval_loss": 0.8868052363395691, + "eval_runtime": 1033.6186, + "eval_samples_per_second": 483.136, + "eval_steps_per_second": 0.822, + "step": 8190 + }, + { + "epoch": 6.972405209313621, + "grad_norm": 16346.849609375, + "learning_rate": 0.00042574257425742574, + "loss": 0.9773, + "step": 8200 + }, + { + "epoch": 6.98090525484958, + "grad_norm": 15904.78515625, + "learning_rate": 0.00042475247524752474, + "loss": 0.9756, + "step": 8210 + }, + { + "epoch": 6.989405300385537, + "grad_norm": 16064.7041015625, + "learning_rate": 0.00042376237623762374, + "loss": 0.974, + "step": 8220 + }, + { + "epoch": 6.997905345921496, + "grad_norm": 17884.13671875, + "learning_rate": 0.0004227722772277228, + "loss": 0.9747, + "step": 8230 + }, + { + "epoch": 7.006405391457454, + "grad_norm": 14780.166015625, + "learning_rate": 0.0004217821782178218, + "loss": 0.9743, + "step": 8240 + }, + { + "epoch": 7.014905436993413, + "grad_norm": 17861.595703125, + "learning_rate": 0.0004207920792079208, + "loss": 0.9744, + "step": 8250 + }, + { + "epoch": 7.023405482529371, + "grad_norm": 16821.708984375, + "learning_rate": 0.0004198019801980198, + "loss": 0.9732, + "step": 8260 + }, + { + "epoch": 7.031905528065329, + "grad_norm": 18202.1796875, + "learning_rate": 0.0004188118811881188, + "loss": 0.9746, + "step": 8270 + }, + { + "epoch": 7.040405573601287, + "grad_norm": 15645.1865234375, + "learning_rate": 0.00041782178217821784, + "loss": 0.9735, + "step": 8280 + }, + { + "epoch": 7.040405573601287, + "eval_accuracy": 0.7912855873371129, + "eval_loss": 0.8865892887115479, + "eval_runtime": 1039.2279, + "eval_samples_per_second": 480.528, + "eval_steps_per_second": 0.818, + "step": 8280 + }, + { + "epoch": 7.048905619137245, + "grad_norm": 16949.755859375, + "learning_rate": 0.00041683168316831683, + "loss": 0.9736, + "step": 8290 + }, + { + "epoch": 7.057405664673204, + "grad_norm": 18461.337890625, + "learning_rate": 0.00041584158415841583, + "loss": 0.9728, + "step": 8300 + }, + { + "epoch": 7.065905710209162, + "grad_norm": 17668.466796875, + "learning_rate": 0.00041485148514851483, + "loss": 0.9721, + "step": 8310 + }, + { + "epoch": 7.07440575574512, + "grad_norm": 15126.234375, + "learning_rate": 0.00041386138613861383, + "loss": 0.9718, + "step": 8320 + }, + { + "epoch": 7.082905801281078, + "grad_norm": 16266.1083984375, + "learning_rate": 0.0004128712871287129, + "loss": 0.9733, + "step": 8330 + }, + { + "epoch": 7.091405846817037, + "grad_norm": 19422.59375, + "learning_rate": 0.0004118811881188119, + "loss": 0.9733, + "step": 8340 + }, + { + "epoch": 7.099905892352995, + "grad_norm": 17168.087890625, + "learning_rate": 0.0004108910891089109, + "loss": 0.9723, + "step": 8350 + }, + { + "epoch": 7.108405937888953, + "grad_norm": 16709.5625, + "learning_rate": 0.0004099009900990099, + "loss": 0.9712, + "step": 8360 + }, + { + "epoch": 7.116905983424911, + "grad_norm": 15946.3232421875, + "learning_rate": 0.0004089108910891089, + "loss": 0.9726, + "step": 8370 + }, + { + "epoch": 7.116905983424911, + "eval_accuracy": 0.7916597760806168, + "eval_loss": 0.8836008310317993, + "eval_runtime": 1033.0801, + "eval_samples_per_second": 483.388, + "eval_steps_per_second": 0.823, + "step": 8370 + }, + { + "epoch": 7.125406028960869, + "grad_norm": 13808.0322265625, + "learning_rate": 0.0004079207920792079, + "loss": 0.9693, + "step": 8380 + }, + { + "epoch": 7.133906074496828, + "grad_norm": 19899.408203125, + "learning_rate": 0.0004069306930693069, + "loss": 0.9695, + "step": 8390 + }, + { + "epoch": 7.142406120032786, + "grad_norm": 14987.80078125, + "learning_rate": 0.000405940594059406, + "loss": 0.9698, + "step": 8400 + }, + { + "epoch": 7.1509061655687445, + "grad_norm": 16658.63671875, + "learning_rate": 0.000404950495049505, + "loss": 0.9711, + "step": 8410 + }, + { + "epoch": 7.159406211104702, + "grad_norm": 16382.1494140625, + "learning_rate": 0.00040396039603960397, + "loss": 0.9717, + "step": 8420 + }, + { + "epoch": 7.16790625664066, + "grad_norm": 16640.423828125, + "learning_rate": 0.000402970297029703, + "loss": 0.9706, + "step": 8430 + }, + { + "epoch": 7.176406302176619, + "grad_norm": 17918.15625, + "learning_rate": 0.000401980198019802, + "loss": 0.9723, + "step": 8440 + }, + { + "epoch": 7.184906347712577, + "grad_norm": 16958.197265625, + "learning_rate": 0.000400990099009901, + "loss": 0.9704, + "step": 8450 + }, + { + "epoch": 7.1934063932485355, + "grad_norm": 15377.521484375, + "learning_rate": 0.0004, + "loss": 0.97, + "step": 8460 + }, + { + "epoch": 7.1934063932485355, + "eval_accuracy": 0.7920878335859806, + "eval_loss": 0.8810552954673767, + "eval_runtime": 1033.6339, + "eval_samples_per_second": 483.129, + "eval_steps_per_second": 0.822, + "step": 8460 + }, + { + "epoch": 7.201906438784493, + "grad_norm": 15135.03515625, + "learning_rate": 0.000399009900990099, + "loss": 0.9684, + "step": 8470 + }, + { + "epoch": 7.210406484320452, + "grad_norm": 16375.2431640625, + "learning_rate": 0.00039801980198019807, + "loss": 0.9691, + "step": 8480 + }, + { + "epoch": 7.21890652985641, + "grad_norm": 16684.763671875, + "learning_rate": 0.00039702970297029707, + "loss": 0.9678, + "step": 8490 + }, + { + "epoch": 7.227406575392368, + "grad_norm": 17551.935546875, + "learning_rate": 0.00039603960396039607, + "loss": 0.9688, + "step": 8500 + }, + { + "epoch": 7.2359066209283265, + "grad_norm": 17371.619140625, + "learning_rate": 0.00039504950495049506, + "loss": 0.9717, + "step": 8510 + }, + { + "epoch": 7.244406666464284, + "grad_norm": 20004.443359375, + "learning_rate": 0.00039405940594059406, + "loss": 0.9672, + "step": 8520 + }, + { + "epoch": 7.252906712000243, + "grad_norm": 18529.32421875, + "learning_rate": 0.0003930693069306931, + "loss": 0.9693, + "step": 8530 + }, + { + "epoch": 7.261406757536201, + "grad_norm": 16488.56640625, + "learning_rate": 0.0003920792079207921, + "loss": 0.9678, + "step": 8540 + }, + { + "epoch": 7.26990680307216, + "grad_norm": 22428.466796875, + "learning_rate": 0.0003910891089108911, + "loss": 0.9693, + "step": 8550 + }, + { + "epoch": 7.26990680307216, + "eval_accuracy": 0.792256339480986, + "eval_loss": 0.8810757398605347, + "eval_runtime": 1033.8252, + "eval_samples_per_second": 483.039, + "eval_steps_per_second": 0.822, + "step": 8550 + }, + { + "epoch": 7.2784068486081175, + "grad_norm": 17588.4453125, + "learning_rate": 0.0003900990099009901, + "loss": 0.9679, + "step": 8560 + }, + { + "epoch": 7.286906894144076, + "grad_norm": 18152.78515625, + "learning_rate": 0.0003891089108910891, + "loss": 0.9668, + "step": 8570 + }, + { + "epoch": 7.295406939680034, + "grad_norm": 14463.046875, + "learning_rate": 0.00038811881188118816, + "loss": 0.9679, + "step": 8580 + }, + { + "epoch": 7.303906985215992, + "grad_norm": 19075.265625, + "learning_rate": 0.00038712871287128716, + "loss": 0.9681, + "step": 8590 + }, + { + "epoch": 7.312407030751951, + "grad_norm": 21981.548828125, + "learning_rate": 0.00038613861386138616, + "loss": 0.9682, + "step": 8600 + }, + { + "epoch": 7.3209070762879085, + "grad_norm": 20439.08984375, + "learning_rate": 0.00038514851485148515, + "loss": 0.968, + "step": 8610 + }, + { + "epoch": 7.329407121823867, + "grad_norm": 17120.98046875, + "learning_rate": 0.00038415841584158415, + "loss": 0.9664, + "step": 8620 + }, + { + "epoch": 7.337907167359825, + "grad_norm": 18788.953125, + "learning_rate": 0.0003831683168316832, + "loss": 0.965, + "step": 8630 + }, + { + "epoch": 7.346407212895784, + "grad_norm": 14774.0830078125, + "learning_rate": 0.0003821782178217822, + "loss": 0.9666, + "step": 8640 + }, + { + "epoch": 7.346407212895784, + "eval_accuracy": 0.7926509573029413, + "eval_loss": 0.8783407807350159, + "eval_runtime": 1035.0192, + "eval_samples_per_second": 482.482, + "eval_steps_per_second": 0.821, + "step": 8640 + }, + { + "epoch": 7.354907258431742, + "grad_norm": 17107.38671875, + "learning_rate": 0.0003811881188118812, + "loss": 0.9666, + "step": 8650 + }, + { + "epoch": 7.3634073039676995, + "grad_norm": 15051.875, + "learning_rate": 0.0003801980198019802, + "loss": 0.9668, + "step": 8660 + }, + { + "epoch": 7.371907349503658, + "grad_norm": 16625.01171875, + "learning_rate": 0.0003792079207920792, + "loss": 0.9665, + "step": 8670 + }, + { + "epoch": 7.380407395039616, + "grad_norm": 15725.7158203125, + "learning_rate": 0.00037821782178217825, + "loss": 0.9665, + "step": 8680 + }, + { + "epoch": 7.388907440575575, + "grad_norm": 17703.29296875, + "learning_rate": 0.00037722772277227725, + "loss": 0.9653, + "step": 8690 + }, + { + "epoch": 7.397407486111533, + "grad_norm": 18342.701171875, + "learning_rate": 0.00037623762376237625, + "loss": 0.968, + "step": 8700 + }, + { + "epoch": 7.405907531647491, + "grad_norm": 16660.326171875, + "learning_rate": 0.00037524752475247524, + "loss": 0.9633, + "step": 8710 + }, + { + "epoch": 7.414407577183449, + "grad_norm": 14380.01171875, + "learning_rate": 0.00037425742574257424, + "loss": 0.9652, + "step": 8720 + }, + { + "epoch": 7.422907622719407, + "grad_norm": 15314.05859375, + "learning_rate": 0.0003732673267326733, + "loss": 0.9645, + "step": 8730 + }, + { + "epoch": 7.422907622719407, + "eval_accuracy": 0.7929085777897834, + "eval_loss": 0.8774629235267639, + "eval_runtime": 1036.1479, + "eval_samples_per_second": 481.956, + "eval_steps_per_second": 0.82, + "step": 8730 + }, + { + "epoch": 7.431407668255366, + "grad_norm": 15377.4306640625, + "learning_rate": 0.0003722772277227723, + "loss": 0.9661, + "step": 8740 + }, + { + "epoch": 7.439907713791324, + "grad_norm": 20652.443359375, + "learning_rate": 0.0003712871287128713, + "loss": 0.9638, + "step": 8750 + }, + { + "epoch": 7.448407759327282, + "grad_norm": 15570.021484375, + "learning_rate": 0.0003702970297029703, + "loss": 0.9647, + "step": 8760 + }, + { + "epoch": 7.45690780486324, + "grad_norm": 17774.888671875, + "learning_rate": 0.0003693069306930693, + "loss": 0.9651, + "step": 8770 + }, + { + "epoch": 7.465407850399199, + "grad_norm": 19272.056640625, + "learning_rate": 0.00036831683168316834, + "loss": 0.964, + "step": 8780 + }, + { + "epoch": 7.473907895935157, + "grad_norm": 17114.37109375, + "learning_rate": 0.00036732673267326734, + "loss": 0.963, + "step": 8790 + }, + { + "epoch": 7.482407941471115, + "grad_norm": 18956.96484375, + "learning_rate": 0.00036633663366336634, + "loss": 0.9648, + "step": 8800 + }, + { + "epoch": 7.490907987007073, + "grad_norm": 16027.392578125, + "learning_rate": 0.00036534653465346533, + "loss": 0.9646, + "step": 8810 + }, + { + "epoch": 7.499408032543031, + "grad_norm": 16447.48046875, + "learning_rate": 0.00036435643564356433, + "loss": 0.963, + "step": 8820 + }, + { + "epoch": 7.499408032543031, + "eval_accuracy": 0.7934318103664766, + "eval_loss": 0.8747227191925049, + "eval_runtime": 1031.384, + "eval_samples_per_second": 484.182, + "eval_steps_per_second": 0.824, + "step": 8820 + }, + { + "epoch": 7.50790807807899, + "grad_norm": 16898.626953125, + "learning_rate": 0.0003633663366336634, + "loss": 0.9651, + "step": 8830 + }, + { + "epoch": 7.516408123614948, + "grad_norm": 16217.8544921875, + "learning_rate": 0.0003623762376237624, + "loss": 0.9618, + "step": 8840 + }, + { + "epoch": 7.5249081691509065, + "grad_norm": 20356.615234375, + "learning_rate": 0.0003613861386138614, + "loss": 0.9601, + "step": 8850 + }, + { + "epoch": 7.533408214686864, + "grad_norm": 17453.779296875, + "learning_rate": 0.0003603960396039604, + "loss": 0.9627, + "step": 8860 + }, + { + "epoch": 7.541908260222822, + "grad_norm": 14681.1591796875, + "learning_rate": 0.0003594059405940594, + "loss": 0.9616, + "step": 8870 + }, + { + "epoch": 7.550408305758781, + "grad_norm": 16191.0478515625, + "learning_rate": 0.00035841584158415843, + "loss": 0.9623, + "step": 8880 + }, + { + "epoch": 7.558908351294739, + "grad_norm": 15307.25, + "learning_rate": 0.00035742574257425743, + "loss": 0.9604, + "step": 8890 + }, + { + "epoch": 7.5674083968306975, + "grad_norm": 18245.2265625, + "learning_rate": 0.0003564356435643564, + "loss": 0.9619, + "step": 8900 + }, + { + "epoch": 7.575908442366655, + "grad_norm": 16105.3232421875, + "learning_rate": 0.0003554455445544554, + "loss": 0.9643, + "step": 8910 + }, + { + "epoch": 7.575908442366655, + "eval_accuracy": 0.793696329436229, + "eval_loss": 0.8743059039115906, + "eval_runtime": 1035.7771, + "eval_samples_per_second": 482.129, + "eval_steps_per_second": 0.821, + "step": 8910 + }, + { + "epoch": 7.584408487902614, + "grad_norm": 16268.6376953125, + "learning_rate": 0.0003544554455445544, + "loss": 0.9632, + "step": 8920 + }, + { + "epoch": 7.592908533438572, + "grad_norm": 16207.1279296875, + "learning_rate": 0.0003534653465346535, + "loss": 0.9614, + "step": 8930 + }, + { + "epoch": 7.60140857897453, + "grad_norm": 15891.35546875, + "learning_rate": 0.0003524752475247525, + "loss": 0.9623, + "step": 8940 + }, + { + "epoch": 7.6099086245104886, + "grad_norm": 15914.4921875, + "learning_rate": 0.00035148514851485147, + "loss": 0.9609, + "step": 8950 + }, + { + "epoch": 7.618408670046446, + "grad_norm": 15182.478515625, + "learning_rate": 0.00035049504950495047, + "loss": 0.9601, + "step": 8960 + }, + { + "epoch": 7.626908715582405, + "grad_norm": 15736.513671875, + "learning_rate": 0.00034950495049504947, + "loss": 0.9613, + "step": 8970 + }, + { + "epoch": 7.635408761118363, + "grad_norm": 17880.693359375, + "learning_rate": 0.0003485148514851485, + "loss": 0.9606, + "step": 8980 + }, + { + "epoch": 7.643908806654322, + "grad_norm": 15555.0341796875, + "learning_rate": 0.0003475247524752475, + "loss": 0.9618, + "step": 8990 + }, + { + "epoch": 7.6524088521902796, + "grad_norm": 17536.287109375, + "learning_rate": 0.0003465346534653465, + "loss": 0.9606, + "step": 9000 + }, + { + "epoch": 7.6524088521902796, + "eval_accuracy": 0.7939732586317862, + "eval_loss": 0.8728435039520264, + "eval_runtime": 1035.2446, + "eval_samples_per_second": 482.377, + "eval_steps_per_second": 0.821, + "step": 9000 + }, + { + "epoch": 7.660908897726237, + "grad_norm": 16182.7158203125, + "learning_rate": 0.0003455445544554455, + "loss": 0.9605, + "step": 9010 + }, + { + "epoch": 7.669408943262196, + "grad_norm": 17641.724609375, + "learning_rate": 0.0003445544554455445, + "loss": 0.9611, + "step": 9020 + }, + { + "epoch": 7.677908988798154, + "grad_norm": 16540.5625, + "learning_rate": 0.0003435643564356436, + "loss": 0.9596, + "step": 9030 + }, + { + "epoch": 7.686409034334113, + "grad_norm": 15668.515625, + "learning_rate": 0.0003425742574257426, + "loss": 0.9592, + "step": 9040 + }, + { + "epoch": 7.694909079870071, + "grad_norm": 16943.318359375, + "learning_rate": 0.0003415841584158416, + "loss": 0.9619, + "step": 9050 + }, + { + "epoch": 7.703409125406029, + "grad_norm": 15736.9775390625, + "learning_rate": 0.0003405940594059406, + "loss": 0.959, + "step": 9060 + }, + { + "epoch": 7.711909170941987, + "grad_norm": 17073.3203125, + "learning_rate": 0.0003396039603960396, + "loss": 0.9609, + "step": 9070 + }, + { + "epoch": 7.720409216477945, + "grad_norm": 14390.5009765625, + "learning_rate": 0.00033861386138613867, + "loss": 0.9576, + "step": 9080 + }, + { + "epoch": 7.728909262013904, + "grad_norm": 17994.197265625, + "learning_rate": 0.00033762376237623766, + "loss": 0.9576, + "step": 9090 + }, + { + "epoch": 7.728909262013904, + "eval_accuracy": 0.7942283085944538, + "eval_loss": 0.8715931177139282, + "eval_runtime": 1034.9441, + "eval_samples_per_second": 482.517, + "eval_steps_per_second": 0.821, + "step": 9090 + }, + { + "epoch": 7.737409307549862, + "grad_norm": 14687.2646484375, + "learning_rate": 0.00033663366336633666, + "loss": 0.957, + "step": 9100 + }, + { + "epoch": 7.74590935308582, + "grad_norm": 16330.0478515625, + "learning_rate": 0.00033564356435643566, + "loss": 0.9595, + "step": 9110 + }, + { + "epoch": 7.754409398621778, + "grad_norm": 14873.998046875, + "learning_rate": 0.00033465346534653466, + "loss": 0.9596, + "step": 9120 + }, + { + "epoch": 7.762909444157737, + "grad_norm": 14278.1953125, + "learning_rate": 0.0003336633663366337, + "loss": 0.959, + "step": 9130 + }, + { + "epoch": 7.771409489693695, + "grad_norm": 17600.927734375, + "learning_rate": 0.0003326732673267327, + "loss": 0.9573, + "step": 9140 + }, + { + "epoch": 7.779909535229653, + "grad_norm": 15872.1806640625, + "learning_rate": 0.0003316831683168317, + "loss": 0.9573, + "step": 9150 + }, + { + "epoch": 7.788409580765611, + "grad_norm": 19368.9765625, + "learning_rate": 0.0003306930693069307, + "loss": 0.957, + "step": 9160 + }, + { + "epoch": 7.796909626301569, + "grad_norm": 19541.958984375, + "learning_rate": 0.0003297029702970297, + "loss": 0.9581, + "step": 9170 + }, + { + "epoch": 7.805409671837528, + "grad_norm": 16817.328125, + "learning_rate": 0.00032871287128712876, + "loss": 0.957, + "step": 9180 + }, + { + "epoch": 7.805409671837528, + "eval_accuracy": 0.7946937469682237, + "eval_loss": 0.8684272766113281, + "eval_runtime": 1033.8439, + "eval_samples_per_second": 483.03, + "eval_steps_per_second": 0.822, + "step": 9180 + }, + { + "epoch": 7.813909717373486, + "grad_norm": 18241.623046875, + "learning_rate": 0.00032772277227722775, + "loss": 0.9567, + "step": 9190 + }, + { + "epoch": 7.8224097629094445, + "grad_norm": 15321.79296875, + "learning_rate": 0.00032673267326732675, + "loss": 0.9574, + "step": 9200 + }, + { + "epoch": 7.830909808445402, + "grad_norm": 14790.1884765625, + "learning_rate": 0.00032574257425742575, + "loss": 0.9558, + "step": 9210 + }, + { + "epoch": 7.83940985398136, + "grad_norm": 13895.65234375, + "learning_rate": 0.00032475247524752475, + "loss": 0.9567, + "step": 9220 + }, + { + "epoch": 7.847909899517319, + "grad_norm": 17413.087890625, + "learning_rate": 0.0003237623762376238, + "loss": 0.9582, + "step": 9230 + }, + { + "epoch": 7.856409945053277, + "grad_norm": 15125.3447265625, + "learning_rate": 0.0003227722772277228, + "loss": 0.9555, + "step": 9240 + }, + { + "epoch": 7.8649099905892355, + "grad_norm": 15147.716796875, + "learning_rate": 0.0003217821782178218, + "loss": 0.9552, + "step": 9250 + }, + { + "epoch": 7.873410036125193, + "grad_norm": 16691.9765625, + "learning_rate": 0.0003207920792079208, + "loss": 0.9569, + "step": 9260 + }, + { + "epoch": 7.881910081661152, + "grad_norm": 15234.0234375, + "learning_rate": 0.0003198019801980198, + "loss": 0.957, + "step": 9270 + }, + { + "epoch": 7.881910081661152, + "eval_accuracy": 0.7946698774649099, + "eval_loss": 0.8689009547233582, + "eval_runtime": 1037.4044, + "eval_samples_per_second": 481.373, + "eval_steps_per_second": 0.819, + "step": 9270 + }, + { + "epoch": 7.89041012719711, + "grad_norm": 20217.794921875, + "learning_rate": 0.00031881188118811885, + "loss": 0.9558, + "step": 9280 + }, + { + "epoch": 7.898910172733068, + "grad_norm": 16890.482421875, + "learning_rate": 0.00031782178217821784, + "loss": 0.957, + "step": 9290 + }, + { + "epoch": 7.9074102182690265, + "grad_norm": 16310.05078125, + "learning_rate": 0.00031683168316831684, + "loss": 0.9557, + "step": 9300 + }, + { + "epoch": 7.915910263804984, + "grad_norm": 15330.0146484375, + "learning_rate": 0.00031584158415841584, + "loss": 0.9554, + "step": 9310 + }, + { + "epoch": 7.924410309340943, + "grad_norm": 14532.234375, + "learning_rate": 0.00031485148514851484, + "loss": 0.9542, + "step": 9320 + }, + { + "epoch": 7.932910354876901, + "grad_norm": 16035.0615234375, + "learning_rate": 0.0003138613861386139, + "loss": 0.9544, + "step": 9330 + }, + { + "epoch": 7.94141040041286, + "grad_norm": 16156.765625, + "learning_rate": 0.0003128712871287129, + "loss": 0.9549, + "step": 9340 + }, + { + "epoch": 7.9499104459488175, + "grad_norm": 16913.37109375, + "learning_rate": 0.0003118811881188119, + "loss": 0.9537, + "step": 9350 + }, + { + "epoch": 7.958410491484776, + "grad_norm": 16376.814453125, + "learning_rate": 0.0003108910891089109, + "loss": 0.9556, + "step": 9360 + }, + { + "epoch": 7.958410491484776, + "eval_accuracy": 0.7950512170789835, + "eval_loss": 0.8671656847000122, + "eval_runtime": 1035.1375, + "eval_samples_per_second": 482.427, + "eval_steps_per_second": 0.821, + "step": 9360 + }, + { + "epoch": 7.966910537020734, + "grad_norm": 14731.126953125, + "learning_rate": 0.0003099009900990099, + "loss": 0.9529, + "step": 9370 + }, + { + "epoch": 7.975410582556692, + "grad_norm": 16993.232421875, + "learning_rate": 0.00030891089108910894, + "loss": 0.9548, + "step": 9380 + }, + { + "epoch": 7.983910628092651, + "grad_norm": 16157.2880859375, + "learning_rate": 0.00030792079207920793, + "loss": 0.9543, + "step": 9390 + }, + { + "epoch": 7.9924106736286085, + "grad_norm": 17463.8984375, + "learning_rate": 0.00030693069306930693, + "loss": 0.9537, + "step": 9400 + }, + { + "epoch": 8.000910719164567, + "grad_norm": 18573.9765625, + "learning_rate": 0.00030594059405940593, + "loss": 0.9543, + "step": 9410 + }, + { + "epoch": 8.009410764700526, + "grad_norm": 18137.865234375, + "learning_rate": 0.00030495049504950493, + "loss": 0.9529, + "step": 9420 + }, + { + "epoch": 8.017910810236483, + "grad_norm": 17822.484375, + "learning_rate": 0.000303960396039604, + "loss": 0.9528, + "step": 9430 + }, + { + "epoch": 8.026410855772442, + "grad_norm": 16745.134765625, + "learning_rate": 0.000302970297029703, + "loss": 0.9529, + "step": 9440 + }, + { + "epoch": 8.0349109013084, + "grad_norm": 17063.685546875, + "learning_rate": 0.000301980198019802, + "loss": 0.9525, + "step": 9450 + }, + { + "epoch": 8.0349109013084, + "eval_accuracy": 0.7955066441462213, + "eval_loss": 0.8656001687049866, + "eval_runtime": 1034.9789, + "eval_samples_per_second": 482.501, + "eval_steps_per_second": 0.821, + "step": 9450 + }, + { + "epoch": 8.043410946844357, + "grad_norm": 14951.6552734375, + "learning_rate": 0.000300990099009901, + "loss": 0.9539, + "step": 9460 + }, + { + "epoch": 8.051910992380316, + "grad_norm": 18525.021484375, + "learning_rate": 0.0003, + "loss": 0.9529, + "step": 9470 + }, + { + "epoch": 8.060411037916275, + "grad_norm": 16683.45703125, + "learning_rate": 0.000299009900990099, + "loss": 0.9535, + "step": 9480 + }, + { + "epoch": 8.068911083452234, + "grad_norm": 17987.796875, + "learning_rate": 0.000298019801980198, + "loss": 0.9544, + "step": 9490 + }, + { + "epoch": 8.07741112898819, + "grad_norm": 14249.4453125, + "learning_rate": 0.000297029702970297, + "loss": 0.953, + "step": 9500 + }, + { + "epoch": 8.08591117452415, + "grad_norm": 15678.244140625, + "learning_rate": 0.000296039603960396, + "loss": 0.9517, + "step": 9510 + }, + { + "epoch": 8.094411220060108, + "grad_norm": 16945.619140625, + "learning_rate": 0.000295049504950495, + "loss": 0.9518, + "step": 9520 + }, + { + "epoch": 8.102911265596065, + "grad_norm": 16090.876953125, + "learning_rate": 0.00029405940594059407, + "loss": 0.9515, + "step": 9530 + }, + { + "epoch": 8.111411311132024, + "grad_norm": 15359.59375, + "learning_rate": 0.00029306930693069307, + "loss": 0.9509, + "step": 9540 + }, + { + "epoch": 8.111411311132024, + "eval_accuracy": 0.7956073067738585, + "eval_loss": 0.8639572858810425, + "eval_runtime": 1036.0688, + "eval_samples_per_second": 481.993, + "eval_steps_per_second": 0.82, + "step": 9540 + }, + { + "epoch": 8.119911356667982, + "grad_norm": 14432.5576171875, + "learning_rate": 0.00029207920792079207, + "loss": 0.9498, + "step": 9550 + }, + { + "epoch": 8.128411402203941, + "grad_norm": 15058.017578125, + "learning_rate": 0.00029108910891089107, + "loss": 0.9512, + "step": 9560 + }, + { + "epoch": 8.136911447739898, + "grad_norm": 14191.0791015625, + "learning_rate": 0.00029009900990099006, + "loss": 0.9517, + "step": 9570 + }, + { + "epoch": 8.145411493275857, + "grad_norm": 15950.939453125, + "learning_rate": 0.0002891089108910891, + "loss": 0.9506, + "step": 9580 + }, + { + "epoch": 8.153911538811816, + "grad_norm": 17082.619140625, + "learning_rate": 0.0002881188118811881, + "loss": 0.9513, + "step": 9590 + }, + { + "epoch": 8.162411584347772, + "grad_norm": 14874.2890625, + "learning_rate": 0.0002871287128712871, + "loss": 0.9501, + "step": 9600 + }, + { + "epoch": 8.170911629883731, + "grad_norm": 14615.6669921875, + "learning_rate": 0.0002861386138613861, + "loss": 0.951, + "step": 9610 + }, + { + "epoch": 8.17941167541969, + "grad_norm": 15949.583984375, + "learning_rate": 0.0002851485148514851, + "loss": 0.9511, + "step": 9620 + }, + { + "epoch": 8.187911720955649, + "grad_norm": 15501.064453125, + "learning_rate": 0.00028415841584158416, + "loss": 0.9503, + "step": 9630 + }, + { + "epoch": 8.187911720955649, + "eval_accuracy": 0.7959485569423415, + "eval_loss": 0.8628361821174622, + "eval_runtime": 1038.2269, + "eval_samples_per_second": 480.991, + "eval_steps_per_second": 0.819, + "step": 9630 + }, + { + "epoch": 8.196411766491606, + "grad_norm": 14815.34375, + "learning_rate": 0.00028316831683168316, + "loss": 0.9516, + "step": 9640 + }, + { + "epoch": 8.204911812027564, + "grad_norm": 19284.322265625, + "learning_rate": 0.00028217821782178216, + "loss": 0.9508, + "step": 9650 + }, + { + "epoch": 8.213411857563523, + "grad_norm": 15244.177734375, + "learning_rate": 0.0002811881188118812, + "loss": 0.9494, + "step": 9660 + }, + { + "epoch": 8.22191190309948, + "grad_norm": 14576.9482421875, + "learning_rate": 0.0002801980198019802, + "loss": 0.9485, + "step": 9670 + }, + { + "epoch": 8.230411948635439, + "grad_norm": 15386.0654296875, + "learning_rate": 0.00027920792079207926, + "loss": 0.9501, + "step": 9680 + }, + { + "epoch": 8.238911994171398, + "grad_norm": 15885.4248046875, + "learning_rate": 0.00027821782178217826, + "loss": 0.9493, + "step": 9690 + }, + { + "epoch": 8.247412039707356, + "grad_norm": 16190.021484375, + "learning_rate": 0.00027722772277227726, + "loss": 0.9498, + "step": 9700 + }, + { + "epoch": 8.255912085243313, + "grad_norm": 14406.8857421875, + "learning_rate": 0.00027623762376237626, + "loss": 0.9473, + "step": 9710 + }, + { + "epoch": 8.264412130779272, + "grad_norm": 14419.498046875, + "learning_rate": 0.00027524752475247525, + "loss": 0.9499, + "step": 9720 + }, + { + "epoch": 8.264412130779272, + "eval_accuracy": 0.7962577759360605, + "eval_loss": 0.8620018362998962, + "eval_runtime": 1037.7206, + "eval_samples_per_second": 481.226, + "eval_steps_per_second": 0.819, + "step": 9720 + }, + { + "epoch": 8.27291217631523, + "grad_norm": 16101.1533203125, + "learning_rate": 0.0002742574257425743, + "loss": 0.9502, + "step": 9730 + }, + { + "epoch": 8.281412221851188, + "grad_norm": 15671.9033203125, + "learning_rate": 0.0002732673267326733, + "loss": 0.9498, + "step": 9740 + }, + { + "epoch": 8.289912267387146, + "grad_norm": 15215.625, + "learning_rate": 0.0002722772277227723, + "loss": 0.9482, + "step": 9750 + }, + { + "epoch": 8.298412312923105, + "grad_norm": 16016.99609375, + "learning_rate": 0.0002712871287128713, + "loss": 0.9494, + "step": 9760 + }, + { + "epoch": 8.306912358459064, + "grad_norm": 17474.0859375, + "learning_rate": 0.0002702970297029703, + "loss": 0.9497, + "step": 9770 + }, + { + "epoch": 8.31541240399502, + "grad_norm": 13742.4052734375, + "learning_rate": 0.00026930693069306935, + "loss": 0.947, + "step": 9780 + }, + { + "epoch": 8.32391244953098, + "grad_norm": 15790.54296875, + "learning_rate": 0.00026831683168316835, + "loss": 0.9471, + "step": 9790 + }, + { + "epoch": 8.332412495066938, + "grad_norm": 16040.060546875, + "learning_rate": 0.00026732673267326735, + "loss": 0.9489, + "step": 9800 + }, + { + "epoch": 8.340912540602897, + "grad_norm": 14828.8095703125, + "learning_rate": 0.00026633663366336635, + "loss": 0.9482, + "step": 9810 + }, + { + "epoch": 8.340912540602897, + "eval_accuracy": 0.7962955020928217, + "eval_loss": 0.8608699440956116, + "eval_runtime": 1034.0211, + "eval_samples_per_second": 482.948, + "eval_steps_per_second": 0.822, + "step": 9810 + }, + { + "epoch": 8.349412586138854, + "grad_norm": 16481.876953125, + "learning_rate": 0.00026534653465346534, + "loss": 0.9478, + "step": 9820 + }, + { + "epoch": 8.357912631674813, + "grad_norm": 16586.9609375, + "learning_rate": 0.0002643564356435644, + "loss": 0.9491, + "step": 9830 + }, + { + "epoch": 8.366412677210771, + "grad_norm": 16209.767578125, + "learning_rate": 0.0002633663366336634, + "loss": 0.9484, + "step": 9840 + }, + { + "epoch": 8.374912722746728, + "grad_norm": 15675.52734375, + "learning_rate": 0.0002623762376237624, + "loss": 0.9483, + "step": 9850 + }, + { + "epoch": 8.383412768282687, + "grad_norm": 16780.919921875, + "learning_rate": 0.0002613861386138614, + "loss": 0.9466, + "step": 9860 + }, + { + "epoch": 8.391912813818646, + "grad_norm": 14785.990234375, + "learning_rate": 0.0002603960396039604, + "loss": 0.9459, + "step": 9870 + }, + { + "epoch": 8.400412859354603, + "grad_norm": 14482.4794921875, + "learning_rate": 0.00025940594059405944, + "loss": 0.9464, + "step": 9880 + }, + { + "epoch": 8.408912904890562, + "grad_norm": 17260.060546875, + "learning_rate": 0.00025841584158415844, + "loss": 0.947, + "step": 9890 + }, + { + "epoch": 8.41741295042652, + "grad_norm": 15805.9501953125, + "learning_rate": 0.00025742574257425744, + "loss": 0.9454, + "step": 9900 + }, + { + "epoch": 8.41741295042652, + "eval_accuracy": 0.7969292599781366, + "eval_loss": 0.8581969738006592, + "eval_runtime": 1037.2614, + "eval_samples_per_second": 481.439, + "eval_steps_per_second": 0.819, + "step": 9900 + }, + { + "epoch": 8.425912995962479, + "grad_norm": 19391.345703125, + "learning_rate": 0.00025643564356435644, + "loss": 0.9447, + "step": 9910 + }, + { + "epoch": 8.434413041498436, + "grad_norm": 15079.96875, + "learning_rate": 0.00025544554455445543, + "loss": 0.9472, + "step": 9920 + }, + { + "epoch": 8.442913087034395, + "grad_norm": 16290.3056640625, + "learning_rate": 0.0002544554455445545, + "loss": 0.9455, + "step": 9930 + }, + { + "epoch": 8.451413132570353, + "grad_norm": 14815.7783203125, + "learning_rate": 0.0002534653465346535, + "loss": 0.9453, + "step": 9940 + }, + { + "epoch": 8.459913178106312, + "grad_norm": 14300.2734375, + "learning_rate": 0.0002524752475247525, + "loss": 0.9448, + "step": 9950 + }, + { + "epoch": 8.46841322364227, + "grad_norm": 16484.212890625, + "learning_rate": 0.0002514851485148515, + "loss": 0.9471, + "step": 9960 + }, + { + "epoch": 8.476913269178228, + "grad_norm": 14792.8935546875, + "learning_rate": 0.0002504950495049505, + "loss": 0.9459, + "step": 9970 + }, + { + "epoch": 8.485413314714187, + "grad_norm": 15095.5869140625, + "learning_rate": 0.00024950495049504953, + "loss": 0.9439, + "step": 9980 + }, + { + "epoch": 8.493913360250144, + "grad_norm": 15246.3203125, + "learning_rate": 0.00024851485148514853, + "loss": 0.9459, + "step": 9990 + }, + { + "epoch": 8.493913360250144, + "eval_accuracy": 0.7969879777309228, + "eval_loss": 0.8577154278755188, + "eval_runtime": 1035.8486, + "eval_samples_per_second": 482.096, + "eval_steps_per_second": 0.821, + "step": 9990 + }, + { + "epoch": 8.502413405786102, + "grad_norm": 15397.66796875, + "learning_rate": 0.00024752475247524753, + "loss": 0.9443, + "step": 10000 + }, + { + "epoch": 8.510913451322061, + "grad_norm": 15283.72265625, + "learning_rate": 0.0002465346534653465, + "loss": 0.9446, + "step": 10010 + }, + { + "epoch": 8.519413496858018, + "grad_norm": 15016.5234375, + "learning_rate": 0.0002455445544554455, + "loss": 0.9446, + "step": 10020 + }, + { + "epoch": 8.527913542393977, + "grad_norm": 14347.6044921875, + "learning_rate": 0.0002445544554455446, + "loss": 0.945, + "step": 10030 + }, + { + "epoch": 8.536413587929935, + "grad_norm": 17782.130859375, + "learning_rate": 0.00024356435643564357, + "loss": 0.9445, + "step": 10040 + }, + { + "epoch": 8.544913633465894, + "grad_norm": 14884.66796875, + "learning_rate": 0.00024257425742574257, + "loss": 0.9429, + "step": 10050 + }, + { + "epoch": 8.553413679001851, + "grad_norm": 17597.05859375, + "learning_rate": 0.00024158415841584157, + "loss": 0.9443, + "step": 10060 + }, + { + "epoch": 8.56191372453781, + "grad_norm": 17000.009765625, + "learning_rate": 0.0002405940594059406, + "loss": 0.9449, + "step": 10070 + }, + { + "epoch": 8.570413770073769, + "grad_norm": 17303.015625, + "learning_rate": 0.0002396039603960396, + "loss": 0.9444, + "step": 10080 + }, + { + "epoch": 8.570413770073769, + "eval_accuracy": 0.7970984014477712, + "eval_loss": 0.8577408194541931, + "eval_runtime": 1033.5768, + "eval_samples_per_second": 483.155, + "eval_steps_per_second": 0.822, + "step": 10080 + }, + { + "epoch": 8.578913815609727, + "grad_norm": 14883.8759765625, + "learning_rate": 0.00023861386138613862, + "loss": 0.9429, + "step": 10090 + }, + { + "epoch": 8.587413861145684, + "grad_norm": 16271.083984375, + "learning_rate": 0.00023762376237623762, + "loss": 0.9441, + "step": 10100 + }, + { + "epoch": 8.595913906681643, + "grad_norm": 18421.546875, + "learning_rate": 0.00023663366336633662, + "loss": 0.9444, + "step": 10110 + }, + { + "epoch": 8.604413952217602, + "grad_norm": 19686.576171875, + "learning_rate": 0.00023564356435643564, + "loss": 0.9434, + "step": 10120 + }, + { + "epoch": 8.612913997753559, + "grad_norm": 14919.0126953125, + "learning_rate": 0.00023465346534653464, + "loss": 0.9427, + "step": 10130 + }, + { + "epoch": 8.621414043289517, + "grad_norm": 16443.49609375, + "learning_rate": 0.0002336633663366337, + "loss": 0.9412, + "step": 10140 + }, + { + "epoch": 8.629914088825476, + "grad_norm": 14300.4599609375, + "learning_rate": 0.0002326732673267327, + "loss": 0.9436, + "step": 10150 + }, + { + "epoch": 8.638414134361435, + "grad_norm": 17048.576171875, + "learning_rate": 0.0002316831683168317, + "loss": 0.9438, + "step": 10160 + }, + { + "epoch": 8.646914179897392, + "grad_norm": 14095.205078125, + "learning_rate": 0.00023069306930693071, + "loss": 0.9419, + "step": 10170 + }, + { + "epoch": 8.646914179897392, + "eval_accuracy": 0.797700358706027, + "eval_loss": 0.854520857334137, + "eval_runtime": 1038.4284, + "eval_samples_per_second": 480.898, + "eval_steps_per_second": 0.819, + "step": 10170 + }, + { + "epoch": 8.65541422543335, + "grad_norm": 16067.9052734375, + "learning_rate": 0.0002297029702970297, + "loss": 0.9409, + "step": 10180 + }, + { + "epoch": 8.66391427096931, + "grad_norm": 14993.923828125, + "learning_rate": 0.00022871287128712874, + "loss": 0.9421, + "step": 10190 + }, + { + "epoch": 8.672414316505266, + "grad_norm": 14386.7099609375, + "learning_rate": 0.00022772277227722774, + "loss": 0.9428, + "step": 10200 + }, + { + "epoch": 8.680914362041225, + "grad_norm": 15530.26953125, + "learning_rate": 0.00022673267326732673, + "loss": 0.9425, + "step": 10210 + }, + { + "epoch": 8.689414407577184, + "grad_norm": 15528.4541015625, + "learning_rate": 0.00022574257425742576, + "loss": 0.941, + "step": 10220 + }, + { + "epoch": 8.697914453113142, + "grad_norm": 14304.1318359375, + "learning_rate": 0.00022475247524752476, + "loss": 0.9402, + "step": 10230 + }, + { + "epoch": 8.7064144986491, + "grad_norm": 15193.4375, + "learning_rate": 0.00022376237623762378, + "loss": 0.9415, + "step": 10240 + }, + { + "epoch": 8.714914544185058, + "grad_norm": 14937.7109375, + "learning_rate": 0.00022277227722772278, + "loss": 0.9421, + "step": 10250 + }, + { + "epoch": 8.723414589721017, + "grad_norm": 15369.6669921875, + "learning_rate": 0.00022178217821782178, + "loss": 0.9415, + "step": 10260 + }, + { + "epoch": 8.723414589721017, + "eval_accuracy": 0.797684357783119, + "eval_loss": 0.8541524410247803, + "eval_runtime": 1037.1311, + "eval_samples_per_second": 481.499, + "eval_steps_per_second": 0.82, + "step": 10260 + }, + { + "epoch": 8.731914635256974, + "grad_norm": 17404.70703125, + "learning_rate": 0.0002207920792079208, + "loss": 0.9417, + "step": 10270 + }, + { + "epoch": 8.740414680792933, + "grad_norm": 16622.16796875, + "learning_rate": 0.0002198019801980198, + "loss": 0.9409, + "step": 10280 + }, + { + "epoch": 8.748914726328891, + "grad_norm": 13510.8125, + "learning_rate": 0.00021881188118811883, + "loss": 0.9409, + "step": 10290 + }, + { + "epoch": 8.75741477186485, + "grad_norm": 14584.3173828125, + "learning_rate": 0.00021782178217821783, + "loss": 0.9411, + "step": 10300 + }, + { + "epoch": 8.765914817400807, + "grad_norm": 13928.373046875, + "learning_rate": 0.00021683168316831682, + "loss": 0.9408, + "step": 10310 + }, + { + "epoch": 8.774414862936766, + "grad_norm": 13927.751953125, + "learning_rate": 0.00021584158415841585, + "loss": 0.9404, + "step": 10320 + }, + { + "epoch": 8.782914908472724, + "grad_norm": 15996.322265625, + "learning_rate": 0.00021485148514851485, + "loss": 0.9415, + "step": 10330 + }, + { + "epoch": 8.791414954008681, + "grad_norm": 14078.8876953125, + "learning_rate": 0.00021386138613861387, + "loss": 0.9392, + "step": 10340 + }, + { + "epoch": 8.79991499954464, + "grad_norm": 14735.3623046875, + "learning_rate": 0.00021287128712871287, + "loss": 0.9395, + "step": 10350 + }, + { + "epoch": 8.79991499954464, + "eval_accuracy": 0.7978062402609887, + "eval_loss": 0.8535209894180298, + "eval_runtime": 1036.7203, + "eval_samples_per_second": 481.69, + "eval_steps_per_second": 0.82, + "step": 10350 + }, + { + "epoch": 8.808415045080599, + "grad_norm": 14420.7861328125, + "learning_rate": 0.00021188118811881187, + "loss": 0.9412, + "step": 10360 + }, + { + "epoch": 8.816915090616558, + "grad_norm": 14367.3955078125, + "learning_rate": 0.0002108910891089109, + "loss": 0.9382, + "step": 10370 + }, + { + "epoch": 8.825415136152515, + "grad_norm": 14271.1162109375, + "learning_rate": 0.0002099009900990099, + "loss": 0.9406, + "step": 10380 + }, + { + "epoch": 8.833915181688473, + "grad_norm": 14588.7724609375, + "learning_rate": 0.00020891089108910892, + "loss": 0.9397, + "step": 10390 + }, + { + "epoch": 8.842415227224432, + "grad_norm": 13847.20703125, + "learning_rate": 0.00020792079207920792, + "loss": 0.9394, + "step": 10400 + }, + { + "epoch": 8.850915272760389, + "grad_norm": 14702.3779296875, + "learning_rate": 0.00020693069306930691, + "loss": 0.9386, + "step": 10410 + }, + { + "epoch": 8.859415318296348, + "grad_norm": 13664.92578125, + "learning_rate": 0.00020594059405940594, + "loss": 0.939, + "step": 10420 + }, + { + "epoch": 8.867915363832306, + "grad_norm": 14917.166015625, + "learning_rate": 0.00020495049504950494, + "loss": 0.9392, + "step": 10430 + }, + { + "epoch": 8.876415409368265, + "grad_norm": 15233.123046875, + "learning_rate": 0.00020396039603960396, + "loss": 0.9411, + "step": 10440 + }, + { + "epoch": 8.876415409368265, + "eval_accuracy": 0.7982283642493349, + "eval_loss": 0.8512039184570312, + "eval_runtime": 1037.2463, + "eval_samples_per_second": 481.446, + "eval_steps_per_second": 0.819, + "step": 10440 + }, + { + "epoch": 8.884915454904222, + "grad_norm": 14748.51953125, + "learning_rate": 0.000202970297029703, + "loss": 0.9379, + "step": 10450 + }, + { + "epoch": 8.893415500440181, + "grad_norm": 14725.625, + "learning_rate": 0.00020198019801980199, + "loss": 0.9394, + "step": 10460 + }, + { + "epoch": 8.90191554597614, + "grad_norm": 14719.560546875, + "learning_rate": 0.000200990099009901, + "loss": 0.9396, + "step": 10470 + }, + { + "epoch": 8.910415591512097, + "grad_norm": 15252.0546875, + "learning_rate": 0.0002, + "loss": 0.9386, + "step": 10480 + }, + { + "epoch": 8.918915637048055, + "grad_norm": 14756.244140625, + "learning_rate": 0.00019900990099009903, + "loss": 0.9378, + "step": 10490 + }, + { + "epoch": 8.927415682584014, + "grad_norm": 14705.61328125, + "learning_rate": 0.00019801980198019803, + "loss": 0.9377, + "step": 10500 + }, + { + "epoch": 8.935915728119973, + "grad_norm": 14071.0009765625, + "learning_rate": 0.00019702970297029703, + "loss": 0.9394, + "step": 10510 + }, + { + "epoch": 8.94441577365593, + "grad_norm": 13337.140625, + "learning_rate": 0.00019603960396039606, + "loss": 0.9384, + "step": 10520 + }, + { + "epoch": 8.952915819191889, + "grad_norm": 14109.1650390625, + "learning_rate": 0.00019504950495049505, + "loss": 0.9362, + "step": 10530 + }, + { + "epoch": 8.952915819191889, + "eval_accuracy": 0.7982979521046931, + "eval_loss": 0.8517733812332153, + "eval_runtime": 1038.3162, + "eval_samples_per_second": 480.95, + "eval_steps_per_second": 0.819, + "step": 10530 + }, + { + "epoch": 8.961415864727847, + "grad_norm": 17169.69140625, + "learning_rate": 0.00019405940594059408, + "loss": 0.9373, + "step": 10540 + }, + { + "epoch": 8.969915910263804, + "grad_norm": 14149.8935546875, + "learning_rate": 0.00019306930693069308, + "loss": 0.9365, + "step": 10550 + }, + { + "epoch": 8.978415955799763, + "grad_norm": 16067.255859375, + "learning_rate": 0.00019207920792079208, + "loss": 0.9371, + "step": 10560 + }, + { + "epoch": 8.986916001335722, + "grad_norm": 14375.869140625, + "learning_rate": 0.0001910891089108911, + "loss": 0.9375, + "step": 10570 + }, + { + "epoch": 8.99541604687168, + "grad_norm": 14484.7421875, + "learning_rate": 0.0001900990099009901, + "loss": 0.9372, + "step": 10580 + }, + { + "epoch": 9.003916092407637, + "grad_norm": 13759.2275390625, + "learning_rate": 0.00018910891089108913, + "loss": 0.9361, + "step": 10590 + }, + { + "epoch": 9.012416137943596, + "grad_norm": 14114.7119140625, + "learning_rate": 0.00018811881188118812, + "loss": 0.9369, + "step": 10600 + }, + { + "epoch": 9.020916183479555, + "grad_norm": 13548.9287109375, + "learning_rate": 0.00018712871287128712, + "loss": 0.9363, + "step": 10610 + }, + { + "epoch": 9.029416229015512, + "grad_norm": 13845.412109375, + "learning_rate": 0.00018613861386138615, + "loss": 0.9369, + "step": 10620 + }, + { + "epoch": 9.029416229015512, + "eval_accuracy": 0.7986381309887046, + "eval_loss": 0.8504059314727783, + "eval_runtime": 1037.8808, + "eval_samples_per_second": 481.152, + "eval_steps_per_second": 0.819, + "step": 10620 + }, + { + "epoch": 9.03791627455147, + "grad_norm": 14264.1455078125, + "learning_rate": 0.00018514851485148514, + "loss": 0.9371, + "step": 10630 + }, + { + "epoch": 9.04641632008743, + "grad_norm": 12720.0791015625, + "learning_rate": 0.00018415841584158417, + "loss": 0.9351, + "step": 10640 + }, + { + "epoch": 9.054916365623388, + "grad_norm": 14962.2216796875, + "learning_rate": 0.00018316831683168317, + "loss": 0.9347, + "step": 10650 + }, + { + "epoch": 9.063416411159345, + "grad_norm": 14343.6259765625, + "learning_rate": 0.00018217821782178217, + "loss": 0.9366, + "step": 10660 + }, + { + "epoch": 9.071916456695304, + "grad_norm": 15166.927734375, + "learning_rate": 0.0001811881188118812, + "loss": 0.9379, + "step": 10670 + }, + { + "epoch": 9.080416502231262, + "grad_norm": 13523.9873046875, + "learning_rate": 0.0001801980198019802, + "loss": 0.9369, + "step": 10680 + }, + { + "epoch": 9.08891654776722, + "grad_norm": 15057.4638671875, + "learning_rate": 0.00017920792079207922, + "loss": 0.9368, + "step": 10690 + }, + { + "epoch": 9.097416593303178, + "grad_norm": 13766.353515625, + "learning_rate": 0.0001782178217821782, + "loss": 0.935, + "step": 10700 + }, + { + "epoch": 9.105916638839137, + "grad_norm": 15783.3056640625, + "learning_rate": 0.0001772277227722772, + "loss": 0.9365, + "step": 10710 + }, + { + "epoch": 9.105916638839137, + "eval_accuracy": 0.7986280323340061, + "eval_loss": 0.8495949506759644, + "eval_runtime": 1036.1574, + "eval_samples_per_second": 481.952, + "eval_steps_per_second": 0.82, + "step": 10710 + }, + { + "epoch": 9.114416684375096, + "grad_norm": 14088.314453125, + "learning_rate": 0.00017623762376237624, + "loss": 0.9363, + "step": 10720 + }, + { + "epoch": 9.122916729911053, + "grad_norm": 14255.0244140625, + "learning_rate": 0.00017524752475247524, + "loss": 0.9351, + "step": 10730 + }, + { + "epoch": 9.131416775447011, + "grad_norm": 15012.841796875, + "learning_rate": 0.00017425742574257426, + "loss": 0.9341, + "step": 10740 + }, + { + "epoch": 9.13991682098297, + "grad_norm": 15262.3740234375, + "learning_rate": 0.00017326732673267326, + "loss": 0.9342, + "step": 10750 + }, + { + "epoch": 9.148416866518927, + "grad_norm": 14390.744140625, + "learning_rate": 0.00017227722772277226, + "loss": 0.9359, + "step": 10760 + }, + { + "epoch": 9.156916912054886, + "grad_norm": 14124.2373046875, + "learning_rate": 0.0001712871287128713, + "loss": 0.9353, + "step": 10770 + }, + { + "epoch": 9.165416957590844, + "grad_norm": 13361.82421875, + "learning_rate": 0.0001702970297029703, + "loss": 0.9352, + "step": 10780 + }, + { + "epoch": 9.173917003126803, + "grad_norm": 13731.3984375, + "learning_rate": 0.00016930693069306933, + "loss": 0.9345, + "step": 10790 + }, + { + "epoch": 9.18241704866276, + "grad_norm": 13207.9716796875, + "learning_rate": 0.00016831683168316833, + "loss": 0.9336, + "step": 10800 + }, + { + "epoch": 9.18241704866276, + "eval_accuracy": 0.7991505659503615, + "eval_loss": 0.8472453355789185, + "eval_runtime": 1035.1715, + "eval_samples_per_second": 482.411, + "eval_steps_per_second": 0.821, + "step": 10800 + }, + { + "epoch": 9.190917094198719, + "grad_norm": 14880.1533203125, + "learning_rate": 0.00016732673267326733, + "loss": 0.9333, + "step": 10810 + }, + { + "epoch": 9.199417139734678, + "grad_norm": 12751.9345703125, + "learning_rate": 0.00016633663366336635, + "loss": 0.935, + "step": 10820 + }, + { + "epoch": 9.207917185270635, + "grad_norm": 13587.4658203125, + "learning_rate": 0.00016534653465346535, + "loss": 0.9336, + "step": 10830 + }, + { + "epoch": 9.216417230806593, + "grad_norm": 13916.77734375, + "learning_rate": 0.00016435643564356438, + "loss": 0.9348, + "step": 10840 + }, + { + "epoch": 9.224917276342552, + "grad_norm": 14346.2119140625, + "learning_rate": 0.00016336633663366338, + "loss": 0.9328, + "step": 10850 + }, + { + "epoch": 9.23341732187851, + "grad_norm": 11983.17578125, + "learning_rate": 0.00016237623762376237, + "loss": 0.9341, + "step": 10860 + }, + { + "epoch": 9.241917367414468, + "grad_norm": 13424.4794921875, + "learning_rate": 0.0001613861386138614, + "loss": 0.9341, + "step": 10870 + }, + { + "epoch": 9.250417412950426, + "grad_norm": 13093.9267578125, + "learning_rate": 0.0001603960396039604, + "loss": 0.9336, + "step": 10880 + }, + { + "epoch": 9.258917458486385, + "grad_norm": 14450.0888671875, + "learning_rate": 0.00015940594059405942, + "loss": 0.9326, + "step": 10890 + }, + { + "epoch": 9.258917458486385, + "eval_accuracy": 0.7993350474335167, + "eval_loss": 0.846928596496582, + "eval_runtime": 1034.8058, + "eval_samples_per_second": 482.581, + "eval_steps_per_second": 0.821, + "step": 10890 + }, + { + "epoch": 9.267417504022344, + "grad_norm": 12639.7099609375, + "learning_rate": 0.00015841584158415842, + "loss": 0.9331, + "step": 10900 + }, + { + "epoch": 9.2759175495583, + "grad_norm": 14105.6357421875, + "learning_rate": 0.00015742574257425742, + "loss": 0.9323, + "step": 10910 + }, + { + "epoch": 9.28441759509426, + "grad_norm": 13878.3876953125, + "learning_rate": 0.00015643564356435644, + "loss": 0.9319, + "step": 10920 + }, + { + "epoch": 9.292917640630218, + "grad_norm": 14202.0380859375, + "learning_rate": 0.00015544554455445544, + "loss": 0.9309, + "step": 10930 + }, + { + "epoch": 9.301417686166175, + "grad_norm": 12203.2275390625, + "learning_rate": 0.00015445544554455447, + "loss": 0.9323, + "step": 10940 + }, + { + "epoch": 9.309917731702134, + "grad_norm": 12799.16015625, + "learning_rate": 0.00015346534653465347, + "loss": 0.933, + "step": 10950 + }, + { + "epoch": 9.318417777238093, + "grad_norm": 14222.4921875, + "learning_rate": 0.00015247524752475246, + "loss": 0.9335, + "step": 10960 + }, + { + "epoch": 9.32691782277405, + "grad_norm": 13150.4560546875, + "learning_rate": 0.0001514851485148515, + "loss": 0.9334, + "step": 10970 + }, + { + "epoch": 9.335417868310008, + "grad_norm": 13035.662109375, + "learning_rate": 0.0001504950495049505, + "loss": 0.9336, + "step": 10980 + }, + { + "epoch": 9.335417868310008, + "eval_accuracy": 0.7995125837200675, + "eval_loss": 0.8457638621330261, + "eval_runtime": 1036.0457, + "eval_samples_per_second": 482.004, + "eval_steps_per_second": 0.82, + "step": 10980 + }, + { + "epoch": 9.343917913845967, + "grad_norm": 13063.5947265625, + "learning_rate": 0.0001495049504950495, + "loss": 0.9358, + "step": 10990 + }, + { + "epoch": 9.352417959381926, + "grad_norm": 13189.0205078125, + "learning_rate": 0.0001485148514851485, + "loss": 0.9314, + "step": 11000 + }, + { + "epoch": 9.360918004917883, + "grad_norm": 14116.0537109375, + "learning_rate": 0.0001475247524752475, + "loss": 0.9322, + "step": 11010 + }, + { + "epoch": 9.369418050453842, + "grad_norm": 13098.5810546875, + "learning_rate": 0.00014653465346534653, + "loss": 0.9306, + "step": 11020 + }, + { + "epoch": 9.3779180959898, + "grad_norm": 12630.921875, + "learning_rate": 0.00014554455445544553, + "loss": 0.9294, + "step": 11030 + }, + { + "epoch": 9.386418141525759, + "grad_norm": 12936.2509765625, + "learning_rate": 0.00014455445544554456, + "loss": 0.9323, + "step": 11040 + }, + { + "epoch": 9.394918187061716, + "grad_norm": 13526.9853515625, + "learning_rate": 0.00014356435643564356, + "loss": 0.9311, + "step": 11050 + }, + { + "epoch": 9.403418232597675, + "grad_norm": 13259.3603515625, + "learning_rate": 0.00014257425742574255, + "loss": 0.9332, + "step": 11060 + }, + { + "epoch": 9.411918278133633, + "grad_norm": 14287.5361328125, + "learning_rate": 0.00014158415841584158, + "loss": 0.9321, + "step": 11070 + }, + { + "epoch": 9.411918278133633, + "eval_accuracy": 0.7996804125615878, + "eval_loss": 0.8447943329811096, + "eval_runtime": 1035.7879, + "eval_samples_per_second": 482.124, + "eval_steps_per_second": 0.821, + "step": 11070 + }, + { + "epoch": 9.42041832366959, + "grad_norm": 12443.166015625, + "learning_rate": 0.0001405940594059406, + "loss": 0.9303, + "step": 11080 + }, + { + "epoch": 9.42891836920555, + "grad_norm": 13510.2119140625, + "learning_rate": 0.00013960396039603963, + "loss": 0.9298, + "step": 11090 + }, + { + "epoch": 9.437418414741508, + "grad_norm": 11732.4970703125, + "learning_rate": 0.00013861386138613863, + "loss": 0.9314, + "step": 11100 + }, + { + "epoch": 9.445918460277467, + "grad_norm": 13648.421875, + "learning_rate": 0.00013762376237623763, + "loss": 0.9318, + "step": 11110 + }, + { + "epoch": 9.454418505813424, + "grad_norm": 12602.56640625, + "learning_rate": 0.00013663366336633665, + "loss": 0.9318, + "step": 11120 + }, + { + "epoch": 9.462918551349382, + "grad_norm": 13816.3720703125, + "learning_rate": 0.00013564356435643565, + "loss": 0.9319, + "step": 11130 + }, + { + "epoch": 9.471418596885341, + "grad_norm": 14125.748046875, + "learning_rate": 0.00013465346534653468, + "loss": 0.9297, + "step": 11140 + }, + { + "epoch": 9.479918642421298, + "grad_norm": 13495.037109375, + "learning_rate": 0.00013366336633663367, + "loss": 0.9314, + "step": 11150 + }, + { + "epoch": 9.488418687957257, + "grad_norm": 12772.404296875, + "learning_rate": 0.00013267326732673267, + "loss": 0.9283, + "step": 11160 + }, + { + "epoch": 9.488418687957257, + "eval_accuracy": 0.799732345307766, + "eval_loss": 0.8448570370674133, + "eval_runtime": 1038.7893, + "eval_samples_per_second": 480.731, + "eval_steps_per_second": 0.818, + "step": 11160 + }, + { + "epoch": 9.496918733493215, + "grad_norm": 13424.62890625, + "learning_rate": 0.0001316831683168317, + "loss": 0.9297, + "step": 11170 + }, + { + "epoch": 9.505418779029174, + "grad_norm": 14624.0283203125, + "learning_rate": 0.0001306930693069307, + "loss": 0.9295, + "step": 11180 + }, + { + "epoch": 9.513918824565131, + "grad_norm": 12779.9169921875, + "learning_rate": 0.00012970297029702972, + "loss": 0.9322, + "step": 11190 + }, + { + "epoch": 9.52241887010109, + "grad_norm": 13789.7421875, + "learning_rate": 0.00012871287128712872, + "loss": 0.9312, + "step": 11200 + }, + { + "epoch": 9.530918915637049, + "grad_norm": 12679.0703125, + "learning_rate": 0.00012772277227722772, + "loss": 0.9289, + "step": 11210 + }, + { + "epoch": 9.539418961173006, + "grad_norm": 14479.2919921875, + "learning_rate": 0.00012673267326732674, + "loss": 0.931, + "step": 11220 + }, + { + "epoch": 9.547919006708964, + "grad_norm": 12816.5322265625, + "learning_rate": 0.00012574257425742574, + "loss": 0.9304, + "step": 11230 + }, + { + "epoch": 9.556419052244923, + "grad_norm": 13485.4990234375, + "learning_rate": 0.00012475247524752477, + "loss": 0.928, + "step": 11240 + }, + { + "epoch": 9.564919097780882, + "grad_norm": 13575.083984375, + "learning_rate": 0.00012376237623762376, + "loss": 0.929, + "step": 11250 + }, + { + "epoch": 9.564919097780882, + "eval_accuracy": 0.7999083549113648, + "eval_loss": 0.8442253470420837, + "eval_runtime": 1035.3058, + "eval_samples_per_second": 482.348, + "eval_steps_per_second": 0.821, + "step": 11250 + }, + { + "epoch": 9.573419143316839, + "grad_norm": 13030.95703125, + "learning_rate": 0.00012277227722772276, + "loss": 0.9303, + "step": 11260 + }, + { + "epoch": 9.581919188852797, + "grad_norm": 12883.271484375, + "learning_rate": 0.00012178217821782179, + "loss": 0.9304, + "step": 11270 + }, + { + "epoch": 9.590419234388756, + "grad_norm": 12350.5322265625, + "learning_rate": 0.00012079207920792079, + "loss": 0.9298, + "step": 11280 + }, + { + "epoch": 9.598919279924713, + "grad_norm": 12253.1220703125, + "learning_rate": 0.0001198019801980198, + "loss": 0.9301, + "step": 11290 + }, + { + "epoch": 9.607419325460672, + "grad_norm": 13372.912109375, + "learning_rate": 0.00011881188118811881, + "loss": 0.9298, + "step": 11300 + }, + { + "epoch": 9.61591937099663, + "grad_norm": 12150.8623046875, + "learning_rate": 0.00011782178217821782, + "loss": 0.9276, + "step": 11310 + }, + { + "epoch": 9.62441941653259, + "grad_norm": 12663.35546875, + "learning_rate": 0.00011683168316831685, + "loss": 0.9305, + "step": 11320 + }, + { + "epoch": 9.632919462068546, + "grad_norm": 12773.25, + "learning_rate": 0.00011584158415841584, + "loss": 0.929, + "step": 11330 + }, + { + "epoch": 9.641419507604505, + "grad_norm": 12251.90234375, + "learning_rate": 0.00011485148514851486, + "loss": 0.9282, + "step": 11340 + }, + { + "epoch": 9.641419507604505, + "eval_accuracy": 0.8002882889018181, + "eval_loss": 0.842271089553833, + "eval_runtime": 1038.9797, + "eval_samples_per_second": 480.643, + "eval_steps_per_second": 0.818, + "step": 11340 + }, + { + "epoch": 9.649919553140464, + "grad_norm": 12380.44140625, + "learning_rate": 0.00011386138613861387, + "loss": 0.9275, + "step": 11350 + }, + { + "epoch": 9.65841959867642, + "grad_norm": 12090.271484375, + "learning_rate": 0.00011287128712871288, + "loss": 0.9275, + "step": 11360 + }, + { + "epoch": 9.66691964421238, + "grad_norm": 13393.486328125, + "learning_rate": 0.00011188118811881189, + "loss": 0.9293, + "step": 11370 + }, + { + "epoch": 9.675419689748338, + "grad_norm": 11574.048828125, + "learning_rate": 0.00011089108910891089, + "loss": 0.9269, + "step": 11380 + }, + { + "epoch": 9.683919735284297, + "grad_norm": 13117.8916015625, + "learning_rate": 0.0001099009900990099, + "loss": 0.928, + "step": 11390 + }, + { + "epoch": 9.692419780820254, + "grad_norm": 13132.0810546875, + "learning_rate": 0.00010891089108910891, + "loss": 0.9272, + "step": 11400 + }, + { + "epoch": 9.700919826356213, + "grad_norm": 13526.349609375, + "learning_rate": 0.00010792079207920792, + "loss": 0.9271, + "step": 11410 + }, + { + "epoch": 9.709419871892171, + "grad_norm": 13662.8740234375, + "learning_rate": 0.00010693069306930694, + "loss": 0.9261, + "step": 11420 + }, + { + "epoch": 9.717919917428128, + "grad_norm": 12285.236328125, + "learning_rate": 0.00010594059405940593, + "loss": 0.9271, + "step": 11430 + }, + { + "epoch": 9.717919917428128, + "eval_accuracy": 0.8004033551877789, + "eval_loss": 0.8413528800010681, + "eval_runtime": 1034.3691, + "eval_samples_per_second": 482.785, + "eval_steps_per_second": 0.822, + "step": 11430 + }, + { + "epoch": 9.726419962964087, + "grad_norm": 12984.609375, + "learning_rate": 0.00010495049504950495, + "loss": 0.9272, + "step": 11440 + }, + { + "epoch": 9.734920008500046, + "grad_norm": 13035.4326171875, + "learning_rate": 0.00010396039603960396, + "loss": 0.927, + "step": 11450 + }, + { + "epoch": 9.743420054036005, + "grad_norm": 12569.7998046875, + "learning_rate": 0.00010297029702970297, + "loss": 0.9276, + "step": 11460 + }, + { + "epoch": 9.751920099571961, + "grad_norm": 13077.8876953125, + "learning_rate": 0.00010198019801980198, + "loss": 0.9271, + "step": 11470 + }, + { + "epoch": 9.76042014510792, + "grad_norm": 12659.830078125, + "learning_rate": 0.00010099009900990099, + "loss": 0.9276, + "step": 11480 + }, + { + "epoch": 9.768920190643879, + "grad_norm": 13212.763671875, + "learning_rate": 0.0001, + "loss": 0.9268, + "step": 11490 + }, + { + "epoch": 9.777420236179836, + "grad_norm": 12120.390625, + "learning_rate": 9.900990099009902e-05, + "loss": 0.9267, + "step": 11500 + }, + { + "epoch": 9.785920281715795, + "grad_norm": 12504.2646484375, + "learning_rate": 9.801980198019803e-05, + "loss": 0.9281, + "step": 11510 + }, + { + "epoch": 9.794420327251753, + "grad_norm": 12107.5166015625, + "learning_rate": 9.702970297029704e-05, + "loss": 0.9264, + "step": 11520 + }, + { + "epoch": 9.794420327251753, + "eval_accuracy": 0.800677157520577, + "eval_loss": 0.840539276599884, + "eval_runtime": 1035.1246, + "eval_samples_per_second": 482.433, + "eval_steps_per_second": 0.821, + "step": 11520 + }, + { + "epoch": 9.802920372787712, + "grad_norm": 13302.0888671875, + "learning_rate": 9.603960396039604e-05, + "loss": 0.9249, + "step": 11530 + }, + { + "epoch": 9.811420418323669, + "grad_norm": 11464.931640625, + "learning_rate": 9.504950495049505e-05, + "loss": 0.928, + "step": 11540 + }, + { + "epoch": 9.819920463859628, + "grad_norm": 11620.611328125, + "learning_rate": 9.405940594059406e-05, + "loss": 0.9269, + "step": 11550 + }, + { + "epoch": 9.828420509395587, + "grad_norm": 12684.673828125, + "learning_rate": 9.306930693069307e-05, + "loss": 0.9264, + "step": 11560 + }, + { + "epoch": 9.836920554931543, + "grad_norm": 12113.302734375, + "learning_rate": 9.207920792079209e-05, + "loss": 0.9239, + "step": 11570 + }, + { + "epoch": 9.845420600467502, + "grad_norm": 12095.48828125, + "learning_rate": 9.108910891089108e-05, + "loss": 0.9262, + "step": 11580 + }, + { + "epoch": 9.853920646003461, + "grad_norm": 12388.09765625, + "learning_rate": 9.00990099009901e-05, + "loss": 0.9241, + "step": 11590 + }, + { + "epoch": 9.86242069153942, + "grad_norm": 12502.1171875, + "learning_rate": 8.91089108910891e-05, + "loss": 0.9256, + "step": 11600 + }, + { + "epoch": 9.870920737075377, + "grad_norm": 12518.240234375, + "learning_rate": 8.811881188118812e-05, + "loss": 0.9268, + "step": 11610 + }, + { + "epoch": 9.870920737075377, + "eval_accuracy": 0.800816669053509, + "eval_loss": 0.8393772840499878, + "eval_runtime": 1035.7736, + "eval_samples_per_second": 482.13, + "eval_steps_per_second": 0.821, + "step": 11610 + }, + { + "epoch": 9.879420782611335, + "grad_norm": 11850.7548828125, + "learning_rate": 8.712871287128713e-05, + "loss": 0.9257, + "step": 11620 + }, + { + "epoch": 9.887920828147294, + "grad_norm": 12279.2197265625, + "learning_rate": 8.613861386138613e-05, + "loss": 0.9251, + "step": 11630 + }, + { + "epoch": 9.896420873683251, + "grad_norm": 12372.4765625, + "learning_rate": 8.514851485148515e-05, + "loss": 0.9241, + "step": 11640 + }, + { + "epoch": 9.90492091921921, + "grad_norm": 11712.88671875, + "learning_rate": 8.415841584158417e-05, + "loss": 0.9233, + "step": 11650 + }, + { + "epoch": 9.913420964755169, + "grad_norm": 12502.9453125, + "learning_rate": 8.316831683168318e-05, + "loss": 0.9252, + "step": 11660 + }, + { + "epoch": 9.921921010291127, + "grad_norm": 13177.66796875, + "learning_rate": 8.217821782178219e-05, + "loss": 0.9237, + "step": 11670 + }, + { + "epoch": 9.930421055827084, + "grad_norm": 12558.2802734375, + "learning_rate": 8.118811881188119e-05, + "loss": 0.9251, + "step": 11680 + }, + { + "epoch": 9.938921101363043, + "grad_norm": 11745.330078125, + "learning_rate": 8.01980198019802e-05, + "loss": 0.9264, + "step": 11690 + }, + { + "epoch": 9.947421146899002, + "grad_norm": 11350.08203125, + "learning_rate": 7.920792079207921e-05, + "loss": 0.924, + "step": 11700 + }, + { + "epoch": 9.947421146899002, + "eval_accuracy": 0.8009528506484064, + "eval_loss": 0.8385128974914551, + "eval_runtime": 1035.7833, + "eval_samples_per_second": 482.126, + "eval_steps_per_second": 0.821, + "step": 11700 + }, + { + "epoch": 9.955921192434959, + "grad_norm": 11514.369140625, + "learning_rate": 7.821782178217822e-05, + "loss": 0.9236, + "step": 11710 + }, + { + "epoch": 9.964421237970917, + "grad_norm": 11468.5146484375, + "learning_rate": 7.722772277227723e-05, + "loss": 0.9226, + "step": 11720 + }, + { + "epoch": 9.972921283506876, + "grad_norm": 11347.8369140625, + "learning_rate": 7.623762376237623e-05, + "loss": 0.9244, + "step": 11730 + }, + { + "epoch": 9.981421329042835, + "grad_norm": 11171.818359375, + "learning_rate": 7.524752475247524e-05, + "loss": 0.9247, + "step": 11740 + }, + { + "epoch": 9.989921374578792, + "grad_norm": 11165.7451171875, + "learning_rate": 7.425742574257426e-05, + "loss": 0.926, + "step": 11750 + }, + { + "epoch": 9.99842142011475, + "grad_norm": 11604.375, + "learning_rate": 7.326732673267327e-05, + "loss": 0.9237, + "step": 11760 + }, + { + "epoch": 10.00692146565071, + "grad_norm": 11761.7099609375, + "learning_rate": 7.227722772277228e-05, + "loss": 0.9245, + "step": 11770 + }, + { + "epoch": 10.015421511186666, + "grad_norm": 11363.318359375, + "learning_rate": 7.128712871287128e-05, + "loss": 0.9242, + "step": 11780 + }, + { + "epoch": 10.023921556722625, + "grad_norm": 11492.59765625, + "learning_rate": 7.02970297029703e-05, + "loss": 0.9243, + "step": 11790 + }, + { + "epoch": 10.023921556722625, + "eval_accuracy": 0.8013337325505296, + "eval_loss": 0.8371462225914001, + "eval_runtime": 1036.3502, + "eval_samples_per_second": 481.862, + "eval_steps_per_second": 0.82, + "step": 11790 + }, + { + "epoch": 10.032421602258584, + "grad_norm": 11367.05859375, + "learning_rate": 6.930693069306931e-05, + "loss": 0.9241, + "step": 11800 + }, + { + "epoch": 10.040921647794542, + "grad_norm": 11473.013671875, + "learning_rate": 6.831683168316833e-05, + "loss": 0.9223, + "step": 11810 + }, + { + "epoch": 10.0494216933305, + "grad_norm": 11349.96484375, + "learning_rate": 6.732673267326734e-05, + "loss": 0.9225, + "step": 11820 + }, + { + "epoch": 10.057921738866458, + "grad_norm": 11236.7431640625, + "learning_rate": 6.633663366336634e-05, + "loss": 0.9236, + "step": 11830 + }, + { + "epoch": 10.066421784402417, + "grad_norm": 11882.5078125, + "learning_rate": 6.534653465346535e-05, + "loss": 0.9227, + "step": 11840 + }, + { + "epoch": 10.074921829938374, + "grad_norm": 11027.5966796875, + "learning_rate": 6.435643564356436e-05, + "loss": 0.9212, + "step": 11850 + }, + { + "epoch": 10.083421875474333, + "grad_norm": 11009.87890625, + "learning_rate": 6.336633663366337e-05, + "loss": 0.9235, + "step": 11860 + }, + { + "epoch": 10.091921921010291, + "grad_norm": 10943.912109375, + "learning_rate": 6.237623762376238e-05, + "loss": 0.9236, + "step": 11870 + }, + { + "epoch": 10.10042196654625, + "grad_norm": 11583.521484375, + "learning_rate": 6.138613861386138e-05, + "loss": 0.9251, + "step": 11880 + }, + { + "epoch": 10.10042196654625, + "eval_accuracy": 0.8014076660668283, + "eval_loss": 0.8366426825523376, + "eval_runtime": 1035.5282, + "eval_samples_per_second": 482.245, + "eval_steps_per_second": 0.821, + "step": 11880 + }, + { + "epoch": 10.108922012082207, + "grad_norm": 10507.802734375, + "learning_rate": 6.039603960396039e-05, + "loss": 0.9218, + "step": 11890 + }, + { + "epoch": 10.117422057618166, + "grad_norm": 12043.7763671875, + "learning_rate": 5.9405940594059404e-05, + "loss": 0.9213, + "step": 11900 + }, + { + "epoch": 10.125922103154124, + "grad_norm": 10901.1728515625, + "learning_rate": 5.841584158415842e-05, + "loss": 0.92, + "step": 11910 + }, + { + "epoch": 10.134422148690081, + "grad_norm": 11496.796875, + "learning_rate": 5.742574257425743e-05, + "loss": 0.922, + "step": 11920 + }, + { + "epoch": 10.14292219422604, + "grad_norm": 11207.1455078125, + "learning_rate": 5.643564356435644e-05, + "loss": 0.9207, + "step": 11930 + }, + { + "epoch": 10.151422239761999, + "grad_norm": 10568.88671875, + "learning_rate": 5.5445544554455445e-05, + "loss": 0.923, + "step": 11940 + }, + { + "epoch": 10.159922285297958, + "grad_norm": 11236.5009765625, + "learning_rate": 5.4455445544554456e-05, + "loss": 0.9226, + "step": 11950 + }, + { + "epoch": 10.168422330833915, + "grad_norm": 10793.302734375, + "learning_rate": 5.346534653465347e-05, + "loss": 0.921, + "step": 11960 + }, + { + "epoch": 10.176922376369873, + "grad_norm": 10562.205078125, + "learning_rate": 5.247524752475247e-05, + "loss": 0.9211, + "step": 11970 + }, + { + "epoch": 10.176922376369873, + "eval_accuracy": 0.8014045174418934, + "eval_loss": 0.836881697177887, + "eval_runtime": 1035.287, + "eval_samples_per_second": 482.357, + "eval_steps_per_second": 0.821, + "step": 11970 + }, + { + "epoch": 10.185422421905832, + "grad_norm": 10615.4091796875, + "learning_rate": 5.1485148514851485e-05, + "loss": 0.9216, + "step": 11980 + }, + { + "epoch": 10.19392246744179, + "grad_norm": 11039.455078125, + "learning_rate": 5.0495049504950497e-05, + "loss": 0.9193, + "step": 11990 + }, + { + "epoch": 10.202422512977748, + "grad_norm": 11875.810546875, + "learning_rate": 4.950495049504951e-05, + "loss": 0.9221, + "step": 12000 + }, + { + "epoch": 10.210922558513706, + "grad_norm": 10764.8984375, + "learning_rate": 4.851485148514852e-05, + "loss": 0.922, + "step": 12010 + }, + { + "epoch": 10.219422604049665, + "grad_norm": 10610.0224609375, + "learning_rate": 4.7524752475247525e-05, + "loss": 0.9225, + "step": 12020 + }, + { + "epoch": 10.227922649585622, + "grad_norm": 10679.6396484375, + "learning_rate": 4.653465346534654e-05, + "loss": 0.921, + "step": 12030 + }, + { + "epoch": 10.23642269512158, + "grad_norm": 10098.7451171875, + "learning_rate": 4.554455445544554e-05, + "loss": 0.9227, + "step": 12040 + }, + { + "epoch": 10.24492274065754, + "grad_norm": 11032.216796875, + "learning_rate": 4.455445544554455e-05, + "loss": 0.9223, + "step": 12050 + }, + { + "epoch": 10.253422786193497, + "grad_norm": 11303.1201171875, + "learning_rate": 4.3564356435643565e-05, + "loss": 0.9223, + "step": 12060 + }, + { + "epoch": 10.253422786193497, + "eval_accuracy": 0.8015698536327949, + "eval_loss": 0.8357640504837036, + "eval_runtime": 1032.412, + "eval_samples_per_second": 483.7, + "eval_steps_per_second": 0.823, + "step": 12060 + }, + { + "epoch": 10.261922831729455, + "grad_norm": 10582.2373046875, + "learning_rate": 4.257425742574258e-05, + "loss": 0.9192, + "step": 12070 + }, + { + "epoch": 10.270422877265414, + "grad_norm": 10299.9345703125, + "learning_rate": 4.158415841584159e-05, + "loss": 0.9209, + "step": 12080 + }, + { + "epoch": 10.278922922801373, + "grad_norm": 10322.740234375, + "learning_rate": 4.0594059405940594e-05, + "loss": 0.922, + "step": 12090 + }, + { + "epoch": 10.28742296833733, + "grad_norm": 10159.0244140625, + "learning_rate": 3.9603960396039605e-05, + "loss": 0.9189, + "step": 12100 + }, + { + "epoch": 10.295923013873288, + "grad_norm": 10132.728515625, + "learning_rate": 3.861386138613862e-05, + "loss": 0.9209, + "step": 12110 + }, + { + "epoch": 10.304423059409247, + "grad_norm": 10718.8837890625, + "learning_rate": 3.762376237623762e-05, + "loss": 0.9208, + "step": 12120 + }, + { + "epoch": 10.312923104945206, + "grad_norm": 10195.5869140625, + "learning_rate": 3.6633663366336634e-05, + "loss": 0.9225, + "step": 12130 + }, + { + "epoch": 10.321423150481163, + "grad_norm": 10577.4375, + "learning_rate": 3.564356435643564e-05, + "loss": 0.9204, + "step": 12140 + }, + { + "epoch": 10.329923196017122, + "grad_norm": 10045.123046875, + "learning_rate": 3.465346534653466e-05, + "loss": 0.9182, + "step": 12150 + }, + { + "epoch": 10.329923196017122, + "eval_accuracy": 0.8016586056285943, + "eval_loss": 0.835513710975647, + "eval_runtime": 1039.9403, + "eval_samples_per_second": 480.199, + "eval_steps_per_second": 0.817, + "step": 12150 + }, + { + "epoch": 10.33842324155308, + "grad_norm": 10554.9501953125, + "learning_rate": 3.366336633663367e-05, + "loss": 0.9219, + "step": 12160 + }, + { + "epoch": 10.346923287089037, + "grad_norm": 10395.685546875, + "learning_rate": 3.2673267326732674e-05, + "loss": 0.9189, + "step": 12170 + }, + { + "epoch": 10.355423332624996, + "grad_norm": 10223.9580078125, + "learning_rate": 3.1683168316831686e-05, + "loss": 0.9205, + "step": 12180 + }, + { + "epoch": 10.363923378160955, + "grad_norm": 10570.7958984375, + "learning_rate": 3.069306930693069e-05, + "loss": 0.9207, + "step": 12190 + }, + { + "epoch": 10.372423423696914, + "grad_norm": 9892.7470703125, + "learning_rate": 2.9702970297029702e-05, + "loss": 0.918, + "step": 12200 + }, + { + "epoch": 10.38092346923287, + "grad_norm": 9921.88671875, + "learning_rate": 2.8712871287128714e-05, + "loss": 0.9196, + "step": 12210 + }, + { + "epoch": 10.38942351476883, + "grad_norm": 9997.732421875, + "learning_rate": 2.7722772277227722e-05, + "loss": 0.9217, + "step": 12220 + }, + { + "epoch": 10.397923560304788, + "grad_norm": 10292.541015625, + "learning_rate": 2.6732673267326734e-05, + "loss": 0.9195, + "step": 12230 + }, + { + "epoch": 10.406423605840745, + "grad_norm": 10017.322265625, + "learning_rate": 2.5742574257425742e-05, + "loss": 0.9215, + "step": 12240 + }, + { + "epoch": 10.406423605840745, + "eval_accuracy": 0.8019068191825514, + "eval_loss": 0.83404940366745, + "eval_runtime": 1036.3785, + "eval_samples_per_second": 481.849, + "eval_steps_per_second": 0.82, + "step": 12240 + }, + { + "epoch": 10.414923651376704, + "grad_norm": 10193.8349609375, + "learning_rate": 2.4752475247524754e-05, + "loss": 0.9206, + "step": 12250 + }, + { + "epoch": 10.423423696912662, + "grad_norm": 10791.517578125, + "learning_rate": 2.3762376237623762e-05, + "loss": 0.9192, + "step": 12260 + }, + { + "epoch": 10.431923742448621, + "grad_norm": 9776.1796875, + "learning_rate": 2.277227722772277e-05, + "loss": 0.9183, + "step": 12270 + }, + { + "epoch": 10.440423787984578, + "grad_norm": 9862.8271484375, + "learning_rate": 2.1782178217821783e-05, + "loss": 0.9194, + "step": 12280 + }, + { + "epoch": 10.448923833520537, + "grad_norm": 10056.2607421875, + "learning_rate": 2.0792079207920794e-05, + "loss": 0.9207, + "step": 12290 + }, + { + "epoch": 10.457423879056496, + "grad_norm": 9770.7578125, + "learning_rate": 1.9801980198019803e-05, + "loss": 0.9198, + "step": 12300 + }, + { + "epoch": 10.465923924592452, + "grad_norm": 10069.5634765625, + "learning_rate": 1.881188118811881e-05, + "loss": 0.9201, + "step": 12310 + }, + { + "epoch": 10.474423970128411, + "grad_norm": 9465.1181640625, + "learning_rate": 1.782178217821782e-05, + "loss": 0.9197, + "step": 12320 + }, + { + "epoch": 10.48292401566437, + "grad_norm": 9807.8330078125, + "learning_rate": 1.6831683168316834e-05, + "loss": 0.9179, + "step": 12330 + }, + { + "epoch": 10.48292401566437, + "eval_accuracy": 0.8019292754647317, + "eval_loss": 0.8340857625007629, + "eval_runtime": 1037.0091, + "eval_samples_per_second": 481.556, + "eval_steps_per_second": 0.82, + "step": 12330 + }, + { + "epoch": 10.491424061200329, + "grad_norm": 9650.10546875, + "learning_rate": 1.5841584158415843e-05, + "loss": 0.9205, + "step": 12340 + }, + { + "epoch": 10.499924106736286, + "grad_norm": 9682.900390625, + "learning_rate": 1.4851485148514851e-05, + "loss": 0.9195, + "step": 12350 + }, + { + "epoch": 10.508424152272244, + "grad_norm": 9976.9375, + "learning_rate": 1.3861386138613861e-05, + "loss": 0.9191, + "step": 12360 + }, + { + "epoch": 10.516924197808203, + "grad_norm": 9428.4052734375, + "learning_rate": 1.2871287128712871e-05, + "loss": 0.9189, + "step": 12370 + }, + { + "epoch": 10.52542424334416, + "grad_norm": 10025.7158203125, + "learning_rate": 1.1881188118811881e-05, + "loss": 0.918, + "step": 12380 + }, + { + "epoch": 10.533924288880119, + "grad_norm": 9750.935546875, + "learning_rate": 1.0891089108910891e-05, + "loss": 0.9187, + "step": 12390 + }, + { + "epoch": 10.542424334416078, + "grad_norm": 9010.52734375, + "learning_rate": 9.900990099009901e-06, + "loss": 0.9189, + "step": 12400 + }, + { + "epoch": 10.550924379952036, + "grad_norm": 9163.4169921875, + "learning_rate": 8.91089108910891e-06, + "loss": 0.9191, + "step": 12410 + }, + { + "epoch": 10.559424425487993, + "grad_norm": 9015.5771484375, + "learning_rate": 7.920792079207921e-06, + "loss": 0.9195, + "step": 12420 + }, + { + "epoch": 10.559424425487993, + "eval_accuracy": 0.802042445850802, + "eval_loss": 0.833830714225769, + "eval_runtime": 1033.7642, + "eval_samples_per_second": 483.068, + "eval_steps_per_second": 0.822, + "step": 12420 + } + ], + "logging_steps": 10, + "max_steps": 12500, + "num_input_tokens_seen": 0, + "num_train_epochs": 11, + "save_steps": 90, + "total_flos": 7.8411669992781e+18, + "train_batch_size": 288, + "trial_name": null, + "trial_params": null +}