{ "best_metric": 0.8174300254452926, "best_model_checkpoint": "training_sentiment_analysis/checkpoint-8600", "epoch": 20.0, "eval_steps": 200, "global_step": 18680, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.21, "grad_norm": 1.3381836414337158, "learning_rate": 3.2119914346895075e-05, "loss": 0.9299, "step": 200 }, { "epoch": 0.21, "eval_accuracy": 0.638676844783715, "eval_loss": 0.827367901802063, "eval_runtime": 3.055, "eval_samples_per_second": 514.569, "eval_steps_per_second": 16.367, "step": 200 }, { "epoch": 0.43, "grad_norm": 1.0220164060592651, "learning_rate": 6.423982869379015e-05, "loss": 0.7793, "step": 400 }, { "epoch": 0.43, "eval_accuracy": 0.7188295165394402, "eval_loss": 0.6643335223197937, "eval_runtime": 3.0013, "eval_samples_per_second": 523.77, "eval_steps_per_second": 16.659, "step": 400 }, { "epoch": 0.64, "grad_norm": 1.7421491146087646, "learning_rate": 9.635974304068522e-05, "loss": 0.6574, "step": 600 }, { "epoch": 0.64, "eval_accuracy": 0.7659033078880407, "eval_loss": 0.5868020057678223, "eval_runtime": 2.9749, "eval_samples_per_second": 528.422, "eval_steps_per_second": 16.807, "step": 600 }, { "epoch": 0.86, "grad_norm": 1.8133894205093384, "learning_rate": 0.0001284796573875803, "loss": 0.6132, "step": 800 }, { "epoch": 0.86, "eval_accuracy": 0.772264631043257, "eval_loss": 0.5582301616668701, "eval_runtime": 2.9908, "eval_samples_per_second": 525.617, "eval_steps_per_second": 16.718, "step": 800 }, { "epoch": 1.07, "grad_norm": 1.3071078062057495, "learning_rate": 0.00016059957173447537, "loss": 0.5791, "step": 1000 }, { "epoch": 1.07, "eval_accuracy": 0.7830788804071247, "eval_loss": 0.5515692234039307, "eval_runtime": 2.9665, "eval_samples_per_second": 529.915, "eval_steps_per_second": 16.855, "step": 1000 }, { "epoch": 1.28, "grad_norm": 1.0445743799209595, "learning_rate": 0.00019271948608137044, "loss": 0.554, "step": 1200 }, { "epoch": 1.28, "eval_accuracy": 0.7964376590330788, "eval_loss": 0.5187413692474365, "eval_runtime": 2.9846, "eval_samples_per_second": 526.705, "eval_steps_per_second": 16.753, "step": 1200 }, { "epoch": 1.5, "grad_norm": 1.0763362646102905, "learning_rate": 0.0002248394004282655, "loss": 0.5258, "step": 1400 }, { "epoch": 1.5, "eval_accuracy": 0.8034351145038168, "eval_loss": 0.5125576257705688, "eval_runtime": 2.9831, "eval_samples_per_second": 526.967, "eval_steps_per_second": 16.761, "step": 1400 }, { "epoch": 1.71, "grad_norm": 0.8554897308349609, "learning_rate": 0.0002569593147751606, "loss": 0.5373, "step": 1600 }, { "epoch": 1.71, "eval_accuracy": 0.8002544529262087, "eval_loss": 0.51680988073349, "eval_runtime": 2.9726, "eval_samples_per_second": 528.823, "eval_steps_per_second": 16.82, "step": 1600 }, { "epoch": 1.93, "grad_norm": 1.538806438446045, "learning_rate": 0.0002890792291220556, "loss": 0.5266, "step": 1800 }, { "epoch": 1.93, "eval_accuracy": 0.8027989821882952, "eval_loss": 0.5283887982368469, "eval_runtime": 2.9766, "eval_samples_per_second": 528.12, "eval_steps_per_second": 16.798, "step": 1800 }, { "epoch": 2.14, "grad_norm": 1.1234441995620728, "learning_rate": 0.000297644539614561, "loss": 0.5076, "step": 2000 }, { "epoch": 2.14, "eval_accuracy": 0.7977099236641222, "eval_loss": 0.5178301334381104, "eval_runtime": 2.9829, "eval_samples_per_second": 526.996, "eval_steps_per_second": 16.762, "step": 2000 }, { "epoch": 2.36, "grad_norm": 1.6212774515151978, "learning_rate": 0.0002940756602426838, "loss": 0.5094, "step": 2200 }, { "epoch": 2.36, "eval_accuracy": 0.8027989821882952, "eval_loss": 0.5134572982788086, "eval_runtime": 2.981, "eval_samples_per_second": 527.334, "eval_steps_per_second": 16.773, "step": 2200 }, { "epoch": 2.57, "grad_norm": 1.4514294862747192, "learning_rate": 0.00029050678087080655, "loss": 0.5032, "step": 2400 }, { "epoch": 2.57, "eval_accuracy": 0.8104325699745547, "eval_loss": 0.5022692084312439, "eval_runtime": 2.963, "eval_samples_per_second": 530.535, "eval_steps_per_second": 16.875, "step": 2400 }, { "epoch": 2.78, "grad_norm": 1.826932668685913, "learning_rate": 0.0002869379014989293, "loss": 0.5034, "step": 2600 }, { "epoch": 2.78, "eval_accuracy": 0.80470737913486, "eval_loss": 0.5088226199150085, "eval_runtime": 2.9831, "eval_samples_per_second": 526.969, "eval_steps_per_second": 16.761, "step": 2600 }, { "epoch": 3.0, "grad_norm": 1.4404336214065552, "learning_rate": 0.0002833690221270521, "loss": 0.4923, "step": 2800 }, { "epoch": 3.0, "eval_accuracy": 0.799618320610687, "eval_loss": 0.5219257473945618, "eval_runtime": 2.9722, "eval_samples_per_second": 528.9, "eval_steps_per_second": 16.823, "step": 2800 }, { "epoch": 3.21, "grad_norm": 0.8795878291130066, "learning_rate": 0.00027980014275517484, "loss": 0.4934, "step": 3000 }, { "epoch": 3.21, "eval_accuracy": 0.8129770992366412, "eval_loss": 0.4905295968055725, "eval_runtime": 2.9734, "eval_samples_per_second": 528.696, "eval_steps_per_second": 16.816, "step": 3000 }, { "epoch": 3.43, "grad_norm": 1.6092537641525269, "learning_rate": 0.0002762312633832976, "loss": 0.4798, "step": 3200 }, { "epoch": 3.43, "eval_accuracy": 0.8097964376590331, "eval_loss": 0.4907812178134918, "eval_runtime": 2.9897, "eval_samples_per_second": 525.803, "eval_steps_per_second": 16.724, "step": 3200 }, { "epoch": 3.64, "grad_norm": 1.6475110054016113, "learning_rate": 0.0002726623840114204, "loss": 0.4831, "step": 3400 }, { "epoch": 3.64, "eval_accuracy": 0.8072519083969466, "eval_loss": 0.48748457431793213, "eval_runtime": 2.9694, "eval_samples_per_second": 529.396, "eval_steps_per_second": 16.838, "step": 3400 }, { "epoch": 3.85, "grad_norm": 1.1669467687606812, "learning_rate": 0.00026909350463954313, "loss": 0.4707, "step": 3600 }, { "epoch": 3.85, "eval_accuracy": 0.8072519083969466, "eval_loss": 0.4985896944999695, "eval_runtime": 2.991, "eval_samples_per_second": 525.579, "eval_steps_per_second": 16.717, "step": 3600 }, { "epoch": 4.07, "grad_norm": 0.9440352320671082, "learning_rate": 0.00026552462526766593, "loss": 0.4674, "step": 3800 }, { "epoch": 4.07, "eval_accuracy": 0.8104325699745547, "eval_loss": 0.5195557475090027, "eval_runtime": 2.9789, "eval_samples_per_second": 527.711, "eval_steps_per_second": 16.785, "step": 3800 }, { "epoch": 4.28, "grad_norm": 1.8151628971099854, "learning_rate": 0.0002619557458957887, "loss": 0.4535, "step": 4000 }, { "epoch": 4.28, "eval_accuracy": 0.8097964376590331, "eval_loss": 0.4896373152732849, "eval_runtime": 2.9869, "eval_samples_per_second": 526.295, "eval_steps_per_second": 16.74, "step": 4000 }, { "epoch": 4.5, "grad_norm": 3.0790090560913086, "learning_rate": 0.0002583868665239115, "loss": 0.464, "step": 4200 }, { "epoch": 4.5, "eval_accuracy": 0.8078880407124682, "eval_loss": 0.517495334148407, "eval_runtime": 2.9986, "eval_samples_per_second": 524.246, "eval_steps_per_second": 16.674, "step": 4200 }, { "epoch": 4.71, "grad_norm": 1.1520639657974243, "learning_rate": 0.0002548179871520343, "loss": 0.4715, "step": 4400 }, { "epoch": 4.71, "eval_accuracy": 0.8027989821882952, "eval_loss": 0.5001667737960815, "eval_runtime": 2.9723, "eval_samples_per_second": 528.885, "eval_steps_per_second": 16.822, "step": 4400 }, { "epoch": 4.93, "grad_norm": 0.8184943795204163, "learning_rate": 0.000251249107780157, "loss": 0.468, "step": 4600 }, { "epoch": 4.93, "eval_accuracy": 0.8110687022900763, "eval_loss": 0.4883332848548889, "eval_runtime": 2.9769, "eval_samples_per_second": 528.068, "eval_steps_per_second": 16.796, "step": 4600 }, { "epoch": 5.14, "grad_norm": 1.155013084411621, "learning_rate": 0.00024768022840827977, "loss": 0.4645, "step": 4800 }, { "epoch": 5.14, "eval_accuracy": 0.8040712468193384, "eval_loss": 0.5186554789543152, "eval_runtime": 2.9698, "eval_samples_per_second": 529.333, "eval_steps_per_second": 16.836, "step": 4800 }, { "epoch": 5.35, "grad_norm": 1.6959339380264282, "learning_rate": 0.00024411134903640257, "loss": 0.445, "step": 5000 }, { "epoch": 5.35, "eval_accuracy": 0.806615776081425, "eval_loss": 0.4928103983402252, "eval_runtime": 2.9782, "eval_samples_per_second": 527.83, "eval_steps_per_second": 16.789, "step": 5000 }, { "epoch": 5.57, "grad_norm": 1.0461735725402832, "learning_rate": 0.00024054246966452532, "loss": 0.4558, "step": 5200 }, { "epoch": 5.57, "eval_accuracy": 0.8078880407124682, "eval_loss": 0.48704999685287476, "eval_runtime": 2.9838, "eval_samples_per_second": 526.839, "eval_steps_per_second": 16.757, "step": 5200 }, { "epoch": 5.78, "grad_norm": 0.9599233269691467, "learning_rate": 0.00023697359029264806, "loss": 0.4405, "step": 5400 }, { "epoch": 5.78, "eval_accuracy": 0.8104325699745547, "eval_loss": 0.4985482692718506, "eval_runtime": 3.0065, "eval_samples_per_second": 522.862, "eval_steps_per_second": 16.63, "step": 5400 }, { "epoch": 6.0, "grad_norm": 1.4131615161895752, "learning_rate": 0.00023340471092077086, "loss": 0.4648, "step": 5600 }, { "epoch": 6.0, "eval_accuracy": 0.8059796437659033, "eval_loss": 0.48415422439575195, "eval_runtime": 2.9786, "eval_samples_per_second": 527.759, "eval_steps_per_second": 16.786, "step": 5600 }, { "epoch": 6.21, "grad_norm": 1.189572811126709, "learning_rate": 0.0002298358315488936, "loss": 0.435, "step": 5800 }, { "epoch": 6.21, "eval_accuracy": 0.811704834605598, "eval_loss": 0.4911487102508545, "eval_runtime": 2.9997, "eval_samples_per_second": 524.044, "eval_steps_per_second": 16.668, "step": 5800 }, { "epoch": 6.42, "grad_norm": 1.5198345184326172, "learning_rate": 0.00022626695217701638, "loss": 0.437, "step": 6000 }, { "epoch": 6.42, "eval_accuracy": 0.8085241730279898, "eval_loss": 0.48542749881744385, "eval_runtime": 3.0042, "eval_samples_per_second": 523.274, "eval_steps_per_second": 16.644, "step": 6000 }, { "epoch": 6.64, "grad_norm": 1.1990240812301636, "learning_rate": 0.00022269807280513918, "loss": 0.4588, "step": 6200 }, { "epoch": 6.64, "eval_accuracy": 0.8085241730279898, "eval_loss": 0.48791924118995667, "eval_runtime": 3.0014, "eval_samples_per_second": 523.758, "eval_steps_per_second": 16.659, "step": 6200 }, { "epoch": 6.85, "grad_norm": 1.346658706665039, "learning_rate": 0.00021912919343326193, "loss": 0.4342, "step": 6400 }, { "epoch": 6.85, "eval_accuracy": 0.8104325699745547, "eval_loss": 0.49220582842826843, "eval_runtime": 3.0046, "eval_samples_per_second": 523.193, "eval_steps_per_second": 16.641, "step": 6400 }, { "epoch": 7.07, "grad_norm": 1.8644700050354004, "learning_rate": 0.00021556031406138473, "loss": 0.4347, "step": 6600 }, { "epoch": 7.07, "eval_accuracy": 0.8142493638676844, "eval_loss": 0.49111655354499817, "eval_runtime": 2.985, "eval_samples_per_second": 526.634, "eval_steps_per_second": 16.75, "step": 6600 }, { "epoch": 7.28, "grad_norm": 1.9364045858383179, "learning_rate": 0.00021199143468950748, "loss": 0.4326, "step": 6800 }, { "epoch": 7.28, "eval_accuracy": 0.8078880407124682, "eval_loss": 0.491384893655777, "eval_runtime": 2.9723, "eval_samples_per_second": 528.882, "eval_steps_per_second": 16.822, "step": 6800 }, { "epoch": 7.49, "grad_norm": 0.9911957383155823, "learning_rate": 0.00020842255531763022, "loss": 0.4267, "step": 7000 }, { "epoch": 7.49, "eval_accuracy": 0.8104325699745547, "eval_loss": 0.4917159080505371, "eval_runtime": 2.9808, "eval_samples_per_second": 527.373, "eval_steps_per_second": 16.774, "step": 7000 }, { "epoch": 7.71, "grad_norm": 1.2186638116836548, "learning_rate": 0.00020485367594575302, "loss": 0.4241, "step": 7200 }, { "epoch": 7.71, "eval_accuracy": 0.8136132315521628, "eval_loss": 0.4887010455131531, "eval_runtime": 2.9872, "eval_samples_per_second": 526.253, "eval_steps_per_second": 16.738, "step": 7200 }, { "epoch": 7.92, "grad_norm": 1.1467108726501465, "learning_rate": 0.0002012847965738758, "loss": 0.4376, "step": 7400 }, { "epoch": 7.92, "eval_accuracy": 0.8078880407124682, "eval_loss": 0.5122085213661194, "eval_runtime": 2.9829, "eval_samples_per_second": 527.007, "eval_steps_per_second": 16.762, "step": 7400 }, { "epoch": 8.14, "grad_norm": 0.8427834510803223, "learning_rate": 0.00019771591720199854, "loss": 0.4323, "step": 7600 }, { "epoch": 8.14, "eval_accuracy": 0.8097964376590331, "eval_loss": 0.49093857407569885, "eval_runtime": 2.9738, "eval_samples_per_second": 528.625, "eval_steps_per_second": 16.814, "step": 7600 }, { "epoch": 8.35, "grad_norm": 1.2060902118682861, "learning_rate": 0.00019414703783012134, "loss": 0.4264, "step": 7800 }, { "epoch": 8.35, "eval_accuracy": 0.8142493638676844, "eval_loss": 0.48821595311164856, "eval_runtime": 2.9836, "eval_samples_per_second": 526.88, "eval_steps_per_second": 16.758, "step": 7800 }, { "epoch": 8.57, "grad_norm": 1.7033394575119019, "learning_rate": 0.0001905781584582441, "loss": 0.4175, "step": 8000 }, { "epoch": 8.57, "eval_accuracy": 0.8053435114503816, "eval_loss": 0.5090692043304443, "eval_runtime": 2.9978, "eval_samples_per_second": 524.393, "eval_steps_per_second": 16.679, "step": 8000 }, { "epoch": 8.78, "grad_norm": 1.3033976554870605, "learning_rate": 0.0001870092790863669, "loss": 0.4228, "step": 8200 }, { "epoch": 8.78, "eval_accuracy": 0.8097964376590331, "eval_loss": 0.5060204863548279, "eval_runtime": 2.9975, "eval_samples_per_second": 524.436, "eval_steps_per_second": 16.681, "step": 8200 }, { "epoch": 8.99, "grad_norm": 1.2635438442230225, "learning_rate": 0.00018344039971448964, "loss": 0.4189, "step": 8400 }, { "epoch": 8.99, "eval_accuracy": 0.8091603053435115, "eval_loss": 0.4940575361251831, "eval_runtime": 2.9634, "eval_samples_per_second": 530.468, "eval_steps_per_second": 16.872, "step": 8400 }, { "epoch": 9.21, "grad_norm": 1.496982455253601, "learning_rate": 0.0001798715203426124, "loss": 0.4161, "step": 8600 }, { "epoch": 9.21, "eval_accuracy": 0.8174300254452926, "eval_loss": 0.5010442137718201, "eval_runtime": 2.973, "eval_samples_per_second": 528.758, "eval_steps_per_second": 16.818, "step": 8600 }, { "epoch": 9.42, "grad_norm": 1.355362892150879, "learning_rate": 0.00017630264097073518, "loss": 0.4078, "step": 8800 }, { "epoch": 9.42, "eval_accuracy": 0.8078880407124682, "eval_loss": 0.4949406683444977, "eval_runtime": 2.9901, "eval_samples_per_second": 525.736, "eval_steps_per_second": 16.722, "step": 8800 }, { "epoch": 9.64, "grad_norm": 1.180076241493225, "learning_rate": 0.00017273376159885795, "loss": 0.4201, "step": 9000 }, { "epoch": 9.64, "eval_accuracy": 0.8072519083969466, "eval_loss": 0.5017107129096985, "eval_runtime": 2.952, "eval_samples_per_second": 532.525, "eval_steps_per_second": 16.938, "step": 9000 }, { "epoch": 9.85, "grad_norm": 1.1020286083221436, "learning_rate": 0.0001691648822269807, "loss": 0.4141, "step": 9200 }, { "epoch": 9.85, "eval_accuracy": 0.8091603053435115, "eval_loss": 0.4984731078147888, "eval_runtime": 2.9633, "eval_samples_per_second": 530.497, "eval_steps_per_second": 16.873, "step": 9200 }, { "epoch": 10.06, "grad_norm": 1.2666047811508179, "learning_rate": 0.0001655960028551035, "loss": 0.4132, "step": 9400 }, { "epoch": 10.06, "eval_accuracy": 0.8053435114503816, "eval_loss": 0.5031649470329285, "eval_runtime": 2.9822, "eval_samples_per_second": 527.133, "eval_steps_per_second": 16.766, "step": 9400 }, { "epoch": 10.28, "grad_norm": 0.6767197251319885, "learning_rate": 0.00016202712348322625, "loss": 0.4043, "step": 9600 }, { "epoch": 10.28, "eval_accuracy": 0.8129770992366412, "eval_loss": 0.5038406848907471, "eval_runtime": 2.9816, "eval_samples_per_second": 527.24, "eval_steps_per_second": 16.77, "step": 9600 }, { "epoch": 10.49, "grad_norm": 1.147275447845459, "learning_rate": 0.00015845824411134902, "loss": 0.4187, "step": 9800 }, { "epoch": 10.49, "eval_accuracy": 0.8104325699745547, "eval_loss": 0.4981047213077545, "eval_runtime": 2.9858, "eval_samples_per_second": 526.485, "eval_steps_per_second": 16.746, "step": 9800 }, { "epoch": 10.71, "grad_norm": 1.6172677278518677, "learning_rate": 0.0001548893647394718, "loss": 0.3827, "step": 10000 }, { "epoch": 10.71, "eval_accuracy": 0.8072519083969466, "eval_loss": 0.5126467943191528, "eval_runtime": 2.9825, "eval_samples_per_second": 527.072, "eval_steps_per_second": 16.764, "step": 10000 }, { "epoch": 10.92, "grad_norm": 1.8639923334121704, "learning_rate": 0.00015132048536759457, "loss": 0.4074, "step": 10200 }, { "epoch": 10.92, "eval_accuracy": 0.8072519083969466, "eval_loss": 0.5088323950767517, "eval_runtime": 2.9816, "eval_samples_per_second": 527.237, "eval_steps_per_second": 16.77, "step": 10200 }, { "epoch": 11.13, "grad_norm": 1.2519667148590088, "learning_rate": 0.00014775160599571734, "loss": 0.4013, "step": 10400 }, { "epoch": 11.13, "eval_accuracy": 0.8072519083969466, "eval_loss": 0.5061373114585876, "eval_runtime": 2.9811, "eval_samples_per_second": 527.316, "eval_steps_per_second": 16.772, "step": 10400 }, { "epoch": 11.35, "grad_norm": 1.1711052656173706, "learning_rate": 0.0001441827266238401, "loss": 0.3888, "step": 10600 }, { "epoch": 11.35, "eval_accuracy": 0.8085241730279898, "eval_loss": 0.5013065338134766, "eval_runtime": 2.9847, "eval_samples_per_second": 526.681, "eval_steps_per_second": 16.752, "step": 10600 }, { "epoch": 11.56, "grad_norm": 1.8078001737594604, "learning_rate": 0.00014061384725196286, "loss": 0.3855, "step": 10800 }, { "epoch": 11.56, "eval_accuracy": 0.8059796437659033, "eval_loss": 0.4992610514163971, "eval_runtime": 2.9927, "eval_samples_per_second": 525.27, "eval_steps_per_second": 16.707, "step": 10800 }, { "epoch": 11.78, "grad_norm": 1.1071592569351196, "learning_rate": 0.00013704496788008563, "loss": 0.3924, "step": 11000 }, { "epoch": 11.78, "eval_accuracy": 0.8085241730279898, "eval_loss": 0.5075262188911438, "eval_runtime": 3.0066, "eval_samples_per_second": 522.844, "eval_steps_per_second": 16.63, "step": 11000 }, { "epoch": 11.99, "grad_norm": 1.3704427480697632, "learning_rate": 0.0001334760885082084, "loss": 0.4046, "step": 11200 }, { "epoch": 11.99, "eval_accuracy": 0.8027989821882952, "eval_loss": 0.49990707635879517, "eval_runtime": 3.0049, "eval_samples_per_second": 523.149, "eval_steps_per_second": 16.64, "step": 11200 }, { "epoch": 12.21, "grad_norm": 1.40170419216156, "learning_rate": 0.00012990720913633118, "loss": 0.3957, "step": 11400 }, { "epoch": 12.21, "eval_accuracy": 0.8034351145038168, "eval_loss": 0.5089264512062073, "eval_runtime": 2.9942, "eval_samples_per_second": 525.011, "eval_steps_per_second": 16.699, "step": 11400 }, { "epoch": 12.42, "grad_norm": 1.1885521411895752, "learning_rate": 0.00012633832976445395, "loss": 0.381, "step": 11600 }, { "epoch": 12.42, "eval_accuracy": 0.8072519083969466, "eval_loss": 0.5207549929618835, "eval_runtime": 2.9746, "eval_samples_per_second": 528.479, "eval_steps_per_second": 16.809, "step": 11600 }, { "epoch": 12.63, "grad_norm": 0.8873888254165649, "learning_rate": 0.00012276945039257673, "loss": 0.3906, "step": 11800 }, { "epoch": 12.63, "eval_accuracy": 0.806615776081425, "eval_loss": 0.513671875, "eval_runtime": 2.961, "eval_samples_per_second": 530.901, "eval_steps_per_second": 16.886, "step": 11800 }, { "epoch": 12.85, "grad_norm": 1.6491570472717285, "learning_rate": 0.0001192005710206995, "loss": 0.3734, "step": 12000 }, { "epoch": 12.85, "eval_accuracy": 0.8040712468193384, "eval_loss": 0.5183374881744385, "eval_runtime": 2.9533, "eval_samples_per_second": 532.292, "eval_steps_per_second": 16.93, "step": 12000 }, { "epoch": 13.06, "grad_norm": 2.042646884918213, "learning_rate": 0.00011563169164882227, "loss": 0.3928, "step": 12200 }, { "epoch": 13.06, "eval_accuracy": 0.806615776081425, "eval_loss": 0.5069447159767151, "eval_runtime": 2.959, "eval_samples_per_second": 531.259, "eval_steps_per_second": 16.898, "step": 12200 }, { "epoch": 13.28, "grad_norm": 0.817425549030304, "learning_rate": 0.00011206281227694502, "loss": 0.3774, "step": 12400 }, { "epoch": 13.28, "eval_accuracy": 0.8008905852417303, "eval_loss": 0.5086419582366943, "eval_runtime": 2.9547, "eval_samples_per_second": 532.04, "eval_steps_per_second": 16.922, "step": 12400 }, { "epoch": 13.49, "grad_norm": 1.0988578796386719, "learning_rate": 0.0001084939329050678, "loss": 0.3892, "step": 12600 }, { "epoch": 13.49, "eval_accuracy": 0.8059796437659033, "eval_loss": 0.4966925382614136, "eval_runtime": 2.9538, "eval_samples_per_second": 532.194, "eval_steps_per_second": 16.927, "step": 12600 }, { "epoch": 13.7, "grad_norm": 1.312321662902832, "learning_rate": 0.00010492505353319058, "loss": 0.372, "step": 12800 }, { "epoch": 13.7, "eval_accuracy": 0.8040712468193384, "eval_loss": 0.5042534470558167, "eval_runtime": 2.9651, "eval_samples_per_second": 530.16, "eval_steps_per_second": 16.863, "step": 12800 }, { "epoch": 13.92, "grad_norm": 1.642741322517395, "learning_rate": 0.00010135617416131332, "loss": 0.388, "step": 13000 }, { "epoch": 13.92, "eval_accuracy": 0.8072519083969466, "eval_loss": 0.5095480680465698, "eval_runtime": 2.9526, "eval_samples_per_second": 532.404, "eval_steps_per_second": 16.934, "step": 13000 }, { "epoch": 14.13, "grad_norm": 1.10377836227417, "learning_rate": 9.778729478943611e-05, "loss": 0.3754, "step": 13200 }, { "epoch": 14.13, "eval_accuracy": 0.8021628498727735, "eval_loss": 0.5103972554206848, "eval_runtime": 2.9663, "eval_samples_per_second": 529.954, "eval_steps_per_second": 16.856, "step": 13200 }, { "epoch": 14.35, "grad_norm": 1.1614229679107666, "learning_rate": 9.421841541755888e-05, "loss": 0.3639, "step": 13400 }, { "epoch": 14.35, "eval_accuracy": 0.7983460559796438, "eval_loss": 0.5263165235519409, "eval_runtime": 2.9391, "eval_samples_per_second": 534.858, "eval_steps_per_second": 17.012, "step": 13400 }, { "epoch": 14.56, "grad_norm": 1.6049692630767822, "learning_rate": 9.064953604568166e-05, "loss": 0.3795, "step": 13600 }, { "epoch": 14.56, "eval_accuracy": 0.8015267175572519, "eval_loss": 0.5145931839942932, "eval_runtime": 2.9465, "eval_samples_per_second": 533.506, "eval_steps_per_second": 16.969, "step": 13600 }, { "epoch": 14.78, "grad_norm": 2.813002347946167, "learning_rate": 8.708065667380442e-05, "loss": 0.3792, "step": 13800 }, { "epoch": 14.78, "eval_accuracy": 0.8040712468193384, "eval_loss": 0.5066380500793457, "eval_runtime": 2.9409, "eval_samples_per_second": 534.523, "eval_steps_per_second": 17.001, "step": 13800 }, { "epoch": 14.99, "grad_norm": 1.2670201063156128, "learning_rate": 8.351177730192719e-05, "loss": 0.3589, "step": 14000 }, { "epoch": 14.99, "eval_accuracy": 0.8078880407124682, "eval_loss": 0.5135853886604309, "eval_runtime": 2.962, "eval_samples_per_second": 530.717, "eval_steps_per_second": 16.88, "step": 14000 }, { "epoch": 15.2, "grad_norm": 1.9681557416915894, "learning_rate": 7.994289793004996e-05, "loss": 0.3624, "step": 14200 }, { "epoch": 15.2, "eval_accuracy": 0.8021628498727735, "eval_loss": 0.5237164497375488, "eval_runtime": 2.9535, "eval_samples_per_second": 532.245, "eval_steps_per_second": 16.929, "step": 14200 }, { "epoch": 15.42, "grad_norm": 1.8548041582107544, "learning_rate": 7.637401855817274e-05, "loss": 0.3659, "step": 14400 }, { "epoch": 15.42, "eval_accuracy": 0.8059796437659033, "eval_loss": 0.5165674090385437, "eval_runtime": 2.9482, "eval_samples_per_second": 533.2, "eval_steps_per_second": 16.959, "step": 14400 }, { "epoch": 15.63, "grad_norm": 1.3727173805236816, "learning_rate": 7.28051391862955e-05, "loss": 0.3657, "step": 14600 }, { "epoch": 15.63, "eval_accuracy": 0.8002544529262087, "eval_loss": 0.5177738070487976, "eval_runtime": 2.9451, "eval_samples_per_second": 533.764, "eval_steps_per_second": 16.977, "step": 14600 }, { "epoch": 15.85, "grad_norm": 2.10198974609375, "learning_rate": 6.923625981441827e-05, "loss": 0.359, "step": 14800 }, { "epoch": 15.85, "eval_accuracy": 0.7983460559796438, "eval_loss": 0.5152426362037659, "eval_runtime": 2.9473, "eval_samples_per_second": 533.372, "eval_steps_per_second": 16.965, "step": 14800 }, { "epoch": 16.06, "grad_norm": 1.0453667640686035, "learning_rate": 6.566738044254104e-05, "loss": 0.3677, "step": 15000 }, { "epoch": 16.06, "eval_accuracy": 0.8034351145038168, "eval_loss": 0.5211815237998962, "eval_runtime": 2.9478, "eval_samples_per_second": 533.274, "eval_steps_per_second": 16.962, "step": 15000 }, { "epoch": 16.27, "grad_norm": 1.0645538568496704, "learning_rate": 6.20985010706638e-05, "loss": 0.3521, "step": 15200 }, { "epoch": 16.27, "eval_accuracy": 0.8002544529262087, "eval_loss": 0.5323696732521057, "eval_runtime": 2.9594, "eval_samples_per_second": 531.197, "eval_steps_per_second": 16.896, "step": 15200 }, { "epoch": 16.49, "grad_norm": 3.849015951156616, "learning_rate": 5.852962169878657e-05, "loss": 0.3589, "step": 15400 }, { "epoch": 16.49, "eval_accuracy": 0.8040712468193384, "eval_loss": 0.5237988829612732, "eval_runtime": 2.9364, "eval_samples_per_second": 535.357, "eval_steps_per_second": 17.028, "step": 15400 }, { "epoch": 16.7, "grad_norm": 1.3231987953186035, "learning_rate": 5.496074232690935e-05, "loss": 0.3695, "step": 15600 }, { "epoch": 16.7, "eval_accuracy": 0.7977099236641222, "eval_loss": 0.511340320110321, "eval_runtime": 2.969, "eval_samples_per_second": 529.468, "eval_steps_per_second": 16.841, "step": 15600 }, { "epoch": 16.92, "grad_norm": 1.7709985971450806, "learning_rate": 5.139186295503211e-05, "loss": 0.3606, "step": 15800 }, { "epoch": 16.92, "eval_accuracy": 0.7983460559796438, "eval_loss": 0.5136662721633911, "eval_runtime": 2.9594, "eval_samples_per_second": 531.193, "eval_steps_per_second": 16.895, "step": 15800 }, { "epoch": 17.13, "grad_norm": 1.5108495950698853, "learning_rate": 4.782298358315489e-05, "loss": 0.3581, "step": 16000 }, { "epoch": 17.13, "eval_accuracy": 0.799618320610687, "eval_loss": 0.5130853056907654, "eval_runtime": 2.9611, "eval_samples_per_second": 530.882, "eval_steps_per_second": 16.886, "step": 16000 }, { "epoch": 17.34, "grad_norm": 1.3634617328643799, "learning_rate": 4.4254104211277655e-05, "loss": 0.3488, "step": 16200 }, { "epoch": 17.34, "eval_accuracy": 0.7989821882951654, "eval_loss": 0.5270070433616638, "eval_runtime": 2.9953, "eval_samples_per_second": 524.824, "eval_steps_per_second": 16.693, "step": 16200 }, { "epoch": 17.56, "grad_norm": 1.0239213705062866, "learning_rate": 4.068522483940043e-05, "loss": 0.3499, "step": 16400 }, { "epoch": 17.56, "eval_accuracy": 0.7964376590330788, "eval_loss": 0.523576021194458, "eval_runtime": 2.9356, "eval_samples_per_second": 535.502, "eval_steps_per_second": 17.033, "step": 16400 }, { "epoch": 17.77, "grad_norm": 1.108484148979187, "learning_rate": 3.7116345467523195e-05, "loss": 0.3603, "step": 16600 }, { "epoch": 17.77, "eval_accuracy": 0.8002544529262087, "eval_loss": 0.5186541080474854, "eval_runtime": 2.9666, "eval_samples_per_second": 529.891, "eval_steps_per_second": 16.854, "step": 16600 }, { "epoch": 17.99, "grad_norm": 2.816092014312744, "learning_rate": 3.354746609564596e-05, "loss": 0.3578, "step": 16800 }, { "epoch": 17.99, "eval_accuracy": 0.8021628498727735, "eval_loss": 0.5223926901817322, "eval_runtime": 2.9355, "eval_samples_per_second": 535.521, "eval_steps_per_second": 17.033, "step": 16800 }, { "epoch": 18.2, "grad_norm": 1.5831489562988281, "learning_rate": 2.997858672376873e-05, "loss": 0.3449, "step": 17000 }, { "epoch": 18.2, "eval_accuracy": 0.7989821882951654, "eval_loss": 0.5227622389793396, "eval_runtime": 2.9602, "eval_samples_per_second": 531.048, "eval_steps_per_second": 16.891, "step": 17000 }, { "epoch": 18.42, "grad_norm": 1.0060327053070068, "learning_rate": 2.64097073518915e-05, "loss": 0.3418, "step": 17200 }, { "epoch": 18.42, "eval_accuracy": 0.8008905852417303, "eval_loss": 0.5287216901779175, "eval_runtime": 2.9537, "eval_samples_per_second": 532.21, "eval_steps_per_second": 16.928, "step": 17200 }, { "epoch": 18.63, "grad_norm": 1.8092093467712402, "learning_rate": 2.2840827980014274e-05, "loss": 0.3334, "step": 17400 }, { "epoch": 18.63, "eval_accuracy": 0.799618320610687, "eval_loss": 0.5322315096855164, "eval_runtime": 2.9745, "eval_samples_per_second": 528.484, "eval_steps_per_second": 16.809, "step": 17400 }, { "epoch": 18.84, "grad_norm": 1.4800430536270142, "learning_rate": 1.9271948608137044e-05, "loss": 0.3567, "step": 17600 }, { "epoch": 18.84, "eval_accuracy": 0.7983460559796438, "eval_loss": 0.5293812155723572, "eval_runtime": 2.9485, "eval_samples_per_second": 533.161, "eval_steps_per_second": 16.958, "step": 17600 }, { "epoch": 19.06, "grad_norm": 1.6271811723709106, "learning_rate": 1.5703069236259814e-05, "loss": 0.3541, "step": 17800 }, { "epoch": 19.06, "eval_accuracy": 0.8002544529262087, "eval_loss": 0.5250320434570312, "eval_runtime": 2.9479, "eval_samples_per_second": 533.268, "eval_steps_per_second": 16.961, "step": 17800 }, { "epoch": 19.27, "grad_norm": 0.7758527994155884, "learning_rate": 1.2134189864382584e-05, "loss": 0.365, "step": 18000 }, { "epoch": 19.27, "eval_accuracy": 0.7983460559796438, "eval_loss": 0.5246437788009644, "eval_runtime": 2.9363, "eval_samples_per_second": 535.369, "eval_steps_per_second": 17.028, "step": 18000 }, { "epoch": 19.49, "grad_norm": 0.9722337126731873, "learning_rate": 8.565310492505352e-06, "loss": 0.337, "step": 18200 }, { "epoch": 19.49, "eval_accuracy": 0.7977099236641222, "eval_loss": 0.527810275554657, "eval_runtime": 2.9383, "eval_samples_per_second": 535.006, "eval_steps_per_second": 17.017, "step": 18200 }, { "epoch": 19.7, "grad_norm": 1.5007203817367554, "learning_rate": 4.996431120628123e-06, "loss": 0.3301, "step": 18400 }, { "epoch": 19.7, "eval_accuracy": 0.7989821882951654, "eval_loss": 0.5283259153366089, "eval_runtime": 2.9603, "eval_samples_per_second": 531.035, "eval_steps_per_second": 16.89, "step": 18400 }, { "epoch": 19.91, "grad_norm": 1.1220752000808716, "learning_rate": 1.4275517487508921e-06, "loss": 0.3421, "step": 18600 }, { "epoch": 19.91, "eval_accuracy": 0.7977099236641222, "eval_loss": 0.5287136435508728, "eval_runtime": 2.9398, "eval_samples_per_second": 534.737, "eval_steps_per_second": 17.008, "step": 18600 }, { "epoch": 20.0, "step": 18680, "total_flos": 1.44512252251488e+16, "train_loss": 0.42864556159401346, "train_runtime": 2680.553, "train_samples_per_second": 222.82, "train_steps_per_second": 6.969 } ], "logging_steps": 200, "max_steps": 18680, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 200, "total_flos": 1.44512252251488e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null }