{ "best_metric": 0.7937062937062938, "best_model_checkpoint": "wav2vec2-5Class-train-test-finetune/checkpoint-4122", "epoch": 224.0, "eval_steps": 500, "global_step": 5432, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.99, "eval_accuracy": 0.34265734265734266, "eval_loss": 1.5984586477279663, "eval_runtime": 5.3437, "eval_samples_per_second": 53.521, "eval_steps_per_second": 3.368, "step": 24 }, { "epoch": 1.98, "eval_accuracy": 0.33916083916083917, "eval_loss": 1.5969289541244507, "eval_runtime": 3.8653, "eval_samples_per_second": 73.992, "eval_steps_per_second": 4.657, "step": 48 }, { "epoch": 2.06, "grad_norm": 1.0544973611831665, "learning_rate": 2.4999999999999998e-06, "loss": 1.5969, "step": 50 }, { "epoch": 2.97, "eval_accuracy": 0.32867132867132864, "eval_loss": 1.5943816900253296, "eval_runtime": 6.1748, "eval_samples_per_second": 46.317, "eval_steps_per_second": 2.915, "step": 72 }, { "epoch": 4.0, "eval_accuracy": 0.3146853146853147, "eval_loss": 1.5906767845153809, "eval_runtime": 5.1678, "eval_samples_per_second": 55.343, "eval_steps_per_second": 3.483, "step": 97 }, { "epoch": 4.12, "grad_norm": 0.8443157076835632, "learning_rate": 4.9999999999999996e-06, "loss": 1.5896, "step": 100 }, { "epoch": 4.99, "eval_accuracy": 0.2972027972027972, "eval_loss": 1.5860023498535156, "eval_runtime": 4.9416, "eval_samples_per_second": 57.876, "eval_steps_per_second": 3.643, "step": 121 }, { "epoch": 5.98, "eval_accuracy": 0.2692307692307692, "eval_loss": 1.5806005001068115, "eval_runtime": 4.1837, "eval_samples_per_second": 68.36, "eval_steps_per_second": 4.302, "step": 145 }, { "epoch": 6.19, "grad_norm": 1.0938074588775635, "learning_rate": 7.5e-06, "loss": 1.5743, "step": 150 }, { "epoch": 6.97, "eval_accuracy": 0.25874125874125875, "eval_loss": 1.5742768049240112, "eval_runtime": 7.1914, "eval_samples_per_second": 39.77, "eval_steps_per_second": 2.503, "step": 169 }, { "epoch": 8.0, "eval_accuracy": 0.23426573426573427, "eval_loss": 1.5664165019989014, "eval_runtime": 5.6489, "eval_samples_per_second": 50.629, "eval_steps_per_second": 3.186, "step": 194 }, { "epoch": 8.25, "grad_norm": 0.9692079424858093, "learning_rate": 9.999999999999999e-06, "loss": 1.5508, "step": 200 }, { "epoch": 8.99, "eval_accuracy": 0.22727272727272727, "eval_loss": 1.557572841644287, "eval_runtime": 5.5182, "eval_samples_per_second": 51.828, "eval_steps_per_second": 3.262, "step": 218 }, { "epoch": 9.98, "eval_accuracy": 0.22727272727272727, "eval_loss": 1.5482373237609863, "eval_runtime": 5.3205, "eval_samples_per_second": 53.754, "eval_steps_per_second": 3.383, "step": 242 }, { "epoch": 10.31, "grad_norm": 1.02046799659729, "learning_rate": 1.25e-05, "loss": 1.5157, "step": 250 }, { "epoch": 10.97, "eval_accuracy": 0.22727272727272727, "eval_loss": 1.539355993270874, "eval_runtime": 6.3116, "eval_samples_per_second": 45.313, "eval_steps_per_second": 2.852, "step": 266 }, { "epoch": 12.0, "eval_accuracy": 0.22727272727272727, "eval_loss": 1.5350520610809326, "eval_runtime": 4.3422, "eval_samples_per_second": 65.865, "eval_steps_per_second": 4.145, "step": 291 }, { "epoch": 12.37, "grad_norm": 1.6058833599090576, "learning_rate": 1.5e-05, "loss": 1.4534, "step": 300 }, { "epoch": 12.99, "eval_accuracy": 0.22727272727272727, "eval_loss": 1.5525730848312378, "eval_runtime": 5.245, "eval_samples_per_second": 54.528, "eval_steps_per_second": 3.432, "step": 315 }, { "epoch": 13.98, "eval_accuracy": 0.22727272727272727, "eval_loss": 1.599926471710205, "eval_runtime": 6.0088, "eval_samples_per_second": 47.597, "eval_steps_per_second": 2.996, "step": 339 }, { "epoch": 14.43, "grad_norm": 0.8243080377578735, "learning_rate": 1.7500000000000002e-05, "loss": 1.3638, "step": 350 }, { "epoch": 14.97, "eval_accuracy": 0.22727272727272727, "eval_loss": 1.5896875858306885, "eval_runtime": 4.8752, "eval_samples_per_second": 58.664, "eval_steps_per_second": 3.692, "step": 363 }, { "epoch": 16.0, "eval_accuracy": 0.26573426573426573, "eval_loss": 1.560091495513916, "eval_runtime": 5.5082, "eval_samples_per_second": 51.922, "eval_steps_per_second": 3.268, "step": 388 }, { "epoch": 16.49, "grad_norm": 0.7977257370948792, "learning_rate": 1.9999999999999998e-05, "loss": 1.2951, "step": 400 }, { "epoch": 16.99, "eval_accuracy": 0.2937062937062937, "eval_loss": 1.5349317789077759, "eval_runtime": 4.7526, "eval_samples_per_second": 60.178, "eval_steps_per_second": 3.787, "step": 412 }, { "epoch": 17.98, "eval_accuracy": 0.34265734265734266, "eval_loss": 1.5053907632827759, "eval_runtime": 4.8638, "eval_samples_per_second": 58.801, "eval_steps_per_second": 3.701, "step": 436 }, { "epoch": 18.56, "grad_norm": 0.7064552903175354, "learning_rate": 2.25e-05, "loss": 1.2369, "step": 450 }, { "epoch": 18.97, "eval_accuracy": 0.3741258741258741, "eval_loss": 1.4689087867736816, "eval_runtime": 4.3712, "eval_samples_per_second": 65.428, "eval_steps_per_second": 4.118, "step": 460 }, { "epoch": 20.0, "eval_accuracy": 0.4370629370629371, "eval_loss": 1.404613971710205, "eval_runtime": 4.7203, "eval_samples_per_second": 60.59, "eval_steps_per_second": 3.813, "step": 485 }, { "epoch": 20.62, "grad_norm": 0.598238468170166, "learning_rate": 2.5e-05, "loss": 1.1566, "step": 500 }, { "epoch": 20.99, "eval_accuracy": 0.4405594405594406, "eval_loss": 1.3691043853759766, "eval_runtime": 6.6443, "eval_samples_per_second": 43.044, "eval_steps_per_second": 2.709, "step": 509 }, { "epoch": 21.98, "eval_accuracy": 0.4825174825174825, "eval_loss": 1.3120107650756836, "eval_runtime": 4.9585, "eval_samples_per_second": 57.679, "eval_steps_per_second": 3.63, "step": 533 }, { "epoch": 22.68, "grad_norm": 0.682925820350647, "learning_rate": 2.75e-05, "loss": 1.0676, "step": 550 }, { "epoch": 22.97, "eval_accuracy": 0.486013986013986, "eval_loss": 1.2839338779449463, "eval_runtime": 4.0382, "eval_samples_per_second": 70.824, "eval_steps_per_second": 4.457, "step": 557 }, { "epoch": 24.0, "eval_accuracy": 0.5104895104895105, "eval_loss": 1.2549891471862793, "eval_runtime": 5.1896, "eval_samples_per_second": 55.11, "eval_steps_per_second": 3.468, "step": 582 }, { "epoch": 24.74, "grad_norm": 1.1368101835250854, "learning_rate": 3e-05, "loss": 0.992, "step": 600 }, { "epoch": 24.99, "eval_accuracy": 0.5209790209790209, "eval_loss": 1.2106566429138184, "eval_runtime": 6.8941, "eval_samples_per_second": 41.485, "eval_steps_per_second": 2.611, "step": 606 }, { "epoch": 25.98, "eval_accuracy": 0.5384615384615384, "eval_loss": 1.1711338758468628, "eval_runtime": 4.9707, "eval_samples_per_second": 57.537, "eval_steps_per_second": 3.621, "step": 630 }, { "epoch": 26.8, "grad_norm": 0.9649831056594849, "learning_rate": 2.9722222222222223e-05, "loss": 0.9272, "step": 650 }, { "epoch": 26.97, "eval_accuracy": 0.5594405594405595, "eval_loss": 1.1318116188049316, "eval_runtime": 5.5564, "eval_samples_per_second": 51.472, "eval_steps_per_second": 3.24, "step": 654 }, { "epoch": 28.0, "eval_accuracy": 0.6153846153846154, "eval_loss": 1.0594333410263062, "eval_runtime": 4.6773, "eval_samples_per_second": 61.147, "eval_steps_per_second": 3.848, "step": 679 }, { "epoch": 28.87, "grad_norm": 0.883937418460846, "learning_rate": 2.9444444444444445e-05, "loss": 0.8478, "step": 700 }, { "epoch": 28.99, "eval_accuracy": 0.6013986013986014, "eval_loss": 1.054669737815857, "eval_runtime": 4.9219, "eval_samples_per_second": 58.108, "eval_steps_per_second": 3.657, "step": 703 }, { "epoch": 29.98, "eval_accuracy": 0.6363636363636364, "eval_loss": 0.9822685122489929, "eval_runtime": 6.3133, "eval_samples_per_second": 45.302, "eval_steps_per_second": 2.851, "step": 727 }, { "epoch": 30.93, "grad_norm": 1.3742878437042236, "learning_rate": 2.9166666666666666e-05, "loss": 0.7627, "step": 750 }, { "epoch": 30.97, "eval_accuracy": 0.6398601398601399, "eval_loss": 1.00295090675354, "eval_runtime": 6.154, "eval_samples_per_second": 46.473, "eval_steps_per_second": 2.925, "step": 751 }, { "epoch": 32.0, "eval_accuracy": 0.6608391608391608, "eval_loss": 0.930969774723053, "eval_runtime": 5.6747, "eval_samples_per_second": 50.399, "eval_steps_per_second": 3.172, "step": 776 }, { "epoch": 32.99, "grad_norm": 1.329268217086792, "learning_rate": 2.8888888888888888e-05, "loss": 0.7266, "step": 800 }, { "epoch": 32.99, "eval_accuracy": 0.6678321678321678, "eval_loss": 0.9228739738464355, "eval_runtime": 5.382, "eval_samples_per_second": 53.14, "eval_steps_per_second": 3.344, "step": 800 }, { "epoch": 33.98, "eval_accuracy": 0.6958041958041958, "eval_loss": 0.8684509992599487, "eval_runtime": 4.8497, "eval_samples_per_second": 58.973, "eval_steps_per_second": 3.712, "step": 824 }, { "epoch": 34.97, "eval_accuracy": 0.6643356643356644, "eval_loss": 0.8954732418060303, "eval_runtime": 5.2083, "eval_samples_per_second": 54.912, "eval_steps_per_second": 3.456, "step": 848 }, { "epoch": 35.05, "grad_norm": 1.3892701864242554, "learning_rate": 2.8611111111111113e-05, "loss": 0.6906, "step": 850 }, { "epoch": 36.0, "eval_accuracy": 0.6713286713286714, "eval_loss": 0.9125654101371765, "eval_runtime": 5.3068, "eval_samples_per_second": 53.894, "eval_steps_per_second": 3.392, "step": 873 }, { "epoch": 36.99, "eval_accuracy": 0.6923076923076923, "eval_loss": 0.8543534874916077, "eval_runtime": 4.3351, "eval_samples_per_second": 65.974, "eval_steps_per_second": 4.152, "step": 897 }, { "epoch": 37.11, "grad_norm": 0.836291491985321, "learning_rate": 2.8333333333333332e-05, "loss": 0.6721, "step": 900 }, { "epoch": 37.98, "eval_accuracy": 0.6923076923076923, "eval_loss": 0.8480322957038879, "eval_runtime": 5.1861, "eval_samples_per_second": 55.147, "eval_steps_per_second": 3.471, "step": 921 }, { "epoch": 38.97, "eval_accuracy": 0.7097902097902098, "eval_loss": 0.8354606628417969, "eval_runtime": 6.3247, "eval_samples_per_second": 45.22, "eval_steps_per_second": 2.846, "step": 945 }, { "epoch": 39.18, "grad_norm": 1.6499431133270264, "learning_rate": 2.8055555555555557e-05, "loss": 0.6442, "step": 950 }, { "epoch": 40.0, "eval_accuracy": 0.6958041958041958, "eval_loss": 0.8412452340126038, "eval_runtime": 5.2281, "eval_samples_per_second": 54.704, "eval_steps_per_second": 3.443, "step": 970 }, { "epoch": 40.99, "eval_accuracy": 0.6888111888111889, "eval_loss": 0.8356389999389648, "eval_runtime": 4.8326, "eval_samples_per_second": 59.181, "eval_steps_per_second": 3.725, "step": 994 }, { "epoch": 41.24, "grad_norm": 1.1766818761825562, "learning_rate": 2.777777777777778e-05, "loss": 0.6465, "step": 1000 }, { "epoch": 41.98, "eval_accuracy": 0.7062937062937062, "eval_loss": 0.8180016875267029, "eval_runtime": 5.7926, "eval_samples_per_second": 49.374, "eval_steps_per_second": 3.107, "step": 1018 }, { "epoch": 42.97, "eval_accuracy": 0.7027972027972028, "eval_loss": 0.8103991150856018, "eval_runtime": 5.5185, "eval_samples_per_second": 51.825, "eval_steps_per_second": 3.262, "step": 1042 }, { "epoch": 43.3, "grad_norm": 0.9722403287887573, "learning_rate": 2.75e-05, "loss": 0.6086, "step": 1050 }, { "epoch": 44.0, "eval_accuracy": 0.6958041958041958, "eval_loss": 0.8162235617637634, "eval_runtime": 4.9174, "eval_samples_per_second": 58.161, "eval_steps_per_second": 3.66, "step": 1067 }, { "epoch": 44.99, "eval_accuracy": 0.7027972027972028, "eval_loss": 0.7957289218902588, "eval_runtime": 4.6891, "eval_samples_per_second": 60.992, "eval_steps_per_second": 3.839, "step": 1091 }, { "epoch": 45.36, "grad_norm": 1.269113302230835, "learning_rate": 2.7222222222222223e-05, "loss": 0.5863, "step": 1100 }, { "epoch": 45.98, "eval_accuracy": 0.6958041958041958, "eval_loss": 0.8143528699874878, "eval_runtime": 6.6805, "eval_samples_per_second": 42.811, "eval_steps_per_second": 2.694, "step": 1115 }, { "epoch": 46.97, "eval_accuracy": 0.7027972027972028, "eval_loss": 0.78568434715271, "eval_runtime": 4.7422, "eval_samples_per_second": 60.31, "eval_steps_per_second": 3.796, "step": 1139 }, { "epoch": 47.42, "grad_norm": 0.9775255918502808, "learning_rate": 2.6944444444444445e-05, "loss": 0.5877, "step": 1150 }, { "epoch": 48.0, "eval_accuracy": 0.7132867132867133, "eval_loss": 0.7764595150947571, "eval_runtime": 5.76, "eval_samples_per_second": 49.653, "eval_steps_per_second": 3.125, "step": 1164 }, { "epoch": 48.99, "eval_accuracy": 0.6993006993006993, "eval_loss": 0.7881478071212769, "eval_runtime": 5.4965, "eval_samples_per_second": 52.033, "eval_steps_per_second": 3.275, "step": 1188 }, { "epoch": 49.48, "grad_norm": 1.540124773979187, "learning_rate": 2.6666666666666667e-05, "loss": 0.5629, "step": 1200 }, { "epoch": 49.98, "eval_accuracy": 0.7097902097902098, "eval_loss": 0.7658265829086304, "eval_runtime": 4.731, "eval_samples_per_second": 60.452, "eval_steps_per_second": 3.805, "step": 1212 }, { "epoch": 50.97, "eval_accuracy": 0.7132867132867133, "eval_loss": 0.7723098397254944, "eval_runtime": 5.8352, "eval_samples_per_second": 49.013, "eval_steps_per_second": 3.085, "step": 1236 }, { "epoch": 51.55, "grad_norm": 1.2498500347137451, "learning_rate": 2.6388888888888892e-05, "loss": 0.5476, "step": 1250 }, { "epoch": 52.0, "eval_accuracy": 0.7097902097902098, "eval_loss": 0.7603952884674072, "eval_runtime": 4.448, "eval_samples_per_second": 64.299, "eval_steps_per_second": 4.047, "step": 1261 }, { "epoch": 52.99, "eval_accuracy": 0.7202797202797203, "eval_loss": 0.7554137706756592, "eval_runtime": 6.4218, "eval_samples_per_second": 44.536, "eval_steps_per_second": 2.803, "step": 1285 }, { "epoch": 53.61, "grad_norm": 0.9919388890266418, "learning_rate": 2.6116666666666667e-05, "loss": 0.5357, "step": 1300 }, { "epoch": 53.98, "eval_accuracy": 0.7307692307692307, "eval_loss": 0.7458928227424622, "eval_runtime": 5.3791, "eval_samples_per_second": 53.168, "eval_steps_per_second": 3.346, "step": 1309 }, { "epoch": 54.97, "eval_accuracy": 0.7132867132867133, "eval_loss": 0.7632877230644226, "eval_runtime": 5.278, "eval_samples_per_second": 54.187, "eval_steps_per_second": 3.41, "step": 1333 }, { "epoch": 55.67, "grad_norm": 1.688183307647705, "learning_rate": 2.5838888888888892e-05, "loss": 0.5335, "step": 1350 }, { "epoch": 56.0, "eval_accuracy": 0.7167832167832168, "eval_loss": 0.768308162689209, "eval_runtime": 5.7022, "eval_samples_per_second": 50.156, "eval_steps_per_second": 3.157, "step": 1358 }, { "epoch": 56.99, "eval_accuracy": 0.7307692307692307, "eval_loss": 0.7380541563034058, "eval_runtime": 4.522, "eval_samples_per_second": 63.247, "eval_steps_per_second": 3.981, "step": 1382 }, { "epoch": 57.73, "grad_norm": 1.4895784854888916, "learning_rate": 2.556111111111111e-05, "loss": 0.5107, "step": 1400 }, { "epoch": 57.98, "eval_accuracy": 0.7377622377622378, "eval_loss": 0.7308338284492493, "eval_runtime": 4.4787, "eval_samples_per_second": 63.857, "eval_steps_per_second": 4.019, "step": 1406 }, { "epoch": 58.97, "eval_accuracy": 0.7237762237762237, "eval_loss": 0.7441032528877258, "eval_runtime": 5.8744, "eval_samples_per_second": 48.685, "eval_steps_per_second": 3.064, "step": 1430 }, { "epoch": 59.79, "grad_norm": 1.4925004243850708, "learning_rate": 2.5283333333333332e-05, "loss": 0.5105, "step": 1450 }, { "epoch": 60.0, "eval_accuracy": 0.7307692307692307, "eval_loss": 0.7481815218925476, "eval_runtime": 7.272, "eval_samples_per_second": 39.329, "eval_steps_per_second": 2.475, "step": 1455 }, { "epoch": 60.99, "eval_accuracy": 0.7342657342657343, "eval_loss": 0.733482301235199, "eval_runtime": 4.6235, "eval_samples_per_second": 61.858, "eval_steps_per_second": 3.893, "step": 1479 }, { "epoch": 61.86, "grad_norm": 1.3200663328170776, "learning_rate": 2.5005555555555558e-05, "loss": 0.4914, "step": 1500 }, { "epoch": 61.98, "eval_accuracy": 0.7447552447552448, "eval_loss": 0.7241908311843872, "eval_runtime": 4.8198, "eval_samples_per_second": 59.338, "eval_steps_per_second": 3.735, "step": 1503 }, { "epoch": 62.97, "eval_accuracy": 0.7377622377622378, "eval_loss": 0.7321043014526367, "eval_runtime": 5.8929, "eval_samples_per_second": 48.533, "eval_steps_per_second": 3.055, "step": 1527 }, { "epoch": 63.92, "grad_norm": 1.1309747695922852, "learning_rate": 2.472777777777778e-05, "loss": 0.4839, "step": 1550 }, { "epoch": 64.0, "eval_accuracy": 0.7342657342657343, "eval_loss": 0.7220665216445923, "eval_runtime": 5.8635, "eval_samples_per_second": 48.776, "eval_steps_per_second": 3.07, "step": 1552 }, { "epoch": 64.99, "eval_accuracy": 0.7412587412587412, "eval_loss": 0.7136482000350952, "eval_runtime": 4.3102, "eval_samples_per_second": 66.354, "eval_steps_per_second": 4.176, "step": 1576 }, { "epoch": 65.98, "grad_norm": 1.1314157247543335, "learning_rate": 2.4449999999999998e-05, "loss": 0.4751, "step": 1600 }, { "epoch": 65.98, "eval_accuracy": 0.7412587412587412, "eval_loss": 0.7198111414909363, "eval_runtime": 4.7841, "eval_samples_per_second": 59.781, "eval_steps_per_second": 3.762, "step": 1600 }, { "epoch": 66.97, "eval_accuracy": 0.7377622377622378, "eval_loss": 0.7145721912384033, "eval_runtime": 6.347, "eval_samples_per_second": 45.061, "eval_steps_per_second": 2.836, "step": 1624 }, { "epoch": 68.0, "eval_accuracy": 0.7447552447552448, "eval_loss": 0.6970916390419006, "eval_runtime": 5.6871, "eval_samples_per_second": 50.289, "eval_steps_per_second": 3.165, "step": 1649 }, { "epoch": 68.04, "grad_norm": 2.397585153579712, "learning_rate": 2.4172222222222223e-05, "loss": 0.4639, "step": 1650 }, { "epoch": 68.99, "eval_accuracy": 0.7272727272727273, "eval_loss": 0.7201464176177979, "eval_runtime": 4.4157, "eval_samples_per_second": 64.769, "eval_steps_per_second": 4.076, "step": 1673 }, { "epoch": 69.98, "eval_accuracy": 0.7307692307692307, "eval_loss": 0.7244682312011719, "eval_runtime": 5.4392, "eval_samples_per_second": 52.581, "eval_steps_per_second": 3.309, "step": 1697 }, { "epoch": 70.1, "grad_norm": 2.062610387802124, "learning_rate": 2.3894444444444445e-05, "loss": 0.4581, "step": 1700 }, { "epoch": 70.97, "eval_accuracy": 0.7447552447552448, "eval_loss": 0.7077587842941284, "eval_runtime": 5.1002, "eval_samples_per_second": 56.076, "eval_steps_per_second": 3.529, "step": 1721 }, { "epoch": 72.0, "eval_accuracy": 0.7517482517482518, "eval_loss": 0.6957913637161255, "eval_runtime": 4.4485, "eval_samples_per_second": 64.291, "eval_steps_per_second": 4.046, "step": 1746 }, { "epoch": 72.16, "grad_norm": 2.7808456420898438, "learning_rate": 2.3616666666666667e-05, "loss": 0.4643, "step": 1750 }, { "epoch": 72.99, "eval_accuracy": 0.7447552447552448, "eval_loss": 0.7036928534507751, "eval_runtime": 5.9101, "eval_samples_per_second": 48.392, "eval_steps_per_second": 3.046, "step": 1770 }, { "epoch": 73.98, "eval_accuracy": 0.7482517482517482, "eval_loss": 0.71629399061203, "eval_runtime": 6.0211, "eval_samples_per_second": 47.5, "eval_steps_per_second": 2.989, "step": 1794 }, { "epoch": 74.23, "grad_norm": 1.78495192527771, "learning_rate": 2.333888888888889e-05, "loss": 0.442, "step": 1800 }, { "epoch": 74.97, "eval_accuracy": 0.7377622377622378, "eval_loss": 0.6997957229614258, "eval_runtime": 4.4212, "eval_samples_per_second": 64.688, "eval_steps_per_second": 4.071, "step": 1818 }, { "epoch": 76.0, "eval_accuracy": 0.7447552447552448, "eval_loss": 0.6946483850479126, "eval_runtime": 4.0507, "eval_samples_per_second": 70.605, "eval_steps_per_second": 4.444, "step": 1843 }, { "epoch": 76.29, "grad_norm": 1.7383118867874146, "learning_rate": 2.306111111111111e-05, "loss": 0.4305, "step": 1850 }, { "epoch": 76.99, "eval_accuracy": 0.7552447552447552, "eval_loss": 0.6857091784477234, "eval_runtime": 4.1718, "eval_samples_per_second": 68.556, "eval_steps_per_second": 4.315, "step": 1867 }, { "epoch": 77.98, "eval_accuracy": 0.7447552447552448, "eval_loss": 0.6936307549476624, "eval_runtime": 3.8781, "eval_samples_per_second": 73.747, "eval_steps_per_second": 4.641, "step": 1891 }, { "epoch": 78.35, "grad_norm": 1.047067403793335, "learning_rate": 2.2783333333333336e-05, "loss": 0.4416, "step": 1900 }, { "epoch": 78.97, "eval_accuracy": 0.7517482517482518, "eval_loss": 0.6965110301971436, "eval_runtime": 5.1318, "eval_samples_per_second": 55.731, "eval_steps_per_second": 3.508, "step": 1915 }, { "epoch": 80.0, "eval_accuracy": 0.7482517482517482, "eval_loss": 0.7017127871513367, "eval_runtime": 4.3418, "eval_samples_per_second": 65.871, "eval_steps_per_second": 4.146, "step": 1940 }, { "epoch": 80.41, "grad_norm": 1.5354928970336914, "learning_rate": 2.2505555555555554e-05, "loss": 0.428, "step": 1950 }, { "epoch": 80.99, "eval_accuracy": 0.7552447552447552, "eval_loss": 0.6970596313476562, "eval_runtime": 5.973, "eval_samples_per_second": 47.882, "eval_steps_per_second": 3.014, "step": 1964 }, { "epoch": 81.98, "eval_accuracy": 0.7552447552447552, "eval_loss": 0.6897542476654053, "eval_runtime": 5.0481, "eval_samples_per_second": 56.655, "eval_steps_per_second": 3.566, "step": 1988 }, { "epoch": 82.47, "grad_norm": 1.7141317129135132, "learning_rate": 2.2227777777777776e-05, "loss": 0.4093, "step": 2000 }, { "epoch": 82.97, "eval_accuracy": 0.7482517482517482, "eval_loss": 0.7004020810127258, "eval_runtime": 4.1986, "eval_samples_per_second": 68.118, "eval_steps_per_second": 4.287, "step": 2012 }, { "epoch": 84.0, "eval_accuracy": 0.7552447552447552, "eval_loss": 0.6867479681968689, "eval_runtime": 4.6871, "eval_samples_per_second": 61.018, "eval_steps_per_second": 3.84, "step": 2037 }, { "epoch": 84.54, "grad_norm": 2.0219666957855225, "learning_rate": 2.195e-05, "loss": 0.4148, "step": 2050 }, { "epoch": 84.99, "eval_accuracy": 0.7377622377622378, "eval_loss": 0.7070020437240601, "eval_runtime": 5.9326, "eval_samples_per_second": 48.208, "eval_steps_per_second": 3.034, "step": 2061 }, { "epoch": 85.98, "eval_accuracy": 0.7447552447552448, "eval_loss": 0.7030305862426758, "eval_runtime": 5.3564, "eval_samples_per_second": 53.394, "eval_steps_per_second": 3.36, "step": 2085 }, { "epoch": 86.6, "grad_norm": 1.4678714275360107, "learning_rate": 2.1672222222222223e-05, "loss": 0.3923, "step": 2100 }, { "epoch": 86.97, "eval_accuracy": 0.7587412587412588, "eval_loss": 0.678174614906311, "eval_runtime": 3.9745, "eval_samples_per_second": 71.96, "eval_steps_per_second": 4.529, "step": 2109 }, { "epoch": 88.0, "eval_accuracy": 0.7412587412587412, "eval_loss": 0.7166118621826172, "eval_runtime": 4.0358, "eval_samples_per_second": 70.866, "eval_steps_per_second": 4.46, "step": 2134 }, { "epoch": 88.66, "grad_norm": 1.589543342590332, "learning_rate": 2.1394444444444445e-05, "loss": 0.3964, "step": 2150 }, { "epoch": 88.99, "eval_accuracy": 0.7482517482517482, "eval_loss": 0.7075912952423096, "eval_runtime": 5.0331, "eval_samples_per_second": 56.823, "eval_steps_per_second": 3.576, "step": 2158 }, { "epoch": 89.98, "eval_accuracy": 0.7657342657342657, "eval_loss": 0.6867172122001648, "eval_runtime": 5.386, "eval_samples_per_second": 53.101, "eval_steps_per_second": 3.342, "step": 2182 }, { "epoch": 90.72, "grad_norm": 1.3886605501174927, "learning_rate": 2.1116666666666667e-05, "loss": 0.3846, "step": 2200 }, { "epoch": 90.97, "eval_accuracy": 0.7517482517482518, "eval_loss": 0.6913285851478577, "eval_runtime": 5.5324, "eval_samples_per_second": 51.696, "eval_steps_per_second": 3.254, "step": 2206 }, { "epoch": 92.0, "eval_accuracy": 0.7482517482517482, "eval_loss": 0.7160294651985168, "eval_runtime": 5.2753, "eval_samples_per_second": 54.215, "eval_steps_per_second": 3.412, "step": 2231 }, { "epoch": 92.78, "grad_norm": 2.4106783866882324, "learning_rate": 2.083888888888889e-05, "loss": 0.3654, "step": 2250 }, { "epoch": 92.99, "eval_accuracy": 0.7517482517482518, "eval_loss": 0.6765207052230835, "eval_runtime": 5.5671, "eval_samples_per_second": 51.373, "eval_steps_per_second": 3.233, "step": 2255 }, { "epoch": 93.98, "eval_accuracy": 0.7657342657342657, "eval_loss": 0.6881967186927795, "eval_runtime": 3.8228, "eval_samples_per_second": 74.814, "eval_steps_per_second": 4.709, "step": 2279 }, { "epoch": 94.85, "grad_norm": 0.8871183395385742, "learning_rate": 2.0561111111111114e-05, "loss": 0.3577, "step": 2300 }, { "epoch": 94.97, "eval_accuracy": 0.7552447552447552, "eval_loss": 0.6852585673332214, "eval_runtime": 4.7228, "eval_samples_per_second": 60.557, "eval_steps_per_second": 3.811, "step": 2303 }, { "epoch": 96.0, "eval_accuracy": 0.7552447552447552, "eval_loss": 0.7158808708190918, "eval_runtime": 5.6504, "eval_samples_per_second": 50.616, "eval_steps_per_second": 3.186, "step": 2328 }, { "epoch": 96.91, "grad_norm": 1.0019863843917847, "learning_rate": 2.0283333333333333e-05, "loss": 0.37, "step": 2350 }, { "epoch": 96.99, "eval_accuracy": 0.7657342657342657, "eval_loss": 0.6943120360374451, "eval_runtime": 4.8337, "eval_samples_per_second": 59.168, "eval_steps_per_second": 3.724, "step": 2352 }, { "epoch": 97.98, "eval_accuracy": 0.7587412587412588, "eval_loss": 0.7010317444801331, "eval_runtime": 4.6874, "eval_samples_per_second": 61.015, "eval_steps_per_second": 3.84, "step": 2376 }, { "epoch": 98.97, "grad_norm": 1.2908928394317627, "learning_rate": 2.0005555555555555e-05, "loss": 0.3473, "step": 2400 }, { "epoch": 98.97, "eval_accuracy": 0.7727272727272727, "eval_loss": 0.693758487701416, "eval_runtime": 4.7585, "eval_samples_per_second": 60.103, "eval_steps_per_second": 3.783, "step": 2400 }, { "epoch": 100.0, "eval_accuracy": 0.7587412587412588, "eval_loss": 0.6918778419494629, "eval_runtime": 6.6891, "eval_samples_per_second": 42.756, "eval_steps_per_second": 2.691, "step": 2425 }, { "epoch": 100.99, "eval_accuracy": 0.7552447552447552, "eval_loss": 0.6849302053451538, "eval_runtime": 4.4685, "eval_samples_per_second": 64.003, "eval_steps_per_second": 4.028, "step": 2449 }, { "epoch": 101.03, "grad_norm": 1.1730871200561523, "learning_rate": 1.972777777777778e-05, "loss": 0.3587, "step": 2450 }, { "epoch": 101.98, "eval_accuracy": 0.7587412587412588, "eval_loss": 0.6855939030647278, "eval_runtime": 4.3434, "eval_samples_per_second": 65.847, "eval_steps_per_second": 4.144, "step": 2473 }, { "epoch": 102.97, "eval_accuracy": 0.7517482517482518, "eval_loss": 0.7046144604682922, "eval_runtime": 4.7166, "eval_samples_per_second": 60.637, "eval_steps_per_second": 3.816, "step": 2497 }, { "epoch": 103.09, "grad_norm": 1.3693217039108276, "learning_rate": 1.945e-05, "loss": 0.3429, "step": 2500 }, { "epoch": 104.0, "eval_accuracy": 0.7727272727272727, "eval_loss": 0.6892997622489929, "eval_runtime": 5.3868, "eval_samples_per_second": 53.092, "eval_steps_per_second": 3.341, "step": 2522 }, { "epoch": 104.99, "eval_accuracy": 0.7622377622377622, "eval_loss": 0.6913393139839172, "eval_runtime": 5.09, "eval_samples_per_second": 56.188, "eval_steps_per_second": 3.536, "step": 2546 }, { "epoch": 105.15, "grad_norm": 1.923829436302185, "learning_rate": 1.9172222222222224e-05, "loss": 0.3549, "step": 2550 }, { "epoch": 105.98, "eval_accuracy": 0.7762237762237763, "eval_loss": 0.6880810856819153, "eval_runtime": 4.6668, "eval_samples_per_second": 61.283, "eval_steps_per_second": 3.857, "step": 2570 }, { "epoch": 106.97, "eval_accuracy": 0.7692307692307693, "eval_loss": 0.7097887396812439, "eval_runtime": 6.4652, "eval_samples_per_second": 44.237, "eval_steps_per_second": 2.784, "step": 2594 }, { "epoch": 107.22, "grad_norm": 2.702012062072754, "learning_rate": 1.8894444444444446e-05, "loss": 0.3403, "step": 2600 }, { "epoch": 108.0, "eval_accuracy": 0.7762237762237763, "eval_loss": 0.6878336668014526, "eval_runtime": 4.6923, "eval_samples_per_second": 60.951, "eval_steps_per_second": 3.836, "step": 2619 }, { "epoch": 108.99, "eval_accuracy": 0.7762237762237763, "eval_loss": 0.695954442024231, "eval_runtime": 4.4809, "eval_samples_per_second": 63.827, "eval_steps_per_second": 4.017, "step": 2643 }, { "epoch": 109.28, "grad_norm": 2.3427536487579346, "learning_rate": 1.8616666666666667e-05, "loss": 0.3253, "step": 2650 }, { "epoch": 109.98, "eval_accuracy": 0.7727272727272727, "eval_loss": 0.7005948424339294, "eval_runtime": 4.8882, "eval_samples_per_second": 58.508, "eval_steps_per_second": 3.682, "step": 2667 }, { "epoch": 110.97, "eval_accuracy": 0.7692307692307693, "eval_loss": 0.6916196346282959, "eval_runtime": 5.2891, "eval_samples_per_second": 54.073, "eval_steps_per_second": 3.403, "step": 2691 }, { "epoch": 111.34, "grad_norm": 2.178089141845703, "learning_rate": 1.833888888888889e-05, "loss": 0.3332, "step": 2700 }, { "epoch": 112.0, "eval_accuracy": 0.7657342657342657, "eval_loss": 0.7059447765350342, "eval_runtime": 4.7437, "eval_samples_per_second": 60.291, "eval_steps_per_second": 3.795, "step": 2716 }, { "epoch": 112.99, "eval_accuracy": 0.7867132867132867, "eval_loss": 0.6904045939445496, "eval_runtime": 4.9942, "eval_samples_per_second": 57.267, "eval_steps_per_second": 3.604, "step": 2740 }, { "epoch": 113.4, "grad_norm": 1.1625444889068604, "learning_rate": 1.806111111111111e-05, "loss": 0.3188, "step": 2750 }, { "epoch": 113.98, "eval_accuracy": 0.7727272727272727, "eval_loss": 0.6970774531364441, "eval_runtime": 6.4809, "eval_samples_per_second": 44.13, "eval_steps_per_second": 2.777, "step": 2764 }, { "epoch": 114.97, "eval_accuracy": 0.7797202797202797, "eval_loss": 0.700820803642273, "eval_runtime": 5.2617, "eval_samples_per_second": 54.355, "eval_steps_per_second": 3.421, "step": 2788 }, { "epoch": 115.46, "grad_norm": 1.2394715547561646, "learning_rate": 1.7783333333333333e-05, "loss": 0.3112, "step": 2800 }, { "epoch": 116.0, "eval_accuracy": 0.7797202797202797, "eval_loss": 0.7002130150794983, "eval_runtime": 5.0937, "eval_samples_per_second": 56.147, "eval_steps_per_second": 3.534, "step": 2813 }, { "epoch": 116.99, "eval_accuracy": 0.7692307692307693, "eval_loss": 0.6909505724906921, "eval_runtime": 4.7575, "eval_samples_per_second": 60.116, "eval_steps_per_second": 3.784, "step": 2837 }, { "epoch": 117.53, "grad_norm": 2.4334964752197266, "learning_rate": 1.7505555555555558e-05, "loss": 0.3153, "step": 2850 }, { "epoch": 117.98, "eval_accuracy": 0.7797202797202797, "eval_loss": 0.6957750916481018, "eval_runtime": 4.8105, "eval_samples_per_second": 59.453, "eval_steps_per_second": 3.742, "step": 2861 }, { "epoch": 118.97, "eval_accuracy": 0.7762237762237763, "eval_loss": 0.6867520213127136, "eval_runtime": 4.5411, "eval_samples_per_second": 62.98, "eval_steps_per_second": 3.964, "step": 2885 }, { "epoch": 119.59, "grad_norm": 0.769097089767456, "learning_rate": 1.7227777777777777e-05, "loss": 0.3006, "step": 2900 }, { "epoch": 120.0, "eval_accuracy": 0.7727272727272727, "eval_loss": 0.6890790462493896, "eval_runtime": 4.5864, "eval_samples_per_second": 62.358, "eval_steps_per_second": 3.925, "step": 2910 }, { "epoch": 120.99, "eval_accuracy": 0.7657342657342657, "eval_loss": 0.6889089941978455, "eval_runtime": 6.5804, "eval_samples_per_second": 43.462, "eval_steps_per_second": 2.735, "step": 2934 }, { "epoch": 121.65, "grad_norm": 1.8714542388916016, "learning_rate": 1.695e-05, "loss": 0.2967, "step": 2950 }, { "epoch": 121.98, "eval_accuracy": 0.7657342657342657, "eval_loss": 0.6935350894927979, "eval_runtime": 4.7491, "eval_samples_per_second": 60.223, "eval_steps_per_second": 3.79, "step": 2958 }, { "epoch": 122.97, "eval_accuracy": 0.7692307692307693, "eval_loss": 0.7058219909667969, "eval_runtime": 4.8941, "eval_samples_per_second": 58.438, "eval_steps_per_second": 3.678, "step": 2982 }, { "epoch": 123.71, "grad_norm": 2.062924385070801, "learning_rate": 1.6672222222222224e-05, "loss": 0.2939, "step": 3000 }, { "epoch": 124.0, "eval_accuracy": 0.7657342657342657, "eval_loss": 0.7220865488052368, "eval_runtime": 5.0487, "eval_samples_per_second": 56.648, "eval_steps_per_second": 3.565, "step": 3007 }, { "epoch": 124.99, "eval_accuracy": 0.7727272727272727, "eval_loss": 0.6857044696807861, "eval_runtime": 5.6134, "eval_samples_per_second": 50.95, "eval_steps_per_second": 3.207, "step": 3031 }, { "epoch": 125.77, "grad_norm": 1.7039302587509155, "learning_rate": 1.6394444444444446e-05, "loss": 0.3101, "step": 3050 }, { "epoch": 125.98, "eval_accuracy": 0.7762237762237763, "eval_loss": 0.6742061972618103, "eval_runtime": 5.3609, "eval_samples_per_second": 53.349, "eval_steps_per_second": 3.358, "step": 3055 }, { "epoch": 126.97, "eval_accuracy": 0.7727272727272727, "eval_loss": 0.7029407620429993, "eval_runtime": 5.8891, "eval_samples_per_second": 48.564, "eval_steps_per_second": 3.056, "step": 3079 }, { "epoch": 127.84, "grad_norm": 1.434970736503601, "learning_rate": 1.6116666666666668e-05, "loss": 0.284, "step": 3100 }, { "epoch": 128.0, "eval_accuracy": 0.7762237762237763, "eval_loss": 0.682050347328186, "eval_runtime": 5.1437, "eval_samples_per_second": 55.602, "eval_steps_per_second": 3.499, "step": 3104 }, { "epoch": 128.99, "eval_accuracy": 0.7762237762237763, "eval_loss": 0.68370121717453, "eval_runtime": 4.2733, "eval_samples_per_second": 66.927, "eval_steps_per_second": 4.212, "step": 3128 }, { "epoch": 129.9, "grad_norm": 1.320789098739624, "learning_rate": 1.583888888888889e-05, "loss": 0.2902, "step": 3150 }, { "epoch": 129.98, "eval_accuracy": 0.7727272727272727, "eval_loss": 0.6823462843894958, "eval_runtime": 5.7566, "eval_samples_per_second": 49.682, "eval_steps_per_second": 3.127, "step": 3152 }, { "epoch": 130.97, "eval_accuracy": 0.7762237762237763, "eval_loss": 0.6950440406799316, "eval_runtime": 4.9248, "eval_samples_per_second": 58.074, "eval_steps_per_second": 3.655, "step": 3176 }, { "epoch": 131.96, "grad_norm": 2.1280930042266846, "learning_rate": 1.556111111111111e-05, "loss": 0.301, "step": 3200 }, { "epoch": 132.0, "eval_accuracy": 0.7727272727272727, "eval_loss": 0.6800761818885803, "eval_runtime": 8.1328, "eval_samples_per_second": 35.166, "eval_steps_per_second": 2.213, "step": 3201 }, { "epoch": 132.99, "eval_accuracy": 0.7762237762237763, "eval_loss": 0.6867505311965942, "eval_runtime": 4.2532, "eval_samples_per_second": 67.244, "eval_steps_per_second": 4.232, "step": 3225 }, { "epoch": 133.98, "eval_accuracy": 0.7797202797202797, "eval_loss": 0.7061284184455872, "eval_runtime": 5.3031, "eval_samples_per_second": 53.93, "eval_steps_per_second": 3.394, "step": 3249 }, { "epoch": 134.02, "grad_norm": 1.532638669013977, "learning_rate": 1.5283333333333333e-05, "loss": 0.2736, "step": 3250 }, { "epoch": 134.97, "eval_accuracy": 0.7727272727272727, "eval_loss": 0.7114368677139282, "eval_runtime": 4.6536, "eval_samples_per_second": 61.458, "eval_steps_per_second": 3.868, "step": 3273 }, { "epoch": 136.0, "eval_accuracy": 0.7762237762237763, "eval_loss": 0.6914551854133606, "eval_runtime": 4.5505, "eval_samples_per_second": 62.851, "eval_steps_per_second": 3.956, "step": 3298 }, { "epoch": 136.08, "grad_norm": 2.0108492374420166, "learning_rate": 1.5005555555555555e-05, "loss": 0.2931, "step": 3300 }, { "epoch": 136.99, "eval_accuracy": 0.7797202797202797, "eval_loss": 0.7055917978286743, "eval_runtime": 5.3067, "eval_samples_per_second": 53.894, "eval_steps_per_second": 3.392, "step": 3322 }, { "epoch": 137.98, "eval_accuracy": 0.7727272727272727, "eval_loss": 0.7026935815811157, "eval_runtime": 5.186, "eval_samples_per_second": 55.149, "eval_steps_per_second": 3.471, "step": 3346 }, { "epoch": 138.14, "grad_norm": 1.0804469585418701, "learning_rate": 1.4727777777777779e-05, "loss": 0.2864, "step": 3350 }, { "epoch": 138.97, "eval_accuracy": 0.7657342657342657, "eval_loss": 0.6983500719070435, "eval_runtime": 6.955, "eval_samples_per_second": 41.122, "eval_steps_per_second": 2.588, "step": 3370 }, { "epoch": 140.0, "eval_accuracy": 0.7657342657342657, "eval_loss": 0.7168787121772766, "eval_runtime": 4.234, "eval_samples_per_second": 67.548, "eval_steps_per_second": 4.251, "step": 3395 }, { "epoch": 140.21, "grad_norm": 2.370694637298584, "learning_rate": 1.445e-05, "loss": 0.2765, "step": 3400 }, { "epoch": 140.99, "eval_accuracy": 0.7762237762237763, "eval_loss": 0.6960318088531494, "eval_runtime": 5.0294, "eval_samples_per_second": 56.865, "eval_steps_per_second": 3.579, "step": 3419 }, { "epoch": 141.98, "eval_accuracy": 0.7762237762237763, "eval_loss": 0.6990492343902588, "eval_runtime": 5.2727, "eval_samples_per_second": 54.242, "eval_steps_per_second": 3.414, "step": 3443 }, { "epoch": 142.27, "grad_norm": 1.6676194667816162, "learning_rate": 1.4172222222222222e-05, "loss": 0.2808, "step": 3450 }, { "epoch": 142.97, "eval_accuracy": 0.7797202797202797, "eval_loss": 0.706200897693634, "eval_runtime": 4.5273, "eval_samples_per_second": 63.173, "eval_steps_per_second": 3.976, "step": 3467 }, { "epoch": 144.0, "eval_accuracy": 0.7657342657342657, "eval_loss": 0.6821764707565308, "eval_runtime": 5.3614, "eval_samples_per_second": 53.344, "eval_steps_per_second": 3.357, "step": 3492 }, { "epoch": 144.33, "grad_norm": 1.9151145219802856, "learning_rate": 1.3894444444444444e-05, "loss": 0.2712, "step": 3500 }, { "epoch": 144.99, "eval_accuracy": 0.7762237762237763, "eval_loss": 0.7063603401184082, "eval_runtime": 4.9088, "eval_samples_per_second": 58.263, "eval_steps_per_second": 3.667, "step": 3516 }, { "epoch": 145.98, "eval_accuracy": 0.7692307692307693, "eval_loss": 0.7150112390518188, "eval_runtime": 7.2044, "eval_samples_per_second": 39.698, "eval_steps_per_second": 2.498, "step": 3540 }, { "epoch": 146.39, "grad_norm": 1.5093848705291748, "learning_rate": 1.3622222222222223e-05, "loss": 0.2726, "step": 3550 }, { "epoch": 146.97, "eval_accuracy": 0.7797202797202797, "eval_loss": 0.696849524974823, "eval_runtime": 4.9386, "eval_samples_per_second": 57.911, "eval_steps_per_second": 3.645, "step": 3564 }, { "epoch": 148.0, "eval_accuracy": 0.7727272727272727, "eval_loss": 0.7086759209632874, "eval_runtime": 4.4363, "eval_samples_per_second": 64.468, "eval_steps_per_second": 4.057, "step": 3589 }, { "epoch": 148.45, "grad_norm": 1.4403679370880127, "learning_rate": 1.3344444444444444e-05, "loss": 0.2607, "step": 3600 }, { "epoch": 148.99, "eval_accuracy": 0.7692307692307693, "eval_loss": 0.7129560112953186, "eval_runtime": 5.3809, "eval_samples_per_second": 53.15, "eval_steps_per_second": 3.345, "step": 3613 }, { "epoch": 149.98, "eval_accuracy": 0.7902097902097902, "eval_loss": 0.7080287933349609, "eval_runtime": 5.8187, "eval_samples_per_second": 49.152, "eval_steps_per_second": 3.093, "step": 3637 }, { "epoch": 150.52, "grad_norm": 2.036515235900879, "learning_rate": 1.3066666666666666e-05, "loss": 0.2546, "step": 3650 }, { "epoch": 150.97, "eval_accuracy": 0.7762237762237763, "eval_loss": 0.7088435888290405, "eval_runtime": 4.8742, "eval_samples_per_second": 58.677, "eval_steps_per_second": 3.693, "step": 3661 }, { "epoch": 152.0, "eval_accuracy": 0.7797202797202797, "eval_loss": 0.7030193209648132, "eval_runtime": 4.9492, "eval_samples_per_second": 57.787, "eval_steps_per_second": 3.637, "step": 3686 }, { "epoch": 152.58, "grad_norm": 1.200052261352539, "learning_rate": 1.2788888888888888e-05, "loss": 0.2563, "step": 3700 }, { "epoch": 152.99, "eval_accuracy": 0.7692307692307693, "eval_loss": 0.7077969908714294, "eval_runtime": 4.614, "eval_samples_per_second": 61.985, "eval_steps_per_second": 3.901, "step": 3710 }, { "epoch": 153.98, "eval_accuracy": 0.7727272727272727, "eval_loss": 0.700455904006958, "eval_runtime": 5.7657, "eval_samples_per_second": 49.604, "eval_steps_per_second": 3.122, "step": 3734 }, { "epoch": 154.64, "grad_norm": 2.2751214504241943, "learning_rate": 1.2511111111111112e-05, "loss": 0.2531, "step": 3750 }, { "epoch": 154.97, "eval_accuracy": 0.7727272727272727, "eval_loss": 0.7160292267799377, "eval_runtime": 5.1079, "eval_samples_per_second": 55.992, "eval_steps_per_second": 3.524, "step": 3758 }, { "epoch": 156.0, "eval_accuracy": 0.7797202797202797, "eval_loss": 0.7175909876823425, "eval_runtime": 5.4035, "eval_samples_per_second": 52.929, "eval_steps_per_second": 3.331, "step": 3783 }, { "epoch": 156.7, "grad_norm": 1.9024412631988525, "learning_rate": 1.2233333333333334e-05, "loss": 0.2446, "step": 3800 }, { "epoch": 156.99, "eval_accuracy": 0.7762237762237763, "eval_loss": 0.7190600037574768, "eval_runtime": 4.3633, "eval_samples_per_second": 65.546, "eval_steps_per_second": 4.125, "step": 3807 }, { "epoch": 157.98, "eval_accuracy": 0.7797202797202797, "eval_loss": 0.719641387462616, "eval_runtime": 5.0426, "eval_samples_per_second": 56.717, "eval_steps_per_second": 3.57, "step": 3831 }, { "epoch": 158.76, "grad_norm": 3.471806287765503, "learning_rate": 1.1955555555555556e-05, "loss": 0.2479, "step": 3850 }, { "epoch": 158.97, "eval_accuracy": 0.7797202797202797, "eval_loss": 0.7073430418968201, "eval_runtime": 3.6336, "eval_samples_per_second": 78.711, "eval_steps_per_second": 4.954, "step": 3855 }, { "epoch": 160.0, "eval_accuracy": 0.7797202797202797, "eval_loss": 0.7328661680221558, "eval_runtime": 5.2625, "eval_samples_per_second": 54.347, "eval_steps_per_second": 3.42, "step": 3880 }, { "epoch": 160.82, "grad_norm": 2.1171793937683105, "learning_rate": 1.1677777777777777e-05, "loss": 0.2523, "step": 3900 }, { "epoch": 160.99, "eval_accuracy": 0.7832167832167832, "eval_loss": 0.7158821821212769, "eval_runtime": 6.5877, "eval_samples_per_second": 43.414, "eval_steps_per_second": 2.732, "step": 3904 }, { "epoch": 161.98, "eval_accuracy": 0.7692307692307693, "eval_loss": 0.719171404838562, "eval_runtime": 4.5674, "eval_samples_per_second": 62.618, "eval_steps_per_second": 3.941, "step": 3928 }, { "epoch": 162.89, "grad_norm": 1.7515395879745483, "learning_rate": 1.1400000000000001e-05, "loss": 0.2523, "step": 3950 }, { "epoch": 162.97, "eval_accuracy": 0.7762237762237763, "eval_loss": 0.7281435132026672, "eval_runtime": 4.4866, "eval_samples_per_second": 63.746, "eval_steps_per_second": 4.012, "step": 3952 }, { "epoch": 164.0, "eval_accuracy": 0.7832167832167832, "eval_loss": 0.7078841328620911, "eval_runtime": 4.4241, "eval_samples_per_second": 64.645, "eval_steps_per_second": 4.069, "step": 3977 }, { "epoch": 164.95, "grad_norm": 1.456335186958313, "learning_rate": 1.1122222222222223e-05, "loss": 0.2422, "step": 4000 }, { "epoch": 164.99, "eval_accuracy": 0.7762237762237763, "eval_loss": 0.7161521911621094, "eval_runtime": 5.1239, "eval_samples_per_second": 55.817, "eval_steps_per_second": 3.513, "step": 4001 }, { "epoch": 165.98, "eval_accuracy": 0.7832167832167832, "eval_loss": 0.7190020084381104, "eval_runtime": 3.4488, "eval_samples_per_second": 82.926, "eval_steps_per_second": 5.219, "step": 4025 }, { "epoch": 166.97, "eval_accuracy": 0.7762237762237763, "eval_loss": 0.7311248779296875, "eval_runtime": 5.0389, "eval_samples_per_second": 56.759, "eval_steps_per_second": 3.572, "step": 4049 }, { "epoch": 167.01, "grad_norm": 1.2554075717926025, "learning_rate": 1.0844444444444445e-05, "loss": 0.242, "step": 4050 }, { "epoch": 168.0, "eval_accuracy": 0.7902097902097902, "eval_loss": 0.7110462188720703, "eval_runtime": 4.4612, "eval_samples_per_second": 64.108, "eval_steps_per_second": 4.035, "step": 4074 }, { "epoch": 168.99, "eval_accuracy": 0.7867132867132867, "eval_loss": 0.7028501629829407, "eval_runtime": 6.955, "eval_samples_per_second": 41.122, "eval_steps_per_second": 2.588, "step": 4098 }, { "epoch": 169.07, "grad_norm": 2.8003265857696533, "learning_rate": 1.0566666666666667e-05, "loss": 0.2392, "step": 4100 }, { "epoch": 169.98, "eval_accuracy": 0.7937062937062938, "eval_loss": 0.7108554840087891, "eval_runtime": 5.0033, "eval_samples_per_second": 57.162, "eval_steps_per_second": 3.598, "step": 4122 }, { "epoch": 170.97, "eval_accuracy": 0.7902097902097902, "eval_loss": 0.7106384634971619, "eval_runtime": 5.1984, "eval_samples_per_second": 55.017, "eval_steps_per_second": 3.463, "step": 4146 }, { "epoch": 171.13, "grad_norm": 2.1897969245910645, "learning_rate": 1.028888888888889e-05, "loss": 0.247, "step": 4150 }, { "epoch": 172.0, "eval_accuracy": 0.7867132867132867, "eval_loss": 0.7151694297790527, "eval_runtime": 5.1963, "eval_samples_per_second": 55.039, "eval_steps_per_second": 3.464, "step": 4171 }, { "epoch": 172.99, "eval_accuracy": 0.7657342657342657, "eval_loss": 0.7254167795181274, "eval_runtime": 4.4466, "eval_samples_per_second": 64.319, "eval_steps_per_second": 4.048, "step": 4195 }, { "epoch": 173.2, "grad_norm": 2.769357681274414, "learning_rate": 1.0011111111111112e-05, "loss": 0.2341, "step": 4200 }, { "epoch": 173.98, "eval_accuracy": 0.7832167832167832, "eval_loss": 0.7290962338447571, "eval_runtime": 6.2221, "eval_samples_per_second": 45.965, "eval_steps_per_second": 2.893, "step": 4219 }, { "epoch": 174.97, "eval_accuracy": 0.7867132867132867, "eval_loss": 0.7088623046875, "eval_runtime": 4.3709, "eval_samples_per_second": 65.433, "eval_steps_per_second": 4.118, "step": 4243 }, { "epoch": 175.26, "grad_norm": 2.044703483581543, "learning_rate": 9.733333333333332e-06, "loss": 0.2317, "step": 4250 }, { "epoch": 176.0, "eval_accuracy": 0.7902097902097902, "eval_loss": 0.7185826897621155, "eval_runtime": 5.4095, "eval_samples_per_second": 52.87, "eval_steps_per_second": 3.327, "step": 4268 }, { "epoch": 176.99, "eval_accuracy": 0.7797202797202797, "eval_loss": 0.7167823314666748, "eval_runtime": 4.9506, "eval_samples_per_second": 57.77, "eval_steps_per_second": 3.636, "step": 4292 }, { "epoch": 177.32, "grad_norm": 1.078834056854248, "learning_rate": 9.455555555555556e-06, "loss": 0.2269, "step": 4300 }, { "epoch": 177.98, "eval_accuracy": 0.7902097902097902, "eval_loss": 0.7237738966941833, "eval_runtime": 4.781, "eval_samples_per_second": 59.82, "eval_steps_per_second": 3.765, "step": 4316 }, { "epoch": 178.97, "eval_accuracy": 0.7867132867132867, "eval_loss": 0.7131801247596741, "eval_runtime": 4.6869, "eval_samples_per_second": 61.022, "eval_steps_per_second": 3.841, "step": 4340 }, { "epoch": 179.38, "grad_norm": 2.008120536804199, "learning_rate": 9.177777777777778e-06, "loss": 0.2283, "step": 4350 }, { "epoch": 180.0, "eval_accuracy": 0.7797202797202797, "eval_loss": 0.7384253144264221, "eval_runtime": 4.5879, "eval_samples_per_second": 62.338, "eval_steps_per_second": 3.923, "step": 4365 }, { "epoch": 180.99, "eval_accuracy": 0.7902097902097902, "eval_loss": 0.7002861499786377, "eval_runtime": 5.3238, "eval_samples_per_second": 53.721, "eval_steps_per_second": 3.381, "step": 4389 }, { "epoch": 181.44, "grad_norm": 1.9518792629241943, "learning_rate": 8.900000000000001e-06, "loss": 0.2303, "step": 4400 }, { "epoch": 181.98, "eval_accuracy": 0.7797202797202797, "eval_loss": 0.7278482913970947, "eval_runtime": 5.8358, "eval_samples_per_second": 49.008, "eval_steps_per_second": 3.084, "step": 4413 }, { "epoch": 182.97, "eval_accuracy": 0.7832167832167832, "eval_loss": 0.7143127918243408, "eval_runtime": 6.1229, "eval_samples_per_second": 46.71, "eval_steps_per_second": 2.94, "step": 4437 }, { "epoch": 183.51, "grad_norm": 1.0936890840530396, "learning_rate": 8.622222222222221e-06, "loss": 0.2109, "step": 4450 }, { "epoch": 184.0, "eval_accuracy": 0.7797202797202797, "eval_loss": 0.7406834363937378, "eval_runtime": 5.0467, "eval_samples_per_second": 56.671, "eval_steps_per_second": 3.567, "step": 4462 }, { "epoch": 184.99, "eval_accuracy": 0.7797202797202797, "eval_loss": 0.7053534388542175, "eval_runtime": 5.279, "eval_samples_per_second": 54.177, "eval_steps_per_second": 3.41, "step": 4486 }, { "epoch": 185.57, "grad_norm": 2.9350059032440186, "learning_rate": 8.344444444444445e-06, "loss": 0.2261, "step": 4500 }, { "epoch": 185.98, "eval_accuracy": 0.7727272727272727, "eval_loss": 0.7260809540748596, "eval_runtime": 5.4165, "eval_samples_per_second": 52.802, "eval_steps_per_second": 3.323, "step": 4510 }, { "epoch": 186.97, "eval_accuracy": 0.7902097902097902, "eval_loss": 0.7240064144134521, "eval_runtime": 5.4866, "eval_samples_per_second": 52.127, "eval_steps_per_second": 3.281, "step": 4534 }, { "epoch": 187.63, "grad_norm": 1.8322782516479492, "learning_rate": 8.066666666666667e-06, "loss": 0.2282, "step": 4550 }, { "epoch": 188.0, "eval_accuracy": 0.7867132867132867, "eval_loss": 0.7199599146842957, "eval_runtime": 4.6736, "eval_samples_per_second": 61.195, "eval_steps_per_second": 3.851, "step": 4559 }, { "epoch": 188.99, "eval_accuracy": 0.7797202797202797, "eval_loss": 0.7102844715118408, "eval_runtime": 5.4219, "eval_samples_per_second": 52.749, "eval_steps_per_second": 3.32, "step": 4583 }, { "epoch": 189.69, "grad_norm": 1.8777916431427002, "learning_rate": 7.78888888888889e-06, "loss": 0.2321, "step": 4600 }, { "epoch": 189.98, "eval_accuracy": 0.7797202797202797, "eval_loss": 0.7083376049995422, "eval_runtime": 5.9634, "eval_samples_per_second": 47.959, "eval_steps_per_second": 3.018, "step": 4607 }, { "epoch": 190.97, "eval_accuracy": 0.7832167832167832, "eval_loss": 0.7244677543640137, "eval_runtime": 5.2078, "eval_samples_per_second": 54.918, "eval_steps_per_second": 3.456, "step": 4631 }, { "epoch": 191.75, "grad_norm": 1.5277408361434937, "learning_rate": 7.5111111111111105e-06, "loss": 0.2261, "step": 4650 }, { "epoch": 192.0, "eval_accuracy": 0.7867132867132867, "eval_loss": 0.7124583721160889, "eval_runtime": 5.7079, "eval_samples_per_second": 50.106, "eval_steps_per_second": 3.154, "step": 4656 }, { "epoch": 192.99, "eval_accuracy": 0.7867132867132867, "eval_loss": 0.7308976054191589, "eval_runtime": 5.3404, "eval_samples_per_second": 53.554, "eval_steps_per_second": 3.371, "step": 4680 }, { "epoch": 193.81, "grad_norm": 2.095749616622925, "learning_rate": 7.233333333333333e-06, "loss": 0.2231, "step": 4700 }, { "epoch": 193.98, "eval_accuracy": 0.7832167832167832, "eval_loss": 0.7237818837165833, "eval_runtime": 4.6666, "eval_samples_per_second": 61.286, "eval_steps_per_second": 3.857, "step": 4704 }, { "epoch": 194.97, "eval_accuracy": 0.7832167832167832, "eval_loss": 0.7253320217132568, "eval_runtime": 5.8059, "eval_samples_per_second": 49.261, "eval_steps_per_second": 3.1, "step": 4728 }, { "epoch": 195.88, "grad_norm": 1.6955636739730835, "learning_rate": 6.955555555555556e-06, "loss": 0.2083, "step": 4750 }, { "epoch": 196.0, "eval_accuracy": 0.7832167832167832, "eval_loss": 0.7240011692047119, "eval_runtime": 6.0767, "eval_samples_per_second": 47.065, "eval_steps_per_second": 2.962, "step": 4753 }, { "epoch": 196.99, "eval_accuracy": 0.7832167832167832, "eval_loss": 0.7131750583648682, "eval_runtime": 5.3063, "eval_samples_per_second": 53.898, "eval_steps_per_second": 3.392, "step": 4777 }, { "epoch": 197.94, "grad_norm": 0.8933289051055908, "learning_rate": 6.677777777777778e-06, "loss": 0.2116, "step": 4800 }, { "epoch": 197.98, "eval_accuracy": 0.7867132867132867, "eval_loss": 0.7169559597969055, "eval_runtime": 5.5713, "eval_samples_per_second": 51.335, "eval_steps_per_second": 3.231, "step": 4801 }, { "epoch": 198.97, "eval_accuracy": 0.7832167832167832, "eval_loss": 0.7265609502792358, "eval_runtime": 4.1397, "eval_samples_per_second": 69.087, "eval_steps_per_second": 4.348, "step": 4825 }, { "epoch": 200.0, "grad_norm": 2.175414562225342, "learning_rate": 6.4000000000000006e-06, "loss": 0.2219, "step": 4850 }, { "epoch": 200.0, "eval_accuracy": 0.7832167832167832, "eval_loss": 0.7162622213363647, "eval_runtime": 5.2016, "eval_samples_per_second": 54.984, "eval_steps_per_second": 3.461, "step": 4850 }, { "epoch": 200.99, "eval_accuracy": 0.7797202797202797, "eval_loss": 0.7302048802375793, "eval_runtime": 4.9222, "eval_samples_per_second": 58.104, "eval_steps_per_second": 3.657, "step": 4874 }, { "epoch": 201.98, "eval_accuracy": 0.7832167832167832, "eval_loss": 0.7223746180534363, "eval_runtime": 4.6884, "eval_samples_per_second": 61.002, "eval_steps_per_second": 3.839, "step": 4898 }, { "epoch": 202.06, "grad_norm": 2.053739309310913, "learning_rate": 6.1222222222222224e-06, "loss": 0.2183, "step": 4900 }, { "epoch": 202.97, "eval_accuracy": 0.7797202797202797, "eval_loss": 0.7179226279258728, "eval_runtime": 4.5556, "eval_samples_per_second": 62.78, "eval_steps_per_second": 3.951, "step": 4922 }, { "epoch": 204.0, "eval_accuracy": 0.7797202797202797, "eval_loss": 0.7245286107063293, "eval_runtime": 5.7474, "eval_samples_per_second": 49.762, "eval_steps_per_second": 3.132, "step": 4947 }, { "epoch": 204.12, "grad_norm": 1.1081063747406006, "learning_rate": 5.844444444444444e-06, "loss": 0.2053, "step": 4950 }, { "epoch": 204.99, "eval_accuracy": 0.7832167832167832, "eval_loss": 0.7344977259635925, "eval_runtime": 5.4178, "eval_samples_per_second": 52.789, "eval_steps_per_second": 3.322, "step": 4971 }, { "epoch": 205.98, "eval_accuracy": 0.7832167832167832, "eval_loss": 0.7249557971954346, "eval_runtime": 5.6352, "eval_samples_per_second": 50.753, "eval_steps_per_second": 3.194, "step": 4995 }, { "epoch": 206.19, "grad_norm": 1.09213125705719, "learning_rate": 5.566666666666667e-06, "loss": 0.2113, "step": 5000 }, { "epoch": 206.97, "eval_accuracy": 0.7832167832167832, "eval_loss": 0.7246001958847046, "eval_runtime": 4.9071, "eval_samples_per_second": 58.283, "eval_steps_per_second": 3.668, "step": 5019 }, { "epoch": 208.0, "eval_accuracy": 0.7867132867132867, "eval_loss": 0.7270117998123169, "eval_runtime": 5.8385, "eval_samples_per_second": 48.985, "eval_steps_per_second": 3.083, "step": 5044 }, { "epoch": 208.25, "grad_norm": 1.6693130731582642, "learning_rate": 5.288888888888889e-06, "loss": 0.2152, "step": 5050 }, { "epoch": 208.99, "eval_accuracy": 0.7867132867132867, "eval_loss": 0.7285901308059692, "eval_runtime": 5.489, "eval_samples_per_second": 52.104, "eval_steps_per_second": 3.279, "step": 5068 }, { "epoch": 209.98, "eval_accuracy": 0.7797202797202797, "eval_loss": 0.7332947254180908, "eval_runtime": 5.3017, "eval_samples_per_second": 53.945, "eval_steps_per_second": 3.395, "step": 5092 }, { "epoch": 210.31, "grad_norm": 2.0511515140533447, "learning_rate": 5.011111111111112e-06, "loss": 0.2129, "step": 5100 }, { "epoch": 210.97, "eval_accuracy": 0.7797202797202797, "eval_loss": 0.7307863831520081, "eval_runtime": 5.2991, "eval_samples_per_second": 53.971, "eval_steps_per_second": 3.397, "step": 5116 }, { "epoch": 212.0, "eval_accuracy": 0.7797202797202797, "eval_loss": 0.7176437973976135, "eval_runtime": 4.9452, "eval_samples_per_second": 57.834, "eval_steps_per_second": 3.64, "step": 5141 }, { "epoch": 212.37, "grad_norm": 1.8491023778915405, "learning_rate": 4.7333333333333335e-06, "loss": 0.2173, "step": 5150 }, { "epoch": 212.99, "eval_accuracy": 0.7832167832167832, "eval_loss": 0.7334882020950317, "eval_runtime": 4.9602, "eval_samples_per_second": 57.659, "eval_steps_per_second": 3.629, "step": 5165 }, { "epoch": 213.98, "eval_accuracy": 0.7797202797202797, "eval_loss": 0.7268483638763428, "eval_runtime": 5.885, "eval_samples_per_second": 48.598, "eval_steps_per_second": 3.059, "step": 5189 }, { "epoch": 214.43, "grad_norm": 1.2067769765853882, "learning_rate": 4.455555555555556e-06, "loss": 0.2042, "step": 5200 }, { "epoch": 214.97, "eval_accuracy": 0.7902097902097902, "eval_loss": 0.7299237847328186, "eval_runtime": 5.7645, "eval_samples_per_second": 49.614, "eval_steps_per_second": 3.123, "step": 5213 }, { "epoch": 216.0, "eval_accuracy": 0.7902097902097902, "eval_loss": 0.7360625863075256, "eval_runtime": 4.7143, "eval_samples_per_second": 60.667, "eval_steps_per_second": 3.818, "step": 5238 }, { "epoch": 216.49, "grad_norm": 1.3863427639007568, "learning_rate": 4.177777777777777e-06, "loss": 0.2112, "step": 5250 }, { "epoch": 216.99, "eval_accuracy": 0.7902097902097902, "eval_loss": 0.723866879940033, "eval_runtime": 5.3445, "eval_samples_per_second": 53.513, "eval_steps_per_second": 3.368, "step": 5262 }, { "epoch": 217.98, "eval_accuracy": 0.7832167832167832, "eval_loss": 0.7252445220947266, "eval_runtime": 4.6314, "eval_samples_per_second": 61.753, "eval_steps_per_second": 3.887, "step": 5286 }, { "epoch": 218.56, "grad_norm": 1.1177924871444702, "learning_rate": 3.9e-06, "loss": 0.2007, "step": 5300 }, { "epoch": 218.97, "eval_accuracy": 0.7867132867132867, "eval_loss": 0.719983696937561, "eval_runtime": 4.865, "eval_samples_per_second": 58.787, "eval_steps_per_second": 3.7, "step": 5310 }, { "epoch": 220.0, "eval_accuracy": 0.7867132867132867, "eval_loss": 0.7195786237716675, "eval_runtime": 5.5422, "eval_samples_per_second": 51.604, "eval_steps_per_second": 3.248, "step": 5335 }, { "epoch": 220.62, "grad_norm": 1.413304090499878, "learning_rate": 3.6222222222222226e-06, "loss": 0.2163, "step": 5350 }, { "epoch": 220.99, "eval_accuracy": 0.7902097902097902, "eval_loss": 0.7309580445289612, "eval_runtime": 5.2512, "eval_samples_per_second": 54.463, "eval_steps_per_second": 3.428, "step": 5359 }, { "epoch": 221.98, "eval_accuracy": 0.7867132867132867, "eval_loss": 0.7313971519470215, "eval_runtime": 5.1151, "eval_samples_per_second": 55.913, "eval_steps_per_second": 3.519, "step": 5383 }, { "epoch": 222.68, "grad_norm": 3.0471901893615723, "learning_rate": 3.3444444444444445e-06, "loss": 0.2141, "step": 5400 }, { "epoch": 222.97, "eval_accuracy": 0.7832167832167832, "eval_loss": 0.727938175201416, "eval_runtime": 4.6405, "eval_samples_per_second": 61.631, "eval_steps_per_second": 3.879, "step": 5407 }, { "epoch": 224.0, "eval_accuracy": 0.7902097902097902, "eval_loss": 0.725923478603363, "eval_runtime": 5.0906, "eval_samples_per_second": 56.182, "eval_steps_per_second": 3.536, "step": 5432 } ], "logging_steps": 50, "max_steps": 6000, "num_input_tokens_seen": 0, "num_train_epochs": 250, "save_steps": 500, "total_flos": 3.037085846065152e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }