{ "best_metric": 0.2531912922859192, "best_model_checkpoint": "./ryan_model314/checkpoint-600", "epoch": 4.0, "eval_steps": 100, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 1.1103402376174927, "learning_rate": 0.00019920000000000002, "loss": 0.5731, "step": 10 }, { "epoch": 0.03, "grad_norm": 0.8193413019180298, "learning_rate": 0.0001984, "loss": 0.4217, "step": 20 }, { "epoch": 0.05, "grad_norm": 1.3536686897277832, "learning_rate": 0.0001976, "loss": 0.3709, "step": 30 }, { "epoch": 0.06, "grad_norm": 0.9998810887336731, "learning_rate": 0.0001968, "loss": 0.3398, "step": 40 }, { "epoch": 0.08, "grad_norm": 1.5689244270324707, "learning_rate": 0.000196, "loss": 0.3346, "step": 50 }, { "epoch": 0.1, "grad_norm": 1.1778826713562012, "learning_rate": 0.0001952, "loss": 0.3406, "step": 60 }, { "epoch": 0.11, "grad_norm": 1.3193926811218262, "learning_rate": 0.0001944, "loss": 0.2755, "step": 70 }, { "epoch": 0.13, "grad_norm": 1.1302804946899414, "learning_rate": 0.00019360000000000002, "loss": 0.3944, "step": 80 }, { "epoch": 0.14, "grad_norm": 0.8255844712257385, "learning_rate": 0.0001928, "loss": 0.3473, "step": 90 }, { "epoch": 0.16, "grad_norm": 1.0871790647506714, "learning_rate": 0.000192, "loss": 0.3042, "step": 100 }, { "epoch": 0.16, "eval_loss": 0.3673088252544403, "eval_na_accuracy": 0.928, "eval_ordinal_accuracy": 0.4671280276816609, "eval_runtime": 110.8646, "eval_samples_per_second": 9.02, "eval_steps_per_second": 1.128, "step": 100 }, { "epoch": 0.18, "grad_norm": 1.5816177129745483, "learning_rate": 0.0001912, "loss": 0.4058, "step": 110 }, { "epoch": 0.19, "grad_norm": 0.8431822061538696, "learning_rate": 0.0001904, "loss": 0.2781, "step": 120 }, { "epoch": 0.21, "grad_norm": 1.0826754570007324, "learning_rate": 0.0001896, "loss": 0.2587, "step": 130 }, { "epoch": 0.22, "grad_norm": 3.7366294860839844, "learning_rate": 0.0001888, "loss": 0.3432, "step": 140 }, { "epoch": 0.24, "grad_norm": 0.5233088731765747, "learning_rate": 0.000188, "loss": 0.3484, "step": 150 }, { "epoch": 0.26, "grad_norm": 1.8766111135482788, "learning_rate": 0.00018720000000000002, "loss": 0.3597, "step": 160 }, { "epoch": 0.27, "grad_norm": 1.0037935972213745, "learning_rate": 0.00018640000000000003, "loss": 0.288, "step": 170 }, { "epoch": 0.29, "grad_norm": 1.3281046152114868, "learning_rate": 0.0001856, "loss": 0.3207, "step": 180 }, { "epoch": 0.3, "grad_norm": 1.4793013334274292, "learning_rate": 0.00018480000000000002, "loss": 0.3372, "step": 190 }, { "epoch": 0.32, "grad_norm": 0.8796727657318115, "learning_rate": 0.00018400000000000003, "loss": 0.2904, "step": 200 }, { "epoch": 0.32, "eval_loss": 0.29769936203956604, "eval_na_accuracy": 0.933, "eval_ordinal_accuracy": 0.5790080738177624, "eval_runtime": 39.6284, "eval_samples_per_second": 25.234, "eval_steps_per_second": 3.154, "step": 200 }, { "epoch": 0.34, "grad_norm": 1.3502057790756226, "learning_rate": 0.0001832, "loss": 0.3519, "step": 210 }, { "epoch": 0.35, "grad_norm": 1.5546174049377441, "learning_rate": 0.00018240000000000002, "loss": 0.3243, "step": 220 }, { "epoch": 0.37, "grad_norm": 0.7677227854728699, "learning_rate": 0.00018160000000000002, "loss": 0.2914, "step": 230 }, { "epoch": 0.38, "grad_norm": 1.1754639148712158, "learning_rate": 0.0001808, "loss": 0.3539, "step": 240 }, { "epoch": 0.4, "grad_norm": 0.8472470641136169, "learning_rate": 0.00018, "loss": 0.2395, "step": 250 }, { "epoch": 0.42, "grad_norm": 1.1917964220046997, "learning_rate": 0.00017920000000000002, "loss": 0.2295, "step": 260 }, { "epoch": 0.43, "grad_norm": 0.7398644685745239, "learning_rate": 0.0001784, "loss": 0.2398, "step": 270 }, { "epoch": 0.45, "grad_norm": 0.5953208804130554, "learning_rate": 0.0001776, "loss": 0.2786, "step": 280 }, { "epoch": 0.46, "grad_norm": 2.0648913383483887, "learning_rate": 0.00017680000000000001, "loss": 0.3661, "step": 290 }, { "epoch": 0.48, "grad_norm": 1.4048805236816406, "learning_rate": 0.00017600000000000002, "loss": 0.2648, "step": 300 }, { "epoch": 0.48, "eval_loss": 0.2830840051174164, "eval_na_accuracy": 0.944, "eval_ordinal_accuracy": 0.5940023068050749, "eval_runtime": 39.7255, "eval_samples_per_second": 25.173, "eval_steps_per_second": 3.147, "step": 300 }, { "epoch": 0.5, "grad_norm": 0.8102580904960632, "learning_rate": 0.0001752, "loss": 0.2359, "step": 310 }, { "epoch": 0.51, "grad_norm": 2.0220913887023926, "learning_rate": 0.0001744, "loss": 0.2557, "step": 320 }, { "epoch": 0.53, "grad_norm": 1.2111886739730835, "learning_rate": 0.00017360000000000002, "loss": 0.3025, "step": 330 }, { "epoch": 0.54, "grad_norm": 1.788378119468689, "learning_rate": 0.0001728, "loss": 0.3067, "step": 340 }, { "epoch": 0.56, "grad_norm": 0.7332974076271057, "learning_rate": 0.000172, "loss": 0.2612, "step": 350 }, { "epoch": 0.58, "grad_norm": 0.5220205783843994, "learning_rate": 0.00017120000000000001, "loss": 0.2924, "step": 360 }, { "epoch": 0.59, "grad_norm": 0.8991191387176514, "learning_rate": 0.0001704, "loss": 0.2379, "step": 370 }, { "epoch": 0.61, "grad_norm": 1.6633837223052979, "learning_rate": 0.0001696, "loss": 0.2792, "step": 380 }, { "epoch": 0.62, "grad_norm": 0.9553330540657043, "learning_rate": 0.0001688, "loss": 0.2512, "step": 390 }, { "epoch": 0.64, "grad_norm": 0.4544942080974579, "learning_rate": 0.000168, "loss": 0.3036, "step": 400 }, { "epoch": 0.64, "eval_loss": 0.27759096026420593, "eval_na_accuracy": 0.949, "eval_ordinal_accuracy": 0.5870818915801614, "eval_runtime": 39.7611, "eval_samples_per_second": 25.15, "eval_steps_per_second": 3.144, "step": 400 }, { "epoch": 0.66, "grad_norm": 1.2751814126968384, "learning_rate": 0.0001672, "loss": 0.3042, "step": 410 }, { "epoch": 0.67, "grad_norm": 1.791074514389038, "learning_rate": 0.0001664, "loss": 0.3341, "step": 420 }, { "epoch": 0.69, "grad_norm": 0.9887642860412598, "learning_rate": 0.0001656, "loss": 0.2868, "step": 430 }, { "epoch": 0.7, "grad_norm": 1.3511923551559448, "learning_rate": 0.0001648, "loss": 0.3763, "step": 440 }, { "epoch": 0.72, "grad_norm": 1.7992609739303589, "learning_rate": 0.000164, "loss": 0.2264, "step": 450 }, { "epoch": 0.74, "grad_norm": 1.0241813659667969, "learning_rate": 0.0001632, "loss": 0.3018, "step": 460 }, { "epoch": 0.75, "grad_norm": 0.628193736076355, "learning_rate": 0.00016240000000000002, "loss": 0.3323, "step": 470 }, { "epoch": 0.77, "grad_norm": 0.8471026420593262, "learning_rate": 0.00016160000000000002, "loss": 0.2005, "step": 480 }, { "epoch": 0.78, "grad_norm": 1.1799852848052979, "learning_rate": 0.0001608, "loss": 0.2984, "step": 490 }, { "epoch": 0.8, "grad_norm": 1.595058560371399, "learning_rate": 0.00016, "loss": 0.2656, "step": 500 }, { "epoch": 0.8, "eval_loss": 0.2846027612686157, "eval_na_accuracy": 0.931, "eval_ordinal_accuracy": 0.6101499423298731, "eval_runtime": 39.9154, "eval_samples_per_second": 25.053, "eval_steps_per_second": 3.132, "step": 500 }, { "epoch": 0.82, "grad_norm": 0.9246352910995483, "learning_rate": 0.00015920000000000002, "loss": 0.3591, "step": 510 }, { "epoch": 0.83, "grad_norm": 0.9456105828285217, "learning_rate": 0.00015840000000000003, "loss": 0.3569, "step": 520 }, { "epoch": 0.85, "grad_norm": 1.111274003982544, "learning_rate": 0.0001576, "loss": 0.3243, "step": 530 }, { "epoch": 0.86, "grad_norm": NaN, "learning_rate": 0.00015688, "loss": 0.2911, "step": 540 }, { "epoch": 0.88, "grad_norm": 0.8232502341270447, "learning_rate": 0.00015616000000000002, "loss": 0.3236, "step": 550 }, { "epoch": 0.9, "grad_norm": 0.6359846591949463, "learning_rate": 0.00015536, "loss": 0.3211, "step": 560 }, { "epoch": 0.91, "grad_norm": 0.545005738735199, "learning_rate": 0.00015456, "loss": 0.205, "step": 570 }, { "epoch": 0.93, "grad_norm": 0.6029797196388245, "learning_rate": 0.00015376000000000002, "loss": 0.1928, "step": 580 }, { "epoch": 0.94, "grad_norm": 0.7442355155944824, "learning_rate": 0.00015296000000000003, "loss": 0.3273, "step": 590 }, { "epoch": 0.96, "grad_norm": 0.6751519441604614, "learning_rate": 0.00015216, "loss": 0.2954, "step": 600 }, { "epoch": 0.96, "eval_loss": 0.2531912922859192, "eval_na_accuracy": 0.947, "eval_ordinal_accuracy": 0.5951557093425606, "eval_runtime": 39.4037, "eval_samples_per_second": 25.378, "eval_steps_per_second": 3.172, "step": 600 }, { "epoch": 0.98, "grad_norm": 1.6347012519836426, "learning_rate": 0.00015136000000000001, "loss": 0.2256, "step": 610 }, { "epoch": 0.99, "grad_norm": 6.180319309234619, "learning_rate": 0.00015056000000000002, "loss": 0.2003, "step": 620 }, { "epoch": 1.01, "grad_norm": 0.8919633626937866, "learning_rate": 0.00014976, "loss": 0.198, "step": 630 }, { "epoch": 1.02, "grad_norm": 0.9197341203689575, "learning_rate": 0.00014896, "loss": 0.1685, "step": 640 }, { "epoch": 1.04, "grad_norm": 0.37014976143836975, "learning_rate": 0.00014816000000000002, "loss": 0.1729, "step": 650 }, { "epoch": 1.06, "grad_norm": 0.8919755220413208, "learning_rate": 0.00014736, "loss": 0.1993, "step": 660 }, { "epoch": 1.07, "grad_norm": 0.7291600704193115, "learning_rate": 0.00014656, "loss": 0.1893, "step": 670 }, { "epoch": 1.09, "grad_norm": 2.347400665283203, "learning_rate": 0.00014576000000000001, "loss": 0.1799, "step": 680 }, { "epoch": 1.1, "grad_norm": 0.3188568949699402, "learning_rate": 0.00014496, "loss": 0.167, "step": 690 }, { "epoch": 1.12, "grad_norm": 1.342278242111206, "learning_rate": 0.00014416, "loss": 0.1991, "step": 700 }, { "epoch": 1.12, "eval_loss": 0.26034072041511536, "eval_na_accuracy": 0.942, "eval_ordinal_accuracy": 0.6078431372549019, "eval_runtime": 39.5088, "eval_samples_per_second": 25.311, "eval_steps_per_second": 3.164, "step": 700 }, { "epoch": 1.14, "grad_norm": 0.48655757308006287, "learning_rate": 0.00014336, "loss": 0.1885, "step": 710 }, { "epoch": 1.15, "grad_norm": 0.556333065032959, "learning_rate": 0.00014256000000000002, "loss": 0.1449, "step": 720 }, { "epoch": 1.17, "grad_norm": 0.4880894422531128, "learning_rate": 0.00014176, "loss": 0.1164, "step": 730 }, { "epoch": 1.18, "grad_norm": 0.599926233291626, "learning_rate": 0.00014096, "loss": 0.2113, "step": 740 }, { "epoch": 1.2, "grad_norm": 0.6070149540901184, "learning_rate": 0.00014016, "loss": 0.1534, "step": 750 }, { "epoch": 1.22, "grad_norm": 0.7789746522903442, "learning_rate": 0.00013936, "loss": 0.1655, "step": 760 }, { "epoch": 1.23, "grad_norm": 0.5523375868797302, "learning_rate": 0.00013856, "loss": 0.298, "step": 770 }, { "epoch": 1.25, "grad_norm": 2.4257819652557373, "learning_rate": 0.00013776, "loss": 0.2101, "step": 780 }, { "epoch": 1.26, "grad_norm": 0.5729731321334839, "learning_rate": 0.00013696, "loss": 0.133, "step": 790 }, { "epoch": 1.28, "grad_norm": 0.4050444960594177, "learning_rate": 0.00013616, "loss": 0.1678, "step": 800 }, { "epoch": 1.28, "eval_loss": 0.2904650568962097, "eval_na_accuracy": 0.942, "eval_ordinal_accuracy": 0.6332179930795848, "eval_runtime": 39.7139, "eval_samples_per_second": 25.18, "eval_steps_per_second": 3.148, "step": 800 }, { "epoch": 1.3, "grad_norm": 4.782747745513916, "learning_rate": 0.00013536, "loss": 0.207, "step": 810 }, { "epoch": 1.31, "grad_norm": 2.577669143676758, "learning_rate": 0.00013455999999999999, "loss": 0.1818, "step": 820 }, { "epoch": 1.33, "grad_norm": 2.8163273334503174, "learning_rate": 0.00013376, "loss": 0.1761, "step": 830 }, { "epoch": 1.34, "grad_norm": 2.213799238204956, "learning_rate": 0.00013296, "loss": 0.2966, "step": 840 }, { "epoch": 1.36, "grad_norm": 0.8946444988250732, "learning_rate": 0.00013216, "loss": 0.1569, "step": 850 }, { "epoch": 1.38, "grad_norm": 0.6494708061218262, "learning_rate": 0.00013136000000000002, "loss": 0.1746, "step": 860 }, { "epoch": 1.39, "grad_norm": 1.0058079957962036, "learning_rate": 0.00013056000000000002, "loss": 0.1204, "step": 870 }, { "epoch": 1.41, "grad_norm": 1.1752161979675293, "learning_rate": 0.00012976, "loss": 0.2082, "step": 880 }, { "epoch": 1.42, "grad_norm": 0.5655858516693115, "learning_rate": 0.00012896, "loss": 0.1971, "step": 890 }, { "epoch": 1.44, "grad_norm": 2.5486743450164795, "learning_rate": 0.00012816000000000002, "loss": 0.2514, "step": 900 }, { "epoch": 1.44, "eval_loss": 0.25656750798225403, "eval_na_accuracy": 0.94, "eval_ordinal_accuracy": 0.6089965397923875, "eval_runtime": 39.7194, "eval_samples_per_second": 25.177, "eval_steps_per_second": 3.147, "step": 900 }, { "epoch": 1.46, "grad_norm": 0.878511369228363, "learning_rate": 0.00012736, "loss": 0.162, "step": 910 }, { "epoch": 1.47, "grad_norm": 1.1985282897949219, "learning_rate": 0.00012656, "loss": 0.2268, "step": 920 }, { "epoch": 1.49, "grad_norm": 0.521425187587738, "learning_rate": 0.00012576000000000002, "loss": 0.1556, "step": 930 }, { "epoch": 1.5, "grad_norm": 0.9773241877555847, "learning_rate": 0.00012496000000000002, "loss": 0.1457, "step": 940 }, { "epoch": 1.52, "grad_norm": 1.6476322412490845, "learning_rate": 0.00012416, "loss": 0.1913, "step": 950 }, { "epoch": 1.54, "grad_norm": 1.7127236127853394, "learning_rate": 0.00012336, "loss": 0.1961, "step": 960 }, { "epoch": 1.55, "grad_norm": 4.41243314743042, "learning_rate": 0.00012256000000000002, "loss": 0.2061, "step": 970 }, { "epoch": 1.57, "grad_norm": 1.5907992124557495, "learning_rate": 0.00012176000000000001, "loss": 0.1299, "step": 980 }, { "epoch": 1.58, "grad_norm": 0.5711427927017212, "learning_rate": 0.00012096000000000001, "loss": 0.1755, "step": 990 }, { "epoch": 1.6, "grad_norm": 2.925363302230835, "learning_rate": 0.00012016, "loss": 0.2328, "step": 1000 }, { "epoch": 1.6, "eval_loss": 0.2884255647659302, "eval_na_accuracy": 0.94, "eval_ordinal_accuracy": 0.5617070357554786, "eval_runtime": 39.4898, "eval_samples_per_second": 25.323, "eval_steps_per_second": 3.165, "step": 1000 }, { "epoch": 1.62, "grad_norm": 1.1306260824203491, "learning_rate": 0.00011936000000000001, "loss": 0.1595, "step": 1010 }, { "epoch": 1.63, "grad_norm": 1.8953267335891724, "learning_rate": 0.00011856, "loss": 0.2489, "step": 1020 }, { "epoch": 1.65, "grad_norm": 0.7074128985404968, "learning_rate": 0.00011776, "loss": 0.2485, "step": 1030 }, { "epoch": 1.66, "grad_norm": 0.7052355408668518, "learning_rate": 0.00011696, "loss": 0.2075, "step": 1040 }, { "epoch": 1.68, "grad_norm": 0.7830259203910828, "learning_rate": 0.00011616, "loss": 0.2346, "step": 1050 }, { "epoch": 1.7, "grad_norm": 0.5882430672645569, "learning_rate": 0.00011536000000000001, "loss": 0.2136, "step": 1060 }, { "epoch": 1.71, "grad_norm": 1.0235962867736816, "learning_rate": 0.00011456, "loss": 0.1753, "step": 1070 }, { "epoch": 1.73, "grad_norm": 0.9401603937149048, "learning_rate": 0.00011376, "loss": 0.173, "step": 1080 }, { "epoch": 1.74, "grad_norm": 1.0735399723052979, "learning_rate": 0.00011296, "loss": 0.1993, "step": 1090 }, { "epoch": 1.76, "grad_norm": 0.6592912673950195, "learning_rate": 0.00011216, "loss": 0.1826, "step": 1100 }, { "epoch": 1.76, "eval_loss": 0.2869604229927063, "eval_na_accuracy": 0.943, "eval_ordinal_accuracy": 0.6043829296424452, "eval_runtime": 39.3069, "eval_samples_per_second": 25.441, "eval_steps_per_second": 3.18, "step": 1100 }, { "epoch": 1.78, "grad_norm": 2.1649601459503174, "learning_rate": 0.00011135999999999999, "loss": 0.221, "step": 1110 }, { "epoch": 1.79, "grad_norm": 1.7881801128387451, "learning_rate": 0.00011056, "loss": 0.1765, "step": 1120 }, { "epoch": 1.81, "grad_norm": 1.3527191877365112, "learning_rate": 0.00010975999999999999, "loss": 0.1325, "step": 1130 }, { "epoch": 1.82, "grad_norm": 0.7212499976158142, "learning_rate": 0.00010896, "loss": 0.2445, "step": 1140 }, { "epoch": 1.84, "grad_norm": 0.9492518901824951, "learning_rate": 0.00010816, "loss": 0.2704, "step": 1150 }, { "epoch": 1.86, "grad_norm": 0.4344118535518646, "learning_rate": 0.00010736000000000002, "loss": 0.1624, "step": 1160 }, { "epoch": 1.87, "grad_norm": 0.4115823209285736, "learning_rate": 0.00010656000000000001, "loss": 0.2069, "step": 1170 }, { "epoch": 1.89, "grad_norm": 0.6738015413284302, "learning_rate": 0.00010576000000000002, "loss": 0.208, "step": 1180 }, { "epoch": 1.9, "grad_norm": 0.9090007543563843, "learning_rate": 0.00010496000000000001, "loss": 0.1793, "step": 1190 }, { "epoch": 1.92, "grad_norm": 1.1480025053024292, "learning_rate": 0.00010416000000000002, "loss": 0.2013, "step": 1200 }, { "epoch": 1.92, "eval_loss": 0.29365527629852295, "eval_na_accuracy": 0.941, "eval_ordinal_accuracy": 0.5905420991926182, "eval_runtime": 39.6842, "eval_samples_per_second": 25.199, "eval_steps_per_second": 3.15, "step": 1200 }, { "epoch": 1.94, "grad_norm": 0.62380051612854, "learning_rate": 0.00010336000000000001, "loss": 0.192, "step": 1210 }, { "epoch": 1.95, "grad_norm": 0.9949710965156555, "learning_rate": 0.00010256000000000001, "loss": 0.226, "step": 1220 }, { "epoch": 1.97, "grad_norm": 1.0634446144104004, "learning_rate": 0.00010176000000000002, "loss": 0.142, "step": 1230 }, { "epoch": 1.98, "grad_norm": 0.8875225782394409, "learning_rate": 0.00010096000000000001, "loss": 0.1729, "step": 1240 }, { "epoch": 2.0, "grad_norm": 0.6193259358406067, "learning_rate": 0.00010016, "loss": 0.102, "step": 1250 }, { "epoch": 2.02, "grad_norm": 0.4042517840862274, "learning_rate": 9.936000000000001e-05, "loss": 0.0976, "step": 1260 }, { "epoch": 2.03, "grad_norm": 0.4051195979118347, "learning_rate": 9.856e-05, "loss": 0.1448, "step": 1270 }, { "epoch": 2.05, "grad_norm": 0.46061789989471436, "learning_rate": 9.776000000000001e-05, "loss": 0.0768, "step": 1280 }, { "epoch": 2.06, "grad_norm": 0.5934004783630371, "learning_rate": 9.696000000000001e-05, "loss": 0.1404, "step": 1290 }, { "epoch": 2.08, "grad_norm": 0.6819984316825867, "learning_rate": 9.616e-05, "loss": 0.0663, "step": 1300 }, { "epoch": 2.08, "eval_loss": 0.2954486608505249, "eval_na_accuracy": 0.938, "eval_ordinal_accuracy": 0.6251441753171857, "eval_runtime": 39.658, "eval_samples_per_second": 25.216, "eval_steps_per_second": 3.152, "step": 1300 }, { "epoch": 2.1, "grad_norm": 0.5849266052246094, "learning_rate": 9.536000000000001e-05, "loss": 0.1574, "step": 1310 }, { "epoch": 2.11, "grad_norm": 0.9393780827522278, "learning_rate": 9.456e-05, "loss": 0.0979, "step": 1320 }, { "epoch": 2.13, "grad_norm": 0.47529059648513794, "learning_rate": 9.376e-05, "loss": 0.1073, "step": 1330 }, { "epoch": 2.14, "grad_norm": 0.4079722464084625, "learning_rate": 9.296e-05, "loss": 0.0868, "step": 1340 }, { "epoch": 2.16, "grad_norm": 0.7292589545249939, "learning_rate": 9.216e-05, "loss": 0.1446, "step": 1350 }, { "epoch": 2.18, "grad_norm": 0.9205511212348938, "learning_rate": 9.136e-05, "loss": 0.0907, "step": 1360 }, { "epoch": 2.19, "grad_norm": 0.9218105673789978, "learning_rate": 9.056e-05, "loss": 0.1387, "step": 1370 }, { "epoch": 2.21, "grad_norm": 0.5730422139167786, "learning_rate": 8.976e-05, "loss": 0.0882, "step": 1380 }, { "epoch": 2.22, "grad_norm": 0.6922823190689087, "learning_rate": 8.896e-05, "loss": 0.0741, "step": 1390 }, { "epoch": 2.24, "grad_norm": 1.1872971057891846, "learning_rate": 8.816000000000001e-05, "loss": 0.1503, "step": 1400 }, { "epoch": 2.24, "eval_loss": 0.3187769651412964, "eval_na_accuracy": 0.937, "eval_ordinal_accuracy": 0.5986159169550173, "eval_runtime": 39.7243, "eval_samples_per_second": 25.173, "eval_steps_per_second": 3.147, "step": 1400 }, { "epoch": 2.26, "grad_norm": 2.3350443840026855, "learning_rate": 8.736e-05, "loss": 0.1083, "step": 1410 }, { "epoch": 2.27, "grad_norm": 0.8266046643257141, "learning_rate": 8.656000000000001e-05, "loss": 0.0684, "step": 1420 }, { "epoch": 2.29, "grad_norm": 0.41480687260627747, "learning_rate": 8.576e-05, "loss": 0.0809, "step": 1430 }, { "epoch": 2.3, "grad_norm": 0.4657377600669861, "learning_rate": 8.496e-05, "loss": 0.0718, "step": 1440 }, { "epoch": 2.32, "grad_norm": 0.5419800877571106, "learning_rate": 8.416000000000001e-05, "loss": 0.1322, "step": 1450 }, { "epoch": 2.34, "grad_norm": 1.167611837387085, "learning_rate": 8.336e-05, "loss": 0.1017, "step": 1460 }, { "epoch": 2.35, "grad_norm": 0.449034184217453, "learning_rate": 8.256000000000001e-05, "loss": 0.0636, "step": 1470 }, { "epoch": 2.37, "grad_norm": 0.6716451048851013, "learning_rate": 8.176e-05, "loss": 0.1109, "step": 1480 }, { "epoch": 2.38, "grad_norm": 4.306596755981445, "learning_rate": 8.096e-05, "loss": 0.0898, "step": 1490 }, { "epoch": 2.4, "grad_norm": 0.41288742423057556, "learning_rate": 8.016e-05, "loss": 0.0611, "step": 1500 }, { "epoch": 2.4, "eval_loss": 0.33932703733444214, "eval_na_accuracy": 0.945, "eval_ordinal_accuracy": 0.5997693194925029, "eval_runtime": 39.236, "eval_samples_per_second": 25.487, "eval_steps_per_second": 3.186, "step": 1500 }, { "epoch": 2.42, "grad_norm": 0.7951626181602478, "learning_rate": 7.936e-05, "loss": 0.0799, "step": 1510 }, { "epoch": 2.43, "grad_norm": 1.0197049379348755, "learning_rate": 7.856000000000001e-05, "loss": 0.0928, "step": 1520 }, { "epoch": 2.45, "grad_norm": 0.6486759185791016, "learning_rate": 7.776e-05, "loss": 0.0964, "step": 1530 }, { "epoch": 2.46, "grad_norm": 1.0220657587051392, "learning_rate": 7.696e-05, "loss": 0.1736, "step": 1540 }, { "epoch": 2.48, "grad_norm": 2.3006441593170166, "learning_rate": 7.616e-05, "loss": 0.0993, "step": 1550 }, { "epoch": 2.5, "grad_norm": 0.4701670706272125, "learning_rate": 7.536000000000001e-05, "loss": 0.0779, "step": 1560 }, { "epoch": 2.51, "grad_norm": 0.641832172870636, "learning_rate": 7.456e-05, "loss": 0.0586, "step": 1570 }, { "epoch": 2.53, "grad_norm": 0.5836305618286133, "learning_rate": 7.376000000000001e-05, "loss": 0.053, "step": 1580 }, { "epoch": 2.54, "grad_norm": 0.6500815153121948, "learning_rate": 7.296e-05, "loss": 0.0779, "step": 1590 }, { "epoch": 2.56, "grad_norm": 0.5682386755943298, "learning_rate": 7.216e-05, "loss": 0.0743, "step": 1600 }, { "epoch": 2.56, "eval_loss": 0.3182476758956909, "eval_na_accuracy": 0.942, "eval_ordinal_accuracy": 0.6482122260668973, "eval_runtime": 40.5507, "eval_samples_per_second": 24.66, "eval_steps_per_second": 3.083, "step": 1600 }, { "epoch": 2.58, "grad_norm": 0.5527540445327759, "learning_rate": 7.136000000000001e-05, "loss": 0.094, "step": 1610 }, { "epoch": 2.59, "grad_norm": 0.8710426092147827, "learning_rate": 7.056e-05, "loss": 0.0839, "step": 1620 }, { "epoch": 2.61, "grad_norm": 0.9312260746955872, "learning_rate": 6.976000000000001e-05, "loss": 0.155, "step": 1630 }, { "epoch": 2.62, "grad_norm": 0.48695412278175354, "learning_rate": 6.896e-05, "loss": 0.0606, "step": 1640 }, { "epoch": 2.64, "grad_norm": 0.525652289390564, "learning_rate": 6.816e-05, "loss": 0.0715, "step": 1650 }, { "epoch": 2.66, "grad_norm": 0.7670960426330566, "learning_rate": 6.736e-05, "loss": 0.1108, "step": 1660 }, { "epoch": 2.67, "grad_norm": 1.0041375160217285, "learning_rate": 6.656e-05, "loss": 0.1257, "step": 1670 }, { "epoch": 2.69, "grad_norm": 0.3819805681705475, "learning_rate": 6.576e-05, "loss": 0.1004, "step": 1680 }, { "epoch": 2.7, "grad_norm": 0.5372006893157959, "learning_rate": 6.496e-05, "loss": 0.0825, "step": 1690 }, { "epoch": 2.72, "grad_norm": 0.5835949182510376, "learning_rate": 6.416e-05, "loss": 0.0908, "step": 1700 }, { "epoch": 2.72, "eval_loss": 0.3332485854625702, "eval_na_accuracy": 0.942, "eval_ordinal_accuracy": 0.6482122260668973, "eval_runtime": 39.9642, "eval_samples_per_second": 25.022, "eval_steps_per_second": 3.128, "step": 1700 }, { "epoch": 2.74, "grad_norm": 0.677947461605072, "learning_rate": 6.336e-05, "loss": 0.1086, "step": 1710 }, { "epoch": 2.75, "grad_norm": 0.7373325228691101, "learning_rate": 6.256000000000001e-05, "loss": 0.0698, "step": 1720 }, { "epoch": 2.77, "grad_norm": 0.7738047242164612, "learning_rate": 6.176e-05, "loss": 0.1118, "step": 1730 }, { "epoch": 2.78, "grad_norm": 2.052891254425049, "learning_rate": 6.0960000000000006e-05, "loss": 0.1002, "step": 1740 }, { "epoch": 2.8, "grad_norm": 0.26311352849006653, "learning_rate": 6.016000000000001e-05, "loss": 0.0944, "step": 1750 }, { "epoch": 2.82, "grad_norm": 0.8190409541130066, "learning_rate": 5.936000000000001e-05, "loss": 0.0717, "step": 1760 }, { "epoch": 2.83, "grad_norm": 0.5824436545372009, "learning_rate": 5.856e-05, "loss": 0.0746, "step": 1770 }, { "epoch": 2.85, "grad_norm": 0.5489352941513062, "learning_rate": 5.776e-05, "loss": 0.1063, "step": 1780 }, { "epoch": 2.86, "grad_norm": 0.656225323677063, "learning_rate": 5.6960000000000004e-05, "loss": 0.0763, "step": 1790 }, { "epoch": 2.88, "grad_norm": 0.8495000600814819, "learning_rate": 5.6160000000000004e-05, "loss": 0.1108, "step": 1800 }, { "epoch": 2.88, "eval_loss": 0.32561835646629333, "eval_na_accuracy": 0.943, "eval_ordinal_accuracy": 0.6459054209919262, "eval_runtime": 39.4673, "eval_samples_per_second": 25.337, "eval_steps_per_second": 3.167, "step": 1800 }, { "epoch": 2.9, "grad_norm": 0.617258608341217, "learning_rate": 5.536e-05, "loss": 0.1203, "step": 1810 }, { "epoch": 2.91, "grad_norm": 0.4484919011592865, "learning_rate": 5.456e-05, "loss": 0.0573, "step": 1820 }, { "epoch": 2.93, "grad_norm": 0.533388614654541, "learning_rate": 5.376e-05, "loss": 0.0762, "step": 1830 }, { "epoch": 2.94, "grad_norm": 0.4078121483325958, "learning_rate": 5.296e-05, "loss": 0.0643, "step": 1840 }, { "epoch": 2.96, "grad_norm": 0.5678732395172119, "learning_rate": 5.2159999999999995e-05, "loss": 0.1126, "step": 1850 }, { "epoch": 2.98, "grad_norm": 0.6543716192245483, "learning_rate": 5.1359999999999996e-05, "loss": 0.0763, "step": 1860 }, { "epoch": 2.99, "grad_norm": 0.6005885601043701, "learning_rate": 5.056000000000001e-05, "loss": 0.0949, "step": 1870 }, { "epoch": 3.01, "grad_norm": 0.4467845559120178, "learning_rate": 4.976e-05, "loss": 0.0481, "step": 1880 }, { "epoch": 3.02, "grad_norm": 0.48746606707572937, "learning_rate": 4.896e-05, "loss": 0.0415, "step": 1890 }, { "epoch": 3.04, "grad_norm": 0.7011713981628418, "learning_rate": 4.816e-05, "loss": 0.0786, "step": 1900 }, { "epoch": 3.04, "eval_loss": 0.3222349286079407, "eval_na_accuracy": 0.944, "eval_ordinal_accuracy": 0.6539792387543253, "eval_runtime": 39.2271, "eval_samples_per_second": 25.493, "eval_steps_per_second": 3.187, "step": 1900 }, { "epoch": 3.06, "grad_norm": 0.5153183341026306, "learning_rate": 4.736000000000001e-05, "loss": 0.0387, "step": 1910 }, { "epoch": 3.07, "grad_norm": 0.43751129508018494, "learning_rate": 4.656e-05, "loss": 0.0364, "step": 1920 }, { "epoch": 3.09, "grad_norm": 0.7584701776504517, "learning_rate": 4.576e-05, "loss": 0.0397, "step": 1930 }, { "epoch": 3.1, "grad_norm": 0.20170661807060242, "learning_rate": 4.496e-05, "loss": 0.0288, "step": 1940 }, { "epoch": 3.12, "grad_norm": 0.2583639919757843, "learning_rate": 4.4160000000000004e-05, "loss": 0.0302, "step": 1950 }, { "epoch": 3.14, "grad_norm": 3.9720704555511475, "learning_rate": 4.336e-05, "loss": 0.0484, "step": 1960 }, { "epoch": 3.15, "grad_norm": 0.3367606997489929, "learning_rate": 4.256e-05, "loss": 0.0387, "step": 1970 }, { "epoch": 3.17, "grad_norm": 0.7610962986946106, "learning_rate": 4.176000000000001e-05, "loss": 0.0424, "step": 1980 }, { "epoch": 3.18, "grad_norm": 0.6901140213012695, "learning_rate": 4.096e-05, "loss": 0.035, "step": 1990 }, { "epoch": 3.2, "grad_norm": 0.2873363196849823, "learning_rate": 4.016e-05, "loss": 0.043, "step": 2000 }, { "epoch": 3.2, "eval_loss": 0.35012441873550415, "eval_na_accuracy": 0.941, "eval_ordinal_accuracy": 0.6482122260668973, "eval_runtime": 39.427, "eval_samples_per_second": 25.363, "eval_steps_per_second": 3.17, "step": 2000 }, { "epoch": 3.22, "grad_norm": 0.5277103781700134, "learning_rate": 3.936e-05, "loss": 0.0423, "step": 2010 }, { "epoch": 3.23, "grad_norm": 0.3088182508945465, "learning_rate": 3.8560000000000004e-05, "loss": 0.0305, "step": 2020 }, { "epoch": 3.25, "grad_norm": 0.3621159791946411, "learning_rate": 3.776e-05, "loss": 0.0398, "step": 2030 }, { "epoch": 3.26, "grad_norm": 0.6761226654052734, "learning_rate": 3.696e-05, "loss": 0.0606, "step": 2040 }, { "epoch": 3.28, "grad_norm": 0.9860779047012329, "learning_rate": 3.616e-05, "loss": 0.0437, "step": 2050 }, { "epoch": 3.3, "grad_norm": 0.6743874549865723, "learning_rate": 3.536000000000001e-05, "loss": 0.0328, "step": 2060 }, { "epoch": 3.31, "grad_norm": 1.5928354263305664, "learning_rate": 3.456e-05, "loss": 0.0261, "step": 2070 }, { "epoch": 3.33, "grad_norm": 0.9067389965057373, "learning_rate": 3.376e-05, "loss": 0.0268, "step": 2080 }, { "epoch": 3.34, "grad_norm": 0.5733221173286438, "learning_rate": 3.296e-05, "loss": 0.0335, "step": 2090 }, { "epoch": 3.36, "grad_norm": 0.7042862772941589, "learning_rate": 3.2160000000000004e-05, "loss": 0.0472, "step": 2100 }, { "epoch": 3.36, "eval_loss": 0.34554365277290344, "eval_na_accuracy": 0.943, "eval_ordinal_accuracy": 0.6608996539792388, "eval_runtime": 40.1655, "eval_samples_per_second": 24.897, "eval_steps_per_second": 3.112, "step": 2100 }, { "epoch": 3.38, "grad_norm": 0.8036783933639526, "learning_rate": 3.136e-05, "loss": 0.0402, "step": 2110 }, { "epoch": 3.39, "grad_norm": 0.4863825738430023, "learning_rate": 3.056e-05, "loss": 0.0507, "step": 2120 }, { "epoch": 3.41, "grad_norm": 0.5171158313751221, "learning_rate": 2.976e-05, "loss": 0.0333, "step": 2130 }, { "epoch": 3.42, "grad_norm": 0.21965381503105164, "learning_rate": 2.8960000000000004e-05, "loss": 0.0277, "step": 2140 }, { "epoch": 3.44, "grad_norm": 0.20841450989246368, "learning_rate": 2.816e-05, "loss": 0.0259, "step": 2150 }, { "epoch": 3.46, "grad_norm": 0.5015869736671448, "learning_rate": 2.7360000000000002e-05, "loss": 0.0316, "step": 2160 }, { "epoch": 3.47, "grad_norm": 0.7938678860664368, "learning_rate": 2.6560000000000003e-05, "loss": 0.0301, "step": 2170 }, { "epoch": 3.49, "grad_norm": 0.44840845465660095, "learning_rate": 2.576e-05, "loss": 0.0406, "step": 2180 }, { "epoch": 3.5, "grad_norm": 0.35510167479515076, "learning_rate": 2.496e-05, "loss": 0.0268, "step": 2190 }, { "epoch": 3.52, "grad_norm": 0.37328681349754333, "learning_rate": 2.4160000000000002e-05, "loss": 0.032, "step": 2200 }, { "epoch": 3.52, "eval_loss": 0.35616353154182434, "eval_na_accuracy": 0.94, "eval_ordinal_accuracy": 0.6516724336793541, "eval_runtime": 40.3825, "eval_samples_per_second": 24.763, "eval_steps_per_second": 3.095, "step": 2200 }, { "epoch": 3.54, "grad_norm": 0.24070143699645996, "learning_rate": 2.336e-05, "loss": 0.0262, "step": 2210 }, { "epoch": 3.55, "grad_norm": 1.0428861379623413, "learning_rate": 2.256e-05, "loss": 0.04, "step": 2220 }, { "epoch": 3.57, "grad_norm": 0.626348614692688, "learning_rate": 2.176e-05, "loss": 0.0275, "step": 2230 }, { "epoch": 3.58, "grad_norm": 0.47826460003852844, "learning_rate": 2.0960000000000003e-05, "loss": 0.0379, "step": 2240 }, { "epoch": 3.6, "grad_norm": 2.685340166091919, "learning_rate": 2.016e-05, "loss": 0.047, "step": 2250 }, { "epoch": 3.62, "grad_norm": 0.1495877057313919, "learning_rate": 1.936e-05, "loss": 0.0389, "step": 2260 }, { "epoch": 3.63, "grad_norm": 0.5789759755134583, "learning_rate": 1.856e-05, "loss": 0.0337, "step": 2270 }, { "epoch": 3.65, "grad_norm": 0.4255303144454956, "learning_rate": 1.7760000000000003e-05, "loss": 0.0282, "step": 2280 }, { "epoch": 3.66, "grad_norm": 0.7483348846435547, "learning_rate": 1.696e-05, "loss": 0.0668, "step": 2290 }, { "epoch": 3.68, "grad_norm": 0.23885439336299896, "learning_rate": 1.616e-05, "loss": 0.0434, "step": 2300 }, { "epoch": 3.68, "eval_loss": 0.34990155696868896, "eval_na_accuracy": 0.94, "eval_ordinal_accuracy": 0.6597462514417531, "eval_runtime": 40.407, "eval_samples_per_second": 24.748, "eval_steps_per_second": 3.094, "step": 2300 }, { "epoch": 3.7, "grad_norm": 0.3341818153858185, "learning_rate": 1.536e-05, "loss": 0.0358, "step": 2310 }, { "epoch": 3.71, "grad_norm": 0.6008884310722351, "learning_rate": 1.4560000000000001e-05, "loss": 0.0394, "step": 2320 }, { "epoch": 3.73, "grad_norm": 0.3966546654701233, "learning_rate": 1.376e-05, "loss": 0.0346, "step": 2330 }, { "epoch": 3.74, "grad_norm": 0.46933791041374207, "learning_rate": 1.296e-05, "loss": 0.0227, "step": 2340 }, { "epoch": 3.76, "grad_norm": 0.6652282476425171, "learning_rate": 1.216e-05, "loss": 0.0393, "step": 2350 }, { "epoch": 3.78, "grad_norm": 0.23938482999801636, "learning_rate": 1.1360000000000001e-05, "loss": 0.0267, "step": 2360 }, { "epoch": 3.79, "grad_norm": 0.6050881147384644, "learning_rate": 1.056e-05, "loss": 0.0287, "step": 2370 }, { "epoch": 3.81, "grad_norm": 0.22671189904212952, "learning_rate": 9.760000000000001e-06, "loss": 0.0491, "step": 2380 }, { "epoch": 3.82, "grad_norm": 0.5296955704689026, "learning_rate": 8.96e-06, "loss": 0.0266, "step": 2390 }, { "epoch": 3.84, "grad_norm": 0.5424560308456421, "learning_rate": 8.160000000000001e-06, "loss": 0.0341, "step": 2400 }, { "epoch": 3.84, "eval_loss": 0.3610887825489044, "eval_na_accuracy": 0.94, "eval_ordinal_accuracy": 0.6482122260668973, "eval_runtime": 40.741, "eval_samples_per_second": 24.545, "eval_steps_per_second": 3.068, "step": 2400 }, { "epoch": 3.86, "grad_norm": 0.4790421724319458, "learning_rate": 7.36e-06, "loss": 0.0319, "step": 2410 }, { "epoch": 3.87, "grad_norm": 0.4021483063697815, "learning_rate": 6.560000000000001e-06, "loss": 0.0551, "step": 2420 }, { "epoch": 3.89, "grad_norm": 0.43051794171333313, "learning_rate": 5.76e-06, "loss": 0.0281, "step": 2430 }, { "epoch": 3.9, "grad_norm": 0.23781944811344147, "learning_rate": 4.96e-06, "loss": 0.0306, "step": 2440 }, { "epoch": 3.92, "grad_norm": 0.6060004234313965, "learning_rate": 4.16e-06, "loss": 0.0326, "step": 2450 }, { "epoch": 3.94, "grad_norm": 0.5149852633476257, "learning_rate": 3.36e-06, "loss": 0.0266, "step": 2460 }, { "epoch": 3.95, "grad_norm": 0.579931914806366, "learning_rate": 2.56e-06, "loss": 0.0236, "step": 2470 }, { "epoch": 3.97, "grad_norm": 0.14379101991653442, "learning_rate": 1.76e-06, "loss": 0.0221, "step": 2480 }, { "epoch": 3.98, "grad_norm": 0.5184658765792847, "learning_rate": 9.6e-07, "loss": 0.0281, "step": 2490 }, { "epoch": 4.0, "grad_norm": 0.5299363732337952, "learning_rate": 1.6e-07, "loss": 0.0305, "step": 2500 }, { "epoch": 4.0, "eval_loss": 0.36354970932006836, "eval_na_accuracy": 0.939, "eval_ordinal_accuracy": 0.6608996539792388, "eval_runtime": 40.233, "eval_samples_per_second": 24.855, "eval_steps_per_second": 3.107, "step": 2500 }, { "epoch": 4.0, "step": 2500, "total_flos": 3.0997907103744e+18, "train_loss": 0.15650403581261635, "train_runtime": 3981.9696, "train_samples_per_second": 10.045, "train_steps_per_second": 0.628 } ], "logging_steps": 10, "max_steps": 2500, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "total_flos": 3.0997907103744e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }