{ "best_metric": 0.19156721234321594, "best_model_checkpoint": "./ryan03312024_lr_2e-5_wd_001/checkpoint-3100", "epoch": 1.5001803101334295, "eval_steps": 100, "global_step": 4160, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 0.6417293548583984, "learning_rate": 1.9879807692307693e-05, "loss": 0.5741, "step": 25 }, { "epoch": 0.02, "grad_norm": 1.4069948196411133, "learning_rate": 1.975961538461539e-05, "loss": 0.4638, "step": 50 }, { "epoch": 0.03, "grad_norm": 1.5593534708023071, "learning_rate": 1.963942307692308e-05, "loss": 0.4114, "step": 75 }, { "epoch": 0.04, "grad_norm": 0.6675819158554077, "learning_rate": 1.9519230769230772e-05, "loss": 0.4436, "step": 100 }, { "epoch": 0.04, "eval_loss": 0.3697698712348938, "eval_na_accuracy": 0.7989690899848938, "eval_ordinal_accuracy": 0.3331620991230011, "eval_ordinal_mae": 0.8705630302429199, "eval_runtime": 346.2992, "eval_samples_per_second": 12.922, "eval_steps_per_second": 1.617, "step": 100 }, { "epoch": 0.05, "grad_norm": 0.6941640377044678, "learning_rate": 1.9399038461538464e-05, "loss": 0.3901, "step": 125 }, { "epoch": 0.05, "grad_norm": 0.7081687450408936, "learning_rate": 1.9278846153846155e-05, "loss": 0.366, "step": 150 }, { "epoch": 0.06, "grad_norm": 0.9151293635368347, "learning_rate": 1.9158653846153847e-05, "loss": 0.3773, "step": 175 }, { "epoch": 0.07, "grad_norm": 0.5204830169677734, "learning_rate": 1.903846153846154e-05, "loss": 0.3143, "step": 200 }, { "epoch": 0.07, "eval_loss": 0.3215162754058838, "eval_na_accuracy": 0.8092783689498901, "eval_ordinal_accuracy": 0.4017467200756073, "eval_ordinal_mae": 0.855476975440979, "eval_runtime": 209.7588, "eval_samples_per_second": 21.334, "eval_steps_per_second": 2.67, "step": 200 }, { "epoch": 0.08, "grad_norm": 1.357649803161621, "learning_rate": 1.8918269230769234e-05, "loss": 0.3559, "step": 225 }, { "epoch": 0.09, "grad_norm": 0.7635074257850647, "learning_rate": 1.8798076923076926e-05, "loss": 0.3616, "step": 250 }, { "epoch": 0.1, "grad_norm": 2.4480478763580322, "learning_rate": 1.8677884615384617e-05, "loss": 0.3209, "step": 275 }, { "epoch": 0.11, "grad_norm": 0.8215653300285339, "learning_rate": 1.855769230769231e-05, "loss": 0.3385, "step": 300 }, { "epoch": 0.11, "eval_loss": 0.2996984124183655, "eval_na_accuracy": 0.8591065406799316, "eval_ordinal_accuracy": 0.44849729537963867, "eval_ordinal_mae": 0.8302922248840332, "eval_runtime": 193.7095, "eval_samples_per_second": 23.102, "eval_steps_per_second": 2.891, "step": 300 }, { "epoch": 0.12, "grad_norm": 0.48835399746894836, "learning_rate": 1.84375e-05, "loss": 0.2976, "step": 325 }, { "epoch": 0.13, "grad_norm": 0.6756967306137085, "learning_rate": 1.8317307692307693e-05, "loss": 0.3043, "step": 350 }, { "epoch": 0.14, "grad_norm": 0.20951023697853088, "learning_rate": 1.8197115384615388e-05, "loss": 0.3046, "step": 375 }, { "epoch": 0.14, "grad_norm": 0.9027990102767944, "learning_rate": 1.807692307692308e-05, "loss": 0.3127, "step": 400 }, { "epoch": 0.14, "eval_loss": 0.28894639015197754, "eval_na_accuracy": 0.8745704293251038, "eval_ordinal_accuracy": 0.4880554974079132, "eval_ordinal_mae": 0.8012504577636719, "eval_runtime": 195.2283, "eval_samples_per_second": 22.922, "eval_steps_per_second": 2.868, "step": 400 }, { "epoch": 0.15, "grad_norm": 1.5711051225662231, "learning_rate": 1.795673076923077e-05, "loss": 0.3094, "step": 425 }, { "epoch": 0.16, "grad_norm": 2.6442267894744873, "learning_rate": 1.7836538461538463e-05, "loss": 0.2979, "step": 450 }, { "epoch": 0.17, "grad_norm": 1.7498008012771606, "learning_rate": 1.7716346153846155e-05, "loss": 0.3193, "step": 475 }, { "epoch": 0.18, "grad_norm": 1.4141407012939453, "learning_rate": 1.7596153846153846e-05, "loss": 0.3054, "step": 500 }, { "epoch": 0.18, "eval_loss": 0.28038087487220764, "eval_na_accuracy": 0.8780068755149841, "eval_ordinal_accuracy": 0.5324942469596863, "eval_ordinal_mae": 0.7619425058364868, "eval_runtime": 189.7487, "eval_samples_per_second": 23.584, "eval_steps_per_second": 2.951, "step": 500 }, { "epoch": 0.19, "grad_norm": 1.3073471784591675, "learning_rate": 1.7475961538461538e-05, "loss": 0.302, "step": 525 }, { "epoch": 0.2, "grad_norm": 1.1622358560562134, "learning_rate": 1.7355769230769233e-05, "loss": 0.2844, "step": 550 }, { "epoch": 0.21, "grad_norm": 0.3688335716724396, "learning_rate": 1.7235576923076925e-05, "loss": 0.2745, "step": 575 }, { "epoch": 0.22, "grad_norm": 1.1387437582015991, "learning_rate": 1.7115384615384617e-05, "loss": 0.3051, "step": 600 }, { "epoch": 0.22, "eval_loss": 0.27521631121635437, "eval_na_accuracy": 0.9158075451850891, "eval_ordinal_accuracy": 0.5235037207603455, "eval_ordinal_mae": 0.7215057015419006, "eval_runtime": 192.1403, "eval_samples_per_second": 23.29, "eval_steps_per_second": 2.915, "step": 600 }, { "epoch": 0.23, "grad_norm": 0.613787055015564, "learning_rate": 1.699519230769231e-05, "loss": 0.3017, "step": 625 }, { "epoch": 0.23, "grad_norm": 0.595897376537323, "learning_rate": 1.6875e-05, "loss": 0.2806, "step": 650 }, { "epoch": 0.24, "grad_norm": 1.4324086904525757, "learning_rate": 1.6754807692307692e-05, "loss": 0.256, "step": 675 }, { "epoch": 0.25, "grad_norm": 3.7929930686950684, "learning_rate": 1.6634615384615387e-05, "loss": 0.2833, "step": 700 }, { "epoch": 0.25, "eval_loss": 0.26531103253364563, "eval_na_accuracy": 0.8969072103500366, "eval_ordinal_accuracy": 0.5486770868301392, "eval_ordinal_mae": 0.6806999444961548, "eval_runtime": 191.9358, "eval_samples_per_second": 23.315, "eval_steps_per_second": 2.918, "step": 700 }, { "epoch": 0.26, "grad_norm": 0.27854958176612854, "learning_rate": 1.651442307692308e-05, "loss": 0.2805, "step": 725 }, { "epoch": 0.27, "grad_norm": 1.8652201890945435, "learning_rate": 1.6399038461538462e-05, "loss": 0.2737, "step": 750 }, { "epoch": 0.28, "grad_norm": 2.1118507385253906, "learning_rate": 1.6278846153846154e-05, "loss": 0.2611, "step": 775 }, { "epoch": 0.29, "grad_norm": 2.6029460430145264, "learning_rate": 1.6158653846153845e-05, "loss": 0.2907, "step": 800 }, { "epoch": 0.29, "eval_loss": 0.255000501871109, "eval_na_accuracy": 0.8350515365600586, "eval_ordinal_accuracy": 0.5617775321006775, "eval_ordinal_mae": 0.6431577205657959, "eval_runtime": 190.5444, "eval_samples_per_second": 23.485, "eval_steps_per_second": 2.939, "step": 800 }, { "epoch": 0.3, "grad_norm": 1.63986337184906, "learning_rate": 1.603846153846154e-05, "loss": 0.2661, "step": 825 }, { "epoch": 0.31, "grad_norm": 0.8340407013893127, "learning_rate": 1.5918269230769232e-05, "loss": 0.2513, "step": 850 }, { "epoch": 0.32, "grad_norm": 1.321059226989746, "learning_rate": 1.5798076923076924e-05, "loss": 0.2676, "step": 875 }, { "epoch": 0.32, "grad_norm": 2.41912841796875, "learning_rate": 1.567788461538462e-05, "loss": 0.2468, "step": 900 }, { "epoch": 0.32, "eval_loss": 0.2521895170211792, "eval_na_accuracy": 0.8058419227600098, "eval_ordinal_accuracy": 0.5972257852554321, "eval_ordinal_mae": 0.6118690967559814, "eval_runtime": 197.239, "eval_samples_per_second": 22.688, "eval_steps_per_second": 2.839, "step": 900 }, { "epoch": 0.33, "grad_norm": 0.9854594469070435, "learning_rate": 1.555769230769231e-05, "loss": 0.2727, "step": 925 }, { "epoch": 0.34, "grad_norm": 5.904122352600098, "learning_rate": 1.54375e-05, "loss": 0.3027, "step": 950 }, { "epoch": 0.35, "grad_norm": 1.8180320262908936, "learning_rate": 1.5317307692307694e-05, "loss": 0.2465, "step": 975 }, { "epoch": 0.36, "grad_norm": 0.5673078894615173, "learning_rate": 1.5197115384615386e-05, "loss": 0.2199, "step": 1000 }, { "epoch": 0.36, "eval_loss": 0.24374203383922577, "eval_na_accuracy": 0.8127147555351257, "eval_ordinal_accuracy": 0.606216311454773, "eval_ordinal_mae": 0.6023499965667725, "eval_runtime": 192.8009, "eval_samples_per_second": 23.21, "eval_steps_per_second": 2.905, "step": 1000 }, { "epoch": 0.37, "grad_norm": 0.8171074390411377, "learning_rate": 1.5076923076923078e-05, "loss": 0.221, "step": 1025 }, { "epoch": 0.38, "grad_norm": 0.881005048751831, "learning_rate": 1.495673076923077e-05, "loss": 0.235, "step": 1050 }, { "epoch": 0.39, "grad_norm": 2.021958351135254, "learning_rate": 1.4836538461538463e-05, "loss": 0.2841, "step": 1075 }, { "epoch": 0.4, "grad_norm": 1.8785498142242432, "learning_rate": 1.4716346153846155e-05, "loss": 0.2219, "step": 1100 }, { "epoch": 0.4, "eval_loss": 0.23609140515327454, "eval_na_accuracy": 0.9037800431251526, "eval_ordinal_accuracy": 0.595941424369812, "eval_ordinal_mae": 0.5573533177375793, "eval_runtime": 193.7176, "eval_samples_per_second": 23.101, "eval_steps_per_second": 2.891, "step": 1100 }, { "epoch": 0.41, "grad_norm": 0.6871089935302734, "learning_rate": 1.4596153846153846e-05, "loss": 0.201, "step": 1125 }, { "epoch": 0.41, "grad_norm": 0.3621855080127716, "learning_rate": 1.447596153846154e-05, "loss": 0.2119, "step": 1150 }, { "epoch": 0.42, "grad_norm": 1.404956340789795, "learning_rate": 1.4355769230769232e-05, "loss": 0.2566, "step": 1175 }, { "epoch": 0.43, "grad_norm": 0.8949152231216431, "learning_rate": 1.4235576923076923e-05, "loss": 0.2071, "step": 1200 }, { "epoch": 0.43, "eval_loss": 0.23867186903953552, "eval_na_accuracy": 0.7714776396751404, "eval_ordinal_accuracy": 0.6175186038017273, "eval_ordinal_mae": 0.5438615679740906, "eval_runtime": 196.3391, "eval_samples_per_second": 22.792, "eval_steps_per_second": 2.852, "step": 1200 }, { "epoch": 0.44, "grad_norm": 1.7730196714401245, "learning_rate": 1.4115384615384617e-05, "loss": 0.2756, "step": 1225 }, { "epoch": 0.45, "grad_norm": 0.7472477555274963, "learning_rate": 1.3995192307692308e-05, "loss": 0.2318, "step": 1250 }, { "epoch": 0.46, "grad_norm": 2.942986249923706, "learning_rate": 1.3875e-05, "loss": 0.2568, "step": 1275 }, { "epoch": 0.47, "grad_norm": 0.5726996064186096, "learning_rate": 1.3754807692307695e-05, "loss": 0.2214, "step": 1300 }, { "epoch": 0.47, "eval_loss": 0.2340591698884964, "eval_na_accuracy": 0.7955326437950134, "eval_ordinal_accuracy": 0.6231697797775269, "eval_ordinal_mae": 0.5256503224372864, "eval_runtime": 193.9402, "eval_samples_per_second": 23.074, "eval_steps_per_second": 2.887, "step": 1300 }, { "epoch": 0.48, "grad_norm": 0.94366455078125, "learning_rate": 1.3634615384615385e-05, "loss": 0.2146, "step": 1325 }, { "epoch": 0.49, "grad_norm": 3.612720251083374, "learning_rate": 1.3514423076923077e-05, "loss": 0.1979, "step": 1350 }, { "epoch": 0.5, "grad_norm": 0.878446638584137, "learning_rate": 1.3394230769230769e-05, "loss": 0.2233, "step": 1375 }, { "epoch": 0.5, "grad_norm": 1.1531257629394531, "learning_rate": 1.3274038461538464e-05, "loss": 0.2627, "step": 1400 }, { "epoch": 0.5, "eval_loss": 0.2315448820590973, "eval_na_accuracy": 0.7989690899848938, "eval_ordinal_accuracy": 0.6123812198638916, "eval_ordinal_mae": 0.5152010917663574, "eval_runtime": 196.8075, "eval_samples_per_second": 22.738, "eval_steps_per_second": 2.845, "step": 1400 }, { "epoch": 0.51, "grad_norm": 2.2066402435302734, "learning_rate": 1.3153846153846156e-05, "loss": 0.2415, "step": 1425 }, { "epoch": 0.52, "grad_norm": 1.4936281442642212, "learning_rate": 1.3033653846153846e-05, "loss": 0.2536, "step": 1450 }, { "epoch": 0.53, "grad_norm": 2.112527847290039, "learning_rate": 1.291346153846154e-05, "loss": 0.2105, "step": 1475 }, { "epoch": 0.54, "grad_norm": 1.8110361099243164, "learning_rate": 1.2793269230769233e-05, "loss": 0.2067, "step": 1500 }, { "epoch": 0.54, "eval_loss": 0.22465108335018158, "eval_na_accuracy": 0.8109965920448303, "eval_ordinal_accuracy": 0.6396095752716064, "eval_ordinal_mae": 0.5025707483291626, "eval_runtime": 194.1538, "eval_samples_per_second": 23.049, "eval_steps_per_second": 2.884, "step": 1500 }, { "epoch": 0.55, "grad_norm": 0.8933520913124084, "learning_rate": 1.2673076923076924e-05, "loss": 0.1957, "step": 1525 }, { "epoch": 0.56, "grad_norm": 1.3568251132965088, "learning_rate": 1.2557692307692309e-05, "loss": 0.2286, "step": 1550 }, { "epoch": 0.57, "grad_norm": 2.600196123123169, "learning_rate": 1.24375e-05, "loss": 0.2292, "step": 1575 }, { "epoch": 0.58, "grad_norm": 1.2017379999160767, "learning_rate": 1.2317307692307694e-05, "loss": 0.2086, "step": 1600 }, { "epoch": 0.58, "eval_loss": 0.21920213103294373, "eval_na_accuracy": 0.8041236996650696, "eval_ordinal_accuracy": 0.6588749289512634, "eval_ordinal_mae": 0.49550649523735046, "eval_runtime": 189.1959, "eval_samples_per_second": 23.653, "eval_steps_per_second": 2.96, "step": 1600 }, { "epoch": 0.59, "grad_norm": 2.4263405799865723, "learning_rate": 1.2197115384615386e-05, "loss": 0.2384, "step": 1625 }, { "epoch": 0.6, "grad_norm": 2.1560556888580322, "learning_rate": 1.2076923076923078e-05, "loss": 0.2557, "step": 1650 }, { "epoch": 0.6, "grad_norm": 0.8151688575744629, "learning_rate": 1.1956730769230771e-05, "loss": 0.2623, "step": 1675 }, { "epoch": 0.61, "grad_norm": 2.8995933532714844, "learning_rate": 1.1836538461538463e-05, "loss": 0.1993, "step": 1700 }, { "epoch": 0.61, "eval_loss": 0.21818678081035614, "eval_na_accuracy": 0.8127147555351257, "eval_ordinal_accuracy": 0.6521962285041809, "eval_ordinal_mae": 0.47375088930130005, "eval_runtime": 196.7859, "eval_samples_per_second": 22.74, "eval_steps_per_second": 2.846, "step": 1700 }, { "epoch": 0.62, "grad_norm": 0.6673493385314941, "learning_rate": 1.1716346153846155e-05, "loss": 0.2627, "step": 1725 }, { "epoch": 0.63, "grad_norm": 4.143210411071777, "learning_rate": 1.1596153846153848e-05, "loss": 0.2294, "step": 1750 }, { "epoch": 0.64, "grad_norm": 0.9360339641571045, "learning_rate": 1.147596153846154e-05, "loss": 0.1854, "step": 1775 }, { "epoch": 0.65, "grad_norm": 4.869482040405273, "learning_rate": 1.1355769230769231e-05, "loss": 0.1962, "step": 1800 }, { "epoch": 0.65, "eval_loss": 0.22110989689826965, "eval_na_accuracy": 0.9140893220901489, "eval_ordinal_accuracy": 0.6231697797775269, "eval_ordinal_mae": 0.4857858419418335, "eval_runtime": 187.6432, "eval_samples_per_second": 23.848, "eval_steps_per_second": 2.984, "step": 1800 }, { "epoch": 0.66, "grad_norm": 5.728977203369141, "learning_rate": 1.1235576923076923e-05, "loss": 0.2294, "step": 1825 }, { "epoch": 0.67, "grad_norm": 0.5772213339805603, "learning_rate": 1.1115384615384617e-05, "loss": 0.1981, "step": 1850 }, { "epoch": 0.68, "grad_norm": 2.899949550628662, "learning_rate": 1.0995192307692308e-05, "loss": 0.1885, "step": 1875 }, { "epoch": 0.69, "grad_norm": 1.060448408126831, "learning_rate": 1.0875e-05, "loss": 0.1882, "step": 1900 }, { "epoch": 0.69, "eval_loss": 0.20448338985443115, "eval_na_accuracy": 0.8625429272651672, "eval_ordinal_accuracy": 0.6632417440414429, "eval_ordinal_mae": 0.4668627381324768, "eval_runtime": 187.7227, "eval_samples_per_second": 23.838, "eval_steps_per_second": 2.983, "step": 1900 }, { "epoch": 0.69, "grad_norm": 8.596738815307617, "learning_rate": 1.0754807692307693e-05, "loss": 0.2388, "step": 1925 }, { "epoch": 0.7, "grad_norm": 2.922056198120117, "learning_rate": 1.0634615384615385e-05, "loss": 0.2172, "step": 1950 }, { "epoch": 0.71, "grad_norm": 2.6090290546417236, "learning_rate": 1.0514423076923077e-05, "loss": 0.2432, "step": 1975 }, { "epoch": 0.72, "grad_norm": 0.9129126667976379, "learning_rate": 1.039423076923077e-05, "loss": 0.1895, "step": 2000 }, { "epoch": 0.72, "eval_loss": 0.2081986963748932, "eval_na_accuracy": 0.8608247637748718, "eval_ordinal_accuracy": 0.6316465735435486, "eval_ordinal_mae": 0.46963009238243103, "eval_runtime": 177.2649, "eval_samples_per_second": 25.245, "eval_steps_per_second": 3.159, "step": 2000 }, { "epoch": 0.73, "grad_norm": 14.650406837463379, "learning_rate": 1.0274038461538462e-05, "loss": 0.2168, "step": 2025 }, { "epoch": 0.74, "grad_norm": 2.405910015106201, "learning_rate": 1.0153846153846154e-05, "loss": 0.224, "step": 2050 }, { "epoch": 0.75, "grad_norm": 1.4129964113235474, "learning_rate": 1.0033653846153847e-05, "loss": 0.1908, "step": 2075 }, { "epoch": 0.76, "grad_norm": 2.485114812850952, "learning_rate": 9.913461538461539e-06, "loss": 0.1979, "step": 2100 }, { "epoch": 0.76, "eval_loss": 0.22696280479431152, "eval_na_accuracy": 0.900343656539917, "eval_ordinal_accuracy": 0.6372976899147034, "eval_ordinal_mae": 0.4791434407234192, "eval_runtime": 187.006, "eval_samples_per_second": 23.93, "eval_steps_per_second": 2.995, "step": 2100 }, { "epoch": 0.77, "grad_norm": 1.7261921167373657, "learning_rate": 9.79326923076923e-06, "loss": 0.2169, "step": 2125 }, { "epoch": 0.78, "grad_norm": 5.685389995574951, "learning_rate": 9.673076923076924e-06, "loss": 0.2446, "step": 2150 }, { "epoch": 0.78, "grad_norm": 1.2241212129592896, "learning_rate": 9.552884615384616e-06, "loss": 0.2202, "step": 2175 }, { "epoch": 0.79, "grad_norm": 3.1511054039001465, "learning_rate": 9.432692307692308e-06, "loss": 0.2643, "step": 2200 }, { "epoch": 0.79, "eval_loss": 0.20690996944904327, "eval_na_accuracy": 0.8556700944900513, "eval_ordinal_accuracy": 0.6414076685905457, "eval_ordinal_mae": 0.46626007556915283, "eval_runtime": 194.378, "eval_samples_per_second": 23.022, "eval_steps_per_second": 2.881, "step": 2200 }, { "epoch": 0.8, "grad_norm": 2.382234811782837, "learning_rate": 9.312500000000001e-06, "loss": 0.1782, "step": 2225 }, { "epoch": 0.81, "grad_norm": 3.646544933319092, "learning_rate": 9.192307692307693e-06, "loss": 0.1901, "step": 2250 }, { "epoch": 0.82, "grad_norm": 3.5835981369018555, "learning_rate": 9.072115384615385e-06, "loss": 0.179, "step": 2275 }, { "epoch": 0.83, "grad_norm": 0.6391886472702026, "learning_rate": 8.951923076923078e-06, "loss": 0.2279, "step": 2300 }, { "epoch": 0.83, "eval_loss": 0.2029835283756256, "eval_na_accuracy": 0.869415819644928, "eval_ordinal_accuracy": 0.654251217842102, "eval_ordinal_mae": 0.4581436216831207, "eval_runtime": 190.8034, "eval_samples_per_second": 23.453, "eval_steps_per_second": 2.935, "step": 2300 }, { "epoch": 0.84, "grad_norm": 1.8583753108978271, "learning_rate": 8.83173076923077e-06, "loss": 0.2403, "step": 2325 }, { "epoch": 0.85, "grad_norm": 7.337312698364258, "learning_rate": 8.711538461538463e-06, "loss": 0.2262, "step": 2350 }, { "epoch": 0.86, "grad_norm": 4.292835712432861, "learning_rate": 8.591346153846155e-06, "loss": 0.2321, "step": 2375 }, { "epoch": 0.87, "grad_norm": 3.3267788887023926, "learning_rate": 8.471153846153847e-06, "loss": 0.1965, "step": 2400 }, { "epoch": 0.87, "eval_loss": 0.21094879508018494, "eval_na_accuracy": 0.800687313079834, "eval_ordinal_accuracy": 0.681993305683136, "eval_ordinal_mae": 0.44459667801856995, "eval_runtime": 184.1529, "eval_samples_per_second": 24.3, "eval_steps_per_second": 3.041, "step": 2400 }, { "epoch": 0.87, "grad_norm": 2.07079815864563, "learning_rate": 8.35096153846154e-06, "loss": 0.2122, "step": 2425 }, { "epoch": 0.88, "grad_norm": 3.6883444786071777, "learning_rate": 8.230769230769232e-06, "loss": 0.1876, "step": 2450 }, { "epoch": 0.89, "grad_norm": 0.5956806540489197, "learning_rate": 8.110576923076923e-06, "loss": 0.1804, "step": 2475 }, { "epoch": 0.9, "grad_norm": 0.6182098984718323, "learning_rate": 7.990384615384617e-06, "loss": 0.1637, "step": 2500 }, { "epoch": 0.9, "eval_loss": 0.20052286982536316, "eval_na_accuracy": 0.8556700944900513, "eval_ordinal_accuracy": 0.6763421297073364, "eval_ordinal_mae": 0.4438597857952118, "eval_runtime": 183.167, "eval_samples_per_second": 24.431, "eval_steps_per_second": 3.057, "step": 2500 }, { "epoch": 0.91, "grad_norm": 2.0849900245666504, "learning_rate": 7.875e-06, "loss": 0.2287, "step": 2525 }, { "epoch": 0.92, "grad_norm": 2.9747681617736816, "learning_rate": 7.754807692307693e-06, "loss": 0.202, "step": 2550 }, { "epoch": 0.93, "grad_norm": 0.7342644333839417, "learning_rate": 7.634615384615385e-06, "loss": 0.1887, "step": 2575 }, { "epoch": 0.94, "grad_norm": 2.5988609790802, "learning_rate": 7.514423076923078e-06, "loss": 0.1705, "step": 2600 }, { "epoch": 0.94, "eval_loss": 0.19641266763210297, "eval_na_accuracy": 0.8539518713951111, "eval_ordinal_accuracy": 0.6748009324073792, "eval_ordinal_mae": 0.43212634325027466, "eval_runtime": 189.9978, "eval_samples_per_second": 23.553, "eval_steps_per_second": 2.947, "step": 2600 }, { "epoch": 0.95, "grad_norm": 3.347687244415283, "learning_rate": 7.39423076923077e-06, "loss": 0.1947, "step": 2625 }, { "epoch": 0.96, "grad_norm": 1.516992449760437, "learning_rate": 7.274038461538462e-06, "loss": 0.2137, "step": 2650 }, { "epoch": 0.96, "grad_norm": 1.79114830493927, "learning_rate": 7.153846153846155e-06, "loss": 0.2398, "step": 2675 }, { "epoch": 0.97, "grad_norm": 2.8799991607666016, "learning_rate": 7.033653846153847e-06, "loss": 0.2412, "step": 2700 }, { "epoch": 0.97, "eval_loss": 0.19578155875205994, "eval_na_accuracy": 0.8780068755149841, "eval_ordinal_accuracy": 0.6730028390884399, "eval_ordinal_mae": 0.4344838559627533, "eval_runtime": 198.8509, "eval_samples_per_second": 22.504, "eval_steps_per_second": 2.816, "step": 2700 }, { "epoch": 0.98, "grad_norm": 1.0458216667175293, "learning_rate": 6.913461538461539e-06, "loss": 0.1697, "step": 2725 }, { "epoch": 0.99, "grad_norm": 2.2299771308898926, "learning_rate": 6.7932692307692315e-06, "loss": 0.1623, "step": 2750 }, { "epoch": 1.0, "grad_norm": 1.825832724571228, "learning_rate": 6.673076923076923e-06, "loss": 0.2052, "step": 2775 }, { "epoch": 1.01, "grad_norm": 0.6158725619316101, "learning_rate": 6.552884615384616e-06, "loss": 0.1438, "step": 2800 }, { "epoch": 1.01, "eval_loss": 0.19719891250133514, "eval_na_accuracy": 0.8470790386199951, "eval_ordinal_accuracy": 0.6783971190452576, "eval_ordinal_mae": 0.43012040853500366, "eval_runtime": 190.7061, "eval_samples_per_second": 23.465, "eval_steps_per_second": 2.936, "step": 2800 }, { "epoch": 1.02, "grad_norm": 1.041905164718628, "learning_rate": 6.432692307692308e-06, "loss": 0.1584, "step": 2825 }, { "epoch": 1.03, "grad_norm": 0.6891164779663086, "learning_rate": 6.3125e-06, "loss": 0.1488, "step": 2850 }, { "epoch": 1.04, "grad_norm": 4.814165115356445, "learning_rate": 6.192307692307693e-06, "loss": 0.1565, "step": 2875 }, { "epoch": 1.05, "grad_norm": 1.0739597082138062, "learning_rate": 6.0721153846153844e-06, "loss": 0.123, "step": 2900 }, { "epoch": 1.05, "eval_loss": 0.19954617321491241, "eval_na_accuracy": 0.8419243693351746, "eval_ordinal_accuracy": 0.6753146648406982, "eval_ordinal_mae": 0.42309799790382385, "eval_runtime": 185.744, "eval_samples_per_second": 24.092, "eval_steps_per_second": 3.015, "step": 2900 }, { "epoch": 1.05, "grad_norm": 3.492755651473999, "learning_rate": 5.951923076923077e-06, "loss": 0.1547, "step": 2925 }, { "epoch": 1.06, "grad_norm": 0.9093284010887146, "learning_rate": 5.8317307692307704e-06, "loss": 0.1258, "step": 2950 }, { "epoch": 1.07, "grad_norm": 0.7456061244010925, "learning_rate": 5.711538461538461e-06, "loss": 0.1648, "step": 2975 }, { "epoch": 1.08, "grad_norm": 0.8997055888175964, "learning_rate": 5.591346153846155e-06, "loss": 0.1411, "step": 3000 }, { "epoch": 1.08, "eval_loss": 0.19463004171848297, "eval_na_accuracy": 0.8453608155250549, "eval_ordinal_accuracy": 0.6817364692687988, "eval_ordinal_mae": 0.42202073335647583, "eval_runtime": 176.571, "eval_samples_per_second": 25.344, "eval_steps_per_second": 3.172, "step": 3000 }, { "epoch": 1.09, "grad_norm": 0.5158917903900146, "learning_rate": 5.471153846153847e-06, "loss": 0.168, "step": 3025 }, { "epoch": 1.1, "grad_norm": 1.5927814245224, "learning_rate": 5.350961538461539e-06, "loss": 0.1113, "step": 3050 }, { "epoch": 1.11, "grad_norm": 0.7580274343490601, "learning_rate": 5.230769230769232e-06, "loss": 0.1489, "step": 3075 }, { "epoch": 1.12, "grad_norm": 0.6561002135276794, "learning_rate": 5.110576923076923e-06, "loss": 0.1443, "step": 3100 }, { "epoch": 1.12, "eval_loss": 0.19156721234321594, "eval_na_accuracy": 0.8591065406799316, "eval_ordinal_accuracy": 0.682763934135437, "eval_ordinal_mae": 0.42213648557662964, "eval_runtime": 180.8137, "eval_samples_per_second": 24.749, "eval_steps_per_second": 3.097, "step": 3100 }, { "epoch": 1.13, "grad_norm": 0.7529481053352356, "learning_rate": 4.990384615384616e-06, "loss": 0.1546, "step": 3125 }, { "epoch": 1.14, "grad_norm": 3.369683027267456, "learning_rate": 4.870192307692308e-06, "loss": 0.136, "step": 3150 }, { "epoch": 1.14, "grad_norm": 0.9865265488624573, "learning_rate": 4.75e-06, "loss": 0.1498, "step": 3175 }, { "epoch": 1.15, "grad_norm": 1.8516024351119995, "learning_rate": 4.629807692307693e-06, "loss": 0.208, "step": 3200 }, { "epoch": 1.15, "eval_loss": 0.19419582188129425, "eval_na_accuracy": 0.8676975965499878, "eval_ordinal_accuracy": 0.6740303039550781, "eval_ordinal_mae": 0.4163132607936859, "eval_runtime": 188.6406, "eval_samples_per_second": 23.722, "eval_steps_per_second": 2.969, "step": 3200 }, { "epoch": 1.16, "grad_norm": 1.1575658321380615, "learning_rate": 4.509615384615385e-06, "loss": 0.1316, "step": 3225 }, { "epoch": 1.17, "grad_norm": 0.9484291076660156, "learning_rate": 4.389423076923077e-06, "loss": 0.1204, "step": 3250 }, { "epoch": 1.18, "grad_norm": 2.565762519836426, "learning_rate": 4.26923076923077e-06, "loss": 0.1262, "step": 3275 }, { "epoch": 1.19, "grad_norm": 2.2757420539855957, "learning_rate": 4.149038461538462e-06, "loss": 0.1343, "step": 3300 }, { "epoch": 1.19, "eval_loss": 0.19619733095169067, "eval_na_accuracy": 0.8470790386199951, "eval_ordinal_accuracy": 0.6889288425445557, "eval_ordinal_mae": 0.4182307720184326, "eval_runtime": 187.8854, "eval_samples_per_second": 23.818, "eval_steps_per_second": 2.981, "step": 3300 }, { "epoch": 1.2, "grad_norm": 3.402385711669922, "learning_rate": 4.028846153846154e-06, "loss": 0.1397, "step": 3325 }, { "epoch": 1.21, "grad_norm": 0.881535530090332, "learning_rate": 3.908653846153847e-06, "loss": 0.1176, "step": 3350 }, { "epoch": 1.22, "grad_norm": 0.948428213596344, "learning_rate": 3.7884615384615388e-06, "loss": 0.1767, "step": 3375 }, { "epoch": 1.23, "grad_norm": 1.2142385244369507, "learning_rate": 3.668269230769231e-06, "loss": 0.1347, "step": 3400 }, { "epoch": 1.23, "eval_loss": 0.1938188225030899, "eval_na_accuracy": 0.8659793734550476, "eval_ordinal_accuracy": 0.6899563074111938, "eval_ordinal_mae": 0.4161255955696106, "eval_runtime": 185.9367, "eval_samples_per_second": 24.067, "eval_steps_per_second": 3.012, "step": 3400 }, { "epoch": 1.24, "grad_norm": 3.2418200969696045, "learning_rate": 3.5480769230769235e-06, "loss": 0.1585, "step": 3425 }, { "epoch": 1.24, "grad_norm": 1.845564842224121, "learning_rate": 3.4278846153846157e-06, "loss": 0.1499, "step": 3450 }, { "epoch": 1.25, "grad_norm": 1.7326797246932983, "learning_rate": 3.307692307692308e-06, "loss": 0.1286, "step": 3475 }, { "epoch": 1.26, "grad_norm": 4.901269912719727, "learning_rate": 3.1875e-06, "loss": 0.1076, "step": 3500 }, { "epoch": 1.26, "eval_loss": 0.19698283076286316, "eval_na_accuracy": 0.8470790386199951, "eval_ordinal_accuracy": 0.6943231225013733, "eval_ordinal_mae": 0.41806870698928833, "eval_runtime": 187.7833, "eval_samples_per_second": 23.831, "eval_steps_per_second": 2.982, "step": 3500 }, { "epoch": 1.27, "grad_norm": 1.3833277225494385, "learning_rate": 3.0673076923076926e-06, "loss": 0.1246, "step": 3525 }, { "epoch": 1.28, "grad_norm": 0.9357690215110779, "learning_rate": 2.947115384615385e-06, "loss": 0.1113, "step": 3550 }, { "epoch": 1.29, "grad_norm": 7.975602149963379, "learning_rate": 2.8269230769230773e-06, "loss": 0.1872, "step": 3575 }, { "epoch": 1.3, "grad_norm": 0.5384923815727234, "learning_rate": 2.7067307692307694e-06, "loss": 0.1248, "step": 3600 }, { "epoch": 1.3, "eval_loss": 0.19512778520584106, "eval_na_accuracy": 0.8470790386199951, "eval_ordinal_accuracy": 0.6958643794059753, "eval_ordinal_mae": 0.4150661826133728, "eval_runtime": 190.3088, "eval_samples_per_second": 23.514, "eval_steps_per_second": 2.943, "step": 3600 }, { "epoch": 1.31, "grad_norm": 0.7750712633132935, "learning_rate": 2.586538461538462e-06, "loss": 0.1249, "step": 3625 }, { "epoch": 1.32, "grad_norm": 0.7317385077476501, "learning_rate": 2.466346153846154e-06, "loss": 0.124, "step": 3650 }, { "epoch": 1.33, "grad_norm": 0.79685378074646, "learning_rate": 2.3461538461538463e-06, "loss": 0.1524, "step": 3675 }, { "epoch": 1.33, "grad_norm": 0.9845700263977051, "learning_rate": 2.2259615384615385e-06, "loss": 0.1455, "step": 3700 }, { "epoch": 1.33, "eval_loss": 0.1951962560415268, "eval_na_accuracy": 0.8814433217048645, "eval_ordinal_accuracy": 0.6850757598876953, "eval_ordinal_mae": 0.4146950840950012, "eval_runtime": 188.1776, "eval_samples_per_second": 23.781, "eval_steps_per_second": 2.976, "step": 3700 }, { "epoch": 1.34, "grad_norm": 2.780651330947876, "learning_rate": 2.105769230769231e-06, "loss": 0.174, "step": 3725 }, { "epoch": 1.35, "grad_norm": 1.0597649812698364, "learning_rate": 1.9855769230769232e-06, "loss": 0.1211, "step": 3750 }, { "epoch": 1.36, "grad_norm": 2.3485703468322754, "learning_rate": 1.8653846153846156e-06, "loss": 0.1537, "step": 3775 }, { "epoch": 1.37, "grad_norm": 6.283831596374512, "learning_rate": 1.7451923076923077e-06, "loss": 0.131, "step": 3800 }, { "epoch": 1.37, "eval_loss": 0.19528667628765106, "eval_na_accuracy": 0.8453608155250549, "eval_ordinal_accuracy": 0.6948369145393372, "eval_ordinal_mae": 0.4172358810901642, "eval_runtime": 191.9622, "eval_samples_per_second": 23.312, "eval_steps_per_second": 2.917, "step": 3800 }, { "epoch": 1.38, "grad_norm": 0.6940748691558838, "learning_rate": 1.6250000000000001e-06, "loss": 0.1321, "step": 3825 }, { "epoch": 1.39, "grad_norm": 2.970365285873413, "learning_rate": 1.5048076923076923e-06, "loss": 0.1578, "step": 3850 }, { "epoch": 1.4, "grad_norm": 3.247617483139038, "learning_rate": 1.3846153846153848e-06, "loss": 0.1484, "step": 3875 }, { "epoch": 1.41, "grad_norm": 0.7711169719696045, "learning_rate": 1.264423076923077e-06, "loss": 0.1307, "step": 3900 }, { "epoch": 1.41, "eval_loss": 0.19318200647830963, "eval_na_accuracy": 0.8642611503601074, "eval_ordinal_accuracy": 0.692781925201416, "eval_ordinal_mae": 0.41271111369132996, "eval_runtime": 192.6809, "eval_samples_per_second": 23.225, "eval_steps_per_second": 2.906, "step": 3900 }, { "epoch": 1.42, "grad_norm": 0.6948511004447937, "learning_rate": 1.1442307692307694e-06, "loss": 0.1566, "step": 3925 }, { "epoch": 1.42, "grad_norm": 0.9776670336723328, "learning_rate": 1.0240384615384615e-06, "loss": 0.153, "step": 3950 }, { "epoch": 1.43, "grad_norm": 1.5596826076507568, "learning_rate": 9.038461538461539e-07, "loss": 0.1316, "step": 3975 }, { "epoch": 1.44, "grad_norm": 0.5882940888404846, "learning_rate": 7.836538461538463e-07, "loss": 0.1198, "step": 4000 }, { "epoch": 1.44, "eval_loss": 0.19471855461597443, "eval_na_accuracy": 0.8573883175849915, "eval_ordinal_accuracy": 0.6940662860870361, "eval_ordinal_mae": 0.4110487997531891, "eval_runtime": 192.5912, "eval_samples_per_second": 23.236, "eval_steps_per_second": 2.908, "step": 4000 }, { "epoch": 1.45, "grad_norm": 1.4058364629745483, "learning_rate": 6.634615384615385e-07, "loss": 0.1154, "step": 4025 }, { "epoch": 1.46, "grad_norm": 2.0241715908050537, "learning_rate": 5.432692307692308e-07, "loss": 0.1359, "step": 4050 }, { "epoch": 1.47, "grad_norm": 2.556328773498535, "learning_rate": 4.2307692307692315e-07, "loss": 0.1374, "step": 4075 }, { "epoch": 1.48, "grad_norm": 0.89422607421875, "learning_rate": 3.028846153846154e-07, "loss": 0.1363, "step": 4100 }, { "epoch": 1.48, "eval_loss": 0.19520333409309387, "eval_na_accuracy": 0.8573883175849915, "eval_ordinal_accuracy": 0.6886719465255737, "eval_ordinal_mae": 0.4086832106113434, "eval_runtime": 191.6181, "eval_samples_per_second": 23.354, "eval_steps_per_second": 2.922, "step": 4100 }, { "epoch": 1.49, "grad_norm": 1.4230666160583496, "learning_rate": 1.8269230769230772e-07, "loss": 0.1273, "step": 4125 }, { "epoch": 1.5, "grad_norm": 2.7363412380218506, "learning_rate": 6.250000000000001e-08, "loss": 0.1068, "step": 4150 }, { "epoch": 1.5, "step": 4160, "total_flos": 5.158051742063002e+18, "train_loss": 0.21532289660893955, "train_runtime": 13643.9574, "train_samples_per_second": 4.878, "train_steps_per_second": 0.305 } ], "logging_steps": 25, "max_steps": 4160, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "total_flos": 5.158051742063002e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }