{ "best_metric": 0.26356959342956543, "best_model_checkpoint": "./ryan_model3272024/checkpoint-1000", "epoch": 0.6496519721577726, "eval_steps": 100, "global_step": 1400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 0.38699468970298767, "learning_rate": 0.0001994199535962877, "loss": 0.4038, "step": 25 }, { "epoch": 0.02, "grad_norm": 0.6787680387496948, "learning_rate": 0.00019883990719257543, "loss": 0.4003, "step": 50 }, { "epoch": 0.03, "grad_norm": 0.5743306279182434, "learning_rate": 0.00019825986078886312, "loss": 0.3591, "step": 75 }, { "epoch": 0.05, "grad_norm": 0.41705068945884705, "learning_rate": 0.00019767981438515082, "loss": 0.3524, "step": 100 }, { "epoch": 0.05, "eval_loss": 0.339992493391037, "eval_na_accuracy": 0.7586872577667236, "eval_ordinal_accuracy": 0.38746026158332825, "eval_ordinal_mae": 0.8904515504837036, "eval_runtime": 335.205, "eval_samples_per_second": 11.87, "eval_steps_per_second": 1.486, "step": 100 }, { "epoch": 0.06, "grad_norm": 0.36200761795043945, "learning_rate": 0.0001970997679814385, "loss": 0.3071, "step": 125 }, { "epoch": 0.07, "grad_norm": 0.24589791893959045, "learning_rate": 0.00019651972157772623, "loss": 0.3475, "step": 150 }, { "epoch": 0.08, "grad_norm": 0.6089735627174377, "learning_rate": 0.00019593967517401393, "loss": 0.3072, "step": 175 }, { "epoch": 0.09, "grad_norm": 0.5671761631965637, "learning_rate": 0.00019535962877030162, "loss": 0.2683, "step": 200 }, { "epoch": 0.09, "eval_loss": 0.36712726950645447, "eval_na_accuracy": 0.623552143573761, "eval_ordinal_accuracy": 0.48916497826576233, "eval_ordinal_mae": 0.7306416630744934, "eval_runtime": 155.9343, "eval_samples_per_second": 25.517, "eval_steps_per_second": 3.194, "step": 200 }, { "epoch": 0.1, "grad_norm": 1.2764167785644531, "learning_rate": 0.00019477958236658932, "loss": 0.2953, "step": 225 }, { "epoch": 0.12, "grad_norm": 1.9076497554779053, "learning_rate": 0.00019419953596287704, "loss": 0.3382, "step": 250 }, { "epoch": 0.13, "grad_norm": 0.2747127115726471, "learning_rate": 0.00019361948955916474, "loss": 0.2752, "step": 275 }, { "epoch": 0.14, "grad_norm": 0.9448749423027039, "learning_rate": 0.00019303944315545243, "loss": 0.3314, "step": 300 }, { "epoch": 0.14, "eval_loss": 0.3450469672679901, "eval_na_accuracy": 0.6969112157821655, "eval_ordinal_accuracy": 0.4013291001319885, "eval_ordinal_mae": 0.8077224493026733, "eval_runtime": 156.2328, "eval_samples_per_second": 25.468, "eval_steps_per_second": 3.188, "step": 300 }, { "epoch": 0.15, "grad_norm": 0.2589721083641052, "learning_rate": 0.00019245939675174015, "loss": 0.3486, "step": 325 }, { "epoch": 0.16, "grad_norm": 0.44286003708839417, "learning_rate": 0.00019187935034802785, "loss": 0.3386, "step": 350 }, { "epoch": 0.17, "grad_norm": 0.3215602934360504, "learning_rate": 0.00019129930394431554, "loss": 0.3056, "step": 375 }, { "epoch": 0.19, "grad_norm": 0.9510051012039185, "learning_rate": 0.00019071925754060324, "loss": 0.2747, "step": 400 }, { "epoch": 0.19, "eval_loss": 0.28132036328315735, "eval_na_accuracy": 0.7895752787590027, "eval_ordinal_accuracy": 0.5423288345336914, "eval_ordinal_mae": 0.6105712056159973, "eval_runtime": 155.1965, "eval_samples_per_second": 25.638, "eval_steps_per_second": 3.209, "step": 400 }, { "epoch": 0.2, "grad_norm": 0.5417093634605408, "learning_rate": 0.00019013921113689096, "loss": 0.2522, "step": 425 }, { "epoch": 0.21, "grad_norm": 1.405881643295288, "learning_rate": 0.00018955916473317868, "loss": 0.3589, "step": 450 }, { "epoch": 0.22, "grad_norm": 0.8319898843765259, "learning_rate": 0.00018897911832946638, "loss": 0.2991, "step": 475 }, { "epoch": 0.23, "grad_norm": 1.9455621242523193, "learning_rate": 0.00018839907192575407, "loss": 0.3247, "step": 500 }, { "epoch": 0.23, "eval_loss": 0.3143959045410156, "eval_na_accuracy": 0.7104247212409973, "eval_ordinal_accuracy": 0.4524703919887543, "eval_ordinal_mae": 0.7256373763084412, "eval_runtime": 157.1141, "eval_samples_per_second": 25.326, "eval_steps_per_second": 3.17, "step": 500 }, { "epoch": 0.24, "grad_norm": 0.6339251399040222, "learning_rate": 0.00018781902552204177, "loss": 0.303, "step": 525 }, { "epoch": 0.26, "grad_norm": 0.3713740408420563, "learning_rate": 0.0001872389791183295, "loss": 0.3035, "step": 550 }, { "epoch": 0.27, "grad_norm": 0.7050974369049072, "learning_rate": 0.00018665893271461718, "loss": 0.2609, "step": 575 }, { "epoch": 0.28, "grad_norm": 0.791477620601654, "learning_rate": 0.00018607888631090488, "loss": 0.3612, "step": 600 }, { "epoch": 0.28, "eval_loss": 0.3074879050254822, "eval_na_accuracy": 0.7586872577667236, "eval_ordinal_accuracy": 0.4984108507633209, "eval_ordinal_mae": 0.6415887475013733, "eval_runtime": 154.2538, "eval_samples_per_second": 25.795, "eval_steps_per_second": 3.228, "step": 600 }, { "epoch": 0.29, "grad_norm": 0.39196524024009705, "learning_rate": 0.0001854988399071926, "loss": 0.31, "step": 625 }, { "epoch": 0.3, "grad_norm": 1.0753191709518433, "learning_rate": 0.0001849187935034803, "loss": 0.2722, "step": 650 }, { "epoch": 0.31, "grad_norm": 0.8922611474990845, "learning_rate": 0.000184338747099768, "loss": 0.3132, "step": 675 }, { "epoch": 0.32, "grad_norm": 0.6866246461868286, "learning_rate": 0.0001837587006960557, "loss": 0.3031, "step": 700 }, { "epoch": 0.32, "eval_loss": 0.2784635126590729, "eval_na_accuracy": 0.7895752787590027, "eval_ordinal_accuracy": 0.5556197762489319, "eval_ordinal_mae": 0.5720168352127075, "eval_runtime": 154.421, "eval_samples_per_second": 25.767, "eval_steps_per_second": 3.225, "step": 700 }, { "epoch": 0.34, "grad_norm": 1.713051676750183, "learning_rate": 0.0001831786542923434, "loss": 0.337, "step": 725 }, { "epoch": 0.35, "grad_norm": 1.0872548818588257, "learning_rate": 0.0001825986078886311, "loss": 0.2918, "step": 750 }, { "epoch": 0.36, "grad_norm": 1.5099256038665771, "learning_rate": 0.0001820185614849188, "loss": 0.2509, "step": 775 }, { "epoch": 0.37, "grad_norm": 0.5774210691452026, "learning_rate": 0.0001814385150812065, "loss": 0.2866, "step": 800 }, { "epoch": 0.37, "eval_loss": 0.28780511021614075, "eval_na_accuracy": 0.7335907220840454, "eval_ordinal_accuracy": 0.5775787234306335, "eval_ordinal_mae": 0.5347856879234314, "eval_runtime": 154.6062, "eval_samples_per_second": 25.736, "eval_steps_per_second": 3.221, "step": 800 }, { "epoch": 0.38, "grad_norm": 0.33059367537498474, "learning_rate": 0.00018085846867749422, "loss": 0.2626, "step": 825 }, { "epoch": 0.39, "grad_norm": 1.45087730884552, "learning_rate": 0.0001802784222737819, "loss": 0.3485, "step": 850 }, { "epoch": 0.41, "grad_norm": 1.195901870727539, "learning_rate": 0.0001796983758700696, "loss": 0.3007, "step": 875 }, { "epoch": 0.42, "grad_norm": 0.26779890060424805, "learning_rate": 0.00017911832946635733, "loss": 0.2927, "step": 900 }, { "epoch": 0.42, "eval_loss": 0.2688673734664917, "eval_na_accuracy": 0.7972972989082336, "eval_ordinal_accuracy": 0.5573533773422241, "eval_ordinal_mae": 0.5855077505111694, "eval_runtime": 154.5178, "eval_samples_per_second": 25.751, "eval_steps_per_second": 3.223, "step": 900 }, { "epoch": 0.43, "grad_norm": 0.5635965466499329, "learning_rate": 0.00017853828306264502, "loss": 0.269, "step": 925 }, { "epoch": 0.44, "grad_norm": 2.8135786056518555, "learning_rate": 0.00017795823665893272, "loss": 0.2677, "step": 950 }, { "epoch": 0.45, "grad_norm": 0.49396631121635437, "learning_rate": 0.0001773781902552204, "loss": 0.3069, "step": 975 }, { "epoch": 0.46, "grad_norm": 1.3267723321914673, "learning_rate": 0.00017679814385150814, "loss": 0.3003, "step": 1000 }, { "epoch": 0.46, "eval_loss": 0.26356959342956543, "eval_na_accuracy": 0.7915058135986328, "eval_ordinal_accuracy": 0.581045925617218, "eval_ordinal_mae": 0.5543876886367798, "eval_runtime": 157.946, "eval_samples_per_second": 25.192, "eval_steps_per_second": 3.153, "step": 1000 }, { "epoch": 0.48, "grad_norm": 0.9938157200813293, "learning_rate": 0.00017621809744779583, "loss": 0.2521, "step": 1025 }, { "epoch": 0.49, "grad_norm": 0.45715010166168213, "learning_rate": 0.00017563805104408353, "loss": 0.2926, "step": 1050 }, { "epoch": 0.5, "grad_norm": 2.9666409492492676, "learning_rate": 0.00017505800464037122, "loss": 0.2581, "step": 1075 }, { "epoch": 0.51, "grad_norm": 2.5301055908203125, "learning_rate": 0.00017447795823665894, "loss": 0.2522, "step": 1100 }, { "epoch": 0.51, "eval_loss": 0.3009192943572998, "eval_na_accuracy": 0.8571428656578064, "eval_ordinal_accuracy": 0.54435133934021, "eval_ordinal_mae": 0.5650931596755981, "eval_runtime": 159.1216, "eval_samples_per_second": 25.006, "eval_steps_per_second": 3.13, "step": 1100 }, { "epoch": 0.52, "grad_norm": 0.8192782998085022, "learning_rate": 0.00017389791183294664, "loss": 0.3584, "step": 1125 }, { "epoch": 0.53, "grad_norm": 2.0657265186309814, "learning_rate": 0.00017331786542923433, "loss": 0.2547, "step": 1150 }, { "epoch": 0.55, "grad_norm": 0.5887840390205383, "learning_rate": 0.00017273781902552203, "loss": 0.2335, "step": 1175 }, { "epoch": 0.56, "grad_norm": 0.8169906735420227, "learning_rate": 0.00017215777262180975, "loss": 0.262, "step": 1200 }, { "epoch": 0.56, "eval_loss": 0.279022216796875, "eval_na_accuracy": 0.8301158547401428, "eval_ordinal_accuracy": 0.5801791548728943, "eval_ordinal_mae": 0.5203233361244202, "eval_runtime": 159.9167, "eval_samples_per_second": 24.882, "eval_steps_per_second": 3.114, "step": 1200 }, { "epoch": 0.57, "grad_norm": 2.5461835861206055, "learning_rate": 0.00017157772621809744, "loss": 0.2387, "step": 1225 }, { "epoch": 0.58, "grad_norm": 0.7304142117500305, "learning_rate": 0.00017099767981438517, "loss": 0.2366, "step": 1250 }, { "epoch": 0.59, "grad_norm": 1.3845186233520508, "learning_rate": 0.00017041763341067286, "loss": 0.2309, "step": 1275 }, { "epoch": 0.6, "grad_norm": 0.5202885270118713, "learning_rate": 0.00016983758700696058, "loss": 0.2139, "step": 1300 }, { "epoch": 0.6, "eval_loss": 0.2653418481349945, "eval_na_accuracy": 0.7509652376174927, "eval_ordinal_accuracy": 0.5492632389068604, "eval_ordinal_mae": 0.562603771686554, "eval_runtime": 158.9921, "eval_samples_per_second": 25.026, "eval_steps_per_second": 3.132, "step": 1300 }, { "epoch": 0.61, "grad_norm": 1.6506483554840088, "learning_rate": 0.00016925754060324828, "loss": 0.3071, "step": 1325 }, { "epoch": 0.63, "grad_norm": 0.5789369940757751, "learning_rate": 0.00016867749419953597, "loss": 0.2689, "step": 1350 }, { "epoch": 0.64, "grad_norm": 0.5665389895439148, "learning_rate": 0.00016809744779582367, "loss": 0.2598, "step": 1375 }, { "epoch": 0.65, "grad_norm": 0.6937847137451172, "learning_rate": 0.0001675174013921114, "loss": 0.2655, "step": 1400 }, { "epoch": 0.65, "eval_loss": 0.2760397493839264, "eval_na_accuracy": 0.7123551964759827, "eval_ordinal_accuracy": 0.5426177382469177, "eval_ordinal_mae": 0.6106911897659302, "eval_runtime": 160.1635, "eval_samples_per_second": 24.843, "eval_steps_per_second": 3.109, "step": 1400 }, { "epoch": 0.65, "step": 1400, "total_flos": 1.735882797809664e+18, "train_loss": 0.29669314997536794, "train_runtime": 4786.838, "train_samples_per_second": 28.807, "train_steps_per_second": 1.801 } ], "logging_steps": 25, "max_steps": 8620, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "total_flos": 1.735882797809664e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }