{ "best_metric": 0.757429718875502, "best_model_checkpoint": "./results_small_seed2CasedUnstripped/checkpoint-19000", "epoch": 0.7741199478487614, "global_step": 19000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 1.0183299389002038e-05, "loss": 1.0885, "step": 250 }, { "epoch": 0.02, "learning_rate": 2.0366598778004075e-05, "loss": 0.9616, "step": 500 }, { "epoch": 0.02, "eval_accuracy": 0.6345381526104418, "eval_loss": 0.8497252464294434, "eval_runtime": 4.8371, "eval_samples_per_second": 514.77, "eval_steps_per_second": 16.125, "step": 500 }, { "epoch": 0.03, "learning_rate": 3.0549898167006117e-05, "loss": 0.8224, "step": 750 }, { "epoch": 0.04, "learning_rate": 4.073319755600815e-05, "loss": 0.7802, "step": 1000 }, { "epoch": 0.04, "eval_accuracy": 0.6522088353413654, "eval_loss": 0.8134140968322754, "eval_runtime": 4.8807, "eval_samples_per_second": 510.173, "eval_steps_per_second": 15.981, "step": 1000 }, { "epoch": 0.05, "learning_rate": 5.0916496945010185e-05, "loss": 0.7951, "step": 1250 }, { "epoch": 0.06, "learning_rate": 6.109979633401223e-05, "loss": 0.7652, "step": 1500 }, { "epoch": 0.06, "eval_accuracy": 0.6859437751004016, "eval_loss": 0.762012243270874, "eval_runtime": 4.8074, "eval_samples_per_second": 517.952, "eval_steps_per_second": 16.225, "step": 1500 }, { "epoch": 0.07, "learning_rate": 7.128309572301426e-05, "loss": 0.758, "step": 1750 }, { "epoch": 0.08, "learning_rate": 8.14663951120163e-05, "loss": 0.7811, "step": 2000 }, { "epoch": 0.08, "eval_accuracy": 0.6746987951807228, "eval_loss": 0.7685323357582092, "eval_runtime": 4.7928, "eval_samples_per_second": 519.528, "eval_steps_per_second": 16.274, "step": 2000 }, { "epoch": 0.09, "learning_rate": 9.164969450101833e-05, "loss": 0.7824, "step": 2250 }, { "epoch": 0.1, "learning_rate": 9.979627869075105e-05, "loss": 0.81, "step": 2500 }, { "epoch": 0.1, "eval_accuracy": 0.6481927710843374, "eval_loss": 0.7982615232467651, "eval_runtime": 4.802, "eval_samples_per_second": 518.539, "eval_steps_per_second": 16.243, "step": 2500 }, { "epoch": 0.11, "learning_rate": 9.866449363936802e-05, "loss": 0.791, "step": 2750 }, { "epoch": 0.12, "learning_rate": 9.753270858798497e-05, "loss": 0.793, "step": 3000 }, { "epoch": 0.12, "eval_accuracy": 0.6365461847389559, "eval_loss": 0.8263759016990662, "eval_runtime": 4.8663, "eval_samples_per_second": 511.678, "eval_steps_per_second": 16.028, "step": 3000 }, { "epoch": 0.13, "learning_rate": 9.640092353660193e-05, "loss": 0.785, "step": 3250 }, { "epoch": 0.14, "learning_rate": 9.526913848521888e-05, "loss": 0.7876, "step": 3500 }, { "epoch": 0.14, "eval_accuracy": 0.6598393574297189, "eval_loss": 0.7989997267723083, "eval_runtime": 4.7903, "eval_samples_per_second": 519.799, "eval_steps_per_second": 16.283, "step": 3500 }, { "epoch": 0.15, "learning_rate": 9.413735343383585e-05, "loss": 0.7866, "step": 3750 }, { "epoch": 0.16, "learning_rate": 9.30055683824528e-05, "loss": 0.7799, "step": 4000 }, { "epoch": 0.16, "eval_accuracy": 0.6718875502008033, "eval_loss": 0.7617995142936707, "eval_runtime": 4.8098, "eval_samples_per_second": 517.691, "eval_steps_per_second": 16.217, "step": 4000 }, { "epoch": 0.17, "learning_rate": 9.187378333106976e-05, "loss": 0.7699, "step": 4250 }, { "epoch": 0.18, "learning_rate": 9.074199827968673e-05, "loss": 0.7487, "step": 4500 }, { "epoch": 0.18, "eval_accuracy": 0.6767068273092369, "eval_loss": 0.7875193357467651, "eval_runtime": 4.8759, "eval_samples_per_second": 510.674, "eval_steps_per_second": 15.997, "step": 4500 }, { "epoch": 0.19, "learning_rate": 8.961021322830368e-05, "loss": 0.7532, "step": 4750 }, { "epoch": 0.2, "learning_rate": 8.847842817692065e-05, "loss": 0.7375, "step": 5000 }, { "epoch": 0.2, "eval_accuracy": 0.6726907630522089, "eval_loss": 0.7819647789001465, "eval_runtime": 4.7823, "eval_samples_per_second": 520.672, "eval_steps_per_second": 16.31, "step": 5000 }, { "epoch": 0.21, "learning_rate": 8.73466431255376e-05, "loss": 0.7374, "step": 5250 }, { "epoch": 0.22, "learning_rate": 8.621485807415456e-05, "loss": 0.7674, "step": 5500 }, { "epoch": 0.22, "eval_accuracy": 0.6751004016064257, "eval_loss": 0.7469549179077148, "eval_runtime": 4.7908, "eval_samples_per_second": 519.751, "eval_steps_per_second": 16.281, "step": 5500 }, { "epoch": 0.23, "learning_rate": 8.508307302277152e-05, "loss": 0.7531, "step": 5750 }, { "epoch": 0.24, "learning_rate": 8.395128797138848e-05, "loss": 0.74, "step": 6000 }, { "epoch": 0.24, "eval_accuracy": 0.6763052208835342, "eval_loss": 0.7755768895149231, "eval_runtime": 4.7885, "eval_samples_per_second": 519.996, "eval_steps_per_second": 16.289, "step": 6000 }, { "epoch": 0.25, "learning_rate": 8.281950292000543e-05, "loss": 0.7448, "step": 6250 }, { "epoch": 0.26, "learning_rate": 8.16877178686224e-05, "loss": 0.7444, "step": 6500 }, { "epoch": 0.26, "eval_accuracy": 0.6775100401606425, "eval_loss": 0.7682979702949524, "eval_runtime": 4.7945, "eval_samples_per_second": 519.348, "eval_steps_per_second": 16.269, "step": 6500 }, { "epoch": 0.28, "learning_rate": 8.055593281723935e-05, "loss": 0.7326, "step": 6750 }, { "epoch": 0.29, "learning_rate": 7.942414776585632e-05, "loss": 0.7431, "step": 7000 }, { "epoch": 0.29, "eval_accuracy": 0.6783132530120481, "eval_loss": 0.7539064288139343, "eval_runtime": 4.7996, "eval_samples_per_second": 518.797, "eval_steps_per_second": 16.251, "step": 7000 }, { "epoch": 0.3, "learning_rate": 7.829236271447327e-05, "loss": 0.7284, "step": 7250 }, { "epoch": 0.31, "learning_rate": 7.716057766309023e-05, "loss": 0.7155, "step": 7500 }, { "epoch": 0.31, "eval_accuracy": 0.6827309236947792, "eval_loss": 0.7247231602668762, "eval_runtime": 4.7845, "eval_samples_per_second": 520.426, "eval_steps_per_second": 16.302, "step": 7500 }, { "epoch": 0.32, "learning_rate": 7.60287926117072e-05, "loss": 0.7499, "step": 7750 }, { "epoch": 0.33, "learning_rate": 7.489700756032415e-05, "loss": 0.7087, "step": 8000 }, { "epoch": 0.33, "eval_accuracy": 0.7116465863453815, "eval_loss": 0.7027077674865723, "eval_runtime": 4.7959, "eval_samples_per_second": 519.195, "eval_steps_per_second": 16.264, "step": 8000 }, { "epoch": 0.34, "learning_rate": 7.37652225089411e-05, "loss": 0.7144, "step": 8250 }, { "epoch": 0.35, "learning_rate": 7.263343745755806e-05, "loss": 0.719, "step": 8500 }, { "epoch": 0.35, "eval_accuracy": 0.7004016064257028, "eval_loss": 0.73233962059021, "eval_runtime": 4.8318, "eval_samples_per_second": 515.339, "eval_steps_per_second": 16.143, "step": 8500 }, { "epoch": 0.36, "learning_rate": 7.150165240617503e-05, "loss": 0.7301, "step": 8750 }, { "epoch": 0.37, "learning_rate": 7.036986735479198e-05, "loss": 0.7225, "step": 9000 }, { "epoch": 0.37, "eval_accuracy": 0.7156626506024096, "eval_loss": 0.7080035209655762, "eval_runtime": 4.8777, "eval_samples_per_second": 510.486, "eval_steps_per_second": 15.991, "step": 9000 }, { "epoch": 0.38, "learning_rate": 6.923808230340894e-05, "loss": 0.7216, "step": 9250 }, { "epoch": 0.39, "learning_rate": 6.810629725202589e-05, "loss": 0.707, "step": 9500 }, { "epoch": 0.39, "eval_accuracy": 0.7004016064257028, "eval_loss": 0.7323043942451477, "eval_runtime": 4.9132, "eval_samples_per_second": 506.797, "eval_steps_per_second": 15.876, "step": 9500 }, { "epoch": 0.4, "learning_rate": 6.697451220064286e-05, "loss": 0.7133, "step": 9750 }, { "epoch": 0.41, "learning_rate": 6.584272714925982e-05, "loss": 0.6908, "step": 10000 }, { "epoch": 0.41, "eval_accuracy": 0.706425702811245, "eval_loss": 0.7354196310043335, "eval_runtime": 4.8275, "eval_samples_per_second": 515.794, "eval_steps_per_second": 16.157, "step": 10000 }, { "epoch": 0.42, "learning_rate": 6.471094209787677e-05, "loss": 0.7053, "step": 10250 }, { "epoch": 0.43, "learning_rate": 6.357915704649372e-05, "loss": 0.705, "step": 10500 }, { "epoch": 0.43, "eval_accuracy": 0.7261044176706827, "eval_loss": 0.6736636161804199, "eval_runtime": 4.8326, "eval_samples_per_second": 515.256, "eval_steps_per_second": 16.141, "step": 10500 }, { "epoch": 0.44, "learning_rate": 6.24473719951107e-05, "loss": 0.7213, "step": 10750 }, { "epoch": 0.45, "learning_rate": 6.131558694372765e-05, "loss": 0.6961, "step": 11000 }, { "epoch": 0.45, "eval_accuracy": 0.7208835341365462, "eval_loss": 0.6973076462745667, "eval_runtime": 4.901, "eval_samples_per_second": 508.056, "eval_steps_per_second": 15.915, "step": 11000 }, { "epoch": 0.46, "learning_rate": 6.01838018923446e-05, "loss": 0.7029, "step": 11250 }, { "epoch": 0.47, "learning_rate": 5.9052016840961564e-05, "loss": 0.6842, "step": 11500 }, { "epoch": 0.47, "eval_accuracy": 0.6855421686746987, "eval_loss": 0.7288179993629456, "eval_runtime": 4.8248, "eval_samples_per_second": 516.079, "eval_steps_per_second": 16.166, "step": 11500 }, { "epoch": 0.48, "learning_rate": 5.7920231789578526e-05, "loss": 0.6894, "step": 11750 }, { "epoch": 0.49, "learning_rate": 5.678844673819549e-05, "loss": 0.6879, "step": 12000 }, { "epoch": 0.49, "eval_accuracy": 0.7204819277108434, "eval_loss": 0.6706427931785583, "eval_runtime": 4.816, "eval_samples_per_second": 517.027, "eval_steps_per_second": 16.196, "step": 12000 }, { "epoch": 0.5, "learning_rate": 5.565666168681244e-05, "loss": 0.6962, "step": 12250 }, { "epoch": 0.51, "learning_rate": 5.4524876635429404e-05, "loss": 0.6782, "step": 12500 }, { "epoch": 0.51, "eval_accuracy": 0.7172690763052209, "eval_loss": 0.6858745813369751, "eval_runtime": 4.8781, "eval_samples_per_second": 510.448, "eval_steps_per_second": 15.99, "step": 12500 }, { "epoch": 0.52, "learning_rate": 5.339309158404636e-05, "loss": 0.6871, "step": 12750 }, { "epoch": 0.53, "learning_rate": 5.226130653266332e-05, "loss": 0.6606, "step": 13000 }, { "epoch": 0.53, "eval_accuracy": 0.7244979919678715, "eval_loss": 0.6812068819999695, "eval_runtime": 4.8174, "eval_samples_per_second": 516.876, "eval_steps_per_second": 16.191, "step": 13000 }, { "epoch": 0.54, "learning_rate": 5.1129521481280275e-05, "loss": 0.6948, "step": 13250 }, { "epoch": 0.55, "learning_rate": 4.9997736429897236e-05, "loss": 0.6752, "step": 13500 }, { "epoch": 0.55, "eval_accuracy": 0.7321285140562249, "eval_loss": 0.6839885115623474, "eval_runtime": 4.8713, "eval_samples_per_second": 511.155, "eval_steps_per_second": 16.012, "step": 13500 }, { "epoch": 0.56, "learning_rate": 4.88659513785142e-05, "loss": 0.6692, "step": 13750 }, { "epoch": 0.57, "learning_rate": 4.773416632713115e-05, "loss": 0.6502, "step": 14000 }, { "epoch": 0.57, "eval_accuracy": 0.7240963855421687, "eval_loss": 0.6590266823768616, "eval_runtime": 4.9174, "eval_samples_per_second": 506.361, "eval_steps_per_second": 15.862, "step": 14000 }, { "epoch": 0.58, "learning_rate": 4.6602381275748114e-05, "loss": 0.6626, "step": 14250 }, { "epoch": 0.59, "learning_rate": 4.547059622436507e-05, "loss": 0.6677, "step": 14500 }, { "epoch": 0.59, "eval_accuracy": 0.7289156626506024, "eval_loss": 0.6861540079116821, "eval_runtime": 4.8286, "eval_samples_per_second": 515.672, "eval_steps_per_second": 16.154, "step": 14500 }, { "epoch": 0.6, "learning_rate": 4.433881117298203e-05, "loss": 0.6559, "step": 14750 }, { "epoch": 0.61, "learning_rate": 4.3207026121598985e-05, "loss": 0.6694, "step": 15000 }, { "epoch": 0.61, "eval_accuracy": 0.7261044176706827, "eval_loss": 0.6638771891593933, "eval_runtime": 4.8259, "eval_samples_per_second": 515.971, "eval_steps_per_second": 16.163, "step": 15000 }, { "epoch": 0.62, "learning_rate": 4.207524107021595e-05, "loss": 0.6623, "step": 15250 }, { "epoch": 0.63, "learning_rate": 4.09434560188329e-05, "loss": 0.6653, "step": 15500 }, { "epoch": 0.63, "eval_accuracy": 0.7128514056224899, "eval_loss": 0.6940041184425354, "eval_runtime": 4.9313, "eval_samples_per_second": 504.94, "eval_steps_per_second": 15.817, "step": 15500 }, { "epoch": 0.64, "learning_rate": 3.981167096744986e-05, "loss": 0.6343, "step": 15750 }, { "epoch": 0.65, "learning_rate": 3.8679885916066825e-05, "loss": 0.6636, "step": 16000 }, { "epoch": 0.65, "eval_accuracy": 0.7397590361445783, "eval_loss": 0.6664681434631348, "eval_runtime": 4.8293, "eval_samples_per_second": 515.605, "eval_steps_per_second": 16.151, "step": 16000 }, { "epoch": 0.66, "learning_rate": 3.754810086468378e-05, "loss": 0.6591, "step": 16250 }, { "epoch": 0.67, "learning_rate": 3.641631581330074e-05, "loss": 0.6394, "step": 16500 }, { "epoch": 0.67, "eval_accuracy": 0.7441767068273092, "eval_loss": 0.6528923511505127, "eval_runtime": 4.8454, "eval_samples_per_second": 513.891, "eval_steps_per_second": 16.098, "step": 16500 }, { "epoch": 0.68, "learning_rate": 3.52845307619177e-05, "loss": 0.6483, "step": 16750 }, { "epoch": 0.69, "learning_rate": 3.415274571053466e-05, "loss": 0.6545, "step": 17000 }, { "epoch": 0.69, "eval_accuracy": 0.7365461847389558, "eval_loss": 0.6435971260070801, "eval_runtime": 4.8617, "eval_samples_per_second": 512.163, "eval_steps_per_second": 16.044, "step": 17000 }, { "epoch": 0.7, "learning_rate": 3.302096065915162e-05, "loss": 0.6298, "step": 17250 }, { "epoch": 0.71, "learning_rate": 3.1889175607768574e-05, "loss": 0.6377, "step": 17500 }, { "epoch": 0.71, "eval_accuracy": 0.738152610441767, "eval_loss": 0.6470540165901184, "eval_runtime": 4.8301, "eval_samples_per_second": 515.517, "eval_steps_per_second": 16.149, "step": 17500 }, { "epoch": 0.72, "learning_rate": 3.0757390556385535e-05, "loss": 0.6441, "step": 17750 }, { "epoch": 0.73, "learning_rate": 2.962560550500249e-05, "loss": 0.6305, "step": 18000 }, { "epoch": 0.73, "eval_accuracy": 0.7248995983935743, "eval_loss": 0.6817350387573242, "eval_runtime": 4.833, "eval_samples_per_second": 515.206, "eval_steps_per_second": 16.139, "step": 18000 }, { "epoch": 0.74, "learning_rate": 2.849382045361945e-05, "loss": 0.62, "step": 18250 }, { "epoch": 0.75, "learning_rate": 2.7362035402236406e-05, "loss": 0.6345, "step": 18500 }, { "epoch": 0.75, "eval_accuracy": 0.7493975903614458, "eval_loss": 0.6382141709327698, "eval_runtime": 4.8844, "eval_samples_per_second": 509.786, "eval_steps_per_second": 15.969, "step": 18500 }, { "epoch": 0.76, "learning_rate": 2.6230250350853368e-05, "loss": 0.6236, "step": 18750 }, { "epoch": 0.77, "learning_rate": 2.5098465299470326e-05, "loss": 0.62, "step": 19000 }, { "epoch": 0.77, "eval_accuracy": 0.757429718875502, "eval_loss": 0.6400189399719238, "eval_runtime": 4.8308, "eval_samples_per_second": 515.438, "eval_steps_per_second": 16.146, "step": 19000 } ], "max_steps": 24544, "num_train_epochs": 1, "total_flos": 1.1363112060596352e+16, "trial_name": null, "trial_params": null }