{ "best_metric": 0.10501641035079956, "best_model_checkpoint": "./vit-base-lcdoctypev1_session3/checkpoint-335", "epoch": 10.0, "eval_steps": 5, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08, "eval_accuracy": 0.9090909090909091, "eval_loss": 0.3158922493457794, "eval_runtime": 9.7781, "eval_samples_per_second": 12.375, "eval_steps_per_second": 1.636, "step": 5 }, { "epoch": 0.17, "grad_norm": 6.890113353729248, "learning_rate": 0.00019666666666666666, "loss": 0.1798, "step": 10 }, { "epoch": 0.17, "eval_accuracy": 0.9338842975206612, "eval_loss": 0.22616656124591827, "eval_runtime": 8.5759, "eval_samples_per_second": 14.109, "eval_steps_per_second": 1.866, "step": 10 }, { "epoch": 0.25, "eval_accuracy": 0.7768595041322314, "eval_loss": 0.9910252690315247, "eval_runtime": 9.2477, "eval_samples_per_second": 13.084, "eval_steps_per_second": 1.73, "step": 15 }, { "epoch": 0.33, "grad_norm": 5.934413909912109, "learning_rate": 0.0001936666666666667, "loss": 0.3815, "step": 20 }, { "epoch": 0.33, "eval_accuracy": 0.9008264462809917, "eval_loss": 0.3035498857498169, "eval_runtime": 8.7721, "eval_samples_per_second": 13.794, "eval_steps_per_second": 1.824, "step": 20 }, { "epoch": 0.42, "eval_accuracy": 0.9338842975206612, "eval_loss": 0.21773552894592285, "eval_runtime": 9.0833, "eval_samples_per_second": 13.321, "eval_steps_per_second": 1.761, "step": 25 }, { "epoch": 0.5, "grad_norm": 0.3238607347011566, "learning_rate": 0.00019033333333333334, "loss": 0.1429, "step": 30 }, { "epoch": 0.5, "eval_accuracy": 0.8842975206611571, "eval_loss": 0.4908874034881592, "eval_runtime": 8.456, "eval_samples_per_second": 14.309, "eval_steps_per_second": 1.892, "step": 30 }, { "epoch": 0.58, "eval_accuracy": 0.9256198347107438, "eval_loss": 0.3096350133419037, "eval_runtime": 10.0423, "eval_samples_per_second": 12.049, "eval_steps_per_second": 1.593, "step": 35 }, { "epoch": 0.67, "grad_norm": 5.55199670791626, "learning_rate": 0.00018700000000000002, "loss": 0.2424, "step": 40 }, { "epoch": 0.67, "eval_accuracy": 0.9173553719008265, "eval_loss": 0.32702526450157166, "eval_runtime": 8.357, "eval_samples_per_second": 14.479, "eval_steps_per_second": 1.915, "step": 40 }, { "epoch": 0.75, "eval_accuracy": 0.9173553719008265, "eval_loss": 0.2554916441440582, "eval_runtime": 9.2174, "eval_samples_per_second": 13.127, "eval_steps_per_second": 1.736, "step": 45 }, { "epoch": 0.83, "grad_norm": 14.463412284851074, "learning_rate": 0.00018366666666666667, "loss": 0.1172, "step": 50 }, { "epoch": 0.83, "eval_accuracy": 0.9173553719008265, "eval_loss": 0.2309066504240036, "eval_runtime": 9.1271, "eval_samples_per_second": 13.257, "eval_steps_per_second": 1.753, "step": 50 }, { "epoch": 0.92, "eval_accuracy": 0.9173553719008265, "eval_loss": 0.2952496409416199, "eval_runtime": 9.6618, "eval_samples_per_second": 12.523, "eval_steps_per_second": 1.656, "step": 55 }, { "epoch": 1.0, "grad_norm": 0.9041020274162292, "learning_rate": 0.00018033333333333334, "loss": 0.1185, "step": 60 }, { "epoch": 1.0, "eval_accuracy": 0.9173553719008265, "eval_loss": 0.295705646276474, "eval_runtime": 8.6323, "eval_samples_per_second": 14.017, "eval_steps_per_second": 1.854, "step": 60 }, { "epoch": 1.08, "eval_accuracy": 0.8925619834710744, "eval_loss": 0.3724129796028137, "eval_runtime": 8.5924, "eval_samples_per_second": 14.082, "eval_steps_per_second": 1.862, "step": 65 }, { "epoch": 1.17, "grad_norm": 24.987598419189453, "learning_rate": 0.00017700000000000002, "loss": 0.1594, "step": 70 }, { "epoch": 1.17, "eval_accuracy": 0.8842975206611571, "eval_loss": 0.4216250777244568, "eval_runtime": 9.0834, "eval_samples_per_second": 13.321, "eval_steps_per_second": 1.761, "step": 70 }, { "epoch": 1.25, "eval_accuracy": 0.9173553719008265, "eval_loss": 0.347516268491745, "eval_runtime": 8.9447, "eval_samples_per_second": 13.528, "eval_steps_per_second": 1.789, "step": 75 }, { "epoch": 1.33, "grad_norm": 0.3783435523509979, "learning_rate": 0.00017366666666666667, "loss": 0.1231, "step": 80 }, { "epoch": 1.33, "eval_accuracy": 0.8925619834710744, "eval_loss": 0.323406845331192, "eval_runtime": 9.122, "eval_samples_per_second": 13.265, "eval_steps_per_second": 1.754, "step": 80 }, { "epoch": 1.42, "eval_accuracy": 0.8842975206611571, "eval_loss": 0.4309641718864441, "eval_runtime": 8.8847, "eval_samples_per_second": 13.619, "eval_steps_per_second": 1.801, "step": 85 }, { "epoch": 1.5, "grad_norm": 5.4656147956848145, "learning_rate": 0.00017033333333333334, "loss": 0.0875, "step": 90 }, { "epoch": 1.5, "eval_accuracy": 0.9256198347107438, "eval_loss": 0.3598105013370514, "eval_runtime": 9.086, "eval_samples_per_second": 13.317, "eval_steps_per_second": 1.761, "step": 90 }, { "epoch": 1.58, "eval_accuracy": 0.9256198347107438, "eval_loss": 0.3038037419319153, "eval_runtime": 8.8538, "eval_samples_per_second": 13.666, "eval_steps_per_second": 1.807, "step": 95 }, { "epoch": 1.67, "grad_norm": 0.0762423649430275, "learning_rate": 0.000167, "loss": 0.0897, "step": 100 }, { "epoch": 1.67, "eval_accuracy": 0.9338842975206612, "eval_loss": 0.25987809896469116, "eval_runtime": 8.7997, "eval_samples_per_second": 13.751, "eval_steps_per_second": 1.818, "step": 100 }, { "epoch": 1.75, "eval_accuracy": 0.9586776859504132, "eval_loss": 0.1683536171913147, "eval_runtime": 8.0037, "eval_samples_per_second": 15.118, "eval_steps_per_second": 1.999, "step": 105 }, { "epoch": 1.83, "grad_norm": 1.4948017597198486, "learning_rate": 0.00016366666666666667, "loss": 0.1797, "step": 110 }, { "epoch": 1.83, "eval_accuracy": 0.9504132231404959, "eval_loss": 0.1412244588136673, "eval_runtime": 8.3997, "eval_samples_per_second": 14.405, "eval_steps_per_second": 1.905, "step": 110 }, { "epoch": 1.92, "eval_accuracy": 0.9586776859504132, "eval_loss": 0.14531350135803223, "eval_runtime": 8.6407, "eval_samples_per_second": 14.003, "eval_steps_per_second": 1.852, "step": 115 }, { "epoch": 2.0, "grad_norm": 6.408501148223877, "learning_rate": 0.00016033333333333335, "loss": 0.1178, "step": 120 }, { "epoch": 2.0, "eval_accuracy": 0.8925619834710744, "eval_loss": 0.3830728530883789, "eval_runtime": 9.0064, "eval_samples_per_second": 13.435, "eval_steps_per_second": 1.777, "step": 120 }, { "epoch": 2.08, "eval_accuracy": 0.9090909090909091, "eval_loss": 0.3321413993835449, "eval_runtime": 8.8462, "eval_samples_per_second": 13.678, "eval_steps_per_second": 1.809, "step": 125 }, { "epoch": 2.17, "grad_norm": 0.7101590633392334, "learning_rate": 0.00015700000000000002, "loss": 0.1969, "step": 130 }, { "epoch": 2.17, "eval_accuracy": 0.9090909090909091, "eval_loss": 0.25461918115615845, "eval_runtime": 8.969, "eval_samples_per_second": 13.491, "eval_steps_per_second": 1.784, "step": 130 }, { "epoch": 2.25, "eval_accuracy": 0.9504132231404959, "eval_loss": 0.18391890823841095, "eval_runtime": 8.644, "eval_samples_per_second": 13.998, "eval_steps_per_second": 1.851, "step": 135 }, { "epoch": 2.33, "grad_norm": 0.9350224733352661, "learning_rate": 0.00015366666666666667, "loss": 0.0362, "step": 140 }, { "epoch": 2.33, "eval_accuracy": 0.9586776859504132, "eval_loss": 0.20266053080558777, "eval_runtime": 8.5546, "eval_samples_per_second": 14.144, "eval_steps_per_second": 1.87, "step": 140 }, { "epoch": 2.42, "eval_accuracy": 0.9090909090909091, "eval_loss": 0.28766530752182007, "eval_runtime": 8.0124, "eval_samples_per_second": 15.102, "eval_steps_per_second": 1.997, "step": 145 }, { "epoch": 2.5, "grad_norm": 8.35146427154541, "learning_rate": 0.00015033333333333335, "loss": 0.1047, "step": 150 }, { "epoch": 2.5, "eval_accuracy": 0.8925619834710744, "eval_loss": 0.4503512978553772, "eval_runtime": 8.454, "eval_samples_per_second": 14.313, "eval_steps_per_second": 1.893, "step": 150 }, { "epoch": 2.58, "eval_accuracy": 0.9504132231404959, "eval_loss": 0.1810603141784668, "eval_runtime": 9.2694, "eval_samples_per_second": 13.054, "eval_steps_per_second": 1.726, "step": 155 }, { "epoch": 2.67, "grad_norm": 0.6059785485267639, "learning_rate": 0.000147, "loss": 0.1232, "step": 160 }, { "epoch": 2.67, "eval_accuracy": 0.9421487603305785, "eval_loss": 0.21074515581130981, "eval_runtime": 8.8489, "eval_samples_per_second": 13.674, "eval_steps_per_second": 1.808, "step": 160 }, { "epoch": 2.75, "eval_accuracy": 0.9504132231404959, "eval_loss": 0.20863419771194458, "eval_runtime": 8.9684, "eval_samples_per_second": 13.492, "eval_steps_per_second": 1.784, "step": 165 }, { "epoch": 2.83, "grad_norm": 0.04269757494330406, "learning_rate": 0.00014366666666666667, "loss": 0.0611, "step": 170 }, { "epoch": 2.83, "eval_accuracy": 0.9338842975206612, "eval_loss": 0.2971450686454773, "eval_runtime": 9.1231, "eval_samples_per_second": 13.263, "eval_steps_per_second": 1.754, "step": 170 }, { "epoch": 2.92, "eval_accuracy": 0.9338842975206612, "eval_loss": 0.2731765806674957, "eval_runtime": 8.9974, "eval_samples_per_second": 13.448, "eval_steps_per_second": 1.778, "step": 175 }, { "epoch": 3.0, "grad_norm": 0.9442762732505798, "learning_rate": 0.00014033333333333335, "loss": 0.0815, "step": 180 }, { "epoch": 3.0, "eval_accuracy": 0.9586776859504132, "eval_loss": 0.16794723272323608, "eval_runtime": 8.9455, "eval_samples_per_second": 13.526, "eval_steps_per_second": 1.789, "step": 180 }, { "epoch": 3.08, "eval_accuracy": 0.9338842975206612, "eval_loss": 0.24155646562576294, "eval_runtime": 8.3613, "eval_samples_per_second": 14.471, "eval_steps_per_second": 1.914, "step": 185 }, { "epoch": 3.17, "grad_norm": 2.08608078956604, "learning_rate": 0.00013700000000000002, "loss": 0.0469, "step": 190 }, { "epoch": 3.17, "eval_accuracy": 0.9256198347107438, "eval_loss": 0.29269692301750183, "eval_runtime": 8.9407, "eval_samples_per_second": 13.534, "eval_steps_per_second": 1.79, "step": 190 }, { "epoch": 3.25, "eval_accuracy": 0.9338842975206612, "eval_loss": 0.28314581513404846, "eval_runtime": 8.7918, "eval_samples_per_second": 13.763, "eval_steps_per_second": 1.82, "step": 195 }, { "epoch": 3.33, "grad_norm": 0.8798501491546631, "learning_rate": 0.00013366666666666667, "loss": 0.0443, "step": 200 }, { "epoch": 3.33, "eval_accuracy": 0.9421487603305785, "eval_loss": 0.2744951546192169, "eval_runtime": 8.612, "eval_samples_per_second": 14.05, "eval_steps_per_second": 1.858, "step": 200 }, { "epoch": 3.42, "eval_accuracy": 0.8925619834710744, "eval_loss": 0.4193201959133148, "eval_runtime": 8.9147, "eval_samples_per_second": 13.573, "eval_steps_per_second": 1.795, "step": 205 }, { "epoch": 3.5, "grad_norm": 0.03738969564437866, "learning_rate": 0.00013033333333333332, "loss": 0.0823, "step": 210 }, { "epoch": 3.5, "eval_accuracy": 0.9173553719008265, "eval_loss": 0.3746081292629242, "eval_runtime": 8.5854, "eval_samples_per_second": 14.094, "eval_steps_per_second": 1.864, "step": 210 }, { "epoch": 3.58, "eval_accuracy": 0.9421487603305785, "eval_loss": 0.30296453833580017, "eval_runtime": 8.8651, "eval_samples_per_second": 13.649, "eval_steps_per_second": 1.805, "step": 215 }, { "epoch": 3.67, "grad_norm": 0.03203318268060684, "learning_rate": 0.000127, "loss": 0.0101, "step": 220 }, { "epoch": 3.67, "eval_accuracy": 0.9504132231404959, "eval_loss": 0.21464580297470093, "eval_runtime": 8.8029, "eval_samples_per_second": 13.745, "eval_steps_per_second": 1.818, "step": 220 }, { "epoch": 3.75, "eval_accuracy": 0.9421487603305785, "eval_loss": 0.2514008581638336, "eval_runtime": 9.2073, "eval_samples_per_second": 13.142, "eval_steps_per_second": 1.738, "step": 225 }, { "epoch": 3.83, "grad_norm": 0.04610053077340126, "learning_rate": 0.00012366666666666667, "loss": 0.16, "step": 230 }, { "epoch": 3.83, "eval_accuracy": 0.9421487603305785, "eval_loss": 0.25517505407333374, "eval_runtime": 8.8885, "eval_samples_per_second": 13.613, "eval_steps_per_second": 1.8, "step": 230 }, { "epoch": 3.92, "eval_accuracy": 0.9421487603305785, "eval_loss": 0.22389596700668335, "eval_runtime": 8.5203, "eval_samples_per_second": 14.201, "eval_steps_per_second": 1.878, "step": 235 }, { "epoch": 4.0, "grad_norm": 3.5699214935302734, "learning_rate": 0.00012033333333333335, "loss": 0.1687, "step": 240 }, { "epoch": 4.0, "eval_accuracy": 0.9256198347107438, "eval_loss": 0.25712552666664124, "eval_runtime": 8.7329, "eval_samples_per_second": 13.856, "eval_steps_per_second": 1.832, "step": 240 }, { "epoch": 4.08, "eval_accuracy": 0.9752066115702479, "eval_loss": 0.13568438589572906, "eval_runtime": 8.642, "eval_samples_per_second": 14.001, "eval_steps_per_second": 1.851, "step": 245 }, { "epoch": 4.17, "grad_norm": 2.446256637573242, "learning_rate": 0.000117, "loss": 0.0758, "step": 250 }, { "epoch": 4.17, "eval_accuracy": 0.9504132231404959, "eval_loss": 0.17341962456703186, "eval_runtime": 8.0872, "eval_samples_per_second": 14.962, "eval_steps_per_second": 1.978, "step": 250 }, { "epoch": 4.25, "eval_accuracy": 0.9752066115702479, "eval_loss": 0.11965644359588623, "eval_runtime": 9.0168, "eval_samples_per_second": 13.419, "eval_steps_per_second": 1.774, "step": 255 }, { "epoch": 4.33, "grad_norm": 0.048665851354599, "learning_rate": 0.00011366666666666667, "loss": 0.042, "step": 260 }, { "epoch": 4.33, "eval_accuracy": 0.9421487603305785, "eval_loss": 0.23387375473976135, "eval_runtime": 8.7974, "eval_samples_per_second": 13.754, "eval_steps_per_second": 1.819, "step": 260 }, { "epoch": 4.42, "eval_accuracy": 0.9173553719008265, "eval_loss": 0.2923980951309204, "eval_runtime": 8.7236, "eval_samples_per_second": 13.87, "eval_steps_per_second": 1.834, "step": 265 }, { "epoch": 4.5, "grad_norm": 0.07859846204519272, "learning_rate": 0.00011033333333333334, "loss": 0.0114, "step": 270 }, { "epoch": 4.5, "eval_accuracy": 0.9504132231404959, "eval_loss": 0.23183898627758026, "eval_runtime": 8.7284, "eval_samples_per_second": 13.863, "eval_steps_per_second": 1.833, "step": 270 }, { "epoch": 4.58, "eval_accuracy": 0.9586776859504132, "eval_loss": 0.17654092609882355, "eval_runtime": 8.8357, "eval_samples_per_second": 13.694, "eval_steps_per_second": 1.811, "step": 275 }, { "epoch": 4.67, "grad_norm": 0.266812801361084, "learning_rate": 0.00010700000000000001, "loss": 0.0197, "step": 280 }, { "epoch": 4.67, "eval_accuracy": 0.9669421487603306, "eval_loss": 0.12631382048130035, "eval_runtime": 7.9448, "eval_samples_per_second": 15.23, "eval_steps_per_second": 2.014, "step": 280 }, { "epoch": 4.75, "eval_accuracy": 0.9669421487603306, "eval_loss": 0.12528358399868011, "eval_runtime": 8.1841, "eval_samples_per_second": 14.785, "eval_steps_per_second": 1.955, "step": 285 }, { "epoch": 4.83, "grad_norm": 0.8625770807266235, "learning_rate": 0.00010366666666666666, "loss": 0.0283, "step": 290 }, { "epoch": 4.83, "eval_accuracy": 0.9669421487603306, "eval_loss": 0.1239379420876503, "eval_runtime": 9.1766, "eval_samples_per_second": 13.186, "eval_steps_per_second": 1.744, "step": 290 }, { "epoch": 4.92, "eval_accuracy": 0.9669421487603306, "eval_loss": 0.12782499194145203, "eval_runtime": 8.9807, "eval_samples_per_second": 13.473, "eval_steps_per_second": 1.782, "step": 295 }, { "epoch": 5.0, "grad_norm": 0.08663175255060196, "learning_rate": 0.00010033333333333335, "loss": 0.1115, "step": 300 }, { "epoch": 5.0, "eval_accuracy": 0.9338842975206612, "eval_loss": 0.2527827322483063, "eval_runtime": 8.5733, "eval_samples_per_second": 14.114, "eval_steps_per_second": 1.866, "step": 300 }, { "epoch": 5.08, "eval_accuracy": 0.9338842975206612, "eval_loss": 0.3164093792438507, "eval_runtime": 8.5647, "eval_samples_per_second": 14.128, "eval_steps_per_second": 1.868, "step": 305 }, { "epoch": 5.17, "grad_norm": 0.11408742517232895, "learning_rate": 9.7e-05, "loss": 0.0404, "step": 310 }, { "epoch": 5.17, "eval_accuracy": 0.9338842975206612, "eval_loss": 0.2841833829879761, "eval_runtime": 8.7395, "eval_samples_per_second": 13.845, "eval_steps_per_second": 1.831, "step": 310 }, { "epoch": 5.25, "eval_accuracy": 0.9504132231404959, "eval_loss": 0.17133790254592896, "eval_runtime": 8.8895, "eval_samples_per_second": 13.612, "eval_steps_per_second": 1.8, "step": 315 }, { "epoch": 5.33, "grad_norm": 0.08482904732227325, "learning_rate": 9.366666666666668e-05, "loss": 0.0719, "step": 320 }, { "epoch": 5.33, "eval_accuracy": 0.9338842975206612, "eval_loss": 0.18959270417690277, "eval_runtime": 8.641, "eval_samples_per_second": 14.003, "eval_steps_per_second": 1.852, "step": 320 }, { "epoch": 5.42, "eval_accuracy": 0.9256198347107438, "eval_loss": 0.18550463020801544, "eval_runtime": 8.219, "eval_samples_per_second": 14.722, "eval_steps_per_second": 1.947, "step": 325 }, { "epoch": 5.5, "grad_norm": 4.354619026184082, "learning_rate": 9.033333333333334e-05, "loss": 0.0435, "step": 330 }, { "epoch": 5.5, "eval_accuracy": 0.9669421487603306, "eval_loss": 0.15409986674785614, "eval_runtime": 8.5474, "eval_samples_per_second": 14.156, "eval_steps_per_second": 1.872, "step": 330 }, { "epoch": 5.58, "eval_accuracy": 0.9669421487603306, "eval_loss": 0.10501641035079956, "eval_runtime": 8.6962, "eval_samples_per_second": 13.914, "eval_steps_per_second": 1.84, "step": 335 }, { "epoch": 5.67, "grad_norm": 0.05306649208068848, "learning_rate": 8.7e-05, "loss": 0.0129, "step": 340 }, { "epoch": 5.67, "eval_accuracy": 0.9586776859504132, "eval_loss": 0.10632016509771347, "eval_runtime": 8.7849, "eval_samples_per_second": 13.774, "eval_steps_per_second": 1.821, "step": 340 }, { "epoch": 5.75, "eval_accuracy": 0.9586776859504132, "eval_loss": 0.11378511786460876, "eval_runtime": 8.0973, "eval_samples_per_second": 14.943, "eval_steps_per_second": 1.976, "step": 345 }, { "epoch": 5.83, "grad_norm": 0.02987060882151127, "learning_rate": 8.366666666666668e-05, "loss": 0.0222, "step": 350 }, { "epoch": 5.83, "eval_accuracy": 0.9586776859504132, "eval_loss": 0.11444854736328125, "eval_runtime": 8.7513, "eval_samples_per_second": 13.827, "eval_steps_per_second": 1.828, "step": 350 }, { "epoch": 5.92, "eval_accuracy": 0.9669421487603306, "eval_loss": 0.12378235161304474, "eval_runtime": 9.2818, "eval_samples_per_second": 13.036, "eval_steps_per_second": 1.724, "step": 355 }, { "epoch": 6.0, "grad_norm": 0.02307078428566456, "learning_rate": 8.033333333333334e-05, "loss": 0.0431, "step": 360 }, { "epoch": 6.0, "eval_accuracy": 0.9752066115702479, "eval_loss": 0.1342514306306839, "eval_runtime": 7.9783, "eval_samples_per_second": 15.166, "eval_steps_per_second": 2.005, "step": 360 }, { "epoch": 6.08, "eval_accuracy": 0.9669421487603306, "eval_loss": 0.144140362739563, "eval_runtime": 8.763, "eval_samples_per_second": 13.808, "eval_steps_per_second": 1.826, "step": 365 }, { "epoch": 6.17, "grad_norm": 0.016979066655039787, "learning_rate": 7.7e-05, "loss": 0.0064, "step": 370 }, { "epoch": 6.17, "eval_accuracy": 0.9669421487603306, "eval_loss": 0.1470753401517868, "eval_runtime": 8.9895, "eval_samples_per_second": 13.46, "eval_steps_per_second": 1.78, "step": 370 }, { "epoch": 6.25, "eval_accuracy": 0.9752066115702479, "eval_loss": 0.1360587477684021, "eval_runtime": 8.9513, "eval_samples_per_second": 13.518, "eval_steps_per_second": 1.787, "step": 375 }, { "epoch": 6.33, "grad_norm": 0.022434255108237267, "learning_rate": 7.366666666666668e-05, "loss": 0.0576, "step": 380 }, { "epoch": 6.33, "eval_accuracy": 0.9752066115702479, "eval_loss": 0.13161548972129822, "eval_runtime": 8.7711, "eval_samples_per_second": 13.795, "eval_steps_per_second": 1.824, "step": 380 }, { "epoch": 6.42, "eval_accuracy": 0.9669421487603306, "eval_loss": 0.12319940328598022, "eval_runtime": 8.3275, "eval_samples_per_second": 14.53, "eval_steps_per_second": 1.921, "step": 385 }, { "epoch": 6.5, "grad_norm": 0.2556796371936798, "learning_rate": 7.033333333333334e-05, "loss": 0.0298, "step": 390 }, { "epoch": 6.5, "eval_accuracy": 0.9669421487603306, "eval_loss": 0.1254538893699646, "eval_runtime": 8.6034, "eval_samples_per_second": 14.064, "eval_steps_per_second": 1.86, "step": 390 }, { "epoch": 6.58, "eval_accuracy": 0.9669421487603306, "eval_loss": 0.13591305911540985, "eval_runtime": 8.8244, "eval_samples_per_second": 13.712, "eval_steps_per_second": 1.813, "step": 395 }, { "epoch": 6.67, "grad_norm": 0.03436155617237091, "learning_rate": 6.7e-05, "loss": 0.0097, "step": 400 }, { "epoch": 6.67, "eval_accuracy": 0.9669421487603306, "eval_loss": 0.1434980034828186, "eval_runtime": 9.1677, "eval_samples_per_second": 13.199, "eval_steps_per_second": 1.745, "step": 400 }, { "epoch": 6.75, "eval_accuracy": 0.9669421487603306, "eval_loss": 0.14506025612354279, "eval_runtime": 8.7551, "eval_samples_per_second": 13.82, "eval_steps_per_second": 1.827, "step": 405 }, { "epoch": 6.83, "grad_norm": 0.019292179495096207, "learning_rate": 6.366666666666668e-05, "loss": 0.0153, "step": 410 }, { "epoch": 6.83, "eval_accuracy": 0.9669421487603306, "eval_loss": 0.14391547441482544, "eval_runtime": 8.6401, "eval_samples_per_second": 14.004, "eval_steps_per_second": 1.852, "step": 410 }, { "epoch": 6.92, "eval_accuracy": 0.9752066115702479, "eval_loss": 0.1352916657924652, "eval_runtime": 8.9781, "eval_samples_per_second": 13.477, "eval_steps_per_second": 1.782, "step": 415 }, { "epoch": 7.0, "grad_norm": 0.1764516532421112, "learning_rate": 6.033333333333334e-05, "loss": 0.0406, "step": 420 }, { "epoch": 7.0, "eval_accuracy": 0.9752066115702479, "eval_loss": 0.13157208263874054, "eval_runtime": 8.4013, "eval_samples_per_second": 14.402, "eval_steps_per_second": 1.904, "step": 420 }, { "epoch": 7.08, "eval_accuracy": 0.9752066115702479, "eval_loss": 0.13093091547489166, "eval_runtime": 9.2645, "eval_samples_per_second": 13.061, "eval_steps_per_second": 1.727, "step": 425 }, { "epoch": 7.17, "grad_norm": 0.025451194494962692, "learning_rate": 5.6999999999999996e-05, "loss": 0.0154, "step": 430 }, { "epoch": 7.17, "eval_accuracy": 0.9752066115702479, "eval_loss": 0.13050581514835358, "eval_runtime": 8.9669, "eval_samples_per_second": 13.494, "eval_steps_per_second": 1.784, "step": 430 }, { "epoch": 7.25, "eval_accuracy": 0.9752066115702479, "eval_loss": 0.13096679747104645, "eval_runtime": 8.8412, "eval_samples_per_second": 13.686, "eval_steps_per_second": 1.81, "step": 435 }, { "epoch": 7.33, "grad_norm": 0.01770775578916073, "learning_rate": 5.3666666666666666e-05, "loss": 0.0209, "step": 440 }, { "epoch": 7.33, "eval_accuracy": 0.9752066115702479, "eval_loss": 0.13012637197971344, "eval_runtime": 8.8578, "eval_samples_per_second": 13.66, "eval_steps_per_second": 1.806, "step": 440 }, { "epoch": 7.42, "eval_accuracy": 0.9586776859504132, "eval_loss": 0.145931214094162, "eval_runtime": 8.6371, "eval_samples_per_second": 14.009, "eval_steps_per_second": 1.852, "step": 445 }, { "epoch": 7.5, "grad_norm": 17.443572998046875, "learning_rate": 5.0333333333333335e-05, "loss": 0.0298, "step": 450 }, { "epoch": 7.5, "eval_accuracy": 0.9586776859504132, "eval_loss": 0.16629938781261444, "eval_runtime": 8.2752, "eval_samples_per_second": 14.622, "eval_steps_per_second": 1.933, "step": 450 }, { "epoch": 7.58, "eval_accuracy": 0.9586776859504132, "eval_loss": 0.15594066679477692, "eval_runtime": 8.806, "eval_samples_per_second": 13.741, "eval_steps_per_second": 1.817, "step": 455 }, { "epoch": 7.67, "grad_norm": 0.030463455244898796, "learning_rate": 4.7e-05, "loss": 0.0052, "step": 460 }, { "epoch": 7.67, "eval_accuracy": 0.9586776859504132, "eval_loss": 0.15159618854522705, "eval_runtime": 8.9863, "eval_samples_per_second": 13.465, "eval_steps_per_second": 1.78, "step": 460 }, { "epoch": 7.75, "eval_accuracy": 0.9586776859504132, "eval_loss": 0.13964863121509552, "eval_runtime": 8.4469, "eval_samples_per_second": 14.325, "eval_steps_per_second": 1.894, "step": 465 }, { "epoch": 7.83, "grad_norm": 0.11391649395227432, "learning_rate": 4.3666666666666666e-05, "loss": 0.0172, "step": 470 }, { "epoch": 7.83, "eval_accuracy": 0.9586776859504132, "eval_loss": 0.13303633034229279, "eval_runtime": 8.9949, "eval_samples_per_second": 13.452, "eval_steps_per_second": 1.779, "step": 470 }, { "epoch": 7.92, "eval_accuracy": 0.9752066115702479, "eval_loss": 0.12355250120162964, "eval_runtime": 8.1109, "eval_samples_per_second": 14.918, "eval_steps_per_second": 1.973, "step": 475 }, { "epoch": 8.0, "grad_norm": 0.059335533529520035, "learning_rate": 4.0333333333333336e-05, "loss": 0.0348, "step": 480 }, { "epoch": 8.0, "eval_accuracy": 0.9752066115702479, "eval_loss": 0.12098132073879242, "eval_runtime": 8.8837, "eval_samples_per_second": 13.621, "eval_steps_per_second": 1.801, "step": 480 }, { "epoch": 8.08, "eval_accuracy": 0.9752066115702479, "eval_loss": 0.11751802265644073, "eval_runtime": 9.8045, "eval_samples_per_second": 12.341, "eval_steps_per_second": 1.632, "step": 485 }, { "epoch": 8.17, "grad_norm": 0.019469719380140305, "learning_rate": 3.7e-05, "loss": 0.0068, "step": 490 }, { "epoch": 8.17, "eval_accuracy": 0.9752066115702479, "eval_loss": 0.1185019463300705, "eval_runtime": 8.8757, "eval_samples_per_second": 13.633, "eval_steps_per_second": 1.803, "step": 490 }, { "epoch": 8.25, "eval_accuracy": 0.9752066115702479, "eval_loss": 0.12291049212217331, "eval_runtime": 8.9999, "eval_samples_per_second": 13.445, "eval_steps_per_second": 1.778, "step": 495 }, { "epoch": 8.33, "grad_norm": 0.029128719121217728, "learning_rate": 3.366666666666667e-05, "loss": 0.0305, "step": 500 }, { "epoch": 8.33, "eval_accuracy": 0.9752066115702479, "eval_loss": 0.12297818809747696, "eval_runtime": 9.0782, "eval_samples_per_second": 13.329, "eval_steps_per_second": 1.762, "step": 500 }, { "epoch": 8.42, "eval_accuracy": 0.9752066115702479, "eval_loss": 0.12048203498125076, "eval_runtime": 8.2377, "eval_samples_per_second": 14.688, "eval_steps_per_second": 1.942, "step": 505 }, { "epoch": 8.5, "grad_norm": 0.01591232791543007, "learning_rate": 3.0333333333333337e-05, "loss": 0.0154, "step": 510 }, { "epoch": 8.5, "eval_accuracy": 0.9752066115702479, "eval_loss": 0.11965296417474747, "eval_runtime": 9.3646, "eval_samples_per_second": 12.921, "eval_steps_per_second": 1.709, "step": 510 }, { "epoch": 8.58, "eval_accuracy": 0.9752066115702479, "eval_loss": 0.12173377722501755, "eval_runtime": 9.3963, "eval_samples_per_second": 12.877, "eval_steps_per_second": 1.703, "step": 515 }, { "epoch": 8.67, "grad_norm": 0.02258380502462387, "learning_rate": 2.7000000000000002e-05, "loss": 0.0177, "step": 520 }, { "epoch": 8.67, "eval_accuracy": 0.9752066115702479, "eval_loss": 0.12387024611234665, "eval_runtime": 9.1961, "eval_samples_per_second": 13.158, "eval_steps_per_second": 1.74, "step": 520 }, { "epoch": 8.75, "eval_accuracy": 0.9752066115702479, "eval_loss": 0.12437601387500763, "eval_runtime": 9.196, "eval_samples_per_second": 13.158, "eval_steps_per_second": 1.74, "step": 525 }, { "epoch": 8.83, "grad_norm": 0.020360412076115608, "learning_rate": 2.3666666666666668e-05, "loss": 0.0123, "step": 530 }, { "epoch": 8.83, "eval_accuracy": 0.9669421487603306, "eval_loss": 0.12708893418312073, "eval_runtime": 8.8063, "eval_samples_per_second": 13.74, "eval_steps_per_second": 1.817, "step": 530 }, { "epoch": 8.92, "eval_accuracy": 0.9669421487603306, "eval_loss": 0.1300380975008011, "eval_runtime": 8.9605, "eval_samples_per_second": 13.504, "eval_steps_per_second": 1.786, "step": 535 }, { "epoch": 9.0, "grad_norm": 0.6857224702835083, "learning_rate": 2.0333333333333334e-05, "loss": 0.0154, "step": 540 }, { "epoch": 9.0, "eval_accuracy": 0.9669421487603306, "eval_loss": 0.13137011229991913, "eval_runtime": 8.8274, "eval_samples_per_second": 13.707, "eval_steps_per_second": 1.813, "step": 540 }, { "epoch": 9.08, "eval_accuracy": 0.9669421487603306, "eval_loss": 0.1295723170042038, "eval_runtime": 8.8666, "eval_samples_per_second": 13.647, "eval_steps_per_second": 1.805, "step": 545 }, { "epoch": 9.17, "grad_norm": 0.4675326943397522, "learning_rate": 1.7000000000000003e-05, "loss": 0.0331, "step": 550 }, { "epoch": 9.17, "eval_accuracy": 0.9752066115702479, "eval_loss": 0.12511087954044342, "eval_runtime": 9.3211, "eval_samples_per_second": 12.981, "eval_steps_per_second": 1.717, "step": 550 }, { "epoch": 9.25, "eval_accuracy": 0.9752066115702479, "eval_loss": 0.12691423296928406, "eval_runtime": 9.1389, "eval_samples_per_second": 13.24, "eval_steps_per_second": 1.751, "step": 555 }, { "epoch": 9.33, "grad_norm": 0.02136796899139881, "learning_rate": 1.3666666666666666e-05, "loss": 0.0196, "step": 560 }, { "epoch": 9.33, "eval_accuracy": 0.9752066115702479, "eval_loss": 0.12836582958698273, "eval_runtime": 8.6274, "eval_samples_per_second": 14.025, "eval_steps_per_second": 1.855, "step": 560 }, { "epoch": 9.42, "eval_accuracy": 0.9669421487603306, "eval_loss": 0.1298026293516159, "eval_runtime": 8.3157, "eval_samples_per_second": 14.551, "eval_steps_per_second": 1.924, "step": 565 }, { "epoch": 9.5, "grad_norm": 0.014311849139630795, "learning_rate": 1.0333333333333333e-05, "loss": 0.0058, "step": 570 }, { "epoch": 9.5, "eval_accuracy": 0.9669421487603306, "eval_loss": 0.13134212791919708, "eval_runtime": 9.1247, "eval_samples_per_second": 13.261, "eval_steps_per_second": 1.753, "step": 570 }, { "epoch": 9.58, "eval_accuracy": 0.9669421487603306, "eval_loss": 0.13212385773658752, "eval_runtime": 8.6468, "eval_samples_per_second": 13.994, "eval_steps_per_second": 1.85, "step": 575 }, { "epoch": 9.67, "grad_norm": 0.4894683361053467, "learning_rate": 7.000000000000001e-06, "loss": 0.012, "step": 580 }, { "epoch": 9.67, "eval_accuracy": 0.9669421487603306, "eval_loss": 0.1326775997877121, "eval_runtime": 9.2181, "eval_samples_per_second": 13.126, "eval_steps_per_second": 1.736, "step": 580 }, { "epoch": 9.75, "eval_accuracy": 0.9669421487603306, "eval_loss": 0.13264916837215424, "eval_runtime": 8.9299, "eval_samples_per_second": 13.55, "eval_steps_per_second": 1.792, "step": 585 }, { "epoch": 9.83, "grad_norm": 0.015242637135088444, "learning_rate": 3.666666666666667e-06, "loss": 0.0081, "step": 590 }, { "epoch": 9.83, "eval_accuracy": 0.9669421487603306, "eval_loss": 0.13293592631816864, "eval_runtime": 9.2123, "eval_samples_per_second": 13.135, "eval_steps_per_second": 1.737, "step": 590 }, { "epoch": 9.92, "eval_accuracy": 0.9669421487603306, "eval_loss": 0.13364511728286743, "eval_runtime": 8.9976, "eval_samples_per_second": 13.448, "eval_steps_per_second": 1.778, "step": 595 }, { "epoch": 10.0, "grad_norm": 0.014774391427636147, "learning_rate": 3.3333333333333335e-07, "loss": 0.0083, "step": 600 }, { "epoch": 10.0, "eval_accuracy": 0.9669421487603306, "eval_loss": 0.1337580680847168, "eval_runtime": 8.6721, "eval_samples_per_second": 13.953, "eval_steps_per_second": 1.845, "step": 600 }, { "epoch": 10.0, "step": 600, "total_flos": 7.39286832673751e+17, "train_loss": 0.06743512197087208, "train_runtime": 3546.8991, "train_samples_per_second": 2.69, "train_steps_per_second": 0.169 } ], "logging_steps": 10, "max_steps": 600, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 5, "total_flos": 7.39286832673751e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }