{ "best_metric": 0.4315283000469208, "best_model_checkpoint": "xblock-large-patch2-224/checkpoint-5181", "epoch": 3.0, "eval_steps": 500, "global_step": 5181, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 19.092004776000977, "learning_rate": 2.2157996146435453e-06, "loss": 2.5421, "step": 25 }, { "epoch": 0.03, "grad_norm": 11.274575233459473, "learning_rate": 4.624277456647399e-06, "loss": 1.9399, "step": 50 }, { "epoch": 0.04, "grad_norm": 10.260701179504395, "learning_rate": 7.032755298651253e-06, "loss": 1.5011, "step": 75 }, { "epoch": 0.06, "grad_norm": 9.719381332397461, "learning_rate": 9.441233140655107e-06, "loss": 1.3912, "step": 100 }, { "epoch": 0.07, "grad_norm": 10.234914779663086, "learning_rate": 1.184971098265896e-05, "loss": 1.2166, "step": 125 }, { "epoch": 0.09, "grad_norm": 13.722230911254883, "learning_rate": 1.4258188824662813e-05, "loss": 1.1751, "step": 150 }, { "epoch": 0.1, "grad_norm": 17.890995025634766, "learning_rate": 1.6666666666666667e-05, "loss": 1.1862, "step": 175 }, { "epoch": 0.12, "grad_norm": 13.72986125946045, "learning_rate": 1.907514450867052e-05, "loss": 0.8828, "step": 200 }, { "epoch": 0.13, "grad_norm": 12.903642654418945, "learning_rate": 2.1483622350674377e-05, "loss": 0.9541, "step": 225 }, { "epoch": 0.14, "grad_norm": 8.937955856323242, "learning_rate": 2.3892100192678228e-05, "loss": 0.9032, "step": 250 }, { "epoch": 0.16, "grad_norm": 10.743949890136719, "learning_rate": 2.6300578034682083e-05, "loss": 1.0891, "step": 275 }, { "epoch": 0.17, "grad_norm": 9.766569137573242, "learning_rate": 2.8709055876685937e-05, "loss": 0.8951, "step": 300 }, { "epoch": 0.19, "grad_norm": 10.973127365112305, "learning_rate": 3.111753371868979e-05, "loss": 1.0709, "step": 325 }, { "epoch": 0.2, "grad_norm": 13.805391311645508, "learning_rate": 3.352601156069364e-05, "loss": 1.1141, "step": 350 }, { "epoch": 0.22, "grad_norm": 7.355842113494873, "learning_rate": 3.59344894026975e-05, "loss": 1.0031, "step": 375 }, { "epoch": 0.23, "grad_norm": 10.688377380371094, "learning_rate": 3.834296724470135e-05, "loss": 1.0344, "step": 400 }, { "epoch": 0.25, "grad_norm": 8.896137237548828, "learning_rate": 4.07514450867052e-05, "loss": 1.0991, "step": 425 }, { "epoch": 0.26, "grad_norm": 11.027874946594238, "learning_rate": 4.3159922928709055e-05, "loss": 1.0221, "step": 450 }, { "epoch": 0.28, "grad_norm": 27.792613983154297, "learning_rate": 4.556840077071291e-05, "loss": 0.8256, "step": 475 }, { "epoch": 0.29, "grad_norm": 8.005478858947754, "learning_rate": 4.7976878612716764e-05, "loss": 0.9862, "step": 500 }, { "epoch": 0.3, "grad_norm": 11.962843894958496, "learning_rate": 4.995709995709996e-05, "loss": 1.0462, "step": 525 }, { "epoch": 0.32, "grad_norm": 7.288177013397217, "learning_rate": 4.9688974688974696e-05, "loss": 0.9999, "step": 550 }, { "epoch": 0.33, "grad_norm": 3.769800901412964, "learning_rate": 4.9420849420849425e-05, "loss": 1.1262, "step": 575 }, { "epoch": 0.35, "grad_norm": 8.510008811950684, "learning_rate": 4.9152724152724154e-05, "loss": 0.944, "step": 600 }, { "epoch": 0.36, "grad_norm": 8.264263153076172, "learning_rate": 4.888459888459889e-05, "loss": 0.9652, "step": 625 }, { "epoch": 0.38, "grad_norm": 10.604584693908691, "learning_rate": 4.861647361647362e-05, "loss": 0.9246, "step": 650 }, { "epoch": 0.39, "grad_norm": 5.483927249908447, "learning_rate": 4.834834834834835e-05, "loss": 1.0557, "step": 675 }, { "epoch": 0.41, "grad_norm": 10.221104621887207, "learning_rate": 4.808022308022308e-05, "loss": 0.7906, "step": 700 }, { "epoch": 0.42, "grad_norm": 5.725340843200684, "learning_rate": 4.781209781209782e-05, "loss": 0.9, "step": 725 }, { "epoch": 0.43, "grad_norm": 7.057939052581787, "learning_rate": 4.754397254397255e-05, "loss": 0.8744, "step": 750 }, { "epoch": 0.45, "grad_norm": 9.354517936706543, "learning_rate": 4.727584727584728e-05, "loss": 1.0611, "step": 775 }, { "epoch": 0.46, "grad_norm": 4.2830491065979, "learning_rate": 4.700772200772201e-05, "loss": 0.8894, "step": 800 }, { "epoch": 0.48, "grad_norm": 10.702705383300781, "learning_rate": 4.673959673959674e-05, "loss": 1.0246, "step": 825 }, { "epoch": 0.49, "grad_norm": 4.7863359451293945, "learning_rate": 4.647147147147147e-05, "loss": 0.7967, "step": 850 }, { "epoch": 0.51, "grad_norm": 7.841278553009033, "learning_rate": 4.62033462033462e-05, "loss": 0.8616, "step": 875 }, { "epoch": 0.52, "grad_norm": 6.256266117095947, "learning_rate": 4.593522093522094e-05, "loss": 0.9735, "step": 900 }, { "epoch": 0.54, "grad_norm": 6.177362442016602, "learning_rate": 4.566709566709567e-05, "loss": 0.8287, "step": 925 }, { "epoch": 0.55, "grad_norm": 6.448288917541504, "learning_rate": 4.53989703989704e-05, "loss": 1.0062, "step": 950 }, { "epoch": 0.56, "grad_norm": 7.912018299102783, "learning_rate": 4.513084513084513e-05, "loss": 1.0174, "step": 975 }, { "epoch": 0.58, "grad_norm": 9.055561065673828, "learning_rate": 4.486271986271987e-05, "loss": 0.9616, "step": 1000 }, { "epoch": 0.59, "grad_norm": 7.433628559112549, "learning_rate": 4.4594594594594596e-05, "loss": 0.9309, "step": 1025 }, { "epoch": 0.61, "grad_norm": 3.5334954261779785, "learning_rate": 4.4326469326469325e-05, "loss": 0.807, "step": 1050 }, { "epoch": 0.62, "grad_norm": 5.620259761810303, "learning_rate": 4.405834405834406e-05, "loss": 0.8042, "step": 1075 }, { "epoch": 0.64, "grad_norm": 7.67726469039917, "learning_rate": 4.379021879021879e-05, "loss": 0.7394, "step": 1100 }, { "epoch": 0.65, "grad_norm": 9.430630683898926, "learning_rate": 4.3522093522093526e-05, "loss": 0.7895, "step": 1125 }, { "epoch": 0.67, "grad_norm": 9.241034507751465, "learning_rate": 4.3253968253968256e-05, "loss": 0.8032, "step": 1150 }, { "epoch": 0.68, "grad_norm": 7.471988201141357, "learning_rate": 4.298584298584299e-05, "loss": 0.7669, "step": 1175 }, { "epoch": 0.69, "grad_norm": 3.161353588104248, "learning_rate": 4.271771771771772e-05, "loss": 0.8795, "step": 1200 }, { "epoch": 0.71, "grad_norm": 6.6813201904296875, "learning_rate": 4.244959244959245e-05, "loss": 0.819, "step": 1225 }, { "epoch": 0.72, "grad_norm": 16.08786392211914, "learning_rate": 4.2181467181467186e-05, "loss": 0.8779, "step": 1250 }, { "epoch": 0.74, "grad_norm": 3.753849506378174, "learning_rate": 4.1913341913341915e-05, "loss": 0.9255, "step": 1275 }, { "epoch": 0.75, "grad_norm": 5.4661431312561035, "learning_rate": 4.1645216645216644e-05, "loss": 0.8028, "step": 1300 }, { "epoch": 0.77, "grad_norm": 6.559650897979736, "learning_rate": 4.137709137709138e-05, "loss": 0.7556, "step": 1325 }, { "epoch": 0.78, "grad_norm": 5.7179341316223145, "learning_rate": 4.1108966108966116e-05, "loss": 0.7076, "step": 1350 }, { "epoch": 0.8, "grad_norm": 12.687734603881836, "learning_rate": 4.0840840840840845e-05, "loss": 0.8583, "step": 1375 }, { "epoch": 0.81, "grad_norm": 5.1677398681640625, "learning_rate": 4.0572715572715574e-05, "loss": 0.8944, "step": 1400 }, { "epoch": 0.83, "grad_norm": 11.766656875610352, "learning_rate": 4.03045903045903e-05, "loss": 0.5638, "step": 1425 }, { "epoch": 0.84, "grad_norm": 4.12522554397583, "learning_rate": 4.003646503646504e-05, "loss": 0.6883, "step": 1450 }, { "epoch": 0.85, "grad_norm": 2.586186408996582, "learning_rate": 3.976833976833977e-05, "loss": 0.8784, "step": 1475 }, { "epoch": 0.87, "grad_norm": 4.075995445251465, "learning_rate": 3.95002145002145e-05, "loss": 0.7596, "step": 1500 }, { "epoch": 0.88, "grad_norm": 12.722098350524902, "learning_rate": 3.923208923208923e-05, "loss": 0.6316, "step": 1525 }, { "epoch": 0.9, "grad_norm": 12.86962890625, "learning_rate": 3.896396396396397e-05, "loss": 0.7721, "step": 1550 }, { "epoch": 0.91, "grad_norm": 10.640520095825195, "learning_rate": 3.86958386958387e-05, "loss": 0.711, "step": 1575 }, { "epoch": 0.93, "grad_norm": 7.080173015594482, "learning_rate": 3.842771342771343e-05, "loss": 0.8145, "step": 1600 }, { "epoch": 0.94, "grad_norm": 2.814232587814331, "learning_rate": 3.815958815958816e-05, "loss": 0.8509, "step": 1625 }, { "epoch": 0.96, "grad_norm": 12.209416389465332, "learning_rate": 3.789146289146289e-05, "loss": 0.8037, "step": 1650 }, { "epoch": 0.97, "grad_norm": 3.5646421909332275, "learning_rate": 3.762333762333762e-05, "loss": 0.8347, "step": 1675 }, { "epoch": 0.98, "grad_norm": 8.00243854522705, "learning_rate": 3.735521235521236e-05, "loss": 0.7534, "step": 1700 }, { "epoch": 1.0, "grad_norm": 5.955112934112549, "learning_rate": 3.708708708708709e-05, "loss": 0.9062, "step": 1725 }, { "epoch": 1.0, "eval_accuracy": 0.7815393518518519, "eval_f1_macro": 0.36763187534206965, "eval_f1_micro": 0.7815393518518519, "eval_f1_weighted": 0.7649698649865229, "eval_loss": 0.6848556399345398, "eval_precision_macro": 0.4527657259795588, "eval_precision_micro": 0.7815393518518519, "eval_precision_weighted": 0.7691072277887989, "eval_recall_macro": 0.3815255183458625, "eval_recall_micro": 0.7815393518518519, "eval_recall_weighted": 0.7815393518518519, "eval_runtime": 3330.8434, "eval_samples_per_second": 1.038, "eval_steps_per_second": 0.065, "step": 1727 }, { "epoch": 1.01, "grad_norm": 3.4409847259521484, "learning_rate": 3.681896181896182e-05, "loss": 0.8201, "step": 1750 }, { "epoch": 1.03, "grad_norm": 7.564268589019775, "learning_rate": 3.655083655083655e-05, "loss": 0.548, "step": 1775 }, { "epoch": 1.04, "grad_norm": 10.243828773498535, "learning_rate": 3.628271128271129e-05, "loss": 0.6348, "step": 1800 }, { "epoch": 1.06, "grad_norm": 14.57152271270752, "learning_rate": 3.6014586014586017e-05, "loss": 0.66, "step": 1825 }, { "epoch": 1.07, "grad_norm": 4.4091973304748535, "learning_rate": 3.5746460746460746e-05, "loss": 0.8169, "step": 1850 }, { "epoch": 1.09, "grad_norm": 5.423861503601074, "learning_rate": 3.547833547833548e-05, "loss": 0.7353, "step": 1875 }, { "epoch": 1.1, "grad_norm": 6.510718822479248, "learning_rate": 3.521021021021021e-05, "loss": 0.7008, "step": 1900 }, { "epoch": 1.11, "grad_norm": 2.5035436153411865, "learning_rate": 3.4942084942084947e-05, "loss": 0.5975, "step": 1925 }, { "epoch": 1.13, "grad_norm": 15.364286422729492, "learning_rate": 3.4673959673959676e-05, "loss": 0.8198, "step": 1950 }, { "epoch": 1.14, "grad_norm": 16.03240966796875, "learning_rate": 3.440583440583441e-05, "loss": 0.7089, "step": 1975 }, { "epoch": 1.16, "grad_norm": 8.521039962768555, "learning_rate": 3.413770913770914e-05, "loss": 0.6514, "step": 2000 }, { "epoch": 1.17, "grad_norm": 5.206024169921875, "learning_rate": 3.386958386958387e-05, "loss": 0.7545, "step": 2025 }, { "epoch": 1.19, "grad_norm": 7.756472110748291, "learning_rate": 3.36014586014586e-05, "loss": 0.8055, "step": 2050 }, { "epoch": 1.2, "grad_norm": 17.274944305419922, "learning_rate": 3.3333333333333335e-05, "loss": 0.7212, "step": 2075 }, { "epoch": 1.22, "grad_norm": 5.303420543670654, "learning_rate": 3.3065208065208064e-05, "loss": 0.7299, "step": 2100 }, { "epoch": 1.23, "grad_norm": 5.07370138168335, "learning_rate": 3.27970827970828e-05, "loss": 0.5858, "step": 2125 }, { "epoch": 1.24, "grad_norm": 6.2755937576293945, "learning_rate": 3.252895752895753e-05, "loss": 0.7687, "step": 2150 }, { "epoch": 1.26, "grad_norm": 12.160276412963867, "learning_rate": 3.227155727155727e-05, "loss": 0.727, "step": 2175 }, { "epoch": 1.27, "grad_norm": 3.9984447956085205, "learning_rate": 3.2003432003432e-05, "loss": 0.6697, "step": 2200 }, { "epoch": 1.29, "grad_norm": 5.756568908691406, "learning_rate": 3.173530673530674e-05, "loss": 0.7912, "step": 2225 }, { "epoch": 1.3, "grad_norm": 8.783411026000977, "learning_rate": 3.1467181467181466e-05, "loss": 0.7035, "step": 2250 }, { "epoch": 1.32, "grad_norm": 5.451704502105713, "learning_rate": 3.1199056199056196e-05, "loss": 0.7144, "step": 2275 }, { "epoch": 1.33, "grad_norm": 5.039503574371338, "learning_rate": 3.093093093093093e-05, "loss": 0.7893, "step": 2300 }, { "epoch": 1.35, "grad_norm": 12.280179023742676, "learning_rate": 3.066280566280567e-05, "loss": 0.5903, "step": 2325 }, { "epoch": 1.36, "grad_norm": 6.999240398406982, "learning_rate": 3.0394680394680397e-05, "loss": 0.651, "step": 2350 }, { "epoch": 1.38, "grad_norm": 7.355953216552734, "learning_rate": 3.012655512655513e-05, "loss": 0.878, "step": 2375 }, { "epoch": 1.39, "grad_norm": 12.737029075622559, "learning_rate": 2.9858429858429858e-05, "loss": 0.731, "step": 2400 }, { "epoch": 1.4, "grad_norm": 4.184784889221191, "learning_rate": 2.959030459030459e-05, "loss": 0.8775, "step": 2425 }, { "epoch": 1.42, "grad_norm": 9.188583374023438, "learning_rate": 2.9322179322179323e-05, "loss": 0.7259, "step": 2450 }, { "epoch": 1.43, "grad_norm": 12.98018741607666, "learning_rate": 2.906477906477907e-05, "loss": 0.6219, "step": 2475 }, { "epoch": 1.45, "grad_norm": 12.084989547729492, "learning_rate": 2.87966537966538e-05, "loss": 0.5074, "step": 2500 }, { "epoch": 1.46, "grad_norm": 5.283312797546387, "learning_rate": 2.852852852852853e-05, "loss": 0.6606, "step": 2525 }, { "epoch": 1.48, "grad_norm": 19.32860565185547, "learning_rate": 2.826040326040326e-05, "loss": 0.7651, "step": 2550 }, { "epoch": 1.49, "grad_norm": 4.0794901847839355, "learning_rate": 2.7992277992277993e-05, "loss": 0.6737, "step": 2575 }, { "epoch": 1.51, "grad_norm": 33.1405029296875, "learning_rate": 2.7724152724152726e-05, "loss": 0.654, "step": 2600 }, { "epoch": 1.52, "grad_norm": 4.921344757080078, "learning_rate": 2.7456027456027455e-05, "loss": 0.7194, "step": 2625 }, { "epoch": 1.53, "grad_norm": 5.453707695007324, "learning_rate": 2.7187902187902187e-05, "loss": 0.6851, "step": 2650 }, { "epoch": 1.55, "grad_norm": 8.766169548034668, "learning_rate": 2.6919776919776923e-05, "loss": 0.8228, "step": 2675 }, { "epoch": 1.56, "grad_norm": 8.957389831542969, "learning_rate": 2.6651651651651656e-05, "loss": 0.6645, "step": 2700 }, { "epoch": 1.58, "grad_norm": 5.715158939361572, "learning_rate": 2.6383526383526385e-05, "loss": 0.6905, "step": 2725 }, { "epoch": 1.59, "grad_norm": 6.306962490081787, "learning_rate": 2.6115401115401117e-05, "loss": 0.6924, "step": 2750 }, { "epoch": 1.61, "grad_norm": 8.548517227172852, "learning_rate": 2.5847275847275846e-05, "loss": 0.8402, "step": 2775 }, { "epoch": 1.62, "grad_norm": 7.5719895362854, "learning_rate": 2.557915057915058e-05, "loss": 0.7758, "step": 2800 }, { "epoch": 1.64, "grad_norm": 8.910326957702637, "learning_rate": 2.531102531102531e-05, "loss": 0.5645, "step": 2825 }, { "epoch": 1.65, "grad_norm": 8.304277420043945, "learning_rate": 2.504290004290004e-05, "loss": 0.7066, "step": 2850 }, { "epoch": 1.66, "grad_norm": 13.99254035949707, "learning_rate": 2.4774774774774777e-05, "loss": 0.7396, "step": 2875 }, { "epoch": 1.68, "grad_norm": 3.806931257247925, "learning_rate": 2.4506649506649506e-05, "loss": 0.6334, "step": 2900 }, { "epoch": 1.69, "grad_norm": 6.550988674163818, "learning_rate": 2.423852423852424e-05, "loss": 0.9251, "step": 2925 }, { "epoch": 1.71, "grad_norm": 3.1442198753356934, "learning_rate": 2.397039897039897e-05, "loss": 0.546, "step": 2950 }, { "epoch": 1.72, "grad_norm": 5.249305248260498, "learning_rate": 2.3702273702273703e-05, "loss": 0.6419, "step": 2975 }, { "epoch": 1.74, "grad_norm": 5.300810813903809, "learning_rate": 2.3434148434148436e-05, "loss": 0.535, "step": 3000 }, { "epoch": 1.75, "grad_norm": 8.079426765441895, "learning_rate": 2.3166023166023168e-05, "loss": 0.8142, "step": 3025 }, { "epoch": 1.77, "grad_norm": 6.737719535827637, "learning_rate": 2.28978978978979e-05, "loss": 0.5974, "step": 3050 }, { "epoch": 1.78, "grad_norm": 5.037626266479492, "learning_rate": 2.262977262977263e-05, "loss": 0.7068, "step": 3075 }, { "epoch": 1.8, "grad_norm": 3.4523470401763916, "learning_rate": 2.2361647361647362e-05, "loss": 0.5756, "step": 3100 }, { "epoch": 1.81, "grad_norm": 2.2966675758361816, "learning_rate": 2.2093522093522095e-05, "loss": 0.4941, "step": 3125 }, { "epoch": 1.82, "grad_norm": 11.497820854187012, "learning_rate": 2.1825396825396827e-05, "loss": 0.8353, "step": 3150 }, { "epoch": 1.84, "grad_norm": 11.813599586486816, "learning_rate": 2.1557271557271557e-05, "loss": 0.8303, "step": 3175 }, { "epoch": 1.85, "grad_norm": 2.504293203353882, "learning_rate": 2.128914628914629e-05, "loss": 0.5574, "step": 3200 }, { "epoch": 1.87, "grad_norm": 8.983193397521973, "learning_rate": 2.102102102102102e-05, "loss": 0.6033, "step": 3225 }, { "epoch": 1.88, "grad_norm": 7.629824161529541, "learning_rate": 2.0752895752895754e-05, "loss": 0.6305, "step": 3250 }, { "epoch": 1.9, "grad_norm": 10.86919116973877, "learning_rate": 2.0484770484770487e-05, "loss": 0.6045, "step": 3275 }, { "epoch": 1.91, "grad_norm": 6.0854387283325195, "learning_rate": 2.0216645216645216e-05, "loss": 0.6208, "step": 3300 }, { "epoch": 1.93, "grad_norm": 6.228011131286621, "learning_rate": 1.994851994851995e-05, "loss": 0.6249, "step": 3325 }, { "epoch": 1.94, "grad_norm": 7.076812744140625, "learning_rate": 1.968039468039468e-05, "loss": 0.6176, "step": 3350 }, { "epoch": 1.95, "grad_norm": 7.893978595733643, "learning_rate": 1.9412269412269413e-05, "loss": 0.7779, "step": 3375 }, { "epoch": 1.97, "grad_norm": 7.72683048248291, "learning_rate": 1.9144144144144142e-05, "loss": 0.669, "step": 3400 }, { "epoch": 1.98, "grad_norm": 3.820025682449341, "learning_rate": 1.887601887601888e-05, "loss": 0.6182, "step": 3425 }, { "epoch": 2.0, "grad_norm": 5.642152309417725, "learning_rate": 1.8607893607893607e-05, "loss": 0.6453, "step": 3450 }, { "epoch": 2.0, "eval_accuracy": 0.8454861111111112, "eval_f1_macro": 0.5038698715266201, "eval_f1_micro": 0.845486111111111, "eval_f1_weighted": 0.830657390454042, "eval_loss": 0.4813511371612549, "eval_precision_macro": 0.7578927995388053, "eval_precision_micro": 0.8454861111111112, "eval_precision_weighted": 0.8347843050246918, "eval_recall_macro": 0.456096431265928, "eval_recall_micro": 0.8454861111111112, "eval_recall_weighted": 0.8454861111111112, "eval_runtime": 3281.605, "eval_samples_per_second": 1.053, "eval_steps_per_second": 0.066, "step": 3454 }, { "epoch": 2.01, "grad_norm": 8.240702629089355, "learning_rate": 1.833976833976834e-05, "loss": 0.4854, "step": 3475 }, { "epoch": 2.03, "grad_norm": 7.087810039520264, "learning_rate": 1.8071643071643072e-05, "loss": 0.5459, "step": 3500 }, { "epoch": 2.04, "grad_norm": 0.7334815859794617, "learning_rate": 1.7803517803517805e-05, "loss": 0.5212, "step": 3525 }, { "epoch": 2.06, "grad_norm": 6.095980167388916, "learning_rate": 1.7535392535392538e-05, "loss": 0.5191, "step": 3550 }, { "epoch": 2.07, "grad_norm": 10.246546745300293, "learning_rate": 1.7267267267267267e-05, "loss": 0.6645, "step": 3575 }, { "epoch": 2.08, "grad_norm": 3.6809566020965576, "learning_rate": 1.6999141999142e-05, "loss": 0.5629, "step": 3600 }, { "epoch": 2.1, "grad_norm": 13.609752655029297, "learning_rate": 1.673101673101673e-05, "loss": 0.6598, "step": 3625 }, { "epoch": 2.11, "grad_norm": 5.554472923278809, "learning_rate": 1.6462891462891464e-05, "loss": 0.4935, "step": 3650 }, { "epoch": 2.13, "grad_norm": 16.72881317138672, "learning_rate": 1.6205491205491204e-05, "loss": 0.379, "step": 3675 }, { "epoch": 2.14, "grad_norm": 9.573266983032227, "learning_rate": 1.593736593736594e-05, "loss": 0.7141, "step": 3700 }, { "epoch": 2.16, "grad_norm": 5.245655059814453, "learning_rate": 1.566924066924067e-05, "loss": 0.7518, "step": 3725 }, { "epoch": 2.17, "grad_norm": 13.11945915222168, "learning_rate": 1.54011154011154e-05, "loss": 0.6588, "step": 3750 }, { "epoch": 2.19, "grad_norm": 6.219137191772461, "learning_rate": 1.5132990132990132e-05, "loss": 0.5884, "step": 3775 }, { "epoch": 2.2, "grad_norm": 6.516097545623779, "learning_rate": 1.4864864864864867e-05, "loss": 0.4572, "step": 3800 }, { "epoch": 2.21, "grad_norm": 4.23282527923584, "learning_rate": 1.4596739596739597e-05, "loss": 0.6028, "step": 3825 }, { "epoch": 2.23, "grad_norm": 2.6169815063476562, "learning_rate": 1.4328614328614328e-05, "loss": 0.6198, "step": 3850 }, { "epoch": 2.24, "grad_norm": 4.720090389251709, "learning_rate": 1.4060489060489059e-05, "loss": 0.556, "step": 3875 }, { "epoch": 2.26, "grad_norm": 7.367048263549805, "learning_rate": 1.3792363792363793e-05, "loss": 0.5812, "step": 3900 }, { "epoch": 2.27, "grad_norm": 7.3934173583984375, "learning_rate": 1.3524238524238526e-05, "loss": 0.6141, "step": 3925 }, { "epoch": 2.29, "grad_norm": 10.528743743896484, "learning_rate": 1.3256113256113257e-05, "loss": 0.6047, "step": 3950 }, { "epoch": 2.3, "grad_norm": 7.282771110534668, "learning_rate": 1.2987987987987987e-05, "loss": 0.6281, "step": 3975 }, { "epoch": 2.32, "grad_norm": 11.970826148986816, "learning_rate": 1.2719862719862722e-05, "loss": 0.4103, "step": 4000 }, { "epoch": 2.33, "grad_norm": 6.620480537414551, "learning_rate": 1.2451737451737452e-05, "loss": 0.5386, "step": 4025 }, { "epoch": 2.35, "grad_norm": 5.699476718902588, "learning_rate": 1.2183612183612183e-05, "loss": 0.6507, "step": 4050 }, { "epoch": 2.36, "grad_norm": 2.477766275405884, "learning_rate": 1.1915486915486916e-05, "loss": 0.524, "step": 4075 }, { "epoch": 2.37, "grad_norm": 6.517852306365967, "learning_rate": 1.1647361647361647e-05, "loss": 0.6979, "step": 4100 }, { "epoch": 2.39, "grad_norm": 4.506691932678223, "learning_rate": 1.1379236379236379e-05, "loss": 0.4651, "step": 4125 }, { "epoch": 2.4, "grad_norm": 6.522432804107666, "learning_rate": 1.1111111111111112e-05, "loss": 0.6845, "step": 4150 }, { "epoch": 2.42, "grad_norm": 12.015291213989258, "learning_rate": 1.0842985842985844e-05, "loss": 0.5348, "step": 4175 }, { "epoch": 2.43, "grad_norm": 7.297937393188477, "learning_rate": 1.0574860574860575e-05, "loss": 0.5412, "step": 4200 }, { "epoch": 2.45, "grad_norm": 13.665657997131348, "learning_rate": 1.0306735306735307e-05, "loss": 0.5137, "step": 4225 }, { "epoch": 2.46, "grad_norm": 11.743260383605957, "learning_rate": 1.0038610038610038e-05, "loss": 0.5738, "step": 4250 }, { "epoch": 2.48, "grad_norm": 10.24691104888916, "learning_rate": 9.77048477048477e-06, "loss": 0.5134, "step": 4275 }, { "epoch": 2.49, "grad_norm": 4.543239116668701, "learning_rate": 9.502359502359502e-06, "loss": 0.6055, "step": 4300 }, { "epoch": 2.5, "grad_norm": 7.915064811706543, "learning_rate": 9.234234234234234e-06, "loss": 0.5153, "step": 4325 }, { "epoch": 2.52, "grad_norm": 8.37210750579834, "learning_rate": 8.966108966108967e-06, "loss": 0.4754, "step": 4350 }, { "epoch": 2.53, "grad_norm": 7.3417487144470215, "learning_rate": 8.6979836979837e-06, "loss": 0.6285, "step": 4375 }, { "epoch": 2.55, "grad_norm": 9.027023315429688, "learning_rate": 8.42985842985843e-06, "loss": 0.4925, "step": 4400 }, { "epoch": 2.56, "grad_norm": 7.813179016113281, "learning_rate": 8.161733161733163e-06, "loss": 0.5089, "step": 4425 }, { "epoch": 2.58, "grad_norm": 13.45531940460205, "learning_rate": 7.893607893607893e-06, "loss": 0.4717, "step": 4450 }, { "epoch": 2.59, "grad_norm": 7.09887170791626, "learning_rate": 7.625482625482626e-06, "loss": 0.6506, "step": 4475 }, { "epoch": 2.61, "grad_norm": 4.6297383308410645, "learning_rate": 7.357357357357357e-06, "loss": 0.4828, "step": 4500 }, { "epoch": 2.62, "grad_norm": 8.980986595153809, "learning_rate": 7.089232089232089e-06, "loss": 0.4233, "step": 4525 }, { "epoch": 2.63, "grad_norm": 26.76249885559082, "learning_rate": 6.821106821106821e-06, "loss": 0.5748, "step": 4550 }, { "epoch": 2.65, "grad_norm": 8.72842788696289, "learning_rate": 6.552981552981553e-06, "loss": 0.6565, "step": 4575 }, { "epoch": 2.66, "grad_norm": 9.191315650939941, "learning_rate": 6.284856284856284e-06, "loss": 0.5332, "step": 4600 }, { "epoch": 2.68, "grad_norm": 7.631181240081787, "learning_rate": 6.016731016731017e-06, "loss": 0.4692, "step": 4625 }, { "epoch": 2.69, "grad_norm": 7.811351299285889, "learning_rate": 5.748605748605749e-06, "loss": 0.6485, "step": 4650 }, { "epoch": 2.71, "grad_norm": 8.997116088867188, "learning_rate": 5.480480480480481e-06, "loss": 0.4207, "step": 4675 }, { "epoch": 2.72, "grad_norm": 9.758033752441406, "learning_rate": 5.212355212355213e-06, "loss": 0.5205, "step": 4700 }, { "epoch": 2.74, "grad_norm": 5.98590612411499, "learning_rate": 4.944229944229944e-06, "loss": 0.6115, "step": 4725 }, { "epoch": 2.75, "grad_norm": 11.506319046020508, "learning_rate": 4.676104676104676e-06, "loss": 0.4449, "step": 4750 }, { "epoch": 2.76, "grad_norm": 7.969517230987549, "learning_rate": 4.4079794079794084e-06, "loss": 0.5384, "step": 4775 }, { "epoch": 2.78, "grad_norm": 0.8463253974914551, "learning_rate": 4.13985413985414e-06, "loss": 0.4981, "step": 4800 }, { "epoch": 2.79, "grad_norm": 12.771890640258789, "learning_rate": 3.871728871728872e-06, "loss": 0.4786, "step": 4825 }, { "epoch": 2.81, "grad_norm": 0.6047688126564026, "learning_rate": 3.603603603603604e-06, "loss": 0.4913, "step": 4850 }, { "epoch": 2.82, "grad_norm": 7.117040157318115, "learning_rate": 3.3354783354783355e-06, "loss": 0.5386, "step": 4875 }, { "epoch": 2.84, "grad_norm": 5.262890338897705, "learning_rate": 3.0673530673530676e-06, "loss": 0.6115, "step": 4900 }, { "epoch": 2.85, "grad_norm": 0.5500399470329285, "learning_rate": 2.7992277992277993e-06, "loss": 0.5285, "step": 4925 }, { "epoch": 2.87, "grad_norm": 2.5653154850006104, "learning_rate": 2.531102531102531e-06, "loss": 0.3621, "step": 4950 }, { "epoch": 2.88, "grad_norm": 5.71751594543457, "learning_rate": 2.262977262977263e-06, "loss": 0.6007, "step": 4975 }, { "epoch": 2.9, "grad_norm": 12.138904571533203, "learning_rate": 1.9948519948519947e-06, "loss": 0.5797, "step": 5000 }, { "epoch": 2.91, "grad_norm": 8.795024871826172, "learning_rate": 1.7267267267267268e-06, "loss": 0.5585, "step": 5025 }, { "epoch": 2.92, "grad_norm": 3.7619569301605225, "learning_rate": 1.4586014586014587e-06, "loss": 0.3951, "step": 5050 }, { "epoch": 2.94, "grad_norm": 2.763073205947876, "learning_rate": 1.1904761904761904e-06, "loss": 0.287, "step": 5075 }, { "epoch": 2.95, "grad_norm": 7.337412357330322, "learning_rate": 9.223509223509224e-07, "loss": 0.4416, "step": 5100 }, { "epoch": 2.97, "grad_norm": 4.266438961029053, "learning_rate": 6.542256542256542e-07, "loss": 0.5275, "step": 5125 }, { "epoch": 2.98, "grad_norm": 8.12879753112793, "learning_rate": 3.8610038610038613e-07, "loss": 0.4297, "step": 5150 }, { "epoch": 3.0, "grad_norm": 6.199108123779297, "learning_rate": 1.1797511797511798e-07, "loss": 0.4389, "step": 5175 }, { "epoch": 3.0, "eval_accuracy": 0.8602430555555556, "eval_f1_macro": 0.6149830093941424, "eval_f1_micro": 0.8602430555555556, "eval_f1_weighted": 0.8515059109185544, "eval_loss": 0.4315283000469208, "eval_precision_macro": 0.7610988679415244, "eval_precision_micro": 0.8602430555555556, "eval_precision_weighted": 0.8532444856848228, "eval_recall_macro": 0.5527145295483504, "eval_recall_micro": 0.8602430555555556, "eval_recall_weighted": 0.8602430555555556, "eval_runtime": 3373.7409, "eval_samples_per_second": 1.024, "eval_steps_per_second": 0.064, "step": 5181 } ], "logging_steps": 25, "max_steps": 5181, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 1.135272556528692e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }