{ "best_metric": 0.5765425562858582, "best_model_checkpoint": "v23/checkpoint-358", "epoch": 2.0, "eval_steps": 500, "global_step": 358, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03910614525139665, "grad_norm": Infinity, "learning_rate": 1.6666666666666667e-06, "loss": 2.8181, "step": 7 }, { "epoch": 0.0782122905027933, "grad_norm": 48.7786750793457, "learning_rate": 5.555555555555556e-06, "loss": 2.4279, "step": 14 }, { "epoch": 0.11731843575418995, "grad_norm": 16.421829223632812, "learning_rate": 9.444444444444445e-06, "loss": 1.9836, "step": 21 }, { "epoch": 0.1564245810055866, "grad_norm": 37.26057052612305, "learning_rate": 1.3333333333333333e-05, "loss": 1.637, "step": 28 }, { "epoch": 0.19553072625698323, "grad_norm": 24.32502555847168, "learning_rate": 1.7222222222222224e-05, "loss": 1.4235, "step": 35 }, { "epoch": 0.2346368715083799, "grad_norm": 4.514742374420166, "learning_rate": 2.111111111111111e-05, "loss": 0.8819, "step": 42 }, { "epoch": 0.2737430167597765, "grad_norm": 17.023542404174805, "learning_rate": 2.5e-05, "loss": 0.847, "step": 49 }, { "epoch": 0.3128491620111732, "grad_norm": 18.13689422607422, "learning_rate": 2.8888888888888888e-05, "loss": 0.7611, "step": 56 }, { "epoch": 0.35195530726256985, "grad_norm": 21.782562255859375, "learning_rate": 3.277777777777778e-05, "loss": 0.7189, "step": 63 }, { "epoch": 0.39106145251396646, "grad_norm": 13.720254898071289, "learning_rate": 3.6666666666666666e-05, "loss": 0.7178, "step": 70 }, { "epoch": 0.4301675977653631, "grad_norm": 4.693215847015381, "learning_rate": 4.055555555555556e-05, "loss": 0.7092, "step": 77 }, { "epoch": 0.4692737430167598, "grad_norm": 14.818086624145508, "learning_rate": 4.4444444444444447e-05, "loss": 0.682, "step": 84 }, { "epoch": 0.5083798882681564, "grad_norm": 26.538074493408203, "learning_rate": 4.8333333333333334e-05, "loss": 0.6824, "step": 91 }, { "epoch": 0.547486033519553, "grad_norm": 29.219240188598633, "learning_rate": 4.975155279503106e-05, "loss": 0.681, "step": 98 }, { "epoch": 0.5865921787709497, "grad_norm": 8.444730758666992, "learning_rate": 4.93167701863354e-05, "loss": 0.7867, "step": 105 }, { "epoch": 0.6256983240223464, "grad_norm": 8.663554191589355, "learning_rate": 4.888198757763975e-05, "loss": 0.7689, "step": 112 }, { "epoch": 0.664804469273743, "grad_norm": 15.8271484375, "learning_rate": 4.8447204968944106e-05, "loss": 0.8417, "step": 119 }, { "epoch": 0.7039106145251397, "grad_norm": 6.195271968841553, "learning_rate": 4.801242236024845e-05, "loss": 0.629, "step": 126 }, { "epoch": 0.7430167597765364, "grad_norm": 4.593315601348877, "learning_rate": 4.75776397515528e-05, "loss": 0.6039, "step": 133 }, { "epoch": 0.7821229050279329, "grad_norm": 22.80195426940918, "learning_rate": 4.714285714285714e-05, "loss": 0.5824, "step": 140 }, { "epoch": 0.8212290502793296, "grad_norm": 13.558725357055664, "learning_rate": 4.6770186335403726e-05, "loss": 0.876, "step": 147 }, { "epoch": 0.8603351955307262, "grad_norm": 4.1830668449401855, "learning_rate": 4.633540372670807e-05, "loss": 0.7458, "step": 154 }, { "epoch": 0.8994413407821229, "grad_norm": 14.1422119140625, "learning_rate": 4.590062111801243e-05, "loss": 0.6289, "step": 161 }, { "epoch": 0.9385474860335196, "grad_norm": 15.986943244934082, "learning_rate": 4.546583850931677e-05, "loss": 0.8139, "step": 168 }, { "epoch": 0.9776536312849162, "grad_norm": 10.396794319152832, "learning_rate": 4.5031055900621124e-05, "loss": 0.7859, "step": 175 }, { "epoch": 1.0, "eval_accuracy": 0.6, "eval_f1_macro": 0.4025140193447718, "eval_f1_micro": 0.6, "eval_f1_weighted": 0.4689184747817452, "eval_loss": 0.7313841581344604, "eval_precision_macro": 0.5837690631808279, "eval_precision_micro": 0.6, "eval_precision_weighted": 0.5306172839506172, "eval_recall_macro": 0.43363545726457, "eval_recall_micro": 0.6, "eval_recall_weighted": 0.6, "eval_runtime": 0.8236, "eval_samples_per_second": 382.454, "eval_steps_per_second": 24.283, "step": 179 }, { "epoch": 1.0167597765363128, "grad_norm": 15.96173095703125, "learning_rate": 4.4596273291925465e-05, "loss": 0.5898, "step": 182 }, { "epoch": 1.0558659217877095, "grad_norm": 21.0263729095459, "learning_rate": 4.416149068322982e-05, "loss": 0.6724, "step": 189 }, { "epoch": 1.094972067039106, "grad_norm": 8.833733558654785, "learning_rate": 4.372670807453416e-05, "loss": 0.5225, "step": 196 }, { "epoch": 1.1340782122905029, "grad_norm": 13.83945083618164, "learning_rate": 4.3291925465838515e-05, "loss": 0.6694, "step": 203 }, { "epoch": 1.1731843575418994, "grad_norm": 13.772929191589355, "learning_rate": 4.2857142857142856e-05, "loss": 0.5541, "step": 210 }, { "epoch": 1.2122905027932962, "grad_norm": 8.899124145507812, "learning_rate": 4.2422360248447204e-05, "loss": 0.713, "step": 217 }, { "epoch": 1.2513966480446927, "grad_norm": 8.794002532958984, "learning_rate": 4.198757763975156e-05, "loss": 0.5879, "step": 224 }, { "epoch": 1.2905027932960893, "grad_norm": 4.091240882873535, "learning_rate": 4.15527950310559e-05, "loss": 0.5424, "step": 231 }, { "epoch": 1.329608938547486, "grad_norm": 16.86524772644043, "learning_rate": 4.1118012422360255e-05, "loss": 0.6323, "step": 238 }, { "epoch": 1.3687150837988826, "grad_norm": 10.920906066894531, "learning_rate": 4.0683229813664596e-05, "loss": 0.5727, "step": 245 }, { "epoch": 1.4078212290502794, "grad_norm": 17.164987564086914, "learning_rate": 4.024844720496895e-05, "loss": 0.5847, "step": 252 }, { "epoch": 1.446927374301676, "grad_norm": 12.27508544921875, "learning_rate": 3.981366459627329e-05, "loss": 0.4845, "step": 259 }, { "epoch": 1.4860335195530725, "grad_norm": 12.798267364501953, "learning_rate": 3.9378881987577646e-05, "loss": 0.4762, "step": 266 }, { "epoch": 1.5251396648044693, "grad_norm": 3.783871650695801, "learning_rate": 3.894409937888199e-05, "loss": 0.4889, "step": 273 }, { "epoch": 1.564245810055866, "grad_norm": 17.070810317993164, "learning_rate": 3.8509316770186335e-05, "loss": 0.5382, "step": 280 }, { "epoch": 1.6033519553072626, "grad_norm": 27.77778434753418, "learning_rate": 3.807453416149068e-05, "loss": 0.5336, "step": 287 }, { "epoch": 1.6424581005586592, "grad_norm": 52.91617202758789, "learning_rate": 3.763975155279503e-05, "loss": 0.6025, "step": 294 }, { "epoch": 1.6815642458100557, "grad_norm": 17.698348999023438, "learning_rate": 3.7204968944099385e-05, "loss": 0.3589, "step": 301 }, { "epoch": 1.7206703910614525, "grad_norm": 14.64693832397461, "learning_rate": 3.6770186335403726e-05, "loss": 0.3654, "step": 308 }, { "epoch": 1.7597765363128492, "grad_norm": 13.599970817565918, "learning_rate": 3.633540372670808e-05, "loss": 0.6611, "step": 315 }, { "epoch": 1.7988826815642458, "grad_norm": 6.364068984985352, "learning_rate": 3.590062111801242e-05, "loss": 0.5289, "step": 322 }, { "epoch": 1.8379888268156424, "grad_norm": 10.628365516662598, "learning_rate": 3.546583850931677e-05, "loss": 0.6834, "step": 329 }, { "epoch": 1.8770949720670391, "grad_norm": 7.663080215454102, "learning_rate": 3.503105590062112e-05, "loss": 0.4576, "step": 336 }, { "epoch": 1.916201117318436, "grad_norm": 9.863435745239258, "learning_rate": 3.4596273291925466e-05, "loss": 0.4385, "step": 343 }, { "epoch": 1.9553072625698324, "grad_norm": 7.04995059967041, "learning_rate": 3.4161490683229814e-05, "loss": 0.3589, "step": 350 }, { "epoch": 1.994413407821229, "grad_norm": 12.553130149841309, "learning_rate": 3.372670807453416e-05, "loss": 0.6421, "step": 357 }, { "epoch": 2.0, "eval_accuracy": 0.7492063492063492, "eval_f1_macro": 0.7349736157447978, "eval_f1_micro": 0.7492063492063492, "eval_f1_weighted": 0.7506575340838828, "eval_loss": 0.5765425562858582, "eval_precision_macro": 0.7150911360799, "eval_precision_micro": 0.7492063492063492, "eval_precision_weighted": 0.7534924677486475, "eval_recall_macro": 0.7595969666757304, "eval_recall_micro": 0.7492063492063492, "eval_recall_weighted": 0.7492063492063492, "eval_runtime": 0.8337, "eval_samples_per_second": 377.837, "eval_steps_per_second": 23.99, "step": 358 } ], "logging_steps": 7, "max_steps": 895, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "total_flos": 188389207093248.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }