{ "best_metric": 0.5103544592857361, "best_model_checkpoint": "../../experiments_checkpoints/MAdAiLab/microsoft/phi_2_twitter/checkpoint-750", "epoch": 3.0, "eval_steps": 50, "global_step": 816, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "grad_norm": 164.05126953125, "learning_rate": 4.938725490196079e-06, "loss": 1.2318, "step": 10 }, { "epoch": 0.07, "grad_norm": 104.96498107910156, "learning_rate": 4.8774509803921576e-06, "loss": 1.0923, "step": 20 }, { "epoch": 0.11, "grad_norm": 83.26679229736328, "learning_rate": 4.816176470588236e-06, "loss": 0.7558, "step": 30 }, { "epoch": 0.15, "grad_norm": 51.119388580322266, "learning_rate": 4.754901960784314e-06, "loss": 0.6983, "step": 40 }, { "epoch": 0.18, "grad_norm": 38.16554260253906, "learning_rate": 4.693627450980393e-06, "loss": 0.5921, "step": 50 }, { "epoch": 0.18, "eval_accuracy": 0.6957720588235294, "eval_f1_macro": 0.5053995294725175, "eval_f1_micro": 0.6957720588235294, "eval_loss": 0.6037454009056091, "eval_runtime": 5.1072, "eval_samples_per_second": 213.033, "eval_steps_per_second": 6.657, "step": 50 }, { "epoch": 0.22, "grad_norm": 168.71607971191406, "learning_rate": 4.632352941176471e-06, "loss": 0.6117, "step": 60 }, { "epoch": 0.26, "grad_norm": 39.83993148803711, "learning_rate": 4.571078431372549e-06, "loss": 0.6139, "step": 70 }, { "epoch": 0.29, "grad_norm": 51.992225646972656, "learning_rate": 4.509803921568628e-06, "loss": 0.554, "step": 80 }, { "epoch": 0.33, "grad_norm": 40.58168029785156, "learning_rate": 4.448529411764706e-06, "loss": 0.5429, "step": 90 }, { "epoch": 0.37, "grad_norm": 77.49382019042969, "learning_rate": 4.3872549019607845e-06, "loss": 0.5904, "step": 100 }, { "epoch": 0.37, "eval_accuracy": 0.71875, "eval_f1_macro": 0.6277222408587723, "eval_f1_micro": 0.71875, "eval_loss": 0.5537109375, "eval_runtime": 5.1256, "eval_samples_per_second": 212.27, "eval_steps_per_second": 6.633, "step": 100 }, { "epoch": 0.4, "grad_norm": 55.94441604614258, "learning_rate": 4.3259803921568635e-06, "loss": 0.5684, "step": 110 }, { "epoch": 0.44, "grad_norm": 53.36739730834961, "learning_rate": 4.264705882352942e-06, "loss": 0.5263, "step": 120 }, { "epoch": 0.48, "grad_norm": 44.8289680480957, "learning_rate": 4.20343137254902e-06, "loss": 0.5239, "step": 130 }, { "epoch": 0.51, "grad_norm": 43.29486083984375, "learning_rate": 4.142156862745099e-06, "loss": 0.6123, "step": 140 }, { "epoch": 0.55, "grad_norm": 114.31592559814453, "learning_rate": 4.080882352941177e-06, "loss": 0.5021, "step": 150 }, { "epoch": 0.55, "eval_accuracy": 0.7159926470588235, "eval_f1_macro": 0.5774438641805396, "eval_f1_micro": 0.7159926470588235, "eval_loss": 0.6040900945663452, "eval_runtime": 5.1289, "eval_samples_per_second": 212.13, "eval_steps_per_second": 6.629, "step": 150 }, { "epoch": 0.59, "grad_norm": 99.46711730957031, "learning_rate": 4.019607843137255e-06, "loss": 0.5639, "step": 160 }, { "epoch": 0.62, "grad_norm": 90.892578125, "learning_rate": 3.958333333333333e-06, "loss": 0.5736, "step": 170 }, { "epoch": 0.66, "grad_norm": 70.59735107421875, "learning_rate": 3.897058823529412e-06, "loss": 0.5759, "step": 180 }, { "epoch": 0.7, "grad_norm": 61.23074722290039, "learning_rate": 3.8357843137254904e-06, "loss": 0.5252, "step": 190 }, { "epoch": 0.74, "grad_norm": 48.55831527709961, "learning_rate": 3.774509803921569e-06, "loss": 0.5266, "step": 200 }, { "epoch": 0.74, "eval_accuracy": 0.7095588235294118, "eval_f1_macro": 0.6496202779329097, "eval_f1_micro": 0.7095588235294118, "eval_loss": 0.5544002652168274, "eval_runtime": 5.139, "eval_samples_per_second": 211.713, "eval_steps_per_second": 6.616, "step": 200 }, { "epoch": 0.77, "grad_norm": 40.87614440917969, "learning_rate": 3.7132352941176476e-06, "loss": 0.5181, "step": 210 }, { "epoch": 0.81, "grad_norm": 42.869300842285156, "learning_rate": 3.6519607843137257e-06, "loss": 0.5433, "step": 220 }, { "epoch": 0.85, "grad_norm": 37.196022033691406, "learning_rate": 3.5906862745098043e-06, "loss": 0.5088, "step": 230 }, { "epoch": 0.88, "grad_norm": 26.8739013671875, "learning_rate": 3.529411764705883e-06, "loss": 0.515, "step": 240 }, { "epoch": 0.92, "grad_norm": 74.13105010986328, "learning_rate": 3.468137254901961e-06, "loss": 0.5427, "step": 250 }, { "epoch": 0.92, "eval_accuracy": 0.7398897058823529, "eval_f1_macro": 0.6914737024578574, "eval_f1_micro": 0.7398897058823529, "eval_loss": 0.533088207244873, "eval_runtime": 5.1312, "eval_samples_per_second": 212.038, "eval_steps_per_second": 6.626, "step": 250 }, { "epoch": 0.96, "grad_norm": 59.509281158447266, "learning_rate": 3.4068627450980396e-06, "loss": 0.4859, "step": 260 }, { "epoch": 0.99, "grad_norm": 26.682819366455078, "learning_rate": 3.3455882352941178e-06, "loss": 0.5441, "step": 270 }, { "epoch": 1.03, "grad_norm": 60.43280792236328, "learning_rate": 3.2843137254901964e-06, "loss": 0.487, "step": 280 }, { "epoch": 1.07, "grad_norm": 33.994571685791016, "learning_rate": 3.223039215686275e-06, "loss": 0.4848, "step": 290 }, { "epoch": 1.1, "grad_norm": 43.1735954284668, "learning_rate": 3.161764705882353e-06, "loss": 0.4715, "step": 300 }, { "epoch": 1.1, "eval_accuracy": 0.7398897058823529, "eval_f1_macro": 0.6361259916590146, "eval_f1_micro": 0.7398897058823529, "eval_loss": 0.5436149835586548, "eval_runtime": 5.1329, "eval_samples_per_second": 211.966, "eval_steps_per_second": 6.624, "step": 300 }, { "epoch": 1.14, "grad_norm": 43.79911804199219, "learning_rate": 3.1004901960784317e-06, "loss": 0.4838, "step": 310 }, { "epoch": 1.18, "grad_norm": 107.44807434082031, "learning_rate": 3.03921568627451e-06, "loss": 0.47, "step": 320 }, { "epoch": 1.21, "grad_norm": 33.9593620300293, "learning_rate": 2.9779411764705884e-06, "loss": 0.5337, "step": 330 }, { "epoch": 1.25, "grad_norm": 49.85417556762695, "learning_rate": 2.916666666666667e-06, "loss": 0.4914, "step": 340 }, { "epoch": 1.29, "grad_norm": 27.120018005371094, "learning_rate": 2.855392156862745e-06, "loss": 0.4829, "step": 350 }, { "epoch": 1.29, "eval_accuracy": 0.7564338235294118, "eval_f1_macro": 0.7135428309702008, "eval_f1_micro": 0.7564338235294118, "eval_loss": 0.5216710567474365, "eval_runtime": 5.1418, "eval_samples_per_second": 211.6, "eval_steps_per_second": 6.613, "step": 350 }, { "epoch": 1.32, "grad_norm": 42.45214080810547, "learning_rate": 2.7941176470588237e-06, "loss": 0.4851, "step": 360 }, { "epoch": 1.36, "grad_norm": 68.97784423828125, "learning_rate": 2.732843137254902e-06, "loss": 0.4166, "step": 370 }, { "epoch": 1.4, "grad_norm": 39.31782913208008, "learning_rate": 2.6715686274509804e-06, "loss": 0.5542, "step": 380 }, { "epoch": 1.43, "grad_norm": 46.57756805419922, "learning_rate": 2.610294117647059e-06, "loss": 0.4718, "step": 390 }, { "epoch": 1.47, "grad_norm": 70.58038330078125, "learning_rate": 2.549019607843137e-06, "loss": 0.4676, "step": 400 }, { "epoch": 1.47, "eval_accuracy": 0.7536764705882353, "eval_f1_macro": 0.6829050794520786, "eval_f1_micro": 0.7536764705882353, "eval_loss": 0.5224753022193909, "eval_runtime": 5.1324, "eval_samples_per_second": 211.985, "eval_steps_per_second": 6.625, "step": 400 }, { "epoch": 1.51, "grad_norm": 50.325191497802734, "learning_rate": 2.4877450980392158e-06, "loss": 0.5109, "step": 410 }, { "epoch": 1.54, "grad_norm": 74.2918701171875, "learning_rate": 2.4264705882352943e-06, "loss": 0.4521, "step": 420 }, { "epoch": 1.58, "grad_norm": 36.432159423828125, "learning_rate": 2.3651960784313725e-06, "loss": 0.44, "step": 430 }, { "epoch": 1.62, "grad_norm": 45.3189811706543, "learning_rate": 2.303921568627451e-06, "loss": 0.434, "step": 440 }, { "epoch": 1.65, "grad_norm": 38.40291213989258, "learning_rate": 2.2426470588235296e-06, "loss": 0.5196, "step": 450 }, { "epoch": 1.65, "eval_accuracy": 0.7628676470588235, "eval_f1_macro": 0.709592918596134, "eval_f1_micro": 0.7628676470588235, "eval_loss": 0.5163430571556091, "eval_runtime": 5.141, "eval_samples_per_second": 211.631, "eval_steps_per_second": 6.613, "step": 450 }, { "epoch": 1.69, "grad_norm": 34.863059997558594, "learning_rate": 2.1813725490196082e-06, "loss": 0.4809, "step": 460 }, { "epoch": 1.73, "grad_norm": 42.53833770751953, "learning_rate": 2.1200980392156864e-06, "loss": 0.4883, "step": 470 }, { "epoch": 1.76, "grad_norm": 34.529541015625, "learning_rate": 2.058823529411765e-06, "loss": 0.4098, "step": 480 }, { "epoch": 1.8, "grad_norm": 30.016141891479492, "learning_rate": 1.9975490196078435e-06, "loss": 0.4292, "step": 490 }, { "epoch": 1.84, "grad_norm": 39.967533111572266, "learning_rate": 1.9362745098039217e-06, "loss": 0.4815, "step": 500 }, { "epoch": 1.84, "eval_accuracy": 0.765625, "eval_f1_macro": 0.7215170814701055, "eval_f1_micro": 0.765625, "eval_loss": 0.5213120579719543, "eval_runtime": 5.1359, "eval_samples_per_second": 211.843, "eval_steps_per_second": 6.62, "step": 500 }, { "epoch": 1.88, "grad_norm": 43.9462776184082, "learning_rate": 1.8750000000000003e-06, "loss": 0.5156, "step": 510 }, { "epoch": 1.91, "grad_norm": 26.374074935913086, "learning_rate": 1.8137254901960786e-06, "loss": 0.5271, "step": 520 }, { "epoch": 1.95, "grad_norm": 58.75647735595703, "learning_rate": 1.752450980392157e-06, "loss": 0.5166, "step": 530 }, { "epoch": 1.99, "grad_norm": 32.93243408203125, "learning_rate": 1.6911764705882356e-06, "loss": 0.4624, "step": 540 }, { "epoch": 2.02, "grad_norm": 39.54354476928711, "learning_rate": 1.629901960784314e-06, "loss": 0.4836, "step": 550 }, { "epoch": 2.02, "eval_accuracy": 0.7619485294117647, "eval_f1_macro": 0.7190841764536411, "eval_f1_micro": 0.7619485294117647, "eval_loss": 0.522087574005127, "eval_runtime": 5.1446, "eval_samples_per_second": 211.482, "eval_steps_per_second": 6.609, "step": 550 }, { "epoch": 2.06, "grad_norm": 27.993928909301758, "learning_rate": 1.5686274509803923e-06, "loss": 0.4759, "step": 560 }, { "epoch": 2.1, "grad_norm": 24.092933654785156, "learning_rate": 1.5073529411764707e-06, "loss": 0.4096, "step": 570 }, { "epoch": 2.13, "grad_norm": 26.48084831237793, "learning_rate": 1.4460784313725492e-06, "loss": 0.4636, "step": 580 }, { "epoch": 2.17, "grad_norm": 23.85141372680664, "learning_rate": 1.3848039215686276e-06, "loss": 0.4349, "step": 590 }, { "epoch": 2.21, "grad_norm": 28.8481502532959, "learning_rate": 1.323529411764706e-06, "loss": 0.4945, "step": 600 }, { "epoch": 2.21, "eval_accuracy": 0.7637867647058824, "eval_f1_macro": 0.7173491781255339, "eval_f1_micro": 0.7637867647058824, "eval_loss": 0.5133702754974365, "eval_runtime": 5.1414, "eval_samples_per_second": 211.615, "eval_steps_per_second": 6.613, "step": 600 }, { "epoch": 2.24, "grad_norm": 20.64706039428711, "learning_rate": 1.2622549019607843e-06, "loss": 0.4625, "step": 610 }, { "epoch": 2.28, "grad_norm": 29.8828067779541, "learning_rate": 1.200980392156863e-06, "loss": 0.4622, "step": 620 }, { "epoch": 2.32, "grad_norm": 30.85140037536621, "learning_rate": 1.1397058823529413e-06, "loss": 0.4505, "step": 630 }, { "epoch": 2.35, "grad_norm": 58.30624008178711, "learning_rate": 1.0784313725490197e-06, "loss": 0.4761, "step": 640 }, { "epoch": 2.39, "grad_norm": 32.4758186340332, "learning_rate": 1.017156862745098e-06, "loss": 0.4103, "step": 650 }, { "epoch": 2.39, "eval_accuracy": 0.7683823529411765, "eval_f1_macro": 0.72109375, "eval_f1_micro": 0.7683823529411765, "eval_loss": 0.5125373601913452, "eval_runtime": 5.1343, "eval_samples_per_second": 211.91, "eval_steps_per_second": 6.622, "step": 650 }, { "epoch": 2.43, "grad_norm": 20.597454071044922, "learning_rate": 9.558823529411764e-07, "loss": 0.4986, "step": 660 }, { "epoch": 2.46, "grad_norm": 41.76366424560547, "learning_rate": 8.94607843137255e-07, "loss": 0.4589, "step": 670 }, { "epoch": 2.5, "grad_norm": 18.32976531982422, "learning_rate": 8.333333333333333e-07, "loss": 0.4294, "step": 680 }, { "epoch": 2.54, "grad_norm": 25.26702117919922, "learning_rate": 7.720588235294119e-07, "loss": 0.4579, "step": 690 }, { "epoch": 2.57, "grad_norm": 30.386232376098633, "learning_rate": 7.107843137254903e-07, "loss": 0.4191, "step": 700 }, { "epoch": 2.57, "eval_accuracy": 0.7683823529411765, "eval_f1_macro": 0.7226006191950465, "eval_f1_micro": 0.7683823529411765, "eval_loss": 0.5108427405357361, "eval_runtime": 5.1417, "eval_samples_per_second": 211.604, "eval_steps_per_second": 6.613, "step": 700 }, { "epoch": 2.61, "grad_norm": 21.484479904174805, "learning_rate": 6.495098039215687e-07, "loss": 0.4528, "step": 710 }, { "epoch": 2.65, "grad_norm": 29.71825408935547, "learning_rate": 5.882352941176471e-07, "loss": 0.4237, "step": 720 }, { "epoch": 2.68, "grad_norm": 29.686370849609375, "learning_rate": 5.269607843137256e-07, "loss": 0.4671, "step": 730 }, { "epoch": 2.72, "grad_norm": 29.55408477783203, "learning_rate": 4.6568627450980395e-07, "loss": 0.484, "step": 740 }, { "epoch": 2.76, "grad_norm": 43.04338836669922, "learning_rate": 4.044117647058824e-07, "loss": 0.5004, "step": 750 }, { "epoch": 2.76, "eval_accuracy": 0.7647058823529411, "eval_f1_macro": 0.717181071212435, "eval_f1_micro": 0.7647058823529411, "eval_loss": 0.5103544592857361, "eval_runtime": 5.1424, "eval_samples_per_second": 211.576, "eval_steps_per_second": 6.612, "step": 750 }, { "epoch": 2.79, "grad_norm": 37.452056884765625, "learning_rate": 3.4313725490196084e-07, "loss": 0.3917, "step": 760 }, { "epoch": 2.83, "grad_norm": 31.18016815185547, "learning_rate": 2.8186274509803926e-07, "loss": 0.4226, "step": 770 }, { "epoch": 2.87, "grad_norm": 41.16299819946289, "learning_rate": 2.2058823529411768e-07, "loss": 0.4242, "step": 780 }, { "epoch": 2.9, "grad_norm": 18.450733184814453, "learning_rate": 1.5931372549019607e-07, "loss": 0.4482, "step": 790 }, { "epoch": 2.94, "grad_norm": 37.13800811767578, "learning_rate": 9.803921568627452e-08, "loss": 0.4398, "step": 800 }, { "epoch": 2.94, "eval_accuracy": 0.7674632352941176, "eval_f1_macro": 0.7138339312799731, "eval_f1_micro": 0.7674632352941176, "eval_loss": 0.511359691619873, "eval_runtime": 5.1453, "eval_samples_per_second": 211.454, "eval_steps_per_second": 6.608, "step": 800 }, { "epoch": 2.98, "grad_norm": 28.77461814880371, "learning_rate": 3.6764705882352945e-08, "loss": 0.4237, "step": 810 }, { "epoch": 3.0, "step": 816, "total_flos": 5.048586286084915e+16, "train_loss": 0.5124810910692402, "train_runtime": 1553.1677, "train_samples_per_second": 16.804, "train_steps_per_second": 0.525 } ], "logging_steps": 10, "max_steps": 816, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "total_flos": 5.048586286084915e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }