{ "best_metric": 0.5038631558418274, "best_model_checkpoint": "../../experiments_checkpoints/MAdAiLab/Qwen/Qwen1.5_1.8B_twitter/checkpoint-250", "epoch": 3.0, "eval_steps": 50, "global_step": 816, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "grad_norm": 109.95573425292969, "learning_rate": 4.938725490196079e-06, "loss": 1.1177, "step": 10 }, { "epoch": 0.07, "grad_norm": 117.97087860107422, "learning_rate": 4.8774509803921576e-06, "loss": 0.8448, "step": 20 }, { "epoch": 0.11, "grad_norm": 46.05353546142578, "learning_rate": 4.816176470588236e-06, "loss": 0.8301, "step": 30 }, { "epoch": 0.15, "grad_norm": 41.749080657958984, "learning_rate": 4.754901960784314e-06, "loss": 0.7069, "step": 40 }, { "epoch": 0.18, "grad_norm": 26.182971954345703, "learning_rate": 4.693627450980393e-06, "loss": 0.6585, "step": 50 }, { "epoch": 0.18, "eval_accuracy": 0.7123161764705882, "eval_f1_macro": 0.5810560943233977, "eval_f1_micro": 0.7123161764705882, "eval_loss": 0.6434972286224365, "eval_runtime": 2.8155, "eval_samples_per_second": 386.429, "eval_steps_per_second": 12.076, "step": 50 }, { "epoch": 0.22, "grad_norm": 118.7502212524414, "learning_rate": 4.632352941176471e-06, "loss": 0.6179, "step": 60 }, { "epoch": 0.26, "grad_norm": 34.24118423461914, "learning_rate": 4.571078431372549e-06, "loss": 0.6324, "step": 70 }, { "epoch": 0.29, "grad_norm": 92.65385437011719, "learning_rate": 4.509803921568628e-06, "loss": 0.5743, "step": 80 }, { "epoch": 0.33, "grad_norm": 44.45353317260742, "learning_rate": 4.448529411764706e-06, "loss": 0.4798, "step": 90 }, { "epoch": 0.37, "grad_norm": 35.91661071777344, "learning_rate": 4.3872549019607845e-06, "loss": 0.6396, "step": 100 }, { "epoch": 0.37, "eval_accuracy": 0.7297794117647058, "eval_f1_macro": 0.6997657235537935, "eval_f1_micro": 0.7297794117647058, "eval_loss": 0.6015912294387817, "eval_runtime": 2.832, "eval_samples_per_second": 384.18, "eval_steps_per_second": 12.006, "step": 100 }, { "epoch": 0.4, "grad_norm": 56.78458786010742, "learning_rate": 4.3259803921568635e-06, "loss": 0.5814, "step": 110 }, { "epoch": 0.44, "grad_norm": 53.924827575683594, "learning_rate": 4.264705882352942e-06, "loss": 0.5131, "step": 120 }, { "epoch": 0.48, "grad_norm": 74.93573760986328, "learning_rate": 4.20343137254902e-06, "loss": 0.5242, "step": 130 }, { "epoch": 0.51, "grad_norm": 26.097169876098633, "learning_rate": 4.142156862745099e-06, "loss": 0.5529, "step": 140 }, { "epoch": 0.55, "grad_norm": 78.16165161132812, "learning_rate": 4.080882352941177e-06, "loss": 0.5108, "step": 150 }, { "epoch": 0.55, "eval_accuracy": 0.7527573529411765, "eval_f1_macro": 0.6963299829940972, "eval_f1_micro": 0.7527573529411765, "eval_loss": 0.5226907134056091, "eval_runtime": 2.826, "eval_samples_per_second": 384.993, "eval_steps_per_second": 12.031, "step": 150 }, { "epoch": 0.59, "grad_norm": 19.04606056213379, "learning_rate": 4.019607843137255e-06, "loss": 0.5012, "step": 160 }, { "epoch": 0.62, "grad_norm": 127.43460845947266, "learning_rate": 3.958333333333333e-06, "loss": 0.5601, "step": 170 }, { "epoch": 0.66, "grad_norm": 73.45539093017578, "learning_rate": 3.897058823529412e-06, "loss": 0.6134, "step": 180 }, { "epoch": 0.7, "grad_norm": 59.42979049682617, "learning_rate": 3.8357843137254904e-06, "loss": 0.5447, "step": 190 }, { "epoch": 0.74, "grad_norm": 22.45537757873535, "learning_rate": 3.774509803921569e-06, "loss": 0.5065, "step": 200 }, { "epoch": 0.74, "eval_accuracy": 0.7417279411764706, "eval_f1_macro": 0.6346563132227484, "eval_f1_micro": 0.7417279411764706, "eval_loss": 0.5502642393112183, "eval_runtime": 2.8852, "eval_samples_per_second": 377.094, "eval_steps_per_second": 11.784, "step": 200 }, { "epoch": 0.77, "grad_norm": 50.938880920410156, "learning_rate": 3.7132352941176476e-06, "loss": 0.4589, "step": 210 }, { "epoch": 0.81, "grad_norm": 84.68132781982422, "learning_rate": 3.6519607843137257e-06, "loss": 0.5403, "step": 220 }, { "epoch": 0.85, "grad_norm": 27.052024841308594, "learning_rate": 3.5906862745098043e-06, "loss": 0.4618, "step": 230 }, { "epoch": 0.88, "grad_norm": 23.497787475585938, "learning_rate": 3.529411764705883e-06, "loss": 0.452, "step": 240 }, { "epoch": 0.92, "grad_norm": 15.185086250305176, "learning_rate": 3.468137254901961e-06, "loss": 0.4883, "step": 250 }, { "epoch": 0.92, "eval_accuracy": 0.7775735294117647, "eval_f1_macro": 0.7420002194942226, "eval_f1_micro": 0.7775735294117647, "eval_loss": 0.5038631558418274, "eval_runtime": 2.8371, "eval_samples_per_second": 383.494, "eval_steps_per_second": 11.984, "step": 250 }, { "epoch": 0.96, "grad_norm": 24.239835739135742, "learning_rate": 3.4068627450980396e-06, "loss": 0.4325, "step": 260 }, { "epoch": 0.99, "grad_norm": 13.703453063964844, "learning_rate": 3.3455882352941178e-06, "loss": 0.5193, "step": 270 }, { "epoch": 1.03, "grad_norm": 92.7187728881836, "learning_rate": 3.2843137254901964e-06, "loss": 0.4147, "step": 280 }, { "epoch": 1.07, "grad_norm": 38.43824768066406, "learning_rate": 3.223039215686275e-06, "loss": 0.3275, "step": 290 }, { "epoch": 1.1, "grad_norm": 11.464463233947754, "learning_rate": 3.161764705882353e-06, "loss": 0.3296, "step": 300 }, { "epoch": 1.1, "eval_accuracy": 0.7729779411764706, "eval_f1_macro": 0.730720863982653, "eval_f1_micro": 0.7729779411764706, "eval_loss": 0.5249598026275635, "eval_runtime": 2.8316, "eval_samples_per_second": 384.236, "eval_steps_per_second": 12.007, "step": 300 }, { "epoch": 1.14, "grad_norm": 18.103742599487305, "learning_rate": 3.1004901960784317e-06, "loss": 0.32, "step": 310 }, { "epoch": 1.18, "grad_norm": 48.70182800292969, "learning_rate": 3.03921568627451e-06, "loss": 0.3033, "step": 320 }, { "epoch": 1.21, "grad_norm": 19.211103439331055, "learning_rate": 2.9779411764705884e-06, "loss": 0.371, "step": 330 }, { "epoch": 1.25, "grad_norm": 30.80494499206543, "learning_rate": 2.916666666666667e-06, "loss": 0.3158, "step": 340 }, { "epoch": 1.29, "grad_norm": 22.932558059692383, "learning_rate": 2.855392156862745e-06, "loss": 0.322, "step": 350 }, { "epoch": 1.29, "eval_accuracy": 0.7720588235294118, "eval_f1_macro": 0.7422500391653388, "eval_f1_micro": 0.7720588235294118, "eval_loss": 0.5509535670280457, "eval_runtime": 2.8424, "eval_samples_per_second": 382.778, "eval_steps_per_second": 11.962, "step": 350 }, { "epoch": 1.32, "grad_norm": 12.943411827087402, "learning_rate": 2.7941176470588237e-06, "loss": 0.3103, "step": 360 }, { "epoch": 1.36, "grad_norm": 35.10004425048828, "learning_rate": 2.732843137254902e-06, "loss": 0.2967, "step": 370 }, { "epoch": 1.4, "grad_norm": 21.274372100830078, "learning_rate": 2.6715686274509804e-06, "loss": 0.3739, "step": 380 }, { "epoch": 1.43, "grad_norm": 33.91293716430664, "learning_rate": 2.610294117647059e-06, "loss": 0.3583, "step": 390 }, { "epoch": 1.47, "grad_norm": 33.361366271972656, "learning_rate": 2.549019607843137e-06, "loss": 0.3287, "step": 400 }, { "epoch": 1.47, "eval_accuracy": 0.7582720588235294, "eval_f1_macro": 0.6932231064719864, "eval_f1_micro": 0.7582720588235294, "eval_loss": 0.539191722869873, "eval_runtime": 2.8418, "eval_samples_per_second": 382.862, "eval_steps_per_second": 11.964, "step": 400 }, { "epoch": 1.51, "grad_norm": 41.59482192993164, "learning_rate": 2.4877450980392158e-06, "loss": 0.2896, "step": 410 }, { "epoch": 1.54, "grad_norm": 57.865882873535156, "learning_rate": 2.4264705882352943e-06, "loss": 0.3092, "step": 420 }, { "epoch": 1.58, "grad_norm": 37.957183837890625, "learning_rate": 2.3651960784313725e-06, "loss": 0.3175, "step": 430 }, { "epoch": 1.62, "grad_norm": 30.82972526550293, "learning_rate": 2.303921568627451e-06, "loss": 0.3005, "step": 440 }, { "epoch": 1.65, "grad_norm": 34.46055603027344, "learning_rate": 2.2426470588235296e-06, "loss": 0.3097, "step": 450 }, { "epoch": 1.65, "eval_accuracy": 0.7628676470588235, "eval_f1_macro": 0.7222644376899696, "eval_f1_micro": 0.7628676470588235, "eval_loss": 0.5631462335586548, "eval_runtime": 2.8377, "eval_samples_per_second": 383.405, "eval_steps_per_second": 11.981, "step": 450 }, { "epoch": 1.69, "grad_norm": 25.395221710205078, "learning_rate": 2.1813725490196082e-06, "loss": 0.3284, "step": 460 }, { "epoch": 1.73, "grad_norm": 13.454970359802246, "learning_rate": 2.1200980392156864e-06, "loss": 0.3399, "step": 470 }, { "epoch": 1.76, "grad_norm": 14.682934761047363, "learning_rate": 2.058823529411765e-06, "loss": 0.2575, "step": 480 }, { "epoch": 1.8, "grad_norm": 8.564711570739746, "learning_rate": 1.9975490196078435e-06, "loss": 0.3048, "step": 490 }, { "epoch": 1.84, "grad_norm": 30.56090545654297, "learning_rate": 1.9362745098039217e-06, "loss": 0.3397, "step": 500 }, { "epoch": 1.84, "eval_accuracy": 0.7674632352941176, "eval_f1_macro": 0.7333806007808814, "eval_f1_micro": 0.7674632352941176, "eval_loss": 0.5669376254081726, "eval_runtime": 2.8353, "eval_samples_per_second": 383.737, "eval_steps_per_second": 11.992, "step": 500 }, { "epoch": 1.88, "grad_norm": 50.637630462646484, "learning_rate": 1.8750000000000003e-06, "loss": 0.3389, "step": 510 }, { "epoch": 1.91, "grad_norm": 69.52845001220703, "learning_rate": 1.8137254901960786e-06, "loss": 0.365, "step": 520 }, { "epoch": 1.95, "grad_norm": 19.709930419921875, "learning_rate": 1.752450980392157e-06, "loss": 0.3561, "step": 530 }, { "epoch": 1.99, "grad_norm": 22.103239059448242, "learning_rate": 1.6911764705882356e-06, "loss": 0.3147, "step": 540 }, { "epoch": 2.02, "grad_norm": 18.630016326904297, "learning_rate": 1.629901960784314e-06, "loss": 0.2618, "step": 550 }, { "epoch": 2.02, "eval_accuracy": 0.75, "eval_f1_macro": 0.6870389170896785, "eval_f1_micro": 0.75, "eval_loss": 0.5891400575637817, "eval_runtime": 3.0028, "eval_samples_per_second": 362.323, "eval_steps_per_second": 11.323, "step": 550 }, { "epoch": 2.06, "grad_norm": 20.225698471069336, "learning_rate": 1.5686274509803923e-06, "loss": 0.2155, "step": 560 }, { "epoch": 2.1, "grad_norm": 20.72734832763672, "learning_rate": 1.5073529411764707e-06, "loss": 0.1837, "step": 570 }, { "epoch": 2.13, "grad_norm": 12.758101463317871, "learning_rate": 1.4460784313725492e-06, "loss": 0.1936, "step": 580 }, { "epoch": 2.17, "grad_norm": 20.31243896484375, "learning_rate": 1.3848039215686276e-06, "loss": 0.1668, "step": 590 }, { "epoch": 2.21, "grad_norm": 6.187927722930908, "learning_rate": 1.323529411764706e-06, "loss": 0.1745, "step": 600 }, { "epoch": 2.21, "eval_accuracy": 0.7582720588235294, "eval_f1_macro": 0.7122790012056079, "eval_f1_micro": 0.7582720588235294, "eval_loss": 0.6399500370025635, "eval_runtime": 2.8351, "eval_samples_per_second": 383.754, "eval_steps_per_second": 11.992, "step": 600 }, { "epoch": 2.24, "grad_norm": 16.426952362060547, "learning_rate": 1.2622549019607843e-06, "loss": 0.1873, "step": 610 }, { "epoch": 2.28, "grad_norm": 14.441506385803223, "learning_rate": 1.200980392156863e-06, "loss": 0.1318, "step": 620 }, { "epoch": 2.32, "grad_norm": 25.300006866455078, "learning_rate": 1.1397058823529413e-06, "loss": 0.1468, "step": 630 }, { "epoch": 2.35, "grad_norm": 5.833555221557617, "learning_rate": 1.0784313725490197e-06, "loss": 0.1969, "step": 640 }, { "epoch": 2.39, "grad_norm": 14.262299537658691, "learning_rate": 1.017156862745098e-06, "loss": 0.1572, "step": 650 }, { "epoch": 2.39, "eval_accuracy": 0.7518382352941176, "eval_f1_macro": 0.6966693100713719, "eval_f1_micro": 0.7518382352941176, "eval_loss": 0.6694048643112183, "eval_runtime": 2.8384, "eval_samples_per_second": 383.315, "eval_steps_per_second": 11.979, "step": 650 }, { "epoch": 2.43, "grad_norm": 9.928728103637695, "learning_rate": 9.558823529411764e-07, "loss": 0.1654, "step": 660 }, { "epoch": 2.46, "grad_norm": 36.1452522277832, "learning_rate": 8.94607843137255e-07, "loss": 0.181, "step": 670 }, { "epoch": 2.5, "grad_norm": 14.285146713256836, "learning_rate": 8.333333333333333e-07, "loss": 0.1702, "step": 680 }, { "epoch": 2.54, "grad_norm": 22.802335739135742, "learning_rate": 7.720588235294119e-07, "loss": 0.1375, "step": 690 }, { "epoch": 2.57, "grad_norm": 25.91615867614746, "learning_rate": 7.107843137254903e-07, "loss": 0.1804, "step": 700 }, { "epoch": 2.57, "eval_accuracy": 0.7610294117647058, "eval_f1_macro": 0.7172783241384825, "eval_f1_micro": 0.7610294117647058, "eval_loss": 0.6869542598724365, "eval_runtime": 2.8367, "eval_samples_per_second": 383.549, "eval_steps_per_second": 11.986, "step": 700 }, { "epoch": 2.61, "grad_norm": 21.840639114379883, "learning_rate": 6.495098039215687e-07, "loss": 0.1811, "step": 710 }, { "epoch": 2.65, "grad_norm": 12.820947647094727, "learning_rate": 5.882352941176471e-07, "loss": 0.1636, "step": 720 }, { "epoch": 2.68, "grad_norm": 14.430222511291504, "learning_rate": 5.269607843137256e-07, "loss": 0.1478, "step": 730 }, { "epoch": 2.72, "grad_norm": 15.748844146728516, "learning_rate": 4.6568627450980395e-07, "loss": 0.1738, "step": 740 }, { "epoch": 2.76, "grad_norm": 21.588491439819336, "learning_rate": 4.044117647058824e-07, "loss": 0.1817, "step": 750 }, { "epoch": 2.76, "eval_accuracy": 0.7536764705882353, "eval_f1_macro": 0.7044581029142274, "eval_f1_micro": 0.7536764705882353, "eval_loss": 0.6656494140625, "eval_runtime": 2.8877, "eval_samples_per_second": 376.77, "eval_steps_per_second": 11.774, "step": 750 }, { "epoch": 2.79, "grad_norm": 34.946441650390625, "learning_rate": 3.4313725490196084e-07, "loss": 0.1558, "step": 760 }, { "epoch": 2.83, "grad_norm": 14.45042610168457, "learning_rate": 2.8186274509803926e-07, "loss": 0.1671, "step": 770 }, { "epoch": 2.87, "grad_norm": 22.017681121826172, "learning_rate": 2.2058823529411768e-07, "loss": 0.16, "step": 780 }, { "epoch": 2.9, "grad_norm": 40.37516403198242, "learning_rate": 1.5931372549019607e-07, "loss": 0.2156, "step": 790 }, { "epoch": 2.94, "grad_norm": 21.41912841796875, "learning_rate": 9.803921568627452e-08, "loss": 0.1984, "step": 800 }, { "epoch": 2.94, "eval_accuracy": 0.7518382352941176, "eval_f1_macro": 0.6949038473522191, "eval_f1_micro": 0.7518382352941176, "eval_loss": 0.6783303618431091, "eval_runtime": 2.8354, "eval_samples_per_second": 383.725, "eval_steps_per_second": 11.991, "step": 800 }, { "epoch": 2.98, "grad_norm": 22.50458335876465, "learning_rate": 3.6764705882352945e-08, "loss": 0.1742, "step": 810 }, { "epoch": 3.0, "step": 816, "total_flos": 2.435566082929459e+16, "train_loss": 0.3625207788803998, "train_runtime": 885.2091, "train_samples_per_second": 29.485, "train_steps_per_second": 0.922 } ], "logging_steps": 10, "max_steps": 816, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "total_flos": 2.435566082929459e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }