{ "best_metric": 0.5101504325866699, "best_model_checkpoint": "./vit-beta1-0.85/checkpoint-5778", "epoch": 28.0, "eval_steps": 500, "global_step": 8988, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "grad_norm": 23.742420196533203, "learning_rate": 1.8291979226774382e-05, "loss": 1.763, "step": 321 }, { "epoch": 1.0, "eval_accuracy": 0.6952149791955617, "eval_f1": 0.6201672138002952, "eval_loss": 0.9504804015159607, "eval_precision": 0.6345772764957709, "eval_recall": 0.6952149791955617, "eval_runtime": 23.0549, "eval_samples_per_second": 125.093, "eval_steps_per_second": 15.658, "step": 321 }, { "epoch": 2.0, "grad_norm": 20.97748374938965, "learning_rate": 3.681477207155222e-05, "loss": 1.149, "step": 642 }, { "epoch": 2.0, "eval_accuracy": 0.7444521497919556, "eval_f1": 0.7229775407881395, "eval_loss": 0.714677631855011, "eval_precision": 0.74566856205471, "eval_recall": 0.7444521497919556, "eval_runtime": 23.0018, "eval_samples_per_second": 125.381, "eval_steps_per_second": 15.694, "step": 642 }, { "epoch": 3.0, "grad_norm": 8.279248237609863, "learning_rate": 5.5337564916330066e-05, "loss": 1.0452, "step": 963 }, { "epoch": 3.0, "eval_accuracy": 0.7572815533980582, "eval_f1": 0.7320657787389255, "eval_loss": 0.6250461935997009, "eval_precision": 0.7591347407203577, "eval_recall": 0.7572815533980582, "eval_runtime": 23.1015, "eval_samples_per_second": 124.841, "eval_steps_per_second": 15.627, "step": 963 }, { "epoch": 4.0, "grad_norm": 41.02663803100586, "learning_rate": 7.386035776110792e-05, "loss": 1.0048, "step": 1284 }, { "epoch": 4.0, "eval_accuracy": 0.7784327323162274, "eval_f1": 0.7737493702586475, "eval_loss": 0.5614432692527771, "eval_precision": 0.779177150378666, "eval_recall": 0.7784327323162274, "eval_runtime": 23.126, "eval_samples_per_second": 124.708, "eval_steps_per_second": 15.61, "step": 1284 }, { "epoch": 5.0, "grad_norm": 7.507669448852539, "learning_rate": 9.238315060588575e-05, "loss": 0.931, "step": 1605 }, { "epoch": 5.0, "eval_accuracy": 0.7739251040221914, "eval_f1": 0.7822626472987246, "eval_loss": 0.6081557869911194, "eval_precision": 0.8019901444431083, "eval_recall": 0.7739251040221914, "eval_runtime": 22.3445, "eval_samples_per_second": 129.07, "eval_steps_per_second": 16.156, "step": 1605 }, { "epoch": 6.0, "grad_norm": 8.154308319091797, "learning_rate": 9.984715255878176e-05, "loss": 0.9808, "step": 1926 }, { "epoch": 6.0, "eval_accuracy": 0.7981969486823856, "eval_f1": 0.7950945949242283, "eval_loss": 0.554189145565033, "eval_precision": 0.7983952215842433, "eval_recall": 0.7981969486823856, "eval_runtime": 22.8586, "eval_samples_per_second": 126.167, "eval_steps_per_second": 15.793, "step": 1926 }, { "epoch": 7.0, "grad_norm": 11.704543113708496, "learning_rate": 9.889061131437471e-05, "loss": 0.8908, "step": 2247 }, { "epoch": 7.0, "eval_accuracy": 0.7545076282940361, "eval_f1": 0.7708612112738275, "eval_loss": 0.5956947207450867, "eval_precision": 0.820205289282276, "eval_recall": 0.7545076282940361, "eval_runtime": 23.7447, "eval_samples_per_second": 121.459, "eval_steps_per_second": 15.203, "step": 2247 }, { "epoch": 8.0, "grad_norm": 3.4443271160125732, "learning_rate": 9.707265436104638e-05, "loss": 0.7747, "step": 2568 }, { "epoch": 8.0, "eval_accuracy": 0.7694174757281553, "eval_f1": 0.7836405012298657, "eval_loss": 0.5766288638114929, "eval_precision": 0.8155227380188221, "eval_recall": 0.7694174757281553, "eval_runtime": 23.1168, "eval_samples_per_second": 124.758, "eval_steps_per_second": 15.616, "step": 2568 }, { "epoch": 9.0, "grad_norm": 7.433899879455566, "learning_rate": 9.44253127296151e-05, "loss": 0.741, "step": 2889 }, { "epoch": 9.0, "eval_accuracy": 0.7995839112343966, "eval_f1": 0.804684275131234, "eval_loss": 0.5430790185928345, "eval_precision": 0.8189836827516774, "eval_recall": 0.7995839112343966, "eval_runtime": 23.7114, "eval_samples_per_second": 121.629, "eval_steps_per_second": 15.225, "step": 2889 }, { "epoch": 10.0, "grad_norm": 12.33841609954834, "learning_rate": 9.099523058358976e-05, "loss": 0.7179, "step": 3210 }, { "epoch": 10.0, "eval_accuracy": 0.7773925104022191, "eval_f1": 0.7904480536083208, "eval_loss": 0.5864874720573425, "eval_precision": 0.831331916729736, "eval_recall": 0.7773925104022191, "eval_runtime": 23.035, "eval_samples_per_second": 125.201, "eval_steps_per_second": 15.672, "step": 3210 }, { "epoch": 11.0, "grad_norm": 21.430646896362305, "learning_rate": 8.684284338417735e-05, "loss": 0.6102, "step": 3531 }, { "epoch": 11.0, "eval_accuracy": 0.8096393897364771, "eval_f1": 0.8180305563212311, "eval_loss": 0.5288417935371399, "eval_precision": 0.8361295244369513, "eval_recall": 0.8096393897364771, "eval_runtime": 23.7358, "eval_samples_per_second": 121.504, "eval_steps_per_second": 15.209, "step": 3531 }, { "epoch": 12.0, "grad_norm": 75.45726013183594, "learning_rate": 8.204131306302357e-05, "loss": 0.574, "step": 3852 }, { "epoch": 12.0, "eval_accuracy": 0.7995839112343966, "eval_f1": 0.8096217027832384, "eval_loss": 0.599109947681427, "eval_precision": 0.8332319256993149, "eval_recall": 0.7995839112343966, "eval_runtime": 23.369, "eval_samples_per_second": 123.411, "eval_steps_per_second": 15.448, "step": 3852 }, { "epoch": 13.0, "grad_norm": 3.8113694190979004, "learning_rate": 7.667523896413962e-05, "loss": 0.4515, "step": 4173 }, { "epoch": 13.0, "eval_accuracy": 0.8370319001386962, "eval_f1": 0.8292572001875775, "eval_loss": 0.5889743566513062, "eval_precision": 0.8333558129412982, "eval_recall": 0.8370319001386962, "eval_runtime": 23.6944, "eval_samples_per_second": 121.717, "eval_steps_per_second": 15.236, "step": 4173 }, { "epoch": 14.0, "grad_norm": 5.727066516876221, "learning_rate": 7.083916726724684e-05, "loss": 0.4629, "step": 4494 }, { "epoch": 14.0, "eval_accuracy": 0.8120665742024965, "eval_f1": 0.8205347890470436, "eval_loss": 0.5573027729988098, "eval_precision": 0.8462670503631797, "eval_recall": 0.8120665742024965, "eval_runtime": 24.8143, "eval_samples_per_second": 116.223, "eval_steps_per_second": 14.548, "step": 4494 }, { "epoch": 15.0, "grad_norm": 4.605546474456787, "learning_rate": 6.463592515537568e-05, "loss": 0.3927, "step": 4815 }, { "epoch": 15.0, "eval_accuracy": 0.8332177531206657, "eval_f1": 0.835722282090202, "eval_loss": 0.5279321670532227, "eval_precision": 0.8506448765217454, "eval_recall": 0.8332177531206657, "eval_runtime": 25.6733, "eval_samples_per_second": 112.335, "eval_steps_per_second": 14.061, "step": 4815 }, { "epoch": 16.0, "grad_norm": 5.051961898803711, "learning_rate": 5.8174809077430184e-05, "loss": 0.3535, "step": 5136 }, { "epoch": 16.0, "eval_accuracy": 0.8356449375866851, "eval_f1": 0.8404866875551775, "eval_loss": 0.5363761782646179, "eval_precision": 0.8494425873765888, "eval_recall": 0.8356449375866851, "eval_runtime": 23.6271, "eval_samples_per_second": 122.063, "eval_steps_per_second": 15.279, "step": 5136 }, { "epoch": 17.0, "grad_norm": 28.548500061035156, "learning_rate": 5.156965902716534e-05, "loss": 0.2635, "step": 5457 }, { "epoch": 17.0, "eval_accuracy": 0.8547156726768377, "eval_f1": 0.853209116290087, "eval_loss": 0.5475490093231201, "eval_precision": 0.8625841938606562, "eval_recall": 0.8547156726768377, "eval_runtime": 25.2986, "eval_samples_per_second": 113.999, "eval_steps_per_second": 14.27, "step": 5457 }, { "epoch": 18.0, "grad_norm": 16.80175018310547, "learning_rate": 4.493685276832998e-05, "loss": 0.2493, "step": 5778 }, { "epoch": 18.0, "eval_accuracy": 0.855755894590846, "eval_f1": 0.8553441136144744, "eval_loss": 0.5101504325866699, "eval_precision": 0.8568218210572909, "eval_recall": 0.855755894590846, "eval_runtime": 23.831, "eval_samples_per_second": 121.019, "eval_steps_per_second": 15.148, "step": 5778 }, { "epoch": 19.0, "grad_norm": 0.11270666122436523, "learning_rate": 3.839325534621579e-05, "loss": 0.2125, "step": 6099 }, { "epoch": 19.0, "eval_accuracy": 0.8328710124826629, "eval_f1": 0.8418273768641612, "eval_loss": 0.6120281219482422, "eval_precision": 0.8622992078235902, "eval_recall": 0.8328710124826629, "eval_runtime": 23.1564, "eval_samples_per_second": 124.544, "eval_steps_per_second": 15.59, "step": 6099 }, { "epoch": 20.0, "grad_norm": 0.06091728433966637, "learning_rate": 3.205416001367289e-05, "loss": 0.2179, "step": 6420 }, { "epoch": 20.0, "eval_accuracy": 0.8567961165048543, "eval_f1": 0.85627072801028, "eval_loss": 0.5720838904380798, "eval_precision": 0.8563063857199564, "eval_recall": 0.8567961165048543, "eval_runtime": 23.124, "eval_samples_per_second": 124.719, "eval_steps_per_second": 15.611, "step": 6420 }, { "epoch": 21.0, "grad_norm": 2.7066216468811035, "learning_rate": 2.6067582030088143e-05, "loss": 0.1598, "step": 6741 }, { "epoch": 21.0, "eval_accuracy": 0.8651178918169209, "eval_f1": 0.8632510833215378, "eval_loss": 0.5503374338150024, "eval_precision": 0.8623213572654986, "eval_recall": 0.8651178918169209, "eval_runtime": 23.2584, "eval_samples_per_second": 123.998, "eval_steps_per_second": 15.521, "step": 6741 }, { "epoch": 22.0, "grad_norm": 37.88532257080078, "learning_rate": 2.0464044677195966e-05, "loss": 0.1194, "step": 7062 }, { "epoch": 22.0, "eval_accuracy": 0.8678918169209431, "eval_f1": 0.8668972179399497, "eval_loss": 0.5829476714134216, "eval_precision": 0.8672496533540811, "eval_recall": 0.8678918169209431, "eval_runtime": 23.2515, "eval_samples_per_second": 124.035, "eval_steps_per_second": 15.526, "step": 7062 }, { "epoch": 23.0, "grad_norm": 0.11761299520730972, "learning_rate": 1.538090858763548e-05, "loss": 0.1245, "step": 7383 }, { "epoch": 23.0, "eval_accuracy": 0.8682385575589459, "eval_f1": 0.8629371653200789, "eval_loss": 0.6137722134590149, "eval_precision": 0.8631764321501026, "eval_recall": 0.8682385575589459, "eval_runtime": 24.6364, "eval_samples_per_second": 117.063, "eval_steps_per_second": 14.653, "step": 7383 }, { "epoch": 24.0, "grad_norm": 0.2003249078989029, "learning_rate": 1.0907734786732799e-05, "loss": 0.1239, "step": 7704 }, { "epoch": 24.0, "eval_accuracy": 0.8730929264909847, "eval_f1": 0.8694766569722234, "eval_loss": 0.6135957837104797, "eval_precision": 0.8702946096941098, "eval_recall": 0.8730929264909847, "eval_runtime": 23.1398, "eval_samples_per_second": 124.633, "eval_steps_per_second": 15.601, "step": 7704 }, { "epoch": 25.0, "grad_norm": 0.018545789644122124, "learning_rate": 7.123337224185811e-06, "loss": 0.1159, "step": 8025 }, { "epoch": 25.0, "eval_accuracy": 0.8751733703190014, "eval_f1": 0.8726121524469985, "eval_loss": 0.5931476354598999, "eval_precision": 0.8724487380217695, "eval_recall": 0.8751733703190014, "eval_runtime": 23.8052, "eval_samples_per_second": 121.15, "eval_steps_per_second": 15.165, "step": 8025 }, { "epoch": 26.0, "grad_norm": 13.244468688964844, "learning_rate": 4.094394131694684e-06, "loss": 0.089, "step": 8346 }, { "epoch": 26.0, "eval_accuracy": 0.8776005547850209, "eval_f1": 0.8749752761914638, "eval_loss": 0.5846670866012573, "eval_precision": 0.8742768994280256, "eval_recall": 0.8776005547850209, "eval_runtime": 23.0803, "eval_samples_per_second": 124.955, "eval_steps_per_second": 15.641, "step": 8346 }, { "epoch": 27.0, "grad_norm": 0.4510207176208496, "learning_rate": 1.8742732027931087e-06, "loss": 0.1123, "step": 8667 }, { "epoch": 27.0, "eval_accuracy": 0.8751733703190014, "eval_f1": 0.8718954079920697, "eval_loss": 0.594070553779602, "eval_precision": 0.8710238593528318, "eval_recall": 0.8751733703190014, "eval_runtime": 22.9979, "eval_samples_per_second": 125.403, "eval_steps_per_second": 15.697, "step": 8667 }, { "epoch": 28.0, "grad_norm": 0.05787363648414612, "learning_rate": 5.020912943263345e-07, "loss": 0.0779, "step": 8988 }, { "epoch": 28.0, "eval_accuracy": 0.8765603328710125, "eval_f1": 0.8729435556323918, "eval_loss": 0.6037870049476624, "eval_precision": 0.872156471246016, "eval_recall": 0.8765603328710125, "eval_runtime": 24.3339, "eval_samples_per_second": 118.518, "eval_steps_per_second": 14.835, "step": 8988 }, { "epoch": 28.0, "step": 8988, "total_flos": 1.1127108458244538e+19, "train_loss": 0.5253127112513815, "train_runtime": 3374.2787, "train_samples_per_second": 151.973, "train_steps_per_second": 9.513 } ], "logging_steps": 500, "max_steps": 32100, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 500, "total_flos": 1.1127108458244538e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }