{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.029848363926576218, "eval_steps": 17, "global_step": 187, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0001596169193934557, "eval_loss": 4.4704790115356445, "eval_runtime": 251.6298, "eval_samples_per_second": 41.935, "eval_steps_per_second": 5.242, "step": 1 }, { "epoch": 0.00047885075818036713, "grad_norm": 15.91522216796875, "learning_rate": 3e-05, "loss": 17.5681, "step": 3 }, { "epoch": 0.0009577015163607343, "grad_norm": 15.344284057617188, "learning_rate": 6e-05, "loss": 17.9003, "step": 6 }, { "epoch": 0.0014365522745411015, "grad_norm": 15.565232276916504, "learning_rate": 9e-05, "loss": 16.1959, "step": 9 }, { "epoch": 0.0019154030327214685, "grad_norm": 11.516205787658691, "learning_rate": 9.997266286704631e-05, "loss": 13.8765, "step": 12 }, { "epoch": 0.0023942537909018356, "grad_norm": 10.986926078796387, "learning_rate": 9.98292246503335e-05, "loss": 13.7128, "step": 15 }, { "epoch": 0.002713487629688747, "eval_loss": 3.3633768558502197, "eval_runtime": 251.6503, "eval_samples_per_second": 41.931, "eval_steps_per_second": 5.241, "step": 17 }, { "epoch": 0.002873104549082203, "grad_norm": 14.424678802490234, "learning_rate": 9.956320346634876e-05, "loss": 13.9318, "step": 18 }, { "epoch": 0.0033519553072625698, "grad_norm": 10.257959365844727, "learning_rate": 9.917525374361912e-05, "loss": 12.9712, "step": 21 }, { "epoch": 0.003830806065442937, "grad_norm": 9.352620124816895, "learning_rate": 9.86663298624003e-05, "loss": 13.191, "step": 24 }, { "epoch": 0.004309656823623304, "grad_norm": 9.2288236618042, "learning_rate": 9.803768380684242e-05, "loss": 12.3269, "step": 27 }, { "epoch": 0.004788507581803671, "grad_norm": 8.8008394241333, "learning_rate": 9.729086208503174e-05, "loss": 11.092, "step": 30 }, { "epoch": 0.005267358339984038, "grad_norm": 8.954124450683594, "learning_rate": 9.642770192448536e-05, "loss": 12.0332, "step": 33 }, { "epoch": 0.005426975259377494, "eval_loss": 2.9837427139282227, "eval_runtime": 251.8703, "eval_samples_per_second": 41.895, "eval_steps_per_second": 5.237, "step": 34 }, { "epoch": 0.005746209098164406, "grad_norm": 8.329032897949219, "learning_rate": 9.545032675245813e-05, "loss": 11.7558, "step": 36 }, { "epoch": 0.006225059856344773, "grad_norm": 8.299912452697754, "learning_rate": 9.43611409721806e-05, "loss": 11.596, "step": 39 }, { "epoch": 0.0067039106145251395, "grad_norm": 7.866604328155518, "learning_rate": 9.316282404787871e-05, "loss": 11.5052, "step": 42 }, { "epoch": 0.007182761372705506, "grad_norm": 9.483307838439941, "learning_rate": 9.185832391312644e-05, "loss": 11.6852, "step": 45 }, { "epoch": 0.007661612130885874, "grad_norm": 8.992164611816406, "learning_rate": 9.045084971874738e-05, "loss": 11.2905, "step": 48 }, { "epoch": 0.008140462889066242, "grad_norm": 9.365238189697266, "learning_rate": 8.894386393810563e-05, "loss": 11.6957, "step": 51 }, { "epoch": 0.008140462889066242, "eval_loss": 2.849066734313965, "eval_runtime": 251.7718, "eval_samples_per_second": 41.911, "eval_steps_per_second": 5.239, "step": 51 }, { "epoch": 0.008619313647246609, "grad_norm": 8.924206733703613, "learning_rate": 8.73410738492077e-05, "loss": 11.8827, "step": 54 }, { "epoch": 0.009098164405426976, "grad_norm": 8.531525611877441, "learning_rate": 8.564642241456986e-05, "loss": 11.1439, "step": 57 }, { "epoch": 0.009577015163607342, "grad_norm": 8.676018714904785, "learning_rate": 8.386407858128706e-05, "loss": 10.6807, "step": 60 }, { "epoch": 0.01005586592178771, "grad_norm": 9.389708518981934, "learning_rate": 8.199842702516583e-05, "loss": 10.6687, "step": 63 }, { "epoch": 0.010534716679968076, "grad_norm": 8.772107124328613, "learning_rate": 8.005405736415126e-05, "loss": 11.4398, "step": 66 }, { "epoch": 0.010853950518754988, "eval_loss": 2.7779133319854736, "eval_runtime": 251.7336, "eval_samples_per_second": 41.917, "eval_steps_per_second": 5.24, "step": 68 }, { "epoch": 0.011013567438148443, "grad_norm": 8.75489616394043, "learning_rate": 7.803575286758364e-05, "loss": 11.2171, "step": 69 }, { "epoch": 0.011492418196328812, "grad_norm": 10.025832176208496, "learning_rate": 7.594847868906076e-05, "loss": 11.3749, "step": 72 }, { "epoch": 0.011971268954509178, "grad_norm": 10.174299240112305, "learning_rate": 7.379736965185368e-05, "loss": 10.8346, "step": 75 }, { "epoch": 0.012450119712689545, "grad_norm": 9.833930015563965, "learning_rate": 7.158771761692464e-05, "loss": 11.1372, "step": 78 }, { "epoch": 0.012928970470869912, "grad_norm": 10.182357788085938, "learning_rate": 6.932495846462261e-05, "loss": 10.8957, "step": 81 }, { "epoch": 0.013407821229050279, "grad_norm": 11.759332656860352, "learning_rate": 6.701465872208216e-05, "loss": 11.0684, "step": 84 }, { "epoch": 0.013567438148443736, "eval_loss": 2.715846300125122, "eval_runtime": 251.8705, "eval_samples_per_second": 41.895, "eval_steps_per_second": 5.237, "step": 85 }, { "epoch": 0.013886671987230646, "grad_norm": 10.804072380065918, "learning_rate": 6.466250186922325e-05, "loss": 10.9534, "step": 87 }, { "epoch": 0.014365522745411013, "grad_norm": 9.92498779296875, "learning_rate": 6.227427435703997e-05, "loss": 10.2133, "step": 90 }, { "epoch": 0.014844373503591381, "grad_norm": 11.183781623840332, "learning_rate": 5.985585137257401e-05, "loss": 10.8727, "step": 93 }, { "epoch": 0.015323224261771748, "grad_norm": 9.482535362243652, "learning_rate": 5.74131823855921e-05, "loss": 10.364, "step": 96 }, { "epoch": 0.015802075019952115, "grad_norm": 10.478203773498535, "learning_rate": 5.495227651252315e-05, "loss": 10.487, "step": 99 }, { "epoch": 0.016280925778132484, "grad_norm": 9.559563636779785, "learning_rate": 5.247918773366112e-05, "loss": 11.0371, "step": 102 }, { "epoch": 0.016280925778132484, "eval_loss": 2.6764745712280273, "eval_runtime": 251.6353, "eval_samples_per_second": 41.934, "eval_steps_per_second": 5.242, "step": 102 }, { "epoch": 0.01675977653631285, "grad_norm": 9.738425254821777, "learning_rate": 5e-05, "loss": 11.066, "step": 105 }, { "epoch": 0.017238627294493217, "grad_norm": 9.256367683410645, "learning_rate": 4.7520812266338885e-05, "loss": 10.5407, "step": 108 }, { "epoch": 0.017717478052673583, "grad_norm": 10.132784843444824, "learning_rate": 4.504772348747687e-05, "loss": 11.1067, "step": 111 }, { "epoch": 0.01819632881085395, "grad_norm": 11.10916519165039, "learning_rate": 4.2586817614407895e-05, "loss": 10.6358, "step": 114 }, { "epoch": 0.018675179569034316, "grad_norm": 10.140100479125977, "learning_rate": 4.0144148627425993e-05, "loss": 10.6957, "step": 117 }, { "epoch": 0.01899441340782123, "eval_loss": 2.6511905193328857, "eval_runtime": 251.7606, "eval_samples_per_second": 41.913, "eval_steps_per_second": 5.239, "step": 119 }, { "epoch": 0.019154030327214685, "grad_norm": 10.377765655517578, "learning_rate": 3.772572564296005e-05, "loss": 11.2824, "step": 120 }, { "epoch": 0.019632881085395053, "grad_norm": 10.346396446228027, "learning_rate": 3.533749813077677e-05, "loss": 10.3451, "step": 123 }, { "epoch": 0.02011173184357542, "grad_norm": 10.072776794433594, "learning_rate": 3.298534127791785e-05, "loss": 10.2213, "step": 126 }, { "epoch": 0.020590582601755787, "grad_norm": 10.647153854370117, "learning_rate": 3.0675041535377405e-05, "loss": 11.1756, "step": 129 }, { "epoch": 0.021069433359936152, "grad_norm": 10.575749397277832, "learning_rate": 2.8412282383075363e-05, "loss": 11.3389, "step": 132 }, { "epoch": 0.02154828411811652, "grad_norm": 10.153754234313965, "learning_rate": 2.6202630348146324e-05, "loss": 10.759, "step": 135 }, { "epoch": 0.021707901037509976, "eval_loss": 2.62726092338562, "eval_runtime": 251.8498, "eval_samples_per_second": 41.898, "eval_steps_per_second": 5.237, "step": 136 }, { "epoch": 0.022027134876296886, "grad_norm": 10.582724571228027, "learning_rate": 2.405152131093926e-05, "loss": 10.553, "step": 138 }, { "epoch": 0.022505985634477255, "grad_norm": 10.754630088806152, "learning_rate": 2.196424713241637e-05, "loss": 10.2039, "step": 141 }, { "epoch": 0.022984836392657623, "grad_norm": 11.024446487426758, "learning_rate": 1.9945942635848748e-05, "loss": 10.5787, "step": 144 }, { "epoch": 0.02346368715083799, "grad_norm": 10.963229179382324, "learning_rate": 1.800157297483417e-05, "loss": 10.6943, "step": 147 }, { "epoch": 0.023942537909018357, "grad_norm": 9.20400619506836, "learning_rate": 1.6135921418712956e-05, "loss": 10.2282, "step": 150 }, { "epoch": 0.024421388667198722, "grad_norm": 10.465228080749512, "learning_rate": 1.435357758543015e-05, "loss": 10.7362, "step": 153 }, { "epoch": 0.024421388667198722, "eval_loss": 2.6127984523773193, "eval_runtime": 251.6915, "eval_samples_per_second": 41.924, "eval_steps_per_second": 5.241, "step": 153 }, { "epoch": 0.02490023942537909, "grad_norm": 10.978548049926758, "learning_rate": 1.2658926150792322e-05, "loss": 10.6595, "step": 156 }, { "epoch": 0.025379090183559456, "grad_norm": 10.339092254638672, "learning_rate": 1.1056136061894384e-05, "loss": 10.4906, "step": 159 }, { "epoch": 0.025857940941739824, "grad_norm": 11.028854370117188, "learning_rate": 9.549150281252633e-06, "loss": 10.4017, "step": 162 }, { "epoch": 0.026336791699920193, "grad_norm": 10.076519966125488, "learning_rate": 8.141676086873572e-06, "loss": 10.1381, "step": 165 }, { "epoch": 0.026815642458100558, "grad_norm": 10.517831802368164, "learning_rate": 6.837175952121306e-06, "loss": 10.6209, "step": 168 }, { "epoch": 0.02713487629688747, "eval_loss": 2.6026711463928223, "eval_runtime": 251.8204, "eval_samples_per_second": 41.903, "eval_steps_per_second": 5.238, "step": 170 }, { "epoch": 0.027294493216280927, "grad_norm": 10.131866455078125, "learning_rate": 5.6388590278194096e-06, "loss": 10.2782, "step": 171 }, { "epoch": 0.027773343974461292, "grad_norm": 10.199562072753906, "learning_rate": 4.549673247541875e-06, "loss": 10.2035, "step": 174 }, { "epoch": 0.02825219473264166, "grad_norm": 10.666658401489258, "learning_rate": 3.5722980755146517e-06, "loss": 10.5107, "step": 177 }, { "epoch": 0.028731045490822026, "grad_norm": 10.063220024108887, "learning_rate": 2.7091379149682685e-06, "loss": 10.3803, "step": 180 }, { "epoch": 0.029209896249002394, "grad_norm": 11.067845344543457, "learning_rate": 1.962316193157593e-06, "loss": 10.4799, "step": 183 }, { "epoch": 0.029688747007182763, "grad_norm": 10.027233123779297, "learning_rate": 1.333670137599713e-06, "loss": 10.9872, "step": 186 }, { "epoch": 0.029848363926576218, "eval_loss": 2.6014537811279297, "eval_runtime": 251.853, "eval_samples_per_second": 41.897, "eval_steps_per_second": 5.237, "step": 187 } ], "logging_steps": 3, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 17, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.1346074966949888e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }