{ "best_metric": 11.921051025390625, "best_model_checkpoint": "miner_id_24/checkpoint-100", "epoch": 0.4672897196261682, "eval_steps": 25, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004672897196261682, "grad_norm": 0.012287290766835213, "learning_rate": 2.9999999999999997e-05, "loss": 11.9336, "step": 1 }, { "epoch": 0.004672897196261682, "eval_loss": 11.933741569519043, "eval_runtime": 0.3116, "eval_samples_per_second": 160.464, "eval_steps_per_second": 22.465, "step": 1 }, { "epoch": 0.009345794392523364, "grad_norm": 0.007687127683311701, "learning_rate": 5.9999999999999995e-05, "loss": 11.9349, "step": 2 }, { "epoch": 0.014018691588785047, "grad_norm": 0.011557426303625107, "learning_rate": 8.999999999999999e-05, "loss": 11.9336, "step": 3 }, { "epoch": 0.018691588785046728, "grad_norm": 0.0096749821677804, "learning_rate": 0.00011999999999999999, "loss": 11.9343, "step": 4 }, { "epoch": 0.02336448598130841, "grad_norm": 0.011569755151867867, "learning_rate": 0.00015, "loss": 11.9332, "step": 5 }, { "epoch": 0.028037383177570093, "grad_norm": 0.009253166615962982, "learning_rate": 0.00017999999999999998, "loss": 11.9324, "step": 6 }, { "epoch": 0.03271028037383177, "grad_norm": 0.014671793207526207, "learning_rate": 0.00020999999999999998, "loss": 11.9335, "step": 7 }, { "epoch": 0.037383177570093455, "grad_norm": 0.009677017107605934, "learning_rate": 0.00023999999999999998, "loss": 11.9347, "step": 8 }, { "epoch": 0.04205607476635514, "grad_norm": 0.010931613855063915, "learning_rate": 0.00027, "loss": 11.9333, "step": 9 }, { "epoch": 0.04672897196261682, "grad_norm": 0.007789121475070715, "learning_rate": 0.0003, "loss": 11.9356, "step": 10 }, { "epoch": 0.0514018691588785, "grad_norm": 0.01007977407425642, "learning_rate": 0.0002999794957488703, "loss": 11.933, "step": 11 }, { "epoch": 0.056074766355140186, "grad_norm": 0.011012083850800991, "learning_rate": 0.0002999179886011389, "loss": 11.9328, "step": 12 }, { "epoch": 0.06074766355140187, "grad_norm": 0.011678499169647694, "learning_rate": 0.0002998154953722457, "loss": 11.934, "step": 13 }, { "epoch": 0.06542056074766354, "grad_norm": 0.00791614968329668, "learning_rate": 0.00029967204408281613, "loss": 11.9359, "step": 14 }, { "epoch": 0.07009345794392523, "grad_norm": 0.011806285940110683, "learning_rate": 0.00029948767395100045, "loss": 11.9335, "step": 15 }, { "epoch": 0.07476635514018691, "grad_norm": 0.013282974250614643, "learning_rate": 0.0002992624353817517, "loss": 11.9321, "step": 16 }, { "epoch": 0.0794392523364486, "grad_norm": 0.010643936693668365, "learning_rate": 0.0002989963899530457, "loss": 11.9329, "step": 17 }, { "epoch": 0.08411214953271028, "grad_norm": 0.009988749399781227, "learning_rate": 0.00029868961039904624, "loss": 11.9339, "step": 18 }, { "epoch": 0.08878504672897196, "grad_norm": 0.012497678399085999, "learning_rate": 0.00029834218059022024, "loss": 11.933, "step": 19 }, { "epoch": 0.09345794392523364, "grad_norm": 0.01283989381045103, "learning_rate": 0.00029795419551040833, "loss": 11.9325, "step": 20 }, { "epoch": 0.09813084112149532, "grad_norm": 0.011552334763109684, "learning_rate": 0.00029752576123085736, "loss": 11.9321, "step": 21 }, { "epoch": 0.102803738317757, "grad_norm": 0.012469599023461342, "learning_rate": 0.0002970569948812214, "loss": 11.9342, "step": 22 }, { "epoch": 0.10747663551401869, "grad_norm": 0.013510530814528465, "learning_rate": 0.0002965480246175399, "loss": 11.9341, "step": 23 }, { "epoch": 0.11214953271028037, "grad_norm": 0.015959424898028374, "learning_rate": 0.0002959989895872009, "loss": 11.9338, "step": 24 }, { "epoch": 0.11682242990654206, "grad_norm": 0.017688067629933357, "learning_rate": 0.0002954100398908995, "loss": 11.9333, "step": 25 }, { "epoch": 0.11682242990654206, "eval_loss": 11.932760238647461, "eval_runtime": 0.3111, "eval_samples_per_second": 160.696, "eval_steps_per_second": 22.497, "step": 25 }, { "epoch": 0.12149532710280374, "grad_norm": 0.018132586032152176, "learning_rate": 0.0002947813365416023, "loss": 11.9355, "step": 26 }, { "epoch": 0.1261682242990654, "grad_norm": 0.01555582694709301, "learning_rate": 0.0002941130514205272, "loss": 11.9327, "step": 27 }, { "epoch": 0.1308411214953271, "grad_norm": 0.010587092489004135, "learning_rate": 0.0002934053672301536, "loss": 11.9306, "step": 28 }, { "epoch": 0.13551401869158877, "grad_norm": 0.016130341216921806, "learning_rate": 0.00029265847744427303, "loss": 11.934, "step": 29 }, { "epoch": 0.14018691588785046, "grad_norm": 0.016748711466789246, "learning_rate": 0.00029187258625509513, "loss": 11.9327, "step": 30 }, { "epoch": 0.14485981308411214, "grad_norm": 0.02337314747273922, "learning_rate": 0.00029104790851742417, "loss": 11.9312, "step": 31 }, { "epoch": 0.14953271028037382, "grad_norm": 0.031070569530129433, "learning_rate": 0.0002901846696899191, "loss": 11.931, "step": 32 }, { "epoch": 0.1542056074766355, "grad_norm": 0.030112041160464287, "learning_rate": 0.00028928310577345606, "loss": 11.931, "step": 33 }, { "epoch": 0.1588785046728972, "grad_norm": 0.025763792917132378, "learning_rate": 0.0002883434632466077, "loss": 11.9341, "step": 34 }, { "epoch": 0.16355140186915887, "grad_norm": 0.020702293142676353, "learning_rate": 0.00028736599899825856, "loss": 11.9302, "step": 35 }, { "epoch": 0.16822429906542055, "grad_norm": 0.032018788158893585, "learning_rate": 0.00028635098025737434, "loss": 11.9305, "step": 36 }, { "epoch": 0.17289719626168223, "grad_norm": 0.03408855199813843, "learning_rate": 0.00028529868451994384, "loss": 11.9297, "step": 37 }, { "epoch": 0.17757009345794392, "grad_norm": 0.03538135811686516, "learning_rate": 0.0002842093994731145, "loss": 11.9297, "step": 38 }, { "epoch": 0.1822429906542056, "grad_norm": 0.042890459299087524, "learning_rate": 0.00028308342291654174, "loss": 11.931, "step": 39 }, { "epoch": 0.18691588785046728, "grad_norm": 0.04696235433220863, "learning_rate": 0.00028192106268097334, "loss": 11.9326, "step": 40 }, { "epoch": 0.19158878504672897, "grad_norm": 0.04607512429356575, "learning_rate": 0.00028072263654409154, "loss": 11.9304, "step": 41 }, { "epoch": 0.19626168224299065, "grad_norm": 0.059835415333509445, "learning_rate": 0.0002794884721436361, "loss": 11.9319, "step": 42 }, { "epoch": 0.20093457943925233, "grad_norm": 0.041718047112226486, "learning_rate": 0.00027821890688783083, "loss": 11.9292, "step": 43 }, { "epoch": 0.205607476635514, "grad_norm": 0.06262337416410446, "learning_rate": 0.0002769142878631403, "loss": 11.9337, "step": 44 }, { "epoch": 0.2102803738317757, "grad_norm": 0.054845791310071945, "learning_rate": 0.00027557497173937923, "loss": 11.9268, "step": 45 }, { "epoch": 0.21495327102803738, "grad_norm": 0.0817423015832901, "learning_rate": 0.000274201324672203, "loss": 11.9276, "step": 46 }, { "epoch": 0.21962616822429906, "grad_norm": 0.0744045227766037, "learning_rate": 0.00027279372220300385, "loss": 11.9311, "step": 47 }, { "epoch": 0.22429906542056074, "grad_norm": 0.092995785176754, "learning_rate": 0.0002713525491562421, "loss": 11.9277, "step": 48 }, { "epoch": 0.22897196261682243, "grad_norm": 0.07758913189172745, "learning_rate": 0.00026987819953423867, "loss": 11.929, "step": 49 }, { "epoch": 0.2336448598130841, "grad_norm": 0.0959334447979927, "learning_rate": 0.00026837107640945905, "loss": 11.932, "step": 50 }, { "epoch": 0.2336448598130841, "eval_loss": 11.926326751708984, "eval_runtime": 0.309, "eval_samples_per_second": 161.815, "eval_steps_per_second": 22.654, "step": 50 }, { "epoch": 0.2383177570093458, "grad_norm": 0.0555480420589447, "learning_rate": 0.0002668315918143169, "loss": 11.9239, "step": 51 }, { "epoch": 0.24299065420560748, "grad_norm": 0.03427752107381821, "learning_rate": 0.00026526016662852886, "loss": 11.9273, "step": 52 }, { "epoch": 0.24766355140186916, "grad_norm": 0.051935601979494095, "learning_rate": 0.00026365723046405023, "loss": 11.9254, "step": 53 }, { "epoch": 0.2523364485981308, "grad_norm": 0.05068502947688103, "learning_rate": 0.0002620232215476231, "loss": 11.9243, "step": 54 }, { "epoch": 0.2570093457943925, "grad_norm": 0.042215969413518906, "learning_rate": 0.0002603585866009697, "loss": 11.9256, "step": 55 }, { "epoch": 0.2616822429906542, "grad_norm": 0.03927940875291824, "learning_rate": 0.00025866378071866334, "loss": 11.9255, "step": 56 }, { "epoch": 0.26635514018691586, "grad_norm": 0.041530344635248184, "learning_rate": 0.00025693926724370956, "loss": 11.9267, "step": 57 }, { "epoch": 0.27102803738317754, "grad_norm": 0.05640103295445442, "learning_rate": 0.00025518551764087326, "loss": 11.9244, "step": 58 }, { "epoch": 0.2757009345794392, "grad_norm": 0.039908554404973984, "learning_rate": 0.00025340301136778483, "loss": 11.9261, "step": 59 }, { "epoch": 0.2803738317757009, "grad_norm": 0.03271481394767761, "learning_rate": 0.00025159223574386114, "loss": 11.9258, "step": 60 }, { "epoch": 0.2850467289719626, "grad_norm": 0.0314725823700428, "learning_rate": 0.0002497536858170772, "loss": 11.9223, "step": 61 }, { "epoch": 0.2897196261682243, "grad_norm": 0.03497527167201042, "learning_rate": 0.00024788786422862526, "loss": 11.9226, "step": 62 }, { "epoch": 0.29439252336448596, "grad_norm": 0.03343958035111427, "learning_rate": 0.00024599528107549745, "loss": 11.9246, "step": 63 }, { "epoch": 0.29906542056074764, "grad_norm": 0.03223850205540657, "learning_rate": 0.00024407645377103054, "loss": 11.9208, "step": 64 }, { "epoch": 0.3037383177570093, "grad_norm": 0.03138720244169235, "learning_rate": 0.00024213190690345018, "loss": 11.9224, "step": 65 }, { "epoch": 0.308411214953271, "grad_norm": 0.02842772752046585, "learning_rate": 0.00024016217209245374, "loss": 11.9213, "step": 66 }, { "epoch": 0.3130841121495327, "grad_norm": 0.033133890479803085, "learning_rate": 0.00023816778784387094, "loss": 11.9225, "step": 67 }, { "epoch": 0.3177570093457944, "grad_norm": 0.032245416194200516, "learning_rate": 0.0002361492994024415, "loss": 11.9209, "step": 68 }, { "epoch": 0.32242990654205606, "grad_norm": 0.026734715327620506, "learning_rate": 0.0002341072586027509, "loss": 11.925, "step": 69 }, { "epoch": 0.32710280373831774, "grad_norm": 0.025514936074614525, "learning_rate": 0.00023204222371836405, "loss": 11.9217, "step": 70 }, { "epoch": 0.3317757009345794, "grad_norm": 0.018397009000182152, "learning_rate": 0.00022995475930919905, "loss": 11.9264, "step": 71 }, { "epoch": 0.3364485981308411, "grad_norm": 0.017135081812739372, "learning_rate": 0.00022784543606718227, "loss": 11.9244, "step": 72 }, { "epoch": 0.3411214953271028, "grad_norm": 0.02699720673263073, "learning_rate": 0.00022571483066022657, "loss": 11.9216, "step": 73 }, { "epoch": 0.34579439252336447, "grad_norm": 0.024376867339015007, "learning_rate": 0.0002235635255745762, "loss": 11.9232, "step": 74 }, { "epoch": 0.35046728971962615, "grad_norm": 0.023777619004249573, "learning_rate": 0.00022139210895556104, "loss": 11.9243, "step": 75 }, { "epoch": 0.35046728971962615, "eval_loss": 11.921453475952148, "eval_runtime": 0.3108, "eval_samples_per_second": 160.851, "eval_steps_per_second": 22.519, "step": 75 }, { "epoch": 0.35514018691588783, "grad_norm": 0.02467428147792816, "learning_rate": 0.00021920117444680317, "loss": 11.9218, "step": 76 }, { "epoch": 0.3598130841121495, "grad_norm": 0.014774040319025517, "learning_rate": 0.00021699132102792097, "loss": 11.9219, "step": 77 }, { "epoch": 0.3644859813084112, "grad_norm": 0.01568908616900444, "learning_rate": 0.0002147631528507739, "loss": 11.9206, "step": 78 }, { "epoch": 0.3691588785046729, "grad_norm": 0.021154990419745445, "learning_rate": 0.00021251727907429355, "loss": 11.9222, "step": 79 }, { "epoch": 0.37383177570093457, "grad_norm": 0.01931612752377987, "learning_rate": 0.0002102543136979454, "loss": 11.9229, "step": 80 }, { "epoch": 0.37850467289719625, "grad_norm": 0.02147780731320381, "learning_rate": 0.0002079748753938678, "loss": 11.923, "step": 81 }, { "epoch": 0.38317757009345793, "grad_norm": 0.01698346436023712, "learning_rate": 0.0002056795873377331, "loss": 11.9215, "step": 82 }, { "epoch": 0.3878504672897196, "grad_norm": 0.021959029138088226, "learning_rate": 0.00020336907703837748, "loss": 11.9202, "step": 83 }, { "epoch": 0.3925233644859813, "grad_norm": 0.018221363425254822, "learning_rate": 0.00020104397616624645, "loss": 11.9213, "step": 84 }, { "epoch": 0.397196261682243, "grad_norm": 0.014806934632360935, "learning_rate": 0.00019870492038070252, "loss": 11.921, "step": 85 }, { "epoch": 0.40186915887850466, "grad_norm": 0.019513826817274094, "learning_rate": 0.0001963525491562421, "loss": 11.9189, "step": 86 }, { "epoch": 0.40654205607476634, "grad_norm": 0.01412623468786478, "learning_rate": 0.0001939875056076697, "loss": 11.9225, "step": 87 }, { "epoch": 0.411214953271028, "grad_norm": 0.01801372691988945, "learning_rate": 0.00019161043631427666, "loss": 11.9201, "step": 88 }, { "epoch": 0.4158878504672897, "grad_norm": 0.019024653360247612, "learning_rate": 0.00018922199114307294, "loss": 11.9205, "step": 89 }, { "epoch": 0.4205607476635514, "grad_norm": 0.021895864978432655, "learning_rate": 0.00018682282307111987, "loss": 11.9216, "step": 90 }, { "epoch": 0.4252336448598131, "grad_norm": 0.025304431095719337, "learning_rate": 0.00018441358800701273, "loss": 11.9214, "step": 91 }, { "epoch": 0.42990654205607476, "grad_norm": 0.0259839054197073, "learning_rate": 0.00018199494461156203, "loss": 11.9253, "step": 92 }, { "epoch": 0.43457943925233644, "grad_norm": 0.04013689234852791, "learning_rate": 0.000179567554117722, "loss": 11.9211, "step": 93 }, { "epoch": 0.4392523364485981, "grad_norm": 0.032571107149124146, "learning_rate": 0.00017713208014981648, "loss": 11.9246, "step": 94 }, { "epoch": 0.4439252336448598, "grad_norm": 0.04223785176873207, "learning_rate": 0.00017468918854211007, "loss": 11.9277, "step": 95 }, { "epoch": 0.4485981308411215, "grad_norm": 0.01614546775817871, "learning_rate": 0.00017223954715677627, "loss": 11.9202, "step": 96 }, { "epoch": 0.4532710280373832, "grad_norm": 0.05689317733049393, "learning_rate": 0.00016978382570131034, "loss": 11.9215, "step": 97 }, { "epoch": 0.45794392523364486, "grad_norm": 0.04419248178601265, "learning_rate": 0.00016732269554543794, "loss": 11.9244, "step": 98 }, { "epoch": 0.46261682242990654, "grad_norm": 0.05944395065307617, "learning_rate": 0.00016485682953756942, "loss": 11.9218, "step": 99 }, { "epoch": 0.4672897196261682, "grad_norm": 0.05680546537041664, "learning_rate": 0.00016238690182084986, "loss": 11.9257, "step": 100 }, { "epoch": 0.4672897196261682, "eval_loss": 11.921051025390625, "eval_runtime": 0.3127, "eval_samples_per_second": 159.909, "eval_steps_per_second": 22.387, "step": 100 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 159262310400.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }