{ "best_metric": 1.8198388814926147, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 1.0633303808680248, "eval_steps": 100, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001771479185119575, "eval_loss": 4.6529459953308105, "eval_runtime": 16.7798, "eval_samples_per_second": 56.675, "eval_steps_per_second": 14.184, "step": 1 }, { "epoch": 0.01771479185119575, "grad_norm": 6.935274124145508, "learning_rate": 0.0002, "loss": 3.5332, "step": 10 }, { "epoch": 0.0354295837023915, "grad_norm": 3.5101845264434814, "learning_rate": 0.0001998582695676762, "loss": 2.1451, "step": 20 }, { "epoch": 0.053144375553587246, "grad_norm": 2.828544855117798, "learning_rate": 0.00019943348002101371, "loss": 2.034, "step": 30 }, { "epoch": 0.070859167404783, "grad_norm": 2.610250234603882, "learning_rate": 0.00019872683547213446, "loss": 1.9515, "step": 40 }, { "epoch": 0.08857395925597875, "grad_norm": 2.671312093734741, "learning_rate": 0.00019774033898178667, "loss": 1.9134, "step": 50 }, { "epoch": 0.10628875110717449, "grad_norm": 2.4686455726623535, "learning_rate": 0.0001964767868814516, "loss": 2.0368, "step": 60 }, { "epoch": 0.12400354295837024, "grad_norm": 2.524057388305664, "learning_rate": 0.00019493976084683813, "loss": 2.0019, "step": 70 }, { "epoch": 0.141718334809566, "grad_norm": 2.3468284606933594, "learning_rate": 0.00019313361774523385, "loss": 1.9573, "step": 80 }, { "epoch": 0.15943312666076173, "grad_norm": 2.1679277420043945, "learning_rate": 0.00019106347728549135, "loss": 1.8797, "step": 90 }, { "epoch": 0.1771479185119575, "grad_norm": 2.4932808876037598, "learning_rate": 0.00018873520750565718, "loss": 1.897, "step": 100 }, { "epoch": 0.1771479185119575, "eval_loss": 1.960958480834961, "eval_runtime": 16.7821, "eval_samples_per_second": 56.667, "eval_steps_per_second": 14.182, "step": 100 }, { "epoch": 0.19486271036315322, "grad_norm": 2.1805710792541504, "learning_rate": 0.0001861554081393806, "loss": 2.0712, "step": 110 }, { "epoch": 0.21257750221434898, "grad_norm": 2.5120718479156494, "learning_rate": 0.0001833313919082515, "loss": 1.9858, "step": 120 }, { "epoch": 0.23029229406554472, "grad_norm": 2.521531343460083, "learning_rate": 0.00018027116379309638, "loss": 1.8546, "step": 130 }, { "epoch": 0.24800708591674048, "grad_norm": 2.148980140686035, "learning_rate": 0.00017698339834299061, "loss": 1.8414, "step": 140 }, { "epoch": 0.2657218777679362, "grad_norm": 2.597273111343384, "learning_rate": 0.00017347741508630672, "loss": 1.8544, "step": 150 }, { "epoch": 0.283436669619132, "grad_norm": 2.2320265769958496, "learning_rate": 0.0001697631521134985, "loss": 1.9321, "step": 160 }, { "epoch": 0.30115146147032773, "grad_norm": 2.263096570968628, "learning_rate": 0.00016585113790650388, "loss": 1.9933, "step": 170 }, { "epoch": 0.31886625332152346, "grad_norm": 3.2061798572540283, "learning_rate": 0.0001617524614946192, "loss": 1.9608, "step": 180 }, { "epoch": 0.3365810451727192, "grad_norm": 2.643886089324951, "learning_rate": 0.0001574787410214407, "loss": 1.8959, "step": 190 }, { "epoch": 0.354295837023915, "grad_norm": 2.0563578605651855, "learning_rate": 0.00015304209081197425, "loss": 1.8635, "step": 200 }, { "epoch": 0.354295837023915, "eval_loss": 1.9041333198547363, "eval_runtime": 16.8326, "eval_samples_per_second": 56.498, "eval_steps_per_second": 14.139, "step": 200 }, { "epoch": 0.3720106288751107, "grad_norm": 2.665921926498413, "learning_rate": 0.00014845508703326504, "loss": 1.9161, "step": 210 }, { "epoch": 0.38972542072630645, "grad_norm": 2.691920518875122, "learning_rate": 0.00014373073204588556, "loss": 1.9337, "step": 220 }, { "epoch": 0.40744021257750224, "grad_norm": 2.107774257659912, "learning_rate": 0.00013888241754733208, "loss": 1.8728, "step": 230 }, { "epoch": 0.42515500442869797, "grad_norm": 2.2913401126861572, "learning_rate": 0.00013392388661180303, "loss": 1.9039, "step": 240 }, { "epoch": 0.4428697962798937, "grad_norm": 2.163093328475952, "learning_rate": 0.0001288691947339621, "loss": 1.7768, "step": 250 }, { "epoch": 0.46058458813108943, "grad_norm": 2.8082563877105713, "learning_rate": 0.0001237326699871115, "loss": 1.932, "step": 260 }, { "epoch": 0.4782993799822852, "grad_norm": 2.1630780696868896, "learning_rate": 0.00011852887240871145, "loss": 1.8789, "step": 270 }, { "epoch": 0.49601417183348095, "grad_norm": 2.2900702953338623, "learning_rate": 0.00011327255272837221, "loss": 1.8839, "step": 280 }, { "epoch": 0.5137289636846767, "grad_norm": 2.1130764484405518, "learning_rate": 0.00010797861055530831, "loss": 1.8992, "step": 290 }, { "epoch": 0.5314437555358724, "grad_norm": 2.1182191371917725, "learning_rate": 0.00010266205214377748, "loss": 1.8728, "step": 300 }, { "epoch": 0.5314437555358724, "eval_loss": 1.8754034042358398, "eval_runtime": 16.7499, "eval_samples_per_second": 56.776, "eval_steps_per_second": 14.209, "step": 300 }, { "epoch": 0.5491585473870682, "grad_norm": 2.364851951599121, "learning_rate": 9.733794785622253e-05, "loss": 1.9063, "step": 310 }, { "epoch": 0.566873339238264, "grad_norm": 2.248103380203247, "learning_rate": 9.202138944469168e-05, "loss": 1.8951, "step": 320 }, { "epoch": 0.5845881310894597, "grad_norm": 2.0723376274108887, "learning_rate": 8.672744727162781e-05, "loss": 1.8764, "step": 330 }, { "epoch": 0.6023029229406555, "grad_norm": 2.1202616691589355, "learning_rate": 8.147112759128859e-05, "loss": 1.8705, "step": 340 }, { "epoch": 0.6200177147918512, "grad_norm": 2.483820915222168, "learning_rate": 7.626733001288851e-05, "loss": 1.8853, "step": 350 }, { "epoch": 0.6377325066430469, "grad_norm": 2.218118906021118, "learning_rate": 7.113080526603792e-05, "loss": 1.8008, "step": 360 }, { "epoch": 0.6554472984942427, "grad_norm": 2.155637502670288, "learning_rate": 6.607611338819697e-05, "loss": 1.896, "step": 370 }, { "epoch": 0.6731620903454384, "grad_norm": 2.509552001953125, "learning_rate": 6.111758245266794e-05, "loss": 1.828, "step": 380 }, { "epoch": 0.6908768821966342, "grad_norm": 1.92943274974823, "learning_rate": 5.626926795411447e-05, "loss": 1.8035, "step": 390 }, { "epoch": 0.70859167404783, "grad_norm": 2.5901525020599365, "learning_rate": 5.1544912966734994e-05, "loss": 1.8291, "step": 400 }, { "epoch": 0.70859167404783, "eval_loss": 1.8369861841201782, "eval_runtime": 16.8518, "eval_samples_per_second": 56.433, "eval_steps_per_second": 14.123, "step": 400 }, { "epoch": 0.7263064658990257, "grad_norm": 2.4062066078186035, "learning_rate": 4.695790918802576e-05, "loss": 1.7516, "step": 410 }, { "epoch": 0.7440212577502214, "grad_norm": 2.545196294784546, "learning_rate": 4.252125897855932e-05, "loss": 1.8523, "step": 420 }, { "epoch": 0.7617360496014172, "grad_norm": 2.172696352005005, "learning_rate": 3.824753850538082e-05, "loss": 1.8045, "step": 430 }, { "epoch": 0.7794508414526129, "grad_norm": 2.218444347381592, "learning_rate": 3.414886209349615e-05, "loss": 1.7085, "step": 440 }, { "epoch": 0.7971656333038086, "grad_norm": 1.9435523748397827, "learning_rate": 3.0236847886501542e-05, "loss": 1.776, "step": 450 }, { "epoch": 0.8148804251550045, "grad_norm": 1.9952267408370972, "learning_rate": 2.6522584913693294e-05, "loss": 1.7882, "step": 460 }, { "epoch": 0.8325952170062002, "grad_norm": 2.478018283843994, "learning_rate": 2.301660165700936e-05, "loss": 1.867, "step": 470 }, { "epoch": 0.8503100088573959, "grad_norm": 2.3368334770202637, "learning_rate": 1.9728836206903656e-05, "loss": 1.6999, "step": 480 }, { "epoch": 0.8680248007085917, "grad_norm": 2.051180362701416, "learning_rate": 1.6668608091748495e-05, "loss": 1.8281, "step": 490 }, { "epoch": 0.8857395925597874, "grad_norm": 2.087390422821045, "learning_rate": 1.3844591860619383e-05, "loss": 1.7554, "step": 500 }, { "epoch": 0.8857395925597874, "eval_loss": 1.8198388814926147, "eval_runtime": 16.7054, "eval_samples_per_second": 56.928, "eval_steps_per_second": 14.247, "step": 500 }, { "epoch": 0.9034543844109831, "grad_norm": 2.305307388305664, "learning_rate": 1.1264792494342857e-05, "loss": 1.8523, "step": 510 }, { "epoch": 0.9211691762621789, "grad_norm": 3.0362179279327393, "learning_rate": 8.936522714508678e-06, "loss": 1.8562, "step": 520 }, { "epoch": 0.9388839681133747, "grad_norm": 2.4378557205200195, "learning_rate": 6.866382254766157e-06, "loss": 1.8373, "step": 530 }, { "epoch": 0.9565987599645704, "grad_norm": 3.080037832260132, "learning_rate": 5.060239153161872e-06, "loss": 1.7831, "step": 540 }, { "epoch": 0.9743135518157662, "grad_norm": 2.2773683071136475, "learning_rate": 3.5232131185484076e-06, "loss": 1.7235, "step": 550 }, { "epoch": 0.9920283436669619, "grad_norm": 1.843621850013733, "learning_rate": 2.259661018213333e-06, "loss": 1.7343, "step": 560 }, { "epoch": 1.0101860053144376, "grad_norm": 2.412452220916748, "learning_rate": 1.2731645278655445e-06, "loss": 1.7645, "step": 570 }, { "epoch": 1.0279007971656333, "grad_norm": 1.848340392112732, "learning_rate": 5.665199789862907e-07, "loss": 1.5955, "step": 580 }, { "epoch": 1.045615589016829, "grad_norm": 1.5862144231796265, "learning_rate": 1.4173043232380557e-07, "loss": 1.4961, "step": 590 }, { "epoch": 1.0633303808680248, "grad_norm": 1.902883768081665, "learning_rate": 0.0, "loss": 1.517, "step": 600 }, { "epoch": 1.0633303808680248, "eval_loss": 1.8240644931793213, "eval_runtime": 16.7359, "eval_samples_per_second": 56.824, "eval_steps_per_second": 14.221, "step": 600 } ], "logging_steps": 10, "max_steps": 600, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 1 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.42951368966144e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }