{ "best_metric": 1.5615293979644775, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.009793072380597964, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.958614476119593e-05, "eval_loss": 2.541055679321289, "eval_runtime": 808.5156, "eval_samples_per_second": 26.589, "eval_steps_per_second": 6.648, "step": 1 }, { "epoch": 0.0001958614476119593, "grad_norm": 0.42259156703948975, "learning_rate": 4.12e-05, "loss": 2.4447, "step": 10 }, { "epoch": 0.0003917228952239186, "grad_norm": 0.4474017918109894, "learning_rate": 8.24e-05, "loss": 2.3596, "step": 20 }, { "epoch": 0.0005875843428358779, "grad_norm": 0.5387372970581055, "learning_rate": 0.0001236, "loss": 2.2655, "step": 30 }, { "epoch": 0.0007834457904478372, "grad_norm": 0.6558287739753723, "learning_rate": 0.0001648, "loss": 2.1072, "step": 40 }, { "epoch": 0.0009793072380597965, "grad_norm": 0.790547788143158, "learning_rate": 0.000206, "loss": 2.031, "step": 50 }, { "epoch": 0.0009793072380597965, "eval_loss": 2.0144009590148926, "eval_runtime": 811.9056, "eval_samples_per_second": 26.478, "eval_steps_per_second": 6.62, "step": 50 }, { "epoch": 0.0011751686856717558, "grad_norm": 0.474514901638031, "learning_rate": 0.0002057490971767619, "loss": 1.9995, "step": 60 }, { "epoch": 0.001371030133283715, "grad_norm": 0.48066890239715576, "learning_rate": 0.00020499761108038175, "loss": 1.9568, "step": 70 }, { "epoch": 0.0015668915808956744, "grad_norm": 0.5162322521209717, "learning_rate": 0.00020374920287558198, "loss": 1.8879, "step": 80 }, { "epoch": 0.0017627530285076337, "grad_norm": 0.5203802585601807, "learning_rate": 0.00020200995468164684, "loss": 1.8786, "step": 90 }, { "epoch": 0.001958614476119593, "grad_norm": 0.6834505796432495, "learning_rate": 0.00019978833994094855, "loss": 1.8963, "step": 100 }, { "epoch": 0.001958614476119593, "eval_loss": 1.8420443534851074, "eval_runtime": 802.6412, "eval_samples_per_second": 26.784, "eval_steps_per_second": 6.697, "step": 100 }, { "epoch": 0.0021544759237315525, "grad_norm": 0.4236699044704437, "learning_rate": 0.00019709518213718787, "loss": 1.8609, "step": 110 }, { "epoch": 0.0023503373713435116, "grad_norm": 0.47379371523857117, "learning_rate": 0.00019394360206446948, "loss": 1.8112, "step": 120 }, { "epoch": 0.002546198818955471, "grad_norm": 0.48835086822509766, "learning_rate": 0.00019034895390411186, "loss": 1.7648, "step": 130 }, { "epoch": 0.00274206026656743, "grad_norm": 0.5578092336654663, "learning_rate": 0.0001863287504206196, "loss": 1.8491, "step": 140 }, { "epoch": 0.0029379217141793897, "grad_norm": 0.7225767374038696, "learning_rate": 0.00018190257764125471, "loss": 1.7379, "step": 150 }, { "epoch": 0.0029379217141793897, "eval_loss": 1.766533374786377, "eval_runtime": 808.4925, "eval_samples_per_second": 26.59, "eval_steps_per_second": 6.648, "step": 150 }, { "epoch": 0.0031337831617913488, "grad_norm": 0.4122462868690491, "learning_rate": 0.00017709199943488106, "loss": 1.8591, "step": 160 }, { "epoch": 0.0033296446094033083, "grad_norm": 0.45456036925315857, "learning_rate": 0.00017192045245496238, "loss": 1.7107, "step": 170 }, { "epoch": 0.0035255060570152674, "grad_norm": 0.4639202356338501, "learning_rate": 0.00016641313195854277, "loss": 1.6736, "step": 180 }, { "epoch": 0.003721367504627227, "grad_norm": 0.551369309425354, "learning_rate": 0.0001605968690574869, "loss": 1.7199, "step": 190 }, { "epoch": 0.003917228952239186, "grad_norm": 0.6994920969009399, "learning_rate": 0.0001545, "loss": 1.6386, "step": 200 }, { "epoch": 0.003917228952239186, "eval_loss": 1.7132164239883423, "eval_runtime": 815.4936, "eval_samples_per_second": 26.362, "eval_steps_per_second": 6.591, "step": 200 }, { "epoch": 0.004113090399851145, "grad_norm": 0.43010979890823364, "learning_rate": 0.00014815222811927496, "loss": 1.7888, "step": 210 }, { "epoch": 0.004308951847463105, "grad_norm": 0.4258266091346741, "learning_rate": 0.00014158447912183896, "loss": 1.6298, "step": 220 }, { "epoch": 0.004504813295075064, "grad_norm": 0.47796645760536194, "learning_rate": 0.00013482875042061958, "loss": 1.6301, "step": 230 }, { "epoch": 0.004700674742687023, "grad_norm": 0.5105708241462708, "learning_rate": 0.00012791795524676576, "loss": 1.6413, "step": 240 }, { "epoch": 0.004896536190298982, "grad_norm": 0.6835204362869263, "learning_rate": 0.00012088576229969385, "loss": 1.6857, "step": 250 }, { "epoch": 0.004896536190298982, "eval_loss": 1.6581625938415527, "eval_runtime": 808.8687, "eval_samples_per_second": 26.578, "eval_steps_per_second": 6.645, "step": 250 }, { "epoch": 0.005092397637910942, "grad_norm": 0.4199804961681366, "learning_rate": 0.0001137664317165683, "loss": 1.786, "step": 260 }, { "epoch": 0.005288259085522901, "grad_norm": 0.46537598967552185, "learning_rate": 0.00010659464816035761, "loss": 1.6461, "step": 270 }, { "epoch": 0.00548412053313486, "grad_norm": 0.5051584243774414, "learning_rate": 9.940535183964242e-05, "loss": 1.5999, "step": 280 }, { "epoch": 0.0056799819807468194, "grad_norm": 0.5738089680671692, "learning_rate": 9.22335682834317e-05, "loss": 1.6069, "step": 290 }, { "epoch": 0.005875843428358779, "grad_norm": 0.6677771210670471, "learning_rate": 8.511423770030617e-05, "loss": 1.6102, "step": 300 }, { "epoch": 0.005875843428358779, "eval_loss": 1.621539831161499, "eval_runtime": 849.3992, "eval_samples_per_second": 25.31, "eval_steps_per_second": 6.328, "step": 300 }, { "epoch": 0.0060717048759707385, "grad_norm": 0.42615145444869995, "learning_rate": 7.808204475323423e-05, "loss": 1.682, "step": 310 }, { "epoch": 0.0062675663235826976, "grad_norm": 0.4762561321258545, "learning_rate": 7.117124957938042e-05, "loss": 1.6215, "step": 320 }, { "epoch": 0.006463427771194657, "grad_norm": 0.5104791522026062, "learning_rate": 6.441552087816105e-05, "loss": 1.5619, "step": 330 }, { "epoch": 0.006659289218806617, "grad_norm": 0.575524091720581, "learning_rate": 5.784777188072502e-05, "loss": 1.6038, "step": 340 }, { "epoch": 0.006855150666418576, "grad_norm": 0.6795148849487305, "learning_rate": 5.150000000000002e-05, "loss": 1.6331, "step": 350 }, { "epoch": 0.006855150666418576, "eval_loss": 1.5960947275161743, "eval_runtime": 807.8208, "eval_samples_per_second": 26.612, "eval_steps_per_second": 6.654, "step": 350 }, { "epoch": 0.007051012114030535, "grad_norm": 0.4098166823387146, "learning_rate": 4.540313094251309e-05, "loss": 1.6492, "step": 360 }, { "epoch": 0.007246873561642494, "grad_norm": 0.47631141543388367, "learning_rate": 3.958686804145719e-05, "loss": 1.5605, "step": 370 }, { "epoch": 0.007442735009254454, "grad_norm": 0.5455722808837891, "learning_rate": 3.4079547545037634e-05, "loss": 1.5975, "step": 380 }, { "epoch": 0.007638596456866413, "grad_norm": 0.5428374409675598, "learning_rate": 2.8908000565118947e-05, "loss": 1.5256, "step": 390 }, { "epoch": 0.007834457904478372, "grad_norm": 0.7749019265174866, "learning_rate": 2.4097422358745275e-05, "loss": 1.5731, "step": 400 }, { "epoch": 0.007834457904478372, "eval_loss": 1.5720521211624146, "eval_runtime": 805.3716, "eval_samples_per_second": 26.693, "eval_steps_per_second": 6.674, "step": 400 }, { "epoch": 0.008030319352090332, "grad_norm": 0.4348071813583374, "learning_rate": 1.9671249579380422e-05, "loss": 1.6855, "step": 410 }, { "epoch": 0.00822618079970229, "grad_norm": 0.5242032408714294, "learning_rate": 1.5651046095888127e-05, "loss": 1.6195, "step": 420 }, { "epoch": 0.00842204224731425, "grad_norm": 0.5005368590354919, "learning_rate": 1.205639793553052e-05, "loss": 1.569, "step": 430 }, { "epoch": 0.00861790369492621, "grad_norm": 0.5892754793167114, "learning_rate": 8.904817862812098e-06, "loss": 1.5471, "step": 440 }, { "epoch": 0.008813765142538168, "grad_norm": 0.715459406375885, "learning_rate": 6.211660059051443e-06, "loss": 1.5927, "step": 450 }, { "epoch": 0.008813765142538168, "eval_loss": 1.5629593133926392, "eval_runtime": 809.2741, "eval_samples_per_second": 26.565, "eval_steps_per_second": 6.642, "step": 450 }, { "epoch": 0.009009626590150128, "grad_norm": 0.4325626492500305, "learning_rate": 3.990045318353154e-06, "loss": 1.6155, "step": 460 }, { "epoch": 0.009205488037762086, "grad_norm": 0.4722021222114563, "learning_rate": 2.250797124418014e-06, "loss": 1.5226, "step": 470 }, { "epoch": 0.009401349485374046, "grad_norm": 0.49364137649536133, "learning_rate": 1.0023889196182526e-06, "loss": 1.5093, "step": 480 }, { "epoch": 0.009597210932986006, "grad_norm": 0.5476292967796326, "learning_rate": 2.5090282323810766e-07, "loss": 1.5315, "step": 490 }, { "epoch": 0.009793072380597964, "grad_norm": 0.8033114075660706, "learning_rate": 0.0, "loss": 1.574, "step": 500 }, { "epoch": 0.009793072380597964, "eval_loss": 1.5615293979644775, "eval_runtime": 809.085, "eval_samples_per_second": 26.571, "eval_steps_per_second": 6.643, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.2862904541773824e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }