{ "best_metric": 2.7940969467163086, "best_model_checkpoint": "miner_id_24/checkpoint-450", "epoch": 0.026761819803746655, "eval_steps": 50, "global_step": 450, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 5.947071067499257e-05, "eval_loss": 3.2442233562469482, "eval_runtime": 149.0831, "eval_samples_per_second": 47.49, "eval_steps_per_second": 11.873, "step": 1 }, { "epoch": 0.0005947071067499256, "grad_norm": 0.7875794172286987, "learning_rate": 4.1400000000000003e-05, "loss": 3.0668, "step": 10 }, { "epoch": 0.0011894142134998512, "grad_norm": 0.8943319916725159, "learning_rate": 8.280000000000001e-05, "loss": 3.1298, "step": 20 }, { "epoch": 0.001784121320249777, "grad_norm": 1.10039222240448, "learning_rate": 0.00012419999999999998, "loss": 3.0946, "step": 30 }, { "epoch": 0.0023788284269997025, "grad_norm": 1.2827677726745605, "learning_rate": 0.00016560000000000001, "loss": 2.9561, "step": 40 }, { "epoch": 0.0029735355337496285, "grad_norm": 2.5582001209259033, "learning_rate": 0.000207, "loss": 2.998, "step": 50 }, { "epoch": 0.0029735355337496285, "eval_loss": 2.9912269115448, "eval_runtime": 148.1587, "eval_samples_per_second": 47.787, "eval_steps_per_second": 11.947, "step": 50 }, { "epoch": 0.003568242640499554, "grad_norm": 0.7799744009971619, "learning_rate": 0.00020674787920189178, "loss": 3.003, "step": 60 }, { "epoch": 0.004162949747249479, "grad_norm": 0.8374800682067871, "learning_rate": 0.00020599274511475253, "loss": 3.0247, "step": 70 }, { "epoch": 0.004757656853999405, "grad_norm": 0.9920192956924438, "learning_rate": 0.00020473827667594888, "loss": 2.9491, "step": 80 }, { "epoch": 0.0053523639607493305, "grad_norm": 1.210517168045044, "learning_rate": 0.00020299058552961598, "loss": 2.9129, "step": 90 }, { "epoch": 0.005947071067499257, "grad_norm": 2.20344614982605, "learning_rate": 0.00020075818625134152, "loss": 2.9891, "step": 100 }, { "epoch": 0.005947071067499257, "eval_loss": 2.934997797012329, "eval_runtime": 149.0913, "eval_samples_per_second": 47.488, "eval_steps_per_second": 11.872, "step": 100 }, { "epoch": 0.006541778174249183, "grad_norm": 0.7027397155761719, "learning_rate": 0.00019805195486600916, "loss": 2.9015, "step": 110 }, { "epoch": 0.007136485280999108, "grad_norm": 0.7384776473045349, "learning_rate": 0.00019488507586089894, "loss": 2.9129, "step": 120 }, { "epoch": 0.007731192387749034, "grad_norm": 0.8933420777320862, "learning_rate": 0.00019127297795219008, "loss": 2.8789, "step": 130 }, { "epoch": 0.008325899494498959, "grad_norm": 1.1296390295028687, "learning_rate": 0.00018723325891780706, "loss": 2.8091, "step": 140 }, { "epoch": 0.008920606601248885, "grad_norm": 1.680759310722351, "learning_rate": 0.0001827855998628142, "loss": 2.8615, "step": 150 }, { "epoch": 0.008920606601248885, "eval_loss": 2.9041831493377686, "eval_runtime": 149.8181, "eval_samples_per_second": 47.257, "eval_steps_per_second": 11.814, "step": 150 }, { "epoch": 0.00951531370799881, "grad_norm": 0.7546591758728027, "learning_rate": 0.0001779516693350504, "loss": 2.8554, "step": 160 }, { "epoch": 0.010110020814748736, "grad_norm": 0.7574900388717651, "learning_rate": 0.00017275501775814182, "loss": 2.7957, "step": 170 }, { "epoch": 0.010704727921498661, "grad_norm": 0.8689918518066406, "learning_rate": 0.00016722096269620562, "loss": 2.8529, "step": 180 }, { "epoch": 0.011299435028248588, "grad_norm": 1.0633153915405273, "learning_rate": 0.00016137646550922228, "loss": 2.8309, "step": 190 }, { "epoch": 0.011894142134998514, "grad_norm": 2.1989660263061523, "learning_rate": 0.00015525, "loss": 2.9186, "step": 200 }, { "epoch": 0.011894142134998514, "eval_loss": 2.8749983310699463, "eval_runtime": 149.067, "eval_samples_per_second": 47.495, "eval_steps_per_second": 11.874, "step": 200 }, { "epoch": 0.012488849241748439, "grad_norm": 0.6521171927452087, "learning_rate": 0.0001488714136926695, "loss": 2.9137, "step": 210 }, { "epoch": 0.013083556348498365, "grad_norm": 0.7320899963378906, "learning_rate": 0.0001422717824185469, "loss": 2.8405, "step": 220 }, { "epoch": 0.01367826345524829, "grad_norm": 0.8370326161384583, "learning_rate": 0.00013548325891780705, "loss": 2.8327, "step": 230 }, { "epoch": 0.014272970561998216, "grad_norm": 1.010695219039917, "learning_rate": 0.0001285389161945656, "loss": 2.8228, "step": 240 }, { "epoch": 0.014867677668748141, "grad_norm": 1.9595260620117188, "learning_rate": 0.0001214725863885273, "loss": 2.772, "step": 250 }, { "epoch": 0.014867677668748141, "eval_loss": 2.844726085662842, "eval_runtime": 150.3953, "eval_samples_per_second": 47.076, "eval_steps_per_second": 11.769, "step": 250 }, { "epoch": 0.015462384775498068, "grad_norm": 0.6232745051383972, "learning_rate": 0.00011431869594820213, "loss": 2.8667, "step": 260 }, { "epoch": 0.016057091882247992, "grad_norm": 0.7752615213394165, "learning_rate": 0.00010711209790870886, "loss": 2.8185, "step": 270 }, { "epoch": 0.016651798988997917, "grad_norm": 0.8857811093330383, "learning_rate": 9.988790209129117e-05, "loss": 2.7915, "step": 280 }, { "epoch": 0.017246506095747845, "grad_norm": 1.0391193628311157, "learning_rate": 9.268130405179787e-05, "loss": 2.795, "step": 290 }, { "epoch": 0.01784121320249777, "grad_norm": 1.5660728216171265, "learning_rate": 8.55274136114727e-05, "loss": 2.771, "step": 300 }, { "epoch": 0.01784121320249777, "eval_loss": 2.820711851119995, "eval_runtime": 149.8338, "eval_samples_per_second": 47.252, "eval_steps_per_second": 11.813, "step": 300 }, { "epoch": 0.018435920309247695, "grad_norm": 0.658405065536499, "learning_rate": 7.84610838054344e-05, "loss": 2.8653, "step": 310 }, { "epoch": 0.01903062741599762, "grad_norm": 0.7782045602798462, "learning_rate": 7.151674108219295e-05, "loss": 2.8074, "step": 320 }, { "epoch": 0.019625334522747548, "grad_norm": 0.9159675240516663, "learning_rate": 6.472821758145309e-05, "loss": 2.814, "step": 330 }, { "epoch": 0.020220041629497473, "grad_norm": 1.0867841243743896, "learning_rate": 5.8128586307330475e-05, "loss": 2.7224, "step": 340 }, { "epoch": 0.020814748736247397, "grad_norm": 1.6991764307022095, "learning_rate": 5.175000000000002e-05, "loss": 2.7216, "step": 350 }, { "epoch": 0.020814748736247397, "eval_loss": 2.8091001510620117, "eval_runtime": 150.0782, "eval_samples_per_second": 47.175, "eval_steps_per_second": 11.794, "step": 350 }, { "epoch": 0.021409455842997322, "grad_norm": 0.6833013296127319, "learning_rate": 4.5623534490777714e-05, "loss": 2.8396, "step": 360 }, { "epoch": 0.02200416294974725, "grad_norm": 0.7645152807235718, "learning_rate": 3.9779037303794365e-05, "loss": 2.7683, "step": 370 }, { "epoch": 0.022598870056497175, "grad_norm": 0.88221675157547, "learning_rate": 3.42449822418582e-05, "loss": 2.7769, "step": 380 }, { "epoch": 0.0231935771632471, "grad_norm": 1.0650640726089478, "learning_rate": 2.9048330664949622e-05, "loss": 2.782, "step": 390 }, { "epoch": 0.023788284269997028, "grad_norm": 1.8103938102722168, "learning_rate": 2.4214400137185785e-05, "loss": 2.8704, "step": 400 }, { "epoch": 0.023788284269997028, "eval_loss": 2.798283100128174, "eval_runtime": 149.8594, "eval_samples_per_second": 47.244, "eval_steps_per_second": 11.811, "step": 400 }, { "epoch": 0.024382991376746953, "grad_norm": 0.6215288639068604, "learning_rate": 1.976674108219295e-05, "loss": 2.8352, "step": 410 }, { "epoch": 0.024977698483496878, "grad_norm": 0.7338668704032898, "learning_rate": 1.572702204780991e-05, "loss": 2.801, "step": 420 }, { "epoch": 0.025572405590246802, "grad_norm": 0.9176461100578308, "learning_rate": 1.2114924139101056e-05, "loss": 2.7443, "step": 430 }, { "epoch": 0.02616711269699673, "grad_norm": 1.041648507118225, "learning_rate": 8.948045133990798e-06, "loss": 2.7708, "step": 440 }, { "epoch": 0.026761819803746655, "grad_norm": 1.8905234336853027, "learning_rate": 6.241813748658489e-06, "loss": 2.7077, "step": 450 }, { "epoch": 0.026761819803746655, "eval_loss": 2.7940969467163086, "eval_runtime": 149.9687, "eval_samples_per_second": 47.21, "eval_steps_per_second": 11.802, "step": 450 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4749286456688640.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }