{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9921259842519685, "eval_steps": 16, "global_step": 63, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015748031496062992, "grad_norm": 0.034481361508369446, "learning_rate": 4e-05, "loss": 0.1412, "step": 1 }, { "epoch": 0.015748031496062992, "eval_loss": 0.1612786203622818, "eval_runtime": 64.5157, "eval_samples_per_second": 7.812, "eval_steps_per_second": 0.977, "step": 1 }, { "epoch": 0.031496062992125984, "grad_norm": 0.029317770153284073, "learning_rate": 8e-05, "loss": 0.1191, "step": 2 }, { "epoch": 0.047244094488188976, "grad_norm": 0.036621659994125366, "learning_rate": 0.00012, "loss": 0.1369, "step": 3 }, { "epoch": 0.06299212598425197, "grad_norm": 0.04425783455371857, "learning_rate": 0.00016, "loss": 0.1321, "step": 4 }, { "epoch": 0.07874015748031496, "grad_norm": 0.05247063934803009, "learning_rate": 0.0002, "loss": 0.1285, "step": 5 }, { "epoch": 0.09448818897637795, "grad_norm": 0.03902214765548706, "learning_rate": 0.00019996629653035126, "loss": 0.1004, "step": 6 }, { "epoch": 0.11023622047244094, "grad_norm": 0.03752463683485985, "learning_rate": 0.00019986520883988232, "loss": 0.0985, "step": 7 }, { "epoch": 0.12598425196850394, "grad_norm": 0.03061060793697834, "learning_rate": 0.00019969680506871137, "loss": 0.0912, "step": 8 }, { "epoch": 0.14173228346456693, "grad_norm": 0.034427180886268616, "learning_rate": 0.00019946119873266613, "loss": 0.0836, "step": 9 }, { "epoch": 0.15748031496062992, "grad_norm": 0.03106631338596344, "learning_rate": 0.00019915854864676664, "loss": 0.0734, "step": 10 }, { "epoch": 0.1732283464566929, "grad_norm": 0.02498232200741768, "learning_rate": 0.00019878905881817252, "loss": 0.0729, "step": 11 }, { "epoch": 0.1889763779527559, "grad_norm": 0.03798564895987511, "learning_rate": 0.00019835297830866826, "loss": 0.0694, "step": 12 }, { "epoch": 0.2047244094488189, "grad_norm": 0.046124912798404694, "learning_rate": 0.00019785060106677818, "loss": 0.0833, "step": 13 }, { "epoch": 0.2204724409448819, "grad_norm": 0.02981509082019329, "learning_rate": 0.00019728226572962473, "loss": 0.0713, "step": 14 }, { "epoch": 0.23622047244094488, "grad_norm": 0.02461801841855049, "learning_rate": 0.0001966483553946637, "loss": 0.0657, "step": 15 }, { "epoch": 0.25196850393700787, "grad_norm": 0.04344266653060913, "learning_rate": 0.00019594929736144976, "loss": 0.0635, "step": 16 }, { "epoch": 0.25196850393700787, "eval_loss": 0.06579381227493286, "eval_runtime": 64.6166, "eval_samples_per_second": 7.8, "eval_steps_per_second": 0.975, "step": 16 }, { "epoch": 0.2677165354330709, "grad_norm": 0.0320642925798893, "learning_rate": 0.00019518556284360696, "loss": 0.0656, "step": 17 }, { "epoch": 0.28346456692913385, "grad_norm": 0.028899891301989555, "learning_rate": 0.0001943576666511982, "loss": 0.0462, "step": 18 }, { "epoch": 0.2992125984251969, "grad_norm": 0.02383616380393505, "learning_rate": 0.0001934661668437073, "loss": 0.0649, "step": 19 }, { "epoch": 0.31496062992125984, "grad_norm": 0.03346535563468933, "learning_rate": 0.0001925116643538684, "loss": 0.0546, "step": 20 }, { "epoch": 0.33070866141732286, "grad_norm": 0.020454615354537964, "learning_rate": 0.00019149480258259533, "loss": 0.0538, "step": 21 }, { "epoch": 0.3464566929133858, "grad_norm": 0.02081696316599846, "learning_rate": 0.00019041626696528503, "loss": 0.0526, "step": 22 }, { "epoch": 0.36220472440944884, "grad_norm": 0.028128350153565407, "learning_rate": 0.0001892767845097864, "loss": 0.0593, "step": 23 }, { "epoch": 0.3779527559055118, "grad_norm": 0.015519126318395138, "learning_rate": 0.00018807712330634642, "loss": 0.0528, "step": 24 }, { "epoch": 0.3937007874015748, "grad_norm": 0.03593792766332626, "learning_rate": 0.0001868180920098644, "loss": 0.0481, "step": 25 }, { "epoch": 0.4094488188976378, "grad_norm": 0.015408644452691078, "learning_rate": 0.00018550053929480202, "loss": 0.0479, "step": 26 }, { "epoch": 0.4251968503937008, "grad_norm": 0.021226301789283752, "learning_rate": 0.00018412535328311814, "loss": 0.054, "step": 27 }, { "epoch": 0.4409448818897638, "grad_norm": 0.01717953570187092, "learning_rate": 0.0001826934609456129, "loss": 0.0523, "step": 28 }, { "epoch": 0.4566929133858268, "grad_norm": 0.019626960158348083, "learning_rate": 0.00018120582747708502, "loss": 0.0512, "step": 29 }, { "epoch": 0.47244094488188976, "grad_norm": 0.019186396151781082, "learning_rate": 0.0001796634556457236, "loss": 0.05, "step": 30 }, { "epoch": 0.4881889763779528, "grad_norm": 0.014989328570663929, "learning_rate": 0.0001780673851171728, "loss": 0.0441, "step": 31 }, { "epoch": 0.5039370078740157, "grad_norm": 0.012519012205302715, "learning_rate": 0.00017641869175372493, "loss": 0.0459, "step": 32 }, { "epoch": 0.5039370078740157, "eval_loss": 0.056131936609745026, "eval_runtime": 64.4985, "eval_samples_per_second": 7.814, "eval_steps_per_second": 0.977, "step": 32 }, { "epoch": 0.5196850393700787, "grad_norm": 0.01598576456308365, "learning_rate": 0.00017471848688911464, "loss": 0.0496, "step": 33 }, { "epoch": 0.5354330708661418, "grad_norm": 0.017361309379339218, "learning_rate": 0.000172967916579403, "loss": 0.0534, "step": 34 }, { "epoch": 0.5511811023622047, "grad_norm": 0.021230200305581093, "learning_rate": 0.00017116816083045602, "loss": 0.0505, "step": 35 }, { "epoch": 0.5669291338582677, "grad_norm": 0.01624094881117344, "learning_rate": 0.0001693204328025389, "loss": 0.0568, "step": 36 }, { "epoch": 0.5826771653543307, "grad_norm": 0.014916475862264633, "learning_rate": 0.00016742597799256182, "loss": 0.0542, "step": 37 }, { "epoch": 0.5984251968503937, "grad_norm": 0.013211382552981377, "learning_rate": 0.00016548607339452853, "loss": 0.0507, "step": 38 }, { "epoch": 0.6141732283464567, "grad_norm": 0.01305565144866705, "learning_rate": 0.00016350202663875386, "loss": 0.0387, "step": 39 }, { "epoch": 0.6299212598425197, "grad_norm": 0.011459614150226116, "learning_rate": 0.0001614751751104301, "loss": 0.0433, "step": 40 }, { "epoch": 0.6456692913385826, "grad_norm": 0.014712609350681305, "learning_rate": 0.00015940688504813662, "loss": 0.0571, "step": 41 }, { "epoch": 0.6614173228346457, "grad_norm": 0.015662657096982002, "learning_rate": 0.00015729855062290022, "loss": 0.0504, "step": 42 }, { "epoch": 0.6771653543307087, "grad_norm": 0.011235736310482025, "learning_rate": 0.00015515159299842707, "loss": 0.0453, "step": 43 }, { "epoch": 0.6929133858267716, "grad_norm": 0.011984420008957386, "learning_rate": 0.00015296745937313987, "loss": 0.0402, "step": 44 }, { "epoch": 0.7086614173228346, "grad_norm": 0.010523953475058079, "learning_rate": 0.00015074762200466556, "loss": 0.036, "step": 45 }, { "epoch": 0.7244094488188977, "grad_norm": 0.013540665619075298, "learning_rate": 0.00014849357721743168, "loss": 0.0346, "step": 46 }, { "epoch": 0.7401574803149606, "grad_norm": 0.012998640537261963, "learning_rate": 0.00014620684439403962, "loss": 0.0468, "step": 47 }, { "epoch": 0.7559055118110236, "grad_norm": 0.01443515345454216, "learning_rate": 0.0001438889649510956, "loss": 0.0453, "step": 48 }, { "epoch": 0.7559055118110236, "eval_loss": 0.05216333642601967, "eval_runtime": 64.5137, "eval_samples_per_second": 7.812, "eval_steps_per_second": 0.977, "step": 48 }, { "epoch": 0.7716535433070866, "grad_norm": 0.01463907677680254, "learning_rate": 0.00014154150130018866, "loss": 0.0526, "step": 49 }, { "epoch": 0.7874015748031497, "grad_norm": 0.01614455319941044, "learning_rate": 0.00013916603579471705, "loss": 0.0484, "step": 50 }, { "epoch": 0.8031496062992126, "grad_norm": 0.014042153023183346, "learning_rate": 0.000136764169663272, "loss": 0.0419, "step": 51 }, { "epoch": 0.8188976377952756, "grad_norm": 0.015309924259781837, "learning_rate": 0.00013433752193029886, "loss": 0.0425, "step": 52 }, { "epoch": 0.8346456692913385, "grad_norm": 0.018054217100143433, "learning_rate": 0.00013188772832476188, "loss": 0.0426, "step": 53 }, { "epoch": 0.8503937007874016, "grad_norm": 0.012343033216893673, "learning_rate": 0.00012941644017754964, "loss": 0.0448, "step": 54 }, { "epoch": 0.8661417322834646, "grad_norm": 0.012457596138119698, "learning_rate": 0.00012692532330836346, "loss": 0.0451, "step": 55 }, { "epoch": 0.8818897637795275, "grad_norm": 0.013512413017451763, "learning_rate": 0.00012441605690283915, "loss": 0.0413, "step": 56 }, { "epoch": 0.8976377952755905, "grad_norm": 0.013424846343696117, "learning_rate": 0.0001218903323806595, "loss": 0.0441, "step": 57 }, { "epoch": 0.9133858267716536, "grad_norm": 0.014157367870211601, "learning_rate": 0.00011934985225541998, "loss": 0.0443, "step": 58 }, { "epoch": 0.9291338582677166, "grad_norm": 0.0130110839381814, "learning_rate": 0.00011679632898701649, "loss": 0.0478, "step": 59 }, { "epoch": 0.9448818897637795, "grad_norm": 0.012677576392889023, "learning_rate": 0.00011423148382732853, "loss": 0.0399, "step": 60 }, { "epoch": 0.9606299212598425, "grad_norm": 0.01409006118774414, "learning_rate": 0.00011165704565997593, "loss": 0.0481, "step": 61 }, { "epoch": 0.9763779527559056, "grad_norm": 0.013535700738430023, "learning_rate": 0.00010907474983493144, "loss": 0.0406, "step": 62 }, { "epoch": 0.9921259842519685, "grad_norm": 0.014210895635187626, "learning_rate": 0.0001064863369987743, "loss": 0.0425, "step": 63 } ], "logging_steps": 1, "max_steps": 126, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 63, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.3132684787266355e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }