{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.04435573297848747, "eval_steps": 13, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008871146595697493, "grad_norm": 2.552351713180542, "learning_rate": 5e-06, "loss": 2.0461, "step": 1 }, { "epoch": 0.0008871146595697493, "eval_loss": 2.357318878173828, "eval_runtime": 22.7138, "eval_samples_per_second": 20.912, "eval_steps_per_second": 10.478, "step": 1 }, { "epoch": 0.0017742293191394987, "grad_norm": 1.873948097229004, "learning_rate": 1e-05, "loss": 1.5926, "step": 2 }, { "epoch": 0.0026613439787092482, "grad_norm": 3.1655471324920654, "learning_rate": 1.5e-05, "loss": 2.3712, "step": 3 }, { "epoch": 0.0035484586382789974, "grad_norm": 2.2303149700164795, "learning_rate": 2e-05, "loss": 1.7694, "step": 4 }, { "epoch": 0.004435573297848747, "grad_norm": 2.6942412853240967, "learning_rate": 2.5e-05, "loss": 2.3172, "step": 5 }, { "epoch": 0.0053226879574184965, "grad_norm": 2.11533260345459, "learning_rate": 3e-05, "loss": 1.7689, "step": 6 }, { "epoch": 0.006209802616988246, "grad_norm": 2.6214680671691895, "learning_rate": 3.5e-05, "loss": 1.8771, "step": 7 }, { "epoch": 0.007096917276557995, "grad_norm": 2.7126352787017822, "learning_rate": 4e-05, "loss": 1.7355, "step": 8 }, { "epoch": 0.007984031936127744, "grad_norm": 2.914896011352539, "learning_rate": 4.5e-05, "loss": 1.7516, "step": 9 }, { "epoch": 0.008871146595697495, "grad_norm": 3.065276861190796, "learning_rate": 5e-05, "loss": 1.7267, "step": 10 }, { "epoch": 0.009758261255267244, "grad_norm": 3.1432533264160156, "learning_rate": 4.99229333433282e-05, "loss": 2.071, "step": 11 }, { "epoch": 0.010645375914836993, "grad_norm": 2.5506999492645264, "learning_rate": 4.9692208514878444e-05, "loss": 1.4475, "step": 12 }, { "epoch": 0.011532490574406742, "grad_norm": 2.507850170135498, "learning_rate": 4.9309248009941914e-05, "loss": 1.3582, "step": 13 }, { "epoch": 0.011532490574406742, "eval_loss": 1.3111391067504883, "eval_runtime": 22.7388, "eval_samples_per_second": 20.889, "eval_steps_per_second": 10.467, "step": 13 }, { "epoch": 0.012419605233976491, "grad_norm": 2.5213797092437744, "learning_rate": 4.877641290737884e-05, "loss": 1.3541, "step": 14 }, { "epoch": 0.01330671989354624, "grad_norm": 2.1459462642669678, "learning_rate": 4.8096988312782174e-05, "loss": 0.9014, "step": 15 }, { "epoch": 0.01419383455311599, "grad_norm": 2.892573833465576, "learning_rate": 4.72751631047092e-05, "loss": 1.0082, "step": 16 }, { "epoch": 0.01508094921268574, "grad_norm": 2.568525791168213, "learning_rate": 4.6316004108852305e-05, "loss": 0.8953, "step": 17 }, { "epoch": 0.015968063872255488, "grad_norm": 2.7149441242218018, "learning_rate": 4.522542485937369e-05, "loss": 0.9834, "step": 18 }, { "epoch": 0.01685517853182524, "grad_norm": 1.9368983507156372, "learning_rate": 4.401014914000078e-05, "loss": 0.8917, "step": 19 }, { "epoch": 0.01774229319139499, "grad_norm": 2.554739236831665, "learning_rate": 4.267766952966369e-05, "loss": 0.7854, "step": 20 }, { "epoch": 0.018629407850964737, "grad_norm": 2.128541946411133, "learning_rate": 4.123620120825459e-05, "loss": 0.7922, "step": 21 }, { "epoch": 0.019516522510534488, "grad_norm": 1.9166874885559082, "learning_rate": 3.969463130731183e-05, "loss": 0.6496, "step": 22 }, { "epoch": 0.020403637170104235, "grad_norm": 1.746896743774414, "learning_rate": 3.8062464117898724e-05, "loss": 0.6681, "step": 23 }, { "epoch": 0.021290751829673986, "grad_norm": 2.7109410762786865, "learning_rate": 3.634976249348867e-05, "loss": 0.8622, "step": 24 }, { "epoch": 0.022177866489243733, "grad_norm": 2.082806348800659, "learning_rate": 3.456708580912725e-05, "loss": 0.6218, "step": 25 }, { "epoch": 0.023064981148813484, "grad_norm": 1.8518857955932617, "learning_rate": 3.272542485937369e-05, "loss": 0.5705, "step": 26 }, { "epoch": 0.023064981148813484, "eval_loss": 0.6528695225715637, "eval_runtime": 22.7482, "eval_samples_per_second": 20.881, "eval_steps_per_second": 10.462, "step": 26 }, { "epoch": 0.023952095808383235, "grad_norm": 2.120460271835327, "learning_rate": 3.083613409639764e-05, "loss": 0.4645, "step": 27 }, { "epoch": 0.024839210467952982, "grad_norm": 1.6701194047927856, "learning_rate": 2.8910861626005776e-05, "loss": 0.6054, "step": 28 }, { "epoch": 0.025726325127522733, "grad_norm": 1.8231170177459717, "learning_rate": 2.6961477393196126e-05, "loss": 0.4044, "step": 29 }, { "epoch": 0.02661343978709248, "grad_norm": 1.5324640274047852, "learning_rate": 2.5e-05, "loss": 0.3928, "step": 30 }, { "epoch": 0.02750055444666223, "grad_norm": 3.182884693145752, "learning_rate": 2.303852260680388e-05, "loss": 0.6412, "step": 31 }, { "epoch": 0.02838766910623198, "grad_norm": 1.9924638271331787, "learning_rate": 2.1089138373994223e-05, "loss": 0.5593, "step": 32 }, { "epoch": 0.02927478376580173, "grad_norm": 1.4829778671264648, "learning_rate": 1.9163865903602374e-05, "loss": 0.471, "step": 33 }, { "epoch": 0.03016189842537148, "grad_norm": 1.884485125541687, "learning_rate": 1.7274575140626318e-05, "loss": 0.5697, "step": 34 }, { "epoch": 0.031049013084941228, "grad_norm": 1.922612190246582, "learning_rate": 1.5432914190872757e-05, "loss": 0.3865, "step": 35 }, { "epoch": 0.031936127744510975, "grad_norm": 1.9512248039245605, "learning_rate": 1.3650237506511331e-05, "loss": 0.4733, "step": 36 }, { "epoch": 0.03282324240408073, "grad_norm": 1.9390307664871216, "learning_rate": 1.1937535882101281e-05, "loss": 0.5349, "step": 37 }, { "epoch": 0.03371035706365048, "grad_norm": 1.8721624612808228, "learning_rate": 1.0305368692688174e-05, "loss": 0.7239, "step": 38 }, { "epoch": 0.034597471723220224, "grad_norm": 1.7561758756637573, "learning_rate": 8.763798791745411e-06, "loss": 0.5401, "step": 39 }, { "epoch": 0.034597471723220224, "eval_loss": 0.583450198173523, "eval_runtime": 22.7264, "eval_samples_per_second": 20.901, "eval_steps_per_second": 10.472, "step": 39 }, { "epoch": 0.03548458638278998, "grad_norm": 1.6044561862945557, "learning_rate": 7.3223304703363135e-06, "loss": 0.525, "step": 40 }, { "epoch": 0.036371701042359726, "grad_norm": 2.315598726272583, "learning_rate": 5.989850859999227e-06, "loss": 0.6067, "step": 41 }, { "epoch": 0.037258815701929474, "grad_norm": 1.8639142513275146, "learning_rate": 4.7745751406263165e-06, "loss": 0.6147, "step": 42 }, { "epoch": 0.03814593036149922, "grad_norm": 2.1164658069610596, "learning_rate": 3.6839958911476957e-06, "loss": 0.5504, "step": 43 }, { "epoch": 0.039033045021068975, "grad_norm": 2.0981204509735107, "learning_rate": 2.7248368952908053e-06, "loss": 0.6325, "step": 44 }, { "epoch": 0.03992015968063872, "grad_norm": 1.7437758445739746, "learning_rate": 1.9030116872178316e-06, "loss": 0.4499, "step": 45 }, { "epoch": 0.04080727434020847, "grad_norm": 2.0806751251220703, "learning_rate": 1.2235870926211619e-06, "loss": 0.5006, "step": 46 }, { "epoch": 0.041694388999778224, "grad_norm": 2.024292469024658, "learning_rate": 6.907519900580861e-07, "loss": 0.5585, "step": 47 }, { "epoch": 0.04258150365934797, "grad_norm": 1.8988577127456665, "learning_rate": 3.077914851215585e-07, "loss": 0.4586, "step": 48 }, { "epoch": 0.04346861831891772, "grad_norm": 2.1858489513397217, "learning_rate": 7.706665667180091e-08, "loss": 0.5561, "step": 49 }, { "epoch": 0.04435573297848747, "grad_norm": 2.0317609310150146, "learning_rate": 0.0, "loss": 0.5758, "step": 50 } ], "logging_steps": 1, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 13, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.02619680866304e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }