{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.20116655865197666, "eval_steps": 50, "global_step": 97, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0020738820479585226, "grad_norm": NaN, "learning_rate": 2e-05, "loss": 0.0, "step": 1 }, { "epoch": 0.0020738820479585226, "eval_loss": NaN, "eval_runtime": 232.4311, "eval_samples_per_second": 6.991, "eval_steps_per_second": 1.751, "step": 1 }, { "epoch": 0.004147764095917045, "grad_norm": NaN, "learning_rate": 4e-05, "loss": 0.0, "step": 2 }, { "epoch": 0.006221646143875567, "grad_norm": NaN, "learning_rate": 6e-05, "loss": 0.0, "step": 3 }, { "epoch": 0.00829552819183409, "grad_norm": NaN, "learning_rate": 8e-05, "loss": 0.0, "step": 4 }, { "epoch": 0.010369410239792612, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 0.0, "step": 5 }, { "epoch": 0.012443292287751134, "grad_norm": NaN, "learning_rate": 0.00012, "loss": 0.0, "step": 6 }, { "epoch": 0.014517174335709657, "grad_norm": NaN, "learning_rate": 0.00014, "loss": 0.0, "step": 7 }, { "epoch": 0.01659105638366818, "grad_norm": NaN, "learning_rate": 0.00016, "loss": 0.0, "step": 8 }, { "epoch": 0.018664938431626702, "grad_norm": NaN, "learning_rate": 0.00018, "loss": 0.0, "step": 9 }, { "epoch": 0.020738820479585224, "grad_norm": NaN, "learning_rate": 0.0002, "loss": 0.0, "step": 10 }, { "epoch": 0.022812702527543745, "grad_norm": NaN, "learning_rate": 0.0001999348095389677, "loss": 0.0, "step": 11 }, { "epoch": 0.024886584575502267, "grad_norm": NaN, "learning_rate": 0.000199739323151795, "loss": 0.0, "step": 12 }, { "epoch": 0.026960466623460792, "grad_norm": NaN, "learning_rate": 0.00019941379571543596, "loss": 0.0, "step": 13 }, { "epoch": 0.029034348671419314, "grad_norm": NaN, "learning_rate": 0.00019895865165556377, "loss": 0.0, "step": 14 }, { "epoch": 0.031108230719377836, "grad_norm": NaN, "learning_rate": 0.00019837448439320027, "loss": 0.0, "step": 15 }, { "epoch": 0.03318211276733636, "grad_norm": NaN, "learning_rate": 0.00019766205557100868, "loss": 0.0, "step": 16 }, { "epoch": 0.03525599481529488, "grad_norm": NaN, "learning_rate": 0.00019682229406025635, "loss": 0.0, "step": 17 }, { "epoch": 0.037329876863253404, "grad_norm": NaN, "learning_rate": 0.00019585629474974415, "loss": 0.0, "step": 18 }, { "epoch": 0.03940375891121192, "grad_norm": NaN, "learning_rate": 0.00019476531711828027, "loss": 0.0, "step": 19 }, { "epoch": 0.04147764095917045, "grad_norm": NaN, "learning_rate": 0.0001935507835925601, "loss": 0.0, "step": 20 }, { "epoch": 0.04355152300712897, "grad_norm": NaN, "learning_rate": 0.00019221427769259333, "loss": 0.0, "step": 21 }, { "epoch": 0.04562540505508749, "grad_norm": NaN, "learning_rate": 0.00019075754196709572, "loss": 0.0, "step": 22 }, { "epoch": 0.047699287103046016, "grad_norm": NaN, "learning_rate": 0.00018918247572153823, "loss": 0.0, "step": 23 }, { "epoch": 0.049773169151004534, "grad_norm": NaN, "learning_rate": 0.00018749113254181498, "loss": 0.0, "step": 24 }, { "epoch": 0.05184705119896306, "grad_norm": NaN, "learning_rate": 0.00018568571761675893, "loss": 0.0, "step": 25 }, { "epoch": 0.053920933246921585, "grad_norm": NaN, "learning_rate": 0.00018376858486299647, "loss": 0.0, "step": 26 }, { "epoch": 0.0559948152948801, "grad_norm": NaN, "learning_rate": 0.00018174223385588917, "loss": 0.0, "step": 27 }, { "epoch": 0.05806869734283863, "grad_norm": NaN, "learning_rate": 0.00017960930657056438, "loss": 0.0, "step": 28 }, { "epoch": 0.060142579390797146, "grad_norm": NaN, "learning_rate": 0.00017737258393728364, "loss": 0.0, "step": 29 }, { "epoch": 0.06221646143875567, "grad_norm": NaN, "learning_rate": 0.00017503498221564025, "loss": 0.0, "step": 30 }, { "epoch": 0.0642903434867142, "grad_norm": NaN, "learning_rate": 0.0001725995491923131, "loss": 0.0, "step": 31 }, { "epoch": 0.06636422553467272, "grad_norm": NaN, "learning_rate": 0.00017006946020733425, "loss": 0.0, "step": 32 }, { "epoch": 0.06843810758263123, "grad_norm": NaN, "learning_rate": 0.0001674480140140514, "loss": 0.0, "step": 33 }, { "epoch": 0.07051198963058976, "grad_norm": NaN, "learning_rate": 0.00016473862847818277, "loss": 0.0, "step": 34 }, { "epoch": 0.07258587167854828, "grad_norm": NaN, "learning_rate": 0.0001619448361215723, "loss": 0.0, "step": 35 }, { "epoch": 0.07465975372650681, "grad_norm": NaN, "learning_rate": 0.0001590702795164551, "loss": 0.0, "step": 36 }, { "epoch": 0.07673363577446533, "grad_norm": NaN, "learning_rate": 0.00015611870653623825, "loss": 0.0, "step": 37 }, { "epoch": 0.07880751782242384, "grad_norm": NaN, "learning_rate": 0.0001530939654689887, "loss": 0.0, "step": 38 }, { "epoch": 0.08088139987038237, "grad_norm": NaN, "learning_rate": 0.00015000000000000001, "loss": 0.0, "step": 39 }, { "epoch": 0.0829552819183409, "grad_norm": NaN, "learning_rate": 0.00014684084406997903, "loss": 0.0, "step": 40 }, { "epoch": 0.08502916396629942, "grad_norm": NaN, "learning_rate": 0.00014362061661555675, "loss": 0.0, "step": 41 }, { "epoch": 0.08710304601425795, "grad_norm": NaN, "learning_rate": 0.00014034351619898088, "loss": 0.0, "step": 42 }, { "epoch": 0.08917692806221646, "grad_norm": NaN, "learning_rate": 0.00013701381553399145, "loss": 0.0, "step": 43 }, { "epoch": 0.09125081011017498, "grad_norm": NaN, "learning_rate": 0.0001336358559150175, "loss": 0.0, "step": 44 }, { "epoch": 0.09332469215813351, "grad_norm": NaN, "learning_rate": 0.00013021404155695725, "loss": 0.0, "step": 45 }, { "epoch": 0.09539857420609203, "grad_norm": NaN, "learning_rate": 0.00012675283385292212, "loss": 0.0, "step": 46 }, { "epoch": 0.09747245625405056, "grad_norm": NaN, "learning_rate": 0.00012325674555743106, "loss": 0.0, "step": 47 }, { "epoch": 0.09954633830200907, "grad_norm": NaN, "learning_rate": 0.00011973033490264001, "loss": 0.0, "step": 48 }, { "epoch": 0.1016202203499676, "grad_norm": NaN, "learning_rate": 0.0001161781996552765, "loss": 0.0, "step": 49 }, { "epoch": 0.10369410239792612, "grad_norm": NaN, "learning_rate": 0.00011260497112202895, "loss": 0.0, "step": 50 }, { "epoch": 0.10369410239792612, "eval_loss": NaN, "eval_runtime": 233.0986, "eval_samples_per_second": 6.971, "eval_steps_per_second": 1.746, "step": 50 }, { "epoch": 0.10576798444588464, "grad_norm": NaN, "learning_rate": 0.00010901530811120655, "loss": 0.0, "step": 51 }, { "epoch": 0.10784186649384317, "grad_norm": NaN, "learning_rate": 0.00010541389085854176, "loss": 0.0, "step": 52 }, { "epoch": 0.10991574854180168, "grad_norm": NaN, "learning_rate": 0.00010180541492505604, "loss": 0.0, "step": 53 }, { "epoch": 0.1119896305897602, "grad_norm": NaN, "learning_rate": 9.819458507494394e-05, "loss": 0.0, "step": 54 }, { "epoch": 0.11406351263771873, "grad_norm": NaN, "learning_rate": 9.458610914145826e-05, "loss": 0.0, "step": 55 }, { "epoch": 0.11613739468567726, "grad_norm": NaN, "learning_rate": 9.098469188879349e-05, "loss": 0.0, "step": 56 }, { "epoch": 0.11821127673363578, "grad_norm": NaN, "learning_rate": 8.739502887797107e-05, "loss": 0.0, "step": 57 }, { "epoch": 0.12028515878159429, "grad_norm": NaN, "learning_rate": 8.382180034472353e-05, "loss": 0.0, "step": 58 }, { "epoch": 0.12235904082955282, "grad_norm": NaN, "learning_rate": 8.026966509736001e-05, "loss": 0.0, "step": 59 }, { "epoch": 0.12443292287751134, "grad_norm": NaN, "learning_rate": 7.674325444256899e-05, "loss": 0.0, "step": 60 }, { "epoch": 0.12650680492546987, "grad_norm": NaN, "learning_rate": 7.324716614707793e-05, "loss": 0.0, "step": 61 }, { "epoch": 0.1285806869734284, "grad_norm": NaN, "learning_rate": 6.978595844304271e-05, "loss": 0.0, "step": 62 }, { "epoch": 0.13065456902138692, "grad_norm": NaN, "learning_rate": 6.636414408498249e-05, "loss": 0.0, "step": 63 }, { "epoch": 0.13272845106934544, "grad_norm": NaN, "learning_rate": 6.298618446600856e-05, "loss": 0.0, "step": 64 }, { "epoch": 0.13480233311730394, "grad_norm": NaN, "learning_rate": 5.965648380101916e-05, "loss": 0.0, "step": 65 }, { "epoch": 0.13687621516526247, "grad_norm": NaN, "learning_rate": 5.6379383384443255e-05, "loss": 0.0, "step": 66 }, { "epoch": 0.138950097213221, "grad_norm": NaN, "learning_rate": 5.3159155930021e-05, "loss": 0.0, "step": 67 }, { "epoch": 0.14102397926117952, "grad_norm": NaN, "learning_rate": 5.000000000000002e-05, "loss": 0.0, "step": 68 }, { "epoch": 0.14309786130913804, "grad_norm": NaN, "learning_rate": 4.6906034531011346e-05, "loss": 0.0, "step": 69 }, { "epoch": 0.14517174335709657, "grad_norm": NaN, "learning_rate": 4.388129346376178e-05, "loss": 0.0, "step": 70 }, { "epoch": 0.1472456254050551, "grad_norm": NaN, "learning_rate": 4.092972048354491e-05, "loss": 0.0, "step": 71 }, { "epoch": 0.14931950745301362, "grad_norm": NaN, "learning_rate": 3.80551638784277e-05, "loss": 0.0, "step": 72 }, { "epoch": 0.15139338950097214, "grad_norm": NaN, "learning_rate": 3.5261371521817244e-05, "loss": 0.0, "step": 73 }, { "epoch": 0.15346727154893067, "grad_norm": NaN, "learning_rate": 3.2551985985948616e-05, "loss": 0.0, "step": 74 }, { "epoch": 0.15554115359688916, "grad_norm": NaN, "learning_rate": 2.993053979266577e-05, "loss": 0.0, "step": 75 }, { "epoch": 0.1576150356448477, "grad_norm": NaN, "learning_rate": 2.7400450807686938e-05, "loss": 0.0, "step": 76 }, { "epoch": 0.15968891769280621, "grad_norm": NaN, "learning_rate": 2.496501778435977e-05, "loss": 0.0, "step": 77 }, { "epoch": 0.16176279974076474, "grad_norm": NaN, "learning_rate": 2.2627416062716366e-05, "loss": 0.0, "step": 78 }, { "epoch": 0.16383668178872327, "grad_norm": NaN, "learning_rate": 2.0390693429435627e-05, "loss": 0.0, "step": 79 }, { "epoch": 0.1659105638366818, "grad_norm": NaN, "learning_rate": 1.825776614411082e-05, "loss": 0.0, "step": 80 }, { "epoch": 0.16798444588464032, "grad_norm": NaN, "learning_rate": 1.6231415137003537e-05, "loss": 0.0, "step": 81 }, { "epoch": 0.17005832793259884, "grad_norm": NaN, "learning_rate": 1.4314282383241096e-05, "loss": 0.0, "step": 82 }, { "epoch": 0.17213220998055737, "grad_norm": NaN, "learning_rate": 1.2508867458185037e-05, "loss": 0.0, "step": 83 }, { "epoch": 0.1742060920285159, "grad_norm": NaN, "learning_rate": 1.0817524278461776e-05, "loss": 0.0, "step": 84 }, { "epoch": 0.1762799740764744, "grad_norm": NaN, "learning_rate": 9.242458032904311e-06, "loss": 0.0, "step": 85 }, { "epoch": 0.1783538561244329, "grad_norm": NaN, "learning_rate": 7.785722307406684e-06, "loss": 0.0, "step": 86 }, { "epoch": 0.18042773817239144, "grad_norm": NaN, "learning_rate": 6.4492164074399065e-06, "loss": 0.0, "step": 87 }, { "epoch": 0.18250162022034996, "grad_norm": NaN, "learning_rate": 5.2346828817197655e-06, "loss": 0.0, "step": 88 }, { "epoch": 0.1845755022683085, "grad_norm": NaN, "learning_rate": 4.143705250255869e-06, "loss": 0.0, "step": 89 }, { "epoch": 0.18664938431626701, "grad_norm": NaN, "learning_rate": 3.1777059397436692e-06, "loss": 0.0, "step": 90 }, { "epoch": 0.18872326636422554, "grad_norm": NaN, "learning_rate": 2.3379444289913342e-06, "loss": 0.0, "step": 91 }, { "epoch": 0.19079714841218406, "grad_norm": NaN, "learning_rate": 1.6255156067997323e-06, "loss": 0.0, "step": 92 }, { "epoch": 0.1928710304601426, "grad_norm": NaN, "learning_rate": 1.0413483444362771e-06, "loss": 0.0, "step": 93 }, { "epoch": 0.19494491250810111, "grad_norm": NaN, "learning_rate": 5.862042845640403e-07, "loss": 0.0, "step": 94 }, { "epoch": 0.1970187945560596, "grad_norm": NaN, "learning_rate": 2.606768482050215e-07, "loss": 0.0, "step": 95 }, { "epoch": 0.19909267660401814, "grad_norm": NaN, "learning_rate": 6.519046103230508e-08, "loss": 0.0, "step": 96 }, { "epoch": 0.20116655865197666, "grad_norm": NaN, "learning_rate": 0.0, "loss": 0.0, "step": 97 } ], "logging_steps": 1, "max_steps": 97, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 2, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.0708856401874125e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }