{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4678362573099415, "eval_steps": 500, "global_step": 150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5e-05, "loss": 1.0506, "step": 1 }, { "epoch": 0.01, "learning_rate": 0.0001, "loss": 0.9988, "step": 2 }, { "epoch": 0.01, "learning_rate": 0.00015000000000000001, "loss": 0.9783, "step": 3 }, { "epoch": 0.01, "learning_rate": 0.0002, "loss": 0.9849, "step": 4 }, { "epoch": 0.02, "learning_rate": 0.00025, "loss": 1.0159, "step": 5 }, { "epoch": 0.02, "learning_rate": 0.00030000000000000003, "loss": 0.9847, "step": 6 }, { "epoch": 0.02, "learning_rate": 0.00034999999999999994, "loss": 0.9101, "step": 7 }, { "epoch": 0.02, "learning_rate": 0.0004, "loss": 0.9445, "step": 8 }, { "epoch": 0.03, "learning_rate": 0.00045, "loss": 0.8578, "step": 9 }, { "epoch": 0.03, "learning_rate": 0.0005, "loss": 0.9356, "step": 10 }, { "epoch": 0.03, "learning_rate": 0.0005499999999999999, "loss": 0.8395, "step": 11 }, { "epoch": 0.04, "learning_rate": 0.0006000000000000001, "loss": 0.9002, "step": 12 }, { "epoch": 0.04, "learning_rate": 0.00065, "loss": 0.8955, "step": 13 }, { "epoch": 0.04, "learning_rate": 0.0006499959204043461, "loss": 0.902, "step": 14 }, { "epoch": 0.05, "learning_rate": 0.0006499836817198032, "loss": 0.8578, "step": 15 }, { "epoch": 0.05, "learning_rate": 0.0006499632842536263, "loss": 0.9005, "step": 16 }, { "epoch": 0.05, "learning_rate": 0.0006499347285178979, "loss": 0.8539, "step": 17 }, { "epoch": 0.06, "learning_rate": 0.0006498980152295153, "loss": 0.8595, "step": 18 }, { "epoch": 0.06, "learning_rate": 0.0006498531453101735, "loss": 0.8845, "step": 19 }, { "epoch": 0.06, "learning_rate": 0.0006498001198863406, "loss": 0.8924, "step": 20 }, { "epoch": 0.07, "learning_rate": 0.000649738940289231, "loss": 0.8365, "step": 21 }, { "epoch": 0.07, "learning_rate": 0.0006496696080547707, "loss": 0.8462, "step": 22 }, { "epoch": 0.07, "learning_rate": 0.0006495921249235596, "loss": 0.8528, "step": 23 }, { "epoch": 0.07, "learning_rate": 0.0006495064928408277, "loss": 0.8159, "step": 24 }, { "epoch": 0.08, "learning_rate": 0.0006494127139563859, "loss": 0.8245, "step": 25 }, { "epoch": 0.08, "learning_rate": 0.000649310790624572, "loss": 0.8081, "step": 26 }, { "epoch": 0.08, "learning_rate": 0.0006492007254041924, "loss": 0.8535, "step": 27 }, { "epoch": 0.09, "learning_rate": 0.0006490825210584566, "loss": 0.8162, "step": 28 }, { "epoch": 0.09, "learning_rate": 0.0006489561805549089, "loss": 0.8456, "step": 29 }, { "epoch": 0.09, "learning_rate": 0.0006488217070653535, "loss": 0.7799, "step": 30 }, { "epoch": 0.1, "learning_rate": 0.0006486791039657748, "loss": 0.8088, "step": 31 }, { "epoch": 0.1, "learning_rate": 0.0006485283748362524, "loss": 0.8683, "step": 32 }, { "epoch": 0.1, "learning_rate": 0.0006483695234608723, "loss": 0.8871, "step": 33 }, { "epoch": 0.11, "learning_rate": 0.0006482025538276304, "loss": 0.7711, "step": 34 }, { "epoch": 0.11, "learning_rate": 0.0006480274701283335, "loss": 0.7621, "step": 35 }, { "epoch": 0.11, "learning_rate": 0.0006478442767584937, "loss": 0.8243, "step": 36 }, { "epoch": 0.12, "learning_rate": 0.0006476529783172177, "loss": 0.8257, "step": 37 }, { "epoch": 0.12, "learning_rate": 0.0006474535796070919, "loss": 0.8141, "step": 38 }, { "epoch": 0.12, "learning_rate": 0.0006472460856340619, "loss": 0.8109, "step": 39 }, { "epoch": 0.12, "learning_rate": 0.000647030501607306, "loss": 0.7873, "step": 40 }, { "epoch": 0.13, "learning_rate": 0.000646806832939105, "loss": 0.7386, "step": 41 }, { "epoch": 0.13, "learning_rate": 0.0006465750852447068, "loss": 0.8636, "step": 42 }, { "epoch": 0.13, "learning_rate": 0.0006463352643421846, "loss": 0.7357, "step": 43 }, { "epoch": 0.14, "learning_rate": 0.0006460873762522906, "loss": 0.8142, "step": 44 }, { "epoch": 0.14, "learning_rate": 0.0006458314271983063, "loss": 0.7275, "step": 45 }, { "epoch": 0.14, "learning_rate": 0.0006455674236058847, "loss": 0.8029, "step": 46 }, { "epoch": 0.15, "learning_rate": 0.00064529537210289, "loss": 0.7901, "step": 47 }, { "epoch": 0.15, "learning_rate": 0.0006450152795192307, "loss": 0.7788, "step": 48 }, { "epoch": 0.15, "learning_rate": 0.0006447271528866881, "loss": 0.7621, "step": 49 }, { "epoch": 0.16, "learning_rate": 0.0006444309994387402, "loss": 0.7537, "step": 50 }, { "epoch": 0.16, "learning_rate": 0.0006441268266103796, "loss": 0.7917, "step": 51 }, { "epoch": 0.16, "learning_rate": 0.0006438146420379274, "loss": 0.8451, "step": 52 }, { "epoch": 0.17, "learning_rate": 0.0006434944535588411, "loss": 0.8369, "step": 53 }, { "epoch": 0.17, "learning_rate": 0.0006431662692115173, "loss": 0.7637, "step": 54 }, { "epoch": 0.17, "learning_rate": 0.0006428300972350914, "loss": 0.8365, "step": 55 }, { "epoch": 0.17, "learning_rate": 0.0006424859460692295, "loss": 0.7633, "step": 56 }, { "epoch": 0.18, "learning_rate": 0.0006421338243539165, "loss": 0.7718, "step": 57 }, { "epoch": 0.18, "learning_rate": 0.0006417737409292403, "loss": 0.7672, "step": 58 }, { "epoch": 0.18, "learning_rate": 0.0006414057048351684, "loss": 0.8107, "step": 59 }, { "epoch": 0.19, "learning_rate": 0.0006410297253113221, "loss": 0.7979, "step": 60 }, { "epoch": 0.19, "learning_rate": 0.0006406458117967443, "loss": 0.7634, "step": 61 }, { "epoch": 0.19, "learning_rate": 0.0006402539739296618, "loss": 0.7504, "step": 62 }, { "epoch": 0.2, "learning_rate": 0.0006398542215472443, "loss": 0.8082, "step": 63 }, { "epoch": 0.2, "learning_rate": 0.0006394465646853571, "loss": 0.8355, "step": 64 }, { "epoch": 0.2, "learning_rate": 0.0006390310135783086, "loss": 0.7458, "step": 65 }, { "epoch": 0.21, "learning_rate": 0.0006386075786585944, "loss": 0.7525, "step": 66 }, { "epoch": 0.21, "learning_rate": 0.0006381762705566343, "loss": 0.7464, "step": 67 }, { "epoch": 0.21, "learning_rate": 0.0006377371001005063, "loss": 0.78, "step": 68 }, { "epoch": 0.22, "learning_rate": 0.0006372900783156745, "loss": 0.7752, "step": 69 }, { "epoch": 0.22, "learning_rate": 0.0006368352164247117, "loss": 0.7299, "step": 70 }, { "epoch": 0.22, "learning_rate": 0.0006363725258470184, "loss": 0.7722, "step": 71 }, { "epoch": 0.22, "learning_rate": 0.0006359020181985365, "loss": 0.8236, "step": 72 }, { "epoch": 0.23, "learning_rate": 0.0006354237052914561, "loss": 0.7589, "step": 73 }, { "epoch": 0.23, "learning_rate": 0.0006349375991339202, "loss": 0.7948, "step": 74 }, { "epoch": 0.23, "learning_rate": 0.0006344437119297233, "loss": 0.7528, "step": 75 }, { "epoch": 0.24, "learning_rate": 0.0006339420560780045, "loss": 0.7842, "step": 76 }, { "epoch": 0.24, "learning_rate": 0.0006334326441729361, "loss": 0.7541, "step": 77 }, { "epoch": 0.24, "learning_rate": 0.000632915489003408, "loss": 0.7425, "step": 78 }, { "epoch": 0.25, "learning_rate": 0.0006323906035527062, "loss": 0.8168, "step": 79 }, { "epoch": 0.25, "learning_rate": 0.0006318580009981871, "loss": 0.8074, "step": 80 }, { "epoch": 0.25, "learning_rate": 0.0006313176947109465, "loss": 0.7679, "step": 81 }, { "epoch": 0.26, "learning_rate": 0.0006307696982554838, "loss": 0.7465, "step": 82 }, { "epoch": 0.26, "learning_rate": 0.0006302140253893622, "loss": 0.7073, "step": 83 }, { "epoch": 0.26, "learning_rate": 0.0006296506900628619, "loss": 0.7687, "step": 84 }, { "epoch": 0.27, "learning_rate": 0.0006290797064186315, "loss": 0.7578, "step": 85 }, { "epoch": 0.27, "learning_rate": 0.0006285010887913319, "loss": 0.7494, "step": 86 }, { "epoch": 0.27, "learning_rate": 0.0006279148517072765, "loss": 0.7326, "step": 87 }, { "epoch": 0.27, "learning_rate": 0.000627321009884067, "loss": 0.7603, "step": 88 }, { "epoch": 0.28, "learning_rate": 0.0006267195782302236, "loss": 0.8141, "step": 89 }, { "epoch": 0.28, "learning_rate": 0.0006261105718448105, "loss": 0.7542, "step": 90 }, { "epoch": 0.28, "learning_rate": 0.0006254940060170575, "loss": 0.7597, "step": 91 }, { "epoch": 0.29, "learning_rate": 0.0006248698962259753, "loss": 0.7332, "step": 92 }, { "epoch": 0.29, "learning_rate": 0.0006242382581399676, "loss": 0.7031, "step": 93 }, { "epoch": 0.29, "learning_rate": 0.0006235991076164375, "loss": 0.7258, "step": 94 }, { "epoch": 0.3, "learning_rate": 0.0006229524607013892, "loss": 0.7634, "step": 95 }, { "epoch": 0.3, "learning_rate": 0.0006222983336290254, "loss": 0.765, "step": 96 }, { "epoch": 0.3, "learning_rate": 0.0006216367428213398, "loss": 0.7246, "step": 97 }, { "epoch": 0.31, "learning_rate": 0.0006209677048877046, "loss": 0.7115, "step": 98 }, { "epoch": 0.31, "learning_rate": 0.0006202912366244535, "loss": 0.6748, "step": 99 }, { "epoch": 0.31, "learning_rate": 0.0006196073550144604, "loss": 0.6995, "step": 100 }, { "epoch": 0.32, "learning_rate": 0.0006189160772267127, "loss": 0.7764, "step": 101 }, { "epoch": 0.32, "learning_rate": 0.00061821742061588, "loss": 0.8628, "step": 102 }, { "epoch": 0.32, "learning_rate": 0.0006175114027218794, "loss": 0.7266, "step": 103 }, { "epoch": 0.32, "learning_rate": 0.0006167980412694342, "loss": 0.7557, "step": 104 }, { "epoch": 0.33, "learning_rate": 0.0006160773541676288, "loss": 0.7518, "step": 105 }, { "epoch": 0.33, "learning_rate": 0.0006153493595094602, "loss": 0.7589, "step": 106 }, { "epoch": 0.33, "learning_rate": 0.000614614075571383, "loss": 0.7506, "step": 107 }, { "epoch": 0.34, "learning_rate": 0.0006138715208128501, "loss": 0.6617, "step": 108 }, { "epoch": 0.34, "learning_rate": 0.0006131217138758505, "loss": 0.7396, "step": 109 }, { "epoch": 0.34, "learning_rate": 0.0006123646735844401, "loss": 0.7666, "step": 110 }, { "epoch": 0.35, "learning_rate": 0.00061160041894427, "loss": 0.7555, "step": 111 }, { "epoch": 0.35, "learning_rate": 0.0006108289691421089, "loss": 0.7301, "step": 112 }, { "epoch": 0.35, "learning_rate": 0.0006100503435453614, "loss": 0.7364, "step": 113 }, { "epoch": 0.36, "learning_rate": 0.0006092645617015822, "loss": 0.7461, "step": 114 }, { "epoch": 0.36, "learning_rate": 0.0006084716433379844, "loss": 0.8086, "step": 115 }, { "epoch": 0.36, "learning_rate": 0.0006076716083609456, "loss": 0.7577, "step": 116 }, { "epoch": 0.36, "learning_rate": 0.0006068644768555068, "loss": 0.7094, "step": 117 }, { "epoch": 0.37, "learning_rate": 0.0006060502690848696, "loss": 0.726, "step": 118 }, { "epoch": 0.37, "learning_rate": 0.0006052290054898859, "loss": 0.7243, "step": 119 }, { "epoch": 0.37, "learning_rate": 0.0006044007066885458, "loss": 0.7119, "step": 120 }, { "epoch": 0.38, "learning_rate": 0.0006035653934754598, "loss": 0.7049, "step": 121 }, { "epoch": 0.38, "learning_rate": 0.0006027230868213366, "loss": 0.7424, "step": 122 }, { "epoch": 0.38, "learning_rate": 0.0006018738078724563, "loss": 0.7271, "step": 123 }, { "epoch": 0.39, "learning_rate": 0.0006010175779501405, "loss": 0.7996, "step": 124 }, { "epoch": 0.39, "learning_rate": 0.0006001544185502158, "loss": 0.7468, "step": 125 }, { "epoch": 0.39, "learning_rate": 0.0005992843513424754, "loss": 0.7513, "step": 126 }, { "epoch": 0.4, "learning_rate": 0.0005984073981701338, "loss": 0.7461, "step": 127 }, { "epoch": 0.4, "learning_rate": 0.0005975235810492794, "loss": 0.6821, "step": 128 }, { "epoch": 0.4, "learning_rate": 0.0005966329221683215, "loss": 0.7314, "step": 129 }, { "epoch": 0.41, "learning_rate": 0.0005957354438874327, "loss": 0.714, "step": 130 }, { "epoch": 0.41, "learning_rate": 0.0005948311687379884, "loss": 0.7339, "step": 131 }, { "epoch": 0.41, "learning_rate": 0.000593920119422001, "loss": 0.7021, "step": 132 }, { "epoch": 0.41, "learning_rate": 0.0005930023188115492, "loss": 0.7228, "step": 133 }, { "epoch": 0.42, "learning_rate": 0.0005920777899482046, "loss": 0.7107, "step": 134 }, { "epoch": 0.42, "learning_rate": 0.0005911465560424532, "loss": 0.659, "step": 135 }, { "epoch": 0.42, "learning_rate": 0.0005902086404731118, "loss": 0.7028, "step": 136 }, { "epoch": 0.43, "learning_rate": 0.0005892640667867423, "loss": 0.7275, "step": 137 }, { "epoch": 0.43, "learning_rate": 0.00058831285869706, "loss": 0.6889, "step": 138 }, { "epoch": 0.43, "learning_rate": 0.0005873550400843378, "loss": 0.7891, "step": 139 }, { "epoch": 0.44, "learning_rate": 0.0005863906349948074, "loss": 0.7904, "step": 140 }, { "epoch": 0.44, "learning_rate": 0.0005854196676400555, "loss": 0.6674, "step": 141 }, { "epoch": 0.44, "learning_rate": 0.0005844421623964157, "loss": 0.7352, "step": 142 }, { "epoch": 0.45, "learning_rate": 0.0005834581438043563, "loss": 0.6965, "step": 143 }, { "epoch": 0.45, "learning_rate": 0.000582467636567865, "loss": 0.7238, "step": 144 }, { "epoch": 0.45, "learning_rate": 0.0005814706655538279, "loss": 0.7064, "step": 145 }, { "epoch": 0.46, "learning_rate": 0.0005804672557914059, "loss": 0.6984, "step": 146 }, { "epoch": 0.46, "learning_rate": 0.0005794574324714057, "loss": 0.7594, "step": 147 }, { "epoch": 0.46, "learning_rate": 0.0005784412209456479, "loss": 0.6884, "step": 148 }, { "epoch": 0.46, "learning_rate": 0.00057741864672633, "loss": 0.7141, "step": 149 }, { "epoch": 0.47, "learning_rate": 0.0005763897354853866, "loss": 0.705, "step": 150 } ], "logging_steps": 1, "max_steps": 640, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "total_flos": 3.889192088892211e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }