{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 10580, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1890359168241966, "grad_norm": 0.2485148161649704, "learning_rate": 0.0005, "loss": 1.9, "step": 100 }, { "epoch": 0.3780718336483932, "grad_norm": 0.2885477840900421, "learning_rate": 0.0005, "loss": 1.8505, "step": 200 }, { "epoch": 0.5671077504725898, "grad_norm": 0.3204280436038971, "learning_rate": 0.0005, "loss": 1.8005, "step": 300 }, { "epoch": 0.7561436672967864, "grad_norm": 0.3903968632221222, "learning_rate": 0.0005, "loss": 1.7786, "step": 400 }, { "epoch": 0.945179584120983, "grad_norm": 0.37639909982681274, "learning_rate": 0.0005, "loss": 1.7331, "step": 500 }, { "epoch": 1.0, "eval_accuracy": 0.6365112107623319, "eval_loss": 1.427091121673584, "eval_runtime": 6.6532, "eval_samples_per_second": 75.152, "eval_steps_per_second": 9.469, "step": 529 }, { "epoch": 1.1342155009451795, "grad_norm": 0.5027719736099243, "learning_rate": 0.0005, "loss": 1.4811, "step": 600 }, { "epoch": 1.3232514177693762, "grad_norm": 0.6268895268440247, "learning_rate": 0.0005, "loss": 1.3494, "step": 700 }, { "epoch": 1.5122873345935728, "grad_norm": 0.5936668515205383, "learning_rate": 0.0005, "loss": 1.3268, "step": 800 }, { "epoch": 1.7013232514177694, "grad_norm": 0.61969393491745, "learning_rate": 0.0005, "loss": 1.3109, "step": 900 }, { "epoch": 1.8903591682419658, "grad_norm": 0.6521849036216736, "learning_rate": 0.0005, "loss": 1.3037, "step": 1000 }, { "epoch": 2.0, "eval_accuracy": 0.6845739910313902, "eval_loss": 1.0687369108200073, "eval_runtime": 6.6004, "eval_samples_per_second": 75.753, "eval_steps_per_second": 9.545, "step": 1058 }, { "epoch": 2.0793950850661624, "grad_norm": 0.6594504117965698, "learning_rate": 0.0005, "loss": 1.0942, "step": 1100 }, { "epoch": 2.268431001890359, "grad_norm": 0.7692387104034424, "learning_rate": 0.0005, "loss": 0.8554, "step": 1200 }, { "epoch": 2.4574669187145557, "grad_norm": 0.7720690965652466, "learning_rate": 0.0005, "loss": 0.8748, "step": 1300 }, { "epoch": 2.6465028355387523, "grad_norm": 0.8158366680145264, "learning_rate": 0.0005, "loss": 0.8853, "step": 1400 }, { "epoch": 2.835538752362949, "grad_norm": 0.8173924684524536, "learning_rate": 0.0005, "loss": 0.8818, "step": 1500 }, { "epoch": 3.0, "eval_accuracy": 0.7216233183856502, "eval_loss": 0.8142201900482178, "eval_runtime": 7.0748, "eval_samples_per_second": 70.673, "eval_steps_per_second": 8.905, "step": 1587 }, { "epoch": 3.0245746691871456, "grad_norm": 0.8073566555976868, "learning_rate": 0.0005, "loss": 0.8192, "step": 1600 }, { "epoch": 3.213610586011342, "grad_norm": 0.7835751175880432, "learning_rate": 0.0005, "loss": 0.5222, "step": 1700 }, { "epoch": 3.402646502835539, "grad_norm": 0.8399431109428406, "learning_rate": 0.0005, "loss": 0.5699, "step": 1800 }, { "epoch": 3.5916824196597354, "grad_norm": 0.8880508542060852, "learning_rate": 0.0005, "loss": 0.6016, "step": 1900 }, { "epoch": 3.780718336483932, "grad_norm": 1.0154162645339966, "learning_rate": 0.0005, "loss": 0.6219, "step": 2000 }, { "epoch": 3.9697542533081287, "grad_norm": 0.8903167843818665, "learning_rate": 0.0005, "loss": 0.6397, "step": 2100 }, { "epoch": 4.0, "eval_accuracy": 0.7470134529147983, "eval_loss": 0.6636369824409485, "eval_runtime": 6.2123, "eval_samples_per_second": 80.485, "eval_steps_per_second": 10.141, "step": 2116 }, { "epoch": 4.158790170132325, "grad_norm": 0.7903006672859192, "learning_rate": 0.0005, "loss": 0.4242, "step": 2200 }, { "epoch": 4.3478260869565215, "grad_norm": 0.8569557666778564, "learning_rate": 0.0005, "loss": 0.4102, "step": 2300 }, { "epoch": 4.536862003780718, "grad_norm": 0.8884899616241455, "learning_rate": 0.0005, "loss": 0.4266, "step": 2400 }, { "epoch": 4.725897920604915, "grad_norm": 0.9051417708396912, "learning_rate": 0.0005, "loss": 0.456, "step": 2500 }, { "epoch": 4.914933837429111, "grad_norm": 0.8909860849380493, "learning_rate": 0.0005, "loss": 0.4735, "step": 2600 }, { "epoch": 5.0, "eval_accuracy": 0.7667174887892376, "eval_loss": 0.5546546578407288, "eval_runtime": 6.1339, "eval_samples_per_second": 81.514, "eval_steps_per_second": 10.271, "step": 2645 }, { "epoch": 5.103969754253308, "grad_norm": 0.8669963479042053, "learning_rate": 0.0005, "loss": 0.3835, "step": 2700 }, { "epoch": 5.293005671077505, "grad_norm": 0.7596257925033569, "learning_rate": 0.0005, "loss": 0.3091, "step": 2800 }, { "epoch": 5.482041587901701, "grad_norm": 0.8810439705848694, "learning_rate": 0.0005, "loss": 0.3356, "step": 2900 }, { "epoch": 5.671077504725898, "grad_norm": 0.866624653339386, "learning_rate": 0.0005, "loss": 0.3585, "step": 3000 }, { "epoch": 5.8601134215500945, "grad_norm": 1.0449254512786865, "learning_rate": 0.0005, "loss": 0.3798, "step": 3100 }, { "epoch": 6.0, "eval_accuracy": 0.7763946188340807, "eval_loss": 0.5001726746559143, "eval_runtime": 7.0927, "eval_samples_per_second": 70.495, "eval_steps_per_second": 8.882, "step": 3174 }, { "epoch": 6.049149338374291, "grad_norm": 0.7229480147361755, "learning_rate": 0.0005, "loss": 0.3581, "step": 3200 }, { "epoch": 6.238185255198488, "grad_norm": 0.8279250264167786, "learning_rate": 0.0005, "loss": 0.2561, "step": 3300 }, { "epoch": 6.427221172022684, "grad_norm": 0.7875299453735352, "learning_rate": 0.0005, "loss": 0.2796, "step": 3400 }, { "epoch": 6.616257088846881, "grad_norm": 0.9155901074409485, "learning_rate": 0.0005, "loss": 0.3019, "step": 3500 }, { "epoch": 6.805293005671078, "grad_norm": 0.7736573815345764, "learning_rate": 0.0005, "loss": 0.3212, "step": 3600 }, { "epoch": 6.994328922495274, "grad_norm": 1.005894422531128, "learning_rate": 0.0005, "loss": 0.3409, "step": 3700 }, { "epoch": 7.0, "eval_accuracy": 0.7801165919282511, "eval_loss": 0.48504143953323364, "eval_runtime": 6.6015, "eval_samples_per_second": 75.74, "eval_steps_per_second": 9.543, "step": 3703 }, { "epoch": 7.183364839319471, "grad_norm": 0.7929958701133728, "learning_rate": 0.0005, "loss": 0.2328, "step": 3800 }, { "epoch": 7.3724007561436675, "grad_norm": 0.8098764419555664, "learning_rate": 0.0005, "loss": 0.2474, "step": 3900 }, { "epoch": 7.561436672967864, "grad_norm": 0.7900761365890503, "learning_rate": 0.0005, "loss": 0.2645, "step": 4000 }, { "epoch": 7.750472589792061, "grad_norm": 0.8573282957077026, "learning_rate": 0.0005, "loss": 0.2817, "step": 4100 }, { "epoch": 7.939508506616257, "grad_norm": 0.9526512622833252, "learning_rate": 0.0005, "loss": 0.3054, "step": 4200 }, { "epoch": 8.0, "eval_accuracy": 0.7835156950672646, "eval_loss": 0.46914222836494446, "eval_runtime": 6.6138, "eval_samples_per_second": 75.599, "eval_steps_per_second": 9.526, "step": 4232 }, { "epoch": 8.128544423440454, "grad_norm": 0.7902705669403076, "learning_rate": 0.0005, "loss": 0.2491, "step": 4300 }, { "epoch": 8.31758034026465, "grad_norm": 0.7070533633232117, "learning_rate": 0.0005, "loss": 0.2221, "step": 4400 }, { "epoch": 8.506616257088847, "grad_norm": 0.8409714102745056, "learning_rate": 0.0005, "loss": 0.2471, "step": 4500 }, { "epoch": 8.695652173913043, "grad_norm": 0.8545922636985779, "learning_rate": 0.0005, "loss": 0.2634, "step": 4600 }, { "epoch": 8.88468809073724, "grad_norm": 0.9577164053916931, "learning_rate": 0.0005, "loss": 0.2803, "step": 4700 }, { "epoch": 9.0, "eval_accuracy": 0.785865470852018, "eval_loss": 0.4636962115764618, "eval_runtime": 6.6143, "eval_samples_per_second": 75.594, "eval_steps_per_second": 9.525, "step": 4761 }, { "epoch": 9.073724007561436, "grad_norm": 0.7526606917381287, "learning_rate": 0.0005, "loss": 0.2587, "step": 4800 }, { "epoch": 9.262759924385634, "grad_norm": 0.881596565246582, "learning_rate": 0.0005, "loss": 0.2101, "step": 4900 }, { "epoch": 9.45179584120983, "grad_norm": 0.8427467942237854, "learning_rate": 0.0005, "loss": 0.2218, "step": 5000 }, { "epoch": 9.640831758034027, "grad_norm": 0.8922020792961121, "learning_rate": 0.0005, "loss": 0.2471, "step": 5100 }, { "epoch": 9.829867674858223, "grad_norm": 0.9664803147315979, "learning_rate": 0.0005, "loss": 0.2637, "step": 5200 }, { "epoch": 10.0, "eval_accuracy": 0.7877040358744395, "eval_loss": 0.45324718952178955, "eval_runtime": 6.6378, "eval_samples_per_second": 75.326, "eval_steps_per_second": 9.491, "step": 5290 }, { "epoch": 10.01890359168242, "grad_norm": 0.7240998148918152, "learning_rate": 0.0005, "loss": 0.2689, "step": 5300 }, { "epoch": 10.207939508506616, "grad_norm": 0.8262352347373962, "learning_rate": 0.0005, "loss": 0.2022, "step": 5400 }, { "epoch": 10.396975425330814, "grad_norm": 0.8235085010528564, "learning_rate": 0.0005, "loss": 0.2113, "step": 5500 }, { "epoch": 10.58601134215501, "grad_norm": 0.9357341527938843, "learning_rate": 0.0005, "loss": 0.2282, "step": 5600 }, { "epoch": 10.775047258979207, "grad_norm": 0.8699725866317749, "learning_rate": 0.0005, "loss": 0.2499, "step": 5700 }, { "epoch": 10.964083175803403, "grad_norm": 0.9279125928878784, "learning_rate": 0.0005, "loss": 0.2661, "step": 5800 }, { "epoch": 11.0, "eval_accuracy": 0.7878834080717488, "eval_loss": 0.4667767286300659, "eval_runtime": 6.6569, "eval_samples_per_second": 75.11, "eval_steps_per_second": 9.464, "step": 5819 }, { "epoch": 11.1531190926276, "grad_norm": 0.7377325892448425, "learning_rate": 0.0005, "loss": 0.2085, "step": 5900 }, { "epoch": 11.342155009451796, "grad_norm": 0.8089397549629211, "learning_rate": 0.0005, "loss": 0.2027, "step": 6000 }, { "epoch": 11.531190926275993, "grad_norm": 0.8074979782104492, "learning_rate": 0.0005, "loss": 0.2183, "step": 6100 }, { "epoch": 11.720226843100189, "grad_norm": 0.8173251152038574, "learning_rate": 0.0005, "loss": 0.2373, "step": 6200 }, { "epoch": 11.909262759924385, "grad_norm": 0.8041278123855591, "learning_rate": 0.0005, "loss": 0.2513, "step": 6300 }, { "epoch": 12.0, "eval_accuracy": 0.7892825112107623, "eval_loss": 0.4647226631641388, "eval_runtime": 6.1139, "eval_samples_per_second": 81.78, "eval_steps_per_second": 10.304, "step": 6348 }, { "epoch": 12.098298676748582, "grad_norm": 0.7303513288497925, "learning_rate": 0.0005, "loss": 0.221, "step": 6400 }, { "epoch": 12.287334593572778, "grad_norm": 0.8052433133125305, "learning_rate": 0.0005, "loss": 0.1969, "step": 6500 }, { "epoch": 12.476370510396976, "grad_norm": 0.9060468077659607, "learning_rate": 0.0005, "loss": 0.2078, "step": 6600 }, { "epoch": 12.665406427221171, "grad_norm": 0.8090572357177734, "learning_rate": 0.0005, "loss": 0.228, "step": 6700 }, { "epoch": 12.854442344045369, "grad_norm": 0.8910106420516968, "learning_rate": 0.0005, "loss": 0.2424, "step": 6800 }, { "epoch": 13.0, "eval_accuracy": 0.7896681614349775, "eval_loss": 0.46145549416542053, "eval_runtime": 7.2577, "eval_samples_per_second": 68.892, "eval_steps_per_second": 8.68, "step": 6877 }, { "epoch": 13.043478260869565, "grad_norm": 0.7991313934326172, "learning_rate": 0.0005, "loss": 0.2395, "step": 6900 }, { "epoch": 13.232514177693762, "grad_norm": 0.6423863768577576, "learning_rate": 0.0005, "loss": 0.1903, "step": 7000 }, { "epoch": 13.421550094517958, "grad_norm": 0.8329731225967407, "learning_rate": 0.0005, "loss": 0.2021, "step": 7100 }, { "epoch": 13.610586011342155, "grad_norm": 0.8105093240737915, "learning_rate": 0.0005, "loss": 0.2198, "step": 7200 }, { "epoch": 13.799621928166351, "grad_norm": 0.8448224067687988, "learning_rate": 0.0005, "loss": 0.2317, "step": 7300 }, { "epoch": 13.988657844990549, "grad_norm": 0.8879454731941223, "learning_rate": 0.0005, "loss": 0.2499, "step": 7400 }, { "epoch": 14.0, "eval_accuracy": 0.7894170403587444, "eval_loss": 0.4546208679676056, "eval_runtime": 6.6163, "eval_samples_per_second": 75.571, "eval_steps_per_second": 9.522, "step": 7406 }, { "epoch": 14.177693761814744, "grad_norm": 0.7737810611724854, "learning_rate": 0.0005, "loss": 0.1916, "step": 7500 }, { "epoch": 14.366729678638942, "grad_norm": 0.8142408132553101, "learning_rate": 0.0005, "loss": 0.1968, "step": 7600 }, { "epoch": 14.555765595463138, "grad_norm": 0.8974208831787109, "learning_rate": 0.0005, "loss": 0.2089, "step": 7700 }, { "epoch": 14.744801512287335, "grad_norm": 0.8206737637519836, "learning_rate": 0.0005, "loss": 0.2268, "step": 7800 }, { "epoch": 14.93383742911153, "grad_norm": 0.8444597721099854, "learning_rate": 0.0005, "loss": 0.235, "step": 7900 }, { "epoch": 15.0, "eval_accuracy": 0.7896233183856503, "eval_loss": 0.4667895436286926, "eval_runtime": 6.086, "eval_samples_per_second": 82.156, "eval_steps_per_second": 10.352, "step": 7935 }, { "epoch": 15.122873345935728, "grad_norm": 0.8915912508964539, "learning_rate": 0.0005, "loss": 0.2052, "step": 8000 }, { "epoch": 15.311909262759924, "grad_norm": 0.8511343598365784, "learning_rate": 0.0005, "loss": 0.1887, "step": 8100 }, { "epoch": 15.500945179584122, "grad_norm": 0.8454872965812683, "learning_rate": 0.0005, "loss": 0.1967, "step": 8200 }, { "epoch": 15.689981096408317, "grad_norm": 0.7670953869819641, "learning_rate": 0.0005, "loss": 0.2177, "step": 8300 }, { "epoch": 15.879017013232515, "grad_norm": 0.8440077900886536, "learning_rate": 0.0005, "loss": 0.2317, "step": 8400 }, { "epoch": 16.0, "eval_accuracy": 0.7913183856502242, "eval_loss": 0.4509560465812683, "eval_runtime": 6.1751, "eval_samples_per_second": 80.97, "eval_steps_per_second": 10.202, "step": 8464 }, { "epoch": 16.068052930056712, "grad_norm": 0.7304275035858154, "learning_rate": 0.0005, "loss": 0.2194, "step": 8500 }, { "epoch": 16.257088846880908, "grad_norm": 0.6791017651557922, "learning_rate": 0.0005, "loss": 0.1821, "step": 8600 }, { "epoch": 16.446124763705104, "grad_norm": 0.7813127040863037, "learning_rate": 0.0005, "loss": 0.1952, "step": 8700 }, { "epoch": 16.6351606805293, "grad_norm": 0.8328565955162048, "learning_rate": 0.0005, "loss": 0.2107, "step": 8800 }, { "epoch": 16.8241965973535, "grad_norm": 0.7615934014320374, "learning_rate": 0.0005, "loss": 0.2225, "step": 8900 }, { "epoch": 17.0, "eval_accuracy": 0.7914618834080718, "eval_loss": 0.44965559244155884, "eval_runtime": 6.0983, "eval_samples_per_second": 81.99, "eval_steps_per_second": 10.331, "step": 8993 }, { "epoch": 17.013232514177695, "grad_norm": 0.6526479125022888, "learning_rate": 0.0005, "loss": 0.2357, "step": 9000 }, { "epoch": 17.20226843100189, "grad_norm": 0.8103135228157043, "learning_rate": 0.0005, "loss": 0.1796, "step": 9100 }, { "epoch": 17.391304347826086, "grad_norm": 0.7900732159614563, "learning_rate": 0.0005, "loss": 0.1894, "step": 9200 }, { "epoch": 17.58034026465028, "grad_norm": 0.8373563885688782, "learning_rate": 0.0005, "loss": 0.2048, "step": 9300 }, { "epoch": 17.76937618147448, "grad_norm": 0.8260546922683716, "learning_rate": 0.0005, "loss": 0.2213, "step": 9400 }, { "epoch": 17.958412098298677, "grad_norm": 0.8767030835151672, "learning_rate": 0.0005, "loss": 0.2358, "step": 9500 }, { "epoch": 18.0, "eval_accuracy": 0.7915695067264574, "eval_loss": 0.4474881589412689, "eval_runtime": 6.1364, "eval_samples_per_second": 81.481, "eval_steps_per_second": 10.267, "step": 9522 }, { "epoch": 18.147448015122873, "grad_norm": 0.7694826722145081, "learning_rate": 0.0005, "loss": 0.1931, "step": 9600 }, { "epoch": 18.33648393194707, "grad_norm": 0.7357311248779297, "learning_rate": 0.0005, "loss": 0.187, "step": 9700 }, { "epoch": 18.525519848771268, "grad_norm": 0.7614474892616272, "learning_rate": 0.0005, "loss": 0.1971, "step": 9800 }, { "epoch": 18.714555765595463, "grad_norm": 0.8685888648033142, "learning_rate": 0.0005, "loss": 0.2127, "step": 9900 }, { "epoch": 18.90359168241966, "grad_norm": 0.8261685371398926, "learning_rate": 0.0005, "loss": 0.2253, "step": 10000 }, { "epoch": 19.0, "eval_accuracy": 0.7918116591928251, "eval_loss": 0.45286619663238525, "eval_runtime": 7.1709, "eval_samples_per_second": 69.726, "eval_steps_per_second": 8.785, "step": 10051 }, { "epoch": 19.092627599243855, "grad_norm": 0.6555222868919373, "learning_rate": 0.0005, "loss": 0.2055, "step": 10100 }, { "epoch": 19.281663516068054, "grad_norm": 0.6589702367782593, "learning_rate": 0.0005, "loss": 0.1828, "step": 10200 }, { "epoch": 19.47069943289225, "grad_norm": 0.8058968782424927, "learning_rate": 0.0005, "loss": 0.1932, "step": 10300 }, { "epoch": 19.659735349716446, "grad_norm": 0.7728559374809265, "learning_rate": 0.0005, "loss": 0.2066, "step": 10400 }, { "epoch": 19.84877126654064, "grad_norm": 0.7878997325897217, "learning_rate": 0.0005, "loss": 0.2172, "step": 10500 }, { "epoch": 20.0, "eval_accuracy": 0.7918744394618834, "eval_loss": 0.45650961995124817, "eval_runtime": 6.6035, "eval_samples_per_second": 75.718, "eval_steps_per_second": 9.54, "step": 10580 }, { "epoch": 20.0, "step": 10580, "total_flos": 9.64242245391745e+17, "train_loss": 0.42894544871858475, "train_runtime": 22582.9978, "train_samples_per_second": 14.989, "train_steps_per_second": 0.468 } ], "logging_steps": 100, "max_steps": 10580, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "total_flos": 9.64242245391745e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }