{ "best_metric": 1.8915482759475708, "best_model_checkpoint": "gpt2-large-open-assistant-guanaco/checkpoint-2710", "epoch": 1.0, "eval_steps": 500, "global_step": 2710, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 2.1778407096862793, "learning_rate": 1.845018450184502e-05, "loss": 2.4702, "step": 25 }, { "epoch": 0.02, "grad_norm": 1.888936996459961, "learning_rate": 3.690036900369004e-05, "loss": 2.379, "step": 50 }, { "epoch": 0.03, "grad_norm": 1.8794102668762207, "learning_rate": 5.535055350553506e-05, "loss": 2.4576, "step": 75 }, { "epoch": 0.04, "grad_norm": 1.9237180948257446, "learning_rate": 7.380073800738008e-05, "loss": 2.431, "step": 100 }, { "epoch": 0.05, "grad_norm": 2.3720157146453857, "learning_rate": 9.22509225092251e-05, "loss": 2.5555, "step": 125 }, { "epoch": 0.06, "grad_norm": 1.955418348312378, "learning_rate": 0.00011070110701107013, "loss": 2.405, "step": 150 }, { "epoch": 0.06, "grad_norm": 2.923417091369629, "learning_rate": 0.00012915129151291514, "loss": 2.4429, "step": 175 }, { "epoch": 0.07, "grad_norm": 1.6975607872009277, "learning_rate": 0.00014760147601476016, "loss": 2.2604, "step": 200 }, { "epoch": 0.08, "grad_norm": 6.038263320922852, "learning_rate": 0.00016605166051660516, "loss": 2.5249, "step": 225 }, { "epoch": 0.09, "grad_norm": 1.835350513458252, "learning_rate": 0.0001845018450184502, "loss": 2.7203, "step": 250 }, { "epoch": 0.1, "grad_norm": 1.5644915103912354, "learning_rate": 0.0001996719967199672, "loss": 2.6386, "step": 275 }, { "epoch": 0.11, "grad_norm": 2.2700071334838867, "learning_rate": 0.0001976219762197622, "loss": 2.4893, "step": 300 }, { "epoch": 0.12, "grad_norm": 2.0536715984344482, "learning_rate": 0.0001955719557195572, "loss": 2.5939, "step": 325 }, { "epoch": 0.13, "grad_norm": 1.561672329902649, "learning_rate": 0.0001935219352193522, "loss": 2.7238, "step": 350 }, { "epoch": 0.14, "grad_norm": 1.4834322929382324, "learning_rate": 0.0001914719147191472, "loss": 2.4672, "step": 375 }, { "epoch": 0.15, "grad_norm": 1.6992969512939453, "learning_rate": 0.00018942189421894222, "loss": 2.5833, "step": 400 }, { "epoch": 0.16, "grad_norm": 1.595070719718933, "learning_rate": 0.00018737187371873718, "loss": 2.5067, "step": 425 }, { "epoch": 0.17, "grad_norm": 2.07468318939209, "learning_rate": 0.00018532185321853219, "loss": 2.483, "step": 450 }, { "epoch": 0.18, "grad_norm": 2.283381462097168, "learning_rate": 0.0001832718327183272, "loss": 2.6612, "step": 475 }, { "epoch": 0.18, "grad_norm": 1.5583351850509644, "learning_rate": 0.0001812218122181222, "loss": 2.3414, "step": 500 }, { "epoch": 0.19, "grad_norm": 1.2439337968826294, "learning_rate": 0.0001791717917179172, "loss": 2.4414, "step": 525 }, { "epoch": 0.2, "grad_norm": 1.4025323390960693, "learning_rate": 0.0001771217712177122, "loss": 2.3685, "step": 550 }, { "epoch": 0.21, "grad_norm": 1.5767055749893188, "learning_rate": 0.00017507175071750718, "loss": 2.3423, "step": 575 }, { "epoch": 0.22, "grad_norm": 1.2068151235580444, "learning_rate": 0.0001730217302173022, "loss": 2.465, "step": 600 }, { "epoch": 0.23, "grad_norm": 2.0946199893951416, "learning_rate": 0.00017097170971709717, "loss": 2.288, "step": 625 }, { "epoch": 0.24, "grad_norm": 1.315215826034546, "learning_rate": 0.00016892168921689218, "loss": 2.4756, "step": 650 }, { "epoch": 0.25, "grad_norm": 1.3037053346633911, "learning_rate": 0.0001668716687166872, "loss": 2.6019, "step": 675 }, { "epoch": 0.26, "grad_norm": 1.2501168251037598, "learning_rate": 0.00016482164821648217, "loss": 2.3108, "step": 700 }, { "epoch": 0.27, "grad_norm": 1.4587308168411255, "learning_rate": 0.00016277162771627715, "loss": 2.4512, "step": 725 }, { "epoch": 0.28, "grad_norm": 1.4319279193878174, "learning_rate": 0.00016072160721607216, "loss": 2.5111, "step": 750 }, { "epoch": 0.29, "grad_norm": 1.6068148612976074, "learning_rate": 0.00015867158671586717, "loss": 2.4429, "step": 775 }, { "epoch": 0.3, "grad_norm": 1.2128015756607056, "learning_rate": 0.00015662156621566218, "loss": 2.4457, "step": 800 }, { "epoch": 0.3, "grad_norm": 1.2929550409317017, "learning_rate": 0.00015457154571545717, "loss": 2.2653, "step": 825 }, { "epoch": 0.31, "grad_norm": 1.0177528858184814, "learning_rate": 0.00015252152521525215, "loss": 2.2931, "step": 850 }, { "epoch": 0.32, "grad_norm": 1.533103346824646, "learning_rate": 0.00015047150471504716, "loss": 2.2601, "step": 875 }, { "epoch": 0.33, "grad_norm": 1.1346125602722168, "learning_rate": 0.00014842148421484217, "loss": 2.3318, "step": 900 }, { "epoch": 0.34, "grad_norm": 1.0939571857452393, "learning_rate": 0.00014637146371463715, "loss": 2.249, "step": 925 }, { "epoch": 0.35, "grad_norm": 1.225317120552063, "learning_rate": 0.00014432144321443216, "loss": 2.4674, "step": 950 }, { "epoch": 0.36, "grad_norm": 1.3087507486343384, "learning_rate": 0.00014227142271422714, "loss": 2.298, "step": 975 }, { "epoch": 0.37, "grad_norm": 1.26546049118042, "learning_rate": 0.00014022140221402215, "loss": 2.3119, "step": 1000 }, { "epoch": 0.38, "grad_norm": 1.2002321481704712, "learning_rate": 0.00013817138171381713, "loss": 2.1619, "step": 1025 }, { "epoch": 0.39, "grad_norm": 1.589195728302002, "learning_rate": 0.00013612136121361214, "loss": 2.2499, "step": 1050 }, { "epoch": 0.4, "grad_norm": 1.1953868865966797, "learning_rate": 0.00013407134071340715, "loss": 2.1875, "step": 1075 }, { "epoch": 0.41, "grad_norm": 1.0502057075500488, "learning_rate": 0.00013202132021320216, "loss": 2.259, "step": 1100 }, { "epoch": 0.42, "grad_norm": 1.4274191856384277, "learning_rate": 0.00012997129971299714, "loss": 2.2288, "step": 1125 }, { "epoch": 0.42, "grad_norm": 1.33050537109375, "learning_rate": 0.00012792127921279213, "loss": 2.1854, "step": 1150 }, { "epoch": 0.43, "grad_norm": 1.1274808645248413, "learning_rate": 0.00012587125871258714, "loss": 2.0351, "step": 1175 }, { "epoch": 0.44, "grad_norm": 1.2254371643066406, "learning_rate": 0.00012382123821238214, "loss": 2.2282, "step": 1200 }, { "epoch": 0.45, "grad_norm": 1.305560827255249, "learning_rate": 0.00012177121771217713, "loss": 2.1982, "step": 1225 }, { "epoch": 0.46, "grad_norm": 1.0613969564437866, "learning_rate": 0.00011972119721197212, "loss": 2.1863, "step": 1250 }, { "epoch": 0.47, "grad_norm": 1.2470930814743042, "learning_rate": 0.00011767117671176713, "loss": 2.0909, "step": 1275 }, { "epoch": 0.48, "grad_norm": 1.4893149137496948, "learning_rate": 0.00011562115621156213, "loss": 2.1255, "step": 1300 }, { "epoch": 0.49, "grad_norm": 1.1325526237487793, "learning_rate": 0.00011357113571135711, "loss": 2.1233, "step": 1325 }, { "epoch": 0.5, "grad_norm": 1.153321385383606, "learning_rate": 0.00011152111521115212, "loss": 2.2209, "step": 1350 }, { "epoch": 0.51, "grad_norm": 1.1444792747497559, "learning_rate": 0.00010947109471094712, "loss": 2.1678, "step": 1375 }, { "epoch": 0.52, "grad_norm": 0.9731519222259521, "learning_rate": 0.00010742107421074213, "loss": 2.186, "step": 1400 }, { "epoch": 0.53, "grad_norm": 1.3069313764572144, "learning_rate": 0.00010537105371053711, "loss": 2.0522, "step": 1425 }, { "epoch": 0.54, "grad_norm": 1.3358750343322754, "learning_rate": 0.0001033210332103321, "loss": 1.9899, "step": 1450 }, { "epoch": 0.54, "grad_norm": 1.0077296495437622, "learning_rate": 0.00010127101271012711, "loss": 2.1628, "step": 1475 }, { "epoch": 0.55, "grad_norm": 1.1938546895980835, "learning_rate": 9.922099220992211e-05, "loss": 2.1514, "step": 1500 }, { "epoch": 0.56, "grad_norm": 1.1697758436203003, "learning_rate": 9.71709717097171e-05, "loss": 2.1637, "step": 1525 }, { "epoch": 0.57, "grad_norm": 1.0194048881530762, "learning_rate": 9.51209512095121e-05, "loss": 2.2062, "step": 1550 }, { "epoch": 0.58, "grad_norm": 0.9525765180587769, "learning_rate": 9.30709307093071e-05, "loss": 2.157, "step": 1575 }, { "epoch": 0.59, "grad_norm": 1.3280248641967773, "learning_rate": 9.102091020910209e-05, "loss": 2.0966, "step": 1600 }, { "epoch": 0.6, "grad_norm": 1.2190978527069092, "learning_rate": 8.89708897088971e-05, "loss": 1.9294, "step": 1625 }, { "epoch": 0.61, "grad_norm": 0.9637733697891235, "learning_rate": 8.692086920869208e-05, "loss": 2.0513, "step": 1650 }, { "epoch": 0.62, "grad_norm": 1.0733612775802612, "learning_rate": 8.48708487084871e-05, "loss": 2.0574, "step": 1675 }, { "epoch": 0.63, "grad_norm": 1.1949636936187744, "learning_rate": 8.282082820828209e-05, "loss": 2.1535, "step": 1700 }, { "epoch": 0.64, "grad_norm": 1.285975694656372, "learning_rate": 8.077080770807709e-05, "loss": 2.0739, "step": 1725 }, { "epoch": 0.65, "grad_norm": 1.0896881818771362, "learning_rate": 7.872078720787208e-05, "loss": 2.0279, "step": 1750 }, { "epoch": 0.65, "grad_norm": 1.0036427974700928, "learning_rate": 7.667076670766709e-05, "loss": 1.9592, "step": 1775 }, { "epoch": 0.66, "grad_norm": 1.148842453956604, "learning_rate": 7.462074620746207e-05, "loss": 1.9549, "step": 1800 }, { "epoch": 0.67, "grad_norm": 1.4055249691009521, "learning_rate": 7.257072570725708e-05, "loss": 2.1193, "step": 1825 }, { "epoch": 0.68, "grad_norm": 0.7423416972160339, "learning_rate": 7.052070520705208e-05, "loss": 2.0627, "step": 1850 }, { "epoch": 0.69, "grad_norm": 0.8528961539268494, "learning_rate": 6.847068470684707e-05, "loss": 1.9821, "step": 1875 }, { "epoch": 0.7, "grad_norm": 1.220268726348877, "learning_rate": 6.642066420664207e-05, "loss": 2.0314, "step": 1900 }, { "epoch": 0.71, "grad_norm": 0.8227468132972717, "learning_rate": 6.437064370643707e-05, "loss": 1.924, "step": 1925 }, { "epoch": 0.72, "grad_norm": 1.149190068244934, "learning_rate": 6.232062320623206e-05, "loss": 1.8974, "step": 1950 }, { "epoch": 0.73, "grad_norm": 1.2299059629440308, "learning_rate": 6.027060270602707e-05, "loss": 2.0567, "step": 1975 }, { "epoch": 0.74, "grad_norm": 1.226804256439209, "learning_rate": 5.822058220582206e-05, "loss": 2.0881, "step": 2000 }, { "epoch": 0.75, "grad_norm": 1.1045171022415161, "learning_rate": 5.6170561705617064e-05, "loss": 1.9007, "step": 2025 }, { "epoch": 0.76, "grad_norm": 0.82816481590271, "learning_rate": 5.412054120541206e-05, "loss": 2.0032, "step": 2050 }, { "epoch": 0.77, "grad_norm": 0.9694753289222717, "learning_rate": 5.207052070520706e-05, "loss": 2.0872, "step": 2075 }, { "epoch": 0.77, "grad_norm": 0.9572305679321289, "learning_rate": 5.002050020500205e-05, "loss": 2.112, "step": 2100 }, { "epoch": 0.78, "grad_norm": 1.0792776346206665, "learning_rate": 4.797047970479705e-05, "loss": 2.0145, "step": 2125 }, { "epoch": 0.79, "grad_norm": 1.3833109140396118, "learning_rate": 4.592045920459205e-05, "loss": 2.1392, "step": 2150 }, { "epoch": 0.8, "grad_norm": 0.8226168155670166, "learning_rate": 4.3870438704387046e-05, "loss": 2.0964, "step": 2175 }, { "epoch": 0.81, "grad_norm": 1.5212799310684204, "learning_rate": 4.182041820418204e-05, "loss": 2.0599, "step": 2200 }, { "epoch": 0.82, "grad_norm": 1.015734314918518, "learning_rate": 3.9770397703977044e-05, "loss": 2.0029, "step": 2225 }, { "epoch": 0.83, "grad_norm": 0.9501891732215881, "learning_rate": 3.772037720377204e-05, "loss": 2.0123, "step": 2250 }, { "epoch": 0.84, "grad_norm": 0.9258189797401428, "learning_rate": 3.5670356703567036e-05, "loss": 1.9087, "step": 2275 }, { "epoch": 0.85, "grad_norm": 0.998928964138031, "learning_rate": 3.362033620336203e-05, "loss": 2.0364, "step": 2300 }, { "epoch": 0.86, "grad_norm": 1.24564790725708, "learning_rate": 3.1570315703157035e-05, "loss": 1.8917, "step": 2325 }, { "epoch": 0.87, "grad_norm": 1.0782575607299805, "learning_rate": 2.952029520295203e-05, "loss": 1.9744, "step": 2350 }, { "epoch": 0.88, "grad_norm": 1.2584232091903687, "learning_rate": 2.747027470274703e-05, "loss": 1.9452, "step": 2375 }, { "epoch": 0.89, "grad_norm": 1.038710355758667, "learning_rate": 2.5420254202542026e-05, "loss": 1.8455, "step": 2400 }, { "epoch": 0.89, "grad_norm": 1.0375022888183594, "learning_rate": 2.3370233702337025e-05, "loss": 1.9034, "step": 2425 }, { "epoch": 0.9, "grad_norm": 1.314627766609192, "learning_rate": 2.132021320213202e-05, "loss": 2.0162, "step": 2450 }, { "epoch": 0.91, "grad_norm": 1.096312165260315, "learning_rate": 1.927019270192702e-05, "loss": 2.0927, "step": 2475 }, { "epoch": 0.92, "grad_norm": 1.0984846353530884, "learning_rate": 1.722017220172202e-05, "loss": 1.8629, "step": 2500 }, { "epoch": 0.93, "grad_norm": 1.209221363067627, "learning_rate": 1.5170151701517015e-05, "loss": 1.9085, "step": 2525 }, { "epoch": 0.94, "grad_norm": 1.2675678730010986, "learning_rate": 1.3120131201312013e-05, "loss": 1.9813, "step": 2550 }, { "epoch": 0.95, "grad_norm": 1.2204258441925049, "learning_rate": 1.1070110701107012e-05, "loss": 1.8332, "step": 2575 }, { "epoch": 0.96, "grad_norm": 1.0811994075775146, "learning_rate": 9.02009020090201e-06, "loss": 1.7946, "step": 2600 }, { "epoch": 0.97, "grad_norm": 1.219846248626709, "learning_rate": 6.9700697006970075e-06, "loss": 1.8714, "step": 2625 }, { "epoch": 0.98, "grad_norm": 1.1557291746139526, "learning_rate": 4.920049200492005e-06, "loss": 1.7837, "step": 2650 }, { "epoch": 0.99, "grad_norm": 0.845923125743866, "learning_rate": 2.870028700287003e-06, "loss": 1.8496, "step": 2675 }, { "epoch": 1.0, "grad_norm": 0.9930481314659119, "learning_rate": 8.200082000820008e-07, "loss": 1.8581, "step": 2700 }, { "epoch": 1.0, "eval_loss": 1.8915482759475708, "eval_runtime": 97.9153, "eval_samples_per_second": 2.911, "eval_steps_per_second": 1.46, "step": 2710 } ], "logging_steps": 25, "max_steps": 2710, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 2.35853879967744e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }