{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9959278650378127, "eval_steps": 500, "global_step": 107, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 437.97529539347306, "learning_rate": 0.0, "loss": 5.3209, "step": 1 }, { "epoch": 0.02, "grad_norm": 244.20895893680887, "learning_rate": 4.6275642631951835e-06, "loss": 4.7061, "step": 2 }, { "epoch": 0.03, "grad_norm": 936.3785292284313, "learning_rate": 7.3345158268416935e-06, "loss": 4.7486, "step": 3 }, { "epoch": 0.04, "grad_norm": 304.95868060279906, "learning_rate": 9.255128526390367e-06, "loss": 4.396, "step": 4 }, { "epoch": 0.05, "grad_norm": 543.1494672279446, "learning_rate": 1.0744871473609633e-05, "loss": 4.3105, "step": 5 }, { "epoch": 0.06, "grad_norm": 103.08129755804396, "learning_rate": 1.1962080090036879e-05, "loss": 4.1115, "step": 6 }, { "epoch": 0.07, "grad_norm": 715.6237445403834, "learning_rate": 1.299121531141887e-05, "loss": 3.8146, "step": 7 }, { "epoch": 0.07, "grad_norm": 1298.9641065094909, "learning_rate": 1.388269278958555e-05, "loss": 3.7038, "step": 8 }, { "epoch": 0.08, "grad_norm": 521.7548214918575, "learning_rate": 1.4669031653683387e-05, "loss": 3.7118, "step": 9 }, { "epoch": 0.09, "grad_norm": 1104.9428152624007, "learning_rate": 1.537243573680482e-05, "loss": 3.5664, "step": 10 }, { "epoch": 0.1, "grad_norm": 209.65231634780474, "learning_rate": 1.600874212937343e-05, "loss": 3.6595, "step": 11 }, { "epoch": 0.11, "grad_norm": 461.55614839908696, "learning_rate": 1.6589644353232063e-05, "loss": 3.568, "step": 12 }, { "epoch": 0.12, "grad_norm": 352.0604307146715, "learning_rate": 1.712402259777778e-05, "loss": 3.5551, "step": 13 }, { "epoch": 0.13, "grad_norm": 329.37656546117546, "learning_rate": 1.7618779574614054e-05, "loss": 3.3926, "step": 14 }, { "epoch": 0.14, "grad_norm": 271.35324935865424, "learning_rate": 1.8079387300451327e-05, "loss": 3.3927, "step": 15 }, { "epoch": 0.15, "grad_norm": 419.20503262126994, "learning_rate": 1.8510257052780734e-05, "loss": 3.1879, "step": 16 }, { "epoch": 0.16, "grad_norm": 166.53060201131203, "learning_rate": 1.891499697130832e-05, "loss": 3.2592, "step": 17 }, { "epoch": 0.17, "grad_norm": 81.46046059669071, "learning_rate": 1.929659591687857e-05, "loss": 3.1923, "step": 18 }, { "epoch": 0.18, "grad_norm": 135.34804671660393, "learning_rate": 1.9657557553855117e-05, "loss": 2.9865, "step": 19 }, { "epoch": 0.19, "grad_norm": 106.0927558863587, "learning_rate": 2e-05, "loss": 3.0512, "step": 20 }, { "epoch": 0.2, "grad_norm": 107.90656675308273, "learning_rate": 2e-05, "loss": 2.9979, "step": 21 }, { "epoch": 0.2, "grad_norm": 435.76043447059305, "learning_rate": 1.9770114942528737e-05, "loss": 2.9034, "step": 22 }, { "epoch": 0.21, "grad_norm": 343.47043176960665, "learning_rate": 1.9540229885057475e-05, "loss": 3.0163, "step": 23 }, { "epoch": 0.22, "grad_norm": 123.78174308788432, "learning_rate": 1.931034482758621e-05, "loss": 2.8995, "step": 24 }, { "epoch": 0.23, "grad_norm": 2.546022424579643, "learning_rate": 1.908045977011494e-05, "loss": 2.9121, "step": 25 }, { "epoch": 0.24, "grad_norm": 413.99873565628195, "learning_rate": 1.885057471264368e-05, "loss": 2.8147, "step": 26 }, { "epoch": 0.25, "grad_norm": 116.6388741077562, "learning_rate": 1.8620689655172415e-05, "loss": 2.8241, "step": 27 }, { "epoch": 0.26, "grad_norm": 125.14499403192282, "learning_rate": 1.839080459770115e-05, "loss": 2.9184, "step": 28 }, { "epoch": 0.27, "grad_norm": 142.0060930691004, "learning_rate": 1.8160919540229885e-05, "loss": 2.999, "step": 29 }, { "epoch": 0.28, "grad_norm": 134.9248416364644, "learning_rate": 1.7931034482758623e-05, "loss": 3.0598, "step": 30 }, { "epoch": 0.29, "grad_norm": 197.02733193963218, "learning_rate": 1.770114942528736e-05, "loss": 2.9538, "step": 31 }, { "epoch": 0.3, "grad_norm": 150.99011764019525, "learning_rate": 1.7471264367816093e-05, "loss": 3.0267, "step": 32 }, { "epoch": 0.31, "grad_norm": 50.47512295289012, "learning_rate": 1.7241379310344828e-05, "loss": 3.0653, "step": 33 }, { "epoch": 0.32, "grad_norm": 131.75852379922142, "learning_rate": 1.7011494252873563e-05, "loss": 2.9688, "step": 34 }, { "epoch": 0.33, "grad_norm": 78.08826636409596, "learning_rate": 1.6781609195402298e-05, "loss": 2.9607, "step": 35 }, { "epoch": 0.34, "grad_norm": 302.1715934283036, "learning_rate": 1.6551724137931037e-05, "loss": 2.9878, "step": 36 }, { "epoch": 0.34, "grad_norm": 189.53413280989787, "learning_rate": 1.632183908045977e-05, "loss": 2.8227, "step": 37 }, { "epoch": 0.35, "grad_norm": 121.19186750768387, "learning_rate": 1.6091954022988507e-05, "loss": 2.9387, "step": 38 }, { "epoch": 0.36, "grad_norm": 141.40503107209446, "learning_rate": 1.586206896551724e-05, "loss": 3.018, "step": 39 }, { "epoch": 0.37, "grad_norm": 74.06501512079606, "learning_rate": 1.563218390804598e-05, "loss": 2.8574, "step": 40 }, { "epoch": 0.38, "grad_norm": 73.55407967420128, "learning_rate": 1.540229885057471e-05, "loss": 2.8177, "step": 41 }, { "epoch": 0.39, "grad_norm": 126.67257944327375, "learning_rate": 1.5172413793103448e-05, "loss": 2.8549, "step": 42 }, { "epoch": 0.4, "grad_norm": 237.06832480320944, "learning_rate": 1.4942528735632185e-05, "loss": 2.8987, "step": 43 }, { "epoch": 0.41, "grad_norm": 185.5836022861395, "learning_rate": 1.471264367816092e-05, "loss": 2.8245, "step": 44 }, { "epoch": 0.42, "grad_norm": 74.770384187608, "learning_rate": 1.4482758620689657e-05, "loss": 2.7259, "step": 45 }, { "epoch": 0.43, "grad_norm": 70.2045629860517, "learning_rate": 1.4252873563218392e-05, "loss": 2.7883, "step": 46 }, { "epoch": 0.44, "grad_norm": 57.69543680166028, "learning_rate": 1.4022988505747128e-05, "loss": 2.7784, "step": 47 }, { "epoch": 0.45, "grad_norm": 195.69040440195323, "learning_rate": 1.3793103448275863e-05, "loss": 2.7917, "step": 48 }, { "epoch": 0.46, "grad_norm": 301.2361381665553, "learning_rate": 1.3563218390804598e-05, "loss": 2.8006, "step": 49 }, { "epoch": 0.47, "grad_norm": 131.0915710486376, "learning_rate": 1.3333333333333333e-05, "loss": 2.7235, "step": 50 }, { "epoch": 0.47, "grad_norm": 329.8230349369189, "learning_rate": 1.310344827586207e-05, "loss": 2.7674, "step": 51 }, { "epoch": 0.48, "grad_norm": 399.2921666837455, "learning_rate": 1.2873563218390805e-05, "loss": 2.759, "step": 52 }, { "epoch": 0.49, "grad_norm": 29.69761238134042, "learning_rate": 1.2643678160919542e-05, "loss": 2.7786, "step": 53 }, { "epoch": 0.5, "grad_norm": 176.00736845370992, "learning_rate": 1.2413793103448277e-05, "loss": 2.8321, "step": 54 }, { "epoch": 0.51, "grad_norm": 153.2054061852094, "learning_rate": 1.2183908045977013e-05, "loss": 2.8115, "step": 55 }, { "epoch": 0.52, "grad_norm": 126.2155777640482, "learning_rate": 1.1954022988505748e-05, "loss": 2.6866, "step": 56 }, { "epoch": 0.53, "grad_norm": 103.57218384115014, "learning_rate": 1.1724137931034483e-05, "loss": 2.7394, "step": 57 }, { "epoch": 0.54, "grad_norm": 50.05498929302652, "learning_rate": 1.1494252873563218e-05, "loss": 2.7124, "step": 58 }, { "epoch": 0.55, "grad_norm": 165.3252062697186, "learning_rate": 1.1264367816091955e-05, "loss": 2.7174, "step": 59 }, { "epoch": 0.56, "grad_norm": 52.16714610569089, "learning_rate": 1.103448275862069e-05, "loss": 2.8025, "step": 60 }, { "epoch": 0.57, "grad_norm": 302.1963481124482, "learning_rate": 1.0804597701149427e-05, "loss": 2.7509, "step": 61 }, { "epoch": 0.58, "grad_norm": 196.17868927332552, "learning_rate": 1.0574712643678162e-05, "loss": 2.694, "step": 62 }, { "epoch": 0.59, "grad_norm": 79.62041600325415, "learning_rate": 1.0344827586206898e-05, "loss": 2.7608, "step": 63 }, { "epoch": 0.6, "grad_norm": 123.97504395416644, "learning_rate": 1.0114942528735633e-05, "loss": 2.6213, "step": 64 }, { "epoch": 0.61, "grad_norm": 1.165291463877395, "learning_rate": 9.885057471264368e-06, "loss": 2.7338, "step": 65 }, { "epoch": 0.61, "grad_norm": 1.091132765565006, "learning_rate": 9.655172413793105e-06, "loss": 2.6933, "step": 66 }, { "epoch": 0.62, "grad_norm": 448.63071186110295, "learning_rate": 9.42528735632184e-06, "loss": 2.7006, "step": 67 }, { "epoch": 0.63, "grad_norm": 115.13354168909684, "learning_rate": 9.195402298850575e-06, "loss": 2.6215, "step": 68 }, { "epoch": 0.64, "grad_norm": 57.16458087522924, "learning_rate": 8.965517241379312e-06, "loss": 2.6976, "step": 69 }, { "epoch": 0.65, "grad_norm": 292.92454499836646, "learning_rate": 8.735632183908047e-06, "loss": 2.7857, "step": 70 }, { "epoch": 0.66, "grad_norm": 178.63614708920196, "learning_rate": 8.505747126436782e-06, "loss": 2.7472, "step": 71 }, { "epoch": 0.67, "grad_norm": 100.69834768536423, "learning_rate": 8.275862068965518e-06, "loss": 2.657, "step": 72 }, { "epoch": 0.68, "grad_norm": 153.57918611013017, "learning_rate": 8.045977011494253e-06, "loss": 2.7704, "step": 73 }, { "epoch": 0.69, "grad_norm": 65.39478593392404, "learning_rate": 7.81609195402299e-06, "loss": 2.7119, "step": 74 }, { "epoch": 0.7, "grad_norm": 31.429940924593502, "learning_rate": 7.586206896551724e-06, "loss": 2.7455, "step": 75 }, { "epoch": 0.71, "grad_norm": 58.56657911687454, "learning_rate": 7.35632183908046e-06, "loss": 2.8129, "step": 76 }, { "epoch": 0.72, "grad_norm": 61.468567297644086, "learning_rate": 7.126436781609196e-06, "loss": 2.6448, "step": 77 }, { "epoch": 0.73, "grad_norm": 60.14776320004552, "learning_rate": 6.896551724137932e-06, "loss": 2.7367, "step": 78 }, { "epoch": 0.74, "grad_norm": 51.31183150828252, "learning_rate": 6.666666666666667e-06, "loss": 2.7281, "step": 79 }, { "epoch": 0.74, "grad_norm": 95.73532239004055, "learning_rate": 6.4367816091954025e-06, "loss": 2.6449, "step": 80 }, { "epoch": 0.75, "grad_norm": 31.12077480577434, "learning_rate": 6.206896551724138e-06, "loss": 2.7176, "step": 81 }, { "epoch": 0.76, "grad_norm": 34.291115689553976, "learning_rate": 5.977011494252874e-06, "loss": 2.6747, "step": 82 }, { "epoch": 0.77, "grad_norm": 236.6452711866434, "learning_rate": 5.747126436781609e-06, "loss": 2.7139, "step": 83 }, { "epoch": 0.78, "grad_norm": 83.59601048345549, "learning_rate": 5.517241379310345e-06, "loss": 2.6815, "step": 84 }, { "epoch": 0.79, "grad_norm": 124.43972875954849, "learning_rate": 5.287356321839081e-06, "loss": 2.6672, "step": 85 }, { "epoch": 0.8, "grad_norm": 254.94322284579366, "learning_rate": 5.057471264367817e-06, "loss": 2.6687, "step": 86 }, { "epoch": 0.81, "grad_norm": 18.73181409727846, "learning_rate": 4.8275862068965525e-06, "loss": 2.695, "step": 87 }, { "epoch": 0.82, "grad_norm": 23.17924117998015, "learning_rate": 4.5977011494252875e-06, "loss": 2.7384, "step": 88 }, { "epoch": 0.83, "grad_norm": 77.75708054073758, "learning_rate": 4.367816091954023e-06, "loss": 2.7446, "step": 89 }, { "epoch": 0.84, "grad_norm": 144.37852403707763, "learning_rate": 4.137931034482759e-06, "loss": 2.6592, "step": 90 }, { "epoch": 0.85, "grad_norm": 241.76402492926857, "learning_rate": 3.908045977011495e-06, "loss": 2.6332, "step": 91 }, { "epoch": 0.86, "grad_norm": 93.21581142703205, "learning_rate": 3.67816091954023e-06, "loss": 2.7159, "step": 92 }, { "epoch": 0.87, "grad_norm": 90.82646249338626, "learning_rate": 3.448275862068966e-06, "loss": 2.6985, "step": 93 }, { "epoch": 0.87, "grad_norm": 0.867778542293462, "learning_rate": 3.2183908045977012e-06, "loss": 2.6932, "step": 94 }, { "epoch": 0.88, "grad_norm": 31.050236512786825, "learning_rate": 2.988505747126437e-06, "loss": 2.6976, "step": 95 }, { "epoch": 0.89, "grad_norm": 204.7615801059857, "learning_rate": 2.7586206896551725e-06, "loss": 2.6847, "step": 96 }, { "epoch": 0.9, "grad_norm": 73.48986829616379, "learning_rate": 2.5287356321839083e-06, "loss": 2.691, "step": 97 }, { "epoch": 0.91, "grad_norm": 28.74608392996503, "learning_rate": 2.2988505747126437e-06, "loss": 2.5344, "step": 98 }, { "epoch": 0.92, "grad_norm": 61.404625853203235, "learning_rate": 2.0689655172413796e-06, "loss": 2.6812, "step": 99 }, { "epoch": 0.93, "grad_norm": 249.76608588086975, "learning_rate": 1.839080459770115e-06, "loss": 2.6055, "step": 100 }, { "epoch": 0.94, "grad_norm": 205.75138593093365, "learning_rate": 1.6091954022988506e-06, "loss": 2.6435, "step": 101 }, { "epoch": 0.95, "grad_norm": 81.41982852206519, "learning_rate": 1.3793103448275862e-06, "loss": 2.6747, "step": 102 }, { "epoch": 0.96, "grad_norm": 139.53060260208153, "learning_rate": 1.1494252873563219e-06, "loss": 2.606, "step": 103 }, { "epoch": 0.97, "grad_norm": 83.28651777429977, "learning_rate": 9.195402298850575e-07, "loss": 2.6382, "step": 104 }, { "epoch": 0.98, "grad_norm": 170.369153421328, "learning_rate": 6.896551724137931e-07, "loss": 2.5977, "step": 105 }, { "epoch": 0.99, "grad_norm": 106.50088486281464, "learning_rate": 4.5977011494252875e-07, "loss": 2.6409, "step": 106 }, { "epoch": 1.0, "grad_norm": 51.96983283974103, "learning_rate": 2.2988505747126437e-07, "loss": 2.5618, "step": 107 }, { "epoch": 1.0, "step": 107, "total_flos": 202218871455744.0, "train_loss": 2.9570909036654176, "train_runtime": 109970.5724, "train_samples_per_second": 0.25, "train_steps_per_second": 0.001 } ], "logging_steps": 1.0, "max_steps": 107, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 202218871455744.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }