{ "best_metric": null, "best_model_checkpoint": null, "epoch": 15.238095238095237, "eval_steps": 500, "global_step": 160, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.19047619047619047, "grad_norm": 0.9942206144332886, "learning_rate": 0.00019750000000000003, "loss": 9.5613, "step": 2 }, { "epoch": 0.38095238095238093, "grad_norm": 1.2790788412094116, "learning_rate": 0.000195, "loss": 9.2339, "step": 4 }, { "epoch": 0.5714285714285714, "grad_norm": 1.9939367771148682, "learning_rate": 0.00019250000000000002, "loss": 8.7953, "step": 6 }, { "epoch": 0.7619047619047619, "grad_norm": 2.010485887527466, "learning_rate": 0.00019, "loss": 8.2168, "step": 8 }, { "epoch": 0.9523809523809523, "grad_norm": 1.352328896522522, "learning_rate": 0.0001875, "loss": 7.8941, "step": 10 }, { "epoch": 1.1428571428571428, "grad_norm": 0.9626594185829163, "learning_rate": 0.00018500000000000002, "loss": 7.6817, "step": 12 }, { "epoch": 1.3333333333333333, "grad_norm": 1.1568268537521362, "learning_rate": 0.0001825, "loss": 7.5131, "step": 14 }, { "epoch": 1.5238095238095237, "grad_norm": 1.0264520645141602, "learning_rate": 0.00018, "loss": 7.4247, "step": 16 }, { "epoch": 1.7142857142857144, "grad_norm": 0.9865540862083435, "learning_rate": 0.0001775, "loss": 7.4369, "step": 18 }, { "epoch": 1.9047619047619047, "grad_norm": 1.0182702541351318, "learning_rate": 0.000175, "loss": 7.3787, "step": 20 }, { "epoch": 2.0952380952380953, "grad_norm": 0.7922359108924866, "learning_rate": 0.00017250000000000002, "loss": 7.373, "step": 22 }, { "epoch": 2.2857142857142856, "grad_norm": 0.7033187747001648, "learning_rate": 0.00017, "loss": 7.3096, "step": 24 }, { "epoch": 2.4761904761904763, "grad_norm": 2.9758119583129883, "learning_rate": 0.0001675, "loss": 7.1991, "step": 26 }, { "epoch": 2.6666666666666665, "grad_norm": 0.7531760931015015, "learning_rate": 0.000165, "loss": 7.2661, "step": 28 }, { "epoch": 2.857142857142857, "grad_norm": 1.3790533542633057, "learning_rate": 0.00016250000000000002, "loss": 7.2782, "step": 30 }, { "epoch": 3.0476190476190474, "grad_norm": 0.6538093686103821, "learning_rate": 0.00016, "loss": 7.2109, "step": 32 }, { "epoch": 3.238095238095238, "grad_norm": 0.6145215630531311, "learning_rate": 0.0001575, "loss": 7.2192, "step": 34 }, { "epoch": 3.4285714285714284, "grad_norm": 0.4128475785255432, "learning_rate": 0.000155, "loss": 7.2892, "step": 36 }, { "epoch": 3.619047619047619, "grad_norm": 1.0160013437271118, "learning_rate": 0.0001525, "loss": 7.2049, "step": 38 }, { "epoch": 3.8095238095238093, "grad_norm": 0.5834835171699524, "learning_rate": 0.00015000000000000001, "loss": 7.1672, "step": 40 }, { "epoch": 4.0, "grad_norm": 0.4894554615020752, "learning_rate": 0.0001475, "loss": 7.1269, "step": 42 }, { "epoch": 4.190476190476191, "grad_norm": 0.593618631362915, "learning_rate": 0.000145, "loss": 7.0175, "step": 44 }, { "epoch": 4.380952380952381, "grad_norm": 1.6190487146377563, "learning_rate": 0.00014250000000000002, "loss": 7.2919, "step": 46 }, { "epoch": 4.571428571428571, "grad_norm": 0.755859911441803, "learning_rate": 0.00014, "loss": 7.1624, "step": 48 }, { "epoch": 4.761904761904762, "grad_norm": 0.46613645553588867, "learning_rate": 0.0001375, "loss": 7.2233, "step": 50 }, { "epoch": 4.9523809523809526, "grad_norm": 0.5973020792007446, "learning_rate": 0.00013500000000000003, "loss": 7.1642, "step": 52 }, { "epoch": 5.142857142857143, "grad_norm": 0.97837233543396, "learning_rate": 0.0001325, "loss": 7.1172, "step": 54 }, { "epoch": 5.333333333333333, "grad_norm": 0.9348046183586121, "learning_rate": 0.00013000000000000002, "loss": 7.1564, "step": 56 }, { "epoch": 5.523809523809524, "grad_norm": 0.6632198691368103, "learning_rate": 0.0001275, "loss": 7.0821, "step": 58 }, { "epoch": 5.714285714285714, "grad_norm": 0.7776179909706116, "learning_rate": 0.000125, "loss": 7.2272, "step": 60 }, { "epoch": 5.904761904761905, "grad_norm": 0.6282438039779663, "learning_rate": 0.00012250000000000002, "loss": 7.0926, "step": 62 }, { "epoch": 6.095238095238095, "grad_norm": 0.6008353233337402, "learning_rate": 0.00012, "loss": 7.1073, "step": 64 }, { "epoch": 6.285714285714286, "grad_norm": 0.8796420097351074, "learning_rate": 0.00011750000000000001, "loss": 7.1737, "step": 66 }, { "epoch": 6.476190476190476, "grad_norm": 0.6400454640388489, "learning_rate": 0.00011499999999999999, "loss": 7.0924, "step": 68 }, { "epoch": 6.666666666666667, "grad_norm": 0.5479526519775391, "learning_rate": 0.00011250000000000001, "loss": 7.1275, "step": 70 }, { "epoch": 6.857142857142857, "grad_norm": 0.5992618203163147, "learning_rate": 0.00011000000000000002, "loss": 7.0599, "step": 72 }, { "epoch": 7.0476190476190474, "grad_norm": 0.5336684584617615, "learning_rate": 0.0001075, "loss": 7.0206, "step": 74 }, { "epoch": 7.238095238095238, "grad_norm": 0.3991040289402008, "learning_rate": 0.000105, "loss": 7.0123, "step": 76 }, { "epoch": 7.428571428571429, "grad_norm": 1.032917857170105, "learning_rate": 0.0001025, "loss": 7.0267, "step": 78 }, { "epoch": 7.619047619047619, "grad_norm": 0.5554404854774475, "learning_rate": 0.0001, "loss": 7.0203, "step": 80 }, { "epoch": 7.809523809523809, "grad_norm": 0.7755109667778015, "learning_rate": 9.75e-05, "loss": 7.1445, "step": 82 }, { "epoch": 8.0, "grad_norm": 1.8295842409133911, "learning_rate": 9.5e-05, "loss": 7.0002, "step": 84 }, { "epoch": 8.19047619047619, "grad_norm": 1.4985620975494385, "learning_rate": 9.250000000000001e-05, "loss": 7.0613, "step": 86 }, { "epoch": 8.380952380952381, "grad_norm": 1.0733778476715088, "learning_rate": 9e-05, "loss": 7.0594, "step": 88 }, { "epoch": 8.571428571428571, "grad_norm": 0.7009026408195496, "learning_rate": 8.75e-05, "loss": 6.9432, "step": 90 }, { "epoch": 8.761904761904763, "grad_norm": 1.195196509361267, "learning_rate": 8.5e-05, "loss": 6.9266, "step": 92 }, { "epoch": 8.952380952380953, "grad_norm": 2.6835684776306152, "learning_rate": 8.25e-05, "loss": 6.9855, "step": 94 }, { "epoch": 9.142857142857142, "grad_norm": 0.7434377670288086, "learning_rate": 8e-05, "loss": 6.7975, "step": 96 }, { "epoch": 9.333333333333334, "grad_norm": 0.5993837118148804, "learning_rate": 7.75e-05, "loss": 7.0476, "step": 98 }, { "epoch": 9.523809523809524, "grad_norm": 0.4656153619289398, "learning_rate": 7.500000000000001e-05, "loss": 6.9894, "step": 100 }, { "epoch": 9.714285714285714, "grad_norm": 0.7926774621009827, "learning_rate": 7.25e-05, "loss": 6.9854, "step": 102 }, { "epoch": 9.904761904761905, "grad_norm": 1.0828678607940674, "learning_rate": 7e-05, "loss": 6.9185, "step": 104 }, { "epoch": 10.095238095238095, "grad_norm": 0.6923830509185791, "learning_rate": 6.750000000000001e-05, "loss": 6.9804, "step": 106 }, { "epoch": 10.285714285714286, "grad_norm": 0.5546735525131226, "learning_rate": 6.500000000000001e-05, "loss": 6.9273, "step": 108 }, { "epoch": 10.476190476190476, "grad_norm": 0.8265076875686646, "learning_rate": 6.25e-05, "loss": 6.9087, "step": 110 }, { "epoch": 10.666666666666666, "grad_norm": 0.3945198655128479, "learning_rate": 6e-05, "loss": 6.9375, "step": 112 }, { "epoch": 10.857142857142858, "grad_norm": 0.5948878526687622, "learning_rate": 5.7499999999999995e-05, "loss": 6.8764, "step": 114 }, { "epoch": 11.047619047619047, "grad_norm": 0.7741471529006958, "learning_rate": 5.500000000000001e-05, "loss": 6.7551, "step": 116 }, { "epoch": 11.238095238095237, "grad_norm": 0.32554784417152405, "learning_rate": 5.25e-05, "loss": 6.8862, "step": 118 }, { "epoch": 11.428571428571429, "grad_norm": 0.5033702850341797, "learning_rate": 5e-05, "loss": 6.7297, "step": 120 }, { "epoch": 11.619047619047619, "grad_norm": 0.5291158556938171, "learning_rate": 4.75e-05, "loss": 6.9826, "step": 122 }, { "epoch": 11.80952380952381, "grad_norm": 0.39498385787010193, "learning_rate": 4.5e-05, "loss": 6.837, "step": 124 }, { "epoch": 12.0, "grad_norm": 0.4402136206626892, "learning_rate": 4.25e-05, "loss": 7.0434, "step": 126 }, { "epoch": 12.19047619047619, "grad_norm": 0.6476764678955078, "learning_rate": 4e-05, "loss": 6.8524, "step": 128 }, { "epoch": 12.380952380952381, "grad_norm": 0.330609530210495, "learning_rate": 3.7500000000000003e-05, "loss": 6.8742, "step": 130 }, { "epoch": 12.571428571428571, "grad_norm": 0.5420040488243103, "learning_rate": 3.5e-05, "loss": 6.7931, "step": 132 }, { "epoch": 12.761904761904763, "grad_norm": 0.3482373356819153, "learning_rate": 3.2500000000000004e-05, "loss": 6.883, "step": 134 }, { "epoch": 12.952380952380953, "grad_norm": 0.3476051092147827, "learning_rate": 3e-05, "loss": 6.9857, "step": 136 }, { "epoch": 13.142857142857142, "grad_norm": 0.43590274453163147, "learning_rate": 2.8749999999999997e-05, "loss": 8.116, "step": 138 }, { "epoch": 13.333333333333334, "grad_norm": 0.2993098497390747, "learning_rate": 2.625e-05, "loss": 6.657, "step": 140 }, { "epoch": 13.523809523809524, "grad_norm": 0.3477262556552887, "learning_rate": 2.375e-05, "loss": 6.9781, "step": 142 }, { "epoch": 13.714285714285714, "grad_norm": 0.47370073199272156, "learning_rate": 2.125e-05, "loss": 6.9277, "step": 144 }, { "epoch": 13.904761904761905, "grad_norm": 0.3924289345741272, "learning_rate": 1.8750000000000002e-05, "loss": 6.8967, "step": 146 }, { "epoch": 14.095238095238095, "grad_norm": 0.5621922612190247, "learning_rate": 1.6250000000000002e-05, "loss": 6.7197, "step": 148 }, { "epoch": 14.285714285714286, "grad_norm": 0.3454875349998474, "learning_rate": 1.3750000000000002e-05, "loss": 6.9314, "step": 150 }, { "epoch": 14.476190476190476, "grad_norm": 0.3146642744541168, "learning_rate": 1.125e-05, "loss": 6.9142, "step": 152 }, { "epoch": 14.666666666666666, "grad_norm": 0.3762160837650299, "learning_rate": 8.75e-06, "loss": 6.8759, "step": 154 }, { "epoch": 14.857142857142858, "grad_norm": 0.33906954526901245, "learning_rate": 6.25e-06, "loss": 6.8712, "step": 156 }, { "epoch": 15.047619047619047, "grad_norm": 0.3414846360683441, "learning_rate": 3.75e-06, "loss": 6.737, "step": 158 }, { "epoch": 15.238095238095237, "grad_norm": 0.4463809132575989, "learning_rate": 1.25e-06, "loss": 6.9144, "step": 160 }, { "epoch": 15.238095238095237, "step": 160, "total_flos": 800861569170024.0, "train_loss": 7.170098584890366, "train_runtime": 677.1666, "train_samples_per_second": 3.969, "train_steps_per_second": 0.236 } ], "logging_steps": 2, "max_steps": 160, "num_input_tokens_seen": 0, "num_train_epochs": 16, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 800861569170024.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }