{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.306122448979592, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0326530612244898, "grad_norm": 0.3159657120704651, "learning_rate": 4.9985361990992455e-05, "loss": 0.1654, "step": 5 }, { "epoch": 0.0653061224489796, "grad_norm": 0.32706642150878906, "learning_rate": 4.9941465105674435e-05, "loss": 0.1369, "step": 10 }, { "epoch": 0.09795918367346938, "grad_norm": 0.33407843112945557, "learning_rate": 4.986836074908616e-05, "loss": 0.1259, "step": 15 }, { "epoch": 0.1306122448979592, "grad_norm": 0.3189881443977356, "learning_rate": 4.976613452940604e-05, "loss": 0.1041, "step": 20 }, { "epoch": 0.16326530612244897, "grad_norm": 0.3424989581108093, "learning_rate": 4.9634906157700036e-05, "loss": 0.1004, "step": 25 }, { "epoch": 0.19591836734693877, "grad_norm": 0.3253389298915863, "learning_rate": 4.9474829307735115e-05, "loss": 0.0941, "step": 30 }, { "epoch": 0.22857142857142856, "grad_norm": 0.26078635454177856, "learning_rate": 4.9286091436021015e-05, "loss": 0.0867, "step": 35 }, { "epoch": 0.2612244897959184, "grad_norm": 0.252139687538147, "learning_rate": 4.906891356229103e-05, "loss": 0.0853, "step": 40 }, { "epoch": 0.2938775510204082, "grad_norm": 0.3403972387313843, "learning_rate": 4.882355001067892e-05, "loss": 0.0863, "step": 45 }, { "epoch": 0.32653061224489793, "grad_norm": 0.4710679352283478, "learning_rate": 4.855028811189496e-05, "loss": 0.0874, "step": 50 }, { "epoch": 0.35918367346938773, "grad_norm": 0.3147217929363251, "learning_rate": 4.8249447866750025e-05, "loss": 0.0733, "step": 55 }, { "epoch": 0.39183673469387753, "grad_norm": 0.3265310823917389, "learning_rate": 4.792138157142158e-05, "loss": 0.0719, "step": 60 }, { "epoch": 0.42448979591836733, "grad_norm": 0.35432252287864685, "learning_rate": 4.75664734049005e-05, "loss": 0.0824, "step": 65 }, { "epoch": 0.45714285714285713, "grad_norm": 0.3701626658439636, "learning_rate": 4.7185138979101864e-05, "loss": 0.0731, "step": 70 }, { "epoch": 0.4897959183673469, "grad_norm": 0.35868266224861145, "learning_rate": 4.677782485216644e-05, "loss": 0.0725, "step": 75 }, { "epoch": 0.5224489795918368, "grad_norm": 0.32440632581710815, "learning_rate": 4.6345008005522966e-05, "loss": 0.0694, "step": 80 }, { "epoch": 0.5551020408163265, "grad_norm": 0.3003002405166626, "learning_rate": 4.588719528532342e-05, "loss": 0.072, "step": 85 }, { "epoch": 0.5877551020408164, "grad_norm": 0.34989920258522034, "learning_rate": 4.540492280890555e-05, "loss": 0.0646, "step": 90 }, { "epoch": 0.6204081632653061, "grad_norm": 0.473254919052124, "learning_rate": 4.4898755336977673e-05, "loss": 0.0732, "step": 95 }, { "epoch": 0.6530612244897959, "grad_norm": 0.30768489837646484, "learning_rate": 4.436928561226087e-05, "loss": 0.068, "step": 100 }, { "epoch": 0.6857142857142857, "grad_norm": 0.31777673959732056, "learning_rate": 4.381713366536311e-05, "loss": 0.0749, "step": 105 }, { "epoch": 0.7183673469387755, "grad_norm": 0.40202295780181885, "learning_rate": 4.324294608869817e-05, "loss": 0.0652, "step": 110 }, { "epoch": 0.7510204081632653, "grad_norm": 0.4353463053703308, "learning_rate": 4.264739527929959e-05, "loss": 0.0562, "step": 115 }, { "epoch": 0.7836734693877551, "grad_norm": 0.4079282879829407, "learning_rate": 4.203117865141635e-05, "loss": 0.0602, "step": 120 }, { "epoch": 0.8163265306122449, "grad_norm": 0.40490931272506714, "learning_rate": 4.1395017819812445e-05, "loss": 0.0639, "step": 125 }, { "epoch": 0.8489795918367347, "grad_norm": 0.423981636762619, "learning_rate": 4.07396577547265e-05, "loss": 0.0651, "step": 130 }, { "epoch": 0.8816326530612245, "grad_norm": 0.3315489590167999, "learning_rate": 4.0065865909481417e-05, "loss": 0.0676, "step": 135 }, { "epoch": 0.9142857142857143, "grad_norm": 0.560667097568512, "learning_rate": 3.937443132176517e-05, "loss": 0.0669, "step": 140 }, { "epoch": 0.9469387755102041, "grad_norm": 0.4202517569065094, "learning_rate": 3.8666163689635616e-05, "loss": 0.0631, "step": 145 }, { "epoch": 0.9795918367346939, "grad_norm": 0.4563729465007782, "learning_rate": 3.794189242333106e-05, "loss": 0.0649, "step": 150 }, { "epoch": 1.0122448979591836, "grad_norm": 0.3522864580154419, "learning_rate": 3.720246567399712e-05, "loss": 0.059, "step": 155 }, { "epoch": 1.0448979591836736, "grad_norm": 0.38459908962249756, "learning_rate": 3.644874934046716e-05, "loss": 0.062, "step": 160 }, { "epoch": 1.0775510204081633, "grad_norm": 0.37406954169273376, "learning_rate": 3.568162605525953e-05, "loss": 0.0565, "step": 165 }, { "epoch": 1.110204081632653, "grad_norm": 0.3395706117153168, "learning_rate": 3.490199415097892e-05, "loss": 0.0575, "step": 170 }, { "epoch": 1.1428571428571428, "grad_norm": 0.31273558735847473, "learning_rate": 3.4110766608332347e-05, "loss": 0.0589, "step": 175 }, { "epoch": 1.1755102040816325, "grad_norm": 0.44675061106681824, "learning_rate": 3.330886998699149e-05, "loss": 0.0611, "step": 180 }, { "epoch": 1.2081632653061225, "grad_norm": 0.34769827127456665, "learning_rate": 3.249724334055367e-05, "loss": 0.062, "step": 185 }, { "epoch": 1.2408163265306122, "grad_norm": 0.31832146644592285, "learning_rate": 3.167683711687179e-05, "loss": 0.0616, "step": 190 }, { "epoch": 1.273469387755102, "grad_norm": 0.4153653085231781, "learning_rate": 3.084861204504122e-05, "loss": 0.0586, "step": 195 }, { "epoch": 1.306122448979592, "grad_norm": 0.36249643564224243, "learning_rate": 3.001353801034688e-05, "loss": 0.0578, "step": 200 } ], "logging_steps": 5, "max_steps": 459, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 1.2409791545887949e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }