{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 11140, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.18, "grad_norm": 6.054772853851318, "learning_rate": 1.9820466786355476e-05, "loss": 0.5721, "step": 100 }, { "epoch": 0.36, "grad_norm": 9.810357093811035, "learning_rate": 1.9640933572710953e-05, "loss": 0.3846, "step": 200 }, { "epoch": 0.54, "grad_norm": 9.026122093200684, "learning_rate": 1.9461400359066428e-05, "loss": 0.3019, "step": 300 }, { "epoch": 0.72, "grad_norm": 11.95788288116455, "learning_rate": 1.9281867145421905e-05, "loss": 0.3071, "step": 400 }, { "epoch": 0.9, "grad_norm": 15.329608917236328, "learning_rate": 1.910233393177738e-05, "loss": 0.2907, "step": 500 }, { "epoch": 1.08, "grad_norm": 12.364314079284668, "learning_rate": 1.8922800718132857e-05, "loss": 0.2646, "step": 600 }, { "epoch": 1.26, "grad_norm": 14.555986404418945, "learning_rate": 1.874326750448833e-05, "loss": 0.2097, "step": 700 }, { "epoch": 1.44, "grad_norm": 4.199421405792236, "learning_rate": 1.8563734290843805e-05, "loss": 0.2389, "step": 800 }, { "epoch": 1.62, "grad_norm": 22.485984802246094, "learning_rate": 1.8384201077199283e-05, "loss": 0.2019, "step": 900 }, { "epoch": 1.8, "grad_norm": 9.688520431518555, "learning_rate": 1.820466786355476e-05, "loss": 0.2594, "step": 1000 }, { "epoch": 1.97, "grad_norm": 18.052719116210938, "learning_rate": 1.8025134649910235e-05, "loss": 0.2044, "step": 1100 }, { "epoch": 2.15, "grad_norm": 0.7371789216995239, "learning_rate": 1.7845601436265712e-05, "loss": 0.1551, "step": 1200 }, { "epoch": 2.33, "grad_norm": 20.938648223876953, "learning_rate": 1.7666068222621186e-05, "loss": 0.1463, "step": 1300 }, { "epoch": 2.51, "grad_norm": 0.25227147340774536, "learning_rate": 1.748653500897666e-05, "loss": 0.1493, "step": 1400 }, { "epoch": 2.69, "grad_norm": 0.27634137868881226, "learning_rate": 1.7307001795332138e-05, "loss": 0.1649, "step": 1500 }, { "epoch": 2.87, "grad_norm": 0.2588340938091278, "learning_rate": 1.7127468581687616e-05, "loss": 0.1521, "step": 1600 }, { "epoch": 3.05, "grad_norm": 0.05350634083151817, "learning_rate": 1.694793536804309e-05, "loss": 0.1343, "step": 1700 }, { "epoch": 3.23, "grad_norm": 0.02972230687737465, "learning_rate": 1.6768402154398564e-05, "loss": 0.1068, "step": 1800 }, { "epoch": 3.41, "grad_norm": 0.09572970867156982, "learning_rate": 1.658886894075404e-05, "loss": 0.1151, "step": 1900 }, { "epoch": 3.59, "grad_norm": 21.431325912475586, "learning_rate": 1.6409335727109516e-05, "loss": 0.1073, "step": 2000 }, { "epoch": 3.77, "grad_norm": 1.4688669443130493, "learning_rate": 1.6229802513464993e-05, "loss": 0.1098, "step": 2100 }, { "epoch": 3.95, "grad_norm": 19.461355209350586, "learning_rate": 1.6050269299820467e-05, "loss": 0.1238, "step": 2200 }, { "epoch": 4.13, "grad_norm": 6.33543586730957, "learning_rate": 1.5870736086175945e-05, "loss": 0.0934, "step": 2300 }, { "epoch": 4.31, "grad_norm": 10.25698184967041, "learning_rate": 1.569120287253142e-05, "loss": 0.068, "step": 2400 }, { "epoch": 4.49, "grad_norm": 0.05421575903892517, "learning_rate": 1.5511669658886893e-05, "loss": 0.0767, "step": 2500 }, { "epoch": 4.67, "grad_norm": 0.04917303845286369, "learning_rate": 1.533213644524237e-05, "loss": 0.1053, "step": 2600 }, { "epoch": 4.85, "grad_norm": 1.722901463508606, "learning_rate": 1.5152603231597847e-05, "loss": 0.0513, "step": 2700 }, { "epoch": 5.03, "grad_norm": 28.548158645629883, "learning_rate": 1.4973070017953321e-05, "loss": 0.0611, "step": 2800 }, { "epoch": 5.21, "grad_norm": 1.0562294721603394, "learning_rate": 1.4793536804308799e-05, "loss": 0.0543, "step": 2900 }, { "epoch": 5.39, "grad_norm": 0.011326675303280354, "learning_rate": 1.4614003590664274e-05, "loss": 0.0491, "step": 3000 }, { "epoch": 5.57, "grad_norm": 0.009987740777432919, "learning_rate": 1.4434470377019749e-05, "loss": 0.0567, "step": 3100 }, { "epoch": 5.75, "grad_norm": 26.81354331970215, "learning_rate": 1.4254937163375226e-05, "loss": 0.0641, "step": 3200 }, { "epoch": 5.92, "grad_norm": 0.042816389352083206, "learning_rate": 1.4075403949730702e-05, "loss": 0.0488, "step": 3300 }, { "epoch": 6.1, "grad_norm": 0.014200630597770214, "learning_rate": 1.3895870736086176e-05, "loss": 0.0374, "step": 3400 }, { "epoch": 6.28, "grad_norm": 0.004939161241054535, "learning_rate": 1.3716337522441652e-05, "loss": 0.0393, "step": 3500 }, { "epoch": 6.46, "grad_norm": 39.326534271240234, "learning_rate": 1.353680430879713e-05, "loss": 0.0404, "step": 3600 }, { "epoch": 6.64, "grad_norm": 1.0669581890106201, "learning_rate": 1.3357271095152604e-05, "loss": 0.03, "step": 3700 }, { "epoch": 6.82, "grad_norm": 0.11205600947141647, "learning_rate": 1.317773788150808e-05, "loss": 0.0416, "step": 3800 }, { "epoch": 7.0, "grad_norm": 0.005933618638664484, "learning_rate": 1.2998204667863557e-05, "loss": 0.0498, "step": 3900 }, { "epoch": 7.18, "grad_norm": 0.0025591508019715548, "learning_rate": 1.2818671454219031e-05, "loss": 0.0214, "step": 4000 }, { "epoch": 7.36, "grad_norm": 0.0021508955396711826, "learning_rate": 1.2639138240574507e-05, "loss": 0.0346, "step": 4100 }, { "epoch": 7.54, "grad_norm": 0.002263248898088932, "learning_rate": 1.2459605026929983e-05, "loss": 0.0315, "step": 4200 }, { "epoch": 7.72, "grad_norm": 0.0016491829883307219, "learning_rate": 1.2280071813285459e-05, "loss": 0.0238, "step": 4300 }, { "epoch": 7.9, "grad_norm": 0.0034240155946463346, "learning_rate": 1.2100538599640935e-05, "loss": 0.0319, "step": 4400 }, { "epoch": 8.08, "grad_norm": 0.002759799361228943, "learning_rate": 1.1921005385996409e-05, "loss": 0.0487, "step": 4500 }, { "epoch": 8.26, "grad_norm": 34.639888763427734, "learning_rate": 1.1741472172351887e-05, "loss": 0.053, "step": 4600 }, { "epoch": 8.44, "grad_norm": 0.001209100242704153, "learning_rate": 1.1561938958707361e-05, "loss": 0.0143, "step": 4700 }, { "epoch": 8.62, "grad_norm": 0.0022134396713227034, "learning_rate": 1.1382405745062837e-05, "loss": 0.0291, "step": 4800 }, { "epoch": 8.8, "grad_norm": 0.06826081871986389, "learning_rate": 1.1202872531418314e-05, "loss": 0.0301, "step": 4900 }, { "epoch": 8.98, "grad_norm": 0.0025578713975846767, "learning_rate": 1.1023339317773789e-05, "loss": 0.0194, "step": 5000 }, { "epoch": 9.16, "grad_norm": 0.01461367029696703, "learning_rate": 1.0843806104129264e-05, "loss": 0.0152, "step": 5100 }, { "epoch": 9.34, "grad_norm": 0.0008313595899380744, "learning_rate": 1.0664272890484742e-05, "loss": 0.0163, "step": 5200 }, { "epoch": 9.52, "grad_norm": 1.5908000469207764, "learning_rate": 1.0484739676840216e-05, "loss": 0.0352, "step": 5300 }, { "epoch": 9.69, "grad_norm": 0.0016398399602621794, "learning_rate": 1.0305206463195692e-05, "loss": 0.0268, "step": 5400 }, { "epoch": 9.87, "grad_norm": 0.005051742307841778, "learning_rate": 1.0125673249551166e-05, "loss": 0.0244, "step": 5500 }, { "epoch": 10.05, "grad_norm": 0.11044131964445114, "learning_rate": 9.946140035906644e-06, "loss": 0.0318, "step": 5600 }, { "epoch": 10.23, "grad_norm": 0.02652016654610634, "learning_rate": 9.76660682226212e-06, "loss": 0.0224, "step": 5700 }, { "epoch": 10.41, "grad_norm": 0.0051875789649784565, "learning_rate": 9.587073608617595e-06, "loss": 0.0086, "step": 5800 }, { "epoch": 10.59, "grad_norm": 0.0023671940434724092, "learning_rate": 9.40754039497307e-06, "loss": 0.0061, "step": 5900 }, { "epoch": 10.77, "grad_norm": 0.14054147899150848, "learning_rate": 9.228007181328547e-06, "loss": 0.0222, "step": 6000 }, { "epoch": 10.95, "grad_norm": 0.002135949907824397, "learning_rate": 9.048473967684023e-06, "loss": 0.0205, "step": 6100 }, { "epoch": 11.13, "grad_norm": 0.5219959616661072, "learning_rate": 8.868940754039497e-06, "loss": 0.0243, "step": 6200 }, { "epoch": 11.31, "grad_norm": 0.004496410954743624, "learning_rate": 8.689407540394975e-06, "loss": 0.0232, "step": 6300 }, { "epoch": 11.49, "grad_norm": 0.0012191541027277708, "learning_rate": 8.509874326750449e-06, "loss": 0.0174, "step": 6400 }, { "epoch": 11.67, "grad_norm": 0.0010406944202259183, "learning_rate": 8.330341113105925e-06, "loss": 0.0002, "step": 6500 }, { "epoch": 11.85, "grad_norm": 0.0007079216302372515, "learning_rate": 8.1508078994614e-06, "loss": 0.0181, "step": 6600 }, { "epoch": 12.03, "grad_norm": 0.0031879213638603687, "learning_rate": 7.971274685816877e-06, "loss": 0.0029, "step": 6700 }, { "epoch": 12.21, "grad_norm": 0.07363840192556381, "learning_rate": 7.791741472172353e-06, "loss": 0.012, "step": 6800 }, { "epoch": 12.39, "grad_norm": 0.0074967676773667336, "learning_rate": 7.6122082585278276e-06, "loss": 0.0297, "step": 6900 }, { "epoch": 12.57, "grad_norm": 0.03205731883645058, "learning_rate": 7.432675044883304e-06, "loss": 0.0172, "step": 7000 }, { "epoch": 12.75, "grad_norm": 0.0017446995479986072, "learning_rate": 7.25314183123878e-06, "loss": 0.0095, "step": 7100 }, { "epoch": 12.93, "grad_norm": 62.21705627441406, "learning_rate": 7.073608617594255e-06, "loss": 0.0216, "step": 7200 }, { "epoch": 13.11, "grad_norm": 0.0004949498688802123, "learning_rate": 6.894075403949732e-06, "loss": 0.0025, "step": 7300 }, { "epoch": 13.29, "grad_norm": 1.0483726263046265, "learning_rate": 6.714542190305207e-06, "loss": 0.0124, "step": 7400 }, { "epoch": 13.46, "grad_norm": 0.0011325060622766614, "learning_rate": 6.535008976660683e-06, "loss": 0.0176, "step": 7500 }, { "epoch": 13.64, "grad_norm": 0.0004886816022917628, "learning_rate": 6.355475763016159e-06, "loss": 0.0005, "step": 7600 }, { "epoch": 13.82, "grad_norm": 0.0002447882143314928, "learning_rate": 6.175942549371634e-06, "loss": 0.0, "step": 7700 }, { "epoch": 14.0, "grad_norm": 0.0033621052280068398, "learning_rate": 5.99640933572711e-06, "loss": 0.0151, "step": 7800 }, { "epoch": 14.18, "grad_norm": 0.37218067049980164, "learning_rate": 5.8168761220825854e-06, "loss": 0.0062, "step": 7900 }, { "epoch": 14.36, "grad_norm": 0.00017587828915566206, "learning_rate": 5.637342908438061e-06, "loss": 0.0004, "step": 8000 }, { "epoch": 14.54, "grad_norm": 0.00030748211429454386, "learning_rate": 5.457809694793538e-06, "loss": 0.0, "step": 8100 }, { "epoch": 14.72, "grad_norm": 0.0006639692583121359, "learning_rate": 5.278276481149013e-06, "loss": 0.0301, "step": 8200 }, { "epoch": 14.9, "grad_norm": 0.0006772920023649931, "learning_rate": 5.098743267504489e-06, "loss": 0.015, "step": 8300 }, { "epoch": 15.08, "grad_norm": 0.0017782676732167602, "learning_rate": 4.919210053859965e-06, "loss": 0.0184, "step": 8400 }, { "epoch": 15.26, "grad_norm": 0.0006229325663298368, "learning_rate": 4.739676840215441e-06, "loss": 0.0053, "step": 8500 }, { "epoch": 15.44, "grad_norm": 0.0011186526389792562, "learning_rate": 4.560143626570916e-06, "loss": 0.0076, "step": 8600 }, { "epoch": 15.62, "grad_norm": 0.0009836511453613639, "learning_rate": 4.380610412926392e-06, "loss": 0.0054, "step": 8700 }, { "epoch": 15.8, "grad_norm": 0.002876699436455965, "learning_rate": 4.2010771992818675e-06, "loss": 0.0011, "step": 8800 }, { "epoch": 15.98, "grad_norm": 0.0004506981058511883, "learning_rate": 4.021543985637343e-06, "loss": 0.0142, "step": 8900 }, { "epoch": 16.16, "grad_norm": 143.88809204101562, "learning_rate": 3.842010771992819e-06, "loss": 0.0136, "step": 9000 }, { "epoch": 16.34, "grad_norm": 0.0012065657647326589, "learning_rate": 3.6624775583482946e-06, "loss": 0.0025, "step": 9100 }, { "epoch": 16.52, "grad_norm": 0.0008388046990148723, "learning_rate": 3.48294434470377e-06, "loss": 0.0054, "step": 9200 }, { "epoch": 16.7, "grad_norm": 0.0005097670364193618, "learning_rate": 3.3034111310592464e-06, "loss": 0.0, "step": 9300 }, { "epoch": 16.88, "grad_norm": 0.00020445660629775375, "learning_rate": 3.1238779174147223e-06, "loss": 0.0019, "step": 9400 }, { "epoch": 17.06, "grad_norm": 0.00033040347625501454, "learning_rate": 2.9443447037701977e-06, "loss": 0.002, "step": 9500 }, { "epoch": 17.24, "grad_norm": 0.00017785438103601336, "learning_rate": 2.7648114901256736e-06, "loss": 0.0011, "step": 9600 }, { "epoch": 17.41, "grad_norm": 0.0003500489692669362, "learning_rate": 2.585278276481149e-06, "loss": 0.023, "step": 9700 }, { "epoch": 17.59, "grad_norm": 0.0002393827890045941, "learning_rate": 2.405745062836625e-06, "loss": 0.0001, "step": 9800 }, { "epoch": 17.77, "grad_norm": 0.00016358286666218191, "learning_rate": 2.2262118491921008e-06, "loss": 0.0017, "step": 9900 }, { "epoch": 17.95, "grad_norm": 0.0005156854167580605, "learning_rate": 2.0466786355475767e-06, "loss": 0.0022, "step": 10000 }, { "epoch": 18.13, "grad_norm": 0.0001992402976611629, "learning_rate": 1.8671454219030521e-06, "loss": 0.006, "step": 10100 }, { "epoch": 18.31, "grad_norm": 0.00023926289577502757, "learning_rate": 1.687612208258528e-06, "loss": 0.0026, "step": 10200 }, { "epoch": 18.49, "grad_norm": 0.00018842410645447671, "learning_rate": 1.5080789946140036e-06, "loss": 0.0063, "step": 10300 }, { "epoch": 18.67, "grad_norm": 0.00017712044063955545, "learning_rate": 1.3285457809694793e-06, "loss": 0.0, "step": 10400 }, { "epoch": 18.85, "grad_norm": 0.00030714995227754116, "learning_rate": 1.1490125673249552e-06, "loss": 0.0, "step": 10500 }, { "epoch": 19.03, "grad_norm": 0.0004120106459595263, "learning_rate": 9.694793536804308e-07, "loss": 0.0041, "step": 10600 }, { "epoch": 19.21, "grad_norm": 0.00010405273496871814, "learning_rate": 7.899461400359067e-07, "loss": 0.0, "step": 10700 }, { "epoch": 19.39, "grad_norm": 9.223635424859822e-05, "learning_rate": 6.104129263913825e-07, "loss": 0.0001, "step": 10800 }, { "epoch": 19.57, "grad_norm": 0.00015765101124998182, "learning_rate": 4.3087971274685824e-07, "loss": 0.0042, "step": 10900 }, { "epoch": 19.75, "grad_norm": 0.00016278728435281664, "learning_rate": 2.5134649910233396e-07, "loss": 0.0003, "step": 11000 }, { "epoch": 19.93, "grad_norm": 0.00011305743100820109, "learning_rate": 7.18132854578097e-08, "loss": 0.0, "step": 11100 } ], "logging_steps": 100, "max_steps": 11140, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "total_flos": 1.1371892211152928e+17, "train_batch_size": 14, "trial_name": null, "trial_params": null }