diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,181123 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 22635, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0001325381047051027, + "grad_norm": 250.43743896484375, + "learning_rate": 5.000000000000001e-07, + "loss": 1.0252, + "num_input_tokens_seen": 2128, + "step": 1 + }, + { + "epoch": 0.0002650762094102054, + "grad_norm": 174.06369018554688, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.0459, + "num_input_tokens_seen": 4048, + "step": 2 + }, + { + "epoch": 0.00039761431411530816, + "grad_norm": 273.9053955078125, + "learning_rate": 1.5e-06, + "loss": 1.4319, + "num_input_tokens_seen": 6248, + "step": 3 + }, + { + "epoch": 0.0005301524188204108, + "grad_norm": 34.939170837402344, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.1974, + "num_input_tokens_seen": 7776, + "step": 4 + }, + { + "epoch": 0.0006626905235255136, + "grad_norm": 82.0633544921875, + "learning_rate": 2.5e-06, + "loss": 0.5028, + "num_input_tokens_seen": 9320, + "step": 5 + }, + { + "epoch": 0.0007952286282306163, + "grad_norm": 96.41106414794922, + "learning_rate": 3e-06, + "loss": 1.3664, + "num_input_tokens_seen": 11160, + "step": 6 + }, + { + "epoch": 0.0009277667329357191, + "grad_norm": 112.13668060302734, + "learning_rate": 3.5e-06, + "loss": 1.5008, + "num_input_tokens_seen": 12952, + "step": 7 + }, + { + "epoch": 0.0010603048376408217, + "grad_norm": 66.35326385498047, + "learning_rate": 4.000000000000001e-06, + "loss": 1.3062, + "num_input_tokens_seen": 15400, + "step": 8 + }, + { + "epoch": 0.0011928429423459245, + "grad_norm": 19.309053421020508, + "learning_rate": 4.5e-06, + "loss": 0.2608, + "num_input_tokens_seen": 17040, + "step": 9 + }, + { + "epoch": 0.0013253810470510272, + "grad_norm": 33.11587905883789, + "learning_rate": 5e-06, + "loss": 0.2822, + "num_input_tokens_seen": 19048, + "step": 10 + }, + { + "epoch": 0.0014579191517561298, + "grad_norm": 32.90431213378906, + "learning_rate": 4.999999975899138e-06, + "loss": 0.0864, + "num_input_tokens_seen": 20840, + "step": 11 + }, + { + "epoch": 0.0015904572564612327, + "grad_norm": 0.9036729335784912, + "learning_rate": 4.9999999035965515e-06, + "loss": 0.0012, + "num_input_tokens_seen": 23096, + "step": 12 + }, + { + "epoch": 0.0017229953611663353, + "grad_norm": 0.010966886766254902, + "learning_rate": 4.999999783092242e-06, + "loss": 0.0, + "num_input_tokens_seen": 24496, + "step": 13 + }, + { + "epoch": 0.0018555334658714381, + "grad_norm": 62.49030685424805, + "learning_rate": 4.999999614386212e-06, + "loss": 1.451, + "num_input_tokens_seen": 26248, + "step": 14 + }, + { + "epoch": 0.0019880715705765406, + "grad_norm": 30.907695770263672, + "learning_rate": 4.999999397478466e-06, + "loss": 0.7342, + "num_input_tokens_seen": 29496, + "step": 15 + }, + { + "epoch": 0.0021206096752816434, + "grad_norm": 0.008592139929533005, + "learning_rate": 4.999999132369006e-06, + "loss": 0.0, + "num_input_tokens_seen": 31216, + "step": 16 + }, + { + "epoch": 0.0022531477799867462, + "grad_norm": 40.367591857910156, + "learning_rate": 4.9999988190578376e-06, + "loss": 1.1096, + "num_input_tokens_seen": 34480, + "step": 17 + }, + { + "epoch": 0.002385685884691849, + "grad_norm": 121.25069427490234, + "learning_rate": 4.999998457544968e-06, + "loss": 0.761, + "num_input_tokens_seen": 35928, + "step": 18 + }, + { + "epoch": 0.0025182239893969515, + "grad_norm": 0.20059402287006378, + "learning_rate": 4.999998047830402e-06, + "loss": 0.0005, + "num_input_tokens_seen": 37392, + "step": 19 + }, + { + "epoch": 0.0026507620941020544, + "grad_norm": 30.98487663269043, + "learning_rate": 4.999997589914151e-06, + "loss": 0.6026, + "num_input_tokens_seen": 39128, + "step": 20 + }, + { + "epoch": 0.002783300198807157, + "grad_norm": 33.51759338378906, + "learning_rate": 4.999997083796221e-06, + "loss": 0.4553, + "num_input_tokens_seen": 41656, + "step": 21 + }, + { + "epoch": 0.0029158383035122596, + "grad_norm": 19.42667007446289, + "learning_rate": 4.999996529476622e-06, + "loss": 0.0542, + "num_input_tokens_seen": 42992, + "step": 22 + }, + { + "epoch": 0.0030483764082173625, + "grad_norm": 17.662841796875, + "learning_rate": 4.999995926955366e-06, + "loss": 0.2276, + "num_input_tokens_seen": 45224, + "step": 23 + }, + { + "epoch": 0.0031809145129224653, + "grad_norm": 6.467265605926514, + "learning_rate": 4.999995276232463e-06, + "loss": 0.151, + "num_input_tokens_seen": 47048, + "step": 24 + }, + { + "epoch": 0.0033134526176275677, + "grad_norm": 41.72969436645508, + "learning_rate": 4.9999945773079275e-06, + "loss": 0.841, + "num_input_tokens_seen": 49592, + "step": 25 + }, + { + "epoch": 0.0034459907223326706, + "grad_norm": 4.11884880065918, + "learning_rate": 4.999993830181772e-06, + "loss": 0.0121, + "num_input_tokens_seen": 51344, + "step": 26 + }, + { + "epoch": 0.0035785288270377734, + "grad_norm": 1.7135484218597412, + "learning_rate": 4.99999303485401e-06, + "loss": 0.005, + "num_input_tokens_seen": 53536, + "step": 27 + }, + { + "epoch": 0.0037110669317428763, + "grad_norm": 23.234941482543945, + "learning_rate": 4.999992191324657e-06, + "loss": 0.5664, + "num_input_tokens_seen": 55264, + "step": 28 + }, + { + "epoch": 0.0038436050364479787, + "grad_norm": 0.08144200593233109, + "learning_rate": 4.9999912995937314e-06, + "loss": 0.0003, + "num_input_tokens_seen": 56656, + "step": 29 + }, + { + "epoch": 0.003976143141153081, + "grad_norm": 31.365327835083008, + "learning_rate": 4.999990359661249e-06, + "loss": 1.0078, + "num_input_tokens_seen": 58944, + "step": 30 + }, + { + "epoch": 0.004108681245858184, + "grad_norm": 52.97278594970703, + "learning_rate": 4.999989371527226e-06, + "loss": 0.9884, + "num_input_tokens_seen": 60536, + "step": 31 + }, + { + "epoch": 0.004241219350563287, + "grad_norm": 0.017972925677895546, + "learning_rate": 4.9999883351916845e-06, + "loss": 0.0001, + "num_input_tokens_seen": 63072, + "step": 32 + }, + { + "epoch": 0.00437375745526839, + "grad_norm": 19.20778465270996, + "learning_rate": 4.9999872506546435e-06, + "loss": 0.4417, + "num_input_tokens_seen": 64648, + "step": 33 + }, + { + "epoch": 0.0045062955599734925, + "grad_norm": 19.955821990966797, + "learning_rate": 4.999986117916123e-06, + "loss": 0.5826, + "num_input_tokens_seen": 67136, + "step": 34 + }, + { + "epoch": 0.004638833664678595, + "grad_norm": 0.09148523211479187, + "learning_rate": 4.999984936976145e-06, + "loss": 0.0003, + "num_input_tokens_seen": 69128, + "step": 35 + }, + { + "epoch": 0.004771371769383698, + "grad_norm": 5.537275314331055, + "learning_rate": 4.999983707834735e-06, + "loss": 0.1306, + "num_input_tokens_seen": 70528, + "step": 36 + }, + { + "epoch": 0.0049039098740888, + "grad_norm": 17.37434959411621, + "learning_rate": 4.999982430491912e-06, + "loss": 0.479, + "num_input_tokens_seen": 73696, + "step": 37 + }, + { + "epoch": 0.005036447978793903, + "grad_norm": 29.15047264099121, + "learning_rate": 4.999981104947704e-06, + "loss": 0.4218, + "num_input_tokens_seen": 75456, + "step": 38 + }, + { + "epoch": 0.005168986083499006, + "grad_norm": 6.122926235198975, + "learning_rate": 4.999979731202136e-06, + "loss": 0.0178, + "num_input_tokens_seen": 77288, + "step": 39 + }, + { + "epoch": 0.005301524188204109, + "grad_norm": 28.305675506591797, + "learning_rate": 4.9999783092552334e-06, + "loss": 0.5839, + "num_input_tokens_seen": 79128, + "step": 40 + }, + { + "epoch": 0.0054340622929092116, + "grad_norm": 28.465259552001953, + "learning_rate": 4.999976839107025e-06, + "loss": 0.3275, + "num_input_tokens_seen": 80504, + "step": 41 + }, + { + "epoch": 0.005566600397614314, + "grad_norm": 12.967247009277344, + "learning_rate": 4.999975320757539e-06, + "loss": 0.1255, + "num_input_tokens_seen": 82496, + "step": 42 + }, + { + "epoch": 0.005699138502319417, + "grad_norm": 13.58895206451416, + "learning_rate": 4.999973754206803e-06, + "loss": 0.0443, + "num_input_tokens_seen": 84048, + "step": 43 + }, + { + "epoch": 0.005831676607024519, + "grad_norm": 8.829529762268066, + "learning_rate": 4.9999721394548485e-06, + "loss": 0.1663, + "num_input_tokens_seen": 86064, + "step": 44 + }, + { + "epoch": 0.005964214711729622, + "grad_norm": 16.731121063232422, + "learning_rate": 4.999970476501707e-06, + "loss": 0.173, + "num_input_tokens_seen": 88632, + "step": 45 + }, + { + "epoch": 0.006096752816434725, + "grad_norm": 6.858058452606201, + "learning_rate": 4.99996876534741e-06, + "loss": 0.0222, + "num_input_tokens_seen": 91328, + "step": 46 + }, + { + "epoch": 0.006229290921139828, + "grad_norm": 26.11011505126953, + "learning_rate": 4.99996700599199e-06, + "loss": 0.2834, + "num_input_tokens_seen": 93624, + "step": 47 + }, + { + "epoch": 0.006361829025844931, + "grad_norm": 1.8604565858840942, + "learning_rate": 4.999965198435482e-06, + "loss": 0.0058, + "num_input_tokens_seen": 95360, + "step": 48 + }, + { + "epoch": 0.0064943671305500335, + "grad_norm": 52.78355407714844, + "learning_rate": 4.99996334267792e-06, + "loss": 0.8988, + "num_input_tokens_seen": 97032, + "step": 49 + }, + { + "epoch": 0.0066269052352551355, + "grad_norm": 17.50345230102539, + "learning_rate": 4.999961438719341e-06, + "loss": 0.3259, + "num_input_tokens_seen": 98816, + "step": 50 + }, + { + "epoch": 0.006759443339960238, + "grad_norm": 21.572919845581055, + "learning_rate": 4.99995948655978e-06, + "loss": 0.2656, + "num_input_tokens_seen": 100792, + "step": 51 + }, + { + "epoch": 0.006891981444665341, + "grad_norm": 0.284165620803833, + "learning_rate": 4.999957486199276e-06, + "loss": 0.001, + "num_input_tokens_seen": 103272, + "step": 52 + }, + { + "epoch": 0.007024519549370444, + "grad_norm": 7.135979175567627, + "learning_rate": 4.999955437637867e-06, + "loss": 0.136, + "num_input_tokens_seen": 104672, + "step": 53 + }, + { + "epoch": 0.007157057654075547, + "grad_norm": 27.35825538635254, + "learning_rate": 4.999953340875592e-06, + "loss": 0.6583, + "num_input_tokens_seen": 106728, + "step": 54 + }, + { + "epoch": 0.00728959575878065, + "grad_norm": 17.128623962402344, + "learning_rate": 4.999951195912492e-06, + "loss": 0.3751, + "num_input_tokens_seen": 108408, + "step": 55 + }, + { + "epoch": 0.0074221338634857525, + "grad_norm": 1.3629236221313477, + "learning_rate": 4.999949002748608e-06, + "loss": 0.0046, + "num_input_tokens_seen": 111088, + "step": 56 + }, + { + "epoch": 0.0075546719681908545, + "grad_norm": 1.013257622718811, + "learning_rate": 4.999946761383984e-06, + "loss": 0.0035, + "num_input_tokens_seen": 112792, + "step": 57 + }, + { + "epoch": 0.007687210072895957, + "grad_norm": 12.58083438873291, + "learning_rate": 4.999944471818661e-06, + "loss": 0.2518, + "num_input_tokens_seen": 114872, + "step": 58 + }, + { + "epoch": 0.00781974817760106, + "grad_norm": 0.324655145406723, + "learning_rate": 4.999942134052684e-06, + "loss": 0.0011, + "num_input_tokens_seen": 116808, + "step": 59 + }, + { + "epoch": 0.007952286282306162, + "grad_norm": 0.15079373121261597, + "learning_rate": 4.9999397480860975e-06, + "loss": 0.0005, + "num_input_tokens_seen": 118824, + "step": 60 + }, + { + "epoch": 0.008084824387011266, + "grad_norm": 0.07261475175619125, + "learning_rate": 4.999937313918949e-06, + "loss": 0.0002, + "num_input_tokens_seen": 119800, + "step": 61 + }, + { + "epoch": 0.008217362491716368, + "grad_norm": 24.96176528930664, + "learning_rate": 4.999934831551284e-06, + "loss": 0.6587, + "num_input_tokens_seen": 122024, + "step": 62 + }, + { + "epoch": 0.008349900596421472, + "grad_norm": 0.033747754991054535, + "learning_rate": 4.99993230098315e-06, + "loss": 0.0001, + "num_input_tokens_seen": 123976, + "step": 63 + }, + { + "epoch": 0.008482438701126574, + "grad_norm": 31.08684730529785, + "learning_rate": 4.9999297222145986e-06, + "loss": 0.5857, + "num_input_tokens_seen": 125992, + "step": 64 + }, + { + "epoch": 0.008614976805831677, + "grad_norm": 35.08548355102539, + "learning_rate": 4.999927095245676e-06, + "loss": 0.7041, + "num_input_tokens_seen": 127976, + "step": 65 + }, + { + "epoch": 0.00874751491053678, + "grad_norm": 6.023736953735352, + "learning_rate": 4.999924420076434e-06, + "loss": 0.1227, + "num_input_tokens_seen": 130272, + "step": 66 + }, + { + "epoch": 0.008880053015241881, + "grad_norm": 0.04503978043794632, + "learning_rate": 4.999921696706925e-06, + "loss": 0.0001, + "num_input_tokens_seen": 131408, + "step": 67 + }, + { + "epoch": 0.009012591119946985, + "grad_norm": 49.56775665283203, + "learning_rate": 4.9999189251372025e-06, + "loss": 0.727, + "num_input_tokens_seen": 133760, + "step": 68 + }, + { + "epoch": 0.009145129224652087, + "grad_norm": 31.22989273071289, + "learning_rate": 4.999916105367317e-06, + "loss": 0.5476, + "num_input_tokens_seen": 135416, + "step": 69 + }, + { + "epoch": 0.00927766732935719, + "grad_norm": 0.1987680345773697, + "learning_rate": 4.999913237397324e-06, + "loss": 0.0007, + "num_input_tokens_seen": 137448, + "step": 70 + }, + { + "epoch": 0.009410205434062293, + "grad_norm": 25.454242706298828, + "learning_rate": 4.999910321227281e-06, + "loss": 0.6887, + "num_input_tokens_seen": 138704, + "step": 71 + }, + { + "epoch": 0.009542743538767396, + "grad_norm": 24.782148361206055, + "learning_rate": 4.999907356857241e-06, + "loss": 0.3743, + "num_input_tokens_seen": 140512, + "step": 72 + }, + { + "epoch": 0.009675281643472498, + "grad_norm": 29.465654373168945, + "learning_rate": 4.999904344287263e-06, + "loss": 1.2092, + "num_input_tokens_seen": 143656, + "step": 73 + }, + { + "epoch": 0.0098078197481776, + "grad_norm": 64.79688262939453, + "learning_rate": 4.999901283517404e-06, + "loss": 0.7532, + "num_input_tokens_seen": 146464, + "step": 74 + }, + { + "epoch": 0.009940357852882704, + "grad_norm": 15.676737785339355, + "learning_rate": 4.999898174547725e-06, + "loss": 0.4197, + "num_input_tokens_seen": 148800, + "step": 75 + }, + { + "epoch": 0.010072895957587806, + "grad_norm": 63.59064865112305, + "learning_rate": 4.9998950173782834e-06, + "loss": 1.5147, + "num_input_tokens_seen": 152160, + "step": 76 + }, + { + "epoch": 0.01020543406229291, + "grad_norm": 4.737969875335693, + "learning_rate": 4.999891812009143e-06, + "loss": 0.0752, + "num_input_tokens_seen": 153832, + "step": 77 + }, + { + "epoch": 0.010337972166998012, + "grad_norm": 6.161489009857178, + "learning_rate": 4.999888558440362e-06, + "loss": 0.0232, + "num_input_tokens_seen": 154888, + "step": 78 + }, + { + "epoch": 0.010470510271703115, + "grad_norm": 8.917037010192871, + "learning_rate": 4.9998852566720055e-06, + "loss": 0.1196, + "num_input_tokens_seen": 156528, + "step": 79 + }, + { + "epoch": 0.010603048376408217, + "grad_norm": 11.00399112701416, + "learning_rate": 4.999881906704137e-06, + "loss": 0.2367, + "num_input_tokens_seen": 158136, + "step": 80 + }, + { + "epoch": 0.01073558648111332, + "grad_norm": 5.292150020599365, + "learning_rate": 4.99987850853682e-06, + "loss": 0.0198, + "num_input_tokens_seen": 159440, + "step": 81 + }, + { + "epoch": 0.010868124585818423, + "grad_norm": 4.829105854034424, + "learning_rate": 4.999875062170121e-06, + "loss": 0.0183, + "num_input_tokens_seen": 160840, + "step": 82 + }, + { + "epoch": 0.011000662690523525, + "grad_norm": 3.056729793548584, + "learning_rate": 4.999871567604107e-06, + "loss": 0.0109, + "num_input_tokens_seen": 162240, + "step": 83 + }, + { + "epoch": 0.011133200795228629, + "grad_norm": 19.688913345336914, + "learning_rate": 4.999868024838844e-06, + "loss": 0.2948, + "num_input_tokens_seen": 164328, + "step": 84 + }, + { + "epoch": 0.01126573889993373, + "grad_norm": 33.16144561767578, + "learning_rate": 4.999864433874401e-06, + "loss": 0.584, + "num_input_tokens_seen": 166216, + "step": 85 + }, + { + "epoch": 0.011398277004638835, + "grad_norm": 10.677643775939941, + "learning_rate": 4.999860794710847e-06, + "loss": 0.1286, + "num_input_tokens_seen": 168280, + "step": 86 + }, + { + "epoch": 0.011530815109343936, + "grad_norm": 0.293518990278244, + "learning_rate": 4.999857107348253e-06, + "loss": 0.0011, + "num_input_tokens_seen": 170024, + "step": 87 + }, + { + "epoch": 0.011663353214049038, + "grad_norm": 23.888822555541992, + "learning_rate": 4.999853371786689e-06, + "loss": 0.74, + "num_input_tokens_seen": 172424, + "step": 88 + }, + { + "epoch": 0.011795891318754142, + "grad_norm": 0.42366430163383484, + "learning_rate": 4.999849588026227e-06, + "loss": 0.0016, + "num_input_tokens_seen": 175000, + "step": 89 + }, + { + "epoch": 0.011928429423459244, + "grad_norm": 33.93714904785156, + "learning_rate": 4.999845756066941e-06, + "loss": 0.9005, + "num_input_tokens_seen": 177248, + "step": 90 + }, + { + "epoch": 0.012060967528164348, + "grad_norm": 33.67679214477539, + "learning_rate": 4.999841875908904e-06, + "loss": 0.5072, + "num_input_tokens_seen": 178768, + "step": 91 + }, + { + "epoch": 0.01219350563286945, + "grad_norm": 21.549104690551758, + "learning_rate": 4.9998379475521915e-06, + "loss": 0.4235, + "num_input_tokens_seen": 180760, + "step": 92 + }, + { + "epoch": 0.012326043737574552, + "grad_norm": 26.029510498046875, + "learning_rate": 4.999833970996878e-06, + "loss": 0.6335, + "num_input_tokens_seen": 182864, + "step": 93 + }, + { + "epoch": 0.012458581842279656, + "grad_norm": 0.6470825672149658, + "learning_rate": 4.999829946243042e-06, + "loss": 0.0024, + "num_input_tokens_seen": 184424, + "step": 94 + }, + { + "epoch": 0.012591119946984758, + "grad_norm": 13.286639213562012, + "learning_rate": 4.999825873290759e-06, + "loss": 0.2366, + "num_input_tokens_seen": 186560, + "step": 95 + }, + { + "epoch": 0.012723658051689861, + "grad_norm": 22.230331420898438, + "learning_rate": 4.9998217521401095e-06, + "loss": 0.3539, + "num_input_tokens_seen": 188384, + "step": 96 + }, + { + "epoch": 0.012856196156394963, + "grad_norm": 17.20349884033203, + "learning_rate": 4.999817582791171e-06, + "loss": 0.4303, + "num_input_tokens_seen": 190808, + "step": 97 + }, + { + "epoch": 0.012988734261100067, + "grad_norm": 1.6089740991592407, + "learning_rate": 4.999813365244026e-06, + "loss": 0.0059, + "num_input_tokens_seen": 192480, + "step": 98 + }, + { + "epoch": 0.013121272365805169, + "grad_norm": 1.8107287883758545, + "learning_rate": 4.999809099498755e-06, + "loss": 0.0067, + "num_input_tokens_seen": 194128, + "step": 99 + }, + { + "epoch": 0.013253810470510271, + "grad_norm": 13.169525146484375, + "learning_rate": 4.999804785555439e-06, + "loss": 0.2379, + "num_input_tokens_seen": 195728, + "step": 100 + }, + { + "epoch": 0.013386348575215375, + "grad_norm": 0.6430429220199585, + "learning_rate": 4.999800423414163e-06, + "loss": 0.0023, + "num_input_tokens_seen": 197120, + "step": 101 + }, + { + "epoch": 0.013518886679920477, + "grad_norm": 20.317203521728516, + "learning_rate": 4.99979601307501e-06, + "loss": 0.4052, + "num_input_tokens_seen": 199064, + "step": 102 + }, + { + "epoch": 0.01365142478462558, + "grad_norm": 14.788737297058105, + "learning_rate": 4.999791554538065e-06, + "loss": 0.3322, + "num_input_tokens_seen": 200560, + "step": 103 + }, + { + "epoch": 0.013783962889330682, + "grad_norm": 0.3029716908931732, + "learning_rate": 4.999787047803415e-06, + "loss": 0.0011, + "num_input_tokens_seen": 203784, + "step": 104 + }, + { + "epoch": 0.013916500994035786, + "grad_norm": 1.024711012840271, + "learning_rate": 4.999782492871146e-06, + "loss": 0.0034, + "num_input_tokens_seen": 205776, + "step": 105 + }, + { + "epoch": 0.014049039098740888, + "grad_norm": 18.154651641845703, + "learning_rate": 4.999777889741345e-06, + "loss": 0.2853, + "num_input_tokens_seen": 208688, + "step": 106 + }, + { + "epoch": 0.01418157720344599, + "grad_norm": 9.336213111877441, + "learning_rate": 4.999773238414103e-06, + "loss": 0.2062, + "num_input_tokens_seen": 210408, + "step": 107 + }, + { + "epoch": 0.014314115308151094, + "grad_norm": 111.06527709960938, + "learning_rate": 4.999768538889508e-06, + "loss": 2.0096, + "num_input_tokens_seen": 213224, + "step": 108 + }, + { + "epoch": 0.014446653412856196, + "grad_norm": 0.13847197592258453, + "learning_rate": 4.999763791167651e-06, + "loss": 0.0004, + "num_input_tokens_seen": 214760, + "step": 109 + }, + { + "epoch": 0.0145791915175613, + "grad_norm": 10.839756965637207, + "learning_rate": 4.999758995248624e-06, + "loss": 0.2364, + "num_input_tokens_seen": 216176, + "step": 110 + }, + { + "epoch": 0.014711729622266401, + "grad_norm": 6.870694160461426, + "learning_rate": 4.999754151132519e-06, + "loss": 0.1528, + "num_input_tokens_seen": 217600, + "step": 111 + }, + { + "epoch": 0.014844267726971505, + "grad_norm": 20.930335998535156, + "learning_rate": 4.999749258819429e-06, + "loss": 0.663, + "num_input_tokens_seen": 220640, + "step": 112 + }, + { + "epoch": 0.014976805831676607, + "grad_norm": 24.177223205566406, + "learning_rate": 4.99974431830945e-06, + "loss": 0.4515, + "num_input_tokens_seen": 222616, + "step": 113 + }, + { + "epoch": 0.015109343936381709, + "grad_norm": 18.11847686767578, + "learning_rate": 4.999739329602675e-06, + "loss": 0.278, + "num_input_tokens_seen": 225032, + "step": 114 + }, + { + "epoch": 0.015241882041086813, + "grad_norm": 14.463335990905762, + "learning_rate": 4.999734292699201e-06, + "loss": 0.3191, + "num_input_tokens_seen": 226712, + "step": 115 + }, + { + "epoch": 0.015374420145791915, + "grad_norm": 0.5931599140167236, + "learning_rate": 4.999729207599126e-06, + "loss": 0.002, + "num_input_tokens_seen": 228776, + "step": 116 + }, + { + "epoch": 0.015506958250497018, + "grad_norm": 20.722591400146484, + "learning_rate": 4.999724074302548e-06, + "loss": 0.3065, + "num_input_tokens_seen": 230560, + "step": 117 + }, + { + "epoch": 0.01563949635520212, + "grad_norm": 2.0402841567993164, + "learning_rate": 4.9997188928095645e-06, + "loss": 0.0073, + "num_input_tokens_seen": 232688, + "step": 118 + }, + { + "epoch": 0.015772034459907224, + "grad_norm": 22.575775146484375, + "learning_rate": 4.999713663120276e-06, + "loss": 0.6893, + "num_input_tokens_seen": 234456, + "step": 119 + }, + { + "epoch": 0.015904572564612324, + "grad_norm": 1.2449815273284912, + "learning_rate": 4.999708385234784e-06, + "loss": 0.0044, + "num_input_tokens_seen": 236176, + "step": 120 + }, + { + "epoch": 0.016037110669317428, + "grad_norm": 0.7653734087944031, + "learning_rate": 4.99970305915319e-06, + "loss": 0.0027, + "num_input_tokens_seen": 237824, + "step": 121 + }, + { + "epoch": 0.016169648774022532, + "grad_norm": 0.6011762022972107, + "learning_rate": 4.999697684875596e-06, + "loss": 0.0021, + "num_input_tokens_seen": 240552, + "step": 122 + }, + { + "epoch": 0.016302186878727636, + "grad_norm": 0.4856562912464142, + "learning_rate": 4.9996922624021076e-06, + "loss": 0.0017, + "num_input_tokens_seen": 242560, + "step": 123 + }, + { + "epoch": 0.016434724983432736, + "grad_norm": 0.09074526280164719, + "learning_rate": 4.9996867917328264e-06, + "loss": 0.0003, + "num_input_tokens_seen": 245040, + "step": 124 + }, + { + "epoch": 0.01656726308813784, + "grad_norm": 0.14254754781723022, + "learning_rate": 4.99968127286786e-06, + "loss": 0.0005, + "num_input_tokens_seen": 247464, + "step": 125 + }, + { + "epoch": 0.016699801192842943, + "grad_norm": 21.311914443969727, + "learning_rate": 4.999675705807314e-06, + "loss": 0.54, + "num_input_tokens_seen": 249168, + "step": 126 + }, + { + "epoch": 0.016832339297548043, + "grad_norm": 20.326438903808594, + "learning_rate": 4.999670090551296e-06, + "loss": 0.4356, + "num_input_tokens_seen": 251232, + "step": 127 + }, + { + "epoch": 0.016964877402253147, + "grad_norm": 0.01491043996065855, + "learning_rate": 4.999664427099914e-06, + "loss": 0.0001, + "num_input_tokens_seen": 253296, + "step": 128 + }, + { + "epoch": 0.01709741550695825, + "grad_norm": 23.955692291259766, + "learning_rate": 4.999658715453279e-06, + "loss": 0.5625, + "num_input_tokens_seen": 255376, + "step": 129 + }, + { + "epoch": 0.017229953611663355, + "grad_norm": 0.004617680795490742, + "learning_rate": 4.999652955611497e-06, + "loss": 0.0, + "num_input_tokens_seen": 257456, + "step": 130 + }, + { + "epoch": 0.017362491716368455, + "grad_norm": 18.199174880981445, + "learning_rate": 4.9996471475746835e-06, + "loss": 0.4444, + "num_input_tokens_seen": 259528, + "step": 131 + }, + { + "epoch": 0.01749502982107356, + "grad_norm": 7.164508819580078, + "learning_rate": 4.999641291342948e-06, + "loss": 0.1856, + "num_input_tokens_seen": 261320, + "step": 132 + }, + { + "epoch": 0.017627567925778662, + "grad_norm": 28.76552963256836, + "learning_rate": 4.999635386916404e-06, + "loss": 1.0287, + "num_input_tokens_seen": 263936, + "step": 133 + }, + { + "epoch": 0.017760106030483763, + "grad_norm": 0.014427479356527328, + "learning_rate": 4.999629434295165e-06, + "loss": 0.0001, + "num_input_tokens_seen": 266784, + "step": 134 + }, + { + "epoch": 0.017892644135188866, + "grad_norm": 15.610347747802734, + "learning_rate": 4.999623433479346e-06, + "loss": 0.4054, + "num_input_tokens_seen": 268744, + "step": 135 + }, + { + "epoch": 0.01802518223989397, + "grad_norm": 0.03668922930955887, + "learning_rate": 4.999617384469063e-06, + "loss": 0.0001, + "num_input_tokens_seen": 270464, + "step": 136 + }, + { + "epoch": 0.018157720344599074, + "grad_norm": 45.97502517700195, + "learning_rate": 4.999611287264433e-06, + "loss": 0.566, + "num_input_tokens_seen": 271928, + "step": 137 + }, + { + "epoch": 0.018290258449304174, + "grad_norm": 5.405684471130371, + "learning_rate": 4.999605141865573e-06, + "loss": 0.0962, + "num_input_tokens_seen": 273864, + "step": 138 + }, + { + "epoch": 0.018422796554009278, + "grad_norm": 20.897493362426758, + "learning_rate": 4.999598948272601e-06, + "loss": 0.3346, + "num_input_tokens_seen": 275368, + "step": 139 + }, + { + "epoch": 0.01855533465871438, + "grad_norm": 11.520383834838867, + "learning_rate": 4.999592706485636e-06, + "loss": 0.2364, + "num_input_tokens_seen": 277304, + "step": 140 + }, + { + "epoch": 0.01868787276341948, + "grad_norm": 5.902313709259033, + "learning_rate": 4.999586416504801e-06, + "loss": 0.0918, + "num_input_tokens_seen": 278904, + "step": 141 + }, + { + "epoch": 0.018820410868124585, + "grad_norm": 16.073192596435547, + "learning_rate": 4.999580078330215e-06, + "loss": 0.1683, + "num_input_tokens_seen": 280552, + "step": 142 + }, + { + "epoch": 0.01895294897282969, + "grad_norm": 1.160386085510254, + "learning_rate": 4.999573691962001e-06, + "loss": 0.0047, + "num_input_tokens_seen": 282752, + "step": 143 + }, + { + "epoch": 0.019085487077534793, + "grad_norm": 2.3690803050994873, + "learning_rate": 4.99956725740028e-06, + "loss": 0.0096, + "num_input_tokens_seen": 285584, + "step": 144 + }, + { + "epoch": 0.019218025182239893, + "grad_norm": 6.115748882293701, + "learning_rate": 4.99956077464518e-06, + "loss": 0.0868, + "num_input_tokens_seen": 286832, + "step": 145 + }, + { + "epoch": 0.019350563286944997, + "grad_norm": 1.332751750946045, + "learning_rate": 4.999554243696822e-06, + "loss": 0.0054, + "num_input_tokens_seen": 288816, + "step": 146 + }, + { + "epoch": 0.0194831013916501, + "grad_norm": 18.80128288269043, + "learning_rate": 4.999547664555335e-06, + "loss": 0.4129, + "num_input_tokens_seen": 290592, + "step": 147 + }, + { + "epoch": 0.0196156394963552, + "grad_norm": 16.31783103942871, + "learning_rate": 4.999541037220845e-06, + "loss": 0.879, + "num_input_tokens_seen": 293464, + "step": 148 + }, + { + "epoch": 0.019748177601060304, + "grad_norm": 14.406692504882812, + "learning_rate": 4.999534361693479e-06, + "loss": 0.3031, + "num_input_tokens_seen": 295664, + "step": 149 + }, + { + "epoch": 0.019880715705765408, + "grad_norm": 3.517805337905884, + "learning_rate": 4.999527637973367e-06, + "loss": 0.0144, + "num_input_tokens_seen": 298008, + "step": 150 + }, + { + "epoch": 0.020013253810470512, + "grad_norm": 23.954038619995117, + "learning_rate": 4.999520866060637e-06, + "loss": 0.6077, + "num_input_tokens_seen": 301040, + "step": 151 + }, + { + "epoch": 0.020145791915175612, + "grad_norm": 17.75224494934082, + "learning_rate": 4.99951404595542e-06, + "loss": 0.4108, + "num_input_tokens_seen": 303840, + "step": 152 + }, + { + "epoch": 0.020278330019880716, + "grad_norm": 18.609392166137695, + "learning_rate": 4.999507177657848e-06, + "loss": 0.1972, + "num_input_tokens_seen": 306024, + "step": 153 + }, + { + "epoch": 0.02041086812458582, + "grad_norm": 20.8830509185791, + "learning_rate": 4.999500261168054e-06, + "loss": 0.6959, + "num_input_tokens_seen": 308224, + "step": 154 + }, + { + "epoch": 0.02054340622929092, + "grad_norm": 14.19822883605957, + "learning_rate": 4.99949329648617e-06, + "loss": 0.2804, + "num_input_tokens_seen": 309752, + "step": 155 + }, + { + "epoch": 0.020675944333996023, + "grad_norm": 9.410264015197754, + "learning_rate": 4.999486283612332e-06, + "loss": 0.15, + "num_input_tokens_seen": 311368, + "step": 156 + }, + { + "epoch": 0.020808482438701127, + "grad_norm": 26.67640495300293, + "learning_rate": 4.999479222546673e-06, + "loss": 0.544, + "num_input_tokens_seen": 313456, + "step": 157 + }, + { + "epoch": 0.02094102054340623, + "grad_norm": 20.946008682250977, + "learning_rate": 4.9994721132893305e-06, + "loss": 0.1774, + "num_input_tokens_seen": 315808, + "step": 158 + }, + { + "epoch": 0.02107355864811133, + "grad_norm": 12.023552894592285, + "learning_rate": 4.9994649558404415e-06, + "loss": 0.1994, + "num_input_tokens_seen": 317824, + "step": 159 + }, + { + "epoch": 0.021206096752816435, + "grad_norm": 3.0416977405548096, + "learning_rate": 4.999457750200143e-06, + "loss": 0.0128, + "num_input_tokens_seen": 319336, + "step": 160 + }, + { + "epoch": 0.02133863485752154, + "grad_norm": 18.2331600189209, + "learning_rate": 4.999450496368576e-06, + "loss": 0.4826, + "num_input_tokens_seen": 321272, + "step": 161 + }, + { + "epoch": 0.02147117296222664, + "grad_norm": 8.425516128540039, + "learning_rate": 4.999443194345879e-06, + "loss": 0.0416, + "num_input_tokens_seen": 323240, + "step": 162 + }, + { + "epoch": 0.021603711066931743, + "grad_norm": 5.87241268157959, + "learning_rate": 4.999435844132193e-06, + "loss": 0.0265, + "num_input_tokens_seen": 325416, + "step": 163 + }, + { + "epoch": 0.021736249171636846, + "grad_norm": 4.545175075531006, + "learning_rate": 4.999428445727659e-06, + "loss": 0.019, + "num_input_tokens_seen": 327384, + "step": 164 + }, + { + "epoch": 0.02186878727634195, + "grad_norm": 17.245952606201172, + "learning_rate": 4.9994209991324205e-06, + "loss": 0.4342, + "num_input_tokens_seen": 328744, + "step": 165 + }, + { + "epoch": 0.02200132538104705, + "grad_norm": 2.1576149463653564, + "learning_rate": 4.999413504346621e-06, + "loss": 0.0087, + "num_input_tokens_seen": 330616, + "step": 166 + }, + { + "epoch": 0.022133863485752154, + "grad_norm": 0.9837048649787903, + "learning_rate": 4.999405961370405e-06, + "loss": 0.0039, + "num_input_tokens_seen": 331952, + "step": 167 + }, + { + "epoch": 0.022266401590457258, + "grad_norm": 15.896281242370605, + "learning_rate": 4.999398370203917e-06, + "loss": 0.3722, + "num_input_tokens_seen": 333400, + "step": 168 + }, + { + "epoch": 0.022398939695162358, + "grad_norm": 21.467859268188477, + "learning_rate": 4.999390730847306e-06, + "loss": 0.7088, + "num_input_tokens_seen": 335704, + "step": 169 + }, + { + "epoch": 0.02253147779986746, + "grad_norm": 30.230329513549805, + "learning_rate": 4.9993830433007165e-06, + "loss": 0.7724, + "num_input_tokens_seen": 337736, + "step": 170 + }, + { + "epoch": 0.022664015904572565, + "grad_norm": 0.07943902164697647, + "learning_rate": 4.999375307564297e-06, + "loss": 0.0004, + "num_input_tokens_seen": 338928, + "step": 171 + }, + { + "epoch": 0.02279655400927767, + "grad_norm": 16.193147659301758, + "learning_rate": 4.999367523638198e-06, + "loss": 0.4945, + "num_input_tokens_seen": 341384, + "step": 172 + }, + { + "epoch": 0.02292909211398277, + "grad_norm": 26.579015731811523, + "learning_rate": 4.999359691522568e-06, + "loss": 0.7879, + "num_input_tokens_seen": 343792, + "step": 173 + }, + { + "epoch": 0.023061630218687873, + "grad_norm": 15.10954761505127, + "learning_rate": 4.99935181121756e-06, + "loss": 0.37, + "num_input_tokens_seen": 345416, + "step": 174 + }, + { + "epoch": 0.023194168323392977, + "grad_norm": 19.339305877685547, + "learning_rate": 4.999343882723325e-06, + "loss": 0.6132, + "num_input_tokens_seen": 346728, + "step": 175 + }, + { + "epoch": 0.023326706428098077, + "grad_norm": 20.464967727661133, + "learning_rate": 4.999335906040015e-06, + "loss": 0.6838, + "num_input_tokens_seen": 348536, + "step": 176 + }, + { + "epoch": 0.02345924453280318, + "grad_norm": 24.76003646850586, + "learning_rate": 4.999327881167785e-06, + "loss": 0.8473, + "num_input_tokens_seen": 350632, + "step": 177 + }, + { + "epoch": 0.023591782637508284, + "grad_norm": 29.685523986816406, + "learning_rate": 4.999319808106789e-06, + "loss": 0.4731, + "num_input_tokens_seen": 352032, + "step": 178 + }, + { + "epoch": 0.023724320742213385, + "grad_norm": 0.12374715507030487, + "learning_rate": 4.999311686857184e-06, + "loss": 0.0006, + "num_input_tokens_seen": 353936, + "step": 179 + }, + { + "epoch": 0.02385685884691849, + "grad_norm": 15.263150215148926, + "learning_rate": 4.999303517419124e-06, + "loss": 0.2656, + "num_input_tokens_seen": 355856, + "step": 180 + }, + { + "epoch": 0.023989396951623592, + "grad_norm": 21.625612258911133, + "learning_rate": 4.999295299792768e-06, + "loss": 0.4828, + "num_input_tokens_seen": 357560, + "step": 181 + }, + { + "epoch": 0.024121935056328696, + "grad_norm": 15.519153594970703, + "learning_rate": 4.999287033978275e-06, + "loss": 0.3393, + "num_input_tokens_seen": 359240, + "step": 182 + }, + { + "epoch": 0.024254473161033796, + "grad_norm": 11.988371849060059, + "learning_rate": 4.999278719975804e-06, + "loss": 0.2141, + "num_input_tokens_seen": 360992, + "step": 183 + }, + { + "epoch": 0.0243870112657389, + "grad_norm": 18.441619873046875, + "learning_rate": 4.999270357785515e-06, + "loss": 0.507, + "num_input_tokens_seen": 363120, + "step": 184 + }, + { + "epoch": 0.024519549370444003, + "grad_norm": 12.764714241027832, + "learning_rate": 4.999261947407568e-06, + "loss": 0.2737, + "num_input_tokens_seen": 365032, + "step": 185 + }, + { + "epoch": 0.024652087475149104, + "grad_norm": 22.18332862854004, + "learning_rate": 4.999253488842128e-06, + "loss": 0.463, + "num_input_tokens_seen": 366448, + "step": 186 + }, + { + "epoch": 0.024784625579854207, + "grad_norm": 2.5686256885528564, + "learning_rate": 4.999244982089357e-06, + "loss": 0.0141, + "num_input_tokens_seen": 367880, + "step": 187 + }, + { + "epoch": 0.02491716368455931, + "grad_norm": 1.7752444744110107, + "learning_rate": 4.999236427149418e-06, + "loss": 0.0084, + "num_input_tokens_seen": 369128, + "step": 188 + }, + { + "epoch": 0.025049701789264415, + "grad_norm": 9.059718132019043, + "learning_rate": 4.9992278240224765e-06, + "loss": 0.2373, + "num_input_tokens_seen": 371376, + "step": 189 + }, + { + "epoch": 0.025182239893969515, + "grad_norm": 14.930595397949219, + "learning_rate": 4.999219172708698e-06, + "loss": 0.2159, + "num_input_tokens_seen": 373576, + "step": 190 + }, + { + "epoch": 0.02531477799867462, + "grad_norm": 31.155925750732422, + "learning_rate": 4.99921047320825e-06, + "loss": 0.5783, + "num_input_tokens_seen": 375728, + "step": 191 + }, + { + "epoch": 0.025447316103379723, + "grad_norm": 2.1444640159606934, + "learning_rate": 4.9992017255213e-06, + "loss": 0.0099, + "num_input_tokens_seen": 377176, + "step": 192 + }, + { + "epoch": 0.025579854208084823, + "grad_norm": 12.785717010498047, + "learning_rate": 4.999192929648017e-06, + "loss": 0.1593, + "num_input_tokens_seen": 379112, + "step": 193 + }, + { + "epoch": 0.025712392312789926, + "grad_norm": 2.521740198135376, + "learning_rate": 4.999184085588569e-06, + "loss": 0.0115, + "num_input_tokens_seen": 380800, + "step": 194 + }, + { + "epoch": 0.02584493041749503, + "grad_norm": 15.717253684997559, + "learning_rate": 4.99917519334313e-06, + "loss": 0.2697, + "num_input_tokens_seen": 383360, + "step": 195 + }, + { + "epoch": 0.025977468522200134, + "grad_norm": 22.165664672851562, + "learning_rate": 4.999166252911868e-06, + "loss": 0.4282, + "num_input_tokens_seen": 385080, + "step": 196 + }, + { + "epoch": 0.026110006626905234, + "grad_norm": 7.5503644943237305, + "learning_rate": 4.999157264294956e-06, + "loss": 0.1602, + "num_input_tokens_seen": 387392, + "step": 197 + }, + { + "epoch": 0.026242544731610338, + "grad_norm": 17.232053756713867, + "learning_rate": 4.999148227492568e-06, + "loss": 0.6237, + "num_input_tokens_seen": 389056, + "step": 198 + }, + { + "epoch": 0.02637508283631544, + "grad_norm": 19.79399299621582, + "learning_rate": 4.999139142504878e-06, + "loss": 0.4356, + "num_input_tokens_seen": 392336, + "step": 199 + }, + { + "epoch": 0.026507620941020542, + "grad_norm": 4.224052906036377, + "learning_rate": 4.999130009332062e-06, + "loss": 0.0712, + "num_input_tokens_seen": 394640, + "step": 200 + }, + { + "epoch": 0.026640159045725646, + "grad_norm": 17.576534271240234, + "learning_rate": 4.999120827974295e-06, + "loss": 0.2182, + "num_input_tokens_seen": 396496, + "step": 201 + }, + { + "epoch": 0.02677269715043075, + "grad_norm": 14.022331237792969, + "learning_rate": 4.999111598431754e-06, + "loss": 0.3605, + "num_input_tokens_seen": 398736, + "step": 202 + }, + { + "epoch": 0.026905235255135853, + "grad_norm": 21.757204055786133, + "learning_rate": 4.999102320704618e-06, + "loss": 0.3021, + "num_input_tokens_seen": 400872, + "step": 203 + }, + { + "epoch": 0.027037773359840953, + "grad_norm": 21.306703567504883, + "learning_rate": 4.999092994793065e-06, + "loss": 0.6083, + "num_input_tokens_seen": 403008, + "step": 204 + }, + { + "epoch": 0.027170311464546057, + "grad_norm": 4.1254754066467285, + "learning_rate": 4.9990836206972746e-06, + "loss": 0.0184, + "num_input_tokens_seen": 405536, + "step": 205 + }, + { + "epoch": 0.02730284956925116, + "grad_norm": 11.163147926330566, + "learning_rate": 4.999074198417429e-06, + "loss": 0.2964, + "num_input_tokens_seen": 407504, + "step": 206 + }, + { + "epoch": 0.02743538767395626, + "grad_norm": 22.429800033569336, + "learning_rate": 4.999064727953707e-06, + "loss": 0.446, + "num_input_tokens_seen": 409536, + "step": 207 + }, + { + "epoch": 0.027567925778661365, + "grad_norm": 4.595419883728027, + "learning_rate": 4.999055209306295e-06, + "loss": 0.0204, + "num_input_tokens_seen": 411592, + "step": 208 + }, + { + "epoch": 0.02770046388336647, + "grad_norm": 20.951740264892578, + "learning_rate": 4.999045642475374e-06, + "loss": 0.3642, + "num_input_tokens_seen": 413496, + "step": 209 + }, + { + "epoch": 0.027833001988071572, + "grad_norm": 18.991111755371094, + "learning_rate": 4.999036027461128e-06, + "loss": 0.4246, + "num_input_tokens_seen": 415864, + "step": 210 + }, + { + "epoch": 0.027965540092776672, + "grad_norm": 23.66268539428711, + "learning_rate": 4.999026364263744e-06, + "loss": 0.5465, + "num_input_tokens_seen": 418560, + "step": 211 + }, + { + "epoch": 0.028098078197481776, + "grad_norm": 10.489510536193848, + "learning_rate": 4.9990166528834085e-06, + "loss": 0.3397, + "num_input_tokens_seen": 421304, + "step": 212 + }, + { + "epoch": 0.02823061630218688, + "grad_norm": 4.345235347747803, + "learning_rate": 4.999006893320307e-06, + "loss": 0.0193, + "num_input_tokens_seen": 423144, + "step": 213 + }, + { + "epoch": 0.02836315440689198, + "grad_norm": 18.24567413330078, + "learning_rate": 4.998997085574628e-06, + "loss": 0.2422, + "num_input_tokens_seen": 425352, + "step": 214 + }, + { + "epoch": 0.028495692511597084, + "grad_norm": 1.7618850469589233, + "learning_rate": 4.998987229646562e-06, + "loss": 0.0073, + "num_input_tokens_seen": 426672, + "step": 215 + }, + { + "epoch": 0.028628230616302187, + "grad_norm": 10.86171817779541, + "learning_rate": 4.9989773255362995e-06, + "loss": 0.13, + "num_input_tokens_seen": 428568, + "step": 216 + }, + { + "epoch": 0.02876076872100729, + "grad_norm": 31.913516998291016, + "learning_rate": 4.9989673732440285e-06, + "loss": 0.6857, + "num_input_tokens_seen": 430936, + "step": 217 + }, + { + "epoch": 0.02889330682571239, + "grad_norm": 14.94822883605957, + "learning_rate": 4.9989573727699435e-06, + "loss": 0.2922, + "num_input_tokens_seen": 432392, + "step": 218 + }, + { + "epoch": 0.029025844930417495, + "grad_norm": 0.7612848281860352, + "learning_rate": 4.998947324114236e-06, + "loss": 0.0031, + "num_input_tokens_seen": 433984, + "step": 219 + }, + { + "epoch": 0.0291583830351226, + "grad_norm": 17.241859436035156, + "learning_rate": 4.998937227277099e-06, + "loss": 0.3538, + "num_input_tokens_seen": 436368, + "step": 220 + }, + { + "epoch": 0.0292909211398277, + "grad_norm": 4.021599769592285, + "learning_rate": 4.998927082258731e-06, + "loss": 0.044, + "num_input_tokens_seen": 438072, + "step": 221 + }, + { + "epoch": 0.029423459244532803, + "grad_norm": 11.155754089355469, + "learning_rate": 4.998916889059323e-06, + "loss": 0.2909, + "num_input_tokens_seen": 439488, + "step": 222 + }, + { + "epoch": 0.029555997349237906, + "grad_norm": 1.1311284303665161, + "learning_rate": 4.998906647679074e-06, + "loss": 0.0045, + "num_input_tokens_seen": 441304, + "step": 223 + }, + { + "epoch": 0.02968853545394301, + "grad_norm": 16.937152862548828, + "learning_rate": 4.998896358118181e-06, + "loss": 0.2745, + "num_input_tokens_seen": 442928, + "step": 224 + }, + { + "epoch": 0.02982107355864811, + "grad_norm": 32.88721466064453, + "learning_rate": 4.998886020376842e-06, + "loss": 0.6268, + "num_input_tokens_seen": 446216, + "step": 225 + }, + { + "epoch": 0.029953611663353214, + "grad_norm": 27.79301643371582, + "learning_rate": 4.9988756344552565e-06, + "loss": 1.0521, + "num_input_tokens_seen": 449200, + "step": 226 + }, + { + "epoch": 0.030086149768058318, + "grad_norm": 15.246744155883789, + "learning_rate": 4.998865200353625e-06, + "loss": 0.1614, + "num_input_tokens_seen": 451320, + "step": 227 + }, + { + "epoch": 0.030218687872763418, + "grad_norm": 14.964058876037598, + "learning_rate": 4.998854718072149e-06, + "loss": 0.3347, + "num_input_tokens_seen": 453064, + "step": 228 + }, + { + "epoch": 0.030351225977468522, + "grad_norm": 38.46656036376953, + "learning_rate": 4.998844187611031e-06, + "loss": 0.8131, + "num_input_tokens_seen": 455048, + "step": 229 + }, + { + "epoch": 0.030483764082173626, + "grad_norm": 16.31789207458496, + "learning_rate": 4.998833608970471e-06, + "loss": 0.3002, + "num_input_tokens_seen": 456976, + "step": 230 + }, + { + "epoch": 0.03061630218687873, + "grad_norm": 16.649085998535156, + "learning_rate": 4.998822982150677e-06, + "loss": 0.3018, + "num_input_tokens_seen": 459376, + "step": 231 + }, + { + "epoch": 0.03074884029158383, + "grad_norm": 28.501062393188477, + "learning_rate": 4.998812307151851e-06, + "loss": 0.8573, + "num_input_tokens_seen": 461160, + "step": 232 + }, + { + "epoch": 0.030881378396288933, + "grad_norm": 9.457627296447754, + "learning_rate": 4.9988015839741995e-06, + "loss": 0.098, + "num_input_tokens_seen": 463480, + "step": 233 + }, + { + "epoch": 0.031013916500994037, + "grad_norm": 1.945236325263977, + "learning_rate": 4.9987908126179305e-06, + "loss": 0.0075, + "num_input_tokens_seen": 465048, + "step": 234 + }, + { + "epoch": 0.031146454605699137, + "grad_norm": 16.86723518371582, + "learning_rate": 4.9987799930832515e-06, + "loss": 0.4702, + "num_input_tokens_seen": 467840, + "step": 235 + }, + { + "epoch": 0.03127899271040424, + "grad_norm": 10.56873607635498, + "learning_rate": 4.998769125370369e-06, + "loss": 0.0711, + "num_input_tokens_seen": 469648, + "step": 236 + }, + { + "epoch": 0.03141153081510934, + "grad_norm": 22.688934326171875, + "learning_rate": 4.998758209479495e-06, + "loss": 0.471, + "num_input_tokens_seen": 471440, + "step": 237 + }, + { + "epoch": 0.03154406891981445, + "grad_norm": 9.229374885559082, + "learning_rate": 4.998747245410838e-06, + "loss": 0.0545, + "num_input_tokens_seen": 474248, + "step": 238 + }, + { + "epoch": 0.03167660702451955, + "grad_norm": 43.888267517089844, + "learning_rate": 4.998736233164611e-06, + "loss": 0.7092, + "num_input_tokens_seen": 477184, + "step": 239 + }, + { + "epoch": 0.03180914512922465, + "grad_norm": 6.8438897132873535, + "learning_rate": 4.9987251727410254e-06, + "loss": 0.0346, + "num_input_tokens_seen": 480040, + "step": 240 + }, + { + "epoch": 0.031941683233929756, + "grad_norm": 2.162801742553711, + "learning_rate": 4.998714064140294e-06, + "loss": 0.0083, + "num_input_tokens_seen": 482648, + "step": 241 + }, + { + "epoch": 0.032074221338634856, + "grad_norm": 11.149718284606934, + "learning_rate": 4.998702907362633e-06, + "loss": 0.2339, + "num_input_tokens_seen": 485136, + "step": 242 + }, + { + "epoch": 0.03220675944333996, + "grad_norm": 0.5103450417518616, + "learning_rate": 4.998691702408255e-06, + "loss": 0.0021, + "num_input_tokens_seen": 486680, + "step": 243 + }, + { + "epoch": 0.032339297548045064, + "grad_norm": 13.526078224182129, + "learning_rate": 4.9986804492773785e-06, + "loss": 0.1776, + "num_input_tokens_seen": 488784, + "step": 244 + }, + { + "epoch": 0.032471835652750164, + "grad_norm": 0.023342560976743698, + "learning_rate": 4.998669147970218e-06, + "loss": 0.0001, + "num_input_tokens_seen": 490448, + "step": 245 + }, + { + "epoch": 0.03260437375745527, + "grad_norm": 14.449139595031738, + "learning_rate": 4.998657798486994e-06, + "loss": 0.2465, + "num_input_tokens_seen": 492584, + "step": 246 + }, + { + "epoch": 0.03273691186216037, + "grad_norm": 0.13257715106010437, + "learning_rate": 4.998646400827923e-06, + "loss": 0.0005, + "num_input_tokens_seen": 494648, + "step": 247 + }, + { + "epoch": 0.03286944996686547, + "grad_norm": 0.009109255857765675, + "learning_rate": 4.998634954993224e-06, + "loss": 0.0, + "num_input_tokens_seen": 496912, + "step": 248 + }, + { + "epoch": 0.03300198807157058, + "grad_norm": 9.28011703491211, + "learning_rate": 4.998623460983121e-06, + "loss": 0.2192, + "num_input_tokens_seen": 498320, + "step": 249 + }, + { + "epoch": 0.03313452617627568, + "grad_norm": 35.02414321899414, + "learning_rate": 4.998611918797834e-06, + "loss": 0.3857, + "num_input_tokens_seen": 499840, + "step": 250 + }, + { + "epoch": 0.03326706428098078, + "grad_norm": 9.046163558959961, + "learning_rate": 4.998600328437586e-06, + "loss": 0.2055, + "num_input_tokens_seen": 501984, + "step": 251 + }, + { + "epoch": 0.033399602385685886, + "grad_norm": 0.29357361793518066, + "learning_rate": 4.998588689902599e-06, + "loss": 0.0013, + "num_input_tokens_seen": 503952, + "step": 252 + }, + { + "epoch": 0.03353214049039099, + "grad_norm": 12.435687065124512, + "learning_rate": 4.998577003193099e-06, + "loss": 0.2083, + "num_input_tokens_seen": 505832, + "step": 253 + }, + { + "epoch": 0.03366467859509609, + "grad_norm": 62.12875747680664, + "learning_rate": 4.99856526830931e-06, + "loss": 0.4879, + "num_input_tokens_seen": 507304, + "step": 254 + }, + { + "epoch": 0.033797216699801194, + "grad_norm": 0.09826428443193436, + "learning_rate": 4.998553485251459e-06, + "loss": 0.0004, + "num_input_tokens_seen": 508800, + "step": 255 + }, + { + "epoch": 0.033929754804506294, + "grad_norm": 15.336609840393066, + "learning_rate": 4.998541654019774e-06, + "loss": 0.4523, + "num_input_tokens_seen": 510440, + "step": 256 + }, + { + "epoch": 0.0340622929092114, + "grad_norm": 17.225994110107422, + "learning_rate": 4.998529774614482e-06, + "loss": 0.3167, + "num_input_tokens_seen": 512544, + "step": 257 + }, + { + "epoch": 0.0341948310139165, + "grad_norm": 20.611408233642578, + "learning_rate": 4.998517847035812e-06, + "loss": 0.4972, + "num_input_tokens_seen": 514160, + "step": 258 + }, + { + "epoch": 0.0343273691186216, + "grad_norm": 0.1505860835313797, + "learning_rate": 4.998505871283994e-06, + "loss": 0.0007, + "num_input_tokens_seen": 516064, + "step": 259 + }, + { + "epoch": 0.03445990722332671, + "grad_norm": 33.17475128173828, + "learning_rate": 4.99849384735926e-06, + "loss": 0.4428, + "num_input_tokens_seen": 518064, + "step": 260 + }, + { + "epoch": 0.03459244532803181, + "grad_norm": 28.57742691040039, + "learning_rate": 4.99848177526184e-06, + "loss": 0.9032, + "num_input_tokens_seen": 519992, + "step": 261 + }, + { + "epoch": 0.03472498343273691, + "grad_norm": 0.5577319860458374, + "learning_rate": 4.998469654991969e-06, + "loss": 0.0027, + "num_input_tokens_seen": 521592, + "step": 262 + }, + { + "epoch": 0.03485752153744202, + "grad_norm": 6.979521751403809, + "learning_rate": 4.998457486549878e-06, + "loss": 0.2413, + "num_input_tokens_seen": 524088, + "step": 263 + }, + { + "epoch": 0.03499005964214712, + "grad_norm": 14.174946784973145, + "learning_rate": 4.998445269935805e-06, + "loss": 0.2806, + "num_input_tokens_seen": 525904, + "step": 264 + }, + { + "epoch": 0.03512259774685222, + "grad_norm": 13.45556354522705, + "learning_rate": 4.998433005149981e-06, + "loss": 0.4349, + "num_input_tokens_seen": 528288, + "step": 265 + }, + { + "epoch": 0.035255135851557325, + "grad_norm": 24.512451171875, + "learning_rate": 4.998420692192647e-06, + "loss": 0.4747, + "num_input_tokens_seen": 530176, + "step": 266 + }, + { + "epoch": 0.035387673956262425, + "grad_norm": 14.442388534545898, + "learning_rate": 4.9984083310640375e-06, + "loss": 0.3086, + "num_input_tokens_seen": 532744, + "step": 267 + }, + { + "epoch": 0.035520212060967525, + "grad_norm": 13.616148948669434, + "learning_rate": 4.998395921764393e-06, + "loss": 0.2474, + "num_input_tokens_seen": 535144, + "step": 268 + }, + { + "epoch": 0.03565275016567263, + "grad_norm": 12.278056144714355, + "learning_rate": 4.9983834642939505e-06, + "loss": 0.3656, + "num_input_tokens_seen": 537872, + "step": 269 + }, + { + "epoch": 0.03578528827037773, + "grad_norm": 35.60847091674805, + "learning_rate": 4.998370958652952e-06, + "loss": 0.5403, + "num_input_tokens_seen": 540856, + "step": 270 + }, + { + "epoch": 0.03591782637508284, + "grad_norm": 13.47176742553711, + "learning_rate": 4.998358404841637e-06, + "loss": 0.3135, + "num_input_tokens_seen": 542320, + "step": 271 + }, + { + "epoch": 0.03605036447978794, + "grad_norm": 25.42990493774414, + "learning_rate": 4.998345802860249e-06, + "loss": 0.5438, + "num_input_tokens_seen": 545176, + "step": 272 + }, + { + "epoch": 0.03618290258449304, + "grad_norm": 20.656879425048828, + "learning_rate": 4.998333152709031e-06, + "loss": 0.3382, + "num_input_tokens_seen": 547128, + "step": 273 + }, + { + "epoch": 0.03631544068919815, + "grad_norm": 33.70890426635742, + "learning_rate": 4.998320454388225e-06, + "loss": 0.5682, + "num_input_tokens_seen": 548944, + "step": 274 + }, + { + "epoch": 0.03644797879390325, + "grad_norm": 6.59552001953125, + "learning_rate": 4.998307707898078e-06, + "loss": 0.039, + "num_input_tokens_seen": 551064, + "step": 275 + }, + { + "epoch": 0.03658051689860835, + "grad_norm": 12.120160102844238, + "learning_rate": 4.998294913238834e-06, + "loss": 0.2049, + "num_input_tokens_seen": 552736, + "step": 276 + }, + { + "epoch": 0.036713055003313455, + "grad_norm": 13.951089859008789, + "learning_rate": 4.998282070410741e-06, + "loss": 0.2062, + "num_input_tokens_seen": 556584, + "step": 277 + }, + { + "epoch": 0.036845593108018555, + "grad_norm": 25.15782928466797, + "learning_rate": 4.998269179414047e-06, + "loss": 0.306, + "num_input_tokens_seen": 557960, + "step": 278 + }, + { + "epoch": 0.036978131212723656, + "grad_norm": 21.26768684387207, + "learning_rate": 4.998256240248999e-06, + "loss": 0.3163, + "num_input_tokens_seen": 561984, + "step": 279 + }, + { + "epoch": 0.03711066931742876, + "grad_norm": 21.928241729736328, + "learning_rate": 4.998243252915847e-06, + "loss": 0.5134, + "num_input_tokens_seen": 564784, + "step": 280 + }, + { + "epoch": 0.03724320742213386, + "grad_norm": 16.106935501098633, + "learning_rate": 4.9982302174148425e-06, + "loss": 0.3639, + "num_input_tokens_seen": 566848, + "step": 281 + }, + { + "epoch": 0.03737574552683896, + "grad_norm": 8.305567741394043, + "learning_rate": 4.998217133746235e-06, + "loss": 0.095, + "num_input_tokens_seen": 568584, + "step": 282 + }, + { + "epoch": 0.03750828363154407, + "grad_norm": 2.3119444847106934, + "learning_rate": 4.998204001910279e-06, + "loss": 0.0111, + "num_input_tokens_seen": 569592, + "step": 283 + }, + { + "epoch": 0.03764082173624917, + "grad_norm": 17.673871994018555, + "learning_rate": 4.998190821907225e-06, + "loss": 0.2595, + "num_input_tokens_seen": 571448, + "step": 284 + }, + { + "epoch": 0.03777335984095427, + "grad_norm": 6.476834774017334, + "learning_rate": 4.99817759373733e-06, + "loss": 0.0407, + "num_input_tokens_seen": 573352, + "step": 285 + }, + { + "epoch": 0.03790589794565938, + "grad_norm": 10.310851097106934, + "learning_rate": 4.998164317400846e-06, + "loss": 0.1975, + "num_input_tokens_seen": 574784, + "step": 286 + }, + { + "epoch": 0.03803843605036448, + "grad_norm": 7.472101211547852, + "learning_rate": 4.998150992898032e-06, + "loss": 0.1528, + "num_input_tokens_seen": 576360, + "step": 287 + }, + { + "epoch": 0.038170974155069586, + "grad_norm": 7.403510570526123, + "learning_rate": 4.998137620229143e-06, + "loss": 0.1364, + "num_input_tokens_seen": 578160, + "step": 288 + }, + { + "epoch": 0.038303512259774686, + "grad_norm": 2.993300199508667, + "learning_rate": 4.998124199394437e-06, + "loss": 0.0148, + "num_input_tokens_seen": 580080, + "step": 289 + }, + { + "epoch": 0.038436050364479786, + "grad_norm": 13.170620918273926, + "learning_rate": 4.998110730394174e-06, + "loss": 0.3559, + "num_input_tokens_seen": 582336, + "step": 290 + }, + { + "epoch": 0.03856858846918489, + "grad_norm": 14.553942680358887, + "learning_rate": 4.998097213228611e-06, + "loss": 0.2029, + "num_input_tokens_seen": 583928, + "step": 291 + }, + { + "epoch": 0.03870112657388999, + "grad_norm": 0.8202224969863892, + "learning_rate": 4.998083647898011e-06, + "loss": 0.0029, + "num_input_tokens_seen": 586016, + "step": 292 + }, + { + "epoch": 0.038833664678595094, + "grad_norm": 25.899988174438477, + "learning_rate": 4.998070034402635e-06, + "loss": 0.5873, + "num_input_tokens_seen": 588280, + "step": 293 + }, + { + "epoch": 0.0389662027833002, + "grad_norm": 11.520463943481445, + "learning_rate": 4.9980563727427464e-06, + "loss": 0.1645, + "num_input_tokens_seen": 590600, + "step": 294 + }, + { + "epoch": 0.0390987408880053, + "grad_norm": 18.768911361694336, + "learning_rate": 4.998042662918607e-06, + "loss": 0.6236, + "num_input_tokens_seen": 592976, + "step": 295 + }, + { + "epoch": 0.0392312789927104, + "grad_norm": 0.867225170135498, + "learning_rate": 4.998028904930481e-06, + "loss": 0.0038, + "num_input_tokens_seen": 594848, + "step": 296 + }, + { + "epoch": 0.03936381709741551, + "grad_norm": 25.768062591552734, + "learning_rate": 4.998015098778636e-06, + "loss": 0.4069, + "num_input_tokens_seen": 596640, + "step": 297 + }, + { + "epoch": 0.03949635520212061, + "grad_norm": 12.203022956848145, + "learning_rate": 4.998001244463336e-06, + "loss": 0.2393, + "num_input_tokens_seen": 598832, + "step": 298 + }, + { + "epoch": 0.03962889330682571, + "grad_norm": 25.078598022460938, + "learning_rate": 4.997987341984848e-06, + "loss": 0.4055, + "num_input_tokens_seen": 600400, + "step": 299 + }, + { + "epoch": 0.039761431411530816, + "grad_norm": 2.1533451080322266, + "learning_rate": 4.9979733913434405e-06, + "loss": 0.0306, + "num_input_tokens_seen": 602224, + "step": 300 + }, + { + "epoch": 0.039893969516235916, + "grad_norm": 14.567193031311035, + "learning_rate": 4.997959392539383e-06, + "loss": 0.2431, + "num_input_tokens_seen": 604472, + "step": 301 + }, + { + "epoch": 0.040026507620941024, + "grad_norm": 12.722371101379395, + "learning_rate": 4.997945345572946e-06, + "loss": 0.4987, + "num_input_tokens_seen": 606416, + "step": 302 + }, + { + "epoch": 0.040159045725646124, + "grad_norm": 25.712350845336914, + "learning_rate": 4.997931250444399e-06, + "loss": 0.5941, + "num_input_tokens_seen": 608848, + "step": 303 + }, + { + "epoch": 0.040291583830351224, + "grad_norm": 7.8166184425354, + "learning_rate": 4.997917107154015e-06, + "loss": 0.2187, + "num_input_tokens_seen": 611056, + "step": 304 + }, + { + "epoch": 0.04042412193505633, + "grad_norm": 0.5850040912628174, + "learning_rate": 4.997902915702065e-06, + "loss": 0.0022, + "num_input_tokens_seen": 613552, + "step": 305 + }, + { + "epoch": 0.04055666003976143, + "grad_norm": 23.503761291503906, + "learning_rate": 4.997888676088823e-06, + "loss": 0.7279, + "num_input_tokens_seen": 615648, + "step": 306 + }, + { + "epoch": 0.04068919814446653, + "grad_norm": 19.09429168701172, + "learning_rate": 4.997874388314565e-06, + "loss": 0.3783, + "num_input_tokens_seen": 617648, + "step": 307 + }, + { + "epoch": 0.04082173624917164, + "grad_norm": 16.295747756958008, + "learning_rate": 4.997860052379566e-06, + "loss": 0.4785, + "num_input_tokens_seen": 620128, + "step": 308 + }, + { + "epoch": 0.04095427435387674, + "grad_norm": 7.503650665283203, + "learning_rate": 4.9978456682841015e-06, + "loss": 0.0362, + "num_input_tokens_seen": 622376, + "step": 309 + }, + { + "epoch": 0.04108681245858184, + "grad_norm": 15.009029388427734, + "learning_rate": 4.99783123602845e-06, + "loss": 0.1055, + "num_input_tokens_seen": 624088, + "step": 310 + }, + { + "epoch": 0.04121935056328695, + "grad_norm": 16.56051254272461, + "learning_rate": 4.997816755612887e-06, + "loss": 0.304, + "num_input_tokens_seen": 625752, + "step": 311 + }, + { + "epoch": 0.04135188866799205, + "grad_norm": 10.829145431518555, + "learning_rate": 4.997802227037697e-06, + "loss": 0.1062, + "num_input_tokens_seen": 627472, + "step": 312 + }, + { + "epoch": 0.04148442677269715, + "grad_norm": 20.199914932250977, + "learning_rate": 4.9977876503031545e-06, + "loss": 0.2104, + "num_input_tokens_seen": 629176, + "step": 313 + }, + { + "epoch": 0.041616964877402254, + "grad_norm": 6.902258396148682, + "learning_rate": 4.9977730254095435e-06, + "loss": 0.0944, + "num_input_tokens_seen": 630648, + "step": 314 + }, + { + "epoch": 0.041749502982107355, + "grad_norm": 12.835172653198242, + "learning_rate": 4.997758352357146e-06, + "loss": 0.2762, + "num_input_tokens_seen": 632816, + "step": 315 + }, + { + "epoch": 0.04188204108681246, + "grad_norm": 11.454261779785156, + "learning_rate": 4.997743631146244e-06, + "loss": 0.0646, + "num_input_tokens_seen": 634504, + "step": 316 + }, + { + "epoch": 0.04201457919151756, + "grad_norm": 29.429641723632812, + "learning_rate": 4.997728861777121e-06, + "loss": 0.8085, + "num_input_tokens_seen": 638080, + "step": 317 + }, + { + "epoch": 0.04214711729622266, + "grad_norm": 9.454316139221191, + "learning_rate": 4.997714044250064e-06, + "loss": 0.2269, + "num_input_tokens_seen": 640616, + "step": 318 + }, + { + "epoch": 0.04227965540092777, + "grad_norm": 26.891950607299805, + "learning_rate": 4.997699178565356e-06, + "loss": 0.7373, + "num_input_tokens_seen": 642592, + "step": 319 + }, + { + "epoch": 0.04241219350563287, + "grad_norm": 16.68791961669922, + "learning_rate": 4.997684264723285e-06, + "loss": 0.2951, + "num_input_tokens_seen": 644672, + "step": 320 + }, + { + "epoch": 0.04254473161033797, + "grad_norm": 21.331117630004883, + "learning_rate": 4.997669302724139e-06, + "loss": 0.6098, + "num_input_tokens_seen": 646912, + "step": 321 + }, + { + "epoch": 0.04267726971504308, + "grad_norm": 20.414836883544922, + "learning_rate": 4.997654292568204e-06, + "loss": 0.3962, + "num_input_tokens_seen": 649656, + "step": 322 + }, + { + "epoch": 0.04280980781974818, + "grad_norm": 7.269143104553223, + "learning_rate": 4.997639234255773e-06, + "loss": 0.1993, + "num_input_tokens_seen": 651112, + "step": 323 + }, + { + "epoch": 0.04294234592445328, + "grad_norm": 20.107419967651367, + "learning_rate": 4.9976241277871335e-06, + "loss": 0.3358, + "num_input_tokens_seen": 653312, + "step": 324 + }, + { + "epoch": 0.043074884029158385, + "grad_norm": 27.0461368560791, + "learning_rate": 4.997608973162578e-06, + "loss": 0.5486, + "num_input_tokens_seen": 656072, + "step": 325 + }, + { + "epoch": 0.043207422133863485, + "grad_norm": 17.05916976928711, + "learning_rate": 4.997593770382398e-06, + "loss": 0.4689, + "num_input_tokens_seen": 658840, + "step": 326 + }, + { + "epoch": 0.043339960238568585, + "grad_norm": 1.6389083862304688, + "learning_rate": 4.9975785194468886e-06, + "loss": 0.0072, + "num_input_tokens_seen": 661824, + "step": 327 + }, + { + "epoch": 0.04347249834327369, + "grad_norm": 3.850003242492676, + "learning_rate": 4.997563220356341e-06, + "loss": 0.0198, + "num_input_tokens_seen": 664928, + "step": 328 + }, + { + "epoch": 0.04360503644797879, + "grad_norm": 23.54313087463379, + "learning_rate": 4.997547873111053e-06, + "loss": 0.3895, + "num_input_tokens_seen": 668112, + "step": 329 + }, + { + "epoch": 0.0437375745526839, + "grad_norm": 9.03012466430664, + "learning_rate": 4.997532477711318e-06, + "loss": 0.3714, + "num_input_tokens_seen": 671248, + "step": 330 + }, + { + "epoch": 0.043870112657389, + "grad_norm": 14.154772758483887, + "learning_rate": 4.997517034157433e-06, + "loss": 0.3441, + "num_input_tokens_seen": 672536, + "step": 331 + }, + { + "epoch": 0.0440026507620941, + "grad_norm": 0.9552240967750549, + "learning_rate": 4.997501542449697e-06, + "loss": 0.0043, + "num_input_tokens_seen": 675576, + "step": 332 + }, + { + "epoch": 0.04413518886679921, + "grad_norm": 11.67136001586914, + "learning_rate": 4.99748600258841e-06, + "loss": 0.1134, + "num_input_tokens_seen": 677632, + "step": 333 + }, + { + "epoch": 0.04426772697150431, + "grad_norm": 20.03084945678711, + "learning_rate": 4.997470414573869e-06, + "loss": 0.216, + "num_input_tokens_seen": 681088, + "step": 334 + }, + { + "epoch": 0.04440026507620941, + "grad_norm": 18.869739532470703, + "learning_rate": 4.997454778406375e-06, + "loss": 0.3727, + "num_input_tokens_seen": 683368, + "step": 335 + }, + { + "epoch": 0.044532803180914515, + "grad_norm": 15.195161819458008, + "learning_rate": 4.997439094086231e-06, + "loss": 0.3932, + "num_input_tokens_seen": 685528, + "step": 336 + }, + { + "epoch": 0.044665341285619616, + "grad_norm": 5.1058478355407715, + "learning_rate": 4.9974233616137365e-06, + "loss": 0.0612, + "num_input_tokens_seen": 687896, + "step": 337 + }, + { + "epoch": 0.044797879390324716, + "grad_norm": 8.459250450134277, + "learning_rate": 4.997407580989198e-06, + "loss": 0.0422, + "num_input_tokens_seen": 689520, + "step": 338 + }, + { + "epoch": 0.04493041749502982, + "grad_norm": 29.686885833740234, + "learning_rate": 4.997391752212918e-06, + "loss": 0.421, + "num_input_tokens_seen": 691944, + "step": 339 + }, + { + "epoch": 0.04506295559973492, + "grad_norm": 7.44094181060791, + "learning_rate": 4.997375875285203e-06, + "loss": 0.1209, + "num_input_tokens_seen": 695152, + "step": 340 + }, + { + "epoch": 0.04519549370444002, + "grad_norm": 21.71242332458496, + "learning_rate": 4.997359950206357e-06, + "loss": 0.5102, + "num_input_tokens_seen": 697656, + "step": 341 + }, + { + "epoch": 0.04532803180914513, + "grad_norm": 0.6252732872962952, + "learning_rate": 4.99734397697669e-06, + "loss": 0.0021, + "num_input_tokens_seen": 698952, + "step": 342 + }, + { + "epoch": 0.04546056991385023, + "grad_norm": 10.156767845153809, + "learning_rate": 4.997327955596507e-06, + "loss": 0.2217, + "num_input_tokens_seen": 700608, + "step": 343 + }, + { + "epoch": 0.04559310801855534, + "grad_norm": 7.860878944396973, + "learning_rate": 4.997311886066118e-06, + "loss": 0.1274, + "num_input_tokens_seen": 702120, + "step": 344 + }, + { + "epoch": 0.04572564612326044, + "grad_norm": 11.383088111877441, + "learning_rate": 4.997295768385834e-06, + "loss": 0.2046, + "num_input_tokens_seen": 703528, + "step": 345 + }, + { + "epoch": 0.04585818422796554, + "grad_norm": 7.847553730010986, + "learning_rate": 4.997279602555964e-06, + "loss": 0.0318, + "num_input_tokens_seen": 704848, + "step": 346 + }, + { + "epoch": 0.045990722332670646, + "grad_norm": 19.786420822143555, + "learning_rate": 4.997263388576822e-06, + "loss": 0.3434, + "num_input_tokens_seen": 706392, + "step": 347 + }, + { + "epoch": 0.046123260437375746, + "grad_norm": 2.1406922340393066, + "learning_rate": 4.997247126448718e-06, + "loss": 0.043, + "num_input_tokens_seen": 708000, + "step": 348 + }, + { + "epoch": 0.046255798542080846, + "grad_norm": 4.831175327301025, + "learning_rate": 4.9972308161719665e-06, + "loss": 0.0204, + "num_input_tokens_seen": 709728, + "step": 349 + }, + { + "epoch": 0.04638833664678595, + "grad_norm": 18.041276931762695, + "learning_rate": 4.997214457746882e-06, + "loss": 0.4672, + "num_input_tokens_seen": 711960, + "step": 350 + }, + { + "epoch": 0.046520874751491054, + "grad_norm": 5.1598029136657715, + "learning_rate": 4.99719805117378e-06, + "loss": 0.1205, + "num_input_tokens_seen": 713208, + "step": 351 + }, + { + "epoch": 0.046653412856196154, + "grad_norm": 3.019613265991211, + "learning_rate": 4.997181596452977e-06, + "loss": 0.0065, + "num_input_tokens_seen": 715120, + "step": 352 + }, + { + "epoch": 0.04678595096090126, + "grad_norm": 12.380135536193848, + "learning_rate": 4.99716509358479e-06, + "loss": 0.1354, + "num_input_tokens_seen": 716648, + "step": 353 + }, + { + "epoch": 0.04691848906560636, + "grad_norm": 13.253052711486816, + "learning_rate": 4.9971485425695374e-06, + "loss": 0.126, + "num_input_tokens_seen": 718024, + "step": 354 + }, + { + "epoch": 0.04705102717031146, + "grad_norm": 0.013280677609145641, + "learning_rate": 4.997131943407538e-06, + "loss": 0.0, + "num_input_tokens_seen": 719576, + "step": 355 + }, + { + "epoch": 0.04718356527501657, + "grad_norm": 0.12724220752716064, + "learning_rate": 4.9971152960991134e-06, + "loss": 0.0002, + "num_input_tokens_seen": 722600, + "step": 356 + }, + { + "epoch": 0.04731610337972167, + "grad_norm": 10.176298141479492, + "learning_rate": 4.997098600644582e-06, + "loss": 0.2611, + "num_input_tokens_seen": 724280, + "step": 357 + }, + { + "epoch": 0.04744864148442677, + "grad_norm": 21.48967742919922, + "learning_rate": 4.997081857044267e-06, + "loss": 0.5354, + "num_input_tokens_seen": 726312, + "step": 358 + }, + { + "epoch": 0.047581179589131876, + "grad_norm": 0.03790980949997902, + "learning_rate": 4.997065065298491e-06, + "loss": 0.0001, + "num_input_tokens_seen": 727712, + "step": 359 + }, + { + "epoch": 0.04771371769383698, + "grad_norm": 20.249364852905273, + "learning_rate": 4.997048225407578e-06, + "loss": 0.4925, + "num_input_tokens_seen": 729752, + "step": 360 + }, + { + "epoch": 0.047846255798542084, + "grad_norm": 7.8548078536987305, + "learning_rate": 4.997031337371853e-06, + "loss": 0.1501, + "num_input_tokens_seen": 731480, + "step": 361 + }, + { + "epoch": 0.047978793903247184, + "grad_norm": 13.437089920043945, + "learning_rate": 4.99701440119164e-06, + "loss": 0.188, + "num_input_tokens_seen": 734528, + "step": 362 + }, + { + "epoch": 0.048111332007952284, + "grad_norm": 12.443678855895996, + "learning_rate": 4.996997416867268e-06, + "loss": 0.2899, + "num_input_tokens_seen": 736376, + "step": 363 + }, + { + "epoch": 0.04824387011265739, + "grad_norm": 19.146350860595703, + "learning_rate": 4.996980384399062e-06, + "loss": 0.3962, + "num_input_tokens_seen": 738712, + "step": 364 + }, + { + "epoch": 0.04837640821736249, + "grad_norm": 18.645601272583008, + "learning_rate": 4.996963303787354e-06, + "loss": 0.421, + "num_input_tokens_seen": 741656, + "step": 365 + }, + { + "epoch": 0.04850894632206759, + "grad_norm": 23.880483627319336, + "learning_rate": 4.996946175032469e-06, + "loss": 0.3565, + "num_input_tokens_seen": 743160, + "step": 366 + }, + { + "epoch": 0.0486414844267727, + "grad_norm": 26.378475189208984, + "learning_rate": 4.996928998134739e-06, + "loss": 0.6684, + "num_input_tokens_seen": 745936, + "step": 367 + }, + { + "epoch": 0.0487740225314778, + "grad_norm": 16.886404037475586, + "learning_rate": 4.996911773094496e-06, + "loss": 0.3134, + "num_input_tokens_seen": 747736, + "step": 368 + }, + { + "epoch": 0.0489065606361829, + "grad_norm": 15.819960594177246, + "learning_rate": 4.9968944999120715e-06, + "loss": 0.2254, + "num_input_tokens_seen": 749528, + "step": 369 + }, + { + "epoch": 0.04903909874088801, + "grad_norm": 2.205227851867676, + "learning_rate": 4.996877178587799e-06, + "loss": 0.0097, + "num_input_tokens_seen": 750640, + "step": 370 + }, + { + "epoch": 0.04917163684559311, + "grad_norm": 19.295799255371094, + "learning_rate": 4.9968598091220115e-06, + "loss": 0.4687, + "num_input_tokens_seen": 752648, + "step": 371 + }, + { + "epoch": 0.04930417495029821, + "grad_norm": 2.245725154876709, + "learning_rate": 4.996842391515045e-06, + "loss": 0.01, + "num_input_tokens_seen": 754448, + "step": 372 + }, + { + "epoch": 0.049436713055003315, + "grad_norm": 17.21232032775879, + "learning_rate": 4.996824925767233e-06, + "loss": 0.1311, + "num_input_tokens_seen": 757344, + "step": 373 + }, + { + "epoch": 0.049569251159708415, + "grad_norm": 29.35503578186035, + "learning_rate": 4.996807411878915e-06, + "loss": 0.4628, + "num_input_tokens_seen": 758560, + "step": 374 + }, + { + "epoch": 0.04970178926441352, + "grad_norm": 11.46652889251709, + "learning_rate": 4.996789849850427e-06, + "loss": 0.0821, + "num_input_tokens_seen": 760800, + "step": 375 + }, + { + "epoch": 0.04983432736911862, + "grad_norm": 0.5824503898620605, + "learning_rate": 4.99677223968211e-06, + "loss": 0.0026, + "num_input_tokens_seen": 761840, + "step": 376 + }, + { + "epoch": 0.04996686547382372, + "grad_norm": 8.417191505432129, + "learning_rate": 4.9967545813743e-06, + "loss": 0.1175, + "num_input_tokens_seen": 763424, + "step": 377 + }, + { + "epoch": 0.05009940357852883, + "grad_norm": 6.865485668182373, + "learning_rate": 4.996736874927341e-06, + "loss": 0.0421, + "num_input_tokens_seen": 765096, + "step": 378 + }, + { + "epoch": 0.05023194168323393, + "grad_norm": 6.654351234436035, + "learning_rate": 4.996719120341571e-06, + "loss": 0.0766, + "num_input_tokens_seen": 766416, + "step": 379 + }, + { + "epoch": 0.05036447978793903, + "grad_norm": 12.908503532409668, + "learning_rate": 4.996701317617335e-06, + "loss": 0.3849, + "num_input_tokens_seen": 768728, + "step": 380 + }, + { + "epoch": 0.05049701789264414, + "grad_norm": 0.8765175342559814, + "learning_rate": 4.996683466754974e-06, + "loss": 0.0036, + "num_input_tokens_seen": 770104, + "step": 381 + }, + { + "epoch": 0.05062955599734924, + "grad_norm": 23.151649475097656, + "learning_rate": 4.996665567754834e-06, + "loss": 0.5085, + "num_input_tokens_seen": 771960, + "step": 382 + }, + { + "epoch": 0.05076209410205434, + "grad_norm": 17.043270111083984, + "learning_rate": 4.99664762061726e-06, + "loss": 0.3232, + "num_input_tokens_seen": 774312, + "step": 383 + }, + { + "epoch": 0.050894632206759445, + "grad_norm": 23.318056106567383, + "learning_rate": 4.996629625342597e-06, + "loss": 0.4619, + "num_input_tokens_seen": 776112, + "step": 384 + }, + { + "epoch": 0.051027170311464545, + "grad_norm": 15.63559341430664, + "learning_rate": 4.9966115819311926e-06, + "loss": 0.2928, + "num_input_tokens_seen": 777552, + "step": 385 + }, + { + "epoch": 0.051159708416169646, + "grad_norm": 0.42104920744895935, + "learning_rate": 4.996593490383395e-06, + "loss": 0.0015, + "num_input_tokens_seen": 779176, + "step": 386 + }, + { + "epoch": 0.05129224652087475, + "grad_norm": 17.924867630004883, + "learning_rate": 4.996575350699552e-06, + "loss": 0.4286, + "num_input_tokens_seen": 781216, + "step": 387 + }, + { + "epoch": 0.05142478462557985, + "grad_norm": 27.782072067260742, + "learning_rate": 4.996557162880014e-06, + "loss": 0.8455, + "num_input_tokens_seen": 782984, + "step": 388 + }, + { + "epoch": 0.05155732273028496, + "grad_norm": 26.4074649810791, + "learning_rate": 4.996538926925132e-06, + "loss": 0.4452, + "num_input_tokens_seen": 784928, + "step": 389 + }, + { + "epoch": 0.05168986083499006, + "grad_norm": 0.5493453741073608, + "learning_rate": 4.996520642835257e-06, + "loss": 0.0024, + "num_input_tokens_seen": 788072, + "step": 390 + }, + { + "epoch": 0.05182239893969516, + "grad_norm": 17.478471755981445, + "learning_rate": 4.996502310610742e-06, + "loss": 0.4274, + "num_input_tokens_seen": 790008, + "step": 391 + }, + { + "epoch": 0.05195493704440027, + "grad_norm": 6.221721172332764, + "learning_rate": 4.99648393025194e-06, + "loss": 0.1173, + "num_input_tokens_seen": 791736, + "step": 392 + }, + { + "epoch": 0.05208747514910537, + "grad_norm": 14.853644371032715, + "learning_rate": 4.996465501759205e-06, + "loss": 0.3939, + "num_input_tokens_seen": 793584, + "step": 393 + }, + { + "epoch": 0.05222001325381047, + "grad_norm": 0.30814510583877563, + "learning_rate": 4.9964470251328935e-06, + "loss": 0.0014, + "num_input_tokens_seen": 794944, + "step": 394 + }, + { + "epoch": 0.052352551358515576, + "grad_norm": 22.30123519897461, + "learning_rate": 4.996428500373361e-06, + "loss": 0.4323, + "num_input_tokens_seen": 796856, + "step": 395 + }, + { + "epoch": 0.052485089463220676, + "grad_norm": 13.975912094116211, + "learning_rate": 4.996409927480965e-06, + "loss": 0.4534, + "num_input_tokens_seen": 799224, + "step": 396 + }, + { + "epoch": 0.052617627567925776, + "grad_norm": 10.820759773254395, + "learning_rate": 4.9963913064560634e-06, + "loss": 0.2199, + "num_input_tokens_seen": 801496, + "step": 397 + }, + { + "epoch": 0.05275016567263088, + "grad_norm": 1.8996949195861816, + "learning_rate": 4.996372637299015e-06, + "loss": 0.0085, + "num_input_tokens_seen": 802768, + "step": 398 + }, + { + "epoch": 0.05288270377733598, + "grad_norm": 13.372843742370605, + "learning_rate": 4.9963539200101795e-06, + "loss": 0.2026, + "num_input_tokens_seen": 804376, + "step": 399 + }, + { + "epoch": 0.053015241882041084, + "grad_norm": 21.658615112304688, + "learning_rate": 4.99633515458992e-06, + "loss": 0.6691, + "num_input_tokens_seen": 806400, + "step": 400 + }, + { + "epoch": 0.05314777998674619, + "grad_norm": 13.015047073364258, + "learning_rate": 4.996316341038595e-06, + "loss": 0.2655, + "num_input_tokens_seen": 808336, + "step": 401 + }, + { + "epoch": 0.05328031809145129, + "grad_norm": 8.32094669342041, + "learning_rate": 4.9962974793565685e-06, + "loss": 0.0826, + "num_input_tokens_seen": 809912, + "step": 402 + }, + { + "epoch": 0.0534128561961564, + "grad_norm": 20.510452270507812, + "learning_rate": 4.996278569544206e-06, + "loss": 0.4597, + "num_input_tokens_seen": 812640, + "step": 403 + }, + { + "epoch": 0.0535453943008615, + "grad_norm": 5.006075382232666, + "learning_rate": 4.9962596116018695e-06, + "loss": 0.024, + "num_input_tokens_seen": 813848, + "step": 404 + }, + { + "epoch": 0.0536779324055666, + "grad_norm": 5.897892475128174, + "learning_rate": 4.996240605529926e-06, + "loss": 0.0971, + "num_input_tokens_seen": 815392, + "step": 405 + }, + { + "epoch": 0.053810470510271706, + "grad_norm": 22.914487838745117, + "learning_rate": 4.996221551328742e-06, + "loss": 0.5509, + "num_input_tokens_seen": 816928, + "step": 406 + }, + { + "epoch": 0.053943008614976806, + "grad_norm": 27.326887130737305, + "learning_rate": 4.996202448998684e-06, + "loss": 0.485, + "num_input_tokens_seen": 818344, + "step": 407 + }, + { + "epoch": 0.054075546719681906, + "grad_norm": 8.313642501831055, + "learning_rate": 4.996183298540121e-06, + "loss": 0.1001, + "num_input_tokens_seen": 820040, + "step": 408 + }, + { + "epoch": 0.054208084824387014, + "grad_norm": 4.386748790740967, + "learning_rate": 4.996164099953421e-06, + "loss": 0.0218, + "num_input_tokens_seen": 821696, + "step": 409 + }, + { + "epoch": 0.054340622929092114, + "grad_norm": 14.14405632019043, + "learning_rate": 4.996144853238956e-06, + "loss": 0.2684, + "num_input_tokens_seen": 824176, + "step": 410 + }, + { + "epoch": 0.054473161033797214, + "grad_norm": 16.349185943603516, + "learning_rate": 4.996125558397096e-06, + "loss": 0.3261, + "num_input_tokens_seen": 826576, + "step": 411 + }, + { + "epoch": 0.05460569913850232, + "grad_norm": 14.40383529663086, + "learning_rate": 4.996106215428214e-06, + "loss": 0.4198, + "num_input_tokens_seen": 828056, + "step": 412 + }, + { + "epoch": 0.05473823724320742, + "grad_norm": 12.038331031799316, + "learning_rate": 4.996086824332681e-06, + "loss": 0.26, + "num_input_tokens_seen": 830184, + "step": 413 + }, + { + "epoch": 0.05487077534791252, + "grad_norm": 1.7930399179458618, + "learning_rate": 4.9960673851108724e-06, + "loss": 0.0085, + "num_input_tokens_seen": 831600, + "step": 414 + }, + { + "epoch": 0.05500331345261763, + "grad_norm": 16.52307891845703, + "learning_rate": 4.996047897763163e-06, + "loss": 0.2179, + "num_input_tokens_seen": 833840, + "step": 415 + }, + { + "epoch": 0.05513585155732273, + "grad_norm": 2.702738046646118, + "learning_rate": 4.996028362289928e-06, + "loss": 0.0113, + "num_input_tokens_seen": 835520, + "step": 416 + }, + { + "epoch": 0.055268389662027836, + "grad_norm": 28.42337417602539, + "learning_rate": 4.996008778691544e-06, + "loss": 1.5106, + "num_input_tokens_seen": 838536, + "step": 417 + }, + { + "epoch": 0.05540092776673294, + "grad_norm": 8.068453788757324, + "learning_rate": 4.995989146968389e-06, + "loss": 0.1233, + "num_input_tokens_seen": 839880, + "step": 418 + }, + { + "epoch": 0.05553346587143804, + "grad_norm": 10.71106243133545, + "learning_rate": 4.9959694671208415e-06, + "loss": 0.3155, + "num_input_tokens_seen": 841736, + "step": 419 + }, + { + "epoch": 0.055666003976143144, + "grad_norm": 15.732540130615234, + "learning_rate": 4.995949739149281e-06, + "loss": 0.4902, + "num_input_tokens_seen": 843216, + "step": 420 + }, + { + "epoch": 0.055798542080848244, + "grad_norm": 12.263782501220703, + "learning_rate": 4.9959299630540876e-06, + "loss": 0.0779, + "num_input_tokens_seen": 844912, + "step": 421 + }, + { + "epoch": 0.055931080185553345, + "grad_norm": 0.03717665746808052, + "learning_rate": 4.995910138835642e-06, + "loss": 0.0002, + "num_input_tokens_seen": 846536, + "step": 422 + }, + { + "epoch": 0.05606361829025845, + "grad_norm": 18.78084945678711, + "learning_rate": 4.9958902664943275e-06, + "loss": 0.1599, + "num_input_tokens_seen": 848712, + "step": 423 + }, + { + "epoch": 0.05619615639496355, + "grad_norm": 3.03532338142395, + "learning_rate": 4.995870346030528e-06, + "loss": 0.0133, + "num_input_tokens_seen": 850872, + "step": 424 + }, + { + "epoch": 0.05632869449966865, + "grad_norm": 1.760057806968689, + "learning_rate": 4.995850377444624e-06, + "loss": 0.0009, + "num_input_tokens_seen": 852224, + "step": 425 + }, + { + "epoch": 0.05646123260437376, + "grad_norm": 21.569618225097656, + "learning_rate": 4.995830360737005e-06, + "loss": 0.5746, + "num_input_tokens_seen": 854456, + "step": 426 + }, + { + "epoch": 0.05659377070907886, + "grad_norm": 11.767157554626465, + "learning_rate": 4.995810295908054e-06, + "loss": 0.2346, + "num_input_tokens_seen": 856800, + "step": 427 + }, + { + "epoch": 0.05672630881378396, + "grad_norm": 17.919403076171875, + "learning_rate": 4.995790182958159e-06, + "loss": 0.2724, + "num_input_tokens_seen": 858352, + "step": 428 + }, + { + "epoch": 0.05685884691848907, + "grad_norm": 12.62038803100586, + "learning_rate": 4.995770021887707e-06, + "loss": 0.149, + "num_input_tokens_seen": 859712, + "step": 429 + }, + { + "epoch": 0.05699138502319417, + "grad_norm": 13.71484661102295, + "learning_rate": 4.995749812697088e-06, + "loss": 0.1083, + "num_input_tokens_seen": 862248, + "step": 430 + }, + { + "epoch": 0.05712392312789927, + "grad_norm": 0.01438061147928238, + "learning_rate": 4.99572955538669e-06, + "loss": 0.0001, + "num_input_tokens_seen": 863664, + "step": 431 + }, + { + "epoch": 0.057256461232604375, + "grad_norm": 0.35884493589401245, + "learning_rate": 4.995709249956905e-06, + "loss": 0.0014, + "num_input_tokens_seen": 865600, + "step": 432 + }, + { + "epoch": 0.057388999337309475, + "grad_norm": 0.010414387099444866, + "learning_rate": 4.995688896408124e-06, + "loss": 0.0001, + "num_input_tokens_seen": 867256, + "step": 433 + }, + { + "epoch": 0.05752153744201458, + "grad_norm": 15.658315658569336, + "learning_rate": 4.995668494740739e-06, + "loss": 0.3547, + "num_input_tokens_seen": 869240, + "step": 434 + }, + { + "epoch": 0.05765407554671968, + "grad_norm": 0.9342787265777588, + "learning_rate": 4.995648044955144e-06, + "loss": 0.0025, + "num_input_tokens_seen": 871888, + "step": 435 + }, + { + "epoch": 0.05778661365142478, + "grad_norm": 2.8957719802856445, + "learning_rate": 4.9956275470517335e-06, + "loss": 0.0064, + "num_input_tokens_seen": 873752, + "step": 436 + }, + { + "epoch": 0.05791915175612989, + "grad_norm": 0.04046275466680527, + "learning_rate": 4.995607001030902e-06, + "loss": 0.0002, + "num_input_tokens_seen": 875584, + "step": 437 + }, + { + "epoch": 0.05805168986083499, + "grad_norm": 8.409808158874512, + "learning_rate": 4.995586406893045e-06, + "loss": 0.2931, + "num_input_tokens_seen": 878072, + "step": 438 + }, + { + "epoch": 0.05818422796554009, + "grad_norm": 9.53891658782959, + "learning_rate": 4.995565764638561e-06, + "loss": 0.2245, + "num_input_tokens_seen": 880688, + "step": 439 + }, + { + "epoch": 0.0583167660702452, + "grad_norm": 16.10770034790039, + "learning_rate": 4.995545074267848e-06, + "loss": 0.4526, + "num_input_tokens_seen": 882336, + "step": 440 + }, + { + "epoch": 0.0584493041749503, + "grad_norm": 0.003209405578672886, + "learning_rate": 4.995524335781305e-06, + "loss": 0.0, + "num_input_tokens_seen": 883672, + "step": 441 + }, + { + "epoch": 0.0585818422796554, + "grad_norm": 37.67921447753906, + "learning_rate": 4.9955035491793295e-06, + "loss": 1.0989, + "num_input_tokens_seen": 886512, + "step": 442 + }, + { + "epoch": 0.058714380384360505, + "grad_norm": 5.709592819213867, + "learning_rate": 4.995482714462324e-06, + "loss": 0.0741, + "num_input_tokens_seen": 888104, + "step": 443 + }, + { + "epoch": 0.058846918489065606, + "grad_norm": 38.78124237060547, + "learning_rate": 4.995461831630691e-06, + "loss": 1.0351, + "num_input_tokens_seen": 889864, + "step": 444 + }, + { + "epoch": 0.058979456593770706, + "grad_norm": 15.407828330993652, + "learning_rate": 4.995440900684832e-06, + "loss": 0.4081, + "num_input_tokens_seen": 892560, + "step": 445 + }, + { + "epoch": 0.05911199469847581, + "grad_norm": 0.17919433116912842, + "learning_rate": 4.995419921625152e-06, + "loss": 0.0008, + "num_input_tokens_seen": 894792, + "step": 446 + }, + { + "epoch": 0.05924453280318091, + "grad_norm": 0.6332680583000183, + "learning_rate": 4.995398894452054e-06, + "loss": 0.0019, + "num_input_tokens_seen": 896840, + "step": 447 + }, + { + "epoch": 0.05937707090788602, + "grad_norm": 46.322914123535156, + "learning_rate": 4.995377819165943e-06, + "loss": 0.3411, + "num_input_tokens_seen": 898520, + "step": 448 + }, + { + "epoch": 0.05950960901259112, + "grad_norm": 94.10094451904297, + "learning_rate": 4.995356695767226e-06, + "loss": 0.3775, + "num_input_tokens_seen": 900272, + "step": 449 + }, + { + "epoch": 0.05964214711729622, + "grad_norm": 18.68901252746582, + "learning_rate": 4.995335524256312e-06, + "loss": 0.4001, + "num_input_tokens_seen": 901736, + "step": 450 + }, + { + "epoch": 0.05977468522200133, + "grad_norm": 16.15734100341797, + "learning_rate": 4.995314304633606e-06, + "loss": 0.3467, + "num_input_tokens_seen": 903400, + "step": 451 + }, + { + "epoch": 0.05990722332670643, + "grad_norm": 22.254375457763672, + "learning_rate": 4.99529303689952e-06, + "loss": 0.3432, + "num_input_tokens_seen": 905168, + "step": 452 + }, + { + "epoch": 0.06003976143141153, + "grad_norm": 18.157529830932617, + "learning_rate": 4.995271721054462e-06, + "loss": 0.2681, + "num_input_tokens_seen": 906808, + "step": 453 + }, + { + "epoch": 0.060172299536116636, + "grad_norm": 21.01182746887207, + "learning_rate": 4.995250357098844e-06, + "loss": 0.049, + "num_input_tokens_seen": 909552, + "step": 454 + }, + { + "epoch": 0.060304837640821736, + "grad_norm": 9.753447532653809, + "learning_rate": 4.995228945033078e-06, + "loss": 0.2303, + "num_input_tokens_seen": 912344, + "step": 455 + }, + { + "epoch": 0.060437375745526836, + "grad_norm": 21.69582748413086, + "learning_rate": 4.9952074848575765e-06, + "loss": 0.3012, + "num_input_tokens_seen": 914864, + "step": 456 + }, + { + "epoch": 0.06056991385023194, + "grad_norm": 20.239931106567383, + "learning_rate": 4.9951859765727535e-06, + "loss": 0.3241, + "num_input_tokens_seen": 916880, + "step": 457 + }, + { + "epoch": 0.060702451954937044, + "grad_norm": 1.746264934539795, + "learning_rate": 4.995164420179023e-06, + "loss": 0.0079, + "num_input_tokens_seen": 918272, + "step": 458 + }, + { + "epoch": 0.060834990059642144, + "grad_norm": 24.057586669921875, + "learning_rate": 4.995142815676802e-06, + "loss": 0.6932, + "num_input_tokens_seen": 920336, + "step": 459 + }, + { + "epoch": 0.06096752816434725, + "grad_norm": 9.76949691772461, + "learning_rate": 4.995121163066505e-06, + "loss": 0.0648, + "num_input_tokens_seen": 921872, + "step": 460 + }, + { + "epoch": 0.06110006626905235, + "grad_norm": 2.4379332065582275, + "learning_rate": 4.995099462348551e-06, + "loss": 0.0113, + "num_input_tokens_seen": 923888, + "step": 461 + }, + { + "epoch": 0.06123260437375746, + "grad_norm": 0.6686123609542847, + "learning_rate": 4.995077713523359e-06, + "loss": 0.0028, + "num_input_tokens_seen": 926408, + "step": 462 + }, + { + "epoch": 0.06136514247846256, + "grad_norm": 8.666668891906738, + "learning_rate": 4.995055916591347e-06, + "loss": 0.3313, + "num_input_tokens_seen": 928328, + "step": 463 + }, + { + "epoch": 0.06149768058316766, + "grad_norm": 16.044036865234375, + "learning_rate": 4.9950340715529355e-06, + "loss": 0.171, + "num_input_tokens_seen": 931568, + "step": 464 + }, + { + "epoch": 0.061630218687872766, + "grad_norm": 15.3733491897583, + "learning_rate": 4.9950121784085455e-06, + "loss": 0.2286, + "num_input_tokens_seen": 933800, + "step": 465 + }, + { + "epoch": 0.061762756792577866, + "grad_norm": 0.15300540626049042, + "learning_rate": 4.9949902371586e-06, + "loss": 0.0007, + "num_input_tokens_seen": 935504, + "step": 466 + }, + { + "epoch": 0.06189529489728297, + "grad_norm": 9.183958053588867, + "learning_rate": 4.994968247803522e-06, + "loss": 0.1304, + "num_input_tokens_seen": 937728, + "step": 467 + }, + { + "epoch": 0.062027833001988074, + "grad_norm": 0.26844802498817444, + "learning_rate": 4.994946210343734e-06, + "loss": 0.0009, + "num_input_tokens_seen": 938752, + "step": 468 + }, + { + "epoch": 0.062160371106693174, + "grad_norm": 0.05436732992529869, + "learning_rate": 4.994924124779662e-06, + "loss": 0.0003, + "num_input_tokens_seen": 940520, + "step": 469 + }, + { + "epoch": 0.062292909211398274, + "grad_norm": 13.469395637512207, + "learning_rate": 4.9949019911117325e-06, + "loss": 0.1647, + "num_input_tokens_seen": 942560, + "step": 470 + }, + { + "epoch": 0.06242544731610338, + "grad_norm": 0.03584084287285805, + "learning_rate": 4.99487980934037e-06, + "loss": 0.0002, + "num_input_tokens_seen": 944648, + "step": 471 + }, + { + "epoch": 0.06255798542080848, + "grad_norm": 8.76506233215332, + "learning_rate": 4.994857579466005e-06, + "loss": 0.0416, + "num_input_tokens_seen": 946304, + "step": 472 + }, + { + "epoch": 0.06269052352551359, + "grad_norm": 0.027090368792414665, + "learning_rate": 4.994835301489065e-06, + "loss": 0.0001, + "num_input_tokens_seen": 947696, + "step": 473 + }, + { + "epoch": 0.06282306163021868, + "grad_norm": 8.105457305908203, + "learning_rate": 4.9948129754099785e-06, + "loss": 0.1449, + "num_input_tokens_seen": 949296, + "step": 474 + }, + { + "epoch": 0.06295559973492379, + "grad_norm": 7.156956672668457, + "learning_rate": 4.994790601229177e-06, + "loss": 0.0745, + "num_input_tokens_seen": 951536, + "step": 475 + }, + { + "epoch": 0.0630881378396289, + "grad_norm": 0.016795145347714424, + "learning_rate": 4.994768178947092e-06, + "loss": 0.0001, + "num_input_tokens_seen": 953008, + "step": 476 + }, + { + "epoch": 0.06322067594433399, + "grad_norm": 24.05290985107422, + "learning_rate": 4.9947457085641544e-06, + "loss": 0.3832, + "num_input_tokens_seen": 954480, + "step": 477 + }, + { + "epoch": 0.0633532140490391, + "grad_norm": 12.219489097595215, + "learning_rate": 4.994723190080799e-06, + "loss": 0.2212, + "num_input_tokens_seen": 956120, + "step": 478 + }, + { + "epoch": 0.0634857521537442, + "grad_norm": 39.07728958129883, + "learning_rate": 4.994700623497459e-06, + "loss": 0.8499, + "num_input_tokens_seen": 958200, + "step": 479 + }, + { + "epoch": 0.0636182902584493, + "grad_norm": 0.4574955701828003, + "learning_rate": 4.994678008814571e-06, + "loss": 0.0013, + "num_input_tokens_seen": 960472, + "step": 480 + }, + { + "epoch": 0.0637508283631544, + "grad_norm": 14.811234474182129, + "learning_rate": 4.9946553460325695e-06, + "loss": 0.3978, + "num_input_tokens_seen": 961832, + "step": 481 + }, + { + "epoch": 0.06388336646785951, + "grad_norm": 31.710086822509766, + "learning_rate": 4.994632635151893e-06, + "loss": 0.2641, + "num_input_tokens_seen": 964024, + "step": 482 + }, + { + "epoch": 0.06401590457256462, + "grad_norm": 24.19526481628418, + "learning_rate": 4.994609876172976e-06, + "loss": 0.4087, + "num_input_tokens_seen": 966136, + "step": 483 + }, + { + "epoch": 0.06414844267726971, + "grad_norm": 31.750226974487305, + "learning_rate": 4.994587069096262e-06, + "loss": 0.9694, + "num_input_tokens_seen": 967848, + "step": 484 + }, + { + "epoch": 0.06428098078197482, + "grad_norm": 3.483440399169922, + "learning_rate": 4.994564213922187e-06, + "loss": 0.0053, + "num_input_tokens_seen": 969576, + "step": 485 + }, + { + "epoch": 0.06441351888667993, + "grad_norm": 32.30971145629883, + "learning_rate": 4.994541310651193e-06, + "loss": 0.2429, + "num_input_tokens_seen": 971744, + "step": 486 + }, + { + "epoch": 0.06454605699138502, + "grad_norm": 29.977540969848633, + "learning_rate": 4.994518359283722e-06, + "loss": 0.7457, + "num_input_tokens_seen": 973856, + "step": 487 + }, + { + "epoch": 0.06467859509609013, + "grad_norm": 2187.822509765625, + "learning_rate": 4.994495359820217e-06, + "loss": 1.7051, + "num_input_tokens_seen": 975752, + "step": 488 + }, + { + "epoch": 0.06481113320079523, + "grad_norm": 11.483095169067383, + "learning_rate": 4.99447231226112e-06, + "loss": 0.1525, + "num_input_tokens_seen": 977488, + "step": 489 + }, + { + "epoch": 0.06494367130550033, + "grad_norm": 38.371864318847656, + "learning_rate": 4.994449216606875e-06, + "loss": 0.7257, + "num_input_tokens_seen": 979576, + "step": 490 + }, + { + "epoch": 0.06507620941020544, + "grad_norm": 6.387148380279541, + "learning_rate": 4.994426072857929e-06, + "loss": 0.1932, + "num_input_tokens_seen": 981728, + "step": 491 + }, + { + "epoch": 0.06520874751491054, + "grad_norm": 1.466477394104004, + "learning_rate": 4.994402881014728e-06, + "loss": 0.0054, + "num_input_tokens_seen": 983904, + "step": 492 + }, + { + "epoch": 0.06534128561961564, + "grad_norm": 10.523158073425293, + "learning_rate": 4.994379641077718e-06, + "loss": 0.0993, + "num_input_tokens_seen": 987464, + "step": 493 + }, + { + "epoch": 0.06547382372432074, + "grad_norm": 16.072219848632812, + "learning_rate": 4.994356353047348e-06, + "loss": 0.0805, + "num_input_tokens_seen": 989544, + "step": 494 + }, + { + "epoch": 0.06560636182902585, + "grad_norm": 0.09814932197332382, + "learning_rate": 4.9943330169240665e-06, + "loss": 0.0004, + "num_input_tokens_seen": 990872, + "step": 495 + }, + { + "epoch": 0.06573889993373094, + "grad_norm": 9.3329496383667, + "learning_rate": 4.994309632708324e-06, + "loss": 0.0247, + "num_input_tokens_seen": 992896, + "step": 496 + }, + { + "epoch": 0.06587143803843605, + "grad_norm": 20.09090805053711, + "learning_rate": 4.994286200400571e-06, + "loss": 0.1765, + "num_input_tokens_seen": 995992, + "step": 497 + }, + { + "epoch": 0.06600397614314116, + "grad_norm": 10.68331527709961, + "learning_rate": 4.99426272000126e-06, + "loss": 0.2593, + "num_input_tokens_seen": 997656, + "step": 498 + }, + { + "epoch": 0.06613651424784625, + "grad_norm": 0.11378294974565506, + "learning_rate": 4.994239191510841e-06, + "loss": 0.0005, + "num_input_tokens_seen": 999768, + "step": 499 + }, + { + "epoch": 0.06626905235255136, + "grad_norm": 20.251298904418945, + "learning_rate": 4.994215614929772e-06, + "loss": 0.4918, + "num_input_tokens_seen": 1002136, + "step": 500 + }, + { + "epoch": 0.06640159045725647, + "grad_norm": 14.02566146850586, + "learning_rate": 4.994191990258504e-06, + "loss": 0.2084, + "num_input_tokens_seen": 1003272, + "step": 501 + }, + { + "epoch": 0.06653412856196156, + "grad_norm": 17.264339447021484, + "learning_rate": 4.9941683174974935e-06, + "loss": 0.1955, + "num_input_tokens_seen": 1005232, + "step": 502 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.1551053524017334, + "learning_rate": 4.994144596647198e-06, + "loss": 0.0005, + "num_input_tokens_seen": 1006424, + "step": 503 + }, + { + "epoch": 0.06679920477137177, + "grad_norm": 29.754657745361328, + "learning_rate": 4.994120827708073e-06, + "loss": 0.5756, + "num_input_tokens_seen": 1008272, + "step": 504 + }, + { + "epoch": 0.06693174287607687, + "grad_norm": 10.792243957519531, + "learning_rate": 4.994097010680579e-06, + "loss": 0.2684, + "num_input_tokens_seen": 1010184, + "step": 505 + }, + { + "epoch": 0.06706428098078197, + "grad_norm": 6.34636926651001, + "learning_rate": 4.994073145565174e-06, + "loss": 0.1127, + "num_input_tokens_seen": 1012784, + "step": 506 + }, + { + "epoch": 0.06719681908548708, + "grad_norm": 8.283468246459961, + "learning_rate": 4.994049232362318e-06, + "loss": 0.0891, + "num_input_tokens_seen": 1014768, + "step": 507 + }, + { + "epoch": 0.06732935719019217, + "grad_norm": 13.287981033325195, + "learning_rate": 4.994025271072472e-06, + "loss": 0.0825, + "num_input_tokens_seen": 1016376, + "step": 508 + }, + { + "epoch": 0.06746189529489728, + "grad_norm": 16.882177352905273, + "learning_rate": 4.994001261696098e-06, + "loss": 0.4124, + "num_input_tokens_seen": 1018504, + "step": 509 + }, + { + "epoch": 0.06759443339960239, + "grad_norm": 2.0997307300567627, + "learning_rate": 4.99397720423366e-06, + "loss": 0.0055, + "num_input_tokens_seen": 1020080, + "step": 510 + }, + { + "epoch": 0.06772697150430748, + "grad_norm": 14.057239532470703, + "learning_rate": 4.99395309868562e-06, + "loss": 0.3045, + "num_input_tokens_seen": 1021728, + "step": 511 + }, + { + "epoch": 0.06785950960901259, + "grad_norm": 12.672103881835938, + "learning_rate": 4.993928945052444e-06, + "loss": 0.268, + "num_input_tokens_seen": 1024688, + "step": 512 + }, + { + "epoch": 0.0679920477137177, + "grad_norm": 0.49435117840766907, + "learning_rate": 4.993904743334598e-06, + "loss": 0.0017, + "num_input_tokens_seen": 1026544, + "step": 513 + }, + { + "epoch": 0.0681245858184228, + "grad_norm": 0.19059982895851135, + "learning_rate": 4.993880493532548e-06, + "loss": 0.0008, + "num_input_tokens_seen": 1028224, + "step": 514 + }, + { + "epoch": 0.0682571239231279, + "grad_norm": 12.96335506439209, + "learning_rate": 4.9938561956467615e-06, + "loss": 0.2523, + "num_input_tokens_seen": 1030200, + "step": 515 + }, + { + "epoch": 0.068389662027833, + "grad_norm": 2.7128167152404785, + "learning_rate": 4.993831849677707e-06, + "loss": 0.0115, + "num_input_tokens_seen": 1031920, + "step": 516 + }, + { + "epoch": 0.06852220013253811, + "grad_norm": 6.8787455558776855, + "learning_rate": 4.993807455625854e-06, + "loss": 0.1451, + "num_input_tokens_seen": 1033472, + "step": 517 + }, + { + "epoch": 0.0686547382372432, + "grad_norm": 10.646443367004395, + "learning_rate": 4.9937830134916735e-06, + "loss": 0.2795, + "num_input_tokens_seen": 1035400, + "step": 518 + }, + { + "epoch": 0.06878727634194831, + "grad_norm": 28.678573608398438, + "learning_rate": 4.9937585232756355e-06, + "loss": 0.6005, + "num_input_tokens_seen": 1037488, + "step": 519 + }, + { + "epoch": 0.06891981444665342, + "grad_norm": 26.15842628479004, + "learning_rate": 4.993733984978213e-06, + "loss": 0.7782, + "num_input_tokens_seen": 1039928, + "step": 520 + }, + { + "epoch": 0.06905235255135851, + "grad_norm": 16.04706573486328, + "learning_rate": 4.993709398599879e-06, + "loss": 0.237, + "num_input_tokens_seen": 1041584, + "step": 521 + }, + { + "epoch": 0.06918489065606362, + "grad_norm": 4.769724369049072, + "learning_rate": 4.993684764141107e-06, + "loss": 0.074, + "num_input_tokens_seen": 1043624, + "step": 522 + }, + { + "epoch": 0.06931742876076873, + "grad_norm": 16.5262393951416, + "learning_rate": 4.993660081602373e-06, + "loss": 0.3854, + "num_input_tokens_seen": 1045808, + "step": 523 + }, + { + "epoch": 0.06944996686547382, + "grad_norm": 13.904562950134277, + "learning_rate": 4.993635350984152e-06, + "loss": 0.3698, + "num_input_tokens_seen": 1047520, + "step": 524 + }, + { + "epoch": 0.06958250497017893, + "grad_norm": 0.04643460735678673, + "learning_rate": 4.993610572286921e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1049560, + "step": 525 + }, + { + "epoch": 0.06971504307488403, + "grad_norm": 0.23561903834342957, + "learning_rate": 4.993585745511158e-06, + "loss": 0.001, + "num_input_tokens_seen": 1051608, + "step": 526 + }, + { + "epoch": 0.06984758117958913, + "grad_norm": 0.0867864117026329, + "learning_rate": 4.993560870657342e-06, + "loss": 0.0004, + "num_input_tokens_seen": 1052984, + "step": 527 + }, + { + "epoch": 0.06998011928429423, + "grad_norm": 6.684571743011475, + "learning_rate": 4.993535947725952e-06, + "loss": 0.118, + "num_input_tokens_seen": 1054744, + "step": 528 + }, + { + "epoch": 0.07011265738899934, + "grad_norm": 9.315130233764648, + "learning_rate": 4.993510976717468e-06, + "loss": 0.1495, + "num_input_tokens_seen": 1056144, + "step": 529 + }, + { + "epoch": 0.07024519549370443, + "grad_norm": 5.07112979888916, + "learning_rate": 4.993485957632374e-06, + "loss": 0.0532, + "num_input_tokens_seen": 1058824, + "step": 530 + }, + { + "epoch": 0.07037773359840954, + "grad_norm": 29.896940231323242, + "learning_rate": 4.993460890471149e-06, + "loss": 0.6898, + "num_input_tokens_seen": 1061040, + "step": 531 + }, + { + "epoch": 0.07051027170311465, + "grad_norm": 0.9054104685783386, + "learning_rate": 4.993435775234278e-06, + "loss": 0.0036, + "num_input_tokens_seen": 1062480, + "step": 532 + }, + { + "epoch": 0.07064280980781974, + "grad_norm": 10.092836380004883, + "learning_rate": 4.993410611922246e-06, + "loss": 0.1754, + "num_input_tokens_seen": 1064640, + "step": 533 + }, + { + "epoch": 0.07077534791252485, + "grad_norm": 12.96221923828125, + "learning_rate": 4.9933854005355365e-06, + "loss": 0.327, + "num_input_tokens_seen": 1066608, + "step": 534 + }, + { + "epoch": 0.07090788601722996, + "grad_norm": 0.35928207635879517, + "learning_rate": 4.993360141074636e-06, + "loss": 0.0017, + "num_input_tokens_seen": 1068816, + "step": 535 + }, + { + "epoch": 0.07104042412193505, + "grad_norm": 21.966604232788086, + "learning_rate": 4.993334833540032e-06, + "loss": 0.558, + "num_input_tokens_seen": 1071216, + "step": 536 + }, + { + "epoch": 0.07117296222664016, + "grad_norm": 10.543190956115723, + "learning_rate": 4.993309477932214e-06, + "loss": 0.1464, + "num_input_tokens_seen": 1073104, + "step": 537 + }, + { + "epoch": 0.07130550033134526, + "grad_norm": 21.538822174072266, + "learning_rate": 4.993284074251668e-06, + "loss": 0.891, + "num_input_tokens_seen": 1075600, + "step": 538 + }, + { + "epoch": 0.07143803843605036, + "grad_norm": 0.27097639441490173, + "learning_rate": 4.993258622498885e-06, + "loss": 0.0013, + "num_input_tokens_seen": 1077048, + "step": 539 + }, + { + "epoch": 0.07157057654075547, + "grad_norm": 16.11741065979004, + "learning_rate": 4.9932331226743555e-06, + "loss": 0.697, + "num_input_tokens_seen": 1079584, + "step": 540 + }, + { + "epoch": 0.07170311464546057, + "grad_norm": 0.2181357592344284, + "learning_rate": 4.993207574778572e-06, + "loss": 0.001, + "num_input_tokens_seen": 1081056, + "step": 541 + }, + { + "epoch": 0.07183565275016568, + "grad_norm": 0.1433018445968628, + "learning_rate": 4.993181978812026e-06, + "loss": 0.0007, + "num_input_tokens_seen": 1082448, + "step": 542 + }, + { + "epoch": 0.07196819085487077, + "grad_norm": 8.875802040100098, + "learning_rate": 4.993156334775212e-06, + "loss": 0.0938, + "num_input_tokens_seen": 1084832, + "step": 543 + }, + { + "epoch": 0.07210072895957588, + "grad_norm": 13.357874870300293, + "learning_rate": 4.993130642668623e-06, + "loss": 0.397, + "num_input_tokens_seen": 1087112, + "step": 544 + }, + { + "epoch": 0.07223326706428099, + "grad_norm": 25.850324630737305, + "learning_rate": 4.993104902492756e-06, + "loss": 0.4894, + "num_input_tokens_seen": 1088864, + "step": 545 + }, + { + "epoch": 0.07236580516898608, + "grad_norm": 0.18744581937789917, + "learning_rate": 4.993079114248107e-06, + "loss": 0.0009, + "num_input_tokens_seen": 1090264, + "step": 546 + }, + { + "epoch": 0.07249834327369119, + "grad_norm": 6.02025032043457, + "learning_rate": 4.993053277935172e-06, + "loss": 0.1977, + "num_input_tokens_seen": 1092008, + "step": 547 + }, + { + "epoch": 0.0726308813783963, + "grad_norm": 0.32402750849723816, + "learning_rate": 4.99302739355445e-06, + "loss": 0.0014, + "num_input_tokens_seen": 1094192, + "step": 548 + }, + { + "epoch": 0.07276341948310139, + "grad_norm": 3.706557273864746, + "learning_rate": 4.99300146110644e-06, + "loss": 0.0165, + "num_input_tokens_seen": 1095504, + "step": 549 + }, + { + "epoch": 0.0728959575878065, + "grad_norm": 14.554426193237305, + "learning_rate": 4.992975480591643e-06, + "loss": 0.2236, + "num_input_tokens_seen": 1097344, + "step": 550 + }, + { + "epoch": 0.0730284956925116, + "grad_norm": 4.300492763519287, + "learning_rate": 4.992949452010558e-06, + "loss": 0.0587, + "num_input_tokens_seen": 1098792, + "step": 551 + }, + { + "epoch": 0.0731610337972167, + "grad_norm": 0.206004798412323, + "learning_rate": 4.992923375363687e-06, + "loss": 0.001, + "num_input_tokens_seen": 1101152, + "step": 552 + }, + { + "epoch": 0.0732935719019218, + "grad_norm": 0.8021019101142883, + "learning_rate": 4.992897250651535e-06, + "loss": 0.0035, + "num_input_tokens_seen": 1103448, + "step": 553 + }, + { + "epoch": 0.07342611000662691, + "grad_norm": 6.172055244445801, + "learning_rate": 4.992871077874604e-06, + "loss": 0.0726, + "num_input_tokens_seen": 1105240, + "step": 554 + }, + { + "epoch": 0.073558648111332, + "grad_norm": 32.844390869140625, + "learning_rate": 4.992844857033399e-06, + "loss": 0.7557, + "num_input_tokens_seen": 1107952, + "step": 555 + }, + { + "epoch": 0.07369118621603711, + "grad_norm": 16.420387268066406, + "learning_rate": 4.992818588128424e-06, + "loss": 0.2981, + "num_input_tokens_seen": 1109504, + "step": 556 + }, + { + "epoch": 0.07382372432074222, + "grad_norm": 8.786429405212402, + "learning_rate": 4.992792271160189e-06, + "loss": 0.0788, + "num_input_tokens_seen": 1112600, + "step": 557 + }, + { + "epoch": 0.07395626242544731, + "grad_norm": 2.0075364112854004, + "learning_rate": 4.992765906129198e-06, + "loss": 0.0365, + "num_input_tokens_seen": 1114192, + "step": 558 + }, + { + "epoch": 0.07408880053015242, + "grad_norm": 12.34540843963623, + "learning_rate": 4.9927394930359604e-06, + "loss": 0.2806, + "num_input_tokens_seen": 1116328, + "step": 559 + }, + { + "epoch": 0.07422133863485753, + "grad_norm": 1.716313362121582, + "learning_rate": 4.992713031880986e-06, + "loss": 0.0061, + "num_input_tokens_seen": 1117928, + "step": 560 + }, + { + "epoch": 0.07435387673956262, + "grad_norm": 8.74285888671875, + "learning_rate": 4.992686522664785e-06, + "loss": 0.032, + "num_input_tokens_seen": 1119712, + "step": 561 + }, + { + "epoch": 0.07448641484426773, + "grad_norm": 0.0721452385187149, + "learning_rate": 4.992659965387868e-06, + "loss": 0.0003, + "num_input_tokens_seen": 1121688, + "step": 562 + }, + { + "epoch": 0.07461895294897283, + "grad_norm": 12.754493713378906, + "learning_rate": 4.9926333600507484e-06, + "loss": 0.2298, + "num_input_tokens_seen": 1123272, + "step": 563 + }, + { + "epoch": 0.07475149105367793, + "grad_norm": 15.049842834472656, + "learning_rate": 4.992606706653936e-06, + "loss": 0.2616, + "num_input_tokens_seen": 1125256, + "step": 564 + }, + { + "epoch": 0.07488402915838303, + "grad_norm": 18.730566024780273, + "learning_rate": 4.992580005197949e-06, + "loss": 0.4459, + "num_input_tokens_seen": 1127576, + "step": 565 + }, + { + "epoch": 0.07501656726308814, + "grad_norm": 10.277911186218262, + "learning_rate": 4.992553255683298e-06, + "loss": 0.1509, + "num_input_tokens_seen": 1130264, + "step": 566 + }, + { + "epoch": 0.07514910536779323, + "grad_norm": 0.07839662581682205, + "learning_rate": 4.9925264581105015e-06, + "loss": 0.0004, + "num_input_tokens_seen": 1131752, + "step": 567 + }, + { + "epoch": 0.07528164347249834, + "grad_norm": 15.85098648071289, + "learning_rate": 4.992499612480075e-06, + "loss": 0.2329, + "num_input_tokens_seen": 1133176, + "step": 568 + }, + { + "epoch": 0.07541418157720345, + "grad_norm": 0.09915667772293091, + "learning_rate": 4.992472718792537e-06, + "loss": 0.0004, + "num_input_tokens_seen": 1134096, + "step": 569 + }, + { + "epoch": 0.07554671968190854, + "grad_norm": 12.51353931427002, + "learning_rate": 4.992445777048406e-06, + "loss": 0.1673, + "num_input_tokens_seen": 1136064, + "step": 570 + }, + { + "epoch": 0.07567925778661365, + "grad_norm": 0.14763173460960388, + "learning_rate": 4.9924187872482e-06, + "loss": 0.0007, + "num_input_tokens_seen": 1137432, + "step": 571 + }, + { + "epoch": 0.07581179589131876, + "grad_norm": 0.5407018065452576, + "learning_rate": 4.99239174939244e-06, + "loss": 0.0025, + "num_input_tokens_seen": 1139384, + "step": 572 + }, + { + "epoch": 0.07594433399602386, + "grad_norm": 17.991897583007812, + "learning_rate": 4.992364663481649e-06, + "loss": 0.2767, + "num_input_tokens_seen": 1141512, + "step": 573 + }, + { + "epoch": 0.07607687210072896, + "grad_norm": 0.1771698296070099, + "learning_rate": 4.992337529516347e-06, + "loss": 0.0008, + "num_input_tokens_seen": 1143440, + "step": 574 + }, + { + "epoch": 0.07620941020543406, + "grad_norm": 18.083797454833984, + "learning_rate": 4.992310347497058e-06, + "loss": 0.4597, + "num_input_tokens_seen": 1145896, + "step": 575 + }, + { + "epoch": 0.07634194831013917, + "grad_norm": 31.536720275878906, + "learning_rate": 4.992283117424307e-06, + "loss": 1.1379, + "num_input_tokens_seen": 1147712, + "step": 576 + }, + { + "epoch": 0.07647448641484426, + "grad_norm": 13.542040824890137, + "learning_rate": 4.992255839298617e-06, + "loss": 0.3027, + "num_input_tokens_seen": 1150688, + "step": 577 + }, + { + "epoch": 0.07660702451954937, + "grad_norm": 1.3251203298568726, + "learning_rate": 4.9922285131205165e-06, + "loss": 0.0061, + "num_input_tokens_seen": 1153064, + "step": 578 + }, + { + "epoch": 0.07673956262425448, + "grad_norm": 0.8419384360313416, + "learning_rate": 4.99220113889053e-06, + "loss": 0.0106, + "num_input_tokens_seen": 1154960, + "step": 579 + }, + { + "epoch": 0.07687210072895957, + "grad_norm": 21.277334213256836, + "learning_rate": 4.9921737166091854e-06, + "loss": 0.6207, + "num_input_tokens_seen": 1156976, + "step": 580 + }, + { + "epoch": 0.07700463883366468, + "grad_norm": 11.78198528289795, + "learning_rate": 4.992146246277014e-06, + "loss": 0.32, + "num_input_tokens_seen": 1158328, + "step": 581 + }, + { + "epoch": 0.07713717693836979, + "grad_norm": 0.7457808256149292, + "learning_rate": 4.992118727894543e-06, + "loss": 0.0034, + "num_input_tokens_seen": 1159808, + "step": 582 + }, + { + "epoch": 0.07726971504307488, + "grad_norm": 14.351053237915039, + "learning_rate": 4.9920911614623034e-06, + "loss": 0.5199, + "num_input_tokens_seen": 1161808, + "step": 583 + }, + { + "epoch": 0.07740225314777999, + "grad_norm": 17.17698097229004, + "learning_rate": 4.9920635469808274e-06, + "loss": 0.3101, + "num_input_tokens_seen": 1163712, + "step": 584 + }, + { + "epoch": 0.0775347912524851, + "grad_norm": 14.19390869140625, + "learning_rate": 4.992035884450647e-06, + "loss": 0.296, + "num_input_tokens_seen": 1165336, + "step": 585 + }, + { + "epoch": 0.07766732935719019, + "grad_norm": 10.093198776245117, + "learning_rate": 4.992008173872295e-06, + "loss": 0.1378, + "num_input_tokens_seen": 1167368, + "step": 586 + }, + { + "epoch": 0.0777998674618953, + "grad_norm": 19.43902587890625, + "learning_rate": 4.991980415246307e-06, + "loss": 0.2828, + "num_input_tokens_seen": 1169792, + "step": 587 + }, + { + "epoch": 0.0779324055666004, + "grad_norm": 1.6475398540496826, + "learning_rate": 4.9919526085732175e-06, + "loss": 0.0075, + "num_input_tokens_seen": 1171288, + "step": 588 + }, + { + "epoch": 0.0780649436713055, + "grad_norm": 9.919628143310547, + "learning_rate": 4.991924753853562e-06, + "loss": 0.2253, + "num_input_tokens_seen": 1172728, + "step": 589 + }, + { + "epoch": 0.0781974817760106, + "grad_norm": 9.461600303649902, + "learning_rate": 4.991896851087878e-06, + "loss": 0.2042, + "num_input_tokens_seen": 1174664, + "step": 590 + }, + { + "epoch": 0.07833001988071571, + "grad_norm": 0.33462563157081604, + "learning_rate": 4.991868900276705e-06, + "loss": 0.0015, + "num_input_tokens_seen": 1176024, + "step": 591 + }, + { + "epoch": 0.0784625579854208, + "grad_norm": 16.593881607055664, + "learning_rate": 4.991840901420579e-06, + "loss": 0.5138, + "num_input_tokens_seen": 1177640, + "step": 592 + }, + { + "epoch": 0.07859509609012591, + "grad_norm": 3.8829195499420166, + "learning_rate": 4.991812854520043e-06, + "loss": 0.0181, + "num_input_tokens_seen": 1180352, + "step": 593 + }, + { + "epoch": 0.07872763419483102, + "grad_norm": 8.810951232910156, + "learning_rate": 4.991784759575635e-06, + "loss": 0.2146, + "num_input_tokens_seen": 1182048, + "step": 594 + }, + { + "epoch": 0.07886017229953611, + "grad_norm": 13.477286338806152, + "learning_rate": 4.991756616587898e-06, + "loss": 0.1102, + "num_input_tokens_seen": 1183664, + "step": 595 + }, + { + "epoch": 0.07899271040424122, + "grad_norm": 25.641202926635742, + "learning_rate": 4.991728425557375e-06, + "loss": 0.9096, + "num_input_tokens_seen": 1186840, + "step": 596 + }, + { + "epoch": 0.07912524850894632, + "grad_norm": 16.363447189331055, + "learning_rate": 4.991700186484608e-06, + "loss": 0.4999, + "num_input_tokens_seen": 1188680, + "step": 597 + }, + { + "epoch": 0.07925778661365142, + "grad_norm": 0.2997564971446991, + "learning_rate": 4.991671899370143e-06, + "loss": 0.0014, + "num_input_tokens_seen": 1190120, + "step": 598 + }, + { + "epoch": 0.07939032471835653, + "grad_norm": 0.28962504863739014, + "learning_rate": 4.9916435642145245e-06, + "loss": 0.0013, + "num_input_tokens_seen": 1191216, + "step": 599 + }, + { + "epoch": 0.07952286282306163, + "grad_norm": 0.30223163962364197, + "learning_rate": 4.9916151810183e-06, + "loss": 0.0014, + "num_input_tokens_seen": 1192968, + "step": 600 + }, + { + "epoch": 0.07965540092776674, + "grad_norm": 21.49254035949707, + "learning_rate": 4.9915867497820154e-06, + "loss": 0.3617, + "num_input_tokens_seen": 1195656, + "step": 601 + }, + { + "epoch": 0.07978793903247183, + "grad_norm": 13.202689170837402, + "learning_rate": 4.991558270506219e-06, + "loss": 0.0293, + "num_input_tokens_seen": 1197160, + "step": 602 + }, + { + "epoch": 0.07992047713717694, + "grad_norm": 14.31188678741455, + "learning_rate": 4.991529743191461e-06, + "loss": 0.3444, + "num_input_tokens_seen": 1199736, + "step": 603 + }, + { + "epoch": 0.08005301524188205, + "grad_norm": 23.098224639892578, + "learning_rate": 4.991501167838291e-06, + "loss": 0.5607, + "num_input_tokens_seen": 1201960, + "step": 604 + }, + { + "epoch": 0.08018555334658714, + "grad_norm": 13.40975570678711, + "learning_rate": 4.991472544447259e-06, + "loss": 0.1489, + "num_input_tokens_seen": 1203832, + "step": 605 + }, + { + "epoch": 0.08031809145129225, + "grad_norm": 8.271739959716797, + "learning_rate": 4.991443873018917e-06, + "loss": 0.137, + "num_input_tokens_seen": 1207128, + "step": 606 + }, + { + "epoch": 0.08045062955599735, + "grad_norm": 25.32298469543457, + "learning_rate": 4.991415153553819e-06, + "loss": 0.5161, + "num_input_tokens_seen": 1208880, + "step": 607 + }, + { + "epoch": 0.08058316766070245, + "grad_norm": 6.173625469207764, + "learning_rate": 4.991386386052519e-06, + "loss": 0.1023, + "num_input_tokens_seen": 1210880, + "step": 608 + }, + { + "epoch": 0.08071570576540756, + "grad_norm": 4.077023506164551, + "learning_rate": 4.99135757051557e-06, + "loss": 0.0334, + "num_input_tokens_seen": 1212360, + "step": 609 + }, + { + "epoch": 0.08084824387011266, + "grad_norm": 11.888177871704102, + "learning_rate": 4.9913287069435275e-06, + "loss": 0.2108, + "num_input_tokens_seen": 1214440, + "step": 610 + }, + { + "epoch": 0.08098078197481776, + "grad_norm": 0.2590247690677643, + "learning_rate": 4.99129979533695e-06, + "loss": 0.001, + "num_input_tokens_seen": 1216784, + "step": 611 + }, + { + "epoch": 0.08111332007952286, + "grad_norm": 0.09978921711444855, + "learning_rate": 4.991270835696393e-06, + "loss": 0.0005, + "num_input_tokens_seen": 1218104, + "step": 612 + }, + { + "epoch": 0.08124585818422797, + "grad_norm": 28.489778518676758, + "learning_rate": 4.991241828022417e-06, + "loss": 0.5733, + "num_input_tokens_seen": 1219880, + "step": 613 + }, + { + "epoch": 0.08137839628893306, + "grad_norm": 6.184957027435303, + "learning_rate": 4.991212772315579e-06, + "loss": 0.1045, + "num_input_tokens_seen": 1221544, + "step": 614 + }, + { + "epoch": 0.08151093439363817, + "grad_norm": 0.11869768053293228, + "learning_rate": 4.99118366857644e-06, + "loss": 0.0005, + "num_input_tokens_seen": 1223912, + "step": 615 + }, + { + "epoch": 0.08164347249834328, + "grad_norm": 10.143977165222168, + "learning_rate": 4.991154516805562e-06, + "loss": 0.317, + "num_input_tokens_seen": 1226928, + "step": 616 + }, + { + "epoch": 0.08177601060304837, + "grad_norm": 18.1310977935791, + "learning_rate": 4.991125317003506e-06, + "loss": 0.5772, + "num_input_tokens_seen": 1229456, + "step": 617 + }, + { + "epoch": 0.08190854870775348, + "grad_norm": 0.15265947580337524, + "learning_rate": 4.991096069170835e-06, + "loss": 0.0007, + "num_input_tokens_seen": 1230944, + "step": 618 + }, + { + "epoch": 0.08204108681245859, + "grad_norm": 15.075489044189453, + "learning_rate": 4.991066773308113e-06, + "loss": 0.2118, + "num_input_tokens_seen": 1232712, + "step": 619 + }, + { + "epoch": 0.08217362491716368, + "grad_norm": 0.2138608992099762, + "learning_rate": 4.991037429415906e-06, + "loss": 0.001, + "num_input_tokens_seen": 1233984, + "step": 620 + }, + { + "epoch": 0.08230616302186879, + "grad_norm": 9.908183097839355, + "learning_rate": 4.991008037494779e-06, + "loss": 0.2053, + "num_input_tokens_seen": 1235280, + "step": 621 + }, + { + "epoch": 0.0824387011265739, + "grad_norm": 15.478682518005371, + "learning_rate": 4.990978597545297e-06, + "loss": 0.2799, + "num_input_tokens_seen": 1237448, + "step": 622 + }, + { + "epoch": 0.08257123923127899, + "grad_norm": 19.739458084106445, + "learning_rate": 4.9909491095680306e-06, + "loss": 0.4469, + "num_input_tokens_seen": 1239672, + "step": 623 + }, + { + "epoch": 0.0827037773359841, + "grad_norm": 8.118733406066895, + "learning_rate": 4.990919573563546e-06, + "loss": 0.1296, + "num_input_tokens_seen": 1240872, + "step": 624 + }, + { + "epoch": 0.0828363154406892, + "grad_norm": 9.600885391235352, + "learning_rate": 4.990889989532414e-06, + "loss": 0.2823, + "num_input_tokens_seen": 1243056, + "step": 625 + }, + { + "epoch": 0.0829688535453943, + "grad_norm": 21.85724639892578, + "learning_rate": 4.990860357475206e-06, + "loss": 0.4024, + "num_input_tokens_seen": 1245072, + "step": 626 + }, + { + "epoch": 0.0831013916500994, + "grad_norm": 1.1288819313049316, + "learning_rate": 4.9908306773924904e-06, + "loss": 0.0052, + "num_input_tokens_seen": 1246392, + "step": 627 + }, + { + "epoch": 0.08323392975480451, + "grad_norm": 13.654850006103516, + "learning_rate": 4.990800949284842e-06, + "loss": 0.2984, + "num_input_tokens_seen": 1248224, + "step": 628 + }, + { + "epoch": 0.08336646785950962, + "grad_norm": 3.5629279613494873, + "learning_rate": 4.990771173152832e-06, + "loss": 0.0182, + "num_input_tokens_seen": 1249912, + "step": 629 + }, + { + "epoch": 0.08349900596421471, + "grad_norm": 0.7646605968475342, + "learning_rate": 4.990741348997036e-06, + "loss": 0.0035, + "num_input_tokens_seen": 1251288, + "step": 630 + }, + { + "epoch": 0.08363154406891982, + "grad_norm": 27.763885498046875, + "learning_rate": 4.990711476818027e-06, + "loss": 0.5743, + "num_input_tokens_seen": 1253304, + "step": 631 + }, + { + "epoch": 0.08376408217362492, + "grad_norm": 26.014076232910156, + "learning_rate": 4.990681556616385e-06, + "loss": 0.5963, + "num_input_tokens_seen": 1255944, + "step": 632 + }, + { + "epoch": 0.08389662027833002, + "grad_norm": 17.14945411682129, + "learning_rate": 4.990651588392683e-06, + "loss": 0.46, + "num_input_tokens_seen": 1257712, + "step": 633 + }, + { + "epoch": 0.08402915838303512, + "grad_norm": 8.238128662109375, + "learning_rate": 4.990621572147501e-06, + "loss": 0.2309, + "num_input_tokens_seen": 1259400, + "step": 634 + }, + { + "epoch": 0.08416169648774023, + "grad_norm": 17.069917678833008, + "learning_rate": 4.990591507881416e-06, + "loss": 0.5602, + "num_input_tokens_seen": 1261352, + "step": 635 + }, + { + "epoch": 0.08429423459244532, + "grad_norm": 16.171161651611328, + "learning_rate": 4.990561395595008e-06, + "loss": 0.3492, + "num_input_tokens_seen": 1264176, + "step": 636 + }, + { + "epoch": 0.08442677269715043, + "grad_norm": 0.11699037998914719, + "learning_rate": 4.990531235288859e-06, + "loss": 0.0005, + "num_input_tokens_seen": 1266072, + "step": 637 + }, + { + "epoch": 0.08455931080185554, + "grad_norm": 14.559609413146973, + "learning_rate": 4.99050102696355e-06, + "loss": 0.4229, + "num_input_tokens_seen": 1267752, + "step": 638 + }, + { + "epoch": 0.08469184890656063, + "grad_norm": 6.065984725952148, + "learning_rate": 4.9904707706196634e-06, + "loss": 0.1386, + "num_input_tokens_seen": 1269160, + "step": 639 + }, + { + "epoch": 0.08482438701126574, + "grad_norm": 11.977204322814941, + "learning_rate": 4.990440466257781e-06, + "loss": 0.2277, + "num_input_tokens_seen": 1271328, + "step": 640 + }, + { + "epoch": 0.08495692511597085, + "grad_norm": 0.07289767265319824, + "learning_rate": 4.990410113878489e-06, + "loss": 0.0003, + "num_input_tokens_seen": 1272608, + "step": 641 + }, + { + "epoch": 0.08508946322067594, + "grad_norm": 7.814647197723389, + "learning_rate": 4.990379713482372e-06, + "loss": 0.1684, + "num_input_tokens_seen": 1275256, + "step": 642 + }, + { + "epoch": 0.08522200132538105, + "grad_norm": 0.3539038300514221, + "learning_rate": 4.9903492650700155e-06, + "loss": 0.0016, + "num_input_tokens_seen": 1276824, + "step": 643 + }, + { + "epoch": 0.08535453943008615, + "grad_norm": 0.5345784425735474, + "learning_rate": 4.9903187686420076e-06, + "loss": 0.0021, + "num_input_tokens_seen": 1277992, + "step": 644 + }, + { + "epoch": 0.08548707753479125, + "grad_norm": 2.2201688289642334, + "learning_rate": 4.990288224198936e-06, + "loss": 0.0041, + "num_input_tokens_seen": 1279720, + "step": 645 + }, + { + "epoch": 0.08561961563949635, + "grad_norm": 9.425178527832031, + "learning_rate": 4.99025763174139e-06, + "loss": 0.1398, + "num_input_tokens_seen": 1281968, + "step": 646 + }, + { + "epoch": 0.08575215374420146, + "grad_norm": 0.05981895327568054, + "learning_rate": 4.990226991269957e-06, + "loss": 0.0003, + "num_input_tokens_seen": 1283296, + "step": 647 + }, + { + "epoch": 0.08588469184890656, + "grad_norm": 0.1609344780445099, + "learning_rate": 4.990196302785232e-06, + "loss": 0.0007, + "num_input_tokens_seen": 1284600, + "step": 648 + }, + { + "epoch": 0.08601722995361166, + "grad_norm": 0.07702692598104477, + "learning_rate": 4.990165566287803e-06, + "loss": 0.0003, + "num_input_tokens_seen": 1286848, + "step": 649 + }, + { + "epoch": 0.08614976805831677, + "grad_norm": 12.37551498413086, + "learning_rate": 4.990134781778264e-06, + "loss": 0.1459, + "num_input_tokens_seen": 1288568, + "step": 650 + }, + { + "epoch": 0.08628230616302186, + "grad_norm": 19.112503051757812, + "learning_rate": 4.99010394925721e-06, + "loss": 0.2366, + "num_input_tokens_seen": 1290192, + "step": 651 + }, + { + "epoch": 0.08641484426772697, + "grad_norm": 12.018131256103516, + "learning_rate": 4.990073068725232e-06, + "loss": 0.0659, + "num_input_tokens_seen": 1291920, + "step": 652 + }, + { + "epoch": 0.08654738237243208, + "grad_norm": 20.139245986938477, + "learning_rate": 4.990042140182929e-06, + "loss": 0.3819, + "num_input_tokens_seen": 1294288, + "step": 653 + }, + { + "epoch": 0.08667992047713717, + "grad_norm": 0.020606258884072304, + "learning_rate": 4.990011163630894e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1295936, + "step": 654 + }, + { + "epoch": 0.08681245858184228, + "grad_norm": 18.525423049926758, + "learning_rate": 4.9899801390697275e-06, + "loss": 0.5304, + "num_input_tokens_seen": 1298160, + "step": 655 + }, + { + "epoch": 0.08694499668654739, + "grad_norm": 12.277786254882812, + "learning_rate": 4.989949066500026e-06, + "loss": 0.2501, + "num_input_tokens_seen": 1301304, + "step": 656 + }, + { + "epoch": 0.08707753479125248, + "grad_norm": 20.86549949645996, + "learning_rate": 4.989917945922389e-06, + "loss": 0.3786, + "num_input_tokens_seen": 1303488, + "step": 657 + }, + { + "epoch": 0.08721007289595759, + "grad_norm": 16.710758209228516, + "learning_rate": 4.989886777337416e-06, + "loss": 0.2996, + "num_input_tokens_seen": 1306000, + "step": 658 + }, + { + "epoch": 0.08734261100066269, + "grad_norm": 14.10604190826416, + "learning_rate": 4.9898555607457075e-06, + "loss": 0.1954, + "num_input_tokens_seen": 1308600, + "step": 659 + }, + { + "epoch": 0.0874751491053678, + "grad_norm": 6.774168968200684, + "learning_rate": 4.9898242961478675e-06, + "loss": 0.13, + "num_input_tokens_seen": 1310120, + "step": 660 + }, + { + "epoch": 0.0876076872100729, + "grad_norm": 20.237262725830078, + "learning_rate": 4.989792983544496e-06, + "loss": 0.087, + "num_input_tokens_seen": 1312000, + "step": 661 + }, + { + "epoch": 0.087740225314778, + "grad_norm": 9.918155670166016, + "learning_rate": 4.989761622936199e-06, + "loss": 0.0958, + "num_input_tokens_seen": 1314680, + "step": 662 + }, + { + "epoch": 0.08787276341948311, + "grad_norm": 0.10521414130926132, + "learning_rate": 4.9897302143235805e-06, + "loss": 0.0004, + "num_input_tokens_seen": 1316064, + "step": 663 + }, + { + "epoch": 0.0880053015241882, + "grad_norm": 0.02062806487083435, + "learning_rate": 4.989698757707246e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1317280, + "step": 664 + }, + { + "epoch": 0.08813783962889331, + "grad_norm": 7.562278747558594, + "learning_rate": 4.9896672530878004e-06, + "loss": 0.0489, + "num_input_tokens_seen": 1318816, + "step": 665 + }, + { + "epoch": 0.08827037773359842, + "grad_norm": 8.728992462158203, + "learning_rate": 4.989635700465854e-06, + "loss": 0.0293, + "num_input_tokens_seen": 1320984, + "step": 666 + }, + { + "epoch": 0.08840291583830351, + "grad_norm": 19.341419219970703, + "learning_rate": 4.989604099842013e-06, + "loss": 0.535, + "num_input_tokens_seen": 1323600, + "step": 667 + }, + { + "epoch": 0.08853545394300862, + "grad_norm": 3.3183066844940186, + "learning_rate": 4.989572451216888e-06, + "loss": 0.0097, + "num_input_tokens_seen": 1326272, + "step": 668 + }, + { + "epoch": 0.08866799204771372, + "grad_norm": 0.23016738891601562, + "learning_rate": 4.989540754591088e-06, + "loss": 0.0006, + "num_input_tokens_seen": 1327888, + "step": 669 + }, + { + "epoch": 0.08880053015241882, + "grad_norm": 2.326965808868408, + "learning_rate": 4.989509009965226e-06, + "loss": 0.0098, + "num_input_tokens_seen": 1329992, + "step": 670 + }, + { + "epoch": 0.08893306825712392, + "grad_norm": 10.426582336425781, + "learning_rate": 4.989477217339912e-06, + "loss": 0.2614, + "num_input_tokens_seen": 1331000, + "step": 671 + }, + { + "epoch": 0.08906560636182903, + "grad_norm": 14.278281211853027, + "learning_rate": 4.98944537671576e-06, + "loss": 0.488, + "num_input_tokens_seen": 1333088, + "step": 672 + }, + { + "epoch": 0.08919814446653412, + "grad_norm": 12.702383041381836, + "learning_rate": 4.989413488093384e-06, + "loss": 0.2876, + "num_input_tokens_seen": 1334856, + "step": 673 + }, + { + "epoch": 0.08933068257123923, + "grad_norm": 19.054916381835938, + "learning_rate": 4.989381551473399e-06, + "loss": 0.307, + "num_input_tokens_seen": 1336176, + "step": 674 + }, + { + "epoch": 0.08946322067594434, + "grad_norm": 30.507787704467773, + "learning_rate": 4.989349566856419e-06, + "loss": 1.1632, + "num_input_tokens_seen": 1339584, + "step": 675 + }, + { + "epoch": 0.08959575878064943, + "grad_norm": 0.01704728789627552, + "learning_rate": 4.989317534243063e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1342520, + "step": 676 + }, + { + "epoch": 0.08972829688535454, + "grad_norm": 16.835655212402344, + "learning_rate": 4.989285453633948e-06, + "loss": 0.2461, + "num_input_tokens_seen": 1344832, + "step": 677 + }, + { + "epoch": 0.08986083499005965, + "grad_norm": 8.752105712890625, + "learning_rate": 4.989253325029691e-06, + "loss": 0.1929, + "num_input_tokens_seen": 1346968, + "step": 678 + }, + { + "epoch": 0.08999337309476474, + "grad_norm": 9.870306015014648, + "learning_rate": 4.989221148430913e-06, + "loss": 0.1515, + "num_input_tokens_seen": 1349128, + "step": 679 + }, + { + "epoch": 0.09012591119946985, + "grad_norm": 26.429723739624023, + "learning_rate": 4.989188923838235e-06, + "loss": 1.103, + "num_input_tokens_seen": 1351552, + "step": 680 + }, + { + "epoch": 0.09025844930417495, + "grad_norm": 27.696903228759766, + "learning_rate": 4.989156651252276e-06, + "loss": 0.4898, + "num_input_tokens_seen": 1352760, + "step": 681 + }, + { + "epoch": 0.09039098740888005, + "grad_norm": 13.558822631835938, + "learning_rate": 4.989124330673661e-06, + "loss": 0.2836, + "num_input_tokens_seen": 1354720, + "step": 682 + }, + { + "epoch": 0.09052352551358515, + "grad_norm": 9.587841033935547, + "learning_rate": 4.989091962103011e-06, + "loss": 0.1231, + "num_input_tokens_seen": 1356160, + "step": 683 + }, + { + "epoch": 0.09065606361829026, + "grad_norm": 15.322427749633789, + "learning_rate": 4.98905954554095e-06, + "loss": 0.4212, + "num_input_tokens_seen": 1358464, + "step": 684 + }, + { + "epoch": 0.09078860172299535, + "grad_norm": 20.8477783203125, + "learning_rate": 4.9890270809881046e-06, + "loss": 0.2775, + "num_input_tokens_seen": 1360008, + "step": 685 + }, + { + "epoch": 0.09092113982770046, + "grad_norm": 0.48768192529678345, + "learning_rate": 4.9889945684451e-06, + "loss": 0.002, + "num_input_tokens_seen": 1362760, + "step": 686 + }, + { + "epoch": 0.09105367793240557, + "grad_norm": 13.189520835876465, + "learning_rate": 4.988962007912564e-06, + "loss": 0.1808, + "num_input_tokens_seen": 1364992, + "step": 687 + }, + { + "epoch": 0.09118621603711068, + "grad_norm": 9.489797592163086, + "learning_rate": 4.988929399391122e-06, + "loss": 0.175, + "num_input_tokens_seen": 1366232, + "step": 688 + }, + { + "epoch": 0.09131875414181577, + "grad_norm": 12.343358039855957, + "learning_rate": 4.988896742881405e-06, + "loss": 0.1203, + "num_input_tokens_seen": 1367944, + "step": 689 + }, + { + "epoch": 0.09145129224652088, + "grad_norm": 342.6282043457031, + "learning_rate": 4.988864038384041e-06, + "loss": 1.5204, + "num_input_tokens_seen": 1369784, + "step": 690 + }, + { + "epoch": 0.09158383035122598, + "grad_norm": 15.843669891357422, + "learning_rate": 4.988831285899663e-06, + "loss": 0.0256, + "num_input_tokens_seen": 1371432, + "step": 691 + }, + { + "epoch": 0.09171636845593108, + "grad_norm": 2.5344300270080566, + "learning_rate": 4.988798485428898e-06, + "loss": 0.0108, + "num_input_tokens_seen": 1372928, + "step": 692 + }, + { + "epoch": 0.09184890656063618, + "grad_norm": 7.959270000457764, + "learning_rate": 4.9887656369723835e-06, + "loss": 0.297, + "num_input_tokens_seen": 1375088, + "step": 693 + }, + { + "epoch": 0.09198144466534129, + "grad_norm": 15.053533554077148, + "learning_rate": 4.988732740530751e-06, + "loss": 0.3799, + "num_input_tokens_seen": 1376720, + "step": 694 + }, + { + "epoch": 0.09211398277004638, + "grad_norm": 15.338154792785645, + "learning_rate": 4.988699796104632e-06, + "loss": 0.4782, + "num_input_tokens_seen": 1378656, + "step": 695 + }, + { + "epoch": 0.09224652087475149, + "grad_norm": 24.729204177856445, + "learning_rate": 4.988666803694666e-06, + "loss": 0.2049, + "num_input_tokens_seen": 1380128, + "step": 696 + }, + { + "epoch": 0.0923790589794566, + "grad_norm": 9.287247657775879, + "learning_rate": 4.988633763301487e-06, + "loss": 0.1624, + "num_input_tokens_seen": 1381952, + "step": 697 + }, + { + "epoch": 0.09251159708416169, + "grad_norm": 0.44355037808418274, + "learning_rate": 4.988600674925732e-06, + "loss": 0.002, + "num_input_tokens_seen": 1383208, + "step": 698 + }, + { + "epoch": 0.0926441351888668, + "grad_norm": 1.12647545337677, + "learning_rate": 4.988567538568039e-06, + "loss": 0.0045, + "num_input_tokens_seen": 1384728, + "step": 699 + }, + { + "epoch": 0.0927766732935719, + "grad_norm": 13.151799201965332, + "learning_rate": 4.988534354229047e-06, + "loss": 0.0929, + "num_input_tokens_seen": 1386360, + "step": 700 + }, + { + "epoch": 0.092909211398277, + "grad_norm": 53.58653259277344, + "learning_rate": 4.988501121909396e-06, + "loss": 0.2849, + "num_input_tokens_seen": 1389464, + "step": 701 + }, + { + "epoch": 0.09304174950298211, + "grad_norm": 15.320070266723633, + "learning_rate": 4.988467841609727e-06, + "loss": 0.1257, + "num_input_tokens_seen": 1391072, + "step": 702 + }, + { + "epoch": 0.09317428760768721, + "grad_norm": 23.33917999267578, + "learning_rate": 4.988434513330681e-06, + "loss": 0.2425, + "num_input_tokens_seen": 1393744, + "step": 703 + }, + { + "epoch": 0.09330682571239231, + "grad_norm": 20.63372230529785, + "learning_rate": 4.988401137072901e-06, + "loss": 0.39, + "num_input_tokens_seen": 1395936, + "step": 704 + }, + { + "epoch": 0.09343936381709742, + "grad_norm": 15.035074234008789, + "learning_rate": 4.98836771283703e-06, + "loss": 0.5303, + "num_input_tokens_seen": 1397912, + "step": 705 + }, + { + "epoch": 0.09357190192180252, + "grad_norm": 14.882782936096191, + "learning_rate": 4.988334240623713e-06, + "loss": 0.2208, + "num_input_tokens_seen": 1399960, + "step": 706 + }, + { + "epoch": 0.09370444002650762, + "grad_norm": 0.6375334858894348, + "learning_rate": 4.9883007204335956e-06, + "loss": 0.0026, + "num_input_tokens_seen": 1402368, + "step": 707 + }, + { + "epoch": 0.09383697813121272, + "grad_norm": 14.368058204650879, + "learning_rate": 4.988267152267324e-06, + "loss": 0.21, + "num_input_tokens_seen": 1403632, + "step": 708 + }, + { + "epoch": 0.09396951623591783, + "grad_norm": 0.14596626162528992, + "learning_rate": 4.988233536125544e-06, + "loss": 0.0007, + "num_input_tokens_seen": 1405800, + "step": 709 + }, + { + "epoch": 0.09410205434062292, + "grad_norm": 13.29217529296875, + "learning_rate": 4.988199872008906e-06, + "loss": 0.434, + "num_input_tokens_seen": 1408168, + "step": 710 + }, + { + "epoch": 0.09423459244532803, + "grad_norm": 0.041853271424770355, + "learning_rate": 4.988166159918057e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1410112, + "step": 711 + }, + { + "epoch": 0.09436713055003314, + "grad_norm": 14.671149253845215, + "learning_rate": 4.988132399853649e-06, + "loss": 0.4687, + "num_input_tokens_seen": 1412880, + "step": 712 + }, + { + "epoch": 0.09449966865473823, + "grad_norm": 19.114940643310547, + "learning_rate": 4.988098591816332e-06, + "loss": 0.3795, + "num_input_tokens_seen": 1414648, + "step": 713 + }, + { + "epoch": 0.09463220675944334, + "grad_norm": 18.239574432373047, + "learning_rate": 4.988064735806757e-06, + "loss": 0.2361, + "num_input_tokens_seen": 1416168, + "step": 714 + }, + { + "epoch": 0.09476474486414845, + "grad_norm": 28.464008331298828, + "learning_rate": 4.988030831825577e-06, + "loss": 0.4571, + "num_input_tokens_seen": 1418120, + "step": 715 + }, + { + "epoch": 0.09489728296885354, + "grad_norm": 0.10835893452167511, + "learning_rate": 4.9879968798734465e-06, + "loss": 0.0005, + "num_input_tokens_seen": 1420168, + "step": 716 + }, + { + "epoch": 0.09502982107355865, + "grad_norm": 15.603118896484375, + "learning_rate": 4.987962879951021e-06, + "loss": 0.3054, + "num_input_tokens_seen": 1422728, + "step": 717 + }, + { + "epoch": 0.09516235917826375, + "grad_norm": 12.232925415039062, + "learning_rate": 4.987928832058952e-06, + "loss": 0.292, + "num_input_tokens_seen": 1424600, + "step": 718 + }, + { + "epoch": 0.09529489728296886, + "grad_norm": 19.580366134643555, + "learning_rate": 4.987894736197901e-06, + "loss": 0.4345, + "num_input_tokens_seen": 1426968, + "step": 719 + }, + { + "epoch": 0.09542743538767395, + "grad_norm": 546.7924194335938, + "learning_rate": 4.987860592368523e-06, + "loss": 0.6036, + "num_input_tokens_seen": 1430096, + "step": 720 + }, + { + "epoch": 0.09555997349237906, + "grad_norm": 96.1352310180664, + "learning_rate": 4.987826400571476e-06, + "loss": 1.4265, + "num_input_tokens_seen": 1434192, + "step": 721 + }, + { + "epoch": 0.09569251159708417, + "grad_norm": 22.91558265686035, + "learning_rate": 4.98779216080742e-06, + "loss": 0.2517, + "num_input_tokens_seen": 1436480, + "step": 722 + }, + { + "epoch": 0.09582504970178926, + "grad_norm": 56.499263763427734, + "learning_rate": 4.987757873077015e-06, + "loss": 0.1272, + "num_input_tokens_seen": 1437936, + "step": 723 + }, + { + "epoch": 0.09595758780649437, + "grad_norm": 23.61193084716797, + "learning_rate": 4.9877235373809216e-06, + "loss": 0.2669, + "num_input_tokens_seen": 1439720, + "step": 724 + }, + { + "epoch": 0.09609012591119948, + "grad_norm": 23.29979133605957, + "learning_rate": 4.987689153719802e-06, + "loss": 0.5032, + "num_input_tokens_seen": 1442256, + "step": 725 + }, + { + "epoch": 0.09622266401590457, + "grad_norm": 5.512513160705566, + "learning_rate": 4.987654722094321e-06, + "loss": 0.0438, + "num_input_tokens_seen": 1443728, + "step": 726 + }, + { + "epoch": 0.09635520212060968, + "grad_norm": 24.879680633544922, + "learning_rate": 4.987620242505139e-06, + "loss": 0.4635, + "num_input_tokens_seen": 1445640, + "step": 727 + }, + { + "epoch": 0.09648774022531478, + "grad_norm": 11.99130630493164, + "learning_rate": 4.987585714952923e-06, + "loss": 0.1229, + "num_input_tokens_seen": 1447824, + "step": 728 + }, + { + "epoch": 0.09662027833001988, + "grad_norm": 1.627767562866211, + "learning_rate": 4.987551139438339e-06, + "loss": 0.0078, + "num_input_tokens_seen": 1450440, + "step": 729 + }, + { + "epoch": 0.09675281643472498, + "grad_norm": 1.4350247383117676, + "learning_rate": 4.987516515962052e-06, + "loss": 0.0072, + "num_input_tokens_seen": 1452416, + "step": 730 + }, + { + "epoch": 0.09688535453943009, + "grad_norm": 11.842101097106934, + "learning_rate": 4.987481844524731e-06, + "loss": 0.1608, + "num_input_tokens_seen": 1454432, + "step": 731 + }, + { + "epoch": 0.09701789264413518, + "grad_norm": 36.77568054199219, + "learning_rate": 4.987447125127044e-06, + "loss": 1.6341, + "num_input_tokens_seen": 1457456, + "step": 732 + }, + { + "epoch": 0.09715043074884029, + "grad_norm": 16.38068962097168, + "learning_rate": 4.987412357769661e-06, + "loss": 0.1922, + "num_input_tokens_seen": 1459752, + "step": 733 + }, + { + "epoch": 0.0972829688535454, + "grad_norm": 19.159032821655273, + "learning_rate": 4.9873775424532515e-06, + "loss": 0.3839, + "num_input_tokens_seen": 1462136, + "step": 734 + }, + { + "epoch": 0.09741550695825049, + "grad_norm": 15.491409301757812, + "learning_rate": 4.987342679178486e-06, + "loss": 0.366, + "num_input_tokens_seen": 1464608, + "step": 735 + }, + { + "epoch": 0.0975480450629556, + "grad_norm": 12.078960418701172, + "learning_rate": 4.987307767946039e-06, + "loss": 0.3364, + "num_input_tokens_seen": 1467048, + "step": 736 + }, + { + "epoch": 0.0976805831676607, + "grad_norm": 9.48757553100586, + "learning_rate": 4.987272808756581e-06, + "loss": 0.1777, + "num_input_tokens_seen": 1468840, + "step": 737 + }, + { + "epoch": 0.0978131212723658, + "grad_norm": 745.883056640625, + "learning_rate": 4.987237801610788e-06, + "loss": 0.2046, + "num_input_tokens_seen": 1470272, + "step": 738 + }, + { + "epoch": 0.0979456593770709, + "grad_norm": 1.5754197835922241, + "learning_rate": 4.987202746509335e-06, + "loss": 0.0029, + "num_input_tokens_seen": 1471808, + "step": 739 + }, + { + "epoch": 0.09807819748177601, + "grad_norm": 0.10619448125362396, + "learning_rate": 4.9871676434528966e-06, + "loss": 0.0003, + "num_input_tokens_seen": 1473296, + "step": 740 + }, + { + "epoch": 0.09821073558648111, + "grad_norm": 9.894831657409668, + "learning_rate": 4.98713249244215e-06, + "loss": 0.1444, + "num_input_tokens_seen": 1475344, + "step": 741 + }, + { + "epoch": 0.09834327369118621, + "grad_norm": 5.439673900604248, + "learning_rate": 4.987097293477774e-06, + "loss": 0.0632, + "num_input_tokens_seen": 1477456, + "step": 742 + }, + { + "epoch": 0.09847581179589132, + "grad_norm": 20.135883331298828, + "learning_rate": 4.987062046560446e-06, + "loss": 0.5811, + "num_input_tokens_seen": 1479344, + "step": 743 + }, + { + "epoch": 0.09860834990059641, + "grad_norm": 32.169254302978516, + "learning_rate": 4.987026751690845e-06, + "loss": 0.5984, + "num_input_tokens_seen": 1481352, + "step": 744 + }, + { + "epoch": 0.09874088800530152, + "grad_norm": 0.05301506444811821, + "learning_rate": 4.986991408869654e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1484000, + "step": 745 + }, + { + "epoch": 0.09887342611000663, + "grad_norm": 25.11466407775879, + "learning_rate": 4.986956018097552e-06, + "loss": 0.2294, + "num_input_tokens_seen": 1486248, + "step": 746 + }, + { + "epoch": 0.09900596421471174, + "grad_norm": 12.702735900878906, + "learning_rate": 4.986920579375223e-06, + "loss": 0.2259, + "num_input_tokens_seen": 1487488, + "step": 747 + }, + { + "epoch": 0.09913850231941683, + "grad_norm": 9.419434547424316, + "learning_rate": 4.986885092703349e-06, + "loss": 0.1453, + "num_input_tokens_seen": 1490616, + "step": 748 + }, + { + "epoch": 0.09927104042412194, + "grad_norm": 5.48451042175293, + "learning_rate": 4.986849558082615e-06, + "loss": 0.1653, + "num_input_tokens_seen": 1492584, + "step": 749 + }, + { + "epoch": 0.09940357852882704, + "grad_norm": 6.994708061218262, + "learning_rate": 4.986813975513706e-06, + "loss": 0.0853, + "num_input_tokens_seen": 1495160, + "step": 750 + }, + { + "epoch": 0.09953611663353214, + "grad_norm": 14.051322937011719, + "learning_rate": 4.986778344997308e-06, + "loss": 0.3267, + "num_input_tokens_seen": 1497128, + "step": 751 + }, + { + "epoch": 0.09966865473823724, + "grad_norm": 13.732600212097168, + "learning_rate": 4.986742666534108e-06, + "loss": 0.1131, + "num_input_tokens_seen": 1498752, + "step": 752 + }, + { + "epoch": 0.09980119284294235, + "grad_norm": 25.390104293823242, + "learning_rate": 4.986706940124794e-06, + "loss": 0.5778, + "num_input_tokens_seen": 1500480, + "step": 753 + }, + { + "epoch": 0.09993373094764745, + "grad_norm": 14.873355865478516, + "learning_rate": 4.986671165770054e-06, + "loss": 0.252, + "num_input_tokens_seen": 1502464, + "step": 754 + }, + { + "epoch": 0.10006626905235255, + "grad_norm": 16.085914611816406, + "learning_rate": 4.986635343470579e-06, + "loss": 0.2568, + "num_input_tokens_seen": 1504296, + "step": 755 + }, + { + "epoch": 0.10019880715705766, + "grad_norm": 26.34661102294922, + "learning_rate": 4.98659947322706e-06, + "loss": 0.4456, + "num_input_tokens_seen": 1506384, + "step": 756 + }, + { + "epoch": 0.10033134526176275, + "grad_norm": 1.040360927581787, + "learning_rate": 4.986563555040186e-06, + "loss": 0.0054, + "num_input_tokens_seen": 1507712, + "step": 757 + }, + { + "epoch": 0.10046388336646786, + "grad_norm": 42.569305419921875, + "learning_rate": 4.986527588910652e-06, + "loss": 0.5666, + "num_input_tokens_seen": 1511024, + "step": 758 + }, + { + "epoch": 0.10059642147117297, + "grad_norm": 17.265544891357422, + "learning_rate": 4.986491574839151e-06, + "loss": 0.5337, + "num_input_tokens_seen": 1513336, + "step": 759 + }, + { + "epoch": 0.10072895957587806, + "grad_norm": 11.115621566772461, + "learning_rate": 4.986455512826377e-06, + "loss": 0.078, + "num_input_tokens_seen": 1515248, + "step": 760 + }, + { + "epoch": 0.10086149768058317, + "grad_norm": 14.317075729370117, + "learning_rate": 4.986419402873025e-06, + "loss": 0.4713, + "num_input_tokens_seen": 1517360, + "step": 761 + }, + { + "epoch": 0.10099403578528827, + "grad_norm": 10.884073257446289, + "learning_rate": 4.986383244979792e-06, + "loss": 0.0761, + "num_input_tokens_seen": 1519016, + "step": 762 + }, + { + "epoch": 0.10112657388999337, + "grad_norm": 18.775115966796875, + "learning_rate": 4.9863470391473745e-06, + "loss": 0.4286, + "num_input_tokens_seen": 1520384, + "step": 763 + }, + { + "epoch": 0.10125911199469848, + "grad_norm": 4.701003551483154, + "learning_rate": 4.986310785376471e-06, + "loss": 0.0202, + "num_input_tokens_seen": 1521808, + "step": 764 + }, + { + "epoch": 0.10139165009940358, + "grad_norm": 14.515257835388184, + "learning_rate": 4.98627448366778e-06, + "loss": 0.3837, + "num_input_tokens_seen": 1523400, + "step": 765 + }, + { + "epoch": 0.10152418820410868, + "grad_norm": 3.0291593074798584, + "learning_rate": 4.986238134022003e-06, + "loss": 0.0159, + "num_input_tokens_seen": 1525736, + "step": 766 + }, + { + "epoch": 0.10165672630881378, + "grad_norm": 0.7638363242149353, + "learning_rate": 4.986201736439838e-06, + "loss": 0.0034, + "num_input_tokens_seen": 1527536, + "step": 767 + }, + { + "epoch": 0.10178926441351889, + "grad_norm": 1.6707223653793335, + "learning_rate": 4.98616529092199e-06, + "loss": 0.0085, + "num_input_tokens_seen": 1529320, + "step": 768 + }, + { + "epoch": 0.10192180251822398, + "grad_norm": 0.04035111889243126, + "learning_rate": 4.9861287974691594e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1530912, + "step": 769 + }, + { + "epoch": 0.10205434062292909, + "grad_norm": 17.018436431884766, + "learning_rate": 4.98609225608205e-06, + "loss": 0.6156, + "num_input_tokens_seen": 1533232, + "step": 770 + }, + { + "epoch": 0.1021868787276342, + "grad_norm": 20.329517364501953, + "learning_rate": 4.986055666761367e-06, + "loss": 0.6314, + "num_input_tokens_seen": 1534720, + "step": 771 + }, + { + "epoch": 0.10231941683233929, + "grad_norm": 0.01515394076704979, + "learning_rate": 4.986019029507816e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1536128, + "step": 772 + }, + { + "epoch": 0.1024519549370444, + "grad_norm": 0.06699056178331375, + "learning_rate": 4.985982344322103e-06, + "loss": 0.0003, + "num_input_tokens_seen": 1537712, + "step": 773 + }, + { + "epoch": 0.1025844930417495, + "grad_norm": 18.549163818359375, + "learning_rate": 4.985945611204936e-06, + "loss": 0.4909, + "num_input_tokens_seen": 1539952, + "step": 774 + }, + { + "epoch": 0.10271703114645461, + "grad_norm": 15.423291206359863, + "learning_rate": 4.985908830157021e-06, + "loss": 0.4519, + "num_input_tokens_seen": 1542168, + "step": 775 + }, + { + "epoch": 0.1028495692511597, + "grad_norm": 11.36609172821045, + "learning_rate": 4.98587200117907e-06, + "loss": 0.2412, + "num_input_tokens_seen": 1544152, + "step": 776 + }, + { + "epoch": 0.10298210735586481, + "grad_norm": 30.567285537719727, + "learning_rate": 4.985835124271791e-06, + "loss": 0.8359, + "num_input_tokens_seen": 1546608, + "step": 777 + }, + { + "epoch": 0.10311464546056992, + "grad_norm": 15.79310131072998, + "learning_rate": 4.985798199435897e-06, + "loss": 0.4973, + "num_input_tokens_seen": 1549064, + "step": 778 + }, + { + "epoch": 0.10324718356527501, + "grad_norm": 19.374916076660156, + "learning_rate": 4.985761226672099e-06, + "loss": 0.4881, + "num_input_tokens_seen": 1551144, + "step": 779 + }, + { + "epoch": 0.10337972166998012, + "grad_norm": 11.14467716217041, + "learning_rate": 4.985724205981109e-06, + "loss": 0.239, + "num_input_tokens_seen": 1552944, + "step": 780 + }, + { + "epoch": 0.10351225977468523, + "grad_norm": 0.022233571857213974, + "learning_rate": 4.985687137363642e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1554288, + "step": 781 + }, + { + "epoch": 0.10364479787939032, + "grad_norm": 7.771042346954346, + "learning_rate": 4.9856500208204115e-06, + "loss": 0.1592, + "num_input_tokens_seen": 1555704, + "step": 782 + }, + { + "epoch": 0.10377733598409543, + "grad_norm": 19.567974090576172, + "learning_rate": 4.985612856352135e-06, + "loss": 0.4304, + "num_input_tokens_seen": 1557880, + "step": 783 + }, + { + "epoch": 0.10390987408880054, + "grad_norm": 21.69396209716797, + "learning_rate": 4.985575643959526e-06, + "loss": 0.8001, + "num_input_tokens_seen": 1559864, + "step": 784 + }, + { + "epoch": 0.10404241219350563, + "grad_norm": 20.777292251586914, + "learning_rate": 4.985538383643305e-06, + "loss": 0.5572, + "num_input_tokens_seen": 1562088, + "step": 785 + }, + { + "epoch": 0.10417495029821074, + "grad_norm": 0.2171250283718109, + "learning_rate": 4.98550107540419e-06, + "loss": 0.0011, + "num_input_tokens_seen": 1564152, + "step": 786 + }, + { + "epoch": 0.10430748840291584, + "grad_norm": 9.17831039428711, + "learning_rate": 4.985463719242899e-06, + "loss": 0.1282, + "num_input_tokens_seen": 1565840, + "step": 787 + }, + { + "epoch": 0.10444002650762094, + "grad_norm": 19.05247688293457, + "learning_rate": 4.985426315160152e-06, + "loss": 0.2431, + "num_input_tokens_seen": 1568152, + "step": 788 + }, + { + "epoch": 0.10457256461232604, + "grad_norm": 8.950368881225586, + "learning_rate": 4.985388863156672e-06, + "loss": 0.2835, + "num_input_tokens_seen": 1570136, + "step": 789 + }, + { + "epoch": 0.10470510271703115, + "grad_norm": 0.6779446601867676, + "learning_rate": 4.985351363233179e-06, + "loss": 0.0033, + "num_input_tokens_seen": 1571696, + "step": 790 + }, + { + "epoch": 0.10483764082173624, + "grad_norm": 17.873313903808594, + "learning_rate": 4.985313815390399e-06, + "loss": 0.5607, + "num_input_tokens_seen": 1573920, + "step": 791 + }, + { + "epoch": 0.10497017892644135, + "grad_norm": 0.3745076656341553, + "learning_rate": 4.985276219629053e-06, + "loss": 0.002, + "num_input_tokens_seen": 1575376, + "step": 792 + }, + { + "epoch": 0.10510271703114646, + "grad_norm": 21.35468101501465, + "learning_rate": 4.985238575949867e-06, + "loss": 0.4964, + "num_input_tokens_seen": 1576864, + "step": 793 + }, + { + "epoch": 0.10523525513585155, + "grad_norm": 5.856512546539307, + "learning_rate": 4.985200884353567e-06, + "loss": 0.0708, + "num_input_tokens_seen": 1578128, + "step": 794 + }, + { + "epoch": 0.10536779324055666, + "grad_norm": 1.709293007850647, + "learning_rate": 4.9851631448408795e-06, + "loss": 0.0405, + "num_input_tokens_seen": 1580688, + "step": 795 + }, + { + "epoch": 0.10550033134526177, + "grad_norm": 15.964642524719238, + "learning_rate": 4.9851253574125325e-06, + "loss": 0.3801, + "num_input_tokens_seen": 1582144, + "step": 796 + }, + { + "epoch": 0.10563286944996686, + "grad_norm": 11.121831893920898, + "learning_rate": 4.985087522069254e-06, + "loss": 0.2094, + "num_input_tokens_seen": 1584328, + "step": 797 + }, + { + "epoch": 0.10576540755467197, + "grad_norm": 15.89108657836914, + "learning_rate": 4.985049638811773e-06, + "loss": 0.4751, + "num_input_tokens_seen": 1587168, + "step": 798 + }, + { + "epoch": 0.10589794565937707, + "grad_norm": 5.668000221252441, + "learning_rate": 4.985011707640821e-06, + "loss": 0.1316, + "num_input_tokens_seen": 1588960, + "step": 799 + }, + { + "epoch": 0.10603048376408217, + "grad_norm": 11.672808647155762, + "learning_rate": 4.984973728557129e-06, + "loss": 0.1876, + "num_input_tokens_seen": 1590808, + "step": 800 + }, + { + "epoch": 0.10616302186878727, + "grad_norm": 12.616355895996094, + "learning_rate": 4.98493570156143e-06, + "loss": 0.187, + "num_input_tokens_seen": 1593128, + "step": 801 + }, + { + "epoch": 0.10629555997349238, + "grad_norm": 11.747628211975098, + "learning_rate": 4.984897626654455e-06, + "loss": 0.1139, + "num_input_tokens_seen": 1595336, + "step": 802 + }, + { + "epoch": 0.10642809807819748, + "grad_norm": 6.239638805389404, + "learning_rate": 4.98485950383694e-06, + "loss": 0.0484, + "num_input_tokens_seen": 1597000, + "step": 803 + }, + { + "epoch": 0.10656063618290258, + "grad_norm": 14.411937713623047, + "learning_rate": 4.98482133310962e-06, + "loss": 0.2067, + "num_input_tokens_seen": 1599784, + "step": 804 + }, + { + "epoch": 0.10669317428760769, + "grad_norm": 6.464604377746582, + "learning_rate": 4.984783114473231e-06, + "loss": 0.0765, + "num_input_tokens_seen": 1601336, + "step": 805 + }, + { + "epoch": 0.1068257123923128, + "grad_norm": 1.3901069164276123, + "learning_rate": 4.984744847928508e-06, + "loss": 0.0067, + "num_input_tokens_seen": 1602672, + "step": 806 + }, + { + "epoch": 0.10695825049701789, + "grad_norm": 0.6944620013237, + "learning_rate": 4.984706533476191e-06, + "loss": 0.0035, + "num_input_tokens_seen": 1604024, + "step": 807 + }, + { + "epoch": 0.107090788601723, + "grad_norm": 11.79548168182373, + "learning_rate": 4.984668171117017e-06, + "loss": 0.0872, + "num_input_tokens_seen": 1606096, + "step": 808 + }, + { + "epoch": 0.1072233267064281, + "grad_norm": 10.3992919921875, + "learning_rate": 4.984629760851728e-06, + "loss": 0.1345, + "num_input_tokens_seen": 1607272, + "step": 809 + }, + { + "epoch": 0.1073558648111332, + "grad_norm": 7.927424907684326, + "learning_rate": 4.9845913026810624e-06, + "loss": 0.284, + "num_input_tokens_seen": 1609176, + "step": 810 + }, + { + "epoch": 0.1074884029158383, + "grad_norm": 11.217361450195312, + "learning_rate": 4.984552796605762e-06, + "loss": 0.3618, + "num_input_tokens_seen": 1611440, + "step": 811 + }, + { + "epoch": 0.10762094102054341, + "grad_norm": 6.751386642456055, + "learning_rate": 4.98451424262657e-06, + "loss": 0.1044, + "num_input_tokens_seen": 1613256, + "step": 812 + }, + { + "epoch": 0.1077534791252485, + "grad_norm": 7.366720199584961, + "learning_rate": 4.98447564074423e-06, + "loss": 0.1946, + "num_input_tokens_seen": 1615016, + "step": 813 + }, + { + "epoch": 0.10788601722995361, + "grad_norm": 0.077422134578228, + "learning_rate": 4.984436990959486e-06, + "loss": 0.0004, + "num_input_tokens_seen": 1616104, + "step": 814 + }, + { + "epoch": 0.10801855533465872, + "grad_norm": 17.3349666595459, + "learning_rate": 4.9843982932730814e-06, + "loss": 0.507, + "num_input_tokens_seen": 1618336, + "step": 815 + }, + { + "epoch": 0.10815109343936381, + "grad_norm": 12.384060859680176, + "learning_rate": 4.9843595476857655e-06, + "loss": 0.2621, + "num_input_tokens_seen": 1621104, + "step": 816 + }, + { + "epoch": 0.10828363154406892, + "grad_norm": 17.825319290161133, + "learning_rate": 4.984320754198282e-06, + "loss": 0.3285, + "num_input_tokens_seen": 1623584, + "step": 817 + }, + { + "epoch": 0.10841616964877403, + "grad_norm": 2.3461272716522217, + "learning_rate": 4.984281912811382e-06, + "loss": 0.0497, + "num_input_tokens_seen": 1625264, + "step": 818 + }, + { + "epoch": 0.10854870775347912, + "grad_norm": 0.4677661061286926, + "learning_rate": 4.984243023525812e-06, + "loss": 0.0024, + "num_input_tokens_seen": 1627456, + "step": 819 + }, + { + "epoch": 0.10868124585818423, + "grad_norm": 0.0713978111743927, + "learning_rate": 4.984204086342323e-06, + "loss": 0.0004, + "num_input_tokens_seen": 1629088, + "step": 820 + }, + { + "epoch": 0.10881378396288933, + "grad_norm": 38.562618255615234, + "learning_rate": 4.984165101261665e-06, + "loss": 0.5041, + "num_input_tokens_seen": 1631544, + "step": 821 + }, + { + "epoch": 0.10894632206759443, + "grad_norm": 0.06658313423395157, + "learning_rate": 4.984126068284591e-06, + "loss": 0.0004, + "num_input_tokens_seen": 1633848, + "step": 822 + }, + { + "epoch": 0.10907886017229954, + "grad_norm": 0.18257781863212585, + "learning_rate": 4.984086987411852e-06, + "loss": 0.0008, + "num_input_tokens_seen": 1636048, + "step": 823 + }, + { + "epoch": 0.10921139827700464, + "grad_norm": 2.978452205657959, + "learning_rate": 4.984047858644202e-06, + "loss": 0.0551, + "num_input_tokens_seen": 1638800, + "step": 824 + }, + { + "epoch": 0.10934393638170974, + "grad_norm": 0.11380869895219803, + "learning_rate": 4.984008681982396e-06, + "loss": 0.0006, + "num_input_tokens_seen": 1640872, + "step": 825 + }, + { + "epoch": 0.10947647448641484, + "grad_norm": 0.5317994356155396, + "learning_rate": 4.983969457427189e-06, + "loss": 0.0026, + "num_input_tokens_seen": 1642744, + "step": 826 + }, + { + "epoch": 0.10960901259111995, + "grad_norm": 13.86351490020752, + "learning_rate": 4.983930184979338e-06, + "loss": 0.4566, + "num_input_tokens_seen": 1644416, + "step": 827 + }, + { + "epoch": 0.10974155069582504, + "grad_norm": 15.852274894714355, + "learning_rate": 4.983890864639598e-06, + "loss": 0.295, + "num_input_tokens_seen": 1647160, + "step": 828 + }, + { + "epoch": 0.10987408880053015, + "grad_norm": 0.10582073777914047, + "learning_rate": 4.983851496408729e-06, + "loss": 0.0006, + "num_input_tokens_seen": 1648984, + "step": 829 + }, + { + "epoch": 0.11000662690523526, + "grad_norm": 13.638051986694336, + "learning_rate": 4.98381208028749e-06, + "loss": 0.2271, + "num_input_tokens_seen": 1650856, + "step": 830 + }, + { + "epoch": 0.11013916500994035, + "grad_norm": 14.870671272277832, + "learning_rate": 4.983772616276641e-06, + "loss": 0.3975, + "num_input_tokens_seen": 1652480, + "step": 831 + }, + { + "epoch": 0.11027170311464546, + "grad_norm": 0.1878441721200943, + "learning_rate": 4.9837331043769416e-06, + "loss": 0.0008, + "num_input_tokens_seen": 1653672, + "step": 832 + }, + { + "epoch": 0.11040424121935057, + "grad_norm": 0.2929903566837311, + "learning_rate": 4.983693544589155e-06, + "loss": 0.0013, + "num_input_tokens_seen": 1655256, + "step": 833 + }, + { + "epoch": 0.11053677932405567, + "grad_norm": 1.0403456687927246, + "learning_rate": 4.9836539369140434e-06, + "loss": 0.003, + "num_input_tokens_seen": 1658256, + "step": 834 + }, + { + "epoch": 0.11066931742876077, + "grad_norm": 15.563006401062012, + "learning_rate": 4.98361428135237e-06, + "loss": 0.405, + "num_input_tokens_seen": 1661384, + "step": 835 + }, + { + "epoch": 0.11080185553346587, + "grad_norm": 13.409303665161133, + "learning_rate": 4.9835745779049e-06, + "loss": 0.2076, + "num_input_tokens_seen": 1663480, + "step": 836 + }, + { + "epoch": 0.11093439363817098, + "grad_norm": 0.06287132948637009, + "learning_rate": 4.9835348265724e-06, + "loss": 0.0003, + "num_input_tokens_seen": 1665440, + "step": 837 + }, + { + "epoch": 0.11106693174287607, + "grad_norm": 7.390381813049316, + "learning_rate": 4.983495027355634e-06, + "loss": 0.2333, + "num_input_tokens_seen": 1667560, + "step": 838 + }, + { + "epoch": 0.11119946984758118, + "grad_norm": 13.304295539855957, + "learning_rate": 4.983455180255371e-06, + "loss": 0.562, + "num_input_tokens_seen": 1669904, + "step": 839 + }, + { + "epoch": 0.11133200795228629, + "grad_norm": 15.280116081237793, + "learning_rate": 4.983415285272379e-06, + "loss": 0.3232, + "num_input_tokens_seen": 1671944, + "step": 840 + }, + { + "epoch": 0.11146454605699138, + "grad_norm": 21.811206817626953, + "learning_rate": 4.983375342407427e-06, + "loss": 0.8501, + "num_input_tokens_seen": 1674224, + "step": 841 + }, + { + "epoch": 0.11159708416169649, + "grad_norm": 0.05672546476125717, + "learning_rate": 4.983335351661285e-06, + "loss": 0.0003, + "num_input_tokens_seen": 1675736, + "step": 842 + }, + { + "epoch": 0.1117296222664016, + "grad_norm": 0.6553467512130737, + "learning_rate": 4.983295313034725e-06, + "loss": 0.0032, + "num_input_tokens_seen": 1677808, + "step": 843 + }, + { + "epoch": 0.11186216037110669, + "grad_norm": 0.864575982093811, + "learning_rate": 4.9832552265285174e-06, + "loss": 0.0037, + "num_input_tokens_seen": 1679848, + "step": 844 + }, + { + "epoch": 0.1119946984758118, + "grad_norm": 3.7665722370147705, + "learning_rate": 4.983215092143437e-06, + "loss": 0.0919, + "num_input_tokens_seen": 1681832, + "step": 845 + }, + { + "epoch": 0.1121272365805169, + "grad_norm": 7.560515880584717, + "learning_rate": 4.983174909880255e-06, + "loss": 0.1615, + "num_input_tokens_seen": 1683824, + "step": 846 + }, + { + "epoch": 0.112259774685222, + "grad_norm": 9.117321968078613, + "learning_rate": 4.9831346797397485e-06, + "loss": 0.1764, + "num_input_tokens_seen": 1685800, + "step": 847 + }, + { + "epoch": 0.1123923127899271, + "grad_norm": 5.653853416442871, + "learning_rate": 4.983094401722692e-06, + "loss": 0.1085, + "num_input_tokens_seen": 1687336, + "step": 848 + }, + { + "epoch": 0.11252485089463221, + "grad_norm": 0.38787755370140076, + "learning_rate": 4.983054075829863e-06, + "loss": 0.0019, + "num_input_tokens_seen": 1689088, + "step": 849 + }, + { + "epoch": 0.1126573889993373, + "grad_norm": 29.667003631591797, + "learning_rate": 4.983013702062037e-06, + "loss": 0.4131, + "num_input_tokens_seen": 1691560, + "step": 850 + }, + { + "epoch": 0.11278992710404241, + "grad_norm": 23.675508499145508, + "learning_rate": 4.982973280419996e-06, + "loss": 0.4548, + "num_input_tokens_seen": 1692944, + "step": 851 + }, + { + "epoch": 0.11292246520874752, + "grad_norm": 18.625350952148438, + "learning_rate": 4.982932810904516e-06, + "loss": 0.3063, + "num_input_tokens_seen": 1694728, + "step": 852 + }, + { + "epoch": 0.11305500331345261, + "grad_norm": 11.815914154052734, + "learning_rate": 4.9828922935163785e-06, + "loss": 0.3517, + "num_input_tokens_seen": 1697208, + "step": 853 + }, + { + "epoch": 0.11318754141815772, + "grad_norm": 12.923126220703125, + "learning_rate": 4.982851728256365e-06, + "loss": 0.3375, + "num_input_tokens_seen": 1699128, + "step": 854 + }, + { + "epoch": 0.11332007952286283, + "grad_norm": 22.642520904541016, + "learning_rate": 4.982811115125257e-06, + "loss": 0.6115, + "num_input_tokens_seen": 1701200, + "step": 855 + }, + { + "epoch": 0.11345261762756792, + "grad_norm": 23.831607818603516, + "learning_rate": 4.9827704541238385e-06, + "loss": 0.8099, + "num_input_tokens_seen": 1703208, + "step": 856 + }, + { + "epoch": 0.11358515573227303, + "grad_norm": 5.9392547607421875, + "learning_rate": 4.982729745252893e-06, + "loss": 0.0386, + "num_input_tokens_seen": 1704664, + "step": 857 + }, + { + "epoch": 0.11371769383697813, + "grad_norm": 8.692997932434082, + "learning_rate": 4.982688988513205e-06, + "loss": 0.0659, + "num_input_tokens_seen": 1706640, + "step": 858 + }, + { + "epoch": 0.11385023194168323, + "grad_norm": 12.953629493713379, + "learning_rate": 4.98264818390556e-06, + "loss": 0.4667, + "num_input_tokens_seen": 1708560, + "step": 859 + }, + { + "epoch": 0.11398277004638833, + "grad_norm": 1.076991319656372, + "learning_rate": 4.982607331430746e-06, + "loss": 0.0059, + "num_input_tokens_seen": 1709928, + "step": 860 + }, + { + "epoch": 0.11411530815109344, + "grad_norm": 15.592238426208496, + "learning_rate": 4.9825664310895494e-06, + "loss": 0.298, + "num_input_tokens_seen": 1712344, + "step": 861 + }, + { + "epoch": 0.11424784625579854, + "grad_norm": 9.876296043395996, + "learning_rate": 4.98252548288276e-06, + "loss": 0.3059, + "num_input_tokens_seen": 1713992, + "step": 862 + }, + { + "epoch": 0.11438038436050364, + "grad_norm": 0.7367914319038391, + "learning_rate": 4.982484486811166e-06, + "loss": 0.004, + "num_input_tokens_seen": 1715384, + "step": 863 + }, + { + "epoch": 0.11451292246520875, + "grad_norm": 14.256082534790039, + "learning_rate": 4.982443442875559e-06, + "loss": 0.302, + "num_input_tokens_seen": 1717360, + "step": 864 + }, + { + "epoch": 0.11464546056991386, + "grad_norm": 18.457731246948242, + "learning_rate": 4.98240235107673e-06, + "loss": 0.5164, + "num_input_tokens_seen": 1719080, + "step": 865 + }, + { + "epoch": 0.11477799867461895, + "grad_norm": 16.755495071411133, + "learning_rate": 4.9823612114154704e-06, + "loss": 0.1996, + "num_input_tokens_seen": 1721680, + "step": 866 + }, + { + "epoch": 0.11491053677932406, + "grad_norm": 11.11315631866455, + "learning_rate": 4.982320023892575e-06, + "loss": 0.2141, + "num_input_tokens_seen": 1723256, + "step": 867 + }, + { + "epoch": 0.11504307488402916, + "grad_norm": 16.627334594726562, + "learning_rate": 4.982278788508837e-06, + "loss": 0.4186, + "num_input_tokens_seen": 1725144, + "step": 868 + }, + { + "epoch": 0.11517561298873426, + "grad_norm": 0.2547544836997986, + "learning_rate": 4.982237505265051e-06, + "loss": 0.0013, + "num_input_tokens_seen": 1726704, + "step": 869 + }, + { + "epoch": 0.11530815109343936, + "grad_norm": 16.413021087646484, + "learning_rate": 4.982196174162014e-06, + "loss": 0.456, + "num_input_tokens_seen": 1727976, + "step": 870 + }, + { + "epoch": 0.11544068919814447, + "grad_norm": 11.950100898742676, + "learning_rate": 4.982154795200522e-06, + "loss": 0.178, + "num_input_tokens_seen": 1730312, + "step": 871 + }, + { + "epoch": 0.11557322730284957, + "grad_norm": 11.07381534576416, + "learning_rate": 4.9821133683813735e-06, + "loss": 0.191, + "num_input_tokens_seen": 1731576, + "step": 872 + }, + { + "epoch": 0.11570576540755467, + "grad_norm": 12.142051696777344, + "learning_rate": 4.982071893705367e-06, + "loss": 0.4028, + "num_input_tokens_seen": 1733168, + "step": 873 + }, + { + "epoch": 0.11583830351225978, + "grad_norm": 21.12917137145996, + "learning_rate": 4.982030371173302e-06, + "loss": 0.8856, + "num_input_tokens_seen": 1735040, + "step": 874 + }, + { + "epoch": 0.11597084161696487, + "grad_norm": 2.745619773864746, + "learning_rate": 4.981988800785979e-06, + "loss": 0.0097, + "num_input_tokens_seen": 1737032, + "step": 875 + }, + { + "epoch": 0.11610337972166998, + "grad_norm": 16.126264572143555, + "learning_rate": 4.981947182544199e-06, + "loss": 0.3656, + "num_input_tokens_seen": 1738504, + "step": 876 + }, + { + "epoch": 0.11623591782637509, + "grad_norm": 1.384109377861023, + "learning_rate": 4.981905516448766e-06, + "loss": 0.0181, + "num_input_tokens_seen": 1740544, + "step": 877 + }, + { + "epoch": 0.11636845593108018, + "grad_norm": 0.529175341129303, + "learning_rate": 4.981863802500483e-06, + "loss": 0.0027, + "num_input_tokens_seen": 1743152, + "step": 878 + }, + { + "epoch": 0.11650099403578529, + "grad_norm": 22.3206729888916, + "learning_rate": 4.981822040700153e-06, + "loss": 0.6353, + "num_input_tokens_seen": 1745192, + "step": 879 + }, + { + "epoch": 0.1166335321404904, + "grad_norm": 11.981969833374023, + "learning_rate": 4.981780231048582e-06, + "loss": 0.183, + "num_input_tokens_seen": 1746936, + "step": 880 + }, + { + "epoch": 0.11676607024519549, + "grad_norm": 8.657827377319336, + "learning_rate": 4.981738373546575e-06, + "loss": 0.1326, + "num_input_tokens_seen": 1748576, + "step": 881 + }, + { + "epoch": 0.1168986083499006, + "grad_norm": 16.50551414489746, + "learning_rate": 4.981696468194942e-06, + "loss": 0.3783, + "num_input_tokens_seen": 1751376, + "step": 882 + }, + { + "epoch": 0.1170311464546057, + "grad_norm": 31.99459457397461, + "learning_rate": 4.981654514994489e-06, + "loss": 0.7492, + "num_input_tokens_seen": 1753328, + "step": 883 + }, + { + "epoch": 0.1171636845593108, + "grad_norm": 17.236486434936523, + "learning_rate": 4.981612513946024e-06, + "loss": 0.5204, + "num_input_tokens_seen": 1755368, + "step": 884 + }, + { + "epoch": 0.1172962226640159, + "grad_norm": 0.35969194769859314, + "learning_rate": 4.981570465050357e-06, + "loss": 0.0019, + "num_input_tokens_seen": 1757712, + "step": 885 + }, + { + "epoch": 0.11742876076872101, + "grad_norm": 3.8617584705352783, + "learning_rate": 4.981528368308301e-06, + "loss": 0.0712, + "num_input_tokens_seen": 1760288, + "step": 886 + }, + { + "epoch": 0.1175612988734261, + "grad_norm": 19.82040786743164, + "learning_rate": 4.981486223720665e-06, + "loss": 0.2419, + "num_input_tokens_seen": 1762048, + "step": 887 + }, + { + "epoch": 0.11769383697813121, + "grad_norm": 20.818172454833984, + "learning_rate": 4.9814440312882635e-06, + "loss": 0.4932, + "num_input_tokens_seen": 1763880, + "step": 888 + }, + { + "epoch": 0.11782637508283632, + "grad_norm": 9.143793106079102, + "learning_rate": 4.981401791011909e-06, + "loss": 0.0884, + "num_input_tokens_seen": 1766216, + "step": 889 + }, + { + "epoch": 0.11795891318754141, + "grad_norm": 16.558807373046875, + "learning_rate": 4.981359502892416e-06, + "loss": 0.4182, + "num_input_tokens_seen": 1768304, + "step": 890 + }, + { + "epoch": 0.11809145129224652, + "grad_norm": 19.93306541442871, + "learning_rate": 4.9813171669306e-06, + "loss": 0.6198, + "num_input_tokens_seen": 1770912, + "step": 891 + }, + { + "epoch": 0.11822398939695163, + "grad_norm": 16.844219207763672, + "learning_rate": 4.9812747831272775e-06, + "loss": 0.5516, + "num_input_tokens_seen": 1773040, + "step": 892 + }, + { + "epoch": 0.11835652750165673, + "grad_norm": 5.556632041931152, + "learning_rate": 4.981232351483265e-06, + "loss": 0.1735, + "num_input_tokens_seen": 1775560, + "step": 893 + }, + { + "epoch": 0.11848906560636183, + "grad_norm": 12.321590423583984, + "learning_rate": 4.981189871999381e-06, + "loss": 0.231, + "num_input_tokens_seen": 1777696, + "step": 894 + }, + { + "epoch": 0.11862160371106693, + "grad_norm": 8.620399475097656, + "learning_rate": 4.981147344676446e-06, + "loss": 0.2044, + "num_input_tokens_seen": 1778856, + "step": 895 + }, + { + "epoch": 0.11875414181577204, + "grad_norm": 9.742249488830566, + "learning_rate": 4.981104769515277e-06, + "loss": 0.2322, + "num_input_tokens_seen": 1780392, + "step": 896 + }, + { + "epoch": 0.11888667992047713, + "grad_norm": 10.554191589355469, + "learning_rate": 4.981062146516696e-06, + "loss": 0.1007, + "num_input_tokens_seen": 1783336, + "step": 897 + }, + { + "epoch": 0.11901921802518224, + "grad_norm": 5.274407386779785, + "learning_rate": 4.981019475681527e-06, + "loss": 0.0428, + "num_input_tokens_seen": 1785160, + "step": 898 + }, + { + "epoch": 0.11915175612988735, + "grad_norm": 15.795299530029297, + "learning_rate": 4.98097675701059e-06, + "loss": 0.3509, + "num_input_tokens_seen": 1787056, + "step": 899 + }, + { + "epoch": 0.11928429423459244, + "grad_norm": 28.578710556030273, + "learning_rate": 4.9809339905047095e-06, + "loss": 0.5014, + "num_input_tokens_seen": 1788920, + "step": 900 + }, + { + "epoch": 0.11941683233929755, + "grad_norm": 6.704741477966309, + "learning_rate": 4.980891176164711e-06, + "loss": 0.1481, + "num_input_tokens_seen": 1790480, + "step": 901 + }, + { + "epoch": 0.11954937044400266, + "grad_norm": 6.743203163146973, + "learning_rate": 4.980848313991417e-06, + "loss": 0.1238, + "num_input_tokens_seen": 1792296, + "step": 902 + }, + { + "epoch": 0.11968190854870775, + "grad_norm": 8.272367477416992, + "learning_rate": 4.9808054039856575e-06, + "loss": 0.1913, + "num_input_tokens_seen": 1794040, + "step": 903 + }, + { + "epoch": 0.11981444665341286, + "grad_norm": 1.7441437244415283, + "learning_rate": 4.980762446148259e-06, + "loss": 0.0095, + "num_input_tokens_seen": 1795320, + "step": 904 + }, + { + "epoch": 0.11994698475811796, + "grad_norm": 12.581842422485352, + "learning_rate": 4.980719440480048e-06, + "loss": 0.1067, + "num_input_tokens_seen": 1796400, + "step": 905 + }, + { + "epoch": 0.12007952286282306, + "grad_norm": 1.9570139646530151, + "learning_rate": 4.980676386981856e-06, + "loss": 0.0104, + "num_input_tokens_seen": 1798608, + "step": 906 + }, + { + "epoch": 0.12021206096752816, + "grad_norm": 2.0020437240600586, + "learning_rate": 4.980633285654511e-06, + "loss": 0.0088, + "num_input_tokens_seen": 1801216, + "step": 907 + }, + { + "epoch": 0.12034459907223327, + "grad_norm": 0.8369412422180176, + "learning_rate": 4.980590136498845e-06, + "loss": 0.0043, + "num_input_tokens_seen": 1803376, + "step": 908 + }, + { + "epoch": 0.12047713717693836, + "grad_norm": 0.9324873685836792, + "learning_rate": 4.98054693951569e-06, + "loss": 0.0046, + "num_input_tokens_seen": 1805384, + "step": 909 + }, + { + "epoch": 0.12060967528164347, + "grad_norm": 16.33839988708496, + "learning_rate": 4.980503694705879e-06, + "loss": 0.3225, + "num_input_tokens_seen": 1807768, + "step": 910 + }, + { + "epoch": 0.12074221338634858, + "grad_norm": 15.873828887939453, + "learning_rate": 4.9804604020702454e-06, + "loss": 0.4233, + "num_input_tokens_seen": 1809320, + "step": 911 + }, + { + "epoch": 0.12087475149105367, + "grad_norm": 9.458096504211426, + "learning_rate": 4.980417061609625e-06, + "loss": 0.0929, + "num_input_tokens_seen": 1811576, + "step": 912 + }, + { + "epoch": 0.12100728959575878, + "grad_norm": 13.18581771850586, + "learning_rate": 4.980373673324851e-06, + "loss": 0.2054, + "num_input_tokens_seen": 1813992, + "step": 913 + }, + { + "epoch": 0.12113982770046389, + "grad_norm": 0.03155744820833206, + "learning_rate": 4.9803302372167615e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1816104, + "step": 914 + }, + { + "epoch": 0.12127236580516898, + "grad_norm": 0.042727030813694, + "learning_rate": 4.980286753286196e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1818056, + "step": 915 + }, + { + "epoch": 0.12140490390987409, + "grad_norm": 2.7886788845062256, + "learning_rate": 4.980243221533988e-06, + "loss": 0.0989, + "num_input_tokens_seen": 1819312, + "step": 916 + }, + { + "epoch": 0.1215374420145792, + "grad_norm": 0.07936962693929672, + "learning_rate": 4.980199641960981e-06, + "loss": 0.0003, + "num_input_tokens_seen": 1820712, + "step": 917 + }, + { + "epoch": 0.12166998011928429, + "grad_norm": 11.05478572845459, + "learning_rate": 4.980156014568014e-06, + "loss": 0.2817, + "num_input_tokens_seen": 1822568, + "step": 918 + }, + { + "epoch": 0.1218025182239894, + "grad_norm": 13.562288284301758, + "learning_rate": 4.980112339355928e-06, + "loss": 0.189, + "num_input_tokens_seen": 1825920, + "step": 919 + }, + { + "epoch": 0.1219350563286945, + "grad_norm": 19.410114288330078, + "learning_rate": 4.980068616325565e-06, + "loss": 0.2246, + "num_input_tokens_seen": 1828048, + "step": 920 + }, + { + "epoch": 0.1220675944333996, + "grad_norm": 20.214387893676758, + "learning_rate": 4.980024845477769e-06, + "loss": 0.8152, + "num_input_tokens_seen": 1830496, + "step": 921 + }, + { + "epoch": 0.1222001325381047, + "grad_norm": 19.170421600341797, + "learning_rate": 4.979981026813382e-06, + "loss": 0.5857, + "num_input_tokens_seen": 1833752, + "step": 922 + }, + { + "epoch": 0.12233267064280981, + "grad_norm": 20.561349868774414, + "learning_rate": 4.97993716033325e-06, + "loss": 0.3535, + "num_input_tokens_seen": 1836312, + "step": 923 + }, + { + "epoch": 0.12246520874751492, + "grad_norm": 16.416982650756836, + "learning_rate": 4.979893246038219e-06, + "loss": 0.2581, + "num_input_tokens_seen": 1839752, + "step": 924 + }, + { + "epoch": 0.12259774685222001, + "grad_norm": 0.01307649165391922, + "learning_rate": 4.979849283929135e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1840872, + "step": 925 + }, + { + "epoch": 0.12273028495692512, + "grad_norm": 9.902410507202148, + "learning_rate": 4.979805274006847e-06, + "loss": 0.2249, + "num_input_tokens_seen": 1843648, + "step": 926 + }, + { + "epoch": 0.12286282306163022, + "grad_norm": 17.429828643798828, + "learning_rate": 4.979761216272202e-06, + "loss": 0.1367, + "num_input_tokens_seen": 1845304, + "step": 927 + }, + { + "epoch": 0.12299536116633532, + "grad_norm": 19.342809677124023, + "learning_rate": 4.97971711072605e-06, + "loss": 0.3836, + "num_input_tokens_seen": 1847072, + "step": 928 + }, + { + "epoch": 0.12312789927104043, + "grad_norm": 6.875079154968262, + "learning_rate": 4.979672957369242e-06, + "loss": 0.0226, + "num_input_tokens_seen": 1848416, + "step": 929 + }, + { + "epoch": 0.12326043737574553, + "grad_norm": 28.68491554260254, + "learning_rate": 4.979628756202628e-06, + "loss": 0.4527, + "num_input_tokens_seen": 1850944, + "step": 930 + }, + { + "epoch": 0.12339297548045063, + "grad_norm": 19.466541290283203, + "learning_rate": 4.979584507227062e-06, + "loss": 0.4791, + "num_input_tokens_seen": 1852864, + "step": 931 + }, + { + "epoch": 0.12352551358515573, + "grad_norm": 0.02839178964495659, + "learning_rate": 4.979540210443396e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1854928, + "step": 932 + }, + { + "epoch": 0.12365805168986084, + "grad_norm": 11.732213973999023, + "learning_rate": 4.979495865852483e-06, + "loss": 0.0857, + "num_input_tokens_seen": 1856592, + "step": 933 + }, + { + "epoch": 0.12379058979456593, + "grad_norm": 8.966989517211914, + "learning_rate": 4.979451473455179e-06, + "loss": 0.1056, + "num_input_tokens_seen": 1858192, + "step": 934 + }, + { + "epoch": 0.12392312789927104, + "grad_norm": 29.38459014892578, + "learning_rate": 4.979407033252342e-06, + "loss": 0.7006, + "num_input_tokens_seen": 1861032, + "step": 935 + }, + { + "epoch": 0.12405566600397615, + "grad_norm": 14.765287399291992, + "learning_rate": 4.979362545244825e-06, + "loss": 0.2784, + "num_input_tokens_seen": 1862640, + "step": 936 + }, + { + "epoch": 0.12418820410868124, + "grad_norm": 16.24656105041504, + "learning_rate": 4.979318009433489e-06, + "loss": 0.2565, + "num_input_tokens_seen": 1864392, + "step": 937 + }, + { + "epoch": 0.12432074221338635, + "grad_norm": 7.147797584533691, + "learning_rate": 4.979273425819191e-06, + "loss": 0.1088, + "num_input_tokens_seen": 1866104, + "step": 938 + }, + { + "epoch": 0.12445328031809146, + "grad_norm": 19.232332229614258, + "learning_rate": 4.979228794402791e-06, + "loss": 0.4301, + "num_input_tokens_seen": 1868016, + "step": 939 + }, + { + "epoch": 0.12458581842279655, + "grad_norm": 10.900642395019531, + "learning_rate": 4.979184115185149e-06, + "loss": 0.0946, + "num_input_tokens_seen": 1870328, + "step": 940 + }, + { + "epoch": 0.12471835652750166, + "grad_norm": 12.264572143554688, + "learning_rate": 4.979139388167128e-06, + "loss": 0.0838, + "num_input_tokens_seen": 1871640, + "step": 941 + }, + { + "epoch": 0.12485089463220676, + "grad_norm": 5.650241851806641, + "learning_rate": 4.9790946133495884e-06, + "loss": 0.0336, + "num_input_tokens_seen": 1873160, + "step": 942 + }, + { + "epoch": 0.12498343273691186, + "grad_norm": 22.28797149658203, + "learning_rate": 4.979049790733395e-06, + "loss": 0.3869, + "num_input_tokens_seen": 1874480, + "step": 943 + }, + { + "epoch": 0.12511597084161696, + "grad_norm": 15.541543006896973, + "learning_rate": 4.97900492031941e-06, + "loss": 0.2555, + "num_input_tokens_seen": 1876344, + "step": 944 + }, + { + "epoch": 0.12524850894632206, + "grad_norm": 32.305545806884766, + "learning_rate": 4.978960002108502e-06, + "loss": 0.3528, + "num_input_tokens_seen": 1879096, + "step": 945 + }, + { + "epoch": 0.12538104705102718, + "grad_norm": 15.09549617767334, + "learning_rate": 4.978915036101534e-06, + "loss": 0.327, + "num_input_tokens_seen": 1880960, + "step": 946 + }, + { + "epoch": 0.12551358515573227, + "grad_norm": 18.13591957092285, + "learning_rate": 4.978870022299374e-06, + "loss": 0.6034, + "num_input_tokens_seen": 1883112, + "step": 947 + }, + { + "epoch": 0.12564612326043736, + "grad_norm": 11.852300643920898, + "learning_rate": 4.97882496070289e-06, + "loss": 0.2273, + "num_input_tokens_seen": 1884448, + "step": 948 + }, + { + "epoch": 0.12577866136514249, + "grad_norm": 20.35089874267578, + "learning_rate": 4.978779851312951e-06, + "loss": 0.2672, + "num_input_tokens_seen": 1886368, + "step": 949 + }, + { + "epoch": 0.12591119946984758, + "grad_norm": 8.971364974975586, + "learning_rate": 4.978734694130426e-06, + "loss": 0.196, + "num_input_tokens_seen": 1889032, + "step": 950 + }, + { + "epoch": 0.12604373757455267, + "grad_norm": 3.7292568683624268, + "learning_rate": 4.978689489156187e-06, + "loss": 0.0767, + "num_input_tokens_seen": 1891576, + "step": 951 + }, + { + "epoch": 0.1261762756792578, + "grad_norm": 0.7282701134681702, + "learning_rate": 4.978644236391104e-06, + "loss": 0.0035, + "num_input_tokens_seen": 1892776, + "step": 952 + }, + { + "epoch": 0.1263088137839629, + "grad_norm": 1.57236647605896, + "learning_rate": 4.97859893583605e-06, + "loss": 0.0073, + "num_input_tokens_seen": 1894352, + "step": 953 + }, + { + "epoch": 0.12644135188866798, + "grad_norm": 16.642669677734375, + "learning_rate": 4.978553587491899e-06, + "loss": 0.4573, + "num_input_tokens_seen": 1896568, + "step": 954 + }, + { + "epoch": 0.1265738899933731, + "grad_norm": 1.8103134632110596, + "learning_rate": 4.9785081913595244e-06, + "loss": 0.0096, + "num_input_tokens_seen": 1898272, + "step": 955 + }, + { + "epoch": 0.1267064280980782, + "grad_norm": 1.0839422941207886, + "learning_rate": 4.978462747439802e-06, + "loss": 0.0058, + "num_input_tokens_seen": 1901432, + "step": 956 + }, + { + "epoch": 0.1268389662027833, + "grad_norm": 0.35691606998443604, + "learning_rate": 4.9784172557336085e-06, + "loss": 0.0018, + "num_input_tokens_seen": 1903728, + "step": 957 + }, + { + "epoch": 0.1269715043074884, + "grad_norm": 0.2396673709154129, + "learning_rate": 4.97837171624182e-06, + "loss": 0.0012, + "num_input_tokens_seen": 1905568, + "step": 958 + }, + { + "epoch": 0.1271040424121935, + "grad_norm": 14.472853660583496, + "learning_rate": 4.978326128965316e-06, + "loss": 0.2768, + "num_input_tokens_seen": 1907928, + "step": 959 + }, + { + "epoch": 0.1272365805168986, + "grad_norm": 0.017154300585389137, + "learning_rate": 4.978280493904974e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1909240, + "step": 960 + }, + { + "epoch": 0.12736911862160372, + "grad_norm": 26.194080352783203, + "learning_rate": 4.978234811061674e-06, + "loss": 0.9594, + "num_input_tokens_seen": 1910848, + "step": 961 + }, + { + "epoch": 0.1275016567263088, + "grad_norm": 0.018787138164043427, + "learning_rate": 4.978189080436298e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1912712, + "step": 962 + }, + { + "epoch": 0.1276341948310139, + "grad_norm": 17.589214324951172, + "learning_rate": 4.978143302029726e-06, + "loss": 0.5851, + "num_input_tokens_seen": 1915472, + "step": 963 + }, + { + "epoch": 0.12776673293571902, + "grad_norm": 0.18589110672473907, + "learning_rate": 4.978097475842843e-06, + "loss": 0.0007, + "num_input_tokens_seen": 1918640, + "step": 964 + }, + { + "epoch": 0.12789927104042412, + "grad_norm": 6.767877578735352, + "learning_rate": 4.97805160187653e-06, + "loss": 0.2474, + "num_input_tokens_seen": 1921496, + "step": 965 + }, + { + "epoch": 0.12803180914512924, + "grad_norm": 15.221022605895996, + "learning_rate": 4.978005680131672e-06, + "loss": 0.435, + "num_input_tokens_seen": 1923384, + "step": 966 + }, + { + "epoch": 0.12816434724983433, + "grad_norm": 7.466868877410889, + "learning_rate": 4.977959710609156e-06, + "loss": 0.1878, + "num_input_tokens_seen": 1926368, + "step": 967 + }, + { + "epoch": 0.12829688535453942, + "grad_norm": 21.889080047607422, + "learning_rate": 4.977913693309867e-06, + "loss": 0.8215, + "num_input_tokens_seen": 1928536, + "step": 968 + }, + { + "epoch": 0.12842942345924455, + "grad_norm": 0.02951613999903202, + "learning_rate": 4.9778676282346935e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1930192, + "step": 969 + }, + { + "epoch": 0.12856196156394964, + "grad_norm": 17.272552490234375, + "learning_rate": 4.977821515384522e-06, + "loss": 0.4763, + "num_input_tokens_seen": 1932088, + "step": 970 + }, + { + "epoch": 0.12869449966865473, + "grad_norm": 4.174117565155029, + "learning_rate": 4.977775354760241e-06, + "loss": 0.1649, + "num_input_tokens_seen": 1935352, + "step": 971 + }, + { + "epoch": 0.12882703777335985, + "grad_norm": 11.728879928588867, + "learning_rate": 4.977729146362744e-06, + "loss": 0.2765, + "num_input_tokens_seen": 1936928, + "step": 972 + }, + { + "epoch": 0.12895957587806495, + "grad_norm": 10.122467041015625, + "learning_rate": 4.977682890192919e-06, + "loss": 0.2682, + "num_input_tokens_seen": 1939032, + "step": 973 + }, + { + "epoch": 0.12909211398277004, + "grad_norm": 20.24359130859375, + "learning_rate": 4.9776365862516585e-06, + "loss": 0.3167, + "num_input_tokens_seen": 1940704, + "step": 974 + }, + { + "epoch": 0.12922465208747516, + "grad_norm": 15.037644386291504, + "learning_rate": 4.977590234539855e-06, + "loss": 0.3133, + "num_input_tokens_seen": 1942472, + "step": 975 + }, + { + "epoch": 0.12935719019218025, + "grad_norm": 0.871199369430542, + "learning_rate": 4.977543835058403e-06, + "loss": 0.0047, + "num_input_tokens_seen": 1943720, + "step": 976 + }, + { + "epoch": 0.12948972829688535, + "grad_norm": 10.860332489013672, + "learning_rate": 4.977497387808197e-06, + "loss": 0.3642, + "num_input_tokens_seen": 1945920, + "step": 977 + }, + { + "epoch": 0.12962226640159047, + "grad_norm": 12.183349609375, + "learning_rate": 4.977450892790131e-06, + "loss": 0.3583, + "num_input_tokens_seen": 1947984, + "step": 978 + }, + { + "epoch": 0.12975480450629556, + "grad_norm": 11.729555130004883, + "learning_rate": 4.977404350005104e-06, + "loss": 0.159, + "num_input_tokens_seen": 1950904, + "step": 979 + }, + { + "epoch": 0.12988734261100066, + "grad_norm": 0.8721858263015747, + "learning_rate": 4.977357759454011e-06, + "loss": 0.0049, + "num_input_tokens_seen": 1951912, + "step": 980 + }, + { + "epoch": 0.13001988071570578, + "grad_norm": 16.194427490234375, + "learning_rate": 4.977311121137752e-06, + "loss": 0.4528, + "num_input_tokens_seen": 1953840, + "step": 981 + }, + { + "epoch": 0.13015241882041087, + "grad_norm": 23.044219970703125, + "learning_rate": 4.977264435057226e-06, + "loss": 0.6483, + "num_input_tokens_seen": 1956032, + "step": 982 + }, + { + "epoch": 0.13028495692511596, + "grad_norm": 5.177922248840332, + "learning_rate": 4.977217701213332e-06, + "loss": 0.0912, + "num_input_tokens_seen": 1958464, + "step": 983 + }, + { + "epoch": 0.13041749502982108, + "grad_norm": 9.338175773620605, + "learning_rate": 4.9771709196069716e-06, + "loss": 0.166, + "num_input_tokens_seen": 1961120, + "step": 984 + }, + { + "epoch": 0.13055003313452618, + "grad_norm": 1.4829189777374268, + "learning_rate": 4.977124090239048e-06, + "loss": 0.008, + "num_input_tokens_seen": 1963200, + "step": 985 + }, + { + "epoch": 0.13068257123923127, + "grad_norm": 5.258603096008301, + "learning_rate": 4.977077213110462e-06, + "loss": 0.0914, + "num_input_tokens_seen": 1964400, + "step": 986 + }, + { + "epoch": 0.1308151093439364, + "grad_norm": 16.36634635925293, + "learning_rate": 4.977030288222119e-06, + "loss": 0.3669, + "num_input_tokens_seen": 1966760, + "step": 987 + }, + { + "epoch": 0.13094764744864149, + "grad_norm": 9.772537231445312, + "learning_rate": 4.976983315574923e-06, + "loss": 0.3577, + "num_input_tokens_seen": 1968784, + "step": 988 + }, + { + "epoch": 0.13108018555334658, + "grad_norm": 14.790483474731445, + "learning_rate": 4.97693629516978e-06, + "loss": 0.4724, + "num_input_tokens_seen": 1971256, + "step": 989 + }, + { + "epoch": 0.1312127236580517, + "grad_norm": 6.397546291351318, + "learning_rate": 4.976889227007597e-06, + "loss": 0.0896, + "num_input_tokens_seen": 1973680, + "step": 990 + }, + { + "epoch": 0.1313452617627568, + "grad_norm": 10.71539306640625, + "learning_rate": 4.97684211108928e-06, + "loss": 0.3719, + "num_input_tokens_seen": 1976496, + "step": 991 + }, + { + "epoch": 0.1314777998674619, + "grad_norm": 17.441057205200195, + "learning_rate": 4.976794947415739e-06, + "loss": 0.2989, + "num_input_tokens_seen": 1978144, + "step": 992 + }, + { + "epoch": 0.131610337972167, + "grad_norm": 7.96646785736084, + "learning_rate": 4.976747735987882e-06, + "loss": 0.2642, + "num_input_tokens_seen": 1980192, + "step": 993 + }, + { + "epoch": 0.1317428760768721, + "grad_norm": 0.7479077577590942, + "learning_rate": 4.976700476806622e-06, + "loss": 0.0041, + "num_input_tokens_seen": 1981616, + "step": 994 + }, + { + "epoch": 0.1318754141815772, + "grad_norm": 6.590865135192871, + "learning_rate": 4.976653169872866e-06, + "loss": 0.0968, + "num_input_tokens_seen": 1983648, + "step": 995 + }, + { + "epoch": 0.13200795228628232, + "grad_norm": 4.6522135734558105, + "learning_rate": 4.976605815187529e-06, + "loss": 0.0313, + "num_input_tokens_seen": 1985560, + "step": 996 + }, + { + "epoch": 0.1321404903909874, + "grad_norm": 0.715999186038971, + "learning_rate": 4.976558412751524e-06, + "loss": 0.0039, + "num_input_tokens_seen": 1987096, + "step": 997 + }, + { + "epoch": 0.1322730284956925, + "grad_norm": 0.8746582269668579, + "learning_rate": 4.976510962565764e-06, + "loss": 0.0045, + "num_input_tokens_seen": 1989448, + "step": 998 + }, + { + "epoch": 0.13240556660039762, + "grad_norm": 1.1189825534820557, + "learning_rate": 4.976463464631164e-06, + "loss": 0.0059, + "num_input_tokens_seen": 1991608, + "step": 999 + }, + { + "epoch": 0.13253810470510272, + "grad_norm": 9.288385391235352, + "learning_rate": 4.97641591894864e-06, + "loss": 0.1396, + "num_input_tokens_seen": 1993440, + "step": 1000 + }, + { + "epoch": 0.1326706428098078, + "grad_norm": 0.09776676446199417, + "learning_rate": 4.976368325519109e-06, + "loss": 0.0005, + "num_input_tokens_seen": 1994544, + "step": 1001 + }, + { + "epoch": 0.13280318091451293, + "grad_norm": 3.845468044281006, + "learning_rate": 4.976320684343488e-06, + "loss": 0.0441, + "num_input_tokens_seen": 1995920, + "step": 1002 + }, + { + "epoch": 0.13293571901921802, + "grad_norm": 0.11025694757699966, + "learning_rate": 4.976272995422696e-06, + "loss": 0.0005, + "num_input_tokens_seen": 1996984, + "step": 1003 + }, + { + "epoch": 0.13306825712392312, + "grad_norm": 12.400800704956055, + "learning_rate": 4.976225258757652e-06, + "loss": 0.3215, + "num_input_tokens_seen": 1999408, + "step": 1004 + }, + { + "epoch": 0.13320079522862824, + "grad_norm": 0.1854807734489441, + "learning_rate": 4.9761774743492776e-06, + "loss": 0.0009, + "num_input_tokens_seen": 2000664, + "step": 1005 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.0483243502676487, + "learning_rate": 4.976129642198492e-06, + "loss": 0.0003, + "num_input_tokens_seen": 2002536, + "step": 1006 + }, + { + "epoch": 0.13346587143803842, + "grad_norm": 14.460874557495117, + "learning_rate": 4.9760817623062196e-06, + "loss": 0.2247, + "num_input_tokens_seen": 2004744, + "step": 1007 + }, + { + "epoch": 0.13359840954274355, + "grad_norm": 34.594844818115234, + "learning_rate": 4.976033834673383e-06, + "loss": 0.8309, + "num_input_tokens_seen": 2006888, + "step": 1008 + }, + { + "epoch": 0.13373094764744864, + "grad_norm": 14.689656257629395, + "learning_rate": 4.975985859300905e-06, + "loss": 0.3425, + "num_input_tokens_seen": 2008976, + "step": 1009 + }, + { + "epoch": 0.13386348575215373, + "grad_norm": 17.404752731323242, + "learning_rate": 4.975937836189712e-06, + "loss": 0.6832, + "num_input_tokens_seen": 2010720, + "step": 1010 + }, + { + "epoch": 0.13399602385685885, + "grad_norm": 11.40569019317627, + "learning_rate": 4.975889765340729e-06, + "loss": 0.2868, + "num_input_tokens_seen": 2012408, + "step": 1011 + }, + { + "epoch": 0.13412856196156395, + "grad_norm": 11.377372741699219, + "learning_rate": 4.975841646754885e-06, + "loss": 0.3469, + "num_input_tokens_seen": 2014008, + "step": 1012 + }, + { + "epoch": 0.13426110006626904, + "grad_norm": 14.436957359313965, + "learning_rate": 4.975793480433104e-06, + "loss": 0.3232, + "num_input_tokens_seen": 2016552, + "step": 1013 + }, + { + "epoch": 0.13439363817097416, + "grad_norm": 0.0989784449338913, + "learning_rate": 4.975745266376317e-06, + "loss": 0.0005, + "num_input_tokens_seen": 2018104, + "step": 1014 + }, + { + "epoch": 0.13452617627567925, + "grad_norm": 0.02434745617210865, + "learning_rate": 4.975697004585455e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2019352, + "step": 1015 + }, + { + "epoch": 0.13465871438038435, + "grad_norm": 0.17050907015800476, + "learning_rate": 4.975648695061445e-06, + "loss": 0.0008, + "num_input_tokens_seen": 2021000, + "step": 1016 + }, + { + "epoch": 0.13479125248508947, + "grad_norm": 15.06557559967041, + "learning_rate": 4.975600337805221e-06, + "loss": 0.3678, + "num_input_tokens_seen": 2023184, + "step": 1017 + }, + { + "epoch": 0.13492379058979456, + "grad_norm": 5.201245307922363, + "learning_rate": 4.9755519328177146e-06, + "loss": 0.0711, + "num_input_tokens_seen": 2024728, + "step": 1018 + }, + { + "epoch": 0.13505632869449966, + "grad_norm": 8.9242582321167, + "learning_rate": 4.975503480099859e-06, + "loss": 0.208, + "num_input_tokens_seen": 2027088, + "step": 1019 + }, + { + "epoch": 0.13518886679920478, + "grad_norm": 20.803049087524414, + "learning_rate": 4.975454979652589e-06, + "loss": 0.6699, + "num_input_tokens_seen": 2028744, + "step": 1020 + }, + { + "epoch": 0.13532140490390987, + "grad_norm": 6.726044654846191, + "learning_rate": 4.975406431476839e-06, + "loss": 0.1754, + "num_input_tokens_seen": 2030448, + "step": 1021 + }, + { + "epoch": 0.13545394300861496, + "grad_norm": 15.010723114013672, + "learning_rate": 4.975357835573546e-06, + "loss": 0.257, + "num_input_tokens_seen": 2032208, + "step": 1022 + }, + { + "epoch": 0.13558648111332008, + "grad_norm": 1.7161058187484741, + "learning_rate": 4.975309191943644e-06, + "loss": 0.0084, + "num_input_tokens_seen": 2034176, + "step": 1023 + }, + { + "epoch": 0.13571901921802518, + "grad_norm": 8.106315612792969, + "learning_rate": 4.975260500588076e-06, + "loss": 0.1332, + "num_input_tokens_seen": 2036864, + "step": 1024 + }, + { + "epoch": 0.1358515573227303, + "grad_norm": 8.195710182189941, + "learning_rate": 4.975211761507777e-06, + "loss": 0.0749, + "num_input_tokens_seen": 2038344, + "step": 1025 + }, + { + "epoch": 0.1359840954274354, + "grad_norm": 11.96745777130127, + "learning_rate": 4.975162974703687e-06, + "loss": 0.1417, + "num_input_tokens_seen": 2040008, + "step": 1026 + }, + { + "epoch": 0.13611663353214049, + "grad_norm": 0.40713274478912354, + "learning_rate": 4.975114140176748e-06, + "loss": 0.0016, + "num_input_tokens_seen": 2041480, + "step": 1027 + }, + { + "epoch": 0.1362491716368456, + "grad_norm": 5.220546722412109, + "learning_rate": 4.9750652579279005e-06, + "loss": 0.0322, + "num_input_tokens_seen": 2043112, + "step": 1028 + }, + { + "epoch": 0.1363817097415507, + "grad_norm": 11.272992134094238, + "learning_rate": 4.975016327958087e-06, + "loss": 0.1386, + "num_input_tokens_seen": 2044688, + "step": 1029 + }, + { + "epoch": 0.1365142478462558, + "grad_norm": 3.4789772033691406, + "learning_rate": 4.974967350268251e-06, + "loss": 0.0805, + "num_input_tokens_seen": 2046976, + "step": 1030 + }, + { + "epoch": 0.13664678595096091, + "grad_norm": 14.42776870727539, + "learning_rate": 4.974918324859338e-06, + "loss": 0.4598, + "num_input_tokens_seen": 2048696, + "step": 1031 + }, + { + "epoch": 0.136779324055666, + "grad_norm": 1.0267363786697388, + "learning_rate": 4.974869251732293e-06, + "loss": 0.0052, + "num_input_tokens_seen": 2050664, + "step": 1032 + }, + { + "epoch": 0.1369118621603711, + "grad_norm": 22.682056427001953, + "learning_rate": 4.974820130888061e-06, + "loss": 0.7385, + "num_input_tokens_seen": 2051824, + "step": 1033 + }, + { + "epoch": 0.13704440026507622, + "grad_norm": 0.19740206003189087, + "learning_rate": 4.974770962327589e-06, + "loss": 0.001, + "num_input_tokens_seen": 2053608, + "step": 1034 + }, + { + "epoch": 0.13717693836978131, + "grad_norm": 12.586018562316895, + "learning_rate": 4.974721746051825e-06, + "loss": 0.285, + "num_input_tokens_seen": 2055192, + "step": 1035 + }, + { + "epoch": 0.1373094764744864, + "grad_norm": 0.3698250353336334, + "learning_rate": 4.97467248206172e-06, + "loss": 0.0017, + "num_input_tokens_seen": 2057896, + "step": 1036 + }, + { + "epoch": 0.13744201457919153, + "grad_norm": 8.431306838989258, + "learning_rate": 4.974623170358222e-06, + "loss": 0.2271, + "num_input_tokens_seen": 2060752, + "step": 1037 + }, + { + "epoch": 0.13757455268389662, + "grad_norm": 23.15184211730957, + "learning_rate": 4.974573810942281e-06, + "loss": 0.8263, + "num_input_tokens_seen": 2063032, + "step": 1038 + }, + { + "epoch": 0.13770709078860172, + "grad_norm": 1.7490357160568237, + "learning_rate": 4.974524403814851e-06, + "loss": 0.0093, + "num_input_tokens_seen": 2064680, + "step": 1039 + }, + { + "epoch": 0.13783962889330684, + "grad_norm": 0.0850224569439888, + "learning_rate": 4.974474948976883e-06, + "loss": 0.0005, + "num_input_tokens_seen": 2066064, + "step": 1040 + }, + { + "epoch": 0.13797216699801193, + "grad_norm": 10.165837287902832, + "learning_rate": 4.974425446429331e-06, + "loss": 0.1956, + "num_input_tokens_seen": 2067368, + "step": 1041 + }, + { + "epoch": 0.13810470510271702, + "grad_norm": 0.10979548841714859, + "learning_rate": 4.97437589617315e-06, + "loss": 0.0006, + "num_input_tokens_seen": 2068352, + "step": 1042 + }, + { + "epoch": 0.13823724320742214, + "grad_norm": 23.999309539794922, + "learning_rate": 4.974326298209294e-06, + "loss": 0.678, + "num_input_tokens_seen": 2070472, + "step": 1043 + }, + { + "epoch": 0.13836978131212724, + "grad_norm": 1.0572566986083984, + "learning_rate": 4.9742766525387184e-06, + "loss": 0.0036, + "num_input_tokens_seen": 2072760, + "step": 1044 + }, + { + "epoch": 0.13850231941683233, + "grad_norm": 3.4263973236083984, + "learning_rate": 4.9742269591623845e-06, + "loss": 0.0669, + "num_input_tokens_seen": 2074584, + "step": 1045 + }, + { + "epoch": 0.13863485752153745, + "grad_norm": 10.278995513916016, + "learning_rate": 4.974177218081247e-06, + "loss": 0.2531, + "num_input_tokens_seen": 2076200, + "step": 1046 + }, + { + "epoch": 0.13876739562624255, + "grad_norm": 25.111663818359375, + "learning_rate": 4.974127429296266e-06, + "loss": 0.3791, + "num_input_tokens_seen": 2078328, + "step": 1047 + }, + { + "epoch": 0.13889993373094764, + "grad_norm": 7.208645343780518, + "learning_rate": 4.974077592808402e-06, + "loss": 0.1693, + "num_input_tokens_seen": 2080400, + "step": 1048 + }, + { + "epoch": 0.13903247183565276, + "grad_norm": 9.183088302612305, + "learning_rate": 4.974027708618614e-06, + "loss": 0.1283, + "num_input_tokens_seen": 2082184, + "step": 1049 + }, + { + "epoch": 0.13916500994035785, + "grad_norm": 0.16749460995197296, + "learning_rate": 4.9739777767278656e-06, + "loss": 0.0009, + "num_input_tokens_seen": 2084512, + "step": 1050 + }, + { + "epoch": 0.13929754804506295, + "grad_norm": 0.14608798921108246, + "learning_rate": 4.973927797137119e-06, + "loss": 0.0008, + "num_input_tokens_seen": 2085888, + "step": 1051 + }, + { + "epoch": 0.13943008614976807, + "grad_norm": 24.015453338623047, + "learning_rate": 4.973877769847338e-06, + "loss": 0.6617, + "num_input_tokens_seen": 2088232, + "step": 1052 + }, + { + "epoch": 0.13956262425447316, + "grad_norm": 8.42868709564209, + "learning_rate": 4.9738276948594865e-06, + "loss": 0.2422, + "num_input_tokens_seen": 2090456, + "step": 1053 + }, + { + "epoch": 0.13969516235917825, + "grad_norm": 18.085811614990234, + "learning_rate": 4.97377757217453e-06, + "loss": 0.3224, + "num_input_tokens_seen": 2092672, + "step": 1054 + }, + { + "epoch": 0.13982770046388338, + "grad_norm": 19.4833984375, + "learning_rate": 4.973727401793437e-06, + "loss": 0.3896, + "num_input_tokens_seen": 2095208, + "step": 1055 + }, + { + "epoch": 0.13996023856858847, + "grad_norm": 9.022984504699707, + "learning_rate": 4.973677183717172e-06, + "loss": 0.1654, + "num_input_tokens_seen": 2097840, + "step": 1056 + }, + { + "epoch": 0.14009277667329356, + "grad_norm": 8.475109100341797, + "learning_rate": 4.973626917946704e-06, + "loss": 0.2702, + "num_input_tokens_seen": 2099688, + "step": 1057 + }, + { + "epoch": 0.14022531477799868, + "grad_norm": 25.786333084106445, + "learning_rate": 4.973576604483004e-06, + "loss": 0.8779, + "num_input_tokens_seen": 2102472, + "step": 1058 + }, + { + "epoch": 0.14035785288270378, + "grad_norm": 0.3217809200286865, + "learning_rate": 4.973526243327039e-06, + "loss": 0.0018, + "num_input_tokens_seen": 2103656, + "step": 1059 + }, + { + "epoch": 0.14049039098740887, + "grad_norm": 14.359037399291992, + "learning_rate": 4.973475834479783e-06, + "loss": 0.2964, + "num_input_tokens_seen": 2104656, + "step": 1060 + }, + { + "epoch": 0.140622929092114, + "grad_norm": 15.119754791259766, + "learning_rate": 4.973425377942206e-06, + "loss": 0.1965, + "num_input_tokens_seen": 2106952, + "step": 1061 + }, + { + "epoch": 0.14075546719681908, + "grad_norm": 15.015596389770508, + "learning_rate": 4.973374873715281e-06, + "loss": 0.666, + "num_input_tokens_seen": 2109152, + "step": 1062 + }, + { + "epoch": 0.14088800530152418, + "grad_norm": 6.950316429138184, + "learning_rate": 4.973324321799983e-06, + "loss": 0.172, + "num_input_tokens_seen": 2112112, + "step": 1063 + }, + { + "epoch": 0.1410205434062293, + "grad_norm": 12.297924041748047, + "learning_rate": 4.973273722197285e-06, + "loss": 0.1637, + "num_input_tokens_seen": 2114272, + "step": 1064 + }, + { + "epoch": 0.1411530815109344, + "grad_norm": 10.396502494812012, + "learning_rate": 4.973223074908163e-06, + "loss": 0.2256, + "num_input_tokens_seen": 2116368, + "step": 1065 + }, + { + "epoch": 0.14128561961563949, + "grad_norm": 2.5695409774780273, + "learning_rate": 4.9731723799335954e-06, + "loss": 0.014, + "num_input_tokens_seen": 2118088, + "step": 1066 + }, + { + "epoch": 0.1414181577203446, + "grad_norm": 0.9237880706787109, + "learning_rate": 4.973121637274558e-06, + "loss": 0.0052, + "num_input_tokens_seen": 2119440, + "step": 1067 + }, + { + "epoch": 0.1415506958250497, + "grad_norm": 14.194740295410156, + "learning_rate": 4.973070846932028e-06, + "loss": 0.3794, + "num_input_tokens_seen": 2121888, + "step": 1068 + }, + { + "epoch": 0.1416832339297548, + "grad_norm": 0.8454501628875732, + "learning_rate": 4.973020008906987e-06, + "loss": 0.0045, + "num_input_tokens_seen": 2123736, + "step": 1069 + }, + { + "epoch": 0.1418157720344599, + "grad_norm": 7.1999993324279785, + "learning_rate": 4.9729691232004135e-06, + "loss": 0.0613, + "num_input_tokens_seen": 2125096, + "step": 1070 + }, + { + "epoch": 0.141948310139165, + "grad_norm": 16.522409439086914, + "learning_rate": 4.97291818981329e-06, + "loss": 0.7136, + "num_input_tokens_seen": 2127216, + "step": 1071 + }, + { + "epoch": 0.1420808482438701, + "grad_norm": 7.099532604217529, + "learning_rate": 4.972867208746597e-06, + "loss": 0.2218, + "num_input_tokens_seen": 2129040, + "step": 1072 + }, + { + "epoch": 0.14221338634857522, + "grad_norm": 10.5108060836792, + "learning_rate": 4.972816180001319e-06, + "loss": 0.1802, + "num_input_tokens_seen": 2131160, + "step": 1073 + }, + { + "epoch": 0.14234592445328031, + "grad_norm": 13.357348442077637, + "learning_rate": 4.972765103578438e-06, + "loss": 0.2124, + "num_input_tokens_seen": 2133136, + "step": 1074 + }, + { + "epoch": 0.1424784625579854, + "grad_norm": 0.17172189056873322, + "learning_rate": 4.972713979478941e-06, + "loss": 0.0009, + "num_input_tokens_seen": 2135176, + "step": 1075 + }, + { + "epoch": 0.14261100066269053, + "grad_norm": 7.505782604217529, + "learning_rate": 4.972662807703813e-06, + "loss": 0.0844, + "num_input_tokens_seen": 2137488, + "step": 1076 + }, + { + "epoch": 0.14274353876739562, + "grad_norm": 2.9465689659118652, + "learning_rate": 4.97261158825404e-06, + "loss": 0.0498, + "num_input_tokens_seen": 2139320, + "step": 1077 + }, + { + "epoch": 0.14287607687210072, + "grad_norm": 0.1261802464723587, + "learning_rate": 4.972560321130609e-06, + "loss": 0.0007, + "num_input_tokens_seen": 2140672, + "step": 1078 + }, + { + "epoch": 0.14300861497680584, + "grad_norm": 7.9510498046875, + "learning_rate": 4.97250900633451e-06, + "loss": 0.1369, + "num_input_tokens_seen": 2142224, + "step": 1079 + }, + { + "epoch": 0.14314115308151093, + "grad_norm": 8.301384925842285, + "learning_rate": 4.9724576438667316e-06, + "loss": 0.2094, + "num_input_tokens_seen": 2144048, + "step": 1080 + }, + { + "epoch": 0.14327369118621602, + "grad_norm": 16.20797348022461, + "learning_rate": 4.972406233728264e-06, + "loss": 0.3451, + "num_input_tokens_seen": 2145504, + "step": 1081 + }, + { + "epoch": 0.14340622929092114, + "grad_norm": 11.305400848388672, + "learning_rate": 4.972354775920099e-06, + "loss": 0.2041, + "num_input_tokens_seen": 2147824, + "step": 1082 + }, + { + "epoch": 0.14353876739562624, + "grad_norm": 17.06035041809082, + "learning_rate": 4.972303270443227e-06, + "loss": 0.3535, + "num_input_tokens_seen": 2149680, + "step": 1083 + }, + { + "epoch": 0.14367130550033136, + "grad_norm": 8.827367782592773, + "learning_rate": 4.972251717298644e-06, + "loss": 0.154, + "num_input_tokens_seen": 2151256, + "step": 1084 + }, + { + "epoch": 0.14380384360503645, + "grad_norm": 15.790962219238281, + "learning_rate": 4.972200116487342e-06, + "loss": 0.3193, + "num_input_tokens_seen": 2153144, + "step": 1085 + }, + { + "epoch": 0.14393638170974155, + "grad_norm": 11.527240753173828, + "learning_rate": 4.972148468010316e-06, + "loss": 0.126, + "num_input_tokens_seen": 2155544, + "step": 1086 + }, + { + "epoch": 0.14406891981444667, + "grad_norm": 0.07238878309726715, + "learning_rate": 4.972096771868562e-06, + "loss": 0.0004, + "num_input_tokens_seen": 2156976, + "step": 1087 + }, + { + "epoch": 0.14420145791915176, + "grad_norm": 16.891948699951172, + "learning_rate": 4.972045028063076e-06, + "loss": 0.404, + "num_input_tokens_seen": 2159440, + "step": 1088 + }, + { + "epoch": 0.14433399602385685, + "grad_norm": 9.059277534484863, + "learning_rate": 4.971993236594858e-06, + "loss": 0.1847, + "num_input_tokens_seen": 2160560, + "step": 1089 + }, + { + "epoch": 0.14446653412856197, + "grad_norm": 16.980607986450195, + "learning_rate": 4.971941397464904e-06, + "loss": 0.2502, + "num_input_tokens_seen": 2162656, + "step": 1090 + }, + { + "epoch": 0.14459907223326707, + "grad_norm": 18.538114547729492, + "learning_rate": 4.971889510674215e-06, + "loss": 0.5705, + "num_input_tokens_seen": 2164232, + "step": 1091 + }, + { + "epoch": 0.14473161033797216, + "grad_norm": 0.23751325905323029, + "learning_rate": 4.9718375762237915e-06, + "loss": 0.0013, + "num_input_tokens_seen": 2166360, + "step": 1092 + }, + { + "epoch": 0.14486414844267728, + "grad_norm": 0.9280669689178467, + "learning_rate": 4.971785594114634e-06, + "loss": 0.0045, + "num_input_tokens_seen": 2168392, + "step": 1093 + }, + { + "epoch": 0.14499668654738238, + "grad_norm": 9.240777015686035, + "learning_rate": 4.971733564347745e-06, + "loss": 0.1102, + "num_input_tokens_seen": 2171184, + "step": 1094 + }, + { + "epoch": 0.14512922465208747, + "grad_norm": 0.21600164473056793, + "learning_rate": 4.971681486924127e-06, + "loss": 0.0011, + "num_input_tokens_seen": 2172256, + "step": 1095 + }, + { + "epoch": 0.1452617627567926, + "grad_norm": 7.418125629425049, + "learning_rate": 4.971629361844785e-06, + "loss": 0.2352, + "num_input_tokens_seen": 2174832, + "step": 1096 + }, + { + "epoch": 0.14539430086149768, + "grad_norm": 6.598450660705566, + "learning_rate": 4.971577189110724e-06, + "loss": 0.0517, + "num_input_tokens_seen": 2176176, + "step": 1097 + }, + { + "epoch": 0.14552683896620278, + "grad_norm": 17.72537612915039, + "learning_rate": 4.971524968722951e-06, + "loss": 0.445, + "num_input_tokens_seen": 2177968, + "step": 1098 + }, + { + "epoch": 0.1456593770709079, + "grad_norm": 0.332596093416214, + "learning_rate": 4.97147270068247e-06, + "loss": 0.0014, + "num_input_tokens_seen": 2179224, + "step": 1099 + }, + { + "epoch": 0.145791915175613, + "grad_norm": 11.393199920654297, + "learning_rate": 4.9714203849902906e-06, + "loss": 0.2588, + "num_input_tokens_seen": 2180720, + "step": 1100 + }, + { + "epoch": 0.14592445328031808, + "grad_norm": 0.12381167709827423, + "learning_rate": 4.9713680216474215e-06, + "loss": 0.0007, + "num_input_tokens_seen": 2183600, + "step": 1101 + }, + { + "epoch": 0.1460569913850232, + "grad_norm": 0.0911279022693634, + "learning_rate": 4.971315610654872e-06, + "loss": 0.0005, + "num_input_tokens_seen": 2185096, + "step": 1102 + }, + { + "epoch": 0.1461895294897283, + "grad_norm": 0.09448344260454178, + "learning_rate": 4.971263152013653e-06, + "loss": 0.0005, + "num_input_tokens_seen": 2186376, + "step": 1103 + }, + { + "epoch": 0.1463220675944334, + "grad_norm": 14.062426567077637, + "learning_rate": 4.971210645724775e-06, + "loss": 0.3228, + "num_input_tokens_seen": 2189072, + "step": 1104 + }, + { + "epoch": 0.1464546056991385, + "grad_norm": 16.506311416625977, + "learning_rate": 4.971158091789251e-06, + "loss": 0.3634, + "num_input_tokens_seen": 2190944, + "step": 1105 + }, + { + "epoch": 0.1465871438038436, + "grad_norm": 0.06774196773767471, + "learning_rate": 4.971105490208095e-06, + "loss": 0.0004, + "num_input_tokens_seen": 2192232, + "step": 1106 + }, + { + "epoch": 0.1467196819085487, + "grad_norm": 13.49811840057373, + "learning_rate": 4.971052840982319e-06, + "loss": 0.44, + "num_input_tokens_seen": 2193848, + "step": 1107 + }, + { + "epoch": 0.14685222001325382, + "grad_norm": 16.973705291748047, + "learning_rate": 4.971000144112941e-06, + "loss": 0.4162, + "num_input_tokens_seen": 2196192, + "step": 1108 + }, + { + "epoch": 0.1469847581179589, + "grad_norm": 7.563966274261475, + "learning_rate": 4.970947399600975e-06, + "loss": 0.0709, + "num_input_tokens_seen": 2198112, + "step": 1109 + }, + { + "epoch": 0.147117296222664, + "grad_norm": 0.5764822959899902, + "learning_rate": 4.970894607447438e-06, + "loss": 0.0054, + "num_input_tokens_seen": 2199712, + "step": 1110 + }, + { + "epoch": 0.14724983432736913, + "grad_norm": 3.911957025527954, + "learning_rate": 4.970841767653349e-06, + "loss": 0.109, + "num_input_tokens_seen": 2201200, + "step": 1111 + }, + { + "epoch": 0.14738237243207422, + "grad_norm": 16.377504348754883, + "learning_rate": 4.970788880219726e-06, + "loss": 0.4557, + "num_input_tokens_seen": 2203248, + "step": 1112 + }, + { + "epoch": 0.14751491053677931, + "grad_norm": 9.744097709655762, + "learning_rate": 4.970735945147589e-06, + "loss": 0.1216, + "num_input_tokens_seen": 2204600, + "step": 1113 + }, + { + "epoch": 0.14764744864148444, + "grad_norm": 3.640883684158325, + "learning_rate": 4.970682962437958e-06, + "loss": 0.0379, + "num_input_tokens_seen": 2206720, + "step": 1114 + }, + { + "epoch": 0.14777998674618953, + "grad_norm": 15.960216522216797, + "learning_rate": 4.970629932091856e-06, + "loss": 0.3399, + "num_input_tokens_seen": 2208168, + "step": 1115 + }, + { + "epoch": 0.14791252485089462, + "grad_norm": 14.752596855163574, + "learning_rate": 4.970576854110304e-06, + "loss": 0.2745, + "num_input_tokens_seen": 2209904, + "step": 1116 + }, + { + "epoch": 0.14804506295559974, + "grad_norm": 0.04901452735066414, + "learning_rate": 4.970523728494326e-06, + "loss": 0.0003, + "num_input_tokens_seen": 2211384, + "step": 1117 + }, + { + "epoch": 0.14817760106030484, + "grad_norm": 1.8523015975952148, + "learning_rate": 4.970470555244946e-06, + "loss": 0.0055, + "num_input_tokens_seen": 2213496, + "step": 1118 + }, + { + "epoch": 0.14831013916500993, + "grad_norm": 15.264734268188477, + "learning_rate": 4.970417334363189e-06, + "loss": 0.2557, + "num_input_tokens_seen": 2214912, + "step": 1119 + }, + { + "epoch": 0.14844267726971505, + "grad_norm": 0.6389595866203308, + "learning_rate": 4.970364065850082e-06, + "loss": 0.0012, + "num_input_tokens_seen": 2216496, + "step": 1120 + }, + { + "epoch": 0.14857521537442014, + "grad_norm": 17.97623634338379, + "learning_rate": 4.970310749706651e-06, + "loss": 0.4766, + "num_input_tokens_seen": 2217760, + "step": 1121 + }, + { + "epoch": 0.14870775347912524, + "grad_norm": 8.3178129196167, + "learning_rate": 4.970257385933925e-06, + "loss": 0.0791, + "num_input_tokens_seen": 2219928, + "step": 1122 + }, + { + "epoch": 0.14884029158383036, + "grad_norm": 0.09905339032411575, + "learning_rate": 4.970203974532933e-06, + "loss": 0.0005, + "num_input_tokens_seen": 2221032, + "step": 1123 + }, + { + "epoch": 0.14897282968853545, + "grad_norm": 20.534059524536133, + "learning_rate": 4.970150515504703e-06, + "loss": 0.3386, + "num_input_tokens_seen": 2223080, + "step": 1124 + }, + { + "epoch": 0.14910536779324055, + "grad_norm": 2.9343061447143555, + "learning_rate": 4.970097008850269e-06, + "loss": 0.0115, + "num_input_tokens_seen": 2225104, + "step": 1125 + }, + { + "epoch": 0.14923790589794567, + "grad_norm": 21.946889877319336, + "learning_rate": 4.97004345457066e-06, + "loss": 0.5937, + "num_input_tokens_seen": 2227400, + "step": 1126 + }, + { + "epoch": 0.14937044400265076, + "grad_norm": 0.10060641914606094, + "learning_rate": 4.969989852666908e-06, + "loss": 0.0005, + "num_input_tokens_seen": 2229248, + "step": 1127 + }, + { + "epoch": 0.14950298210735585, + "grad_norm": 13.128434181213379, + "learning_rate": 4.969936203140048e-06, + "loss": 0.275, + "num_input_tokens_seen": 2230928, + "step": 1128 + }, + { + "epoch": 0.14963552021206097, + "grad_norm": 7.371717929840088, + "learning_rate": 4.969882505991115e-06, + "loss": 0.0981, + "num_input_tokens_seen": 2233280, + "step": 1129 + }, + { + "epoch": 0.14976805831676607, + "grad_norm": 0.08068453520536423, + "learning_rate": 4.969828761221142e-06, + "loss": 0.0004, + "num_input_tokens_seen": 2235200, + "step": 1130 + }, + { + "epoch": 0.14990059642147116, + "grad_norm": 10.685983657836914, + "learning_rate": 4.969774968831168e-06, + "loss": 0.1411, + "num_input_tokens_seen": 2236888, + "step": 1131 + }, + { + "epoch": 0.15003313452617628, + "grad_norm": 0.26747697591781616, + "learning_rate": 4.969721128822228e-06, + "loss": 0.0012, + "num_input_tokens_seen": 2238376, + "step": 1132 + }, + { + "epoch": 0.15016567263088137, + "grad_norm": 1.9737640619277954, + "learning_rate": 4.9696672411953615e-06, + "loss": 0.0044, + "num_input_tokens_seen": 2240352, + "step": 1133 + }, + { + "epoch": 0.15029821073558647, + "grad_norm": 11.947619438171387, + "learning_rate": 4.969613305951607e-06, + "loss": 0.4761, + "num_input_tokens_seen": 2242592, + "step": 1134 + }, + { + "epoch": 0.1504307488402916, + "grad_norm": 0.06477290391921997, + "learning_rate": 4.969559323092004e-06, + "loss": 0.0003, + "num_input_tokens_seen": 2243648, + "step": 1135 + }, + { + "epoch": 0.15056328694499668, + "grad_norm": 0.28587570786476135, + "learning_rate": 4.969505292617593e-06, + "loss": 0.0011, + "num_input_tokens_seen": 2245264, + "step": 1136 + }, + { + "epoch": 0.15069582504970178, + "grad_norm": 0.09513625502586365, + "learning_rate": 4.969451214529416e-06, + "loss": 0.0004, + "num_input_tokens_seen": 2246488, + "step": 1137 + }, + { + "epoch": 0.1508283631544069, + "grad_norm": 0.06177070364356041, + "learning_rate": 4.969397088828517e-06, + "loss": 0.0003, + "num_input_tokens_seen": 2248832, + "step": 1138 + }, + { + "epoch": 0.150960901259112, + "grad_norm": 14.32563304901123, + "learning_rate": 4.969342915515938e-06, + "loss": 0.3412, + "num_input_tokens_seen": 2251040, + "step": 1139 + }, + { + "epoch": 0.15109343936381708, + "grad_norm": 34.86860656738281, + "learning_rate": 4.969288694592725e-06, + "loss": 0.5281, + "num_input_tokens_seen": 2253832, + "step": 1140 + }, + { + "epoch": 0.1512259774685222, + "grad_norm": 9.496525764465332, + "learning_rate": 4.969234426059922e-06, + "loss": 0.3175, + "num_input_tokens_seen": 2255632, + "step": 1141 + }, + { + "epoch": 0.1513585155732273, + "grad_norm": 0.34487175941467285, + "learning_rate": 4.9691801099185746e-06, + "loss": 0.0012, + "num_input_tokens_seen": 2257104, + "step": 1142 + }, + { + "epoch": 0.15149105367793242, + "grad_norm": 8.842801094055176, + "learning_rate": 4.969125746169733e-06, + "loss": 0.1957, + "num_input_tokens_seen": 2259288, + "step": 1143 + }, + { + "epoch": 0.1516235917826375, + "grad_norm": 8.908940315246582, + "learning_rate": 4.9690713348144425e-06, + "loss": 0.3226, + "num_input_tokens_seen": 2262024, + "step": 1144 + }, + { + "epoch": 0.1517561298873426, + "grad_norm": 7.990653038024902, + "learning_rate": 4.969016875853754e-06, + "loss": 0.1555, + "num_input_tokens_seen": 2263808, + "step": 1145 + }, + { + "epoch": 0.15188866799204773, + "grad_norm": 14.654000282287598, + "learning_rate": 4.968962369288717e-06, + "loss": 0.2299, + "num_input_tokens_seen": 2265416, + "step": 1146 + }, + { + "epoch": 0.15202120609675282, + "grad_norm": 41.310523986816406, + "learning_rate": 4.968907815120382e-06, + "loss": 0.8631, + "num_input_tokens_seen": 2268504, + "step": 1147 + }, + { + "epoch": 0.1521537442014579, + "grad_norm": 7.033841133117676, + "learning_rate": 4.9688532133498004e-06, + "loss": 0.1642, + "num_input_tokens_seen": 2270696, + "step": 1148 + }, + { + "epoch": 0.15228628230616303, + "grad_norm": 10.48091983795166, + "learning_rate": 4.968798563978026e-06, + "loss": 0.2556, + "num_input_tokens_seen": 2272960, + "step": 1149 + }, + { + "epoch": 0.15241882041086813, + "grad_norm": 18.73812484741211, + "learning_rate": 4.9687438670061126e-06, + "loss": 0.3744, + "num_input_tokens_seen": 2274712, + "step": 1150 + }, + { + "epoch": 0.15255135851557322, + "grad_norm": 2.2381982803344727, + "learning_rate": 4.968689122435114e-06, + "loss": 0.0052, + "num_input_tokens_seen": 2276112, + "step": 1151 + }, + { + "epoch": 0.15268389662027834, + "grad_norm": 0.17634576559066772, + "learning_rate": 4.968634330266087e-06, + "loss": 0.0009, + "num_input_tokens_seen": 2277688, + "step": 1152 + }, + { + "epoch": 0.15281643472498344, + "grad_norm": 14.243127822875977, + "learning_rate": 4.968579490500086e-06, + "loss": 0.4828, + "num_input_tokens_seen": 2279936, + "step": 1153 + }, + { + "epoch": 0.15294897282968853, + "grad_norm": 12.21242618560791, + "learning_rate": 4.96852460313817e-06, + "loss": 0.0618, + "num_input_tokens_seen": 2280984, + "step": 1154 + }, + { + "epoch": 0.15308151093439365, + "grad_norm": 19.897294998168945, + "learning_rate": 4.968469668181397e-06, + "loss": 0.592, + "num_input_tokens_seen": 2282200, + "step": 1155 + }, + { + "epoch": 0.15321404903909874, + "grad_norm": 14.368499755859375, + "learning_rate": 4.968414685630825e-06, + "loss": 0.0857, + "num_input_tokens_seen": 2283496, + "step": 1156 + }, + { + "epoch": 0.15334658714380384, + "grad_norm": 21.154964447021484, + "learning_rate": 4.968359655487515e-06, + "loss": 0.5014, + "num_input_tokens_seen": 2285608, + "step": 1157 + }, + { + "epoch": 0.15347912524850896, + "grad_norm": 0.20786318182945251, + "learning_rate": 4.968304577752528e-06, + "loss": 0.0011, + "num_input_tokens_seen": 2287040, + "step": 1158 + }, + { + "epoch": 0.15361166335321405, + "grad_norm": 5.691685199737549, + "learning_rate": 4.968249452426927e-06, + "loss": 0.1654, + "num_input_tokens_seen": 2288952, + "step": 1159 + }, + { + "epoch": 0.15374420145791914, + "grad_norm": 0.19987598061561584, + "learning_rate": 4.968194279511773e-06, + "loss": 0.001, + "num_input_tokens_seen": 2291320, + "step": 1160 + }, + { + "epoch": 0.15387673956262427, + "grad_norm": 0.4299350380897522, + "learning_rate": 4.968139059008131e-06, + "loss": 0.0021, + "num_input_tokens_seen": 2293296, + "step": 1161 + }, + { + "epoch": 0.15400927766732936, + "grad_norm": 8.03731632232666, + "learning_rate": 4.9680837909170646e-06, + "loss": 0.1455, + "num_input_tokens_seen": 2295360, + "step": 1162 + }, + { + "epoch": 0.15414181577203445, + "grad_norm": 12.259587287902832, + "learning_rate": 4.96802847523964e-06, + "loss": 0.1423, + "num_input_tokens_seen": 2298368, + "step": 1163 + }, + { + "epoch": 0.15427435387673957, + "grad_norm": 23.08724594116211, + "learning_rate": 4.967973111976925e-06, + "loss": 0.6332, + "num_input_tokens_seen": 2300048, + "step": 1164 + }, + { + "epoch": 0.15440689198144467, + "grad_norm": 14.329208374023438, + "learning_rate": 4.967917701129985e-06, + "loss": 0.1032, + "num_input_tokens_seen": 2302168, + "step": 1165 + }, + { + "epoch": 0.15453943008614976, + "grad_norm": 15.72828197479248, + "learning_rate": 4.967862242699889e-06, + "loss": 0.3043, + "num_input_tokens_seen": 2303696, + "step": 1166 + }, + { + "epoch": 0.15467196819085488, + "grad_norm": 10.578027725219727, + "learning_rate": 4.967806736687707e-06, + "loss": 0.0761, + "num_input_tokens_seen": 2305760, + "step": 1167 + }, + { + "epoch": 0.15480450629555997, + "grad_norm": 12.880636215209961, + "learning_rate": 4.967751183094508e-06, + "loss": 0.097, + "num_input_tokens_seen": 2307368, + "step": 1168 + }, + { + "epoch": 0.15493704440026507, + "grad_norm": 0.1368607133626938, + "learning_rate": 4.967695581921364e-06, + "loss": 0.0007, + "num_input_tokens_seen": 2309000, + "step": 1169 + }, + { + "epoch": 0.1550695825049702, + "grad_norm": 0.13404937088489532, + "learning_rate": 4.967639933169347e-06, + "loss": 0.0007, + "num_input_tokens_seen": 2310144, + "step": 1170 + }, + { + "epoch": 0.15520212060967528, + "grad_norm": 0.13037435710430145, + "learning_rate": 4.96758423683953e-06, + "loss": 0.0007, + "num_input_tokens_seen": 2311640, + "step": 1171 + }, + { + "epoch": 0.15533465871438037, + "grad_norm": 18.210630416870117, + "learning_rate": 4.9675284929329855e-06, + "loss": 0.3665, + "num_input_tokens_seen": 2314128, + "step": 1172 + }, + { + "epoch": 0.1554671968190855, + "grad_norm": 1.6704750061035156, + "learning_rate": 4.96747270145079e-06, + "loss": 0.012, + "num_input_tokens_seen": 2315808, + "step": 1173 + }, + { + "epoch": 0.1555997349237906, + "grad_norm": 9.904519081115723, + "learning_rate": 4.96741686239402e-06, + "loss": 0.2422, + "num_input_tokens_seen": 2318288, + "step": 1174 + }, + { + "epoch": 0.15573227302849568, + "grad_norm": 0.13234105706214905, + "learning_rate": 4.96736097576375e-06, + "loss": 0.0007, + "num_input_tokens_seen": 2321216, + "step": 1175 + }, + { + "epoch": 0.1558648111332008, + "grad_norm": 29.408897399902344, + "learning_rate": 4.967305041561057e-06, + "loss": 0.7555, + "num_input_tokens_seen": 2323696, + "step": 1176 + }, + { + "epoch": 0.1559973492379059, + "grad_norm": 0.12690770626068115, + "learning_rate": 4.967249059787021e-06, + "loss": 0.0006, + "num_input_tokens_seen": 2325592, + "step": 1177 + }, + { + "epoch": 0.156129887342611, + "grad_norm": 6.844043254852295, + "learning_rate": 4.967193030442722e-06, + "loss": 0.136, + "num_input_tokens_seen": 2326656, + "step": 1178 + }, + { + "epoch": 0.1562624254473161, + "grad_norm": 20.070098876953125, + "learning_rate": 4.967136953529239e-06, + "loss": 0.3159, + "num_input_tokens_seen": 2328128, + "step": 1179 + }, + { + "epoch": 0.1563949635520212, + "grad_norm": 0.23279854655265808, + "learning_rate": 4.967080829047653e-06, + "loss": 0.0011, + "num_input_tokens_seen": 2329656, + "step": 1180 + }, + { + "epoch": 0.1565275016567263, + "grad_norm": 9.846593856811523, + "learning_rate": 4.967024656999048e-06, + "loss": 0.2366, + "num_input_tokens_seen": 2331160, + "step": 1181 + }, + { + "epoch": 0.15666003976143142, + "grad_norm": 21.642732620239258, + "learning_rate": 4.966968437384504e-06, + "loss": 0.4724, + "num_input_tokens_seen": 2333192, + "step": 1182 + }, + { + "epoch": 0.1567925778661365, + "grad_norm": 6.204029560089111, + "learning_rate": 4.966912170205108e-06, + "loss": 0.142, + "num_input_tokens_seen": 2335760, + "step": 1183 + }, + { + "epoch": 0.1569251159708416, + "grad_norm": 11.157078742980957, + "learning_rate": 4.9668558554619424e-06, + "loss": 0.3597, + "num_input_tokens_seen": 2337720, + "step": 1184 + }, + { + "epoch": 0.15705765407554673, + "grad_norm": 0.18780739605426788, + "learning_rate": 4.966799493156095e-06, + "loss": 0.0009, + "num_input_tokens_seen": 2339768, + "step": 1185 + }, + { + "epoch": 0.15719019218025182, + "grad_norm": 0.20116011798381805, + "learning_rate": 4.966743083288651e-06, + "loss": 0.001, + "num_input_tokens_seen": 2341296, + "step": 1186 + }, + { + "epoch": 0.1573227302849569, + "grad_norm": 9.951813697814941, + "learning_rate": 4.966686625860699e-06, + "loss": 0.2009, + "num_input_tokens_seen": 2343016, + "step": 1187 + }, + { + "epoch": 0.15745526838966203, + "grad_norm": 13.271669387817383, + "learning_rate": 4.9666301208733266e-06, + "loss": 0.2422, + "num_input_tokens_seen": 2344848, + "step": 1188 + }, + { + "epoch": 0.15758780649436713, + "grad_norm": 14.315110206604004, + "learning_rate": 4.966573568327625e-06, + "loss": 0.3175, + "num_input_tokens_seen": 2346528, + "step": 1189 + }, + { + "epoch": 0.15772034459907222, + "grad_norm": 13.240228652954102, + "learning_rate": 4.966516968224683e-06, + "loss": 0.1778, + "num_input_tokens_seen": 2348136, + "step": 1190 + }, + { + "epoch": 0.15785288270377734, + "grad_norm": 18.752595901489258, + "learning_rate": 4.966460320565592e-06, + "loss": 0.5405, + "num_input_tokens_seen": 2350872, + "step": 1191 + }, + { + "epoch": 0.15798542080848244, + "grad_norm": 0.40500640869140625, + "learning_rate": 4.966403625351444e-06, + "loss": 0.002, + "num_input_tokens_seen": 2352456, + "step": 1192 + }, + { + "epoch": 0.15811795891318753, + "grad_norm": 10.899077415466309, + "learning_rate": 4.966346882583333e-06, + "loss": 0.1399, + "num_input_tokens_seen": 2354200, + "step": 1193 + }, + { + "epoch": 0.15825049701789265, + "grad_norm": 14.641097068786621, + "learning_rate": 4.966290092262353e-06, + "loss": 0.5644, + "num_input_tokens_seen": 2355840, + "step": 1194 + }, + { + "epoch": 0.15838303512259774, + "grad_norm": 9.716653823852539, + "learning_rate": 4.9662332543895984e-06, + "loss": 0.1583, + "num_input_tokens_seen": 2358192, + "step": 1195 + }, + { + "epoch": 0.15851557322730284, + "grad_norm": 16.018234252929688, + "learning_rate": 4.966176368966165e-06, + "loss": 0.6166, + "num_input_tokens_seen": 2360216, + "step": 1196 + }, + { + "epoch": 0.15864811133200796, + "grad_norm": 11.404841423034668, + "learning_rate": 4.9661194359931504e-06, + "loss": 0.4163, + "num_input_tokens_seen": 2362416, + "step": 1197 + }, + { + "epoch": 0.15878064943671305, + "grad_norm": 0.8917785882949829, + "learning_rate": 4.966062455471651e-06, + "loss": 0.0045, + "num_input_tokens_seen": 2365168, + "step": 1198 + }, + { + "epoch": 0.15891318754141814, + "grad_norm": 10.884905815124512, + "learning_rate": 4.9660054274027666e-06, + "loss": 0.2707, + "num_input_tokens_seen": 2368040, + "step": 1199 + }, + { + "epoch": 0.15904572564612326, + "grad_norm": 0.4378180503845215, + "learning_rate": 4.965948351787597e-06, + "loss": 0.0022, + "num_input_tokens_seen": 2369048, + "step": 1200 + }, + { + "epoch": 0.15917826375082836, + "grad_norm": 0.7108259797096252, + "learning_rate": 4.9658912286272416e-06, + "loss": 0.0036, + "num_input_tokens_seen": 2371280, + "step": 1201 + }, + { + "epoch": 0.15931080185553348, + "grad_norm": 23.162755966186523, + "learning_rate": 4.965834057922802e-06, + "loss": 1.1308, + "num_input_tokens_seen": 2373632, + "step": 1202 + }, + { + "epoch": 0.15944333996023857, + "grad_norm": 0.20963579416275024, + "learning_rate": 4.965776839675381e-06, + "loss": 0.001, + "num_input_tokens_seen": 2374704, + "step": 1203 + }, + { + "epoch": 0.15957587806494367, + "grad_norm": 10.45650577545166, + "learning_rate": 4.965719573886081e-06, + "loss": 0.2593, + "num_input_tokens_seen": 2377064, + "step": 1204 + }, + { + "epoch": 0.1597084161696488, + "grad_norm": 10.756378173828125, + "learning_rate": 4.9656622605560065e-06, + "loss": 0.1955, + "num_input_tokens_seen": 2378792, + "step": 1205 + }, + { + "epoch": 0.15984095427435388, + "grad_norm": 0.3480214774608612, + "learning_rate": 4.965604899686264e-06, + "loss": 0.0016, + "num_input_tokens_seen": 2380192, + "step": 1206 + }, + { + "epoch": 0.15997349237905897, + "grad_norm": 7.867863655090332, + "learning_rate": 4.965547491277957e-06, + "loss": 0.0445, + "num_input_tokens_seen": 2382344, + "step": 1207 + }, + { + "epoch": 0.1601060304837641, + "grad_norm": 21.34485626220703, + "learning_rate": 4.965490035332193e-06, + "loss": 0.7536, + "num_input_tokens_seen": 2384232, + "step": 1208 + }, + { + "epoch": 0.1602385685884692, + "grad_norm": 12.051791191101074, + "learning_rate": 4.965432531850081e-06, + "loss": 0.2002, + "num_input_tokens_seen": 2386752, + "step": 1209 + }, + { + "epoch": 0.16037110669317428, + "grad_norm": 24.40317726135254, + "learning_rate": 4.965374980832729e-06, + "loss": 0.4844, + "num_input_tokens_seen": 2388560, + "step": 1210 + }, + { + "epoch": 0.1605036447978794, + "grad_norm": 10.289485931396484, + "learning_rate": 4.9653173822812466e-06, + "loss": 0.1146, + "num_input_tokens_seen": 2390520, + "step": 1211 + }, + { + "epoch": 0.1606361829025845, + "grad_norm": 9.223539352416992, + "learning_rate": 4.965259736196744e-06, + "loss": 0.2771, + "num_input_tokens_seen": 2392504, + "step": 1212 + }, + { + "epoch": 0.1607687210072896, + "grad_norm": 18.188947677612305, + "learning_rate": 4.965202042580333e-06, + "loss": 0.5398, + "num_input_tokens_seen": 2396208, + "step": 1213 + }, + { + "epoch": 0.1609012591119947, + "grad_norm": 8.66128158569336, + "learning_rate": 4.965144301433126e-06, + "loss": 0.122, + "num_input_tokens_seen": 2397920, + "step": 1214 + }, + { + "epoch": 0.1610337972166998, + "grad_norm": 9.544986724853516, + "learning_rate": 4.965086512756236e-06, + "loss": 0.1422, + "num_input_tokens_seen": 2399576, + "step": 1215 + }, + { + "epoch": 0.1611663353214049, + "grad_norm": 10.718111991882324, + "learning_rate": 4.965028676550778e-06, + "loss": 0.2169, + "num_input_tokens_seen": 2401296, + "step": 1216 + }, + { + "epoch": 0.16129887342611002, + "grad_norm": 9.164186477661133, + "learning_rate": 4.964970792817866e-06, + "loss": 0.3463, + "num_input_tokens_seen": 2403816, + "step": 1217 + }, + { + "epoch": 0.1614314115308151, + "grad_norm": 14.77768611907959, + "learning_rate": 4.964912861558617e-06, + "loss": 0.4384, + "num_input_tokens_seen": 2405872, + "step": 1218 + }, + { + "epoch": 0.1615639496355202, + "grad_norm": 9.14603328704834, + "learning_rate": 4.964854882774148e-06, + "loss": 0.2088, + "num_input_tokens_seen": 2407776, + "step": 1219 + }, + { + "epoch": 0.16169648774022533, + "grad_norm": 7.468289852142334, + "learning_rate": 4.964796856465576e-06, + "loss": 0.1911, + "num_input_tokens_seen": 2409488, + "step": 1220 + }, + { + "epoch": 0.16182902584493042, + "grad_norm": 2.329026460647583, + "learning_rate": 4.96473878263402e-06, + "loss": 0.0119, + "num_input_tokens_seen": 2411224, + "step": 1221 + }, + { + "epoch": 0.1619615639496355, + "grad_norm": 5.656033992767334, + "learning_rate": 4.9646806612806e-06, + "loss": 0.0471, + "num_input_tokens_seen": 2412808, + "step": 1222 + }, + { + "epoch": 0.16209410205434063, + "grad_norm": 8.645514488220215, + "learning_rate": 4.964622492406437e-06, + "loss": 0.1249, + "num_input_tokens_seen": 2414240, + "step": 1223 + }, + { + "epoch": 0.16222664015904573, + "grad_norm": 0.2108611762523651, + "learning_rate": 4.964564276012652e-06, + "loss": 0.0008, + "num_input_tokens_seen": 2416544, + "step": 1224 + }, + { + "epoch": 0.16235917826375082, + "grad_norm": 13.370351791381836, + "learning_rate": 4.964506012100367e-06, + "loss": 0.1463, + "num_input_tokens_seen": 2418144, + "step": 1225 + }, + { + "epoch": 0.16249171636845594, + "grad_norm": 10.555953025817871, + "learning_rate": 4.964447700670706e-06, + "loss": 0.2306, + "num_input_tokens_seen": 2420040, + "step": 1226 + }, + { + "epoch": 0.16262425447316103, + "grad_norm": 12.894680976867676, + "learning_rate": 4.964389341724794e-06, + "loss": 0.2132, + "num_input_tokens_seen": 2422504, + "step": 1227 + }, + { + "epoch": 0.16275679257786613, + "grad_norm": 18.176006317138672, + "learning_rate": 4.964330935263754e-06, + "loss": 0.4596, + "num_input_tokens_seen": 2424376, + "step": 1228 + }, + { + "epoch": 0.16288933068257125, + "grad_norm": 56.68040084838867, + "learning_rate": 4.964272481288715e-06, + "loss": 0.1415, + "num_input_tokens_seen": 2427168, + "step": 1229 + }, + { + "epoch": 0.16302186878727634, + "grad_norm": 22.071115493774414, + "learning_rate": 4.964213979800802e-06, + "loss": 0.5743, + "num_input_tokens_seen": 2429760, + "step": 1230 + }, + { + "epoch": 0.16315440689198143, + "grad_norm": 5.792419910430908, + "learning_rate": 4.964155430801143e-06, + "loss": 0.0925, + "num_input_tokens_seen": 2431880, + "step": 1231 + }, + { + "epoch": 0.16328694499668656, + "grad_norm": 13.725963592529297, + "learning_rate": 4.964096834290868e-06, + "loss": 0.3464, + "num_input_tokens_seen": 2433800, + "step": 1232 + }, + { + "epoch": 0.16341948310139165, + "grad_norm": 11.299811363220215, + "learning_rate": 4.964038190271106e-06, + "loss": 0.1396, + "num_input_tokens_seen": 2435296, + "step": 1233 + }, + { + "epoch": 0.16355202120609674, + "grad_norm": 6.829468727111816, + "learning_rate": 4.963979498742988e-06, + "loss": 0.1915, + "num_input_tokens_seen": 2437008, + "step": 1234 + }, + { + "epoch": 0.16368455931080186, + "grad_norm": 15.892531394958496, + "learning_rate": 4.963920759707645e-06, + "loss": 0.3817, + "num_input_tokens_seen": 2440152, + "step": 1235 + }, + { + "epoch": 0.16381709741550696, + "grad_norm": 9.568033218383789, + "learning_rate": 4.96386197316621e-06, + "loss": 0.0495, + "num_input_tokens_seen": 2442048, + "step": 1236 + }, + { + "epoch": 0.16394963552021205, + "grad_norm": 7.006852149963379, + "learning_rate": 4.963803139119817e-06, + "loss": 0.2443, + "num_input_tokens_seen": 2444840, + "step": 1237 + }, + { + "epoch": 0.16408217362491717, + "grad_norm": 13.384697914123535, + "learning_rate": 4.9637442575695995e-06, + "loss": 0.3175, + "num_input_tokens_seen": 2446568, + "step": 1238 + }, + { + "epoch": 0.16421471172962226, + "grad_norm": 14.999347686767578, + "learning_rate": 4.963685328516693e-06, + "loss": 0.1554, + "num_input_tokens_seen": 2449384, + "step": 1239 + }, + { + "epoch": 0.16434724983432736, + "grad_norm": 0.9093858599662781, + "learning_rate": 4.9636263519622334e-06, + "loss": 0.0047, + "num_input_tokens_seen": 2451536, + "step": 1240 + }, + { + "epoch": 0.16447978793903248, + "grad_norm": 12.794568061828613, + "learning_rate": 4.963567327907359e-06, + "loss": 0.2892, + "num_input_tokens_seen": 2453088, + "step": 1241 + }, + { + "epoch": 0.16461232604373757, + "grad_norm": 28.85283660888672, + "learning_rate": 4.963508256353206e-06, + "loss": 0.1888, + "num_input_tokens_seen": 2455704, + "step": 1242 + }, + { + "epoch": 0.16474486414844267, + "grad_norm": 10.746437072753906, + "learning_rate": 4.963449137300915e-06, + "loss": 0.1503, + "num_input_tokens_seen": 2458200, + "step": 1243 + }, + { + "epoch": 0.1648774022531478, + "grad_norm": 2.46858811378479, + "learning_rate": 4.9633899707516245e-06, + "loss": 0.0713, + "num_input_tokens_seen": 2459384, + "step": 1244 + }, + { + "epoch": 0.16500994035785288, + "grad_norm": 1.8052623271942139, + "learning_rate": 4.963330756706477e-06, + "loss": 0.0084, + "num_input_tokens_seen": 2460800, + "step": 1245 + }, + { + "epoch": 0.16514247846255797, + "grad_norm": 7.2112603187561035, + "learning_rate": 4.963271495166612e-06, + "loss": 0.2209, + "num_input_tokens_seen": 2463552, + "step": 1246 + }, + { + "epoch": 0.1652750165672631, + "grad_norm": 0.8979879021644592, + "learning_rate": 4.963212186133174e-06, + "loss": 0.0048, + "num_input_tokens_seen": 2465504, + "step": 1247 + }, + { + "epoch": 0.1654075546719682, + "grad_norm": 23.280481338500977, + "learning_rate": 4.963152829607306e-06, + "loss": 0.4791, + "num_input_tokens_seen": 2467528, + "step": 1248 + }, + { + "epoch": 0.16554009277667328, + "grad_norm": 15.63931941986084, + "learning_rate": 4.963093425590152e-06, + "loss": 0.2506, + "num_input_tokens_seen": 2470128, + "step": 1249 + }, + { + "epoch": 0.1656726308813784, + "grad_norm": 7.766134262084961, + "learning_rate": 4.963033974082857e-06, + "loss": 0.1795, + "num_input_tokens_seen": 2472808, + "step": 1250 + }, + { + "epoch": 0.1658051689860835, + "grad_norm": 16.949462890625, + "learning_rate": 4.962974475086568e-06, + "loss": 0.598, + "num_input_tokens_seen": 2474696, + "step": 1251 + }, + { + "epoch": 0.1659377070907886, + "grad_norm": 16.239791870117188, + "learning_rate": 4.962914928602432e-06, + "loss": 0.6068, + "num_input_tokens_seen": 2476944, + "step": 1252 + }, + { + "epoch": 0.1660702451954937, + "grad_norm": 18.533939361572266, + "learning_rate": 4.962855334631598e-06, + "loss": 0.2969, + "num_input_tokens_seen": 2478856, + "step": 1253 + }, + { + "epoch": 0.1662027833001988, + "grad_norm": 12.568775177001953, + "learning_rate": 4.962795693175213e-06, + "loss": 0.1615, + "num_input_tokens_seen": 2480576, + "step": 1254 + }, + { + "epoch": 0.1663353214049039, + "grad_norm": 0.4570324718952179, + "learning_rate": 4.962736004234429e-06, + "loss": 0.002, + "num_input_tokens_seen": 2482216, + "step": 1255 + }, + { + "epoch": 0.16646785950960902, + "grad_norm": 10.312764167785645, + "learning_rate": 4.962676267810395e-06, + "loss": 0.2164, + "num_input_tokens_seen": 2484192, + "step": 1256 + }, + { + "epoch": 0.1666003976143141, + "grad_norm": 14.856953620910645, + "learning_rate": 4.962616483904263e-06, + "loss": 0.2962, + "num_input_tokens_seen": 2485928, + "step": 1257 + }, + { + "epoch": 0.16673293571901923, + "grad_norm": 20.90708351135254, + "learning_rate": 4.962556652517188e-06, + "loss": 0.458, + "num_input_tokens_seen": 2487856, + "step": 1258 + }, + { + "epoch": 0.16686547382372433, + "grad_norm": 0.3233267664909363, + "learning_rate": 4.962496773650322e-06, + "loss": 0.0018, + "num_input_tokens_seen": 2489248, + "step": 1259 + }, + { + "epoch": 0.16699801192842942, + "grad_norm": 20.04923439025879, + "learning_rate": 4.962436847304818e-06, + "loss": 0.3699, + "num_input_tokens_seen": 2491816, + "step": 1260 + }, + { + "epoch": 0.16713055003313454, + "grad_norm": 23.092763900756836, + "learning_rate": 4.962376873481833e-06, + "loss": 0.7277, + "num_input_tokens_seen": 2494112, + "step": 1261 + }, + { + "epoch": 0.16726308813783963, + "grad_norm": 14.853482246398926, + "learning_rate": 4.962316852182523e-06, + "loss": 0.4203, + "num_input_tokens_seen": 2495800, + "step": 1262 + }, + { + "epoch": 0.16739562624254473, + "grad_norm": 8.015201568603516, + "learning_rate": 4.9622567834080475e-06, + "loss": 0.1154, + "num_input_tokens_seen": 2497704, + "step": 1263 + }, + { + "epoch": 0.16752816434724985, + "grad_norm": 11.065986633300781, + "learning_rate": 4.96219666715956e-06, + "loss": 0.396, + "num_input_tokens_seen": 2500376, + "step": 1264 + }, + { + "epoch": 0.16766070245195494, + "grad_norm": 7.045234680175781, + "learning_rate": 4.962136503438224e-06, + "loss": 0.2278, + "num_input_tokens_seen": 2501608, + "step": 1265 + }, + { + "epoch": 0.16779324055666003, + "grad_norm": 18.814783096313477, + "learning_rate": 4.962076292245197e-06, + "loss": 0.6398, + "num_input_tokens_seen": 2504376, + "step": 1266 + }, + { + "epoch": 0.16792577866136515, + "grad_norm": 8.410832405090332, + "learning_rate": 4.962016033581641e-06, + "loss": 0.0826, + "num_input_tokens_seen": 2506360, + "step": 1267 + }, + { + "epoch": 0.16805831676607025, + "grad_norm": 9.72460651397705, + "learning_rate": 4.961955727448716e-06, + "loss": 0.0395, + "num_input_tokens_seen": 2507784, + "step": 1268 + }, + { + "epoch": 0.16819085487077534, + "grad_norm": 6.11878776550293, + "learning_rate": 4.961895373847588e-06, + "loss": 0.1862, + "num_input_tokens_seen": 2509576, + "step": 1269 + }, + { + "epoch": 0.16832339297548046, + "grad_norm": 14.088763236999512, + "learning_rate": 4.961834972779418e-06, + "loss": 0.253, + "num_input_tokens_seen": 2511480, + "step": 1270 + }, + { + "epoch": 0.16845593108018556, + "grad_norm": 12.575016021728516, + "learning_rate": 4.961774524245371e-06, + "loss": 0.1721, + "num_input_tokens_seen": 2513424, + "step": 1271 + }, + { + "epoch": 0.16858846918489065, + "grad_norm": 7.279139518737793, + "learning_rate": 4.9617140282466136e-06, + "loss": 0.2121, + "num_input_tokens_seen": 2515968, + "step": 1272 + }, + { + "epoch": 0.16872100728959577, + "grad_norm": 14.74572467803955, + "learning_rate": 4.9616534847843115e-06, + "loss": 0.1213, + "num_input_tokens_seen": 2517824, + "step": 1273 + }, + { + "epoch": 0.16885354539430086, + "grad_norm": 0.7207683324813843, + "learning_rate": 4.9615928938596305e-06, + "loss": 0.0039, + "num_input_tokens_seen": 2519168, + "step": 1274 + }, + { + "epoch": 0.16898608349900596, + "grad_norm": 0.6745265126228333, + "learning_rate": 4.961532255473742e-06, + "loss": 0.0037, + "num_input_tokens_seen": 2522008, + "step": 1275 + }, + { + "epoch": 0.16911862160371108, + "grad_norm": 19.08082389831543, + "learning_rate": 4.961471569627813e-06, + "loss": 0.3813, + "num_input_tokens_seen": 2523960, + "step": 1276 + }, + { + "epoch": 0.16925115970841617, + "grad_norm": 14.044391632080078, + "learning_rate": 4.961410836323014e-06, + "loss": 0.3901, + "num_input_tokens_seen": 2526504, + "step": 1277 + }, + { + "epoch": 0.16938369781312126, + "grad_norm": 6.240870952606201, + "learning_rate": 4.961350055560516e-06, + "loss": 0.1013, + "num_input_tokens_seen": 2528200, + "step": 1278 + }, + { + "epoch": 0.16951623591782639, + "grad_norm": 0.1938682645559311, + "learning_rate": 4.961289227341491e-06, + "loss": 0.0011, + "num_input_tokens_seen": 2529608, + "step": 1279 + }, + { + "epoch": 0.16964877402253148, + "grad_norm": 70.04135131835938, + "learning_rate": 4.961228351667111e-06, + "loss": 0.9064, + "num_input_tokens_seen": 2532488, + "step": 1280 + }, + { + "epoch": 0.16978131212723657, + "grad_norm": 0.11638358235359192, + "learning_rate": 4.961167428538552e-06, + "loss": 0.0006, + "num_input_tokens_seen": 2533640, + "step": 1281 + }, + { + "epoch": 0.1699138502319417, + "grad_norm": 13.382452964782715, + "learning_rate": 4.961106457956986e-06, + "loss": 0.3132, + "num_input_tokens_seen": 2535384, + "step": 1282 + }, + { + "epoch": 0.1700463883366468, + "grad_norm": 12.490396499633789, + "learning_rate": 4.961045439923589e-06, + "loss": 0.2024, + "num_input_tokens_seen": 2537776, + "step": 1283 + }, + { + "epoch": 0.17017892644135188, + "grad_norm": 8.545411109924316, + "learning_rate": 4.9609843744395405e-06, + "loss": 0.272, + "num_input_tokens_seen": 2540064, + "step": 1284 + }, + { + "epoch": 0.170311464546057, + "grad_norm": 0.04541824758052826, + "learning_rate": 4.960923261506014e-06, + "loss": 0.0003, + "num_input_tokens_seen": 2541312, + "step": 1285 + }, + { + "epoch": 0.1704440026507621, + "grad_norm": 11.121861457824707, + "learning_rate": 4.960862101124189e-06, + "loss": 0.2936, + "num_input_tokens_seen": 2543048, + "step": 1286 + }, + { + "epoch": 0.1705765407554672, + "grad_norm": 15.224230766296387, + "learning_rate": 4.960800893295246e-06, + "loss": 0.3807, + "num_input_tokens_seen": 2544344, + "step": 1287 + }, + { + "epoch": 0.1707090788601723, + "grad_norm": 6.467791557312012, + "learning_rate": 4.960739638020363e-06, + "loss": 0.0913, + "num_input_tokens_seen": 2546464, + "step": 1288 + }, + { + "epoch": 0.1708416169648774, + "grad_norm": 14.586759567260742, + "learning_rate": 4.960678335300723e-06, + "loss": 0.4684, + "num_input_tokens_seen": 2548840, + "step": 1289 + }, + { + "epoch": 0.1709741550695825, + "grad_norm": 0.07136260718107224, + "learning_rate": 4.960616985137507e-06, + "loss": 0.0004, + "num_input_tokens_seen": 2550432, + "step": 1290 + }, + { + "epoch": 0.17110669317428762, + "grad_norm": 0.47577252984046936, + "learning_rate": 4.960555587531898e-06, + "loss": 0.0023, + "num_input_tokens_seen": 2552160, + "step": 1291 + }, + { + "epoch": 0.1712392312789927, + "grad_norm": 13.335426330566406, + "learning_rate": 4.960494142485079e-06, + "loss": 0.256, + "num_input_tokens_seen": 2553880, + "step": 1292 + }, + { + "epoch": 0.1713717693836978, + "grad_norm": 13.659242630004883, + "learning_rate": 4.960432649998237e-06, + "loss": 0.2444, + "num_input_tokens_seen": 2555496, + "step": 1293 + }, + { + "epoch": 0.17150430748840292, + "grad_norm": 17.66350555419922, + "learning_rate": 4.960371110072556e-06, + "loss": 0.4865, + "num_input_tokens_seen": 2557240, + "step": 1294 + }, + { + "epoch": 0.17163684559310802, + "grad_norm": 0.0468173585832119, + "learning_rate": 4.960309522709222e-06, + "loss": 0.0003, + "num_input_tokens_seen": 2558608, + "step": 1295 + }, + { + "epoch": 0.1717693836978131, + "grad_norm": 12.314399719238281, + "learning_rate": 4.960247887909423e-06, + "loss": 0.5134, + "num_input_tokens_seen": 2560296, + "step": 1296 + }, + { + "epoch": 0.17190192180251823, + "grad_norm": 7.6465373039245605, + "learning_rate": 4.960186205674348e-06, + "loss": 0.2015, + "num_input_tokens_seen": 2563136, + "step": 1297 + }, + { + "epoch": 0.17203445990722332, + "grad_norm": 7.99665641784668, + "learning_rate": 4.960124476005185e-06, + "loss": 0.1211, + "num_input_tokens_seen": 2564800, + "step": 1298 + }, + { + "epoch": 0.17216699801192842, + "grad_norm": 5.727914810180664, + "learning_rate": 4.960062698903126e-06, + "loss": 0.0384, + "num_input_tokens_seen": 2566536, + "step": 1299 + }, + { + "epoch": 0.17229953611663354, + "grad_norm": 9.787583351135254, + "learning_rate": 4.960000874369361e-06, + "loss": 0.2331, + "num_input_tokens_seen": 2568440, + "step": 1300 + }, + { + "epoch": 0.17243207422133863, + "grad_norm": 29.9659481048584, + "learning_rate": 4.959939002405081e-06, + "loss": 0.7485, + "num_input_tokens_seen": 2570344, + "step": 1301 + }, + { + "epoch": 0.17256461232604373, + "grad_norm": 4.79191255569458, + "learning_rate": 4.95987708301148e-06, + "loss": 0.1534, + "num_input_tokens_seen": 2572056, + "step": 1302 + }, + { + "epoch": 0.17269715043074885, + "grad_norm": 0.05787365138530731, + "learning_rate": 4.959815116189753e-06, + "loss": 0.0003, + "num_input_tokens_seen": 2573704, + "step": 1303 + }, + { + "epoch": 0.17282968853545394, + "grad_norm": 0.05866735056042671, + "learning_rate": 4.959753101941093e-06, + "loss": 0.0003, + "num_input_tokens_seen": 2575800, + "step": 1304 + }, + { + "epoch": 0.17296222664015903, + "grad_norm": 22.51832389831543, + "learning_rate": 4.959691040266697e-06, + "loss": 0.6, + "num_input_tokens_seen": 2577216, + "step": 1305 + }, + { + "epoch": 0.17309476474486415, + "grad_norm": 6.810016632080078, + "learning_rate": 4.95962893116776e-06, + "loss": 0.1055, + "num_input_tokens_seen": 2579216, + "step": 1306 + }, + { + "epoch": 0.17322730284956925, + "grad_norm": 10.866792678833008, + "learning_rate": 4.9595667746454805e-06, + "loss": 0.1586, + "num_input_tokens_seen": 2580720, + "step": 1307 + }, + { + "epoch": 0.17335984095427434, + "grad_norm": 4.257212162017822, + "learning_rate": 4.959504570701057e-06, + "loss": 0.0613, + "num_input_tokens_seen": 2584008, + "step": 1308 + }, + { + "epoch": 0.17349237905897946, + "grad_norm": 7.145122528076172, + "learning_rate": 4.959442319335689e-06, + "loss": 0.133, + "num_input_tokens_seen": 2586456, + "step": 1309 + }, + { + "epoch": 0.17362491716368456, + "grad_norm": 12.153865814208984, + "learning_rate": 4.959380020550576e-06, + "loss": 0.3231, + "num_input_tokens_seen": 2587840, + "step": 1310 + }, + { + "epoch": 0.17375745526838965, + "grad_norm": 33.809593200683594, + "learning_rate": 4.95931767434692e-06, + "loss": 0.7073, + "num_input_tokens_seen": 2589544, + "step": 1311 + }, + { + "epoch": 0.17388999337309477, + "grad_norm": 22.8640079498291, + "learning_rate": 4.959255280725921e-06, + "loss": 0.4314, + "num_input_tokens_seen": 2591984, + "step": 1312 + }, + { + "epoch": 0.17402253147779986, + "grad_norm": 0.2230100929737091, + "learning_rate": 4.959192839688786e-06, + "loss": 0.0009, + "num_input_tokens_seen": 2593728, + "step": 1313 + }, + { + "epoch": 0.17415506958250496, + "grad_norm": 27.39137077331543, + "learning_rate": 4.959130351236715e-06, + "loss": 1.2096, + "num_input_tokens_seen": 2596560, + "step": 1314 + }, + { + "epoch": 0.17428760768721008, + "grad_norm": 7.179905414581299, + "learning_rate": 4.959067815370915e-06, + "loss": 0.0992, + "num_input_tokens_seen": 2598832, + "step": 1315 + }, + { + "epoch": 0.17442014579191517, + "grad_norm": 12.255276679992676, + "learning_rate": 4.959005232092591e-06, + "loss": 0.3726, + "num_input_tokens_seen": 2600504, + "step": 1316 + }, + { + "epoch": 0.1745526838966203, + "grad_norm": 0.24817180633544922, + "learning_rate": 4.958942601402949e-06, + "loss": 0.0013, + "num_input_tokens_seen": 2602184, + "step": 1317 + }, + { + "epoch": 0.17468522200132539, + "grad_norm": 17.49175262451172, + "learning_rate": 4.958879923303198e-06, + "loss": 0.4214, + "num_input_tokens_seen": 2603912, + "step": 1318 + }, + { + "epoch": 0.17481776010603048, + "grad_norm": 15.520642280578613, + "learning_rate": 4.958817197794545e-06, + "loss": 0.5072, + "num_input_tokens_seen": 2605640, + "step": 1319 + }, + { + "epoch": 0.1749502982107356, + "grad_norm": 7.988204479217529, + "learning_rate": 4.958754424878202e-06, + "loss": 0.1501, + "num_input_tokens_seen": 2608952, + "step": 1320 + }, + { + "epoch": 0.1750828363154407, + "grad_norm": 14.617895126342773, + "learning_rate": 4.958691604555376e-06, + "loss": 0.4428, + "num_input_tokens_seen": 2610320, + "step": 1321 + }, + { + "epoch": 0.1752153744201458, + "grad_norm": 8.387760162353516, + "learning_rate": 4.9586287368272805e-06, + "loss": 0.2063, + "num_input_tokens_seen": 2612480, + "step": 1322 + }, + { + "epoch": 0.1753479125248509, + "grad_norm": 0.3168196380138397, + "learning_rate": 4.958565821695126e-06, + "loss": 0.0017, + "num_input_tokens_seen": 2614824, + "step": 1323 + }, + { + "epoch": 0.175480450629556, + "grad_norm": 5.0695576667785645, + "learning_rate": 4.958502859160127e-06, + "loss": 0.1005, + "num_input_tokens_seen": 2618512, + "step": 1324 + }, + { + "epoch": 0.1756129887342611, + "grad_norm": 21.439069747924805, + "learning_rate": 4.958439849223496e-06, + "loss": 0.6034, + "num_input_tokens_seen": 2621664, + "step": 1325 + }, + { + "epoch": 0.17574552683896622, + "grad_norm": 10.268298149108887, + "learning_rate": 4.95837679188645e-06, + "loss": 0.0392, + "num_input_tokens_seen": 2623624, + "step": 1326 + }, + { + "epoch": 0.1758780649436713, + "grad_norm": 15.390063285827637, + "learning_rate": 4.9583136871502025e-06, + "loss": 0.3062, + "num_input_tokens_seen": 2625328, + "step": 1327 + }, + { + "epoch": 0.1760106030483764, + "grad_norm": 13.129558563232422, + "learning_rate": 4.958250535015972e-06, + "loss": 0.4096, + "num_input_tokens_seen": 2627464, + "step": 1328 + }, + { + "epoch": 0.17614314115308152, + "grad_norm": 16.459684371948242, + "learning_rate": 4.958187335484974e-06, + "loss": 0.5073, + "num_input_tokens_seen": 2630224, + "step": 1329 + }, + { + "epoch": 0.17627567925778662, + "grad_norm": 6.999892234802246, + "learning_rate": 4.9581240885584295e-06, + "loss": 0.2283, + "num_input_tokens_seen": 2632296, + "step": 1330 + }, + { + "epoch": 0.1764082173624917, + "grad_norm": 4.824463367462158, + "learning_rate": 4.958060794237557e-06, + "loss": 0.1857, + "num_input_tokens_seen": 2633848, + "step": 1331 + }, + { + "epoch": 0.17654075546719683, + "grad_norm": 0.5638998746871948, + "learning_rate": 4.957997452523576e-06, + "loss": 0.0031, + "num_input_tokens_seen": 2635456, + "step": 1332 + }, + { + "epoch": 0.17667329357190192, + "grad_norm": 8.772820472717285, + "learning_rate": 4.957934063417708e-06, + "loss": 0.1095, + "num_input_tokens_seen": 2638112, + "step": 1333 + }, + { + "epoch": 0.17680583167660702, + "grad_norm": 18.29918670654297, + "learning_rate": 4.957870626921177e-06, + "loss": 0.7832, + "num_input_tokens_seen": 2639760, + "step": 1334 + }, + { + "epoch": 0.17693836978131214, + "grad_norm": 23.334012985229492, + "learning_rate": 4.957807143035204e-06, + "loss": 0.2197, + "num_input_tokens_seen": 2641640, + "step": 1335 + }, + { + "epoch": 0.17707090788601723, + "grad_norm": 4.936501979827881, + "learning_rate": 4.957743611761013e-06, + "loss": 0.0911, + "num_input_tokens_seen": 2642944, + "step": 1336 + }, + { + "epoch": 0.17720344599072232, + "grad_norm": 11.561747550964355, + "learning_rate": 4.957680033099831e-06, + "loss": 0.2882, + "num_input_tokens_seen": 2644568, + "step": 1337 + }, + { + "epoch": 0.17733598409542745, + "grad_norm": 0.17479321360588074, + "learning_rate": 4.957616407052882e-06, + "loss": 0.0009, + "num_input_tokens_seen": 2645736, + "step": 1338 + }, + { + "epoch": 0.17746852220013254, + "grad_norm": 7.629298686981201, + "learning_rate": 4.957552733621393e-06, + "loss": 0.1594, + "num_input_tokens_seen": 2647336, + "step": 1339 + }, + { + "epoch": 0.17760106030483763, + "grad_norm": 11.04284954071045, + "learning_rate": 4.957489012806591e-06, + "loss": 0.1328, + "num_input_tokens_seen": 2649664, + "step": 1340 + }, + { + "epoch": 0.17773359840954275, + "grad_norm": 6.219261169433594, + "learning_rate": 4.957425244609707e-06, + "loss": 0.1289, + "num_input_tokens_seen": 2651160, + "step": 1341 + }, + { + "epoch": 0.17786613651424785, + "grad_norm": 5.224959850311279, + "learning_rate": 4.957361429031969e-06, + "loss": 0.1162, + "num_input_tokens_seen": 2652888, + "step": 1342 + }, + { + "epoch": 0.17799867461895294, + "grad_norm": 18.43150520324707, + "learning_rate": 4.957297566074607e-06, + "loss": 0.3498, + "num_input_tokens_seen": 2654760, + "step": 1343 + }, + { + "epoch": 0.17813121272365806, + "grad_norm": 0.18989135324954987, + "learning_rate": 4.957233655738852e-06, + "loss": 0.001, + "num_input_tokens_seen": 2655968, + "step": 1344 + }, + { + "epoch": 0.17826375082836315, + "grad_norm": 0.4958026111125946, + "learning_rate": 4.9571696980259376e-06, + "loss": 0.0026, + "num_input_tokens_seen": 2657848, + "step": 1345 + }, + { + "epoch": 0.17839628893306825, + "grad_norm": 10.45175552368164, + "learning_rate": 4.9571056929370965e-06, + "loss": 0.3346, + "num_input_tokens_seen": 2659592, + "step": 1346 + }, + { + "epoch": 0.17852882703777337, + "grad_norm": 1.3028470277786255, + "learning_rate": 4.957041640473562e-06, + "loss": 0.0056, + "num_input_tokens_seen": 2661432, + "step": 1347 + }, + { + "epoch": 0.17866136514247846, + "grad_norm": 0.16366267204284668, + "learning_rate": 4.95697754063657e-06, + "loss": 0.0009, + "num_input_tokens_seen": 2662904, + "step": 1348 + }, + { + "epoch": 0.17879390324718356, + "grad_norm": 8.679288864135742, + "learning_rate": 4.956913393427356e-06, + "loss": 0.0858, + "num_input_tokens_seen": 2664200, + "step": 1349 + }, + { + "epoch": 0.17892644135188868, + "grad_norm": 7.6977925300598145, + "learning_rate": 4.956849198847157e-06, + "loss": 0.155, + "num_input_tokens_seen": 2666392, + "step": 1350 + }, + { + "epoch": 0.17905897945659377, + "grad_norm": 4.024433612823486, + "learning_rate": 4.95678495689721e-06, + "loss": 0.0219, + "num_input_tokens_seen": 2668096, + "step": 1351 + }, + { + "epoch": 0.17919151756129886, + "grad_norm": 16.507389068603516, + "learning_rate": 4.956720667578755e-06, + "loss": 0.3301, + "num_input_tokens_seen": 2669976, + "step": 1352 + }, + { + "epoch": 0.17932405566600398, + "grad_norm": 12.342925071716309, + "learning_rate": 4.95665633089303e-06, + "loss": 0.2227, + "num_input_tokens_seen": 2671896, + "step": 1353 + }, + { + "epoch": 0.17945659377070908, + "grad_norm": 16.24755859375, + "learning_rate": 4.956591946841277e-06, + "loss": 0.2726, + "num_input_tokens_seen": 2674160, + "step": 1354 + }, + { + "epoch": 0.17958913187541417, + "grad_norm": 15.209321975708008, + "learning_rate": 4.956527515424736e-06, + "loss": 0.6729, + "num_input_tokens_seen": 2677000, + "step": 1355 + }, + { + "epoch": 0.1797216699801193, + "grad_norm": 15.547550201416016, + "learning_rate": 4.95646303664465e-06, + "loss": 0.1294, + "num_input_tokens_seen": 2679720, + "step": 1356 + }, + { + "epoch": 0.17985420808482439, + "grad_norm": 0.04554541036486626, + "learning_rate": 4.956398510502261e-06, + "loss": 0.0002, + "num_input_tokens_seen": 2680728, + "step": 1357 + }, + { + "epoch": 0.17998674618952948, + "grad_norm": 0.07686033844947815, + "learning_rate": 4.956333936998816e-06, + "loss": 0.0004, + "num_input_tokens_seen": 2683136, + "step": 1358 + }, + { + "epoch": 0.1801192842942346, + "grad_norm": 13.693772315979004, + "learning_rate": 4.956269316135557e-06, + "loss": 0.5034, + "num_input_tokens_seen": 2684736, + "step": 1359 + }, + { + "epoch": 0.1802518223989397, + "grad_norm": 10.692391395568848, + "learning_rate": 4.956204647913731e-06, + "loss": 0.333, + "num_input_tokens_seen": 2686952, + "step": 1360 + }, + { + "epoch": 0.1803843605036448, + "grad_norm": 0.030914820730686188, + "learning_rate": 4.956139932334586e-06, + "loss": 0.0002, + "num_input_tokens_seen": 2689360, + "step": 1361 + }, + { + "epoch": 0.1805168986083499, + "grad_norm": 7.450273036956787, + "learning_rate": 4.956075169399368e-06, + "loss": 0.1463, + "num_input_tokens_seen": 2691240, + "step": 1362 + }, + { + "epoch": 0.180649436713055, + "grad_norm": 22.607311248779297, + "learning_rate": 4.956010359109326e-06, + "loss": 0.3636, + "num_input_tokens_seen": 2692696, + "step": 1363 + }, + { + "epoch": 0.1807819748177601, + "grad_norm": 9.848207473754883, + "learning_rate": 4.955945501465711e-06, + "loss": 0.1439, + "num_input_tokens_seen": 2695152, + "step": 1364 + }, + { + "epoch": 0.18091451292246521, + "grad_norm": 16.704357147216797, + "learning_rate": 4.955880596469771e-06, + "loss": 0.3333, + "num_input_tokens_seen": 2697160, + "step": 1365 + }, + { + "epoch": 0.1810470510271703, + "grad_norm": 0.12614494562149048, + "learning_rate": 4.95581564412276e-06, + "loss": 0.0007, + "num_input_tokens_seen": 2698680, + "step": 1366 + }, + { + "epoch": 0.1811795891318754, + "grad_norm": 3.5355217456817627, + "learning_rate": 4.955750644425928e-06, + "loss": 0.0901, + "num_input_tokens_seen": 2700512, + "step": 1367 + }, + { + "epoch": 0.18131212723658052, + "grad_norm": 9.856828689575195, + "learning_rate": 4.95568559738053e-06, + "loss": 0.2626, + "num_input_tokens_seen": 2702360, + "step": 1368 + }, + { + "epoch": 0.18144466534128562, + "grad_norm": 5.182863712310791, + "learning_rate": 4.9556205029878205e-06, + "loss": 0.1628, + "num_input_tokens_seen": 2704136, + "step": 1369 + }, + { + "epoch": 0.1815772034459907, + "grad_norm": 3.907506227493286, + "learning_rate": 4.955555361249053e-06, + "loss": 0.0814, + "num_input_tokens_seen": 2706224, + "step": 1370 + }, + { + "epoch": 0.18170974155069583, + "grad_norm": 9.75102710723877, + "learning_rate": 4.9554901721654846e-06, + "loss": 0.0714, + "num_input_tokens_seen": 2708560, + "step": 1371 + }, + { + "epoch": 0.18184227965540092, + "grad_norm": 11.964812278747559, + "learning_rate": 4.955424935738371e-06, + "loss": 0.3374, + "num_input_tokens_seen": 2710344, + "step": 1372 + }, + { + "epoch": 0.18197481776010602, + "grad_norm": 6.92396354675293, + "learning_rate": 4.95535965196897e-06, + "loss": 0.0887, + "num_input_tokens_seen": 2712272, + "step": 1373 + }, + { + "epoch": 0.18210735586481114, + "grad_norm": 22.524127960205078, + "learning_rate": 4.955294320858543e-06, + "loss": 0.5707, + "num_input_tokens_seen": 2713728, + "step": 1374 + }, + { + "epoch": 0.18223989396951623, + "grad_norm": 0.24901993572711945, + "learning_rate": 4.955228942408347e-06, + "loss": 0.0013, + "num_input_tokens_seen": 2715912, + "step": 1375 + }, + { + "epoch": 0.18237243207422135, + "grad_norm": 12.116771697998047, + "learning_rate": 4.9551635166196425e-06, + "loss": 0.3255, + "num_input_tokens_seen": 2718016, + "step": 1376 + }, + { + "epoch": 0.18250497017892645, + "grad_norm": 7.756706714630127, + "learning_rate": 4.955098043493692e-06, + "loss": 0.2238, + "num_input_tokens_seen": 2720272, + "step": 1377 + }, + { + "epoch": 0.18263750828363154, + "grad_norm": 3.5088140964508057, + "learning_rate": 4.955032523031757e-06, + "loss": 0.0205, + "num_input_tokens_seen": 2721824, + "step": 1378 + }, + { + "epoch": 0.18277004638833666, + "grad_norm": 15.379654884338379, + "learning_rate": 4.954966955235103e-06, + "loss": 0.4006, + "num_input_tokens_seen": 2724432, + "step": 1379 + }, + { + "epoch": 0.18290258449304175, + "grad_norm": 10.215106964111328, + "learning_rate": 4.954901340104991e-06, + "loss": 0.0775, + "num_input_tokens_seen": 2726392, + "step": 1380 + }, + { + "epoch": 0.18303512259774685, + "grad_norm": 2.4512383937835693, + "learning_rate": 4.954835677642688e-06, + "loss": 0.013, + "num_input_tokens_seen": 2727536, + "step": 1381 + }, + { + "epoch": 0.18316766070245197, + "grad_norm": 3.808758497238159, + "learning_rate": 4.95476996784946e-06, + "loss": 0.0248, + "num_input_tokens_seen": 2729104, + "step": 1382 + }, + { + "epoch": 0.18330019880715706, + "grad_norm": 17.573938369750977, + "learning_rate": 4.954704210726573e-06, + "loss": 0.3097, + "num_input_tokens_seen": 2731288, + "step": 1383 + }, + { + "epoch": 0.18343273691186215, + "grad_norm": 0.7857549786567688, + "learning_rate": 4.9546384062752965e-06, + "loss": 0.004, + "num_input_tokens_seen": 2733352, + "step": 1384 + }, + { + "epoch": 0.18356527501656728, + "grad_norm": 0.9988700151443481, + "learning_rate": 4.954572554496897e-06, + "loss": 0.0051, + "num_input_tokens_seen": 2735472, + "step": 1385 + }, + { + "epoch": 0.18369781312127237, + "grad_norm": 15.075550079345703, + "learning_rate": 4.954506655392645e-06, + "loss": 0.237, + "num_input_tokens_seen": 2737840, + "step": 1386 + }, + { + "epoch": 0.18383035122597746, + "grad_norm": 17.40070915222168, + "learning_rate": 4.954440708963813e-06, + "loss": 0.3638, + "num_input_tokens_seen": 2739824, + "step": 1387 + }, + { + "epoch": 0.18396288933068258, + "grad_norm": 14.727279663085938, + "learning_rate": 4.9543747152116695e-06, + "loss": 0.3716, + "num_input_tokens_seen": 2741912, + "step": 1388 + }, + { + "epoch": 0.18409542743538768, + "grad_norm": 0.0239323228597641, + "learning_rate": 4.954308674137489e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2743536, + "step": 1389 + }, + { + "epoch": 0.18422796554009277, + "grad_norm": 5.457317352294922, + "learning_rate": 4.9542425857425435e-06, + "loss": 0.1929, + "num_input_tokens_seen": 2745488, + "step": 1390 + }, + { + "epoch": 0.1843605036447979, + "grad_norm": 14.15846061706543, + "learning_rate": 4.954176450028108e-06, + "loss": 0.1807, + "num_input_tokens_seen": 2747352, + "step": 1391 + }, + { + "epoch": 0.18449304174950298, + "grad_norm": 10.494392395019531, + "learning_rate": 4.954110266995458e-06, + "loss": 0.1434, + "num_input_tokens_seen": 2749960, + "step": 1392 + }, + { + "epoch": 0.18462557985420808, + "grad_norm": 0.0084645701572299, + "learning_rate": 4.954044036645869e-06, + "loss": 0.0, + "num_input_tokens_seen": 2751328, + "step": 1393 + }, + { + "epoch": 0.1847581179589132, + "grad_norm": 8.417326927185059, + "learning_rate": 4.953977758980618e-06, + "loss": 0.2025, + "num_input_tokens_seen": 2753480, + "step": 1394 + }, + { + "epoch": 0.1848906560636183, + "grad_norm": 2.6627230644226074, + "learning_rate": 4.953911434000982e-06, + "loss": 0.0387, + "num_input_tokens_seen": 2755088, + "step": 1395 + }, + { + "epoch": 0.18502319416832338, + "grad_norm": 4.448336124420166, + "learning_rate": 4.953845061708241e-06, + "loss": 0.0609, + "num_input_tokens_seen": 2756048, + "step": 1396 + }, + { + "epoch": 0.1851557322730285, + "grad_norm": 0.013358982279896736, + "learning_rate": 4.953778642103675e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2757576, + "step": 1397 + }, + { + "epoch": 0.1852882703777336, + "grad_norm": 0.014393574558198452, + "learning_rate": 4.953712175188564e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2759968, + "step": 1398 + }, + { + "epoch": 0.1854208084824387, + "grad_norm": 0.01428682915866375, + "learning_rate": 4.953645660964189e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2761224, + "step": 1399 + }, + { + "epoch": 0.1855533465871438, + "grad_norm": 5.122958183288574, + "learning_rate": 4.9535790994318335e-06, + "loss": 0.1171, + "num_input_tokens_seen": 2762648, + "step": 1400 + }, + { + "epoch": 0.1856858846918489, + "grad_norm": 4.081135272979736, + "learning_rate": 4.95351249059278e-06, + "loss": 0.1286, + "num_input_tokens_seen": 2764832, + "step": 1401 + }, + { + "epoch": 0.185818422796554, + "grad_norm": 0.06594186276197433, + "learning_rate": 4.953445834448314e-06, + "loss": 0.0003, + "num_input_tokens_seen": 2766224, + "step": 1402 + }, + { + "epoch": 0.18595096090125912, + "grad_norm": 12.58669376373291, + "learning_rate": 4.9533791309997184e-06, + "loss": 0.202, + "num_input_tokens_seen": 2768408, + "step": 1403 + }, + { + "epoch": 0.18608349900596421, + "grad_norm": 11.852744102478027, + "learning_rate": 4.9533123802482815e-06, + "loss": 0.2999, + "num_input_tokens_seen": 2770896, + "step": 1404 + }, + { + "epoch": 0.1862160371106693, + "grad_norm": 17.630538940429688, + "learning_rate": 4.953245582195288e-06, + "loss": 0.4739, + "num_input_tokens_seen": 2773616, + "step": 1405 + }, + { + "epoch": 0.18634857521537443, + "grad_norm": 7.816521644592285, + "learning_rate": 4.953178736842029e-06, + "loss": 0.1411, + "num_input_tokens_seen": 2775952, + "step": 1406 + }, + { + "epoch": 0.18648111332007952, + "grad_norm": 15.505335807800293, + "learning_rate": 4.9531118441897915e-06, + "loss": 0.2839, + "num_input_tokens_seen": 2778184, + "step": 1407 + }, + { + "epoch": 0.18661365142478462, + "grad_norm": 29.322540283203125, + "learning_rate": 4.953044904239865e-06, + "loss": 0.3927, + "num_input_tokens_seen": 2780704, + "step": 1408 + }, + { + "epoch": 0.18674618952948974, + "grad_norm": 3.9190611839294434, + "learning_rate": 4.95297791699354e-06, + "loss": 0.143, + "num_input_tokens_seen": 2782800, + "step": 1409 + }, + { + "epoch": 0.18687872763419483, + "grad_norm": 16.224727630615234, + "learning_rate": 4.952910882452109e-06, + "loss": 0.2726, + "num_input_tokens_seen": 2785696, + "step": 1410 + }, + { + "epoch": 0.18701126573889992, + "grad_norm": 5.798166275024414, + "learning_rate": 4.9528438006168635e-06, + "loss": 0.176, + "num_input_tokens_seen": 2786904, + "step": 1411 + }, + { + "epoch": 0.18714380384360504, + "grad_norm": 11.637779235839844, + "learning_rate": 4.952776671489098e-06, + "loss": 0.2707, + "num_input_tokens_seen": 2788656, + "step": 1412 + }, + { + "epoch": 0.18727634194831014, + "grad_norm": 12.561522483825684, + "learning_rate": 4.952709495070106e-06, + "loss": 0.1834, + "num_input_tokens_seen": 2791176, + "step": 1413 + }, + { + "epoch": 0.18740888005301523, + "grad_norm": 20.759782791137695, + "learning_rate": 4.952642271361183e-06, + "loss": 0.4771, + "num_input_tokens_seen": 2792496, + "step": 1414 + }, + { + "epoch": 0.18754141815772035, + "grad_norm": 19.70482635498047, + "learning_rate": 4.952575000363625e-06, + "loss": 0.5119, + "num_input_tokens_seen": 2794128, + "step": 1415 + }, + { + "epoch": 0.18767395626242545, + "grad_norm": 4.772583961486816, + "learning_rate": 4.9525076820787285e-06, + "loss": 0.0587, + "num_input_tokens_seen": 2795328, + "step": 1416 + }, + { + "epoch": 0.18780649436713054, + "grad_norm": 10.307486534118652, + "learning_rate": 4.952440316507792e-06, + "loss": 0.157, + "num_input_tokens_seen": 2796784, + "step": 1417 + }, + { + "epoch": 0.18793903247183566, + "grad_norm": 13.984895706176758, + "learning_rate": 4.952372903652115e-06, + "loss": 0.4324, + "num_input_tokens_seen": 2798888, + "step": 1418 + }, + { + "epoch": 0.18807157057654075, + "grad_norm": 24.499588012695312, + "learning_rate": 4.9523054435129966e-06, + "loss": 0.7377, + "num_input_tokens_seen": 2801576, + "step": 1419 + }, + { + "epoch": 0.18820410868124585, + "grad_norm": 8.95081615447998, + "learning_rate": 4.952237936091737e-06, + "loss": 0.2374, + "num_input_tokens_seen": 2803160, + "step": 1420 + }, + { + "epoch": 0.18833664678595097, + "grad_norm": 3.977041721343994, + "learning_rate": 4.952170381389639e-06, + "loss": 0.056, + "num_input_tokens_seen": 2804536, + "step": 1421 + }, + { + "epoch": 0.18846918489065606, + "grad_norm": 11.694116592407227, + "learning_rate": 4.9521027794080035e-06, + "loss": 0.1135, + "num_input_tokens_seen": 2806728, + "step": 1422 + }, + { + "epoch": 0.18860172299536115, + "grad_norm": 13.62118148803711, + "learning_rate": 4.952035130148135e-06, + "loss": 0.3857, + "num_input_tokens_seen": 2809208, + "step": 1423 + }, + { + "epoch": 0.18873426110006628, + "grad_norm": 14.459735870361328, + "learning_rate": 4.9519674336113385e-06, + "loss": 0.2551, + "num_input_tokens_seen": 2810736, + "step": 1424 + }, + { + "epoch": 0.18886679920477137, + "grad_norm": 14.811407089233398, + "learning_rate": 4.951899689798917e-06, + "loss": 0.5976, + "num_input_tokens_seen": 2812088, + "step": 1425 + }, + { + "epoch": 0.18899933730947646, + "grad_norm": 9.987671852111816, + "learning_rate": 4.9518318987121794e-06, + "loss": 0.0825, + "num_input_tokens_seen": 2813752, + "step": 1426 + }, + { + "epoch": 0.18913187541418158, + "grad_norm": 11.869303703308105, + "learning_rate": 4.951764060352431e-06, + "loss": 0.2615, + "num_input_tokens_seen": 2816480, + "step": 1427 + }, + { + "epoch": 0.18926441351888668, + "grad_norm": 12.198988914489746, + "learning_rate": 4.9516961747209795e-06, + "loss": 0.1687, + "num_input_tokens_seen": 2818464, + "step": 1428 + }, + { + "epoch": 0.18939695162359177, + "grad_norm": 12.543244361877441, + "learning_rate": 4.951628241819134e-06, + "loss": 0.1593, + "num_input_tokens_seen": 2819760, + "step": 1429 + }, + { + "epoch": 0.1895294897282969, + "grad_norm": 8.581977844238281, + "learning_rate": 4.951560261648206e-06, + "loss": 0.1176, + "num_input_tokens_seen": 2822816, + "step": 1430 + }, + { + "epoch": 0.18966202783300198, + "grad_norm": 4.672897815704346, + "learning_rate": 4.951492234209504e-06, + "loss": 0.0523, + "num_input_tokens_seen": 2825256, + "step": 1431 + }, + { + "epoch": 0.18979456593770708, + "grad_norm": 8.391756057739258, + "learning_rate": 4.951424159504341e-06, + "loss": 0.128, + "num_input_tokens_seen": 2826968, + "step": 1432 + }, + { + "epoch": 0.1899271040424122, + "grad_norm": 4.2906599044799805, + "learning_rate": 4.951356037534028e-06, + "loss": 0.0305, + "num_input_tokens_seen": 2829144, + "step": 1433 + }, + { + "epoch": 0.1900596421471173, + "grad_norm": 0.6365682482719421, + "learning_rate": 4.951287868299881e-06, + "loss": 0.0033, + "num_input_tokens_seen": 2830688, + "step": 1434 + }, + { + "epoch": 0.1901921802518224, + "grad_norm": 24.61792755126953, + "learning_rate": 4.951219651803212e-06, + "loss": 0.4558, + "num_input_tokens_seen": 2833344, + "step": 1435 + }, + { + "epoch": 0.1903247183565275, + "grad_norm": 6.3319854736328125, + "learning_rate": 4.951151388045338e-06, + "loss": 0.1775, + "num_input_tokens_seen": 2835456, + "step": 1436 + }, + { + "epoch": 0.1904572564612326, + "grad_norm": 0.12330157309770584, + "learning_rate": 4.951083077027573e-06, + "loss": 0.0007, + "num_input_tokens_seen": 2837296, + "step": 1437 + }, + { + "epoch": 0.19058979456593772, + "grad_norm": 2.628593683242798, + "learning_rate": 4.951014718751236e-06, + "loss": 0.0064, + "num_input_tokens_seen": 2840048, + "step": 1438 + }, + { + "epoch": 0.1907223326706428, + "grad_norm": 12.96270751953125, + "learning_rate": 4.950946313217645e-06, + "loss": 0.3344, + "num_input_tokens_seen": 2842872, + "step": 1439 + }, + { + "epoch": 0.1908548707753479, + "grad_norm": 0.4206317067146301, + "learning_rate": 4.950877860428117e-06, + "loss": 0.0025, + "num_input_tokens_seen": 2844112, + "step": 1440 + }, + { + "epoch": 0.19098740888005303, + "grad_norm": 7.179927825927734, + "learning_rate": 4.950809360383974e-06, + "loss": 0.1734, + "num_input_tokens_seen": 2846024, + "step": 1441 + }, + { + "epoch": 0.19111994698475812, + "grad_norm": 13.49063777923584, + "learning_rate": 4.950740813086535e-06, + "loss": 0.3759, + "num_input_tokens_seen": 2848992, + "step": 1442 + }, + { + "epoch": 0.19125248508946321, + "grad_norm": 16.809833526611328, + "learning_rate": 4.950672218537124e-06, + "loss": 0.6711, + "num_input_tokens_seen": 2852144, + "step": 1443 + }, + { + "epoch": 0.19138502319416834, + "grad_norm": 15.014547348022461, + "learning_rate": 4.95060357673706e-06, + "loss": 0.401, + "num_input_tokens_seen": 2853408, + "step": 1444 + }, + { + "epoch": 0.19151756129887343, + "grad_norm": 0.03233711048960686, + "learning_rate": 4.95053488768767e-06, + "loss": 0.0002, + "num_input_tokens_seen": 2854752, + "step": 1445 + }, + { + "epoch": 0.19165009940357852, + "grad_norm": 14.743435859680176, + "learning_rate": 4.9504661513902755e-06, + "loss": 0.5222, + "num_input_tokens_seen": 2856832, + "step": 1446 + }, + { + "epoch": 0.19178263750828364, + "grad_norm": 16.61276626586914, + "learning_rate": 4.950397367846204e-06, + "loss": 0.3905, + "num_input_tokens_seen": 2858616, + "step": 1447 + }, + { + "epoch": 0.19191517561298874, + "grad_norm": 10.526078224182129, + "learning_rate": 4.950328537056781e-06, + "loss": 0.3363, + "num_input_tokens_seen": 2860696, + "step": 1448 + }, + { + "epoch": 0.19204771371769383, + "grad_norm": 10.344022750854492, + "learning_rate": 4.950259659023333e-06, + "loss": 0.2589, + "num_input_tokens_seen": 2862856, + "step": 1449 + }, + { + "epoch": 0.19218025182239895, + "grad_norm": 10.606470108032227, + "learning_rate": 4.950190733747188e-06, + "loss": 0.2366, + "num_input_tokens_seen": 2865224, + "step": 1450 + }, + { + "epoch": 0.19231278992710404, + "grad_norm": 9.286369323730469, + "learning_rate": 4.950121761229676e-06, + "loss": 0.164, + "num_input_tokens_seen": 2866832, + "step": 1451 + }, + { + "epoch": 0.19244532803180914, + "grad_norm": 0.17270997166633606, + "learning_rate": 4.950052741472126e-06, + "loss": 0.0011, + "num_input_tokens_seen": 2868712, + "step": 1452 + }, + { + "epoch": 0.19257786613651426, + "grad_norm": 0.11371482163667679, + "learning_rate": 4.949983674475869e-06, + "loss": 0.0007, + "num_input_tokens_seen": 2870200, + "step": 1453 + }, + { + "epoch": 0.19271040424121935, + "grad_norm": 2.0509612560272217, + "learning_rate": 4.949914560242236e-06, + "loss": 0.0401, + "num_input_tokens_seen": 2872040, + "step": 1454 + }, + { + "epoch": 0.19284294234592445, + "grad_norm": 11.695093154907227, + "learning_rate": 4.949845398772561e-06, + "loss": 0.2868, + "num_input_tokens_seen": 2873384, + "step": 1455 + }, + { + "epoch": 0.19297548045062957, + "grad_norm": 3.2590997219085693, + "learning_rate": 4.949776190068176e-06, + "loss": 0.0603, + "num_input_tokens_seen": 2875848, + "step": 1456 + }, + { + "epoch": 0.19310801855533466, + "grad_norm": 0.27745962142944336, + "learning_rate": 4.949706934130417e-06, + "loss": 0.0017, + "num_input_tokens_seen": 2877216, + "step": 1457 + }, + { + "epoch": 0.19324055666003975, + "grad_norm": 17.104896545410156, + "learning_rate": 4.949637630960618e-06, + "loss": 0.7062, + "num_input_tokens_seen": 2879584, + "step": 1458 + }, + { + "epoch": 0.19337309476474487, + "grad_norm": 11.767041206359863, + "learning_rate": 4.949568280560115e-06, + "loss": 0.3207, + "num_input_tokens_seen": 2881168, + "step": 1459 + }, + { + "epoch": 0.19350563286944997, + "grad_norm": 13.05449390411377, + "learning_rate": 4.949498882930246e-06, + "loss": 0.1772, + "num_input_tokens_seen": 2883136, + "step": 1460 + }, + { + "epoch": 0.19363817097415506, + "grad_norm": 2.6173017024993896, + "learning_rate": 4.949429438072348e-06, + "loss": 0.0118, + "num_input_tokens_seen": 2884720, + "step": 1461 + }, + { + "epoch": 0.19377070907886018, + "grad_norm": 0.624408483505249, + "learning_rate": 4.949359945987762e-06, + "loss": 0.0039, + "num_input_tokens_seen": 2886000, + "step": 1462 + }, + { + "epoch": 0.19390324718356527, + "grad_norm": 0.6071314811706543, + "learning_rate": 4.949290406677826e-06, + "loss": 0.0037, + "num_input_tokens_seen": 2887856, + "step": 1463 + }, + { + "epoch": 0.19403578528827037, + "grad_norm": 2.3573100566864014, + "learning_rate": 4.949220820143881e-06, + "loss": 0.0096, + "num_input_tokens_seen": 2889656, + "step": 1464 + }, + { + "epoch": 0.1941683233929755, + "grad_norm": 9.5718994140625, + "learning_rate": 4.949151186387269e-06, + "loss": 0.2236, + "num_input_tokens_seen": 2891584, + "step": 1465 + }, + { + "epoch": 0.19430086149768058, + "grad_norm": 9.640631675720215, + "learning_rate": 4.949081505409332e-06, + "loss": 0.1823, + "num_input_tokens_seen": 2893536, + "step": 1466 + }, + { + "epoch": 0.19443339960238568, + "grad_norm": 17.514127731323242, + "learning_rate": 4.949011777211415e-06, + "loss": 0.1424, + "num_input_tokens_seen": 2895704, + "step": 1467 + }, + { + "epoch": 0.1945659377070908, + "grad_norm": 0.2816193699836731, + "learning_rate": 4.948942001794861e-06, + "loss": 0.0017, + "num_input_tokens_seen": 2896928, + "step": 1468 + }, + { + "epoch": 0.1946984758117959, + "grad_norm": 4.933318138122559, + "learning_rate": 4.948872179161016e-06, + "loss": 0.0433, + "num_input_tokens_seen": 2898712, + "step": 1469 + }, + { + "epoch": 0.19483101391650098, + "grad_norm": 0.1647079885005951, + "learning_rate": 4.948802309311226e-06, + "loss": 0.001, + "num_input_tokens_seen": 2900152, + "step": 1470 + }, + { + "epoch": 0.1949635520212061, + "grad_norm": 8.096887588500977, + "learning_rate": 4.948732392246838e-06, + "loss": 0.134, + "num_input_tokens_seen": 2902544, + "step": 1471 + }, + { + "epoch": 0.1950960901259112, + "grad_norm": 20.626953125, + "learning_rate": 4.9486624279692e-06, + "loss": 0.2312, + "num_input_tokens_seen": 2905952, + "step": 1472 + }, + { + "epoch": 0.1952286282306163, + "grad_norm": 0.09424714744091034, + "learning_rate": 4.948592416479662e-06, + "loss": 0.0005, + "num_input_tokens_seen": 2908976, + "step": 1473 + }, + { + "epoch": 0.1953611663353214, + "grad_norm": 3.65250825881958, + "learning_rate": 4.9485223577795725e-06, + "loss": 0.0529, + "num_input_tokens_seen": 2911104, + "step": 1474 + }, + { + "epoch": 0.1954937044400265, + "grad_norm": 0.26873332262039185, + "learning_rate": 4.948452251870282e-06, + "loss": 0.0015, + "num_input_tokens_seen": 2913480, + "step": 1475 + }, + { + "epoch": 0.1956262425447316, + "grad_norm": 0.09667015820741653, + "learning_rate": 4.9483820987531436e-06, + "loss": 0.0005, + "num_input_tokens_seen": 2915200, + "step": 1476 + }, + { + "epoch": 0.19575878064943672, + "grad_norm": 0.05408121645450592, + "learning_rate": 4.9483118984295095e-06, + "loss": 0.0003, + "num_input_tokens_seen": 2916952, + "step": 1477 + }, + { + "epoch": 0.1958913187541418, + "grad_norm": 6.450794219970703, + "learning_rate": 4.948241650900733e-06, + "loss": 0.0441, + "num_input_tokens_seen": 2919096, + "step": 1478 + }, + { + "epoch": 0.1960238568588469, + "grad_norm": 0.02533072419464588, + "learning_rate": 4.948171356168168e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2920384, + "step": 1479 + }, + { + "epoch": 0.19615639496355203, + "grad_norm": 5.56210994720459, + "learning_rate": 4.948101014233171e-06, + "loss": 0.1898, + "num_input_tokens_seen": 2922064, + "step": 1480 + }, + { + "epoch": 0.19628893306825712, + "grad_norm": 24.328418731689453, + "learning_rate": 4.948030625097097e-06, + "loss": 0.2696, + "num_input_tokens_seen": 2925008, + "step": 1481 + }, + { + "epoch": 0.19642147117296221, + "grad_norm": 15.620478630065918, + "learning_rate": 4.947960188761304e-06, + "loss": 0.402, + "num_input_tokens_seen": 2927296, + "step": 1482 + }, + { + "epoch": 0.19655400927766734, + "grad_norm": 19.02410125732422, + "learning_rate": 4.9478897052271504e-06, + "loss": 0.2184, + "num_input_tokens_seen": 2929920, + "step": 1483 + }, + { + "epoch": 0.19668654738237243, + "grad_norm": 17.353513717651367, + "learning_rate": 4.9478191744959935e-06, + "loss": 0.4118, + "num_input_tokens_seen": 2931448, + "step": 1484 + }, + { + "epoch": 0.19681908548707752, + "grad_norm": 14.65807819366455, + "learning_rate": 4.947748596569195e-06, + "loss": 0.4383, + "num_input_tokens_seen": 2933464, + "step": 1485 + }, + { + "epoch": 0.19695162359178264, + "grad_norm": 0.013869226910173893, + "learning_rate": 4.947677971448114e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2934832, + "step": 1486 + }, + { + "epoch": 0.19708416169648774, + "grad_norm": 4.815378189086914, + "learning_rate": 4.947607299134114e-06, + "loss": 0.122, + "num_input_tokens_seen": 2937264, + "step": 1487 + }, + { + "epoch": 0.19721669980119283, + "grad_norm": 8.090982437133789, + "learning_rate": 4.947536579628557e-06, + "loss": 0.1378, + "num_input_tokens_seen": 2938688, + "step": 1488 + }, + { + "epoch": 0.19734923790589795, + "grad_norm": 30.305927276611328, + "learning_rate": 4.947465812932806e-06, + "loss": 1.0197, + "num_input_tokens_seen": 2941152, + "step": 1489 + }, + { + "epoch": 0.19748177601060304, + "grad_norm": 0.3082437813282013, + "learning_rate": 4.947394999048225e-06, + "loss": 0.0016, + "num_input_tokens_seen": 2942936, + "step": 1490 + }, + { + "epoch": 0.19761431411530814, + "grad_norm": 14.401032447814941, + "learning_rate": 4.947324137976181e-06, + "loss": 0.3299, + "num_input_tokens_seen": 2944352, + "step": 1491 + }, + { + "epoch": 0.19774685222001326, + "grad_norm": 6.82422399520874, + "learning_rate": 4.947253229718039e-06, + "loss": 0.1891, + "num_input_tokens_seen": 2946280, + "step": 1492 + }, + { + "epoch": 0.19787939032471835, + "grad_norm": 9.745396614074707, + "learning_rate": 4.947182274275166e-06, + "loss": 0.229, + "num_input_tokens_seen": 2948488, + "step": 1493 + }, + { + "epoch": 0.19801192842942347, + "grad_norm": 6.9834065437316895, + "learning_rate": 4.947111271648931e-06, + "loss": 0.1487, + "num_input_tokens_seen": 2950624, + "step": 1494 + }, + { + "epoch": 0.19814446653412857, + "grad_norm": 19.150836944580078, + "learning_rate": 4.947040221840702e-06, + "loss": 0.4586, + "num_input_tokens_seen": 2951888, + "step": 1495 + }, + { + "epoch": 0.19827700463883366, + "grad_norm": 1.4499837160110474, + "learning_rate": 4.9469691248518505e-06, + "loss": 0.0183, + "num_input_tokens_seen": 2953528, + "step": 1496 + }, + { + "epoch": 0.19840954274353878, + "grad_norm": 15.1196928024292, + "learning_rate": 4.946897980683745e-06, + "loss": 0.2454, + "num_input_tokens_seen": 2955144, + "step": 1497 + }, + { + "epoch": 0.19854208084824387, + "grad_norm": 16.50326156616211, + "learning_rate": 4.946826789337759e-06, + "loss": 0.2717, + "num_input_tokens_seen": 2956568, + "step": 1498 + }, + { + "epoch": 0.19867461895294897, + "grad_norm": 0.046950824558734894, + "learning_rate": 4.946755550815265e-06, + "loss": 0.0003, + "num_input_tokens_seen": 2957848, + "step": 1499 + }, + { + "epoch": 0.1988071570576541, + "grad_norm": 3.3023948669433594, + "learning_rate": 4.946684265117635e-06, + "loss": 0.0375, + "num_input_tokens_seen": 2961072, + "step": 1500 + }, + { + "epoch": 0.19893969516235918, + "grad_norm": 35.056427001953125, + "learning_rate": 4.9466129322462444e-06, + "loss": 0.5977, + "num_input_tokens_seen": 2963560, + "step": 1501 + }, + { + "epoch": 0.19907223326706427, + "grad_norm": 14.703974723815918, + "learning_rate": 4.94654155220247e-06, + "loss": 0.2402, + "num_input_tokens_seen": 2965576, + "step": 1502 + }, + { + "epoch": 0.1992047713717694, + "grad_norm": 11.942296028137207, + "learning_rate": 4.946470124987686e-06, + "loss": 0.1351, + "num_input_tokens_seen": 2968232, + "step": 1503 + }, + { + "epoch": 0.1993373094764745, + "grad_norm": 14.916484832763672, + "learning_rate": 4.946398650603269e-06, + "loss": 0.5421, + "num_input_tokens_seen": 2971448, + "step": 1504 + }, + { + "epoch": 0.19946984758117958, + "grad_norm": 17.552587509155273, + "learning_rate": 4.9463271290506e-06, + "loss": 0.2842, + "num_input_tokens_seen": 2973656, + "step": 1505 + }, + { + "epoch": 0.1996023856858847, + "grad_norm": 13.145828247070312, + "learning_rate": 4.946255560331056e-06, + "loss": 0.2213, + "num_input_tokens_seen": 2975304, + "step": 1506 + }, + { + "epoch": 0.1997349237905898, + "grad_norm": 8.712215423583984, + "learning_rate": 4.946183944446017e-06, + "loss": 0.2274, + "num_input_tokens_seen": 2976936, + "step": 1507 + }, + { + "epoch": 0.1998674618952949, + "grad_norm": 2.575416088104248, + "learning_rate": 4.946112281396864e-06, + "loss": 0.0125, + "num_input_tokens_seen": 2979312, + "step": 1508 + }, + { + "epoch": 0.2, + "grad_norm": 2.6841654777526855, + "learning_rate": 4.946040571184978e-06, + "loss": 0.0117, + "num_input_tokens_seen": 2981024, + "step": 1509 + }, + { + "epoch": 0.2001325381047051, + "grad_norm": 9.58149528503418, + "learning_rate": 4.945968813811743e-06, + "loss": 0.1947, + "num_input_tokens_seen": 2983936, + "step": 1510 + }, + { + "epoch": 0.2002650762094102, + "grad_norm": 6.357746124267578, + "learning_rate": 4.945897009278543e-06, + "loss": 0.0832, + "num_input_tokens_seen": 2985328, + "step": 1511 + }, + { + "epoch": 0.20039761431411532, + "grad_norm": 12.998671531677246, + "learning_rate": 4.94582515758676e-06, + "loss": 0.1627, + "num_input_tokens_seen": 2988088, + "step": 1512 + }, + { + "epoch": 0.2005301524188204, + "grad_norm": 15.709349632263184, + "learning_rate": 4.945753258737781e-06, + "loss": 0.2284, + "num_input_tokens_seen": 2989576, + "step": 1513 + }, + { + "epoch": 0.2006626905235255, + "grad_norm": 7.763367176055908, + "learning_rate": 4.945681312732992e-06, + "loss": 0.2367, + "num_input_tokens_seen": 2991384, + "step": 1514 + }, + { + "epoch": 0.20079522862823063, + "grad_norm": 6.251837730407715, + "learning_rate": 4.9456093195737794e-06, + "loss": 0.1219, + "num_input_tokens_seen": 2993384, + "step": 1515 + }, + { + "epoch": 0.20092776673293572, + "grad_norm": 0.11836721748113632, + "learning_rate": 4.945537279261533e-06, + "loss": 0.0005, + "num_input_tokens_seen": 2995904, + "step": 1516 + }, + { + "epoch": 0.2010603048376408, + "grad_norm": 4.0873870849609375, + "learning_rate": 4.945465191797641e-06, + "loss": 0.0481, + "num_input_tokens_seen": 2997504, + "step": 1517 + }, + { + "epoch": 0.20119284294234593, + "grad_norm": 0.6148648858070374, + "learning_rate": 4.945393057183492e-06, + "loss": 0.0036, + "num_input_tokens_seen": 2999336, + "step": 1518 + }, + { + "epoch": 0.20132538104705103, + "grad_norm": 12.814029693603516, + "learning_rate": 4.945320875420478e-06, + "loss": 0.301, + "num_input_tokens_seen": 3003144, + "step": 1519 + }, + { + "epoch": 0.20145791915175612, + "grad_norm": 10.5188627243042, + "learning_rate": 4.945248646509991e-06, + "loss": 0.1691, + "num_input_tokens_seen": 3004824, + "step": 1520 + }, + { + "epoch": 0.20159045725646124, + "grad_norm": 1.2420895099639893, + "learning_rate": 4.945176370453423e-06, + "loss": 0.0068, + "num_input_tokens_seen": 3005840, + "step": 1521 + }, + { + "epoch": 0.20172299536116634, + "grad_norm": 27.47368049621582, + "learning_rate": 4.945104047252168e-06, + "loss": 1.0257, + "num_input_tokens_seen": 3008392, + "step": 1522 + }, + { + "epoch": 0.20185553346587143, + "grad_norm": 10.00438404083252, + "learning_rate": 4.94503167690762e-06, + "loss": 0.2241, + "num_input_tokens_seen": 3010304, + "step": 1523 + }, + { + "epoch": 0.20198807157057655, + "grad_norm": 8.860278129577637, + "learning_rate": 4.944959259421173e-06, + "loss": 0.1447, + "num_input_tokens_seen": 3013168, + "step": 1524 + }, + { + "epoch": 0.20212060967528164, + "grad_norm": 2.469446897506714, + "learning_rate": 4.944886794794227e-06, + "loss": 0.0137, + "num_input_tokens_seen": 3015488, + "step": 1525 + }, + { + "epoch": 0.20225314777998674, + "grad_norm": 6.759047031402588, + "learning_rate": 4.9448142830281764e-06, + "loss": 0.1058, + "num_input_tokens_seen": 3018448, + "step": 1526 + }, + { + "epoch": 0.20238568588469186, + "grad_norm": 1.4119298458099365, + "learning_rate": 4.94474172412442e-06, + "loss": 0.0073, + "num_input_tokens_seen": 3020000, + "step": 1527 + }, + { + "epoch": 0.20251822398939695, + "grad_norm": 21.874143600463867, + "learning_rate": 4.944669118084356e-06, + "loss": 0.4905, + "num_input_tokens_seen": 3024096, + "step": 1528 + }, + { + "epoch": 0.20265076209410204, + "grad_norm": 12.965889930725098, + "learning_rate": 4.9445964649093855e-06, + "loss": 0.319, + "num_input_tokens_seen": 3026120, + "step": 1529 + }, + { + "epoch": 0.20278330019880716, + "grad_norm": 0.36019694805145264, + "learning_rate": 4.944523764600908e-06, + "loss": 0.0018, + "num_input_tokens_seen": 3027472, + "step": 1530 + }, + { + "epoch": 0.20291583830351226, + "grad_norm": 11.79969596862793, + "learning_rate": 4.944451017160327e-06, + "loss": 0.2418, + "num_input_tokens_seen": 3029120, + "step": 1531 + }, + { + "epoch": 0.20304837640821735, + "grad_norm": 1.4309542179107666, + "learning_rate": 4.944378222589043e-06, + "loss": 0.0076, + "num_input_tokens_seen": 3030760, + "step": 1532 + }, + { + "epoch": 0.20318091451292247, + "grad_norm": 13.358979225158691, + "learning_rate": 4.944305380888461e-06, + "loss": 0.2395, + "num_input_tokens_seen": 3032400, + "step": 1533 + }, + { + "epoch": 0.20331345261762757, + "grad_norm": 6.730549335479736, + "learning_rate": 4.944232492059985e-06, + "loss": 0.0747, + "num_input_tokens_seen": 3034504, + "step": 1534 + }, + { + "epoch": 0.20344599072233266, + "grad_norm": 4.821010589599609, + "learning_rate": 4.9441595561050205e-06, + "loss": 0.0539, + "num_input_tokens_seen": 3036160, + "step": 1535 + }, + { + "epoch": 0.20357852882703778, + "grad_norm": 10.793621063232422, + "learning_rate": 4.944086573024973e-06, + "loss": 0.4039, + "num_input_tokens_seen": 3038216, + "step": 1536 + }, + { + "epoch": 0.20371106693174287, + "grad_norm": 11.968805313110352, + "learning_rate": 4.944013542821251e-06, + "loss": 0.3655, + "num_input_tokens_seen": 3039976, + "step": 1537 + }, + { + "epoch": 0.20384360503644797, + "grad_norm": 5.364990234375, + "learning_rate": 4.943940465495262e-06, + "loss": 0.0959, + "num_input_tokens_seen": 3042400, + "step": 1538 + }, + { + "epoch": 0.2039761431411531, + "grad_norm": 2.2847580909729004, + "learning_rate": 4.943867341048414e-06, + "loss": 0.0124, + "num_input_tokens_seen": 3044120, + "step": 1539 + }, + { + "epoch": 0.20410868124585818, + "grad_norm": 0.06408200412988663, + "learning_rate": 4.943794169482119e-06, + "loss": 0.0003, + "num_input_tokens_seen": 3045472, + "step": 1540 + }, + { + "epoch": 0.20424121935056327, + "grad_norm": 0.10299701243638992, + "learning_rate": 4.943720950797785e-06, + "loss": 0.0005, + "num_input_tokens_seen": 3046680, + "step": 1541 + }, + { + "epoch": 0.2043737574552684, + "grad_norm": 13.509176254272461, + "learning_rate": 4.943647684996827e-06, + "loss": 0.2526, + "num_input_tokens_seen": 3048352, + "step": 1542 + }, + { + "epoch": 0.2045062955599735, + "grad_norm": 9.015371322631836, + "learning_rate": 4.943574372080654e-06, + "loss": 0.2274, + "num_input_tokens_seen": 3050040, + "step": 1543 + }, + { + "epoch": 0.20463883366467858, + "grad_norm": 21.445486068725586, + "learning_rate": 4.943501012050682e-06, + "loss": 0.5459, + "num_input_tokens_seen": 3051944, + "step": 1544 + }, + { + "epoch": 0.2047713717693837, + "grad_norm": 0.25789403915405273, + "learning_rate": 4.943427604908325e-06, + "loss": 0.0013, + "num_input_tokens_seen": 3053248, + "step": 1545 + }, + { + "epoch": 0.2049039098740888, + "grad_norm": 8.787060737609863, + "learning_rate": 4.943354150654998e-06, + "loss": 0.2267, + "num_input_tokens_seen": 3055080, + "step": 1546 + }, + { + "epoch": 0.2050364479787939, + "grad_norm": 10.775382041931152, + "learning_rate": 4.943280649292117e-06, + "loss": 0.1584, + "num_input_tokens_seen": 3056552, + "step": 1547 + }, + { + "epoch": 0.205168986083499, + "grad_norm": 0.07496646046638489, + "learning_rate": 4.9432071008211005e-06, + "loss": 0.0004, + "num_input_tokens_seen": 3057536, + "step": 1548 + }, + { + "epoch": 0.2053015241882041, + "grad_norm": 19.620237350463867, + "learning_rate": 4.9431335052433645e-06, + "loss": 0.8056, + "num_input_tokens_seen": 3060488, + "step": 1549 + }, + { + "epoch": 0.20543406229290923, + "grad_norm": 8.260318756103516, + "learning_rate": 4.9430598625603305e-06, + "loss": 0.1048, + "num_input_tokens_seen": 3062600, + "step": 1550 + }, + { + "epoch": 0.20556660039761432, + "grad_norm": 11.072790145874023, + "learning_rate": 4.942986172773417e-06, + "loss": 0.2021, + "num_input_tokens_seen": 3064744, + "step": 1551 + }, + { + "epoch": 0.2056991385023194, + "grad_norm": 16.27667236328125, + "learning_rate": 4.942912435884043e-06, + "loss": 0.2106, + "num_input_tokens_seen": 3066600, + "step": 1552 + }, + { + "epoch": 0.20583167660702453, + "grad_norm": 12.170722961425781, + "learning_rate": 4.942838651893633e-06, + "loss": 0.3647, + "num_input_tokens_seen": 3068416, + "step": 1553 + }, + { + "epoch": 0.20596421471172963, + "grad_norm": 17.116518020629883, + "learning_rate": 4.942764820803608e-06, + "loss": 0.2935, + "num_input_tokens_seen": 3070192, + "step": 1554 + }, + { + "epoch": 0.20609675281643472, + "grad_norm": 0.45862770080566406, + "learning_rate": 4.942690942615394e-06, + "loss": 0.0024, + "num_input_tokens_seen": 3072152, + "step": 1555 + }, + { + "epoch": 0.20622929092113984, + "grad_norm": 13.14889907836914, + "learning_rate": 4.942617017330411e-06, + "loss": 0.4201, + "num_input_tokens_seen": 3073984, + "step": 1556 + }, + { + "epoch": 0.20636182902584493, + "grad_norm": 2.4476521015167236, + "learning_rate": 4.942543044950088e-06, + "loss": 0.0104, + "num_input_tokens_seen": 3076208, + "step": 1557 + }, + { + "epoch": 0.20649436713055003, + "grad_norm": 0.07111723721027374, + "learning_rate": 4.94246902547585e-06, + "loss": 0.0004, + "num_input_tokens_seen": 3077336, + "step": 1558 + }, + { + "epoch": 0.20662690523525515, + "grad_norm": 6.37446928024292, + "learning_rate": 4.942394958909125e-06, + "loss": 0.0601, + "num_input_tokens_seen": 3079224, + "step": 1559 + }, + { + "epoch": 0.20675944333996024, + "grad_norm": 0.0938825011253357, + "learning_rate": 4.942320845251339e-06, + "loss": 0.0005, + "num_input_tokens_seen": 3080592, + "step": 1560 + }, + { + "epoch": 0.20689198144466533, + "grad_norm": 17.8950252532959, + "learning_rate": 4.942246684503922e-06, + "loss": 0.3943, + "num_input_tokens_seen": 3081928, + "step": 1561 + }, + { + "epoch": 0.20702451954937046, + "grad_norm": 14.652807235717773, + "learning_rate": 4.942172476668305e-06, + "loss": 0.4855, + "num_input_tokens_seen": 3083736, + "step": 1562 + }, + { + "epoch": 0.20715705765407555, + "grad_norm": 6.170396327972412, + "learning_rate": 4.942098221745918e-06, + "loss": 0.1458, + "num_input_tokens_seen": 3085376, + "step": 1563 + }, + { + "epoch": 0.20728959575878064, + "grad_norm": 14.20685863494873, + "learning_rate": 4.942023919738192e-06, + "loss": 0.4159, + "num_input_tokens_seen": 3087440, + "step": 1564 + }, + { + "epoch": 0.20742213386348576, + "grad_norm": 2.665311574935913, + "learning_rate": 4.941949570646561e-06, + "loss": 0.0072, + "num_input_tokens_seen": 3088920, + "step": 1565 + }, + { + "epoch": 0.20755467196819086, + "grad_norm": 0.9632099866867065, + "learning_rate": 4.941875174472457e-06, + "loss": 0.0049, + "num_input_tokens_seen": 3090632, + "step": 1566 + }, + { + "epoch": 0.20768721007289595, + "grad_norm": 3.530364990234375, + "learning_rate": 4.941800731217314e-06, + "loss": 0.0706, + "num_input_tokens_seen": 3092464, + "step": 1567 + }, + { + "epoch": 0.20781974817760107, + "grad_norm": 15.34891414642334, + "learning_rate": 4.94172624088257e-06, + "loss": 0.2871, + "num_input_tokens_seen": 3094400, + "step": 1568 + }, + { + "epoch": 0.20795228628230616, + "grad_norm": 19.512920379638672, + "learning_rate": 4.94165170346966e-06, + "loss": 0.6074, + "num_input_tokens_seen": 3096752, + "step": 1569 + }, + { + "epoch": 0.20808482438701126, + "grad_norm": 12.263246536254883, + "learning_rate": 4.941577118980019e-06, + "loss": 0.298, + "num_input_tokens_seen": 3098760, + "step": 1570 + }, + { + "epoch": 0.20821736249171638, + "grad_norm": 10.468993186950684, + "learning_rate": 4.941502487415088e-06, + "loss": 0.2145, + "num_input_tokens_seen": 3100576, + "step": 1571 + }, + { + "epoch": 0.20834990059642147, + "grad_norm": 13.77182388305664, + "learning_rate": 4.941427808776305e-06, + "loss": 0.3044, + "num_input_tokens_seen": 3102912, + "step": 1572 + }, + { + "epoch": 0.20848243870112657, + "grad_norm": 7.96223783493042, + "learning_rate": 4.941353083065109e-06, + "loss": 0.0549, + "num_input_tokens_seen": 3105368, + "step": 1573 + }, + { + "epoch": 0.2086149768058317, + "grad_norm": 10.390724182128906, + "learning_rate": 4.941278310282941e-06, + "loss": 0.2599, + "num_input_tokens_seen": 3106896, + "step": 1574 + }, + { + "epoch": 0.20874751491053678, + "grad_norm": 9.694853782653809, + "learning_rate": 4.941203490431243e-06, + "loss": 0.125, + "num_input_tokens_seen": 3109000, + "step": 1575 + }, + { + "epoch": 0.20888005301524187, + "grad_norm": 17.4279842376709, + "learning_rate": 4.941128623511459e-06, + "loss": 0.2793, + "num_input_tokens_seen": 3110552, + "step": 1576 + }, + { + "epoch": 0.209012591119947, + "grad_norm": 0.37305963039398193, + "learning_rate": 4.94105370952503e-06, + "loss": 0.002, + "num_input_tokens_seen": 3112368, + "step": 1577 + }, + { + "epoch": 0.2091451292246521, + "grad_norm": 0.5690521001815796, + "learning_rate": 4.940978748473401e-06, + "loss": 0.0031, + "num_input_tokens_seen": 3114496, + "step": 1578 + }, + { + "epoch": 0.20927766732935718, + "grad_norm": 9.067329406738281, + "learning_rate": 4.940903740358019e-06, + "loss": 0.3023, + "num_input_tokens_seen": 3116720, + "step": 1579 + }, + { + "epoch": 0.2094102054340623, + "grad_norm": 21.493934631347656, + "learning_rate": 4.940828685180327e-06, + "loss": 0.3311, + "num_input_tokens_seen": 3118664, + "step": 1580 + }, + { + "epoch": 0.2095427435387674, + "grad_norm": 1.2281928062438965, + "learning_rate": 4.940753582941777e-06, + "loss": 0.0064, + "num_input_tokens_seen": 3121488, + "step": 1581 + }, + { + "epoch": 0.2096752816434725, + "grad_norm": 4.252953052520752, + "learning_rate": 4.940678433643812e-06, + "loss": 0.0211, + "num_input_tokens_seen": 3122944, + "step": 1582 + }, + { + "epoch": 0.2098078197481776, + "grad_norm": 11.897409439086914, + "learning_rate": 4.940603237287883e-06, + "loss": 0.3087, + "num_input_tokens_seen": 3124856, + "step": 1583 + }, + { + "epoch": 0.2099403578528827, + "grad_norm": 0.12686584889888763, + "learning_rate": 4.9405279938754405e-06, + "loss": 0.0007, + "num_input_tokens_seen": 3125944, + "step": 1584 + }, + { + "epoch": 0.2100728959575878, + "grad_norm": 7.8734283447265625, + "learning_rate": 4.940452703407935e-06, + "loss": 0.2241, + "num_input_tokens_seen": 3127864, + "step": 1585 + }, + { + "epoch": 0.21020543406229292, + "grad_norm": 8.402398109436035, + "learning_rate": 4.940377365886817e-06, + "loss": 0.3412, + "num_input_tokens_seen": 3129728, + "step": 1586 + }, + { + "epoch": 0.210337972166998, + "grad_norm": 10.226612091064453, + "learning_rate": 4.940301981313541e-06, + "loss": 0.1393, + "num_input_tokens_seen": 3131392, + "step": 1587 + }, + { + "epoch": 0.2104705102717031, + "grad_norm": 6.102915287017822, + "learning_rate": 4.940226549689559e-06, + "loss": 0.0552, + "num_input_tokens_seen": 3133200, + "step": 1588 + }, + { + "epoch": 0.21060304837640823, + "grad_norm": 19.175670623779297, + "learning_rate": 4.940151071016325e-06, + "loss": 1.0989, + "num_input_tokens_seen": 3135624, + "step": 1589 + }, + { + "epoch": 0.21073558648111332, + "grad_norm": 9.163742065429688, + "learning_rate": 4.940075545295295e-06, + "loss": 0.2058, + "num_input_tokens_seen": 3137600, + "step": 1590 + }, + { + "epoch": 0.2108681245858184, + "grad_norm": 0.07821919023990631, + "learning_rate": 4.939999972527926e-06, + "loss": 0.0004, + "num_input_tokens_seen": 3139120, + "step": 1591 + }, + { + "epoch": 0.21100066269052353, + "grad_norm": 19.046098709106445, + "learning_rate": 4.9399243527156735e-06, + "loss": 0.6531, + "num_input_tokens_seen": 3142000, + "step": 1592 + }, + { + "epoch": 0.21113320079522863, + "grad_norm": 22.99635887145996, + "learning_rate": 4.939848685859997e-06, + "loss": 0.486, + "num_input_tokens_seen": 3144184, + "step": 1593 + }, + { + "epoch": 0.21126573889993372, + "grad_norm": 0.11313114315271378, + "learning_rate": 4.939772971962354e-06, + "loss": 0.0006, + "num_input_tokens_seen": 3145512, + "step": 1594 + }, + { + "epoch": 0.21139827700463884, + "grad_norm": 18.431657791137695, + "learning_rate": 4.9396972110242044e-06, + "loss": 0.6959, + "num_input_tokens_seen": 3147264, + "step": 1595 + }, + { + "epoch": 0.21153081510934393, + "grad_norm": 8.416237831115723, + "learning_rate": 4.939621403047011e-06, + "loss": 0.1401, + "num_input_tokens_seen": 3149016, + "step": 1596 + }, + { + "epoch": 0.21166335321404903, + "grad_norm": 0.46213310956954956, + "learning_rate": 4.939545548032233e-06, + "loss": 0.0022, + "num_input_tokens_seen": 3150600, + "step": 1597 + }, + { + "epoch": 0.21179589131875415, + "grad_norm": 12.233981132507324, + "learning_rate": 4.939469645981334e-06, + "loss": 0.4133, + "num_input_tokens_seen": 3152624, + "step": 1598 + }, + { + "epoch": 0.21192842942345924, + "grad_norm": 0.9122524857521057, + "learning_rate": 4.939393696895778e-06, + "loss": 0.0036, + "num_input_tokens_seen": 3153960, + "step": 1599 + }, + { + "epoch": 0.21206096752816433, + "grad_norm": 32.98881530761719, + "learning_rate": 4.939317700777028e-06, + "loss": 1.1184, + "num_input_tokens_seen": 3155568, + "step": 1600 + }, + { + "epoch": 0.21219350563286946, + "grad_norm": 18.546852111816406, + "learning_rate": 4.93924165762655e-06, + "loss": 0.43, + "num_input_tokens_seen": 3158056, + "step": 1601 + }, + { + "epoch": 0.21232604373757455, + "grad_norm": 0.20501665771007538, + "learning_rate": 4.93916556744581e-06, + "loss": 0.0011, + "num_input_tokens_seen": 3159976, + "step": 1602 + }, + { + "epoch": 0.21245858184227964, + "grad_norm": 7.36048698425293, + "learning_rate": 4.9390894302362754e-06, + "loss": 0.1961, + "num_input_tokens_seen": 3161792, + "step": 1603 + }, + { + "epoch": 0.21259111994698476, + "grad_norm": 0.11844947934150696, + "learning_rate": 4.9390132459994135e-06, + "loss": 0.0006, + "num_input_tokens_seen": 3163160, + "step": 1604 + }, + { + "epoch": 0.21272365805168986, + "grad_norm": 8.891694068908691, + "learning_rate": 4.938937014736694e-06, + "loss": 0.2378, + "num_input_tokens_seen": 3165232, + "step": 1605 + }, + { + "epoch": 0.21285619615639495, + "grad_norm": 24.88727378845215, + "learning_rate": 4.938860736449587e-06, + "loss": 0.658, + "num_input_tokens_seen": 3168088, + "step": 1606 + }, + { + "epoch": 0.21298873426110007, + "grad_norm": 9.448894500732422, + "learning_rate": 4.938784411139561e-06, + "loss": 0.2552, + "num_input_tokens_seen": 3169344, + "step": 1607 + }, + { + "epoch": 0.21312127236580516, + "grad_norm": 10.28486156463623, + "learning_rate": 4.93870803880809e-06, + "loss": 0.2629, + "num_input_tokens_seen": 3171160, + "step": 1608 + }, + { + "epoch": 0.21325381047051029, + "grad_norm": 10.416315078735352, + "learning_rate": 4.938631619456646e-06, + "loss": 0.1774, + "num_input_tokens_seen": 3173096, + "step": 1609 + }, + { + "epoch": 0.21338634857521538, + "grad_norm": 10.50994873046875, + "learning_rate": 4.938555153086701e-06, + "loss": 0.225, + "num_input_tokens_seen": 3174120, + "step": 1610 + }, + { + "epoch": 0.21351888667992047, + "grad_norm": 10.50826644897461, + "learning_rate": 4.938478639699732e-06, + "loss": 0.3346, + "num_input_tokens_seen": 3175488, + "step": 1611 + }, + { + "epoch": 0.2136514247846256, + "grad_norm": 15.99755859375, + "learning_rate": 4.938402079297211e-06, + "loss": 0.4693, + "num_input_tokens_seen": 3177616, + "step": 1612 + }, + { + "epoch": 0.2137839628893307, + "grad_norm": 9.410768508911133, + "learning_rate": 4.9383254718806165e-06, + "loss": 0.2641, + "num_input_tokens_seen": 3179304, + "step": 1613 + }, + { + "epoch": 0.21391650099403578, + "grad_norm": 8.813210487365723, + "learning_rate": 4.9382488174514245e-06, + "loss": 0.1131, + "num_input_tokens_seen": 3181504, + "step": 1614 + }, + { + "epoch": 0.2140490390987409, + "grad_norm": 11.054888725280762, + "learning_rate": 4.938172116011113e-06, + "loss": 0.2597, + "num_input_tokens_seen": 3183568, + "step": 1615 + }, + { + "epoch": 0.214181577203446, + "grad_norm": 1.068084478378296, + "learning_rate": 4.938095367561162e-06, + "loss": 0.0047, + "num_input_tokens_seen": 3185480, + "step": 1616 + }, + { + "epoch": 0.2143141153081511, + "grad_norm": 9.036762237548828, + "learning_rate": 4.938018572103049e-06, + "loss": 0.1213, + "num_input_tokens_seen": 3187128, + "step": 1617 + }, + { + "epoch": 0.2144466534128562, + "grad_norm": 13.213265419006348, + "learning_rate": 4.937941729638257e-06, + "loss": 0.5154, + "num_input_tokens_seen": 3189720, + "step": 1618 + }, + { + "epoch": 0.2145791915175613, + "grad_norm": 0.3549255132675171, + "learning_rate": 4.9378648401682675e-06, + "loss": 0.0021, + "num_input_tokens_seen": 3192192, + "step": 1619 + }, + { + "epoch": 0.2147117296222664, + "grad_norm": 11.899450302124023, + "learning_rate": 4.9377879036945605e-06, + "loss": 0.2224, + "num_input_tokens_seen": 3194432, + "step": 1620 + }, + { + "epoch": 0.21484426772697152, + "grad_norm": 10.466017723083496, + "learning_rate": 4.937710920218622e-06, + "loss": 0.1578, + "num_input_tokens_seen": 3196496, + "step": 1621 + }, + { + "epoch": 0.2149768058316766, + "grad_norm": 23.57000732421875, + "learning_rate": 4.937633889741934e-06, + "loss": 0.2188, + "num_input_tokens_seen": 3199192, + "step": 1622 + }, + { + "epoch": 0.2151093439363817, + "grad_norm": 21.964622497558594, + "learning_rate": 4.937556812265984e-06, + "loss": 0.6021, + "num_input_tokens_seen": 3201088, + "step": 1623 + }, + { + "epoch": 0.21524188204108682, + "grad_norm": 24.73978042602539, + "learning_rate": 4.937479687792257e-06, + "loss": 0.3227, + "num_input_tokens_seen": 3203208, + "step": 1624 + }, + { + "epoch": 0.21537442014579192, + "grad_norm": 9.107304573059082, + "learning_rate": 4.937402516322239e-06, + "loss": 0.2177, + "num_input_tokens_seen": 3204936, + "step": 1625 + }, + { + "epoch": 0.215506958250497, + "grad_norm": 6.08217191696167, + "learning_rate": 4.93732529785742e-06, + "loss": 0.1215, + "num_input_tokens_seen": 3207608, + "step": 1626 + }, + { + "epoch": 0.21563949635520213, + "grad_norm": 15.261053085327148, + "learning_rate": 4.9372480323992885e-06, + "loss": 0.3029, + "num_input_tokens_seen": 3209472, + "step": 1627 + }, + { + "epoch": 0.21577203445990722, + "grad_norm": 6.329408168792725, + "learning_rate": 4.937170719949333e-06, + "loss": 0.0522, + "num_input_tokens_seen": 3210880, + "step": 1628 + }, + { + "epoch": 0.21590457256461232, + "grad_norm": 11.83716106414795, + "learning_rate": 4.937093360509044e-06, + "loss": 0.0852, + "num_input_tokens_seen": 3212544, + "step": 1629 + }, + { + "epoch": 0.21603711066931744, + "grad_norm": 11.321566581726074, + "learning_rate": 4.9370159540799146e-06, + "loss": 0.3831, + "num_input_tokens_seen": 3214944, + "step": 1630 + }, + { + "epoch": 0.21616964877402253, + "grad_norm": 21.89409065246582, + "learning_rate": 4.936938500663436e-06, + "loss": 0.524, + "num_input_tokens_seen": 3216840, + "step": 1631 + }, + { + "epoch": 0.21630218687872763, + "grad_norm": 14.414962768554688, + "learning_rate": 4.936861000261101e-06, + "loss": 0.2561, + "num_input_tokens_seen": 3219256, + "step": 1632 + }, + { + "epoch": 0.21643472498343275, + "grad_norm": 8.296018600463867, + "learning_rate": 4.936783452874406e-06, + "loss": 0.1726, + "num_input_tokens_seen": 3222136, + "step": 1633 + }, + { + "epoch": 0.21656726308813784, + "grad_norm": 7.5279316902160645, + "learning_rate": 4.9367058585048456e-06, + "loss": 0.2471, + "num_input_tokens_seen": 3223968, + "step": 1634 + }, + { + "epoch": 0.21669980119284293, + "grad_norm": 0.36171770095825195, + "learning_rate": 4.936628217153914e-06, + "loss": 0.0019, + "num_input_tokens_seen": 3225528, + "step": 1635 + }, + { + "epoch": 0.21683233929754805, + "grad_norm": 0.8328701257705688, + "learning_rate": 4.93655052882311e-06, + "loss": 0.003, + "num_input_tokens_seen": 3227984, + "step": 1636 + }, + { + "epoch": 0.21696487740225315, + "grad_norm": 4.806658744812012, + "learning_rate": 4.936472793513931e-06, + "loss": 0.0702, + "num_input_tokens_seen": 3229856, + "step": 1637 + }, + { + "epoch": 0.21709741550695824, + "grad_norm": 4.383067607879639, + "learning_rate": 4.936395011227876e-06, + "loss": 0.114, + "num_input_tokens_seen": 3231792, + "step": 1638 + }, + { + "epoch": 0.21722995361166336, + "grad_norm": 0.8216667175292969, + "learning_rate": 4.9363171819664434e-06, + "loss": 0.0024, + "num_input_tokens_seen": 3234576, + "step": 1639 + }, + { + "epoch": 0.21736249171636846, + "grad_norm": 9.61504077911377, + "learning_rate": 4.936239305731135e-06, + "loss": 0.1803, + "num_input_tokens_seen": 3237240, + "step": 1640 + }, + { + "epoch": 0.21749502982107355, + "grad_norm": 12.264850616455078, + "learning_rate": 4.936161382523452e-06, + "loss": 0.3079, + "num_input_tokens_seen": 3239808, + "step": 1641 + }, + { + "epoch": 0.21762756792577867, + "grad_norm": 15.373514175415039, + "learning_rate": 4.936083412344898e-06, + "loss": 0.5683, + "num_input_tokens_seen": 3241840, + "step": 1642 + }, + { + "epoch": 0.21776010603048376, + "grad_norm": 0.06137187033891678, + "learning_rate": 4.936005395196974e-06, + "loss": 0.0003, + "num_input_tokens_seen": 3242800, + "step": 1643 + }, + { + "epoch": 0.21789264413518886, + "grad_norm": 1.243264079093933, + "learning_rate": 4.935927331081186e-06, + "loss": 0.0124, + "num_input_tokens_seen": 3244584, + "step": 1644 + }, + { + "epoch": 0.21802518223989398, + "grad_norm": 0.052737534046173096, + "learning_rate": 4.935849219999039e-06, + "loss": 0.0003, + "num_input_tokens_seen": 3245944, + "step": 1645 + }, + { + "epoch": 0.21815772034459907, + "grad_norm": 2.980928897857666, + "learning_rate": 4.935771061952038e-06, + "loss": 0.0097, + "num_input_tokens_seen": 3248096, + "step": 1646 + }, + { + "epoch": 0.21829025844930416, + "grad_norm": 6.593144416809082, + "learning_rate": 4.935692856941691e-06, + "loss": 0.1378, + "num_input_tokens_seen": 3249680, + "step": 1647 + }, + { + "epoch": 0.21842279655400929, + "grad_norm": 7.3935346603393555, + "learning_rate": 4.935614604969505e-06, + "loss": 0.0455, + "num_input_tokens_seen": 3251336, + "step": 1648 + }, + { + "epoch": 0.21855533465871438, + "grad_norm": 12.530481338500977, + "learning_rate": 4.93553630603699e-06, + "loss": 0.3653, + "num_input_tokens_seen": 3253880, + "step": 1649 + }, + { + "epoch": 0.21868787276341947, + "grad_norm": 12.22199535369873, + "learning_rate": 4.9354579601456545e-06, + "loss": 0.4126, + "num_input_tokens_seen": 3256184, + "step": 1650 + }, + { + "epoch": 0.2188204108681246, + "grad_norm": 0.7065559029579163, + "learning_rate": 4.935379567297009e-06, + "loss": 0.0032, + "num_input_tokens_seen": 3258952, + "step": 1651 + }, + { + "epoch": 0.2189529489728297, + "grad_norm": 9.172409057617188, + "learning_rate": 4.935301127492565e-06, + "loss": 0.2361, + "num_input_tokens_seen": 3260904, + "step": 1652 + }, + { + "epoch": 0.21908548707753478, + "grad_norm": 0.7518073916435242, + "learning_rate": 4.935222640733836e-06, + "loss": 0.004, + "num_input_tokens_seen": 3261976, + "step": 1653 + }, + { + "epoch": 0.2192180251822399, + "grad_norm": 16.84255027770996, + "learning_rate": 4.935144107022334e-06, + "loss": 0.3524, + "num_input_tokens_seen": 3264208, + "step": 1654 + }, + { + "epoch": 0.219350563286945, + "grad_norm": 8.219185829162598, + "learning_rate": 4.935065526359574e-06, + "loss": 0.3323, + "num_input_tokens_seen": 3265776, + "step": 1655 + }, + { + "epoch": 0.2194831013916501, + "grad_norm": 0.07503607869148254, + "learning_rate": 4.934986898747071e-06, + "loss": 0.0004, + "num_input_tokens_seen": 3267272, + "step": 1656 + }, + { + "epoch": 0.2196156394963552, + "grad_norm": 7.613670349121094, + "learning_rate": 4.934908224186341e-06, + "loss": 0.0419, + "num_input_tokens_seen": 3269200, + "step": 1657 + }, + { + "epoch": 0.2197481776010603, + "grad_norm": 13.691352844238281, + "learning_rate": 4.9348295026789e-06, + "loss": 0.3911, + "num_input_tokens_seen": 3271288, + "step": 1658 + }, + { + "epoch": 0.2198807157057654, + "grad_norm": 14.167842864990234, + "learning_rate": 4.934750734226267e-06, + "loss": 0.2875, + "num_input_tokens_seen": 3273136, + "step": 1659 + }, + { + "epoch": 0.22001325381047052, + "grad_norm": 21.928781509399414, + "learning_rate": 4.93467191882996e-06, + "loss": 0.3142, + "num_input_tokens_seen": 3275736, + "step": 1660 + }, + { + "epoch": 0.2201457919151756, + "grad_norm": 9.621772766113281, + "learning_rate": 4.934593056491498e-06, + "loss": 0.1505, + "num_input_tokens_seen": 3278352, + "step": 1661 + }, + { + "epoch": 0.2202783300198807, + "grad_norm": 0.05005446448922157, + "learning_rate": 4.934514147212403e-06, + "loss": 0.0002, + "num_input_tokens_seen": 3279752, + "step": 1662 + }, + { + "epoch": 0.22041086812458582, + "grad_norm": 8.94521713256836, + "learning_rate": 4.934435190994195e-06, + "loss": 0.1141, + "num_input_tokens_seen": 3281664, + "step": 1663 + }, + { + "epoch": 0.22054340622929092, + "grad_norm": 0.062009260058403015, + "learning_rate": 4.9343561878383984e-06, + "loss": 0.0003, + "num_input_tokens_seen": 3283960, + "step": 1664 + }, + { + "epoch": 0.220675944333996, + "grad_norm": 0.11217901110649109, + "learning_rate": 4.934277137746535e-06, + "loss": 0.0004, + "num_input_tokens_seen": 3285912, + "step": 1665 + }, + { + "epoch": 0.22080848243870113, + "grad_norm": 5.784594535827637, + "learning_rate": 4.9341980407201286e-06, + "loss": 0.1341, + "num_input_tokens_seen": 3287576, + "step": 1666 + }, + { + "epoch": 0.22094102054340622, + "grad_norm": 7.565398216247559, + "learning_rate": 4.934118896760705e-06, + "loss": 0.1398, + "num_input_tokens_seen": 3288936, + "step": 1667 + }, + { + "epoch": 0.22107355864811135, + "grad_norm": 24.51899528503418, + "learning_rate": 4.934039705869789e-06, + "loss": 0.8829, + "num_input_tokens_seen": 3290376, + "step": 1668 + }, + { + "epoch": 0.22120609675281644, + "grad_norm": 13.75235366821289, + "learning_rate": 4.933960468048909e-06, + "loss": 0.3791, + "num_input_tokens_seen": 3293080, + "step": 1669 + }, + { + "epoch": 0.22133863485752153, + "grad_norm": 0.8770029544830322, + "learning_rate": 4.9338811832995925e-06, + "loss": 0.0036, + "num_input_tokens_seen": 3295056, + "step": 1670 + }, + { + "epoch": 0.22147117296222665, + "grad_norm": 9.46823787689209, + "learning_rate": 4.933801851623367e-06, + "loss": 0.1653, + "num_input_tokens_seen": 3296984, + "step": 1671 + }, + { + "epoch": 0.22160371106693175, + "grad_norm": 14.412089347839355, + "learning_rate": 4.933722473021763e-06, + "loss": 0.2312, + "num_input_tokens_seen": 3298976, + "step": 1672 + }, + { + "epoch": 0.22173624917163684, + "grad_norm": 16.31475257873535, + "learning_rate": 4.933643047496311e-06, + "loss": 0.3257, + "num_input_tokens_seen": 3300032, + "step": 1673 + }, + { + "epoch": 0.22186878727634196, + "grad_norm": 0.08501935750246048, + "learning_rate": 4.933563575048542e-06, + "loss": 0.0005, + "num_input_tokens_seen": 3302080, + "step": 1674 + }, + { + "epoch": 0.22200132538104705, + "grad_norm": 17.939287185668945, + "learning_rate": 4.9334840556799885e-06, + "loss": 0.5308, + "num_input_tokens_seen": 3303696, + "step": 1675 + }, + { + "epoch": 0.22213386348575215, + "grad_norm": 7.127903461456299, + "learning_rate": 4.933404489392184e-06, + "loss": 0.0889, + "num_input_tokens_seen": 3305528, + "step": 1676 + }, + { + "epoch": 0.22226640159045727, + "grad_norm": 4.072912216186523, + "learning_rate": 4.933324876186661e-06, + "loss": 0.0957, + "num_input_tokens_seen": 3307336, + "step": 1677 + }, + { + "epoch": 0.22239893969516236, + "grad_norm": 11.142993927001953, + "learning_rate": 4.933245216064957e-06, + "loss": 0.1511, + "num_input_tokens_seen": 3308656, + "step": 1678 + }, + { + "epoch": 0.22253147779986746, + "grad_norm": 0.2088087946176529, + "learning_rate": 4.933165509028606e-06, + "loss": 0.0011, + "num_input_tokens_seen": 3310184, + "step": 1679 + }, + { + "epoch": 0.22266401590457258, + "grad_norm": 19.459409713745117, + "learning_rate": 4.933085755079145e-06, + "loss": 0.4961, + "num_input_tokens_seen": 3312464, + "step": 1680 + }, + { + "epoch": 0.22279655400927767, + "grad_norm": 7.557456970214844, + "learning_rate": 4.933005954218113e-06, + "loss": 0.0977, + "num_input_tokens_seen": 3314424, + "step": 1681 + }, + { + "epoch": 0.22292909211398276, + "grad_norm": 14.821629524230957, + "learning_rate": 4.932926106447048e-06, + "loss": 0.4703, + "num_input_tokens_seen": 3316864, + "step": 1682 + }, + { + "epoch": 0.22306163021868788, + "grad_norm": 0.24442431330680847, + "learning_rate": 4.932846211767488e-06, + "loss": 0.0014, + "num_input_tokens_seen": 3318560, + "step": 1683 + }, + { + "epoch": 0.22319416832339298, + "grad_norm": 5.608514308929443, + "learning_rate": 4.932766270180976e-06, + "loss": 0.1271, + "num_input_tokens_seen": 3321832, + "step": 1684 + }, + { + "epoch": 0.22332670642809807, + "grad_norm": 13.253571510314941, + "learning_rate": 4.932686281689051e-06, + "loss": 0.3954, + "num_input_tokens_seen": 3324280, + "step": 1685 + }, + { + "epoch": 0.2234592445328032, + "grad_norm": 17.71905517578125, + "learning_rate": 4.932606246293256e-06, + "loss": 0.4995, + "num_input_tokens_seen": 3326456, + "step": 1686 + }, + { + "epoch": 0.22359178263750829, + "grad_norm": 5.279988765716553, + "learning_rate": 4.932526163995136e-06, + "loss": 0.1175, + "num_input_tokens_seen": 3328128, + "step": 1687 + }, + { + "epoch": 0.22372432074221338, + "grad_norm": 15.228689193725586, + "learning_rate": 4.9324460347962325e-06, + "loss": 0.4672, + "num_input_tokens_seen": 3329872, + "step": 1688 + }, + { + "epoch": 0.2238568588469185, + "grad_norm": 11.764204025268555, + "learning_rate": 4.9323658586980915e-06, + "loss": 0.2224, + "num_input_tokens_seen": 3332776, + "step": 1689 + }, + { + "epoch": 0.2239893969516236, + "grad_norm": 12.289379119873047, + "learning_rate": 4.932285635702259e-06, + "loss": 0.2397, + "num_input_tokens_seen": 3334720, + "step": 1690 + }, + { + "epoch": 0.22412193505632869, + "grad_norm": 30.804235458374023, + "learning_rate": 4.932205365810281e-06, + "loss": 0.6516, + "num_input_tokens_seen": 3336616, + "step": 1691 + }, + { + "epoch": 0.2242544731610338, + "grad_norm": 3.714463710784912, + "learning_rate": 4.932125049023706e-06, + "loss": 0.0245, + "num_input_tokens_seen": 3338296, + "step": 1692 + }, + { + "epoch": 0.2243870112657389, + "grad_norm": 2.002056837081909, + "learning_rate": 4.9320446853440815e-06, + "loss": 0.0113, + "num_input_tokens_seen": 3340400, + "step": 1693 + }, + { + "epoch": 0.224519549370444, + "grad_norm": 2.714550733566284, + "learning_rate": 4.931964274772959e-06, + "loss": 0.0157, + "num_input_tokens_seen": 3341728, + "step": 1694 + }, + { + "epoch": 0.22465208747514911, + "grad_norm": 6.166360378265381, + "learning_rate": 4.931883817311886e-06, + "loss": 0.2204, + "num_input_tokens_seen": 3343792, + "step": 1695 + }, + { + "epoch": 0.2247846255798542, + "grad_norm": 0.5058887004852295, + "learning_rate": 4.931803312962417e-06, + "loss": 0.0029, + "num_input_tokens_seen": 3345528, + "step": 1696 + }, + { + "epoch": 0.2249171636845593, + "grad_norm": 1.7587809562683105, + "learning_rate": 4.931722761726102e-06, + "loss": 0.0108, + "num_input_tokens_seen": 3348384, + "step": 1697 + }, + { + "epoch": 0.22504970178926442, + "grad_norm": 17.575016021728516, + "learning_rate": 4.9316421636044945e-06, + "loss": 0.316, + "num_input_tokens_seen": 3350696, + "step": 1698 + }, + { + "epoch": 0.22518223989396952, + "grad_norm": 1.106317400932312, + "learning_rate": 4.931561518599148e-06, + "loss": 0.0056, + "num_input_tokens_seen": 3352168, + "step": 1699 + }, + { + "epoch": 0.2253147779986746, + "grad_norm": 0.4117383062839508, + "learning_rate": 4.9314808267116185e-06, + "loss": 0.0019, + "num_input_tokens_seen": 3354080, + "step": 1700 + }, + { + "epoch": 0.22544731610337973, + "grad_norm": 13.850700378417969, + "learning_rate": 4.931400087943461e-06, + "loss": 0.2391, + "num_input_tokens_seen": 3355808, + "step": 1701 + }, + { + "epoch": 0.22557985420808482, + "grad_norm": 17.980010986328125, + "learning_rate": 4.9313193022962325e-06, + "loss": 0.265, + "num_input_tokens_seen": 3357592, + "step": 1702 + }, + { + "epoch": 0.22571239231278992, + "grad_norm": 7.735844612121582, + "learning_rate": 4.93123846977149e-06, + "loss": 0.4048, + "num_input_tokens_seen": 3359136, + "step": 1703 + }, + { + "epoch": 0.22584493041749504, + "grad_norm": 5.844985485076904, + "learning_rate": 4.931157590370794e-06, + "loss": 0.112, + "num_input_tokens_seen": 3360624, + "step": 1704 + }, + { + "epoch": 0.22597746852220013, + "grad_norm": 0.04979546740651131, + "learning_rate": 4.931076664095701e-06, + "loss": 0.0003, + "num_input_tokens_seen": 3362272, + "step": 1705 + }, + { + "epoch": 0.22611000662690522, + "grad_norm": 9.528762817382812, + "learning_rate": 4.930995690947774e-06, + "loss": 0.2703, + "num_input_tokens_seen": 3364216, + "step": 1706 + }, + { + "epoch": 0.22624254473161035, + "grad_norm": 18.23267936706543, + "learning_rate": 4.930914670928572e-06, + "loss": 0.4144, + "num_input_tokens_seen": 3366896, + "step": 1707 + }, + { + "epoch": 0.22637508283631544, + "grad_norm": 6.709630966186523, + "learning_rate": 4.930833604039659e-06, + "loss": 0.1561, + "num_input_tokens_seen": 3368736, + "step": 1708 + }, + { + "epoch": 0.22650762094102053, + "grad_norm": 16.42906379699707, + "learning_rate": 4.930752490282596e-06, + "loss": 0.6671, + "num_input_tokens_seen": 3371056, + "step": 1709 + }, + { + "epoch": 0.22664015904572565, + "grad_norm": 30.568758010864258, + "learning_rate": 4.93067132965895e-06, + "loss": 0.7228, + "num_input_tokens_seen": 3373176, + "step": 1710 + }, + { + "epoch": 0.22677269715043075, + "grad_norm": 13.62087345123291, + "learning_rate": 4.930590122170283e-06, + "loss": 0.4284, + "num_input_tokens_seen": 3374608, + "step": 1711 + }, + { + "epoch": 0.22690523525513584, + "grad_norm": 8.012083053588867, + "learning_rate": 4.9305088678181615e-06, + "loss": 0.2542, + "num_input_tokens_seen": 3376512, + "step": 1712 + }, + { + "epoch": 0.22703777335984096, + "grad_norm": 0.43502819538116455, + "learning_rate": 4.930427566604153e-06, + "loss": 0.0025, + "num_input_tokens_seen": 3378304, + "step": 1713 + }, + { + "epoch": 0.22717031146454605, + "grad_norm": 4.435062885284424, + "learning_rate": 4.930346218529822e-06, + "loss": 0.0863, + "num_input_tokens_seen": 3380104, + "step": 1714 + }, + { + "epoch": 0.22730284956925115, + "grad_norm": 14.251241683959961, + "learning_rate": 4.9302648235967415e-06, + "loss": 0.4994, + "num_input_tokens_seen": 3382256, + "step": 1715 + }, + { + "epoch": 0.22743538767395627, + "grad_norm": 0.2695981562137604, + "learning_rate": 4.930183381806478e-06, + "loss": 0.0016, + "num_input_tokens_seen": 3383976, + "step": 1716 + }, + { + "epoch": 0.22756792577866136, + "grad_norm": 23.366899490356445, + "learning_rate": 4.930101893160603e-06, + "loss": 0.8223, + "num_input_tokens_seen": 3385472, + "step": 1717 + }, + { + "epoch": 0.22770046388336646, + "grad_norm": 15.77640438079834, + "learning_rate": 4.930020357660687e-06, + "loss": 0.3659, + "num_input_tokens_seen": 3387008, + "step": 1718 + }, + { + "epoch": 0.22783300198807158, + "grad_norm": 0.5171908140182495, + "learning_rate": 4.929938775308301e-06, + "loss": 0.003, + "num_input_tokens_seen": 3388656, + "step": 1719 + }, + { + "epoch": 0.22796554009277667, + "grad_norm": 16.954925537109375, + "learning_rate": 4.92985714610502e-06, + "loss": 0.3829, + "num_input_tokens_seen": 3390632, + "step": 1720 + }, + { + "epoch": 0.22809807819748176, + "grad_norm": 13.440217018127441, + "learning_rate": 4.929775470052417e-06, + "loss": 0.4645, + "num_input_tokens_seen": 3392376, + "step": 1721 + }, + { + "epoch": 0.22823061630218688, + "grad_norm": 4.9072651863098145, + "learning_rate": 4.929693747152066e-06, + "loss": 0.1009, + "num_input_tokens_seen": 3394760, + "step": 1722 + }, + { + "epoch": 0.22836315440689198, + "grad_norm": 0.882038950920105, + "learning_rate": 4.929611977405544e-06, + "loss": 0.0052, + "num_input_tokens_seen": 3396632, + "step": 1723 + }, + { + "epoch": 0.22849569251159707, + "grad_norm": 5.4153032302856445, + "learning_rate": 4.9295301608144265e-06, + "loss": 0.1757, + "num_input_tokens_seen": 3398176, + "step": 1724 + }, + { + "epoch": 0.2286282306163022, + "grad_norm": 11.803627967834473, + "learning_rate": 4.929448297380292e-06, + "loss": 0.3344, + "num_input_tokens_seen": 3400112, + "step": 1725 + }, + { + "epoch": 0.22876076872100728, + "grad_norm": 16.858858108520508, + "learning_rate": 4.929366387104718e-06, + "loss": 0.3613, + "num_input_tokens_seen": 3402208, + "step": 1726 + }, + { + "epoch": 0.2288933068257124, + "grad_norm": 7.683289527893066, + "learning_rate": 4.929284429989284e-06, + "loss": 0.147, + "num_input_tokens_seen": 3404248, + "step": 1727 + }, + { + "epoch": 0.2290258449304175, + "grad_norm": 10.316078186035156, + "learning_rate": 4.929202426035571e-06, + "loss": 0.404, + "num_input_tokens_seen": 3406632, + "step": 1728 + }, + { + "epoch": 0.2291583830351226, + "grad_norm": 21.18628692626953, + "learning_rate": 4.929120375245159e-06, + "loss": 0.431, + "num_input_tokens_seen": 3407800, + "step": 1729 + }, + { + "epoch": 0.2292909211398277, + "grad_norm": 4.152615070343018, + "learning_rate": 4.9290382776196296e-06, + "loss": 0.0726, + "num_input_tokens_seen": 3409560, + "step": 1730 + }, + { + "epoch": 0.2294234592445328, + "grad_norm": 9.900524139404297, + "learning_rate": 4.9289561331605675e-06, + "loss": 0.3087, + "num_input_tokens_seen": 3411576, + "step": 1731 + }, + { + "epoch": 0.2295559973492379, + "grad_norm": 3.816608190536499, + "learning_rate": 4.928873941869555e-06, + "loss": 0.0217, + "num_input_tokens_seen": 3414272, + "step": 1732 + }, + { + "epoch": 0.22968853545394302, + "grad_norm": 2.1002542972564697, + "learning_rate": 4.928791703748178e-06, + "loss": 0.0123, + "num_input_tokens_seen": 3416040, + "step": 1733 + }, + { + "epoch": 0.22982107355864811, + "grad_norm": 10.384066581726074, + "learning_rate": 4.928709418798021e-06, + "loss": 0.2213, + "num_input_tokens_seen": 3417784, + "step": 1734 + }, + { + "epoch": 0.2299536116633532, + "grad_norm": 10.045069694519043, + "learning_rate": 4.928627087020671e-06, + "loss": 0.0981, + "num_input_tokens_seen": 3419456, + "step": 1735 + }, + { + "epoch": 0.23008614976805833, + "grad_norm": 13.13155460357666, + "learning_rate": 4.9285447084177154e-06, + "loss": 0.3302, + "num_input_tokens_seen": 3420992, + "step": 1736 + }, + { + "epoch": 0.23021868787276342, + "grad_norm": 6.180624008178711, + "learning_rate": 4.928462282990743e-06, + "loss": 0.1132, + "num_input_tokens_seen": 3422904, + "step": 1737 + }, + { + "epoch": 0.23035122597746852, + "grad_norm": 11.870166778564453, + "learning_rate": 4.928379810741341e-06, + "loss": 0.3498, + "num_input_tokens_seen": 3425440, + "step": 1738 + }, + { + "epoch": 0.23048376408217364, + "grad_norm": 17.951276779174805, + "learning_rate": 4.928297291671102e-06, + "loss": 0.4949, + "num_input_tokens_seen": 3427680, + "step": 1739 + }, + { + "epoch": 0.23061630218687873, + "grad_norm": 1.7743738889694214, + "learning_rate": 4.928214725781616e-06, + "loss": 0.01, + "num_input_tokens_seen": 3429736, + "step": 1740 + }, + { + "epoch": 0.23074884029158382, + "grad_norm": 15.868306159973145, + "learning_rate": 4.928132113074474e-06, + "loss": 0.461, + "num_input_tokens_seen": 3432912, + "step": 1741 + }, + { + "epoch": 0.23088137839628894, + "grad_norm": 20.072362899780273, + "learning_rate": 4.928049453551271e-06, + "loss": 0.6744, + "num_input_tokens_seen": 3435504, + "step": 1742 + }, + { + "epoch": 0.23101391650099404, + "grad_norm": 3.188154697418213, + "learning_rate": 4.927966747213599e-06, + "loss": 0.0202, + "num_input_tokens_seen": 3436688, + "step": 1743 + }, + { + "epoch": 0.23114645460569913, + "grad_norm": 6.160930156707764, + "learning_rate": 4.927883994063053e-06, + "loss": 0.1286, + "num_input_tokens_seen": 3438584, + "step": 1744 + }, + { + "epoch": 0.23127899271040425, + "grad_norm": 1.9092910289764404, + "learning_rate": 4.9278011941012284e-06, + "loss": 0.0086, + "num_input_tokens_seen": 3440608, + "step": 1745 + }, + { + "epoch": 0.23141153081510935, + "grad_norm": 1.3322649002075195, + "learning_rate": 4.927718347329723e-06, + "loss": 0.0072, + "num_input_tokens_seen": 3442448, + "step": 1746 + }, + { + "epoch": 0.23154406891981444, + "grad_norm": 6.96110200881958, + "learning_rate": 4.927635453750132e-06, + "loss": 0.2768, + "num_input_tokens_seen": 3444360, + "step": 1747 + }, + { + "epoch": 0.23167660702451956, + "grad_norm": 8.553160667419434, + "learning_rate": 4.927552513364056e-06, + "loss": 0.1197, + "num_input_tokens_seen": 3446264, + "step": 1748 + }, + { + "epoch": 0.23180914512922465, + "grad_norm": 0.1770925670862198, + "learning_rate": 4.927469526173092e-06, + "loss": 0.001, + "num_input_tokens_seen": 3448256, + "step": 1749 + }, + { + "epoch": 0.23194168323392975, + "grad_norm": 8.112807273864746, + "learning_rate": 4.92738649217884e-06, + "loss": 0.0655, + "num_input_tokens_seen": 3450152, + "step": 1750 + }, + { + "epoch": 0.23207422133863487, + "grad_norm": 20.28702735900879, + "learning_rate": 4.927303411382903e-06, + "loss": 0.4507, + "num_input_tokens_seen": 3452488, + "step": 1751 + }, + { + "epoch": 0.23220675944333996, + "grad_norm": 15.00679874420166, + "learning_rate": 4.927220283786882e-06, + "loss": 0.3254, + "num_input_tokens_seen": 3453624, + "step": 1752 + }, + { + "epoch": 0.23233929754804505, + "grad_norm": 5.46200704574585, + "learning_rate": 4.927137109392378e-06, + "loss": 0.0466, + "num_input_tokens_seen": 3457080, + "step": 1753 + }, + { + "epoch": 0.23247183565275018, + "grad_norm": 0.13290804624557495, + "learning_rate": 4.927053888200998e-06, + "loss": 0.0007, + "num_input_tokens_seen": 3459176, + "step": 1754 + }, + { + "epoch": 0.23260437375745527, + "grad_norm": 8.002230644226074, + "learning_rate": 4.9269706202143435e-06, + "loss": 0.1621, + "num_input_tokens_seen": 3460944, + "step": 1755 + }, + { + "epoch": 0.23273691186216036, + "grad_norm": 30.684972763061523, + "learning_rate": 4.926887305434022e-06, + "loss": 0.303, + "num_input_tokens_seen": 3463560, + "step": 1756 + }, + { + "epoch": 0.23286944996686548, + "grad_norm": 4.7676005363464355, + "learning_rate": 4.926803943861638e-06, + "loss": 0.0179, + "num_input_tokens_seen": 3465872, + "step": 1757 + }, + { + "epoch": 0.23300198807157058, + "grad_norm": 0.5567274689674377, + "learning_rate": 4.9267205354988e-06, + "loss": 0.0023, + "num_input_tokens_seen": 3468184, + "step": 1758 + }, + { + "epoch": 0.23313452617627567, + "grad_norm": 7.913361549377441, + "learning_rate": 4.926637080347116e-06, + "loss": 0.3205, + "num_input_tokens_seen": 3470720, + "step": 1759 + }, + { + "epoch": 0.2332670642809808, + "grad_norm": 0.11509038507938385, + "learning_rate": 4.9265535784081965e-06, + "loss": 0.0006, + "num_input_tokens_seen": 3472824, + "step": 1760 + }, + { + "epoch": 0.23339960238568588, + "grad_norm": 7.033679962158203, + "learning_rate": 4.9264700296836485e-06, + "loss": 0.0979, + "num_input_tokens_seen": 3474704, + "step": 1761 + }, + { + "epoch": 0.23353214049039098, + "grad_norm": 18.40431022644043, + "learning_rate": 4.926386434175085e-06, + "loss": 0.461, + "num_input_tokens_seen": 3476000, + "step": 1762 + }, + { + "epoch": 0.2336646785950961, + "grad_norm": 21.795299530029297, + "learning_rate": 4.926302791884116e-06, + "loss": 0.7161, + "num_input_tokens_seen": 3479176, + "step": 1763 + }, + { + "epoch": 0.2337972166998012, + "grad_norm": 15.102032661437988, + "learning_rate": 4.926219102812357e-06, + "loss": 0.3513, + "num_input_tokens_seen": 3482088, + "step": 1764 + }, + { + "epoch": 0.23392975480450628, + "grad_norm": 3.790903091430664, + "learning_rate": 4.926135366961419e-06, + "loss": 0.0399, + "num_input_tokens_seen": 3484120, + "step": 1765 + }, + { + "epoch": 0.2340622929092114, + "grad_norm": 19.174198150634766, + "learning_rate": 4.926051584332917e-06, + "loss": 0.4966, + "num_input_tokens_seen": 3486000, + "step": 1766 + }, + { + "epoch": 0.2341948310139165, + "grad_norm": 0.1310090869665146, + "learning_rate": 4.925967754928468e-06, + "loss": 0.0007, + "num_input_tokens_seen": 3487392, + "step": 1767 + }, + { + "epoch": 0.2343273691186216, + "grad_norm": 9.813427925109863, + "learning_rate": 4.925883878749687e-06, + "loss": 0.0898, + "num_input_tokens_seen": 3489256, + "step": 1768 + }, + { + "epoch": 0.2344599072233267, + "grad_norm": 5.372792720794678, + "learning_rate": 4.92579995579819e-06, + "loss": 0.2185, + "num_input_tokens_seen": 3491352, + "step": 1769 + }, + { + "epoch": 0.2345924453280318, + "grad_norm": 8.608305931091309, + "learning_rate": 4.925715986075597e-06, + "loss": 0.2153, + "num_input_tokens_seen": 3492592, + "step": 1770 + }, + { + "epoch": 0.2347249834327369, + "grad_norm": 14.550671577453613, + "learning_rate": 4.925631969583526e-06, + "loss": 0.5905, + "num_input_tokens_seen": 3495672, + "step": 1771 + }, + { + "epoch": 0.23485752153744202, + "grad_norm": 18.621952056884766, + "learning_rate": 4.925547906323598e-06, + "loss": 0.3413, + "num_input_tokens_seen": 3497848, + "step": 1772 + }, + { + "epoch": 0.23499005964214711, + "grad_norm": 0.3151490390300751, + "learning_rate": 4.925463796297433e-06, + "loss": 0.0017, + "num_input_tokens_seen": 3499384, + "step": 1773 + }, + { + "epoch": 0.2351225977468522, + "grad_norm": 10.724369049072266, + "learning_rate": 4.9253796395066534e-06, + "loss": 0.0497, + "num_input_tokens_seen": 3501088, + "step": 1774 + }, + { + "epoch": 0.23525513585155733, + "grad_norm": 13.683547973632812, + "learning_rate": 4.92529543595288e-06, + "loss": 0.3988, + "num_input_tokens_seen": 3504152, + "step": 1775 + }, + { + "epoch": 0.23538767395626242, + "grad_norm": 0.2387828826904297, + "learning_rate": 4.925211185637738e-06, + "loss": 0.0013, + "num_input_tokens_seen": 3505560, + "step": 1776 + }, + { + "epoch": 0.23552021206096752, + "grad_norm": 12.554303169250488, + "learning_rate": 4.9251268885628515e-06, + "loss": 0.1244, + "num_input_tokens_seen": 3507160, + "step": 1777 + }, + { + "epoch": 0.23565275016567264, + "grad_norm": 4.6954450607299805, + "learning_rate": 4.925042544729845e-06, + "loss": 0.0607, + "num_input_tokens_seen": 3509880, + "step": 1778 + }, + { + "epoch": 0.23578528827037773, + "grad_norm": 13.988066673278809, + "learning_rate": 4.924958154140345e-06, + "loss": 0.3042, + "num_input_tokens_seen": 3511384, + "step": 1779 + }, + { + "epoch": 0.23591782637508282, + "grad_norm": 0.21485891938209534, + "learning_rate": 4.924873716795979e-06, + "loss": 0.0012, + "num_input_tokens_seen": 3513256, + "step": 1780 + }, + { + "epoch": 0.23605036447978794, + "grad_norm": 8.725808143615723, + "learning_rate": 4.924789232698376e-06, + "loss": 0.1306, + "num_input_tokens_seen": 3514912, + "step": 1781 + }, + { + "epoch": 0.23618290258449304, + "grad_norm": 9.861848831176758, + "learning_rate": 4.924704701849164e-06, + "loss": 0.1018, + "num_input_tokens_seen": 3516800, + "step": 1782 + }, + { + "epoch": 0.23631544068919813, + "grad_norm": 9.570998191833496, + "learning_rate": 4.92462012424997e-06, + "loss": 0.129, + "num_input_tokens_seen": 3518384, + "step": 1783 + }, + { + "epoch": 0.23644797879390325, + "grad_norm": 18.63361930847168, + "learning_rate": 4.92453549990243e-06, + "loss": 0.3637, + "num_input_tokens_seen": 3520240, + "step": 1784 + }, + { + "epoch": 0.23658051689860835, + "grad_norm": 11.900542259216309, + "learning_rate": 4.924450828808172e-06, + "loss": 0.2691, + "num_input_tokens_seen": 3522072, + "step": 1785 + }, + { + "epoch": 0.23671305500331347, + "grad_norm": 0.7060779929161072, + "learning_rate": 4.924366110968829e-06, + "loss": 0.0038, + "num_input_tokens_seen": 3523424, + "step": 1786 + }, + { + "epoch": 0.23684559310801856, + "grad_norm": 22.841415405273438, + "learning_rate": 4.9242813463860364e-06, + "loss": 0.2523, + "num_input_tokens_seen": 3525408, + "step": 1787 + }, + { + "epoch": 0.23697813121272365, + "grad_norm": 16.0593318939209, + "learning_rate": 4.924196535061426e-06, + "loss": 0.4969, + "num_input_tokens_seen": 3527560, + "step": 1788 + }, + { + "epoch": 0.23711066931742877, + "grad_norm": 6.943134784698486, + "learning_rate": 4.924111676996634e-06, + "loss": 0.0958, + "num_input_tokens_seen": 3529648, + "step": 1789 + }, + { + "epoch": 0.23724320742213387, + "grad_norm": 0.562179446220398, + "learning_rate": 4.924026772193296e-06, + "loss": 0.0027, + "num_input_tokens_seen": 3532208, + "step": 1790 + }, + { + "epoch": 0.23737574552683896, + "grad_norm": 9.025856971740723, + "learning_rate": 4.923941820653051e-06, + "loss": 0.18, + "num_input_tokens_seen": 3533776, + "step": 1791 + }, + { + "epoch": 0.23750828363154408, + "grad_norm": 0.2391253411769867, + "learning_rate": 4.923856822377534e-06, + "loss": 0.0013, + "num_input_tokens_seen": 3535928, + "step": 1792 + }, + { + "epoch": 0.23764082173624917, + "grad_norm": 8.657227516174316, + "learning_rate": 4.923771777368386e-06, + "loss": 0.0707, + "num_input_tokens_seen": 3537896, + "step": 1793 + }, + { + "epoch": 0.23777335984095427, + "grad_norm": 4.383547306060791, + "learning_rate": 4.923686685627246e-06, + "loss": 0.0313, + "num_input_tokens_seen": 3539320, + "step": 1794 + }, + { + "epoch": 0.2379058979456594, + "grad_norm": 1.466487169265747, + "learning_rate": 4.923601547155755e-06, + "loss": 0.0066, + "num_input_tokens_seen": 3540680, + "step": 1795 + }, + { + "epoch": 0.23803843605036448, + "grad_norm": 18.77025604248047, + "learning_rate": 4.923516361955554e-06, + "loss": 0.5952, + "num_input_tokens_seen": 3542800, + "step": 1796 + }, + { + "epoch": 0.23817097415506958, + "grad_norm": 14.198873519897461, + "learning_rate": 4.923431130028285e-06, + "loss": 0.5729, + "num_input_tokens_seen": 3545096, + "step": 1797 + }, + { + "epoch": 0.2383035122597747, + "grad_norm": 0.12466581910848618, + "learning_rate": 4.923345851375594e-06, + "loss": 0.0006, + "num_input_tokens_seen": 3546304, + "step": 1798 + }, + { + "epoch": 0.2384360503644798, + "grad_norm": 0.03186096251010895, + "learning_rate": 4.923260525999122e-06, + "loss": 0.0002, + "num_input_tokens_seen": 3548768, + "step": 1799 + }, + { + "epoch": 0.23856858846918488, + "grad_norm": 3.608673095703125, + "learning_rate": 4.923175153900514e-06, + "loss": 0.1196, + "num_input_tokens_seen": 3550696, + "step": 1800 + }, + { + "epoch": 0.23870112657389, + "grad_norm": 13.959561347961426, + "learning_rate": 4.9230897350814186e-06, + "loss": 0.3268, + "num_input_tokens_seen": 3552624, + "step": 1801 + }, + { + "epoch": 0.2388336646785951, + "grad_norm": 0.07867728918790817, + "learning_rate": 4.923004269543482e-06, + "loss": 0.0004, + "num_input_tokens_seen": 3554024, + "step": 1802 + }, + { + "epoch": 0.2389662027833002, + "grad_norm": 0.03175168111920357, + "learning_rate": 4.9229187572883504e-06, + "loss": 0.0002, + "num_input_tokens_seen": 3556216, + "step": 1803 + }, + { + "epoch": 0.2390987408880053, + "grad_norm": 7.483557224273682, + "learning_rate": 4.922833198317675e-06, + "loss": 0.1845, + "num_input_tokens_seen": 3558192, + "step": 1804 + }, + { + "epoch": 0.2392312789927104, + "grad_norm": 3.9855382442474365, + "learning_rate": 4.922747592633103e-06, + "loss": 0.1151, + "num_input_tokens_seen": 3559648, + "step": 1805 + }, + { + "epoch": 0.2393638170974155, + "grad_norm": 0.047177959233522415, + "learning_rate": 4.9226619402362865e-06, + "loss": 0.0003, + "num_input_tokens_seen": 3560928, + "step": 1806 + }, + { + "epoch": 0.23949635520212062, + "grad_norm": 11.509875297546387, + "learning_rate": 4.922576241128877e-06, + "loss": 0.1308, + "num_input_tokens_seen": 3562592, + "step": 1807 + }, + { + "epoch": 0.2396288933068257, + "grad_norm": 16.34734344482422, + "learning_rate": 4.922490495312525e-06, + "loss": 0.2195, + "num_input_tokens_seen": 3564512, + "step": 1808 + }, + { + "epoch": 0.2397614314115308, + "grad_norm": 12.467219352722168, + "learning_rate": 4.922404702788886e-06, + "loss": 0.1308, + "num_input_tokens_seen": 3566200, + "step": 1809 + }, + { + "epoch": 0.23989396951623593, + "grad_norm": 10.48387622833252, + "learning_rate": 4.922318863559613e-06, + "loss": 0.2352, + "num_input_tokens_seen": 3567808, + "step": 1810 + }, + { + "epoch": 0.24002650762094102, + "grad_norm": 3.375532865524292, + "learning_rate": 4.922232977626361e-06, + "loss": 0.0192, + "num_input_tokens_seen": 3569400, + "step": 1811 + }, + { + "epoch": 0.24015904572564611, + "grad_norm": 0.07941216230392456, + "learning_rate": 4.922147044990787e-06, + "loss": 0.0005, + "num_input_tokens_seen": 3570800, + "step": 1812 + }, + { + "epoch": 0.24029158383035124, + "grad_norm": 30.34292984008789, + "learning_rate": 4.922061065654547e-06, + "loss": 0.6347, + "num_input_tokens_seen": 3573368, + "step": 1813 + }, + { + "epoch": 0.24042412193505633, + "grad_norm": 3.300945997238159, + "learning_rate": 4.921975039619297e-06, + "loss": 0.0396, + "num_input_tokens_seen": 3574680, + "step": 1814 + }, + { + "epoch": 0.24055666003976142, + "grad_norm": 0.23001621663570404, + "learning_rate": 4.921888966886699e-06, + "loss": 0.0012, + "num_input_tokens_seen": 3575992, + "step": 1815 + }, + { + "epoch": 0.24068919814446654, + "grad_norm": 9.388580322265625, + "learning_rate": 4.92180284745841e-06, + "loss": 0.0612, + "num_input_tokens_seen": 3578264, + "step": 1816 + }, + { + "epoch": 0.24082173624917164, + "grad_norm": 10.528152465820312, + "learning_rate": 4.921716681336092e-06, + "loss": 0.2871, + "num_input_tokens_seen": 3580032, + "step": 1817 + }, + { + "epoch": 0.24095427435387673, + "grad_norm": 34.9313850402832, + "learning_rate": 4.921630468521406e-06, + "loss": 0.243, + "num_input_tokens_seen": 3583576, + "step": 1818 + }, + { + "epoch": 0.24108681245858185, + "grad_norm": 11.806851387023926, + "learning_rate": 4.921544209016014e-06, + "loss": 0.3089, + "num_input_tokens_seen": 3585168, + "step": 1819 + }, + { + "epoch": 0.24121935056328694, + "grad_norm": 14.904276847839355, + "learning_rate": 4.921457902821578e-06, + "loss": 0.3052, + "num_input_tokens_seen": 3587368, + "step": 1820 + }, + { + "epoch": 0.24135188866799204, + "grad_norm": 15.053776741027832, + "learning_rate": 4.9213715499397645e-06, + "loss": 0.262, + "num_input_tokens_seen": 3590160, + "step": 1821 + }, + { + "epoch": 0.24148442677269716, + "grad_norm": 18.164548873901367, + "learning_rate": 4.921285150372236e-06, + "loss": 0.591, + "num_input_tokens_seen": 3592048, + "step": 1822 + }, + { + "epoch": 0.24161696487740225, + "grad_norm": 13.959782600402832, + "learning_rate": 4.92119870412066e-06, + "loss": 0.2746, + "num_input_tokens_seen": 3594208, + "step": 1823 + }, + { + "epoch": 0.24174950298210734, + "grad_norm": 17.967731475830078, + "learning_rate": 4.921112211186703e-06, + "loss": 0.3412, + "num_input_tokens_seen": 3596640, + "step": 1824 + }, + { + "epoch": 0.24188204108681247, + "grad_norm": 0.15441210567951202, + "learning_rate": 4.9210256715720315e-06, + "loss": 0.0009, + "num_input_tokens_seen": 3598488, + "step": 1825 + }, + { + "epoch": 0.24201457919151756, + "grad_norm": 0.9628341197967529, + "learning_rate": 4.9209390852783155e-06, + "loss": 0.0046, + "num_input_tokens_seen": 3599624, + "step": 1826 + }, + { + "epoch": 0.24214711729622265, + "grad_norm": 14.651203155517578, + "learning_rate": 4.920852452307224e-06, + "loss": 0.2891, + "num_input_tokens_seen": 3602008, + "step": 1827 + }, + { + "epoch": 0.24227965540092777, + "grad_norm": 4.754458427429199, + "learning_rate": 4.920765772660426e-06, + "loss": 0.0363, + "num_input_tokens_seen": 3603712, + "step": 1828 + }, + { + "epoch": 0.24241219350563287, + "grad_norm": 14.206003189086914, + "learning_rate": 4.920679046339595e-06, + "loss": 0.4401, + "num_input_tokens_seen": 3606744, + "step": 1829 + }, + { + "epoch": 0.24254473161033796, + "grad_norm": 1.5548858642578125, + "learning_rate": 4.920592273346402e-06, + "loss": 0.0068, + "num_input_tokens_seen": 3609376, + "step": 1830 + }, + { + "epoch": 0.24267726971504308, + "grad_norm": 12.981927871704102, + "learning_rate": 4.92050545368252e-06, + "loss": 0.3457, + "num_input_tokens_seen": 3611648, + "step": 1831 + }, + { + "epoch": 0.24280980781974817, + "grad_norm": 0.07560261338949203, + "learning_rate": 4.920418587349622e-06, + "loss": 0.0005, + "num_input_tokens_seen": 3613488, + "step": 1832 + }, + { + "epoch": 0.24294234592445327, + "grad_norm": 1.855639100074768, + "learning_rate": 4.9203316743493855e-06, + "loss": 0.0124, + "num_input_tokens_seen": 3615760, + "step": 1833 + }, + { + "epoch": 0.2430748840291584, + "grad_norm": 37.005455017089844, + "learning_rate": 4.920244714683483e-06, + "loss": 0.1748, + "num_input_tokens_seen": 3617584, + "step": 1834 + }, + { + "epoch": 0.24320742213386348, + "grad_norm": 0.07631472498178482, + "learning_rate": 4.920157708353594e-06, + "loss": 0.0004, + "num_input_tokens_seen": 3619216, + "step": 1835 + }, + { + "epoch": 0.24333996023856858, + "grad_norm": 32.240447998046875, + "learning_rate": 4.920070655361394e-06, + "loss": 0.6, + "num_input_tokens_seen": 3621344, + "step": 1836 + }, + { + "epoch": 0.2434724983432737, + "grad_norm": 4.75414514541626, + "learning_rate": 4.919983555708563e-06, + "loss": 0.0764, + "num_input_tokens_seen": 3623064, + "step": 1837 + }, + { + "epoch": 0.2436050364479788, + "grad_norm": 0.039452098309993744, + "learning_rate": 4.919896409396779e-06, + "loss": 0.0002, + "num_input_tokens_seen": 3625264, + "step": 1838 + }, + { + "epoch": 0.24373757455268388, + "grad_norm": 2.0670645236968994, + "learning_rate": 4.919809216427723e-06, + "loss": 0.013, + "num_input_tokens_seen": 3626880, + "step": 1839 + }, + { + "epoch": 0.243870112657389, + "grad_norm": 0.031204301863908768, + "learning_rate": 4.919721976803076e-06, + "loss": 0.0002, + "num_input_tokens_seen": 3628840, + "step": 1840 + }, + { + "epoch": 0.2440026507620941, + "grad_norm": 0.0632593184709549, + "learning_rate": 4.91963469052452e-06, + "loss": 0.0003, + "num_input_tokens_seen": 3631488, + "step": 1841 + }, + { + "epoch": 0.2441351888667992, + "grad_norm": 7.157535076141357, + "learning_rate": 4.919547357593738e-06, + "loss": 0.2211, + "num_input_tokens_seen": 3633320, + "step": 1842 + }, + { + "epoch": 0.2442677269715043, + "grad_norm": 5.95433235168457, + "learning_rate": 4.9194599780124145e-06, + "loss": 0.1283, + "num_input_tokens_seen": 3635992, + "step": 1843 + }, + { + "epoch": 0.2444002650762094, + "grad_norm": 28.20613670349121, + "learning_rate": 4.919372551782232e-06, + "loss": 0.9456, + "num_input_tokens_seen": 3638056, + "step": 1844 + }, + { + "epoch": 0.24453280318091453, + "grad_norm": 22.270618438720703, + "learning_rate": 4.9192850789048785e-06, + "loss": 0.6525, + "num_input_tokens_seen": 3640472, + "step": 1845 + }, + { + "epoch": 0.24466534128561962, + "grad_norm": 9.557283401489258, + "learning_rate": 4.919197559382039e-06, + "loss": 0.1512, + "num_input_tokens_seen": 3642552, + "step": 1846 + }, + { + "epoch": 0.2447978793903247, + "grad_norm": 0.05236904323101044, + "learning_rate": 4.919109993215403e-06, + "loss": 0.0003, + "num_input_tokens_seen": 3644048, + "step": 1847 + }, + { + "epoch": 0.24493041749502983, + "grad_norm": 14.896482467651367, + "learning_rate": 4.919022380406657e-06, + "loss": 0.2717, + "num_input_tokens_seen": 3645704, + "step": 1848 + }, + { + "epoch": 0.24506295559973493, + "grad_norm": 0.045757632702589035, + "learning_rate": 4.91893472095749e-06, + "loss": 0.0003, + "num_input_tokens_seen": 3648448, + "step": 1849 + }, + { + "epoch": 0.24519549370444002, + "grad_norm": 7.438886642456055, + "learning_rate": 4.9188470148695936e-06, + "loss": 0.1148, + "num_input_tokens_seen": 3650144, + "step": 1850 + }, + { + "epoch": 0.24532803180914514, + "grad_norm": 14.617999076843262, + "learning_rate": 4.918759262144658e-06, + "loss": 0.1507, + "num_input_tokens_seen": 3653088, + "step": 1851 + }, + { + "epoch": 0.24546056991385024, + "grad_norm": 0.09754247963428497, + "learning_rate": 4.918671462784375e-06, + "loss": 0.0006, + "num_input_tokens_seen": 3654768, + "step": 1852 + }, + { + "epoch": 0.24559310801855533, + "grad_norm": 0.06411875039339066, + "learning_rate": 4.918583616790438e-06, + "loss": 0.0004, + "num_input_tokens_seen": 3655920, + "step": 1853 + }, + { + "epoch": 0.24572564612326045, + "grad_norm": 17.208988189697266, + "learning_rate": 4.918495724164539e-06, + "loss": 0.6348, + "num_input_tokens_seen": 3658016, + "step": 1854 + }, + { + "epoch": 0.24585818422796554, + "grad_norm": 12.79676628112793, + "learning_rate": 4.9184077849083756e-06, + "loss": 0.4025, + "num_input_tokens_seen": 3659200, + "step": 1855 + }, + { + "epoch": 0.24599072233267064, + "grad_norm": 15.47212028503418, + "learning_rate": 4.918319799023641e-06, + "loss": 0.3718, + "num_input_tokens_seen": 3662184, + "step": 1856 + }, + { + "epoch": 0.24612326043737576, + "grad_norm": 7.295891284942627, + "learning_rate": 4.918231766512033e-06, + "loss": 0.4086, + "num_input_tokens_seen": 3664936, + "step": 1857 + }, + { + "epoch": 0.24625579854208085, + "grad_norm": 6.603776931762695, + "learning_rate": 4.918143687375248e-06, + "loss": 0.1973, + "num_input_tokens_seen": 3666792, + "step": 1858 + }, + { + "epoch": 0.24638833664678594, + "grad_norm": 12.235093116760254, + "learning_rate": 4.918055561614984e-06, + "loss": 0.3252, + "num_input_tokens_seen": 3669520, + "step": 1859 + }, + { + "epoch": 0.24652087475149106, + "grad_norm": 5.89321231842041, + "learning_rate": 4.91796738923294e-06, + "loss": 0.0646, + "num_input_tokens_seen": 3671104, + "step": 1860 + }, + { + "epoch": 0.24665341285619616, + "grad_norm": 26.585529327392578, + "learning_rate": 4.917879170230819e-06, + "loss": 1.3579, + "num_input_tokens_seen": 3672744, + "step": 1861 + }, + { + "epoch": 0.24678595096090125, + "grad_norm": 8.526714324951172, + "learning_rate": 4.9177909046103175e-06, + "loss": 0.2967, + "num_input_tokens_seen": 3674832, + "step": 1862 + }, + { + "epoch": 0.24691848906560637, + "grad_norm": 9.289690017700195, + "learning_rate": 4.9177025923731405e-06, + "loss": 0.284, + "num_input_tokens_seen": 3677432, + "step": 1863 + }, + { + "epoch": 0.24705102717031147, + "grad_norm": 1.7415971755981445, + "learning_rate": 4.917614233520988e-06, + "loss": 0.0105, + "num_input_tokens_seen": 3678936, + "step": 1864 + }, + { + "epoch": 0.24718356527501656, + "grad_norm": 11.623764991760254, + "learning_rate": 4.917525828055567e-06, + "loss": 0.2367, + "num_input_tokens_seen": 3681576, + "step": 1865 + }, + { + "epoch": 0.24731610337972168, + "grad_norm": 10.674825668334961, + "learning_rate": 4.91743737597858e-06, + "loss": 0.3292, + "num_input_tokens_seen": 3684800, + "step": 1866 + }, + { + "epoch": 0.24744864148442677, + "grad_norm": 25.170217514038086, + "learning_rate": 4.917348877291732e-06, + "loss": 0.6979, + "num_input_tokens_seen": 3686672, + "step": 1867 + }, + { + "epoch": 0.24758117958913187, + "grad_norm": 6.641921043395996, + "learning_rate": 4.9172603319967305e-06, + "loss": 0.0466, + "num_input_tokens_seen": 3689880, + "step": 1868 + }, + { + "epoch": 0.247713717693837, + "grad_norm": 11.338716506958008, + "learning_rate": 4.917171740095282e-06, + "loss": 0.168, + "num_input_tokens_seen": 3691768, + "step": 1869 + }, + { + "epoch": 0.24784625579854208, + "grad_norm": 6.881922721862793, + "learning_rate": 4.917083101589095e-06, + "loss": 0.1734, + "num_input_tokens_seen": 3693824, + "step": 1870 + }, + { + "epoch": 0.24797879390324717, + "grad_norm": 3.25223445892334, + "learning_rate": 4.916994416479879e-06, + "loss": 0.0699, + "num_input_tokens_seen": 3695640, + "step": 1871 + }, + { + "epoch": 0.2481113320079523, + "grad_norm": 7.841526985168457, + "learning_rate": 4.916905684769342e-06, + "loss": 0.0838, + "num_input_tokens_seen": 3697688, + "step": 1872 + }, + { + "epoch": 0.2482438701126574, + "grad_norm": 13.312283515930176, + "learning_rate": 4.916816906459197e-06, + "loss": 0.5383, + "num_input_tokens_seen": 3700360, + "step": 1873 + }, + { + "epoch": 0.24837640821736248, + "grad_norm": 11.762429237365723, + "learning_rate": 4.916728081551155e-06, + "loss": 0.3084, + "num_input_tokens_seen": 3702608, + "step": 1874 + }, + { + "epoch": 0.2485089463220676, + "grad_norm": 15.119577407836914, + "learning_rate": 4.9166392100469275e-06, + "loss": 0.4866, + "num_input_tokens_seen": 3704632, + "step": 1875 + }, + { + "epoch": 0.2486414844267727, + "grad_norm": 1.252411961555481, + "learning_rate": 4.916550291948229e-06, + "loss": 0.0078, + "num_input_tokens_seen": 3706016, + "step": 1876 + }, + { + "epoch": 0.2487740225314778, + "grad_norm": 28.945934295654297, + "learning_rate": 4.916461327256774e-06, + "loss": 0.9813, + "num_input_tokens_seen": 3708080, + "step": 1877 + }, + { + "epoch": 0.2489065606361829, + "grad_norm": 10.158907890319824, + "learning_rate": 4.916372315974277e-06, + "loss": 0.3053, + "num_input_tokens_seen": 3711008, + "step": 1878 + }, + { + "epoch": 0.249039098740888, + "grad_norm": 9.339791297912598, + "learning_rate": 4.916283258102455e-06, + "loss": 0.181, + "num_input_tokens_seen": 3712848, + "step": 1879 + }, + { + "epoch": 0.2491716368455931, + "grad_norm": 9.664850234985352, + "learning_rate": 4.916194153643026e-06, + "loss": 0.1581, + "num_input_tokens_seen": 3714368, + "step": 1880 + }, + { + "epoch": 0.24930417495029822, + "grad_norm": 2.597472667694092, + "learning_rate": 4.916105002597706e-06, + "loss": 0.0853, + "num_input_tokens_seen": 3716024, + "step": 1881 + }, + { + "epoch": 0.2494367130550033, + "grad_norm": 13.91226863861084, + "learning_rate": 4.916015804968215e-06, + "loss": 0.2494, + "num_input_tokens_seen": 3718872, + "step": 1882 + }, + { + "epoch": 0.2495692511597084, + "grad_norm": 11.05497932434082, + "learning_rate": 4.915926560756272e-06, + "loss": 0.2445, + "num_input_tokens_seen": 3720472, + "step": 1883 + }, + { + "epoch": 0.24970178926441353, + "grad_norm": 3.511528730392456, + "learning_rate": 4.915837269963599e-06, + "loss": 0.0094, + "num_input_tokens_seen": 3721960, + "step": 1884 + }, + { + "epoch": 0.24983432736911862, + "grad_norm": 8.397645950317383, + "learning_rate": 4.915747932591916e-06, + "loss": 0.0718, + "num_input_tokens_seen": 3724280, + "step": 1885 + }, + { + "epoch": 0.2499668654738237, + "grad_norm": 12.69995403289795, + "learning_rate": 4.915658548642947e-06, + "loss": 0.2206, + "num_input_tokens_seen": 3726112, + "step": 1886 + }, + { + "epoch": 0.25009940357852883, + "grad_norm": 3.728536605834961, + "learning_rate": 4.915569118118414e-06, + "loss": 0.0807, + "num_input_tokens_seen": 3727584, + "step": 1887 + }, + { + "epoch": 0.2502319416832339, + "grad_norm": 8.45195484161377, + "learning_rate": 4.9154796410200424e-06, + "loss": 0.247, + "num_input_tokens_seen": 3729936, + "step": 1888 + }, + { + "epoch": 0.250364479787939, + "grad_norm": 8.986550331115723, + "learning_rate": 4.915390117349558e-06, + "loss": 0.1091, + "num_input_tokens_seen": 3731496, + "step": 1889 + }, + { + "epoch": 0.2504970178926441, + "grad_norm": 17.828346252441406, + "learning_rate": 4.915300547108684e-06, + "loss": 0.5773, + "num_input_tokens_seen": 3734072, + "step": 1890 + }, + { + "epoch": 0.25062955599734926, + "grad_norm": 5.765731334686279, + "learning_rate": 4.9152109302991505e-06, + "loss": 0.0485, + "num_input_tokens_seen": 3736088, + "step": 1891 + }, + { + "epoch": 0.25076209410205436, + "grad_norm": 14.23884391784668, + "learning_rate": 4.915121266922684e-06, + "loss": 0.3383, + "num_input_tokens_seen": 3738456, + "step": 1892 + }, + { + "epoch": 0.25089463220675945, + "grad_norm": 0.36799290776252747, + "learning_rate": 4.9150315569810125e-06, + "loss": 0.0019, + "num_input_tokens_seen": 3740016, + "step": 1893 + }, + { + "epoch": 0.25102717031146454, + "grad_norm": 12.369023323059082, + "learning_rate": 4.9149418004758675e-06, + "loss": 0.3953, + "num_input_tokens_seen": 3741648, + "step": 1894 + }, + { + "epoch": 0.25115970841616964, + "grad_norm": 0.11495542526245117, + "learning_rate": 4.914851997408978e-06, + "loss": 0.0007, + "num_input_tokens_seen": 3742952, + "step": 1895 + }, + { + "epoch": 0.25129224652087473, + "grad_norm": 12.457237243652344, + "learning_rate": 4.9147621477820765e-06, + "loss": 0.385, + "num_input_tokens_seen": 3744632, + "step": 1896 + }, + { + "epoch": 0.2514247846255799, + "grad_norm": 7.956409931182861, + "learning_rate": 4.914672251596895e-06, + "loss": 0.2395, + "num_input_tokens_seen": 3746336, + "step": 1897 + }, + { + "epoch": 0.25155732273028497, + "grad_norm": 21.85320472717285, + "learning_rate": 4.9145823088551654e-06, + "loss": 0.4798, + "num_input_tokens_seen": 3748816, + "step": 1898 + }, + { + "epoch": 0.25168986083499006, + "grad_norm": 0.10949128121137619, + "learning_rate": 4.914492319558625e-06, + "loss": 0.0007, + "num_input_tokens_seen": 3751584, + "step": 1899 + }, + { + "epoch": 0.25182239893969516, + "grad_norm": 9.546878814697266, + "learning_rate": 4.914402283709005e-06, + "loss": 0.2836, + "num_input_tokens_seen": 3753312, + "step": 1900 + }, + { + "epoch": 0.25195493704440025, + "grad_norm": 4.9800124168396, + "learning_rate": 4.914312201308045e-06, + "loss": 0.1131, + "num_input_tokens_seen": 3754576, + "step": 1901 + }, + { + "epoch": 0.25208747514910534, + "grad_norm": 7.190089702606201, + "learning_rate": 4.91422207235748e-06, + "loss": 0.2296, + "num_input_tokens_seen": 3756480, + "step": 1902 + }, + { + "epoch": 0.2522200132538105, + "grad_norm": 2.693955421447754, + "learning_rate": 4.914131896859048e-06, + "loss": 0.0183, + "num_input_tokens_seen": 3758536, + "step": 1903 + }, + { + "epoch": 0.2523525513585156, + "grad_norm": 6.982316493988037, + "learning_rate": 4.914041674814487e-06, + "loss": 0.2071, + "num_input_tokens_seen": 3760376, + "step": 1904 + }, + { + "epoch": 0.2524850894632207, + "grad_norm": 17.749452590942383, + "learning_rate": 4.913951406225538e-06, + "loss": 0.3403, + "num_input_tokens_seen": 3761912, + "step": 1905 + }, + { + "epoch": 0.2526176275679258, + "grad_norm": 5.351406574249268, + "learning_rate": 4.913861091093939e-06, + "loss": 0.1225, + "num_input_tokens_seen": 3763664, + "step": 1906 + }, + { + "epoch": 0.25275016567263087, + "grad_norm": 1.302483081817627, + "learning_rate": 4.913770729421434e-06, + "loss": 0.0154, + "num_input_tokens_seen": 3765632, + "step": 1907 + }, + { + "epoch": 0.25288270377733596, + "grad_norm": 17.06321144104004, + "learning_rate": 4.9136803212097645e-06, + "loss": 0.4955, + "num_input_tokens_seen": 3767904, + "step": 1908 + }, + { + "epoch": 0.2530152418820411, + "grad_norm": 9.149087905883789, + "learning_rate": 4.9135898664606726e-06, + "loss": 0.2951, + "num_input_tokens_seen": 3769480, + "step": 1909 + }, + { + "epoch": 0.2531477799867462, + "grad_norm": 10.772250175476074, + "learning_rate": 4.913499365175903e-06, + "loss": 0.1489, + "num_input_tokens_seen": 3771184, + "step": 1910 + }, + { + "epoch": 0.2532803180914513, + "grad_norm": 8.179113388061523, + "learning_rate": 4.913408817357202e-06, + "loss": 0.2732, + "num_input_tokens_seen": 3772912, + "step": 1911 + }, + { + "epoch": 0.2534128561961564, + "grad_norm": 12.885571479797363, + "learning_rate": 4.913318223006313e-06, + "loss": 0.4088, + "num_input_tokens_seen": 3775336, + "step": 1912 + }, + { + "epoch": 0.2535453943008615, + "grad_norm": 4.166633129119873, + "learning_rate": 4.913227582124983e-06, + "loss": 0.0586, + "num_input_tokens_seen": 3777760, + "step": 1913 + }, + { + "epoch": 0.2536779324055666, + "grad_norm": 15.097594261169434, + "learning_rate": 4.913136894714962e-06, + "loss": 0.3176, + "num_input_tokens_seen": 3779112, + "step": 1914 + }, + { + "epoch": 0.2538104705102717, + "grad_norm": 40.72220230102539, + "learning_rate": 4.9130461607779955e-06, + "loss": 0.4155, + "num_input_tokens_seen": 3781096, + "step": 1915 + }, + { + "epoch": 0.2539430086149768, + "grad_norm": 7.796159744262695, + "learning_rate": 4.9129553803158354e-06, + "loss": 0.0957, + "num_input_tokens_seen": 3782632, + "step": 1916 + }, + { + "epoch": 0.2540755467196819, + "grad_norm": 12.889358520507812, + "learning_rate": 4.912864553330231e-06, + "loss": 0.2891, + "num_input_tokens_seen": 3785016, + "step": 1917 + }, + { + "epoch": 0.254208084824387, + "grad_norm": 18.439678192138672, + "learning_rate": 4.912773679822933e-06, + "loss": 0.623, + "num_input_tokens_seen": 3786720, + "step": 1918 + }, + { + "epoch": 0.2543406229290921, + "grad_norm": 7.719338893890381, + "learning_rate": 4.912682759795694e-06, + "loss": 0.0307, + "num_input_tokens_seen": 3788488, + "step": 1919 + }, + { + "epoch": 0.2544731610337972, + "grad_norm": 2.7555367946624756, + "learning_rate": 4.9125917932502664e-06, + "loss": 0.0279, + "num_input_tokens_seen": 3790992, + "step": 1920 + }, + { + "epoch": 0.25460569913850234, + "grad_norm": 0.22711621224880219, + "learning_rate": 4.912500780188405e-06, + "loss": 0.0013, + "num_input_tokens_seen": 3792992, + "step": 1921 + }, + { + "epoch": 0.25473823724320743, + "grad_norm": 16.960533142089844, + "learning_rate": 4.912409720611865e-06, + "loss": 0.7691, + "num_input_tokens_seen": 3795288, + "step": 1922 + }, + { + "epoch": 0.2548707753479125, + "grad_norm": 8.18647575378418, + "learning_rate": 4.912318614522401e-06, + "loss": 0.2349, + "num_input_tokens_seen": 3797328, + "step": 1923 + }, + { + "epoch": 0.2550033134526176, + "grad_norm": 9.723200798034668, + "learning_rate": 4.912227461921771e-06, + "loss": 0.1876, + "num_input_tokens_seen": 3799688, + "step": 1924 + }, + { + "epoch": 0.2551358515573227, + "grad_norm": 14.271493911743164, + "learning_rate": 4.912136262811729e-06, + "loss": 0.3895, + "num_input_tokens_seen": 3801528, + "step": 1925 + }, + { + "epoch": 0.2552683896620278, + "grad_norm": 13.908578872680664, + "learning_rate": 4.912045017194037e-06, + "loss": 0.3838, + "num_input_tokens_seen": 3803216, + "step": 1926 + }, + { + "epoch": 0.25540092776673295, + "grad_norm": 12.45673656463623, + "learning_rate": 4.911953725070453e-06, + "loss": 0.4295, + "num_input_tokens_seen": 3804952, + "step": 1927 + }, + { + "epoch": 0.25553346587143805, + "grad_norm": 1.1298795938491821, + "learning_rate": 4.911862386442737e-06, + "loss": 0.0058, + "num_input_tokens_seen": 3806496, + "step": 1928 + }, + { + "epoch": 0.25566600397614314, + "grad_norm": 6.069340229034424, + "learning_rate": 4.911771001312651e-06, + "loss": 0.1139, + "num_input_tokens_seen": 3807616, + "step": 1929 + }, + { + "epoch": 0.25579854208084823, + "grad_norm": 6.994503498077393, + "learning_rate": 4.911679569681956e-06, + "loss": 0.09, + "num_input_tokens_seen": 3809480, + "step": 1930 + }, + { + "epoch": 0.25593108018555333, + "grad_norm": 10.260053634643555, + "learning_rate": 4.911588091552415e-06, + "loss": 0.1346, + "num_input_tokens_seen": 3811984, + "step": 1931 + }, + { + "epoch": 0.2560636182902585, + "grad_norm": 12.260746002197266, + "learning_rate": 4.9114965669257925e-06, + "loss": 0.3195, + "num_input_tokens_seen": 3814368, + "step": 1932 + }, + { + "epoch": 0.25619615639496357, + "grad_norm": 9.225576400756836, + "learning_rate": 4.911404995803853e-06, + "loss": 0.0865, + "num_input_tokens_seen": 3815864, + "step": 1933 + }, + { + "epoch": 0.25632869449966866, + "grad_norm": 12.727394104003906, + "learning_rate": 4.9113133781883606e-06, + "loss": 0.4504, + "num_input_tokens_seen": 3817632, + "step": 1934 + }, + { + "epoch": 0.25646123260437376, + "grad_norm": 2.2808163166046143, + "learning_rate": 4.9112217140810825e-06, + "loss": 0.0604, + "num_input_tokens_seen": 3819656, + "step": 1935 + }, + { + "epoch": 0.25659377070907885, + "grad_norm": 0.3402206301689148, + "learning_rate": 4.911130003483787e-06, + "loss": 0.002, + "num_input_tokens_seen": 3822264, + "step": 1936 + }, + { + "epoch": 0.25672630881378394, + "grad_norm": 10.273092269897461, + "learning_rate": 4.911038246398241e-06, + "loss": 0.4197, + "num_input_tokens_seen": 3824408, + "step": 1937 + }, + { + "epoch": 0.2568588469184891, + "grad_norm": 0.31295084953308105, + "learning_rate": 4.910946442826216e-06, + "loss": 0.0018, + "num_input_tokens_seen": 3825968, + "step": 1938 + }, + { + "epoch": 0.2569913850231942, + "grad_norm": 6.330104827880859, + "learning_rate": 4.910854592769478e-06, + "loss": 0.2035, + "num_input_tokens_seen": 3827640, + "step": 1939 + }, + { + "epoch": 0.2571239231278993, + "grad_norm": 11.184147834777832, + "learning_rate": 4.9107626962298016e-06, + "loss": 0.1846, + "num_input_tokens_seen": 3829360, + "step": 1940 + }, + { + "epoch": 0.25725646123260437, + "grad_norm": 9.858233451843262, + "learning_rate": 4.9106707532089575e-06, + "loss": 0.2018, + "num_input_tokens_seen": 3831088, + "step": 1941 + }, + { + "epoch": 0.25738899933730947, + "grad_norm": 11.623895645141602, + "learning_rate": 4.910578763708718e-06, + "loss": 0.2234, + "num_input_tokens_seen": 3832816, + "step": 1942 + }, + { + "epoch": 0.25752153744201456, + "grad_norm": 12.907204627990723, + "learning_rate": 4.910486727730857e-06, + "loss": 0.4126, + "num_input_tokens_seen": 3834784, + "step": 1943 + }, + { + "epoch": 0.2576540755467197, + "grad_norm": 5.1612677574157715, + "learning_rate": 4.910394645277149e-06, + "loss": 0.0667, + "num_input_tokens_seen": 3836376, + "step": 1944 + }, + { + "epoch": 0.2577866136514248, + "grad_norm": 0.10249636322259903, + "learning_rate": 4.9103025163493685e-06, + "loss": 0.0006, + "num_input_tokens_seen": 3838704, + "step": 1945 + }, + { + "epoch": 0.2579191517561299, + "grad_norm": 14.6361083984375, + "learning_rate": 4.910210340949294e-06, + "loss": 0.4059, + "num_input_tokens_seen": 3840832, + "step": 1946 + }, + { + "epoch": 0.258051689860835, + "grad_norm": 0.15117675065994263, + "learning_rate": 4.9101181190787e-06, + "loss": 0.0009, + "num_input_tokens_seen": 3842512, + "step": 1947 + }, + { + "epoch": 0.2581842279655401, + "grad_norm": 0.2026304006576538, + "learning_rate": 4.910025850739367e-06, + "loss": 0.0011, + "num_input_tokens_seen": 3846128, + "step": 1948 + }, + { + "epoch": 0.2583167660702452, + "grad_norm": 2.1146724224090576, + "learning_rate": 4.909933535933073e-06, + "loss": 0.0112, + "num_input_tokens_seen": 3847960, + "step": 1949 + }, + { + "epoch": 0.2584493041749503, + "grad_norm": 9.959967613220215, + "learning_rate": 4.9098411746615984e-06, + "loss": 0.2864, + "num_input_tokens_seen": 3849072, + "step": 1950 + }, + { + "epoch": 0.2585818422796554, + "grad_norm": 24.567041397094727, + "learning_rate": 4.909748766926723e-06, + "loss": 0.6826, + "num_input_tokens_seen": 3851248, + "step": 1951 + }, + { + "epoch": 0.2587143803843605, + "grad_norm": 17.398883819580078, + "learning_rate": 4.9096563127302285e-06, + "loss": 0.5278, + "num_input_tokens_seen": 3853160, + "step": 1952 + }, + { + "epoch": 0.2588469184890656, + "grad_norm": 6.31428861618042, + "learning_rate": 4.9095638120738985e-06, + "loss": 0.1567, + "num_input_tokens_seen": 3855008, + "step": 1953 + }, + { + "epoch": 0.2589794565937707, + "grad_norm": 15.87155532836914, + "learning_rate": 4.909471264959516e-06, + "loss": 0.5319, + "num_input_tokens_seen": 3858176, + "step": 1954 + }, + { + "epoch": 0.2591119946984758, + "grad_norm": 9.704957008361816, + "learning_rate": 4.9093786713888645e-06, + "loss": 0.2225, + "num_input_tokens_seen": 3860592, + "step": 1955 + }, + { + "epoch": 0.25924453280318094, + "grad_norm": 0.11763802915811539, + "learning_rate": 4.909286031363732e-06, + "loss": 0.0007, + "num_input_tokens_seen": 3862344, + "step": 1956 + }, + { + "epoch": 0.25937707090788603, + "grad_norm": 10.972003936767578, + "learning_rate": 4.909193344885901e-06, + "loss": 0.4619, + "num_input_tokens_seen": 3864912, + "step": 1957 + }, + { + "epoch": 0.2595096090125911, + "grad_norm": 9.863266944885254, + "learning_rate": 4.909100611957161e-06, + "loss": 0.3023, + "num_input_tokens_seen": 3867264, + "step": 1958 + }, + { + "epoch": 0.2596421471172962, + "grad_norm": 0.11831197142601013, + "learning_rate": 4.909007832579298e-06, + "loss": 0.0007, + "num_input_tokens_seen": 3869680, + "step": 1959 + }, + { + "epoch": 0.2597746852220013, + "grad_norm": 12.972223281860352, + "learning_rate": 4.908915006754103e-06, + "loss": 0.1185, + "num_input_tokens_seen": 3871672, + "step": 1960 + }, + { + "epoch": 0.2599072233267064, + "grad_norm": 13.743368148803711, + "learning_rate": 4.908822134483365e-06, + "loss": 0.3367, + "num_input_tokens_seen": 3873832, + "step": 1961 + }, + { + "epoch": 0.26003976143141155, + "grad_norm": 11.24937915802002, + "learning_rate": 4.9087292157688746e-06, + "loss": 0.3837, + "num_input_tokens_seen": 3875624, + "step": 1962 + }, + { + "epoch": 0.26017229953611665, + "grad_norm": 0.17184431850910187, + "learning_rate": 4.9086362506124225e-06, + "loss": 0.001, + "num_input_tokens_seen": 3877032, + "step": 1963 + }, + { + "epoch": 0.26030483764082174, + "grad_norm": 7.578049182891846, + "learning_rate": 4.908543239015803e-06, + "loss": 0.2223, + "num_input_tokens_seen": 3878856, + "step": 1964 + }, + { + "epoch": 0.26043737574552683, + "grad_norm": 0.7672553062438965, + "learning_rate": 4.908450180980807e-06, + "loss": 0.004, + "num_input_tokens_seen": 3880856, + "step": 1965 + }, + { + "epoch": 0.2605699138502319, + "grad_norm": 1.161653757095337, + "learning_rate": 4.90835707650923e-06, + "loss": 0.0063, + "num_input_tokens_seen": 3882016, + "step": 1966 + }, + { + "epoch": 0.260702451954937, + "grad_norm": 4.956658363342285, + "learning_rate": 4.9082639256028685e-06, + "loss": 0.0973, + "num_input_tokens_seen": 3883800, + "step": 1967 + }, + { + "epoch": 0.26083499005964217, + "grad_norm": 15.792479515075684, + "learning_rate": 4.908170728263516e-06, + "loss": 0.4463, + "num_input_tokens_seen": 3885608, + "step": 1968 + }, + { + "epoch": 0.26096752816434726, + "grad_norm": 19.68912696838379, + "learning_rate": 4.908077484492971e-06, + "loss": 0.8024, + "num_input_tokens_seen": 3888384, + "step": 1969 + }, + { + "epoch": 0.26110006626905236, + "grad_norm": 3.058600902557373, + "learning_rate": 4.9079841942930305e-06, + "loss": 0.0376, + "num_input_tokens_seen": 3890992, + "step": 1970 + }, + { + "epoch": 0.26123260437375745, + "grad_norm": 12.606821060180664, + "learning_rate": 4.907890857665494e-06, + "loss": 0.2474, + "num_input_tokens_seen": 3893296, + "step": 1971 + }, + { + "epoch": 0.26136514247846254, + "grad_norm": 5.281127452850342, + "learning_rate": 4.907797474612161e-06, + "loss": 0.0709, + "num_input_tokens_seen": 3895184, + "step": 1972 + }, + { + "epoch": 0.26149768058316764, + "grad_norm": 5.009737491607666, + "learning_rate": 4.9077040451348305e-06, + "loss": 0.1491, + "num_input_tokens_seen": 3897360, + "step": 1973 + }, + { + "epoch": 0.2616302186878728, + "grad_norm": 12.964507102966309, + "learning_rate": 4.907610569235306e-06, + "loss": 0.3012, + "num_input_tokens_seen": 3899448, + "step": 1974 + }, + { + "epoch": 0.2617627567925779, + "grad_norm": 8.432779312133789, + "learning_rate": 4.907517046915388e-06, + "loss": 0.2229, + "num_input_tokens_seen": 3900984, + "step": 1975 + }, + { + "epoch": 0.26189529489728297, + "grad_norm": 0.20178042352199554, + "learning_rate": 4.907423478176881e-06, + "loss": 0.0012, + "num_input_tokens_seen": 3902744, + "step": 1976 + }, + { + "epoch": 0.26202783300198806, + "grad_norm": 0.16021206974983215, + "learning_rate": 4.907329863021588e-06, + "loss": 0.001, + "num_input_tokens_seen": 3904720, + "step": 1977 + }, + { + "epoch": 0.26216037110669316, + "grad_norm": 12.543532371520996, + "learning_rate": 4.907236201451315e-06, + "loss": 0.339, + "num_input_tokens_seen": 3906160, + "step": 1978 + }, + { + "epoch": 0.26229290921139825, + "grad_norm": 8.385379791259766, + "learning_rate": 4.907142493467868e-06, + "loss": 0.1689, + "num_input_tokens_seen": 3908896, + "step": 1979 + }, + { + "epoch": 0.2624254473161034, + "grad_norm": 18.749778747558594, + "learning_rate": 4.907048739073052e-06, + "loss": 0.4735, + "num_input_tokens_seen": 3911168, + "step": 1980 + }, + { + "epoch": 0.2625579854208085, + "grad_norm": 21.68638038635254, + "learning_rate": 4.906954938268677e-06, + "loss": 0.612, + "num_input_tokens_seen": 3913688, + "step": 1981 + }, + { + "epoch": 0.2626905235255136, + "grad_norm": 0.528471052646637, + "learning_rate": 4.906861091056549e-06, + "loss": 0.0033, + "num_input_tokens_seen": 3914864, + "step": 1982 + }, + { + "epoch": 0.2628230616302187, + "grad_norm": 10.204917907714844, + "learning_rate": 4.906767197438479e-06, + "loss": 0.2666, + "num_input_tokens_seen": 3916760, + "step": 1983 + }, + { + "epoch": 0.2629555997349238, + "grad_norm": 11.734451293945312, + "learning_rate": 4.9066732574162776e-06, + "loss": 0.2295, + "num_input_tokens_seen": 3918712, + "step": 1984 + }, + { + "epoch": 0.26308813783962887, + "grad_norm": 0.8257302641868591, + "learning_rate": 4.906579270991755e-06, + "loss": 0.0034, + "num_input_tokens_seen": 3920976, + "step": 1985 + }, + { + "epoch": 0.263220675944334, + "grad_norm": 0.7100668549537659, + "learning_rate": 4.9064852381667244e-06, + "loss": 0.0044, + "num_input_tokens_seen": 3922544, + "step": 1986 + }, + { + "epoch": 0.2633532140490391, + "grad_norm": 7.989956378936768, + "learning_rate": 4.906391158942998e-06, + "loss": 0.1863, + "num_input_tokens_seen": 3924408, + "step": 1987 + }, + { + "epoch": 0.2634857521537442, + "grad_norm": 0.2879713475704193, + "learning_rate": 4.90629703332239e-06, + "loss": 0.0016, + "num_input_tokens_seen": 3926432, + "step": 1988 + }, + { + "epoch": 0.2636182902584493, + "grad_norm": 5.889092445373535, + "learning_rate": 4.906202861306715e-06, + "loss": 0.1722, + "num_input_tokens_seen": 3928160, + "step": 1989 + }, + { + "epoch": 0.2637508283631544, + "grad_norm": 9.483233451843262, + "learning_rate": 4.906108642897789e-06, + "loss": 0.1704, + "num_input_tokens_seen": 3929600, + "step": 1990 + }, + { + "epoch": 0.26388336646785954, + "grad_norm": 21.586238861083984, + "learning_rate": 4.906014378097429e-06, + "loss": 0.5517, + "num_input_tokens_seen": 3932624, + "step": 1991 + }, + { + "epoch": 0.26401590457256463, + "grad_norm": 12.82308578491211, + "learning_rate": 4.905920066907451e-06, + "loss": 0.2326, + "num_input_tokens_seen": 3934672, + "step": 1992 + }, + { + "epoch": 0.2641484426772697, + "grad_norm": 11.597251892089844, + "learning_rate": 4.905825709329674e-06, + "loss": 0.2367, + "num_input_tokens_seen": 3936400, + "step": 1993 + }, + { + "epoch": 0.2642809807819748, + "grad_norm": 15.270519256591797, + "learning_rate": 4.90573130536592e-06, + "loss": 0.2457, + "num_input_tokens_seen": 3938048, + "step": 1994 + }, + { + "epoch": 0.2644135188866799, + "grad_norm": 0.1481652706861496, + "learning_rate": 4.905636855018004e-06, + "loss": 0.0009, + "num_input_tokens_seen": 3939608, + "step": 1995 + }, + { + "epoch": 0.264546056991385, + "grad_norm": 11.909150123596191, + "learning_rate": 4.905542358287751e-06, + "loss": 0.376, + "num_input_tokens_seen": 3941472, + "step": 1996 + }, + { + "epoch": 0.26467859509609015, + "grad_norm": 0.2123836874961853, + "learning_rate": 4.9054478151769824e-06, + "loss": 0.0013, + "num_input_tokens_seen": 3943160, + "step": 1997 + }, + { + "epoch": 0.26481113320079525, + "grad_norm": 16.624813079833984, + "learning_rate": 4.90535322568752e-06, + "loss": 0.2058, + "num_input_tokens_seen": 3945496, + "step": 1998 + }, + { + "epoch": 0.26494367130550034, + "grad_norm": 7.588615417480469, + "learning_rate": 4.905258589821188e-06, + "loss": 0.0763, + "num_input_tokens_seen": 3949272, + "step": 1999 + }, + { + "epoch": 0.26507620941020543, + "grad_norm": 8.724717140197754, + "learning_rate": 4.9051639075798115e-06, + "loss": 0.2393, + "num_input_tokens_seen": 3951376, + "step": 2000 + }, + { + "epoch": 0.2652087475149105, + "grad_norm": 2.962209701538086, + "learning_rate": 4.905069178965215e-06, + "loss": 0.0286, + "num_input_tokens_seen": 3953696, + "step": 2001 + }, + { + "epoch": 0.2653412856196156, + "grad_norm": 9.51966667175293, + "learning_rate": 4.904974403979226e-06, + "loss": 0.1367, + "num_input_tokens_seen": 3956088, + "step": 2002 + }, + { + "epoch": 0.26547382372432077, + "grad_norm": 14.170860290527344, + "learning_rate": 4.904879582623671e-06, + "loss": 0.2694, + "num_input_tokens_seen": 3958344, + "step": 2003 + }, + { + "epoch": 0.26560636182902586, + "grad_norm": 14.364133834838867, + "learning_rate": 4.904784714900379e-06, + "loss": 0.4111, + "num_input_tokens_seen": 3961544, + "step": 2004 + }, + { + "epoch": 0.26573889993373095, + "grad_norm": 11.268760681152344, + "learning_rate": 4.904689800811179e-06, + "loss": 0.1794, + "num_input_tokens_seen": 3963936, + "step": 2005 + }, + { + "epoch": 0.26587143803843605, + "grad_norm": 5.095663070678711, + "learning_rate": 4.904594840357901e-06, + "loss": 0.0296, + "num_input_tokens_seen": 3965264, + "step": 2006 + }, + { + "epoch": 0.26600397614314114, + "grad_norm": 0.5969743132591248, + "learning_rate": 4.904499833542374e-06, + "loss": 0.0031, + "num_input_tokens_seen": 3967416, + "step": 2007 + }, + { + "epoch": 0.26613651424784623, + "grad_norm": 6.687586307525635, + "learning_rate": 4.9044047803664315e-06, + "loss": 0.1079, + "num_input_tokens_seen": 3969912, + "step": 2008 + }, + { + "epoch": 0.2662690523525514, + "grad_norm": 16.669143676757812, + "learning_rate": 4.904309680831908e-06, + "loss": 0.1944, + "num_input_tokens_seen": 3972248, + "step": 2009 + }, + { + "epoch": 0.2664015904572565, + "grad_norm": 15.311866760253906, + "learning_rate": 4.9042145349406335e-06, + "loss": 0.4541, + "num_input_tokens_seen": 3973952, + "step": 2010 + }, + { + "epoch": 0.26653412856196157, + "grad_norm": 0.25193893909454346, + "learning_rate": 4.904119342694445e-06, + "loss": 0.0015, + "num_input_tokens_seen": 3975472, + "step": 2011 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 21.174386978149414, + "learning_rate": 4.904024104095178e-06, + "loss": 0.6547, + "num_input_tokens_seen": 3977736, + "step": 2012 + }, + { + "epoch": 0.26679920477137176, + "grad_norm": 0.1642005443572998, + "learning_rate": 4.903928819144666e-06, + "loss": 0.001, + "num_input_tokens_seen": 3979552, + "step": 2013 + }, + { + "epoch": 0.26693174287607685, + "grad_norm": 0.40174320340156555, + "learning_rate": 4.903833487844749e-06, + "loss": 0.0016, + "num_input_tokens_seen": 3980760, + "step": 2014 + }, + { + "epoch": 0.267064280980782, + "grad_norm": 0.2936193645000458, + "learning_rate": 4.9037381101972635e-06, + "loss": 0.0014, + "num_input_tokens_seen": 3983176, + "step": 2015 + }, + { + "epoch": 0.2671968190854871, + "grad_norm": 9.621676445007324, + "learning_rate": 4.9036426862040485e-06, + "loss": 0.3425, + "num_input_tokens_seen": 3984360, + "step": 2016 + }, + { + "epoch": 0.2673293571901922, + "grad_norm": 5.5126051902771, + "learning_rate": 4.903547215866945e-06, + "loss": 0.1968, + "num_input_tokens_seen": 3986088, + "step": 2017 + }, + { + "epoch": 0.2674618952948973, + "grad_norm": 18.312286376953125, + "learning_rate": 4.903451699187793e-06, + "loss": 0.5361, + "num_input_tokens_seen": 3988456, + "step": 2018 + }, + { + "epoch": 0.26759443339960237, + "grad_norm": 3.170588970184326, + "learning_rate": 4.903356136168435e-06, + "loss": 0.0197, + "num_input_tokens_seen": 3990216, + "step": 2019 + }, + { + "epoch": 0.26772697150430746, + "grad_norm": 15.552123069763184, + "learning_rate": 4.903260526810712e-06, + "loss": 0.4036, + "num_input_tokens_seen": 3992712, + "step": 2020 + }, + { + "epoch": 0.2678595096090126, + "grad_norm": 11.448805809020996, + "learning_rate": 4.903164871116468e-06, + "loss": 0.553, + "num_input_tokens_seen": 3996408, + "step": 2021 + }, + { + "epoch": 0.2679920477137177, + "grad_norm": 5.961345195770264, + "learning_rate": 4.903069169087548e-06, + "loss": 0.0658, + "num_input_tokens_seen": 3999040, + "step": 2022 + }, + { + "epoch": 0.2681245858184228, + "grad_norm": 8.351921081542969, + "learning_rate": 4.902973420725796e-06, + "loss": 0.0991, + "num_input_tokens_seen": 4000824, + "step": 2023 + }, + { + "epoch": 0.2682571239231279, + "grad_norm": 11.373915672302246, + "learning_rate": 4.902877626033059e-06, + "loss": 0.1192, + "num_input_tokens_seen": 4003128, + "step": 2024 + }, + { + "epoch": 0.268389662027833, + "grad_norm": 6.572281360626221, + "learning_rate": 4.9027817850111836e-06, + "loss": 0.0607, + "num_input_tokens_seen": 4004656, + "step": 2025 + }, + { + "epoch": 0.2685222001325381, + "grad_norm": 16.788888931274414, + "learning_rate": 4.902685897662017e-06, + "loss": 0.4096, + "num_input_tokens_seen": 4006128, + "step": 2026 + }, + { + "epoch": 0.26865473823724323, + "grad_norm": 7.543140411376953, + "learning_rate": 4.90258996398741e-06, + "loss": 0.193, + "num_input_tokens_seen": 4008552, + "step": 2027 + }, + { + "epoch": 0.2687872763419483, + "grad_norm": 16.228511810302734, + "learning_rate": 4.902493983989211e-06, + "loss": 0.3381, + "num_input_tokens_seen": 4010688, + "step": 2028 + }, + { + "epoch": 0.2689198144466534, + "grad_norm": 9.006682395935059, + "learning_rate": 4.9023979576692695e-06, + "loss": 0.2154, + "num_input_tokens_seen": 4012280, + "step": 2029 + }, + { + "epoch": 0.2690523525513585, + "grad_norm": 8.267720222473145, + "learning_rate": 4.902301885029439e-06, + "loss": 0.154, + "num_input_tokens_seen": 4014384, + "step": 2030 + }, + { + "epoch": 0.2691848906560636, + "grad_norm": 5.125559329986572, + "learning_rate": 4.90220576607157e-06, + "loss": 0.1735, + "num_input_tokens_seen": 4015936, + "step": 2031 + }, + { + "epoch": 0.2693174287607687, + "grad_norm": 0.15806277096271515, + "learning_rate": 4.902109600797518e-06, + "loss": 0.0009, + "num_input_tokens_seen": 4018200, + "step": 2032 + }, + { + "epoch": 0.26944996686547384, + "grad_norm": 0.2320534735918045, + "learning_rate": 4.902013389209135e-06, + "loss": 0.0011, + "num_input_tokens_seen": 4020664, + "step": 2033 + }, + { + "epoch": 0.26958250497017894, + "grad_norm": 0.22386534512043, + "learning_rate": 4.901917131308277e-06, + "loss": 0.0011, + "num_input_tokens_seen": 4022096, + "step": 2034 + }, + { + "epoch": 0.26971504307488403, + "grad_norm": 8.560279846191406, + "learning_rate": 4.9018208270967994e-06, + "loss": 0.2377, + "num_input_tokens_seen": 4023784, + "step": 2035 + }, + { + "epoch": 0.2698475811795891, + "grad_norm": 16.491777420043945, + "learning_rate": 4.9017244765765595e-06, + "loss": 0.1779, + "num_input_tokens_seen": 4025784, + "step": 2036 + }, + { + "epoch": 0.2699801192842942, + "grad_norm": 10.388203620910645, + "learning_rate": 4.901628079749415e-06, + "loss": 0.1628, + "num_input_tokens_seen": 4028192, + "step": 2037 + }, + { + "epoch": 0.2701126573889993, + "grad_norm": 0.5208950638771057, + "learning_rate": 4.901531636617224e-06, + "loss": 0.0028, + "num_input_tokens_seen": 4029360, + "step": 2038 + }, + { + "epoch": 0.27024519549370446, + "grad_norm": 0.661808967590332, + "learning_rate": 4.901435147181846e-06, + "loss": 0.0032, + "num_input_tokens_seen": 4031104, + "step": 2039 + }, + { + "epoch": 0.27037773359840955, + "grad_norm": 19.85924530029297, + "learning_rate": 4.9013386114451425e-06, + "loss": 0.2914, + "num_input_tokens_seen": 4033824, + "step": 2040 + }, + { + "epoch": 0.27051027170311465, + "grad_norm": 12.537938117980957, + "learning_rate": 4.901242029408973e-06, + "loss": 0.2449, + "num_input_tokens_seen": 4035280, + "step": 2041 + }, + { + "epoch": 0.27064280980781974, + "grad_norm": 13.15445327758789, + "learning_rate": 4.9011454010752015e-06, + "loss": 0.2494, + "num_input_tokens_seen": 4037048, + "step": 2042 + }, + { + "epoch": 0.27077534791252483, + "grad_norm": 1.1507568359375, + "learning_rate": 4.901048726445689e-06, + "loss": 0.0026, + "num_input_tokens_seen": 4038760, + "step": 2043 + }, + { + "epoch": 0.2709078860172299, + "grad_norm": 0.09718719869852066, + "learning_rate": 4.900952005522302e-06, + "loss": 0.0005, + "num_input_tokens_seen": 4040496, + "step": 2044 + }, + { + "epoch": 0.2710404241219351, + "grad_norm": 21.779573440551758, + "learning_rate": 4.900855238306904e-06, + "loss": 0.6145, + "num_input_tokens_seen": 4042480, + "step": 2045 + }, + { + "epoch": 0.27117296222664017, + "grad_norm": 15.950552940368652, + "learning_rate": 4.900758424801361e-06, + "loss": 0.3946, + "num_input_tokens_seen": 4044376, + "step": 2046 + }, + { + "epoch": 0.27130550033134526, + "grad_norm": 4.705066204071045, + "learning_rate": 4.900661565007538e-06, + "loss": 0.0815, + "num_input_tokens_seen": 4045880, + "step": 2047 + }, + { + "epoch": 0.27143803843605036, + "grad_norm": 5.091487884521484, + "learning_rate": 4.900564658927305e-06, + "loss": 0.1412, + "num_input_tokens_seen": 4047608, + "step": 2048 + }, + { + "epoch": 0.27157057654075545, + "grad_norm": 0.09139948338270187, + "learning_rate": 4.9004677065625284e-06, + "loss": 0.0005, + "num_input_tokens_seen": 4049224, + "step": 2049 + }, + { + "epoch": 0.2717031146454606, + "grad_norm": 11.605480194091797, + "learning_rate": 4.900370707915079e-06, + "loss": 0.1594, + "num_input_tokens_seen": 4051216, + "step": 2050 + }, + { + "epoch": 0.2718356527501657, + "grad_norm": 16.284513473510742, + "learning_rate": 4.900273662986826e-06, + "loss": 0.2195, + "num_input_tokens_seen": 4052832, + "step": 2051 + }, + { + "epoch": 0.2719681908548708, + "grad_norm": 0.07736451923847198, + "learning_rate": 4.900176571779642e-06, + "loss": 0.0005, + "num_input_tokens_seen": 4054304, + "step": 2052 + }, + { + "epoch": 0.2721007289595759, + "grad_norm": 0.070550337433815, + "learning_rate": 4.900079434295397e-06, + "loss": 0.0004, + "num_input_tokens_seen": 4055520, + "step": 2053 + }, + { + "epoch": 0.27223326706428097, + "grad_norm": 20.032869338989258, + "learning_rate": 4.8999822505359655e-06, + "loss": 0.4176, + "num_input_tokens_seen": 4056880, + "step": 2054 + }, + { + "epoch": 0.27236580516898606, + "grad_norm": 8.376030921936035, + "learning_rate": 4.8998850205032195e-06, + "loss": 0.1885, + "num_input_tokens_seen": 4059520, + "step": 2055 + }, + { + "epoch": 0.2724983432736912, + "grad_norm": 10.103906631469727, + "learning_rate": 4.899787744199035e-06, + "loss": 0.2957, + "num_input_tokens_seen": 4062776, + "step": 2056 + }, + { + "epoch": 0.2726308813783963, + "grad_norm": 12.452119827270508, + "learning_rate": 4.899690421625287e-06, + "loss": 0.2127, + "num_input_tokens_seen": 4065752, + "step": 2057 + }, + { + "epoch": 0.2727634194831014, + "grad_norm": 14.237703323364258, + "learning_rate": 4.899593052783853e-06, + "loss": 0.1875, + "num_input_tokens_seen": 4067280, + "step": 2058 + }, + { + "epoch": 0.2728959575878065, + "grad_norm": 14.333855628967285, + "learning_rate": 4.899495637676609e-06, + "loss": 0.4797, + "num_input_tokens_seen": 4069456, + "step": 2059 + }, + { + "epoch": 0.2730284956925116, + "grad_norm": 6.316786289215088, + "learning_rate": 4.899398176305434e-06, + "loss": 0.0643, + "num_input_tokens_seen": 4071408, + "step": 2060 + }, + { + "epoch": 0.2731610337972167, + "grad_norm": 7.7867631912231445, + "learning_rate": 4.899300668672207e-06, + "loss": 0.0099, + "num_input_tokens_seen": 4073280, + "step": 2061 + }, + { + "epoch": 0.27329357190192183, + "grad_norm": 10.514313697814941, + "learning_rate": 4.8992031147788075e-06, + "loss": 0.2162, + "num_input_tokens_seen": 4075208, + "step": 2062 + }, + { + "epoch": 0.2734261100066269, + "grad_norm": 17.147254943847656, + "learning_rate": 4.8991055146271175e-06, + "loss": 0.3135, + "num_input_tokens_seen": 4077040, + "step": 2063 + }, + { + "epoch": 0.273558648111332, + "grad_norm": 10.662821769714355, + "learning_rate": 4.8990078682190175e-06, + "loss": 0.1895, + "num_input_tokens_seen": 4078960, + "step": 2064 + }, + { + "epoch": 0.2736911862160371, + "grad_norm": 11.364448547363281, + "learning_rate": 4.898910175556391e-06, + "loss": 0.3366, + "num_input_tokens_seen": 4081136, + "step": 2065 + }, + { + "epoch": 0.2738237243207422, + "grad_norm": 26.864459991455078, + "learning_rate": 4.898812436641122e-06, + "loss": 0.6015, + "num_input_tokens_seen": 4082864, + "step": 2066 + }, + { + "epoch": 0.2739562624254473, + "grad_norm": 5.448799133300781, + "learning_rate": 4.898714651475094e-06, + "loss": 0.0489, + "num_input_tokens_seen": 4084024, + "step": 2067 + }, + { + "epoch": 0.27408880053015244, + "grad_norm": 14.595294952392578, + "learning_rate": 4.898616820060192e-06, + "loss": 0.2594, + "num_input_tokens_seen": 4086136, + "step": 2068 + }, + { + "epoch": 0.27422133863485754, + "grad_norm": 16.47277069091797, + "learning_rate": 4.898518942398304e-06, + "loss": 0.5557, + "num_input_tokens_seen": 4087624, + "step": 2069 + }, + { + "epoch": 0.27435387673956263, + "grad_norm": 15.175814628601074, + "learning_rate": 4.898421018491316e-06, + "loss": 0.2742, + "num_input_tokens_seen": 4089128, + "step": 2070 + }, + { + "epoch": 0.2744864148442677, + "grad_norm": 12.268685340881348, + "learning_rate": 4.898323048341115e-06, + "loss": 0.2884, + "num_input_tokens_seen": 4090280, + "step": 2071 + }, + { + "epoch": 0.2746189529489728, + "grad_norm": 10.0732421875, + "learning_rate": 4.898225031949592e-06, + "loss": 0.2714, + "num_input_tokens_seen": 4092288, + "step": 2072 + }, + { + "epoch": 0.2747514910536779, + "grad_norm": 20.11159896850586, + "learning_rate": 4.8981269693186365e-06, + "loss": 0.6043, + "num_input_tokens_seen": 4094184, + "step": 2073 + }, + { + "epoch": 0.27488402915838306, + "grad_norm": 10.031855583190918, + "learning_rate": 4.898028860450138e-06, + "loss": 0.3076, + "num_input_tokens_seen": 4096584, + "step": 2074 + }, + { + "epoch": 0.27501656726308815, + "grad_norm": 11.20979118347168, + "learning_rate": 4.8979307053459884e-06, + "loss": 0.3234, + "num_input_tokens_seen": 4098720, + "step": 2075 + }, + { + "epoch": 0.27514910536779325, + "grad_norm": 4.827417373657227, + "learning_rate": 4.897832504008081e-06, + "loss": 0.1674, + "num_input_tokens_seen": 4100944, + "step": 2076 + }, + { + "epoch": 0.27528164347249834, + "grad_norm": 9.533052444458008, + "learning_rate": 4.897734256438309e-06, + "loss": 0.2884, + "num_input_tokens_seen": 4102928, + "step": 2077 + }, + { + "epoch": 0.27541418157720343, + "grad_norm": 4.671139240264893, + "learning_rate": 4.897635962638566e-06, + "loss": 0.0754, + "num_input_tokens_seen": 4105280, + "step": 2078 + }, + { + "epoch": 0.2755467196819085, + "grad_norm": 12.15795612335205, + "learning_rate": 4.897537622610748e-06, + "loss": 0.2198, + "num_input_tokens_seen": 4108352, + "step": 2079 + }, + { + "epoch": 0.2756792577866137, + "grad_norm": 0.42824143171310425, + "learning_rate": 4.89743923635675e-06, + "loss": 0.0025, + "num_input_tokens_seen": 4109648, + "step": 2080 + }, + { + "epoch": 0.27581179589131877, + "grad_norm": 17.460590362548828, + "learning_rate": 4.89734080387847e-06, + "loss": 0.2317, + "num_input_tokens_seen": 4111184, + "step": 2081 + }, + { + "epoch": 0.27594433399602386, + "grad_norm": 3.146728754043579, + "learning_rate": 4.897242325177805e-06, + "loss": 0.0138, + "num_input_tokens_seen": 4112624, + "step": 2082 + }, + { + "epoch": 0.27607687210072895, + "grad_norm": 0.4596201777458191, + "learning_rate": 4.897143800256655e-06, + "loss": 0.0029, + "num_input_tokens_seen": 4114336, + "step": 2083 + }, + { + "epoch": 0.27620941020543405, + "grad_norm": 13.028621673583984, + "learning_rate": 4.897045229116919e-06, + "loss": 0.3446, + "num_input_tokens_seen": 4117016, + "step": 2084 + }, + { + "epoch": 0.27634194831013914, + "grad_norm": 0.43419769406318665, + "learning_rate": 4.896946611760496e-06, + "loss": 0.0026, + "num_input_tokens_seen": 4119848, + "step": 2085 + }, + { + "epoch": 0.2764744864148443, + "grad_norm": 1.7718678712844849, + "learning_rate": 4.89684794818929e-06, + "loss": 0.0068, + "num_input_tokens_seen": 4121592, + "step": 2086 + }, + { + "epoch": 0.2766070245195494, + "grad_norm": 10.505563735961914, + "learning_rate": 4.896749238405202e-06, + "loss": 0.2272, + "num_input_tokens_seen": 4123720, + "step": 2087 + }, + { + "epoch": 0.2767395626242545, + "grad_norm": 14.227324485778809, + "learning_rate": 4.896650482410135e-06, + "loss": 0.252, + "num_input_tokens_seen": 4125096, + "step": 2088 + }, + { + "epoch": 0.27687210072895957, + "grad_norm": 0.12318974733352661, + "learning_rate": 4.896551680205992e-06, + "loss": 0.0008, + "num_input_tokens_seen": 4126384, + "step": 2089 + }, + { + "epoch": 0.27700463883366466, + "grad_norm": 9.384637832641602, + "learning_rate": 4.896452831794681e-06, + "loss": 0.4158, + "num_input_tokens_seen": 4128424, + "step": 2090 + }, + { + "epoch": 0.27713717693836976, + "grad_norm": 3.53444766998291, + "learning_rate": 4.896353937178106e-06, + "loss": 0.0519, + "num_input_tokens_seen": 4130104, + "step": 2091 + }, + { + "epoch": 0.2772697150430749, + "grad_norm": 0.919614851474762, + "learning_rate": 4.896254996358174e-06, + "loss": 0.0045, + "num_input_tokens_seen": 4131184, + "step": 2092 + }, + { + "epoch": 0.27740225314778, + "grad_norm": 18.927024841308594, + "learning_rate": 4.896156009336792e-06, + "loss": 0.6818, + "num_input_tokens_seen": 4133368, + "step": 2093 + }, + { + "epoch": 0.2775347912524851, + "grad_norm": 8.679544448852539, + "learning_rate": 4.896056976115869e-06, + "loss": 0.1331, + "num_input_tokens_seen": 4136256, + "step": 2094 + }, + { + "epoch": 0.2776673293571902, + "grad_norm": 15.54692268371582, + "learning_rate": 4.895957896697315e-06, + "loss": 0.4987, + "num_input_tokens_seen": 4137520, + "step": 2095 + }, + { + "epoch": 0.2777998674618953, + "grad_norm": 3.3338866233825684, + "learning_rate": 4.895858771083039e-06, + "loss": 0.0848, + "num_input_tokens_seen": 4139032, + "step": 2096 + }, + { + "epoch": 0.27793240556660037, + "grad_norm": 12.034407615661621, + "learning_rate": 4.895759599274955e-06, + "loss": 0.3229, + "num_input_tokens_seen": 4141664, + "step": 2097 + }, + { + "epoch": 0.2780649436713055, + "grad_norm": 10.661338806152344, + "learning_rate": 4.895660381274971e-06, + "loss": 0.265, + "num_input_tokens_seen": 4143560, + "step": 2098 + }, + { + "epoch": 0.2781974817760106, + "grad_norm": 4.307747840881348, + "learning_rate": 4.895561117085004e-06, + "loss": 0.2225, + "num_input_tokens_seen": 4147128, + "step": 2099 + }, + { + "epoch": 0.2783300198807157, + "grad_norm": 10.567222595214844, + "learning_rate": 4.895461806706965e-06, + "loss": 0.2437, + "num_input_tokens_seen": 4149080, + "step": 2100 + }, + { + "epoch": 0.2784625579854208, + "grad_norm": 16.266071319580078, + "learning_rate": 4.89536245014277e-06, + "loss": 0.2531, + "num_input_tokens_seen": 4150656, + "step": 2101 + }, + { + "epoch": 0.2785950960901259, + "grad_norm": 11.213003158569336, + "learning_rate": 4.895263047394334e-06, + "loss": 0.2957, + "num_input_tokens_seen": 4152424, + "step": 2102 + }, + { + "epoch": 0.278727634194831, + "grad_norm": 16.003807067871094, + "learning_rate": 4.895163598463575e-06, + "loss": 0.4159, + "num_input_tokens_seen": 4155384, + "step": 2103 + }, + { + "epoch": 0.27886017229953614, + "grad_norm": 0.06696567684412003, + "learning_rate": 4.89506410335241e-06, + "loss": 0.0004, + "num_input_tokens_seen": 4156712, + "step": 2104 + }, + { + "epoch": 0.27899271040424123, + "grad_norm": 1.272228717803955, + "learning_rate": 4.894964562062755e-06, + "loss": 0.0162, + "num_input_tokens_seen": 4158504, + "step": 2105 + }, + { + "epoch": 0.2791252485089463, + "grad_norm": 4.4508843421936035, + "learning_rate": 4.894864974596531e-06, + "loss": 0.0528, + "num_input_tokens_seen": 4160480, + "step": 2106 + }, + { + "epoch": 0.2792577866136514, + "grad_norm": 1.2055717706680298, + "learning_rate": 4.894765340955659e-06, + "loss": 0.0041, + "num_input_tokens_seen": 4162632, + "step": 2107 + }, + { + "epoch": 0.2793903247183565, + "grad_norm": 0.13629478216171265, + "learning_rate": 4.89466566114206e-06, + "loss": 0.0008, + "num_input_tokens_seen": 4164136, + "step": 2108 + }, + { + "epoch": 0.27952286282306166, + "grad_norm": 6.652531623840332, + "learning_rate": 4.894565935157654e-06, + "loss": 0.0524, + "num_input_tokens_seen": 4166208, + "step": 2109 + }, + { + "epoch": 0.27965540092776675, + "grad_norm": 13.997855186462402, + "learning_rate": 4.894466163004365e-06, + "loss": 0.2631, + "num_input_tokens_seen": 4168528, + "step": 2110 + }, + { + "epoch": 0.27978793903247184, + "grad_norm": 3.0898513793945312, + "learning_rate": 4.894366344684117e-06, + "loss": 0.0576, + "num_input_tokens_seen": 4171128, + "step": 2111 + }, + { + "epoch": 0.27992047713717694, + "grad_norm": 0.09713403880596161, + "learning_rate": 4.894266480198834e-06, + "loss": 0.0006, + "num_input_tokens_seen": 4173600, + "step": 2112 + }, + { + "epoch": 0.28005301524188203, + "grad_norm": 14.981772422790527, + "learning_rate": 4.894166569550442e-06, + "loss": 0.5667, + "num_input_tokens_seen": 4175976, + "step": 2113 + }, + { + "epoch": 0.2801855533465871, + "grad_norm": 13.724303245544434, + "learning_rate": 4.894066612740866e-06, + "loss": 0.4238, + "num_input_tokens_seen": 4178056, + "step": 2114 + }, + { + "epoch": 0.2803180914512923, + "grad_norm": 0.1035027801990509, + "learning_rate": 4.893966609772034e-06, + "loss": 0.0006, + "num_input_tokens_seen": 4180032, + "step": 2115 + }, + { + "epoch": 0.28045062955599737, + "grad_norm": 5.932621002197266, + "learning_rate": 4.893866560645874e-06, + "loss": 0.1781, + "num_input_tokens_seen": 4181456, + "step": 2116 + }, + { + "epoch": 0.28058316766070246, + "grad_norm": 3.967175245285034, + "learning_rate": 4.893766465364316e-06, + "loss": 0.0745, + "num_input_tokens_seen": 4183264, + "step": 2117 + }, + { + "epoch": 0.28071570576540755, + "grad_norm": 6.677824974060059, + "learning_rate": 4.8936663239292885e-06, + "loss": 0.0485, + "num_input_tokens_seen": 4185616, + "step": 2118 + }, + { + "epoch": 0.28084824387011265, + "grad_norm": 0.20160391926765442, + "learning_rate": 4.893566136342723e-06, + "loss": 0.0012, + "num_input_tokens_seen": 4187440, + "step": 2119 + }, + { + "epoch": 0.28098078197481774, + "grad_norm": 10.150309562683105, + "learning_rate": 4.893465902606553e-06, + "loss": 0.2062, + "num_input_tokens_seen": 4189088, + "step": 2120 + }, + { + "epoch": 0.2811133200795229, + "grad_norm": 7.110853672027588, + "learning_rate": 4.893365622722708e-06, + "loss": 0.1879, + "num_input_tokens_seen": 4191512, + "step": 2121 + }, + { + "epoch": 0.281245858184228, + "grad_norm": 11.135810852050781, + "learning_rate": 4.893265296693122e-06, + "loss": 0.1136, + "num_input_tokens_seen": 4193784, + "step": 2122 + }, + { + "epoch": 0.2813783962889331, + "grad_norm": 10.362106323242188, + "learning_rate": 4.8931649245197295e-06, + "loss": 0.2628, + "num_input_tokens_seen": 4195360, + "step": 2123 + }, + { + "epoch": 0.28151093439363817, + "grad_norm": 7.839051723480225, + "learning_rate": 4.893064506204468e-06, + "loss": 0.2214, + "num_input_tokens_seen": 4197504, + "step": 2124 + }, + { + "epoch": 0.28164347249834326, + "grad_norm": 9.837350845336914, + "learning_rate": 4.892964041749272e-06, + "loss": 0.1783, + "num_input_tokens_seen": 4198800, + "step": 2125 + }, + { + "epoch": 0.28177601060304835, + "grad_norm": 20.550384521484375, + "learning_rate": 4.892863531156077e-06, + "loss": 0.7532, + "num_input_tokens_seen": 4201304, + "step": 2126 + }, + { + "epoch": 0.2819085487077535, + "grad_norm": 6.567972660064697, + "learning_rate": 4.892762974426824e-06, + "loss": 0.1319, + "num_input_tokens_seen": 4202584, + "step": 2127 + }, + { + "epoch": 0.2820410868124586, + "grad_norm": 0.40053123235702515, + "learning_rate": 4.892662371563449e-06, + "loss": 0.0024, + "num_input_tokens_seen": 4205288, + "step": 2128 + }, + { + "epoch": 0.2821736249171637, + "grad_norm": 11.262001991271973, + "learning_rate": 4.892561722567893e-06, + "loss": 0.0842, + "num_input_tokens_seen": 4206400, + "step": 2129 + }, + { + "epoch": 0.2823061630218688, + "grad_norm": 13.66666316986084, + "learning_rate": 4.8924610274420965e-06, + "loss": 0.3291, + "num_input_tokens_seen": 4209208, + "step": 2130 + }, + { + "epoch": 0.2824387011265739, + "grad_norm": 0.42933377623558044, + "learning_rate": 4.892360286188001e-06, + "loss": 0.0026, + "num_input_tokens_seen": 4210536, + "step": 2131 + }, + { + "epoch": 0.28257123923127897, + "grad_norm": 3.2867815494537354, + "learning_rate": 4.892259498807549e-06, + "loss": 0.0461, + "num_input_tokens_seen": 4212336, + "step": 2132 + }, + { + "epoch": 0.2827037773359841, + "grad_norm": 1.2013334035873413, + "learning_rate": 4.892158665302683e-06, + "loss": 0.0073, + "num_input_tokens_seen": 4215064, + "step": 2133 + }, + { + "epoch": 0.2828363154406892, + "grad_norm": 6.951056957244873, + "learning_rate": 4.8920577856753485e-06, + "loss": 0.1215, + "num_input_tokens_seen": 4216496, + "step": 2134 + }, + { + "epoch": 0.2829688535453943, + "grad_norm": 9.356734275817871, + "learning_rate": 4.891956859927489e-06, + "loss": 0.2638, + "num_input_tokens_seen": 4218424, + "step": 2135 + }, + { + "epoch": 0.2831013916500994, + "grad_norm": 5.535884857177734, + "learning_rate": 4.891855888061051e-06, + "loss": 0.1378, + "num_input_tokens_seen": 4220232, + "step": 2136 + }, + { + "epoch": 0.2832339297548045, + "grad_norm": 0.24731677770614624, + "learning_rate": 4.8917548700779815e-06, + "loss": 0.0015, + "num_input_tokens_seen": 4221688, + "step": 2137 + }, + { + "epoch": 0.2833664678595096, + "grad_norm": 9.523365020751953, + "learning_rate": 4.891653805980229e-06, + "loss": 0.2774, + "num_input_tokens_seen": 4224160, + "step": 2138 + }, + { + "epoch": 0.28349900596421473, + "grad_norm": 4.411974906921387, + "learning_rate": 4.89155269576974e-06, + "loss": 0.1228, + "num_input_tokens_seen": 4225664, + "step": 2139 + }, + { + "epoch": 0.2836315440689198, + "grad_norm": 7.0857062339782715, + "learning_rate": 4.891451539448466e-06, + "loss": 0.0419, + "num_input_tokens_seen": 4227248, + "step": 2140 + }, + { + "epoch": 0.2837640821736249, + "grad_norm": 1.3636587858200073, + "learning_rate": 4.891350337018356e-06, + "loss": 0.0053, + "num_input_tokens_seen": 4228600, + "step": 2141 + }, + { + "epoch": 0.28389662027833, + "grad_norm": 17.46363639831543, + "learning_rate": 4.891249088481362e-06, + "loss": 0.5138, + "num_input_tokens_seen": 4230584, + "step": 2142 + }, + { + "epoch": 0.2840291583830351, + "grad_norm": 3.9148268699645996, + "learning_rate": 4.891147793839436e-06, + "loss": 0.0845, + "num_input_tokens_seen": 4232488, + "step": 2143 + }, + { + "epoch": 0.2841616964877402, + "grad_norm": 8.393854141235352, + "learning_rate": 4.8910464530945315e-06, + "loss": 0.3156, + "num_input_tokens_seen": 4235264, + "step": 2144 + }, + { + "epoch": 0.28429423459244535, + "grad_norm": 0.19208067655563354, + "learning_rate": 4.890945066248601e-06, + "loss": 0.0011, + "num_input_tokens_seen": 4236536, + "step": 2145 + }, + { + "epoch": 0.28442677269715044, + "grad_norm": 12.819842338562012, + "learning_rate": 4.8908436333036e-06, + "loss": 0.2872, + "num_input_tokens_seen": 4238552, + "step": 2146 + }, + { + "epoch": 0.28455931080185554, + "grad_norm": 10.956326484680176, + "learning_rate": 4.890742154261485e-06, + "loss": 0.3317, + "num_input_tokens_seen": 4239920, + "step": 2147 + }, + { + "epoch": 0.28469184890656063, + "grad_norm": 10.47569465637207, + "learning_rate": 4.8906406291242124e-06, + "loss": 0.3484, + "num_input_tokens_seen": 4241688, + "step": 2148 + }, + { + "epoch": 0.2848243870112657, + "grad_norm": 7.644991874694824, + "learning_rate": 4.890539057893738e-06, + "loss": 0.1377, + "num_input_tokens_seen": 4243464, + "step": 2149 + }, + { + "epoch": 0.2849569251159708, + "grad_norm": 0.26079681515693665, + "learning_rate": 4.890437440572023e-06, + "loss": 0.0014, + "num_input_tokens_seen": 4245144, + "step": 2150 + }, + { + "epoch": 0.28508946322067596, + "grad_norm": 0.09864098578691483, + "learning_rate": 4.890335777161024e-06, + "loss": 0.0006, + "num_input_tokens_seen": 4246512, + "step": 2151 + }, + { + "epoch": 0.28522200132538106, + "grad_norm": 11.880043983459473, + "learning_rate": 4.890234067662703e-06, + "loss": 0.3895, + "num_input_tokens_seen": 4249016, + "step": 2152 + }, + { + "epoch": 0.28535453943008615, + "grad_norm": 19.725011825561523, + "learning_rate": 4.890132312079019e-06, + "loss": 0.416, + "num_input_tokens_seen": 4251416, + "step": 2153 + }, + { + "epoch": 0.28548707753479124, + "grad_norm": 0.11647134274244308, + "learning_rate": 4.890030510411936e-06, + "loss": 0.0007, + "num_input_tokens_seen": 4252536, + "step": 2154 + }, + { + "epoch": 0.28561961563949634, + "grad_norm": 4.854540824890137, + "learning_rate": 4.889928662663416e-06, + "loss": 0.1292, + "num_input_tokens_seen": 4254872, + "step": 2155 + }, + { + "epoch": 0.28575215374420143, + "grad_norm": 4.824460506439209, + "learning_rate": 4.889826768835422e-06, + "loss": 0.1208, + "num_input_tokens_seen": 4257088, + "step": 2156 + }, + { + "epoch": 0.2858846918489066, + "grad_norm": 6.68923282623291, + "learning_rate": 4.88972482892992e-06, + "loss": 0.1716, + "num_input_tokens_seen": 4258632, + "step": 2157 + }, + { + "epoch": 0.2860172299536117, + "grad_norm": 7.124477863311768, + "learning_rate": 4.8896228429488745e-06, + "loss": 0.1301, + "num_input_tokens_seen": 4261320, + "step": 2158 + }, + { + "epoch": 0.28614976805831677, + "grad_norm": 0.330934077501297, + "learning_rate": 4.889520810894252e-06, + "loss": 0.002, + "num_input_tokens_seen": 4263792, + "step": 2159 + }, + { + "epoch": 0.28628230616302186, + "grad_norm": 11.706367492675781, + "learning_rate": 4.88941873276802e-06, + "loss": 0.4521, + "num_input_tokens_seen": 4266248, + "step": 2160 + }, + { + "epoch": 0.28641484426772695, + "grad_norm": 5.327991485595703, + "learning_rate": 4.889316608572145e-06, + "loss": 0.098, + "num_input_tokens_seen": 4267912, + "step": 2161 + }, + { + "epoch": 0.28654738237243205, + "grad_norm": 7.563729286193848, + "learning_rate": 4.889214438308599e-06, + "loss": 0.1869, + "num_input_tokens_seen": 4269632, + "step": 2162 + }, + { + "epoch": 0.2866799204771372, + "grad_norm": 9.429889678955078, + "learning_rate": 4.889112221979351e-06, + "loss": 0.111, + "num_input_tokens_seen": 4271432, + "step": 2163 + }, + { + "epoch": 0.2868124585818423, + "grad_norm": 11.942160606384277, + "learning_rate": 4.88900995958637e-06, + "loss": 0.296, + "num_input_tokens_seen": 4274160, + "step": 2164 + }, + { + "epoch": 0.2869449966865474, + "grad_norm": 9.047669410705566, + "learning_rate": 4.888907651131629e-06, + "loss": 0.2054, + "num_input_tokens_seen": 4277056, + "step": 2165 + }, + { + "epoch": 0.2870775347912525, + "grad_norm": 0.7073408365249634, + "learning_rate": 4.888805296617101e-06, + "loss": 0.0044, + "num_input_tokens_seen": 4278632, + "step": 2166 + }, + { + "epoch": 0.28721007289595757, + "grad_norm": 6.935251712799072, + "learning_rate": 4.888702896044759e-06, + "loss": 0.1116, + "num_input_tokens_seen": 4280664, + "step": 2167 + }, + { + "epoch": 0.2873426110006627, + "grad_norm": 6.126161575317383, + "learning_rate": 4.888600449416576e-06, + "loss": 0.0927, + "num_input_tokens_seen": 4282560, + "step": 2168 + }, + { + "epoch": 0.2874751491053678, + "grad_norm": 16.38372230529785, + "learning_rate": 4.88849795673453e-06, + "loss": 0.2978, + "num_input_tokens_seen": 4284872, + "step": 2169 + }, + { + "epoch": 0.2876076872100729, + "grad_norm": 18.123579025268555, + "learning_rate": 4.888395418000595e-06, + "loss": 0.4355, + "num_input_tokens_seen": 4286896, + "step": 2170 + }, + { + "epoch": 0.287740225314778, + "grad_norm": 10.993791580200195, + "learning_rate": 4.888292833216749e-06, + "loss": 0.218, + "num_input_tokens_seen": 4288672, + "step": 2171 + }, + { + "epoch": 0.2878727634194831, + "grad_norm": 19.185699462890625, + "learning_rate": 4.8881902023849696e-06, + "loss": 0.1878, + "num_input_tokens_seen": 4290640, + "step": 2172 + }, + { + "epoch": 0.2880053015241882, + "grad_norm": 16.663862228393555, + "learning_rate": 4.888087525507235e-06, + "loss": 0.3048, + "num_input_tokens_seen": 4293048, + "step": 2173 + }, + { + "epoch": 0.28813783962889333, + "grad_norm": 15.162696838378906, + "learning_rate": 4.887984802585525e-06, + "loss": 0.1771, + "num_input_tokens_seen": 4295712, + "step": 2174 + }, + { + "epoch": 0.2882703777335984, + "grad_norm": 8.397686004638672, + "learning_rate": 4.887882033621821e-06, + "loss": 0.2193, + "num_input_tokens_seen": 4297968, + "step": 2175 + }, + { + "epoch": 0.2884029158383035, + "grad_norm": 2.960301160812378, + "learning_rate": 4.887779218618105e-06, + "loss": 0.0206, + "num_input_tokens_seen": 4299832, + "step": 2176 + }, + { + "epoch": 0.2885354539430086, + "grad_norm": 16.112192153930664, + "learning_rate": 4.887676357576358e-06, + "loss": 0.3614, + "num_input_tokens_seen": 4301440, + "step": 2177 + }, + { + "epoch": 0.2886679920477137, + "grad_norm": 12.998985290527344, + "learning_rate": 4.887573450498564e-06, + "loss": 0.2919, + "num_input_tokens_seen": 4303680, + "step": 2178 + }, + { + "epoch": 0.2888005301524188, + "grad_norm": 18.08317756652832, + "learning_rate": 4.887470497386706e-06, + "loss": 0.314, + "num_input_tokens_seen": 4306304, + "step": 2179 + }, + { + "epoch": 0.28893306825712395, + "grad_norm": 17.149301528930664, + "learning_rate": 4.88736749824277e-06, + "loss": 0.32, + "num_input_tokens_seen": 4307808, + "step": 2180 + }, + { + "epoch": 0.28906560636182904, + "grad_norm": 11.350018501281738, + "learning_rate": 4.887264453068742e-06, + "loss": 0.3471, + "num_input_tokens_seen": 4309816, + "step": 2181 + }, + { + "epoch": 0.28919814446653413, + "grad_norm": 15.040003776550293, + "learning_rate": 4.887161361866608e-06, + "loss": 0.1677, + "num_input_tokens_seen": 4311736, + "step": 2182 + }, + { + "epoch": 0.28933068257123923, + "grad_norm": 11.911293983459473, + "learning_rate": 4.887058224638356e-06, + "loss": 0.3556, + "num_input_tokens_seen": 4314048, + "step": 2183 + }, + { + "epoch": 0.2894632206759443, + "grad_norm": 10.788071632385254, + "learning_rate": 4.886955041385975e-06, + "loss": 0.1982, + "num_input_tokens_seen": 4316024, + "step": 2184 + }, + { + "epoch": 0.2895957587806494, + "grad_norm": 9.0302734375, + "learning_rate": 4.886851812111454e-06, + "loss": 0.2302, + "num_input_tokens_seen": 4317664, + "step": 2185 + }, + { + "epoch": 0.28972829688535456, + "grad_norm": 4.5644989013671875, + "learning_rate": 4.886748536816784e-06, + "loss": 0.123, + "num_input_tokens_seen": 4318688, + "step": 2186 + }, + { + "epoch": 0.28986083499005966, + "grad_norm": 3.457049608230591, + "learning_rate": 4.886645215503955e-06, + "loss": 0.0247, + "num_input_tokens_seen": 4320896, + "step": 2187 + }, + { + "epoch": 0.28999337309476475, + "grad_norm": 1.8337422609329224, + "learning_rate": 4.8865418481749605e-06, + "loss": 0.0121, + "num_input_tokens_seen": 4322448, + "step": 2188 + }, + { + "epoch": 0.29012591119946984, + "grad_norm": 1.623603343963623, + "learning_rate": 4.886438434831792e-06, + "loss": 0.0108, + "num_input_tokens_seen": 4324248, + "step": 2189 + }, + { + "epoch": 0.29025844930417494, + "grad_norm": 10.071128845214844, + "learning_rate": 4.886334975476445e-06, + "loss": 0.3087, + "num_input_tokens_seen": 4326088, + "step": 2190 + }, + { + "epoch": 0.29039098740888003, + "grad_norm": 6.025654315948486, + "learning_rate": 4.886231470110913e-06, + "loss": 0.0756, + "num_input_tokens_seen": 4327744, + "step": 2191 + }, + { + "epoch": 0.2905235255135852, + "grad_norm": 11.878471374511719, + "learning_rate": 4.886127918737191e-06, + "loss": 0.3052, + "num_input_tokens_seen": 4329432, + "step": 2192 + }, + { + "epoch": 0.2906560636182903, + "grad_norm": 6.4588303565979, + "learning_rate": 4.8860243213572776e-06, + "loss": 0.0933, + "num_input_tokens_seen": 4331320, + "step": 2193 + }, + { + "epoch": 0.29078860172299537, + "grad_norm": 10.340522766113281, + "learning_rate": 4.885920677973169e-06, + "loss": 0.392, + "num_input_tokens_seen": 4333216, + "step": 2194 + }, + { + "epoch": 0.29092113982770046, + "grad_norm": 10.025800704956055, + "learning_rate": 4.885816988586864e-06, + "loss": 0.1623, + "num_input_tokens_seen": 4335312, + "step": 2195 + }, + { + "epoch": 0.29105367793240555, + "grad_norm": 14.015799522399902, + "learning_rate": 4.8857132532003624e-06, + "loss": 0.2503, + "num_input_tokens_seen": 4338424, + "step": 2196 + }, + { + "epoch": 0.29118621603711065, + "grad_norm": 0.4355078339576721, + "learning_rate": 4.885609471815662e-06, + "loss": 0.0027, + "num_input_tokens_seen": 4340248, + "step": 2197 + }, + { + "epoch": 0.2913187541418158, + "grad_norm": 0.33607354760169983, + "learning_rate": 4.885505644434766e-06, + "loss": 0.0021, + "num_input_tokens_seen": 4342096, + "step": 2198 + }, + { + "epoch": 0.2914512922465209, + "grad_norm": 7.259146690368652, + "learning_rate": 4.885401771059676e-06, + "loss": 0.0773, + "num_input_tokens_seen": 4343840, + "step": 2199 + }, + { + "epoch": 0.291583830351226, + "grad_norm": 5.765463352203369, + "learning_rate": 4.885297851692393e-06, + "loss": 0.1881, + "num_input_tokens_seen": 4345552, + "step": 2200 + }, + { + "epoch": 0.2917163684559311, + "grad_norm": 7.386293888092041, + "learning_rate": 4.885193886334922e-06, + "loss": 0.1949, + "num_input_tokens_seen": 4348552, + "step": 2201 + }, + { + "epoch": 0.29184890656063617, + "grad_norm": 6.986801624298096, + "learning_rate": 4.885089874989268e-06, + "loss": 0.2179, + "num_input_tokens_seen": 4350448, + "step": 2202 + }, + { + "epoch": 0.29198144466534126, + "grad_norm": 4.420465469360352, + "learning_rate": 4.884985817657436e-06, + "loss": 0.0385, + "num_input_tokens_seen": 4352016, + "step": 2203 + }, + { + "epoch": 0.2921139827700464, + "grad_norm": 9.250154495239258, + "learning_rate": 4.884881714341432e-06, + "loss": 0.1252, + "num_input_tokens_seen": 4353368, + "step": 2204 + }, + { + "epoch": 0.2922465208747515, + "grad_norm": 0.14360158145427704, + "learning_rate": 4.884777565043264e-06, + "loss": 0.0006, + "num_input_tokens_seen": 4355584, + "step": 2205 + }, + { + "epoch": 0.2923790589794566, + "grad_norm": 13.098666191101074, + "learning_rate": 4.884673369764939e-06, + "loss": 0.4706, + "num_input_tokens_seen": 4357728, + "step": 2206 + }, + { + "epoch": 0.2925115970841617, + "grad_norm": 6.148409843444824, + "learning_rate": 4.884569128508465e-06, + "loss": 0.202, + "num_input_tokens_seen": 4358888, + "step": 2207 + }, + { + "epoch": 0.2926441351888668, + "grad_norm": 0.04250134155154228, + "learning_rate": 4.8844648412758535e-06, + "loss": 0.0003, + "num_input_tokens_seen": 4360296, + "step": 2208 + }, + { + "epoch": 0.2927766732935719, + "grad_norm": 7.456624984741211, + "learning_rate": 4.884360508069116e-06, + "loss": 0.1137, + "num_input_tokens_seen": 4361720, + "step": 2209 + }, + { + "epoch": 0.292909211398277, + "grad_norm": 5.744297981262207, + "learning_rate": 4.884256128890262e-06, + "loss": 0.1297, + "num_input_tokens_seen": 4363712, + "step": 2210 + }, + { + "epoch": 0.2930417495029821, + "grad_norm": 6.307443618774414, + "learning_rate": 4.884151703741306e-06, + "loss": 0.185, + "num_input_tokens_seen": 4365064, + "step": 2211 + }, + { + "epoch": 0.2931742876076872, + "grad_norm": 0.07225403189659119, + "learning_rate": 4.8840472326242594e-06, + "loss": 0.0005, + "num_input_tokens_seen": 4366224, + "step": 2212 + }, + { + "epoch": 0.2933068257123923, + "grad_norm": 0.04703948274254799, + "learning_rate": 4.883942715541138e-06, + "loss": 0.0003, + "num_input_tokens_seen": 4368440, + "step": 2213 + }, + { + "epoch": 0.2934393638170974, + "grad_norm": 8.693592071533203, + "learning_rate": 4.8838381524939575e-06, + "loss": 0.1653, + "num_input_tokens_seen": 4370184, + "step": 2214 + }, + { + "epoch": 0.2935719019218025, + "grad_norm": 1.8131909370422363, + "learning_rate": 4.883733543484731e-06, + "loss": 0.0117, + "num_input_tokens_seen": 4371552, + "step": 2215 + }, + { + "epoch": 0.29370444002650764, + "grad_norm": 0.3738787770271301, + "learning_rate": 4.883628888515478e-06, + "loss": 0.0014, + "num_input_tokens_seen": 4373160, + "step": 2216 + }, + { + "epoch": 0.29383697813121273, + "grad_norm": 9.417610168457031, + "learning_rate": 4.883524187588216e-06, + "loss": 0.2664, + "num_input_tokens_seen": 4374904, + "step": 2217 + }, + { + "epoch": 0.2939695162359178, + "grad_norm": 0.08255329728126526, + "learning_rate": 4.8834194407049626e-06, + "loss": 0.0005, + "num_input_tokens_seen": 4376640, + "step": 2218 + }, + { + "epoch": 0.2941020543406229, + "grad_norm": 10.780593872070312, + "learning_rate": 4.883314647867739e-06, + "loss": 0.5253, + "num_input_tokens_seen": 4379472, + "step": 2219 + }, + { + "epoch": 0.294234592445328, + "grad_norm": 0.0779641643166542, + "learning_rate": 4.883209809078565e-06, + "loss": 0.0005, + "num_input_tokens_seen": 4381792, + "step": 2220 + }, + { + "epoch": 0.2943671305500331, + "grad_norm": 0.04254482313990593, + "learning_rate": 4.8831049243394615e-06, + "loss": 0.0003, + "num_input_tokens_seen": 4383512, + "step": 2221 + }, + { + "epoch": 0.29449966865473826, + "grad_norm": 16.96940040588379, + "learning_rate": 4.882999993652451e-06, + "loss": 0.3445, + "num_input_tokens_seen": 4385136, + "step": 2222 + }, + { + "epoch": 0.29463220675944335, + "grad_norm": 5.812951564788818, + "learning_rate": 4.882895017019556e-06, + "loss": 0.1333, + "num_input_tokens_seen": 4386760, + "step": 2223 + }, + { + "epoch": 0.29476474486414844, + "grad_norm": 12.121362686157227, + "learning_rate": 4.882789994442802e-06, + "loss": 0.4926, + "num_input_tokens_seen": 4388792, + "step": 2224 + }, + { + "epoch": 0.29489728296885354, + "grad_norm": 0.09935160726308823, + "learning_rate": 4.882684925924213e-06, + "loss": 0.0006, + "num_input_tokens_seen": 4390712, + "step": 2225 + }, + { + "epoch": 0.29502982107355863, + "grad_norm": 6.53181791305542, + "learning_rate": 4.882579811465814e-06, + "loss": 0.0886, + "num_input_tokens_seen": 4392672, + "step": 2226 + }, + { + "epoch": 0.2951623591782638, + "grad_norm": 16.24932861328125, + "learning_rate": 4.882474651069634e-06, + "loss": 0.6107, + "num_input_tokens_seen": 4395744, + "step": 2227 + }, + { + "epoch": 0.29529489728296887, + "grad_norm": 4.402467727661133, + "learning_rate": 4.882369444737698e-06, + "loss": 0.1179, + "num_input_tokens_seen": 4397352, + "step": 2228 + }, + { + "epoch": 0.29542743538767396, + "grad_norm": 0.09423938393592834, + "learning_rate": 4.882264192472036e-06, + "loss": 0.0006, + "num_input_tokens_seen": 4398928, + "step": 2229 + }, + { + "epoch": 0.29555997349237906, + "grad_norm": 10.287824630737305, + "learning_rate": 4.8821588942746765e-06, + "loss": 0.2996, + "num_input_tokens_seen": 4400352, + "step": 2230 + }, + { + "epoch": 0.29569251159708415, + "grad_norm": 13.64413833618164, + "learning_rate": 4.882053550147651e-06, + "loss": 0.5281, + "num_input_tokens_seen": 4402224, + "step": 2231 + }, + { + "epoch": 0.29582504970178924, + "grad_norm": 5.7602620124816895, + "learning_rate": 4.88194816009299e-06, + "loss": 0.1773, + "num_input_tokens_seen": 4403488, + "step": 2232 + }, + { + "epoch": 0.2959575878064944, + "grad_norm": 0.15154927968978882, + "learning_rate": 4.881842724112724e-06, + "loss": 0.0009, + "num_input_tokens_seen": 4404592, + "step": 2233 + }, + { + "epoch": 0.2960901259111995, + "grad_norm": 1.4744921922683716, + "learning_rate": 4.881737242208888e-06, + "loss": 0.0067, + "num_input_tokens_seen": 4406728, + "step": 2234 + }, + { + "epoch": 0.2962226640159046, + "grad_norm": 12.635961532592773, + "learning_rate": 4.881631714383515e-06, + "loss": 0.3641, + "num_input_tokens_seen": 4408848, + "step": 2235 + }, + { + "epoch": 0.2963552021206097, + "grad_norm": 12.352937698364258, + "learning_rate": 4.881526140638638e-06, + "loss": 0.3232, + "num_input_tokens_seen": 4410728, + "step": 2236 + }, + { + "epoch": 0.29648774022531477, + "grad_norm": 0.786177933216095, + "learning_rate": 4.881420520976296e-06, + "loss": 0.0046, + "num_input_tokens_seen": 4411888, + "step": 2237 + }, + { + "epoch": 0.29662027833001986, + "grad_norm": 10.772207260131836, + "learning_rate": 4.881314855398523e-06, + "loss": 0.0888, + "num_input_tokens_seen": 4413816, + "step": 2238 + }, + { + "epoch": 0.296752816434725, + "grad_norm": 5.836910724639893, + "learning_rate": 4.881209143907356e-06, + "loss": 0.2267, + "num_input_tokens_seen": 4415296, + "step": 2239 + }, + { + "epoch": 0.2968853545394301, + "grad_norm": 0.4418144226074219, + "learning_rate": 4.881103386504835e-06, + "loss": 0.0023, + "num_input_tokens_seen": 4416888, + "step": 2240 + }, + { + "epoch": 0.2970178926441352, + "grad_norm": 9.291844367980957, + "learning_rate": 4.880997583192998e-06, + "loss": 0.1473, + "num_input_tokens_seen": 4419672, + "step": 2241 + }, + { + "epoch": 0.2971504307488403, + "grad_norm": 11.143998146057129, + "learning_rate": 4.880891733973884e-06, + "loss": 0.3925, + "num_input_tokens_seen": 4421528, + "step": 2242 + }, + { + "epoch": 0.2972829688535454, + "grad_norm": 14.662081718444824, + "learning_rate": 4.8807858388495355e-06, + "loss": 0.3738, + "num_input_tokens_seen": 4423000, + "step": 2243 + }, + { + "epoch": 0.2974155069582505, + "grad_norm": 21.81793212890625, + "learning_rate": 4.880679897821994e-06, + "loss": 0.6423, + "num_input_tokens_seen": 4425072, + "step": 2244 + }, + { + "epoch": 0.2975480450629556, + "grad_norm": 0.2173934429883957, + "learning_rate": 4.880573910893301e-06, + "loss": 0.0013, + "num_input_tokens_seen": 4426600, + "step": 2245 + }, + { + "epoch": 0.2976805831676607, + "grad_norm": 0.7134649753570557, + "learning_rate": 4.880467878065501e-06, + "loss": 0.004, + "num_input_tokens_seen": 4429312, + "step": 2246 + }, + { + "epoch": 0.2978131212723658, + "grad_norm": 0.6743342876434326, + "learning_rate": 4.8803617993406385e-06, + "loss": 0.0037, + "num_input_tokens_seen": 4430568, + "step": 2247 + }, + { + "epoch": 0.2979456593770709, + "grad_norm": 12.765079498291016, + "learning_rate": 4.8802556747207586e-06, + "loss": 0.2069, + "num_input_tokens_seen": 4432896, + "step": 2248 + }, + { + "epoch": 0.298078197481776, + "grad_norm": 4.122313499450684, + "learning_rate": 4.880149504207906e-06, + "loss": 0.0313, + "num_input_tokens_seen": 4434992, + "step": 2249 + }, + { + "epoch": 0.2982107355864811, + "grad_norm": 0.18297256529331207, + "learning_rate": 4.880043287804131e-06, + "loss": 0.0011, + "num_input_tokens_seen": 4436520, + "step": 2250 + }, + { + "epoch": 0.29834327369118624, + "grad_norm": 10.148646354675293, + "learning_rate": 4.879937025511478e-06, + "loss": 0.1464, + "num_input_tokens_seen": 4438840, + "step": 2251 + }, + { + "epoch": 0.29847581179589133, + "grad_norm": 14.868453025817871, + "learning_rate": 4.879830717331997e-06, + "loss": 0.2112, + "num_input_tokens_seen": 4440992, + "step": 2252 + }, + { + "epoch": 0.2986083499005964, + "grad_norm": 25.890005111694336, + "learning_rate": 4.879724363267738e-06, + "loss": 1.05, + "num_input_tokens_seen": 4443704, + "step": 2253 + }, + { + "epoch": 0.2987408880053015, + "grad_norm": 0.14283990859985352, + "learning_rate": 4.879617963320753e-06, + "loss": 0.0008, + "num_input_tokens_seen": 4445104, + "step": 2254 + }, + { + "epoch": 0.2988734261100066, + "grad_norm": 0.28805074095726013, + "learning_rate": 4.879511517493091e-06, + "loss": 0.0015, + "num_input_tokens_seen": 4446616, + "step": 2255 + }, + { + "epoch": 0.2990059642147117, + "grad_norm": 11.149937629699707, + "learning_rate": 4.879405025786806e-06, + "loss": 0.2234, + "num_input_tokens_seen": 4448592, + "step": 2256 + }, + { + "epoch": 0.29913850231941685, + "grad_norm": 10.31842041015625, + "learning_rate": 4.87929848820395e-06, + "loss": 0.3823, + "num_input_tokens_seen": 4451056, + "step": 2257 + }, + { + "epoch": 0.29927104042412195, + "grad_norm": 8.116284370422363, + "learning_rate": 4.8791919047465775e-06, + "loss": 0.1482, + "num_input_tokens_seen": 4453928, + "step": 2258 + }, + { + "epoch": 0.29940357852882704, + "grad_norm": 1.1918914318084717, + "learning_rate": 4.879085275416744e-06, + "loss": 0.0058, + "num_input_tokens_seen": 4455720, + "step": 2259 + }, + { + "epoch": 0.29953611663353213, + "grad_norm": 6.680629730224609, + "learning_rate": 4.8789786002165055e-06, + "loss": 0.0591, + "num_input_tokens_seen": 4457744, + "step": 2260 + }, + { + "epoch": 0.29966865473823723, + "grad_norm": 15.128524780273438, + "learning_rate": 4.878871879147918e-06, + "loss": 0.5397, + "num_input_tokens_seen": 4459336, + "step": 2261 + }, + { + "epoch": 0.2998011928429423, + "grad_norm": 19.39220428466797, + "learning_rate": 4.87876511221304e-06, + "loss": 0.5077, + "num_input_tokens_seen": 4461936, + "step": 2262 + }, + { + "epoch": 0.29993373094764747, + "grad_norm": 9.773737907409668, + "learning_rate": 4.87865829941393e-06, + "loss": 0.4246, + "num_input_tokens_seen": 4464008, + "step": 2263 + }, + { + "epoch": 0.30006626905235256, + "grad_norm": 0.05762796103954315, + "learning_rate": 4.878551440752646e-06, + "loss": 0.0003, + "num_input_tokens_seen": 4465480, + "step": 2264 + }, + { + "epoch": 0.30019880715705766, + "grad_norm": 12.822464942932129, + "learning_rate": 4.87844453623125e-06, + "loss": 0.4152, + "num_input_tokens_seen": 4467320, + "step": 2265 + }, + { + "epoch": 0.30033134526176275, + "grad_norm": 8.42055606842041, + "learning_rate": 4.878337585851802e-06, + "loss": 0.2294, + "num_input_tokens_seen": 4469704, + "step": 2266 + }, + { + "epoch": 0.30046388336646784, + "grad_norm": 14.45976734161377, + "learning_rate": 4.878230589616365e-06, + "loss": 0.5654, + "num_input_tokens_seen": 4471744, + "step": 2267 + }, + { + "epoch": 0.30059642147117294, + "grad_norm": 0.07128151506185532, + "learning_rate": 4.878123547527002e-06, + "loss": 0.0004, + "num_input_tokens_seen": 4473280, + "step": 2268 + }, + { + "epoch": 0.3007289595758781, + "grad_norm": 9.115377426147461, + "learning_rate": 4.878016459585776e-06, + "loss": 0.1438, + "num_input_tokens_seen": 4475480, + "step": 2269 + }, + { + "epoch": 0.3008614976805832, + "grad_norm": 0.0583374947309494, + "learning_rate": 4.877909325794752e-06, + "loss": 0.0004, + "num_input_tokens_seen": 4477688, + "step": 2270 + }, + { + "epoch": 0.30099403578528827, + "grad_norm": 6.256191253662109, + "learning_rate": 4.877802146155996e-06, + "loss": 0.1179, + "num_input_tokens_seen": 4479624, + "step": 2271 + }, + { + "epoch": 0.30112657388999337, + "grad_norm": 15.40497875213623, + "learning_rate": 4.877694920671574e-06, + "loss": 0.4721, + "num_input_tokens_seen": 4481784, + "step": 2272 + }, + { + "epoch": 0.30125911199469846, + "grad_norm": 11.886549949645996, + "learning_rate": 4.877587649343553e-06, + "loss": 0.3917, + "num_input_tokens_seen": 4483216, + "step": 2273 + }, + { + "epoch": 0.30139165009940355, + "grad_norm": 9.00411319732666, + "learning_rate": 4.877480332174003e-06, + "loss": 0.2911, + "num_input_tokens_seen": 4484856, + "step": 2274 + }, + { + "epoch": 0.3015241882041087, + "grad_norm": 4.763082027435303, + "learning_rate": 4.877372969164992e-06, + "loss": 0.1393, + "num_input_tokens_seen": 4486672, + "step": 2275 + }, + { + "epoch": 0.3016567263088138, + "grad_norm": 11.841622352600098, + "learning_rate": 4.8772655603185906e-06, + "loss": 0.528, + "num_input_tokens_seen": 4489416, + "step": 2276 + }, + { + "epoch": 0.3017892644135189, + "grad_norm": 0.10015904158353806, + "learning_rate": 4.877158105636868e-06, + "loss": 0.0006, + "num_input_tokens_seen": 4490768, + "step": 2277 + }, + { + "epoch": 0.301921802518224, + "grad_norm": 15.050132751464844, + "learning_rate": 4.877050605121897e-06, + "loss": 0.3823, + "num_input_tokens_seen": 4492936, + "step": 2278 + }, + { + "epoch": 0.3020543406229291, + "grad_norm": 7.292393684387207, + "learning_rate": 4.876943058775752e-06, + "loss": 0.2537, + "num_input_tokens_seen": 4495056, + "step": 2279 + }, + { + "epoch": 0.30218687872763417, + "grad_norm": 18.64552116394043, + "learning_rate": 4.876835466600503e-06, + "loss": 0.3755, + "num_input_tokens_seen": 4497936, + "step": 2280 + }, + { + "epoch": 0.3023194168323393, + "grad_norm": 1.5576262474060059, + "learning_rate": 4.876727828598229e-06, + "loss": 0.0085, + "num_input_tokens_seen": 4499640, + "step": 2281 + }, + { + "epoch": 0.3024519549370444, + "grad_norm": 0.12575441598892212, + "learning_rate": 4.876620144771002e-06, + "loss": 0.0008, + "num_input_tokens_seen": 4501776, + "step": 2282 + }, + { + "epoch": 0.3025844930417495, + "grad_norm": 6.489303112030029, + "learning_rate": 4.8765124151208975e-06, + "loss": 0.2004, + "num_input_tokens_seen": 4503840, + "step": 2283 + }, + { + "epoch": 0.3027170311464546, + "grad_norm": 14.678549766540527, + "learning_rate": 4.876404639649995e-06, + "loss": 0.5339, + "num_input_tokens_seen": 4506336, + "step": 2284 + }, + { + "epoch": 0.3028495692511597, + "grad_norm": 9.66697883605957, + "learning_rate": 4.876296818360372e-06, + "loss": 0.0448, + "num_input_tokens_seen": 4508112, + "step": 2285 + }, + { + "epoch": 0.30298210735586484, + "grad_norm": 0.17515404522418976, + "learning_rate": 4.876188951254108e-06, + "loss": 0.0011, + "num_input_tokens_seen": 4510712, + "step": 2286 + }, + { + "epoch": 0.30311464546056993, + "grad_norm": 0.1487409472465515, + "learning_rate": 4.87608103833328e-06, + "loss": 0.0009, + "num_input_tokens_seen": 4511920, + "step": 2287 + }, + { + "epoch": 0.303247183565275, + "grad_norm": 0.1465209275484085, + "learning_rate": 4.875973079599972e-06, + "loss": 0.0009, + "num_input_tokens_seen": 4513424, + "step": 2288 + }, + { + "epoch": 0.3033797216699801, + "grad_norm": 10.462250709533691, + "learning_rate": 4.875865075056262e-06, + "loss": 0.2312, + "num_input_tokens_seen": 4515256, + "step": 2289 + }, + { + "epoch": 0.3035122597746852, + "grad_norm": 0.10487841069698334, + "learning_rate": 4.875757024704236e-06, + "loss": 0.0007, + "num_input_tokens_seen": 4516736, + "step": 2290 + }, + { + "epoch": 0.3036447978793903, + "grad_norm": 0.15438759326934814, + "learning_rate": 4.875648928545975e-06, + "loss": 0.001, + "num_input_tokens_seen": 4518360, + "step": 2291 + }, + { + "epoch": 0.30377733598409545, + "grad_norm": 4.229559421539307, + "learning_rate": 4.875540786583564e-06, + "loss": 0.0435, + "num_input_tokens_seen": 4519856, + "step": 2292 + }, + { + "epoch": 0.30390987408880055, + "grad_norm": 8.338935852050781, + "learning_rate": 4.8754325988190875e-06, + "loss": 0.1618, + "num_input_tokens_seen": 4521480, + "step": 2293 + }, + { + "epoch": 0.30404241219350564, + "grad_norm": 12.536456108093262, + "learning_rate": 4.8753243652546316e-06, + "loss": 0.3658, + "num_input_tokens_seen": 4523224, + "step": 2294 + }, + { + "epoch": 0.30417495029821073, + "grad_norm": 5.160500526428223, + "learning_rate": 4.875216085892283e-06, + "loss": 0.0404, + "num_input_tokens_seen": 4524680, + "step": 2295 + }, + { + "epoch": 0.3043074884029158, + "grad_norm": 0.34268054366111755, + "learning_rate": 4.8751077607341306e-06, + "loss": 0.002, + "num_input_tokens_seen": 4526056, + "step": 2296 + }, + { + "epoch": 0.3044400265076209, + "grad_norm": 6.561368942260742, + "learning_rate": 4.874999389782262e-06, + "loss": 0.1351, + "num_input_tokens_seen": 4528216, + "step": 2297 + }, + { + "epoch": 0.30457256461232607, + "grad_norm": 11.347625732421875, + "learning_rate": 4.874890973038766e-06, + "loss": 0.2648, + "num_input_tokens_seen": 4529848, + "step": 2298 + }, + { + "epoch": 0.30470510271703116, + "grad_norm": 10.4386625289917, + "learning_rate": 4.874782510505734e-06, + "loss": 0.3965, + "num_input_tokens_seen": 4531392, + "step": 2299 + }, + { + "epoch": 0.30483764082173626, + "grad_norm": 22.0225830078125, + "learning_rate": 4.8746740021852576e-06, + "loss": 0.6215, + "num_input_tokens_seen": 4533120, + "step": 2300 + }, + { + "epoch": 0.30497017892644135, + "grad_norm": 18.458955764770508, + "learning_rate": 4.874565448079428e-06, + "loss": 0.796, + "num_input_tokens_seen": 4535440, + "step": 2301 + }, + { + "epoch": 0.30510271703114644, + "grad_norm": 20.216047286987305, + "learning_rate": 4.8744568481903375e-06, + "loss": 0.8113, + "num_input_tokens_seen": 4536712, + "step": 2302 + }, + { + "epoch": 0.30523525513585154, + "grad_norm": 11.850532531738281, + "learning_rate": 4.874348202520082e-06, + "loss": 0.1842, + "num_input_tokens_seen": 4538520, + "step": 2303 + }, + { + "epoch": 0.3053677932405567, + "grad_norm": 13.13093090057373, + "learning_rate": 4.874239511070755e-06, + "loss": 0.2179, + "num_input_tokens_seen": 4540448, + "step": 2304 + }, + { + "epoch": 0.3055003313452618, + "grad_norm": 2.783926486968994, + "learning_rate": 4.874130773844452e-06, + "loss": 0.0095, + "num_input_tokens_seen": 4542416, + "step": 2305 + }, + { + "epoch": 0.30563286944996687, + "grad_norm": 0.04120277240872383, + "learning_rate": 4.874021990843269e-06, + "loss": 0.0003, + "num_input_tokens_seen": 4543792, + "step": 2306 + }, + { + "epoch": 0.30576540755467196, + "grad_norm": 0.12249967455863953, + "learning_rate": 4.873913162069306e-06, + "loss": 0.0008, + "num_input_tokens_seen": 4546320, + "step": 2307 + }, + { + "epoch": 0.30589794565937706, + "grad_norm": 5.431410789489746, + "learning_rate": 4.873804287524658e-06, + "loss": 0.0457, + "num_input_tokens_seen": 4548248, + "step": 2308 + }, + { + "epoch": 0.30603048376408215, + "grad_norm": 0.05795201286673546, + "learning_rate": 4.873695367211427e-06, + "loss": 0.0004, + "num_input_tokens_seen": 4549488, + "step": 2309 + }, + { + "epoch": 0.3061630218687873, + "grad_norm": 3.3292765617370605, + "learning_rate": 4.8735864011317114e-06, + "loss": 0.0182, + "num_input_tokens_seen": 4552800, + "step": 2310 + }, + { + "epoch": 0.3062955599734924, + "grad_norm": 13.742362976074219, + "learning_rate": 4.873477389287612e-06, + "loss": 0.351, + "num_input_tokens_seen": 4554888, + "step": 2311 + }, + { + "epoch": 0.3064280980781975, + "grad_norm": 14.86807918548584, + "learning_rate": 4.8733683316812315e-06, + "loss": 0.4338, + "num_input_tokens_seen": 4557512, + "step": 2312 + }, + { + "epoch": 0.3065606361829026, + "grad_norm": 17.22825813293457, + "learning_rate": 4.873259228314673e-06, + "loss": 0.265, + "num_input_tokens_seen": 4559376, + "step": 2313 + }, + { + "epoch": 0.3066931742876077, + "grad_norm": 0.07642994076013565, + "learning_rate": 4.873150079190038e-06, + "loss": 0.0005, + "num_input_tokens_seen": 4561856, + "step": 2314 + }, + { + "epoch": 0.30682571239231277, + "grad_norm": 0.0376090444624424, + "learning_rate": 4.873040884309433e-06, + "loss": 0.0002, + "num_input_tokens_seen": 4563408, + "step": 2315 + }, + { + "epoch": 0.3069582504970179, + "grad_norm": 6.630155563354492, + "learning_rate": 4.872931643674963e-06, + "loss": 0.2238, + "num_input_tokens_seen": 4565824, + "step": 2316 + }, + { + "epoch": 0.307090788601723, + "grad_norm": 0.08366625756025314, + "learning_rate": 4.872822357288733e-06, + "loss": 0.0005, + "num_input_tokens_seen": 4567328, + "step": 2317 + }, + { + "epoch": 0.3072233267064281, + "grad_norm": 11.427505493164062, + "learning_rate": 4.872713025152852e-06, + "loss": 0.3075, + "num_input_tokens_seen": 4568840, + "step": 2318 + }, + { + "epoch": 0.3073558648111332, + "grad_norm": 0.1984274536371231, + "learning_rate": 4.872603647269427e-06, + "loss": 0.0011, + "num_input_tokens_seen": 4570848, + "step": 2319 + }, + { + "epoch": 0.3074884029158383, + "grad_norm": 0.01841103844344616, + "learning_rate": 4.8724942236405655e-06, + "loss": 0.0001, + "num_input_tokens_seen": 4572240, + "step": 2320 + }, + { + "epoch": 0.3076209410205434, + "grad_norm": 0.048301417380571365, + "learning_rate": 4.87238475426838e-06, + "loss": 0.0003, + "num_input_tokens_seen": 4574360, + "step": 2321 + }, + { + "epoch": 0.30775347912524853, + "grad_norm": 14.42236614227295, + "learning_rate": 4.8722752391549795e-06, + "loss": 0.2713, + "num_input_tokens_seen": 4576256, + "step": 2322 + }, + { + "epoch": 0.3078860172299536, + "grad_norm": 6.023249626159668, + "learning_rate": 4.872165678302476e-06, + "loss": 0.1606, + "num_input_tokens_seen": 4577744, + "step": 2323 + }, + { + "epoch": 0.3080185553346587, + "grad_norm": 0.03531588613986969, + "learning_rate": 4.872056071712981e-06, + "loss": 0.0002, + "num_input_tokens_seen": 4579304, + "step": 2324 + }, + { + "epoch": 0.3081510934393638, + "grad_norm": 12.995361328125, + "learning_rate": 4.871946419388609e-06, + "loss": 0.3878, + "num_input_tokens_seen": 4580760, + "step": 2325 + }, + { + "epoch": 0.3082836315440689, + "grad_norm": 8.4299898147583, + "learning_rate": 4.871836721331475e-06, + "loss": 0.3143, + "num_input_tokens_seen": 4582384, + "step": 2326 + }, + { + "epoch": 0.308416169648774, + "grad_norm": 6.2899675369262695, + "learning_rate": 4.871726977543691e-06, + "loss": 0.1073, + "num_input_tokens_seen": 4584472, + "step": 2327 + }, + { + "epoch": 0.30854870775347915, + "grad_norm": 0.15512315928936005, + "learning_rate": 4.8716171880273754e-06, + "loss": 0.0009, + "num_input_tokens_seen": 4585776, + "step": 2328 + }, + { + "epoch": 0.30868124585818424, + "grad_norm": 0.06477442383766174, + "learning_rate": 4.871507352784645e-06, + "loss": 0.0004, + "num_input_tokens_seen": 4588344, + "step": 2329 + }, + { + "epoch": 0.30881378396288933, + "grad_norm": 8.844597816467285, + "learning_rate": 4.871397471817615e-06, + "loss": 0.2402, + "num_input_tokens_seen": 4590496, + "step": 2330 + }, + { + "epoch": 0.3089463220675944, + "grad_norm": 8.777436256408691, + "learning_rate": 4.871287545128407e-06, + "loss": 0.2058, + "num_input_tokens_seen": 4592056, + "step": 2331 + }, + { + "epoch": 0.3090788601722995, + "grad_norm": 2.4256484508514404, + "learning_rate": 4.871177572719139e-06, + "loss": 0.015, + "num_input_tokens_seen": 4593672, + "step": 2332 + }, + { + "epoch": 0.3092113982770046, + "grad_norm": 0.1641574203968048, + "learning_rate": 4.871067554591932e-06, + "loss": 0.0009, + "num_input_tokens_seen": 4595240, + "step": 2333 + }, + { + "epoch": 0.30934393638170976, + "grad_norm": 0.10168379545211792, + "learning_rate": 4.870957490748907e-06, + "loss": 0.0006, + "num_input_tokens_seen": 4596616, + "step": 2334 + }, + { + "epoch": 0.30947647448641485, + "grad_norm": 0.2703472077846527, + "learning_rate": 4.870847381192185e-06, + "loss": 0.0011, + "num_input_tokens_seen": 4598280, + "step": 2335 + }, + { + "epoch": 0.30960901259111995, + "grad_norm": 0.0431048721075058, + "learning_rate": 4.870737225923891e-06, + "loss": 0.0003, + "num_input_tokens_seen": 4599424, + "step": 2336 + }, + { + "epoch": 0.30974155069582504, + "grad_norm": 0.21082842350006104, + "learning_rate": 4.870627024946146e-06, + "loss": 0.0012, + "num_input_tokens_seen": 4601512, + "step": 2337 + }, + { + "epoch": 0.30987408880053013, + "grad_norm": 27.151540756225586, + "learning_rate": 4.870516778261078e-06, + "loss": 0.9187, + "num_input_tokens_seen": 4604264, + "step": 2338 + }, + { + "epoch": 0.3100066269052352, + "grad_norm": 16.894166946411133, + "learning_rate": 4.870406485870811e-06, + "loss": 0.3856, + "num_input_tokens_seen": 4605808, + "step": 2339 + }, + { + "epoch": 0.3101391650099404, + "grad_norm": 6.911101818084717, + "learning_rate": 4.8702961477774715e-06, + "loss": 0.2188, + "num_input_tokens_seen": 4607600, + "step": 2340 + }, + { + "epoch": 0.31027170311464547, + "grad_norm": 21.6949462890625, + "learning_rate": 4.870185763983187e-06, + "loss": 0.5444, + "num_input_tokens_seen": 4609160, + "step": 2341 + }, + { + "epoch": 0.31040424121935056, + "grad_norm": 16.570449829101562, + "learning_rate": 4.870075334490086e-06, + "loss": 0.6775, + "num_input_tokens_seen": 4611192, + "step": 2342 + }, + { + "epoch": 0.31053677932405566, + "grad_norm": 9.404993057250977, + "learning_rate": 4.869964859300298e-06, + "loss": 0.314, + "num_input_tokens_seen": 4613360, + "step": 2343 + }, + { + "epoch": 0.31066931742876075, + "grad_norm": 42.66393280029297, + "learning_rate": 4.869854338415952e-06, + "loss": 0.174, + "num_input_tokens_seen": 4615856, + "step": 2344 + }, + { + "epoch": 0.3108018555334659, + "grad_norm": 0.05939304456114769, + "learning_rate": 4.86974377183918e-06, + "loss": 0.0004, + "num_input_tokens_seen": 4616984, + "step": 2345 + }, + { + "epoch": 0.310934393638171, + "grad_norm": 5.048226833343506, + "learning_rate": 4.869633159572113e-06, + "loss": 0.0804, + "num_input_tokens_seen": 4619464, + "step": 2346 + }, + { + "epoch": 0.3110669317428761, + "grad_norm": 0.09742506593465805, + "learning_rate": 4.869522501616884e-06, + "loss": 0.0006, + "num_input_tokens_seen": 4620880, + "step": 2347 + }, + { + "epoch": 0.3111994698475812, + "grad_norm": 10.257518768310547, + "learning_rate": 4.869411797975626e-06, + "loss": 0.2994, + "num_input_tokens_seen": 4622664, + "step": 2348 + }, + { + "epoch": 0.31133200795228627, + "grad_norm": 10.37560749053955, + "learning_rate": 4.869301048650474e-06, + "loss": 0.2133, + "num_input_tokens_seen": 4625760, + "step": 2349 + }, + { + "epoch": 0.31146454605699136, + "grad_norm": 3.146977663040161, + "learning_rate": 4.869190253643565e-06, + "loss": 0.082, + "num_input_tokens_seen": 4627384, + "step": 2350 + }, + { + "epoch": 0.3115970841616965, + "grad_norm": 17.42863655090332, + "learning_rate": 4.869079412957031e-06, + "loss": 0.0101, + "num_input_tokens_seen": 4629448, + "step": 2351 + }, + { + "epoch": 0.3117296222664016, + "grad_norm": 15.892781257629395, + "learning_rate": 4.868968526593013e-06, + "loss": 0.1414, + "num_input_tokens_seen": 4632032, + "step": 2352 + }, + { + "epoch": 0.3118621603711067, + "grad_norm": 0.18932265043258667, + "learning_rate": 4.868857594553647e-06, + "loss": 0.0012, + "num_input_tokens_seen": 4633640, + "step": 2353 + }, + { + "epoch": 0.3119946984758118, + "grad_norm": 0.05859370157122612, + "learning_rate": 4.8687466168410726e-06, + "loss": 0.0004, + "num_input_tokens_seen": 4635016, + "step": 2354 + }, + { + "epoch": 0.3121272365805169, + "grad_norm": 0.13451075553894043, + "learning_rate": 4.868635593457429e-06, + "loss": 0.0008, + "num_input_tokens_seen": 4637168, + "step": 2355 + }, + { + "epoch": 0.312259774685222, + "grad_norm": 7.182871341705322, + "learning_rate": 4.868524524404858e-06, + "loss": 0.14, + "num_input_tokens_seen": 4638720, + "step": 2356 + }, + { + "epoch": 0.31239231278992713, + "grad_norm": 0.08676090836524963, + "learning_rate": 4.868413409685499e-06, + "loss": 0.0006, + "num_input_tokens_seen": 4640088, + "step": 2357 + }, + { + "epoch": 0.3125248508946322, + "grad_norm": 8.121641159057617, + "learning_rate": 4.868302249301497e-06, + "loss": 0.2428, + "num_input_tokens_seen": 4641840, + "step": 2358 + }, + { + "epoch": 0.3126573889993373, + "grad_norm": 8.843557357788086, + "learning_rate": 4.8681910432549925e-06, + "loss": 0.2957, + "num_input_tokens_seen": 4643264, + "step": 2359 + }, + { + "epoch": 0.3127899271040424, + "grad_norm": 9.292862892150879, + "learning_rate": 4.868079791548131e-06, + "loss": 0.2335, + "num_input_tokens_seen": 4644536, + "step": 2360 + }, + { + "epoch": 0.3129224652087475, + "grad_norm": 11.492375373840332, + "learning_rate": 4.867968494183058e-06, + "loss": 0.3478, + "num_input_tokens_seen": 4646544, + "step": 2361 + }, + { + "epoch": 0.3130550033134526, + "grad_norm": 5.405326843261719, + "learning_rate": 4.86785715116192e-06, + "loss": 0.1007, + "num_input_tokens_seen": 4648752, + "step": 2362 + }, + { + "epoch": 0.31318754141815774, + "grad_norm": 11.212135314941406, + "learning_rate": 4.867745762486862e-06, + "loss": 0.4053, + "num_input_tokens_seen": 4651064, + "step": 2363 + }, + { + "epoch": 0.31332007952286284, + "grad_norm": 0.39075979590415955, + "learning_rate": 4.8676343281600306e-06, + "loss": 0.0025, + "num_input_tokens_seen": 4653416, + "step": 2364 + }, + { + "epoch": 0.31345261762756793, + "grad_norm": 6.183170795440674, + "learning_rate": 4.867522848183578e-06, + "loss": 0.2085, + "num_input_tokens_seen": 4655432, + "step": 2365 + }, + { + "epoch": 0.313585155732273, + "grad_norm": 9.377362251281738, + "learning_rate": 4.8674113225596495e-06, + "loss": 0.3691, + "num_input_tokens_seen": 4657448, + "step": 2366 + }, + { + "epoch": 0.3137176938369781, + "grad_norm": 16.857759475708008, + "learning_rate": 4.8672997512904e-06, + "loss": 0.346, + "num_input_tokens_seen": 4659840, + "step": 2367 + }, + { + "epoch": 0.3138502319416832, + "grad_norm": 27.054405212402344, + "learning_rate": 4.867188134377977e-06, + "loss": 0.6092, + "num_input_tokens_seen": 4661112, + "step": 2368 + }, + { + "epoch": 0.31398277004638836, + "grad_norm": 12.904631614685059, + "learning_rate": 4.8670764718245335e-06, + "loss": 0.2831, + "num_input_tokens_seen": 4662472, + "step": 2369 + }, + { + "epoch": 0.31411530815109345, + "grad_norm": 1.072129249572754, + "learning_rate": 4.866964763632222e-06, + "loss": 0.0068, + "num_input_tokens_seen": 4664096, + "step": 2370 + }, + { + "epoch": 0.31424784625579855, + "grad_norm": 6.47117280960083, + "learning_rate": 4.866853009803198e-06, + "loss": 0.1221, + "num_input_tokens_seen": 4665968, + "step": 2371 + }, + { + "epoch": 0.31438038436050364, + "grad_norm": 8.04733943939209, + "learning_rate": 4.866741210339615e-06, + "loss": 0.3006, + "num_input_tokens_seen": 4668752, + "step": 2372 + }, + { + "epoch": 0.31451292246520873, + "grad_norm": 8.533254623413086, + "learning_rate": 4.866629365243629e-06, + "loss": 0.1317, + "num_input_tokens_seen": 4670632, + "step": 2373 + }, + { + "epoch": 0.3146454605699138, + "grad_norm": 6.609121799468994, + "learning_rate": 4.866517474517396e-06, + "loss": 0.1295, + "num_input_tokens_seen": 4672288, + "step": 2374 + }, + { + "epoch": 0.314777998674619, + "grad_norm": 9.03375244140625, + "learning_rate": 4.866405538163073e-06, + "loss": 0.1818, + "num_input_tokens_seen": 4674160, + "step": 2375 + }, + { + "epoch": 0.31491053677932407, + "grad_norm": 0.3066478669643402, + "learning_rate": 4.86629355618282e-06, + "loss": 0.002, + "num_input_tokens_seen": 4675792, + "step": 2376 + }, + { + "epoch": 0.31504307488402916, + "grad_norm": 0.27516910433769226, + "learning_rate": 4.866181528578793e-06, + "loss": 0.0018, + "num_input_tokens_seen": 4677432, + "step": 2377 + }, + { + "epoch": 0.31517561298873426, + "grad_norm": 6.214582920074463, + "learning_rate": 4.866069455353155e-06, + "loss": 0.1704, + "num_input_tokens_seen": 4679904, + "step": 2378 + }, + { + "epoch": 0.31530815109343935, + "grad_norm": 0.33407238125801086, + "learning_rate": 4.8659573365080655e-06, + "loss": 0.0021, + "num_input_tokens_seen": 4681536, + "step": 2379 + }, + { + "epoch": 0.31544068919814444, + "grad_norm": 18.06239128112793, + "learning_rate": 4.865845172045686e-06, + "loss": 0.6775, + "num_input_tokens_seen": 4683944, + "step": 2380 + }, + { + "epoch": 0.3155732273028496, + "grad_norm": 11.272034645080566, + "learning_rate": 4.865732961968179e-06, + "loss": 0.2196, + "num_input_tokens_seen": 4686272, + "step": 2381 + }, + { + "epoch": 0.3157057654075547, + "grad_norm": 5.787203788757324, + "learning_rate": 4.865620706277709e-06, + "loss": 0.1253, + "num_input_tokens_seen": 4687424, + "step": 2382 + }, + { + "epoch": 0.3158383035122598, + "grad_norm": 13.923933029174805, + "learning_rate": 4.865508404976441e-06, + "loss": 0.4374, + "num_input_tokens_seen": 4689960, + "step": 2383 + }, + { + "epoch": 0.31597084161696487, + "grad_norm": 11.860907554626465, + "learning_rate": 4.865396058066538e-06, + "loss": 0.4108, + "num_input_tokens_seen": 4692296, + "step": 2384 + }, + { + "epoch": 0.31610337972166996, + "grad_norm": 11.430327415466309, + "learning_rate": 4.865283665550167e-06, + "loss": 0.4003, + "num_input_tokens_seen": 4694272, + "step": 2385 + }, + { + "epoch": 0.31623591782637506, + "grad_norm": 7.905951976776123, + "learning_rate": 4.865171227429495e-06, + "loss": 0.095, + "num_input_tokens_seen": 4696800, + "step": 2386 + }, + { + "epoch": 0.3163684559310802, + "grad_norm": 7.172763824462891, + "learning_rate": 4.865058743706691e-06, + "loss": 0.1535, + "num_input_tokens_seen": 4698528, + "step": 2387 + }, + { + "epoch": 0.3165009940357853, + "grad_norm": 9.360395431518555, + "learning_rate": 4.864946214383922e-06, + "loss": 0.2372, + "num_input_tokens_seen": 4700200, + "step": 2388 + }, + { + "epoch": 0.3166335321404904, + "grad_norm": 8.127171516418457, + "learning_rate": 4.8648336394633585e-06, + "loss": 0.1542, + "num_input_tokens_seen": 4702768, + "step": 2389 + }, + { + "epoch": 0.3167660702451955, + "grad_norm": 0.3152252733707428, + "learning_rate": 4.8647210189471726e-06, + "loss": 0.0019, + "num_input_tokens_seen": 4704712, + "step": 2390 + }, + { + "epoch": 0.3168986083499006, + "grad_norm": 2.2181358337402344, + "learning_rate": 4.864608352837532e-06, + "loss": 0.0039, + "num_input_tokens_seen": 4707312, + "step": 2391 + }, + { + "epoch": 0.3170311464546057, + "grad_norm": 0.11500225961208344, + "learning_rate": 4.864495641136612e-06, + "loss": 0.0007, + "num_input_tokens_seen": 4708512, + "step": 2392 + }, + { + "epoch": 0.3171636845593108, + "grad_norm": 11.490994453430176, + "learning_rate": 4.864382883846586e-06, + "loss": 0.2637, + "num_input_tokens_seen": 4710568, + "step": 2393 + }, + { + "epoch": 0.3172962226640159, + "grad_norm": 12.361715316772461, + "learning_rate": 4.864270080969625e-06, + "loss": 0.1677, + "num_input_tokens_seen": 4713128, + "step": 2394 + }, + { + "epoch": 0.317428760768721, + "grad_norm": 7.869872093200684, + "learning_rate": 4.864157232507907e-06, + "loss": 0.1173, + "num_input_tokens_seen": 4715064, + "step": 2395 + }, + { + "epoch": 0.3175612988734261, + "grad_norm": 4.561809062957764, + "learning_rate": 4.864044338463606e-06, + "loss": 0.0363, + "num_input_tokens_seen": 4717384, + "step": 2396 + }, + { + "epoch": 0.3176938369781312, + "grad_norm": 5.06044864654541, + "learning_rate": 4.863931398838899e-06, + "loss": 0.109, + "num_input_tokens_seen": 4719560, + "step": 2397 + }, + { + "epoch": 0.3178263750828363, + "grad_norm": 13.439164161682129, + "learning_rate": 4.863818413635965e-06, + "loss": 0.4624, + "num_input_tokens_seen": 4722008, + "step": 2398 + }, + { + "epoch": 0.31795891318754144, + "grad_norm": 10.076236724853516, + "learning_rate": 4.8637053828569805e-06, + "loss": 0.3145, + "num_input_tokens_seen": 4724496, + "step": 2399 + }, + { + "epoch": 0.31809145129224653, + "grad_norm": 0.09990496933460236, + "learning_rate": 4.863592306504125e-06, + "loss": 0.0006, + "num_input_tokens_seen": 4726496, + "step": 2400 + }, + { + "epoch": 0.3182239893969516, + "grad_norm": 0.03365609422326088, + "learning_rate": 4.863479184579581e-06, + "loss": 0.0002, + "num_input_tokens_seen": 4727584, + "step": 2401 + }, + { + "epoch": 0.3183565275016567, + "grad_norm": 9.044075012207031, + "learning_rate": 4.8633660170855255e-06, + "loss": 0.4158, + "num_input_tokens_seen": 4729672, + "step": 2402 + }, + { + "epoch": 0.3184890656063618, + "grad_norm": 0.04071224108338356, + "learning_rate": 4.863252804024145e-06, + "loss": 0.0002, + "num_input_tokens_seen": 4730992, + "step": 2403 + }, + { + "epoch": 0.31862160371106696, + "grad_norm": 29.465429306030273, + "learning_rate": 4.863139545397619e-06, + "loss": 1.0326, + "num_input_tokens_seen": 4732744, + "step": 2404 + }, + { + "epoch": 0.31875414181577205, + "grad_norm": 9.384528160095215, + "learning_rate": 4.863026241208132e-06, + "loss": 0.1982, + "num_input_tokens_seen": 4734560, + "step": 2405 + }, + { + "epoch": 0.31888667992047715, + "grad_norm": 15.243741989135742, + "learning_rate": 4.8629128914578695e-06, + "loss": 0.4678, + "num_input_tokens_seen": 4736760, + "step": 2406 + }, + { + "epoch": 0.31901921802518224, + "grad_norm": 7.035901069641113, + "learning_rate": 4.862799496149017e-06, + "loss": 0.2825, + "num_input_tokens_seen": 4738344, + "step": 2407 + }, + { + "epoch": 0.31915175612988733, + "grad_norm": 10.949930191040039, + "learning_rate": 4.8626860552837585e-06, + "loss": 0.1726, + "num_input_tokens_seen": 4739928, + "step": 2408 + }, + { + "epoch": 0.3192842942345924, + "grad_norm": 24.149385452270508, + "learning_rate": 4.862572568864285e-06, + "loss": 0.6593, + "num_input_tokens_seen": 4742408, + "step": 2409 + }, + { + "epoch": 0.3194168323392976, + "grad_norm": 14.280720710754395, + "learning_rate": 4.86245903689278e-06, + "loss": 0.539, + "num_input_tokens_seen": 4744752, + "step": 2410 + }, + { + "epoch": 0.31954937044400267, + "grad_norm": 15.06515884399414, + "learning_rate": 4.862345459371437e-06, + "loss": 0.3699, + "num_input_tokens_seen": 4746504, + "step": 2411 + }, + { + "epoch": 0.31968190854870776, + "grad_norm": 0.07665840536355972, + "learning_rate": 4.862231836302444e-06, + "loss": 0.0005, + "num_input_tokens_seen": 4747736, + "step": 2412 + }, + { + "epoch": 0.31981444665341285, + "grad_norm": 11.211732864379883, + "learning_rate": 4.862118167687991e-06, + "loss": 0.2193, + "num_input_tokens_seen": 4749712, + "step": 2413 + }, + { + "epoch": 0.31994698475811795, + "grad_norm": 15.756593704223633, + "learning_rate": 4.862004453530269e-06, + "loss": 0.45, + "num_input_tokens_seen": 4751960, + "step": 2414 + }, + { + "epoch": 0.32007952286282304, + "grad_norm": 8.564964294433594, + "learning_rate": 4.861890693831472e-06, + "loss": 0.1738, + "num_input_tokens_seen": 4754992, + "step": 2415 + }, + { + "epoch": 0.3202120609675282, + "grad_norm": 5.211246967315674, + "learning_rate": 4.8617768885937944e-06, + "loss": 0.1933, + "num_input_tokens_seen": 4757832, + "step": 2416 + }, + { + "epoch": 0.3203445990722333, + "grad_norm": 5.289076328277588, + "learning_rate": 4.861663037819428e-06, + "loss": 0.0431, + "num_input_tokens_seen": 4759272, + "step": 2417 + }, + { + "epoch": 0.3204771371769384, + "grad_norm": 12.89932632446289, + "learning_rate": 4.86154914151057e-06, + "loss": 0.4088, + "num_input_tokens_seen": 4760960, + "step": 2418 + }, + { + "epoch": 0.32060967528164347, + "grad_norm": 0.10930877923965454, + "learning_rate": 4.861435199669414e-06, + "loss": 0.0007, + "num_input_tokens_seen": 4762392, + "step": 2419 + }, + { + "epoch": 0.32074221338634856, + "grad_norm": 12.59751033782959, + "learning_rate": 4.86132121229816e-06, + "loss": 0.5084, + "num_input_tokens_seen": 4764512, + "step": 2420 + }, + { + "epoch": 0.32087475149105366, + "grad_norm": 0.10619465261697769, + "learning_rate": 4.8612071793990025e-06, + "loss": 0.0006, + "num_input_tokens_seen": 4765744, + "step": 2421 + }, + { + "epoch": 0.3210072895957588, + "grad_norm": 11.316045761108398, + "learning_rate": 4.861093100974143e-06, + "loss": 0.1684, + "num_input_tokens_seen": 4767816, + "step": 2422 + }, + { + "epoch": 0.3211398277004639, + "grad_norm": 20.91433334350586, + "learning_rate": 4.860978977025779e-06, + "loss": 0.9307, + "num_input_tokens_seen": 4770264, + "step": 2423 + }, + { + "epoch": 0.321272365805169, + "grad_norm": 0.16921111941337585, + "learning_rate": 4.86086480755611e-06, + "loss": 0.001, + "num_input_tokens_seen": 4771952, + "step": 2424 + }, + { + "epoch": 0.3214049039098741, + "grad_norm": 6.8373894691467285, + "learning_rate": 4.8607505925673414e-06, + "loss": 0.062, + "num_input_tokens_seen": 4773352, + "step": 2425 + }, + { + "epoch": 0.3215374420145792, + "grad_norm": 0.16040894389152527, + "learning_rate": 4.860636332061671e-06, + "loss": 0.0009, + "num_input_tokens_seen": 4775432, + "step": 2426 + }, + { + "epoch": 0.32166998011928427, + "grad_norm": 14.172039031982422, + "learning_rate": 4.860522026041304e-06, + "loss": 0.3022, + "num_input_tokens_seen": 4777392, + "step": 2427 + }, + { + "epoch": 0.3218025182239894, + "grad_norm": 10.313090324401855, + "learning_rate": 4.860407674508445e-06, + "loss": 0.3031, + "num_input_tokens_seen": 4779728, + "step": 2428 + }, + { + "epoch": 0.3219350563286945, + "grad_norm": 0.8662859797477722, + "learning_rate": 4.860293277465296e-06, + "loss": 0.0022, + "num_input_tokens_seen": 4781424, + "step": 2429 + }, + { + "epoch": 0.3220675944333996, + "grad_norm": 0.09188725054264069, + "learning_rate": 4.860178834914066e-06, + "loss": 0.0005, + "num_input_tokens_seen": 4782480, + "step": 2430 + }, + { + "epoch": 0.3222001325381047, + "grad_norm": 5.912001609802246, + "learning_rate": 4.860064346856959e-06, + "loss": 0.1457, + "num_input_tokens_seen": 4784264, + "step": 2431 + }, + { + "epoch": 0.3223326706428098, + "grad_norm": 0.1483450084924698, + "learning_rate": 4.859949813296184e-06, + "loss": 0.0008, + "num_input_tokens_seen": 4786752, + "step": 2432 + }, + { + "epoch": 0.3224652087475149, + "grad_norm": 7.647861957550049, + "learning_rate": 4.859835234233947e-06, + "loss": 0.1633, + "num_input_tokens_seen": 4788856, + "step": 2433 + }, + { + "epoch": 0.32259774685222004, + "grad_norm": 0.1386691778898239, + "learning_rate": 4.85972060967246e-06, + "loss": 0.0008, + "num_input_tokens_seen": 4789936, + "step": 2434 + }, + { + "epoch": 0.32273028495692513, + "grad_norm": 2.059891939163208, + "learning_rate": 4.859605939613932e-06, + "loss": 0.0393, + "num_input_tokens_seen": 4792120, + "step": 2435 + }, + { + "epoch": 0.3228628230616302, + "grad_norm": 0.15478159487247467, + "learning_rate": 4.859491224060572e-06, + "loss": 0.0009, + "num_input_tokens_seen": 4793984, + "step": 2436 + }, + { + "epoch": 0.3229953611663353, + "grad_norm": 8.184882164001465, + "learning_rate": 4.8593764630145955e-06, + "loss": 0.1682, + "num_input_tokens_seen": 4795624, + "step": 2437 + }, + { + "epoch": 0.3231278992710404, + "grad_norm": 14.103765487670898, + "learning_rate": 4.859261656478211e-06, + "loss": 0.4096, + "num_input_tokens_seen": 4797152, + "step": 2438 + }, + { + "epoch": 0.3232604373757455, + "grad_norm": 11.606586456298828, + "learning_rate": 4.859146804453636e-06, + "loss": 0.3912, + "num_input_tokens_seen": 4798984, + "step": 2439 + }, + { + "epoch": 0.32339297548045065, + "grad_norm": 3.270960569381714, + "learning_rate": 4.859031906943082e-06, + "loss": 0.0972, + "num_input_tokens_seen": 4800688, + "step": 2440 + }, + { + "epoch": 0.32352551358515574, + "grad_norm": 6.662178039550781, + "learning_rate": 4.858916963948767e-06, + "loss": 0.0694, + "num_input_tokens_seen": 4802464, + "step": 2441 + }, + { + "epoch": 0.32365805168986084, + "grad_norm": 3.5934388637542725, + "learning_rate": 4.858801975472904e-06, + "loss": 0.0596, + "num_input_tokens_seen": 4804184, + "step": 2442 + }, + { + "epoch": 0.32379058979456593, + "grad_norm": 7.985162258148193, + "learning_rate": 4.858686941517712e-06, + "loss": 0.2226, + "num_input_tokens_seen": 4806176, + "step": 2443 + }, + { + "epoch": 0.323923127899271, + "grad_norm": 13.521080017089844, + "learning_rate": 4.858571862085409e-06, + "loss": 0.235, + "num_input_tokens_seen": 4807752, + "step": 2444 + }, + { + "epoch": 0.3240556660039761, + "grad_norm": 6.998464584350586, + "learning_rate": 4.8584567371782134e-06, + "loss": 0.2099, + "num_input_tokens_seen": 4810544, + "step": 2445 + }, + { + "epoch": 0.32418820410868127, + "grad_norm": 0.3152054250240326, + "learning_rate": 4.858341566798344e-06, + "loss": 0.0016, + "num_input_tokens_seen": 4812168, + "step": 2446 + }, + { + "epoch": 0.32432074221338636, + "grad_norm": 1.6591954231262207, + "learning_rate": 4.858226350948023e-06, + "loss": 0.0071, + "num_input_tokens_seen": 4813648, + "step": 2447 + }, + { + "epoch": 0.32445328031809145, + "grad_norm": 6.460522651672363, + "learning_rate": 4.858111089629472e-06, + "loss": 0.0952, + "num_input_tokens_seen": 4815288, + "step": 2448 + }, + { + "epoch": 0.32458581842279655, + "grad_norm": 0.23468884825706482, + "learning_rate": 4.857995782844911e-06, + "loss": 0.0008, + "num_input_tokens_seen": 4817048, + "step": 2449 + }, + { + "epoch": 0.32471835652750164, + "grad_norm": 0.535457968711853, + "learning_rate": 4.8578804305965645e-06, + "loss": 0.0029, + "num_input_tokens_seen": 4819232, + "step": 2450 + }, + { + "epoch": 0.32485089463220673, + "grad_norm": 18.374446868896484, + "learning_rate": 4.857765032886657e-06, + "loss": 0.8662, + "num_input_tokens_seen": 4822720, + "step": 2451 + }, + { + "epoch": 0.3249834327369119, + "grad_norm": 2.967433214187622, + "learning_rate": 4.857649589717413e-06, + "loss": 0.0131, + "num_input_tokens_seen": 4824576, + "step": 2452 + }, + { + "epoch": 0.325115970841617, + "grad_norm": 0.044765494763851166, + "learning_rate": 4.857534101091059e-06, + "loss": 0.0003, + "num_input_tokens_seen": 4827352, + "step": 2453 + }, + { + "epoch": 0.32524850894632207, + "grad_norm": 19.87103271484375, + "learning_rate": 4.857418567009821e-06, + "loss": 0.8718, + "num_input_tokens_seen": 4829512, + "step": 2454 + }, + { + "epoch": 0.32538104705102716, + "grad_norm": 17.599321365356445, + "learning_rate": 4.857302987475927e-06, + "loss": 0.6143, + "num_input_tokens_seen": 4831112, + "step": 2455 + }, + { + "epoch": 0.32551358515573225, + "grad_norm": 8.495377540588379, + "learning_rate": 4.857187362491605e-06, + "loss": 0.1442, + "num_input_tokens_seen": 4833416, + "step": 2456 + }, + { + "epoch": 0.3256461232604374, + "grad_norm": 0.08904121071100235, + "learning_rate": 4.857071692059085e-06, + "loss": 0.0005, + "num_input_tokens_seen": 4834496, + "step": 2457 + }, + { + "epoch": 0.3257786613651425, + "grad_norm": 21.11811637878418, + "learning_rate": 4.856955976180595e-06, + "loss": 0.7141, + "num_input_tokens_seen": 4836240, + "step": 2458 + }, + { + "epoch": 0.3259111994698476, + "grad_norm": 7.011678218841553, + "learning_rate": 4.856840214858369e-06, + "loss": 0.1682, + "num_input_tokens_seen": 4838648, + "step": 2459 + }, + { + "epoch": 0.3260437375745527, + "grad_norm": 9.919988632202148, + "learning_rate": 4.856724408094637e-06, + "loss": 0.2142, + "num_input_tokens_seen": 4841000, + "step": 2460 + }, + { + "epoch": 0.3261762756792578, + "grad_norm": 12.991177558898926, + "learning_rate": 4.856608555891634e-06, + "loss": 0.21, + "num_input_tokens_seen": 4842776, + "step": 2461 + }, + { + "epoch": 0.32630881378396287, + "grad_norm": 20.477828979492188, + "learning_rate": 4.856492658251591e-06, + "loss": 0.6444, + "num_input_tokens_seen": 4844944, + "step": 2462 + }, + { + "epoch": 0.326441351888668, + "grad_norm": 17.48164176940918, + "learning_rate": 4.8563767151767446e-06, + "loss": 0.5143, + "num_input_tokens_seen": 4846152, + "step": 2463 + }, + { + "epoch": 0.3265738899933731, + "grad_norm": 0.2186077982187271, + "learning_rate": 4.856260726669329e-06, + "loss": 0.0013, + "num_input_tokens_seen": 4847488, + "step": 2464 + }, + { + "epoch": 0.3267064280980782, + "grad_norm": 15.246328353881836, + "learning_rate": 4.856144692731582e-06, + "loss": 1.0336, + "num_input_tokens_seen": 4850600, + "step": 2465 + }, + { + "epoch": 0.3268389662027833, + "grad_norm": 11.29224681854248, + "learning_rate": 4.85602861336574e-06, + "loss": 0.3545, + "num_input_tokens_seen": 4852560, + "step": 2466 + }, + { + "epoch": 0.3269715043074884, + "grad_norm": 20.979074478149414, + "learning_rate": 4.855912488574039e-06, + "loss": 0.5786, + "num_input_tokens_seen": 4854432, + "step": 2467 + }, + { + "epoch": 0.3271040424121935, + "grad_norm": 0.33819472789764404, + "learning_rate": 4.855796318358722e-06, + "loss": 0.002, + "num_input_tokens_seen": 4856384, + "step": 2468 + }, + { + "epoch": 0.32723658051689863, + "grad_norm": 6.447828769683838, + "learning_rate": 4.855680102722026e-06, + "loss": 0.1995, + "num_input_tokens_seen": 4858552, + "step": 2469 + }, + { + "epoch": 0.3273691186216037, + "grad_norm": 2.3483965396881104, + "learning_rate": 4.8555638416661924e-06, + "loss": 0.013, + "num_input_tokens_seen": 4859896, + "step": 2470 + }, + { + "epoch": 0.3275016567263088, + "grad_norm": 11.520812034606934, + "learning_rate": 4.8554475351934635e-06, + "loss": 0.1894, + "num_input_tokens_seen": 4861528, + "step": 2471 + }, + { + "epoch": 0.3276341948310139, + "grad_norm": 0.5696682929992676, + "learning_rate": 4.8553311833060805e-06, + "loss": 0.0035, + "num_input_tokens_seen": 4862648, + "step": 2472 + }, + { + "epoch": 0.327766732935719, + "grad_norm": 6.993871688842773, + "learning_rate": 4.855214786006288e-06, + "loss": 0.2435, + "num_input_tokens_seen": 4865056, + "step": 2473 + }, + { + "epoch": 0.3278992710404241, + "grad_norm": 0.48319047689437866, + "learning_rate": 4.855098343296329e-06, + "loss": 0.003, + "num_input_tokens_seen": 4866872, + "step": 2474 + }, + { + "epoch": 0.32803180914512925, + "grad_norm": 10.132923126220703, + "learning_rate": 4.85498185517845e-06, + "loss": 0.138, + "num_input_tokens_seen": 4868936, + "step": 2475 + }, + { + "epoch": 0.32816434724983434, + "grad_norm": 1.9582284688949585, + "learning_rate": 4.854865321654896e-06, + "loss": 0.0121, + "num_input_tokens_seen": 4870592, + "step": 2476 + }, + { + "epoch": 0.32829688535453944, + "grad_norm": 14.191243171691895, + "learning_rate": 4.854748742727914e-06, + "loss": 0.4173, + "num_input_tokens_seen": 4872192, + "step": 2477 + }, + { + "epoch": 0.32842942345924453, + "grad_norm": 8.466787338256836, + "learning_rate": 4.854632118399751e-06, + "loss": 0.0963, + "num_input_tokens_seen": 4874112, + "step": 2478 + }, + { + "epoch": 0.3285619615639496, + "grad_norm": 7.038083076477051, + "learning_rate": 4.854515448672658e-06, + "loss": 0.1798, + "num_input_tokens_seen": 4875176, + "step": 2479 + }, + { + "epoch": 0.3286944996686547, + "grad_norm": 13.709606170654297, + "learning_rate": 4.854398733548882e-06, + "loss": 0.3756, + "num_input_tokens_seen": 4877640, + "step": 2480 + }, + { + "epoch": 0.32882703777335986, + "grad_norm": 7.141223907470703, + "learning_rate": 4.854281973030674e-06, + "loss": 0.1695, + "num_input_tokens_seen": 4879976, + "step": 2481 + }, + { + "epoch": 0.32895957587806496, + "grad_norm": 1.346604585647583, + "learning_rate": 4.854165167120285e-06, + "loss": 0.0083, + "num_input_tokens_seen": 4881848, + "step": 2482 + }, + { + "epoch": 0.32909211398277005, + "grad_norm": 6.044533729553223, + "learning_rate": 4.854048315819968e-06, + "loss": 0.1629, + "num_input_tokens_seen": 4883520, + "step": 2483 + }, + { + "epoch": 0.32922465208747514, + "grad_norm": 5.787064075469971, + "learning_rate": 4.853931419131976e-06, + "loss": 0.2168, + "num_input_tokens_seen": 4884640, + "step": 2484 + }, + { + "epoch": 0.32935719019218024, + "grad_norm": 14.233854293823242, + "learning_rate": 4.853814477058561e-06, + "loss": 0.49, + "num_input_tokens_seen": 4886864, + "step": 2485 + }, + { + "epoch": 0.32948972829688533, + "grad_norm": 14.842486381530762, + "learning_rate": 4.853697489601981e-06, + "loss": 0.5716, + "num_input_tokens_seen": 4889024, + "step": 2486 + }, + { + "epoch": 0.3296222664015905, + "grad_norm": 5.033909320831299, + "learning_rate": 4.8535804567644886e-06, + "loss": 0.0394, + "num_input_tokens_seen": 4891248, + "step": 2487 + }, + { + "epoch": 0.3297548045062956, + "grad_norm": 0.6181367039680481, + "learning_rate": 4.853463378548341e-06, + "loss": 0.0037, + "num_input_tokens_seen": 4893656, + "step": 2488 + }, + { + "epoch": 0.32988734261100067, + "grad_norm": 21.318761825561523, + "learning_rate": 4.853346254955796e-06, + "loss": 0.7032, + "num_input_tokens_seen": 4895816, + "step": 2489 + }, + { + "epoch": 0.33001988071570576, + "grad_norm": 2.9122464656829834, + "learning_rate": 4.8532290859891125e-06, + "loss": 0.0414, + "num_input_tokens_seen": 4897624, + "step": 2490 + }, + { + "epoch": 0.33015241882041085, + "grad_norm": 0.5618943572044373, + "learning_rate": 4.853111871650548e-06, + "loss": 0.0034, + "num_input_tokens_seen": 4898888, + "step": 2491 + }, + { + "epoch": 0.33028495692511595, + "grad_norm": 14.67484188079834, + "learning_rate": 4.852994611942364e-06, + "loss": 0.8535, + "num_input_tokens_seen": 4901672, + "step": 2492 + }, + { + "epoch": 0.3304174950298211, + "grad_norm": 3.7947425842285156, + "learning_rate": 4.85287730686682e-06, + "loss": 0.0248, + "num_input_tokens_seen": 4903704, + "step": 2493 + }, + { + "epoch": 0.3305500331345262, + "grad_norm": 9.250006675720215, + "learning_rate": 4.852759956426178e-06, + "loss": 0.2065, + "num_input_tokens_seen": 4905544, + "step": 2494 + }, + { + "epoch": 0.3306825712392313, + "grad_norm": 14.868971824645996, + "learning_rate": 4.852642560622702e-06, + "loss": 0.4638, + "num_input_tokens_seen": 4907304, + "step": 2495 + }, + { + "epoch": 0.3308151093439364, + "grad_norm": 16.35065460205078, + "learning_rate": 4.852525119458654e-06, + "loss": 0.5164, + "num_input_tokens_seen": 4909720, + "step": 2496 + }, + { + "epoch": 0.33094764744864147, + "grad_norm": 0.8608124852180481, + "learning_rate": 4.852407632936299e-06, + "loss": 0.0047, + "num_input_tokens_seen": 4911344, + "step": 2497 + }, + { + "epoch": 0.33108018555334656, + "grad_norm": 6.037642002105713, + "learning_rate": 4.852290101057901e-06, + "loss": 0.0996, + "num_input_tokens_seen": 4914296, + "step": 2498 + }, + { + "epoch": 0.3312127236580517, + "grad_norm": 5.970702171325684, + "learning_rate": 4.852172523825729e-06, + "loss": 0.18, + "num_input_tokens_seen": 4916032, + "step": 2499 + }, + { + "epoch": 0.3313452617627568, + "grad_norm": 1.6504173278808594, + "learning_rate": 4.852054901242046e-06, + "loss": 0.0085, + "num_input_tokens_seen": 4917976, + "step": 2500 + }, + { + "epoch": 0.3314777998674619, + "grad_norm": 1.5778703689575195, + "learning_rate": 4.851937233309123e-06, + "loss": 0.0071, + "num_input_tokens_seen": 4920024, + "step": 2501 + }, + { + "epoch": 0.331610337972167, + "grad_norm": 2.2431654930114746, + "learning_rate": 4.851819520029227e-06, + "loss": 0.0355, + "num_input_tokens_seen": 4921448, + "step": 2502 + }, + { + "epoch": 0.3317428760768721, + "grad_norm": 13.440147399902344, + "learning_rate": 4.851701761404629e-06, + "loss": 0.4951, + "num_input_tokens_seen": 4923336, + "step": 2503 + }, + { + "epoch": 0.3318754141815772, + "grad_norm": 0.7404280304908752, + "learning_rate": 4.851583957437598e-06, + "loss": 0.0037, + "num_input_tokens_seen": 4925056, + "step": 2504 + }, + { + "epoch": 0.3320079522862823, + "grad_norm": 11.032755851745605, + "learning_rate": 4.851466108130406e-06, + "loss": 0.438, + "num_input_tokens_seen": 4926792, + "step": 2505 + }, + { + "epoch": 0.3321404903909874, + "grad_norm": 10.95981216430664, + "learning_rate": 4.851348213485326e-06, + "loss": 0.491, + "num_input_tokens_seen": 4928664, + "step": 2506 + }, + { + "epoch": 0.3322730284956925, + "grad_norm": 17.006113052368164, + "learning_rate": 4.85123027350463e-06, + "loss": 0.5775, + "num_input_tokens_seen": 4931424, + "step": 2507 + }, + { + "epoch": 0.3324055666003976, + "grad_norm": 0.47183868288993835, + "learning_rate": 4.851112288190592e-06, + "loss": 0.0023, + "num_input_tokens_seen": 4932824, + "step": 2508 + }, + { + "epoch": 0.3325381047051027, + "grad_norm": 8.050056457519531, + "learning_rate": 4.850994257545488e-06, + "loss": 0.339, + "num_input_tokens_seen": 4935664, + "step": 2509 + }, + { + "epoch": 0.3326706428098078, + "grad_norm": 0.3868926167488098, + "learning_rate": 4.850876181571592e-06, + "loss": 0.0016, + "num_input_tokens_seen": 4938328, + "step": 2510 + }, + { + "epoch": 0.33280318091451294, + "grad_norm": 14.420541763305664, + "learning_rate": 4.850758060271181e-06, + "loss": 0.6009, + "num_input_tokens_seen": 4940472, + "step": 2511 + }, + { + "epoch": 0.33293571901921803, + "grad_norm": 0.22953245043754578, + "learning_rate": 4.850639893646534e-06, + "loss": 0.0012, + "num_input_tokens_seen": 4941968, + "step": 2512 + }, + { + "epoch": 0.33306825712392313, + "grad_norm": 13.4358549118042, + "learning_rate": 4.850521681699928e-06, + "loss": 0.5495, + "num_input_tokens_seen": 4943256, + "step": 2513 + }, + { + "epoch": 0.3332007952286282, + "grad_norm": 1.4146995544433594, + "learning_rate": 4.850403424433642e-06, + "loss": 0.0062, + "num_input_tokens_seen": 4945560, + "step": 2514 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 6.421215057373047, + "learning_rate": 4.8502851218499566e-06, + "loss": 0.1505, + "num_input_tokens_seen": 4947256, + "step": 2515 + }, + { + "epoch": 0.33346587143803846, + "grad_norm": 13.972963333129883, + "learning_rate": 4.850166773951153e-06, + "loss": 0.4566, + "num_input_tokens_seen": 4949064, + "step": 2516 + }, + { + "epoch": 0.33359840954274356, + "grad_norm": 12.054634094238281, + "learning_rate": 4.850048380739512e-06, + "loss": 0.5551, + "num_input_tokens_seen": 4951520, + "step": 2517 + }, + { + "epoch": 0.33373094764744865, + "grad_norm": 0.5132588148117065, + "learning_rate": 4.849929942217318e-06, + "loss": 0.0028, + "num_input_tokens_seen": 4953472, + "step": 2518 + }, + { + "epoch": 0.33386348575215374, + "grad_norm": 12.718517303466797, + "learning_rate": 4.849811458386853e-06, + "loss": 0.3673, + "num_input_tokens_seen": 4955392, + "step": 2519 + }, + { + "epoch": 0.33399602385685884, + "grad_norm": 0.06976090371608734, + "learning_rate": 4.849692929250401e-06, + "loss": 0.0004, + "num_input_tokens_seen": 4956336, + "step": 2520 + }, + { + "epoch": 0.33412856196156393, + "grad_norm": 13.524518013000488, + "learning_rate": 4.849574354810251e-06, + "loss": 0.4958, + "num_input_tokens_seen": 4958872, + "step": 2521 + }, + { + "epoch": 0.3342611000662691, + "grad_norm": 13.198885917663574, + "learning_rate": 4.849455735068684e-06, + "loss": 0.5385, + "num_input_tokens_seen": 4960768, + "step": 2522 + }, + { + "epoch": 0.3343936381709742, + "grad_norm": 0.14774581789970398, + "learning_rate": 4.8493370700279905e-06, + "loss": 0.0009, + "num_input_tokens_seen": 4963336, + "step": 2523 + }, + { + "epoch": 0.33452617627567927, + "grad_norm": 10.848809242248535, + "learning_rate": 4.849218359690458e-06, + "loss": 0.2331, + "num_input_tokens_seen": 4965240, + "step": 2524 + }, + { + "epoch": 0.33465871438038436, + "grad_norm": 0.13256627321243286, + "learning_rate": 4.849099604058375e-06, + "loss": 0.0008, + "num_input_tokens_seen": 4967208, + "step": 2525 + }, + { + "epoch": 0.33479125248508945, + "grad_norm": 5.851517677307129, + "learning_rate": 4.84898080313403e-06, + "loss": 0.0832, + "num_input_tokens_seen": 4969416, + "step": 2526 + }, + { + "epoch": 0.33492379058979455, + "grad_norm": 16.57158088684082, + "learning_rate": 4.848861956919715e-06, + "loss": 0.206, + "num_input_tokens_seen": 4971528, + "step": 2527 + }, + { + "epoch": 0.3350563286944997, + "grad_norm": 2.150195837020874, + "learning_rate": 4.848743065417722e-06, + "loss": 0.0522, + "num_input_tokens_seen": 4973120, + "step": 2528 + }, + { + "epoch": 0.3351888667992048, + "grad_norm": 0.17715318500995636, + "learning_rate": 4.8486241286303415e-06, + "loss": 0.0011, + "num_input_tokens_seen": 4974960, + "step": 2529 + }, + { + "epoch": 0.3353214049039099, + "grad_norm": 11.661783218383789, + "learning_rate": 4.848505146559868e-06, + "loss": 0.2968, + "num_input_tokens_seen": 4976984, + "step": 2530 + }, + { + "epoch": 0.335453943008615, + "grad_norm": 15.448066711425781, + "learning_rate": 4.848386119208595e-06, + "loss": 0.4958, + "num_input_tokens_seen": 4980304, + "step": 2531 + }, + { + "epoch": 0.33558648111332007, + "grad_norm": 6.0017828941345215, + "learning_rate": 4.848267046578817e-06, + "loss": 0.1532, + "num_input_tokens_seen": 4982344, + "step": 2532 + }, + { + "epoch": 0.33571901921802516, + "grad_norm": 3.1475830078125, + "learning_rate": 4.848147928672831e-06, + "loss": 0.0737, + "num_input_tokens_seen": 4984304, + "step": 2533 + }, + { + "epoch": 0.3358515573227303, + "grad_norm": 9.871624946594238, + "learning_rate": 4.8480287654929326e-06, + "loss": 0.2759, + "num_input_tokens_seen": 4986216, + "step": 2534 + }, + { + "epoch": 0.3359840954274354, + "grad_norm": 8.20520305633545, + "learning_rate": 4.84790955704142e-06, + "loss": 0.2514, + "num_input_tokens_seen": 4987912, + "step": 2535 + }, + { + "epoch": 0.3361166335321405, + "grad_norm": 14.068840980529785, + "learning_rate": 4.847790303320592e-06, + "loss": 0.2199, + "num_input_tokens_seen": 4989104, + "step": 2536 + }, + { + "epoch": 0.3362491716368456, + "grad_norm": 13.081754684448242, + "learning_rate": 4.847671004332746e-06, + "loss": 0.4257, + "num_input_tokens_seen": 4990960, + "step": 2537 + }, + { + "epoch": 0.3363817097415507, + "grad_norm": 1.3506258726119995, + "learning_rate": 4.847551660080184e-06, + "loss": 0.0086, + "num_input_tokens_seen": 4993200, + "step": 2538 + }, + { + "epoch": 0.3365142478462558, + "grad_norm": 1.0557507276535034, + "learning_rate": 4.847432270565207e-06, + "loss": 0.0068, + "num_input_tokens_seen": 4994464, + "step": 2539 + }, + { + "epoch": 0.3366467859509609, + "grad_norm": 12.210657119750977, + "learning_rate": 4.847312835790117e-06, + "loss": 0.1723, + "num_input_tokens_seen": 4996960, + "step": 2540 + }, + { + "epoch": 0.336779324055666, + "grad_norm": 1.9731026887893677, + "learning_rate": 4.847193355757215e-06, + "loss": 0.0121, + "num_input_tokens_seen": 4999848, + "step": 2541 + }, + { + "epoch": 0.3369118621603711, + "grad_norm": 8.374910354614258, + "learning_rate": 4.8470738304688055e-06, + "loss": 0.2966, + "num_input_tokens_seen": 5002256, + "step": 2542 + }, + { + "epoch": 0.3370444002650762, + "grad_norm": 0.3551548719406128, + "learning_rate": 4.846954259927195e-06, + "loss": 0.0022, + "num_input_tokens_seen": 5004672, + "step": 2543 + }, + { + "epoch": 0.3371769383697813, + "grad_norm": 14.602219581604004, + "learning_rate": 4.846834644134686e-06, + "loss": 0.507, + "num_input_tokens_seen": 5006896, + "step": 2544 + }, + { + "epoch": 0.3373094764744864, + "grad_norm": 17.20040512084961, + "learning_rate": 4.8467149830935865e-06, + "loss": 0.6817, + "num_input_tokens_seen": 5007968, + "step": 2545 + }, + { + "epoch": 0.33744201457919154, + "grad_norm": 9.876198768615723, + "learning_rate": 4.846595276806203e-06, + "loss": 0.3634, + "num_input_tokens_seen": 5009816, + "step": 2546 + }, + { + "epoch": 0.33757455268389663, + "grad_norm": 0.33512306213378906, + "learning_rate": 4.846475525274844e-06, + "loss": 0.0021, + "num_input_tokens_seen": 5011016, + "step": 2547 + }, + { + "epoch": 0.3377070907886017, + "grad_norm": 20.322330474853516, + "learning_rate": 4.846355728501817e-06, + "loss": 0.4798, + "num_input_tokens_seen": 5012856, + "step": 2548 + }, + { + "epoch": 0.3378396288933068, + "grad_norm": 8.270444869995117, + "learning_rate": 4.8462358864894345e-06, + "loss": 0.1673, + "num_input_tokens_seen": 5014592, + "step": 2549 + }, + { + "epoch": 0.3379721669980119, + "grad_norm": 15.18359661102295, + "learning_rate": 4.8461159992400044e-06, + "loss": 0.547, + "num_input_tokens_seen": 5016760, + "step": 2550 + }, + { + "epoch": 0.338104705102717, + "grad_norm": 4.201569557189941, + "learning_rate": 4.84599606675584e-06, + "loss": 0.0104, + "num_input_tokens_seen": 5019664, + "step": 2551 + }, + { + "epoch": 0.33823724320742216, + "grad_norm": 10.736529350280762, + "learning_rate": 4.845876089039253e-06, + "loss": 0.2526, + "num_input_tokens_seen": 5021336, + "step": 2552 + }, + { + "epoch": 0.33836978131212725, + "grad_norm": 4.422069549560547, + "learning_rate": 4.845756066092556e-06, + "loss": 0.0624, + "num_input_tokens_seen": 5022448, + "step": 2553 + }, + { + "epoch": 0.33850231941683234, + "grad_norm": 0.2259720116853714, + "learning_rate": 4.845635997918064e-06, + "loss": 0.0014, + "num_input_tokens_seen": 5024408, + "step": 2554 + }, + { + "epoch": 0.33863485752153744, + "grad_norm": 8.32859992980957, + "learning_rate": 4.845515884518093e-06, + "loss": 0.1041, + "num_input_tokens_seen": 5026944, + "step": 2555 + }, + { + "epoch": 0.33876739562624253, + "grad_norm": 9.94010066986084, + "learning_rate": 4.845395725894956e-06, + "loss": 0.1135, + "num_input_tokens_seen": 5028832, + "step": 2556 + }, + { + "epoch": 0.3388999337309476, + "grad_norm": 10.362662315368652, + "learning_rate": 4.845275522050973e-06, + "loss": 0.1865, + "num_input_tokens_seen": 5031224, + "step": 2557 + }, + { + "epoch": 0.33903247183565277, + "grad_norm": 8.959379196166992, + "learning_rate": 4.845155272988459e-06, + "loss": 0.3073, + "num_input_tokens_seen": 5032296, + "step": 2558 + }, + { + "epoch": 0.33916500994035786, + "grad_norm": 0.3402162194252014, + "learning_rate": 4.845034978709733e-06, + "loss": 0.0021, + "num_input_tokens_seen": 5033824, + "step": 2559 + }, + { + "epoch": 0.33929754804506296, + "grad_norm": 5.115143299102783, + "learning_rate": 4.844914639217115e-06, + "loss": 0.1712, + "num_input_tokens_seen": 5036352, + "step": 2560 + }, + { + "epoch": 0.33943008614976805, + "grad_norm": 14.035609245300293, + "learning_rate": 4.844794254512926e-06, + "loss": 0.3628, + "num_input_tokens_seen": 5038480, + "step": 2561 + }, + { + "epoch": 0.33956262425447314, + "grad_norm": 6.534146785736084, + "learning_rate": 4.844673824599486e-06, + "loss": 0.0682, + "num_input_tokens_seen": 5040264, + "step": 2562 + }, + { + "epoch": 0.33969516235917824, + "grad_norm": 0.33945485949516296, + "learning_rate": 4.844553349479117e-06, + "loss": 0.0021, + "num_input_tokens_seen": 5042176, + "step": 2563 + }, + { + "epoch": 0.3398277004638834, + "grad_norm": 11.172015190124512, + "learning_rate": 4.844432829154142e-06, + "loss": 0.5188, + "num_input_tokens_seen": 5044112, + "step": 2564 + }, + { + "epoch": 0.3399602385685885, + "grad_norm": 10.872260093688965, + "learning_rate": 4.8443122636268845e-06, + "loss": 0.194, + "num_input_tokens_seen": 5045808, + "step": 2565 + }, + { + "epoch": 0.3400927766732936, + "grad_norm": 11.375405311584473, + "learning_rate": 4.844191652899669e-06, + "loss": 0.3708, + "num_input_tokens_seen": 5047744, + "step": 2566 + }, + { + "epoch": 0.34022531477799867, + "grad_norm": 0.38756489753723145, + "learning_rate": 4.844070996974822e-06, + "loss": 0.0025, + "num_input_tokens_seen": 5050336, + "step": 2567 + }, + { + "epoch": 0.34035785288270376, + "grad_norm": 0.30488312244415283, + "learning_rate": 4.8439502958546684e-06, + "loss": 0.0018, + "num_input_tokens_seen": 5052408, + "step": 2568 + }, + { + "epoch": 0.34049039098740885, + "grad_norm": 0.1934095025062561, + "learning_rate": 4.843829549541537e-06, + "loss": 0.0012, + "num_input_tokens_seen": 5054376, + "step": 2569 + }, + { + "epoch": 0.340622929092114, + "grad_norm": 16.13597297668457, + "learning_rate": 4.843708758037754e-06, + "loss": 0.7639, + "num_input_tokens_seen": 5056688, + "step": 2570 + }, + { + "epoch": 0.3407554671968191, + "grad_norm": 0.2038782835006714, + "learning_rate": 4.843587921345649e-06, + "loss": 0.0012, + "num_input_tokens_seen": 5057808, + "step": 2571 + }, + { + "epoch": 0.3408880053015242, + "grad_norm": 18.087759017944336, + "learning_rate": 4.8434670394675536e-06, + "loss": 0.5012, + "num_input_tokens_seen": 5059872, + "step": 2572 + }, + { + "epoch": 0.3410205434062293, + "grad_norm": 0.5027891993522644, + "learning_rate": 4.843346112405796e-06, + "loss": 0.0027, + "num_input_tokens_seen": 5061496, + "step": 2573 + }, + { + "epoch": 0.3411530815109344, + "grad_norm": 0.2382429987192154, + "learning_rate": 4.84322514016271e-06, + "loss": 0.0014, + "num_input_tokens_seen": 5063712, + "step": 2574 + }, + { + "epoch": 0.3412856196156395, + "grad_norm": 3.5534324645996094, + "learning_rate": 4.843104122740625e-06, + "loss": 0.0369, + "num_input_tokens_seen": 5065600, + "step": 2575 + }, + { + "epoch": 0.3414181577203446, + "grad_norm": 13.352022171020508, + "learning_rate": 4.842983060141877e-06, + "loss": 0.4957, + "num_input_tokens_seen": 5067280, + "step": 2576 + }, + { + "epoch": 0.3415506958250497, + "grad_norm": 14.412768363952637, + "learning_rate": 4.842861952368799e-06, + "loss": 0.4341, + "num_input_tokens_seen": 5069552, + "step": 2577 + }, + { + "epoch": 0.3416832339297548, + "grad_norm": 8.399524688720703, + "learning_rate": 4.842740799423726e-06, + "loss": 0.0724, + "num_input_tokens_seen": 5071856, + "step": 2578 + }, + { + "epoch": 0.3418157720344599, + "grad_norm": 0.09015511721372604, + "learning_rate": 4.8426196013089945e-06, + "loss": 0.0005, + "num_input_tokens_seen": 5073472, + "step": 2579 + }, + { + "epoch": 0.341948310139165, + "grad_norm": 13.617608070373535, + "learning_rate": 4.8424983580269416e-06, + "loss": 0.3859, + "num_input_tokens_seen": 5075080, + "step": 2580 + }, + { + "epoch": 0.34208084824387014, + "grad_norm": 1.20191490650177, + "learning_rate": 4.8423770695799035e-06, + "loss": 0.0047, + "num_input_tokens_seen": 5077224, + "step": 2581 + }, + { + "epoch": 0.34221338634857523, + "grad_norm": 5.682173728942871, + "learning_rate": 4.842255735970221e-06, + "loss": 0.0736, + "num_input_tokens_seen": 5078576, + "step": 2582 + }, + { + "epoch": 0.3423459244532803, + "grad_norm": 9.435686111450195, + "learning_rate": 4.842134357200231e-06, + "loss": 0.106, + "num_input_tokens_seen": 5079936, + "step": 2583 + }, + { + "epoch": 0.3424784625579854, + "grad_norm": 7.388195514678955, + "learning_rate": 4.842012933272274e-06, + "loss": 0.0934, + "num_input_tokens_seen": 5081912, + "step": 2584 + }, + { + "epoch": 0.3426110006626905, + "grad_norm": 13.4119873046875, + "learning_rate": 4.841891464188692e-06, + "loss": 0.3727, + "num_input_tokens_seen": 5083584, + "step": 2585 + }, + { + "epoch": 0.3427435387673956, + "grad_norm": 15.916617393493652, + "learning_rate": 4.841769949951829e-06, + "loss": 0.3822, + "num_input_tokens_seen": 5085688, + "step": 2586 + }, + { + "epoch": 0.34287607687210075, + "grad_norm": 0.062166765332221985, + "learning_rate": 4.841648390564025e-06, + "loss": 0.0004, + "num_input_tokens_seen": 5086848, + "step": 2587 + }, + { + "epoch": 0.34300861497680585, + "grad_norm": 0.06384774297475815, + "learning_rate": 4.841526786027624e-06, + "loss": 0.0004, + "num_input_tokens_seen": 5088200, + "step": 2588 + }, + { + "epoch": 0.34314115308151094, + "grad_norm": 0.0751710832118988, + "learning_rate": 4.841405136344971e-06, + "loss": 0.0004, + "num_input_tokens_seen": 5089440, + "step": 2589 + }, + { + "epoch": 0.34327369118621603, + "grad_norm": 8.333683013916016, + "learning_rate": 4.841283441518413e-06, + "loss": 0.2896, + "num_input_tokens_seen": 5091008, + "step": 2590 + }, + { + "epoch": 0.34340622929092113, + "grad_norm": 0.11106595396995544, + "learning_rate": 4.841161701550295e-06, + "loss": 0.0007, + "num_input_tokens_seen": 5092968, + "step": 2591 + }, + { + "epoch": 0.3435387673956262, + "grad_norm": 8.654097557067871, + "learning_rate": 4.841039916442962e-06, + "loss": 0.2042, + "num_input_tokens_seen": 5094864, + "step": 2592 + }, + { + "epoch": 0.34367130550033137, + "grad_norm": 5.409858226776123, + "learning_rate": 4.840918086198767e-06, + "loss": 0.1349, + "num_input_tokens_seen": 5096288, + "step": 2593 + }, + { + "epoch": 0.34380384360503646, + "grad_norm": 6.716861248016357, + "learning_rate": 4.840796210820055e-06, + "loss": 0.1318, + "num_input_tokens_seen": 5098192, + "step": 2594 + }, + { + "epoch": 0.34393638170974156, + "grad_norm": 0.07945011556148529, + "learning_rate": 4.840674290309178e-06, + "loss": 0.0005, + "num_input_tokens_seen": 5100760, + "step": 2595 + }, + { + "epoch": 0.34406891981444665, + "grad_norm": 13.218320846557617, + "learning_rate": 4.840552324668485e-06, + "loss": 0.2058, + "num_input_tokens_seen": 5102936, + "step": 2596 + }, + { + "epoch": 0.34420145791915174, + "grad_norm": 7.486458778381348, + "learning_rate": 4.8404303139003295e-06, + "loss": 0.1768, + "num_input_tokens_seen": 5105112, + "step": 2597 + }, + { + "epoch": 0.34433399602385684, + "grad_norm": 19.353336334228516, + "learning_rate": 4.840308258007063e-06, + "loss": 0.5371, + "num_input_tokens_seen": 5107368, + "step": 2598 + }, + { + "epoch": 0.344466534128562, + "grad_norm": 31.570310592651367, + "learning_rate": 4.840186156991038e-06, + "loss": 0.5615, + "num_input_tokens_seen": 5109696, + "step": 2599 + }, + { + "epoch": 0.3445990722332671, + "grad_norm": 0.14075443148612976, + "learning_rate": 4.840064010854609e-06, + "loss": 0.0008, + "num_input_tokens_seen": 5111008, + "step": 2600 + }, + { + "epoch": 0.34473161033797217, + "grad_norm": 0.15091027319431305, + "learning_rate": 4.839941819600133e-06, + "loss": 0.0009, + "num_input_tokens_seen": 5112848, + "step": 2601 + }, + { + "epoch": 0.34486414844267727, + "grad_norm": 20.39800453186035, + "learning_rate": 4.839819583229964e-06, + "loss": 0.7999, + "num_input_tokens_seen": 5114928, + "step": 2602 + }, + { + "epoch": 0.34499668654738236, + "grad_norm": 0.3418734073638916, + "learning_rate": 4.839697301746459e-06, + "loss": 0.002, + "num_input_tokens_seen": 5116216, + "step": 2603 + }, + { + "epoch": 0.34512922465208745, + "grad_norm": 11.38237476348877, + "learning_rate": 4.839574975151976e-06, + "loss": 0.4725, + "num_input_tokens_seen": 5118312, + "step": 2604 + }, + { + "epoch": 0.3452617627567926, + "grad_norm": 8.359557151794434, + "learning_rate": 4.839452603448873e-06, + "loss": 0.2628, + "num_input_tokens_seen": 5120232, + "step": 2605 + }, + { + "epoch": 0.3453943008614977, + "grad_norm": 7.7645649909973145, + "learning_rate": 4.839330186639511e-06, + "loss": 0.2457, + "num_input_tokens_seen": 5122272, + "step": 2606 + }, + { + "epoch": 0.3455268389662028, + "grad_norm": 0.20248530805110931, + "learning_rate": 4.839207724726248e-06, + "loss": 0.0012, + "num_input_tokens_seen": 5125072, + "step": 2607 + }, + { + "epoch": 0.3456593770709079, + "grad_norm": 0.6183853149414062, + "learning_rate": 4.8390852177114476e-06, + "loss": 0.0084, + "num_input_tokens_seen": 5126776, + "step": 2608 + }, + { + "epoch": 0.345791915175613, + "grad_norm": 2.827495813369751, + "learning_rate": 4.83896266559747e-06, + "loss": 0.0221, + "num_input_tokens_seen": 5128088, + "step": 2609 + }, + { + "epoch": 0.34592445328031807, + "grad_norm": 10.523560523986816, + "learning_rate": 4.838840068386678e-06, + "loss": 0.1884, + "num_input_tokens_seen": 5130200, + "step": 2610 + }, + { + "epoch": 0.3460569913850232, + "grad_norm": 15.118446350097656, + "learning_rate": 4.838717426081437e-06, + "loss": 0.4927, + "num_input_tokens_seen": 5131984, + "step": 2611 + }, + { + "epoch": 0.3461895294897283, + "grad_norm": 6.286466121673584, + "learning_rate": 4.83859473868411e-06, + "loss": 0.168, + "num_input_tokens_seen": 5134160, + "step": 2612 + }, + { + "epoch": 0.3463220675944334, + "grad_norm": 8.771166801452637, + "learning_rate": 4.838472006197064e-06, + "loss": 0.1892, + "num_input_tokens_seen": 5135832, + "step": 2613 + }, + { + "epoch": 0.3464546056991385, + "grad_norm": 7.78450345993042, + "learning_rate": 4.838349228622663e-06, + "loss": 0.2389, + "num_input_tokens_seen": 5137688, + "step": 2614 + }, + { + "epoch": 0.3465871438038436, + "grad_norm": 7.166824817657471, + "learning_rate": 4.838226405963277e-06, + "loss": 0.1208, + "num_input_tokens_seen": 5139384, + "step": 2615 + }, + { + "epoch": 0.3467196819085487, + "grad_norm": 11.340084075927734, + "learning_rate": 4.838103538221272e-06, + "loss": 0.4306, + "num_input_tokens_seen": 5141208, + "step": 2616 + }, + { + "epoch": 0.34685222001325383, + "grad_norm": 5.661216735839844, + "learning_rate": 4.837980625399018e-06, + "loss": 0.0798, + "num_input_tokens_seen": 5143376, + "step": 2617 + }, + { + "epoch": 0.3469847581179589, + "grad_norm": 13.118907928466797, + "learning_rate": 4.837857667498885e-06, + "loss": 0.3629, + "num_input_tokens_seen": 5146192, + "step": 2618 + }, + { + "epoch": 0.347117296222664, + "grad_norm": 5.755819797515869, + "learning_rate": 4.837734664523243e-06, + "loss": 0.1382, + "num_input_tokens_seen": 5148608, + "step": 2619 + }, + { + "epoch": 0.3472498343273691, + "grad_norm": 11.884995460510254, + "learning_rate": 4.8376116164744645e-06, + "loss": 0.0692, + "num_input_tokens_seen": 5151888, + "step": 2620 + }, + { + "epoch": 0.3473823724320742, + "grad_norm": 5.193696975708008, + "learning_rate": 4.837488523354921e-06, + "loss": 0.2207, + "num_input_tokens_seen": 5153544, + "step": 2621 + }, + { + "epoch": 0.3475149105367793, + "grad_norm": 1.6338624954223633, + "learning_rate": 4.837365385166986e-06, + "loss": 0.0221, + "num_input_tokens_seen": 5154784, + "step": 2622 + }, + { + "epoch": 0.34764744864148445, + "grad_norm": 8.607190132141113, + "learning_rate": 4.837242201913035e-06, + "loss": 0.1012, + "num_input_tokens_seen": 5156640, + "step": 2623 + }, + { + "epoch": 0.34777998674618954, + "grad_norm": 12.23025131225586, + "learning_rate": 4.83711897359544e-06, + "loss": 0.219, + "num_input_tokens_seen": 5158408, + "step": 2624 + }, + { + "epoch": 0.34791252485089463, + "grad_norm": 0.14775116741657257, + "learning_rate": 4.836995700216581e-06, + "loss": 0.0009, + "num_input_tokens_seen": 5160480, + "step": 2625 + }, + { + "epoch": 0.3480450629555997, + "grad_norm": 12.328269958496094, + "learning_rate": 4.8368723817788314e-06, + "loss": 0.3127, + "num_input_tokens_seen": 5162112, + "step": 2626 + }, + { + "epoch": 0.3481776010603048, + "grad_norm": 6.5969319343566895, + "learning_rate": 4.83674901828457e-06, + "loss": 0.0767, + "num_input_tokens_seen": 5163856, + "step": 2627 + }, + { + "epoch": 0.3483101391650099, + "grad_norm": 15.464762687683105, + "learning_rate": 4.836625609736176e-06, + "loss": 0.4079, + "num_input_tokens_seen": 5165776, + "step": 2628 + }, + { + "epoch": 0.34844267726971506, + "grad_norm": 2.8966989517211914, + "learning_rate": 4.836502156136028e-06, + "loss": 0.0183, + "num_input_tokens_seen": 5167984, + "step": 2629 + }, + { + "epoch": 0.34857521537442016, + "grad_norm": 21.495792388916016, + "learning_rate": 4.836378657486508e-06, + "loss": 0.6785, + "num_input_tokens_seen": 5170472, + "step": 2630 + }, + { + "epoch": 0.34870775347912525, + "grad_norm": 0.4247101843357086, + "learning_rate": 4.836255113789994e-06, + "loss": 0.0026, + "num_input_tokens_seen": 5171832, + "step": 2631 + }, + { + "epoch": 0.34884029158383034, + "grad_norm": 11.294836044311523, + "learning_rate": 4.8361315250488705e-06, + "loss": 0.3737, + "num_input_tokens_seen": 5173560, + "step": 2632 + }, + { + "epoch": 0.34897282968853544, + "grad_norm": 1.940533995628357, + "learning_rate": 4.836007891265519e-06, + "loss": 0.0116, + "num_input_tokens_seen": 5174920, + "step": 2633 + }, + { + "epoch": 0.3491053677932406, + "grad_norm": 0.193169504404068, + "learning_rate": 4.8358842124423245e-06, + "loss": 0.0012, + "num_input_tokens_seen": 5176512, + "step": 2634 + }, + { + "epoch": 0.3492379058979457, + "grad_norm": 18.383394241333008, + "learning_rate": 4.83576048858167e-06, + "loss": 0.3589, + "num_input_tokens_seen": 5177888, + "step": 2635 + }, + { + "epoch": 0.34937044400265077, + "grad_norm": 0.08755943924188614, + "learning_rate": 4.835636719685942e-06, + "loss": 0.0005, + "num_input_tokens_seen": 5179104, + "step": 2636 + }, + { + "epoch": 0.34950298210735586, + "grad_norm": 1.7547752857208252, + "learning_rate": 4.835512905757528e-06, + "loss": 0.0068, + "num_input_tokens_seen": 5181416, + "step": 2637 + }, + { + "epoch": 0.34963552021206096, + "grad_norm": 0.1409478783607483, + "learning_rate": 4.835389046798813e-06, + "loss": 0.0008, + "num_input_tokens_seen": 5183224, + "step": 2638 + }, + { + "epoch": 0.34976805831676605, + "grad_norm": 8.044390678405762, + "learning_rate": 4.835265142812187e-06, + "loss": 0.2563, + "num_input_tokens_seen": 5185472, + "step": 2639 + }, + { + "epoch": 0.3499005964214712, + "grad_norm": 11.293478965759277, + "learning_rate": 4.835141193800037e-06, + "loss": 0.2075, + "num_input_tokens_seen": 5187528, + "step": 2640 + }, + { + "epoch": 0.3500331345261763, + "grad_norm": 11.678450584411621, + "learning_rate": 4.835017199764754e-06, + "loss": 0.3128, + "num_input_tokens_seen": 5189648, + "step": 2641 + }, + { + "epoch": 0.3501656726308814, + "grad_norm": 7.599231243133545, + "learning_rate": 4.834893160708729e-06, + "loss": 0.1659, + "num_input_tokens_seen": 5192112, + "step": 2642 + }, + { + "epoch": 0.3502982107355865, + "grad_norm": 8.560397148132324, + "learning_rate": 4.834769076634352e-06, + "loss": 0.2134, + "num_input_tokens_seen": 5193912, + "step": 2643 + }, + { + "epoch": 0.3504307488402916, + "grad_norm": 9.08426284790039, + "learning_rate": 4.834644947544018e-06, + "loss": 0.0999, + "num_input_tokens_seen": 5196600, + "step": 2644 + }, + { + "epoch": 0.35056328694499667, + "grad_norm": 0.066383957862854, + "learning_rate": 4.834520773440118e-06, + "loss": 0.0004, + "num_input_tokens_seen": 5198632, + "step": 2645 + }, + { + "epoch": 0.3506958250497018, + "grad_norm": 12.781527519226074, + "learning_rate": 4.834396554325047e-06, + "loss": 0.3929, + "num_input_tokens_seen": 5199968, + "step": 2646 + }, + { + "epoch": 0.3508283631544069, + "grad_norm": 0.10992295295000076, + "learning_rate": 4.8342722902012005e-06, + "loss": 0.0006, + "num_input_tokens_seen": 5201168, + "step": 2647 + }, + { + "epoch": 0.350960901259112, + "grad_norm": 2.396319627761841, + "learning_rate": 4.834147981070973e-06, + "loss": 0.0607, + "num_input_tokens_seen": 5202952, + "step": 2648 + }, + { + "epoch": 0.3510934393638171, + "grad_norm": 12.338872909545898, + "learning_rate": 4.834023626936763e-06, + "loss": 0.3525, + "num_input_tokens_seen": 5204624, + "step": 2649 + }, + { + "epoch": 0.3512259774685222, + "grad_norm": 3.2395901679992676, + "learning_rate": 4.833899227800968e-06, + "loss": 0.0208, + "num_input_tokens_seen": 5206496, + "step": 2650 + }, + { + "epoch": 0.3513585155732273, + "grad_norm": 5.437209606170654, + "learning_rate": 4.833774783665985e-06, + "loss": 0.0834, + "num_input_tokens_seen": 5207776, + "step": 2651 + }, + { + "epoch": 0.35149105367793243, + "grad_norm": 0.08772001415491104, + "learning_rate": 4.833650294534214e-06, + "loss": 0.0005, + "num_input_tokens_seen": 5209368, + "step": 2652 + }, + { + "epoch": 0.3516235917826375, + "grad_norm": 7.437930107116699, + "learning_rate": 4.833525760408055e-06, + "loss": 0.0702, + "num_input_tokens_seen": 5211928, + "step": 2653 + }, + { + "epoch": 0.3517561298873426, + "grad_norm": 4.432439804077148, + "learning_rate": 4.83340118128991e-06, + "loss": 0.0404, + "num_input_tokens_seen": 5213920, + "step": 2654 + }, + { + "epoch": 0.3518886679920477, + "grad_norm": 10.84237289428711, + "learning_rate": 4.83327655718218e-06, + "loss": 0.2531, + "num_input_tokens_seen": 5215992, + "step": 2655 + }, + { + "epoch": 0.3520212060967528, + "grad_norm": 19.16095733642578, + "learning_rate": 4.8331518880872695e-06, + "loss": 0.3891, + "num_input_tokens_seen": 5217576, + "step": 2656 + }, + { + "epoch": 0.3521537442014579, + "grad_norm": 12.686888694763184, + "learning_rate": 4.833027174007581e-06, + "loss": 0.4381, + "num_input_tokens_seen": 5219624, + "step": 2657 + }, + { + "epoch": 0.35228628230616305, + "grad_norm": 0.21289145946502686, + "learning_rate": 4.832902414945519e-06, + "loss": 0.0011, + "num_input_tokens_seen": 5221376, + "step": 2658 + }, + { + "epoch": 0.35241882041086814, + "grad_norm": 1.2822811603546143, + "learning_rate": 4.832777610903487e-06, + "loss": 0.0078, + "num_input_tokens_seen": 5223720, + "step": 2659 + }, + { + "epoch": 0.35255135851557323, + "grad_norm": 14.713430404663086, + "learning_rate": 4.832652761883896e-06, + "loss": 0.5127, + "num_input_tokens_seen": 5226280, + "step": 2660 + }, + { + "epoch": 0.3526838966202783, + "grad_norm": 11.527045249938965, + "learning_rate": 4.8325278678891486e-06, + "loss": 0.2571, + "num_input_tokens_seen": 5228496, + "step": 2661 + }, + { + "epoch": 0.3528164347249834, + "grad_norm": 7.736532211303711, + "learning_rate": 4.832402928921656e-06, + "loss": 0.2654, + "num_input_tokens_seen": 5230616, + "step": 2662 + }, + { + "epoch": 0.3529489728296885, + "grad_norm": 10.425844192504883, + "learning_rate": 4.832277944983825e-06, + "loss": 0.1387, + "num_input_tokens_seen": 5231896, + "step": 2663 + }, + { + "epoch": 0.35308151093439366, + "grad_norm": 8.610852241516113, + "learning_rate": 4.832152916078068e-06, + "loss": 0.3806, + "num_input_tokens_seen": 5234064, + "step": 2664 + }, + { + "epoch": 0.35321404903909875, + "grad_norm": 19.013717651367188, + "learning_rate": 4.8320278422067926e-06, + "loss": 0.4992, + "num_input_tokens_seen": 5236192, + "step": 2665 + }, + { + "epoch": 0.35334658714380385, + "grad_norm": 6.491237163543701, + "learning_rate": 4.831902723372412e-06, + "loss": 0.1254, + "num_input_tokens_seen": 5238784, + "step": 2666 + }, + { + "epoch": 0.35347912524850894, + "grad_norm": 10.515507698059082, + "learning_rate": 4.831777559577337e-06, + "loss": 0.2655, + "num_input_tokens_seen": 5240368, + "step": 2667 + }, + { + "epoch": 0.35361166335321403, + "grad_norm": 16.409242630004883, + "learning_rate": 4.8316523508239835e-06, + "loss": 0.4999, + "num_input_tokens_seen": 5242544, + "step": 2668 + }, + { + "epoch": 0.3537442014579191, + "grad_norm": 19.251745223999023, + "learning_rate": 4.831527097114763e-06, + "loss": 0.5179, + "num_input_tokens_seen": 5246024, + "step": 2669 + }, + { + "epoch": 0.3538767395626243, + "grad_norm": 2.0515401363372803, + "learning_rate": 4.831401798452092e-06, + "loss": 0.0117, + "num_input_tokens_seen": 5247656, + "step": 2670 + }, + { + "epoch": 0.35400927766732937, + "grad_norm": 0.660833477973938, + "learning_rate": 4.831276454838386e-06, + "loss": 0.0036, + "num_input_tokens_seen": 5249328, + "step": 2671 + }, + { + "epoch": 0.35414181577203446, + "grad_norm": 8.581456184387207, + "learning_rate": 4.831151066276062e-06, + "loss": 0.2808, + "num_input_tokens_seen": 5251224, + "step": 2672 + }, + { + "epoch": 0.35427435387673956, + "grad_norm": 0.07275553047657013, + "learning_rate": 4.831025632767538e-06, + "loss": 0.0004, + "num_input_tokens_seen": 5252616, + "step": 2673 + }, + { + "epoch": 0.35440689198144465, + "grad_norm": 12.886369705200195, + "learning_rate": 4.83090015431523e-06, + "loss": 0.23, + "num_input_tokens_seen": 5254888, + "step": 2674 + }, + { + "epoch": 0.35453943008614974, + "grad_norm": 0.17593100666999817, + "learning_rate": 4.83077463092156e-06, + "loss": 0.001, + "num_input_tokens_seen": 5256848, + "step": 2675 + }, + { + "epoch": 0.3546719681908549, + "grad_norm": 0.550105094909668, + "learning_rate": 4.830649062588947e-06, + "loss": 0.0027, + "num_input_tokens_seen": 5258152, + "step": 2676 + }, + { + "epoch": 0.35480450629556, + "grad_norm": 0.18115903437137604, + "learning_rate": 4.830523449319813e-06, + "loss": 0.0008, + "num_input_tokens_seen": 5259792, + "step": 2677 + }, + { + "epoch": 0.3549370444002651, + "grad_norm": 0.1869560182094574, + "learning_rate": 4.8303977911165785e-06, + "loss": 0.0008, + "num_input_tokens_seen": 5261400, + "step": 2678 + }, + { + "epoch": 0.35506958250497017, + "grad_norm": 7.449179649353027, + "learning_rate": 4.8302720879816665e-06, + "loss": 0.1592, + "num_input_tokens_seen": 5263936, + "step": 2679 + }, + { + "epoch": 0.35520212060967526, + "grad_norm": 0.06532639265060425, + "learning_rate": 4.830146339917502e-06, + "loss": 0.0004, + "num_input_tokens_seen": 5265440, + "step": 2680 + }, + { + "epoch": 0.35533465871438036, + "grad_norm": 0.03875555470585823, + "learning_rate": 4.830020546926508e-06, + "loss": 0.0002, + "num_input_tokens_seen": 5266936, + "step": 2681 + }, + { + "epoch": 0.3554671968190855, + "grad_norm": 15.970491409301758, + "learning_rate": 4.82989470901111e-06, + "loss": 0.5837, + "num_input_tokens_seen": 5268656, + "step": 2682 + }, + { + "epoch": 0.3555997349237906, + "grad_norm": 11.810888290405273, + "learning_rate": 4.829768826173735e-06, + "loss": 0.1878, + "num_input_tokens_seen": 5271552, + "step": 2683 + }, + { + "epoch": 0.3557322730284957, + "grad_norm": 0.03669019415974617, + "learning_rate": 4.82964289841681e-06, + "loss": 0.0002, + "num_input_tokens_seen": 5273280, + "step": 2684 + }, + { + "epoch": 0.3558648111332008, + "grad_norm": 0.9188379645347595, + "learning_rate": 4.829516925742764e-06, + "loss": 0.0041, + "num_input_tokens_seen": 5274528, + "step": 2685 + }, + { + "epoch": 0.3559973492379059, + "grad_norm": 6.313194274902344, + "learning_rate": 4.829390908154022e-06, + "loss": 0.2197, + "num_input_tokens_seen": 5275928, + "step": 2686 + }, + { + "epoch": 0.356129887342611, + "grad_norm": 18.509252548217773, + "learning_rate": 4.829264845653018e-06, + "loss": 0.5059, + "num_input_tokens_seen": 5277256, + "step": 2687 + }, + { + "epoch": 0.3562624254473161, + "grad_norm": 3.853984832763672, + "learning_rate": 4.829138738242181e-06, + "loss": 0.0307, + "num_input_tokens_seen": 5278624, + "step": 2688 + }, + { + "epoch": 0.3563949635520212, + "grad_norm": 13.607367515563965, + "learning_rate": 4.829012585923942e-06, + "loss": 0.5285, + "num_input_tokens_seen": 5280688, + "step": 2689 + }, + { + "epoch": 0.3565275016567263, + "grad_norm": 0.07905127853155136, + "learning_rate": 4.828886388700734e-06, + "loss": 0.0005, + "num_input_tokens_seen": 5282096, + "step": 2690 + }, + { + "epoch": 0.3566600397614314, + "grad_norm": 6.90317440032959, + "learning_rate": 4.828760146574989e-06, + "loss": 0.1898, + "num_input_tokens_seen": 5283584, + "step": 2691 + }, + { + "epoch": 0.3567925778661365, + "grad_norm": 0.04748100787401199, + "learning_rate": 4.828633859549142e-06, + "loss": 0.0003, + "num_input_tokens_seen": 5285384, + "step": 2692 + }, + { + "epoch": 0.35692511597084164, + "grad_norm": 31.073902130126953, + "learning_rate": 4.828507527625628e-06, + "loss": 1.0915, + "num_input_tokens_seen": 5287624, + "step": 2693 + }, + { + "epoch": 0.35705765407554674, + "grad_norm": 0.05502196028828621, + "learning_rate": 4.8283811508068826e-06, + "loss": 0.0003, + "num_input_tokens_seen": 5289448, + "step": 2694 + }, + { + "epoch": 0.35719019218025183, + "grad_norm": 8.48267936706543, + "learning_rate": 4.828254729095343e-06, + "loss": 0.1088, + "num_input_tokens_seen": 5291328, + "step": 2695 + }, + { + "epoch": 0.3573227302849569, + "grad_norm": 12.937058448791504, + "learning_rate": 4.828128262493445e-06, + "loss": 0.3588, + "num_input_tokens_seen": 5293952, + "step": 2696 + }, + { + "epoch": 0.357455268389662, + "grad_norm": 0.617555558681488, + "learning_rate": 4.828001751003628e-06, + "loss": 0.0032, + "num_input_tokens_seen": 5295536, + "step": 2697 + }, + { + "epoch": 0.3575878064943671, + "grad_norm": 0.9359810948371887, + "learning_rate": 4.827875194628332e-06, + "loss": 0.0052, + "num_input_tokens_seen": 5297944, + "step": 2698 + }, + { + "epoch": 0.35772034459907226, + "grad_norm": 0.20049409568309784, + "learning_rate": 4.827748593369996e-06, + "loss": 0.0009, + "num_input_tokens_seen": 5299568, + "step": 2699 + }, + { + "epoch": 0.35785288270377735, + "grad_norm": 11.105497360229492, + "learning_rate": 4.827621947231061e-06, + "loss": 0.2461, + "num_input_tokens_seen": 5301800, + "step": 2700 + }, + { + "epoch": 0.35798542080848245, + "grad_norm": 8.143413543701172, + "learning_rate": 4.8274952562139695e-06, + "loss": 0.2674, + "num_input_tokens_seen": 5303992, + "step": 2701 + }, + { + "epoch": 0.35811795891318754, + "grad_norm": 0.12301398813724518, + "learning_rate": 4.827368520321163e-06, + "loss": 0.0007, + "num_input_tokens_seen": 5306568, + "step": 2702 + }, + { + "epoch": 0.35825049701789263, + "grad_norm": 10.06353759765625, + "learning_rate": 4.827241739555087e-06, + "loss": 0.1921, + "num_input_tokens_seen": 5308576, + "step": 2703 + }, + { + "epoch": 0.3583830351225977, + "grad_norm": 0.07130112498998642, + "learning_rate": 4.827114913918184e-06, + "loss": 0.0004, + "num_input_tokens_seen": 5309672, + "step": 2704 + }, + { + "epoch": 0.3585155732273029, + "grad_norm": 0.060565609484910965, + "learning_rate": 4.8269880434129e-06, + "loss": 0.0004, + "num_input_tokens_seen": 5311544, + "step": 2705 + }, + { + "epoch": 0.35864811133200797, + "grad_norm": 0.10120686143636703, + "learning_rate": 4.826861128041682e-06, + "loss": 0.0006, + "num_input_tokens_seen": 5313584, + "step": 2706 + }, + { + "epoch": 0.35878064943671306, + "grad_norm": 3.437826156616211, + "learning_rate": 4.826734167806976e-06, + "loss": 0.0766, + "num_input_tokens_seen": 5315568, + "step": 2707 + }, + { + "epoch": 0.35891318754141815, + "grad_norm": 8.354029655456543, + "learning_rate": 4.826607162711229e-06, + "loss": 0.2113, + "num_input_tokens_seen": 5317024, + "step": 2708 + }, + { + "epoch": 0.35904572564612325, + "grad_norm": 18.631977081298828, + "learning_rate": 4.826480112756892e-06, + "loss": 0.4829, + "num_input_tokens_seen": 5318752, + "step": 2709 + }, + { + "epoch": 0.35917826375082834, + "grad_norm": 20.836320877075195, + "learning_rate": 4.826353017946414e-06, + "loss": 0.4978, + "num_input_tokens_seen": 5320952, + "step": 2710 + }, + { + "epoch": 0.3593108018555335, + "grad_norm": 11.428426742553711, + "learning_rate": 4.826225878282244e-06, + "loss": 0.333, + "num_input_tokens_seen": 5322808, + "step": 2711 + }, + { + "epoch": 0.3594433399602386, + "grad_norm": 9.816218376159668, + "learning_rate": 4.826098693766834e-06, + "loss": 0.2347, + "num_input_tokens_seen": 5324424, + "step": 2712 + }, + { + "epoch": 0.3595758780649437, + "grad_norm": 0.17480172216892242, + "learning_rate": 4.825971464402637e-06, + "loss": 0.001, + "num_input_tokens_seen": 5326728, + "step": 2713 + }, + { + "epoch": 0.35970841616964877, + "grad_norm": 0.10027627646923065, + "learning_rate": 4.8258441901921046e-06, + "loss": 0.0006, + "num_input_tokens_seen": 5328160, + "step": 2714 + }, + { + "epoch": 0.35984095427435386, + "grad_norm": 7.035010814666748, + "learning_rate": 4.825716871137694e-06, + "loss": 0.0669, + "num_input_tokens_seen": 5329976, + "step": 2715 + }, + { + "epoch": 0.35997349237905896, + "grad_norm": 0.33973458409309387, + "learning_rate": 4.825589507241856e-06, + "loss": 0.002, + "num_input_tokens_seen": 5332320, + "step": 2716 + }, + { + "epoch": 0.3601060304837641, + "grad_norm": 8.755828857421875, + "learning_rate": 4.825462098507048e-06, + "loss": 0.2259, + "num_input_tokens_seen": 5334408, + "step": 2717 + }, + { + "epoch": 0.3602385685884692, + "grad_norm": 8.906939506530762, + "learning_rate": 4.825334644935727e-06, + "loss": 0.2249, + "num_input_tokens_seen": 5336368, + "step": 2718 + }, + { + "epoch": 0.3603711066931743, + "grad_norm": 0.49791663885116577, + "learning_rate": 4.825207146530349e-06, + "loss": 0.0026, + "num_input_tokens_seen": 5338160, + "step": 2719 + }, + { + "epoch": 0.3605036447978794, + "grad_norm": 5.675534248352051, + "learning_rate": 4.825079603293374e-06, + "loss": 0.0879, + "num_input_tokens_seen": 5339912, + "step": 2720 + }, + { + "epoch": 0.3606361829025845, + "grad_norm": 8.904741287231445, + "learning_rate": 4.82495201522726e-06, + "loss": 0.1763, + "num_input_tokens_seen": 5342760, + "step": 2721 + }, + { + "epoch": 0.3607687210072896, + "grad_norm": 10.000617980957031, + "learning_rate": 4.824824382334467e-06, + "loss": 0.2787, + "num_input_tokens_seen": 5344496, + "step": 2722 + }, + { + "epoch": 0.3609012591119947, + "grad_norm": 4.359716892242432, + "learning_rate": 4.824696704617456e-06, + "loss": 0.0459, + "num_input_tokens_seen": 5346032, + "step": 2723 + }, + { + "epoch": 0.3610337972166998, + "grad_norm": 20.22006607055664, + "learning_rate": 4.824568982078689e-06, + "loss": 0.4951, + "num_input_tokens_seen": 5348568, + "step": 2724 + }, + { + "epoch": 0.3611663353214049, + "grad_norm": 4.95950984954834, + "learning_rate": 4.824441214720629e-06, + "loss": 0.0419, + "num_input_tokens_seen": 5350288, + "step": 2725 + }, + { + "epoch": 0.36129887342611, + "grad_norm": 6.043578624725342, + "learning_rate": 4.824313402545738e-06, + "loss": 0.0915, + "num_input_tokens_seen": 5353072, + "step": 2726 + }, + { + "epoch": 0.3614314115308151, + "grad_norm": 13.984691619873047, + "learning_rate": 4.824185545556482e-06, + "loss": 0.7146, + "num_input_tokens_seen": 5355256, + "step": 2727 + }, + { + "epoch": 0.3615639496355202, + "grad_norm": 9.403741836547852, + "learning_rate": 4.8240576437553246e-06, + "loss": 0.1839, + "num_input_tokens_seen": 5358120, + "step": 2728 + }, + { + "epoch": 0.36169648774022534, + "grad_norm": 9.493645668029785, + "learning_rate": 4.823929697144733e-06, + "loss": 0.2058, + "num_input_tokens_seen": 5359616, + "step": 2729 + }, + { + "epoch": 0.36182902584493043, + "grad_norm": 14.25259780883789, + "learning_rate": 4.823801705727173e-06, + "loss": 0.481, + "num_input_tokens_seen": 5362224, + "step": 2730 + }, + { + "epoch": 0.3619615639496355, + "grad_norm": 3.7409017086029053, + "learning_rate": 4.823673669505113e-06, + "loss": 0.1407, + "num_input_tokens_seen": 5365328, + "step": 2731 + }, + { + "epoch": 0.3620941020543406, + "grad_norm": 19.91145133972168, + "learning_rate": 4.823545588481023e-06, + "loss": 0.6865, + "num_input_tokens_seen": 5367784, + "step": 2732 + }, + { + "epoch": 0.3622266401590457, + "grad_norm": 0.24601709842681885, + "learning_rate": 4.823417462657369e-06, + "loss": 0.0015, + "num_input_tokens_seen": 5370008, + "step": 2733 + }, + { + "epoch": 0.3623591782637508, + "grad_norm": 0.6090838313102722, + "learning_rate": 4.8232892920366255e-06, + "loss": 0.0031, + "num_input_tokens_seen": 5371432, + "step": 2734 + }, + { + "epoch": 0.36249171636845595, + "grad_norm": 10.948972702026367, + "learning_rate": 4.82316107662126e-06, + "loss": 0.2235, + "num_input_tokens_seen": 5372880, + "step": 2735 + }, + { + "epoch": 0.36262425447316105, + "grad_norm": 7.48264741897583, + "learning_rate": 4.8230328164137475e-06, + "loss": 0.1722, + "num_input_tokens_seen": 5374648, + "step": 2736 + }, + { + "epoch": 0.36275679257786614, + "grad_norm": 4.644361972808838, + "learning_rate": 4.82290451141656e-06, + "loss": 0.0631, + "num_input_tokens_seen": 5376232, + "step": 2737 + }, + { + "epoch": 0.36288933068257123, + "grad_norm": 13.291915893554688, + "learning_rate": 4.82277616163217e-06, + "loss": 0.2009, + "num_input_tokens_seen": 5377832, + "step": 2738 + }, + { + "epoch": 0.3630218687872763, + "grad_norm": 5.830990314483643, + "learning_rate": 4.8226477670630535e-06, + "loss": 0.0711, + "num_input_tokens_seen": 5379144, + "step": 2739 + }, + { + "epoch": 0.3631544068919814, + "grad_norm": 7.66276741027832, + "learning_rate": 4.822519327711686e-06, + "loss": 0.1609, + "num_input_tokens_seen": 5380808, + "step": 2740 + }, + { + "epoch": 0.36328694499668657, + "grad_norm": 3.8986198902130127, + "learning_rate": 4.822390843580543e-06, + "loss": 0.0428, + "num_input_tokens_seen": 5382920, + "step": 2741 + }, + { + "epoch": 0.36341948310139166, + "grad_norm": 3.8060407638549805, + "learning_rate": 4.822262314672103e-06, + "loss": 0.0531, + "num_input_tokens_seen": 5384920, + "step": 2742 + }, + { + "epoch": 0.36355202120609675, + "grad_norm": 14.809745788574219, + "learning_rate": 4.822133740988843e-06, + "loss": 0.5055, + "num_input_tokens_seen": 5388112, + "step": 2743 + }, + { + "epoch": 0.36368455931080185, + "grad_norm": 8.85603141784668, + "learning_rate": 4.822005122533243e-06, + "loss": 0.1833, + "num_input_tokens_seen": 5389984, + "step": 2744 + }, + { + "epoch": 0.36381709741550694, + "grad_norm": 14.71507453918457, + "learning_rate": 4.821876459307782e-06, + "loss": 0.6708, + "num_input_tokens_seen": 5392304, + "step": 2745 + }, + { + "epoch": 0.36394963552021203, + "grad_norm": 7.249777793884277, + "learning_rate": 4.821747751314941e-06, + "loss": 0.0601, + "num_input_tokens_seen": 5393952, + "step": 2746 + }, + { + "epoch": 0.3640821736249172, + "grad_norm": 8.879473686218262, + "learning_rate": 4.821618998557202e-06, + "loss": 0.2415, + "num_input_tokens_seen": 5396136, + "step": 2747 + }, + { + "epoch": 0.3642147117296223, + "grad_norm": 1.3180749416351318, + "learning_rate": 4.821490201037048e-06, + "loss": 0.0076, + "num_input_tokens_seen": 5398072, + "step": 2748 + }, + { + "epoch": 0.36434724983432737, + "grad_norm": 13.902233123779297, + "learning_rate": 4.82136135875696e-06, + "loss": 0.4783, + "num_input_tokens_seen": 5400696, + "step": 2749 + }, + { + "epoch": 0.36447978793903246, + "grad_norm": 6.44667911529541, + "learning_rate": 4.821232471719424e-06, + "loss": 0.1601, + "num_input_tokens_seen": 5402016, + "step": 2750 + }, + { + "epoch": 0.36461232604373756, + "grad_norm": 11.358306884765625, + "learning_rate": 4.821103539926925e-06, + "loss": 0.3299, + "num_input_tokens_seen": 5403728, + "step": 2751 + }, + { + "epoch": 0.3647448641484427, + "grad_norm": 15.400067329406738, + "learning_rate": 4.820974563381949e-06, + "loss": 0.5389, + "num_input_tokens_seen": 5405504, + "step": 2752 + }, + { + "epoch": 0.3648774022531478, + "grad_norm": 1.9128706455230713, + "learning_rate": 4.82084554208698e-06, + "loss": 0.0092, + "num_input_tokens_seen": 5407424, + "step": 2753 + }, + { + "epoch": 0.3650099403578529, + "grad_norm": 6.558671474456787, + "learning_rate": 4.82071647604451e-06, + "loss": 0.0386, + "num_input_tokens_seen": 5409048, + "step": 2754 + }, + { + "epoch": 0.365142478462558, + "grad_norm": 8.509815216064453, + "learning_rate": 4.820587365257024e-06, + "loss": 0.1868, + "num_input_tokens_seen": 5410984, + "step": 2755 + }, + { + "epoch": 0.3652750165672631, + "grad_norm": 6.833467483520508, + "learning_rate": 4.820458209727014e-06, + "loss": 0.1614, + "num_input_tokens_seen": 5413344, + "step": 2756 + }, + { + "epoch": 0.36540755467196817, + "grad_norm": 8.644022941589355, + "learning_rate": 4.820329009456968e-06, + "loss": 0.1508, + "num_input_tokens_seen": 5415096, + "step": 2757 + }, + { + "epoch": 0.3655400927766733, + "grad_norm": 10.98110580444336, + "learning_rate": 4.820199764449378e-06, + "loss": 0.3432, + "num_input_tokens_seen": 5418040, + "step": 2758 + }, + { + "epoch": 0.3656726308813784, + "grad_norm": 2.9276976585388184, + "learning_rate": 4.820070474706736e-06, + "loss": 0.0239, + "num_input_tokens_seen": 5420464, + "step": 2759 + }, + { + "epoch": 0.3658051689860835, + "grad_norm": 9.445584297180176, + "learning_rate": 4.8199411402315356e-06, + "loss": 0.3231, + "num_input_tokens_seen": 5423080, + "step": 2760 + }, + { + "epoch": 0.3659377070907886, + "grad_norm": 2.3052515983581543, + "learning_rate": 4.819811761026269e-06, + "loss": 0.0099, + "num_input_tokens_seen": 5424672, + "step": 2761 + }, + { + "epoch": 0.3660702451954937, + "grad_norm": 0.09113645553588867, + "learning_rate": 4.819682337093431e-06, + "loss": 0.0005, + "num_input_tokens_seen": 5425928, + "step": 2762 + }, + { + "epoch": 0.3662027833001988, + "grad_norm": 16.88588523864746, + "learning_rate": 4.819552868435517e-06, + "loss": 0.796, + "num_input_tokens_seen": 5428768, + "step": 2763 + }, + { + "epoch": 0.36633532140490394, + "grad_norm": 10.21859073638916, + "learning_rate": 4.819423355055024e-06, + "loss": 0.4537, + "num_input_tokens_seen": 5430832, + "step": 2764 + }, + { + "epoch": 0.36646785950960903, + "grad_norm": 10.199539184570312, + "learning_rate": 4.819293796954449e-06, + "loss": 0.2956, + "num_input_tokens_seen": 5432944, + "step": 2765 + }, + { + "epoch": 0.3666003976143141, + "grad_norm": 7.365966796875, + "learning_rate": 4.819164194136289e-06, + "loss": 0.1657, + "num_input_tokens_seen": 5434728, + "step": 2766 + }, + { + "epoch": 0.3667329357190192, + "grad_norm": 0.46876177191734314, + "learning_rate": 4.819034546603044e-06, + "loss": 0.0025, + "num_input_tokens_seen": 5436160, + "step": 2767 + }, + { + "epoch": 0.3668654738237243, + "grad_norm": 0.09722945094108582, + "learning_rate": 4.818904854357212e-06, + "loss": 0.0006, + "num_input_tokens_seen": 5437600, + "step": 2768 + }, + { + "epoch": 0.3669980119284294, + "grad_norm": 14.744268417358398, + "learning_rate": 4.818775117401296e-06, + "loss": 0.5286, + "num_input_tokens_seen": 5439176, + "step": 2769 + }, + { + "epoch": 0.36713055003313455, + "grad_norm": 5.0414838790893555, + "learning_rate": 4.818645335737795e-06, + "loss": 0.1518, + "num_input_tokens_seen": 5440624, + "step": 2770 + }, + { + "epoch": 0.36726308813783964, + "grad_norm": 3.2627439498901367, + "learning_rate": 4.818515509369214e-06, + "loss": 0.062, + "num_input_tokens_seen": 5441944, + "step": 2771 + }, + { + "epoch": 0.36739562624254474, + "grad_norm": 11.379758834838867, + "learning_rate": 4.818385638298053e-06, + "loss": 0.289, + "num_input_tokens_seen": 5443248, + "step": 2772 + }, + { + "epoch": 0.36752816434724983, + "grad_norm": 0.3330937623977661, + "learning_rate": 4.8182557225268174e-06, + "loss": 0.002, + "num_input_tokens_seen": 5445728, + "step": 2773 + }, + { + "epoch": 0.3676607024519549, + "grad_norm": 0.2368849664926529, + "learning_rate": 4.818125762058014e-06, + "loss": 0.0014, + "num_input_tokens_seen": 5446968, + "step": 2774 + }, + { + "epoch": 0.36779324055666, + "grad_norm": 8.730948448181152, + "learning_rate": 4.817995756894145e-06, + "loss": 0.2473, + "num_input_tokens_seen": 5449256, + "step": 2775 + }, + { + "epoch": 0.36792577866136517, + "grad_norm": 0.31364089250564575, + "learning_rate": 4.817865707037719e-06, + "loss": 0.0019, + "num_input_tokens_seen": 5451048, + "step": 2776 + }, + { + "epoch": 0.36805831676607026, + "grad_norm": 8.201196670532227, + "learning_rate": 4.817735612491243e-06, + "loss": 0.2061, + "num_input_tokens_seen": 5452672, + "step": 2777 + }, + { + "epoch": 0.36819085487077535, + "grad_norm": 20.229724884033203, + "learning_rate": 4.817605473257226e-06, + "loss": 0.499, + "num_input_tokens_seen": 5454800, + "step": 2778 + }, + { + "epoch": 0.36832339297548045, + "grad_norm": 0.7167686223983765, + "learning_rate": 4.817475289338176e-06, + "loss": 0.0043, + "num_input_tokens_seen": 5456760, + "step": 2779 + }, + { + "epoch": 0.36845593108018554, + "grad_norm": 10.311163902282715, + "learning_rate": 4.817345060736604e-06, + "loss": 0.1983, + "num_input_tokens_seen": 5458384, + "step": 2780 + }, + { + "epoch": 0.36858846918489063, + "grad_norm": 8.68519115447998, + "learning_rate": 4.817214787455022e-06, + "loss": 0.0979, + "num_input_tokens_seen": 5460296, + "step": 2781 + }, + { + "epoch": 0.3687210072895958, + "grad_norm": 13.810454368591309, + "learning_rate": 4.817084469495938e-06, + "loss": 0.3559, + "num_input_tokens_seen": 5462976, + "step": 2782 + }, + { + "epoch": 0.3688535453943009, + "grad_norm": 0.3250599503517151, + "learning_rate": 4.816954106861868e-06, + "loss": 0.0019, + "num_input_tokens_seen": 5464328, + "step": 2783 + }, + { + "epoch": 0.36898608349900597, + "grad_norm": 9.174680709838867, + "learning_rate": 4.816823699555324e-06, + "loss": 0.135, + "num_input_tokens_seen": 5466440, + "step": 2784 + }, + { + "epoch": 0.36911862160371106, + "grad_norm": 0.591949999332428, + "learning_rate": 4.816693247578821e-06, + "loss": 0.0035, + "num_input_tokens_seen": 5468224, + "step": 2785 + }, + { + "epoch": 0.36925115970841615, + "grad_norm": 4.287399768829346, + "learning_rate": 4.816562750934875e-06, + "loss": 0.0842, + "num_input_tokens_seen": 5470344, + "step": 2786 + }, + { + "epoch": 0.36938369781312125, + "grad_norm": 6.258661270141602, + "learning_rate": 4.816432209625999e-06, + "loss": 0.0692, + "num_input_tokens_seen": 5472040, + "step": 2787 + }, + { + "epoch": 0.3695162359178264, + "grad_norm": 14.845474243164062, + "learning_rate": 4.816301623654714e-06, + "loss": 0.4915, + "num_input_tokens_seen": 5474176, + "step": 2788 + }, + { + "epoch": 0.3696487740225315, + "grad_norm": 11.282391548156738, + "learning_rate": 4.816170993023535e-06, + "loss": 0.4019, + "num_input_tokens_seen": 5477208, + "step": 2789 + }, + { + "epoch": 0.3697813121272366, + "grad_norm": 1.8792104721069336, + "learning_rate": 4.816040317734981e-06, + "loss": 0.0447, + "num_input_tokens_seen": 5478848, + "step": 2790 + }, + { + "epoch": 0.3699138502319417, + "grad_norm": 6.433264255523682, + "learning_rate": 4.8159095977915725e-06, + "loss": 0.1937, + "num_input_tokens_seen": 5480672, + "step": 2791 + }, + { + "epoch": 0.37004638833664677, + "grad_norm": 5.4285197257995605, + "learning_rate": 4.815778833195829e-06, + "loss": 0.0357, + "num_input_tokens_seen": 5481888, + "step": 2792 + }, + { + "epoch": 0.37017892644135186, + "grad_norm": 0.45382043719291687, + "learning_rate": 4.815648023950272e-06, + "loss": 0.0026, + "num_input_tokens_seen": 5483120, + "step": 2793 + }, + { + "epoch": 0.370311464546057, + "grad_norm": 7.863617897033691, + "learning_rate": 4.815517170057423e-06, + "loss": 0.0689, + "num_input_tokens_seen": 5485024, + "step": 2794 + }, + { + "epoch": 0.3704440026507621, + "grad_norm": 3.0515055656433105, + "learning_rate": 4.815386271519807e-06, + "loss": 0.0346, + "num_input_tokens_seen": 5486544, + "step": 2795 + }, + { + "epoch": 0.3705765407554672, + "grad_norm": 0.24760496616363525, + "learning_rate": 4.815255328339946e-06, + "loss": 0.0015, + "num_input_tokens_seen": 5488048, + "step": 2796 + }, + { + "epoch": 0.3707090788601723, + "grad_norm": 12.127198219299316, + "learning_rate": 4.815124340520364e-06, + "loss": 0.422, + "num_input_tokens_seen": 5490208, + "step": 2797 + }, + { + "epoch": 0.3708416169648774, + "grad_norm": 9.043184280395508, + "learning_rate": 4.814993308063589e-06, + "loss": 0.1885, + "num_input_tokens_seen": 5491784, + "step": 2798 + }, + { + "epoch": 0.3709741550695825, + "grad_norm": 22.74708366394043, + "learning_rate": 4.814862230972145e-06, + "loss": 0.9328, + "num_input_tokens_seen": 5495136, + "step": 2799 + }, + { + "epoch": 0.3711066931742876, + "grad_norm": 5.827566146850586, + "learning_rate": 4.814731109248561e-06, + "loss": 0.0988, + "num_input_tokens_seen": 5496808, + "step": 2800 + }, + { + "epoch": 0.3712392312789927, + "grad_norm": 6.5667877197265625, + "learning_rate": 4.8145999428953626e-06, + "loss": 0.1433, + "num_input_tokens_seen": 5498200, + "step": 2801 + }, + { + "epoch": 0.3713717693836978, + "grad_norm": 8.760632514953613, + "learning_rate": 4.814468731915082e-06, + "loss": 0.2566, + "num_input_tokens_seen": 5500392, + "step": 2802 + }, + { + "epoch": 0.3715043074884029, + "grad_norm": 5.427152633666992, + "learning_rate": 4.814337476310248e-06, + "loss": 0.0563, + "num_input_tokens_seen": 5502400, + "step": 2803 + }, + { + "epoch": 0.371636845593108, + "grad_norm": 3.9160783290863037, + "learning_rate": 4.81420617608339e-06, + "loss": 0.0797, + "num_input_tokens_seen": 5504104, + "step": 2804 + }, + { + "epoch": 0.3717693836978131, + "grad_norm": 6.1304450035095215, + "learning_rate": 4.8140748312370405e-06, + "loss": 0.2059, + "num_input_tokens_seen": 5506640, + "step": 2805 + }, + { + "epoch": 0.37190192180251824, + "grad_norm": 7.111385822296143, + "learning_rate": 4.813943441773732e-06, + "loss": 0.2043, + "num_input_tokens_seen": 5508448, + "step": 2806 + }, + { + "epoch": 0.37203445990722334, + "grad_norm": 0.954653799533844, + "learning_rate": 4.813812007695998e-06, + "loss": 0.0043, + "num_input_tokens_seen": 5509584, + "step": 2807 + }, + { + "epoch": 0.37216699801192843, + "grad_norm": 13.62472152709961, + "learning_rate": 4.813680529006372e-06, + "loss": 0.4609, + "num_input_tokens_seen": 5511128, + "step": 2808 + }, + { + "epoch": 0.3722995361166335, + "grad_norm": 20.73649787902832, + "learning_rate": 4.813549005707389e-06, + "loss": 0.7493, + "num_input_tokens_seen": 5513744, + "step": 2809 + }, + { + "epoch": 0.3724320742213386, + "grad_norm": 12.14520263671875, + "learning_rate": 4.813417437801585e-06, + "loss": 0.312, + "num_input_tokens_seen": 5515200, + "step": 2810 + }, + { + "epoch": 0.37256461232604376, + "grad_norm": 12.121313095092773, + "learning_rate": 4.813285825291498e-06, + "loss": 0.3774, + "num_input_tokens_seen": 5516784, + "step": 2811 + }, + { + "epoch": 0.37269715043074886, + "grad_norm": 4.442098140716553, + "learning_rate": 4.813154168179663e-06, + "loss": 0.021, + "num_input_tokens_seen": 5518528, + "step": 2812 + }, + { + "epoch": 0.37282968853545395, + "grad_norm": 7.283833026885986, + "learning_rate": 4.813022466468621e-06, + "loss": 0.1261, + "num_input_tokens_seen": 5520152, + "step": 2813 + }, + { + "epoch": 0.37296222664015904, + "grad_norm": 12.564074516296387, + "learning_rate": 4.8128907201609095e-06, + "loss": 0.6129, + "num_input_tokens_seen": 5521936, + "step": 2814 + }, + { + "epoch": 0.37309476474486414, + "grad_norm": 0.37329480051994324, + "learning_rate": 4.8127589292590696e-06, + "loss": 0.0022, + "num_input_tokens_seen": 5523064, + "step": 2815 + }, + { + "epoch": 0.37322730284956923, + "grad_norm": 4.183423042297363, + "learning_rate": 4.812627093765642e-06, + "loss": 0.1098, + "num_input_tokens_seen": 5526136, + "step": 2816 + }, + { + "epoch": 0.3733598409542744, + "grad_norm": 17.65333366394043, + "learning_rate": 4.812495213683169e-06, + "loss": 0.5372, + "num_input_tokens_seen": 5528840, + "step": 2817 + }, + { + "epoch": 0.3734923790589795, + "grad_norm": 9.254899024963379, + "learning_rate": 4.812363289014193e-06, + "loss": 0.1286, + "num_input_tokens_seen": 5530856, + "step": 2818 + }, + { + "epoch": 0.37362491716368457, + "grad_norm": 9.81226921081543, + "learning_rate": 4.8122313197612565e-06, + "loss": 0.2343, + "num_input_tokens_seen": 5533832, + "step": 2819 + }, + { + "epoch": 0.37375745526838966, + "grad_norm": 0.40088605880737305, + "learning_rate": 4.812099305926906e-06, + "loss": 0.0023, + "num_input_tokens_seen": 5535400, + "step": 2820 + }, + { + "epoch": 0.37388999337309475, + "grad_norm": 11.985273361206055, + "learning_rate": 4.811967247513685e-06, + "loss": 0.4819, + "num_input_tokens_seen": 5538072, + "step": 2821 + }, + { + "epoch": 0.37402253147779985, + "grad_norm": 0.19371116161346436, + "learning_rate": 4.811835144524141e-06, + "loss": 0.0012, + "num_input_tokens_seen": 5539520, + "step": 2822 + }, + { + "epoch": 0.374155069582505, + "grad_norm": 0.41432109475135803, + "learning_rate": 4.811702996960821e-06, + "loss": 0.0024, + "num_input_tokens_seen": 5541600, + "step": 2823 + }, + { + "epoch": 0.3742876076872101, + "grad_norm": 0.28967615962028503, + "learning_rate": 4.811570804826272e-06, + "loss": 0.0017, + "num_input_tokens_seen": 5543200, + "step": 2824 + }, + { + "epoch": 0.3744201457919152, + "grad_norm": 14.24317455291748, + "learning_rate": 4.811438568123044e-06, + "loss": 0.6332, + "num_input_tokens_seen": 5545768, + "step": 2825 + }, + { + "epoch": 0.3745526838966203, + "grad_norm": 0.18354202806949615, + "learning_rate": 4.811306286853684e-06, + "loss": 0.0011, + "num_input_tokens_seen": 5546848, + "step": 2826 + }, + { + "epoch": 0.37468522200132537, + "grad_norm": 1.3571782112121582, + "learning_rate": 4.811173961020746e-06, + "loss": 0.0146, + "num_input_tokens_seen": 5548816, + "step": 2827 + }, + { + "epoch": 0.37481776010603046, + "grad_norm": 1.520582914352417, + "learning_rate": 4.8110415906267795e-06, + "loss": 0.0086, + "num_input_tokens_seen": 5551176, + "step": 2828 + }, + { + "epoch": 0.3749502982107356, + "grad_norm": 0.15347182750701904, + "learning_rate": 4.8109091756743355e-06, + "loss": 0.0009, + "num_input_tokens_seen": 5552304, + "step": 2829 + }, + { + "epoch": 0.3750828363154407, + "grad_norm": 0.23789644241333008, + "learning_rate": 4.81077671616597e-06, + "loss": 0.0014, + "num_input_tokens_seen": 5554680, + "step": 2830 + }, + { + "epoch": 0.3752153744201458, + "grad_norm": 6.411083221435547, + "learning_rate": 4.810644212104234e-06, + "loss": 0.076, + "num_input_tokens_seen": 5556408, + "step": 2831 + }, + { + "epoch": 0.3753479125248509, + "grad_norm": 8.77319049835205, + "learning_rate": 4.810511663491684e-06, + "loss": 0.1447, + "num_input_tokens_seen": 5559512, + "step": 2832 + }, + { + "epoch": 0.375480450629556, + "grad_norm": 1.6859214305877686, + "learning_rate": 4.810379070330876e-06, + "loss": 0.0319, + "num_input_tokens_seen": 5560816, + "step": 2833 + }, + { + "epoch": 0.3756129887342611, + "grad_norm": 12.44225788116455, + "learning_rate": 4.810246432624365e-06, + "loss": 0.2305, + "num_input_tokens_seen": 5562592, + "step": 2834 + }, + { + "epoch": 0.3757455268389662, + "grad_norm": 2.298737049102783, + "learning_rate": 4.810113750374709e-06, + "loss": 0.047, + "num_input_tokens_seen": 5564824, + "step": 2835 + }, + { + "epoch": 0.3758780649436713, + "grad_norm": 0.037678878754377365, + "learning_rate": 4.8099810235844654e-06, + "loss": 0.0002, + "num_input_tokens_seen": 5566176, + "step": 2836 + }, + { + "epoch": 0.3760106030483764, + "grad_norm": 1.4424196481704712, + "learning_rate": 4.8098482522561955e-06, + "loss": 0.0094, + "num_input_tokens_seen": 5567848, + "step": 2837 + }, + { + "epoch": 0.3761431411530815, + "grad_norm": 10.544405937194824, + "learning_rate": 4.809715436392457e-06, + "loss": 0.3774, + "num_input_tokens_seen": 5570768, + "step": 2838 + }, + { + "epoch": 0.3762756792577866, + "grad_norm": 6.835763454437256, + "learning_rate": 4.809582575995812e-06, + "loss": 0.1068, + "num_input_tokens_seen": 5573232, + "step": 2839 + }, + { + "epoch": 0.3764082173624917, + "grad_norm": 12.55688762664795, + "learning_rate": 4.809449671068821e-06, + "loss": 0.4922, + "num_input_tokens_seen": 5575880, + "step": 2840 + }, + { + "epoch": 0.37654075546719684, + "grad_norm": 0.06836347281932831, + "learning_rate": 4.809316721614048e-06, + "loss": 0.0004, + "num_input_tokens_seen": 5577328, + "step": 2841 + }, + { + "epoch": 0.37667329357190193, + "grad_norm": 3.1467015743255615, + "learning_rate": 4.8091837276340546e-06, + "loss": 0.0451, + "num_input_tokens_seen": 5578560, + "step": 2842 + }, + { + "epoch": 0.37680583167660703, + "grad_norm": 8.121678352355957, + "learning_rate": 4.8090506891314066e-06, + "loss": 0.2487, + "num_input_tokens_seen": 5581072, + "step": 2843 + }, + { + "epoch": 0.3769383697813121, + "grad_norm": 0.016818249598145485, + "learning_rate": 4.808917606108668e-06, + "loss": 0.0001, + "num_input_tokens_seen": 5582392, + "step": 2844 + }, + { + "epoch": 0.3770709078860172, + "grad_norm": 13.593183517456055, + "learning_rate": 4.808784478568405e-06, + "loss": 0.4682, + "num_input_tokens_seen": 5584264, + "step": 2845 + }, + { + "epoch": 0.3772034459907223, + "grad_norm": 10.936602592468262, + "learning_rate": 4.808651306513185e-06, + "loss": 0.3243, + "num_input_tokens_seen": 5585880, + "step": 2846 + }, + { + "epoch": 0.37733598409542746, + "grad_norm": 5.6197285652160645, + "learning_rate": 4.808518089945575e-06, + "loss": 0.1943, + "num_input_tokens_seen": 5587632, + "step": 2847 + }, + { + "epoch": 0.37746852220013255, + "grad_norm": 19.004112243652344, + "learning_rate": 4.8083848288681436e-06, + "loss": 0.4383, + "num_input_tokens_seen": 5590496, + "step": 2848 + }, + { + "epoch": 0.37760106030483764, + "grad_norm": 0.42464756965637207, + "learning_rate": 4.80825152328346e-06, + "loss": 0.0025, + "num_input_tokens_seen": 5591488, + "step": 2849 + }, + { + "epoch": 0.37773359840954274, + "grad_norm": 0.18224036693572998, + "learning_rate": 4.8081181731940946e-06, + "loss": 0.001, + "num_input_tokens_seen": 5593576, + "step": 2850 + }, + { + "epoch": 0.37786613651424783, + "grad_norm": 9.077179908752441, + "learning_rate": 4.807984778602619e-06, + "loss": 0.3147, + "num_input_tokens_seen": 5596528, + "step": 2851 + }, + { + "epoch": 0.3779986746189529, + "grad_norm": 0.10246941447257996, + "learning_rate": 4.807851339511604e-06, + "loss": 0.0006, + "num_input_tokens_seen": 5597752, + "step": 2852 + }, + { + "epoch": 0.37813121272365807, + "grad_norm": 14.062154769897461, + "learning_rate": 4.807717855923624e-06, + "loss": 0.5376, + "num_input_tokens_seen": 5599992, + "step": 2853 + }, + { + "epoch": 0.37826375082836317, + "grad_norm": 4.889924049377441, + "learning_rate": 4.80758432784125e-06, + "loss": 0.0989, + "num_input_tokens_seen": 5601664, + "step": 2854 + }, + { + "epoch": 0.37839628893306826, + "grad_norm": 8.847596168518066, + "learning_rate": 4.8074507552670605e-06, + "loss": 0.2732, + "num_input_tokens_seen": 5603184, + "step": 2855 + }, + { + "epoch": 0.37852882703777335, + "grad_norm": 11.587574005126953, + "learning_rate": 4.807317138203627e-06, + "loss": 0.2452, + "num_input_tokens_seen": 5605024, + "step": 2856 + }, + { + "epoch": 0.37866136514247845, + "grad_norm": 7.685604572296143, + "learning_rate": 4.807183476653528e-06, + "loss": 0.1439, + "num_input_tokens_seen": 5606808, + "step": 2857 + }, + { + "epoch": 0.37879390324718354, + "grad_norm": 4.846345901489258, + "learning_rate": 4.80704977061934e-06, + "loss": 0.0587, + "num_input_tokens_seen": 5609456, + "step": 2858 + }, + { + "epoch": 0.3789264413518887, + "grad_norm": 68.86653137207031, + "learning_rate": 4.80691602010364e-06, + "loss": 0.3999, + "num_input_tokens_seen": 5612816, + "step": 2859 + }, + { + "epoch": 0.3790589794565938, + "grad_norm": 13.789451599121094, + "learning_rate": 4.806782225109009e-06, + "loss": 0.5113, + "num_input_tokens_seen": 5614600, + "step": 2860 + }, + { + "epoch": 0.3791915175612989, + "grad_norm": 9.264317512512207, + "learning_rate": 4.8066483856380245e-06, + "loss": 0.223, + "num_input_tokens_seen": 5616464, + "step": 2861 + }, + { + "epoch": 0.37932405566600397, + "grad_norm": 11.028992652893066, + "learning_rate": 4.806514501693268e-06, + "loss": 0.2552, + "num_input_tokens_seen": 5618728, + "step": 2862 + }, + { + "epoch": 0.37945659377070906, + "grad_norm": 3.7008163928985596, + "learning_rate": 4.80638057327732e-06, + "loss": 0.0042, + "num_input_tokens_seen": 5620056, + "step": 2863 + }, + { + "epoch": 0.37958913187541415, + "grad_norm": 7.706905841827393, + "learning_rate": 4.806246600392765e-06, + "loss": 0.0894, + "num_input_tokens_seen": 5621768, + "step": 2864 + }, + { + "epoch": 0.3797216699801193, + "grad_norm": 7.523062705993652, + "learning_rate": 4.806112583042184e-06, + "loss": 0.0873, + "num_input_tokens_seen": 5623664, + "step": 2865 + }, + { + "epoch": 0.3798542080848244, + "grad_norm": 0.7280088067054749, + "learning_rate": 4.805978521228161e-06, + "loss": 0.0044, + "num_input_tokens_seen": 5626256, + "step": 2866 + }, + { + "epoch": 0.3799867461895295, + "grad_norm": 0.5145332217216492, + "learning_rate": 4.8058444149532816e-06, + "loss": 0.0031, + "num_input_tokens_seen": 5627560, + "step": 2867 + }, + { + "epoch": 0.3801192842942346, + "grad_norm": 8.557425498962402, + "learning_rate": 4.805710264220131e-06, + "loss": 0.2467, + "num_input_tokens_seen": 5629568, + "step": 2868 + }, + { + "epoch": 0.3802518223989397, + "grad_norm": 5.689026832580566, + "learning_rate": 4.805576069031296e-06, + "loss": 0.1245, + "num_input_tokens_seen": 5631968, + "step": 2869 + }, + { + "epoch": 0.3803843605036448, + "grad_norm": 14.75581169128418, + "learning_rate": 4.805441829389363e-06, + "loss": 0.2518, + "num_input_tokens_seen": 5634272, + "step": 2870 + }, + { + "epoch": 0.3805168986083499, + "grad_norm": 7.1187849044799805, + "learning_rate": 4.8053075452969224e-06, + "loss": 0.2179, + "num_input_tokens_seen": 5636104, + "step": 2871 + }, + { + "epoch": 0.380649436713055, + "grad_norm": 6.471792697906494, + "learning_rate": 4.805173216756562e-06, + "loss": 0.128, + "num_input_tokens_seen": 5637792, + "step": 2872 + }, + { + "epoch": 0.3807819748177601, + "grad_norm": 7.564325332641602, + "learning_rate": 4.805038843770871e-06, + "loss": 0.0914, + "num_input_tokens_seen": 5639584, + "step": 2873 + }, + { + "epoch": 0.3809145129224652, + "grad_norm": 18.496036529541016, + "learning_rate": 4.804904426342442e-06, + "loss": 0.4562, + "num_input_tokens_seen": 5642800, + "step": 2874 + }, + { + "epoch": 0.3810470510271703, + "grad_norm": 0.5235787630081177, + "learning_rate": 4.804769964473864e-06, + "loss": 0.0031, + "num_input_tokens_seen": 5645504, + "step": 2875 + }, + { + "epoch": 0.38117958913187544, + "grad_norm": 1.3340500593185425, + "learning_rate": 4.804635458167732e-06, + "loss": 0.0084, + "num_input_tokens_seen": 5647376, + "step": 2876 + }, + { + "epoch": 0.38131212723658053, + "grad_norm": 14.965535163879395, + "learning_rate": 4.804500907426638e-06, + "loss": 0.5277, + "num_input_tokens_seen": 5649184, + "step": 2877 + }, + { + "epoch": 0.3814446653412856, + "grad_norm": 0.6908347606658936, + "learning_rate": 4.804366312253177e-06, + "loss": 0.0041, + "num_input_tokens_seen": 5651112, + "step": 2878 + }, + { + "epoch": 0.3815772034459907, + "grad_norm": 3.9593496322631836, + "learning_rate": 4.804231672649944e-06, + "loss": 0.0878, + "num_input_tokens_seen": 5653192, + "step": 2879 + }, + { + "epoch": 0.3817097415506958, + "grad_norm": 20.214065551757812, + "learning_rate": 4.804096988619534e-06, + "loss": 0.502, + "num_input_tokens_seen": 5654960, + "step": 2880 + }, + { + "epoch": 0.3818422796554009, + "grad_norm": 4.521189212799072, + "learning_rate": 4.8039622601645454e-06, + "loss": 0.1067, + "num_input_tokens_seen": 5656592, + "step": 2881 + }, + { + "epoch": 0.38197481776010606, + "grad_norm": 10.256596565246582, + "learning_rate": 4.803827487287575e-06, + "loss": 0.1613, + "num_input_tokens_seen": 5658440, + "step": 2882 + }, + { + "epoch": 0.38210735586481115, + "grad_norm": 12.114916801452637, + "learning_rate": 4.8036926699912215e-06, + "loss": 0.3, + "num_input_tokens_seen": 5660496, + "step": 2883 + }, + { + "epoch": 0.38223989396951624, + "grad_norm": 5.946622848510742, + "learning_rate": 4.803557808278083e-06, + "loss": 0.1662, + "num_input_tokens_seen": 5662336, + "step": 2884 + }, + { + "epoch": 0.38237243207422134, + "grad_norm": 14.305303573608398, + "learning_rate": 4.803422902150762e-06, + "loss": 0.4726, + "num_input_tokens_seen": 5664616, + "step": 2885 + }, + { + "epoch": 0.38250497017892643, + "grad_norm": 0.1547321379184723, + "learning_rate": 4.803287951611857e-06, + "loss": 0.0009, + "num_input_tokens_seen": 5665768, + "step": 2886 + }, + { + "epoch": 0.3826375082836315, + "grad_norm": 0.7804080843925476, + "learning_rate": 4.803152956663972e-06, + "loss": 0.0044, + "num_input_tokens_seen": 5668416, + "step": 2887 + }, + { + "epoch": 0.38277004638833667, + "grad_norm": 4.414644241333008, + "learning_rate": 4.80301791730971e-06, + "loss": 0.0199, + "num_input_tokens_seen": 5670344, + "step": 2888 + }, + { + "epoch": 0.38290258449304176, + "grad_norm": 3.5182840824127197, + "learning_rate": 4.802882833551673e-06, + "loss": 0.0506, + "num_input_tokens_seen": 5672656, + "step": 2889 + }, + { + "epoch": 0.38303512259774686, + "grad_norm": 13.864169120788574, + "learning_rate": 4.802747705392466e-06, + "loss": 0.1403, + "num_input_tokens_seen": 5674440, + "step": 2890 + }, + { + "epoch": 0.38316766070245195, + "grad_norm": 0.16644710302352905, + "learning_rate": 4.802612532834695e-06, + "loss": 0.001, + "num_input_tokens_seen": 5676568, + "step": 2891 + }, + { + "epoch": 0.38330019880715704, + "grad_norm": 11.119260787963867, + "learning_rate": 4.802477315880966e-06, + "loss": 0.1751, + "num_input_tokens_seen": 5678072, + "step": 2892 + }, + { + "epoch": 0.38343273691186214, + "grad_norm": 14.251937866210938, + "learning_rate": 4.802342054533886e-06, + "loss": 0.1929, + "num_input_tokens_seen": 5680152, + "step": 2893 + }, + { + "epoch": 0.3835652750165673, + "grad_norm": 9.28195571899414, + "learning_rate": 4.8022067487960634e-06, + "loss": 0.1583, + "num_input_tokens_seen": 5681984, + "step": 2894 + }, + { + "epoch": 0.3836978131212724, + "grad_norm": 9.147839546203613, + "learning_rate": 4.8020713986701055e-06, + "loss": 0.0787, + "num_input_tokens_seen": 5683440, + "step": 2895 + }, + { + "epoch": 0.3838303512259775, + "grad_norm": 7.044008731842041, + "learning_rate": 4.8019360041586225e-06, + "loss": 0.0642, + "num_input_tokens_seen": 5685448, + "step": 2896 + }, + { + "epoch": 0.38396288933068257, + "grad_norm": 0.08466002345085144, + "learning_rate": 4.801800565264227e-06, + "loss": 0.0005, + "num_input_tokens_seen": 5688000, + "step": 2897 + }, + { + "epoch": 0.38409542743538766, + "grad_norm": 0.060855597257614136, + "learning_rate": 4.801665081989528e-06, + "loss": 0.0003, + "num_input_tokens_seen": 5689392, + "step": 2898 + }, + { + "epoch": 0.38422796554009275, + "grad_norm": 10.958772659301758, + "learning_rate": 4.801529554337138e-06, + "loss": 0.167, + "num_input_tokens_seen": 5691984, + "step": 2899 + }, + { + "epoch": 0.3843605036447979, + "grad_norm": 0.08043933659791946, + "learning_rate": 4.80139398230967e-06, + "loss": 0.0005, + "num_input_tokens_seen": 5693376, + "step": 2900 + }, + { + "epoch": 0.384493041749503, + "grad_norm": 8.444103240966797, + "learning_rate": 4.8012583659097385e-06, + "loss": 0.126, + "num_input_tokens_seen": 5695584, + "step": 2901 + }, + { + "epoch": 0.3846255798542081, + "grad_norm": 6.222550392150879, + "learning_rate": 4.801122705139959e-06, + "loss": 0.1006, + "num_input_tokens_seen": 5697864, + "step": 2902 + }, + { + "epoch": 0.3847581179589132, + "grad_norm": 4.725201606750488, + "learning_rate": 4.800987000002946e-06, + "loss": 0.1507, + "num_input_tokens_seen": 5699408, + "step": 2903 + }, + { + "epoch": 0.3848906560636183, + "grad_norm": 8.471427917480469, + "learning_rate": 4.800851250501316e-06, + "loss": 0.1996, + "num_input_tokens_seen": 5701096, + "step": 2904 + }, + { + "epoch": 0.38502319416832337, + "grad_norm": 0.05548432469367981, + "learning_rate": 4.800715456637687e-06, + "loss": 0.0003, + "num_input_tokens_seen": 5702960, + "step": 2905 + }, + { + "epoch": 0.3851557322730285, + "grad_norm": 8.121257781982422, + "learning_rate": 4.800579618414677e-06, + "loss": 0.2707, + "num_input_tokens_seen": 5704960, + "step": 2906 + }, + { + "epoch": 0.3852882703777336, + "grad_norm": 5.4546403884887695, + "learning_rate": 4.800443735834904e-06, + "loss": 0.1191, + "num_input_tokens_seen": 5707312, + "step": 2907 + }, + { + "epoch": 0.3854208084824387, + "grad_norm": 15.769217491149902, + "learning_rate": 4.800307808900989e-06, + "loss": 0.6465, + "num_input_tokens_seen": 5708472, + "step": 2908 + }, + { + "epoch": 0.3855533465871438, + "grad_norm": 9.134364128112793, + "learning_rate": 4.800171837615553e-06, + "loss": 0.0937, + "num_input_tokens_seen": 5710800, + "step": 2909 + }, + { + "epoch": 0.3856858846918489, + "grad_norm": 6.90806245803833, + "learning_rate": 4.8000358219812165e-06, + "loss": 0.2421, + "num_input_tokens_seen": 5712696, + "step": 2910 + }, + { + "epoch": 0.385818422796554, + "grad_norm": 13.220758438110352, + "learning_rate": 4.799899762000602e-06, + "loss": 0.3505, + "num_input_tokens_seen": 5715064, + "step": 2911 + }, + { + "epoch": 0.38595096090125913, + "grad_norm": 4.890446662902832, + "learning_rate": 4.7997636576763354e-06, + "loss": 0.0742, + "num_input_tokens_seen": 5716560, + "step": 2912 + }, + { + "epoch": 0.3860834990059642, + "grad_norm": 8.918073654174805, + "learning_rate": 4.799627509011038e-06, + "loss": 0.0987, + "num_input_tokens_seen": 5718576, + "step": 2913 + }, + { + "epoch": 0.3862160371106693, + "grad_norm": 5.0368146896362305, + "learning_rate": 4.799491316007337e-06, + "loss": 0.1193, + "num_input_tokens_seen": 5720688, + "step": 2914 + }, + { + "epoch": 0.3863485752153744, + "grad_norm": 15.296892166137695, + "learning_rate": 4.799355078667855e-06, + "loss": 0.5037, + "num_input_tokens_seen": 5722736, + "step": 2915 + }, + { + "epoch": 0.3864811133200795, + "grad_norm": 0.11789968609809875, + "learning_rate": 4.799218796995222e-06, + "loss": 0.0007, + "num_input_tokens_seen": 5724136, + "step": 2916 + }, + { + "epoch": 0.3866136514247846, + "grad_norm": 15.619475364685059, + "learning_rate": 4.799082470992064e-06, + "loss": 0.2294, + "num_input_tokens_seen": 5727096, + "step": 2917 + }, + { + "epoch": 0.38674618952948975, + "grad_norm": 11.507015228271484, + "learning_rate": 4.798946100661011e-06, + "loss": 0.1717, + "num_input_tokens_seen": 5730048, + "step": 2918 + }, + { + "epoch": 0.38687872763419484, + "grad_norm": 20.806148529052734, + "learning_rate": 4.7988096860046895e-06, + "loss": 1.1562, + "num_input_tokens_seen": 5732920, + "step": 2919 + }, + { + "epoch": 0.38701126573889993, + "grad_norm": 5.2293901443481445, + "learning_rate": 4.798673227025732e-06, + "loss": 0.0461, + "num_input_tokens_seen": 5734536, + "step": 2920 + }, + { + "epoch": 0.387143803843605, + "grad_norm": 7.92720890045166, + "learning_rate": 4.79853672372677e-06, + "loss": 0.1044, + "num_input_tokens_seen": 5736104, + "step": 2921 + }, + { + "epoch": 0.3872763419483101, + "grad_norm": 12.38214111328125, + "learning_rate": 4.798400176110432e-06, + "loss": 0.3783, + "num_input_tokens_seen": 5738768, + "step": 2922 + }, + { + "epoch": 0.3874088800530152, + "grad_norm": 4.390325546264648, + "learning_rate": 4.798263584179355e-06, + "loss": 0.0359, + "num_input_tokens_seen": 5740872, + "step": 2923 + }, + { + "epoch": 0.38754141815772036, + "grad_norm": 16.519081115722656, + "learning_rate": 4.79812694793617e-06, + "loss": 0.5263, + "num_input_tokens_seen": 5742752, + "step": 2924 + }, + { + "epoch": 0.38767395626242546, + "grad_norm": 12.951534271240234, + "learning_rate": 4.797990267383512e-06, + "loss": 0.1945, + "num_input_tokens_seen": 5745080, + "step": 2925 + }, + { + "epoch": 0.38780649436713055, + "grad_norm": 9.392237663269043, + "learning_rate": 4.797853542524017e-06, + "loss": 0.2681, + "num_input_tokens_seen": 5747512, + "step": 2926 + }, + { + "epoch": 0.38793903247183564, + "grad_norm": 7.611248970031738, + "learning_rate": 4.797716773360321e-06, + "loss": 0.1502, + "num_input_tokens_seen": 5750064, + "step": 2927 + }, + { + "epoch": 0.38807157057654074, + "grad_norm": 3.736629009246826, + "learning_rate": 4.797579959895058e-06, + "loss": 0.0156, + "num_input_tokens_seen": 5751672, + "step": 2928 + }, + { + "epoch": 0.3882041086812459, + "grad_norm": 23.248306274414062, + "learning_rate": 4.79744310213087e-06, + "loss": 0.5916, + "num_input_tokens_seen": 5753760, + "step": 2929 + }, + { + "epoch": 0.388336646785951, + "grad_norm": 13.426590919494629, + "learning_rate": 4.797306200070394e-06, + "loss": 0.4148, + "num_input_tokens_seen": 5755784, + "step": 2930 + }, + { + "epoch": 0.38846918489065607, + "grad_norm": 6.554684162139893, + "learning_rate": 4.7971692537162694e-06, + "loss": 0.12, + "num_input_tokens_seen": 5758264, + "step": 2931 + }, + { + "epoch": 0.38860172299536117, + "grad_norm": 14.359375953674316, + "learning_rate": 4.797032263071138e-06, + "loss": 0.3622, + "num_input_tokens_seen": 5759864, + "step": 2932 + }, + { + "epoch": 0.38873426110006626, + "grad_norm": 14.0858736038208, + "learning_rate": 4.796895228137638e-06, + "loss": 0.194, + "num_input_tokens_seen": 5761584, + "step": 2933 + }, + { + "epoch": 0.38886679920477135, + "grad_norm": 13.78635311126709, + "learning_rate": 4.796758148918415e-06, + "loss": 0.2577, + "num_input_tokens_seen": 5763592, + "step": 2934 + }, + { + "epoch": 0.3889993373094765, + "grad_norm": 7.855655193328857, + "learning_rate": 4.7966210254161095e-06, + "loss": 0.1948, + "num_input_tokens_seen": 5765192, + "step": 2935 + }, + { + "epoch": 0.3891318754141816, + "grad_norm": 6.836536407470703, + "learning_rate": 4.796483857633366e-06, + "loss": 0.0662, + "num_input_tokens_seen": 5767896, + "step": 2936 + }, + { + "epoch": 0.3892644135188867, + "grad_norm": 8.234159469604492, + "learning_rate": 4.7963466455728304e-06, + "loss": 0.2592, + "num_input_tokens_seen": 5770280, + "step": 2937 + }, + { + "epoch": 0.3893969516235918, + "grad_norm": 12.620244979858398, + "learning_rate": 4.796209389237146e-06, + "loss": 0.1193, + "num_input_tokens_seen": 5772136, + "step": 2938 + }, + { + "epoch": 0.3895294897282969, + "grad_norm": 1.0394253730773926, + "learning_rate": 4.796072088628963e-06, + "loss": 0.0063, + "num_input_tokens_seen": 5775072, + "step": 2939 + }, + { + "epoch": 0.38966202783300197, + "grad_norm": 9.655213356018066, + "learning_rate": 4.795934743750923e-06, + "loss": 0.3118, + "num_input_tokens_seen": 5777440, + "step": 2940 + }, + { + "epoch": 0.3897945659377071, + "grad_norm": 7.174744606018066, + "learning_rate": 4.795797354605679e-06, + "loss": 0.1946, + "num_input_tokens_seen": 5778976, + "step": 2941 + }, + { + "epoch": 0.3899271040424122, + "grad_norm": 0.5289475917816162, + "learning_rate": 4.795659921195878e-06, + "loss": 0.0032, + "num_input_tokens_seen": 5780792, + "step": 2942 + }, + { + "epoch": 0.3900596421471173, + "grad_norm": 10.455952644348145, + "learning_rate": 4.795522443524171e-06, + "loss": 0.2786, + "num_input_tokens_seen": 5782704, + "step": 2943 + }, + { + "epoch": 0.3901921802518224, + "grad_norm": 13.964348793029785, + "learning_rate": 4.7953849215932065e-06, + "loss": 0.4183, + "num_input_tokens_seen": 5784920, + "step": 2944 + }, + { + "epoch": 0.3903247183565275, + "grad_norm": 0.43512532114982605, + "learning_rate": 4.7952473554056375e-06, + "loss": 0.0026, + "num_input_tokens_seen": 5786232, + "step": 2945 + }, + { + "epoch": 0.3904572564612326, + "grad_norm": 11.050591468811035, + "learning_rate": 4.795109744964116e-06, + "loss": 0.2845, + "num_input_tokens_seen": 5787816, + "step": 2946 + }, + { + "epoch": 0.39058979456593773, + "grad_norm": 22.758481979370117, + "learning_rate": 4.7949720902712945e-06, + "loss": 0.2809, + "num_input_tokens_seen": 5789432, + "step": 2947 + }, + { + "epoch": 0.3907223326706428, + "grad_norm": 12.670291900634766, + "learning_rate": 4.794834391329829e-06, + "loss": 0.2404, + "num_input_tokens_seen": 5791648, + "step": 2948 + }, + { + "epoch": 0.3908548707753479, + "grad_norm": 7.334057331085205, + "learning_rate": 4.794696648142373e-06, + "loss": 0.0867, + "num_input_tokens_seen": 5792832, + "step": 2949 + }, + { + "epoch": 0.390987408880053, + "grad_norm": 1.034836769104004, + "learning_rate": 4.794558860711583e-06, + "loss": 0.0059, + "num_input_tokens_seen": 5794640, + "step": 2950 + }, + { + "epoch": 0.3911199469847581, + "grad_norm": 10.421894073486328, + "learning_rate": 4.794421029040115e-06, + "loss": 0.3422, + "num_input_tokens_seen": 5796680, + "step": 2951 + }, + { + "epoch": 0.3912524850894632, + "grad_norm": 11.266824722290039, + "learning_rate": 4.794283153130627e-06, + "loss": 0.1729, + "num_input_tokens_seen": 5798240, + "step": 2952 + }, + { + "epoch": 0.39138502319416835, + "grad_norm": 7.401568412780762, + "learning_rate": 4.794145232985776e-06, + "loss": 0.1613, + "num_input_tokens_seen": 5799624, + "step": 2953 + }, + { + "epoch": 0.39151756129887344, + "grad_norm": 5.495708465576172, + "learning_rate": 4.794007268608223e-06, + "loss": 0.0847, + "num_input_tokens_seen": 5801184, + "step": 2954 + }, + { + "epoch": 0.39165009940357853, + "grad_norm": 15.047117233276367, + "learning_rate": 4.793869260000627e-06, + "loss": 0.3358, + "num_input_tokens_seen": 5802960, + "step": 2955 + }, + { + "epoch": 0.3917826375082836, + "grad_norm": 3.915280342102051, + "learning_rate": 4.79373120716565e-06, + "loss": 0.0741, + "num_input_tokens_seen": 5804808, + "step": 2956 + }, + { + "epoch": 0.3919151756129887, + "grad_norm": 6.27174186706543, + "learning_rate": 4.793593110105952e-06, + "loss": 0.0618, + "num_input_tokens_seen": 5806272, + "step": 2957 + }, + { + "epoch": 0.3920477137176938, + "grad_norm": 26.54920196533203, + "learning_rate": 4.793454968824197e-06, + "loss": 0.8077, + "num_input_tokens_seen": 5808136, + "step": 2958 + }, + { + "epoch": 0.39218025182239896, + "grad_norm": 9.607259750366211, + "learning_rate": 4.793316783323047e-06, + "loss": 0.1445, + "num_input_tokens_seen": 5809640, + "step": 2959 + }, + { + "epoch": 0.39231278992710406, + "grad_norm": 12.036971092224121, + "learning_rate": 4.793178553605169e-06, + "loss": 0.1451, + "num_input_tokens_seen": 5811200, + "step": 2960 + }, + { + "epoch": 0.39244532803180915, + "grad_norm": 15.08952522277832, + "learning_rate": 4.793040279673226e-06, + "loss": 0.4523, + "num_input_tokens_seen": 5814120, + "step": 2961 + }, + { + "epoch": 0.39257786613651424, + "grad_norm": 9.915499687194824, + "learning_rate": 4.7929019615298844e-06, + "loss": 0.3195, + "num_input_tokens_seen": 5816384, + "step": 2962 + }, + { + "epoch": 0.39271040424121934, + "grad_norm": 18.3887939453125, + "learning_rate": 4.792763599177811e-06, + "loss": 0.5786, + "num_input_tokens_seen": 5818384, + "step": 2963 + }, + { + "epoch": 0.39284294234592443, + "grad_norm": 14.735946655273438, + "learning_rate": 4.792625192619674e-06, + "loss": 0.3675, + "num_input_tokens_seen": 5820328, + "step": 2964 + }, + { + "epoch": 0.3929754804506296, + "grad_norm": 13.089232444763184, + "learning_rate": 4.792486741858141e-06, + "loss": 0.3246, + "num_input_tokens_seen": 5822632, + "step": 2965 + }, + { + "epoch": 0.39310801855533467, + "grad_norm": 11.526205062866211, + "learning_rate": 4.792348246895883e-06, + "loss": 0.2, + "num_input_tokens_seen": 5824544, + "step": 2966 + }, + { + "epoch": 0.39324055666003976, + "grad_norm": 6.57134485244751, + "learning_rate": 4.792209707735569e-06, + "loss": 0.081, + "num_input_tokens_seen": 5826824, + "step": 2967 + }, + { + "epoch": 0.39337309476474486, + "grad_norm": 11.693963050842285, + "learning_rate": 4.79207112437987e-06, + "loss": 0.234, + "num_input_tokens_seen": 5828832, + "step": 2968 + }, + { + "epoch": 0.39350563286944995, + "grad_norm": 8.574296951293945, + "learning_rate": 4.791932496831459e-06, + "loss": 0.2149, + "num_input_tokens_seen": 5830304, + "step": 2969 + }, + { + "epoch": 0.39363817097415504, + "grad_norm": 13.792306900024414, + "learning_rate": 4.791793825093009e-06, + "loss": 0.6277, + "num_input_tokens_seen": 5833152, + "step": 2970 + }, + { + "epoch": 0.3937707090788602, + "grad_norm": 3.509612560272217, + "learning_rate": 4.791655109167192e-06, + "loss": 0.0278, + "num_input_tokens_seen": 5835208, + "step": 2971 + }, + { + "epoch": 0.3939032471835653, + "grad_norm": 0.9070621728897095, + "learning_rate": 4.791516349056684e-06, + "loss": 0.005, + "num_input_tokens_seen": 5836376, + "step": 2972 + }, + { + "epoch": 0.3940357852882704, + "grad_norm": 0.6295693516731262, + "learning_rate": 4.7913775447641595e-06, + "loss": 0.0038, + "num_input_tokens_seen": 5838968, + "step": 2973 + }, + { + "epoch": 0.3941683233929755, + "grad_norm": 0.42120882868766785, + "learning_rate": 4.791238696292296e-06, + "loss": 0.0027, + "num_input_tokens_seen": 5840256, + "step": 2974 + }, + { + "epoch": 0.39430086149768057, + "grad_norm": 0.146803081035614, + "learning_rate": 4.79109980364377e-06, + "loss": 0.0009, + "num_input_tokens_seen": 5841520, + "step": 2975 + }, + { + "epoch": 0.39443339960238566, + "grad_norm": 8.67635726928711, + "learning_rate": 4.790960866821258e-06, + "loss": 0.3369, + "num_input_tokens_seen": 5843336, + "step": 2976 + }, + { + "epoch": 0.3945659377070908, + "grad_norm": 0.11394452303647995, + "learning_rate": 4.790821885827441e-06, + "loss": 0.0007, + "num_input_tokens_seen": 5845024, + "step": 2977 + }, + { + "epoch": 0.3946984758117959, + "grad_norm": 11.386565208435059, + "learning_rate": 4.790682860664998e-06, + "loss": 0.1871, + "num_input_tokens_seen": 5847568, + "step": 2978 + }, + { + "epoch": 0.394831013916501, + "grad_norm": 0.14270855486392975, + "learning_rate": 4.790543791336608e-06, + "loss": 0.0009, + "num_input_tokens_seen": 5849144, + "step": 2979 + }, + { + "epoch": 0.3949635520212061, + "grad_norm": 0.1783614605665207, + "learning_rate": 4.790404677844954e-06, + "loss": 0.0011, + "num_input_tokens_seen": 5850872, + "step": 2980 + }, + { + "epoch": 0.3950960901259112, + "grad_norm": 9.506954193115234, + "learning_rate": 4.790265520192717e-06, + "loss": 0.2391, + "num_input_tokens_seen": 5853744, + "step": 2981 + }, + { + "epoch": 0.3952286282306163, + "grad_norm": 4.984894275665283, + "learning_rate": 4.790126318382582e-06, + "loss": 0.1252, + "num_input_tokens_seen": 5855008, + "step": 2982 + }, + { + "epoch": 0.3953611663353214, + "grad_norm": 12.45118522644043, + "learning_rate": 4.789987072417232e-06, + "loss": 0.3526, + "num_input_tokens_seen": 5856352, + "step": 2983 + }, + { + "epoch": 0.3954937044400265, + "grad_norm": 10.527811050415039, + "learning_rate": 4.789847782299351e-06, + "loss": 0.3427, + "num_input_tokens_seen": 5858112, + "step": 2984 + }, + { + "epoch": 0.3956262425447316, + "grad_norm": 0.030683059245347977, + "learning_rate": 4.789708448031625e-06, + "loss": 0.0002, + "num_input_tokens_seen": 5859624, + "step": 2985 + }, + { + "epoch": 0.3957587806494367, + "grad_norm": 5.548108100891113, + "learning_rate": 4.789569069616739e-06, + "loss": 0.1473, + "num_input_tokens_seen": 5861584, + "step": 2986 + }, + { + "epoch": 0.3958913187541418, + "grad_norm": 16.987546920776367, + "learning_rate": 4.789429647057384e-06, + "loss": 0.6618, + "num_input_tokens_seen": 5863168, + "step": 2987 + }, + { + "epoch": 0.39602385685884695, + "grad_norm": 0.10859762132167816, + "learning_rate": 4.7892901803562455e-06, + "loss": 0.0007, + "num_input_tokens_seen": 5864880, + "step": 2988 + }, + { + "epoch": 0.39615639496355204, + "grad_norm": 11.099987030029297, + "learning_rate": 4.789150669516013e-06, + "loss": 0.1943, + "num_input_tokens_seen": 5866904, + "step": 2989 + }, + { + "epoch": 0.39628893306825713, + "grad_norm": 18.681299209594727, + "learning_rate": 4.789011114539376e-06, + "loss": 0.5396, + "num_input_tokens_seen": 5869288, + "step": 2990 + }, + { + "epoch": 0.3964214711729622, + "grad_norm": 5.464449405670166, + "learning_rate": 4.788871515429026e-06, + "loss": 0.1703, + "num_input_tokens_seen": 5870968, + "step": 2991 + }, + { + "epoch": 0.3965540092776673, + "grad_norm": 10.166231155395508, + "learning_rate": 4.788731872187653e-06, + "loss": 0.2362, + "num_input_tokens_seen": 5872624, + "step": 2992 + }, + { + "epoch": 0.3966865473823724, + "grad_norm": 8.971012115478516, + "learning_rate": 4.7885921848179516e-06, + "loss": 0.2499, + "num_input_tokens_seen": 5874176, + "step": 2993 + }, + { + "epoch": 0.39681908548707756, + "grad_norm": 16.86927604675293, + "learning_rate": 4.788452453322614e-06, + "loss": 0.2378, + "num_input_tokens_seen": 5876920, + "step": 2994 + }, + { + "epoch": 0.39695162359178265, + "grad_norm": 7.404328346252441, + "learning_rate": 4.788312677704335e-06, + "loss": 0.0978, + "num_input_tokens_seen": 5879176, + "step": 2995 + }, + { + "epoch": 0.39708416169648775, + "grad_norm": 6.144297122955322, + "learning_rate": 4.788172857965808e-06, + "loss": 0.1248, + "num_input_tokens_seen": 5881256, + "step": 2996 + }, + { + "epoch": 0.39721669980119284, + "grad_norm": 0.10653188079595566, + "learning_rate": 4.78803299410973e-06, + "loss": 0.0007, + "num_input_tokens_seen": 5882792, + "step": 2997 + }, + { + "epoch": 0.39734923790589793, + "grad_norm": 17.77987289428711, + "learning_rate": 4.787893086138797e-06, + "loss": 0.513, + "num_input_tokens_seen": 5884456, + "step": 2998 + }, + { + "epoch": 0.397481776010603, + "grad_norm": 12.142847061157227, + "learning_rate": 4.787753134055708e-06, + "loss": 0.1951, + "num_input_tokens_seen": 5885952, + "step": 2999 + }, + { + "epoch": 0.3976143141153082, + "grad_norm": 3.825782537460327, + "learning_rate": 4.78761313786316e-06, + "loss": 0.0265, + "num_input_tokens_seen": 5887648, + "step": 3000 + }, + { + "epoch": 0.39774685222001327, + "grad_norm": 12.713396072387695, + "learning_rate": 4.7874730975638525e-06, + "loss": 0.4991, + "num_input_tokens_seen": 5889720, + "step": 3001 + }, + { + "epoch": 0.39787939032471836, + "grad_norm": 14.01033878326416, + "learning_rate": 4.787333013160485e-06, + "loss": 0.4214, + "num_input_tokens_seen": 5891472, + "step": 3002 + }, + { + "epoch": 0.39801192842942346, + "grad_norm": 11.056123733520508, + "learning_rate": 4.78719288465576e-06, + "loss": 0.2324, + "num_input_tokens_seen": 5893512, + "step": 3003 + }, + { + "epoch": 0.39814446653412855, + "grad_norm": 4.753828048706055, + "learning_rate": 4.787052712052378e-06, + "loss": 0.0504, + "num_input_tokens_seen": 5895000, + "step": 3004 + }, + { + "epoch": 0.39827700463883364, + "grad_norm": 2.477724075317383, + "learning_rate": 4.786912495353042e-06, + "loss": 0.0144, + "num_input_tokens_seen": 5897896, + "step": 3005 + }, + { + "epoch": 0.3984095427435388, + "grad_norm": 4.012485980987549, + "learning_rate": 4.786772234560455e-06, + "loss": 0.1019, + "num_input_tokens_seen": 5899648, + "step": 3006 + }, + { + "epoch": 0.3985420808482439, + "grad_norm": 1.1006004810333252, + "learning_rate": 4.7866319296773216e-06, + "loss": 0.0069, + "num_input_tokens_seen": 5901408, + "step": 3007 + }, + { + "epoch": 0.398674618952949, + "grad_norm": 0.5684716105461121, + "learning_rate": 4.786491580706348e-06, + "loss": 0.0036, + "num_input_tokens_seen": 5902712, + "step": 3008 + }, + { + "epoch": 0.39880715705765407, + "grad_norm": 0.44975587725639343, + "learning_rate": 4.786351187650239e-06, + "loss": 0.0028, + "num_input_tokens_seen": 5904200, + "step": 3009 + }, + { + "epoch": 0.39893969516235916, + "grad_norm": 9.59408950805664, + "learning_rate": 4.786210750511701e-06, + "loss": 0.1614, + "num_input_tokens_seen": 5905768, + "step": 3010 + }, + { + "epoch": 0.39907223326706426, + "grad_norm": 11.989697456359863, + "learning_rate": 4.786070269293444e-06, + "loss": 0.3817, + "num_input_tokens_seen": 5909056, + "step": 3011 + }, + { + "epoch": 0.3992047713717694, + "grad_norm": 11.94229793548584, + "learning_rate": 4.785929743998174e-06, + "loss": 0.4448, + "num_input_tokens_seen": 5910592, + "step": 3012 + }, + { + "epoch": 0.3993373094764745, + "grad_norm": 17.604032516479492, + "learning_rate": 4.7857891746286025e-06, + "loss": 0.738, + "num_input_tokens_seen": 5912872, + "step": 3013 + }, + { + "epoch": 0.3994698475811796, + "grad_norm": 5.24974250793457, + "learning_rate": 4.785648561187438e-06, + "loss": 0.0739, + "num_input_tokens_seen": 5914752, + "step": 3014 + }, + { + "epoch": 0.3996023856858847, + "grad_norm": 2.9489407539367676, + "learning_rate": 4.785507903677392e-06, + "loss": 0.0689, + "num_input_tokens_seen": 5916720, + "step": 3015 + }, + { + "epoch": 0.3997349237905898, + "grad_norm": 5.593491554260254, + "learning_rate": 4.785367202101178e-06, + "loss": 0.1008, + "num_input_tokens_seen": 5918432, + "step": 3016 + }, + { + "epoch": 0.3998674618952949, + "grad_norm": 13.952553749084473, + "learning_rate": 4.785226456461508e-06, + "loss": 0.5344, + "num_input_tokens_seen": 5920480, + "step": 3017 + }, + { + "epoch": 0.4, + "grad_norm": 0.10456594079732895, + "learning_rate": 4.785085666761094e-06, + "loss": 0.0007, + "num_input_tokens_seen": 5921888, + "step": 3018 + }, + { + "epoch": 0.4001325381047051, + "grad_norm": 13.457680702209473, + "learning_rate": 4.784944833002654e-06, + "loss": 0.397, + "num_input_tokens_seen": 5923736, + "step": 3019 + }, + { + "epoch": 0.4002650762094102, + "grad_norm": 5.41466760635376, + "learning_rate": 4.784803955188899e-06, + "loss": 0.0587, + "num_input_tokens_seen": 5926984, + "step": 3020 + }, + { + "epoch": 0.4003976143141153, + "grad_norm": 11.419010162353516, + "learning_rate": 4.7846630333225495e-06, + "loss": 0.2443, + "num_input_tokens_seen": 5928304, + "step": 3021 + }, + { + "epoch": 0.4005301524188204, + "grad_norm": 8.327176094055176, + "learning_rate": 4.784522067406319e-06, + "loss": 0.1608, + "num_input_tokens_seen": 5930296, + "step": 3022 + }, + { + "epoch": 0.4006626905235255, + "grad_norm": 0.17967109382152557, + "learning_rate": 4.7843810574429284e-06, + "loss": 0.0011, + "num_input_tokens_seen": 5932520, + "step": 3023 + }, + { + "epoch": 0.40079522862823064, + "grad_norm": 0.6572452783584595, + "learning_rate": 4.784240003435095e-06, + "loss": 0.0035, + "num_input_tokens_seen": 5934288, + "step": 3024 + }, + { + "epoch": 0.40092776673293573, + "grad_norm": 0.7890099883079529, + "learning_rate": 4.7840989053855374e-06, + "loss": 0.0032, + "num_input_tokens_seen": 5936080, + "step": 3025 + }, + { + "epoch": 0.4010603048376408, + "grad_norm": 0.1663220375776291, + "learning_rate": 4.783957763296978e-06, + "loss": 0.001, + "num_input_tokens_seen": 5937344, + "step": 3026 + }, + { + "epoch": 0.4011928429423459, + "grad_norm": 23.41261100769043, + "learning_rate": 4.783816577172137e-06, + "loss": 0.4825, + "num_input_tokens_seen": 5939352, + "step": 3027 + }, + { + "epoch": 0.401325381047051, + "grad_norm": 4.292874813079834, + "learning_rate": 4.783675347013736e-06, + "loss": 0.1444, + "num_input_tokens_seen": 5941512, + "step": 3028 + }, + { + "epoch": 0.4014579191517561, + "grad_norm": 2.5654399394989014, + "learning_rate": 4.7835340728245e-06, + "loss": 0.0162, + "num_input_tokens_seen": 5943120, + "step": 3029 + }, + { + "epoch": 0.40159045725646125, + "grad_norm": 10.964597702026367, + "learning_rate": 4.783392754607152e-06, + "loss": 0.2558, + "num_input_tokens_seen": 5944800, + "step": 3030 + }, + { + "epoch": 0.40172299536116635, + "grad_norm": 8.546445846557617, + "learning_rate": 4.783251392364417e-06, + "loss": 0.2801, + "num_input_tokens_seen": 5946080, + "step": 3031 + }, + { + "epoch": 0.40185553346587144, + "grad_norm": 10.508057594299316, + "learning_rate": 4.783109986099019e-06, + "loss": 0.2873, + "num_input_tokens_seen": 5947752, + "step": 3032 + }, + { + "epoch": 0.40198807157057653, + "grad_norm": 0.0683145523071289, + "learning_rate": 4.782968535813685e-06, + "loss": 0.0004, + "num_input_tokens_seen": 5949728, + "step": 3033 + }, + { + "epoch": 0.4021206096752816, + "grad_norm": 13.5866117477417, + "learning_rate": 4.782827041511143e-06, + "loss": 0.2349, + "num_input_tokens_seen": 5950968, + "step": 3034 + }, + { + "epoch": 0.4022531477799867, + "grad_norm": 5.996464252471924, + "learning_rate": 4.782685503194121e-06, + "loss": 0.2151, + "num_input_tokens_seen": 5953112, + "step": 3035 + }, + { + "epoch": 0.40238568588469187, + "grad_norm": 8.38286304473877, + "learning_rate": 4.782543920865348e-06, + "loss": 0.2292, + "num_input_tokens_seen": 5955192, + "step": 3036 + }, + { + "epoch": 0.40251822398939696, + "grad_norm": 1.311390995979309, + "learning_rate": 4.7824022945275535e-06, + "loss": 0.0204, + "num_input_tokens_seen": 5956824, + "step": 3037 + }, + { + "epoch": 0.40265076209410205, + "grad_norm": 5.924940586090088, + "learning_rate": 4.782260624183468e-06, + "loss": 0.0334, + "num_input_tokens_seen": 5959088, + "step": 3038 + }, + { + "epoch": 0.40278330019880715, + "grad_norm": 8.918008804321289, + "learning_rate": 4.782118909835823e-06, + "loss": 0.2929, + "num_input_tokens_seen": 5960680, + "step": 3039 + }, + { + "epoch": 0.40291583830351224, + "grad_norm": 0.106726735830307, + "learning_rate": 4.781977151487351e-06, + "loss": 0.0007, + "num_input_tokens_seen": 5962168, + "step": 3040 + }, + { + "epoch": 0.40304837640821733, + "grad_norm": 15.09712028503418, + "learning_rate": 4.781835349140786e-06, + "loss": 0.4329, + "num_input_tokens_seen": 5964096, + "step": 3041 + }, + { + "epoch": 0.4031809145129225, + "grad_norm": 9.392486572265625, + "learning_rate": 4.781693502798861e-06, + "loss": 0.1517, + "num_input_tokens_seen": 5965760, + "step": 3042 + }, + { + "epoch": 0.4033134526176276, + "grad_norm": 8.407020568847656, + "learning_rate": 4.781551612464311e-06, + "loss": 0.2123, + "num_input_tokens_seen": 5968096, + "step": 3043 + }, + { + "epoch": 0.40344599072233267, + "grad_norm": 10.140686988830566, + "learning_rate": 4.7814096781398725e-06, + "loss": 0.2907, + "num_input_tokens_seen": 5969616, + "step": 3044 + }, + { + "epoch": 0.40357852882703776, + "grad_norm": 0.2493980973958969, + "learning_rate": 4.781267699828281e-06, + "loss": 0.0015, + "num_input_tokens_seen": 5971352, + "step": 3045 + }, + { + "epoch": 0.40371106693174286, + "grad_norm": 10.33617877960205, + "learning_rate": 4.781125677532274e-06, + "loss": 0.143, + "num_input_tokens_seen": 5973440, + "step": 3046 + }, + { + "epoch": 0.403843605036448, + "grad_norm": 6.5604963302612305, + "learning_rate": 4.7809836112545906e-06, + "loss": 0.0494, + "num_input_tokens_seen": 5975112, + "step": 3047 + }, + { + "epoch": 0.4039761431411531, + "grad_norm": 18.71568489074707, + "learning_rate": 4.78084150099797e-06, + "loss": 0.5231, + "num_input_tokens_seen": 5976680, + "step": 3048 + }, + { + "epoch": 0.4041086812458582, + "grad_norm": 11.909926414489746, + "learning_rate": 4.780699346765152e-06, + "loss": 0.2672, + "num_input_tokens_seen": 5978664, + "step": 3049 + }, + { + "epoch": 0.4042412193505633, + "grad_norm": 11.537005424499512, + "learning_rate": 4.780557148558876e-06, + "loss": 0.1979, + "num_input_tokens_seen": 5980384, + "step": 3050 + }, + { + "epoch": 0.4043737574552684, + "grad_norm": 12.15526294708252, + "learning_rate": 4.780414906381886e-06, + "loss": 0.5005, + "num_input_tokens_seen": 5982792, + "step": 3051 + }, + { + "epoch": 0.40450629555997347, + "grad_norm": 5.510542392730713, + "learning_rate": 4.7802726202369236e-06, + "loss": 0.1017, + "num_input_tokens_seen": 5984328, + "step": 3052 + }, + { + "epoch": 0.4046388336646786, + "grad_norm": 7.926218032836914, + "learning_rate": 4.780130290126731e-06, + "loss": 0.1155, + "num_input_tokens_seen": 5986304, + "step": 3053 + }, + { + "epoch": 0.4047713717693837, + "grad_norm": 10.571142196655273, + "learning_rate": 4.779987916054054e-06, + "loss": 0.1468, + "num_input_tokens_seen": 5988336, + "step": 3054 + }, + { + "epoch": 0.4049039098740888, + "grad_norm": 0.49342310428619385, + "learning_rate": 4.7798454980216375e-06, + "loss": 0.0031, + "num_input_tokens_seen": 5989840, + "step": 3055 + }, + { + "epoch": 0.4050364479787939, + "grad_norm": 9.98480224609375, + "learning_rate": 4.779703036032227e-06, + "loss": 0.197, + "num_input_tokens_seen": 5991936, + "step": 3056 + }, + { + "epoch": 0.405168986083499, + "grad_norm": 9.786097526550293, + "learning_rate": 4.779560530088568e-06, + "loss": 0.3541, + "num_input_tokens_seen": 5993920, + "step": 3057 + }, + { + "epoch": 0.4053015241882041, + "grad_norm": 11.278203010559082, + "learning_rate": 4.779417980193411e-06, + "loss": 0.1997, + "num_input_tokens_seen": 5995688, + "step": 3058 + }, + { + "epoch": 0.40543406229290924, + "grad_norm": 14.312846183776855, + "learning_rate": 4.779275386349502e-06, + "loss": 0.5092, + "num_input_tokens_seen": 5997272, + "step": 3059 + }, + { + "epoch": 0.40556660039761433, + "grad_norm": 11.433096885681152, + "learning_rate": 4.779132748559591e-06, + "loss": 0.277, + "num_input_tokens_seen": 5999984, + "step": 3060 + }, + { + "epoch": 0.4056991385023194, + "grad_norm": 13.709260940551758, + "learning_rate": 4.7789900668264285e-06, + "loss": 0.285, + "num_input_tokens_seen": 6002960, + "step": 3061 + }, + { + "epoch": 0.4058316766070245, + "grad_norm": 1.373302936553955, + "learning_rate": 4.778847341152766e-06, + "loss": 0.0089, + "num_input_tokens_seen": 6005040, + "step": 3062 + }, + { + "epoch": 0.4059642147117296, + "grad_norm": 8.988983154296875, + "learning_rate": 4.778704571541353e-06, + "loss": 0.211, + "num_input_tokens_seen": 6006856, + "step": 3063 + }, + { + "epoch": 0.4060967528164347, + "grad_norm": 2.7159955501556396, + "learning_rate": 4.7785617579949464e-06, + "loss": 0.0525, + "num_input_tokens_seen": 6008328, + "step": 3064 + }, + { + "epoch": 0.40622929092113985, + "grad_norm": 5.499091625213623, + "learning_rate": 4.778418900516295e-06, + "loss": 0.2052, + "num_input_tokens_seen": 6009720, + "step": 3065 + }, + { + "epoch": 0.40636182902584495, + "grad_norm": 8.766337394714355, + "learning_rate": 4.778275999108156e-06, + "loss": 0.2336, + "num_input_tokens_seen": 6011488, + "step": 3066 + }, + { + "epoch": 0.40649436713055004, + "grad_norm": 6.744692802429199, + "learning_rate": 4.778133053773285e-06, + "loss": 0.1526, + "num_input_tokens_seen": 6013040, + "step": 3067 + }, + { + "epoch": 0.40662690523525513, + "grad_norm": 12.018231391906738, + "learning_rate": 4.777990064514436e-06, + "loss": 0.5898, + "num_input_tokens_seen": 6015712, + "step": 3068 + }, + { + "epoch": 0.4067594433399602, + "grad_norm": 11.734688758850098, + "learning_rate": 4.777847031334367e-06, + "loss": 0.4626, + "num_input_tokens_seen": 6018200, + "step": 3069 + }, + { + "epoch": 0.4068919814446653, + "grad_norm": 0.4630076289176941, + "learning_rate": 4.777703954235836e-06, + "loss": 0.003, + "num_input_tokens_seen": 6020296, + "step": 3070 + }, + { + "epoch": 0.40702451954937047, + "grad_norm": 3.5980210304260254, + "learning_rate": 4.777560833221602e-06, + "loss": 0.0933, + "num_input_tokens_seen": 6022024, + "step": 3071 + }, + { + "epoch": 0.40715705765407556, + "grad_norm": 5.336532115936279, + "learning_rate": 4.777417668294423e-06, + "loss": 0.1168, + "num_input_tokens_seen": 6023808, + "step": 3072 + }, + { + "epoch": 0.40728959575878065, + "grad_norm": 6.206470012664795, + "learning_rate": 4.777274459457061e-06, + "loss": 0.1349, + "num_input_tokens_seen": 6025672, + "step": 3073 + }, + { + "epoch": 0.40742213386348575, + "grad_norm": 9.960881233215332, + "learning_rate": 4.777131206712276e-06, + "loss": 0.2055, + "num_input_tokens_seen": 6028920, + "step": 3074 + }, + { + "epoch": 0.40755467196819084, + "grad_norm": 11.088637351989746, + "learning_rate": 4.776987910062831e-06, + "loss": 0.2431, + "num_input_tokens_seen": 6031552, + "step": 3075 + }, + { + "epoch": 0.40768721007289593, + "grad_norm": 8.290035247802734, + "learning_rate": 4.776844569511488e-06, + "loss": 0.3315, + "num_input_tokens_seen": 6034800, + "step": 3076 + }, + { + "epoch": 0.4078197481776011, + "grad_norm": 0.3557060956954956, + "learning_rate": 4.776701185061011e-06, + "loss": 0.0023, + "num_input_tokens_seen": 6036192, + "step": 3077 + }, + { + "epoch": 0.4079522862823062, + "grad_norm": 10.25466537475586, + "learning_rate": 4.776557756714165e-06, + "loss": 0.3523, + "num_input_tokens_seen": 6038112, + "step": 3078 + }, + { + "epoch": 0.40808482438701127, + "grad_norm": 0.794212281703949, + "learning_rate": 4.776414284473714e-06, + "loss": 0.0051, + "num_input_tokens_seen": 6040296, + "step": 3079 + }, + { + "epoch": 0.40821736249171636, + "grad_norm": 2.279489517211914, + "learning_rate": 4.776270768342426e-06, + "loss": 0.0324, + "num_input_tokens_seen": 6043056, + "step": 3080 + }, + { + "epoch": 0.40834990059642146, + "grad_norm": 14.591126441955566, + "learning_rate": 4.776127208323066e-06, + "loss": 0.261, + "num_input_tokens_seen": 6045160, + "step": 3081 + }, + { + "epoch": 0.40848243870112655, + "grad_norm": 8.44650650024414, + "learning_rate": 4.7759836044184045e-06, + "loss": 0.2093, + "num_input_tokens_seen": 6046888, + "step": 3082 + }, + { + "epoch": 0.4086149768058317, + "grad_norm": 0.26111704111099243, + "learning_rate": 4.775839956631209e-06, + "loss": 0.0016, + "num_input_tokens_seen": 6048240, + "step": 3083 + }, + { + "epoch": 0.4087475149105368, + "grad_norm": 12.474830627441406, + "learning_rate": 4.775696264964248e-06, + "loss": 0.258, + "num_input_tokens_seen": 6050160, + "step": 3084 + }, + { + "epoch": 0.4088800530152419, + "grad_norm": 8.354405403137207, + "learning_rate": 4.775552529420294e-06, + "loss": 0.1556, + "num_input_tokens_seen": 6052160, + "step": 3085 + }, + { + "epoch": 0.409012591119947, + "grad_norm": 11.345044136047363, + "learning_rate": 4.7754087500021166e-06, + "loss": 0.222, + "num_input_tokens_seen": 6053904, + "step": 3086 + }, + { + "epoch": 0.40914512922465207, + "grad_norm": 19.17113494873047, + "learning_rate": 4.775264926712489e-06, + "loss": 0.3486, + "num_input_tokens_seen": 6055320, + "step": 3087 + }, + { + "epoch": 0.40927766732935716, + "grad_norm": 10.271292686462402, + "learning_rate": 4.775121059554185e-06, + "loss": 0.2964, + "num_input_tokens_seen": 6057752, + "step": 3088 + }, + { + "epoch": 0.4094102054340623, + "grad_norm": 13.502019882202148, + "learning_rate": 4.774977148529976e-06, + "loss": 0.6909, + "num_input_tokens_seen": 6059664, + "step": 3089 + }, + { + "epoch": 0.4095427435387674, + "grad_norm": 0.2012919783592224, + "learning_rate": 4.77483319364264e-06, + "loss": 0.0012, + "num_input_tokens_seen": 6060904, + "step": 3090 + }, + { + "epoch": 0.4096752816434725, + "grad_norm": 5.689934730529785, + "learning_rate": 4.774689194894949e-06, + "loss": 0.0643, + "num_input_tokens_seen": 6062440, + "step": 3091 + }, + { + "epoch": 0.4098078197481776, + "grad_norm": 0.16804854571819305, + "learning_rate": 4.774545152289682e-06, + "loss": 0.001, + "num_input_tokens_seen": 6063496, + "step": 3092 + }, + { + "epoch": 0.4099403578528827, + "grad_norm": 0.13473553955554962, + "learning_rate": 4.774401065829615e-06, + "loss": 0.0008, + "num_input_tokens_seen": 6064816, + "step": 3093 + }, + { + "epoch": 0.4100728959575878, + "grad_norm": 17.00417137145996, + "learning_rate": 4.7742569355175265e-06, + "loss": 0.4453, + "num_input_tokens_seen": 6066272, + "step": 3094 + }, + { + "epoch": 0.41020543406229293, + "grad_norm": 0.4533868432044983, + "learning_rate": 4.774112761356196e-06, + "loss": 0.0027, + "num_input_tokens_seen": 6068736, + "step": 3095 + }, + { + "epoch": 0.410337972166998, + "grad_norm": 12.019583702087402, + "learning_rate": 4.773968543348401e-06, + "loss": 0.2849, + "num_input_tokens_seen": 6071496, + "step": 3096 + }, + { + "epoch": 0.4104705102717031, + "grad_norm": 42.181907653808594, + "learning_rate": 4.773824281496925e-06, + "loss": 0.3721, + "num_input_tokens_seen": 6073360, + "step": 3097 + }, + { + "epoch": 0.4106030483764082, + "grad_norm": 17.163623809814453, + "learning_rate": 4.773679975804548e-06, + "loss": 0.6141, + "num_input_tokens_seen": 6076352, + "step": 3098 + }, + { + "epoch": 0.4107355864811133, + "grad_norm": 13.250836372375488, + "learning_rate": 4.773535626274052e-06, + "loss": 0.1539, + "num_input_tokens_seen": 6078464, + "step": 3099 + }, + { + "epoch": 0.41086812458581845, + "grad_norm": 0.06211230531334877, + "learning_rate": 4.773391232908221e-06, + "loss": 0.0004, + "num_input_tokens_seen": 6080024, + "step": 3100 + }, + { + "epoch": 0.41100066269052354, + "grad_norm": 10.412588119506836, + "learning_rate": 4.77324679570984e-06, + "loss": 0.4491, + "num_input_tokens_seen": 6081584, + "step": 3101 + }, + { + "epoch": 0.41113320079522864, + "grad_norm": 10.901458740234375, + "learning_rate": 4.77310231468169e-06, + "loss": 0.1699, + "num_input_tokens_seen": 6083480, + "step": 3102 + }, + { + "epoch": 0.41126573889993373, + "grad_norm": 14.307418823242188, + "learning_rate": 4.772957789826561e-06, + "loss": 0.4568, + "num_input_tokens_seen": 6085696, + "step": 3103 + }, + { + "epoch": 0.4113982770046388, + "grad_norm": 0.22862078249454498, + "learning_rate": 4.772813221147238e-06, + "loss": 0.0013, + "num_input_tokens_seen": 6087016, + "step": 3104 + }, + { + "epoch": 0.4115308151093439, + "grad_norm": 9.769891738891602, + "learning_rate": 4.772668608646507e-06, + "loss": 0.2239, + "num_input_tokens_seen": 6088616, + "step": 3105 + }, + { + "epoch": 0.41166335321404907, + "grad_norm": 15.975464820861816, + "learning_rate": 4.772523952327158e-06, + "loss": 0.4469, + "num_input_tokens_seen": 6090528, + "step": 3106 + }, + { + "epoch": 0.41179589131875416, + "grad_norm": 0.19305944442749023, + "learning_rate": 4.77237925219198e-06, + "loss": 0.001, + "num_input_tokens_seen": 6092024, + "step": 3107 + }, + { + "epoch": 0.41192842942345925, + "grad_norm": 17.83104705810547, + "learning_rate": 4.772234508243762e-06, + "loss": 0.8094, + "num_input_tokens_seen": 6094184, + "step": 3108 + }, + { + "epoch": 0.41206096752816435, + "grad_norm": 5.032706260681152, + "learning_rate": 4.772089720485294e-06, + "loss": 0.0383, + "num_input_tokens_seen": 6096752, + "step": 3109 + }, + { + "epoch": 0.41219350563286944, + "grad_norm": 14.342052459716797, + "learning_rate": 4.77194488891937e-06, + "loss": 0.6316, + "num_input_tokens_seen": 6099168, + "step": 3110 + }, + { + "epoch": 0.41232604373757453, + "grad_norm": 0.8267076015472412, + "learning_rate": 4.7718000135487805e-06, + "loss": 0.0047, + "num_input_tokens_seen": 6102288, + "step": 3111 + }, + { + "epoch": 0.4124585818422797, + "grad_norm": 17.27442169189453, + "learning_rate": 4.77165509437632e-06, + "loss": 0.5241, + "num_input_tokens_seen": 6105184, + "step": 3112 + }, + { + "epoch": 0.4125911199469848, + "grad_norm": 4.625985622406006, + "learning_rate": 4.771510131404782e-06, + "loss": 0.1199, + "num_input_tokens_seen": 6107976, + "step": 3113 + }, + { + "epoch": 0.41272365805168987, + "grad_norm": 19.51150131225586, + "learning_rate": 4.771365124636962e-06, + "loss": 0.7241, + "num_input_tokens_seen": 6109464, + "step": 3114 + }, + { + "epoch": 0.41285619615639496, + "grad_norm": 5.553974628448486, + "learning_rate": 4.771220074075655e-06, + "loss": 0.0661, + "num_input_tokens_seen": 6110952, + "step": 3115 + }, + { + "epoch": 0.41298873426110005, + "grad_norm": 11.857381820678711, + "learning_rate": 4.771074979723658e-06, + "loss": 0.3017, + "num_input_tokens_seen": 6112760, + "step": 3116 + }, + { + "epoch": 0.41312127236580515, + "grad_norm": 6.518974781036377, + "learning_rate": 4.770929841583769e-06, + "loss": 0.2368, + "num_input_tokens_seen": 6114952, + "step": 3117 + }, + { + "epoch": 0.4132538104705103, + "grad_norm": 12.872612953186035, + "learning_rate": 4.770784659658786e-06, + "loss": 0.2508, + "num_input_tokens_seen": 6116928, + "step": 3118 + }, + { + "epoch": 0.4133863485752154, + "grad_norm": 21.21937370300293, + "learning_rate": 4.7706394339515084e-06, + "loss": 0.5766, + "num_input_tokens_seen": 6118968, + "step": 3119 + }, + { + "epoch": 0.4135188866799205, + "grad_norm": 11.144512176513672, + "learning_rate": 4.770494164464736e-06, + "loss": 0.1176, + "num_input_tokens_seen": 6120664, + "step": 3120 + }, + { + "epoch": 0.4136514247846256, + "grad_norm": 0.5427361130714417, + "learning_rate": 4.77034885120127e-06, + "loss": 0.0032, + "num_input_tokens_seen": 6122608, + "step": 3121 + }, + { + "epoch": 0.41378396288933067, + "grad_norm": 14.200712203979492, + "learning_rate": 4.770203494163911e-06, + "loss": 0.4424, + "num_input_tokens_seen": 6124632, + "step": 3122 + }, + { + "epoch": 0.41391650099403576, + "grad_norm": 17.00480079650879, + "learning_rate": 4.770058093355463e-06, + "loss": 0.52, + "num_input_tokens_seen": 6127632, + "step": 3123 + }, + { + "epoch": 0.4140490390987409, + "grad_norm": 10.266498565673828, + "learning_rate": 4.76991264877873e-06, + "loss": 0.1349, + "num_input_tokens_seen": 6130328, + "step": 3124 + }, + { + "epoch": 0.414181577203446, + "grad_norm": 16.510438919067383, + "learning_rate": 4.769767160436513e-06, + "loss": 0.3694, + "num_input_tokens_seen": 6132504, + "step": 3125 + }, + { + "epoch": 0.4143141153081511, + "grad_norm": 9.144237518310547, + "learning_rate": 4.769621628331621e-06, + "loss": 0.1583, + "num_input_tokens_seen": 6134448, + "step": 3126 + }, + { + "epoch": 0.4144466534128562, + "grad_norm": 15.152335166931152, + "learning_rate": 4.769476052466858e-06, + "loss": 0.2271, + "num_input_tokens_seen": 6136088, + "step": 3127 + }, + { + "epoch": 0.4145791915175613, + "grad_norm": 11.477108001708984, + "learning_rate": 4.769330432845031e-06, + "loss": 0.2565, + "num_input_tokens_seen": 6137712, + "step": 3128 + }, + { + "epoch": 0.4147117296222664, + "grad_norm": 7.583346843719482, + "learning_rate": 4.769184769468947e-06, + "loss": 0.1339, + "num_input_tokens_seen": 6139192, + "step": 3129 + }, + { + "epoch": 0.4148442677269715, + "grad_norm": 12.235583305358887, + "learning_rate": 4.769039062341416e-06, + "loss": 0.4335, + "num_input_tokens_seen": 6140768, + "step": 3130 + }, + { + "epoch": 0.4149768058316766, + "grad_norm": 11.85566520690918, + "learning_rate": 4.768893311465247e-06, + "loss": 0.3706, + "num_input_tokens_seen": 6143088, + "step": 3131 + }, + { + "epoch": 0.4151093439363817, + "grad_norm": 11.401771545410156, + "learning_rate": 4.768747516843248e-06, + "loss": 0.3559, + "num_input_tokens_seen": 6144656, + "step": 3132 + }, + { + "epoch": 0.4152418820410868, + "grad_norm": 3.7596797943115234, + "learning_rate": 4.768601678478233e-06, + "loss": 0.0218, + "num_input_tokens_seen": 6146072, + "step": 3133 + }, + { + "epoch": 0.4153744201457919, + "grad_norm": 7.428145885467529, + "learning_rate": 4.7684557963730125e-06, + "loss": 0.1476, + "num_input_tokens_seen": 6147888, + "step": 3134 + }, + { + "epoch": 0.415506958250497, + "grad_norm": 10.638260841369629, + "learning_rate": 4.7683098705303995e-06, + "loss": 0.3115, + "num_input_tokens_seen": 6149528, + "step": 3135 + }, + { + "epoch": 0.41563949635520214, + "grad_norm": 8.499889373779297, + "learning_rate": 4.768163900953207e-06, + "loss": 0.1436, + "num_input_tokens_seen": 6151400, + "step": 3136 + }, + { + "epoch": 0.41577203445990724, + "grad_norm": 6.4343976974487305, + "learning_rate": 4.76801788764425e-06, + "loss": 0.2123, + "num_input_tokens_seen": 6153928, + "step": 3137 + }, + { + "epoch": 0.41590457256461233, + "grad_norm": 2.0699076652526855, + "learning_rate": 4.767871830606343e-06, + "loss": 0.0123, + "num_input_tokens_seen": 6156352, + "step": 3138 + }, + { + "epoch": 0.4160371106693174, + "grad_norm": 10.167271614074707, + "learning_rate": 4.767725729842303e-06, + "loss": 0.2444, + "num_input_tokens_seen": 6158368, + "step": 3139 + }, + { + "epoch": 0.4161696487740225, + "grad_norm": 9.541232109069824, + "learning_rate": 4.7675795853549465e-06, + "loss": 0.2729, + "num_input_tokens_seen": 6160592, + "step": 3140 + }, + { + "epoch": 0.4163021868787276, + "grad_norm": 0.28433868288993835, + "learning_rate": 4.767433397147092e-06, + "loss": 0.0017, + "num_input_tokens_seen": 6162072, + "step": 3141 + }, + { + "epoch": 0.41643472498343276, + "grad_norm": 6.1566243171691895, + "learning_rate": 4.767287165221557e-06, + "loss": 0.0757, + "num_input_tokens_seen": 6163680, + "step": 3142 + }, + { + "epoch": 0.41656726308813785, + "grad_norm": 0.060589615255594254, + "learning_rate": 4.76714088958116e-06, + "loss": 0.0004, + "num_input_tokens_seen": 6164992, + "step": 3143 + }, + { + "epoch": 0.41669980119284294, + "grad_norm": 3.048267126083374, + "learning_rate": 4.766994570228723e-06, + "loss": 0.0199, + "num_input_tokens_seen": 6167136, + "step": 3144 + }, + { + "epoch": 0.41683233929754804, + "grad_norm": 0.09219857305288315, + "learning_rate": 4.766848207167069e-06, + "loss": 0.0006, + "num_input_tokens_seen": 6169176, + "step": 3145 + }, + { + "epoch": 0.41696487740225313, + "grad_norm": 19.610143661499023, + "learning_rate": 4.766701800399015e-06, + "loss": 0.2388, + "num_input_tokens_seen": 6171008, + "step": 3146 + }, + { + "epoch": 0.4170974155069582, + "grad_norm": 0.07013262808322906, + "learning_rate": 4.7665553499273875e-06, + "loss": 0.0004, + "num_input_tokens_seen": 6173600, + "step": 3147 + }, + { + "epoch": 0.4172299536116634, + "grad_norm": 9.332684516906738, + "learning_rate": 4.76640885575501e-06, + "loss": 0.3414, + "num_input_tokens_seen": 6174880, + "step": 3148 + }, + { + "epoch": 0.41736249171636847, + "grad_norm": 11.6048583984375, + "learning_rate": 4.766262317884706e-06, + "loss": 0.1967, + "num_input_tokens_seen": 6177560, + "step": 3149 + }, + { + "epoch": 0.41749502982107356, + "grad_norm": 12.333416938781738, + "learning_rate": 4.7661157363193e-06, + "loss": 0.2254, + "num_input_tokens_seen": 6180184, + "step": 3150 + }, + { + "epoch": 0.41762756792577865, + "grad_norm": 0.060673609375953674, + "learning_rate": 4.76596911106162e-06, + "loss": 0.0004, + "num_input_tokens_seen": 6181848, + "step": 3151 + }, + { + "epoch": 0.41776010603048375, + "grad_norm": 0.2722627520561218, + "learning_rate": 4.765822442114493e-06, + "loss": 0.0013, + "num_input_tokens_seen": 6183592, + "step": 3152 + }, + { + "epoch": 0.41789264413518884, + "grad_norm": 25.811012268066406, + "learning_rate": 4.765675729480746e-06, + "loss": 0.9896, + "num_input_tokens_seen": 6185856, + "step": 3153 + }, + { + "epoch": 0.418025182239894, + "grad_norm": 13.918129920959473, + "learning_rate": 4.7655289731632075e-06, + "loss": 0.5266, + "num_input_tokens_seen": 6187336, + "step": 3154 + }, + { + "epoch": 0.4181577203445991, + "grad_norm": 10.792093276977539, + "learning_rate": 4.765382173164706e-06, + "loss": 0.1677, + "num_input_tokens_seen": 6189296, + "step": 3155 + }, + { + "epoch": 0.4182902584493042, + "grad_norm": 3.2576427459716797, + "learning_rate": 4.765235329488075e-06, + "loss": 0.0277, + "num_input_tokens_seen": 6191600, + "step": 3156 + }, + { + "epoch": 0.41842279655400927, + "grad_norm": 5.458369255065918, + "learning_rate": 4.765088442136144e-06, + "loss": 0.105, + "num_input_tokens_seen": 6193864, + "step": 3157 + }, + { + "epoch": 0.41855533465871436, + "grad_norm": 0.035416051745414734, + "learning_rate": 4.764941511111746e-06, + "loss": 0.0002, + "num_input_tokens_seen": 6195304, + "step": 3158 + }, + { + "epoch": 0.4186878727634195, + "grad_norm": 0.0703611969947815, + "learning_rate": 4.764794536417712e-06, + "loss": 0.0004, + "num_input_tokens_seen": 6197496, + "step": 3159 + }, + { + "epoch": 0.4188204108681246, + "grad_norm": 9.709760665893555, + "learning_rate": 4.764647518056877e-06, + "loss": 0.2388, + "num_input_tokens_seen": 6199016, + "step": 3160 + }, + { + "epoch": 0.4189529489728297, + "grad_norm": 9.391764640808105, + "learning_rate": 4.764500456032076e-06, + "loss": 0.3105, + "num_input_tokens_seen": 6200776, + "step": 3161 + }, + { + "epoch": 0.4190854870775348, + "grad_norm": 18.629287719726562, + "learning_rate": 4.764353350346144e-06, + "loss": 0.6329, + "num_input_tokens_seen": 6202984, + "step": 3162 + }, + { + "epoch": 0.4192180251822399, + "grad_norm": 7.715208530426025, + "learning_rate": 4.764206201001918e-06, + "loss": 0.2561, + "num_input_tokens_seen": 6204880, + "step": 3163 + }, + { + "epoch": 0.419350563286945, + "grad_norm": 13.882681846618652, + "learning_rate": 4.764059008002233e-06, + "loss": 0.5021, + "num_input_tokens_seen": 6206272, + "step": 3164 + }, + { + "epoch": 0.4194831013916501, + "grad_norm": 10.552160263061523, + "learning_rate": 4.7639117713499295e-06, + "loss": 0.2802, + "num_input_tokens_seen": 6208128, + "step": 3165 + }, + { + "epoch": 0.4196156394963552, + "grad_norm": 14.564270973205566, + "learning_rate": 4.763764491047845e-06, + "loss": 0.5994, + "num_input_tokens_seen": 6209688, + "step": 3166 + }, + { + "epoch": 0.4197481776010603, + "grad_norm": 6.692846775054932, + "learning_rate": 4.763617167098819e-06, + "loss": 0.168, + "num_input_tokens_seen": 6210992, + "step": 3167 + }, + { + "epoch": 0.4198807157057654, + "grad_norm": 16.475135803222656, + "learning_rate": 4.763469799505693e-06, + "loss": 0.5053, + "num_input_tokens_seen": 6213088, + "step": 3168 + }, + { + "epoch": 0.4200132538104705, + "grad_norm": 3.4039344787597656, + "learning_rate": 4.763322388271307e-06, + "loss": 0.0951, + "num_input_tokens_seen": 6214816, + "step": 3169 + }, + { + "epoch": 0.4201457919151756, + "grad_norm": 1.5329331159591675, + "learning_rate": 4.7631749333985044e-06, + "loss": 0.0099, + "num_input_tokens_seen": 6216768, + "step": 3170 + }, + { + "epoch": 0.42027833001988074, + "grad_norm": 0.1647091805934906, + "learning_rate": 4.7630274348901285e-06, + "loss": 0.0011, + "num_input_tokens_seen": 6218296, + "step": 3171 + }, + { + "epoch": 0.42041086812458583, + "grad_norm": 2.2457377910614014, + "learning_rate": 4.762879892749022e-06, + "loss": 0.0123, + "num_input_tokens_seen": 6220776, + "step": 3172 + }, + { + "epoch": 0.42054340622929093, + "grad_norm": 8.632234573364258, + "learning_rate": 4.762732306978029e-06, + "loss": 0.263, + "num_input_tokens_seen": 6223248, + "step": 3173 + }, + { + "epoch": 0.420675944333996, + "grad_norm": 12.974858283996582, + "learning_rate": 4.7625846775799975e-06, + "loss": 0.5419, + "num_input_tokens_seen": 6224992, + "step": 3174 + }, + { + "epoch": 0.4208084824387011, + "grad_norm": 0.2980729937553406, + "learning_rate": 4.7624370045577715e-06, + "loss": 0.002, + "num_input_tokens_seen": 6226560, + "step": 3175 + }, + { + "epoch": 0.4209410205434062, + "grad_norm": 0.8574631214141846, + "learning_rate": 4.7622892879142e-06, + "loss": 0.0055, + "num_input_tokens_seen": 6228216, + "step": 3176 + }, + { + "epoch": 0.42107355864811136, + "grad_norm": 10.34976577758789, + "learning_rate": 4.762141527652131e-06, + "loss": 0.2832, + "num_input_tokens_seen": 6230296, + "step": 3177 + }, + { + "epoch": 0.42120609675281645, + "grad_norm": 8.182145118713379, + "learning_rate": 4.761993723774411e-06, + "loss": 0.1288, + "num_input_tokens_seen": 6232512, + "step": 3178 + }, + { + "epoch": 0.42133863485752154, + "grad_norm": 0.34891024231910706, + "learning_rate": 4.761845876283893e-06, + "loss": 0.0023, + "num_input_tokens_seen": 6234168, + "step": 3179 + }, + { + "epoch": 0.42147117296222664, + "grad_norm": 2.2028005123138428, + "learning_rate": 4.761697985183426e-06, + "loss": 0.0143, + "num_input_tokens_seen": 6236160, + "step": 3180 + }, + { + "epoch": 0.42160371106693173, + "grad_norm": 0.16459161043167114, + "learning_rate": 4.761550050475861e-06, + "loss": 0.0011, + "num_input_tokens_seen": 6238520, + "step": 3181 + }, + { + "epoch": 0.4217362491716368, + "grad_norm": 6.952666282653809, + "learning_rate": 4.761402072164051e-06, + "loss": 0.0601, + "num_input_tokens_seen": 6240056, + "step": 3182 + }, + { + "epoch": 0.42186878727634197, + "grad_norm": 12.082425117492676, + "learning_rate": 4.76125405025085e-06, + "loss": 0.3815, + "num_input_tokens_seen": 6242728, + "step": 3183 + }, + { + "epoch": 0.42200132538104707, + "grad_norm": 9.790754318237305, + "learning_rate": 4.76110598473911e-06, + "loss": 0.1503, + "num_input_tokens_seen": 6245088, + "step": 3184 + }, + { + "epoch": 0.42213386348575216, + "grad_norm": 13.3594970703125, + "learning_rate": 4.760957875631686e-06, + "loss": 0.4013, + "num_input_tokens_seen": 6247008, + "step": 3185 + }, + { + "epoch": 0.42226640159045725, + "grad_norm": 12.206310272216797, + "learning_rate": 4.760809722931435e-06, + "loss": 0.3708, + "num_input_tokens_seen": 6248904, + "step": 3186 + }, + { + "epoch": 0.42239893969516235, + "grad_norm": 11.866032600402832, + "learning_rate": 4.760661526641214e-06, + "loss": 0.2464, + "num_input_tokens_seen": 6251328, + "step": 3187 + }, + { + "epoch": 0.42253147779986744, + "grad_norm": 16.07721519470215, + "learning_rate": 4.760513286763878e-06, + "loss": 0.4798, + "num_input_tokens_seen": 6252536, + "step": 3188 + }, + { + "epoch": 0.4226640159045726, + "grad_norm": 6.7011399269104, + "learning_rate": 4.760365003302286e-06, + "loss": 0.1763, + "num_input_tokens_seen": 6254176, + "step": 3189 + }, + { + "epoch": 0.4227965540092777, + "grad_norm": 10.597217559814453, + "learning_rate": 4.7602166762592985e-06, + "loss": 0.2457, + "num_input_tokens_seen": 6256208, + "step": 3190 + }, + { + "epoch": 0.4229290921139828, + "grad_norm": 8.765196800231934, + "learning_rate": 4.7600683056377735e-06, + "loss": 0.1289, + "num_input_tokens_seen": 6257488, + "step": 3191 + }, + { + "epoch": 0.42306163021868787, + "grad_norm": 10.604379653930664, + "learning_rate": 4.759919891440573e-06, + "loss": 0.213, + "num_input_tokens_seen": 6258736, + "step": 3192 + }, + { + "epoch": 0.42319416832339296, + "grad_norm": 5.606478691101074, + "learning_rate": 4.7597714336705585e-06, + "loss": 0.1711, + "num_input_tokens_seen": 6260632, + "step": 3193 + }, + { + "epoch": 0.42332670642809805, + "grad_norm": 24.88968276977539, + "learning_rate": 4.759622932330591e-06, + "loss": 0.5038, + "num_input_tokens_seen": 6262416, + "step": 3194 + }, + { + "epoch": 0.4234592445328032, + "grad_norm": 0.5721713304519653, + "learning_rate": 4.759474387423534e-06, + "loss": 0.0038, + "num_input_tokens_seen": 6264400, + "step": 3195 + }, + { + "epoch": 0.4235917826375083, + "grad_norm": 6.471712589263916, + "learning_rate": 4.759325798952253e-06, + "loss": 0.0827, + "num_input_tokens_seen": 6266072, + "step": 3196 + }, + { + "epoch": 0.4237243207422134, + "grad_norm": 4.871187686920166, + "learning_rate": 4.7591771669196115e-06, + "loss": 0.0416, + "num_input_tokens_seen": 6268096, + "step": 3197 + }, + { + "epoch": 0.4238568588469185, + "grad_norm": 8.076518058776855, + "learning_rate": 4.759028491328476e-06, + "loss": 0.3966, + "num_input_tokens_seen": 6269864, + "step": 3198 + }, + { + "epoch": 0.4239893969516236, + "grad_norm": 12.052094459533691, + "learning_rate": 4.7588797721817125e-06, + "loss": 0.2255, + "num_input_tokens_seen": 6273008, + "step": 3199 + }, + { + "epoch": 0.42412193505632867, + "grad_norm": 8.013740539550781, + "learning_rate": 4.758731009482189e-06, + "loss": 0.2477, + "num_input_tokens_seen": 6274832, + "step": 3200 + }, + { + "epoch": 0.4242544731610338, + "grad_norm": 1.2955780029296875, + "learning_rate": 4.758582203232773e-06, + "loss": 0.0079, + "num_input_tokens_seen": 6277168, + "step": 3201 + }, + { + "epoch": 0.4243870112657389, + "grad_norm": 6.372220039367676, + "learning_rate": 4.758433353436334e-06, + "loss": 0.1013, + "num_input_tokens_seen": 6278952, + "step": 3202 + }, + { + "epoch": 0.424519549370444, + "grad_norm": 7.447582721710205, + "learning_rate": 4.7582844600957425e-06, + "loss": 0.3166, + "num_input_tokens_seen": 6280856, + "step": 3203 + }, + { + "epoch": 0.4246520874751491, + "grad_norm": 3.587099552154541, + "learning_rate": 4.758135523213868e-06, + "loss": 0.0586, + "num_input_tokens_seen": 6284152, + "step": 3204 + }, + { + "epoch": 0.4247846255798542, + "grad_norm": 9.551051139831543, + "learning_rate": 4.757986542793583e-06, + "loss": 0.3202, + "num_input_tokens_seen": 6286176, + "step": 3205 + }, + { + "epoch": 0.4249171636845593, + "grad_norm": 10.828388214111328, + "learning_rate": 4.75783751883776e-06, + "loss": 0.2285, + "num_input_tokens_seen": 6288416, + "step": 3206 + }, + { + "epoch": 0.42504970178926443, + "grad_norm": 8.946134567260742, + "learning_rate": 4.757688451349271e-06, + "loss": 0.0646, + "num_input_tokens_seen": 6290424, + "step": 3207 + }, + { + "epoch": 0.4251822398939695, + "grad_norm": 10.437426567077637, + "learning_rate": 4.757539340330993e-06, + "loss": 0.313, + "num_input_tokens_seen": 6293568, + "step": 3208 + }, + { + "epoch": 0.4253147779986746, + "grad_norm": 14.829366683959961, + "learning_rate": 4.757390185785798e-06, + "loss": 0.3653, + "num_input_tokens_seen": 6295504, + "step": 3209 + }, + { + "epoch": 0.4254473161033797, + "grad_norm": 20.425113677978516, + "learning_rate": 4.7572409877165624e-06, + "loss": 0.6285, + "num_input_tokens_seen": 6298096, + "step": 3210 + }, + { + "epoch": 0.4255798542080848, + "grad_norm": 12.301413536071777, + "learning_rate": 4.757091746126165e-06, + "loss": 0.2366, + "num_input_tokens_seen": 6299744, + "step": 3211 + }, + { + "epoch": 0.4257123923127899, + "grad_norm": 9.431246757507324, + "learning_rate": 4.756942461017481e-06, + "loss": 0.1655, + "num_input_tokens_seen": 6300888, + "step": 3212 + }, + { + "epoch": 0.42584493041749505, + "grad_norm": 7.4078450202941895, + "learning_rate": 4.756793132393388e-06, + "loss": 0.131, + "num_input_tokens_seen": 6303872, + "step": 3213 + }, + { + "epoch": 0.42597746852220014, + "grad_norm": 0.44162753224372864, + "learning_rate": 4.7566437602567686e-06, + "loss": 0.0028, + "num_input_tokens_seen": 6305544, + "step": 3214 + }, + { + "epoch": 0.42611000662690524, + "grad_norm": 15.503522872924805, + "learning_rate": 4.7564943446105e-06, + "loss": 0.4489, + "num_input_tokens_seen": 6308016, + "step": 3215 + }, + { + "epoch": 0.42624254473161033, + "grad_norm": 8.623398780822754, + "learning_rate": 4.756344885457462e-06, + "loss": 0.1578, + "num_input_tokens_seen": 6310632, + "step": 3216 + }, + { + "epoch": 0.4263750828363154, + "grad_norm": 6.237339019775391, + "learning_rate": 4.75619538280054e-06, + "loss": 0.0663, + "num_input_tokens_seen": 6312368, + "step": 3217 + }, + { + "epoch": 0.42650762094102057, + "grad_norm": 25.49467658996582, + "learning_rate": 4.756045836642614e-06, + "loss": 0.6745, + "num_input_tokens_seen": 6314936, + "step": 3218 + }, + { + "epoch": 0.42664015904572566, + "grad_norm": 6.262709617614746, + "learning_rate": 4.755896246986568e-06, + "loss": 0.3051, + "num_input_tokens_seen": 6316848, + "step": 3219 + }, + { + "epoch": 0.42677269715043076, + "grad_norm": 9.886835098266602, + "learning_rate": 4.755746613835287e-06, + "loss": 0.4395, + "num_input_tokens_seen": 6318280, + "step": 3220 + }, + { + "epoch": 0.42690523525513585, + "grad_norm": 12.256113052368164, + "learning_rate": 4.755596937191653e-06, + "loss": 0.1916, + "num_input_tokens_seen": 6320352, + "step": 3221 + }, + { + "epoch": 0.42703777335984094, + "grad_norm": 16.476892471313477, + "learning_rate": 4.755447217058556e-06, + "loss": 0.4537, + "num_input_tokens_seen": 6322320, + "step": 3222 + }, + { + "epoch": 0.42717031146454604, + "grad_norm": 0.5555036067962646, + "learning_rate": 4.75529745343888e-06, + "loss": 0.0037, + "num_input_tokens_seen": 6323872, + "step": 3223 + }, + { + "epoch": 0.4273028495692512, + "grad_norm": 16.511619567871094, + "learning_rate": 4.755147646335513e-06, + "loss": 0.4184, + "num_input_tokens_seen": 6326064, + "step": 3224 + }, + { + "epoch": 0.4274353876739563, + "grad_norm": 3.4608192443847656, + "learning_rate": 4.754997795751344e-06, + "loss": 0.0298, + "num_input_tokens_seen": 6327024, + "step": 3225 + }, + { + "epoch": 0.4275679257786614, + "grad_norm": 15.203058242797852, + "learning_rate": 4.754847901689261e-06, + "loss": 0.3173, + "num_input_tokens_seen": 6328904, + "step": 3226 + }, + { + "epoch": 0.42770046388336647, + "grad_norm": 6.806552410125732, + "learning_rate": 4.754697964152155e-06, + "loss": 0.2833, + "num_input_tokens_seen": 6330512, + "step": 3227 + }, + { + "epoch": 0.42783300198807156, + "grad_norm": 0.29349565505981445, + "learning_rate": 4.754547983142918e-06, + "loss": 0.0019, + "num_input_tokens_seen": 6331664, + "step": 3228 + }, + { + "epoch": 0.42796554009277665, + "grad_norm": 12.76695728302002, + "learning_rate": 4.75439795866444e-06, + "loss": 0.4526, + "num_input_tokens_seen": 6333216, + "step": 3229 + }, + { + "epoch": 0.4280980781974818, + "grad_norm": 0.6151604056358337, + "learning_rate": 4.754247890719615e-06, + "loss": 0.0041, + "num_input_tokens_seen": 6334448, + "step": 3230 + }, + { + "epoch": 0.4282306163021869, + "grad_norm": 0.5471124649047852, + "learning_rate": 4.754097779311333e-06, + "loss": 0.0036, + "num_input_tokens_seen": 6335672, + "step": 3231 + }, + { + "epoch": 0.428363154406892, + "grad_norm": 8.136335372924805, + "learning_rate": 4.753947624442493e-06, + "loss": 0.1509, + "num_input_tokens_seen": 6338552, + "step": 3232 + }, + { + "epoch": 0.4284956925115971, + "grad_norm": 0.37853461503982544, + "learning_rate": 4.753797426115987e-06, + "loss": 0.0025, + "num_input_tokens_seen": 6341144, + "step": 3233 + }, + { + "epoch": 0.4286282306163022, + "grad_norm": 11.111507415771484, + "learning_rate": 4.7536471843347124e-06, + "loss": 0.4224, + "num_input_tokens_seen": 6344120, + "step": 3234 + }, + { + "epoch": 0.42876076872100727, + "grad_norm": 1.112655520439148, + "learning_rate": 4.753496899101565e-06, + "loss": 0.0015, + "num_input_tokens_seen": 6345384, + "step": 3235 + }, + { + "epoch": 0.4288933068257124, + "grad_norm": 6.592243671417236, + "learning_rate": 4.753346570419442e-06, + "loss": 0.1619, + "num_input_tokens_seen": 6346680, + "step": 3236 + }, + { + "epoch": 0.4290258449304175, + "grad_norm": 0.24551311135292053, + "learning_rate": 4.753196198291244e-06, + "loss": 0.0015, + "num_input_tokens_seen": 6348336, + "step": 3237 + }, + { + "epoch": 0.4291583830351226, + "grad_norm": 3.84431791305542, + "learning_rate": 4.753045782719868e-06, + "loss": 0.1027, + "num_input_tokens_seen": 6351560, + "step": 3238 + }, + { + "epoch": 0.4292909211398277, + "grad_norm": 1.7530347108840942, + "learning_rate": 4.752895323708216e-06, + "loss": 0.0085, + "num_input_tokens_seen": 6353784, + "step": 3239 + }, + { + "epoch": 0.4294234592445328, + "grad_norm": 1.575809121131897, + "learning_rate": 4.752744821259188e-06, + "loss": 0.0139, + "num_input_tokens_seen": 6356016, + "step": 3240 + }, + { + "epoch": 0.4295559973492379, + "grad_norm": 10.949199676513672, + "learning_rate": 4.752594275375684e-06, + "loss": 0.3173, + "num_input_tokens_seen": 6359160, + "step": 3241 + }, + { + "epoch": 0.42968853545394303, + "grad_norm": 0.23050843179225922, + "learning_rate": 4.752443686060609e-06, + "loss": 0.0014, + "num_input_tokens_seen": 6360584, + "step": 3242 + }, + { + "epoch": 0.4298210735586481, + "grad_norm": 12.696118354797363, + "learning_rate": 4.752293053316867e-06, + "loss": 0.2938, + "num_input_tokens_seen": 6363496, + "step": 3243 + }, + { + "epoch": 0.4299536116633532, + "grad_norm": 7.891383171081543, + "learning_rate": 4.7521423771473605e-06, + "loss": 0.3196, + "num_input_tokens_seen": 6365776, + "step": 3244 + }, + { + "epoch": 0.4300861497680583, + "grad_norm": 6.171553134918213, + "learning_rate": 4.751991657554995e-06, + "loss": 0.0778, + "num_input_tokens_seen": 6367384, + "step": 3245 + }, + { + "epoch": 0.4302186878727634, + "grad_norm": 0.1137521043419838, + "learning_rate": 4.751840894542677e-06, + "loss": 0.0007, + "num_input_tokens_seen": 6368664, + "step": 3246 + }, + { + "epoch": 0.4303512259774685, + "grad_norm": 0.07953488826751709, + "learning_rate": 4.751690088113313e-06, + "loss": 0.0005, + "num_input_tokens_seen": 6369592, + "step": 3247 + }, + { + "epoch": 0.43048376408217365, + "grad_norm": 6.827754497528076, + "learning_rate": 4.751539238269811e-06, + "loss": 0.1433, + "num_input_tokens_seen": 6371728, + "step": 3248 + }, + { + "epoch": 0.43061630218687874, + "grad_norm": 9.533121109008789, + "learning_rate": 4.751388345015079e-06, + "loss": 0.3224, + "num_input_tokens_seen": 6373808, + "step": 3249 + }, + { + "epoch": 0.43074884029158383, + "grad_norm": 0.2911543548107147, + "learning_rate": 4.751237408352027e-06, + "loss": 0.0018, + "num_input_tokens_seen": 6375160, + "step": 3250 + }, + { + "epoch": 0.4308813783962889, + "grad_norm": 6.170498847961426, + "learning_rate": 4.751086428283564e-06, + "loss": 0.218, + "num_input_tokens_seen": 6376944, + "step": 3251 + }, + { + "epoch": 0.431013916500994, + "grad_norm": 5.136447906494141, + "learning_rate": 4.7509354048126025e-06, + "loss": 0.1243, + "num_input_tokens_seen": 6378808, + "step": 3252 + }, + { + "epoch": 0.4311464546056991, + "grad_norm": 0.12435410171747208, + "learning_rate": 4.750784337942053e-06, + "loss": 0.0007, + "num_input_tokens_seen": 6380480, + "step": 3253 + }, + { + "epoch": 0.43127899271040426, + "grad_norm": 10.81765365600586, + "learning_rate": 4.750633227674829e-06, + "loss": 0.2345, + "num_input_tokens_seen": 6382608, + "step": 3254 + }, + { + "epoch": 0.43141153081510936, + "grad_norm": 9.513566017150879, + "learning_rate": 4.7504820740138436e-06, + "loss": 0.1001, + "num_input_tokens_seen": 6385504, + "step": 3255 + }, + { + "epoch": 0.43154406891981445, + "grad_norm": 11.01030445098877, + "learning_rate": 4.750330876962012e-06, + "loss": 0.3387, + "num_input_tokens_seen": 6387080, + "step": 3256 + }, + { + "epoch": 0.43167660702451954, + "grad_norm": 0.14989186823368073, + "learning_rate": 4.750179636522248e-06, + "loss": 0.0009, + "num_input_tokens_seen": 6388440, + "step": 3257 + }, + { + "epoch": 0.43180914512922464, + "grad_norm": 0.23726676404476166, + "learning_rate": 4.750028352697467e-06, + "loss": 0.0014, + "num_input_tokens_seen": 6390432, + "step": 3258 + }, + { + "epoch": 0.43194168323392973, + "grad_norm": 2.9729433059692383, + "learning_rate": 4.749877025490589e-06, + "loss": 0.0439, + "num_input_tokens_seen": 6392192, + "step": 3259 + }, + { + "epoch": 0.4320742213386349, + "grad_norm": 2.461897850036621, + "learning_rate": 4.749725654904529e-06, + "loss": 0.0165, + "num_input_tokens_seen": 6393368, + "step": 3260 + }, + { + "epoch": 0.43220675944333997, + "grad_norm": 8.683965682983398, + "learning_rate": 4.749574240942206e-06, + "loss": 0.2531, + "num_input_tokens_seen": 6395120, + "step": 3261 + }, + { + "epoch": 0.43233929754804507, + "grad_norm": 6.3044538497924805, + "learning_rate": 4.749422783606541e-06, + "loss": 0.1591, + "num_input_tokens_seen": 6397496, + "step": 3262 + }, + { + "epoch": 0.43247183565275016, + "grad_norm": 16.04700469970703, + "learning_rate": 4.749271282900451e-06, + "loss": 0.4915, + "num_input_tokens_seen": 6399848, + "step": 3263 + }, + { + "epoch": 0.43260437375745525, + "grad_norm": 15.745956420898438, + "learning_rate": 4.749119738826862e-06, + "loss": 0.5069, + "num_input_tokens_seen": 6402104, + "step": 3264 + }, + { + "epoch": 0.43273691186216034, + "grad_norm": 0.0707186609506607, + "learning_rate": 4.748968151388691e-06, + "loss": 0.0004, + "num_input_tokens_seen": 6403656, + "step": 3265 + }, + { + "epoch": 0.4328694499668655, + "grad_norm": 0.04840201884508133, + "learning_rate": 4.748816520588862e-06, + "loss": 0.0003, + "num_input_tokens_seen": 6404848, + "step": 3266 + }, + { + "epoch": 0.4330019880715706, + "grad_norm": 0.05223044753074646, + "learning_rate": 4.7486648464303e-06, + "loss": 0.0003, + "num_input_tokens_seen": 6406208, + "step": 3267 + }, + { + "epoch": 0.4331345261762757, + "grad_norm": 3.870861530303955, + "learning_rate": 4.748513128915928e-06, + "loss": 0.0494, + "num_input_tokens_seen": 6409528, + "step": 3268 + }, + { + "epoch": 0.4332670642809808, + "grad_norm": 0.05873791500926018, + "learning_rate": 4.748361368048672e-06, + "loss": 0.0004, + "num_input_tokens_seen": 6410960, + "step": 3269 + }, + { + "epoch": 0.43339960238568587, + "grad_norm": 6.463876247406006, + "learning_rate": 4.7482095638314575e-06, + "loss": 0.079, + "num_input_tokens_seen": 6412696, + "step": 3270 + }, + { + "epoch": 0.43353214049039096, + "grad_norm": 8.980402946472168, + "learning_rate": 4.748057716267212e-06, + "loss": 0.3456, + "num_input_tokens_seen": 6414448, + "step": 3271 + }, + { + "epoch": 0.4336646785950961, + "grad_norm": 0.3042178452014923, + "learning_rate": 4.747905825358863e-06, + "loss": 0.0017, + "num_input_tokens_seen": 6416512, + "step": 3272 + }, + { + "epoch": 0.4337972166998012, + "grad_norm": 0.4682639539241791, + "learning_rate": 4.747753891109338e-06, + "loss": 0.0026, + "num_input_tokens_seen": 6417952, + "step": 3273 + }, + { + "epoch": 0.4339297548045063, + "grad_norm": 11.877985000610352, + "learning_rate": 4.747601913521569e-06, + "loss": 0.1636, + "num_input_tokens_seen": 6419600, + "step": 3274 + }, + { + "epoch": 0.4340622929092114, + "grad_norm": 14.691099166870117, + "learning_rate": 4.747449892598483e-06, + "loss": 0.3116, + "num_input_tokens_seen": 6421400, + "step": 3275 + }, + { + "epoch": 0.4341948310139165, + "grad_norm": 18.678707122802734, + "learning_rate": 4.747297828343012e-06, + "loss": 0.4132, + "num_input_tokens_seen": 6422864, + "step": 3276 + }, + { + "epoch": 0.43432736911862163, + "grad_norm": 0.23068086802959442, + "learning_rate": 4.747145720758091e-06, + "loss": 0.0012, + "num_input_tokens_seen": 6425232, + "step": 3277 + }, + { + "epoch": 0.4344599072233267, + "grad_norm": 9.23740291595459, + "learning_rate": 4.746993569846649e-06, + "loss": 0.4506, + "num_input_tokens_seen": 6427296, + "step": 3278 + }, + { + "epoch": 0.4345924453280318, + "grad_norm": 10.200162887573242, + "learning_rate": 4.74684137561162e-06, + "loss": 0.3438, + "num_input_tokens_seen": 6429952, + "step": 3279 + }, + { + "epoch": 0.4347249834327369, + "grad_norm": 0.0796721950173378, + "learning_rate": 4.746689138055941e-06, + "loss": 0.0005, + "num_input_tokens_seen": 6431920, + "step": 3280 + }, + { + "epoch": 0.434857521537442, + "grad_norm": 0.1400357186794281, + "learning_rate": 4.7465368571825445e-06, + "loss": 0.0008, + "num_input_tokens_seen": 6433528, + "step": 3281 + }, + { + "epoch": 0.4349900596421471, + "grad_norm": 6.827568531036377, + "learning_rate": 4.746384532994368e-06, + "loss": 0.2183, + "num_input_tokens_seen": 6435192, + "step": 3282 + }, + { + "epoch": 0.43512259774685225, + "grad_norm": 18.18243980407715, + "learning_rate": 4.746232165494347e-06, + "loss": 0.6414, + "num_input_tokens_seen": 6437272, + "step": 3283 + }, + { + "epoch": 0.43525513585155734, + "grad_norm": 15.806026458740234, + "learning_rate": 4.746079754685421e-06, + "loss": 0.4384, + "num_input_tokens_seen": 6439384, + "step": 3284 + }, + { + "epoch": 0.43538767395626243, + "grad_norm": 8.984960556030273, + "learning_rate": 4.7459273005705285e-06, + "loss": 0.5182, + "num_input_tokens_seen": 6442192, + "step": 3285 + }, + { + "epoch": 0.4355202120609675, + "grad_norm": 11.54957103729248, + "learning_rate": 4.745774803152607e-06, + "loss": 0.4304, + "num_input_tokens_seen": 6444664, + "step": 3286 + }, + { + "epoch": 0.4356527501656726, + "grad_norm": 11.394903182983398, + "learning_rate": 4.745622262434599e-06, + "loss": 0.1527, + "num_input_tokens_seen": 6446528, + "step": 3287 + }, + { + "epoch": 0.4357852882703777, + "grad_norm": 16.90025520324707, + "learning_rate": 4.745469678419444e-06, + "loss": 0.4675, + "num_input_tokens_seen": 6449080, + "step": 3288 + }, + { + "epoch": 0.43591782637508286, + "grad_norm": 0.28165432810783386, + "learning_rate": 4.745317051110084e-06, + "loss": 0.0017, + "num_input_tokens_seen": 6451240, + "step": 3289 + }, + { + "epoch": 0.43605036447978796, + "grad_norm": 10.608962059020996, + "learning_rate": 4.745164380509464e-06, + "loss": 0.3, + "num_input_tokens_seen": 6453480, + "step": 3290 + }, + { + "epoch": 0.43618290258449305, + "grad_norm": 13.799406051635742, + "learning_rate": 4.7450116666205245e-06, + "loss": 0.499, + "num_input_tokens_seen": 6454968, + "step": 3291 + }, + { + "epoch": 0.43631544068919814, + "grad_norm": 10.929240226745605, + "learning_rate": 4.744858909446212e-06, + "loss": 0.3673, + "num_input_tokens_seen": 6456592, + "step": 3292 + }, + { + "epoch": 0.43644797879390324, + "grad_norm": 15.704523086547852, + "learning_rate": 4.74470610898947e-06, + "loss": 0.3869, + "num_input_tokens_seen": 6458568, + "step": 3293 + }, + { + "epoch": 0.43658051689860833, + "grad_norm": 0.1996748149394989, + "learning_rate": 4.744553265253246e-06, + "loss": 0.0012, + "num_input_tokens_seen": 6460048, + "step": 3294 + }, + { + "epoch": 0.4367130550033135, + "grad_norm": 16.50066566467285, + "learning_rate": 4.744400378240488e-06, + "loss": 0.223, + "num_input_tokens_seen": 6461576, + "step": 3295 + }, + { + "epoch": 0.43684559310801857, + "grad_norm": 10.287540435791016, + "learning_rate": 4.744247447954141e-06, + "loss": 0.1951, + "num_input_tokens_seen": 6463384, + "step": 3296 + }, + { + "epoch": 0.43697813121272366, + "grad_norm": 9.577747344970703, + "learning_rate": 4.744094474397156e-06, + "loss": 0.2353, + "num_input_tokens_seen": 6465168, + "step": 3297 + }, + { + "epoch": 0.43711066931742876, + "grad_norm": 3.065082550048828, + "learning_rate": 4.74394145757248e-06, + "loss": 0.0347, + "num_input_tokens_seen": 6466912, + "step": 3298 + }, + { + "epoch": 0.43724320742213385, + "grad_norm": 0.2064424306154251, + "learning_rate": 4.743788397483066e-06, + "loss": 0.0013, + "num_input_tokens_seen": 6468672, + "step": 3299 + }, + { + "epoch": 0.43737574552683894, + "grad_norm": 5.417873382568359, + "learning_rate": 4.743635294131863e-06, + "loss": 0.0564, + "num_input_tokens_seen": 6470880, + "step": 3300 + }, + { + "epoch": 0.4375082836315441, + "grad_norm": 14.123854637145996, + "learning_rate": 4.743482147521824e-06, + "loss": 0.494, + "num_input_tokens_seen": 6472888, + "step": 3301 + }, + { + "epoch": 0.4376408217362492, + "grad_norm": 10.059908866882324, + "learning_rate": 4.7433289576559015e-06, + "loss": 0.3102, + "num_input_tokens_seen": 6475144, + "step": 3302 + }, + { + "epoch": 0.4377733598409543, + "grad_norm": 12.827664375305176, + "learning_rate": 4.743175724537049e-06, + "loss": 0.1994, + "num_input_tokens_seen": 6476920, + "step": 3303 + }, + { + "epoch": 0.4379058979456594, + "grad_norm": 4.525793075561523, + "learning_rate": 4.743022448168222e-06, + "loss": 0.0608, + "num_input_tokens_seen": 6478640, + "step": 3304 + }, + { + "epoch": 0.43803843605036447, + "grad_norm": 0.5189858675003052, + "learning_rate": 4.742869128552374e-06, + "loss": 0.0034, + "num_input_tokens_seen": 6480472, + "step": 3305 + }, + { + "epoch": 0.43817097415506956, + "grad_norm": 5.578190326690674, + "learning_rate": 4.742715765692462e-06, + "loss": 0.0746, + "num_input_tokens_seen": 6482112, + "step": 3306 + }, + { + "epoch": 0.4383035122597747, + "grad_norm": 10.450170516967773, + "learning_rate": 4.742562359591442e-06, + "loss": 0.28, + "num_input_tokens_seen": 6483976, + "step": 3307 + }, + { + "epoch": 0.4384360503644798, + "grad_norm": 10.942258834838867, + "learning_rate": 4.742408910252274e-06, + "loss": 0.3709, + "num_input_tokens_seen": 6486552, + "step": 3308 + }, + { + "epoch": 0.4385685884691849, + "grad_norm": 1.9542473554611206, + "learning_rate": 4.742255417677914e-06, + "loss": 0.0199, + "num_input_tokens_seen": 6487920, + "step": 3309 + }, + { + "epoch": 0.43870112657389, + "grad_norm": 15.771306991577148, + "learning_rate": 4.7421018818713236e-06, + "loss": 0.4687, + "num_input_tokens_seen": 6490128, + "step": 3310 + }, + { + "epoch": 0.4388336646785951, + "grad_norm": 23.088443756103516, + "learning_rate": 4.741948302835462e-06, + "loss": 0.4987, + "num_input_tokens_seen": 6492408, + "step": 3311 + }, + { + "epoch": 0.4389662027833002, + "grad_norm": 0.2232021540403366, + "learning_rate": 4.741794680573291e-06, + "loss": 0.0014, + "num_input_tokens_seen": 6493456, + "step": 3312 + }, + { + "epoch": 0.4390987408880053, + "grad_norm": 0.5259813070297241, + "learning_rate": 4.741641015087771e-06, + "loss": 0.0033, + "num_input_tokens_seen": 6495680, + "step": 3313 + }, + { + "epoch": 0.4392312789927104, + "grad_norm": 6.458506107330322, + "learning_rate": 4.741487306381866e-06, + "loss": 0.1363, + "num_input_tokens_seen": 6498016, + "step": 3314 + }, + { + "epoch": 0.4393638170974155, + "grad_norm": 0.5402525067329407, + "learning_rate": 4.74133355445854e-06, + "loss": 0.0032, + "num_input_tokens_seen": 6499560, + "step": 3315 + }, + { + "epoch": 0.4394963552021206, + "grad_norm": 0.25143033266067505, + "learning_rate": 4.7411797593207565e-06, + "loss": 0.0016, + "num_input_tokens_seen": 6500880, + "step": 3316 + }, + { + "epoch": 0.4396288933068257, + "grad_norm": 23.83981704711914, + "learning_rate": 4.741025920971481e-06, + "loss": 0.3849, + "num_input_tokens_seen": 6502696, + "step": 3317 + }, + { + "epoch": 0.4397614314115308, + "grad_norm": 13.371020317077637, + "learning_rate": 4.7408720394136796e-06, + "loss": 0.3305, + "num_input_tokens_seen": 6504280, + "step": 3318 + }, + { + "epoch": 0.43989396951623594, + "grad_norm": 20.570331573486328, + "learning_rate": 4.74071811465032e-06, + "loss": 0.6006, + "num_input_tokens_seen": 6505568, + "step": 3319 + }, + { + "epoch": 0.44002650762094103, + "grad_norm": 19.411767959594727, + "learning_rate": 4.740564146684369e-06, + "loss": 0.6488, + "num_input_tokens_seen": 6507576, + "step": 3320 + }, + { + "epoch": 0.4401590457256461, + "grad_norm": 6.28535270690918, + "learning_rate": 4.740410135518795e-06, + "loss": 0.0459, + "num_input_tokens_seen": 6510264, + "step": 3321 + }, + { + "epoch": 0.4402915838303512, + "grad_norm": 13.772862434387207, + "learning_rate": 4.740256081156569e-06, + "loss": 0.4754, + "num_input_tokens_seen": 6512160, + "step": 3322 + }, + { + "epoch": 0.4404241219350563, + "grad_norm": 9.0480318069458, + "learning_rate": 4.74010198360066e-06, + "loss": 0.0763, + "num_input_tokens_seen": 6513736, + "step": 3323 + }, + { + "epoch": 0.4405566600397614, + "grad_norm": 9.289855003356934, + "learning_rate": 4.7399478428540394e-06, + "loss": 0.13, + "num_input_tokens_seen": 6515904, + "step": 3324 + }, + { + "epoch": 0.44068919814446655, + "grad_norm": 13.83438491821289, + "learning_rate": 4.739793658919678e-06, + "loss": 0.2975, + "num_input_tokens_seen": 6518424, + "step": 3325 + }, + { + "epoch": 0.44082173624917165, + "grad_norm": 0.20669658482074738, + "learning_rate": 4.739639431800551e-06, + "loss": 0.0013, + "num_input_tokens_seen": 6520480, + "step": 3326 + }, + { + "epoch": 0.44095427435387674, + "grad_norm": 14.202149391174316, + "learning_rate": 4.739485161499631e-06, + "loss": 0.4193, + "num_input_tokens_seen": 6522944, + "step": 3327 + }, + { + "epoch": 0.44108681245858183, + "grad_norm": 7.478730201721191, + "learning_rate": 4.7393308480198915e-06, + "loss": 0.0923, + "num_input_tokens_seen": 6525256, + "step": 3328 + }, + { + "epoch": 0.4412193505632869, + "grad_norm": 0.19464318454265594, + "learning_rate": 4.739176491364308e-06, + "loss": 0.0012, + "num_input_tokens_seen": 6526712, + "step": 3329 + }, + { + "epoch": 0.441351888667992, + "grad_norm": 20.973407745361328, + "learning_rate": 4.739022091535857e-06, + "loss": 0.6609, + "num_input_tokens_seen": 6528440, + "step": 3330 + }, + { + "epoch": 0.44148442677269717, + "grad_norm": 3.9169130325317383, + "learning_rate": 4.738867648537516e-06, + "loss": 0.0256, + "num_input_tokens_seen": 6529848, + "step": 3331 + }, + { + "epoch": 0.44161696487740226, + "grad_norm": 0.2776479125022888, + "learning_rate": 4.738713162372263e-06, + "loss": 0.0017, + "num_input_tokens_seen": 6531760, + "step": 3332 + }, + { + "epoch": 0.44174950298210736, + "grad_norm": 6.610439300537109, + "learning_rate": 4.738558633043074e-06, + "loss": 0.1544, + "num_input_tokens_seen": 6533928, + "step": 3333 + }, + { + "epoch": 0.44188204108681245, + "grad_norm": 0.1773267686367035, + "learning_rate": 4.738404060552932e-06, + "loss": 0.0011, + "num_input_tokens_seen": 6535904, + "step": 3334 + }, + { + "epoch": 0.44201457919151754, + "grad_norm": 0.15859174728393555, + "learning_rate": 4.738249444904814e-06, + "loss": 0.001, + "num_input_tokens_seen": 6538584, + "step": 3335 + }, + { + "epoch": 0.4421471172962227, + "grad_norm": 5.876689434051514, + "learning_rate": 4.738094786101704e-06, + "loss": 0.1528, + "num_input_tokens_seen": 6540688, + "step": 3336 + }, + { + "epoch": 0.4422796554009278, + "grad_norm": 10.820859909057617, + "learning_rate": 4.737940084146582e-06, + "loss": 0.1789, + "num_input_tokens_seen": 6541992, + "step": 3337 + }, + { + "epoch": 0.4424121935056329, + "grad_norm": 12.894454002380371, + "learning_rate": 4.737785339042431e-06, + "loss": 0.7197, + "num_input_tokens_seen": 6543920, + "step": 3338 + }, + { + "epoch": 0.44254473161033797, + "grad_norm": 9.09660816192627, + "learning_rate": 4.737630550792235e-06, + "loss": 0.1178, + "num_input_tokens_seen": 6545672, + "step": 3339 + }, + { + "epoch": 0.44267726971504306, + "grad_norm": 1.8006818294525146, + "learning_rate": 4.7374757193989784e-06, + "loss": 0.0264, + "num_input_tokens_seen": 6547632, + "step": 3340 + }, + { + "epoch": 0.44280980781974816, + "grad_norm": 20.160762786865234, + "learning_rate": 4.737320844865646e-06, + "loss": 0.4105, + "num_input_tokens_seen": 6549688, + "step": 3341 + }, + { + "epoch": 0.4429423459244533, + "grad_norm": 9.02538013458252, + "learning_rate": 4.737165927195225e-06, + "loss": 0.2323, + "num_input_tokens_seen": 6551856, + "step": 3342 + }, + { + "epoch": 0.4430748840291584, + "grad_norm": 7.381674289703369, + "learning_rate": 4.7370109663907015e-06, + "loss": 0.262, + "num_input_tokens_seen": 6553832, + "step": 3343 + }, + { + "epoch": 0.4432074221338635, + "grad_norm": 18.117494583129883, + "learning_rate": 4.736855962455062e-06, + "loss": 0.2754, + "num_input_tokens_seen": 6556032, + "step": 3344 + }, + { + "epoch": 0.4433399602385686, + "grad_norm": 10.242547988891602, + "learning_rate": 4.736700915391298e-06, + "loss": 0.373, + "num_input_tokens_seen": 6558088, + "step": 3345 + }, + { + "epoch": 0.4434724983432737, + "grad_norm": 15.688780784606934, + "learning_rate": 4.736545825202397e-06, + "loss": 0.422, + "num_input_tokens_seen": 6560072, + "step": 3346 + }, + { + "epoch": 0.4436050364479788, + "grad_norm": 6.993330955505371, + "learning_rate": 4.736390691891349e-06, + "loss": 0.3007, + "num_input_tokens_seen": 6562432, + "step": 3347 + }, + { + "epoch": 0.4437375745526839, + "grad_norm": 0.6917096376419067, + "learning_rate": 4.736235515461146e-06, + "loss": 0.0044, + "num_input_tokens_seen": 6565064, + "step": 3348 + }, + { + "epoch": 0.443870112657389, + "grad_norm": 1.7357581853866577, + "learning_rate": 4.73608029591478e-06, + "loss": 0.0101, + "num_input_tokens_seen": 6566400, + "step": 3349 + }, + { + "epoch": 0.4440026507620941, + "grad_norm": 0.24806712567806244, + "learning_rate": 4.735925033255243e-06, + "loss": 0.0016, + "num_input_tokens_seen": 6567648, + "step": 3350 + }, + { + "epoch": 0.4441351888667992, + "grad_norm": 0.3699946999549866, + "learning_rate": 4.735769727485528e-06, + "loss": 0.0024, + "num_input_tokens_seen": 6570320, + "step": 3351 + }, + { + "epoch": 0.4442677269715043, + "grad_norm": 14.791948318481445, + "learning_rate": 4.735614378608632e-06, + "loss": 0.2891, + "num_input_tokens_seen": 6572320, + "step": 3352 + }, + { + "epoch": 0.4444002650762094, + "grad_norm": 0.1550486981868744, + "learning_rate": 4.735458986627547e-06, + "loss": 0.001, + "num_input_tokens_seen": 6573680, + "step": 3353 + }, + { + "epoch": 0.44453280318091454, + "grad_norm": 10.05436897277832, + "learning_rate": 4.735303551545272e-06, + "loss": 0.4996, + "num_input_tokens_seen": 6575504, + "step": 3354 + }, + { + "epoch": 0.44466534128561963, + "grad_norm": 0.18503151834011078, + "learning_rate": 4.735148073364801e-06, + "loss": 0.0012, + "num_input_tokens_seen": 6576600, + "step": 3355 + }, + { + "epoch": 0.4447978793903247, + "grad_norm": 13.025964736938477, + "learning_rate": 4.734992552089134e-06, + "loss": 0.2231, + "num_input_tokens_seen": 6577968, + "step": 3356 + }, + { + "epoch": 0.4449304174950298, + "grad_norm": 0.11442652344703674, + "learning_rate": 4.7348369877212684e-06, + "loss": 0.0007, + "num_input_tokens_seen": 6578992, + "step": 3357 + }, + { + "epoch": 0.4450629555997349, + "grad_norm": 9.061229705810547, + "learning_rate": 4.734681380264204e-06, + "loss": 0.2224, + "num_input_tokens_seen": 6580544, + "step": 3358 + }, + { + "epoch": 0.44519549370444, + "grad_norm": 2.76716685295105, + "learning_rate": 4.734525729720941e-06, + "loss": 0.007, + "num_input_tokens_seen": 6582680, + "step": 3359 + }, + { + "epoch": 0.44532803180914515, + "grad_norm": 6.341799259185791, + "learning_rate": 4.734370036094481e-06, + "loss": 0.1583, + "num_input_tokens_seen": 6585376, + "step": 3360 + }, + { + "epoch": 0.44546056991385025, + "grad_norm": 14.088835716247559, + "learning_rate": 4.734214299387824e-06, + "loss": 0.4938, + "num_input_tokens_seen": 6587448, + "step": 3361 + }, + { + "epoch": 0.44559310801855534, + "grad_norm": 11.763378143310547, + "learning_rate": 4.734058519603975e-06, + "loss": 0.413, + "num_input_tokens_seen": 6588904, + "step": 3362 + }, + { + "epoch": 0.44572564612326043, + "grad_norm": 0.09960298240184784, + "learning_rate": 4.733902696745936e-06, + "loss": 0.0006, + "num_input_tokens_seen": 6590640, + "step": 3363 + }, + { + "epoch": 0.4458581842279655, + "grad_norm": 9.904974937438965, + "learning_rate": 4.7337468308167125e-06, + "loss": 0.1519, + "num_input_tokens_seen": 6591872, + "step": 3364 + }, + { + "epoch": 0.4459907223326706, + "grad_norm": 0.8243458271026611, + "learning_rate": 4.733590921819309e-06, + "loss": 0.0043, + "num_input_tokens_seen": 6593768, + "step": 3365 + }, + { + "epoch": 0.44612326043737577, + "grad_norm": 14.89431095123291, + "learning_rate": 4.733434969756732e-06, + "loss": 0.2022, + "num_input_tokens_seen": 6595896, + "step": 3366 + }, + { + "epoch": 0.44625579854208086, + "grad_norm": 0.22572316229343414, + "learning_rate": 4.733278974631987e-06, + "loss": 0.0014, + "num_input_tokens_seen": 6597424, + "step": 3367 + }, + { + "epoch": 0.44638833664678595, + "grad_norm": 8.510120391845703, + "learning_rate": 4.733122936448083e-06, + "loss": 0.3085, + "num_input_tokens_seen": 6599168, + "step": 3368 + }, + { + "epoch": 0.44652087475149105, + "grad_norm": 13.875428199768066, + "learning_rate": 4.732966855208029e-06, + "loss": 0.4491, + "num_input_tokens_seen": 6600944, + "step": 3369 + }, + { + "epoch": 0.44665341285619614, + "grad_norm": 21.13619041442871, + "learning_rate": 4.732810730914832e-06, + "loss": 0.7085, + "num_input_tokens_seen": 6602488, + "step": 3370 + }, + { + "epoch": 0.44678595096090123, + "grad_norm": 0.1032075509428978, + "learning_rate": 4.732654563571505e-06, + "loss": 0.0006, + "num_input_tokens_seen": 6603560, + "step": 3371 + }, + { + "epoch": 0.4469184890656064, + "grad_norm": 0.12376049906015396, + "learning_rate": 4.732498353181058e-06, + "loss": 0.0008, + "num_input_tokens_seen": 6604840, + "step": 3372 + }, + { + "epoch": 0.4470510271703115, + "grad_norm": 15.353861808776855, + "learning_rate": 4.732342099746502e-06, + "loss": 0.5511, + "num_input_tokens_seen": 6607008, + "step": 3373 + }, + { + "epoch": 0.44718356527501657, + "grad_norm": 0.4640714228153229, + "learning_rate": 4.73218580327085e-06, + "loss": 0.0027, + "num_input_tokens_seen": 6608536, + "step": 3374 + }, + { + "epoch": 0.44731610337972166, + "grad_norm": 11.818785667419434, + "learning_rate": 4.732029463757117e-06, + "loss": 0.3105, + "num_input_tokens_seen": 6610776, + "step": 3375 + }, + { + "epoch": 0.44744864148442676, + "grad_norm": 0.12626540660858154, + "learning_rate": 4.7318730812083155e-06, + "loss": 0.0008, + "num_input_tokens_seen": 6612176, + "step": 3376 + }, + { + "epoch": 0.44758117958913185, + "grad_norm": 5.480452537536621, + "learning_rate": 4.731716655627461e-06, + "loss": 0.1269, + "num_input_tokens_seen": 6613912, + "step": 3377 + }, + { + "epoch": 0.447713717693837, + "grad_norm": 2.331057548522949, + "learning_rate": 4.731560187017569e-06, + "loss": 0.0196, + "num_input_tokens_seen": 6615616, + "step": 3378 + }, + { + "epoch": 0.4478462557985421, + "grad_norm": 2.400275230407715, + "learning_rate": 4.7314036753816595e-06, + "loss": 0.016, + "num_input_tokens_seen": 6617736, + "step": 3379 + }, + { + "epoch": 0.4479787939032472, + "grad_norm": 10.343480110168457, + "learning_rate": 4.7312471207227465e-06, + "loss": 0.347, + "num_input_tokens_seen": 6619416, + "step": 3380 + }, + { + "epoch": 0.4481113320079523, + "grad_norm": 8.320096015930176, + "learning_rate": 4.73109052304385e-06, + "loss": 0.2576, + "num_input_tokens_seen": 6621416, + "step": 3381 + }, + { + "epoch": 0.44824387011265737, + "grad_norm": 0.044903963804244995, + "learning_rate": 4.730933882347988e-06, + "loss": 0.0003, + "num_input_tokens_seen": 6622648, + "step": 3382 + }, + { + "epoch": 0.44837640821736247, + "grad_norm": 0.10990428179502487, + "learning_rate": 4.730777198638183e-06, + "loss": 0.0007, + "num_input_tokens_seen": 6624392, + "step": 3383 + }, + { + "epoch": 0.4485089463220676, + "grad_norm": 9.115584373474121, + "learning_rate": 4.730620471917454e-06, + "loss": 0.3811, + "num_input_tokens_seen": 6627184, + "step": 3384 + }, + { + "epoch": 0.4486414844267727, + "grad_norm": 33.583404541015625, + "learning_rate": 4.730463702188824e-06, + "loss": 0.2169, + "num_input_tokens_seen": 6630296, + "step": 3385 + }, + { + "epoch": 0.4487740225314778, + "grad_norm": 6.667294979095459, + "learning_rate": 4.730306889455314e-06, + "loss": 0.07, + "num_input_tokens_seen": 6631376, + "step": 3386 + }, + { + "epoch": 0.4489065606361829, + "grad_norm": 10.882554054260254, + "learning_rate": 4.73015003371995e-06, + "loss": 0.3074, + "num_input_tokens_seen": 6633088, + "step": 3387 + }, + { + "epoch": 0.449039098740888, + "grad_norm": 3.9572091102600098, + "learning_rate": 4.7299931349857535e-06, + "loss": 0.1124, + "num_input_tokens_seen": 6636376, + "step": 3388 + }, + { + "epoch": 0.4491716368455931, + "grad_norm": 1.989731788635254, + "learning_rate": 4.729836193255752e-06, + "loss": 0.0118, + "num_input_tokens_seen": 6639200, + "step": 3389 + }, + { + "epoch": 0.44930417495029823, + "grad_norm": 0.5093509554862976, + "learning_rate": 4.72967920853297e-06, + "loss": 0.0031, + "num_input_tokens_seen": 6641392, + "step": 3390 + }, + { + "epoch": 0.4494367130550033, + "grad_norm": 13.902198791503906, + "learning_rate": 4.729522180820434e-06, + "loss": 0.3645, + "num_input_tokens_seen": 6643864, + "step": 3391 + }, + { + "epoch": 0.4495692511597084, + "grad_norm": 0.09832777827978134, + "learning_rate": 4.729365110121174e-06, + "loss": 0.0006, + "num_input_tokens_seen": 6645280, + "step": 3392 + }, + { + "epoch": 0.4497017892644135, + "grad_norm": 7.182304382324219, + "learning_rate": 4.729207996438215e-06, + "loss": 0.3736, + "num_input_tokens_seen": 6647464, + "step": 3393 + }, + { + "epoch": 0.4498343273691186, + "grad_norm": 7.930952548980713, + "learning_rate": 4.729050839774589e-06, + "loss": 0.2603, + "num_input_tokens_seen": 6649688, + "step": 3394 + }, + { + "epoch": 0.44996686547382375, + "grad_norm": 0.14620444178581238, + "learning_rate": 4.728893640133324e-06, + "loss": 0.0009, + "num_input_tokens_seen": 6651208, + "step": 3395 + }, + { + "epoch": 0.45009940357852884, + "grad_norm": 5.989103317260742, + "learning_rate": 4.728736397517453e-06, + "loss": 0.1895, + "num_input_tokens_seen": 6653856, + "step": 3396 + }, + { + "epoch": 0.45023194168323394, + "grad_norm": 0.21796511113643646, + "learning_rate": 4.728579111930007e-06, + "loss": 0.0013, + "num_input_tokens_seen": 6656272, + "step": 3397 + }, + { + "epoch": 0.45036447978793903, + "grad_norm": 11.188365936279297, + "learning_rate": 4.728421783374018e-06, + "loss": 0.3376, + "num_input_tokens_seen": 6657936, + "step": 3398 + }, + { + "epoch": 0.4504970178926441, + "grad_norm": 9.467327117919922, + "learning_rate": 4.728264411852519e-06, + "loss": 0.509, + "num_input_tokens_seen": 6660248, + "step": 3399 + }, + { + "epoch": 0.4506295559973492, + "grad_norm": 0.20046353340148926, + "learning_rate": 4.728106997368545e-06, + "loss": 0.0012, + "num_input_tokens_seen": 6662232, + "step": 3400 + }, + { + "epoch": 0.45076209410205437, + "grad_norm": 0.1963941603899002, + "learning_rate": 4.727949539925132e-06, + "loss": 0.0012, + "num_input_tokens_seen": 6663776, + "step": 3401 + }, + { + "epoch": 0.45089463220675946, + "grad_norm": 10.46510124206543, + "learning_rate": 4.727792039525314e-06, + "loss": 0.4066, + "num_input_tokens_seen": 6665856, + "step": 3402 + }, + { + "epoch": 0.45102717031146455, + "grad_norm": 0.18379952013492584, + "learning_rate": 4.727634496172129e-06, + "loss": 0.0011, + "num_input_tokens_seen": 6667424, + "step": 3403 + }, + { + "epoch": 0.45115970841616965, + "grad_norm": 0.0947766825556755, + "learning_rate": 4.727476909868614e-06, + "loss": 0.0006, + "num_input_tokens_seen": 6668752, + "step": 3404 + }, + { + "epoch": 0.45129224652087474, + "grad_norm": 0.3160610795021057, + "learning_rate": 4.727319280617807e-06, + "loss": 0.0018, + "num_input_tokens_seen": 6670200, + "step": 3405 + }, + { + "epoch": 0.45142478462557983, + "grad_norm": 16.8338623046875, + "learning_rate": 4.727161608422749e-06, + "loss": 0.5058, + "num_input_tokens_seen": 6672296, + "step": 3406 + }, + { + "epoch": 0.451557322730285, + "grad_norm": 6.496436595916748, + "learning_rate": 4.727003893286478e-06, + "loss": 0.0974, + "num_input_tokens_seen": 6674352, + "step": 3407 + }, + { + "epoch": 0.4516898608349901, + "grad_norm": 12.64709186553955, + "learning_rate": 4.726846135212036e-06, + "loss": 0.2691, + "num_input_tokens_seen": 6676176, + "step": 3408 + }, + { + "epoch": 0.45182239893969517, + "grad_norm": 7.829931259155273, + "learning_rate": 4.726688334202464e-06, + "loss": 0.1546, + "num_input_tokens_seen": 6678048, + "step": 3409 + }, + { + "epoch": 0.45195493704440026, + "grad_norm": 0.054195426404476166, + "learning_rate": 4.726530490260805e-06, + "loss": 0.0003, + "num_input_tokens_seen": 6679552, + "step": 3410 + }, + { + "epoch": 0.45208747514910536, + "grad_norm": 0.2684761583805084, + "learning_rate": 4.726372603390101e-06, + "loss": 0.0016, + "num_input_tokens_seen": 6680880, + "step": 3411 + }, + { + "epoch": 0.45222001325381045, + "grad_norm": 0.3117387890815735, + "learning_rate": 4.726214673593398e-06, + "loss": 0.0019, + "num_input_tokens_seen": 6683064, + "step": 3412 + }, + { + "epoch": 0.4523525513585156, + "grad_norm": 8.266839027404785, + "learning_rate": 4.72605670087374e-06, + "loss": 0.1384, + "num_input_tokens_seen": 6685968, + "step": 3413 + }, + { + "epoch": 0.4524850894632207, + "grad_norm": 11.656111717224121, + "learning_rate": 4.725898685234174e-06, + "loss": 0.4296, + "num_input_tokens_seen": 6687864, + "step": 3414 + }, + { + "epoch": 0.4526176275679258, + "grad_norm": 0.42431381344795227, + "learning_rate": 4.725740626677746e-06, + "loss": 0.0022, + "num_input_tokens_seen": 6690376, + "step": 3415 + }, + { + "epoch": 0.4527501656726309, + "grad_norm": 11.26159381866455, + "learning_rate": 4.725582525207503e-06, + "loss": 0.3102, + "num_input_tokens_seen": 6692848, + "step": 3416 + }, + { + "epoch": 0.45288270377733597, + "grad_norm": 0.14348818361759186, + "learning_rate": 4.725424380826493e-06, + "loss": 0.0009, + "num_input_tokens_seen": 6695080, + "step": 3417 + }, + { + "epoch": 0.45301524188204106, + "grad_norm": 7.584483623504639, + "learning_rate": 4.725266193537765e-06, + "loss": 0.1943, + "num_input_tokens_seen": 6697000, + "step": 3418 + }, + { + "epoch": 0.4531477799867462, + "grad_norm": 5.411701679229736, + "learning_rate": 4.725107963344371e-06, + "loss": 0.2663, + "num_input_tokens_seen": 6699304, + "step": 3419 + }, + { + "epoch": 0.4532803180914513, + "grad_norm": 2.625098943710327, + "learning_rate": 4.7249496902493605e-06, + "loss": 0.0136, + "num_input_tokens_seen": 6700920, + "step": 3420 + }, + { + "epoch": 0.4534128561961564, + "grad_norm": 5.04707670211792, + "learning_rate": 4.724791374255784e-06, + "loss": 0.2234, + "num_input_tokens_seen": 6702464, + "step": 3421 + }, + { + "epoch": 0.4535453943008615, + "grad_norm": 7.7730631828308105, + "learning_rate": 4.7246330153666954e-06, + "loss": 0.2966, + "num_input_tokens_seen": 6704272, + "step": 3422 + }, + { + "epoch": 0.4536779324055666, + "grad_norm": 7.220533847808838, + "learning_rate": 4.7244746135851485e-06, + "loss": 0.2061, + "num_input_tokens_seen": 6706080, + "step": 3423 + }, + { + "epoch": 0.4538104705102717, + "grad_norm": 11.883634567260742, + "learning_rate": 4.724316168914195e-06, + "loss": 0.3974, + "num_input_tokens_seen": 6707336, + "step": 3424 + }, + { + "epoch": 0.45394300861497683, + "grad_norm": 9.447149276733398, + "learning_rate": 4.724157681356892e-06, + "loss": 0.251, + "num_input_tokens_seen": 6708808, + "step": 3425 + }, + { + "epoch": 0.4540755467196819, + "grad_norm": 16.157278060913086, + "learning_rate": 4.723999150916295e-06, + "loss": 0.6322, + "num_input_tokens_seen": 6710416, + "step": 3426 + }, + { + "epoch": 0.454208084824387, + "grad_norm": 0.042269252240657806, + "learning_rate": 4.72384057759546e-06, + "loss": 0.0003, + "num_input_tokens_seen": 6711768, + "step": 3427 + }, + { + "epoch": 0.4543406229290921, + "grad_norm": 0.23470933735370636, + "learning_rate": 4.723681961397444e-06, + "loss": 0.0014, + "num_input_tokens_seen": 6713616, + "step": 3428 + }, + { + "epoch": 0.4544731610337972, + "grad_norm": 6.778000831604004, + "learning_rate": 4.723523302325306e-06, + "loss": 0.1814, + "num_input_tokens_seen": 6715744, + "step": 3429 + }, + { + "epoch": 0.4546056991385023, + "grad_norm": 2.0614142417907715, + "learning_rate": 4.723364600382104e-06, + "loss": 0.0669, + "num_input_tokens_seen": 6717704, + "step": 3430 + }, + { + "epoch": 0.45473823724320744, + "grad_norm": 9.870048522949219, + "learning_rate": 4.723205855570899e-06, + "loss": 0.22, + "num_input_tokens_seen": 6719568, + "step": 3431 + }, + { + "epoch": 0.45487077534791254, + "grad_norm": 5.1438727378845215, + "learning_rate": 4.723047067894752e-06, + "loss": 0.1886, + "num_input_tokens_seen": 6721792, + "step": 3432 + }, + { + "epoch": 0.45500331345261763, + "grad_norm": 8.39296817779541, + "learning_rate": 4.722888237356724e-06, + "loss": 0.2978, + "num_input_tokens_seen": 6724208, + "step": 3433 + }, + { + "epoch": 0.4551358515573227, + "grad_norm": 0.15995049476623535, + "learning_rate": 4.722729363959877e-06, + "loss": 0.001, + "num_input_tokens_seen": 6726080, + "step": 3434 + }, + { + "epoch": 0.4552683896620278, + "grad_norm": 8.57268238067627, + "learning_rate": 4.722570447707275e-06, + "loss": 0.1489, + "num_input_tokens_seen": 6727696, + "step": 3435 + }, + { + "epoch": 0.4554009277667329, + "grad_norm": 15.05240249633789, + "learning_rate": 4.7224114886019804e-06, + "loss": 0.4957, + "num_input_tokens_seen": 6730616, + "step": 3436 + }, + { + "epoch": 0.45553346587143806, + "grad_norm": 8.12574291229248, + "learning_rate": 4.72225248664706e-06, + "loss": 0.1275, + "num_input_tokens_seen": 6732296, + "step": 3437 + }, + { + "epoch": 0.45566600397614315, + "grad_norm": 8.275718688964844, + "learning_rate": 4.722093441845578e-06, + "loss": 0.1578, + "num_input_tokens_seen": 6734792, + "step": 3438 + }, + { + "epoch": 0.45579854208084825, + "grad_norm": 13.090919494628906, + "learning_rate": 4.721934354200602e-06, + "loss": 0.4731, + "num_input_tokens_seen": 6736824, + "step": 3439 + }, + { + "epoch": 0.45593108018555334, + "grad_norm": 13.868168830871582, + "learning_rate": 4.721775223715199e-06, + "loss": 0.3414, + "num_input_tokens_seen": 6739064, + "step": 3440 + }, + { + "epoch": 0.45606361829025843, + "grad_norm": 7.289348602294922, + "learning_rate": 4.7216160503924365e-06, + "loss": 0.1797, + "num_input_tokens_seen": 6740752, + "step": 3441 + }, + { + "epoch": 0.4561961563949635, + "grad_norm": 13.353787422180176, + "learning_rate": 4.721456834235384e-06, + "loss": 0.5553, + "num_input_tokens_seen": 6743040, + "step": 3442 + }, + { + "epoch": 0.4563286944996687, + "grad_norm": 10.058000564575195, + "learning_rate": 4.721297575247111e-06, + "loss": 0.2204, + "num_input_tokens_seen": 6745160, + "step": 3443 + }, + { + "epoch": 0.45646123260437377, + "grad_norm": 13.75790023803711, + "learning_rate": 4.721138273430689e-06, + "loss": 0.3976, + "num_input_tokens_seen": 6746896, + "step": 3444 + }, + { + "epoch": 0.45659377070907886, + "grad_norm": 4.942676544189453, + "learning_rate": 4.720978928789188e-06, + "loss": 0.1012, + "num_input_tokens_seen": 6748432, + "step": 3445 + }, + { + "epoch": 0.45672630881378395, + "grad_norm": 14.997764587402344, + "learning_rate": 4.720819541325682e-06, + "loss": 0.2681, + "num_input_tokens_seen": 6750080, + "step": 3446 + }, + { + "epoch": 0.45685884691848905, + "grad_norm": 10.357013702392578, + "learning_rate": 4.720660111043243e-06, + "loss": 0.39, + "num_input_tokens_seen": 6751792, + "step": 3447 + }, + { + "epoch": 0.45699138502319414, + "grad_norm": 4.007915019989014, + "learning_rate": 4.7205006379449445e-06, + "loss": 0.0216, + "num_input_tokens_seen": 6753768, + "step": 3448 + }, + { + "epoch": 0.4571239231278993, + "grad_norm": 12.639501571655273, + "learning_rate": 4.720341122033862e-06, + "loss": 0.4189, + "num_input_tokens_seen": 6755808, + "step": 3449 + }, + { + "epoch": 0.4572564612326044, + "grad_norm": 7.345523357391357, + "learning_rate": 4.720181563313071e-06, + "loss": 0.0468, + "num_input_tokens_seen": 6757448, + "step": 3450 + }, + { + "epoch": 0.4573889993373095, + "grad_norm": 9.172658920288086, + "learning_rate": 4.720021961785648e-06, + "loss": 0.2125, + "num_input_tokens_seen": 6758880, + "step": 3451 + }, + { + "epoch": 0.45752153744201457, + "grad_norm": 7.229430675506592, + "learning_rate": 4.71986231745467e-06, + "loss": 0.2521, + "num_input_tokens_seen": 6760896, + "step": 3452 + }, + { + "epoch": 0.45765407554671966, + "grad_norm": 20.520578384399414, + "learning_rate": 4.719702630323215e-06, + "loss": 0.2354, + "num_input_tokens_seen": 6762520, + "step": 3453 + }, + { + "epoch": 0.4577866136514248, + "grad_norm": 5.293634414672852, + "learning_rate": 4.719542900394361e-06, + "loss": 0.1087, + "num_input_tokens_seen": 6764888, + "step": 3454 + }, + { + "epoch": 0.4579191517561299, + "grad_norm": 0.2889997065067291, + "learning_rate": 4.71938312767119e-06, + "loss": 0.0018, + "num_input_tokens_seen": 6765880, + "step": 3455 + }, + { + "epoch": 0.458051689860835, + "grad_norm": 0.7488850951194763, + "learning_rate": 4.719223312156782e-06, + "loss": 0.0043, + "num_input_tokens_seen": 6767672, + "step": 3456 + }, + { + "epoch": 0.4581842279655401, + "grad_norm": 7.767181396484375, + "learning_rate": 4.719063453854216e-06, + "loss": 0.1899, + "num_input_tokens_seen": 6769968, + "step": 3457 + }, + { + "epoch": 0.4583167660702452, + "grad_norm": 11.677338600158691, + "learning_rate": 4.718903552766576e-06, + "loss": 0.3151, + "num_input_tokens_seen": 6772112, + "step": 3458 + }, + { + "epoch": 0.4584493041749503, + "grad_norm": 13.982976913452148, + "learning_rate": 4.718743608896945e-06, + "loss": 0.5062, + "num_input_tokens_seen": 6775208, + "step": 3459 + }, + { + "epoch": 0.4585818422796554, + "grad_norm": 15.643267631530762, + "learning_rate": 4.718583622248407e-06, + "loss": 0.3352, + "num_input_tokens_seen": 6776976, + "step": 3460 + }, + { + "epoch": 0.4587143803843605, + "grad_norm": 3.260188102722168, + "learning_rate": 4.718423592824047e-06, + "loss": 0.0345, + "num_input_tokens_seen": 6778400, + "step": 3461 + }, + { + "epoch": 0.4588469184890656, + "grad_norm": 2.7483954429626465, + "learning_rate": 4.7182635206269475e-06, + "loss": 0.0162, + "num_input_tokens_seen": 6780192, + "step": 3462 + }, + { + "epoch": 0.4589794565937707, + "grad_norm": 7.3294677734375, + "learning_rate": 4.718103405660198e-06, + "loss": 0.2597, + "num_input_tokens_seen": 6782616, + "step": 3463 + }, + { + "epoch": 0.4591119946984758, + "grad_norm": 0.37416163086891174, + "learning_rate": 4.717943247926885e-06, + "loss": 0.0023, + "num_input_tokens_seen": 6784224, + "step": 3464 + }, + { + "epoch": 0.4592445328031809, + "grad_norm": 6.967800140380859, + "learning_rate": 4.717783047430096e-06, + "loss": 0.1431, + "num_input_tokens_seen": 6786368, + "step": 3465 + }, + { + "epoch": 0.45937707090788604, + "grad_norm": 0.6319146156311035, + "learning_rate": 4.71762280417292e-06, + "loss": 0.0039, + "num_input_tokens_seen": 6788584, + "step": 3466 + }, + { + "epoch": 0.45950960901259114, + "grad_norm": 3.8938984870910645, + "learning_rate": 4.717462518158447e-06, + "loss": 0.0177, + "num_input_tokens_seen": 6791368, + "step": 3467 + }, + { + "epoch": 0.45964214711729623, + "grad_norm": 2.0523159503936768, + "learning_rate": 4.7173021893897655e-06, + "loss": 0.0118, + "num_input_tokens_seen": 6793016, + "step": 3468 + }, + { + "epoch": 0.4597746852220013, + "grad_norm": 16.6965389251709, + "learning_rate": 4.7171418178699686e-06, + "loss": 0.722, + "num_input_tokens_seen": 6795400, + "step": 3469 + }, + { + "epoch": 0.4599072233267064, + "grad_norm": 0.1304982751607895, + "learning_rate": 4.716981403602148e-06, + "loss": 0.0008, + "num_input_tokens_seen": 6797016, + "step": 3470 + }, + { + "epoch": 0.4600397614314115, + "grad_norm": 5.9426589012146, + "learning_rate": 4.716820946589397e-06, + "loss": 0.0769, + "num_input_tokens_seen": 6798408, + "step": 3471 + }, + { + "epoch": 0.46017229953611666, + "grad_norm": 6.750141143798828, + "learning_rate": 4.7166604468348085e-06, + "loss": 0.1026, + "num_input_tokens_seen": 6799680, + "step": 3472 + }, + { + "epoch": 0.46030483764082175, + "grad_norm": 0.027093039825558662, + "learning_rate": 4.716499904341477e-06, + "loss": 0.0002, + "num_input_tokens_seen": 6802248, + "step": 3473 + }, + { + "epoch": 0.46043737574552684, + "grad_norm": 0.033254474401474, + "learning_rate": 4.716339319112499e-06, + "loss": 0.0002, + "num_input_tokens_seen": 6803600, + "step": 3474 + }, + { + "epoch": 0.46056991385023194, + "grad_norm": 0.05802615359425545, + "learning_rate": 4.71617869115097e-06, + "loss": 0.0004, + "num_input_tokens_seen": 6805064, + "step": 3475 + }, + { + "epoch": 0.46070245195493703, + "grad_norm": 2.059119939804077, + "learning_rate": 4.716018020459986e-06, + "loss": 0.034, + "num_input_tokens_seen": 6806400, + "step": 3476 + }, + { + "epoch": 0.4608349900596421, + "grad_norm": 3.234501838684082, + "learning_rate": 4.715857307042646e-06, + "loss": 0.0698, + "num_input_tokens_seen": 6808112, + "step": 3477 + }, + { + "epoch": 0.4609675281643473, + "grad_norm": 0.052756454795598984, + "learning_rate": 4.7156965509020494e-06, + "loss": 0.0003, + "num_input_tokens_seen": 6810096, + "step": 3478 + }, + { + "epoch": 0.46110006626905237, + "grad_norm": 10.914143562316895, + "learning_rate": 4.715535752041294e-06, + "loss": 0.2037, + "num_input_tokens_seen": 6812536, + "step": 3479 + }, + { + "epoch": 0.46123260437375746, + "grad_norm": 9.169696807861328, + "learning_rate": 4.715374910463481e-06, + "loss": 0.2529, + "num_input_tokens_seen": 6813840, + "step": 3480 + }, + { + "epoch": 0.46136514247846255, + "grad_norm": 18.554460525512695, + "learning_rate": 4.715214026171711e-06, + "loss": 0.5491, + "num_input_tokens_seen": 6816752, + "step": 3481 + }, + { + "epoch": 0.46149768058316765, + "grad_norm": 8.606828689575195, + "learning_rate": 4.715053099169088e-06, + "loss": 0.2324, + "num_input_tokens_seen": 6819320, + "step": 3482 + }, + { + "epoch": 0.46163021868787274, + "grad_norm": 9.947463989257812, + "learning_rate": 4.714892129458712e-06, + "loss": 0.3311, + "num_input_tokens_seen": 6822088, + "step": 3483 + }, + { + "epoch": 0.4617627567925779, + "grad_norm": 0.040584735572338104, + "learning_rate": 4.7147311170436875e-06, + "loss": 0.0002, + "num_input_tokens_seen": 6823832, + "step": 3484 + }, + { + "epoch": 0.461895294897283, + "grad_norm": 7.335174560546875, + "learning_rate": 4.714570061927119e-06, + "loss": 0.3384, + "num_input_tokens_seen": 6825440, + "step": 3485 + }, + { + "epoch": 0.4620278330019881, + "grad_norm": 14.993861198425293, + "learning_rate": 4.7144089641121125e-06, + "loss": 0.5626, + "num_input_tokens_seen": 6827152, + "step": 3486 + }, + { + "epoch": 0.46216037110669317, + "grad_norm": 9.26425552368164, + "learning_rate": 4.714247823601774e-06, + "loss": 0.1909, + "num_input_tokens_seen": 6829056, + "step": 3487 + }, + { + "epoch": 0.46229290921139826, + "grad_norm": 4.233144283294678, + "learning_rate": 4.714086640399209e-06, + "loss": 0.1163, + "num_input_tokens_seen": 6830808, + "step": 3488 + }, + { + "epoch": 0.46242544731610336, + "grad_norm": 0.08130092173814774, + "learning_rate": 4.713925414507527e-06, + "loss": 0.0005, + "num_input_tokens_seen": 6833496, + "step": 3489 + }, + { + "epoch": 0.4625579854208085, + "grad_norm": 6.8416852951049805, + "learning_rate": 4.713764145929836e-06, + "loss": 0.2026, + "num_input_tokens_seen": 6835072, + "step": 3490 + }, + { + "epoch": 0.4626905235255136, + "grad_norm": 6.314442157745361, + "learning_rate": 4.713602834669243e-06, + "loss": 0.2061, + "num_input_tokens_seen": 6837080, + "step": 3491 + }, + { + "epoch": 0.4628230616302187, + "grad_norm": 0.07984998822212219, + "learning_rate": 4.713441480728862e-06, + "loss": 0.0005, + "num_input_tokens_seen": 6838568, + "step": 3492 + }, + { + "epoch": 0.4629555997349238, + "grad_norm": 0.32196831703186035, + "learning_rate": 4.713280084111802e-06, + "loss": 0.002, + "num_input_tokens_seen": 6841576, + "step": 3493 + }, + { + "epoch": 0.4630881378396289, + "grad_norm": 9.388946533203125, + "learning_rate": 4.713118644821175e-06, + "loss": 0.236, + "num_input_tokens_seen": 6843304, + "step": 3494 + }, + { + "epoch": 0.46322067594433397, + "grad_norm": 10.134395599365234, + "learning_rate": 4.712957162860094e-06, + "loss": 0.1961, + "num_input_tokens_seen": 6845392, + "step": 3495 + }, + { + "epoch": 0.4633532140490391, + "grad_norm": 4.014077186584473, + "learning_rate": 4.712795638231672e-06, + "loss": 0.0446, + "num_input_tokens_seen": 6846624, + "step": 3496 + }, + { + "epoch": 0.4634857521537442, + "grad_norm": 0.2507886588573456, + "learning_rate": 4.712634070939024e-06, + "loss": 0.0016, + "num_input_tokens_seen": 6848864, + "step": 3497 + }, + { + "epoch": 0.4636182902584493, + "grad_norm": 0.16358473896980286, + "learning_rate": 4.712472460985264e-06, + "loss": 0.001, + "num_input_tokens_seen": 6850744, + "step": 3498 + }, + { + "epoch": 0.4637508283631544, + "grad_norm": 10.368104934692383, + "learning_rate": 4.712310808373509e-06, + "loss": 0.1925, + "num_input_tokens_seen": 6852496, + "step": 3499 + }, + { + "epoch": 0.4638833664678595, + "grad_norm": 5.731873035430908, + "learning_rate": 4.712149113106875e-06, + "loss": 0.067, + "num_input_tokens_seen": 6854272, + "step": 3500 + }, + { + "epoch": 0.4640159045725646, + "grad_norm": 9.510446548461914, + "learning_rate": 4.711987375188481e-06, + "loss": 0.1777, + "num_input_tokens_seen": 6856712, + "step": 3501 + }, + { + "epoch": 0.46414844267726973, + "grad_norm": 0.17497515678405762, + "learning_rate": 4.711825594621443e-06, + "loss": 0.0011, + "num_input_tokens_seen": 6859264, + "step": 3502 + }, + { + "epoch": 0.46428098078197483, + "grad_norm": 12.0756254196167, + "learning_rate": 4.711663771408883e-06, + "loss": 0.2428, + "num_input_tokens_seen": 6861144, + "step": 3503 + }, + { + "epoch": 0.4644135188866799, + "grad_norm": 13.558756828308105, + "learning_rate": 4.711501905553919e-06, + "loss": 0.1976, + "num_input_tokens_seen": 6862928, + "step": 3504 + }, + { + "epoch": 0.464546056991385, + "grad_norm": 13.208483695983887, + "learning_rate": 4.711339997059673e-06, + "loss": 0.429, + "num_input_tokens_seen": 6864824, + "step": 3505 + }, + { + "epoch": 0.4646785950960901, + "grad_norm": 18.7616024017334, + "learning_rate": 4.711178045929267e-06, + "loss": 0.801, + "num_input_tokens_seen": 6866920, + "step": 3506 + }, + { + "epoch": 0.4648111332007952, + "grad_norm": 9.190680503845215, + "learning_rate": 4.711016052165822e-06, + "loss": 0.1913, + "num_input_tokens_seen": 6869160, + "step": 3507 + }, + { + "epoch": 0.46494367130550035, + "grad_norm": 0.18456867337226868, + "learning_rate": 4.710854015772462e-06, + "loss": 0.0012, + "num_input_tokens_seen": 6871440, + "step": 3508 + }, + { + "epoch": 0.46507620941020544, + "grad_norm": 9.639766693115234, + "learning_rate": 4.710691936752312e-06, + "loss": 0.2014, + "num_input_tokens_seen": 6872920, + "step": 3509 + }, + { + "epoch": 0.46520874751491054, + "grad_norm": 0.17921799421310425, + "learning_rate": 4.710529815108496e-06, + "loss": 0.0011, + "num_input_tokens_seen": 6874312, + "step": 3510 + }, + { + "epoch": 0.46534128561961563, + "grad_norm": 0.13441920280456543, + "learning_rate": 4.7103676508441395e-06, + "loss": 0.0009, + "num_input_tokens_seen": 6875872, + "step": 3511 + }, + { + "epoch": 0.4654738237243207, + "grad_norm": 3.1142585277557373, + "learning_rate": 4.710205443962371e-06, + "loss": 0.0397, + "num_input_tokens_seen": 6878568, + "step": 3512 + }, + { + "epoch": 0.46560636182902587, + "grad_norm": 1.7977932691574097, + "learning_rate": 4.710043194466316e-06, + "loss": 0.0148, + "num_input_tokens_seen": 6881128, + "step": 3513 + }, + { + "epoch": 0.46573889993373097, + "grad_norm": 8.77234172821045, + "learning_rate": 4.709880902359104e-06, + "loss": 0.1691, + "num_input_tokens_seen": 6883496, + "step": 3514 + }, + { + "epoch": 0.46587143803843606, + "grad_norm": 7.098576545715332, + "learning_rate": 4.709718567643864e-06, + "loss": 0.1133, + "num_input_tokens_seen": 6885968, + "step": 3515 + }, + { + "epoch": 0.46600397614314115, + "grad_norm": 6.075428485870361, + "learning_rate": 4.709556190323725e-06, + "loss": 0.1374, + "num_input_tokens_seen": 6887632, + "step": 3516 + }, + { + "epoch": 0.46613651424784625, + "grad_norm": 9.650872230529785, + "learning_rate": 4.709393770401818e-06, + "loss": 0.1179, + "num_input_tokens_seen": 6889816, + "step": 3517 + }, + { + "epoch": 0.46626905235255134, + "grad_norm": 18.97589874267578, + "learning_rate": 4.709231307881276e-06, + "loss": 0.5897, + "num_input_tokens_seen": 6891944, + "step": 3518 + }, + { + "epoch": 0.4664015904572565, + "grad_norm": 19.381183624267578, + "learning_rate": 4.7090688027652295e-06, + "loss": 0.181, + "num_input_tokens_seen": 6893888, + "step": 3519 + }, + { + "epoch": 0.4665341285619616, + "grad_norm": 15.839090347290039, + "learning_rate": 4.708906255056813e-06, + "loss": 0.6315, + "num_input_tokens_seen": 6896096, + "step": 3520 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 27.02651596069336, + "learning_rate": 4.70874366475916e-06, + "loss": 1.0679, + "num_input_tokens_seen": 6899136, + "step": 3521 + }, + { + "epoch": 0.46679920477137177, + "grad_norm": 1.7673804759979248, + "learning_rate": 4.708581031875406e-06, + "loss": 0.0171, + "num_input_tokens_seen": 6900768, + "step": 3522 + }, + { + "epoch": 0.46693174287607686, + "grad_norm": 10.952544212341309, + "learning_rate": 4.708418356408685e-06, + "loss": 0.3071, + "num_input_tokens_seen": 6902488, + "step": 3523 + }, + { + "epoch": 0.46706428098078195, + "grad_norm": 3.474372148513794, + "learning_rate": 4.708255638362135e-06, + "loss": 0.0484, + "num_input_tokens_seen": 6904192, + "step": 3524 + }, + { + "epoch": 0.4671968190854871, + "grad_norm": 4.23491907119751, + "learning_rate": 4.708092877738893e-06, + "loss": 0.1421, + "num_input_tokens_seen": 6906360, + "step": 3525 + }, + { + "epoch": 0.4673293571901922, + "grad_norm": 6.1367878913879395, + "learning_rate": 4.707930074542097e-06, + "loss": 0.1427, + "num_input_tokens_seen": 6908184, + "step": 3526 + }, + { + "epoch": 0.4674618952948973, + "grad_norm": 8.948915481567383, + "learning_rate": 4.707767228774885e-06, + "loss": 0.1737, + "num_input_tokens_seen": 6910384, + "step": 3527 + }, + { + "epoch": 0.4675944333996024, + "grad_norm": 0.1639985293149948, + "learning_rate": 4.707604340440399e-06, + "loss": 0.0009, + "num_input_tokens_seen": 6911848, + "step": 3528 + }, + { + "epoch": 0.4677269715043075, + "grad_norm": 12.816913604736328, + "learning_rate": 4.707441409541779e-06, + "loss": 0.2061, + "num_input_tokens_seen": 6913872, + "step": 3529 + }, + { + "epoch": 0.46785950960901257, + "grad_norm": 0.48404574394226074, + "learning_rate": 4.707278436082165e-06, + "loss": 0.0025, + "num_input_tokens_seen": 6915688, + "step": 3530 + }, + { + "epoch": 0.4679920477137177, + "grad_norm": 28.354053497314453, + "learning_rate": 4.707115420064699e-06, + "loss": 1.0691, + "num_input_tokens_seen": 6918248, + "step": 3531 + }, + { + "epoch": 0.4681245858184228, + "grad_norm": 6.964322090148926, + "learning_rate": 4.706952361492526e-06, + "loss": 0.2007, + "num_input_tokens_seen": 6920344, + "step": 3532 + }, + { + "epoch": 0.4682571239231279, + "grad_norm": 0.20571963489055634, + "learning_rate": 4.70678926036879e-06, + "loss": 0.0012, + "num_input_tokens_seen": 6921904, + "step": 3533 + }, + { + "epoch": 0.468389662027833, + "grad_norm": 0.07439305633306503, + "learning_rate": 4.706626116696632e-06, + "loss": 0.0005, + "num_input_tokens_seen": 6924288, + "step": 3534 + }, + { + "epoch": 0.4685222001325381, + "grad_norm": 10.463114738464355, + "learning_rate": 4.706462930479202e-06, + "loss": 0.2889, + "num_input_tokens_seen": 6925864, + "step": 3535 + }, + { + "epoch": 0.4686547382372432, + "grad_norm": 0.07231903821229935, + "learning_rate": 4.706299701719644e-06, + "loss": 0.0005, + "num_input_tokens_seen": 6927344, + "step": 3536 + }, + { + "epoch": 0.46878727634194833, + "grad_norm": 13.263286590576172, + "learning_rate": 4.706136430421107e-06, + "loss": 0.2207, + "num_input_tokens_seen": 6930144, + "step": 3537 + }, + { + "epoch": 0.4689198144466534, + "grad_norm": 10.516202926635742, + "learning_rate": 4.705973116586736e-06, + "loss": 0.1775, + "num_input_tokens_seen": 6932816, + "step": 3538 + }, + { + "epoch": 0.4690523525513585, + "grad_norm": 5.35650110244751, + "learning_rate": 4.705809760219683e-06, + "loss": 0.0998, + "num_input_tokens_seen": 6935120, + "step": 3539 + }, + { + "epoch": 0.4691848906560636, + "grad_norm": 4.04359769821167, + "learning_rate": 4.705646361323095e-06, + "loss": 0.0354, + "num_input_tokens_seen": 6937264, + "step": 3540 + }, + { + "epoch": 0.4693174287607687, + "grad_norm": 8.984070777893066, + "learning_rate": 4.705482919900124e-06, + "loss": 0.3484, + "num_input_tokens_seen": 6939464, + "step": 3541 + }, + { + "epoch": 0.4694499668654738, + "grad_norm": 9.976149559020996, + "learning_rate": 4.705319435953921e-06, + "loss": 0.2858, + "num_input_tokens_seen": 6941152, + "step": 3542 + }, + { + "epoch": 0.46958250497017895, + "grad_norm": 7.658107280731201, + "learning_rate": 4.705155909487637e-06, + "loss": 0.2591, + "num_input_tokens_seen": 6943064, + "step": 3543 + }, + { + "epoch": 0.46971504307488404, + "grad_norm": 6.953422546386719, + "learning_rate": 4.704992340504426e-06, + "loss": 0.2118, + "num_input_tokens_seen": 6944496, + "step": 3544 + }, + { + "epoch": 0.46984758117958914, + "grad_norm": 6.0076141357421875, + "learning_rate": 4.704828729007443e-06, + "loss": 0.1274, + "num_input_tokens_seen": 6946560, + "step": 3545 + }, + { + "epoch": 0.46998011928429423, + "grad_norm": 2.6947245597839355, + "learning_rate": 4.7046650749998396e-06, + "loss": 0.032, + "num_input_tokens_seen": 6947640, + "step": 3546 + }, + { + "epoch": 0.4701126573889993, + "grad_norm": 6.5871381759643555, + "learning_rate": 4.704501378484772e-06, + "loss": 0.1343, + "num_input_tokens_seen": 6949400, + "step": 3547 + }, + { + "epoch": 0.4702451954937044, + "grad_norm": 21.81719207763672, + "learning_rate": 4.704337639465399e-06, + "loss": 0.4704, + "num_input_tokens_seen": 6951224, + "step": 3548 + }, + { + "epoch": 0.47037773359840956, + "grad_norm": 6.1173810958862305, + "learning_rate": 4.704173857944875e-06, + "loss": 0.0759, + "num_input_tokens_seen": 6953560, + "step": 3549 + }, + { + "epoch": 0.47051027170311466, + "grad_norm": 19.397836685180664, + "learning_rate": 4.704010033926358e-06, + "loss": 0.6053, + "num_input_tokens_seen": 6956720, + "step": 3550 + }, + { + "epoch": 0.47064280980781975, + "grad_norm": 0.04566048085689545, + "learning_rate": 4.703846167413007e-06, + "loss": 0.0003, + "num_input_tokens_seen": 6958128, + "step": 3551 + }, + { + "epoch": 0.47077534791252484, + "grad_norm": 6.820549011230469, + "learning_rate": 4.703682258407981e-06, + "loss": 0.0614, + "num_input_tokens_seen": 6960984, + "step": 3552 + }, + { + "epoch": 0.47090788601722994, + "grad_norm": 5.300433158874512, + "learning_rate": 4.703518306914442e-06, + "loss": 0.0461, + "num_input_tokens_seen": 6964080, + "step": 3553 + }, + { + "epoch": 0.47104042412193503, + "grad_norm": 11.61038589477539, + "learning_rate": 4.70335431293555e-06, + "loss": 0.4296, + "num_input_tokens_seen": 6965624, + "step": 3554 + }, + { + "epoch": 0.4711729622266402, + "grad_norm": 9.014015197753906, + "learning_rate": 4.703190276474466e-06, + "loss": 0.2061, + "num_input_tokens_seen": 6967456, + "step": 3555 + }, + { + "epoch": 0.4713055003313453, + "grad_norm": 9.886773109436035, + "learning_rate": 4.703026197534354e-06, + "loss": 0.3265, + "num_input_tokens_seen": 6969440, + "step": 3556 + }, + { + "epoch": 0.47143803843605037, + "grad_norm": 6.1779046058654785, + "learning_rate": 4.702862076118377e-06, + "loss": 0.0765, + "num_input_tokens_seen": 6971352, + "step": 3557 + }, + { + "epoch": 0.47157057654075546, + "grad_norm": 4.020450115203857, + "learning_rate": 4.702697912229699e-06, + "loss": 0.0255, + "num_input_tokens_seen": 6973592, + "step": 3558 + }, + { + "epoch": 0.47170311464546055, + "grad_norm": 0.1025056540966034, + "learning_rate": 4.702533705871487e-06, + "loss": 0.0006, + "num_input_tokens_seen": 6975464, + "step": 3559 + }, + { + "epoch": 0.47183565275016565, + "grad_norm": 8.514978408813477, + "learning_rate": 4.702369457046904e-06, + "loss": 0.3335, + "num_input_tokens_seen": 6977192, + "step": 3560 + }, + { + "epoch": 0.4719681908548708, + "grad_norm": 4.87443733215332, + "learning_rate": 4.702205165759119e-06, + "loss": 0.1765, + "num_input_tokens_seen": 6979528, + "step": 3561 + }, + { + "epoch": 0.4721007289595759, + "grad_norm": 5.829202651977539, + "learning_rate": 4.7020408320113e-06, + "loss": 0.1468, + "num_input_tokens_seen": 6982144, + "step": 3562 + }, + { + "epoch": 0.472233267064281, + "grad_norm": 5.061383247375488, + "learning_rate": 4.701876455806615e-06, + "loss": 0.1379, + "num_input_tokens_seen": 6983752, + "step": 3563 + }, + { + "epoch": 0.4723658051689861, + "grad_norm": 7.918785572052002, + "learning_rate": 4.701712037148231e-06, + "loss": 0.1314, + "num_input_tokens_seen": 6986208, + "step": 3564 + }, + { + "epoch": 0.47249834327369117, + "grad_norm": 6.342104911804199, + "learning_rate": 4.701547576039321e-06, + "loss": 0.2634, + "num_input_tokens_seen": 6988792, + "step": 3565 + }, + { + "epoch": 0.47263088137839626, + "grad_norm": 10.905147552490234, + "learning_rate": 4.701383072483056e-06, + "loss": 0.3358, + "num_input_tokens_seen": 6991376, + "step": 3566 + }, + { + "epoch": 0.4727634194831014, + "grad_norm": 0.16422037780284882, + "learning_rate": 4.701218526482606e-06, + "loss": 0.001, + "num_input_tokens_seen": 6993824, + "step": 3567 + }, + { + "epoch": 0.4728959575878065, + "grad_norm": 1.7487876415252686, + "learning_rate": 4.701053938041144e-06, + "loss": 0.0097, + "num_input_tokens_seen": 6994960, + "step": 3568 + }, + { + "epoch": 0.4730284956925116, + "grad_norm": 0.07559742778539658, + "learning_rate": 4.700889307161843e-06, + "loss": 0.0005, + "num_input_tokens_seen": 6996656, + "step": 3569 + }, + { + "epoch": 0.4731610337972167, + "grad_norm": 14.890816688537598, + "learning_rate": 4.70072463384788e-06, + "loss": 0.4581, + "num_input_tokens_seen": 6999152, + "step": 3570 + }, + { + "epoch": 0.4732935719019218, + "grad_norm": 5.607250213623047, + "learning_rate": 4.700559918102426e-06, + "loss": 0.2113, + "num_input_tokens_seen": 7001648, + "step": 3571 + }, + { + "epoch": 0.47342611000662693, + "grad_norm": 0.10200473666191101, + "learning_rate": 4.700395159928659e-06, + "loss": 0.0006, + "num_input_tokens_seen": 7004200, + "step": 3572 + }, + { + "epoch": 0.473558648111332, + "grad_norm": 12.87032699584961, + "learning_rate": 4.700230359329756e-06, + "loss": 0.2462, + "num_input_tokens_seen": 7006320, + "step": 3573 + }, + { + "epoch": 0.4736911862160371, + "grad_norm": 11.754016876220703, + "learning_rate": 4.7000655163088925e-06, + "loss": 0.3285, + "num_input_tokens_seen": 7007864, + "step": 3574 + }, + { + "epoch": 0.4738237243207422, + "grad_norm": 5.008886814117432, + "learning_rate": 4.699900630869249e-06, + "loss": 0.1819, + "num_input_tokens_seen": 7009920, + "step": 3575 + }, + { + "epoch": 0.4739562624254473, + "grad_norm": 0.058161620050668716, + "learning_rate": 4.699735703014004e-06, + "loss": 0.0004, + "num_input_tokens_seen": 7011640, + "step": 3576 + }, + { + "epoch": 0.4740888005301524, + "grad_norm": 9.170276641845703, + "learning_rate": 4.699570732746337e-06, + "loss": 0.4069, + "num_input_tokens_seen": 7013392, + "step": 3577 + }, + { + "epoch": 0.47422133863485755, + "grad_norm": 8.129196166992188, + "learning_rate": 4.699405720069429e-06, + "loss": 0.1891, + "num_input_tokens_seen": 7015272, + "step": 3578 + }, + { + "epoch": 0.47435387673956264, + "grad_norm": 8.739459037780762, + "learning_rate": 4.6992406649864605e-06, + "loss": 0.2107, + "num_input_tokens_seen": 7017376, + "step": 3579 + }, + { + "epoch": 0.47448641484426773, + "grad_norm": 0.16730321943759918, + "learning_rate": 4.699075567500615e-06, + "loss": 0.0009, + "num_input_tokens_seen": 7020160, + "step": 3580 + }, + { + "epoch": 0.4746189529489728, + "grad_norm": 9.901205062866211, + "learning_rate": 4.698910427615076e-06, + "loss": 0.1999, + "num_input_tokens_seen": 7021360, + "step": 3581 + }, + { + "epoch": 0.4747514910536779, + "grad_norm": 10.716650009155273, + "learning_rate": 4.698745245333026e-06, + "loss": 0.3243, + "num_input_tokens_seen": 7023736, + "step": 3582 + }, + { + "epoch": 0.474884029158383, + "grad_norm": 13.758613586425781, + "learning_rate": 4.698580020657653e-06, + "loss": 0.3305, + "num_input_tokens_seen": 7025888, + "step": 3583 + }, + { + "epoch": 0.47501656726308816, + "grad_norm": 2.665637254714966, + "learning_rate": 4.698414753592139e-06, + "loss": 0.0605, + "num_input_tokens_seen": 7028576, + "step": 3584 + }, + { + "epoch": 0.47514910536779326, + "grad_norm": 18.0197696685791, + "learning_rate": 4.698249444139672e-06, + "loss": 0.4927, + "num_input_tokens_seen": 7032600, + "step": 3585 + }, + { + "epoch": 0.47528164347249835, + "grad_norm": 4.9588823318481445, + "learning_rate": 4.698084092303439e-06, + "loss": 0.1225, + "num_input_tokens_seen": 7034088, + "step": 3586 + }, + { + "epoch": 0.47541418157720344, + "grad_norm": 1.3933740854263306, + "learning_rate": 4.697918698086629e-06, + "loss": 0.0075, + "num_input_tokens_seen": 7035624, + "step": 3587 + }, + { + "epoch": 0.47554671968190854, + "grad_norm": 6.276792049407959, + "learning_rate": 4.697753261492429e-06, + "loss": 0.0486, + "num_input_tokens_seen": 7037768, + "step": 3588 + }, + { + "epoch": 0.47567925778661363, + "grad_norm": 9.799899101257324, + "learning_rate": 4.697587782524031e-06, + "loss": 0.3571, + "num_input_tokens_seen": 7039176, + "step": 3589 + }, + { + "epoch": 0.4758117958913188, + "grad_norm": 15.974863052368164, + "learning_rate": 4.697422261184625e-06, + "loss": 0.4476, + "num_input_tokens_seen": 7041008, + "step": 3590 + }, + { + "epoch": 0.47594433399602387, + "grad_norm": 0.06836438179016113, + "learning_rate": 4.697256697477401e-06, + "loss": 0.0004, + "num_input_tokens_seen": 7042888, + "step": 3591 + }, + { + "epoch": 0.47607687210072896, + "grad_norm": 0.23637758195400238, + "learning_rate": 4.697091091405552e-06, + "loss": 0.0014, + "num_input_tokens_seen": 7045520, + "step": 3592 + }, + { + "epoch": 0.47620941020543406, + "grad_norm": 0.11919800192117691, + "learning_rate": 4.6969254429722715e-06, + "loss": 0.0007, + "num_input_tokens_seen": 7046760, + "step": 3593 + }, + { + "epoch": 0.47634194831013915, + "grad_norm": 2.406327962875366, + "learning_rate": 4.696759752180753e-06, + "loss": 0.024, + "num_input_tokens_seen": 7047928, + "step": 3594 + }, + { + "epoch": 0.47647448641484424, + "grad_norm": 10.8333158493042, + "learning_rate": 4.696594019034191e-06, + "loss": 0.2878, + "num_input_tokens_seen": 7049872, + "step": 3595 + }, + { + "epoch": 0.4766070245195494, + "grad_norm": 11.24345588684082, + "learning_rate": 4.69642824353578e-06, + "loss": 0.3361, + "num_input_tokens_seen": 7052248, + "step": 3596 + }, + { + "epoch": 0.4767395626242545, + "grad_norm": 4.304723262786865, + "learning_rate": 4.696262425688719e-06, + "loss": 0.0769, + "num_input_tokens_seen": 7054560, + "step": 3597 + }, + { + "epoch": 0.4768721007289596, + "grad_norm": 0.1264643669128418, + "learning_rate": 4.696096565496201e-06, + "loss": 0.0008, + "num_input_tokens_seen": 7055744, + "step": 3598 + }, + { + "epoch": 0.4770046388336647, + "grad_norm": 0.31086790561676025, + "learning_rate": 4.695930662961428e-06, + "loss": 0.0019, + "num_input_tokens_seen": 7057648, + "step": 3599 + }, + { + "epoch": 0.47713717693836977, + "grad_norm": 15.648187637329102, + "learning_rate": 4.695764718087597e-06, + "loss": 0.6259, + "num_input_tokens_seen": 7059672, + "step": 3600 + }, + { + "epoch": 0.47726971504307486, + "grad_norm": 8.93014144897461, + "learning_rate": 4.695598730877906e-06, + "loss": 0.3532, + "num_input_tokens_seen": 7061440, + "step": 3601 + }, + { + "epoch": 0.47740225314778, + "grad_norm": 17.751750946044922, + "learning_rate": 4.695432701335558e-06, + "loss": 0.5704, + "num_input_tokens_seen": 7063408, + "step": 3602 + }, + { + "epoch": 0.4775347912524851, + "grad_norm": 0.16362851858139038, + "learning_rate": 4.695266629463753e-06, + "loss": 0.001, + "num_input_tokens_seen": 7065576, + "step": 3603 + }, + { + "epoch": 0.4776673293571902, + "grad_norm": 12.111469268798828, + "learning_rate": 4.695100515265692e-06, + "loss": 0.5356, + "num_input_tokens_seen": 7068104, + "step": 3604 + }, + { + "epoch": 0.4777998674618953, + "grad_norm": 0.3510127067565918, + "learning_rate": 4.694934358744579e-06, + "loss": 0.002, + "num_input_tokens_seen": 7070400, + "step": 3605 + }, + { + "epoch": 0.4779324055666004, + "grad_norm": 10.041983604431152, + "learning_rate": 4.694768159903618e-06, + "loss": 0.2129, + "num_input_tokens_seen": 7071744, + "step": 3606 + }, + { + "epoch": 0.4780649436713055, + "grad_norm": 15.56740665435791, + "learning_rate": 4.694601918746013e-06, + "loss": 0.2034, + "num_input_tokens_seen": 7073392, + "step": 3607 + }, + { + "epoch": 0.4781974817760106, + "grad_norm": 3.900268077850342, + "learning_rate": 4.694435635274967e-06, + "loss": 0.0868, + "num_input_tokens_seen": 7075048, + "step": 3608 + }, + { + "epoch": 0.4783300198807157, + "grad_norm": 15.370348930358887, + "learning_rate": 4.69426930949369e-06, + "loss": 0.3446, + "num_input_tokens_seen": 7076640, + "step": 3609 + }, + { + "epoch": 0.4784625579854208, + "grad_norm": 5.116485595703125, + "learning_rate": 4.694102941405387e-06, + "loss": 0.1836, + "num_input_tokens_seen": 7079040, + "step": 3610 + }, + { + "epoch": 0.4785950960901259, + "grad_norm": 0.5230085253715515, + "learning_rate": 4.693936531013265e-06, + "loss": 0.0032, + "num_input_tokens_seen": 7080864, + "step": 3611 + }, + { + "epoch": 0.478727634194831, + "grad_norm": 11.05142593383789, + "learning_rate": 4.693770078320533e-06, + "loss": 0.1962, + "num_input_tokens_seen": 7082592, + "step": 3612 + }, + { + "epoch": 0.4788601722995361, + "grad_norm": 0.23189933598041534, + "learning_rate": 4.6936035833304015e-06, + "loss": 0.0015, + "num_input_tokens_seen": 7084104, + "step": 3613 + }, + { + "epoch": 0.47899271040424124, + "grad_norm": 0.8146563172340393, + "learning_rate": 4.693437046046078e-06, + "loss": 0.005, + "num_input_tokens_seen": 7086000, + "step": 3614 + }, + { + "epoch": 0.47912524850894633, + "grad_norm": 0.21613956987857819, + "learning_rate": 4.6932704664707765e-06, + "loss": 0.0014, + "num_input_tokens_seen": 7087240, + "step": 3615 + }, + { + "epoch": 0.4792577866136514, + "grad_norm": 0.2218838632106781, + "learning_rate": 4.693103844607707e-06, + "loss": 0.0014, + "num_input_tokens_seen": 7088864, + "step": 3616 + }, + { + "epoch": 0.4793903247183565, + "grad_norm": 9.091875076293945, + "learning_rate": 4.692937180460082e-06, + "loss": 0.2301, + "num_input_tokens_seen": 7091320, + "step": 3617 + }, + { + "epoch": 0.4795228628230616, + "grad_norm": 4.170630931854248, + "learning_rate": 4.692770474031116e-06, + "loss": 0.0962, + "num_input_tokens_seen": 7092744, + "step": 3618 + }, + { + "epoch": 0.4796554009277667, + "grad_norm": 14.123799324035645, + "learning_rate": 4.692603725324022e-06, + "loss": 0.5683, + "num_input_tokens_seen": 7095448, + "step": 3619 + }, + { + "epoch": 0.47978793903247186, + "grad_norm": 6.59906005859375, + "learning_rate": 4.692436934342016e-06, + "loss": 0.2922, + "num_input_tokens_seen": 7097192, + "step": 3620 + }, + { + "epoch": 0.47992047713717695, + "grad_norm": 0.40969640016555786, + "learning_rate": 4.692270101088313e-06, + "loss": 0.0023, + "num_input_tokens_seen": 7098976, + "step": 3621 + }, + { + "epoch": 0.48005301524188204, + "grad_norm": 2.175377607345581, + "learning_rate": 4.69210322556613e-06, + "loss": 0.0131, + "num_input_tokens_seen": 7101016, + "step": 3622 + }, + { + "epoch": 0.48018555334658714, + "grad_norm": 8.167495727539062, + "learning_rate": 4.6919363077786855e-06, + "loss": 0.2313, + "num_input_tokens_seen": 7102424, + "step": 3623 + }, + { + "epoch": 0.48031809145129223, + "grad_norm": 14.146867752075195, + "learning_rate": 4.691769347729196e-06, + "loss": 0.5304, + "num_input_tokens_seen": 7105504, + "step": 3624 + }, + { + "epoch": 0.4804506295559973, + "grad_norm": 6.243712425231934, + "learning_rate": 4.691602345420882e-06, + "loss": 0.1511, + "num_input_tokens_seen": 7107416, + "step": 3625 + }, + { + "epoch": 0.48058316766070247, + "grad_norm": 0.191379114985466, + "learning_rate": 4.691435300856962e-06, + "loss": 0.0011, + "num_input_tokens_seen": 7109104, + "step": 3626 + }, + { + "epoch": 0.48071570576540756, + "grad_norm": 10.162073135375977, + "learning_rate": 4.691268214040658e-06, + "loss": 0.3553, + "num_input_tokens_seen": 7111032, + "step": 3627 + }, + { + "epoch": 0.48084824387011266, + "grad_norm": 4.809532165527344, + "learning_rate": 4.6911010849751916e-06, + "loss": 0.0741, + "num_input_tokens_seen": 7113368, + "step": 3628 + }, + { + "epoch": 0.48098078197481775, + "grad_norm": 4.5478620529174805, + "learning_rate": 4.690933913663784e-06, + "loss": 0.048, + "num_input_tokens_seen": 7114856, + "step": 3629 + }, + { + "epoch": 0.48111332007952284, + "grad_norm": 0.18467599153518677, + "learning_rate": 4.690766700109659e-06, + "loss": 0.0011, + "num_input_tokens_seen": 7116696, + "step": 3630 + }, + { + "epoch": 0.481245858184228, + "grad_norm": 17.60898208618164, + "learning_rate": 4.690599444316042e-06, + "loss": 0.449, + "num_input_tokens_seen": 7118576, + "step": 3631 + }, + { + "epoch": 0.4813783962889331, + "grad_norm": 7.66585111618042, + "learning_rate": 4.690432146286155e-06, + "loss": 0.3184, + "num_input_tokens_seen": 7120536, + "step": 3632 + }, + { + "epoch": 0.4815109343936382, + "grad_norm": 18.95626449584961, + "learning_rate": 4.690264806023226e-06, + "loss": 0.4574, + "num_input_tokens_seen": 7121872, + "step": 3633 + }, + { + "epoch": 0.4816434724983433, + "grad_norm": 0.11527787148952484, + "learning_rate": 4.6900974235304805e-06, + "loss": 0.0007, + "num_input_tokens_seen": 7123720, + "step": 3634 + }, + { + "epoch": 0.48177601060304837, + "grad_norm": 6.762696743011475, + "learning_rate": 4.689929998811145e-06, + "loss": 0.1712, + "num_input_tokens_seen": 7125784, + "step": 3635 + }, + { + "epoch": 0.48190854870775346, + "grad_norm": 0.4892951548099518, + "learning_rate": 4.6897625318684494e-06, + "loss": 0.0025, + "num_input_tokens_seen": 7127448, + "step": 3636 + }, + { + "epoch": 0.4820410868124586, + "grad_norm": 2.992539405822754, + "learning_rate": 4.689595022705621e-06, + "loss": 0.0942, + "num_input_tokens_seen": 7128640, + "step": 3637 + }, + { + "epoch": 0.4821736249171637, + "grad_norm": 0.06269744038581848, + "learning_rate": 4.689427471325891e-06, + "loss": 0.0004, + "num_input_tokens_seen": 7130616, + "step": 3638 + }, + { + "epoch": 0.4823061630218688, + "grad_norm": 16.022083282470703, + "learning_rate": 4.689259877732487e-06, + "loss": 0.4353, + "num_input_tokens_seen": 7132576, + "step": 3639 + }, + { + "epoch": 0.4824387011265739, + "grad_norm": 0.09941118955612183, + "learning_rate": 4.6890922419286436e-06, + "loss": 0.0006, + "num_input_tokens_seen": 7134048, + "step": 3640 + }, + { + "epoch": 0.482571239231279, + "grad_norm": 9.228554725646973, + "learning_rate": 4.688924563917592e-06, + "loss": 0.1197, + "num_input_tokens_seen": 7135704, + "step": 3641 + }, + { + "epoch": 0.4827037773359841, + "grad_norm": 0.06086016818881035, + "learning_rate": 4.688756843702563e-06, + "loss": 0.0004, + "num_input_tokens_seen": 7137080, + "step": 3642 + }, + { + "epoch": 0.4828363154406892, + "grad_norm": 9.835884094238281, + "learning_rate": 4.688589081286794e-06, + "loss": 0.2404, + "num_input_tokens_seen": 7138160, + "step": 3643 + }, + { + "epoch": 0.4829688535453943, + "grad_norm": 10.266068458557129, + "learning_rate": 4.688421276673517e-06, + "loss": 0.3296, + "num_input_tokens_seen": 7140376, + "step": 3644 + }, + { + "epoch": 0.4831013916500994, + "grad_norm": 2.3182859420776367, + "learning_rate": 4.6882534298659675e-06, + "loss": 0.015, + "num_input_tokens_seen": 7141512, + "step": 3645 + }, + { + "epoch": 0.4832339297548045, + "grad_norm": 10.558012008666992, + "learning_rate": 4.6880855408673835e-06, + "loss": 0.2838, + "num_input_tokens_seen": 7143248, + "step": 3646 + }, + { + "epoch": 0.4833664678595096, + "grad_norm": 7.685230731964111, + "learning_rate": 4.6879176096809995e-06, + "loss": 0.2251, + "num_input_tokens_seen": 7144664, + "step": 3647 + }, + { + "epoch": 0.4834990059642147, + "grad_norm": 19.196279525756836, + "learning_rate": 4.687749636310055e-06, + "loss": 0.6622, + "num_input_tokens_seen": 7147528, + "step": 3648 + }, + { + "epoch": 0.48363154406891984, + "grad_norm": 15.792891502380371, + "learning_rate": 4.687581620757788e-06, + "loss": 0.4496, + "num_input_tokens_seen": 7149704, + "step": 3649 + }, + { + "epoch": 0.48376408217362493, + "grad_norm": 10.962190628051758, + "learning_rate": 4.687413563027439e-06, + "loss": 0.1057, + "num_input_tokens_seen": 7151368, + "step": 3650 + }, + { + "epoch": 0.48389662027833, + "grad_norm": 0.7263032793998718, + "learning_rate": 4.687245463122247e-06, + "loss": 0.0045, + "num_input_tokens_seen": 7153376, + "step": 3651 + }, + { + "epoch": 0.4840291583830351, + "grad_norm": 6.699631690979004, + "learning_rate": 4.687077321045454e-06, + "loss": 0.1838, + "num_input_tokens_seen": 7156232, + "step": 3652 + }, + { + "epoch": 0.4841616964877402, + "grad_norm": 5.326754093170166, + "learning_rate": 4.686909136800301e-06, + "loss": 0.1323, + "num_input_tokens_seen": 7158464, + "step": 3653 + }, + { + "epoch": 0.4842942345924453, + "grad_norm": 13.424384117126465, + "learning_rate": 4.6867409103900315e-06, + "loss": 0.1772, + "num_input_tokens_seen": 7160520, + "step": 3654 + }, + { + "epoch": 0.48442677269715045, + "grad_norm": 18.61404800415039, + "learning_rate": 4.6865726418178886e-06, + "loss": 0.8723, + "num_input_tokens_seen": 7162840, + "step": 3655 + }, + { + "epoch": 0.48455931080185555, + "grad_norm": 12.540332794189453, + "learning_rate": 4.686404331087117e-06, + "loss": 0.4924, + "num_input_tokens_seen": 7165080, + "step": 3656 + }, + { + "epoch": 0.48469184890656064, + "grad_norm": 8.716713905334473, + "learning_rate": 4.686235978200961e-06, + "loss": 0.1661, + "num_input_tokens_seen": 7167288, + "step": 3657 + }, + { + "epoch": 0.48482438701126573, + "grad_norm": 7.3954362869262695, + "learning_rate": 4.686067583162667e-06, + "loss": 0.2172, + "num_input_tokens_seen": 7169136, + "step": 3658 + }, + { + "epoch": 0.4849569251159708, + "grad_norm": 6.320158958435059, + "learning_rate": 4.685899145975482e-06, + "loss": 0.1723, + "num_input_tokens_seen": 7170960, + "step": 3659 + }, + { + "epoch": 0.4850894632206759, + "grad_norm": 16.36895179748535, + "learning_rate": 4.685730666642654e-06, + "loss": 0.5165, + "num_input_tokens_seen": 7172496, + "step": 3660 + }, + { + "epoch": 0.48522200132538107, + "grad_norm": 2.5520060062408447, + "learning_rate": 4.68556214516743e-06, + "loss": 0.0302, + "num_input_tokens_seen": 7174440, + "step": 3661 + }, + { + "epoch": 0.48535453943008616, + "grad_norm": 9.569836616516113, + "learning_rate": 4.685393581553061e-06, + "loss": 0.1684, + "num_input_tokens_seen": 7176104, + "step": 3662 + }, + { + "epoch": 0.48548707753479126, + "grad_norm": 9.7288236618042, + "learning_rate": 4.685224975802797e-06, + "loss": 0.3023, + "num_input_tokens_seen": 7178240, + "step": 3663 + }, + { + "epoch": 0.48561961563949635, + "grad_norm": 2.0769824981689453, + "learning_rate": 4.685056327919886e-06, + "loss": 0.0359, + "num_input_tokens_seen": 7180384, + "step": 3664 + }, + { + "epoch": 0.48575215374420144, + "grad_norm": 1.1037579774856567, + "learning_rate": 4.684887637907583e-06, + "loss": 0.0073, + "num_input_tokens_seen": 7181400, + "step": 3665 + }, + { + "epoch": 0.48588469184890654, + "grad_norm": 6.982590675354004, + "learning_rate": 4.684718905769138e-06, + "loss": 0.1672, + "num_input_tokens_seen": 7184032, + "step": 3666 + }, + { + "epoch": 0.4860172299536117, + "grad_norm": 1.2424237728118896, + "learning_rate": 4.684550131507806e-06, + "loss": 0.0082, + "num_input_tokens_seen": 7185672, + "step": 3667 + }, + { + "epoch": 0.4861497680583168, + "grad_norm": 13.110060691833496, + "learning_rate": 4.68438131512684e-06, + "loss": 0.2624, + "num_input_tokens_seen": 7187144, + "step": 3668 + }, + { + "epoch": 0.48628230616302187, + "grad_norm": 2.2205982208251953, + "learning_rate": 4.684212456629496e-06, + "loss": 0.0247, + "num_input_tokens_seen": 7189056, + "step": 3669 + }, + { + "epoch": 0.48641484426772696, + "grad_norm": 6.307052135467529, + "learning_rate": 4.684043556019028e-06, + "loss": 0.1582, + "num_input_tokens_seen": 7190768, + "step": 3670 + }, + { + "epoch": 0.48654738237243206, + "grad_norm": 12.494579315185547, + "learning_rate": 4.683874613298695e-06, + "loss": 0.2223, + "num_input_tokens_seen": 7192648, + "step": 3671 + }, + { + "epoch": 0.48667992047713715, + "grad_norm": 12.971120834350586, + "learning_rate": 4.683705628471752e-06, + "loss": 0.4699, + "num_input_tokens_seen": 7194312, + "step": 3672 + }, + { + "epoch": 0.4868124585818423, + "grad_norm": 11.755553245544434, + "learning_rate": 4.683536601541458e-06, + "loss": 0.322, + "num_input_tokens_seen": 7196048, + "step": 3673 + }, + { + "epoch": 0.4869449966865474, + "grad_norm": 12.68346881866455, + "learning_rate": 4.683367532511072e-06, + "loss": 0.634, + "num_input_tokens_seen": 7197824, + "step": 3674 + }, + { + "epoch": 0.4870775347912525, + "grad_norm": 21.495649337768555, + "learning_rate": 4.683198421383853e-06, + "loss": 0.7325, + "num_input_tokens_seen": 7199904, + "step": 3675 + }, + { + "epoch": 0.4872100728959576, + "grad_norm": 10.929720878601074, + "learning_rate": 4.683029268163064e-06, + "loss": 0.2668, + "num_input_tokens_seen": 7202168, + "step": 3676 + }, + { + "epoch": 0.4873426110006627, + "grad_norm": 13.11765193939209, + "learning_rate": 4.682860072851963e-06, + "loss": 0.356, + "num_input_tokens_seen": 7204336, + "step": 3677 + }, + { + "epoch": 0.48747514910536777, + "grad_norm": 1.0770907402038574, + "learning_rate": 4.682690835453815e-06, + "loss": 0.0067, + "num_input_tokens_seen": 7206120, + "step": 3678 + }, + { + "epoch": 0.4876076872100729, + "grad_norm": 7.749923229217529, + "learning_rate": 4.682521555971881e-06, + "loss": 0.1145, + "num_input_tokens_seen": 7208184, + "step": 3679 + }, + { + "epoch": 0.487740225314778, + "grad_norm": 17.261608123779297, + "learning_rate": 4.6823522344094265e-06, + "loss": 0.2908, + "num_input_tokens_seen": 7210440, + "step": 3680 + }, + { + "epoch": 0.4878727634194831, + "grad_norm": 0.5793535113334656, + "learning_rate": 4.6821828707697155e-06, + "loss": 0.0036, + "num_input_tokens_seen": 7212328, + "step": 3681 + }, + { + "epoch": 0.4880053015241882, + "grad_norm": 12.21238899230957, + "learning_rate": 4.682013465056014e-06, + "loss": 0.4645, + "num_input_tokens_seen": 7214736, + "step": 3682 + }, + { + "epoch": 0.4881378396288933, + "grad_norm": 7.762907981872559, + "learning_rate": 4.681844017271586e-06, + "loss": 0.1034, + "num_input_tokens_seen": 7216504, + "step": 3683 + }, + { + "epoch": 0.4882703777335984, + "grad_norm": 8.921805381774902, + "learning_rate": 4.681674527419701e-06, + "loss": 0.2196, + "num_input_tokens_seen": 7218408, + "step": 3684 + }, + { + "epoch": 0.48840291583830353, + "grad_norm": 10.31627082824707, + "learning_rate": 4.681504995503626e-06, + "loss": 0.2883, + "num_input_tokens_seen": 7219936, + "step": 3685 + }, + { + "epoch": 0.4885354539430086, + "grad_norm": 8.42447566986084, + "learning_rate": 4.681335421526629e-06, + "loss": 0.0715, + "num_input_tokens_seen": 7221728, + "step": 3686 + }, + { + "epoch": 0.4886679920477137, + "grad_norm": 0.3916245698928833, + "learning_rate": 4.681165805491982e-06, + "loss": 0.0025, + "num_input_tokens_seen": 7223328, + "step": 3687 + }, + { + "epoch": 0.4888005301524188, + "grad_norm": 0.4920238256454468, + "learning_rate": 4.680996147402952e-06, + "loss": 0.0029, + "num_input_tokens_seen": 7225000, + "step": 3688 + }, + { + "epoch": 0.4889330682571239, + "grad_norm": 0.1166110411286354, + "learning_rate": 4.680826447262812e-06, + "loss": 0.0007, + "num_input_tokens_seen": 7226336, + "step": 3689 + }, + { + "epoch": 0.48906560636182905, + "grad_norm": 3.457275629043579, + "learning_rate": 4.680656705074834e-06, + "loss": 0.0543, + "num_input_tokens_seen": 7227960, + "step": 3690 + }, + { + "epoch": 0.48919814446653415, + "grad_norm": 2.1174094676971436, + "learning_rate": 4.680486920842291e-06, + "loss": 0.0171, + "num_input_tokens_seen": 7229608, + "step": 3691 + }, + { + "epoch": 0.48933068257123924, + "grad_norm": 0.5574467778205872, + "learning_rate": 4.680317094568454e-06, + "loss": 0.0033, + "num_input_tokens_seen": 7231480, + "step": 3692 + }, + { + "epoch": 0.48946322067594433, + "grad_norm": 3.1361982822418213, + "learning_rate": 4.6801472262566005e-06, + "loss": 0.0794, + "num_input_tokens_seen": 7232968, + "step": 3693 + }, + { + "epoch": 0.4895957587806494, + "grad_norm": 17.396100997924805, + "learning_rate": 4.679977315910005e-06, + "loss": 0.392, + "num_input_tokens_seen": 7234304, + "step": 3694 + }, + { + "epoch": 0.4897282968853545, + "grad_norm": 9.992633819580078, + "learning_rate": 4.679807363531942e-06, + "loss": 0.3459, + "num_input_tokens_seen": 7236272, + "step": 3695 + }, + { + "epoch": 0.48986083499005967, + "grad_norm": 0.1709693968296051, + "learning_rate": 4.67963736912569e-06, + "loss": 0.001, + "num_input_tokens_seen": 7238008, + "step": 3696 + }, + { + "epoch": 0.48999337309476476, + "grad_norm": 5.947240829467773, + "learning_rate": 4.679467332694525e-06, + "loss": 0.1294, + "num_input_tokens_seen": 7240952, + "step": 3697 + }, + { + "epoch": 0.49012591119946985, + "grad_norm": 8.366475105285645, + "learning_rate": 4.679297254241727e-06, + "loss": 0.1231, + "num_input_tokens_seen": 7242744, + "step": 3698 + }, + { + "epoch": 0.49025844930417495, + "grad_norm": 0.06506709009408951, + "learning_rate": 4.679127133770574e-06, + "loss": 0.0004, + "num_input_tokens_seen": 7244048, + "step": 3699 + }, + { + "epoch": 0.49039098740888004, + "grad_norm": 8.427557945251465, + "learning_rate": 4.678956971284347e-06, + "loss": 0.1014, + "num_input_tokens_seen": 7247080, + "step": 3700 + }, + { + "epoch": 0.49052352551358513, + "grad_norm": 2.152414083480835, + "learning_rate": 4.678786766786327e-06, + "loss": 0.028, + "num_input_tokens_seen": 7248792, + "step": 3701 + }, + { + "epoch": 0.4906560636182903, + "grad_norm": 10.751141548156738, + "learning_rate": 4.678616520279794e-06, + "loss": 0.0706, + "num_input_tokens_seen": 7250360, + "step": 3702 + }, + { + "epoch": 0.4907886017229954, + "grad_norm": 5.024112701416016, + "learning_rate": 4.6784462317680315e-06, + "loss": 0.0787, + "num_input_tokens_seen": 7253496, + "step": 3703 + }, + { + "epoch": 0.49092113982770047, + "grad_norm": 2.8715667724609375, + "learning_rate": 4.678275901254324e-06, + "loss": 0.0717, + "num_input_tokens_seen": 7254664, + "step": 3704 + }, + { + "epoch": 0.49105367793240556, + "grad_norm": 8.858973503112793, + "learning_rate": 4.6781055287419525e-06, + "loss": 0.1985, + "num_input_tokens_seen": 7255896, + "step": 3705 + }, + { + "epoch": 0.49118621603711066, + "grad_norm": 6.407101631164551, + "learning_rate": 4.6779351142342044e-06, + "loss": 0.2617, + "num_input_tokens_seen": 7257672, + "step": 3706 + }, + { + "epoch": 0.49131875414181575, + "grad_norm": 0.18223018944263458, + "learning_rate": 4.677764657734366e-06, + "loss": 0.0011, + "num_input_tokens_seen": 7259376, + "step": 3707 + }, + { + "epoch": 0.4914512922465209, + "grad_norm": 0.09545232355594635, + "learning_rate": 4.677594159245722e-06, + "loss": 0.0006, + "num_input_tokens_seen": 7262984, + "step": 3708 + }, + { + "epoch": 0.491583830351226, + "grad_norm": 7.851983070373535, + "learning_rate": 4.67742361877156e-06, + "loss": 0.0725, + "num_input_tokens_seen": 7264488, + "step": 3709 + }, + { + "epoch": 0.4917163684559311, + "grad_norm": 0.1380966156721115, + "learning_rate": 4.677253036315169e-06, + "loss": 0.0009, + "num_input_tokens_seen": 7265952, + "step": 3710 + }, + { + "epoch": 0.4918489065606362, + "grad_norm": 4.944606304168701, + "learning_rate": 4.677082411879837e-06, + "loss": 0.1532, + "num_input_tokens_seen": 7268056, + "step": 3711 + }, + { + "epoch": 0.49198144466534127, + "grad_norm": 9.678457260131836, + "learning_rate": 4.6769117454688545e-06, + "loss": 0.4166, + "num_input_tokens_seen": 7270344, + "step": 3712 + }, + { + "epoch": 0.49211398277004637, + "grad_norm": 11.303293228149414, + "learning_rate": 4.6767410370855115e-06, + "loss": 0.344, + "num_input_tokens_seen": 7271936, + "step": 3713 + }, + { + "epoch": 0.4922465208747515, + "grad_norm": 1.7822917699813843, + "learning_rate": 4.6765702867331e-06, + "loss": 0.0101, + "num_input_tokens_seen": 7274352, + "step": 3714 + }, + { + "epoch": 0.4923790589794566, + "grad_norm": 0.12557177245616913, + "learning_rate": 4.676399494414911e-06, + "loss": 0.0008, + "num_input_tokens_seen": 7275560, + "step": 3715 + }, + { + "epoch": 0.4925115970841617, + "grad_norm": 1.5916081666946411, + "learning_rate": 4.67622866013424e-06, + "loss": 0.0084, + "num_input_tokens_seen": 7277688, + "step": 3716 + }, + { + "epoch": 0.4926441351888668, + "grad_norm": 12.983985900878906, + "learning_rate": 4.676057783894378e-06, + "loss": 0.4425, + "num_input_tokens_seen": 7279928, + "step": 3717 + }, + { + "epoch": 0.4927766732935719, + "grad_norm": 13.755720138549805, + "learning_rate": 4.6758868656986214e-06, + "loss": 0.1449, + "num_input_tokens_seen": 7281128, + "step": 3718 + }, + { + "epoch": 0.492909211398277, + "grad_norm": 14.462699890136719, + "learning_rate": 4.6757159055502635e-06, + "loss": 0.2867, + "num_input_tokens_seen": 7282992, + "step": 3719 + }, + { + "epoch": 0.49304174950298213, + "grad_norm": 0.09824782609939575, + "learning_rate": 4.675544903452603e-06, + "loss": 0.0006, + "num_input_tokens_seen": 7284320, + "step": 3720 + }, + { + "epoch": 0.4931742876076872, + "grad_norm": 0.17122478783130646, + "learning_rate": 4.675373859408936e-06, + "loss": 0.0011, + "num_input_tokens_seen": 7286048, + "step": 3721 + }, + { + "epoch": 0.4933068257123923, + "grad_norm": 0.25496169924736023, + "learning_rate": 4.6752027734225605e-06, + "loss": 0.0015, + "num_input_tokens_seen": 7287432, + "step": 3722 + }, + { + "epoch": 0.4934393638170974, + "grad_norm": 0.10535811632871628, + "learning_rate": 4.675031645496775e-06, + "loss": 0.0007, + "num_input_tokens_seen": 7288840, + "step": 3723 + }, + { + "epoch": 0.4935719019218025, + "grad_norm": 0.15338915586471558, + "learning_rate": 4.674860475634878e-06, + "loss": 0.001, + "num_input_tokens_seen": 7290376, + "step": 3724 + }, + { + "epoch": 0.4937044400265076, + "grad_norm": 0.8637729287147522, + "learning_rate": 4.674689263840171e-06, + "loss": 0.0047, + "num_input_tokens_seen": 7291904, + "step": 3725 + }, + { + "epoch": 0.49383697813121274, + "grad_norm": 8.982322692871094, + "learning_rate": 4.674518010115955e-06, + "loss": 0.2171, + "num_input_tokens_seen": 7293712, + "step": 3726 + }, + { + "epoch": 0.49396951623591784, + "grad_norm": 7.300834655761719, + "learning_rate": 4.674346714465532e-06, + "loss": 0.2726, + "num_input_tokens_seen": 7295584, + "step": 3727 + }, + { + "epoch": 0.49410205434062293, + "grad_norm": 4.2447509765625, + "learning_rate": 4.674175376892204e-06, + "loss": 0.159, + "num_input_tokens_seen": 7297392, + "step": 3728 + }, + { + "epoch": 0.494234592445328, + "grad_norm": 7.436861991882324, + "learning_rate": 4.674003997399275e-06, + "loss": 0.0729, + "num_input_tokens_seen": 7299048, + "step": 3729 + }, + { + "epoch": 0.4943671305500331, + "grad_norm": 0.9530767798423767, + "learning_rate": 4.673832575990048e-06, + "loss": 0.0105, + "num_input_tokens_seen": 7300360, + "step": 3730 + }, + { + "epoch": 0.4944996686547382, + "grad_norm": 0.05369829013943672, + "learning_rate": 4.673661112667831e-06, + "loss": 0.0003, + "num_input_tokens_seen": 7302296, + "step": 3731 + }, + { + "epoch": 0.49463220675944336, + "grad_norm": 7.32320499420166, + "learning_rate": 4.673489607435927e-06, + "loss": 0.2798, + "num_input_tokens_seen": 7304504, + "step": 3732 + }, + { + "epoch": 0.49476474486414845, + "grad_norm": 15.532498359680176, + "learning_rate": 4.6733180602976445e-06, + "loss": 0.6279, + "num_input_tokens_seen": 7307064, + "step": 3733 + }, + { + "epoch": 0.49489728296885355, + "grad_norm": 0.08652970939874649, + "learning_rate": 4.67314647125629e-06, + "loss": 0.0005, + "num_input_tokens_seen": 7309048, + "step": 3734 + }, + { + "epoch": 0.49502982107355864, + "grad_norm": 7.71146297454834, + "learning_rate": 4.6729748403151735e-06, + "loss": 0.2324, + "num_input_tokens_seen": 7311168, + "step": 3735 + }, + { + "epoch": 0.49516235917826373, + "grad_norm": 0.08946602046489716, + "learning_rate": 4.672803167477602e-06, + "loss": 0.0005, + "num_input_tokens_seen": 7313592, + "step": 3736 + }, + { + "epoch": 0.4952948972829688, + "grad_norm": 12.425528526306152, + "learning_rate": 4.672631452746887e-06, + "loss": 0.3299, + "num_input_tokens_seen": 7315648, + "step": 3737 + }, + { + "epoch": 0.495427435387674, + "grad_norm": 4.399319648742676, + "learning_rate": 4.6724596961263376e-06, + "loss": 0.1433, + "num_input_tokens_seen": 7317776, + "step": 3738 + }, + { + "epoch": 0.49555997349237907, + "grad_norm": 15.491249084472656, + "learning_rate": 4.672287897619267e-06, + "loss": 0.4529, + "num_input_tokens_seen": 7320488, + "step": 3739 + }, + { + "epoch": 0.49569251159708416, + "grad_norm": 3.01819109916687, + "learning_rate": 4.672116057228988e-06, + "loss": 0.0315, + "num_input_tokens_seen": 7322640, + "step": 3740 + }, + { + "epoch": 0.49582504970178926, + "grad_norm": 19.011850357055664, + "learning_rate": 4.671944174958812e-06, + "loss": 1.0043, + "num_input_tokens_seen": 7325680, + "step": 3741 + }, + { + "epoch": 0.49595758780649435, + "grad_norm": 8.788366317749023, + "learning_rate": 4.671772250812055e-06, + "loss": 0.2475, + "num_input_tokens_seen": 7327896, + "step": 3742 + }, + { + "epoch": 0.49609012591119944, + "grad_norm": 10.707992553710938, + "learning_rate": 4.671600284792031e-06, + "loss": 0.3097, + "num_input_tokens_seen": 7330056, + "step": 3743 + }, + { + "epoch": 0.4962226640159046, + "grad_norm": 23.457717895507812, + "learning_rate": 4.671428276902054e-06, + "loss": 0.6047, + "num_input_tokens_seen": 7332120, + "step": 3744 + }, + { + "epoch": 0.4963552021206097, + "grad_norm": 15.116650581359863, + "learning_rate": 4.671256227145443e-06, + "loss": 0.4513, + "num_input_tokens_seen": 7333624, + "step": 3745 + }, + { + "epoch": 0.4964877402253148, + "grad_norm": 7.731261253356934, + "learning_rate": 4.671084135525513e-06, + "loss": 0.2663, + "num_input_tokens_seen": 7335616, + "step": 3746 + }, + { + "epoch": 0.49662027833001987, + "grad_norm": 9.077445030212402, + "learning_rate": 4.6709120020455845e-06, + "loss": 0.1923, + "num_input_tokens_seen": 7337552, + "step": 3747 + }, + { + "epoch": 0.49675281643472496, + "grad_norm": 4.799147129058838, + "learning_rate": 4.670739826708974e-06, + "loss": 0.0242, + "num_input_tokens_seen": 7339088, + "step": 3748 + }, + { + "epoch": 0.4968853545394301, + "grad_norm": 6.35343074798584, + "learning_rate": 4.670567609519002e-06, + "loss": 0.0519, + "num_input_tokens_seen": 7340704, + "step": 3749 + }, + { + "epoch": 0.4970178926441352, + "grad_norm": 18.491138458251953, + "learning_rate": 4.670395350478989e-06, + "loss": 0.6177, + "num_input_tokens_seen": 7342768, + "step": 3750 + }, + { + "epoch": 0.4971504307488403, + "grad_norm": 7.562197685241699, + "learning_rate": 4.670223049592257e-06, + "loss": 0.3762, + "num_input_tokens_seen": 7344696, + "step": 3751 + }, + { + "epoch": 0.4972829688535454, + "grad_norm": 0.670927107334137, + "learning_rate": 4.670050706862127e-06, + "loss": 0.0042, + "num_input_tokens_seen": 7347024, + "step": 3752 + }, + { + "epoch": 0.4974155069582505, + "grad_norm": 10.624455451965332, + "learning_rate": 4.669878322291923e-06, + "loss": 0.2179, + "num_input_tokens_seen": 7349416, + "step": 3753 + }, + { + "epoch": 0.4975480450629556, + "grad_norm": 14.380655288696289, + "learning_rate": 4.669705895884968e-06, + "loss": 0.3537, + "num_input_tokens_seen": 7351880, + "step": 3754 + }, + { + "epoch": 0.49768058316766073, + "grad_norm": 7.3378682136535645, + "learning_rate": 4.6695334276445855e-06, + "loss": 0.1797, + "num_input_tokens_seen": 7353928, + "step": 3755 + }, + { + "epoch": 0.4978131212723658, + "grad_norm": 9.116175651550293, + "learning_rate": 4.669360917574103e-06, + "loss": 0.2611, + "num_input_tokens_seen": 7355776, + "step": 3756 + }, + { + "epoch": 0.4979456593770709, + "grad_norm": 10.5614652633667, + "learning_rate": 4.669188365676845e-06, + "loss": 0.1879, + "num_input_tokens_seen": 7357616, + "step": 3757 + }, + { + "epoch": 0.498078197481776, + "grad_norm": 6.729059219360352, + "learning_rate": 4.669015771956138e-06, + "loss": 0.1513, + "num_input_tokens_seen": 7359032, + "step": 3758 + }, + { + "epoch": 0.4982107355864811, + "grad_norm": 12.209918022155762, + "learning_rate": 4.668843136415311e-06, + "loss": 0.3582, + "num_input_tokens_seen": 7360840, + "step": 3759 + }, + { + "epoch": 0.4983432736911862, + "grad_norm": 9.738348007202148, + "learning_rate": 4.668670459057693e-06, + "loss": 0.2889, + "num_input_tokens_seen": 7362184, + "step": 3760 + }, + { + "epoch": 0.49847581179589134, + "grad_norm": 13.182435989379883, + "learning_rate": 4.668497739886611e-06, + "loss": 0.3929, + "num_input_tokens_seen": 7363624, + "step": 3761 + }, + { + "epoch": 0.49860834990059644, + "grad_norm": 11.195540428161621, + "learning_rate": 4.668324978905398e-06, + "loss": 0.2136, + "num_input_tokens_seen": 7366344, + "step": 3762 + }, + { + "epoch": 0.49874088800530153, + "grad_norm": 7.256974220275879, + "learning_rate": 4.668152176117383e-06, + "loss": 0.1638, + "num_input_tokens_seen": 7367728, + "step": 3763 + }, + { + "epoch": 0.4988734261100066, + "grad_norm": 14.844018936157227, + "learning_rate": 4.667979331525899e-06, + "loss": 0.476, + "num_input_tokens_seen": 7369720, + "step": 3764 + }, + { + "epoch": 0.4990059642147117, + "grad_norm": 11.778115272521973, + "learning_rate": 4.667806445134277e-06, + "loss": 0.2855, + "num_input_tokens_seen": 7371064, + "step": 3765 + }, + { + "epoch": 0.4991385023194168, + "grad_norm": 0.8699835538864136, + "learning_rate": 4.667633516945851e-06, + "loss": 0.0058, + "num_input_tokens_seen": 7372592, + "step": 3766 + }, + { + "epoch": 0.49927104042412196, + "grad_norm": 11.455622673034668, + "learning_rate": 4.667460546963956e-06, + "loss": 0.2738, + "num_input_tokens_seen": 7374368, + "step": 3767 + }, + { + "epoch": 0.49940357852882705, + "grad_norm": 6.262709140777588, + "learning_rate": 4.667287535191927e-06, + "loss": 0.1314, + "num_input_tokens_seen": 7377296, + "step": 3768 + }, + { + "epoch": 0.49953611663353215, + "grad_norm": 21.019472122192383, + "learning_rate": 4.667114481633098e-06, + "loss": 0.4648, + "num_input_tokens_seen": 7380024, + "step": 3769 + }, + { + "epoch": 0.49966865473823724, + "grad_norm": 7.639338970184326, + "learning_rate": 4.666941386290808e-06, + "loss": 0.0945, + "num_input_tokens_seen": 7381584, + "step": 3770 + }, + { + "epoch": 0.49980119284294233, + "grad_norm": 8.748551368713379, + "learning_rate": 4.666768249168391e-06, + "loss": 0.2389, + "num_input_tokens_seen": 7384024, + "step": 3771 + }, + { + "epoch": 0.4999337309476474, + "grad_norm": 0.5426409840583801, + "learning_rate": 4.666595070269189e-06, + "loss": 0.0035, + "num_input_tokens_seen": 7385288, + "step": 3772 + }, + { + "epoch": 0.5000662690523525, + "grad_norm": 21.49462890625, + "learning_rate": 4.66642184959654e-06, + "loss": 0.2756, + "num_input_tokens_seen": 7387680, + "step": 3773 + }, + { + "epoch": 0.5001988071570577, + "grad_norm": 8.051777839660645, + "learning_rate": 4.666248587153782e-06, + "loss": 0.0853, + "num_input_tokens_seen": 7389424, + "step": 3774 + }, + { + "epoch": 0.5003313452617627, + "grad_norm": 9.731081008911133, + "learning_rate": 4.666075282944257e-06, + "loss": 0.0663, + "num_input_tokens_seen": 7391152, + "step": 3775 + }, + { + "epoch": 0.5004638833664679, + "grad_norm": 11.829185485839844, + "learning_rate": 4.665901936971306e-06, + "loss": 0.4676, + "num_input_tokens_seen": 7393520, + "step": 3776 + }, + { + "epoch": 0.500596421471173, + "grad_norm": 10.346894264221191, + "learning_rate": 4.665728549238272e-06, + "loss": 0.1606, + "num_input_tokens_seen": 7397072, + "step": 3777 + }, + { + "epoch": 0.500728959575878, + "grad_norm": 0.5274060368537903, + "learning_rate": 4.665555119748499e-06, + "loss": 0.0034, + "num_input_tokens_seen": 7399472, + "step": 3778 + }, + { + "epoch": 0.5008614976805832, + "grad_norm": 2.381251573562622, + "learning_rate": 4.665381648505327e-06, + "loss": 0.0131, + "num_input_tokens_seen": 7401672, + "step": 3779 + }, + { + "epoch": 0.5009940357852882, + "grad_norm": 10.102008819580078, + "learning_rate": 4.665208135512104e-06, + "loss": 0.3348, + "num_input_tokens_seen": 7403624, + "step": 3780 + }, + { + "epoch": 0.5011265738899934, + "grad_norm": 1.107379674911499, + "learning_rate": 4.665034580772175e-06, + "loss": 0.0067, + "num_input_tokens_seen": 7406488, + "step": 3781 + }, + { + "epoch": 0.5012591119946985, + "grad_norm": 1.9707496166229248, + "learning_rate": 4.664860984288884e-06, + "loss": 0.0118, + "num_input_tokens_seen": 7408384, + "step": 3782 + }, + { + "epoch": 0.5013916500994036, + "grad_norm": 5.604523181915283, + "learning_rate": 4.664687346065581e-06, + "loss": 0.1872, + "num_input_tokens_seen": 7410400, + "step": 3783 + }, + { + "epoch": 0.5015241882041087, + "grad_norm": 17.170034408569336, + "learning_rate": 4.664513666105612e-06, + "loss": 0.5413, + "num_input_tokens_seen": 7413112, + "step": 3784 + }, + { + "epoch": 0.5016567263088137, + "grad_norm": 3.253962755203247, + "learning_rate": 4.664339944412327e-06, + "loss": 0.0122, + "num_input_tokens_seen": 7414536, + "step": 3785 + }, + { + "epoch": 0.5017892644135189, + "grad_norm": 13.5678071975708, + "learning_rate": 4.6641661809890735e-06, + "loss": 0.8068, + "num_input_tokens_seen": 7416888, + "step": 3786 + }, + { + "epoch": 0.5019218025182239, + "grad_norm": 11.761780738830566, + "learning_rate": 4.663992375839204e-06, + "loss": 0.3547, + "num_input_tokens_seen": 7418192, + "step": 3787 + }, + { + "epoch": 0.5020543406229291, + "grad_norm": 4.3910298347473145, + "learning_rate": 4.663818528966069e-06, + "loss": 0.1387, + "num_input_tokens_seen": 7419208, + "step": 3788 + }, + { + "epoch": 0.5021868787276342, + "grad_norm": 0.035544801503419876, + "learning_rate": 4.66364464037302e-06, + "loss": 0.0002, + "num_input_tokens_seen": 7421048, + "step": 3789 + }, + { + "epoch": 0.5023194168323393, + "grad_norm": 6.48258638381958, + "learning_rate": 4.663470710063409e-06, + "loss": 0.1329, + "num_input_tokens_seen": 7422528, + "step": 3790 + }, + { + "epoch": 0.5024519549370444, + "grad_norm": 16.436424255371094, + "learning_rate": 4.66329673804059e-06, + "loss": 0.6685, + "num_input_tokens_seen": 7424744, + "step": 3791 + }, + { + "epoch": 0.5025844930417495, + "grad_norm": 0.03394150361418724, + "learning_rate": 4.663122724307919e-06, + "loss": 0.0002, + "num_input_tokens_seen": 7426112, + "step": 3792 + }, + { + "epoch": 0.5027170311464546, + "grad_norm": 8.756386756896973, + "learning_rate": 4.662948668868748e-06, + "loss": 0.2209, + "num_input_tokens_seen": 7429416, + "step": 3793 + }, + { + "epoch": 0.5028495692511598, + "grad_norm": 9.006821632385254, + "learning_rate": 4.662774571726435e-06, + "loss": 0.1185, + "num_input_tokens_seen": 7431536, + "step": 3794 + }, + { + "epoch": 0.5029821073558648, + "grad_norm": 23.868593215942383, + "learning_rate": 4.662600432884336e-06, + "loss": 0.8272, + "num_input_tokens_seen": 7433408, + "step": 3795 + }, + { + "epoch": 0.5031146454605699, + "grad_norm": 6.650596618652344, + "learning_rate": 4.66242625234581e-06, + "loss": 0.4601, + "num_input_tokens_seen": 7435184, + "step": 3796 + }, + { + "epoch": 0.503247183565275, + "grad_norm": 0.04085260629653931, + "learning_rate": 4.662252030114212e-06, + "loss": 0.0003, + "num_input_tokens_seen": 7436440, + "step": 3797 + }, + { + "epoch": 0.5033797216699801, + "grad_norm": 0.039970021694898605, + "learning_rate": 4.662077766192906e-06, + "loss": 0.0003, + "num_input_tokens_seen": 7438960, + "step": 3798 + }, + { + "epoch": 0.5035122597746852, + "grad_norm": 0.028971130028367043, + "learning_rate": 4.661903460585248e-06, + "loss": 0.0002, + "num_input_tokens_seen": 7440552, + "step": 3799 + }, + { + "epoch": 0.5036447978793903, + "grad_norm": 4.454335689544678, + "learning_rate": 4.661729113294599e-06, + "loss": 0.2197, + "num_input_tokens_seen": 7442240, + "step": 3800 + }, + { + "epoch": 0.5037773359840955, + "grad_norm": 4.77355432510376, + "learning_rate": 4.661554724324321e-06, + "loss": 0.1512, + "num_input_tokens_seen": 7445344, + "step": 3801 + }, + { + "epoch": 0.5039098740888005, + "grad_norm": 0.6460253000259399, + "learning_rate": 4.661380293677778e-06, + "loss": 0.006, + "num_input_tokens_seen": 7447232, + "step": 3802 + }, + { + "epoch": 0.5040424121935057, + "grad_norm": 7.980596542358398, + "learning_rate": 4.661205821358331e-06, + "loss": 0.1835, + "num_input_tokens_seen": 7449616, + "step": 3803 + }, + { + "epoch": 0.5041749502982107, + "grad_norm": 8.587661743164062, + "learning_rate": 4.6610313073693454e-06, + "loss": 0.4085, + "num_input_tokens_seen": 7451872, + "step": 3804 + }, + { + "epoch": 0.5043074884029158, + "grad_norm": 0.027200492098927498, + "learning_rate": 4.660856751714185e-06, + "loss": 0.0002, + "num_input_tokens_seen": 7454192, + "step": 3805 + }, + { + "epoch": 0.504440026507621, + "grad_norm": 3.177253007888794, + "learning_rate": 4.660682154396217e-06, + "loss": 0.0595, + "num_input_tokens_seen": 7455952, + "step": 3806 + }, + { + "epoch": 0.504572564612326, + "grad_norm": 0.030226562172174454, + "learning_rate": 4.660507515418805e-06, + "loss": 0.0002, + "num_input_tokens_seen": 7457376, + "step": 3807 + }, + { + "epoch": 0.5047051027170312, + "grad_norm": 18.29974365234375, + "learning_rate": 4.6603328347853185e-06, + "loss": 0.6711, + "num_input_tokens_seen": 7459120, + "step": 3808 + }, + { + "epoch": 0.5048376408217362, + "grad_norm": 10.222456932067871, + "learning_rate": 4.660158112499124e-06, + "loss": 0.394, + "num_input_tokens_seen": 7460688, + "step": 3809 + }, + { + "epoch": 0.5049701789264414, + "grad_norm": 0.04791877418756485, + "learning_rate": 4.659983348563591e-06, + "loss": 0.0003, + "num_input_tokens_seen": 7462720, + "step": 3810 + }, + { + "epoch": 0.5051027170311465, + "grad_norm": 15.199453353881836, + "learning_rate": 4.659808542982089e-06, + "loss": 0.4378, + "num_input_tokens_seen": 7464128, + "step": 3811 + }, + { + "epoch": 0.5052352551358515, + "grad_norm": 19.845563888549805, + "learning_rate": 4.659633695757988e-06, + "loss": 0.2716, + "num_input_tokens_seen": 7466616, + "step": 3812 + }, + { + "epoch": 0.5053677932405567, + "grad_norm": 8.899648666381836, + "learning_rate": 4.65945880689466e-06, + "loss": 0.2255, + "num_input_tokens_seen": 7467920, + "step": 3813 + }, + { + "epoch": 0.5055003313452617, + "grad_norm": 10.421370506286621, + "learning_rate": 4.659283876395476e-06, + "loss": 0.2526, + "num_input_tokens_seen": 7469944, + "step": 3814 + }, + { + "epoch": 0.5056328694499669, + "grad_norm": 14.412315368652344, + "learning_rate": 4.659108904263809e-06, + "loss": 0.3013, + "num_input_tokens_seen": 7473008, + "step": 3815 + }, + { + "epoch": 0.5057654075546719, + "grad_norm": 0.41000670194625854, + "learning_rate": 4.658933890503034e-06, + "loss": 0.0026, + "num_input_tokens_seen": 7474592, + "step": 3816 + }, + { + "epoch": 0.5058979456593771, + "grad_norm": 0.8977817893028259, + "learning_rate": 4.658758835116524e-06, + "loss": 0.0074, + "num_input_tokens_seen": 7476104, + "step": 3817 + }, + { + "epoch": 0.5060304837640822, + "grad_norm": 12.335478782653809, + "learning_rate": 4.6585837381076535e-06, + "loss": 0.3348, + "num_input_tokens_seen": 7478328, + "step": 3818 + }, + { + "epoch": 0.5061630218687873, + "grad_norm": 0.15439845621585846, + "learning_rate": 4.6584085994798e-06, + "loss": 0.0011, + "num_input_tokens_seen": 7479448, + "step": 3819 + }, + { + "epoch": 0.5062955599734924, + "grad_norm": 5.305031776428223, + "learning_rate": 4.65823341923634e-06, + "loss": 0.1122, + "num_input_tokens_seen": 7480520, + "step": 3820 + }, + { + "epoch": 0.5064280980781974, + "grad_norm": 7.51107931137085, + "learning_rate": 4.65805819738065e-06, + "loss": 0.1945, + "num_input_tokens_seen": 7482144, + "step": 3821 + }, + { + "epoch": 0.5065606361829026, + "grad_norm": 16.621143341064453, + "learning_rate": 4.657882933916109e-06, + "loss": 0.4277, + "num_input_tokens_seen": 7484248, + "step": 3822 + }, + { + "epoch": 0.5066931742876077, + "grad_norm": 7.599722385406494, + "learning_rate": 4.6577076288460974e-06, + "loss": 0.1359, + "num_input_tokens_seen": 7485384, + "step": 3823 + }, + { + "epoch": 0.5068257123923128, + "grad_norm": 22.24402618408203, + "learning_rate": 4.657532282173994e-06, + "loss": 0.5525, + "num_input_tokens_seen": 7487480, + "step": 3824 + }, + { + "epoch": 0.5069582504970179, + "grad_norm": 0.5006080865859985, + "learning_rate": 4.657356893903179e-06, + "loss": 0.0034, + "num_input_tokens_seen": 7489432, + "step": 3825 + }, + { + "epoch": 0.507090788601723, + "grad_norm": 9.591044425964355, + "learning_rate": 4.657181464037036e-06, + "loss": 0.3522, + "num_input_tokens_seen": 7491032, + "step": 3826 + }, + { + "epoch": 0.5072233267064281, + "grad_norm": 16.038673400878906, + "learning_rate": 4.6570059925789455e-06, + "loss": 0.2749, + "num_input_tokens_seen": 7493008, + "step": 3827 + }, + { + "epoch": 0.5073558648111332, + "grad_norm": 15.959938049316406, + "learning_rate": 4.656830479532292e-06, + "loss": 0.6408, + "num_input_tokens_seen": 7494936, + "step": 3828 + }, + { + "epoch": 0.5074884029158383, + "grad_norm": 9.982244491577148, + "learning_rate": 4.656654924900458e-06, + "loss": 0.3631, + "num_input_tokens_seen": 7497328, + "step": 3829 + }, + { + "epoch": 0.5076209410205434, + "grad_norm": 0.7568904757499695, + "learning_rate": 4.656479328686829e-06, + "loss": 0.0053, + "num_input_tokens_seen": 7499552, + "step": 3830 + }, + { + "epoch": 0.5077534791252485, + "grad_norm": 13.756217956542969, + "learning_rate": 4.6563036908947915e-06, + "loss": 0.4165, + "num_input_tokens_seen": 7501232, + "step": 3831 + }, + { + "epoch": 0.5078860172299536, + "grad_norm": 14.193361282348633, + "learning_rate": 4.6561280115277314e-06, + "loss": 0.3105, + "num_input_tokens_seen": 7503072, + "step": 3832 + }, + { + "epoch": 0.5080185553346587, + "grad_norm": 8.322296142578125, + "learning_rate": 4.6559522905890355e-06, + "loss": 0.3782, + "num_input_tokens_seen": 7504336, + "step": 3833 + }, + { + "epoch": 0.5081510934393638, + "grad_norm": 3.3163723945617676, + "learning_rate": 4.655776528082093e-06, + "loss": 0.0182, + "num_input_tokens_seen": 7506056, + "step": 3834 + }, + { + "epoch": 0.508283631544069, + "grad_norm": 8.745096206665039, + "learning_rate": 4.655600724010291e-06, + "loss": 0.2784, + "num_input_tokens_seen": 7508264, + "step": 3835 + }, + { + "epoch": 0.508416169648774, + "grad_norm": 5.671663284301758, + "learning_rate": 4.6554248783770205e-06, + "loss": 0.2096, + "num_input_tokens_seen": 7509816, + "step": 3836 + }, + { + "epoch": 0.5085487077534792, + "grad_norm": 17.65005874633789, + "learning_rate": 4.65524899118567e-06, + "loss": 0.3535, + "num_input_tokens_seen": 7511928, + "step": 3837 + }, + { + "epoch": 0.5086812458581842, + "grad_norm": 5.636123180389404, + "learning_rate": 4.655073062439633e-06, + "loss": 0.1573, + "num_input_tokens_seen": 7513576, + "step": 3838 + }, + { + "epoch": 0.5088137839628893, + "grad_norm": 5.241680145263672, + "learning_rate": 4.654897092142302e-06, + "loss": 0.0653, + "num_input_tokens_seen": 7515280, + "step": 3839 + }, + { + "epoch": 0.5089463220675944, + "grad_norm": 0.5257935523986816, + "learning_rate": 4.654721080297066e-06, + "loss": 0.0037, + "num_input_tokens_seen": 7517032, + "step": 3840 + }, + { + "epoch": 0.5090788601722995, + "grad_norm": 5.624911785125732, + "learning_rate": 4.654545026907324e-06, + "loss": 0.1783, + "num_input_tokens_seen": 7518256, + "step": 3841 + }, + { + "epoch": 0.5092113982770047, + "grad_norm": 0.8047042489051819, + "learning_rate": 4.6543689319764654e-06, + "loss": 0.0056, + "num_input_tokens_seen": 7519848, + "step": 3842 + }, + { + "epoch": 0.5093439363817097, + "grad_norm": 9.142449378967285, + "learning_rate": 4.654192795507888e-06, + "loss": 0.253, + "num_input_tokens_seen": 7521488, + "step": 3843 + }, + { + "epoch": 0.5094764744864149, + "grad_norm": 7.155490398406982, + "learning_rate": 4.654016617504987e-06, + "loss": 0.1163, + "num_input_tokens_seen": 7523168, + "step": 3844 + }, + { + "epoch": 0.5096090125911199, + "grad_norm": 12.159253120422363, + "learning_rate": 4.65384039797116e-06, + "loss": 0.286, + "num_input_tokens_seen": 7524728, + "step": 3845 + }, + { + "epoch": 0.509741550695825, + "grad_norm": 6.172644138336182, + "learning_rate": 4.653664136909804e-06, + "loss": 0.1558, + "num_input_tokens_seen": 7525976, + "step": 3846 + }, + { + "epoch": 0.5098740888005302, + "grad_norm": 0.17605362832546234, + "learning_rate": 4.653487834324319e-06, + "loss": 0.0012, + "num_input_tokens_seen": 7527096, + "step": 3847 + }, + { + "epoch": 0.5100066269052352, + "grad_norm": 6.352392196655273, + "learning_rate": 4.653311490218102e-06, + "loss": 0.0706, + "num_input_tokens_seen": 7529400, + "step": 3848 + }, + { + "epoch": 0.5101391650099404, + "grad_norm": 9.853144645690918, + "learning_rate": 4.653135104594554e-06, + "loss": 0.1792, + "num_input_tokens_seen": 7530904, + "step": 3849 + }, + { + "epoch": 0.5102717031146454, + "grad_norm": 6.2242279052734375, + "learning_rate": 4.652958677457076e-06, + "loss": 0.214, + "num_input_tokens_seen": 7532960, + "step": 3850 + }, + { + "epoch": 0.5104042412193506, + "grad_norm": 0.18780875205993652, + "learning_rate": 4.652782208809069e-06, + "loss": 0.0013, + "num_input_tokens_seen": 7534448, + "step": 3851 + }, + { + "epoch": 0.5105367793240556, + "grad_norm": 0.15818049013614655, + "learning_rate": 4.652605698653936e-06, + "loss": 0.0011, + "num_input_tokens_seen": 7536040, + "step": 3852 + }, + { + "epoch": 0.5106693174287608, + "grad_norm": 8.0579252243042, + "learning_rate": 4.65242914699508e-06, + "loss": 0.0802, + "num_input_tokens_seen": 7537776, + "step": 3853 + }, + { + "epoch": 0.5108018555334659, + "grad_norm": 13.291154861450195, + "learning_rate": 4.652252553835905e-06, + "loss": 0.1999, + "num_input_tokens_seen": 7539152, + "step": 3854 + }, + { + "epoch": 0.510934393638171, + "grad_norm": 4.071899890899658, + "learning_rate": 4.652075919179817e-06, + "loss": 0.0427, + "num_input_tokens_seen": 7541344, + "step": 3855 + }, + { + "epoch": 0.5110669317428761, + "grad_norm": 7.976846218109131, + "learning_rate": 4.65189924303022e-06, + "loss": 0.1686, + "num_input_tokens_seen": 7543176, + "step": 3856 + }, + { + "epoch": 0.5111994698475811, + "grad_norm": 0.07538077235221863, + "learning_rate": 4.651722525390522e-06, + "loss": 0.0005, + "num_input_tokens_seen": 7544848, + "step": 3857 + }, + { + "epoch": 0.5113320079522863, + "grad_norm": 3.565011739730835, + "learning_rate": 4.651545766264127e-06, + "loss": 0.0414, + "num_input_tokens_seen": 7546448, + "step": 3858 + }, + { + "epoch": 0.5114645460569914, + "grad_norm": 5.377864837646484, + "learning_rate": 4.651368965654448e-06, + "loss": 0.232, + "num_input_tokens_seen": 7547888, + "step": 3859 + }, + { + "epoch": 0.5115970841616965, + "grad_norm": 9.728271484375, + "learning_rate": 4.65119212356489e-06, + "loss": 0.2512, + "num_input_tokens_seen": 7549600, + "step": 3860 + }, + { + "epoch": 0.5117296222664016, + "grad_norm": 0.27952712774276733, + "learning_rate": 4.6510152399988635e-06, + "loss": 0.0017, + "num_input_tokens_seen": 7551848, + "step": 3861 + }, + { + "epoch": 0.5118621603711067, + "grad_norm": 0.12081749737262726, + "learning_rate": 4.650838314959779e-06, + "loss": 0.0008, + "num_input_tokens_seen": 7553504, + "step": 3862 + }, + { + "epoch": 0.5119946984758118, + "grad_norm": 8.880135536193848, + "learning_rate": 4.650661348451049e-06, + "loss": 0.1328, + "num_input_tokens_seen": 7555176, + "step": 3863 + }, + { + "epoch": 0.512127236580517, + "grad_norm": 0.5720003247261047, + "learning_rate": 4.650484340476084e-06, + "loss": 0.0031, + "num_input_tokens_seen": 7557024, + "step": 3864 + }, + { + "epoch": 0.512259774685222, + "grad_norm": 0.1851201355457306, + "learning_rate": 4.650307291038297e-06, + "loss": 0.0012, + "num_input_tokens_seen": 7558320, + "step": 3865 + }, + { + "epoch": 0.5123923127899271, + "grad_norm": 11.492060661315918, + "learning_rate": 4.650130200141103e-06, + "loss": 0.4219, + "num_input_tokens_seen": 7560928, + "step": 3866 + }, + { + "epoch": 0.5125248508946322, + "grad_norm": 5.517479419708252, + "learning_rate": 4.649953067787915e-06, + "loss": 0.1236, + "num_input_tokens_seen": 7563128, + "step": 3867 + }, + { + "epoch": 0.5126573889993373, + "grad_norm": 5.839290618896484, + "learning_rate": 4.649775893982149e-06, + "loss": 0.2432, + "num_input_tokens_seen": 7565072, + "step": 3868 + }, + { + "epoch": 0.5127899271040424, + "grad_norm": 11.97633171081543, + "learning_rate": 4.64959867872722e-06, + "loss": 0.2892, + "num_input_tokens_seen": 7568264, + "step": 3869 + }, + { + "epoch": 0.5129224652087475, + "grad_norm": 0.09630770236253738, + "learning_rate": 4.649421422026546e-06, + "loss": 0.0006, + "num_input_tokens_seen": 7570816, + "step": 3870 + }, + { + "epoch": 0.5130550033134527, + "grad_norm": 1.6739745140075684, + "learning_rate": 4.649244123883544e-06, + "loss": 0.0218, + "num_input_tokens_seen": 7572264, + "step": 3871 + }, + { + "epoch": 0.5131875414181577, + "grad_norm": 11.977849006652832, + "learning_rate": 4.6490667843016325e-06, + "loss": 0.503, + "num_input_tokens_seen": 7573896, + "step": 3872 + }, + { + "epoch": 0.5133200795228628, + "grad_norm": 19.54895782470703, + "learning_rate": 4.6488894032842315e-06, + "loss": 0.5571, + "num_input_tokens_seen": 7576776, + "step": 3873 + }, + { + "epoch": 0.5134526176275679, + "grad_norm": 0.060662273317575455, + "learning_rate": 4.648711980834759e-06, + "loss": 0.0004, + "num_input_tokens_seen": 7577864, + "step": 3874 + }, + { + "epoch": 0.513585155732273, + "grad_norm": 14.085258483886719, + "learning_rate": 4.648534516956639e-06, + "loss": 0.6941, + "num_input_tokens_seen": 7579432, + "step": 3875 + }, + { + "epoch": 0.5137176938369782, + "grad_norm": 6.37233829498291, + "learning_rate": 4.64835701165329e-06, + "loss": 0.0766, + "num_input_tokens_seen": 7581880, + "step": 3876 + }, + { + "epoch": 0.5138502319416832, + "grad_norm": 12.37532901763916, + "learning_rate": 4.648179464928136e-06, + "loss": 0.1853, + "num_input_tokens_seen": 7584632, + "step": 3877 + }, + { + "epoch": 0.5139827700463884, + "grad_norm": 1.3831883668899536, + "learning_rate": 4.6480018767846005e-06, + "loss": 0.0153, + "num_input_tokens_seen": 7586056, + "step": 3878 + }, + { + "epoch": 0.5141153081510934, + "grad_norm": 10.08962631225586, + "learning_rate": 4.647824247226106e-06, + "loss": 0.1379, + "num_input_tokens_seen": 7587640, + "step": 3879 + }, + { + "epoch": 0.5142478462557986, + "grad_norm": 13.777557373046875, + "learning_rate": 4.647646576256079e-06, + "loss": 0.5273, + "num_input_tokens_seen": 7590256, + "step": 3880 + }, + { + "epoch": 0.5143803843605036, + "grad_norm": 6.842599391937256, + "learning_rate": 4.647468863877944e-06, + "loss": 0.1601, + "num_input_tokens_seen": 7591864, + "step": 3881 + }, + { + "epoch": 0.5145129224652087, + "grad_norm": 0.06761663407087326, + "learning_rate": 4.647291110095128e-06, + "loss": 0.0004, + "num_input_tokens_seen": 7594496, + "step": 3882 + }, + { + "epoch": 0.5146454605699139, + "grad_norm": 0.09105087071657181, + "learning_rate": 4.647113314911059e-06, + "loss": 0.0006, + "num_input_tokens_seen": 7596360, + "step": 3883 + }, + { + "epoch": 0.5147779986746189, + "grad_norm": 0.05476690083742142, + "learning_rate": 4.6469354783291634e-06, + "loss": 0.0004, + "num_input_tokens_seen": 7597720, + "step": 3884 + }, + { + "epoch": 0.5149105367793241, + "grad_norm": 4.954082012176514, + "learning_rate": 4.64675760035287e-06, + "loss": 0.1412, + "num_input_tokens_seen": 7600312, + "step": 3885 + }, + { + "epoch": 0.5150430748840291, + "grad_norm": 0.20849671959877014, + "learning_rate": 4.64657968098561e-06, + "loss": 0.0013, + "num_input_tokens_seen": 7602072, + "step": 3886 + }, + { + "epoch": 0.5151756129887343, + "grad_norm": 0.10920123010873795, + "learning_rate": 4.646401720230812e-06, + "loss": 0.0007, + "num_input_tokens_seen": 7604032, + "step": 3887 + }, + { + "epoch": 0.5153081510934394, + "grad_norm": 7.964940071105957, + "learning_rate": 4.646223718091909e-06, + "loss": 0.2723, + "num_input_tokens_seen": 7606360, + "step": 3888 + }, + { + "epoch": 0.5154406891981445, + "grad_norm": 5.736359596252441, + "learning_rate": 4.646045674572331e-06, + "loss": 0.1512, + "num_input_tokens_seen": 7608232, + "step": 3889 + }, + { + "epoch": 0.5155732273028496, + "grad_norm": 9.374197006225586, + "learning_rate": 4.645867589675514e-06, + "loss": 0.2189, + "num_input_tokens_seen": 7609696, + "step": 3890 + }, + { + "epoch": 0.5157057654075546, + "grad_norm": 14.919758796691895, + "learning_rate": 4.6456894634048875e-06, + "loss": 0.3773, + "num_input_tokens_seen": 7611264, + "step": 3891 + }, + { + "epoch": 0.5158383035122598, + "grad_norm": 6.105409622192383, + "learning_rate": 4.645511295763888e-06, + "loss": 0.1598, + "num_input_tokens_seen": 7613704, + "step": 3892 + }, + { + "epoch": 0.5159708416169648, + "grad_norm": 6.688642978668213, + "learning_rate": 4.645333086755952e-06, + "loss": 0.0816, + "num_input_tokens_seen": 7615824, + "step": 3893 + }, + { + "epoch": 0.51610337972167, + "grad_norm": 11.338894844055176, + "learning_rate": 4.645154836384513e-06, + "loss": 0.3308, + "num_input_tokens_seen": 7617808, + "step": 3894 + }, + { + "epoch": 0.5162359178263751, + "grad_norm": 0.34471264481544495, + "learning_rate": 4.644976544653009e-06, + "loss": 0.0024, + "num_input_tokens_seen": 7620472, + "step": 3895 + }, + { + "epoch": 0.5163684559310802, + "grad_norm": 1.1194355487823486, + "learning_rate": 4.644798211564877e-06, + "loss": 0.0095, + "num_input_tokens_seen": 7621784, + "step": 3896 + }, + { + "epoch": 0.5165009940357853, + "grad_norm": 14.840388298034668, + "learning_rate": 4.644619837123557e-06, + "loss": 0.1561, + "num_input_tokens_seen": 7624232, + "step": 3897 + }, + { + "epoch": 0.5166335321404903, + "grad_norm": 0.6106372475624084, + "learning_rate": 4.644441421332486e-06, + "loss": 0.004, + "num_input_tokens_seen": 7625512, + "step": 3898 + }, + { + "epoch": 0.5167660702451955, + "grad_norm": 4.217182636260986, + "learning_rate": 4.644262964195106e-06, + "loss": 0.0534, + "num_input_tokens_seen": 7628064, + "step": 3899 + }, + { + "epoch": 0.5168986083499006, + "grad_norm": 7.864591121673584, + "learning_rate": 4.644084465714857e-06, + "loss": 0.2397, + "num_input_tokens_seen": 7630032, + "step": 3900 + }, + { + "epoch": 0.5170311464546057, + "grad_norm": 0.7796896696090698, + "learning_rate": 4.643905925895179e-06, + "loss": 0.005, + "num_input_tokens_seen": 7632440, + "step": 3901 + }, + { + "epoch": 0.5171636845593108, + "grad_norm": 2.427598476409912, + "learning_rate": 4.643727344739517e-06, + "loss": 0.0273, + "num_input_tokens_seen": 7633696, + "step": 3902 + }, + { + "epoch": 0.5172962226640159, + "grad_norm": 7.205782890319824, + "learning_rate": 4.643548722251312e-06, + "loss": 0.2571, + "num_input_tokens_seen": 7636576, + "step": 3903 + }, + { + "epoch": 0.517428760768721, + "grad_norm": 6.383040428161621, + "learning_rate": 4.643370058434009e-06, + "loss": 0.1253, + "num_input_tokens_seen": 7638976, + "step": 3904 + }, + { + "epoch": 0.5175612988734261, + "grad_norm": 2.788740634918213, + "learning_rate": 4.643191353291053e-06, + "loss": 0.0112, + "num_input_tokens_seen": 7640888, + "step": 3905 + }, + { + "epoch": 0.5176938369781312, + "grad_norm": 19.716602325439453, + "learning_rate": 4.64301260682589e-06, + "loss": 0.7377, + "num_input_tokens_seen": 7644080, + "step": 3906 + }, + { + "epoch": 0.5178263750828364, + "grad_norm": 0.3081749975681305, + "learning_rate": 4.642833819041964e-06, + "loss": 0.0021, + "num_input_tokens_seen": 7645712, + "step": 3907 + }, + { + "epoch": 0.5179589131875414, + "grad_norm": 7.278192520141602, + "learning_rate": 4.6426549899427245e-06, + "loss": 0.2684, + "num_input_tokens_seen": 7647776, + "step": 3908 + }, + { + "epoch": 0.5180914512922465, + "grad_norm": 12.155147552490234, + "learning_rate": 4.6424761195316185e-06, + "loss": 0.2869, + "num_input_tokens_seen": 7650616, + "step": 3909 + }, + { + "epoch": 0.5182239893969516, + "grad_norm": 7.674764633178711, + "learning_rate": 4.642297207812095e-06, + "loss": 0.0756, + "num_input_tokens_seen": 7652408, + "step": 3910 + }, + { + "epoch": 0.5183565275016567, + "grad_norm": 12.263961791992188, + "learning_rate": 4.642118254787603e-06, + "loss": 0.3006, + "num_input_tokens_seen": 7654280, + "step": 3911 + }, + { + "epoch": 0.5184890656063619, + "grad_norm": 6.193371772766113, + "learning_rate": 4.641939260461594e-06, + "loss": 0.2269, + "num_input_tokens_seen": 7656056, + "step": 3912 + }, + { + "epoch": 0.5186216037110669, + "grad_norm": 0.19670720398426056, + "learning_rate": 4.641760224837518e-06, + "loss": 0.0013, + "num_input_tokens_seen": 7658352, + "step": 3913 + }, + { + "epoch": 0.5187541418157721, + "grad_norm": 10.243383407592773, + "learning_rate": 4.6415811479188275e-06, + "loss": 0.1787, + "num_input_tokens_seen": 7659936, + "step": 3914 + }, + { + "epoch": 0.5188866799204771, + "grad_norm": 0.09814871847629547, + "learning_rate": 4.641402029708975e-06, + "loss": 0.0007, + "num_input_tokens_seen": 7661648, + "step": 3915 + }, + { + "epoch": 0.5190192180251822, + "grad_norm": 0.1302480399608612, + "learning_rate": 4.641222870211413e-06, + "loss": 0.0009, + "num_input_tokens_seen": 7664008, + "step": 3916 + }, + { + "epoch": 0.5191517561298873, + "grad_norm": 0.07020407915115356, + "learning_rate": 4.641043669429599e-06, + "loss": 0.0005, + "num_input_tokens_seen": 7665088, + "step": 3917 + }, + { + "epoch": 0.5192842942345924, + "grad_norm": 12.221670150756836, + "learning_rate": 4.640864427366984e-06, + "loss": 0.2453, + "num_input_tokens_seen": 7667208, + "step": 3918 + }, + { + "epoch": 0.5194168323392976, + "grad_norm": 9.392801284790039, + "learning_rate": 4.640685144027027e-06, + "loss": 0.3726, + "num_input_tokens_seen": 7669064, + "step": 3919 + }, + { + "epoch": 0.5195493704440026, + "grad_norm": 0.14656253159046173, + "learning_rate": 4.640505819413184e-06, + "loss": 0.001, + "num_input_tokens_seen": 7671688, + "step": 3920 + }, + { + "epoch": 0.5196819085487078, + "grad_norm": 0.10497324913740158, + "learning_rate": 4.6403264535289115e-06, + "loss": 0.0007, + "num_input_tokens_seen": 7672952, + "step": 3921 + }, + { + "epoch": 0.5198144466534128, + "grad_norm": 11.629120826721191, + "learning_rate": 4.640147046377669e-06, + "loss": 0.3155, + "num_input_tokens_seen": 7675208, + "step": 3922 + }, + { + "epoch": 0.519946984758118, + "grad_norm": 0.35601159930229187, + "learning_rate": 4.639967597962914e-06, + "loss": 0.0021, + "num_input_tokens_seen": 7676880, + "step": 3923 + }, + { + "epoch": 0.5200795228628231, + "grad_norm": 12.927213668823242, + "learning_rate": 4.6397881082881095e-06, + "loss": 0.4533, + "num_input_tokens_seen": 7679456, + "step": 3924 + }, + { + "epoch": 0.5202120609675281, + "grad_norm": 15.095540046691895, + "learning_rate": 4.6396085773567115e-06, + "loss": 0.3754, + "num_input_tokens_seen": 7680712, + "step": 3925 + }, + { + "epoch": 0.5203445990722333, + "grad_norm": 10.376604080200195, + "learning_rate": 4.639429005172186e-06, + "loss": 0.2242, + "num_input_tokens_seen": 7682504, + "step": 3926 + }, + { + "epoch": 0.5204771371769383, + "grad_norm": 3.0157835483551025, + "learning_rate": 4.639249391737992e-06, + "loss": 0.0317, + "num_input_tokens_seen": 7685080, + "step": 3927 + }, + { + "epoch": 0.5206096752816435, + "grad_norm": 12.498023986816406, + "learning_rate": 4.639069737057595e-06, + "loss": 0.1583, + "num_input_tokens_seen": 7687184, + "step": 3928 + }, + { + "epoch": 0.5207422133863486, + "grad_norm": 12.41493034362793, + "learning_rate": 4.6388900411344575e-06, + "loss": 0.3695, + "num_input_tokens_seen": 7689944, + "step": 3929 + }, + { + "epoch": 0.5208747514910537, + "grad_norm": 0.14885379374027252, + "learning_rate": 4.638710303972044e-06, + "loss": 0.001, + "num_input_tokens_seen": 7691456, + "step": 3930 + }, + { + "epoch": 0.5210072895957588, + "grad_norm": 5.978907585144043, + "learning_rate": 4.638530525573821e-06, + "loss": 0.1282, + "num_input_tokens_seen": 7693696, + "step": 3931 + }, + { + "epoch": 0.5211398277004639, + "grad_norm": 12.325837135314941, + "learning_rate": 4.638350705943254e-06, + "loss": 0.2525, + "num_input_tokens_seen": 7695912, + "step": 3932 + }, + { + "epoch": 0.521272365805169, + "grad_norm": 4.714094161987305, + "learning_rate": 4.6381708450838104e-06, + "loss": 0.0282, + "num_input_tokens_seen": 7697336, + "step": 3933 + }, + { + "epoch": 0.521404903909874, + "grad_norm": 0.17871876060962677, + "learning_rate": 4.637990942998957e-06, + "loss": 0.0012, + "num_input_tokens_seen": 7698472, + "step": 3934 + }, + { + "epoch": 0.5215374420145792, + "grad_norm": 17.90882682800293, + "learning_rate": 4.637810999692164e-06, + "loss": 0.6474, + "num_input_tokens_seen": 7700456, + "step": 3935 + }, + { + "epoch": 0.5216699801192843, + "grad_norm": 9.274698257446289, + "learning_rate": 4.6376310151669e-06, + "loss": 0.1898, + "num_input_tokens_seen": 7701808, + "step": 3936 + }, + { + "epoch": 0.5218025182239894, + "grad_norm": 0.7546862959861755, + "learning_rate": 4.637450989426635e-06, + "loss": 0.0044, + "num_input_tokens_seen": 7703616, + "step": 3937 + }, + { + "epoch": 0.5219350563286945, + "grad_norm": 0.16327273845672607, + "learning_rate": 4.637270922474841e-06, + "loss": 0.0011, + "num_input_tokens_seen": 7705496, + "step": 3938 + }, + { + "epoch": 0.5220675944333996, + "grad_norm": 10.371885299682617, + "learning_rate": 4.637090814314989e-06, + "loss": 0.2251, + "num_input_tokens_seen": 7707480, + "step": 3939 + }, + { + "epoch": 0.5222001325381047, + "grad_norm": 4.093565464019775, + "learning_rate": 4.6369106649505516e-06, + "loss": 0.027, + "num_input_tokens_seen": 7709056, + "step": 3940 + }, + { + "epoch": 0.5223326706428099, + "grad_norm": 15.884981155395508, + "learning_rate": 4.636730474385003e-06, + "loss": 0.4056, + "num_input_tokens_seen": 7711008, + "step": 3941 + }, + { + "epoch": 0.5224652087475149, + "grad_norm": 4.421368598937988, + "learning_rate": 4.636550242621816e-06, + "loss": 0.0491, + "num_input_tokens_seen": 7713144, + "step": 3942 + }, + { + "epoch": 0.52259774685222, + "grad_norm": 2.412248373031616, + "learning_rate": 4.636369969664467e-06, + "loss": 0.086, + "num_input_tokens_seen": 7714824, + "step": 3943 + }, + { + "epoch": 0.5227302849569251, + "grad_norm": 7.426116466522217, + "learning_rate": 4.636189655516431e-06, + "loss": 0.1391, + "num_input_tokens_seen": 7716696, + "step": 3944 + }, + { + "epoch": 0.5228628230616302, + "grad_norm": 8.407407760620117, + "learning_rate": 4.636009300181184e-06, + "loss": 0.0965, + "num_input_tokens_seen": 7719072, + "step": 3945 + }, + { + "epoch": 0.5229953611663353, + "grad_norm": 5.500792503356934, + "learning_rate": 4.635828903662205e-06, + "loss": 0.075, + "num_input_tokens_seen": 7722384, + "step": 3946 + }, + { + "epoch": 0.5231278992710404, + "grad_norm": 0.19300252199172974, + "learning_rate": 4.635648465962971e-06, + "loss": 0.0013, + "num_input_tokens_seen": 7725160, + "step": 3947 + }, + { + "epoch": 0.5232604373757456, + "grad_norm": 0.1379784792661667, + "learning_rate": 4.6354679870869614e-06, + "loss": 0.001, + "num_input_tokens_seen": 7726800, + "step": 3948 + }, + { + "epoch": 0.5233929754804506, + "grad_norm": 11.754271507263184, + "learning_rate": 4.6352874670376556e-06, + "loss": 0.18, + "num_input_tokens_seen": 7729720, + "step": 3949 + }, + { + "epoch": 0.5235255135851558, + "grad_norm": 0.4266282021999359, + "learning_rate": 4.635106905818535e-06, + "loss": 0.0029, + "num_input_tokens_seen": 7731304, + "step": 3950 + }, + { + "epoch": 0.5236580516898608, + "grad_norm": 6.5703959465026855, + "learning_rate": 4.634926303433079e-06, + "loss": 0.2175, + "num_input_tokens_seen": 7733208, + "step": 3951 + }, + { + "epoch": 0.5237905897945659, + "grad_norm": 18.028095245361328, + "learning_rate": 4.634745659884773e-06, + "loss": 0.4297, + "num_input_tokens_seen": 7734824, + "step": 3952 + }, + { + "epoch": 0.5239231278992711, + "grad_norm": 0.12948773801326752, + "learning_rate": 4.634564975177097e-06, + "loss": 0.0009, + "num_input_tokens_seen": 7735928, + "step": 3953 + }, + { + "epoch": 0.5240556660039761, + "grad_norm": 0.12280768901109695, + "learning_rate": 4.634384249313535e-06, + "loss": 0.0008, + "num_input_tokens_seen": 7737344, + "step": 3954 + }, + { + "epoch": 0.5241882041086813, + "grad_norm": 13.939336776733398, + "learning_rate": 4.634203482297574e-06, + "loss": 0.292, + "num_input_tokens_seen": 7738936, + "step": 3955 + }, + { + "epoch": 0.5243207422133863, + "grad_norm": 0.2067951261997223, + "learning_rate": 4.634022674132695e-06, + "loss": 0.0014, + "num_input_tokens_seen": 7741632, + "step": 3956 + }, + { + "epoch": 0.5244532803180915, + "grad_norm": 12.44519329071045, + "learning_rate": 4.633841824822389e-06, + "loss": 0.0828, + "num_input_tokens_seen": 7743816, + "step": 3957 + }, + { + "epoch": 0.5245858184227965, + "grad_norm": 0.13119351863861084, + "learning_rate": 4.63366093437014e-06, + "loss": 0.0009, + "num_input_tokens_seen": 7745608, + "step": 3958 + }, + { + "epoch": 0.5247183565275017, + "grad_norm": 9.463254928588867, + "learning_rate": 4.633480002779436e-06, + "loss": 0.3272, + "num_input_tokens_seen": 7748152, + "step": 3959 + }, + { + "epoch": 0.5248508946322068, + "grad_norm": 6.2471466064453125, + "learning_rate": 4.633299030053765e-06, + "loss": 0.1687, + "num_input_tokens_seen": 7750712, + "step": 3960 + }, + { + "epoch": 0.5249834327369118, + "grad_norm": 0.4079556167125702, + "learning_rate": 4.633118016196617e-06, + "loss": 0.0027, + "num_input_tokens_seen": 7753208, + "step": 3961 + }, + { + "epoch": 0.525115970841617, + "grad_norm": 15.901931762695312, + "learning_rate": 4.632936961211484e-06, + "loss": 0.2196, + "num_input_tokens_seen": 7754768, + "step": 3962 + }, + { + "epoch": 0.525248508946322, + "grad_norm": 8.711216926574707, + "learning_rate": 4.632755865101853e-06, + "loss": 0.1868, + "num_input_tokens_seen": 7756144, + "step": 3963 + }, + { + "epoch": 0.5253810470510272, + "grad_norm": 0.7210275530815125, + "learning_rate": 4.632574727871219e-06, + "loss": 0.0043, + "num_input_tokens_seen": 7759640, + "step": 3964 + }, + { + "epoch": 0.5255135851557323, + "grad_norm": 6.543086051940918, + "learning_rate": 4.632393549523072e-06, + "loss": 0.2831, + "num_input_tokens_seen": 7761920, + "step": 3965 + }, + { + "epoch": 0.5256461232604374, + "grad_norm": 15.134282112121582, + "learning_rate": 4.632212330060907e-06, + "loss": 0.4894, + "num_input_tokens_seen": 7764048, + "step": 3966 + }, + { + "epoch": 0.5257786613651425, + "grad_norm": 4.576977252960205, + "learning_rate": 4.632031069488216e-06, + "loss": 0.066, + "num_input_tokens_seen": 7765840, + "step": 3967 + }, + { + "epoch": 0.5259111994698475, + "grad_norm": 15.82397747039795, + "learning_rate": 4.631849767808496e-06, + "loss": 0.4433, + "num_input_tokens_seen": 7768016, + "step": 3968 + }, + { + "epoch": 0.5260437375745527, + "grad_norm": 9.389792442321777, + "learning_rate": 4.631668425025243e-06, + "loss": 0.0811, + "num_input_tokens_seen": 7770328, + "step": 3969 + }, + { + "epoch": 0.5261762756792577, + "grad_norm": 9.84054946899414, + "learning_rate": 4.631487041141951e-06, + "loss": 0.1585, + "num_input_tokens_seen": 7773024, + "step": 3970 + }, + { + "epoch": 0.5263088137839629, + "grad_norm": 8.409414291381836, + "learning_rate": 4.631305616162119e-06, + "loss": 0.1792, + "num_input_tokens_seen": 7775336, + "step": 3971 + }, + { + "epoch": 0.526441351888668, + "grad_norm": 7.878690242767334, + "learning_rate": 4.6311241500892435e-06, + "loss": 0.3553, + "num_input_tokens_seen": 7777760, + "step": 3972 + }, + { + "epoch": 0.5265738899933731, + "grad_norm": 15.82659912109375, + "learning_rate": 4.630942642926825e-06, + "loss": 0.4995, + "num_input_tokens_seen": 7779704, + "step": 3973 + }, + { + "epoch": 0.5267064280980782, + "grad_norm": 3.158552885055542, + "learning_rate": 4.630761094678362e-06, + "loss": 0.0179, + "num_input_tokens_seen": 7781696, + "step": 3974 + }, + { + "epoch": 0.5268389662027833, + "grad_norm": 0.24967467784881592, + "learning_rate": 4.6305795053473545e-06, + "loss": 0.0016, + "num_input_tokens_seen": 7783424, + "step": 3975 + }, + { + "epoch": 0.5269715043074884, + "grad_norm": 0.07183726876974106, + "learning_rate": 4.630397874937305e-06, + "loss": 0.0005, + "num_input_tokens_seen": 7785368, + "step": 3976 + }, + { + "epoch": 0.5271040424121936, + "grad_norm": 5.644267559051514, + "learning_rate": 4.630216203451715e-06, + "loss": 0.0696, + "num_input_tokens_seen": 7786784, + "step": 3977 + }, + { + "epoch": 0.5272365805168986, + "grad_norm": 0.0759982317686081, + "learning_rate": 4.630034490894087e-06, + "loss": 0.0005, + "num_input_tokens_seen": 7788264, + "step": 3978 + }, + { + "epoch": 0.5273691186216037, + "grad_norm": 0.08297236263751984, + "learning_rate": 4.629852737267924e-06, + "loss": 0.0006, + "num_input_tokens_seen": 7791128, + "step": 3979 + }, + { + "epoch": 0.5275016567263088, + "grad_norm": 7.059423923492432, + "learning_rate": 4.629670942576731e-06, + "loss": 0.1124, + "num_input_tokens_seen": 7793232, + "step": 3980 + }, + { + "epoch": 0.5276341948310139, + "grad_norm": 1.4923434257507324, + "learning_rate": 4.6294891068240135e-06, + "loss": 0.0219, + "num_input_tokens_seen": 7794696, + "step": 3981 + }, + { + "epoch": 0.5277667329357191, + "grad_norm": 13.473164558410645, + "learning_rate": 4.6293072300132765e-06, + "loss": 0.4393, + "num_input_tokens_seen": 7797960, + "step": 3982 + }, + { + "epoch": 0.5278992710404241, + "grad_norm": 9.597066879272461, + "learning_rate": 4.629125312148027e-06, + "loss": 0.1025, + "num_input_tokens_seen": 7800696, + "step": 3983 + }, + { + "epoch": 0.5280318091451293, + "grad_norm": 4.252924919128418, + "learning_rate": 4.628943353231774e-06, + "loss": 0.1214, + "num_input_tokens_seen": 7803560, + "step": 3984 + }, + { + "epoch": 0.5281643472498343, + "grad_norm": 0.10358887910842896, + "learning_rate": 4.628761353268023e-06, + "loss": 0.0007, + "num_input_tokens_seen": 7805696, + "step": 3985 + }, + { + "epoch": 0.5282968853545394, + "grad_norm": 4.281703948974609, + "learning_rate": 4.628579312260285e-06, + "loss": 0.045, + "num_input_tokens_seen": 7807376, + "step": 3986 + }, + { + "epoch": 0.5284294234592445, + "grad_norm": 16.330989837646484, + "learning_rate": 4.628397230212069e-06, + "loss": 0.6654, + "num_input_tokens_seen": 7809408, + "step": 3987 + }, + { + "epoch": 0.5285619615639496, + "grad_norm": 13.92752742767334, + "learning_rate": 4.628215107126888e-06, + "loss": 0.2344, + "num_input_tokens_seen": 7811520, + "step": 3988 + }, + { + "epoch": 0.5286944996686548, + "grad_norm": 14.059699058532715, + "learning_rate": 4.628032943008249e-06, + "loss": 0.3923, + "num_input_tokens_seen": 7814144, + "step": 3989 + }, + { + "epoch": 0.5288270377733598, + "grad_norm": 0.12259407341480255, + "learning_rate": 4.627850737859669e-06, + "loss": 0.0008, + "num_input_tokens_seen": 7815872, + "step": 3990 + }, + { + "epoch": 0.528959575878065, + "grad_norm": 4.583789348602295, + "learning_rate": 4.627668491684657e-06, + "loss": 0.0252, + "num_input_tokens_seen": 7817560, + "step": 3991 + }, + { + "epoch": 0.52909211398277, + "grad_norm": 0.10020911693572998, + "learning_rate": 4.62748620448673e-06, + "loss": 0.0007, + "num_input_tokens_seen": 7819016, + "step": 3992 + }, + { + "epoch": 0.5292246520874752, + "grad_norm": 13.534164428710938, + "learning_rate": 4.627303876269401e-06, + "loss": 0.3385, + "num_input_tokens_seen": 7821072, + "step": 3993 + }, + { + "epoch": 0.5293571901921803, + "grad_norm": 8.080652236938477, + "learning_rate": 4.627121507036186e-06, + "loss": 0.2756, + "num_input_tokens_seen": 7822512, + "step": 3994 + }, + { + "epoch": 0.5294897282968853, + "grad_norm": 0.2605687975883484, + "learning_rate": 4.6269390967906e-06, + "loss": 0.0017, + "num_input_tokens_seen": 7824320, + "step": 3995 + }, + { + "epoch": 0.5296222664015905, + "grad_norm": 3.1092402935028076, + "learning_rate": 4.626756645536161e-06, + "loss": 0.0898, + "num_input_tokens_seen": 7826688, + "step": 3996 + }, + { + "epoch": 0.5297548045062955, + "grad_norm": 11.595963478088379, + "learning_rate": 4.626574153276387e-06, + "loss": 0.2987, + "num_input_tokens_seen": 7828608, + "step": 3997 + }, + { + "epoch": 0.5298873426110007, + "grad_norm": 0.10772614181041718, + "learning_rate": 4.626391620014797e-06, + "loss": 0.0007, + "num_input_tokens_seen": 7829928, + "step": 3998 + }, + { + "epoch": 0.5300198807157057, + "grad_norm": 10.73455810546875, + "learning_rate": 4.626209045754908e-06, + "loss": 0.4074, + "num_input_tokens_seen": 7831552, + "step": 3999 + }, + { + "epoch": 0.5301524188204109, + "grad_norm": 26.661184310913086, + "learning_rate": 4.626026430500243e-06, + "loss": 0.8685, + "num_input_tokens_seen": 7834056, + "step": 4000 + }, + { + "epoch": 0.530284956925116, + "grad_norm": 10.731963157653809, + "learning_rate": 4.625843774254321e-06, + "loss": 0.3024, + "num_input_tokens_seen": 7836088, + "step": 4001 + }, + { + "epoch": 0.530417495029821, + "grad_norm": 13.635927200317383, + "learning_rate": 4.6256610770206665e-06, + "loss": 0.3658, + "num_input_tokens_seen": 7838120, + "step": 4002 + }, + { + "epoch": 0.5305500331345262, + "grad_norm": 8.361394882202148, + "learning_rate": 4.625478338802798e-06, + "loss": 0.2745, + "num_input_tokens_seen": 7840168, + "step": 4003 + }, + { + "epoch": 0.5306825712392312, + "grad_norm": 0.30881306529045105, + "learning_rate": 4.625295559604241e-06, + "loss": 0.0021, + "num_input_tokens_seen": 7841704, + "step": 4004 + }, + { + "epoch": 0.5308151093439364, + "grad_norm": 7.118462085723877, + "learning_rate": 4.62511273942852e-06, + "loss": 0.1064, + "num_input_tokens_seen": 7844088, + "step": 4005 + }, + { + "epoch": 0.5309476474486415, + "grad_norm": 1.277662992477417, + "learning_rate": 4.62492987827916e-06, + "loss": 0.0084, + "num_input_tokens_seen": 7845904, + "step": 4006 + }, + { + "epoch": 0.5310801855533466, + "grad_norm": 0.5338249206542969, + "learning_rate": 4.6247469761596845e-06, + "loss": 0.0037, + "num_input_tokens_seen": 7847424, + "step": 4007 + }, + { + "epoch": 0.5312127236580517, + "grad_norm": 1.4985874891281128, + "learning_rate": 4.6245640330736224e-06, + "loss": 0.0185, + "num_input_tokens_seen": 7849128, + "step": 4008 + }, + { + "epoch": 0.5313452617627568, + "grad_norm": 9.822949409484863, + "learning_rate": 4.6243810490245e-06, + "loss": 0.1705, + "num_input_tokens_seen": 7851120, + "step": 4009 + }, + { + "epoch": 0.5314777998674619, + "grad_norm": 0.2604467570781708, + "learning_rate": 4.624198024015845e-06, + "loss": 0.0017, + "num_input_tokens_seen": 7852408, + "step": 4010 + }, + { + "epoch": 0.531610337972167, + "grad_norm": 400.1668701171875, + "learning_rate": 4.624014958051187e-06, + "loss": 0.4129, + "num_input_tokens_seen": 7854528, + "step": 4011 + }, + { + "epoch": 0.5317428760768721, + "grad_norm": 0.43131300806999207, + "learning_rate": 4.623831851134055e-06, + "loss": 0.0029, + "num_input_tokens_seen": 7857200, + "step": 4012 + }, + { + "epoch": 0.5318754141815772, + "grad_norm": 0.3718176484107971, + "learning_rate": 4.623648703267981e-06, + "loss": 0.0025, + "num_input_tokens_seen": 7858792, + "step": 4013 + }, + { + "epoch": 0.5320079522862823, + "grad_norm": 13.121262550354004, + "learning_rate": 4.623465514456494e-06, + "loss": 0.5068, + "num_input_tokens_seen": 7861376, + "step": 4014 + }, + { + "epoch": 0.5321404903909874, + "grad_norm": 0.25891396403312683, + "learning_rate": 4.623282284703127e-06, + "loss": 0.0017, + "num_input_tokens_seen": 7863664, + "step": 4015 + }, + { + "epoch": 0.5322730284956925, + "grad_norm": 6.230240821838379, + "learning_rate": 4.623099014011413e-06, + "loss": 0.1452, + "num_input_tokens_seen": 7865456, + "step": 4016 + }, + { + "epoch": 0.5324055666003976, + "grad_norm": 4.335967063903809, + "learning_rate": 4.622915702384885e-06, + "loss": 0.0973, + "num_input_tokens_seen": 7867016, + "step": 4017 + }, + { + "epoch": 0.5325381047051028, + "grad_norm": 8.595365524291992, + "learning_rate": 4.622732349827078e-06, + "loss": 0.1016, + "num_input_tokens_seen": 7868872, + "step": 4018 + }, + { + "epoch": 0.5326706428098078, + "grad_norm": 0.1359635293483734, + "learning_rate": 4.6225489563415266e-06, + "loss": 0.0009, + "num_input_tokens_seen": 7870264, + "step": 4019 + }, + { + "epoch": 0.532803180914513, + "grad_norm": 0.12251783907413483, + "learning_rate": 4.622365521931767e-06, + "loss": 0.0008, + "num_input_tokens_seen": 7871944, + "step": 4020 + }, + { + "epoch": 0.532935719019218, + "grad_norm": 11.952160835266113, + "learning_rate": 4.6221820466013365e-06, + "loss": 0.4018, + "num_input_tokens_seen": 7874064, + "step": 4021 + }, + { + "epoch": 0.5330682571239231, + "grad_norm": 1.6966038942337036, + "learning_rate": 4.621998530353772e-06, + "loss": 0.0308, + "num_input_tokens_seen": 7875840, + "step": 4022 + }, + { + "epoch": 0.5332007952286282, + "grad_norm": 15.353293418884277, + "learning_rate": 4.621814973192612e-06, + "loss": 0.4655, + "num_input_tokens_seen": 7878144, + "step": 4023 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 15.296635627746582, + "learning_rate": 4.621631375121395e-06, + "loss": 0.2782, + "num_input_tokens_seen": 7880832, + "step": 4024 + }, + { + "epoch": 0.5334658714380385, + "grad_norm": 12.293479919433594, + "learning_rate": 4.621447736143662e-06, + "loss": 0.3628, + "num_input_tokens_seen": 7882800, + "step": 4025 + }, + { + "epoch": 0.5335984095427435, + "grad_norm": 2.8818576335906982, + "learning_rate": 4.621264056262953e-06, + "loss": 0.1255, + "num_input_tokens_seen": 7884528, + "step": 4026 + }, + { + "epoch": 0.5337309476474487, + "grad_norm": 10.513691902160645, + "learning_rate": 4.62108033548281e-06, + "loss": 0.3848, + "num_input_tokens_seen": 7886272, + "step": 4027 + }, + { + "epoch": 0.5338634857521537, + "grad_norm": 0.14609111845493317, + "learning_rate": 4.620896573806774e-06, + "loss": 0.001, + "num_input_tokens_seen": 7887560, + "step": 4028 + }, + { + "epoch": 0.5339960238568588, + "grad_norm": 0.5605419278144836, + "learning_rate": 4.620712771238389e-06, + "loss": 0.0022, + "num_input_tokens_seen": 7889336, + "step": 4029 + }, + { + "epoch": 0.534128561961564, + "grad_norm": 17.75749969482422, + "learning_rate": 4.6205289277811995e-06, + "loss": 0.0951, + "num_input_tokens_seen": 7890912, + "step": 4030 + }, + { + "epoch": 0.534261100066269, + "grad_norm": 12.841143608093262, + "learning_rate": 4.620345043438748e-06, + "loss": 0.1, + "num_input_tokens_seen": 7892672, + "step": 4031 + }, + { + "epoch": 0.5343936381709742, + "grad_norm": 7.515597343444824, + "learning_rate": 4.620161118214583e-06, + "loss": 0.067, + "num_input_tokens_seen": 7894280, + "step": 4032 + }, + { + "epoch": 0.5345261762756792, + "grad_norm": 0.0845167338848114, + "learning_rate": 4.619977152112248e-06, + "loss": 0.0006, + "num_input_tokens_seen": 7895368, + "step": 4033 + }, + { + "epoch": 0.5346587143803844, + "grad_norm": 13.487278938293457, + "learning_rate": 4.619793145135292e-06, + "loss": 0.6303, + "num_input_tokens_seen": 7898272, + "step": 4034 + }, + { + "epoch": 0.5347912524850894, + "grad_norm": 4.179569244384766, + "learning_rate": 4.619609097287261e-06, + "loss": 0.0298, + "num_input_tokens_seen": 7900632, + "step": 4035 + }, + { + "epoch": 0.5349237905897946, + "grad_norm": 0.19195279479026794, + "learning_rate": 4.619425008571705e-06, + "loss": 0.0012, + "num_input_tokens_seen": 7903104, + "step": 4036 + }, + { + "epoch": 0.5350563286944997, + "grad_norm": 0.1777542531490326, + "learning_rate": 4.619240878992172e-06, + "loss": 0.0012, + "num_input_tokens_seen": 7906272, + "step": 4037 + }, + { + "epoch": 0.5351888667992047, + "grad_norm": 0.2937557101249695, + "learning_rate": 4.6190567085522134e-06, + "loss": 0.0018, + "num_input_tokens_seen": 7908792, + "step": 4038 + }, + { + "epoch": 0.5353214049039099, + "grad_norm": 1.8593034744262695, + "learning_rate": 4.618872497255379e-06, + "loss": 0.0385, + "num_input_tokens_seen": 7910448, + "step": 4039 + }, + { + "epoch": 0.5354539430086149, + "grad_norm": 0.1838109791278839, + "learning_rate": 4.618688245105222e-06, + "loss": 0.0012, + "num_input_tokens_seen": 7912440, + "step": 4040 + }, + { + "epoch": 0.5355864811133201, + "grad_norm": 16.996774673461914, + "learning_rate": 4.618503952105293e-06, + "loss": 0.3133, + "num_input_tokens_seen": 7914224, + "step": 4041 + }, + { + "epoch": 0.5357190192180252, + "grad_norm": 37.801334381103516, + "learning_rate": 4.6183196182591465e-06, + "loss": 0.4917, + "num_input_tokens_seen": 7915528, + "step": 4042 + }, + { + "epoch": 0.5358515573227303, + "grad_norm": 5.75848388671875, + "learning_rate": 4.618135243570336e-06, + "loss": 0.2948, + "num_input_tokens_seen": 7917048, + "step": 4043 + }, + { + "epoch": 0.5359840954274354, + "grad_norm": 11.034120559692383, + "learning_rate": 4.617950828042418e-06, + "loss": 0.3539, + "num_input_tokens_seen": 7918848, + "step": 4044 + }, + { + "epoch": 0.5361166335321405, + "grad_norm": 0.11634110659360886, + "learning_rate": 4.617766371678947e-06, + "loss": 0.0007, + "num_input_tokens_seen": 7920152, + "step": 4045 + }, + { + "epoch": 0.5362491716368456, + "grad_norm": 13.197975158691406, + "learning_rate": 4.6175818744834785e-06, + "loss": 0.4056, + "num_input_tokens_seen": 7921784, + "step": 4046 + }, + { + "epoch": 0.5363817097415507, + "grad_norm": 0.3431829512119293, + "learning_rate": 4.61739733645957e-06, + "loss": 0.002, + "num_input_tokens_seen": 7924152, + "step": 4047 + }, + { + "epoch": 0.5365142478462558, + "grad_norm": 14.108015060424805, + "learning_rate": 4.617212757610782e-06, + "loss": 0.5635, + "num_input_tokens_seen": 7926256, + "step": 4048 + }, + { + "epoch": 0.5366467859509609, + "grad_norm": 2.9647858142852783, + "learning_rate": 4.61702813794067e-06, + "loss": 0.0937, + "num_input_tokens_seen": 7927576, + "step": 4049 + }, + { + "epoch": 0.536779324055666, + "grad_norm": 12.520955085754395, + "learning_rate": 4.616843477452796e-06, + "loss": 0.3035, + "num_input_tokens_seen": 7929760, + "step": 4050 + }, + { + "epoch": 0.5369118621603711, + "grad_norm": 9.80131721496582, + "learning_rate": 4.616658776150719e-06, + "loss": 0.1621, + "num_input_tokens_seen": 7931136, + "step": 4051 + }, + { + "epoch": 0.5370444002650762, + "grad_norm": 12.030084609985352, + "learning_rate": 4.616474034038e-06, + "loss": 0.2362, + "num_input_tokens_seen": 7933512, + "step": 4052 + }, + { + "epoch": 0.5371769383697813, + "grad_norm": 10.376489639282227, + "learning_rate": 4.616289251118202e-06, + "loss": 0.3434, + "num_input_tokens_seen": 7936040, + "step": 4053 + }, + { + "epoch": 0.5373094764744865, + "grad_norm": 8.85754108428955, + "learning_rate": 4.616104427394887e-06, + "loss": 0.0862, + "num_input_tokens_seen": 7937936, + "step": 4054 + }, + { + "epoch": 0.5374420145791915, + "grad_norm": 0.46287301182746887, + "learning_rate": 4.6159195628716195e-06, + "loss": 0.0027, + "num_input_tokens_seen": 7939512, + "step": 4055 + }, + { + "epoch": 0.5375745526838966, + "grad_norm": 0.19545811414718628, + "learning_rate": 4.615734657551963e-06, + "loss": 0.0012, + "num_input_tokens_seen": 7940936, + "step": 4056 + }, + { + "epoch": 0.5377070907886017, + "grad_norm": 7.342900276184082, + "learning_rate": 4.615549711439483e-06, + "loss": 0.2312, + "num_input_tokens_seen": 7943168, + "step": 4057 + }, + { + "epoch": 0.5378396288933068, + "grad_norm": 6.551323890686035, + "learning_rate": 4.615364724537744e-06, + "loss": 0.171, + "num_input_tokens_seen": 7944704, + "step": 4058 + }, + { + "epoch": 0.537972166998012, + "grad_norm": 11.921992301940918, + "learning_rate": 4.615179696850315e-06, + "loss": 0.2878, + "num_input_tokens_seen": 7946048, + "step": 4059 + }, + { + "epoch": 0.538104705102717, + "grad_norm": 10.918295860290527, + "learning_rate": 4.614994628380761e-06, + "loss": 0.1427, + "num_input_tokens_seen": 7947616, + "step": 4060 + }, + { + "epoch": 0.5382372432074222, + "grad_norm": 11.038885116577148, + "learning_rate": 4.614809519132653e-06, + "loss": 0.3313, + "num_input_tokens_seen": 7949792, + "step": 4061 + }, + { + "epoch": 0.5383697813121272, + "grad_norm": 6.8225932121276855, + "learning_rate": 4.614624369109557e-06, + "loss": 0.0921, + "num_input_tokens_seen": 7951304, + "step": 4062 + }, + { + "epoch": 0.5385023194168324, + "grad_norm": 5.132601261138916, + "learning_rate": 4.614439178315045e-06, + "loss": 0.0411, + "num_input_tokens_seen": 7953088, + "step": 4063 + }, + { + "epoch": 0.5386348575215374, + "grad_norm": 0.7638110518455505, + "learning_rate": 4.614253946752687e-06, + "loss": 0.0035, + "num_input_tokens_seen": 7955376, + "step": 4064 + }, + { + "epoch": 0.5387673956262425, + "grad_norm": 6.3938117027282715, + "learning_rate": 4.614068674426054e-06, + "loss": 0.0994, + "num_input_tokens_seen": 7956944, + "step": 4065 + }, + { + "epoch": 0.5388999337309477, + "grad_norm": 8.795907020568848, + "learning_rate": 4.613883361338719e-06, + "loss": 0.087, + "num_input_tokens_seen": 7959152, + "step": 4066 + }, + { + "epoch": 0.5390324718356527, + "grad_norm": 36.54825210571289, + "learning_rate": 4.613698007494255e-06, + "loss": 0.5746, + "num_input_tokens_seen": 7961368, + "step": 4067 + }, + { + "epoch": 0.5391650099403579, + "grad_norm": 14.218330383300781, + "learning_rate": 4.613512612896235e-06, + "loss": 0.2931, + "num_input_tokens_seen": 7963544, + "step": 4068 + }, + { + "epoch": 0.5392975480450629, + "grad_norm": 0.17901967465877533, + "learning_rate": 4.613327177548234e-06, + "loss": 0.0012, + "num_input_tokens_seen": 7965288, + "step": 4069 + }, + { + "epoch": 0.5394300861497681, + "grad_norm": 0.1499279886484146, + "learning_rate": 4.613141701453826e-06, + "loss": 0.001, + "num_input_tokens_seen": 7966680, + "step": 4070 + }, + { + "epoch": 0.5395626242544732, + "grad_norm": 9.754743576049805, + "learning_rate": 4.612956184616589e-06, + "loss": 0.3682, + "num_input_tokens_seen": 7968784, + "step": 4071 + }, + { + "epoch": 0.5396951623591782, + "grad_norm": 20.259737014770508, + "learning_rate": 4.612770627040098e-06, + "loss": 0.4256, + "num_input_tokens_seen": 7971256, + "step": 4072 + }, + { + "epoch": 0.5398277004638834, + "grad_norm": 0.09793662279844284, + "learning_rate": 4.612585028727933e-06, + "loss": 0.0006, + "num_input_tokens_seen": 7972480, + "step": 4073 + }, + { + "epoch": 0.5399602385685884, + "grad_norm": 10.20346450805664, + "learning_rate": 4.612399389683671e-06, + "loss": 0.1662, + "num_input_tokens_seen": 7974248, + "step": 4074 + }, + { + "epoch": 0.5400927766732936, + "grad_norm": 0.11752693355083466, + "learning_rate": 4.612213709910892e-06, + "loss": 0.0007, + "num_input_tokens_seen": 7975808, + "step": 4075 + }, + { + "epoch": 0.5402253147779986, + "grad_norm": 12.99082088470459, + "learning_rate": 4.612027989413175e-06, + "loss": 0.3927, + "num_input_tokens_seen": 7977928, + "step": 4076 + }, + { + "epoch": 0.5403578528827038, + "grad_norm": 3.2054526805877686, + "learning_rate": 4.611842228194101e-06, + "loss": 0.0354, + "num_input_tokens_seen": 7979800, + "step": 4077 + }, + { + "epoch": 0.5404903909874089, + "grad_norm": 8.888463973999023, + "learning_rate": 4.611656426257253e-06, + "loss": 0.3038, + "num_input_tokens_seen": 7981688, + "step": 4078 + }, + { + "epoch": 0.540622929092114, + "grad_norm": 2.5232126712799072, + "learning_rate": 4.611470583606212e-06, + "loss": 0.1064, + "num_input_tokens_seen": 7983792, + "step": 4079 + }, + { + "epoch": 0.5407554671968191, + "grad_norm": 0.02956196665763855, + "learning_rate": 4.611284700244561e-06, + "loss": 0.0002, + "num_input_tokens_seen": 7984920, + "step": 4080 + }, + { + "epoch": 0.5408880053015241, + "grad_norm": 14.2171630859375, + "learning_rate": 4.6110987761758834e-06, + "loss": 0.3936, + "num_input_tokens_seen": 7986904, + "step": 4081 + }, + { + "epoch": 0.5410205434062293, + "grad_norm": 3.6032845973968506, + "learning_rate": 4.610912811403766e-06, + "loss": 0.0433, + "num_input_tokens_seen": 7988760, + "step": 4082 + }, + { + "epoch": 0.5411530815109344, + "grad_norm": 1.8011516332626343, + "learning_rate": 4.610726805931794e-06, + "loss": 0.0368, + "num_input_tokens_seen": 7990608, + "step": 4083 + }, + { + "epoch": 0.5412856196156395, + "grad_norm": 8.52591323852539, + "learning_rate": 4.610540759763552e-06, + "loss": 0.0661, + "num_input_tokens_seen": 7993152, + "step": 4084 + }, + { + "epoch": 0.5414181577203446, + "grad_norm": 6.99245548248291, + "learning_rate": 4.6103546729026295e-06, + "loss": 0.2279, + "num_input_tokens_seen": 7995624, + "step": 4085 + }, + { + "epoch": 0.5415506958250497, + "grad_norm": 10.101627349853516, + "learning_rate": 4.610168545352612e-06, + "loss": 0.1242, + "num_input_tokens_seen": 7998024, + "step": 4086 + }, + { + "epoch": 0.5416832339297548, + "grad_norm": 0.054514020681381226, + "learning_rate": 4.6099823771170894e-06, + "loss": 0.0003, + "num_input_tokens_seen": 7999984, + "step": 4087 + }, + { + "epoch": 0.5418157720344599, + "grad_norm": 7.615383625030518, + "learning_rate": 4.6097961681996516e-06, + "loss": 0.2287, + "num_input_tokens_seen": 8001928, + "step": 4088 + }, + { + "epoch": 0.541948310139165, + "grad_norm": 0.05668923258781433, + "learning_rate": 4.6096099186038875e-06, + "loss": 0.0003, + "num_input_tokens_seen": 8004424, + "step": 4089 + }, + { + "epoch": 0.5420808482438702, + "grad_norm": 0.05965054780244827, + "learning_rate": 4.60942362833339e-06, + "loss": 0.0004, + "num_input_tokens_seen": 8005832, + "step": 4090 + }, + { + "epoch": 0.5422133863485752, + "grad_norm": 1.9437217712402344, + "learning_rate": 4.6092372973917484e-06, + "loss": 0.0404, + "num_input_tokens_seen": 8007856, + "step": 4091 + }, + { + "epoch": 0.5423459244532803, + "grad_norm": 0.5218856930732727, + "learning_rate": 4.609050925782557e-06, + "loss": 0.0025, + "num_input_tokens_seen": 8009712, + "step": 4092 + }, + { + "epoch": 0.5424784625579854, + "grad_norm": 15.88712215423584, + "learning_rate": 4.608864513509409e-06, + "loss": 0.6892, + "num_input_tokens_seen": 8011904, + "step": 4093 + }, + { + "epoch": 0.5426110006626905, + "grad_norm": 0.11739930510520935, + "learning_rate": 4.608678060575899e-06, + "loss": 0.0007, + "num_input_tokens_seen": 8013544, + "step": 4094 + }, + { + "epoch": 0.5427435387673957, + "grad_norm": 4.469979286193848, + "learning_rate": 4.6084915669856195e-06, + "loss": 0.1359, + "num_input_tokens_seen": 8015592, + "step": 4095 + }, + { + "epoch": 0.5428760768721007, + "grad_norm": 10.837701797485352, + "learning_rate": 4.608305032742169e-06, + "loss": 0.3234, + "num_input_tokens_seen": 8017472, + "step": 4096 + }, + { + "epoch": 0.5430086149768059, + "grad_norm": 11.945828437805176, + "learning_rate": 4.608118457849143e-06, + "loss": 0.2767, + "num_input_tokens_seen": 8019800, + "step": 4097 + }, + { + "epoch": 0.5431411530815109, + "grad_norm": 3.062751054763794, + "learning_rate": 4.607931842310139e-06, + "loss": 0.1484, + "num_input_tokens_seen": 8021320, + "step": 4098 + }, + { + "epoch": 0.543273691186216, + "grad_norm": 10.515359878540039, + "learning_rate": 4.607745186128755e-06, + "loss": 0.2404, + "num_input_tokens_seen": 8023832, + "step": 4099 + }, + { + "epoch": 0.5434062292909212, + "grad_norm": 5.204316139221191, + "learning_rate": 4.607558489308589e-06, + "loss": 0.0868, + "num_input_tokens_seen": 8025768, + "step": 4100 + }, + { + "epoch": 0.5435387673956262, + "grad_norm": 15.432523727416992, + "learning_rate": 4.6073717518532415e-06, + "loss": 0.2798, + "num_input_tokens_seen": 8027904, + "step": 4101 + }, + { + "epoch": 0.5436713055003314, + "grad_norm": 4.542093753814697, + "learning_rate": 4.607184973766313e-06, + "loss": 0.0831, + "num_input_tokens_seen": 8030400, + "step": 4102 + }, + { + "epoch": 0.5438038436050364, + "grad_norm": 4.5041728019714355, + "learning_rate": 4.606998155051405e-06, + "loss": 0.1123, + "num_input_tokens_seen": 8032952, + "step": 4103 + }, + { + "epoch": 0.5439363817097416, + "grad_norm": 0.08445163071155548, + "learning_rate": 4.606811295712119e-06, + "loss": 0.0005, + "num_input_tokens_seen": 8034408, + "step": 4104 + }, + { + "epoch": 0.5440689198144466, + "grad_norm": 21.89432716369629, + "learning_rate": 4.606624395752057e-06, + "loss": 0.6877, + "num_input_tokens_seen": 8038504, + "step": 4105 + }, + { + "epoch": 0.5442014579191518, + "grad_norm": 0.18032686412334442, + "learning_rate": 4.606437455174825e-06, + "loss": 0.0012, + "num_input_tokens_seen": 8040240, + "step": 4106 + }, + { + "epoch": 0.5443339960238569, + "grad_norm": 10.134572982788086, + "learning_rate": 4.606250473984024e-06, + "loss": 0.3267, + "num_input_tokens_seen": 8042480, + "step": 4107 + }, + { + "epoch": 0.5444665341285619, + "grad_norm": 9.405549049377441, + "learning_rate": 4.606063452183261e-06, + "loss": 0.2923, + "num_input_tokens_seen": 8044736, + "step": 4108 + }, + { + "epoch": 0.5445990722332671, + "grad_norm": 0.4269721508026123, + "learning_rate": 4.605876389776143e-06, + "loss": 0.0022, + "num_input_tokens_seen": 8046000, + "step": 4109 + }, + { + "epoch": 0.5447316103379721, + "grad_norm": 5.793757438659668, + "learning_rate": 4.605689286766274e-06, + "loss": 0.2216, + "num_input_tokens_seen": 8047744, + "step": 4110 + }, + { + "epoch": 0.5448641484426773, + "grad_norm": 5.584360122680664, + "learning_rate": 4.6055021431572636e-06, + "loss": 0.1198, + "num_input_tokens_seen": 8049960, + "step": 4111 + }, + { + "epoch": 0.5449966865473824, + "grad_norm": 4.206791877746582, + "learning_rate": 4.605314958952719e-06, + "loss": 0.0128, + "num_input_tokens_seen": 8052208, + "step": 4112 + }, + { + "epoch": 0.5451292246520875, + "grad_norm": 3.2708704471588135, + "learning_rate": 4.605127734156251e-06, + "loss": 0.1091, + "num_input_tokens_seen": 8054232, + "step": 4113 + }, + { + "epoch": 0.5452617627567926, + "grad_norm": 0.17672087252140045, + "learning_rate": 4.604940468771466e-06, + "loss": 0.0012, + "num_input_tokens_seen": 8055912, + "step": 4114 + }, + { + "epoch": 0.5453943008614976, + "grad_norm": 6.984878063201904, + "learning_rate": 4.604753162801978e-06, + "loss": 0.1857, + "num_input_tokens_seen": 8057800, + "step": 4115 + }, + { + "epoch": 0.5455268389662028, + "grad_norm": 0.8402068614959717, + "learning_rate": 4.6045658162513964e-06, + "loss": 0.0121, + "num_input_tokens_seen": 8059480, + "step": 4116 + }, + { + "epoch": 0.5456593770709078, + "grad_norm": 0.26560404896736145, + "learning_rate": 4.604378429123334e-06, + "loss": 0.0016, + "num_input_tokens_seen": 8060968, + "step": 4117 + }, + { + "epoch": 0.545791915175613, + "grad_norm": 0.13185304403305054, + "learning_rate": 4.604191001421405e-06, + "loss": 0.0009, + "num_input_tokens_seen": 8063056, + "step": 4118 + }, + { + "epoch": 0.5459244532803181, + "grad_norm": 12.191259384155273, + "learning_rate": 4.60400353314922e-06, + "loss": 0.2488, + "num_input_tokens_seen": 8064904, + "step": 4119 + }, + { + "epoch": 0.5460569913850232, + "grad_norm": 0.08191758394241333, + "learning_rate": 4.6038160243103965e-06, + "loss": 0.0005, + "num_input_tokens_seen": 8066024, + "step": 4120 + }, + { + "epoch": 0.5461895294897283, + "grad_norm": 12.817622184753418, + "learning_rate": 4.603628474908548e-06, + "loss": 0.3704, + "num_input_tokens_seen": 8067936, + "step": 4121 + }, + { + "epoch": 0.5463220675944334, + "grad_norm": 5.399591445922852, + "learning_rate": 4.603440884947292e-06, + "loss": 0.0886, + "num_input_tokens_seen": 8069768, + "step": 4122 + }, + { + "epoch": 0.5464546056991385, + "grad_norm": 5.079294681549072, + "learning_rate": 4.603253254430244e-06, + "loss": 0.1308, + "num_input_tokens_seen": 8072032, + "step": 4123 + }, + { + "epoch": 0.5465871438038437, + "grad_norm": 11.848227500915527, + "learning_rate": 4.6030655833610235e-06, + "loss": 0.407, + "num_input_tokens_seen": 8074464, + "step": 4124 + }, + { + "epoch": 0.5467196819085487, + "grad_norm": 8.840668678283691, + "learning_rate": 4.602877871743246e-06, + "loss": 0.3576, + "num_input_tokens_seen": 8076752, + "step": 4125 + }, + { + "epoch": 0.5468522200132538, + "grad_norm": 11.436777114868164, + "learning_rate": 4.602690119580534e-06, + "loss": 0.3846, + "num_input_tokens_seen": 8078160, + "step": 4126 + }, + { + "epoch": 0.5469847581179589, + "grad_norm": 16.899656295776367, + "learning_rate": 4.602502326876505e-06, + "loss": 0.4454, + "num_input_tokens_seen": 8081056, + "step": 4127 + }, + { + "epoch": 0.547117296222664, + "grad_norm": 11.110140800476074, + "learning_rate": 4.602314493634782e-06, + "loss": 0.3647, + "num_input_tokens_seen": 8082840, + "step": 4128 + }, + { + "epoch": 0.5472498343273691, + "grad_norm": 12.675568580627441, + "learning_rate": 4.602126619858983e-06, + "loss": 0.4172, + "num_input_tokens_seen": 8084344, + "step": 4129 + }, + { + "epoch": 0.5473823724320742, + "grad_norm": 16.5428466796875, + "learning_rate": 4.601938705552734e-06, + "loss": 0.5982, + "num_input_tokens_seen": 8085648, + "step": 4130 + }, + { + "epoch": 0.5475149105367794, + "grad_norm": 11.334456443786621, + "learning_rate": 4.601750750719657e-06, + "loss": 0.0772, + "num_input_tokens_seen": 8087776, + "step": 4131 + }, + { + "epoch": 0.5476474486414844, + "grad_norm": 10.161344528198242, + "learning_rate": 4.601562755363374e-06, + "loss": 0.1594, + "num_input_tokens_seen": 8090448, + "step": 4132 + }, + { + "epoch": 0.5477799867461896, + "grad_norm": 12.087780952453613, + "learning_rate": 4.601374719487513e-06, + "loss": 0.1989, + "num_input_tokens_seen": 8092848, + "step": 4133 + }, + { + "epoch": 0.5479125248508946, + "grad_norm": 6.118009567260742, + "learning_rate": 4.601186643095698e-06, + "loss": 0.1896, + "num_input_tokens_seen": 8095456, + "step": 4134 + }, + { + "epoch": 0.5480450629555997, + "grad_norm": 6.1144256591796875, + "learning_rate": 4.6009985261915536e-06, + "loss": 0.1067, + "num_input_tokens_seen": 8097120, + "step": 4135 + }, + { + "epoch": 0.5481776010603049, + "grad_norm": 7.965542316436768, + "learning_rate": 4.600810368778708e-06, + "loss": 0.1284, + "num_input_tokens_seen": 8098760, + "step": 4136 + }, + { + "epoch": 0.5483101391650099, + "grad_norm": 6.611481666564941, + "learning_rate": 4.600622170860791e-06, + "loss": 0.1039, + "num_input_tokens_seen": 8101072, + "step": 4137 + }, + { + "epoch": 0.5484426772697151, + "grad_norm": 4.470078468322754, + "learning_rate": 4.6004339324414286e-06, + "loss": 0.0796, + "num_input_tokens_seen": 8103208, + "step": 4138 + }, + { + "epoch": 0.5485752153744201, + "grad_norm": 8.41801929473877, + "learning_rate": 4.60024565352425e-06, + "loss": 0.2553, + "num_input_tokens_seen": 8104968, + "step": 4139 + }, + { + "epoch": 0.5487077534791253, + "grad_norm": 0.7607479691505432, + "learning_rate": 4.600057334112887e-06, + "loss": 0.0036, + "num_input_tokens_seen": 8108192, + "step": 4140 + }, + { + "epoch": 0.5488402915838303, + "grad_norm": 6.524007797241211, + "learning_rate": 4.59986897421097e-06, + "loss": 0.1332, + "num_input_tokens_seen": 8110184, + "step": 4141 + }, + { + "epoch": 0.5489728296885354, + "grad_norm": 2.2956554889678955, + "learning_rate": 4.599680573822131e-06, + "loss": 0.0141, + "num_input_tokens_seen": 8111800, + "step": 4142 + }, + { + "epoch": 0.5491053677932406, + "grad_norm": 0.26627495884895325, + "learning_rate": 4.599492132950001e-06, + "loss": 0.0017, + "num_input_tokens_seen": 8113560, + "step": 4143 + }, + { + "epoch": 0.5492379058979456, + "grad_norm": 11.378226280212402, + "learning_rate": 4.5993036515982144e-06, + "loss": 0.2734, + "num_input_tokens_seen": 8116200, + "step": 4144 + }, + { + "epoch": 0.5493704440026508, + "grad_norm": 10.30105972290039, + "learning_rate": 4.599115129770406e-06, + "loss": 0.1435, + "num_input_tokens_seen": 8117912, + "step": 4145 + }, + { + "epoch": 0.5495029821073558, + "grad_norm": 5.610369682312012, + "learning_rate": 4.598926567470209e-06, + "loss": 0.029, + "num_input_tokens_seen": 8120696, + "step": 4146 + }, + { + "epoch": 0.549635520212061, + "grad_norm": 9.871245384216309, + "learning_rate": 4.59873796470126e-06, + "loss": 0.2819, + "num_input_tokens_seen": 8124000, + "step": 4147 + }, + { + "epoch": 0.5497680583167661, + "grad_norm": 0.10486636310815811, + "learning_rate": 4.598549321467195e-06, + "loss": 0.0007, + "num_input_tokens_seen": 8125792, + "step": 4148 + }, + { + "epoch": 0.5499005964214712, + "grad_norm": 0.2691498398780823, + "learning_rate": 4.598360637771651e-06, + "loss": 0.0017, + "num_input_tokens_seen": 8128056, + "step": 4149 + }, + { + "epoch": 0.5500331345261763, + "grad_norm": 0.14830152690410614, + "learning_rate": 4.5981719136182675e-06, + "loss": 0.001, + "num_input_tokens_seen": 8129736, + "step": 4150 + }, + { + "epoch": 0.5501656726308813, + "grad_norm": 0.10912959277629852, + "learning_rate": 4.5979831490106815e-06, + "loss": 0.0007, + "num_input_tokens_seen": 8131384, + "step": 4151 + }, + { + "epoch": 0.5502982107355865, + "grad_norm": 3.2231178283691406, + "learning_rate": 4.597794343952533e-06, + "loss": 0.0273, + "num_input_tokens_seen": 8133368, + "step": 4152 + }, + { + "epoch": 0.5504307488402916, + "grad_norm": 3.418339967727661, + "learning_rate": 4.597605498447463e-06, + "loss": 0.0837, + "num_input_tokens_seen": 8135504, + "step": 4153 + }, + { + "epoch": 0.5505632869449967, + "grad_norm": 9.763542175292969, + "learning_rate": 4.5974166124991116e-06, + "loss": 0.3183, + "num_input_tokens_seen": 8137800, + "step": 4154 + }, + { + "epoch": 0.5506958250497018, + "grad_norm": 3.4293720722198486, + "learning_rate": 4.597227686111121e-06, + "loss": 0.1011, + "num_input_tokens_seen": 8139472, + "step": 4155 + }, + { + "epoch": 0.5508283631544069, + "grad_norm": 14.084505081176758, + "learning_rate": 4.597038719287134e-06, + "loss": 0.3703, + "num_input_tokens_seen": 8141304, + "step": 4156 + }, + { + "epoch": 0.550960901259112, + "grad_norm": 11.821980476379395, + "learning_rate": 4.596849712030793e-06, + "loss": 0.118, + "num_input_tokens_seen": 8143496, + "step": 4157 + }, + { + "epoch": 0.551093439363817, + "grad_norm": 6.229593276977539, + "learning_rate": 4.596660664345744e-06, + "loss": 0.1725, + "num_input_tokens_seen": 8144952, + "step": 4158 + }, + { + "epoch": 0.5512259774685222, + "grad_norm": 8.892006874084473, + "learning_rate": 4.5964715762356295e-06, + "loss": 0.286, + "num_input_tokens_seen": 8146776, + "step": 4159 + }, + { + "epoch": 0.5513585155732273, + "grad_norm": 0.23390333354473114, + "learning_rate": 4.596282447704098e-06, + "loss": 0.0013, + "num_input_tokens_seen": 8149152, + "step": 4160 + }, + { + "epoch": 0.5514910536779324, + "grad_norm": 1.6519416570663452, + "learning_rate": 4.596093278754794e-06, + "loss": 0.0455, + "num_input_tokens_seen": 8150800, + "step": 4161 + }, + { + "epoch": 0.5516235917826375, + "grad_norm": 0.04528777673840523, + "learning_rate": 4.595904069391366e-06, + "loss": 0.0003, + "num_input_tokens_seen": 8153088, + "step": 4162 + }, + { + "epoch": 0.5517561298873426, + "grad_norm": 4.183203220367432, + "learning_rate": 4.595714819617462e-06, + "loss": 0.0444, + "num_input_tokens_seen": 8155552, + "step": 4163 + }, + { + "epoch": 0.5518886679920477, + "grad_norm": 11.055419921875, + "learning_rate": 4.595525529436729e-06, + "loss": 0.1988, + "num_input_tokens_seen": 8156976, + "step": 4164 + }, + { + "epoch": 0.5520212060967529, + "grad_norm": 0.04133330658078194, + "learning_rate": 4.59533619885282e-06, + "loss": 0.0003, + "num_input_tokens_seen": 8158224, + "step": 4165 + }, + { + "epoch": 0.5521537442014579, + "grad_norm": 9.351491928100586, + "learning_rate": 4.595146827869383e-06, + "loss": 0.2428, + "num_input_tokens_seen": 8159792, + "step": 4166 + }, + { + "epoch": 0.5522862823061631, + "grad_norm": 15.061875343322754, + "learning_rate": 4.594957416490069e-06, + "loss": 0.4489, + "num_input_tokens_seen": 8162024, + "step": 4167 + }, + { + "epoch": 0.5524188204108681, + "grad_norm": 10.789448738098145, + "learning_rate": 4.594767964718532e-06, + "loss": 0.4409, + "num_input_tokens_seen": 8164104, + "step": 4168 + }, + { + "epoch": 0.5525513585155732, + "grad_norm": 9.472795486450195, + "learning_rate": 4.594578472558423e-06, + "loss": 0.3299, + "num_input_tokens_seen": 8165760, + "step": 4169 + }, + { + "epoch": 0.5526838966202783, + "grad_norm": 11.541179656982422, + "learning_rate": 4.5943889400133955e-06, + "loss": 0.4529, + "num_input_tokens_seen": 8167392, + "step": 4170 + }, + { + "epoch": 0.5528164347249834, + "grad_norm": 6.17018461227417, + "learning_rate": 4.594199367087105e-06, + "loss": 0.0775, + "num_input_tokens_seen": 8169200, + "step": 4171 + }, + { + "epoch": 0.5529489728296886, + "grad_norm": 11.9037446975708, + "learning_rate": 4.594009753783206e-06, + "loss": 0.2757, + "num_input_tokens_seen": 8171088, + "step": 4172 + }, + { + "epoch": 0.5530815109343936, + "grad_norm": 4.553421497344971, + "learning_rate": 4.593820100105355e-06, + "loss": 0.0451, + "num_input_tokens_seen": 8172576, + "step": 4173 + }, + { + "epoch": 0.5532140490390988, + "grad_norm": 8.459004402160645, + "learning_rate": 4.593630406057207e-06, + "loss": 0.3134, + "num_input_tokens_seen": 8175888, + "step": 4174 + }, + { + "epoch": 0.5533465871438038, + "grad_norm": 15.0360689163208, + "learning_rate": 4.593440671642421e-06, + "loss": 0.2604, + "num_input_tokens_seen": 8177768, + "step": 4175 + }, + { + "epoch": 0.553479125248509, + "grad_norm": 12.067193984985352, + "learning_rate": 4.593250896864654e-06, + "loss": 0.212, + "num_input_tokens_seen": 8180112, + "step": 4176 + }, + { + "epoch": 0.5536116633532141, + "grad_norm": 5.24225378036499, + "learning_rate": 4.5930610817275664e-06, + "loss": 0.1322, + "num_input_tokens_seen": 8181944, + "step": 4177 + }, + { + "epoch": 0.5537442014579191, + "grad_norm": 1.855466604232788, + "learning_rate": 4.592871226234816e-06, + "loss": 0.0104, + "num_input_tokens_seen": 8183728, + "step": 4178 + }, + { + "epoch": 0.5538767395626243, + "grad_norm": 10.864151954650879, + "learning_rate": 4.592681330390066e-06, + "loss": 0.2209, + "num_input_tokens_seen": 8185720, + "step": 4179 + }, + { + "epoch": 0.5540092776673293, + "grad_norm": 8.799050331115723, + "learning_rate": 4.592491394196976e-06, + "loss": 0.17, + "num_input_tokens_seen": 8189224, + "step": 4180 + }, + { + "epoch": 0.5541418157720345, + "grad_norm": 9.319714546203613, + "learning_rate": 4.592301417659208e-06, + "loss": 0.2978, + "num_input_tokens_seen": 8191560, + "step": 4181 + }, + { + "epoch": 0.5542743538767395, + "grad_norm": 5.550588607788086, + "learning_rate": 4.592111400780425e-06, + "loss": 0.0502, + "num_input_tokens_seen": 8192968, + "step": 4182 + }, + { + "epoch": 0.5544068919814447, + "grad_norm": 0.8211635947227478, + "learning_rate": 4.591921343564292e-06, + "loss": 0.0044, + "num_input_tokens_seen": 8194904, + "step": 4183 + }, + { + "epoch": 0.5545394300861498, + "grad_norm": 8.733579635620117, + "learning_rate": 4.591731246014471e-06, + "loss": 0.1681, + "num_input_tokens_seen": 8196936, + "step": 4184 + }, + { + "epoch": 0.5546719681908548, + "grad_norm": 11.438721656799316, + "learning_rate": 4.5915411081346296e-06, + "loss": 0.3361, + "num_input_tokens_seen": 8198632, + "step": 4185 + }, + { + "epoch": 0.55480450629556, + "grad_norm": 4.61662483215332, + "learning_rate": 4.591350929928432e-06, + "loss": 0.0978, + "num_input_tokens_seen": 8199856, + "step": 4186 + }, + { + "epoch": 0.554937044400265, + "grad_norm": 8.586577415466309, + "learning_rate": 4.591160711399546e-06, + "loss": 0.3735, + "num_input_tokens_seen": 8203136, + "step": 4187 + }, + { + "epoch": 0.5550695825049702, + "grad_norm": 1.3087859153747559, + "learning_rate": 4.590970452551639e-06, + "loss": 0.0266, + "num_input_tokens_seen": 8204672, + "step": 4188 + }, + { + "epoch": 0.5552021206096753, + "grad_norm": 7.745833396911621, + "learning_rate": 4.5907801533883795e-06, + "loss": 0.2347, + "num_input_tokens_seen": 8206472, + "step": 4189 + }, + { + "epoch": 0.5553346587143804, + "grad_norm": 18.175031661987305, + "learning_rate": 4.590589813913436e-06, + "loss": 0.4127, + "num_input_tokens_seen": 8207968, + "step": 4190 + }, + { + "epoch": 0.5554671968190855, + "grad_norm": 0.12182530760765076, + "learning_rate": 4.590399434130478e-06, + "loss": 0.0008, + "num_input_tokens_seen": 8209560, + "step": 4191 + }, + { + "epoch": 0.5555997349237906, + "grad_norm": 0.27077168226242065, + "learning_rate": 4.590209014043178e-06, + "loss": 0.0018, + "num_input_tokens_seen": 8212488, + "step": 4192 + }, + { + "epoch": 0.5557322730284957, + "grad_norm": 11.877026557922363, + "learning_rate": 4.590018553655205e-06, + "loss": 0.2348, + "num_input_tokens_seen": 8214528, + "step": 4193 + }, + { + "epoch": 0.5558648111332007, + "grad_norm": 1.5523983240127563, + "learning_rate": 4.589828052970233e-06, + "loss": 0.009, + "num_input_tokens_seen": 8216608, + "step": 4194 + }, + { + "epoch": 0.5559973492379059, + "grad_norm": 0.17122042179107666, + "learning_rate": 4.589637511991935e-06, + "loss": 0.0012, + "num_input_tokens_seen": 8218344, + "step": 4195 + }, + { + "epoch": 0.556129887342611, + "grad_norm": 5.337427616119385, + "learning_rate": 4.589446930723984e-06, + "loss": 0.0544, + "num_input_tokens_seen": 8220536, + "step": 4196 + }, + { + "epoch": 0.5562624254473161, + "grad_norm": 0.1055503562092781, + "learning_rate": 4.589256309170055e-06, + "loss": 0.0007, + "num_input_tokens_seen": 8222096, + "step": 4197 + }, + { + "epoch": 0.5563949635520212, + "grad_norm": 5.509279251098633, + "learning_rate": 4.589065647333822e-06, + "loss": 0.1598, + "num_input_tokens_seen": 8223880, + "step": 4198 + }, + { + "epoch": 0.5565275016567263, + "grad_norm": 13.789811134338379, + "learning_rate": 4.5888749452189625e-06, + "loss": 0.7158, + "num_input_tokens_seen": 8226600, + "step": 4199 + }, + { + "epoch": 0.5566600397614314, + "grad_norm": 6.495257377624512, + "learning_rate": 4.588684202829154e-06, + "loss": 0.124, + "num_input_tokens_seen": 8228928, + "step": 4200 + }, + { + "epoch": 0.5567925778661366, + "grad_norm": 6.73385763168335, + "learning_rate": 4.588493420168072e-06, + "loss": 0.1272, + "num_input_tokens_seen": 8230824, + "step": 4201 + }, + { + "epoch": 0.5569251159708416, + "grad_norm": 0.5748089551925659, + "learning_rate": 4.588302597239396e-06, + "loss": 0.0039, + "num_input_tokens_seen": 8233384, + "step": 4202 + }, + { + "epoch": 0.5570576540755467, + "grad_norm": 12.322843551635742, + "learning_rate": 4.588111734046806e-06, + "loss": 0.2668, + "num_input_tokens_seen": 8235344, + "step": 4203 + }, + { + "epoch": 0.5571901921802518, + "grad_norm": 7.579004287719727, + "learning_rate": 4.58792083059398e-06, + "loss": 0.172, + "num_input_tokens_seen": 8237648, + "step": 4204 + }, + { + "epoch": 0.5573227302849569, + "grad_norm": 0.1485251784324646, + "learning_rate": 4.587729886884602e-06, + "loss": 0.001, + "num_input_tokens_seen": 8238984, + "step": 4205 + }, + { + "epoch": 0.557455268389662, + "grad_norm": 3.3117733001708984, + "learning_rate": 4.587538902922351e-06, + "loss": 0.0244, + "num_input_tokens_seen": 8241192, + "step": 4206 + }, + { + "epoch": 0.5575878064943671, + "grad_norm": 9.784858703613281, + "learning_rate": 4.587347878710909e-06, + "loss": 0.2539, + "num_input_tokens_seen": 8243336, + "step": 4207 + }, + { + "epoch": 0.5577203445990723, + "grad_norm": 0.15626606345176697, + "learning_rate": 4.58715681425396e-06, + "loss": 0.0011, + "num_input_tokens_seen": 8245200, + "step": 4208 + }, + { + "epoch": 0.5578528827037773, + "grad_norm": 0.08361523598432541, + "learning_rate": 4.586965709555188e-06, + "loss": 0.0006, + "num_input_tokens_seen": 8246272, + "step": 4209 + }, + { + "epoch": 0.5579854208084825, + "grad_norm": 6.800239562988281, + "learning_rate": 4.586774564618277e-06, + "loss": 0.2555, + "num_input_tokens_seen": 8247912, + "step": 4210 + }, + { + "epoch": 0.5581179589131875, + "grad_norm": 16.345766067504883, + "learning_rate": 4.586583379446914e-06, + "loss": 0.6866, + "num_input_tokens_seen": 8249888, + "step": 4211 + }, + { + "epoch": 0.5582504970178926, + "grad_norm": 10.406549453735352, + "learning_rate": 4.5863921540447834e-06, + "loss": 0.3264, + "num_input_tokens_seen": 8251680, + "step": 4212 + }, + { + "epoch": 0.5583830351225978, + "grad_norm": 0.08881685882806778, + "learning_rate": 4.586200888415571e-06, + "loss": 0.0006, + "num_input_tokens_seen": 8252728, + "step": 4213 + }, + { + "epoch": 0.5585155732273028, + "grad_norm": 12.554142951965332, + "learning_rate": 4.5860095825629695e-06, + "loss": 0.5008, + "num_input_tokens_seen": 8255472, + "step": 4214 + }, + { + "epoch": 0.558648111332008, + "grad_norm": 0.32159581780433655, + "learning_rate": 4.5858182364906615e-06, + "loss": 0.0022, + "num_input_tokens_seen": 8256808, + "step": 4215 + }, + { + "epoch": 0.558780649436713, + "grad_norm": 15.256695747375488, + "learning_rate": 4.5856268502023405e-06, + "loss": 0.673, + "num_input_tokens_seen": 8258736, + "step": 4216 + }, + { + "epoch": 0.5589131875414182, + "grad_norm": 0.2954338788986206, + "learning_rate": 4.585435423701694e-06, + "loss": 0.002, + "num_input_tokens_seen": 8260808, + "step": 4217 + }, + { + "epoch": 0.5590457256461233, + "grad_norm": 14.656773567199707, + "learning_rate": 4.585243956992415e-06, + "loss": 0.5055, + "num_input_tokens_seen": 8262504, + "step": 4218 + }, + { + "epoch": 0.5591782637508284, + "grad_norm": 0.32726091146469116, + "learning_rate": 4.585052450078193e-06, + "loss": 0.0022, + "num_input_tokens_seen": 8265000, + "step": 4219 + }, + { + "epoch": 0.5593108018555335, + "grad_norm": 0.20523113012313843, + "learning_rate": 4.584860902962721e-06, + "loss": 0.0014, + "num_input_tokens_seen": 8266240, + "step": 4220 + }, + { + "epoch": 0.5594433399602385, + "grad_norm": 9.635098457336426, + "learning_rate": 4.584669315649694e-06, + "loss": 0.4355, + "num_input_tokens_seen": 8269000, + "step": 4221 + }, + { + "epoch": 0.5595758780649437, + "grad_norm": 6.79335355758667, + "learning_rate": 4.584477688142803e-06, + "loss": 0.23, + "num_input_tokens_seen": 8271160, + "step": 4222 + }, + { + "epoch": 0.5597084161696487, + "grad_norm": 7.971199035644531, + "learning_rate": 4.584286020445745e-06, + "loss": 0.0878, + "num_input_tokens_seen": 8273000, + "step": 4223 + }, + { + "epoch": 0.5598409542743539, + "grad_norm": 5.691652774810791, + "learning_rate": 4.584094312562215e-06, + "loss": 0.055, + "num_input_tokens_seen": 8274744, + "step": 4224 + }, + { + "epoch": 0.559973492379059, + "grad_norm": 15.90192985534668, + "learning_rate": 4.583902564495909e-06, + "loss": 0.486, + "num_input_tokens_seen": 8276072, + "step": 4225 + }, + { + "epoch": 0.5601060304837641, + "grad_norm": 7.757584095001221, + "learning_rate": 4.583710776250523e-06, + "loss": 0.1326, + "num_input_tokens_seen": 8278080, + "step": 4226 + }, + { + "epoch": 0.5602385685884692, + "grad_norm": 0.4073067307472229, + "learning_rate": 4.583518947829756e-06, + "loss": 0.0027, + "num_input_tokens_seen": 8279440, + "step": 4227 + }, + { + "epoch": 0.5603711066931742, + "grad_norm": 8.02692699432373, + "learning_rate": 4.583327079237307e-06, + "loss": 0.1566, + "num_input_tokens_seen": 8281704, + "step": 4228 + }, + { + "epoch": 0.5605036447978794, + "grad_norm": 0.3048041760921478, + "learning_rate": 4.583135170476873e-06, + "loss": 0.0021, + "num_input_tokens_seen": 8283112, + "step": 4229 + }, + { + "epoch": 0.5606361829025845, + "grad_norm": 18.203075408935547, + "learning_rate": 4.582943221552158e-06, + "loss": 0.7575, + "num_input_tokens_seen": 8284632, + "step": 4230 + }, + { + "epoch": 0.5607687210072896, + "grad_norm": 11.17457103729248, + "learning_rate": 4.582751232466859e-06, + "loss": 0.191, + "num_input_tokens_seen": 8286376, + "step": 4231 + }, + { + "epoch": 0.5609012591119947, + "grad_norm": 13.156503677368164, + "learning_rate": 4.58255920322468e-06, + "loss": 0.4593, + "num_input_tokens_seen": 8288440, + "step": 4232 + }, + { + "epoch": 0.5610337972166998, + "grad_norm": 10.41724967956543, + "learning_rate": 4.582367133829323e-06, + "loss": 0.1182, + "num_input_tokens_seen": 8290960, + "step": 4233 + }, + { + "epoch": 0.5611663353214049, + "grad_norm": 2.760406970977783, + "learning_rate": 4.582175024284491e-06, + "loss": 0.0155, + "num_input_tokens_seen": 8292904, + "step": 4234 + }, + { + "epoch": 0.56129887342611, + "grad_norm": 0.3311265707015991, + "learning_rate": 4.5819828745938885e-06, + "loss": 0.0022, + "num_input_tokens_seen": 8294400, + "step": 4235 + }, + { + "epoch": 0.5614314115308151, + "grad_norm": 22.410964965820312, + "learning_rate": 4.58179068476122e-06, + "loss": 0.5681, + "num_input_tokens_seen": 8297344, + "step": 4236 + }, + { + "epoch": 0.5615639496355203, + "grad_norm": 10.7067289352417, + "learning_rate": 4.58159845479019e-06, + "loss": 0.2121, + "num_input_tokens_seen": 8299584, + "step": 4237 + }, + { + "epoch": 0.5616964877402253, + "grad_norm": 24.17529296875, + "learning_rate": 4.581406184684506e-06, + "loss": 0.9069, + "num_input_tokens_seen": 8301872, + "step": 4238 + }, + { + "epoch": 0.5618290258449304, + "grad_norm": 10.71629524230957, + "learning_rate": 4.581213874447876e-06, + "loss": 0.2664, + "num_input_tokens_seen": 8303592, + "step": 4239 + }, + { + "epoch": 0.5619615639496355, + "grad_norm": 12.087874412536621, + "learning_rate": 4.581021524084006e-06, + "loss": 0.212, + "num_input_tokens_seen": 8305704, + "step": 4240 + }, + { + "epoch": 0.5620941020543406, + "grad_norm": 14.72028636932373, + "learning_rate": 4.580829133596605e-06, + "loss": 0.5102, + "num_input_tokens_seen": 8308336, + "step": 4241 + }, + { + "epoch": 0.5622266401590458, + "grad_norm": 0.7996182441711426, + "learning_rate": 4.580636702989383e-06, + "loss": 0.0044, + "num_input_tokens_seen": 8310800, + "step": 4242 + }, + { + "epoch": 0.5623591782637508, + "grad_norm": 10.016292572021484, + "learning_rate": 4.58044423226605e-06, + "loss": 0.2175, + "num_input_tokens_seen": 8312832, + "step": 4243 + }, + { + "epoch": 0.562491716368456, + "grad_norm": 0.4290071427822113, + "learning_rate": 4.580251721430318e-06, + "loss": 0.003, + "num_input_tokens_seen": 8314344, + "step": 4244 + }, + { + "epoch": 0.562624254473161, + "grad_norm": 10.686200141906738, + "learning_rate": 4.5800591704858965e-06, + "loss": 0.5142, + "num_input_tokens_seen": 8316984, + "step": 4245 + }, + { + "epoch": 0.5627567925778661, + "grad_norm": 8.642796516418457, + "learning_rate": 4.5798665794365e-06, + "loss": 0.2419, + "num_input_tokens_seen": 8319048, + "step": 4246 + }, + { + "epoch": 0.5628893306825712, + "grad_norm": 0.6038889288902283, + "learning_rate": 4.57967394828584e-06, + "loss": 0.0036, + "num_input_tokens_seen": 8320864, + "step": 4247 + }, + { + "epoch": 0.5630218687872763, + "grad_norm": 7.067734241485596, + "learning_rate": 4.5794812770376325e-06, + "loss": 0.1742, + "num_input_tokens_seen": 8322992, + "step": 4248 + }, + { + "epoch": 0.5631544068919815, + "grad_norm": 0.40288838744163513, + "learning_rate": 4.579288565695591e-06, + "loss": 0.0028, + "num_input_tokens_seen": 8324736, + "step": 4249 + }, + { + "epoch": 0.5632869449966865, + "grad_norm": 0.2552488446235657, + "learning_rate": 4.579095814263431e-06, + "loss": 0.0017, + "num_input_tokens_seen": 8326280, + "step": 4250 + }, + { + "epoch": 0.5634194831013917, + "grad_norm": 18.879011154174805, + "learning_rate": 4.57890302274487e-06, + "loss": 0.5748, + "num_input_tokens_seen": 8327696, + "step": 4251 + }, + { + "epoch": 0.5635520212060967, + "grad_norm": 5.923543930053711, + "learning_rate": 4.5787101911436256e-06, + "loss": 0.0616, + "num_input_tokens_seen": 8329384, + "step": 4252 + }, + { + "epoch": 0.5636845593108019, + "grad_norm": 3.0735182762145996, + "learning_rate": 4.5785173194634135e-06, + "loss": 0.0423, + "num_input_tokens_seen": 8331184, + "step": 4253 + }, + { + "epoch": 0.563817097415507, + "grad_norm": 5.253058910369873, + "learning_rate": 4.5783244077079535e-06, + "loss": 0.297, + "num_input_tokens_seen": 8332760, + "step": 4254 + }, + { + "epoch": 0.563949635520212, + "grad_norm": 13.91555404663086, + "learning_rate": 4.578131455880966e-06, + "loss": 0.5648, + "num_input_tokens_seen": 8335680, + "step": 4255 + }, + { + "epoch": 0.5640821736249172, + "grad_norm": 7.740660667419434, + "learning_rate": 4.577938463986169e-06, + "loss": 0.0815, + "num_input_tokens_seen": 8337760, + "step": 4256 + }, + { + "epoch": 0.5642147117296222, + "grad_norm": 5.496605396270752, + "learning_rate": 4.577745432027286e-06, + "loss": 0.1804, + "num_input_tokens_seen": 8339568, + "step": 4257 + }, + { + "epoch": 0.5643472498343274, + "grad_norm": 17.28211212158203, + "learning_rate": 4.5775523600080375e-06, + "loss": 0.5457, + "num_input_tokens_seen": 8341600, + "step": 4258 + }, + { + "epoch": 0.5644797879390324, + "grad_norm": 9.130685806274414, + "learning_rate": 4.577359247932146e-06, + "loss": 0.4271, + "num_input_tokens_seen": 8343432, + "step": 4259 + }, + { + "epoch": 0.5646123260437376, + "grad_norm": 2.593804121017456, + "learning_rate": 4.577166095803336e-06, + "loss": 0.0241, + "num_input_tokens_seen": 8344992, + "step": 4260 + }, + { + "epoch": 0.5647448641484427, + "grad_norm": 3.4626922607421875, + "learning_rate": 4.57697290362533e-06, + "loss": 0.0908, + "num_input_tokens_seen": 8347104, + "step": 4261 + }, + { + "epoch": 0.5648774022531478, + "grad_norm": 10.954967498779297, + "learning_rate": 4.576779671401853e-06, + "loss": 0.156, + "num_input_tokens_seen": 8349208, + "step": 4262 + }, + { + "epoch": 0.5650099403578529, + "grad_norm": 7.874474048614502, + "learning_rate": 4.576586399136632e-06, + "loss": 0.3566, + "num_input_tokens_seen": 8351176, + "step": 4263 + }, + { + "epoch": 0.5651424784625579, + "grad_norm": 3.974134683609009, + "learning_rate": 4.576393086833393e-06, + "loss": 0.0725, + "num_input_tokens_seen": 8353752, + "step": 4264 + }, + { + "epoch": 0.5652750165672631, + "grad_norm": 8.1998929977417, + "learning_rate": 4.576199734495862e-06, + "loss": 0.2603, + "num_input_tokens_seen": 8355472, + "step": 4265 + }, + { + "epoch": 0.5654075546719682, + "grad_norm": 15.944494247436523, + "learning_rate": 4.576006342127768e-06, + "loss": 0.5336, + "num_input_tokens_seen": 8357832, + "step": 4266 + }, + { + "epoch": 0.5655400927766733, + "grad_norm": 9.196968078613281, + "learning_rate": 4.57581290973284e-06, + "loss": 0.2245, + "num_input_tokens_seen": 8359512, + "step": 4267 + }, + { + "epoch": 0.5656726308813784, + "grad_norm": 6.578267574310303, + "learning_rate": 4.575619437314807e-06, + "loss": 0.2115, + "num_input_tokens_seen": 8361384, + "step": 4268 + }, + { + "epoch": 0.5658051689860835, + "grad_norm": 9.462727546691895, + "learning_rate": 4.575425924877399e-06, + "loss": 0.23, + "num_input_tokens_seen": 8363208, + "step": 4269 + }, + { + "epoch": 0.5659377070907886, + "grad_norm": 4.790237903594971, + "learning_rate": 4.575232372424348e-06, + "loss": 0.1118, + "num_input_tokens_seen": 8365320, + "step": 4270 + }, + { + "epoch": 0.5660702451954938, + "grad_norm": 5.361525058746338, + "learning_rate": 4.575038779959385e-06, + "loss": 0.0307, + "num_input_tokens_seen": 8366856, + "step": 4271 + }, + { + "epoch": 0.5662027833001988, + "grad_norm": 8.946442604064941, + "learning_rate": 4.574845147486242e-06, + "loss": 0.2441, + "num_input_tokens_seen": 8369752, + "step": 4272 + }, + { + "epoch": 0.566335321404904, + "grad_norm": 12.714821815490723, + "learning_rate": 4.574651475008655e-06, + "loss": 0.434, + "num_input_tokens_seen": 8372000, + "step": 4273 + }, + { + "epoch": 0.566467859509609, + "grad_norm": 20.241958618164062, + "learning_rate": 4.574457762530354e-06, + "loss": 0.8547, + "num_input_tokens_seen": 8374528, + "step": 4274 + }, + { + "epoch": 0.5666003976143141, + "grad_norm": 9.682726860046387, + "learning_rate": 4.5742640100550776e-06, + "loss": 0.3227, + "num_input_tokens_seen": 8376704, + "step": 4275 + }, + { + "epoch": 0.5667329357190192, + "grad_norm": 11.85971736907959, + "learning_rate": 4.57407021758656e-06, + "loss": 0.4482, + "num_input_tokens_seen": 8378152, + "step": 4276 + }, + { + "epoch": 0.5668654738237243, + "grad_norm": 22.23664665222168, + "learning_rate": 4.573876385128539e-06, + "loss": 0.7954, + "num_input_tokens_seen": 8379952, + "step": 4277 + }, + { + "epoch": 0.5669980119284295, + "grad_norm": 10.538466453552246, + "learning_rate": 4.573682512684748e-06, + "loss": 0.263, + "num_input_tokens_seen": 8381672, + "step": 4278 + }, + { + "epoch": 0.5671305500331345, + "grad_norm": 7.241318225860596, + "learning_rate": 4.57348860025893e-06, + "loss": 0.2313, + "num_input_tokens_seen": 8383464, + "step": 4279 + }, + { + "epoch": 0.5672630881378397, + "grad_norm": 7.920835018157959, + "learning_rate": 4.573294647854819e-06, + "loss": 0.1121, + "num_input_tokens_seen": 8385248, + "step": 4280 + }, + { + "epoch": 0.5673956262425447, + "grad_norm": 0.19351337850093842, + "learning_rate": 4.573100655476159e-06, + "loss": 0.0013, + "num_input_tokens_seen": 8386680, + "step": 4281 + }, + { + "epoch": 0.5675281643472498, + "grad_norm": 8.171653747558594, + "learning_rate": 4.572906623126687e-06, + "loss": 0.1455, + "num_input_tokens_seen": 8388480, + "step": 4282 + }, + { + "epoch": 0.567660702451955, + "grad_norm": 0.22796307504177094, + "learning_rate": 4.572712550810146e-06, + "loss": 0.0015, + "num_input_tokens_seen": 8389648, + "step": 4283 + }, + { + "epoch": 0.56779324055666, + "grad_norm": 11.489493370056152, + "learning_rate": 4.5725184385302765e-06, + "loss": 0.3254, + "num_input_tokens_seen": 8391736, + "step": 4284 + }, + { + "epoch": 0.5679257786613652, + "grad_norm": 3.537522315979004, + "learning_rate": 4.5723242862908225e-06, + "loss": 0.0748, + "num_input_tokens_seen": 8393464, + "step": 4285 + }, + { + "epoch": 0.5680583167660702, + "grad_norm": 0.6127605438232422, + "learning_rate": 4.572130094095526e-06, + "loss": 0.0041, + "num_input_tokens_seen": 8394808, + "step": 4286 + }, + { + "epoch": 0.5681908548707754, + "grad_norm": 0.7648752331733704, + "learning_rate": 4.571935861948132e-06, + "loss": 0.0114, + "num_input_tokens_seen": 8396464, + "step": 4287 + }, + { + "epoch": 0.5683233929754804, + "grad_norm": 12.284891128540039, + "learning_rate": 4.571741589852385e-06, + "loss": 0.278, + "num_input_tokens_seen": 8398248, + "step": 4288 + }, + { + "epoch": 0.5684559310801856, + "grad_norm": 7.874042987823486, + "learning_rate": 4.5715472778120315e-06, + "loss": 0.2357, + "num_input_tokens_seen": 8400104, + "step": 4289 + }, + { + "epoch": 0.5685884691848907, + "grad_norm": 11.329712867736816, + "learning_rate": 4.571352925830817e-06, + "loss": 0.2304, + "num_input_tokens_seen": 8402904, + "step": 4290 + }, + { + "epoch": 0.5687210072895957, + "grad_norm": 9.141334533691406, + "learning_rate": 4.571158533912489e-06, + "loss": 0.346, + "num_input_tokens_seen": 8405056, + "step": 4291 + }, + { + "epoch": 0.5688535453943009, + "grad_norm": 0.2751242518424988, + "learning_rate": 4.570964102060796e-06, + "loss": 0.0018, + "num_input_tokens_seen": 8406328, + "step": 4292 + }, + { + "epoch": 0.5689860834990059, + "grad_norm": 9.666380882263184, + "learning_rate": 4.570769630279486e-06, + "loss": 0.1686, + "num_input_tokens_seen": 8408600, + "step": 4293 + }, + { + "epoch": 0.5691186216037111, + "grad_norm": 10.13422679901123, + "learning_rate": 4.570575118572309e-06, + "loss": 0.0319, + "num_input_tokens_seen": 8410472, + "step": 4294 + }, + { + "epoch": 0.5692511597084162, + "grad_norm": 4.587724685668945, + "learning_rate": 4.570380566943016e-06, + "loss": 0.1246, + "num_input_tokens_seen": 8413200, + "step": 4295 + }, + { + "epoch": 0.5693836978131213, + "grad_norm": 7.932286262512207, + "learning_rate": 4.570185975395357e-06, + "loss": 0.2141, + "num_input_tokens_seen": 8415800, + "step": 4296 + }, + { + "epoch": 0.5695162359178264, + "grad_norm": 20.300233840942383, + "learning_rate": 4.569991343933084e-06, + "loss": 0.4416, + "num_input_tokens_seen": 8418248, + "step": 4297 + }, + { + "epoch": 0.5696487740225314, + "grad_norm": 0.07834914326667786, + "learning_rate": 4.569796672559949e-06, + "loss": 0.0005, + "num_input_tokens_seen": 8419520, + "step": 4298 + }, + { + "epoch": 0.5697813121272366, + "grad_norm": 13.601746559143066, + "learning_rate": 4.569601961279708e-06, + "loss": 0.676, + "num_input_tokens_seen": 8421120, + "step": 4299 + }, + { + "epoch": 0.5699138502319416, + "grad_norm": 16.05999183654785, + "learning_rate": 4.569407210096113e-06, + "loss": 0.4325, + "num_input_tokens_seen": 8423008, + "step": 4300 + }, + { + "epoch": 0.5700463883366468, + "grad_norm": 12.571924209594727, + "learning_rate": 4.569212419012919e-06, + "loss": 0.1696, + "num_input_tokens_seen": 8424736, + "step": 4301 + }, + { + "epoch": 0.5701789264413519, + "grad_norm": 5.0280609130859375, + "learning_rate": 4.569017588033882e-06, + "loss": 0.1832, + "num_input_tokens_seen": 8426232, + "step": 4302 + }, + { + "epoch": 0.570311464546057, + "grad_norm": 16.32701301574707, + "learning_rate": 4.5688227171627586e-06, + "loss": 1.074, + "num_input_tokens_seen": 8429400, + "step": 4303 + }, + { + "epoch": 0.5704440026507621, + "grad_norm": 5.640312671661377, + "learning_rate": 4.568627806403306e-06, + "loss": 0.0368, + "num_input_tokens_seen": 8431176, + "step": 4304 + }, + { + "epoch": 0.5705765407554672, + "grad_norm": 14.657163619995117, + "learning_rate": 4.5684328557592825e-06, + "loss": 0.4167, + "num_input_tokens_seen": 8432800, + "step": 4305 + }, + { + "epoch": 0.5707090788601723, + "grad_norm": 10.338888168334961, + "learning_rate": 4.568237865234447e-06, + "loss": 0.2876, + "num_input_tokens_seen": 8434976, + "step": 4306 + }, + { + "epoch": 0.5708416169648775, + "grad_norm": 5.957551956176758, + "learning_rate": 4.5680428348325575e-06, + "loss": 0.1107, + "num_input_tokens_seen": 8437704, + "step": 4307 + }, + { + "epoch": 0.5709741550695825, + "grad_norm": 16.47465705871582, + "learning_rate": 4.567847764557377e-06, + "loss": 0.546, + "num_input_tokens_seen": 8439872, + "step": 4308 + }, + { + "epoch": 0.5711066931742876, + "grad_norm": 0.11251896619796753, + "learning_rate": 4.567652654412664e-06, + "loss": 0.0008, + "num_input_tokens_seen": 8441104, + "step": 4309 + }, + { + "epoch": 0.5712392312789927, + "grad_norm": 4.6540069580078125, + "learning_rate": 4.567457504402182e-06, + "loss": 0.0521, + "num_input_tokens_seen": 8442536, + "step": 4310 + }, + { + "epoch": 0.5713717693836978, + "grad_norm": 5.070131301879883, + "learning_rate": 4.567262314529693e-06, + "loss": 0.1366, + "num_input_tokens_seen": 8443816, + "step": 4311 + }, + { + "epoch": 0.5715043074884029, + "grad_norm": 8.351707458496094, + "learning_rate": 4.56706708479896e-06, + "loss": 0.135, + "num_input_tokens_seen": 8446944, + "step": 4312 + }, + { + "epoch": 0.571636845593108, + "grad_norm": 0.9232221841812134, + "learning_rate": 4.566871815213747e-06, + "loss": 0.0053, + "num_input_tokens_seen": 8448752, + "step": 4313 + }, + { + "epoch": 0.5717693836978132, + "grad_norm": 3.869992971420288, + "learning_rate": 4.566676505777821e-06, + "loss": 0.0342, + "num_input_tokens_seen": 8452096, + "step": 4314 + }, + { + "epoch": 0.5719019218025182, + "grad_norm": 15.801178932189941, + "learning_rate": 4.566481156494944e-06, + "loss": 0.1767, + "num_input_tokens_seen": 8453816, + "step": 4315 + }, + { + "epoch": 0.5720344599072233, + "grad_norm": 7.275855541229248, + "learning_rate": 4.5662857673688875e-06, + "loss": 0.3267, + "num_input_tokens_seen": 8455376, + "step": 4316 + }, + { + "epoch": 0.5721669980119284, + "grad_norm": 8.781572341918945, + "learning_rate": 4.5660903384034135e-06, + "loss": 0.156, + "num_input_tokens_seen": 8457160, + "step": 4317 + }, + { + "epoch": 0.5722995361166335, + "grad_norm": 0.7213758230209351, + "learning_rate": 4.565894869602294e-06, + "loss": 0.0048, + "num_input_tokens_seen": 8459784, + "step": 4318 + }, + { + "epoch": 0.5724320742213387, + "grad_norm": 14.654979705810547, + "learning_rate": 4.565699360969297e-06, + "loss": 0.5773, + "num_input_tokens_seen": 8462112, + "step": 4319 + }, + { + "epoch": 0.5725646123260437, + "grad_norm": 9.059070587158203, + "learning_rate": 4.565503812508189e-06, + "loss": 0.4075, + "num_input_tokens_seen": 8464104, + "step": 4320 + }, + { + "epoch": 0.5726971504307489, + "grad_norm": 7.977756023406982, + "learning_rate": 4.565308224222745e-06, + "loss": 0.348, + "num_input_tokens_seen": 8466048, + "step": 4321 + }, + { + "epoch": 0.5728296885354539, + "grad_norm": 0.11737184226512909, + "learning_rate": 4.5651125961167305e-06, + "loss": 0.0008, + "num_input_tokens_seen": 8467152, + "step": 4322 + }, + { + "epoch": 0.5729622266401591, + "grad_norm": 23.347951889038086, + "learning_rate": 4.564916928193923e-06, + "loss": 0.5547, + "num_input_tokens_seen": 8468760, + "step": 4323 + }, + { + "epoch": 0.5730947647448641, + "grad_norm": 7.015285968780518, + "learning_rate": 4.5647212204580915e-06, + "loss": 0.2062, + "num_input_tokens_seen": 8471704, + "step": 4324 + }, + { + "epoch": 0.5732273028495692, + "grad_norm": 8.148268699645996, + "learning_rate": 4.564525472913012e-06, + "loss": 0.149, + "num_input_tokens_seen": 8473632, + "step": 4325 + }, + { + "epoch": 0.5733598409542744, + "grad_norm": 0.31451258063316345, + "learning_rate": 4.564329685562455e-06, + "loss": 0.002, + "num_input_tokens_seen": 8475016, + "step": 4326 + }, + { + "epoch": 0.5734923790589794, + "grad_norm": 12.027788162231445, + "learning_rate": 4.564133858410199e-06, + "loss": 0.2643, + "num_input_tokens_seen": 8476536, + "step": 4327 + }, + { + "epoch": 0.5736249171636846, + "grad_norm": 6.506690979003906, + "learning_rate": 4.5639379914600176e-06, + "loss": 0.1771, + "num_input_tokens_seen": 8478056, + "step": 4328 + }, + { + "epoch": 0.5737574552683896, + "grad_norm": 5.623752593994141, + "learning_rate": 4.5637420847156884e-06, + "loss": 0.1983, + "num_input_tokens_seen": 8479592, + "step": 4329 + }, + { + "epoch": 0.5738899933730948, + "grad_norm": 0.7790703773498535, + "learning_rate": 4.563546138180988e-06, + "loss": 0.0149, + "num_input_tokens_seen": 8481032, + "step": 4330 + }, + { + "epoch": 0.5740225314777999, + "grad_norm": 10.818570137023926, + "learning_rate": 4.563350151859695e-06, + "loss": 0.2969, + "num_input_tokens_seen": 8482928, + "step": 4331 + }, + { + "epoch": 0.574155069582505, + "grad_norm": 6.5961103439331055, + "learning_rate": 4.5631541257555866e-06, + "loss": 0.145, + "num_input_tokens_seen": 8484904, + "step": 4332 + }, + { + "epoch": 0.5742876076872101, + "grad_norm": 6.804736614227295, + "learning_rate": 4.562958059872443e-06, + "loss": 0.1083, + "num_input_tokens_seen": 8486656, + "step": 4333 + }, + { + "epoch": 0.5744201457919151, + "grad_norm": 8.181356430053711, + "learning_rate": 4.5627619542140465e-06, + "loss": 0.1237, + "num_input_tokens_seen": 8488424, + "step": 4334 + }, + { + "epoch": 0.5745526838966203, + "grad_norm": 7.599025726318359, + "learning_rate": 4.562565808784175e-06, + "loss": 0.3012, + "num_input_tokens_seen": 8490584, + "step": 4335 + }, + { + "epoch": 0.5746852220013254, + "grad_norm": 4.574509143829346, + "learning_rate": 4.562369623586612e-06, + "loss": 0.0307, + "num_input_tokens_seen": 8491832, + "step": 4336 + }, + { + "epoch": 0.5748177601060305, + "grad_norm": 0.25657889246940613, + "learning_rate": 4.56217339862514e-06, + "loss": 0.0015, + "num_input_tokens_seen": 8493168, + "step": 4337 + }, + { + "epoch": 0.5749502982107356, + "grad_norm": 0.08141116797924042, + "learning_rate": 4.561977133903543e-06, + "loss": 0.0005, + "num_input_tokens_seen": 8494368, + "step": 4338 + }, + { + "epoch": 0.5750828363154407, + "grad_norm": 6.296396255493164, + "learning_rate": 4.561780829425604e-06, + "loss": 0.0884, + "num_input_tokens_seen": 8495824, + "step": 4339 + }, + { + "epoch": 0.5752153744201458, + "grad_norm": 7.0741095542907715, + "learning_rate": 4.561584485195108e-06, + "loss": 0.1277, + "num_input_tokens_seen": 8496928, + "step": 4340 + }, + { + "epoch": 0.5753479125248508, + "grad_norm": 5.731409072875977, + "learning_rate": 4.561388101215841e-06, + "loss": 0.1947, + "num_input_tokens_seen": 8499016, + "step": 4341 + }, + { + "epoch": 0.575480450629556, + "grad_norm": 6.399181365966797, + "learning_rate": 4.56119167749159e-06, + "loss": 0.2332, + "num_input_tokens_seen": 8501080, + "step": 4342 + }, + { + "epoch": 0.5756129887342611, + "grad_norm": 12.875848770141602, + "learning_rate": 4.560995214026141e-06, + "loss": 0.3918, + "num_input_tokens_seen": 8503432, + "step": 4343 + }, + { + "epoch": 0.5757455268389662, + "grad_norm": 9.309961318969727, + "learning_rate": 4.560798710823282e-06, + "loss": 0.2479, + "num_input_tokens_seen": 8505816, + "step": 4344 + }, + { + "epoch": 0.5758780649436713, + "grad_norm": 1.1099779605865479, + "learning_rate": 4.560602167886803e-06, + "loss": 0.0299, + "num_input_tokens_seen": 8507664, + "step": 4345 + }, + { + "epoch": 0.5760106030483764, + "grad_norm": 8.282926559448242, + "learning_rate": 4.560405585220492e-06, + "loss": 0.2038, + "num_input_tokens_seen": 8509616, + "step": 4346 + }, + { + "epoch": 0.5761431411530815, + "grad_norm": 19.77098274230957, + "learning_rate": 4.560208962828141e-06, + "loss": 0.724, + "num_input_tokens_seen": 8511640, + "step": 4347 + }, + { + "epoch": 0.5762756792577867, + "grad_norm": 0.3607216477394104, + "learning_rate": 4.5600123007135385e-06, + "loss": 0.0022, + "num_input_tokens_seen": 8513408, + "step": 4348 + }, + { + "epoch": 0.5764082173624917, + "grad_norm": 0.2982603907585144, + "learning_rate": 4.559815598880478e-06, + "loss": 0.0017, + "num_input_tokens_seen": 8514632, + "step": 4349 + }, + { + "epoch": 0.5765407554671969, + "grad_norm": 3.2125802040100098, + "learning_rate": 4.559618857332752e-06, + "loss": 0.036, + "num_input_tokens_seen": 8516264, + "step": 4350 + }, + { + "epoch": 0.5766732935719019, + "grad_norm": 9.185589790344238, + "learning_rate": 4.5594220760741535e-06, + "loss": 0.312, + "num_input_tokens_seen": 8518496, + "step": 4351 + }, + { + "epoch": 0.576805831676607, + "grad_norm": 8.0495023727417, + "learning_rate": 4.559225255108477e-06, + "loss": 0.2852, + "num_input_tokens_seen": 8520200, + "step": 4352 + }, + { + "epoch": 0.5769383697813121, + "grad_norm": 8.494548797607422, + "learning_rate": 4.559028394439517e-06, + "loss": 0.151, + "num_input_tokens_seen": 8522440, + "step": 4353 + }, + { + "epoch": 0.5770709078860172, + "grad_norm": 0.25275135040283203, + "learning_rate": 4.558831494071069e-06, + "loss": 0.0017, + "num_input_tokens_seen": 8524136, + "step": 4354 + }, + { + "epoch": 0.5772034459907224, + "grad_norm": 1.3453714847564697, + "learning_rate": 4.558634554006929e-06, + "loss": 0.009, + "num_input_tokens_seen": 8526880, + "step": 4355 + }, + { + "epoch": 0.5773359840954274, + "grad_norm": 10.653584480285645, + "learning_rate": 4.558437574250896e-06, + "loss": 0.3991, + "num_input_tokens_seen": 8528576, + "step": 4356 + }, + { + "epoch": 0.5774685222001326, + "grad_norm": 14.852418899536133, + "learning_rate": 4.558240554806765e-06, + "loss": 0.646, + "num_input_tokens_seen": 8530120, + "step": 4357 + }, + { + "epoch": 0.5776010603048376, + "grad_norm": 9.925727844238281, + "learning_rate": 4.558043495678336e-06, + "loss": 0.144, + "num_input_tokens_seen": 8532168, + "step": 4358 + }, + { + "epoch": 0.5777335984095427, + "grad_norm": 0.045869458466768265, + "learning_rate": 4.55784639686941e-06, + "loss": 0.0003, + "num_input_tokens_seen": 8534032, + "step": 4359 + }, + { + "epoch": 0.5778661365142479, + "grad_norm": 0.17023147642612457, + "learning_rate": 4.557649258383785e-06, + "loss": 0.001, + "num_input_tokens_seen": 8536472, + "step": 4360 + }, + { + "epoch": 0.5779986746189529, + "grad_norm": 10.310873031616211, + "learning_rate": 4.557452080225263e-06, + "loss": 0.1882, + "num_input_tokens_seen": 8537984, + "step": 4361 + }, + { + "epoch": 0.5781312127236581, + "grad_norm": 0.03095160238444805, + "learning_rate": 4.557254862397645e-06, + "loss": 0.0002, + "num_input_tokens_seen": 8539568, + "step": 4362 + }, + { + "epoch": 0.5782637508283631, + "grad_norm": 0.031688202172517776, + "learning_rate": 4.557057604904735e-06, + "loss": 0.0002, + "num_input_tokens_seen": 8541096, + "step": 4363 + }, + { + "epoch": 0.5783962889330683, + "grad_norm": 7.196596622467041, + "learning_rate": 4.5568603077503345e-06, + "loss": 0.1348, + "num_input_tokens_seen": 8543416, + "step": 4364 + }, + { + "epoch": 0.5785288270377733, + "grad_norm": 0.11335533112287521, + "learning_rate": 4.556662970938249e-06, + "loss": 0.0007, + "num_input_tokens_seen": 8544944, + "step": 4365 + }, + { + "epoch": 0.5786613651424785, + "grad_norm": 10.476348876953125, + "learning_rate": 4.5564655944722825e-06, + "loss": 0.2256, + "num_input_tokens_seen": 8547376, + "step": 4366 + }, + { + "epoch": 0.5787939032471836, + "grad_norm": 0.0516941137611866, + "learning_rate": 4.55626817835624e-06, + "loss": 0.0003, + "num_input_tokens_seen": 8549688, + "step": 4367 + }, + { + "epoch": 0.5789264413518886, + "grad_norm": 0.0313602089881897, + "learning_rate": 4.556070722593929e-06, + "loss": 0.0002, + "num_input_tokens_seen": 8550936, + "step": 4368 + }, + { + "epoch": 0.5790589794565938, + "grad_norm": 8.793875694274902, + "learning_rate": 4.555873227189156e-06, + "loss": 0.15, + "num_input_tokens_seen": 8554000, + "step": 4369 + }, + { + "epoch": 0.5791915175612988, + "grad_norm": 0.0633508712053299, + "learning_rate": 4.555675692145729e-06, + "loss": 0.0004, + "num_input_tokens_seen": 8555296, + "step": 4370 + }, + { + "epoch": 0.579324055666004, + "grad_norm": 11.965086936950684, + "learning_rate": 4.555478117467456e-06, + "loss": 0.1877, + "num_input_tokens_seen": 8557440, + "step": 4371 + }, + { + "epoch": 0.5794565937707091, + "grad_norm": 0.013543857261538506, + "learning_rate": 4.555280503158147e-06, + "loss": 0.0001, + "num_input_tokens_seen": 8559000, + "step": 4372 + }, + { + "epoch": 0.5795891318754142, + "grad_norm": 10.043455123901367, + "learning_rate": 4.555082849221612e-06, + "loss": 0.2861, + "num_input_tokens_seen": 8560912, + "step": 4373 + }, + { + "epoch": 0.5797216699801193, + "grad_norm": 0.12319021672010422, + "learning_rate": 4.554885155661663e-06, + "loss": 0.0008, + "num_input_tokens_seen": 8562816, + "step": 4374 + }, + { + "epoch": 0.5798542080848244, + "grad_norm": 33.75932693481445, + "learning_rate": 4.55468742248211e-06, + "loss": 0.5537, + "num_input_tokens_seen": 8564280, + "step": 4375 + }, + { + "epoch": 0.5799867461895295, + "grad_norm": 10.492067337036133, + "learning_rate": 4.554489649686765e-06, + "loss": 0.2457, + "num_input_tokens_seen": 8566760, + "step": 4376 + }, + { + "epoch": 0.5801192842942345, + "grad_norm": 6.253518104553223, + "learning_rate": 4.5542918372794435e-06, + "loss": 0.2509, + "num_input_tokens_seen": 8568872, + "step": 4377 + }, + { + "epoch": 0.5802518223989397, + "grad_norm": 0.3660961091518402, + "learning_rate": 4.554093985263958e-06, + "loss": 0.0024, + "num_input_tokens_seen": 8570312, + "step": 4378 + }, + { + "epoch": 0.5803843605036448, + "grad_norm": 13.249551773071289, + "learning_rate": 4.5538960936441235e-06, + "loss": 0.4213, + "num_input_tokens_seen": 8573352, + "step": 4379 + }, + { + "epoch": 0.5805168986083499, + "grad_norm": 0.07100484520196915, + "learning_rate": 4.5536981624237545e-06, + "loss": 0.0004, + "num_input_tokens_seen": 8574800, + "step": 4380 + }, + { + "epoch": 0.580649436713055, + "grad_norm": 0.037556443363428116, + "learning_rate": 4.553500191606669e-06, + "loss": 0.0002, + "num_input_tokens_seen": 8575936, + "step": 4381 + }, + { + "epoch": 0.5807819748177601, + "grad_norm": 5.215287208557129, + "learning_rate": 4.553302181196683e-06, + "loss": 0.1371, + "num_input_tokens_seen": 8577416, + "step": 4382 + }, + { + "epoch": 0.5809145129224652, + "grad_norm": 10.468539237976074, + "learning_rate": 4.553104131197614e-06, + "loss": 0.2726, + "num_input_tokens_seen": 8580040, + "step": 4383 + }, + { + "epoch": 0.5810470510271704, + "grad_norm": 5.80501651763916, + "learning_rate": 4.552906041613282e-06, + "loss": 0.0992, + "num_input_tokens_seen": 8581936, + "step": 4384 + }, + { + "epoch": 0.5811795891318754, + "grad_norm": 0.5218558311462402, + "learning_rate": 4.552707912447504e-06, + "loss": 0.0029, + "num_input_tokens_seen": 8584240, + "step": 4385 + }, + { + "epoch": 0.5813121272365805, + "grad_norm": 12.1973876953125, + "learning_rate": 4.552509743704102e-06, + "loss": 0.4238, + "num_input_tokens_seen": 8585976, + "step": 4386 + }, + { + "epoch": 0.5814446653412856, + "grad_norm": 8.577536582946777, + "learning_rate": 4.552311535386897e-06, + "loss": 0.3621, + "num_input_tokens_seen": 8587688, + "step": 4387 + }, + { + "epoch": 0.5815772034459907, + "grad_norm": 0.029581217095255852, + "learning_rate": 4.552113287499708e-06, + "loss": 0.0002, + "num_input_tokens_seen": 8588912, + "step": 4388 + }, + { + "epoch": 0.5817097415506959, + "grad_norm": 9.509265899658203, + "learning_rate": 4.55191500004636e-06, + "loss": 0.2605, + "num_input_tokens_seen": 8591208, + "step": 4389 + }, + { + "epoch": 0.5818422796554009, + "grad_norm": 10.583097457885742, + "learning_rate": 4.551716673030676e-06, + "loss": 0.1594, + "num_input_tokens_seen": 8592896, + "step": 4390 + }, + { + "epoch": 0.5819748177601061, + "grad_norm": 0.052210304886102676, + "learning_rate": 4.551518306456478e-06, + "loss": 0.0003, + "num_input_tokens_seen": 8594344, + "step": 4391 + }, + { + "epoch": 0.5821073558648111, + "grad_norm": 10.652533531188965, + "learning_rate": 4.551319900327593e-06, + "loss": 0.3092, + "num_input_tokens_seen": 8596352, + "step": 4392 + }, + { + "epoch": 0.5822398939695163, + "grad_norm": 0.12843726575374603, + "learning_rate": 4.551121454647844e-06, + "loss": 0.0008, + "num_input_tokens_seen": 8598272, + "step": 4393 + }, + { + "epoch": 0.5823724320742213, + "grad_norm": 8.99263858795166, + "learning_rate": 4.550922969421059e-06, + "loss": 0.1758, + "num_input_tokens_seen": 8600304, + "step": 4394 + }, + { + "epoch": 0.5825049701789264, + "grad_norm": 22.130340576171875, + "learning_rate": 4.550724444651065e-06, + "loss": 0.8582, + "num_input_tokens_seen": 8603496, + "step": 4395 + }, + { + "epoch": 0.5826375082836316, + "grad_norm": 4.380096912384033, + "learning_rate": 4.550525880341688e-06, + "loss": 0.0439, + "num_input_tokens_seen": 8605112, + "step": 4396 + }, + { + "epoch": 0.5827700463883366, + "grad_norm": 5.272134780883789, + "learning_rate": 4.5503272764967574e-06, + "loss": 0.2539, + "num_input_tokens_seen": 8606856, + "step": 4397 + }, + { + "epoch": 0.5829025844930418, + "grad_norm": 0.05595359206199646, + "learning_rate": 4.550128633120102e-06, + "loss": 0.0003, + "num_input_tokens_seen": 8608216, + "step": 4398 + }, + { + "epoch": 0.5830351225977468, + "grad_norm": 7.434205532073975, + "learning_rate": 4.549929950215553e-06, + "loss": 0.3007, + "num_input_tokens_seen": 8610416, + "step": 4399 + }, + { + "epoch": 0.583167660702452, + "grad_norm": 7.668612480163574, + "learning_rate": 4.549731227786941e-06, + "loss": 0.2227, + "num_input_tokens_seen": 8612080, + "step": 4400 + }, + { + "epoch": 0.5833001988071571, + "grad_norm": 7.521057605743408, + "learning_rate": 4.549532465838096e-06, + "loss": 0.1927, + "num_input_tokens_seen": 8614176, + "step": 4401 + }, + { + "epoch": 0.5834327369118621, + "grad_norm": 3.920696258544922, + "learning_rate": 4.549333664372851e-06, + "loss": 0.1192, + "num_input_tokens_seen": 8615608, + "step": 4402 + }, + { + "epoch": 0.5835652750165673, + "grad_norm": 4.160447597503662, + "learning_rate": 4.549134823395039e-06, + "loss": 0.0099, + "num_input_tokens_seen": 8617088, + "step": 4403 + }, + { + "epoch": 0.5836978131212723, + "grad_norm": 2.0484983921051025, + "learning_rate": 4.548935942908494e-06, + "loss": 0.0098, + "num_input_tokens_seen": 8618808, + "step": 4404 + }, + { + "epoch": 0.5838303512259775, + "grad_norm": 19.578227996826172, + "learning_rate": 4.548737022917051e-06, + "loss": 0.7331, + "num_input_tokens_seen": 8620136, + "step": 4405 + }, + { + "epoch": 0.5839628893306825, + "grad_norm": 13.342570304870605, + "learning_rate": 4.548538063424546e-06, + "loss": 0.5457, + "num_input_tokens_seen": 8622456, + "step": 4406 + }, + { + "epoch": 0.5840954274353877, + "grad_norm": 5.851290702819824, + "learning_rate": 4.548339064434812e-06, + "loss": 0.0959, + "num_input_tokens_seen": 8624408, + "step": 4407 + }, + { + "epoch": 0.5842279655400928, + "grad_norm": 17.008792877197266, + "learning_rate": 4.548140025951689e-06, + "loss": 0.3667, + "num_input_tokens_seen": 8626032, + "step": 4408 + }, + { + "epoch": 0.5843605036447979, + "grad_norm": 0.07513957470655441, + "learning_rate": 4.547940947979013e-06, + "loss": 0.0005, + "num_input_tokens_seen": 8627728, + "step": 4409 + }, + { + "epoch": 0.584493041749503, + "grad_norm": 5.069014549255371, + "learning_rate": 4.547741830520622e-06, + "loss": 0.0654, + "num_input_tokens_seen": 8629384, + "step": 4410 + }, + { + "epoch": 0.584625579854208, + "grad_norm": 18.559001922607422, + "learning_rate": 4.547542673580357e-06, + "loss": 0.2714, + "num_input_tokens_seen": 8630912, + "step": 4411 + }, + { + "epoch": 0.5847581179589132, + "grad_norm": 9.031882286071777, + "learning_rate": 4.547343477162056e-06, + "loss": 0.2967, + "num_input_tokens_seen": 8633400, + "step": 4412 + }, + { + "epoch": 0.5848906560636183, + "grad_norm": 7.995486736297607, + "learning_rate": 4.5471442412695605e-06, + "loss": 0.2286, + "num_input_tokens_seen": 8635624, + "step": 4413 + }, + { + "epoch": 0.5850231941683234, + "grad_norm": 3.0888314247131348, + "learning_rate": 4.546944965906712e-06, + "loss": 0.0513, + "num_input_tokens_seen": 8637912, + "step": 4414 + }, + { + "epoch": 0.5851557322730285, + "grad_norm": 9.285609245300293, + "learning_rate": 4.5467456510773525e-06, + "loss": 0.2707, + "num_input_tokens_seen": 8640008, + "step": 4415 + }, + { + "epoch": 0.5852882703777336, + "grad_norm": 10.080692291259766, + "learning_rate": 4.546546296785324e-06, + "loss": 0.2375, + "num_input_tokens_seen": 8641656, + "step": 4416 + }, + { + "epoch": 0.5854208084824387, + "grad_norm": 1.5417126417160034, + "learning_rate": 4.546346903034472e-06, + "loss": 0.058, + "num_input_tokens_seen": 8644032, + "step": 4417 + }, + { + "epoch": 0.5855533465871438, + "grad_norm": 0.24453695118427277, + "learning_rate": 4.54614746982864e-06, + "loss": 0.0016, + "num_input_tokens_seen": 8645680, + "step": 4418 + }, + { + "epoch": 0.5856858846918489, + "grad_norm": 0.48124921321868896, + "learning_rate": 4.545947997171673e-06, + "loss": 0.003, + "num_input_tokens_seen": 8647000, + "step": 4419 + }, + { + "epoch": 0.585818422796554, + "grad_norm": 5.7744879722595215, + "learning_rate": 4.545748485067417e-06, + "loss": 0.1108, + "num_input_tokens_seen": 8648512, + "step": 4420 + }, + { + "epoch": 0.5859509609012591, + "grad_norm": 8.144845008850098, + "learning_rate": 4.5455489335197185e-06, + "loss": 0.1027, + "num_input_tokens_seen": 8650936, + "step": 4421 + }, + { + "epoch": 0.5860834990059642, + "grad_norm": 1.0975395441055298, + "learning_rate": 4.545349342532425e-06, + "loss": 0.0051, + "num_input_tokens_seen": 8652440, + "step": 4422 + }, + { + "epoch": 0.5862160371106693, + "grad_norm": 0.21946904063224792, + "learning_rate": 4.5451497121093865e-06, + "loss": 0.0014, + "num_input_tokens_seen": 8654072, + "step": 4423 + }, + { + "epoch": 0.5863485752153744, + "grad_norm": 0.3981848657131195, + "learning_rate": 4.544950042254452e-06, + "loss": 0.0026, + "num_input_tokens_seen": 8655736, + "step": 4424 + }, + { + "epoch": 0.5864811133200796, + "grad_norm": 8.976275444030762, + "learning_rate": 4.544750332971467e-06, + "loss": 0.1844, + "num_input_tokens_seen": 8657624, + "step": 4425 + }, + { + "epoch": 0.5866136514247846, + "grad_norm": 11.50351619720459, + "learning_rate": 4.544550584264287e-06, + "loss": 0.2203, + "num_input_tokens_seen": 8660256, + "step": 4426 + }, + { + "epoch": 0.5867461895294898, + "grad_norm": 4.574235439300537, + "learning_rate": 4.54435079613676e-06, + "loss": 0.2324, + "num_input_tokens_seen": 8661944, + "step": 4427 + }, + { + "epoch": 0.5868787276341948, + "grad_norm": 3.243374824523926, + "learning_rate": 4.54415096859274e-06, + "loss": 0.0387, + "num_input_tokens_seen": 8665048, + "step": 4428 + }, + { + "epoch": 0.5870112657389, + "grad_norm": 9.473231315612793, + "learning_rate": 4.543951101636078e-06, + "loss": 0.3565, + "num_input_tokens_seen": 8667080, + "step": 4429 + }, + { + "epoch": 0.587143803843605, + "grad_norm": 1.1654329299926758, + "learning_rate": 4.54375119527063e-06, + "loss": 0.0066, + "num_input_tokens_seen": 8669736, + "step": 4430 + }, + { + "epoch": 0.5872763419483101, + "grad_norm": 7.744290351867676, + "learning_rate": 4.543551249500249e-06, + "loss": 0.095, + "num_input_tokens_seen": 8671632, + "step": 4431 + }, + { + "epoch": 0.5874088800530153, + "grad_norm": 7.371616363525391, + "learning_rate": 4.54335126432879e-06, + "loss": 0.3018, + "num_input_tokens_seen": 8673048, + "step": 4432 + }, + { + "epoch": 0.5875414181577203, + "grad_norm": 0.552163302898407, + "learning_rate": 4.5431512397601085e-06, + "loss": 0.0023, + "num_input_tokens_seen": 8674832, + "step": 4433 + }, + { + "epoch": 0.5876739562624255, + "grad_norm": 15.323720932006836, + "learning_rate": 4.542951175798062e-06, + "loss": 0.4853, + "num_input_tokens_seen": 8676504, + "step": 4434 + }, + { + "epoch": 0.5878064943671305, + "grad_norm": 0.031158868223428726, + "learning_rate": 4.542751072446507e-06, + "loss": 0.0002, + "num_input_tokens_seen": 8679080, + "step": 4435 + }, + { + "epoch": 0.5879390324718357, + "grad_norm": 8.181913375854492, + "learning_rate": 4.5425509297093025e-06, + "loss": 0.1164, + "num_input_tokens_seen": 8680480, + "step": 4436 + }, + { + "epoch": 0.5880715705765408, + "grad_norm": 1.8267797231674194, + "learning_rate": 4.5423507475903075e-06, + "loss": 0.0271, + "num_input_tokens_seen": 8681728, + "step": 4437 + }, + { + "epoch": 0.5882041086812458, + "grad_norm": 9.63296127319336, + "learning_rate": 4.542150526093381e-06, + "loss": 0.2124, + "num_input_tokens_seen": 8682928, + "step": 4438 + }, + { + "epoch": 0.588336646785951, + "grad_norm": 0.08651191741228104, + "learning_rate": 4.541950265222383e-06, + "loss": 0.0005, + "num_input_tokens_seen": 8684968, + "step": 4439 + }, + { + "epoch": 0.588469184890656, + "grad_norm": 14.389850616455078, + "learning_rate": 4.541749964981175e-06, + "loss": 0.3405, + "num_input_tokens_seen": 8687480, + "step": 4440 + }, + { + "epoch": 0.5886017229953612, + "grad_norm": 4.722055912017822, + "learning_rate": 4.541549625373619e-06, + "loss": 0.1108, + "num_input_tokens_seen": 8689704, + "step": 4441 + }, + { + "epoch": 0.5887342611000662, + "grad_norm": 10.62497329711914, + "learning_rate": 4.541349246403579e-06, + "loss": 0.0537, + "num_input_tokens_seen": 8692496, + "step": 4442 + }, + { + "epoch": 0.5888667992047714, + "grad_norm": 16.432832717895508, + "learning_rate": 4.541148828074916e-06, + "loss": 0.3868, + "num_input_tokens_seen": 8694720, + "step": 4443 + }, + { + "epoch": 0.5889993373094765, + "grad_norm": 3.7078092098236084, + "learning_rate": 4.540948370391497e-06, + "loss": 0.0538, + "num_input_tokens_seen": 8696128, + "step": 4444 + }, + { + "epoch": 0.5891318754141815, + "grad_norm": 4.969757556915283, + "learning_rate": 4.540747873357184e-06, + "loss": 0.1678, + "num_input_tokens_seen": 8697800, + "step": 4445 + }, + { + "epoch": 0.5892644135188867, + "grad_norm": 3.952193260192871, + "learning_rate": 4.540547336975844e-06, + "loss": 0.1317, + "num_input_tokens_seen": 8700176, + "step": 4446 + }, + { + "epoch": 0.5893969516235917, + "grad_norm": 0.025973495095968246, + "learning_rate": 4.540346761251345e-06, + "loss": 0.0002, + "num_input_tokens_seen": 8702272, + "step": 4447 + }, + { + "epoch": 0.5895294897282969, + "grad_norm": 5.541677474975586, + "learning_rate": 4.540146146187552e-06, + "loss": 0.0322, + "num_input_tokens_seen": 8704536, + "step": 4448 + }, + { + "epoch": 0.589662027833002, + "grad_norm": 0.07036970555782318, + "learning_rate": 4.539945491788335e-06, + "loss": 0.0004, + "num_input_tokens_seen": 8705872, + "step": 4449 + }, + { + "epoch": 0.5897945659377071, + "grad_norm": 3.2239980697631836, + "learning_rate": 4.53974479805756e-06, + "loss": 0.0613, + "num_input_tokens_seen": 8707560, + "step": 4450 + }, + { + "epoch": 0.5899271040424122, + "grad_norm": 0.02359906956553459, + "learning_rate": 4.539544064999099e-06, + "loss": 0.0001, + "num_input_tokens_seen": 8709928, + "step": 4451 + }, + { + "epoch": 0.5900596421471173, + "grad_norm": 0.19118675589561462, + "learning_rate": 4.539343292616821e-06, + "loss": 0.001, + "num_input_tokens_seen": 8711480, + "step": 4452 + }, + { + "epoch": 0.5901921802518224, + "grad_norm": 5.0489091873168945, + "learning_rate": 4.539142480914598e-06, + "loss": 0.069, + "num_input_tokens_seen": 8714120, + "step": 4453 + }, + { + "epoch": 0.5903247183565276, + "grad_norm": 0.019733447581529617, + "learning_rate": 4.538941629896301e-06, + "loss": 0.0001, + "num_input_tokens_seen": 8716048, + "step": 4454 + }, + { + "epoch": 0.5904572564612326, + "grad_norm": 0.05661563575267792, + "learning_rate": 4.538740739565803e-06, + "loss": 0.0004, + "num_input_tokens_seen": 8717264, + "step": 4455 + }, + { + "epoch": 0.5905897945659377, + "grad_norm": 8.199716567993164, + "learning_rate": 4.5385398099269765e-06, + "loss": 0.1131, + "num_input_tokens_seen": 8718688, + "step": 4456 + }, + { + "epoch": 0.5907223326706428, + "grad_norm": 11.063639640808105, + "learning_rate": 4.538338840983697e-06, + "loss": 0.3495, + "num_input_tokens_seen": 8721072, + "step": 4457 + }, + { + "epoch": 0.5908548707753479, + "grad_norm": 14.782572746276855, + "learning_rate": 4.538137832739838e-06, + "loss": 0.2652, + "num_input_tokens_seen": 8724072, + "step": 4458 + }, + { + "epoch": 0.590987408880053, + "grad_norm": 0.017578277736902237, + "learning_rate": 4.537936785199275e-06, + "loss": 0.0001, + "num_input_tokens_seen": 8727120, + "step": 4459 + }, + { + "epoch": 0.5911199469847581, + "grad_norm": 7.24366569519043, + "learning_rate": 4.5377356983658855e-06, + "loss": 0.1343, + "num_input_tokens_seen": 8728704, + "step": 4460 + }, + { + "epoch": 0.5912524850894633, + "grad_norm": 11.244036674499512, + "learning_rate": 4.537534572243546e-06, + "loss": 0.6403, + "num_input_tokens_seen": 8730872, + "step": 4461 + }, + { + "epoch": 0.5913850231941683, + "grad_norm": 2.816316604614258, + "learning_rate": 4.537333406836134e-06, + "loss": 0.0932, + "num_input_tokens_seen": 8732880, + "step": 4462 + }, + { + "epoch": 0.5915175612988735, + "grad_norm": 18.80072593688965, + "learning_rate": 4.537132202147529e-06, + "loss": 0.5628, + "num_input_tokens_seen": 8735424, + "step": 4463 + }, + { + "epoch": 0.5916500994035785, + "grad_norm": 0.037906862795352936, + "learning_rate": 4.536930958181609e-06, + "loss": 0.0002, + "num_input_tokens_seen": 8737424, + "step": 4464 + }, + { + "epoch": 0.5917826375082836, + "grad_norm": 12.490594863891602, + "learning_rate": 4.5367296749422555e-06, + "loss": 0.3221, + "num_input_tokens_seen": 8738936, + "step": 4465 + }, + { + "epoch": 0.5919151756129888, + "grad_norm": 0.016223672777414322, + "learning_rate": 4.536528352433349e-06, + "loss": 0.0001, + "num_input_tokens_seen": 8740320, + "step": 4466 + }, + { + "epoch": 0.5920477137176938, + "grad_norm": 7.457622528076172, + "learning_rate": 4.5363269906587705e-06, + "loss": 0.2888, + "num_input_tokens_seen": 8741840, + "step": 4467 + }, + { + "epoch": 0.592180251822399, + "grad_norm": 6.541828155517578, + "learning_rate": 4.536125589622402e-06, + "loss": 0.343, + "num_input_tokens_seen": 8743888, + "step": 4468 + }, + { + "epoch": 0.592312789927104, + "grad_norm": 10.480128288269043, + "learning_rate": 4.535924149328129e-06, + "loss": 0.4572, + "num_input_tokens_seen": 8745832, + "step": 4469 + }, + { + "epoch": 0.5924453280318092, + "grad_norm": 0.04811680316925049, + "learning_rate": 4.535722669779833e-06, + "loss": 0.0003, + "num_input_tokens_seen": 8748208, + "step": 4470 + }, + { + "epoch": 0.5925778661365142, + "grad_norm": 4.689108371734619, + "learning_rate": 4.5355211509814e-06, + "loss": 0.2352, + "num_input_tokens_seen": 8750256, + "step": 4471 + }, + { + "epoch": 0.5927104042412193, + "grad_norm": 20.923831939697266, + "learning_rate": 4.535319592936715e-06, + "loss": 1.1591, + "num_input_tokens_seen": 8752504, + "step": 4472 + }, + { + "epoch": 0.5928429423459245, + "grad_norm": 6.1349263191223145, + "learning_rate": 4.535117995649663e-06, + "loss": 0.1117, + "num_input_tokens_seen": 8754600, + "step": 4473 + }, + { + "epoch": 0.5929754804506295, + "grad_norm": 18.565927505493164, + "learning_rate": 4.534916359124134e-06, + "loss": 0.5349, + "num_input_tokens_seen": 8755968, + "step": 4474 + }, + { + "epoch": 0.5931080185553347, + "grad_norm": 0.1351647824048996, + "learning_rate": 4.534714683364013e-06, + "loss": 0.0009, + "num_input_tokens_seen": 8757272, + "step": 4475 + }, + { + "epoch": 0.5932405566600397, + "grad_norm": 11.154069900512695, + "learning_rate": 4.534512968373189e-06, + "loss": 0.3176, + "num_input_tokens_seen": 8759288, + "step": 4476 + }, + { + "epoch": 0.5933730947647449, + "grad_norm": 5.009145259857178, + "learning_rate": 4.534311214155551e-06, + "loss": 0.0778, + "num_input_tokens_seen": 8761368, + "step": 4477 + }, + { + "epoch": 0.59350563286945, + "grad_norm": 0.1187877282500267, + "learning_rate": 4.534109420714991e-06, + "loss": 0.0008, + "num_input_tokens_seen": 8762472, + "step": 4478 + }, + { + "epoch": 0.593638170974155, + "grad_norm": 36.71934127807617, + "learning_rate": 4.533907588055396e-06, + "loss": 0.2383, + "num_input_tokens_seen": 8765464, + "step": 4479 + }, + { + "epoch": 0.5937707090788602, + "grad_norm": 9.284875869750977, + "learning_rate": 4.5337057161806606e-06, + "loss": 0.3986, + "num_input_tokens_seen": 8767664, + "step": 4480 + }, + { + "epoch": 0.5939032471835652, + "grad_norm": 11.058073997497559, + "learning_rate": 4.533503805094676e-06, + "loss": 0.4033, + "num_input_tokens_seen": 8769880, + "step": 4481 + }, + { + "epoch": 0.5940357852882704, + "grad_norm": 12.482577323913574, + "learning_rate": 4.533301854801335e-06, + "loss": 0.2505, + "num_input_tokens_seen": 8771512, + "step": 4482 + }, + { + "epoch": 0.5941683233929754, + "grad_norm": 5.154951095581055, + "learning_rate": 4.533099865304533e-06, + "loss": 0.1267, + "num_input_tokens_seen": 8774640, + "step": 4483 + }, + { + "epoch": 0.5943008614976806, + "grad_norm": 10.228135108947754, + "learning_rate": 4.532897836608161e-06, + "loss": 0.3263, + "num_input_tokens_seen": 8776296, + "step": 4484 + }, + { + "epoch": 0.5944333996023857, + "grad_norm": 7.892743110656738, + "learning_rate": 4.532695768716117e-06, + "loss": 0.1586, + "num_input_tokens_seen": 8778192, + "step": 4485 + }, + { + "epoch": 0.5945659377070908, + "grad_norm": 4.75492525100708, + "learning_rate": 4.5324936616322965e-06, + "loss": 0.1479, + "num_input_tokens_seen": 8780176, + "step": 4486 + }, + { + "epoch": 0.5946984758117959, + "grad_norm": 0.5556977987289429, + "learning_rate": 4.532291515360596e-06, + "loss": 0.0038, + "num_input_tokens_seen": 8781976, + "step": 4487 + }, + { + "epoch": 0.594831013916501, + "grad_norm": 10.598833084106445, + "learning_rate": 4.532089329904913e-06, + "loss": 0.4334, + "num_input_tokens_seen": 8783344, + "step": 4488 + }, + { + "epoch": 0.5949635520212061, + "grad_norm": 7.626993179321289, + "learning_rate": 4.5318871052691455e-06, + "loss": 0.3437, + "num_input_tokens_seen": 8786920, + "step": 4489 + }, + { + "epoch": 0.5950960901259112, + "grad_norm": 6.43360710144043, + "learning_rate": 4.531684841457193e-06, + "loss": 0.151, + "num_input_tokens_seen": 8788864, + "step": 4490 + }, + { + "epoch": 0.5952286282306163, + "grad_norm": 22.759531021118164, + "learning_rate": 4.531482538472956e-06, + "loss": 0.5418, + "num_input_tokens_seen": 8791512, + "step": 4491 + }, + { + "epoch": 0.5953611663353214, + "grad_norm": 0.7809638977050781, + "learning_rate": 4.531280196320333e-06, + "loss": 0.0056, + "num_input_tokens_seen": 8793384, + "step": 4492 + }, + { + "epoch": 0.5954937044400265, + "grad_norm": 0.9846444725990295, + "learning_rate": 4.531077815003227e-06, + "loss": 0.0072, + "num_input_tokens_seen": 8794584, + "step": 4493 + }, + { + "epoch": 0.5956262425447316, + "grad_norm": 4.842322826385498, + "learning_rate": 4.5308753945255395e-06, + "loss": 0.1149, + "num_input_tokens_seen": 8797016, + "step": 4494 + }, + { + "epoch": 0.5957587806494367, + "grad_norm": 1.3778271675109863, + "learning_rate": 4.5306729348911735e-06, + "loss": 0.0097, + "num_input_tokens_seen": 8799024, + "step": 4495 + }, + { + "epoch": 0.5958913187541418, + "grad_norm": 0.6451683640480042, + "learning_rate": 4.530470436104033e-06, + "loss": 0.0047, + "num_input_tokens_seen": 8800184, + "step": 4496 + }, + { + "epoch": 0.596023856858847, + "grad_norm": 0.5446711182594299, + "learning_rate": 4.530267898168021e-06, + "loss": 0.0039, + "num_input_tokens_seen": 8802016, + "step": 4497 + }, + { + "epoch": 0.596156394963552, + "grad_norm": 7.1697797775268555, + "learning_rate": 4.5300653210870436e-06, + "loss": 0.1197, + "num_input_tokens_seen": 8804016, + "step": 4498 + }, + { + "epoch": 0.5962889330682571, + "grad_norm": 6.376197338104248, + "learning_rate": 4.529862704865006e-06, + "loss": 0.0903, + "num_input_tokens_seen": 8805752, + "step": 4499 + }, + { + "epoch": 0.5964214711729622, + "grad_norm": 10.263676643371582, + "learning_rate": 4.529660049505815e-06, + "loss": 0.604, + "num_input_tokens_seen": 8808208, + "step": 4500 + }, + { + "epoch": 0.5965540092776673, + "grad_norm": 0.5609136819839478, + "learning_rate": 4.529457355013379e-06, + "loss": 0.0027, + "num_input_tokens_seen": 8809904, + "step": 4501 + }, + { + "epoch": 0.5966865473823725, + "grad_norm": 0.05879468098282814, + "learning_rate": 4.529254621391604e-06, + "loss": 0.0004, + "num_input_tokens_seen": 8811944, + "step": 4502 + }, + { + "epoch": 0.5968190854870775, + "grad_norm": 0.12315049022436142, + "learning_rate": 4.529051848644401e-06, + "loss": 0.0009, + "num_input_tokens_seen": 8813856, + "step": 4503 + }, + { + "epoch": 0.5969516235917827, + "grad_norm": 6.565806865692139, + "learning_rate": 4.528849036775677e-06, + "loss": 0.131, + "num_input_tokens_seen": 8816008, + "step": 4504 + }, + { + "epoch": 0.5970841616964877, + "grad_norm": 8.05581283569336, + "learning_rate": 4.528646185789346e-06, + "loss": 0.1606, + "num_input_tokens_seen": 8817888, + "step": 4505 + }, + { + "epoch": 0.5972166998011929, + "grad_norm": 7.169454097747803, + "learning_rate": 4.528443295689317e-06, + "loss": 0.2224, + "num_input_tokens_seen": 8820128, + "step": 4506 + }, + { + "epoch": 0.597349237905898, + "grad_norm": 7.315868377685547, + "learning_rate": 4.5282403664795005e-06, + "loss": 0.313, + "num_input_tokens_seen": 8822536, + "step": 4507 + }, + { + "epoch": 0.597481776010603, + "grad_norm": 4.2231974601745605, + "learning_rate": 4.52803739816381e-06, + "loss": 0.0968, + "num_input_tokens_seen": 8825240, + "step": 4508 + }, + { + "epoch": 0.5976143141153082, + "grad_norm": 0.021635306999087334, + "learning_rate": 4.527834390746161e-06, + "loss": 0.0001, + "num_input_tokens_seen": 8827992, + "step": 4509 + }, + { + "epoch": 0.5977468522200132, + "grad_norm": 10.768719673156738, + "learning_rate": 4.527631344230466e-06, + "loss": 0.302, + "num_input_tokens_seen": 8829688, + "step": 4510 + }, + { + "epoch": 0.5978793903247184, + "grad_norm": 0.04123631492257118, + "learning_rate": 4.527428258620639e-06, + "loss": 0.0002, + "num_input_tokens_seen": 8832256, + "step": 4511 + }, + { + "epoch": 0.5980119284294234, + "grad_norm": 10.023737907409668, + "learning_rate": 4.527225133920596e-06, + "loss": 0.1675, + "num_input_tokens_seen": 8833904, + "step": 4512 + }, + { + "epoch": 0.5981444665341286, + "grad_norm": 13.836169242858887, + "learning_rate": 4.5270219701342555e-06, + "loss": 0.6522, + "num_input_tokens_seen": 8835416, + "step": 4513 + }, + { + "epoch": 0.5982770046388337, + "grad_norm": 8.968985557556152, + "learning_rate": 4.5268187672655326e-06, + "loss": 0.4208, + "num_input_tokens_seen": 8837408, + "step": 4514 + }, + { + "epoch": 0.5984095427435387, + "grad_norm": 18.943763732910156, + "learning_rate": 4.526615525318345e-06, + "loss": 0.582, + "num_input_tokens_seen": 8839072, + "step": 4515 + }, + { + "epoch": 0.5985420808482439, + "grad_norm": 25.836790084838867, + "learning_rate": 4.526412244296613e-06, + "loss": 0.6857, + "num_input_tokens_seen": 8841808, + "step": 4516 + }, + { + "epoch": 0.5986746189529489, + "grad_norm": 14.204140663146973, + "learning_rate": 4.526208924204254e-06, + "loss": 0.2076, + "num_input_tokens_seen": 8843472, + "step": 4517 + }, + { + "epoch": 0.5988071570576541, + "grad_norm": 0.07145500928163528, + "learning_rate": 4.526005565045189e-06, + "loss": 0.0003, + "num_input_tokens_seen": 8845472, + "step": 4518 + }, + { + "epoch": 0.5989396951623592, + "grad_norm": 10.216188430786133, + "learning_rate": 4.52580216682334e-06, + "loss": 0.3633, + "num_input_tokens_seen": 8847200, + "step": 4519 + }, + { + "epoch": 0.5990722332670643, + "grad_norm": 0.04807635769248009, + "learning_rate": 4.5255987295426265e-06, + "loss": 0.0003, + "num_input_tokens_seen": 8849120, + "step": 4520 + }, + { + "epoch": 0.5992047713717694, + "grad_norm": 0.016722597181797028, + "learning_rate": 4.525395253206974e-06, + "loss": 0.0001, + "num_input_tokens_seen": 8850944, + "step": 4521 + }, + { + "epoch": 0.5993373094764745, + "grad_norm": 7.352213382720947, + "learning_rate": 4.525191737820302e-06, + "loss": 0.2549, + "num_input_tokens_seen": 8853400, + "step": 4522 + }, + { + "epoch": 0.5994698475811796, + "grad_norm": 15.775116920471191, + "learning_rate": 4.524988183386536e-06, + "loss": 0.4617, + "num_input_tokens_seen": 8855240, + "step": 4523 + }, + { + "epoch": 0.5996023856858846, + "grad_norm": 0.0293857641518116, + "learning_rate": 4.524784589909602e-06, + "loss": 0.0002, + "num_input_tokens_seen": 8857008, + "step": 4524 + }, + { + "epoch": 0.5997349237905898, + "grad_norm": 0.024235641583800316, + "learning_rate": 4.524580957393424e-06, + "loss": 0.0002, + "num_input_tokens_seen": 8859904, + "step": 4525 + }, + { + "epoch": 0.5998674618952949, + "grad_norm": 11.712533950805664, + "learning_rate": 4.524377285841929e-06, + "loss": 0.2427, + "num_input_tokens_seen": 8861664, + "step": 4526 + }, + { + "epoch": 0.6, + "grad_norm": 0.02020755037665367, + "learning_rate": 4.524173575259042e-06, + "loss": 0.0001, + "num_input_tokens_seen": 8863568, + "step": 4527 + }, + { + "epoch": 0.6001325381047051, + "grad_norm": 0.07037772238254547, + "learning_rate": 4.523969825648692e-06, + "loss": 0.0005, + "num_input_tokens_seen": 8865656, + "step": 4528 + }, + { + "epoch": 0.6002650762094102, + "grad_norm": 0.01822470687329769, + "learning_rate": 4.523766037014809e-06, + "loss": 0.0001, + "num_input_tokens_seen": 8866960, + "step": 4529 + }, + { + "epoch": 0.6003976143141153, + "grad_norm": 6.157485485076904, + "learning_rate": 4.5235622093613195e-06, + "loss": 0.0622, + "num_input_tokens_seen": 8868400, + "step": 4530 + }, + { + "epoch": 0.6005301524188205, + "grad_norm": 8.748900413513184, + "learning_rate": 4.523358342692156e-06, + "loss": 0.3731, + "num_input_tokens_seen": 8870480, + "step": 4531 + }, + { + "epoch": 0.6006626905235255, + "grad_norm": 8.124286651611328, + "learning_rate": 4.523154437011247e-06, + "loss": 0.1777, + "num_input_tokens_seen": 8872776, + "step": 4532 + }, + { + "epoch": 0.6007952286282306, + "grad_norm": 4.451871395111084, + "learning_rate": 4.522950492322524e-06, + "loss": 0.0731, + "num_input_tokens_seen": 8874416, + "step": 4533 + }, + { + "epoch": 0.6009277667329357, + "grad_norm": 7.3767619132995605, + "learning_rate": 4.52274650862992e-06, + "loss": 0.1955, + "num_input_tokens_seen": 8876696, + "step": 4534 + }, + { + "epoch": 0.6010603048376408, + "grad_norm": 0.07187385857105255, + "learning_rate": 4.522542485937369e-06, + "loss": 0.0004, + "num_input_tokens_seen": 8877792, + "step": 4535 + }, + { + "epoch": 0.6011928429423459, + "grad_norm": 5.0182952880859375, + "learning_rate": 4.5223384242488036e-06, + "loss": 0.0838, + "num_input_tokens_seen": 8879240, + "step": 4536 + }, + { + "epoch": 0.601325381047051, + "grad_norm": 14.679693222045898, + "learning_rate": 4.522134323568157e-06, + "loss": 0.5397, + "num_input_tokens_seen": 8881304, + "step": 4537 + }, + { + "epoch": 0.6014579191517562, + "grad_norm": 5.694046497344971, + "learning_rate": 4.521930183899367e-06, + "loss": 0.1809, + "num_input_tokens_seen": 8883568, + "step": 4538 + }, + { + "epoch": 0.6015904572564612, + "grad_norm": 6.009949207305908, + "learning_rate": 4.521726005246367e-06, + "loss": 0.1011, + "num_input_tokens_seen": 8885168, + "step": 4539 + }, + { + "epoch": 0.6017229953611664, + "grad_norm": 6.791545391082764, + "learning_rate": 4.521521787613095e-06, + "loss": 0.2922, + "num_input_tokens_seen": 8887360, + "step": 4540 + }, + { + "epoch": 0.6018555334658714, + "grad_norm": 8.365147590637207, + "learning_rate": 4.521317531003489e-06, + "loss": 0.3158, + "num_input_tokens_seen": 8889592, + "step": 4541 + }, + { + "epoch": 0.6019880715705765, + "grad_norm": 0.9974453449249268, + "learning_rate": 4.521113235421487e-06, + "loss": 0.0037, + "num_input_tokens_seen": 8891600, + "step": 4542 + }, + { + "epoch": 0.6021206096752817, + "grad_norm": 0.3686714172363281, + "learning_rate": 4.520908900871026e-06, + "loss": 0.0023, + "num_input_tokens_seen": 8893024, + "step": 4543 + }, + { + "epoch": 0.6022531477799867, + "grad_norm": 7.982909202575684, + "learning_rate": 4.520704527356049e-06, + "loss": 0.2266, + "num_input_tokens_seen": 8894960, + "step": 4544 + }, + { + "epoch": 0.6023856858846919, + "grad_norm": 3.595158338546753, + "learning_rate": 4.520500114880494e-06, + "loss": 0.0648, + "num_input_tokens_seen": 8897016, + "step": 4545 + }, + { + "epoch": 0.6025182239893969, + "grad_norm": 7.6053595542907715, + "learning_rate": 4.520295663448302e-06, + "loss": 0.3825, + "num_input_tokens_seen": 8899176, + "step": 4546 + }, + { + "epoch": 0.6026507620941021, + "grad_norm": 3.278895139694214, + "learning_rate": 4.520091173063417e-06, + "loss": 0.1652, + "num_input_tokens_seen": 8900552, + "step": 4547 + }, + { + "epoch": 0.6027833001988071, + "grad_norm": 8.445926666259766, + "learning_rate": 4.51988664372978e-06, + "loss": 0.2359, + "num_input_tokens_seen": 8902752, + "step": 4548 + }, + { + "epoch": 0.6029158383035123, + "grad_norm": 11.667129516601562, + "learning_rate": 4.519682075451335e-06, + "loss": 0.3455, + "num_input_tokens_seen": 8904544, + "step": 4549 + }, + { + "epoch": 0.6030483764082174, + "grad_norm": 7.595519542694092, + "learning_rate": 4.5194774682320266e-06, + "loss": 0.1127, + "num_input_tokens_seen": 8906816, + "step": 4550 + }, + { + "epoch": 0.6031809145129224, + "grad_norm": 7.491942405700684, + "learning_rate": 4.5192728220758e-06, + "loss": 0.1301, + "num_input_tokens_seen": 8909136, + "step": 4551 + }, + { + "epoch": 0.6033134526176276, + "grad_norm": 0.23396670818328857, + "learning_rate": 4.5190681369865995e-06, + "loss": 0.0011, + "num_input_tokens_seen": 8911320, + "step": 4552 + }, + { + "epoch": 0.6034459907223326, + "grad_norm": 4.209238529205322, + "learning_rate": 4.518863412968373e-06, + "loss": 0.0349, + "num_input_tokens_seen": 8912736, + "step": 4553 + }, + { + "epoch": 0.6035785288270378, + "grad_norm": 6.982316493988037, + "learning_rate": 4.518658650025066e-06, + "loss": 0.2646, + "num_input_tokens_seen": 8913896, + "step": 4554 + }, + { + "epoch": 0.6037110669317429, + "grad_norm": 10.744330406188965, + "learning_rate": 4.518453848160629e-06, + "loss": 0.3957, + "num_input_tokens_seen": 8915752, + "step": 4555 + }, + { + "epoch": 0.603843605036448, + "grad_norm": 0.09913662821054459, + "learning_rate": 4.518249007379009e-06, + "loss": 0.0007, + "num_input_tokens_seen": 8918944, + "step": 4556 + }, + { + "epoch": 0.6039761431411531, + "grad_norm": 11.746828079223633, + "learning_rate": 4.5180441276841556e-06, + "loss": 0.4279, + "num_input_tokens_seen": 8920432, + "step": 4557 + }, + { + "epoch": 0.6041086812458581, + "grad_norm": 5.90751314163208, + "learning_rate": 4.51783920908002e-06, + "loss": 0.1511, + "num_input_tokens_seen": 8921856, + "step": 4558 + }, + { + "epoch": 0.6042412193505633, + "grad_norm": 0.10062120109796524, + "learning_rate": 4.517634251570551e-06, + "loss": 0.0007, + "num_input_tokens_seen": 8923072, + "step": 4559 + }, + { + "epoch": 0.6043737574552683, + "grad_norm": 6.651098251342773, + "learning_rate": 4.517429255159702e-06, + "loss": 0.2661, + "num_input_tokens_seen": 8925184, + "step": 4560 + }, + { + "epoch": 0.6045062955599735, + "grad_norm": 14.558205604553223, + "learning_rate": 4.517224219851426e-06, + "loss": 0.3519, + "num_input_tokens_seen": 8927152, + "step": 4561 + }, + { + "epoch": 0.6046388336646786, + "grad_norm": 9.759568214416504, + "learning_rate": 4.517019145649675e-06, + "loss": 0.4091, + "num_input_tokens_seen": 8929168, + "step": 4562 + }, + { + "epoch": 0.6047713717693837, + "grad_norm": 12.05634593963623, + "learning_rate": 4.516814032558404e-06, + "loss": 0.4341, + "num_input_tokens_seen": 8930848, + "step": 4563 + }, + { + "epoch": 0.6049039098740888, + "grad_norm": 18.662532806396484, + "learning_rate": 4.516608880581566e-06, + "loss": 0.4086, + "num_input_tokens_seen": 8932944, + "step": 4564 + }, + { + "epoch": 0.6050364479787939, + "grad_norm": 0.2509464621543884, + "learning_rate": 4.516403689723118e-06, + "loss": 0.0018, + "num_input_tokens_seen": 8934608, + "step": 4565 + }, + { + "epoch": 0.605168986083499, + "grad_norm": 11.71917724609375, + "learning_rate": 4.516198459987016e-06, + "loss": 0.3193, + "num_input_tokens_seen": 8936632, + "step": 4566 + }, + { + "epoch": 0.6053015241882042, + "grad_norm": 10.383745193481445, + "learning_rate": 4.515993191377217e-06, + "loss": 0.3079, + "num_input_tokens_seen": 8939072, + "step": 4567 + }, + { + "epoch": 0.6054340622929092, + "grad_norm": 0.43665623664855957, + "learning_rate": 4.515787883897679e-06, + "loss": 0.003, + "num_input_tokens_seen": 8940216, + "step": 4568 + }, + { + "epoch": 0.6055666003976143, + "grad_norm": 5.400319576263428, + "learning_rate": 4.515582537552359e-06, + "loss": 0.0571, + "num_input_tokens_seen": 8943520, + "step": 4569 + }, + { + "epoch": 0.6056991385023194, + "grad_norm": 9.727317810058594, + "learning_rate": 4.515377152345217e-06, + "loss": 0.1748, + "num_input_tokens_seen": 8945680, + "step": 4570 + }, + { + "epoch": 0.6058316766070245, + "grad_norm": 7.340142250061035, + "learning_rate": 4.515171728280214e-06, + "loss": 0.3424, + "num_input_tokens_seen": 8947920, + "step": 4571 + }, + { + "epoch": 0.6059642147117297, + "grad_norm": 2.4260149002075195, + "learning_rate": 4.514966265361309e-06, + "loss": 0.0258, + "num_input_tokens_seen": 8949368, + "step": 4572 + }, + { + "epoch": 0.6060967528164347, + "grad_norm": 4.754249572753906, + "learning_rate": 4.514760763592465e-06, + "loss": 0.0486, + "num_input_tokens_seen": 8951776, + "step": 4573 + }, + { + "epoch": 0.6062292909211399, + "grad_norm": 10.615996360778809, + "learning_rate": 4.514555222977643e-06, + "loss": 0.3899, + "num_input_tokens_seen": 8953488, + "step": 4574 + }, + { + "epoch": 0.6063618290258449, + "grad_norm": 17.5757999420166, + "learning_rate": 4.514349643520807e-06, + "loss": 0.5614, + "num_input_tokens_seen": 8955952, + "step": 4575 + }, + { + "epoch": 0.60649436713055, + "grad_norm": 11.869431495666504, + "learning_rate": 4.514144025225921e-06, + "loss": 0.464, + "num_input_tokens_seen": 8958344, + "step": 4576 + }, + { + "epoch": 0.6066269052352551, + "grad_norm": 7.819013595581055, + "learning_rate": 4.5139383680969475e-06, + "loss": 0.183, + "num_input_tokens_seen": 8960696, + "step": 4577 + }, + { + "epoch": 0.6067594433399602, + "grad_norm": 6.039429664611816, + "learning_rate": 4.513732672137854e-06, + "loss": 0.0695, + "num_input_tokens_seen": 8961944, + "step": 4578 + }, + { + "epoch": 0.6068919814446654, + "grad_norm": 6.966115951538086, + "learning_rate": 4.513526937352604e-06, + "loss": 0.2923, + "num_input_tokens_seen": 8964208, + "step": 4579 + }, + { + "epoch": 0.6070245195493704, + "grad_norm": 6.5732011795043945, + "learning_rate": 4.5133211637451665e-06, + "loss": 0.1039, + "num_input_tokens_seen": 8966616, + "step": 4580 + }, + { + "epoch": 0.6071570576540756, + "grad_norm": 0.7121011018753052, + "learning_rate": 4.513115351319508e-06, + "loss": 0.0052, + "num_input_tokens_seen": 8968776, + "step": 4581 + }, + { + "epoch": 0.6072895957587806, + "grad_norm": 14.751883506774902, + "learning_rate": 4.512909500079596e-06, + "loss": 0.4606, + "num_input_tokens_seen": 8971584, + "step": 4582 + }, + { + "epoch": 0.6074221338634858, + "grad_norm": 34.59712219238281, + "learning_rate": 4.5127036100294015e-06, + "loss": 0.3538, + "num_input_tokens_seen": 8973280, + "step": 4583 + }, + { + "epoch": 0.6075546719681909, + "grad_norm": 1.0584779977798462, + "learning_rate": 4.512497681172892e-06, + "loss": 0.0074, + "num_input_tokens_seen": 8975152, + "step": 4584 + }, + { + "epoch": 0.607687210072896, + "grad_norm": 20.662830352783203, + "learning_rate": 4.51229171351404e-06, + "loss": 0.2321, + "num_input_tokens_seen": 8977360, + "step": 4585 + }, + { + "epoch": 0.6078197481776011, + "grad_norm": 8.430717468261719, + "learning_rate": 4.512085707056813e-06, + "loss": 0.2758, + "num_input_tokens_seen": 8979096, + "step": 4586 + }, + { + "epoch": 0.6079522862823061, + "grad_norm": 0.2120072841644287, + "learning_rate": 4.511879661805188e-06, + "loss": 0.0015, + "num_input_tokens_seen": 8980808, + "step": 4587 + }, + { + "epoch": 0.6080848243870113, + "grad_norm": 9.471118927001953, + "learning_rate": 4.511673577763134e-06, + "loss": 0.4393, + "num_input_tokens_seen": 8982432, + "step": 4588 + }, + { + "epoch": 0.6082173624917163, + "grad_norm": 12.794418334960938, + "learning_rate": 4.511467454934626e-06, + "loss": 0.4066, + "num_input_tokens_seen": 8983584, + "step": 4589 + }, + { + "epoch": 0.6083499005964215, + "grad_norm": 14.948718070983887, + "learning_rate": 4.511261293323637e-06, + "loss": 0.3341, + "num_input_tokens_seen": 8985544, + "step": 4590 + }, + { + "epoch": 0.6084824387011266, + "grad_norm": 14.16418170928955, + "learning_rate": 4.511055092934144e-06, + "loss": 0.2328, + "num_input_tokens_seen": 8987360, + "step": 4591 + }, + { + "epoch": 0.6086149768058317, + "grad_norm": 11.421998977661133, + "learning_rate": 4.510848853770122e-06, + "loss": 0.2661, + "num_input_tokens_seen": 8988944, + "step": 4592 + }, + { + "epoch": 0.6087475149105368, + "grad_norm": 11.948812484741211, + "learning_rate": 4.510642575835545e-06, + "loss": 0.4508, + "num_input_tokens_seen": 8990792, + "step": 4593 + }, + { + "epoch": 0.6088800530152418, + "grad_norm": 0.09583049267530441, + "learning_rate": 4.510436259134394e-06, + "loss": 0.0007, + "num_input_tokens_seen": 8991808, + "step": 4594 + }, + { + "epoch": 0.609012591119947, + "grad_norm": 1.7254836559295654, + "learning_rate": 4.510229903670643e-06, + "loss": 0.0102, + "num_input_tokens_seen": 8993784, + "step": 4595 + }, + { + "epoch": 0.6091451292246521, + "grad_norm": 9.337934494018555, + "learning_rate": 4.510023509448273e-06, + "loss": 0.2054, + "num_input_tokens_seen": 8995760, + "step": 4596 + }, + { + "epoch": 0.6092776673293572, + "grad_norm": 15.228385925292969, + "learning_rate": 4.509817076471265e-06, + "loss": 0.6889, + "num_input_tokens_seen": 8997864, + "step": 4597 + }, + { + "epoch": 0.6094102054340623, + "grad_norm": 12.639373779296875, + "learning_rate": 4.509610604743596e-06, + "loss": 0.367, + "num_input_tokens_seen": 8999664, + "step": 4598 + }, + { + "epoch": 0.6095427435387674, + "grad_norm": 12.03857421875, + "learning_rate": 4.509404094269248e-06, + "loss": 0.3081, + "num_input_tokens_seen": 9001928, + "step": 4599 + }, + { + "epoch": 0.6096752816434725, + "grad_norm": 3.5542972087860107, + "learning_rate": 4.509197545052203e-06, + "loss": 0.0285, + "num_input_tokens_seen": 9003808, + "step": 4600 + }, + { + "epoch": 0.6098078197481775, + "grad_norm": 0.13124549388885498, + "learning_rate": 4.508990957096443e-06, + "loss": 0.0009, + "num_input_tokens_seen": 9005184, + "step": 4601 + }, + { + "epoch": 0.6099403578528827, + "grad_norm": 3.2197208404541016, + "learning_rate": 4.508784330405952e-06, + "loss": 0.0284, + "num_input_tokens_seen": 9006552, + "step": 4602 + }, + { + "epoch": 0.6100728959575878, + "grad_norm": 1.1755419969558716, + "learning_rate": 4.508577664984713e-06, + "loss": 0.0048, + "num_input_tokens_seen": 9008816, + "step": 4603 + }, + { + "epoch": 0.6102054340622929, + "grad_norm": 0.0717979297041893, + "learning_rate": 4.508370960836711e-06, + "loss": 0.0005, + "num_input_tokens_seen": 9009864, + "step": 4604 + }, + { + "epoch": 0.610337972166998, + "grad_norm": 15.239445686340332, + "learning_rate": 4.508164217965931e-06, + "loss": 0.3906, + "num_input_tokens_seen": 9011568, + "step": 4605 + }, + { + "epoch": 0.6104705102717031, + "grad_norm": 12.738359451293945, + "learning_rate": 4.50795743637636e-06, + "loss": 0.4596, + "num_input_tokens_seen": 9013928, + "step": 4606 + }, + { + "epoch": 0.6106030483764082, + "grad_norm": 10.088764190673828, + "learning_rate": 4.507750616071984e-06, + "loss": 0.2146, + "num_input_tokens_seen": 9015048, + "step": 4607 + }, + { + "epoch": 0.6107355864811134, + "grad_norm": 0.1670483946800232, + "learning_rate": 4.507543757056792e-06, + "loss": 0.0012, + "num_input_tokens_seen": 9017656, + "step": 4608 + }, + { + "epoch": 0.6108681245858184, + "grad_norm": 12.729340553283691, + "learning_rate": 4.50733685933477e-06, + "loss": 0.2315, + "num_input_tokens_seen": 9019296, + "step": 4609 + }, + { + "epoch": 0.6110006626905236, + "grad_norm": 0.15931633114814758, + "learning_rate": 4.507129922909909e-06, + "loss": 0.0011, + "num_input_tokens_seen": 9020536, + "step": 4610 + }, + { + "epoch": 0.6111332007952286, + "grad_norm": 7.022318363189697, + "learning_rate": 4.506922947786199e-06, + "loss": 0.3223, + "num_input_tokens_seen": 9021912, + "step": 4611 + }, + { + "epoch": 0.6112657388999337, + "grad_norm": 1.6859134435653687, + "learning_rate": 4.506715933967631e-06, + "loss": 0.0127, + "num_input_tokens_seen": 9023864, + "step": 4612 + }, + { + "epoch": 0.6113982770046388, + "grad_norm": 0.11855582147836685, + "learning_rate": 4.5065088814581935e-06, + "loss": 0.0008, + "num_input_tokens_seen": 9024984, + "step": 4613 + }, + { + "epoch": 0.6115308151093439, + "grad_norm": 10.458747863769531, + "learning_rate": 4.506301790261881e-06, + "loss": 0.3945, + "num_input_tokens_seen": 9027080, + "step": 4614 + }, + { + "epoch": 0.6116633532140491, + "grad_norm": 5.482684135437012, + "learning_rate": 4.5060946603826866e-06, + "loss": 0.1763, + "num_input_tokens_seen": 9029760, + "step": 4615 + }, + { + "epoch": 0.6117958913187541, + "grad_norm": 7.692590236663818, + "learning_rate": 4.5058874918246035e-06, + "loss": 0.2638, + "num_input_tokens_seen": 9031400, + "step": 4616 + }, + { + "epoch": 0.6119284294234593, + "grad_norm": 9.06706714630127, + "learning_rate": 4.5056802845916245e-06, + "loss": 0.1305, + "num_input_tokens_seen": 9034216, + "step": 4617 + }, + { + "epoch": 0.6120609675281643, + "grad_norm": 4.921189785003662, + "learning_rate": 4.505473038687747e-06, + "loss": 0.1956, + "num_input_tokens_seen": 9036536, + "step": 4618 + }, + { + "epoch": 0.6121935056328695, + "grad_norm": 0.14490513503551483, + "learning_rate": 4.505265754116965e-06, + "loss": 0.001, + "num_input_tokens_seen": 9038688, + "step": 4619 + }, + { + "epoch": 0.6123260437375746, + "grad_norm": 9.038633346557617, + "learning_rate": 4.505058430883275e-06, + "loss": 0.2481, + "num_input_tokens_seen": 9040088, + "step": 4620 + }, + { + "epoch": 0.6124585818422796, + "grad_norm": 9.133187294006348, + "learning_rate": 4.504851068990678e-06, + "loss": 0.2461, + "num_input_tokens_seen": 9041848, + "step": 4621 + }, + { + "epoch": 0.6125911199469848, + "grad_norm": 1.131885290145874, + "learning_rate": 4.504643668443167e-06, + "loss": 0.008, + "num_input_tokens_seen": 9044344, + "step": 4622 + }, + { + "epoch": 0.6127236580516898, + "grad_norm": 8.648845672607422, + "learning_rate": 4.504436229244744e-06, + "loss": 0.2392, + "num_input_tokens_seen": 9046808, + "step": 4623 + }, + { + "epoch": 0.612856196156395, + "grad_norm": 8.154289245605469, + "learning_rate": 4.504228751399408e-06, + "loss": 0.1539, + "num_input_tokens_seen": 9048408, + "step": 4624 + }, + { + "epoch": 0.6129887342611001, + "grad_norm": 0.12523721158504486, + "learning_rate": 4.5040212349111586e-06, + "loss": 0.0009, + "num_input_tokens_seen": 9049880, + "step": 4625 + }, + { + "epoch": 0.6131212723658052, + "grad_norm": 0.35211700201034546, + "learning_rate": 4.503813679783997e-06, + "loss": 0.0025, + "num_input_tokens_seen": 9052352, + "step": 4626 + }, + { + "epoch": 0.6132538104705103, + "grad_norm": 7.847185134887695, + "learning_rate": 4.503606086021925e-06, + "loss": 0.113, + "num_input_tokens_seen": 9054200, + "step": 4627 + }, + { + "epoch": 0.6133863485752153, + "grad_norm": 0.22652332484722137, + "learning_rate": 4.5033984536289475e-06, + "loss": 0.0017, + "num_input_tokens_seen": 9056184, + "step": 4628 + }, + { + "epoch": 0.6135188866799205, + "grad_norm": 23.547622680664062, + "learning_rate": 4.5031907826090634e-06, + "loss": 0.7299, + "num_input_tokens_seen": 9057960, + "step": 4629 + }, + { + "epoch": 0.6136514247846255, + "grad_norm": 9.33605670928955, + "learning_rate": 4.50298307296628e-06, + "loss": 0.3337, + "num_input_tokens_seen": 9060576, + "step": 4630 + }, + { + "epoch": 0.6137839628893307, + "grad_norm": 9.735233306884766, + "learning_rate": 4.502775324704601e-06, + "loss": 0.1273, + "num_input_tokens_seen": 9062152, + "step": 4631 + }, + { + "epoch": 0.6139165009940358, + "grad_norm": 16.75956916809082, + "learning_rate": 4.502567537828033e-06, + "loss": 0.4844, + "num_input_tokens_seen": 9063888, + "step": 4632 + }, + { + "epoch": 0.6140490390987409, + "grad_norm": 0.08369563519954681, + "learning_rate": 4.502359712340581e-06, + "loss": 0.0006, + "num_input_tokens_seen": 9066032, + "step": 4633 + }, + { + "epoch": 0.614181577203446, + "grad_norm": 3.727271795272827, + "learning_rate": 4.502151848246252e-06, + "loss": 0.0293, + "num_input_tokens_seen": 9068296, + "step": 4634 + }, + { + "epoch": 0.614314115308151, + "grad_norm": 19.521671295166016, + "learning_rate": 4.501943945549054e-06, + "loss": 0.7203, + "num_input_tokens_seen": 9070856, + "step": 4635 + }, + { + "epoch": 0.6144466534128562, + "grad_norm": 7.795699119567871, + "learning_rate": 4.501736004252995e-06, + "loss": 0.2465, + "num_input_tokens_seen": 9072112, + "step": 4636 + }, + { + "epoch": 0.6145791915175614, + "grad_norm": 14.626348495483398, + "learning_rate": 4.501528024362086e-06, + "loss": 0.7532, + "num_input_tokens_seen": 9074160, + "step": 4637 + }, + { + "epoch": 0.6147117296222664, + "grad_norm": 17.480411529541016, + "learning_rate": 4.501320005880337e-06, + "loss": 0.3319, + "num_input_tokens_seen": 9076224, + "step": 4638 + }, + { + "epoch": 0.6148442677269715, + "grad_norm": 0.7760393023490906, + "learning_rate": 4.501111948811755e-06, + "loss": 0.0054, + "num_input_tokens_seen": 9077504, + "step": 4639 + }, + { + "epoch": 0.6149768058316766, + "grad_norm": 8.23949909210205, + "learning_rate": 4.500903853160356e-06, + "loss": 0.1324, + "num_input_tokens_seen": 9079064, + "step": 4640 + }, + { + "epoch": 0.6151093439363817, + "grad_norm": 7.4478960037231445, + "learning_rate": 4.50069571893015e-06, + "loss": 0.3362, + "num_input_tokens_seen": 9080992, + "step": 4641 + }, + { + "epoch": 0.6152418820410868, + "grad_norm": 0.12939141690731049, + "learning_rate": 4.50048754612515e-06, + "loss": 0.0009, + "num_input_tokens_seen": 9082480, + "step": 4642 + }, + { + "epoch": 0.6153744201457919, + "grad_norm": 4.620145320892334, + "learning_rate": 4.50027933474937e-06, + "loss": 0.2403, + "num_input_tokens_seen": 9085328, + "step": 4643 + }, + { + "epoch": 0.6155069582504971, + "grad_norm": 0.10231047868728638, + "learning_rate": 4.500071084806824e-06, + "loss": 0.0007, + "num_input_tokens_seen": 9086480, + "step": 4644 + }, + { + "epoch": 0.6156394963552021, + "grad_norm": 14.9724702835083, + "learning_rate": 4.499862796301529e-06, + "loss": 0.2381, + "num_input_tokens_seen": 9088568, + "step": 4645 + }, + { + "epoch": 0.6157720344599072, + "grad_norm": 0.11861508339643478, + "learning_rate": 4.4996544692375e-06, + "loss": 0.0008, + "num_input_tokens_seen": 9091232, + "step": 4646 + }, + { + "epoch": 0.6159045725646123, + "grad_norm": 0.21257458627223969, + "learning_rate": 4.499446103618752e-06, + "loss": 0.0015, + "num_input_tokens_seen": 9092344, + "step": 4647 + }, + { + "epoch": 0.6160371106693174, + "grad_norm": 10.051376342773438, + "learning_rate": 4.499237699449304e-06, + "loss": 0.309, + "num_input_tokens_seen": 9094752, + "step": 4648 + }, + { + "epoch": 0.6161696487740226, + "grad_norm": 3.0165855884552, + "learning_rate": 4.499029256733174e-06, + "loss": 0.0623, + "num_input_tokens_seen": 9096048, + "step": 4649 + }, + { + "epoch": 0.6163021868787276, + "grad_norm": 10.058028221130371, + "learning_rate": 4.4988207754743805e-06, + "loss": 0.4459, + "num_input_tokens_seen": 9098368, + "step": 4650 + }, + { + "epoch": 0.6164347249834328, + "grad_norm": 14.995697975158691, + "learning_rate": 4.498612255676944e-06, + "loss": 0.6362, + "num_input_tokens_seen": 9100104, + "step": 4651 + }, + { + "epoch": 0.6165672630881378, + "grad_norm": 10.024073600769043, + "learning_rate": 4.498403697344885e-06, + "loss": 0.3696, + "num_input_tokens_seen": 9102664, + "step": 4652 + }, + { + "epoch": 0.616699801192843, + "grad_norm": 2.230666399002075, + "learning_rate": 4.4981951004822235e-06, + "loss": 0.0492, + "num_input_tokens_seen": 9103816, + "step": 4653 + }, + { + "epoch": 0.616832339297548, + "grad_norm": 13.140382766723633, + "learning_rate": 4.497986465092982e-06, + "loss": 0.525, + "num_input_tokens_seen": 9105800, + "step": 4654 + }, + { + "epoch": 0.6169648774022531, + "grad_norm": 13.978934288024902, + "learning_rate": 4.497777791181183e-06, + "loss": 0.3803, + "num_input_tokens_seen": 9108296, + "step": 4655 + }, + { + "epoch": 0.6170974155069583, + "grad_norm": 8.822529792785645, + "learning_rate": 4.4975690787508504e-06, + "loss": 0.3266, + "num_input_tokens_seen": 9109512, + "step": 4656 + }, + { + "epoch": 0.6172299536116633, + "grad_norm": 3.7693190574645996, + "learning_rate": 4.497360327806007e-06, + "loss": 0.1337, + "num_input_tokens_seen": 9110984, + "step": 4657 + }, + { + "epoch": 0.6173624917163685, + "grad_norm": 0.3466157019138336, + "learning_rate": 4.497151538350679e-06, + "loss": 0.0025, + "num_input_tokens_seen": 9112592, + "step": 4658 + }, + { + "epoch": 0.6174950298210735, + "grad_norm": 2.6810734272003174, + "learning_rate": 4.4969427103888925e-06, + "loss": 0.021, + "num_input_tokens_seen": 9114360, + "step": 4659 + }, + { + "epoch": 0.6176275679257787, + "grad_norm": 26.58724594116211, + "learning_rate": 4.4967338439246725e-06, + "loss": 0.6333, + "num_input_tokens_seen": 9117104, + "step": 4660 + }, + { + "epoch": 0.6177601060304838, + "grad_norm": 1.0373183488845825, + "learning_rate": 4.496524938962046e-06, + "loss": 0.0072, + "num_input_tokens_seen": 9118536, + "step": 4661 + }, + { + "epoch": 0.6178926441351889, + "grad_norm": 13.612814903259277, + "learning_rate": 4.496315995505042e-06, + "loss": 0.2523, + "num_input_tokens_seen": 9121296, + "step": 4662 + }, + { + "epoch": 0.618025182239894, + "grad_norm": 16.107580184936523, + "learning_rate": 4.496107013557687e-06, + "loss": 0.5627, + "num_input_tokens_seen": 9123352, + "step": 4663 + }, + { + "epoch": 0.618157720344599, + "grad_norm": 4.370780944824219, + "learning_rate": 4.4958979931240135e-06, + "loss": 0.1225, + "num_input_tokens_seen": 9126016, + "step": 4664 + }, + { + "epoch": 0.6182902584493042, + "grad_norm": 8.912665367126465, + "learning_rate": 4.495688934208049e-06, + "loss": 0.2641, + "num_input_tokens_seen": 9128240, + "step": 4665 + }, + { + "epoch": 0.6184227965540092, + "grad_norm": 10.929624557495117, + "learning_rate": 4.495479836813825e-06, + "loss": 0.3224, + "num_input_tokens_seen": 9130128, + "step": 4666 + }, + { + "epoch": 0.6185553346587144, + "grad_norm": 0.5185471773147583, + "learning_rate": 4.495270700945373e-06, + "loss": 0.0038, + "num_input_tokens_seen": 9131744, + "step": 4667 + }, + { + "epoch": 0.6186878727634195, + "grad_norm": 2.056727647781372, + "learning_rate": 4.495061526606725e-06, + "loss": 0.0142, + "num_input_tokens_seen": 9133344, + "step": 4668 + }, + { + "epoch": 0.6188204108681246, + "grad_norm": 6.010608673095703, + "learning_rate": 4.494852313801916e-06, + "loss": 0.0611, + "num_input_tokens_seen": 9134944, + "step": 4669 + }, + { + "epoch": 0.6189529489728297, + "grad_norm": 0.33955755829811096, + "learning_rate": 4.4946430625349765e-06, + "loss": 0.0024, + "num_input_tokens_seen": 9135952, + "step": 4670 + }, + { + "epoch": 0.6190854870775347, + "grad_norm": 2.5804831981658936, + "learning_rate": 4.494433772809944e-06, + "loss": 0.0757, + "num_input_tokens_seen": 9138184, + "step": 4671 + }, + { + "epoch": 0.6192180251822399, + "grad_norm": 0.4750082790851593, + "learning_rate": 4.494224444630852e-06, + "loss": 0.0032, + "num_input_tokens_seen": 9139976, + "step": 4672 + }, + { + "epoch": 0.619350563286945, + "grad_norm": 0.6756501197814941, + "learning_rate": 4.494015078001737e-06, + "loss": 0.0035, + "num_input_tokens_seen": 9141576, + "step": 4673 + }, + { + "epoch": 0.6194831013916501, + "grad_norm": 11.099910736083984, + "learning_rate": 4.493805672926636e-06, + "loss": 0.3121, + "num_input_tokens_seen": 9143832, + "step": 4674 + }, + { + "epoch": 0.6196156394963552, + "grad_norm": 14.962285041809082, + "learning_rate": 4.4935962294095856e-06, + "loss": 0.717, + "num_input_tokens_seen": 9146200, + "step": 4675 + }, + { + "epoch": 0.6197481776010603, + "grad_norm": 0.07355666905641556, + "learning_rate": 4.493386747454624e-06, + "loss": 0.0005, + "num_input_tokens_seen": 9147720, + "step": 4676 + }, + { + "epoch": 0.6198807157057654, + "grad_norm": 11.331989288330078, + "learning_rate": 4.4931772270657925e-06, + "loss": 0.1293, + "num_input_tokens_seen": 9150272, + "step": 4677 + }, + { + "epoch": 0.6200132538104705, + "grad_norm": 9.6017484664917, + "learning_rate": 4.492967668247128e-06, + "loss": 0.3661, + "num_input_tokens_seen": 9152872, + "step": 4678 + }, + { + "epoch": 0.6201457919151756, + "grad_norm": 16.255386352539062, + "learning_rate": 4.492758071002672e-06, + "loss": 0.3591, + "num_input_tokens_seen": 9154104, + "step": 4679 + }, + { + "epoch": 0.6202783300198808, + "grad_norm": 13.214810371398926, + "learning_rate": 4.4925484353364654e-06, + "loss": 0.6361, + "num_input_tokens_seen": 9156224, + "step": 4680 + }, + { + "epoch": 0.6204108681245858, + "grad_norm": 1.5089373588562012, + "learning_rate": 4.492338761252551e-06, + "loss": 0.0171, + "num_input_tokens_seen": 9157512, + "step": 4681 + }, + { + "epoch": 0.6205434062292909, + "grad_norm": 10.229904174804688, + "learning_rate": 4.492129048754971e-06, + "loss": 0.4306, + "num_input_tokens_seen": 9159392, + "step": 4682 + }, + { + "epoch": 0.620675944333996, + "grad_norm": 0.1794777810573578, + "learning_rate": 4.4919192978477675e-06, + "loss": 0.0012, + "num_input_tokens_seen": 9162344, + "step": 4683 + }, + { + "epoch": 0.6208084824387011, + "grad_norm": 8.99548625946045, + "learning_rate": 4.491709508534987e-06, + "loss": 0.1933, + "num_input_tokens_seen": 9165552, + "step": 4684 + }, + { + "epoch": 0.6209410205434063, + "grad_norm": 4.505803108215332, + "learning_rate": 4.491499680820672e-06, + "loss": 0.0846, + "num_input_tokens_seen": 9167560, + "step": 4685 + }, + { + "epoch": 0.6210735586481113, + "grad_norm": 4.874803066253662, + "learning_rate": 4.49128981470887e-06, + "loss": 0.1084, + "num_input_tokens_seen": 9169160, + "step": 4686 + }, + { + "epoch": 0.6212060967528165, + "grad_norm": 4.848454475402832, + "learning_rate": 4.491079910203627e-06, + "loss": 0.227, + "num_input_tokens_seen": 9171248, + "step": 4687 + }, + { + "epoch": 0.6213386348575215, + "grad_norm": 6.063534259796143, + "learning_rate": 4.49086996730899e-06, + "loss": 0.1123, + "num_input_tokens_seen": 9173312, + "step": 4688 + }, + { + "epoch": 0.6214711729622266, + "grad_norm": 9.427091598510742, + "learning_rate": 4.490659986029005e-06, + "loss": 0.0885, + "num_input_tokens_seen": 9175872, + "step": 4689 + }, + { + "epoch": 0.6216037110669318, + "grad_norm": 13.28331470489502, + "learning_rate": 4.490449966367724e-06, + "loss": 0.1892, + "num_input_tokens_seen": 9177536, + "step": 4690 + }, + { + "epoch": 0.6217362491716368, + "grad_norm": 3.939404249191284, + "learning_rate": 4.490239908329193e-06, + "loss": 0.0915, + "num_input_tokens_seen": 9179024, + "step": 4691 + }, + { + "epoch": 0.621868787276342, + "grad_norm": 12.433578491210938, + "learning_rate": 4.490029811917465e-06, + "loss": 0.3202, + "num_input_tokens_seen": 9180600, + "step": 4692 + }, + { + "epoch": 0.622001325381047, + "grad_norm": 12.812872886657715, + "learning_rate": 4.489819677136587e-06, + "loss": 0.2954, + "num_input_tokens_seen": 9182512, + "step": 4693 + }, + { + "epoch": 0.6221338634857522, + "grad_norm": 0.3575594425201416, + "learning_rate": 4.489609503990615e-06, + "loss": 0.0022, + "num_input_tokens_seen": 9183576, + "step": 4694 + }, + { + "epoch": 0.6222664015904572, + "grad_norm": 8.992940902709961, + "learning_rate": 4.489399292483599e-06, + "loss": 0.3486, + "num_input_tokens_seen": 9185464, + "step": 4695 + }, + { + "epoch": 0.6223989396951624, + "grad_norm": 10.612434387207031, + "learning_rate": 4.489189042619592e-06, + "loss": 0.1858, + "num_input_tokens_seen": 9187336, + "step": 4696 + }, + { + "epoch": 0.6225314777998675, + "grad_norm": 5.957917213439941, + "learning_rate": 4.488978754402648e-06, + "loss": 0.1281, + "num_input_tokens_seen": 9189568, + "step": 4697 + }, + { + "epoch": 0.6226640159045725, + "grad_norm": 15.949165344238281, + "learning_rate": 4.488768427836821e-06, + "loss": 1.0414, + "num_input_tokens_seen": 9192304, + "step": 4698 + }, + { + "epoch": 0.6227965540092777, + "grad_norm": 0.714363157749176, + "learning_rate": 4.488558062926167e-06, + "loss": 0.0047, + "num_input_tokens_seen": 9195248, + "step": 4699 + }, + { + "epoch": 0.6229290921139827, + "grad_norm": 4.676016330718994, + "learning_rate": 4.488347659674742e-06, + "loss": 0.1134, + "num_input_tokens_seen": 9196888, + "step": 4700 + }, + { + "epoch": 0.6230616302186879, + "grad_norm": 8.10399341583252, + "learning_rate": 4.488137218086602e-06, + "loss": 0.1283, + "num_input_tokens_seen": 9198784, + "step": 4701 + }, + { + "epoch": 0.623194168323393, + "grad_norm": 18.103416442871094, + "learning_rate": 4.487926738165805e-06, + "loss": 0.4921, + "num_input_tokens_seen": 9200536, + "step": 4702 + }, + { + "epoch": 0.6233267064280981, + "grad_norm": 4.6310858726501465, + "learning_rate": 4.48771621991641e-06, + "loss": 0.061, + "num_input_tokens_seen": 9201952, + "step": 4703 + }, + { + "epoch": 0.6234592445328032, + "grad_norm": 0.174373060464859, + "learning_rate": 4.487505663342474e-06, + "loss": 0.0012, + "num_input_tokens_seen": 9205240, + "step": 4704 + }, + { + "epoch": 0.6235917826375083, + "grad_norm": 0.4662921130657196, + "learning_rate": 4.487295068448057e-06, + "loss": 0.0025, + "num_input_tokens_seen": 9206600, + "step": 4705 + }, + { + "epoch": 0.6237243207422134, + "grad_norm": 2.202655792236328, + "learning_rate": 4.487084435237222e-06, + "loss": 0.0561, + "num_input_tokens_seen": 9208184, + "step": 4706 + }, + { + "epoch": 0.6238568588469184, + "grad_norm": 7.26932430267334, + "learning_rate": 4.486873763714028e-06, + "loss": 0.1977, + "num_input_tokens_seen": 9210136, + "step": 4707 + }, + { + "epoch": 0.6239893969516236, + "grad_norm": 10.733011245727539, + "learning_rate": 4.486663053882536e-06, + "loss": 0.1926, + "num_input_tokens_seen": 9211392, + "step": 4708 + }, + { + "epoch": 0.6241219350563287, + "grad_norm": 14.1680908203125, + "learning_rate": 4.48645230574681e-06, + "loss": 0.2926, + "num_input_tokens_seen": 9213688, + "step": 4709 + }, + { + "epoch": 0.6242544731610338, + "grad_norm": 12.292638778686523, + "learning_rate": 4.4862415193109135e-06, + "loss": 0.1858, + "num_input_tokens_seen": 9215984, + "step": 4710 + }, + { + "epoch": 0.6243870112657389, + "grad_norm": 0.7757108807563782, + "learning_rate": 4.48603069457891e-06, + "loss": 0.0048, + "num_input_tokens_seen": 9217648, + "step": 4711 + }, + { + "epoch": 0.624519549370444, + "grad_norm": 5.925593376159668, + "learning_rate": 4.485819831554865e-06, + "loss": 0.1952, + "num_input_tokens_seen": 9220024, + "step": 4712 + }, + { + "epoch": 0.6246520874751491, + "grad_norm": 15.597627639770508, + "learning_rate": 4.485608930242844e-06, + "loss": 0.643, + "num_input_tokens_seen": 9222656, + "step": 4713 + }, + { + "epoch": 0.6247846255798543, + "grad_norm": 11.462278366088867, + "learning_rate": 4.485397990646912e-06, + "loss": 0.2415, + "num_input_tokens_seen": 9224288, + "step": 4714 + }, + { + "epoch": 0.6249171636845593, + "grad_norm": 9.234786033630371, + "learning_rate": 4.4851870127711385e-06, + "loss": 0.1598, + "num_input_tokens_seen": 9226040, + "step": 4715 + }, + { + "epoch": 0.6250497017892644, + "grad_norm": 0.14623677730560303, + "learning_rate": 4.4849759966195885e-06, + "loss": 0.0009, + "num_input_tokens_seen": 9227856, + "step": 4716 + }, + { + "epoch": 0.6251822398939695, + "grad_norm": 15.587583541870117, + "learning_rate": 4.484764942196332e-06, + "loss": 0.491, + "num_input_tokens_seen": 9229856, + "step": 4717 + }, + { + "epoch": 0.6253147779986746, + "grad_norm": 4.736959934234619, + "learning_rate": 4.48455384950544e-06, + "loss": 0.1619, + "num_input_tokens_seen": 9231400, + "step": 4718 + }, + { + "epoch": 0.6254473161033797, + "grad_norm": 6.97900390625, + "learning_rate": 4.4843427185509786e-06, + "loss": 0.1963, + "num_input_tokens_seen": 9234672, + "step": 4719 + }, + { + "epoch": 0.6255798542080848, + "grad_norm": 6.921455383300781, + "learning_rate": 4.484131549337022e-06, + "loss": 0.3091, + "num_input_tokens_seen": 9236456, + "step": 4720 + }, + { + "epoch": 0.62571239231279, + "grad_norm": 7.441236972808838, + "learning_rate": 4.483920341867639e-06, + "loss": 0.287, + "num_input_tokens_seen": 9238352, + "step": 4721 + }, + { + "epoch": 0.625844930417495, + "grad_norm": 11.993032455444336, + "learning_rate": 4.483709096146904e-06, + "loss": 0.2502, + "num_input_tokens_seen": 9240728, + "step": 4722 + }, + { + "epoch": 0.6259774685222002, + "grad_norm": 10.66808032989502, + "learning_rate": 4.48349781217889e-06, + "loss": 0.4643, + "num_input_tokens_seen": 9242936, + "step": 4723 + }, + { + "epoch": 0.6261100066269052, + "grad_norm": 4.909586429595947, + "learning_rate": 4.483286489967669e-06, + "loss": 0.077, + "num_input_tokens_seen": 9245000, + "step": 4724 + }, + { + "epoch": 0.6262425447316103, + "grad_norm": 3.0337061882019043, + "learning_rate": 4.483075129517315e-06, + "loss": 0.0338, + "num_input_tokens_seen": 9246624, + "step": 4725 + }, + { + "epoch": 0.6263750828363155, + "grad_norm": 11.160965919494629, + "learning_rate": 4.482863730831906e-06, + "loss": 0.3363, + "num_input_tokens_seen": 9248256, + "step": 4726 + }, + { + "epoch": 0.6265076209410205, + "grad_norm": 6.9632439613342285, + "learning_rate": 4.482652293915516e-06, + "loss": 0.1224, + "num_input_tokens_seen": 9250808, + "step": 4727 + }, + { + "epoch": 0.6266401590457257, + "grad_norm": 9.692371368408203, + "learning_rate": 4.482440818772222e-06, + "loss": 0.3217, + "num_input_tokens_seen": 9252448, + "step": 4728 + }, + { + "epoch": 0.6267726971504307, + "grad_norm": 15.692598342895508, + "learning_rate": 4.482229305406101e-06, + "loss": 0.5902, + "num_input_tokens_seen": 9255096, + "step": 4729 + }, + { + "epoch": 0.6269052352551359, + "grad_norm": 0.28097230195999146, + "learning_rate": 4.482017753821233e-06, + "loss": 0.0016, + "num_input_tokens_seen": 9256976, + "step": 4730 + }, + { + "epoch": 0.6270377733598409, + "grad_norm": 11.245853424072266, + "learning_rate": 4.481806164021693e-06, + "loss": 0.2608, + "num_input_tokens_seen": 9258592, + "step": 4731 + }, + { + "epoch": 0.627170311464546, + "grad_norm": 14.31205940246582, + "learning_rate": 4.4815945360115655e-06, + "loss": 0.4802, + "num_input_tokens_seen": 9260616, + "step": 4732 + }, + { + "epoch": 0.6273028495692512, + "grad_norm": 3.7629506587982178, + "learning_rate": 4.481382869794927e-06, + "loss": 0.1381, + "num_input_tokens_seen": 9262520, + "step": 4733 + }, + { + "epoch": 0.6274353876739562, + "grad_norm": 0.11953546106815338, + "learning_rate": 4.48117116537586e-06, + "loss": 0.0008, + "num_input_tokens_seen": 9263896, + "step": 4734 + }, + { + "epoch": 0.6275679257786614, + "grad_norm": 6.599833011627197, + "learning_rate": 4.480959422758446e-06, + "loss": 0.1279, + "num_input_tokens_seen": 9266504, + "step": 4735 + }, + { + "epoch": 0.6277004638833664, + "grad_norm": 0.339495986700058, + "learning_rate": 4.480747641946767e-06, + "loss": 0.0013, + "num_input_tokens_seen": 9268152, + "step": 4736 + }, + { + "epoch": 0.6278330019880716, + "grad_norm": 4.102339744567871, + "learning_rate": 4.4805358229449096e-06, + "loss": 0.0515, + "num_input_tokens_seen": 9270400, + "step": 4737 + }, + { + "epoch": 0.6279655400927767, + "grad_norm": 0.10254357755184174, + "learning_rate": 4.480323965756953e-06, + "loss": 0.0007, + "num_input_tokens_seen": 9272128, + "step": 4738 + }, + { + "epoch": 0.6280980781974818, + "grad_norm": 11.284097671508789, + "learning_rate": 4.480112070386985e-06, + "loss": 0.2533, + "num_input_tokens_seen": 9273944, + "step": 4739 + }, + { + "epoch": 0.6282306163021869, + "grad_norm": 9.663166046142578, + "learning_rate": 4.479900136839089e-06, + "loss": 0.2506, + "num_input_tokens_seen": 9275792, + "step": 4740 + }, + { + "epoch": 0.6283631544068919, + "grad_norm": 15.408649444580078, + "learning_rate": 4.479688165117354e-06, + "loss": 0.3845, + "num_input_tokens_seen": 9278304, + "step": 4741 + }, + { + "epoch": 0.6284956925115971, + "grad_norm": 4.305943965911865, + "learning_rate": 4.479476155225866e-06, + "loss": 0.0798, + "num_input_tokens_seen": 9279472, + "step": 4742 + }, + { + "epoch": 0.6286282306163022, + "grad_norm": 0.09096726775169373, + "learning_rate": 4.47926410716871e-06, + "loss": 0.0006, + "num_input_tokens_seen": 9281720, + "step": 4743 + }, + { + "epoch": 0.6287607687210073, + "grad_norm": 4.696253299713135, + "learning_rate": 4.479052020949978e-06, + "loss": 0.0515, + "num_input_tokens_seen": 9283864, + "step": 4744 + }, + { + "epoch": 0.6288933068257124, + "grad_norm": 9.44750690460205, + "learning_rate": 4.478839896573758e-06, + "loss": 0.0617, + "num_input_tokens_seen": 9285592, + "step": 4745 + }, + { + "epoch": 0.6290258449304175, + "grad_norm": 7.358789920806885, + "learning_rate": 4.478627734044139e-06, + "loss": 0.1826, + "num_input_tokens_seen": 9287640, + "step": 4746 + }, + { + "epoch": 0.6291583830351226, + "grad_norm": 12.37944221496582, + "learning_rate": 4.478415533365212e-06, + "loss": 0.2451, + "num_input_tokens_seen": 9289976, + "step": 4747 + }, + { + "epoch": 0.6292909211398277, + "grad_norm": 4.406274795532227, + "learning_rate": 4.4782032945410694e-06, + "loss": 0.1359, + "num_input_tokens_seen": 9291624, + "step": 4748 + }, + { + "epoch": 0.6294234592445328, + "grad_norm": 5.894015789031982, + "learning_rate": 4.477991017575802e-06, + "loss": 0.0871, + "num_input_tokens_seen": 9293136, + "step": 4749 + }, + { + "epoch": 0.629555997349238, + "grad_norm": 13.213053703308105, + "learning_rate": 4.477778702473504e-06, + "loss": 0.283, + "num_input_tokens_seen": 9295712, + "step": 4750 + }, + { + "epoch": 0.629688535453943, + "grad_norm": 14.832886695861816, + "learning_rate": 4.477566349238268e-06, + "loss": 0.8326, + "num_input_tokens_seen": 9298696, + "step": 4751 + }, + { + "epoch": 0.6298210735586481, + "grad_norm": 9.77347469329834, + "learning_rate": 4.477353957874188e-06, + "loss": 0.1554, + "num_input_tokens_seen": 9300616, + "step": 4752 + }, + { + "epoch": 0.6299536116633532, + "grad_norm": 11.360574722290039, + "learning_rate": 4.47714152838536e-06, + "loss": 0.2502, + "num_input_tokens_seen": 9302496, + "step": 4753 + }, + { + "epoch": 0.6300861497680583, + "grad_norm": 4.4659647941589355, + "learning_rate": 4.476929060775879e-06, + "loss": 0.1487, + "num_input_tokens_seen": 9305104, + "step": 4754 + }, + { + "epoch": 0.6302186878727635, + "grad_norm": 40.28339385986328, + "learning_rate": 4.476716555049843e-06, + "loss": 0.2617, + "num_input_tokens_seen": 9307184, + "step": 4755 + }, + { + "epoch": 0.6303512259774685, + "grad_norm": 8.567086219787598, + "learning_rate": 4.476504011211347e-06, + "loss": 0.1876, + "num_input_tokens_seen": 9309072, + "step": 4756 + }, + { + "epoch": 0.6304837640821737, + "grad_norm": 5.161421775817871, + "learning_rate": 4.47629142926449e-06, + "loss": 0.0208, + "num_input_tokens_seen": 9310864, + "step": 4757 + }, + { + "epoch": 0.6306163021868787, + "grad_norm": 5.451211929321289, + "learning_rate": 4.476078809213371e-06, + "loss": 0.0766, + "num_input_tokens_seen": 9313392, + "step": 4758 + }, + { + "epoch": 0.6307488402915838, + "grad_norm": 28.7636775970459, + "learning_rate": 4.47586615106209e-06, + "loss": 0.8131, + "num_input_tokens_seen": 9315280, + "step": 4759 + }, + { + "epoch": 0.6308813783962889, + "grad_norm": 8.143635749816895, + "learning_rate": 4.475653454814746e-06, + "loss": 0.1577, + "num_input_tokens_seen": 9317800, + "step": 4760 + }, + { + "epoch": 0.631013916500994, + "grad_norm": 0.39525511860847473, + "learning_rate": 4.475440720475441e-06, + "loss": 0.0028, + "num_input_tokens_seen": 9319472, + "step": 4761 + }, + { + "epoch": 0.6311464546056992, + "grad_norm": 4.914936542510986, + "learning_rate": 4.475227948048276e-06, + "loss": 0.1786, + "num_input_tokens_seen": 9321520, + "step": 4762 + }, + { + "epoch": 0.6312789927104042, + "grad_norm": 0.8538804054260254, + "learning_rate": 4.475015137537353e-06, + "loss": 0.0062, + "num_input_tokens_seen": 9323792, + "step": 4763 + }, + { + "epoch": 0.6314115308151094, + "grad_norm": 7.3070969581604, + "learning_rate": 4.474802288946776e-06, + "loss": 0.1408, + "num_input_tokens_seen": 9325552, + "step": 4764 + }, + { + "epoch": 0.6315440689198144, + "grad_norm": 0.39548006653785706, + "learning_rate": 4.474589402280649e-06, + "loss": 0.0026, + "num_input_tokens_seen": 9327208, + "step": 4765 + }, + { + "epoch": 0.6316766070245196, + "grad_norm": 19.981210708618164, + "learning_rate": 4.474376477543075e-06, + "loss": 0.5314, + "num_input_tokens_seen": 9329072, + "step": 4766 + }, + { + "epoch": 0.6318091451292247, + "grad_norm": 10.657853126525879, + "learning_rate": 4.474163514738161e-06, + "loss": 0.5323, + "num_input_tokens_seen": 9331056, + "step": 4767 + }, + { + "epoch": 0.6319416832339297, + "grad_norm": 7.317267417907715, + "learning_rate": 4.473950513870013e-06, + "loss": 0.1049, + "num_input_tokens_seen": 9332672, + "step": 4768 + }, + { + "epoch": 0.6320742213386349, + "grad_norm": 6.87615966796875, + "learning_rate": 4.473737474942736e-06, + "loss": 0.0881, + "num_input_tokens_seen": 9334776, + "step": 4769 + }, + { + "epoch": 0.6322067594433399, + "grad_norm": 6.279460430145264, + "learning_rate": 4.47352439796044e-06, + "loss": 0.2506, + "num_input_tokens_seen": 9337032, + "step": 4770 + }, + { + "epoch": 0.6323392975480451, + "grad_norm": 3.7162981033325195, + "learning_rate": 4.473311282927233e-06, + "loss": 0.0446, + "num_input_tokens_seen": 9338288, + "step": 4771 + }, + { + "epoch": 0.6324718356527501, + "grad_norm": 12.533162117004395, + "learning_rate": 4.473098129847222e-06, + "loss": 0.4662, + "num_input_tokens_seen": 9340944, + "step": 4772 + }, + { + "epoch": 0.6326043737574553, + "grad_norm": 3.261996269226074, + "learning_rate": 4.4728849387245174e-06, + "loss": 0.0265, + "num_input_tokens_seen": 9343072, + "step": 4773 + }, + { + "epoch": 0.6327369118621604, + "grad_norm": 15.451162338256836, + "learning_rate": 4.472671709563231e-06, + "loss": 0.5624, + "num_input_tokens_seen": 9345392, + "step": 4774 + }, + { + "epoch": 0.6328694499668654, + "grad_norm": 0.059484872967004776, + "learning_rate": 4.472458442367473e-06, + "loss": 0.0004, + "num_input_tokens_seen": 9347688, + "step": 4775 + }, + { + "epoch": 0.6330019880715706, + "grad_norm": 5.385174751281738, + "learning_rate": 4.472245137141355e-06, + "loss": 0.1769, + "num_input_tokens_seen": 9350256, + "step": 4776 + }, + { + "epoch": 0.6331345261762756, + "grad_norm": 12.318608283996582, + "learning_rate": 4.472031793888991e-06, + "loss": 0.3522, + "num_input_tokens_seen": 9352808, + "step": 4777 + }, + { + "epoch": 0.6332670642809808, + "grad_norm": 0.06902060657739639, + "learning_rate": 4.471818412614492e-06, + "loss": 0.0005, + "num_input_tokens_seen": 9354296, + "step": 4778 + }, + { + "epoch": 0.6333996023856859, + "grad_norm": 3.1061789989471436, + "learning_rate": 4.471604993321975e-06, + "loss": 0.0461, + "num_input_tokens_seen": 9355944, + "step": 4779 + }, + { + "epoch": 0.633532140490391, + "grad_norm": 0.0948604866862297, + "learning_rate": 4.471391536015554e-06, + "loss": 0.0006, + "num_input_tokens_seen": 9357336, + "step": 4780 + }, + { + "epoch": 0.6336646785950961, + "grad_norm": 0.05408541485667229, + "learning_rate": 4.471178040699343e-06, + "loss": 0.0004, + "num_input_tokens_seen": 9358976, + "step": 4781 + }, + { + "epoch": 0.6337972166998012, + "grad_norm": 0.09396762400865555, + "learning_rate": 4.470964507377461e-06, + "loss": 0.0006, + "num_input_tokens_seen": 9361392, + "step": 4782 + }, + { + "epoch": 0.6339297548045063, + "grad_norm": 0.05462309718132019, + "learning_rate": 4.4707509360540224e-06, + "loss": 0.0004, + "num_input_tokens_seen": 9362632, + "step": 4783 + }, + { + "epoch": 0.6340622929092113, + "grad_norm": 4.9388017654418945, + "learning_rate": 4.470537326733147e-06, + "loss": 0.0931, + "num_input_tokens_seen": 9365192, + "step": 4784 + }, + { + "epoch": 0.6341948310139165, + "grad_norm": 6.067946434020996, + "learning_rate": 4.470323679418952e-06, + "loss": 0.1887, + "num_input_tokens_seen": 9366632, + "step": 4785 + }, + { + "epoch": 0.6343273691186216, + "grad_norm": 0.05308328568935394, + "learning_rate": 4.4701099941155576e-06, + "loss": 0.0004, + "num_input_tokens_seen": 9368240, + "step": 4786 + }, + { + "epoch": 0.6344599072233267, + "grad_norm": 7.645464897155762, + "learning_rate": 4.469896270827083e-06, + "loss": 0.2742, + "num_input_tokens_seen": 9370840, + "step": 4787 + }, + { + "epoch": 0.6345924453280318, + "grad_norm": 6.440088748931885, + "learning_rate": 4.469682509557649e-06, + "loss": 0.1394, + "num_input_tokens_seen": 9373144, + "step": 4788 + }, + { + "epoch": 0.6347249834327369, + "grad_norm": 11.439305305480957, + "learning_rate": 4.469468710311378e-06, + "loss": 0.3313, + "num_input_tokens_seen": 9375048, + "step": 4789 + }, + { + "epoch": 0.634857521537442, + "grad_norm": 0.0310747642070055, + "learning_rate": 4.469254873092392e-06, + "loss": 0.0002, + "num_input_tokens_seen": 9377648, + "step": 4790 + }, + { + "epoch": 0.6349900596421472, + "grad_norm": 5.411471843719482, + "learning_rate": 4.469040997904813e-06, + "loss": 0.1519, + "num_input_tokens_seen": 9379352, + "step": 4791 + }, + { + "epoch": 0.6351225977468522, + "grad_norm": 0.04134681448340416, + "learning_rate": 4.468827084752765e-06, + "loss": 0.0003, + "num_input_tokens_seen": 9380552, + "step": 4792 + }, + { + "epoch": 0.6352551358515574, + "grad_norm": 15.2728271484375, + "learning_rate": 4.468613133640373e-06, + "loss": 0.4077, + "num_input_tokens_seen": 9382176, + "step": 4793 + }, + { + "epoch": 0.6353876739562624, + "grad_norm": 23.983142852783203, + "learning_rate": 4.468399144571761e-06, + "loss": 0.4982, + "num_input_tokens_seen": 9384000, + "step": 4794 + }, + { + "epoch": 0.6355202120609675, + "grad_norm": 13.765985488891602, + "learning_rate": 4.468185117551056e-06, + "loss": 0.5123, + "num_input_tokens_seen": 9386832, + "step": 4795 + }, + { + "epoch": 0.6356527501656726, + "grad_norm": 5.54903507232666, + "learning_rate": 4.467971052582384e-06, + "loss": 0.1848, + "num_input_tokens_seen": 9389208, + "step": 4796 + }, + { + "epoch": 0.6357852882703777, + "grad_norm": 5.594756603240967, + "learning_rate": 4.467756949669873e-06, + "loss": 0.2428, + "num_input_tokens_seen": 9391456, + "step": 4797 + }, + { + "epoch": 0.6359178263750829, + "grad_norm": 10.289932250976562, + "learning_rate": 4.46754280881765e-06, + "loss": 0.3626, + "num_input_tokens_seen": 9393488, + "step": 4798 + }, + { + "epoch": 0.6360503644797879, + "grad_norm": 8.716151237487793, + "learning_rate": 4.467328630029845e-06, + "loss": 0.2365, + "num_input_tokens_seen": 9395216, + "step": 4799 + }, + { + "epoch": 0.6361829025844931, + "grad_norm": 9.418380737304688, + "learning_rate": 4.467114413310586e-06, + "loss": 0.1986, + "num_input_tokens_seen": 9397504, + "step": 4800 + }, + { + "epoch": 0.6363154406891981, + "grad_norm": 10.990334510803223, + "learning_rate": 4.466900158664005e-06, + "loss": 0.3999, + "num_input_tokens_seen": 9400944, + "step": 4801 + }, + { + "epoch": 0.6364479787939032, + "grad_norm": 0.5313312411308289, + "learning_rate": 4.46668586609423e-06, + "loss": 0.0089, + "num_input_tokens_seen": 9402968, + "step": 4802 + }, + { + "epoch": 0.6365805168986084, + "grad_norm": 29.295711517333984, + "learning_rate": 4.466471535605397e-06, + "loss": 0.5898, + "num_input_tokens_seen": 9404832, + "step": 4803 + }, + { + "epoch": 0.6367130550033134, + "grad_norm": 8.569259643554688, + "learning_rate": 4.466257167201635e-06, + "loss": 0.2356, + "num_input_tokens_seen": 9407184, + "step": 4804 + }, + { + "epoch": 0.6368455931080186, + "grad_norm": 0.7735918164253235, + "learning_rate": 4.466042760887079e-06, + "loss": 0.0049, + "num_input_tokens_seen": 9408672, + "step": 4805 + }, + { + "epoch": 0.6369781312127236, + "grad_norm": 11.682205200195312, + "learning_rate": 4.465828316665861e-06, + "loss": 0.2697, + "num_input_tokens_seen": 9410400, + "step": 4806 + }, + { + "epoch": 0.6371106693174288, + "grad_norm": 6.96370792388916, + "learning_rate": 4.465613834542118e-06, + "loss": 0.1502, + "num_input_tokens_seen": 9411992, + "step": 4807 + }, + { + "epoch": 0.6372432074221339, + "grad_norm": 8.249862670898438, + "learning_rate": 4.4653993145199845e-06, + "loss": 0.2517, + "num_input_tokens_seen": 9414024, + "step": 4808 + }, + { + "epoch": 0.637375745526839, + "grad_norm": 0.2477010190486908, + "learning_rate": 4.465184756603597e-06, + "loss": 0.0018, + "num_input_tokens_seen": 9415672, + "step": 4809 + }, + { + "epoch": 0.6375082836315441, + "grad_norm": 0.4310399293899536, + "learning_rate": 4.46497016079709e-06, + "loss": 0.0031, + "num_input_tokens_seen": 9417808, + "step": 4810 + }, + { + "epoch": 0.6376408217362491, + "grad_norm": 24.789512634277344, + "learning_rate": 4.464755527104604e-06, + "loss": 1.0219, + "num_input_tokens_seen": 9420472, + "step": 4811 + }, + { + "epoch": 0.6377733598409543, + "grad_norm": 10.651152610778809, + "learning_rate": 4.464540855530275e-06, + "loss": 0.2242, + "num_input_tokens_seen": 9422264, + "step": 4812 + }, + { + "epoch": 0.6379058979456593, + "grad_norm": 3.9325499534606934, + "learning_rate": 4.464326146078244e-06, + "loss": 0.0673, + "num_input_tokens_seen": 9423856, + "step": 4813 + }, + { + "epoch": 0.6380384360503645, + "grad_norm": 7.080622673034668, + "learning_rate": 4.464111398752648e-06, + "loss": 0.1986, + "num_input_tokens_seen": 9426984, + "step": 4814 + }, + { + "epoch": 0.6381709741550696, + "grad_norm": 11.83078670501709, + "learning_rate": 4.4638966135576315e-06, + "loss": 0.3659, + "num_input_tokens_seen": 9428976, + "step": 4815 + }, + { + "epoch": 0.6383035122597747, + "grad_norm": 0.4345207214355469, + "learning_rate": 4.463681790497332e-06, + "loss": 0.0032, + "num_input_tokens_seen": 9430392, + "step": 4816 + }, + { + "epoch": 0.6384360503644798, + "grad_norm": 16.421342849731445, + "learning_rate": 4.463466929575894e-06, + "loss": 0.7735, + "num_input_tokens_seen": 9432120, + "step": 4817 + }, + { + "epoch": 0.6385685884691849, + "grad_norm": 8.015900611877441, + "learning_rate": 4.463252030797458e-06, + "loss": 0.2068, + "num_input_tokens_seen": 9435288, + "step": 4818 + }, + { + "epoch": 0.63870112657389, + "grad_norm": 6.320451736450195, + "learning_rate": 4.46303709416617e-06, + "loss": 0.1152, + "num_input_tokens_seen": 9436960, + "step": 4819 + }, + { + "epoch": 0.6388336646785951, + "grad_norm": 0.39286741614341736, + "learning_rate": 4.462822119686171e-06, + "loss": 0.0029, + "num_input_tokens_seen": 9441032, + "step": 4820 + }, + { + "epoch": 0.6389662027833002, + "grad_norm": 16.194061279296875, + "learning_rate": 4.462607107361608e-06, + "loss": 0.4237, + "num_input_tokens_seen": 9442896, + "step": 4821 + }, + { + "epoch": 0.6390987408880053, + "grad_norm": 9.960128784179688, + "learning_rate": 4.462392057196626e-06, + "loss": 0.1203, + "num_input_tokens_seen": 9444432, + "step": 4822 + }, + { + "epoch": 0.6392312789927104, + "grad_norm": 1.8447933197021484, + "learning_rate": 4.462176969195371e-06, + "loss": 0.013, + "num_input_tokens_seen": 9446992, + "step": 4823 + }, + { + "epoch": 0.6393638170974155, + "grad_norm": 5.806275844573975, + "learning_rate": 4.461961843361991e-06, + "loss": 0.1424, + "num_input_tokens_seen": 9448800, + "step": 4824 + }, + { + "epoch": 0.6394963552021206, + "grad_norm": 10.891443252563477, + "learning_rate": 4.461746679700632e-06, + "loss": 0.2417, + "num_input_tokens_seen": 9450432, + "step": 4825 + }, + { + "epoch": 0.6396288933068257, + "grad_norm": 0.7464866042137146, + "learning_rate": 4.461531478215446e-06, + "loss": 0.0053, + "num_input_tokens_seen": 9453024, + "step": 4826 + }, + { + "epoch": 0.6397614314115309, + "grad_norm": 10.647525787353516, + "learning_rate": 4.461316238910578e-06, + "loss": 0.2265, + "num_input_tokens_seen": 9455088, + "step": 4827 + }, + { + "epoch": 0.6398939695162359, + "grad_norm": 6.746596336364746, + "learning_rate": 4.461100961790181e-06, + "loss": 0.1839, + "num_input_tokens_seen": 9457248, + "step": 4828 + }, + { + "epoch": 0.640026507620941, + "grad_norm": 2.391592264175415, + "learning_rate": 4.460885646858404e-06, + "loss": 0.0584, + "num_input_tokens_seen": 9460176, + "step": 4829 + }, + { + "epoch": 0.6401590457256461, + "grad_norm": 8.733220100402832, + "learning_rate": 4.460670294119399e-06, + "loss": 0.3492, + "num_input_tokens_seen": 9462000, + "step": 4830 + }, + { + "epoch": 0.6402915838303512, + "grad_norm": 0.27454718947410583, + "learning_rate": 4.460454903577318e-06, + "loss": 0.002, + "num_input_tokens_seen": 9463200, + "step": 4831 + }, + { + "epoch": 0.6404241219350564, + "grad_norm": 7.840694427490234, + "learning_rate": 4.460239475236314e-06, + "loss": 0.2701, + "num_input_tokens_seen": 9465264, + "step": 4832 + }, + { + "epoch": 0.6405566600397614, + "grad_norm": 0.2896285355091095, + "learning_rate": 4.46002400910054e-06, + "loss": 0.0021, + "num_input_tokens_seen": 9467928, + "step": 4833 + }, + { + "epoch": 0.6406891981444666, + "grad_norm": 1.2033575773239136, + "learning_rate": 4.459808505174152e-06, + "loss": 0.0067, + "num_input_tokens_seen": 9470944, + "step": 4834 + }, + { + "epoch": 0.6408217362491716, + "grad_norm": 4.316341876983643, + "learning_rate": 4.459592963461304e-06, + "loss": 0.0564, + "num_input_tokens_seen": 9472536, + "step": 4835 + }, + { + "epoch": 0.6409542743538768, + "grad_norm": 1.8086624145507812, + "learning_rate": 4.459377383966151e-06, + "loss": 0.0118, + "num_input_tokens_seen": 9474176, + "step": 4836 + }, + { + "epoch": 0.6410868124585818, + "grad_norm": 6.824998378753662, + "learning_rate": 4.459161766692851e-06, + "loss": 0.2292, + "num_input_tokens_seen": 9476416, + "step": 4837 + }, + { + "epoch": 0.6412193505632869, + "grad_norm": 0.17999917268753052, + "learning_rate": 4.45894611164556e-06, + "loss": 0.0013, + "num_input_tokens_seen": 9478120, + "step": 4838 + }, + { + "epoch": 0.6413518886679921, + "grad_norm": 8.806440353393555, + "learning_rate": 4.458730418828437e-06, + "loss": 0.1975, + "num_input_tokens_seen": 9480152, + "step": 4839 + }, + { + "epoch": 0.6414844267726971, + "grad_norm": 3.3774189949035645, + "learning_rate": 4.458514688245641e-06, + "loss": 0.0375, + "num_input_tokens_seen": 9481616, + "step": 4840 + }, + { + "epoch": 0.6416169648774023, + "grad_norm": 0.07355403155088425, + "learning_rate": 4.45829891990133e-06, + "loss": 0.0005, + "num_input_tokens_seen": 9483616, + "step": 4841 + }, + { + "epoch": 0.6417495029821073, + "grad_norm": 10.695578575134277, + "learning_rate": 4.458083113799664e-06, + "loss": 0.1648, + "num_input_tokens_seen": 9485304, + "step": 4842 + }, + { + "epoch": 0.6418820410868125, + "grad_norm": 5.048794269561768, + "learning_rate": 4.457867269944806e-06, + "loss": 0.0775, + "num_input_tokens_seen": 9486936, + "step": 4843 + }, + { + "epoch": 0.6420145791915176, + "grad_norm": 4.7919440269470215, + "learning_rate": 4.457651388340916e-06, + "loss": 0.1346, + "num_input_tokens_seen": 9489424, + "step": 4844 + }, + { + "epoch": 0.6421471172962226, + "grad_norm": 6.147254943847656, + "learning_rate": 4.457435468992157e-06, + "loss": 0.1702, + "num_input_tokens_seen": 9490760, + "step": 4845 + }, + { + "epoch": 0.6422796554009278, + "grad_norm": 11.189906120300293, + "learning_rate": 4.45721951190269e-06, + "loss": 0.2798, + "num_input_tokens_seen": 9492672, + "step": 4846 + }, + { + "epoch": 0.6424121935056328, + "grad_norm": 7.041614055633545, + "learning_rate": 4.457003517076683e-06, + "loss": 0.2798, + "num_input_tokens_seen": 9494712, + "step": 4847 + }, + { + "epoch": 0.642544731610338, + "grad_norm": 0.045423202216625214, + "learning_rate": 4.456787484518297e-06, + "loss": 0.0003, + "num_input_tokens_seen": 9496208, + "step": 4848 + }, + { + "epoch": 0.642677269715043, + "grad_norm": 7.87891960144043, + "learning_rate": 4.456571414231699e-06, + "loss": 0.2755, + "num_input_tokens_seen": 9498424, + "step": 4849 + }, + { + "epoch": 0.6428098078197482, + "grad_norm": 1.3120954036712646, + "learning_rate": 4.456355306221054e-06, + "loss": 0.0176, + "num_input_tokens_seen": 9499936, + "step": 4850 + }, + { + "epoch": 0.6429423459244533, + "grad_norm": 0.09056670218706131, + "learning_rate": 4.456139160490529e-06, + "loss": 0.0006, + "num_input_tokens_seen": 9502672, + "step": 4851 + }, + { + "epoch": 0.6430748840291584, + "grad_norm": 0.05561875179409981, + "learning_rate": 4.455922977044291e-06, + "loss": 0.0004, + "num_input_tokens_seen": 9504008, + "step": 4852 + }, + { + "epoch": 0.6432074221338635, + "grad_norm": 11.432732582092285, + "learning_rate": 4.455706755886509e-06, + "loss": 0.263, + "num_input_tokens_seen": 9505792, + "step": 4853 + }, + { + "epoch": 0.6433399602385685, + "grad_norm": 6.313557147979736, + "learning_rate": 4.455490497021352e-06, + "loss": 0.1424, + "num_input_tokens_seen": 9508416, + "step": 4854 + }, + { + "epoch": 0.6434724983432737, + "grad_norm": 5.000476360321045, + "learning_rate": 4.455274200452989e-06, + "loss": 0.15, + "num_input_tokens_seen": 9510824, + "step": 4855 + }, + { + "epoch": 0.6436050364479788, + "grad_norm": 1.254848599433899, + "learning_rate": 4.45505786618559e-06, + "loss": 0.0067, + "num_input_tokens_seen": 9512056, + "step": 4856 + }, + { + "epoch": 0.6437375745526839, + "grad_norm": 7.127485275268555, + "learning_rate": 4.454841494223328e-06, + "loss": 0.2149, + "num_input_tokens_seen": 9513656, + "step": 4857 + }, + { + "epoch": 0.643870112657389, + "grad_norm": 0.7913540005683899, + "learning_rate": 4.454625084570372e-06, + "loss": 0.0051, + "num_input_tokens_seen": 9515448, + "step": 4858 + }, + { + "epoch": 0.6440026507620941, + "grad_norm": 0.26762810349464417, + "learning_rate": 4.454408637230897e-06, + "loss": 0.0019, + "num_input_tokens_seen": 9516952, + "step": 4859 + }, + { + "epoch": 0.6441351888667992, + "grad_norm": 8.086902618408203, + "learning_rate": 4.454192152209074e-06, + "loss": 0.2745, + "num_input_tokens_seen": 9518736, + "step": 4860 + }, + { + "epoch": 0.6442677269715044, + "grad_norm": 12.396095275878906, + "learning_rate": 4.453975629509079e-06, + "loss": 0.6375, + "num_input_tokens_seen": 9521848, + "step": 4861 + }, + { + "epoch": 0.6444002650762094, + "grad_norm": 15.481982231140137, + "learning_rate": 4.453759069135087e-06, + "loss": 0.578, + "num_input_tokens_seen": 9523696, + "step": 4862 + }, + { + "epoch": 0.6445328031809145, + "grad_norm": 5.175758361816406, + "learning_rate": 4.45354247109127e-06, + "loss": 0.0961, + "num_input_tokens_seen": 9526368, + "step": 4863 + }, + { + "epoch": 0.6446653412856196, + "grad_norm": 0.32571929693222046, + "learning_rate": 4.453325835381808e-06, + "loss": 0.0022, + "num_input_tokens_seen": 9528416, + "step": 4864 + }, + { + "epoch": 0.6447978793903247, + "grad_norm": 1.2239230871200562, + "learning_rate": 4.453109162010875e-06, + "loss": 0.0182, + "num_input_tokens_seen": 9529920, + "step": 4865 + }, + { + "epoch": 0.6449304174950298, + "grad_norm": 8.420411109924316, + "learning_rate": 4.452892450982651e-06, + "loss": 0.2873, + "num_input_tokens_seen": 9531680, + "step": 4866 + }, + { + "epoch": 0.6450629555997349, + "grad_norm": 0.10115247964859009, + "learning_rate": 4.452675702301313e-06, + "loss": 0.0007, + "num_input_tokens_seen": 9533184, + "step": 4867 + }, + { + "epoch": 0.6451954937044401, + "grad_norm": 5.8544487953186035, + "learning_rate": 4.4524589159710406e-06, + "loss": 0.1923, + "num_input_tokens_seen": 9534304, + "step": 4868 + }, + { + "epoch": 0.6453280318091451, + "grad_norm": 22.86847496032715, + "learning_rate": 4.452242091996013e-06, + "loss": 1.0328, + "num_input_tokens_seen": 9535920, + "step": 4869 + }, + { + "epoch": 0.6454605699138503, + "grad_norm": 8.39797306060791, + "learning_rate": 4.452025230380412e-06, + "loss": 0.1684, + "num_input_tokens_seen": 9537992, + "step": 4870 + }, + { + "epoch": 0.6455931080185553, + "grad_norm": 18.60875129699707, + "learning_rate": 4.451808331128417e-06, + "loss": 0.2908, + "num_input_tokens_seen": 9540536, + "step": 4871 + }, + { + "epoch": 0.6457256461232604, + "grad_norm": 0.30296850204467773, + "learning_rate": 4.451591394244211e-06, + "loss": 0.002, + "num_input_tokens_seen": 9543192, + "step": 4872 + }, + { + "epoch": 0.6458581842279656, + "grad_norm": 22.829776763916016, + "learning_rate": 4.451374419731978e-06, + "loss": 0.4296, + "num_input_tokens_seen": 9544896, + "step": 4873 + }, + { + "epoch": 0.6459907223326706, + "grad_norm": 8.951194763183594, + "learning_rate": 4.451157407595898e-06, + "loss": 0.2081, + "num_input_tokens_seen": 9546752, + "step": 4874 + }, + { + "epoch": 0.6461232604373758, + "grad_norm": 15.652210235595703, + "learning_rate": 4.450940357840157e-06, + "loss": 0.4454, + "num_input_tokens_seen": 9548808, + "step": 4875 + }, + { + "epoch": 0.6462557985420808, + "grad_norm": 0.3247293531894684, + "learning_rate": 4.4507232704689415e-06, + "loss": 0.0022, + "num_input_tokens_seen": 9550304, + "step": 4876 + }, + { + "epoch": 0.646388336646786, + "grad_norm": 6.515808582305908, + "learning_rate": 4.4505061454864355e-06, + "loss": 0.178, + "num_input_tokens_seen": 9553040, + "step": 4877 + }, + { + "epoch": 0.646520874751491, + "grad_norm": 6.426331043243408, + "learning_rate": 4.450288982896825e-06, + "loss": 0.19, + "num_input_tokens_seen": 9554720, + "step": 4878 + }, + { + "epoch": 0.6466534128561962, + "grad_norm": 0.37664979696273804, + "learning_rate": 4.450071782704297e-06, + "loss": 0.0026, + "num_input_tokens_seen": 9556088, + "step": 4879 + }, + { + "epoch": 0.6467859509609013, + "grad_norm": 12.391663551330566, + "learning_rate": 4.44985454491304e-06, + "loss": 0.5698, + "num_input_tokens_seen": 9558832, + "step": 4880 + }, + { + "epoch": 0.6469184890656063, + "grad_norm": 5.82336950302124, + "learning_rate": 4.4496372695272426e-06, + "loss": 0.1832, + "num_input_tokens_seen": 9560488, + "step": 4881 + }, + { + "epoch": 0.6470510271703115, + "grad_norm": 4.2001566886901855, + "learning_rate": 4.449419956551094e-06, + "loss": 0.0641, + "num_input_tokens_seen": 9562728, + "step": 4882 + }, + { + "epoch": 0.6471835652750165, + "grad_norm": 14.888204574584961, + "learning_rate": 4.449202605988784e-06, + "loss": 0.3792, + "num_input_tokens_seen": 9564760, + "step": 4883 + }, + { + "epoch": 0.6473161033797217, + "grad_norm": 10.145330429077148, + "learning_rate": 4.448985217844501e-06, + "loss": 0.2194, + "num_input_tokens_seen": 9567184, + "step": 4884 + }, + { + "epoch": 0.6474486414844268, + "grad_norm": 6.315364837646484, + "learning_rate": 4.44876779212244e-06, + "loss": 0.1209, + "num_input_tokens_seen": 9569832, + "step": 4885 + }, + { + "epoch": 0.6475811795891319, + "grad_norm": 6.940689563751221, + "learning_rate": 4.448550328826792e-06, + "loss": 0.0681, + "num_input_tokens_seen": 9571768, + "step": 4886 + }, + { + "epoch": 0.647713717693837, + "grad_norm": 7.291001319885254, + "learning_rate": 4.448332827961749e-06, + "loss": 0.0757, + "num_input_tokens_seen": 9574040, + "step": 4887 + }, + { + "epoch": 0.647846255798542, + "grad_norm": 5.533946514129639, + "learning_rate": 4.448115289531505e-06, + "loss": 0.1314, + "num_input_tokens_seen": 9576032, + "step": 4888 + }, + { + "epoch": 0.6479787939032472, + "grad_norm": 9.036831855773926, + "learning_rate": 4.4478977135402536e-06, + "loss": 0.2193, + "num_input_tokens_seen": 9578144, + "step": 4889 + }, + { + "epoch": 0.6481113320079522, + "grad_norm": 3.3029744625091553, + "learning_rate": 4.447680099992191e-06, + "loss": 0.0693, + "num_input_tokens_seen": 9580632, + "step": 4890 + }, + { + "epoch": 0.6482438701126574, + "grad_norm": 10.926652908325195, + "learning_rate": 4.447462448891513e-06, + "loss": 0.2621, + "num_input_tokens_seen": 9582544, + "step": 4891 + }, + { + "epoch": 0.6483764082173625, + "grad_norm": 11.979860305786133, + "learning_rate": 4.447244760242415e-06, + "loss": 0.3476, + "num_input_tokens_seen": 9585016, + "step": 4892 + }, + { + "epoch": 0.6485089463220676, + "grad_norm": 2.7239322662353516, + "learning_rate": 4.447027034049094e-06, + "loss": 0.0259, + "num_input_tokens_seen": 9587384, + "step": 4893 + }, + { + "epoch": 0.6486414844267727, + "grad_norm": 9.86445140838623, + "learning_rate": 4.44680927031575e-06, + "loss": 0.1054, + "num_input_tokens_seen": 9589312, + "step": 4894 + }, + { + "epoch": 0.6487740225314778, + "grad_norm": 0.4499588906764984, + "learning_rate": 4.4465914690465785e-06, + "loss": 0.0031, + "num_input_tokens_seen": 9590760, + "step": 4895 + }, + { + "epoch": 0.6489065606361829, + "grad_norm": 4.442226409912109, + "learning_rate": 4.4463736302457824e-06, + "loss": 0.0682, + "num_input_tokens_seen": 9592944, + "step": 4896 + }, + { + "epoch": 0.649039098740888, + "grad_norm": 9.19690990447998, + "learning_rate": 4.446155753917559e-06, + "loss": 0.3678, + "num_input_tokens_seen": 9595008, + "step": 4897 + }, + { + "epoch": 0.6491716368455931, + "grad_norm": 16.92936134338379, + "learning_rate": 4.44593784006611e-06, + "loss": 0.2864, + "num_input_tokens_seen": 9597088, + "step": 4898 + }, + { + "epoch": 0.6493041749502982, + "grad_norm": 3.8098597526550293, + "learning_rate": 4.4457198886956374e-06, + "loss": 0.1194, + "num_input_tokens_seen": 9599176, + "step": 4899 + }, + { + "epoch": 0.6494367130550033, + "grad_norm": 1.7245649099349976, + "learning_rate": 4.445501899810343e-06, + "loss": 0.0138, + "num_input_tokens_seen": 9600632, + "step": 4900 + }, + { + "epoch": 0.6495692511597084, + "grad_norm": 11.463774681091309, + "learning_rate": 4.44528387341443e-06, + "loss": 0.4453, + "num_input_tokens_seen": 9603160, + "step": 4901 + }, + { + "epoch": 0.6497017892644135, + "grad_norm": 12.178915977478027, + "learning_rate": 4.445065809512102e-06, + "loss": 0.4404, + "num_input_tokens_seen": 9605192, + "step": 4902 + }, + { + "epoch": 0.6498343273691186, + "grad_norm": 0.23685547709465027, + "learning_rate": 4.444847708107562e-06, + "loss": 0.0016, + "num_input_tokens_seen": 9606168, + "step": 4903 + }, + { + "epoch": 0.6499668654738238, + "grad_norm": 8.615063667297363, + "learning_rate": 4.444629569205018e-06, + "loss": 0.3142, + "num_input_tokens_seen": 9608024, + "step": 4904 + }, + { + "epoch": 0.6500994035785288, + "grad_norm": 10.248557090759277, + "learning_rate": 4.444411392808674e-06, + "loss": 0.1872, + "num_input_tokens_seen": 9609992, + "step": 4905 + }, + { + "epoch": 0.650231941683234, + "grad_norm": 9.712374687194824, + "learning_rate": 4.444193178922737e-06, + "loss": 0.1912, + "num_input_tokens_seen": 9611832, + "step": 4906 + }, + { + "epoch": 0.650364479787939, + "grad_norm": 14.79377555847168, + "learning_rate": 4.443974927551414e-06, + "loss": 0.3073, + "num_input_tokens_seen": 9613768, + "step": 4907 + }, + { + "epoch": 0.6504970178926441, + "grad_norm": 14.20767879486084, + "learning_rate": 4.443756638698913e-06, + "loss": 0.7216, + "num_input_tokens_seen": 9615856, + "step": 4908 + }, + { + "epoch": 0.6506295559973493, + "grad_norm": 11.211538314819336, + "learning_rate": 4.443538312369444e-06, + "loss": 0.5318, + "num_input_tokens_seen": 9618648, + "step": 4909 + }, + { + "epoch": 0.6507620941020543, + "grad_norm": 8.546815872192383, + "learning_rate": 4.443319948567215e-06, + "loss": 0.2733, + "num_input_tokens_seen": 9620232, + "step": 4910 + }, + { + "epoch": 0.6508946322067595, + "grad_norm": 6.848811149597168, + "learning_rate": 4.443101547296437e-06, + "loss": 0.2028, + "num_input_tokens_seen": 9622424, + "step": 4911 + }, + { + "epoch": 0.6510271703114645, + "grad_norm": 0.6491931080818176, + "learning_rate": 4.442883108561321e-06, + "loss": 0.0041, + "num_input_tokens_seen": 9624488, + "step": 4912 + }, + { + "epoch": 0.6511597084161697, + "grad_norm": 12.751710891723633, + "learning_rate": 4.442664632366077e-06, + "loss": 0.4509, + "num_input_tokens_seen": 9626704, + "step": 4913 + }, + { + "epoch": 0.6512922465208748, + "grad_norm": 1.621903419494629, + "learning_rate": 4.44244611871492e-06, + "loss": 0.0213, + "num_input_tokens_seen": 9629200, + "step": 4914 + }, + { + "epoch": 0.6514247846255798, + "grad_norm": 6.312143325805664, + "learning_rate": 4.442227567612062e-06, + "loss": 0.2643, + "num_input_tokens_seen": 9631728, + "step": 4915 + }, + { + "epoch": 0.651557322730285, + "grad_norm": 11.068902969360352, + "learning_rate": 4.442008979061715e-06, + "loss": 0.2523, + "num_input_tokens_seen": 9633224, + "step": 4916 + }, + { + "epoch": 0.65168986083499, + "grad_norm": 15.456021308898926, + "learning_rate": 4.441790353068098e-06, + "loss": 0.3187, + "num_input_tokens_seen": 9635936, + "step": 4917 + }, + { + "epoch": 0.6518223989396952, + "grad_norm": 14.133515357971191, + "learning_rate": 4.4415716896354205e-06, + "loss": 0.4168, + "num_input_tokens_seen": 9637712, + "step": 4918 + }, + { + "epoch": 0.6519549370444002, + "grad_norm": 3.9950807094573975, + "learning_rate": 4.441352988767903e-06, + "loss": 0.0693, + "num_input_tokens_seen": 9639592, + "step": 4919 + }, + { + "epoch": 0.6520874751491054, + "grad_norm": 10.165696144104004, + "learning_rate": 4.4411342504697595e-06, + "loss": 0.2075, + "num_input_tokens_seen": 9641832, + "step": 4920 + }, + { + "epoch": 0.6522200132538105, + "grad_norm": 9.085101127624512, + "learning_rate": 4.4409154747452096e-06, + "loss": 0.4002, + "num_input_tokens_seen": 9644000, + "step": 4921 + }, + { + "epoch": 0.6523525513585156, + "grad_norm": 4.153408050537109, + "learning_rate": 4.44069666159847e-06, + "loss": 0.0416, + "num_input_tokens_seen": 9646800, + "step": 4922 + }, + { + "epoch": 0.6524850894632207, + "grad_norm": 13.377498626708984, + "learning_rate": 4.4404778110337595e-06, + "loss": 0.3605, + "num_input_tokens_seen": 9648520, + "step": 4923 + }, + { + "epoch": 0.6526176275679257, + "grad_norm": 13.998909950256348, + "learning_rate": 4.440258923055298e-06, + "loss": 0.4652, + "num_input_tokens_seen": 9651128, + "step": 4924 + }, + { + "epoch": 0.6527501656726309, + "grad_norm": 0.3415073752403259, + "learning_rate": 4.440039997667307e-06, + "loss": 0.0022, + "num_input_tokens_seen": 9653432, + "step": 4925 + }, + { + "epoch": 0.652882703777336, + "grad_norm": 0.08827115595340729, + "learning_rate": 4.439821034874006e-06, + "loss": 0.0006, + "num_input_tokens_seen": 9654896, + "step": 4926 + }, + { + "epoch": 0.6530152418820411, + "grad_norm": 1.8026548624038696, + "learning_rate": 4.439602034679618e-06, + "loss": 0.0417, + "num_input_tokens_seen": 9656344, + "step": 4927 + }, + { + "epoch": 0.6531477799867462, + "grad_norm": 11.30551528930664, + "learning_rate": 4.439382997088363e-06, + "loss": 0.2153, + "num_input_tokens_seen": 9658512, + "step": 4928 + }, + { + "epoch": 0.6532803180914513, + "grad_norm": 3.9185690879821777, + "learning_rate": 4.439163922104467e-06, + "loss": 0.0273, + "num_input_tokens_seen": 9659952, + "step": 4929 + }, + { + "epoch": 0.6534128561961564, + "grad_norm": 11.925789833068848, + "learning_rate": 4.438944809732153e-06, + "loss": 0.2703, + "num_input_tokens_seen": 9661648, + "step": 4930 + }, + { + "epoch": 0.6535453943008614, + "grad_norm": 5.87583589553833, + "learning_rate": 4.438725659975644e-06, + "loss": 0.0812, + "num_input_tokens_seen": 9663776, + "step": 4931 + }, + { + "epoch": 0.6536779324055666, + "grad_norm": 13.513618469238281, + "learning_rate": 4.438506472839169e-06, + "loss": 0.4256, + "num_input_tokens_seen": 9665696, + "step": 4932 + }, + { + "epoch": 0.6538104705102717, + "grad_norm": 12.337636947631836, + "learning_rate": 4.43828724832695e-06, + "loss": 0.2735, + "num_input_tokens_seen": 9668368, + "step": 4933 + }, + { + "epoch": 0.6539430086149768, + "grad_norm": 4.707254886627197, + "learning_rate": 4.438067986443217e-06, + "loss": 0.1286, + "num_input_tokens_seen": 9669824, + "step": 4934 + }, + { + "epoch": 0.6540755467196819, + "grad_norm": 1.4064189195632935, + "learning_rate": 4.437848687192195e-06, + "loss": 0.0056, + "num_input_tokens_seen": 9672800, + "step": 4935 + }, + { + "epoch": 0.654208084824387, + "grad_norm": 1.4091341495513916, + "learning_rate": 4.437629350578115e-06, + "loss": 0.0091, + "num_input_tokens_seen": 9675272, + "step": 4936 + }, + { + "epoch": 0.6543406229290921, + "grad_norm": 0.22482064366340637, + "learning_rate": 4.437409976605204e-06, + "loss": 0.0013, + "num_input_tokens_seen": 9676744, + "step": 4937 + }, + { + "epoch": 0.6544731610337973, + "grad_norm": 4.379144668579102, + "learning_rate": 4.437190565277691e-06, + "loss": 0.1677, + "num_input_tokens_seen": 9678872, + "step": 4938 + }, + { + "epoch": 0.6546056991385023, + "grad_norm": 4.113053321838379, + "learning_rate": 4.4369711165998085e-06, + "loss": 0.1263, + "num_input_tokens_seen": 9680992, + "step": 4939 + }, + { + "epoch": 0.6547382372432075, + "grad_norm": 9.140275955200195, + "learning_rate": 4.4367516305757864e-06, + "loss": 0.1357, + "num_input_tokens_seen": 9682272, + "step": 4940 + }, + { + "epoch": 0.6548707753479125, + "grad_norm": 0.03027283400297165, + "learning_rate": 4.436532107209857e-06, + "loss": 0.0002, + "num_input_tokens_seen": 9683552, + "step": 4941 + }, + { + "epoch": 0.6550033134526176, + "grad_norm": 0.1308932900428772, + "learning_rate": 4.436312546506252e-06, + "loss": 0.0008, + "num_input_tokens_seen": 9685320, + "step": 4942 + }, + { + "epoch": 0.6551358515573227, + "grad_norm": 0.13611389696598053, + "learning_rate": 4.436092948469205e-06, + "loss": 0.0009, + "num_input_tokens_seen": 9687064, + "step": 4943 + }, + { + "epoch": 0.6552683896620278, + "grad_norm": 11.976266860961914, + "learning_rate": 4.435873313102951e-06, + "loss": 0.3159, + "num_input_tokens_seen": 9689208, + "step": 4944 + }, + { + "epoch": 0.655400927766733, + "grad_norm": 0.048062171787023544, + "learning_rate": 4.435653640411724e-06, + "loss": 0.0003, + "num_input_tokens_seen": 9690600, + "step": 4945 + }, + { + "epoch": 0.655533465871438, + "grad_norm": 9.773316383361816, + "learning_rate": 4.43543393039976e-06, + "loss": 0.3407, + "num_input_tokens_seen": 9694160, + "step": 4946 + }, + { + "epoch": 0.6556660039761432, + "grad_norm": 9.891671180725098, + "learning_rate": 4.435214183071294e-06, + "loss": 0.4888, + "num_input_tokens_seen": 9697208, + "step": 4947 + }, + { + "epoch": 0.6557985420808482, + "grad_norm": 1.2753030061721802, + "learning_rate": 4.434994398430563e-06, + "loss": 0.0082, + "num_input_tokens_seen": 9699944, + "step": 4948 + }, + { + "epoch": 0.6559310801855534, + "grad_norm": 7.925821304321289, + "learning_rate": 4.434774576481806e-06, + "loss": 0.2255, + "num_input_tokens_seen": 9702176, + "step": 4949 + }, + { + "epoch": 0.6560636182902585, + "grad_norm": 0.013318458572030067, + "learning_rate": 4.43455471722926e-06, + "loss": 0.0001, + "num_input_tokens_seen": 9703720, + "step": 4950 + }, + { + "epoch": 0.6561961563949635, + "grad_norm": 10.792168617248535, + "learning_rate": 4.4343348206771645e-06, + "loss": 0.403, + "num_input_tokens_seen": 9706440, + "step": 4951 + }, + { + "epoch": 0.6563286944996687, + "grad_norm": 9.374406814575195, + "learning_rate": 4.43411488682976e-06, + "loss": 0.1483, + "num_input_tokens_seen": 9707768, + "step": 4952 + }, + { + "epoch": 0.6564612326043737, + "grad_norm": 8.012020111083984, + "learning_rate": 4.433894915691285e-06, + "loss": 0.1881, + "num_input_tokens_seen": 9711160, + "step": 4953 + }, + { + "epoch": 0.6565937707090789, + "grad_norm": 9.518787384033203, + "learning_rate": 4.433674907265982e-06, + "loss": 0.3732, + "num_input_tokens_seen": 9713744, + "step": 4954 + }, + { + "epoch": 0.6567263088137839, + "grad_norm": 21.15972328186035, + "learning_rate": 4.433454861558094e-06, + "loss": 0.6788, + "num_input_tokens_seen": 9715776, + "step": 4955 + }, + { + "epoch": 0.6568588469184891, + "grad_norm": 0.03623078763484955, + "learning_rate": 4.4332347785718615e-06, + "loss": 0.0003, + "num_input_tokens_seen": 9717664, + "step": 4956 + }, + { + "epoch": 0.6569913850231942, + "grad_norm": 13.509406089782715, + "learning_rate": 4.433014658311529e-06, + "loss": 0.2496, + "num_input_tokens_seen": 9719592, + "step": 4957 + }, + { + "epoch": 0.6571239231278992, + "grad_norm": 9.939379692077637, + "learning_rate": 4.4327945007813406e-06, + "loss": 0.23, + "num_input_tokens_seen": 9721544, + "step": 4958 + }, + { + "epoch": 0.6572564612326044, + "grad_norm": 4.431924343109131, + "learning_rate": 4.432574305985541e-06, + "loss": 0.0807, + "num_input_tokens_seen": 9723856, + "step": 4959 + }, + { + "epoch": 0.6573889993373094, + "grad_norm": 9.369827270507812, + "learning_rate": 4.432354073928374e-06, + "loss": 0.1657, + "num_input_tokens_seen": 9726120, + "step": 4960 + }, + { + "epoch": 0.6575215374420146, + "grad_norm": 10.54124641418457, + "learning_rate": 4.432133804614089e-06, + "loss": 0.2355, + "num_input_tokens_seen": 9727688, + "step": 4961 + }, + { + "epoch": 0.6576540755467197, + "grad_norm": 0.13701234757900238, + "learning_rate": 4.431913498046931e-06, + "loss": 0.0009, + "num_input_tokens_seen": 9728968, + "step": 4962 + }, + { + "epoch": 0.6577866136514248, + "grad_norm": 2.403229236602783, + "learning_rate": 4.431693154231148e-06, + "loss": 0.0378, + "num_input_tokens_seen": 9730368, + "step": 4963 + }, + { + "epoch": 0.6579191517561299, + "grad_norm": 0.7506384253501892, + "learning_rate": 4.431472773170988e-06, + "loss": 0.0042, + "num_input_tokens_seen": 9732056, + "step": 4964 + }, + { + "epoch": 0.658051689860835, + "grad_norm": 20.07316780090332, + "learning_rate": 4.431252354870701e-06, + "loss": 0.6298, + "num_input_tokens_seen": 9733952, + "step": 4965 + }, + { + "epoch": 0.6581842279655401, + "grad_norm": 13.885061264038086, + "learning_rate": 4.431031899334537e-06, + "loss": 0.2005, + "num_input_tokens_seen": 9735216, + "step": 4966 + }, + { + "epoch": 0.6583167660702451, + "grad_norm": 4.1072678565979, + "learning_rate": 4.430811406566744e-06, + "loss": 0.029, + "num_input_tokens_seen": 9736824, + "step": 4967 + }, + { + "epoch": 0.6584493041749503, + "grad_norm": 0.10417488217353821, + "learning_rate": 4.430590876571577e-06, + "loss": 0.0007, + "num_input_tokens_seen": 9738592, + "step": 4968 + }, + { + "epoch": 0.6585818422796554, + "grad_norm": 0.12139590084552765, + "learning_rate": 4.4303703093532845e-06, + "loss": 0.0008, + "num_input_tokens_seen": 9740584, + "step": 4969 + }, + { + "epoch": 0.6587143803843605, + "grad_norm": 6.384578704833984, + "learning_rate": 4.430149704916121e-06, + "loss": 0.2513, + "num_input_tokens_seen": 9742400, + "step": 4970 + }, + { + "epoch": 0.6588469184890656, + "grad_norm": 7.288904666900635, + "learning_rate": 4.42992906326434e-06, + "loss": 0.1837, + "num_input_tokens_seen": 9744408, + "step": 4971 + }, + { + "epoch": 0.6589794565937707, + "grad_norm": 10.088225364685059, + "learning_rate": 4.429708384402195e-06, + "loss": 0.1502, + "num_input_tokens_seen": 9747288, + "step": 4972 + }, + { + "epoch": 0.6591119946984758, + "grad_norm": 12.420062065124512, + "learning_rate": 4.429487668333941e-06, + "loss": 0.4258, + "num_input_tokens_seen": 9749792, + "step": 4973 + }, + { + "epoch": 0.659244532803181, + "grad_norm": 10.40991497039795, + "learning_rate": 4.4292669150638346e-06, + "loss": 0.3383, + "num_input_tokens_seen": 9751632, + "step": 4974 + }, + { + "epoch": 0.659377070907886, + "grad_norm": 0.06113347411155701, + "learning_rate": 4.42904612459613e-06, + "loss": 0.0004, + "num_input_tokens_seen": 9753880, + "step": 4975 + }, + { + "epoch": 0.6595096090125911, + "grad_norm": 0.35043495893478394, + "learning_rate": 4.428825296935086e-06, + "loss": 0.0024, + "num_input_tokens_seen": 9755488, + "step": 4976 + }, + { + "epoch": 0.6596421471172962, + "grad_norm": 1.0674535036087036, + "learning_rate": 4.428604432084959e-06, + "loss": 0.0049, + "num_input_tokens_seen": 9757920, + "step": 4977 + }, + { + "epoch": 0.6597746852220013, + "grad_norm": 0.04963526502251625, + "learning_rate": 4.42838353005001e-06, + "loss": 0.0003, + "num_input_tokens_seen": 9759776, + "step": 4978 + }, + { + "epoch": 0.6599072233267065, + "grad_norm": 11.400837898254395, + "learning_rate": 4.428162590834495e-06, + "loss": 0.2831, + "num_input_tokens_seen": 9761504, + "step": 4979 + }, + { + "epoch": 0.6600397614314115, + "grad_norm": 2.0109293460845947, + "learning_rate": 4.427941614442675e-06, + "loss": 0.0124, + "num_input_tokens_seen": 9762768, + "step": 4980 + }, + { + "epoch": 0.6601722995361167, + "grad_norm": 9.907425880432129, + "learning_rate": 4.4277206008788096e-06, + "loss": 0.1858, + "num_input_tokens_seen": 9764432, + "step": 4981 + }, + { + "epoch": 0.6603048376408217, + "grad_norm": 9.269798278808594, + "learning_rate": 4.4274995501471625e-06, + "loss": 0.1293, + "num_input_tokens_seen": 9767344, + "step": 4982 + }, + { + "epoch": 0.6604373757455269, + "grad_norm": 9.2672758102417, + "learning_rate": 4.427278462251994e-06, + "loss": 0.24, + "num_input_tokens_seen": 9769200, + "step": 4983 + }, + { + "epoch": 0.6605699138502319, + "grad_norm": 4.715796947479248, + "learning_rate": 4.427057337197566e-06, + "loss": 0.0396, + "num_input_tokens_seen": 9771176, + "step": 4984 + }, + { + "epoch": 0.660702451954937, + "grad_norm": 7.958349227905273, + "learning_rate": 4.426836174988145e-06, + "loss": 0.2333, + "num_input_tokens_seen": 9772376, + "step": 4985 + }, + { + "epoch": 0.6608349900596422, + "grad_norm": 0.32258349657058716, + "learning_rate": 4.426614975627993e-06, + "loss": 0.0019, + "num_input_tokens_seen": 9774120, + "step": 4986 + }, + { + "epoch": 0.6609675281643472, + "grad_norm": 7.679451942443848, + "learning_rate": 4.426393739121374e-06, + "loss": 0.1117, + "num_input_tokens_seen": 9776280, + "step": 4987 + }, + { + "epoch": 0.6611000662690524, + "grad_norm": 5.332755088806152, + "learning_rate": 4.426172465472556e-06, + "loss": 0.2184, + "num_input_tokens_seen": 9777880, + "step": 4988 + }, + { + "epoch": 0.6612326043737574, + "grad_norm": 7.2600178718566895, + "learning_rate": 4.425951154685804e-06, + "loss": 0.1276, + "num_input_tokens_seen": 9780368, + "step": 4989 + }, + { + "epoch": 0.6613651424784626, + "grad_norm": 6.8875322341918945, + "learning_rate": 4.425729806765385e-06, + "loss": 0.1292, + "num_input_tokens_seen": 9782384, + "step": 4990 + }, + { + "epoch": 0.6614976805831677, + "grad_norm": 13.065975189208984, + "learning_rate": 4.425508421715565e-06, + "loss": 0.3635, + "num_input_tokens_seen": 9784872, + "step": 4991 + }, + { + "epoch": 0.6616302186878728, + "grad_norm": 0.07522980123758316, + "learning_rate": 4.425286999540617e-06, + "loss": 0.0005, + "num_input_tokens_seen": 9786944, + "step": 4992 + }, + { + "epoch": 0.6617627567925779, + "grad_norm": 0.042785629630088806, + "learning_rate": 4.425065540244805e-06, + "loss": 0.0003, + "num_input_tokens_seen": 9788144, + "step": 4993 + }, + { + "epoch": 0.6618952948972829, + "grad_norm": 0.03004760667681694, + "learning_rate": 4.424844043832403e-06, + "loss": 0.0002, + "num_input_tokens_seen": 9790128, + "step": 4994 + }, + { + "epoch": 0.6620278330019881, + "grad_norm": 13.8309907913208, + "learning_rate": 4.424622510307679e-06, + "loss": 0.2251, + "num_input_tokens_seen": 9792896, + "step": 4995 + }, + { + "epoch": 0.6621603711066931, + "grad_norm": 11.583640098571777, + "learning_rate": 4.424400939674906e-06, + "loss": 0.5219, + "num_input_tokens_seen": 9794944, + "step": 4996 + }, + { + "epoch": 0.6622929092113983, + "grad_norm": 12.569435119628906, + "learning_rate": 4.424179331938355e-06, + "loss": 0.3063, + "num_input_tokens_seen": 9797352, + "step": 4997 + }, + { + "epoch": 0.6624254473161034, + "grad_norm": 8.057336807250977, + "learning_rate": 4.423957687102299e-06, + "loss": 0.1473, + "num_input_tokens_seen": 9799264, + "step": 4998 + }, + { + "epoch": 0.6625579854208085, + "grad_norm": 20.758127212524414, + "learning_rate": 4.423736005171012e-06, + "loss": 0.7184, + "num_input_tokens_seen": 9801504, + "step": 4999 + }, + { + "epoch": 0.6626905235255136, + "grad_norm": 9.654420852661133, + "learning_rate": 4.423514286148768e-06, + "loss": 0.3853, + "num_input_tokens_seen": 9803672, + "step": 5000 + }, + { + "epoch": 0.6628230616302186, + "grad_norm": 0.25291866064071655, + "learning_rate": 4.423292530039841e-06, + "loss": 0.0013, + "num_input_tokens_seen": 9806840, + "step": 5001 + }, + { + "epoch": 0.6629555997349238, + "grad_norm": 5.552849292755127, + "learning_rate": 4.423070736848507e-06, + "loss": 0.0931, + "num_input_tokens_seen": 9808376, + "step": 5002 + }, + { + "epoch": 0.6630881378396289, + "grad_norm": 15.180678367614746, + "learning_rate": 4.422848906579042e-06, + "loss": 0.0912, + "num_input_tokens_seen": 9810656, + "step": 5003 + }, + { + "epoch": 0.663220675944334, + "grad_norm": 12.315427780151367, + "learning_rate": 4.422627039235725e-06, + "loss": 0.2831, + "num_input_tokens_seen": 9812360, + "step": 5004 + }, + { + "epoch": 0.6633532140490391, + "grad_norm": 0.5663869380950928, + "learning_rate": 4.422405134822831e-06, + "loss": 0.0031, + "num_input_tokens_seen": 9813896, + "step": 5005 + }, + { + "epoch": 0.6634857521537442, + "grad_norm": 4.510916709899902, + "learning_rate": 4.422183193344642e-06, + "loss": 0.0822, + "num_input_tokens_seen": 9815664, + "step": 5006 + }, + { + "epoch": 0.6636182902584493, + "grad_norm": 11.862811088562012, + "learning_rate": 4.421961214805432e-06, + "loss": 0.4808, + "num_input_tokens_seen": 9817176, + "step": 5007 + }, + { + "epoch": 0.6637508283631544, + "grad_norm": 11.01945686340332, + "learning_rate": 4.4217391992094864e-06, + "loss": 0.1436, + "num_input_tokens_seen": 9818744, + "step": 5008 + }, + { + "epoch": 0.6638833664678595, + "grad_norm": 3.3814823627471924, + "learning_rate": 4.4215171465610815e-06, + "loss": 0.0996, + "num_input_tokens_seen": 9820520, + "step": 5009 + }, + { + "epoch": 0.6640159045725647, + "grad_norm": 0.8859408497810364, + "learning_rate": 4.421295056864501e-06, + "loss": 0.0048, + "num_input_tokens_seen": 9822280, + "step": 5010 + }, + { + "epoch": 0.6641484426772697, + "grad_norm": 9.419044494628906, + "learning_rate": 4.4210729301240266e-06, + "loss": 0.3337, + "num_input_tokens_seen": 9823824, + "step": 5011 + }, + { + "epoch": 0.6642809807819748, + "grad_norm": 9.986599922180176, + "learning_rate": 4.42085076634394e-06, + "loss": 0.2552, + "num_input_tokens_seen": 9825968, + "step": 5012 + }, + { + "epoch": 0.6644135188866799, + "grad_norm": 8.335803985595703, + "learning_rate": 4.420628565528527e-06, + "loss": 0.2553, + "num_input_tokens_seen": 9828984, + "step": 5013 + }, + { + "epoch": 0.664546056991385, + "grad_norm": 15.004911422729492, + "learning_rate": 4.42040632768207e-06, + "loss": 0.4594, + "num_input_tokens_seen": 9831016, + "step": 5014 + }, + { + "epoch": 0.6646785950960902, + "grad_norm": 30.192182540893555, + "learning_rate": 4.420184052808854e-06, + "loss": 1.0335, + "num_input_tokens_seen": 9834088, + "step": 5015 + }, + { + "epoch": 0.6648111332007952, + "grad_norm": 15.952327728271484, + "learning_rate": 4.419961740913163e-06, + "loss": 0.163, + "num_input_tokens_seen": 9836912, + "step": 5016 + }, + { + "epoch": 0.6649436713055004, + "grad_norm": 4.233673572540283, + "learning_rate": 4.419739391999287e-06, + "loss": 0.0357, + "num_input_tokens_seen": 9838912, + "step": 5017 + }, + { + "epoch": 0.6650762094102054, + "grad_norm": 9.261635780334473, + "learning_rate": 4.419517006071511e-06, + "loss": 0.1837, + "num_input_tokens_seen": 9840848, + "step": 5018 + }, + { + "epoch": 0.6652087475149105, + "grad_norm": 0.715478777885437, + "learning_rate": 4.419294583134123e-06, + "loss": 0.0037, + "num_input_tokens_seen": 9842400, + "step": 5019 + }, + { + "epoch": 0.6653412856196156, + "grad_norm": 25.23809814453125, + "learning_rate": 4.419072123191411e-06, + "loss": 0.4748, + "num_input_tokens_seen": 9843704, + "step": 5020 + }, + { + "epoch": 0.6654738237243207, + "grad_norm": 2.126145839691162, + "learning_rate": 4.418849626247664e-06, + "loss": 0.0786, + "num_input_tokens_seen": 9845176, + "step": 5021 + }, + { + "epoch": 0.6656063618290259, + "grad_norm": 24.467252731323242, + "learning_rate": 4.418627092307174e-06, + "loss": 1.2944, + "num_input_tokens_seen": 9847992, + "step": 5022 + }, + { + "epoch": 0.6657388999337309, + "grad_norm": 0.4962744116783142, + "learning_rate": 4.418404521374229e-06, + "loss": 0.0032, + "num_input_tokens_seen": 9850216, + "step": 5023 + }, + { + "epoch": 0.6658714380384361, + "grad_norm": 4.327403545379639, + "learning_rate": 4.418181913453121e-06, + "loss": 0.0677, + "num_input_tokens_seen": 9852520, + "step": 5024 + }, + { + "epoch": 0.6660039761431411, + "grad_norm": 9.603227615356445, + "learning_rate": 4.417959268548143e-06, + "loss": 0.2486, + "num_input_tokens_seen": 9854600, + "step": 5025 + }, + { + "epoch": 0.6661365142478463, + "grad_norm": 9.059036254882812, + "learning_rate": 4.417736586663587e-06, + "loss": 0.2638, + "num_input_tokens_seen": 9856904, + "step": 5026 + }, + { + "epoch": 0.6662690523525514, + "grad_norm": 5.030508041381836, + "learning_rate": 4.417513867803747e-06, + "loss": 0.1136, + "num_input_tokens_seen": 9858408, + "step": 5027 + }, + { + "epoch": 0.6664015904572564, + "grad_norm": 14.200212478637695, + "learning_rate": 4.417291111972917e-06, + "loss": 0.6719, + "num_input_tokens_seen": 9860312, + "step": 5028 + }, + { + "epoch": 0.6665341285619616, + "grad_norm": 0.14212562143802643, + "learning_rate": 4.417068319175391e-06, + "loss": 0.001, + "num_input_tokens_seen": 9862080, + "step": 5029 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 7.112290859222412, + "learning_rate": 4.416845489415465e-06, + "loss": 0.1419, + "num_input_tokens_seen": 9863936, + "step": 5030 + }, + { + "epoch": 0.6667992047713718, + "grad_norm": 3.9986684322357178, + "learning_rate": 4.416622622697436e-06, + "loss": 0.0323, + "num_input_tokens_seen": 9865656, + "step": 5031 + }, + { + "epoch": 0.6669317428760769, + "grad_norm": 6.242071628570557, + "learning_rate": 4.416399719025602e-06, + "loss": 0.0924, + "num_input_tokens_seen": 9867240, + "step": 5032 + }, + { + "epoch": 0.667064280980782, + "grad_norm": 0.14766113460063934, + "learning_rate": 4.416176778404258e-06, + "loss": 0.001, + "num_input_tokens_seen": 9869616, + "step": 5033 + }, + { + "epoch": 0.6671968190854871, + "grad_norm": 20.075815200805664, + "learning_rate": 4.415953800837704e-06, + "loss": 0.6163, + "num_input_tokens_seen": 9871400, + "step": 5034 + }, + { + "epoch": 0.6673293571901922, + "grad_norm": 0.17864811420440674, + "learning_rate": 4.415730786330238e-06, + "loss": 0.0012, + "num_input_tokens_seen": 9873608, + "step": 5035 + }, + { + "epoch": 0.6674618952948973, + "grad_norm": 18.78772735595703, + "learning_rate": 4.415507734886162e-06, + "loss": 0.3052, + "num_input_tokens_seen": 9875672, + "step": 5036 + }, + { + "epoch": 0.6675944333996023, + "grad_norm": 10.10122299194336, + "learning_rate": 4.415284646509775e-06, + "loss": 0.3539, + "num_input_tokens_seen": 9877736, + "step": 5037 + }, + { + "epoch": 0.6677269715043075, + "grad_norm": 5.717212677001953, + "learning_rate": 4.415061521205379e-06, + "loss": 0.1252, + "num_input_tokens_seen": 9879688, + "step": 5038 + }, + { + "epoch": 0.6678595096090126, + "grad_norm": 7.765366077423096, + "learning_rate": 4.414838358977276e-06, + "loss": 0.1498, + "num_input_tokens_seen": 9881624, + "step": 5039 + }, + { + "epoch": 0.6679920477137177, + "grad_norm": 0.09273875504732132, + "learning_rate": 4.414615159829768e-06, + "loss": 0.0006, + "num_input_tokens_seen": 9883080, + "step": 5040 + }, + { + "epoch": 0.6681245858184228, + "grad_norm": 12.386722564697266, + "learning_rate": 4.4143919237671595e-06, + "loss": 0.3715, + "num_input_tokens_seen": 9885576, + "step": 5041 + }, + { + "epoch": 0.6682571239231279, + "grad_norm": 0.190579354763031, + "learning_rate": 4.414168650793753e-06, + "loss": 0.0013, + "num_input_tokens_seen": 9886888, + "step": 5042 + }, + { + "epoch": 0.668389662027833, + "grad_norm": 5.57904577255249, + "learning_rate": 4.413945340913855e-06, + "loss": 0.1759, + "num_input_tokens_seen": 9888640, + "step": 5043 + }, + { + "epoch": 0.6685222001325382, + "grad_norm": 0.106113962829113, + "learning_rate": 4.413721994131771e-06, + "loss": 0.0007, + "num_input_tokens_seen": 9890640, + "step": 5044 + }, + { + "epoch": 0.6686547382372432, + "grad_norm": 4.529018878936768, + "learning_rate": 4.4134986104518065e-06, + "loss": 0.028, + "num_input_tokens_seen": 9892568, + "step": 5045 + }, + { + "epoch": 0.6687872763419483, + "grad_norm": 11.19788646697998, + "learning_rate": 4.4132751898782676e-06, + "loss": 0.3302, + "num_input_tokens_seen": 9894744, + "step": 5046 + }, + { + "epoch": 0.6689198144466534, + "grad_norm": 2.2651402950286865, + "learning_rate": 4.4130517324154645e-06, + "loss": 0.0268, + "num_input_tokens_seen": 9896528, + "step": 5047 + }, + { + "epoch": 0.6690523525513585, + "grad_norm": 0.4463745951652527, + "learning_rate": 4.412828238067704e-06, + "loss": 0.0029, + "num_input_tokens_seen": 9897952, + "step": 5048 + }, + { + "epoch": 0.6691848906560636, + "grad_norm": 9.748815536499023, + "learning_rate": 4.412604706839294e-06, + "loss": 0.2879, + "num_input_tokens_seen": 9899840, + "step": 5049 + }, + { + "epoch": 0.6693174287607687, + "grad_norm": 0.17950616776943207, + "learning_rate": 4.412381138734547e-06, + "loss": 0.0012, + "num_input_tokens_seen": 9900984, + "step": 5050 + }, + { + "epoch": 0.6694499668654739, + "grad_norm": 18.12680435180664, + "learning_rate": 4.4121575337577725e-06, + "loss": 0.6558, + "num_input_tokens_seen": 9902728, + "step": 5051 + }, + { + "epoch": 0.6695825049701789, + "grad_norm": 10.23061466217041, + "learning_rate": 4.411933891913281e-06, + "loss": 0.4668, + "num_input_tokens_seen": 9904536, + "step": 5052 + }, + { + "epoch": 0.669715043074884, + "grad_norm": 0.5455088019371033, + "learning_rate": 4.411710213205386e-06, + "loss": 0.0036, + "num_input_tokens_seen": 9906256, + "step": 5053 + }, + { + "epoch": 0.6698475811795891, + "grad_norm": 4.464481830596924, + "learning_rate": 4.411486497638399e-06, + "loss": 0.0307, + "num_input_tokens_seen": 9908024, + "step": 5054 + }, + { + "epoch": 0.6699801192842942, + "grad_norm": 0.10425784438848495, + "learning_rate": 4.411262745216633e-06, + "loss": 0.0007, + "num_input_tokens_seen": 9910368, + "step": 5055 + }, + { + "epoch": 0.6701126573889994, + "grad_norm": 10.11318588256836, + "learning_rate": 4.411038955944403e-06, + "loss": 0.436, + "num_input_tokens_seen": 9912680, + "step": 5056 + }, + { + "epoch": 0.6702451954937044, + "grad_norm": 7.7582783699035645, + "learning_rate": 4.410815129826025e-06, + "loss": 0.2796, + "num_input_tokens_seen": 9914472, + "step": 5057 + }, + { + "epoch": 0.6703777335984096, + "grad_norm": 13.129932403564453, + "learning_rate": 4.410591266865812e-06, + "loss": 0.4313, + "num_input_tokens_seen": 9918120, + "step": 5058 + }, + { + "epoch": 0.6705102717031146, + "grad_norm": 3.296034097671509, + "learning_rate": 4.410367367068081e-06, + "loss": 0.0263, + "num_input_tokens_seen": 9920608, + "step": 5059 + }, + { + "epoch": 0.6706428098078198, + "grad_norm": 9.669759750366211, + "learning_rate": 4.410143430437149e-06, + "loss": 0.1508, + "num_input_tokens_seen": 9922712, + "step": 5060 + }, + { + "epoch": 0.6707753479125248, + "grad_norm": 0.13385877013206482, + "learning_rate": 4.409919456977335e-06, + "loss": 0.0009, + "num_input_tokens_seen": 9925560, + "step": 5061 + }, + { + "epoch": 0.67090788601723, + "grad_norm": 8.13770866394043, + "learning_rate": 4.409695446692955e-06, + "loss": 0.2427, + "num_input_tokens_seen": 9927880, + "step": 5062 + }, + { + "epoch": 0.6710404241219351, + "grad_norm": 0.09607189893722534, + "learning_rate": 4.4094713995883304e-06, + "loss": 0.0007, + "num_input_tokens_seen": 9929744, + "step": 5063 + }, + { + "epoch": 0.6711729622266401, + "grad_norm": 17.464380264282227, + "learning_rate": 4.4092473156677795e-06, + "loss": 0.8497, + "num_input_tokens_seen": 9932808, + "step": 5064 + }, + { + "epoch": 0.6713055003313453, + "grad_norm": 0.20174774527549744, + "learning_rate": 4.409023194935623e-06, + "loss": 0.0012, + "num_input_tokens_seen": 9935472, + "step": 5065 + }, + { + "epoch": 0.6714380384360503, + "grad_norm": 2.341762065887451, + "learning_rate": 4.408799037396183e-06, + "loss": 0.0109, + "num_input_tokens_seen": 9937104, + "step": 5066 + }, + { + "epoch": 0.6715705765407555, + "grad_norm": 1.1015726327896118, + "learning_rate": 4.408574843053781e-06, + "loss": 0.0052, + "num_input_tokens_seen": 9939040, + "step": 5067 + }, + { + "epoch": 0.6717031146454606, + "grad_norm": 8.07042121887207, + "learning_rate": 4.408350611912739e-06, + "loss": 0.278, + "num_input_tokens_seen": 9941528, + "step": 5068 + }, + { + "epoch": 0.6718356527501657, + "grad_norm": 2.494370460510254, + "learning_rate": 4.40812634397738e-06, + "loss": 0.019, + "num_input_tokens_seen": 9943656, + "step": 5069 + }, + { + "epoch": 0.6719681908548708, + "grad_norm": 5.115164756774902, + "learning_rate": 4.407902039252029e-06, + "loss": 0.1386, + "num_input_tokens_seen": 9945536, + "step": 5070 + }, + { + "epoch": 0.6721007289595758, + "grad_norm": 7.323765754699707, + "learning_rate": 4.407677697741011e-06, + "loss": 0.0923, + "num_input_tokens_seen": 9947696, + "step": 5071 + }, + { + "epoch": 0.672233267064281, + "grad_norm": 5.594784259796143, + "learning_rate": 4.40745331944865e-06, + "loss": 0.1361, + "num_input_tokens_seen": 9949160, + "step": 5072 + }, + { + "epoch": 0.672365805168986, + "grad_norm": 5.180838108062744, + "learning_rate": 4.407228904379274e-06, + "loss": 0.1081, + "num_input_tokens_seen": 9951296, + "step": 5073 + }, + { + "epoch": 0.6724983432736912, + "grad_norm": 0.03395497426390648, + "learning_rate": 4.407004452537209e-06, + "loss": 0.0002, + "num_input_tokens_seen": 9953032, + "step": 5074 + }, + { + "epoch": 0.6726308813783963, + "grad_norm": 5.230647563934326, + "learning_rate": 4.406779963926782e-06, + "loss": 0.1623, + "num_input_tokens_seen": 9955040, + "step": 5075 + }, + { + "epoch": 0.6727634194831014, + "grad_norm": 5.3114824295043945, + "learning_rate": 4.406555438552322e-06, + "loss": 0.1561, + "num_input_tokens_seen": 9956904, + "step": 5076 + }, + { + "epoch": 0.6728959575878065, + "grad_norm": 0.02076246775686741, + "learning_rate": 4.406330876418158e-06, + "loss": 0.0001, + "num_input_tokens_seen": 9959328, + "step": 5077 + }, + { + "epoch": 0.6730284956925116, + "grad_norm": 18.329252243041992, + "learning_rate": 4.40610627752862e-06, + "loss": 0.5498, + "num_input_tokens_seen": 9961584, + "step": 5078 + }, + { + "epoch": 0.6731610337972167, + "grad_norm": 0.0381946824491024, + "learning_rate": 4.405881641888038e-06, + "loss": 0.0003, + "num_input_tokens_seen": 9963408, + "step": 5079 + }, + { + "epoch": 0.6732935719019219, + "grad_norm": 7.628283500671387, + "learning_rate": 4.405656969500742e-06, + "loss": 0.2154, + "num_input_tokens_seen": 9966168, + "step": 5080 + }, + { + "epoch": 0.6734261100066269, + "grad_norm": 0.015847966074943542, + "learning_rate": 4.405432260371066e-06, + "loss": 0.0001, + "num_input_tokens_seen": 9967768, + "step": 5081 + }, + { + "epoch": 0.673558648111332, + "grad_norm": 15.920512199401855, + "learning_rate": 4.405207514503341e-06, + "loss": 0.714, + "num_input_tokens_seen": 9970320, + "step": 5082 + }, + { + "epoch": 0.6736911862160371, + "grad_norm": 0.029172945767641068, + "learning_rate": 4.4049827319019015e-06, + "loss": 0.0002, + "num_input_tokens_seen": 9971592, + "step": 5083 + }, + { + "epoch": 0.6738237243207422, + "grad_norm": 7.37698221206665, + "learning_rate": 4.40475791257108e-06, + "loss": 0.1768, + "num_input_tokens_seen": 9972952, + "step": 5084 + }, + { + "epoch": 0.6739562624254473, + "grad_norm": 0.1236468106508255, + "learning_rate": 4.404533056515213e-06, + "loss": 0.0007, + "num_input_tokens_seen": 9974296, + "step": 5085 + }, + { + "epoch": 0.6740888005301524, + "grad_norm": 4.536341667175293, + "learning_rate": 4.404308163738634e-06, + "loss": 0.1213, + "num_input_tokens_seen": 9975856, + "step": 5086 + }, + { + "epoch": 0.6742213386348576, + "grad_norm": 12.280192375183105, + "learning_rate": 4.40408323424568e-06, + "loss": 0.1738, + "num_input_tokens_seen": 9977280, + "step": 5087 + }, + { + "epoch": 0.6743538767395626, + "grad_norm": 9.334624290466309, + "learning_rate": 4.403858268040688e-06, + "loss": 0.1273, + "num_input_tokens_seen": 9979472, + "step": 5088 + }, + { + "epoch": 0.6744864148442677, + "grad_norm": 10.807537078857422, + "learning_rate": 4.403633265127995e-06, + "loss": 0.5099, + "num_input_tokens_seen": 9982840, + "step": 5089 + }, + { + "epoch": 0.6746189529489728, + "grad_norm": 11.606165885925293, + "learning_rate": 4.40340822551194e-06, + "loss": 0.3692, + "num_input_tokens_seen": 9985032, + "step": 5090 + }, + { + "epoch": 0.6747514910536779, + "grad_norm": 0.4771312177181244, + "learning_rate": 4.40318314919686e-06, + "loss": 0.0032, + "num_input_tokens_seen": 9987528, + "step": 5091 + }, + { + "epoch": 0.6748840291583831, + "grad_norm": 9.602693557739258, + "learning_rate": 4.402958036187096e-06, + "loss": 0.261, + "num_input_tokens_seen": 9989528, + "step": 5092 + }, + { + "epoch": 0.6750165672630881, + "grad_norm": 10.47360610961914, + "learning_rate": 4.402732886486989e-06, + "loss": 0.2707, + "num_input_tokens_seen": 9992080, + "step": 5093 + }, + { + "epoch": 0.6751491053677933, + "grad_norm": 17.313518524169922, + "learning_rate": 4.402507700100879e-06, + "loss": 0.501, + "num_input_tokens_seen": 9993304, + "step": 5094 + }, + { + "epoch": 0.6752816434724983, + "grad_norm": 7.180430889129639, + "learning_rate": 4.402282477033109e-06, + "loss": 0.2267, + "num_input_tokens_seen": 9995136, + "step": 5095 + }, + { + "epoch": 0.6754141815772035, + "grad_norm": 10.786030769348145, + "learning_rate": 4.402057217288019e-06, + "loss": 0.2575, + "num_input_tokens_seen": 9996752, + "step": 5096 + }, + { + "epoch": 0.6755467196819086, + "grad_norm": 0.11024980247020721, + "learning_rate": 4.401831920869954e-06, + "loss": 0.0007, + "num_input_tokens_seen": 9997696, + "step": 5097 + }, + { + "epoch": 0.6756792577866136, + "grad_norm": 9.799957275390625, + "learning_rate": 4.401606587783258e-06, + "loss": 0.0871, + "num_input_tokens_seen": 9999552, + "step": 5098 + }, + { + "epoch": 0.6758117958913188, + "grad_norm": 0.19720172882080078, + "learning_rate": 4.4013812180322755e-06, + "loss": 0.0013, + "num_input_tokens_seen": 10001096, + "step": 5099 + }, + { + "epoch": 0.6759443339960238, + "grad_norm": 0.14439494907855988, + "learning_rate": 4.4011558116213506e-06, + "loss": 0.001, + "num_input_tokens_seen": 10002952, + "step": 5100 + }, + { + "epoch": 0.676076872100729, + "grad_norm": 12.2544584274292, + "learning_rate": 4.400930368554831e-06, + "loss": 0.5235, + "num_input_tokens_seen": 10004976, + "step": 5101 + }, + { + "epoch": 0.676209410205434, + "grad_norm": 15.562100410461426, + "learning_rate": 4.400704888837062e-06, + "loss": 0.4753, + "num_input_tokens_seen": 10007512, + "step": 5102 + }, + { + "epoch": 0.6763419483101392, + "grad_norm": 0.2386159598827362, + "learning_rate": 4.400479372472392e-06, + "loss": 0.0017, + "num_input_tokens_seen": 10008632, + "step": 5103 + }, + { + "epoch": 0.6764744864148443, + "grad_norm": 8.07843017578125, + "learning_rate": 4.400253819465168e-06, + "loss": 0.2633, + "num_input_tokens_seen": 10009776, + "step": 5104 + }, + { + "epoch": 0.6766070245195493, + "grad_norm": 0.3921319246292114, + "learning_rate": 4.400028229819739e-06, + "loss": 0.0028, + "num_input_tokens_seen": 10011712, + "step": 5105 + }, + { + "epoch": 0.6767395626242545, + "grad_norm": 18.368431091308594, + "learning_rate": 4.399802603540456e-06, + "loss": 0.6072, + "num_input_tokens_seen": 10014448, + "step": 5106 + }, + { + "epoch": 0.6768721007289595, + "grad_norm": 14.085212707519531, + "learning_rate": 4.399576940631668e-06, + "loss": 0.2236, + "num_input_tokens_seen": 10015936, + "step": 5107 + }, + { + "epoch": 0.6770046388336647, + "grad_norm": 20.190231323242188, + "learning_rate": 4.399351241097726e-06, + "loss": 0.6187, + "num_input_tokens_seen": 10018496, + "step": 5108 + }, + { + "epoch": 0.6771371769383698, + "grad_norm": 12.832771301269531, + "learning_rate": 4.399125504942982e-06, + "loss": 0.535, + "num_input_tokens_seen": 10020096, + "step": 5109 + }, + { + "epoch": 0.6772697150430749, + "grad_norm": 17.57253646850586, + "learning_rate": 4.398899732171787e-06, + "loss": 0.3736, + "num_input_tokens_seen": 10021616, + "step": 5110 + }, + { + "epoch": 0.67740225314778, + "grad_norm": 12.094898223876953, + "learning_rate": 4.398673922788497e-06, + "loss": 0.2396, + "num_input_tokens_seen": 10023520, + "step": 5111 + }, + { + "epoch": 0.6775347912524851, + "grad_norm": 10.971561431884766, + "learning_rate": 4.3984480767974625e-06, + "loss": 0.3923, + "num_input_tokens_seen": 10025736, + "step": 5112 + }, + { + "epoch": 0.6776673293571902, + "grad_norm": 0.4917202293872833, + "learning_rate": 4.398222194203039e-06, + "loss": 0.0034, + "num_input_tokens_seen": 10027328, + "step": 5113 + }, + { + "epoch": 0.6777998674618952, + "grad_norm": 7.606281757354736, + "learning_rate": 4.3979962750095836e-06, + "loss": 0.1778, + "num_input_tokens_seen": 10029240, + "step": 5114 + }, + { + "epoch": 0.6779324055666004, + "grad_norm": 0.25366276502609253, + "learning_rate": 4.39777031922145e-06, + "loss": 0.0018, + "num_input_tokens_seen": 10031744, + "step": 5115 + }, + { + "epoch": 0.6780649436713055, + "grad_norm": 14.961990356445312, + "learning_rate": 4.397544326842995e-06, + "loss": 0.3837, + "num_input_tokens_seen": 10034144, + "step": 5116 + }, + { + "epoch": 0.6781974817760106, + "grad_norm": 8.941864967346191, + "learning_rate": 4.397318297878577e-06, + "loss": 0.2587, + "num_input_tokens_seen": 10035784, + "step": 5117 + }, + { + "epoch": 0.6783300198807157, + "grad_norm": 5.699052333831787, + "learning_rate": 4.397092232332553e-06, + "loss": 0.2017, + "num_input_tokens_seen": 10037672, + "step": 5118 + }, + { + "epoch": 0.6784625579854208, + "grad_norm": 12.88026237487793, + "learning_rate": 4.396866130209282e-06, + "loss": 0.4548, + "num_input_tokens_seen": 10039416, + "step": 5119 + }, + { + "epoch": 0.6785950960901259, + "grad_norm": 16.35521697998047, + "learning_rate": 4.396639991513124e-06, + "loss": 0.5328, + "num_input_tokens_seen": 10041800, + "step": 5120 + }, + { + "epoch": 0.6787276341948311, + "grad_norm": 6.698175430297852, + "learning_rate": 4.396413816248438e-06, + "loss": 0.1041, + "num_input_tokens_seen": 10044784, + "step": 5121 + }, + { + "epoch": 0.6788601722995361, + "grad_norm": 1.4897096157073975, + "learning_rate": 4.396187604419585e-06, + "loss": 0.0107, + "num_input_tokens_seen": 10046584, + "step": 5122 + }, + { + "epoch": 0.6789927104042413, + "grad_norm": 10.547821044921875, + "learning_rate": 4.395961356030928e-06, + "loss": 0.2656, + "num_input_tokens_seen": 10049032, + "step": 5123 + }, + { + "epoch": 0.6791252485089463, + "grad_norm": 0.6442855000495911, + "learning_rate": 4.395735071086828e-06, + "loss": 0.0044, + "num_input_tokens_seen": 10050616, + "step": 5124 + }, + { + "epoch": 0.6792577866136514, + "grad_norm": 4.081016540527344, + "learning_rate": 4.395508749591647e-06, + "loss": 0.0346, + "num_input_tokens_seen": 10051752, + "step": 5125 + }, + { + "epoch": 0.6793903247183565, + "grad_norm": 19.204418182373047, + "learning_rate": 4.39528239154975e-06, + "loss": 0.4233, + "num_input_tokens_seen": 10053840, + "step": 5126 + }, + { + "epoch": 0.6795228628230616, + "grad_norm": 0.38819560408592224, + "learning_rate": 4.395055996965501e-06, + "loss": 0.0027, + "num_input_tokens_seen": 10055600, + "step": 5127 + }, + { + "epoch": 0.6796554009277668, + "grad_norm": 5.8730268478393555, + "learning_rate": 4.394829565843265e-06, + "loss": 0.0957, + "num_input_tokens_seen": 10058528, + "step": 5128 + }, + { + "epoch": 0.6797879390324718, + "grad_norm": 7.633223533630371, + "learning_rate": 4.394603098187408e-06, + "loss": 0.1299, + "num_input_tokens_seen": 10060472, + "step": 5129 + }, + { + "epoch": 0.679920477137177, + "grad_norm": 11.324410438537598, + "learning_rate": 4.3943765940022965e-06, + "loss": 0.1356, + "num_input_tokens_seen": 10061800, + "step": 5130 + }, + { + "epoch": 0.680053015241882, + "grad_norm": 7.574645519256592, + "learning_rate": 4.394150053292297e-06, + "loss": 0.2034, + "num_input_tokens_seen": 10064072, + "step": 5131 + }, + { + "epoch": 0.6801855533465871, + "grad_norm": 8.12269401550293, + "learning_rate": 4.393923476061778e-06, + "loss": 0.161, + "num_input_tokens_seen": 10066216, + "step": 5132 + }, + { + "epoch": 0.6803180914512923, + "grad_norm": 1.8340046405792236, + "learning_rate": 4.393696862315107e-06, + "loss": 0.0121, + "num_input_tokens_seen": 10068064, + "step": 5133 + }, + { + "epoch": 0.6804506295559973, + "grad_norm": 10.71570873260498, + "learning_rate": 4.393470212056655e-06, + "loss": 0.3858, + "num_input_tokens_seen": 10069976, + "step": 5134 + }, + { + "epoch": 0.6805831676607025, + "grad_norm": 0.16110946238040924, + "learning_rate": 4.3932435252907914e-06, + "loss": 0.0011, + "num_input_tokens_seen": 10071336, + "step": 5135 + }, + { + "epoch": 0.6807157057654075, + "grad_norm": 12.69213581085205, + "learning_rate": 4.3930168020218855e-06, + "loss": 0.4975, + "num_input_tokens_seen": 10073208, + "step": 5136 + }, + { + "epoch": 0.6808482438701127, + "grad_norm": 4.869973182678223, + "learning_rate": 4.39279004225431e-06, + "loss": 0.0889, + "num_input_tokens_seen": 10075392, + "step": 5137 + }, + { + "epoch": 0.6809807819748177, + "grad_norm": 0.6059229373931885, + "learning_rate": 4.392563245992437e-06, + "loss": 0.004, + "num_input_tokens_seen": 10077256, + "step": 5138 + }, + { + "epoch": 0.6811133200795229, + "grad_norm": 10.862265586853027, + "learning_rate": 4.39233641324064e-06, + "loss": 0.231, + "num_input_tokens_seen": 10078920, + "step": 5139 + }, + { + "epoch": 0.681245858184228, + "grad_norm": 10.850610733032227, + "learning_rate": 4.3921095440032905e-06, + "loss": 0.3785, + "num_input_tokens_seen": 10081856, + "step": 5140 + }, + { + "epoch": 0.681378396288933, + "grad_norm": 5.917884349822998, + "learning_rate": 4.391882638284763e-06, + "loss": 0.2369, + "num_input_tokens_seen": 10083968, + "step": 5141 + }, + { + "epoch": 0.6815109343936382, + "grad_norm": 9.508611679077148, + "learning_rate": 4.391655696089433e-06, + "loss": 0.2836, + "num_input_tokens_seen": 10085824, + "step": 5142 + }, + { + "epoch": 0.6816434724983432, + "grad_norm": 5.912469863891602, + "learning_rate": 4.3914287174216774e-06, + "loss": 0.1479, + "num_input_tokens_seen": 10087856, + "step": 5143 + }, + { + "epoch": 0.6817760106030484, + "grad_norm": 6.553494453430176, + "learning_rate": 4.391201702285871e-06, + "loss": 0.1434, + "num_input_tokens_seen": 10089800, + "step": 5144 + }, + { + "epoch": 0.6819085487077535, + "grad_norm": 0.08189654350280762, + "learning_rate": 4.390974650686391e-06, + "loss": 0.0006, + "num_input_tokens_seen": 10091528, + "step": 5145 + }, + { + "epoch": 0.6820410868124586, + "grad_norm": 4.9979681968688965, + "learning_rate": 4.3907475626276155e-06, + "loss": 0.1786, + "num_input_tokens_seen": 10093536, + "step": 5146 + }, + { + "epoch": 0.6821736249171637, + "grad_norm": 0.08108515292406082, + "learning_rate": 4.390520438113922e-06, + "loss": 0.0006, + "num_input_tokens_seen": 10095920, + "step": 5147 + }, + { + "epoch": 0.6823061630218688, + "grad_norm": 9.103939056396484, + "learning_rate": 4.39029327714969e-06, + "loss": 0.3121, + "num_input_tokens_seen": 10097736, + "step": 5148 + }, + { + "epoch": 0.6824387011265739, + "grad_norm": 4.609950065612793, + "learning_rate": 4.3900660797393005e-06, + "loss": 0.1947, + "num_input_tokens_seen": 10099824, + "step": 5149 + }, + { + "epoch": 0.682571239231279, + "grad_norm": 13.706204414367676, + "learning_rate": 4.389838845887133e-06, + "loss": 0.4235, + "num_input_tokens_seen": 10101816, + "step": 5150 + }, + { + "epoch": 0.6827037773359841, + "grad_norm": 0.40879833698272705, + "learning_rate": 4.389611575597569e-06, + "loss": 0.0029, + "num_input_tokens_seen": 10103576, + "step": 5151 + }, + { + "epoch": 0.6828363154406892, + "grad_norm": 7.0339226722717285, + "learning_rate": 4.389384268874989e-06, + "loss": 0.2841, + "num_input_tokens_seen": 10104856, + "step": 5152 + }, + { + "epoch": 0.6829688535453943, + "grad_norm": 6.98808479309082, + "learning_rate": 4.3891569257237775e-06, + "loss": 0.0979, + "num_input_tokens_seen": 10106608, + "step": 5153 + }, + { + "epoch": 0.6831013916500994, + "grad_norm": 0.09090457856655121, + "learning_rate": 4.388929546148317e-06, + "loss": 0.0006, + "num_input_tokens_seen": 10107880, + "step": 5154 + }, + { + "epoch": 0.6832339297548045, + "grad_norm": 5.910809516906738, + "learning_rate": 4.388702130152993e-06, + "loss": 0.0953, + "num_input_tokens_seen": 10110072, + "step": 5155 + }, + { + "epoch": 0.6833664678595096, + "grad_norm": 0.07751896977424622, + "learning_rate": 4.388474677742188e-06, + "loss": 0.0005, + "num_input_tokens_seen": 10111472, + "step": 5156 + }, + { + "epoch": 0.6834990059642148, + "grad_norm": 14.025826454162598, + "learning_rate": 4.388247188920288e-06, + "loss": 0.4521, + "num_input_tokens_seen": 10113488, + "step": 5157 + }, + { + "epoch": 0.6836315440689198, + "grad_norm": 7.779784202575684, + "learning_rate": 4.38801966369168e-06, + "loss": 0.1851, + "num_input_tokens_seen": 10115000, + "step": 5158 + }, + { + "epoch": 0.6837640821736249, + "grad_norm": 2.8445193767547607, + "learning_rate": 4.387792102060751e-06, + "loss": 0.0779, + "num_input_tokens_seen": 10116488, + "step": 5159 + }, + { + "epoch": 0.68389662027833, + "grad_norm": 2.744990348815918, + "learning_rate": 4.387564504031887e-06, + "loss": 0.0209, + "num_input_tokens_seen": 10118576, + "step": 5160 + }, + { + "epoch": 0.6840291583830351, + "grad_norm": 0.25698724389076233, + "learning_rate": 4.3873368696094785e-06, + "loss": 0.0018, + "num_input_tokens_seen": 10119856, + "step": 5161 + }, + { + "epoch": 0.6841616964877403, + "grad_norm": 6.561476230621338, + "learning_rate": 4.387109198797912e-06, + "loss": 0.1156, + "num_input_tokens_seen": 10121688, + "step": 5162 + }, + { + "epoch": 0.6842942345924453, + "grad_norm": 11.146231651306152, + "learning_rate": 4.386881491601578e-06, + "loss": 0.162, + "num_input_tokens_seen": 10123096, + "step": 5163 + }, + { + "epoch": 0.6844267726971505, + "grad_norm": 0.15085971355438232, + "learning_rate": 4.386653748024868e-06, + "loss": 0.001, + "num_input_tokens_seen": 10124424, + "step": 5164 + }, + { + "epoch": 0.6845593108018555, + "grad_norm": 16.216583251953125, + "learning_rate": 4.3864259680721725e-06, + "loss": 0.5163, + "num_input_tokens_seen": 10126568, + "step": 5165 + }, + { + "epoch": 0.6846918489065607, + "grad_norm": 7.989547252655029, + "learning_rate": 4.386198151747882e-06, + "loss": 0.1866, + "num_input_tokens_seen": 10128008, + "step": 5166 + }, + { + "epoch": 0.6848243870112657, + "grad_norm": 0.11146971583366394, + "learning_rate": 4.38597029905639e-06, + "loss": 0.0007, + "num_input_tokens_seen": 10128992, + "step": 5167 + }, + { + "epoch": 0.6849569251159708, + "grad_norm": 7.317201614379883, + "learning_rate": 4.38574241000209e-06, + "loss": 0.2573, + "num_input_tokens_seen": 10130816, + "step": 5168 + }, + { + "epoch": 0.685089463220676, + "grad_norm": 0.07720663398504257, + "learning_rate": 4.385514484589375e-06, + "loss": 0.0005, + "num_input_tokens_seen": 10131856, + "step": 5169 + }, + { + "epoch": 0.685222001325381, + "grad_norm": 0.2312750518321991, + "learning_rate": 4.38528652282264e-06, + "loss": 0.0014, + "num_input_tokens_seen": 10133904, + "step": 5170 + }, + { + "epoch": 0.6853545394300862, + "grad_norm": 3.1167891025543213, + "learning_rate": 4.385058524706282e-06, + "loss": 0.1673, + "num_input_tokens_seen": 10136272, + "step": 5171 + }, + { + "epoch": 0.6854870775347912, + "grad_norm": 0.2546970546245575, + "learning_rate": 4.384830490244693e-06, + "loss": 0.0017, + "num_input_tokens_seen": 10137960, + "step": 5172 + }, + { + "epoch": 0.6856196156394964, + "grad_norm": 18.992210388183594, + "learning_rate": 4.384602419442272e-06, + "loss": 0.8135, + "num_input_tokens_seen": 10139320, + "step": 5173 + }, + { + "epoch": 0.6857521537442015, + "grad_norm": 7.735326766967773, + "learning_rate": 4.384374312303418e-06, + "loss": 0.2901, + "num_input_tokens_seen": 10140680, + "step": 5174 + }, + { + "epoch": 0.6858846918489065, + "grad_norm": 11.820390701293945, + "learning_rate": 4.384146168832526e-06, + "loss": 0.5068, + "num_input_tokens_seen": 10142384, + "step": 5175 + }, + { + "epoch": 0.6860172299536117, + "grad_norm": 8.235243797302246, + "learning_rate": 4.383917989033995e-06, + "loss": 0.169, + "num_input_tokens_seen": 10143904, + "step": 5176 + }, + { + "epoch": 0.6861497680583167, + "grad_norm": 6.0625762939453125, + "learning_rate": 4.383689772912227e-06, + "loss": 0.08, + "num_input_tokens_seen": 10145904, + "step": 5177 + }, + { + "epoch": 0.6862823061630219, + "grad_norm": 2.686798334121704, + "learning_rate": 4.383461520471621e-06, + "loss": 0.0439, + "num_input_tokens_seen": 10147448, + "step": 5178 + }, + { + "epoch": 0.6864148442677269, + "grad_norm": 5.788634300231934, + "learning_rate": 4.383233231716577e-06, + "loss": 0.1623, + "num_input_tokens_seen": 10149104, + "step": 5179 + }, + { + "epoch": 0.6865473823724321, + "grad_norm": 0.18411491811275482, + "learning_rate": 4.383004906651497e-06, + "loss": 0.0012, + "num_input_tokens_seen": 10151008, + "step": 5180 + }, + { + "epoch": 0.6866799204771372, + "grad_norm": 8.171854972839355, + "learning_rate": 4.3827765452807835e-06, + "loss": 0.3028, + "num_input_tokens_seen": 10153152, + "step": 5181 + }, + { + "epoch": 0.6868124585818423, + "grad_norm": 7.945657730102539, + "learning_rate": 4.382548147608839e-06, + "loss": 0.3053, + "num_input_tokens_seen": 10155136, + "step": 5182 + }, + { + "epoch": 0.6869449966865474, + "grad_norm": 11.356136322021484, + "learning_rate": 4.382319713640068e-06, + "loss": 0.1508, + "num_input_tokens_seen": 10156792, + "step": 5183 + }, + { + "epoch": 0.6870775347912524, + "grad_norm": 6.04031229019165, + "learning_rate": 4.382091243378874e-06, + "loss": 0.1286, + "num_input_tokens_seen": 10158656, + "step": 5184 + }, + { + "epoch": 0.6872100728959576, + "grad_norm": 1.3314054012298584, + "learning_rate": 4.381862736829663e-06, + "loss": 0.0085, + "num_input_tokens_seen": 10160544, + "step": 5185 + }, + { + "epoch": 0.6873426110006627, + "grad_norm": 5.13046932220459, + "learning_rate": 4.3816341939968396e-06, + "loss": 0.1243, + "num_input_tokens_seen": 10162320, + "step": 5186 + }, + { + "epoch": 0.6874751491053678, + "grad_norm": 1.3660556077957153, + "learning_rate": 4.381405614884811e-06, + "loss": 0.0083, + "num_input_tokens_seen": 10164680, + "step": 5187 + }, + { + "epoch": 0.6876076872100729, + "grad_norm": 0.46857357025146484, + "learning_rate": 4.381176999497985e-06, + "loss": 0.0034, + "num_input_tokens_seen": 10165872, + "step": 5188 + }, + { + "epoch": 0.687740225314778, + "grad_norm": 0.6251587271690369, + "learning_rate": 4.3809483478407674e-06, + "loss": 0.0037, + "num_input_tokens_seen": 10167888, + "step": 5189 + }, + { + "epoch": 0.6878727634194831, + "grad_norm": 0.1713877171278, + "learning_rate": 4.3807196599175684e-06, + "loss": 0.0012, + "num_input_tokens_seen": 10170304, + "step": 5190 + }, + { + "epoch": 0.6880053015241882, + "grad_norm": 0.060870639979839325, + "learning_rate": 4.3804909357327975e-06, + "loss": 0.0004, + "num_input_tokens_seen": 10171496, + "step": 5191 + }, + { + "epoch": 0.6881378396288933, + "grad_norm": 2.487144947052002, + "learning_rate": 4.380262175290863e-06, + "loss": 0.017, + "num_input_tokens_seen": 10174504, + "step": 5192 + }, + { + "epoch": 0.6882703777335984, + "grad_norm": 7.979701519012451, + "learning_rate": 4.3800333785961775e-06, + "loss": 0.2788, + "num_input_tokens_seen": 10176624, + "step": 5193 + }, + { + "epoch": 0.6884029158383035, + "grad_norm": 19.77692413330078, + "learning_rate": 4.379804545653152e-06, + "loss": 0.6383, + "num_input_tokens_seen": 10178824, + "step": 5194 + }, + { + "epoch": 0.6885354539430086, + "grad_norm": 32.988441467285156, + "learning_rate": 4.3795756764661974e-06, + "loss": 1.873, + "num_input_tokens_seen": 10181656, + "step": 5195 + }, + { + "epoch": 0.6886679920477137, + "grad_norm": 0.13673114776611328, + "learning_rate": 4.379346771039728e-06, + "loss": 0.0009, + "num_input_tokens_seen": 10183280, + "step": 5196 + }, + { + "epoch": 0.6888005301524188, + "grad_norm": 0.9877931475639343, + "learning_rate": 4.379117829378155e-06, + "loss": 0.0221, + "num_input_tokens_seen": 10184872, + "step": 5197 + }, + { + "epoch": 0.688933068257124, + "grad_norm": 4.092088222503662, + "learning_rate": 4.378888851485894e-06, + "loss": 0.1318, + "num_input_tokens_seen": 10186032, + "step": 5198 + }, + { + "epoch": 0.689065606361829, + "grad_norm": 1.1899350881576538, + "learning_rate": 4.378659837367361e-06, + "loss": 0.0348, + "num_input_tokens_seen": 10187616, + "step": 5199 + }, + { + "epoch": 0.6891981444665342, + "grad_norm": 13.512741088867188, + "learning_rate": 4.378430787026969e-06, + "loss": 0.5092, + "num_input_tokens_seen": 10189200, + "step": 5200 + }, + { + "epoch": 0.6893306825712392, + "grad_norm": 7.496296405792236, + "learning_rate": 4.378201700469136e-06, + "loss": 0.1868, + "num_input_tokens_seen": 10191072, + "step": 5201 + }, + { + "epoch": 0.6894632206759443, + "grad_norm": 5.2618608474731445, + "learning_rate": 4.377972577698278e-06, + "loss": 0.075, + "num_input_tokens_seen": 10192640, + "step": 5202 + }, + { + "epoch": 0.6895957587806494, + "grad_norm": 6.543499946594238, + "learning_rate": 4.3777434187188135e-06, + "loss": 0.2141, + "num_input_tokens_seen": 10194560, + "step": 5203 + }, + { + "epoch": 0.6897282968853545, + "grad_norm": 12.690526962280273, + "learning_rate": 4.377514223535162e-06, + "loss": 0.1983, + "num_input_tokens_seen": 10196232, + "step": 5204 + }, + { + "epoch": 0.6898608349900597, + "grad_norm": 6.952563762664795, + "learning_rate": 4.377284992151739e-06, + "loss": 0.2206, + "num_input_tokens_seen": 10198192, + "step": 5205 + }, + { + "epoch": 0.6899933730947647, + "grad_norm": 9.442009925842285, + "learning_rate": 4.377055724572967e-06, + "loss": 0.4371, + "num_input_tokens_seen": 10200568, + "step": 5206 + }, + { + "epoch": 0.6901259111994699, + "grad_norm": 2.073357582092285, + "learning_rate": 4.376826420803267e-06, + "loss": 0.0186, + "num_input_tokens_seen": 10202872, + "step": 5207 + }, + { + "epoch": 0.6902584493041749, + "grad_norm": 9.931365013122559, + "learning_rate": 4.376597080847057e-06, + "loss": 0.3161, + "num_input_tokens_seen": 10205208, + "step": 5208 + }, + { + "epoch": 0.69039098740888, + "grad_norm": 14.322099685668945, + "learning_rate": 4.376367704708761e-06, + "loss": 0.3462, + "num_input_tokens_seen": 10207496, + "step": 5209 + }, + { + "epoch": 0.6905235255135852, + "grad_norm": 9.058643341064453, + "learning_rate": 4.376138292392801e-06, + "loss": 0.2825, + "num_input_tokens_seen": 10208968, + "step": 5210 + }, + { + "epoch": 0.6906560636182902, + "grad_norm": 0.1128392368555069, + "learning_rate": 4.375908843903601e-06, + "loss": 0.0008, + "num_input_tokens_seen": 10210104, + "step": 5211 + }, + { + "epoch": 0.6907886017229954, + "grad_norm": 0.048101648688316345, + "learning_rate": 4.375679359245585e-06, + "loss": 0.0004, + "num_input_tokens_seen": 10211648, + "step": 5212 + }, + { + "epoch": 0.6909211398277004, + "grad_norm": 0.07231906056404114, + "learning_rate": 4.375449838423176e-06, + "loss": 0.0006, + "num_input_tokens_seen": 10213736, + "step": 5213 + }, + { + "epoch": 0.6910536779324056, + "grad_norm": 7.748062610626221, + "learning_rate": 4.3752202814408004e-06, + "loss": 0.363, + "num_input_tokens_seen": 10215168, + "step": 5214 + }, + { + "epoch": 0.6911862160371107, + "grad_norm": 0.14155329763889313, + "learning_rate": 4.374990688302885e-06, + "loss": 0.0011, + "num_input_tokens_seen": 10217056, + "step": 5215 + }, + { + "epoch": 0.6913187541418158, + "grad_norm": 10.102121353149414, + "learning_rate": 4.374761059013854e-06, + "loss": 0.5367, + "num_input_tokens_seen": 10219264, + "step": 5216 + }, + { + "epoch": 0.6914512922465209, + "grad_norm": 0.22931811213493347, + "learning_rate": 4.374531393578138e-06, + "loss": 0.0017, + "num_input_tokens_seen": 10220736, + "step": 5217 + }, + { + "epoch": 0.691583830351226, + "grad_norm": 8.334806442260742, + "learning_rate": 4.374301692000163e-06, + "loss": 0.1874, + "num_input_tokens_seen": 10222008, + "step": 5218 + }, + { + "epoch": 0.6917163684559311, + "grad_norm": 0.14932233095169067, + "learning_rate": 4.374071954284358e-06, + "loss": 0.001, + "num_input_tokens_seen": 10223568, + "step": 5219 + }, + { + "epoch": 0.6918489065606361, + "grad_norm": 0.08902860432863235, + "learning_rate": 4.3738421804351535e-06, + "loss": 0.0006, + "num_input_tokens_seen": 10225288, + "step": 5220 + }, + { + "epoch": 0.6919814446653413, + "grad_norm": 0.10684773325920105, + "learning_rate": 4.373612370456979e-06, + "loss": 0.0008, + "num_input_tokens_seen": 10226584, + "step": 5221 + }, + { + "epoch": 0.6921139827700464, + "grad_norm": 12.19336986541748, + "learning_rate": 4.373382524354265e-06, + "loss": 0.2667, + "num_input_tokens_seen": 10228096, + "step": 5222 + }, + { + "epoch": 0.6922465208747515, + "grad_norm": 11.567729949951172, + "learning_rate": 4.373152642131444e-06, + "loss": 0.167, + "num_input_tokens_seen": 10229880, + "step": 5223 + }, + { + "epoch": 0.6923790589794566, + "grad_norm": 0.17095962166786194, + "learning_rate": 4.372922723792947e-06, + "loss": 0.0012, + "num_input_tokens_seen": 10232152, + "step": 5224 + }, + { + "epoch": 0.6925115970841617, + "grad_norm": 0.8013207316398621, + "learning_rate": 4.3726927693432084e-06, + "loss": 0.0053, + "num_input_tokens_seen": 10233656, + "step": 5225 + }, + { + "epoch": 0.6926441351888668, + "grad_norm": 8.57664680480957, + "learning_rate": 4.372462778786662e-06, + "loss": 0.3064, + "num_input_tokens_seen": 10235344, + "step": 5226 + }, + { + "epoch": 0.692776673293572, + "grad_norm": 17.11249351501465, + "learning_rate": 4.372232752127741e-06, + "loss": 0.6689, + "num_input_tokens_seen": 10237288, + "step": 5227 + }, + { + "epoch": 0.692909211398277, + "grad_norm": 10.308618545532227, + "learning_rate": 4.37200268937088e-06, + "loss": 0.3646, + "num_input_tokens_seen": 10239576, + "step": 5228 + }, + { + "epoch": 0.6930417495029821, + "grad_norm": 11.843452453613281, + "learning_rate": 4.371772590520516e-06, + "loss": 0.2951, + "num_input_tokens_seen": 10241528, + "step": 5229 + }, + { + "epoch": 0.6931742876076872, + "grad_norm": 5.81355094909668, + "learning_rate": 4.371542455581087e-06, + "loss": 0.1348, + "num_input_tokens_seen": 10243496, + "step": 5230 + }, + { + "epoch": 0.6933068257123923, + "grad_norm": 6.560523986816406, + "learning_rate": 4.371312284557027e-06, + "loss": 0.1707, + "num_input_tokens_seen": 10245840, + "step": 5231 + }, + { + "epoch": 0.6934393638170974, + "grad_norm": 4.5499653816223145, + "learning_rate": 4.371082077452775e-06, + "loss": 0.1984, + "num_input_tokens_seen": 10247664, + "step": 5232 + }, + { + "epoch": 0.6935719019218025, + "grad_norm": 4.0428619384765625, + "learning_rate": 4.37085183427277e-06, + "loss": 0.0853, + "num_input_tokens_seen": 10250184, + "step": 5233 + }, + { + "epoch": 0.6937044400265077, + "grad_norm": 6.735337734222412, + "learning_rate": 4.370621555021451e-06, + "loss": 0.3618, + "num_input_tokens_seen": 10252392, + "step": 5234 + }, + { + "epoch": 0.6938369781312127, + "grad_norm": 5.220327377319336, + "learning_rate": 4.370391239703257e-06, + "loss": 0.1992, + "num_input_tokens_seen": 10253864, + "step": 5235 + }, + { + "epoch": 0.6939695162359178, + "grad_norm": 0.14506489038467407, + "learning_rate": 4.370160888322631e-06, + "loss": 0.001, + "num_input_tokens_seen": 10255376, + "step": 5236 + }, + { + "epoch": 0.6941020543406229, + "grad_norm": 22.455324172973633, + "learning_rate": 4.369930500884012e-06, + "loss": 0.6093, + "num_input_tokens_seen": 10257952, + "step": 5237 + }, + { + "epoch": 0.694234592445328, + "grad_norm": 0.7495766878128052, + "learning_rate": 4.369700077391844e-06, + "loss": 0.0036, + "num_input_tokens_seen": 10259584, + "step": 5238 + }, + { + "epoch": 0.6943671305500332, + "grad_norm": 3.2545533180236816, + "learning_rate": 4.369469617850568e-06, + "loss": 0.0358, + "num_input_tokens_seen": 10260912, + "step": 5239 + }, + { + "epoch": 0.6944996686547382, + "grad_norm": 19.410465240478516, + "learning_rate": 4.369239122264628e-06, + "loss": 0.2023, + "num_input_tokens_seen": 10262568, + "step": 5240 + }, + { + "epoch": 0.6946322067594434, + "grad_norm": 9.67562198638916, + "learning_rate": 4.369008590638468e-06, + "loss": 0.3296, + "num_input_tokens_seen": 10263920, + "step": 5241 + }, + { + "epoch": 0.6947647448641484, + "grad_norm": 12.899747848510742, + "learning_rate": 4.368778022976534e-06, + "loss": 0.5128, + "num_input_tokens_seen": 10265960, + "step": 5242 + }, + { + "epoch": 0.6948972829688536, + "grad_norm": 15.291996955871582, + "learning_rate": 4.36854741928327e-06, + "loss": 0.4482, + "num_input_tokens_seen": 10267760, + "step": 5243 + }, + { + "epoch": 0.6950298210735586, + "grad_norm": 0.22596414387226105, + "learning_rate": 4.368316779563122e-06, + "loss": 0.0016, + "num_input_tokens_seen": 10269640, + "step": 5244 + }, + { + "epoch": 0.6951623591782637, + "grad_norm": 8.064006805419922, + "learning_rate": 4.3680861038205384e-06, + "loss": 0.2055, + "num_input_tokens_seen": 10271576, + "step": 5245 + }, + { + "epoch": 0.6952948972829689, + "grad_norm": 13.91385555267334, + "learning_rate": 4.367855392059965e-06, + "loss": 0.7073, + "num_input_tokens_seen": 10274352, + "step": 5246 + }, + { + "epoch": 0.6954274353876739, + "grad_norm": 7.5989460945129395, + "learning_rate": 4.367624644285853e-06, + "loss": 0.1707, + "num_input_tokens_seen": 10276056, + "step": 5247 + }, + { + "epoch": 0.6955599734923791, + "grad_norm": 0.1431422382593155, + "learning_rate": 4.367393860502648e-06, + "loss": 0.001, + "num_input_tokens_seen": 10277112, + "step": 5248 + }, + { + "epoch": 0.6956925115970841, + "grad_norm": 17.388700485229492, + "learning_rate": 4.367163040714801e-06, + "loss": 0.7218, + "num_input_tokens_seen": 10279016, + "step": 5249 + }, + { + "epoch": 0.6958250497017893, + "grad_norm": 8.644015312194824, + "learning_rate": 4.366932184926763e-06, + "loss": 0.2846, + "num_input_tokens_seen": 10280624, + "step": 5250 + }, + { + "epoch": 0.6959575878064944, + "grad_norm": 3.996027946472168, + "learning_rate": 4.366701293142985e-06, + "loss": 0.0467, + "num_input_tokens_seen": 10283112, + "step": 5251 + }, + { + "epoch": 0.6960901259111995, + "grad_norm": 12.316780090332031, + "learning_rate": 4.366470365367918e-06, + "loss": 0.3997, + "num_input_tokens_seen": 10285248, + "step": 5252 + }, + { + "epoch": 0.6962226640159046, + "grad_norm": 5.466716289520264, + "learning_rate": 4.366239401606014e-06, + "loss": 0.0578, + "num_input_tokens_seen": 10287336, + "step": 5253 + }, + { + "epoch": 0.6963552021206096, + "grad_norm": 0.5244131684303284, + "learning_rate": 4.366008401861727e-06, + "loss": 0.0036, + "num_input_tokens_seen": 10290008, + "step": 5254 + }, + { + "epoch": 0.6964877402253148, + "grad_norm": 0.3388371169567108, + "learning_rate": 4.3657773661395115e-06, + "loss": 0.0023, + "num_input_tokens_seen": 10291552, + "step": 5255 + }, + { + "epoch": 0.6966202783300198, + "grad_norm": 11.980232238769531, + "learning_rate": 4.365546294443821e-06, + "loss": 0.2772, + "num_input_tokens_seen": 10293960, + "step": 5256 + }, + { + "epoch": 0.696752816434725, + "grad_norm": 11.34706974029541, + "learning_rate": 4.36531518677911e-06, + "loss": 0.2289, + "num_input_tokens_seen": 10295552, + "step": 5257 + }, + { + "epoch": 0.6968853545394301, + "grad_norm": 0.4876137673854828, + "learning_rate": 4.3650840431498365e-06, + "loss": 0.0033, + "num_input_tokens_seen": 10297024, + "step": 5258 + }, + { + "epoch": 0.6970178926441352, + "grad_norm": 0.10733532905578613, + "learning_rate": 4.364852863560456e-06, + "loss": 0.0007, + "num_input_tokens_seen": 10298296, + "step": 5259 + }, + { + "epoch": 0.6971504307488403, + "grad_norm": 10.179142951965332, + "learning_rate": 4.364621648015426e-06, + "loss": 0.2033, + "num_input_tokens_seen": 10300256, + "step": 5260 + }, + { + "epoch": 0.6972829688535453, + "grad_norm": 13.415271759033203, + "learning_rate": 4.364390396519203e-06, + "loss": 0.2783, + "num_input_tokens_seen": 10302560, + "step": 5261 + }, + { + "epoch": 0.6974155069582505, + "grad_norm": 0.10923636704683304, + "learning_rate": 4.364159109076248e-06, + "loss": 0.0007, + "num_input_tokens_seen": 10304168, + "step": 5262 + }, + { + "epoch": 0.6975480450629556, + "grad_norm": 2.047806739807129, + "learning_rate": 4.363927785691019e-06, + "loss": 0.0084, + "num_input_tokens_seen": 10305552, + "step": 5263 + }, + { + "epoch": 0.6976805831676607, + "grad_norm": 0.14924830198287964, + "learning_rate": 4.363696426367976e-06, + "loss": 0.001, + "num_input_tokens_seen": 10307032, + "step": 5264 + }, + { + "epoch": 0.6978131212723658, + "grad_norm": 10.935125350952148, + "learning_rate": 4.363465031111581e-06, + "loss": 0.2968, + "num_input_tokens_seen": 10308728, + "step": 5265 + }, + { + "epoch": 0.6979456593770709, + "grad_norm": 8.447747230529785, + "learning_rate": 4.3632335999262945e-06, + "loss": 0.3276, + "num_input_tokens_seen": 10311216, + "step": 5266 + }, + { + "epoch": 0.698078197481776, + "grad_norm": 7.594271659851074, + "learning_rate": 4.363002132816578e-06, + "loss": 0.2177, + "num_input_tokens_seen": 10313480, + "step": 5267 + }, + { + "epoch": 0.6982107355864812, + "grad_norm": 7.9201765060424805, + "learning_rate": 4.362770629786896e-06, + "loss": 0.2559, + "num_input_tokens_seen": 10315072, + "step": 5268 + }, + { + "epoch": 0.6983432736911862, + "grad_norm": 4.4513421058654785, + "learning_rate": 4.362539090841711e-06, + "loss": 0.0652, + "num_input_tokens_seen": 10316880, + "step": 5269 + }, + { + "epoch": 0.6984758117958914, + "grad_norm": 7.337923049926758, + "learning_rate": 4.362307515985488e-06, + "loss": 0.1356, + "num_input_tokens_seen": 10319328, + "step": 5270 + }, + { + "epoch": 0.6986083499005964, + "grad_norm": 13.303313255310059, + "learning_rate": 4.36207590522269e-06, + "loss": 0.3061, + "num_input_tokens_seen": 10320984, + "step": 5271 + }, + { + "epoch": 0.6987408880053015, + "grad_norm": 4.793094635009766, + "learning_rate": 4.361844258557786e-06, + "loss": 0.0435, + "num_input_tokens_seen": 10322872, + "step": 5272 + }, + { + "epoch": 0.6988734261100066, + "grad_norm": 1.6766669750213623, + "learning_rate": 4.361612575995239e-06, + "loss": 0.0115, + "num_input_tokens_seen": 10324520, + "step": 5273 + }, + { + "epoch": 0.6990059642147117, + "grad_norm": 0.7178148627281189, + "learning_rate": 4.361380857539517e-06, + "loss": 0.0048, + "num_input_tokens_seen": 10326032, + "step": 5274 + }, + { + "epoch": 0.6991385023194169, + "grad_norm": 5.723263740539551, + "learning_rate": 4.361149103195088e-06, + "loss": 0.2274, + "num_input_tokens_seen": 10327944, + "step": 5275 + }, + { + "epoch": 0.6992710404241219, + "grad_norm": 0.012143773958086967, + "learning_rate": 4.3609173129664216e-06, + "loss": 0.0001, + "num_input_tokens_seen": 10329096, + "step": 5276 + }, + { + "epoch": 0.6994035785288271, + "grad_norm": 0.014949504286050797, + "learning_rate": 4.360685486857984e-06, + "loss": 0.0001, + "num_input_tokens_seen": 10330840, + "step": 5277 + }, + { + "epoch": 0.6995361166335321, + "grad_norm": 0.05713149905204773, + "learning_rate": 4.360453624874248e-06, + "loss": 0.0004, + "num_input_tokens_seen": 10333288, + "step": 5278 + }, + { + "epoch": 0.6996686547382373, + "grad_norm": 9.647068977355957, + "learning_rate": 4.360221727019682e-06, + "loss": 0.2069, + "num_input_tokens_seen": 10334976, + "step": 5279 + }, + { + "epoch": 0.6998011928429424, + "grad_norm": 7.801065921783447, + "learning_rate": 4.359989793298757e-06, + "loss": 0.2666, + "num_input_tokens_seen": 10337008, + "step": 5280 + }, + { + "epoch": 0.6999337309476474, + "grad_norm": 5.655546188354492, + "learning_rate": 4.359757823715946e-06, + "loss": 0.1989, + "num_input_tokens_seen": 10339272, + "step": 5281 + }, + { + "epoch": 0.7000662690523526, + "grad_norm": 9.195932388305664, + "learning_rate": 4.3595258182757226e-06, + "loss": 0.3398, + "num_input_tokens_seen": 10340824, + "step": 5282 + }, + { + "epoch": 0.7001988071570576, + "grad_norm": 13.319113731384277, + "learning_rate": 4.359293776982558e-06, + "loss": 0.5047, + "num_input_tokens_seen": 10343080, + "step": 5283 + }, + { + "epoch": 0.7003313452617628, + "grad_norm": 12.953853607177734, + "learning_rate": 4.359061699840925e-06, + "loss": 0.4603, + "num_input_tokens_seen": 10345808, + "step": 5284 + }, + { + "epoch": 0.7004638833664678, + "grad_norm": 3.146357774734497, + "learning_rate": 4.358829586855302e-06, + "loss": 0.1006, + "num_input_tokens_seen": 10347248, + "step": 5285 + }, + { + "epoch": 0.700596421471173, + "grad_norm": 7.401439189910889, + "learning_rate": 4.358597438030161e-06, + "loss": 0.1483, + "num_input_tokens_seen": 10348872, + "step": 5286 + }, + { + "epoch": 0.7007289595758781, + "grad_norm": 9.841824531555176, + "learning_rate": 4.3583652533699795e-06, + "loss": 0.2142, + "num_input_tokens_seen": 10350800, + "step": 5287 + }, + { + "epoch": 0.7008614976805831, + "grad_norm": 13.105070114135742, + "learning_rate": 4.358133032879234e-06, + "loss": 0.3994, + "num_input_tokens_seen": 10352616, + "step": 5288 + }, + { + "epoch": 0.7009940357852883, + "grad_norm": 1.8817970752716064, + "learning_rate": 4.3579007765624024e-06, + "loss": 0.0144, + "num_input_tokens_seen": 10354424, + "step": 5289 + }, + { + "epoch": 0.7011265738899933, + "grad_norm": 4.865450382232666, + "learning_rate": 4.357668484423961e-06, + "loss": 0.1506, + "num_input_tokens_seen": 10356032, + "step": 5290 + }, + { + "epoch": 0.7012591119946985, + "grad_norm": 6.411260604858398, + "learning_rate": 4.357436156468391e-06, + "loss": 0.1294, + "num_input_tokens_seen": 10358024, + "step": 5291 + }, + { + "epoch": 0.7013916500994036, + "grad_norm": 12.339280128479004, + "learning_rate": 4.357203792700169e-06, + "loss": 0.4542, + "num_input_tokens_seen": 10360360, + "step": 5292 + }, + { + "epoch": 0.7015241882041087, + "grad_norm": 1.5538820028305054, + "learning_rate": 4.356971393123778e-06, + "loss": 0.0192, + "num_input_tokens_seen": 10361664, + "step": 5293 + }, + { + "epoch": 0.7016567263088138, + "grad_norm": 0.35731470584869385, + "learning_rate": 4.3567389577436965e-06, + "loss": 0.0024, + "num_input_tokens_seen": 10363840, + "step": 5294 + }, + { + "epoch": 0.7017892644135189, + "grad_norm": 8.202995300292969, + "learning_rate": 4.3565064865644074e-06, + "loss": 0.283, + "num_input_tokens_seen": 10365888, + "step": 5295 + }, + { + "epoch": 0.701921802518224, + "grad_norm": 8.994125366210938, + "learning_rate": 4.356273979590393e-06, + "loss": 0.2604, + "num_input_tokens_seen": 10367544, + "step": 5296 + }, + { + "epoch": 0.702054340622929, + "grad_norm": 4.477342128753662, + "learning_rate": 4.356041436826135e-06, + "loss": 0.1963, + "num_input_tokens_seen": 10369728, + "step": 5297 + }, + { + "epoch": 0.7021868787276342, + "grad_norm": 4.8990559577941895, + "learning_rate": 4.355808858276117e-06, + "loss": 0.1388, + "num_input_tokens_seen": 10371568, + "step": 5298 + }, + { + "epoch": 0.7023194168323393, + "grad_norm": 11.591240882873535, + "learning_rate": 4.355576243944825e-06, + "loss": 0.5024, + "num_input_tokens_seen": 10373920, + "step": 5299 + }, + { + "epoch": 0.7024519549370444, + "grad_norm": 9.065117835998535, + "learning_rate": 4.355343593836743e-06, + "loss": 0.2231, + "num_input_tokens_seen": 10376512, + "step": 5300 + }, + { + "epoch": 0.7025844930417495, + "grad_norm": 5.833688259124756, + "learning_rate": 4.355110907956356e-06, + "loss": 0.1275, + "num_input_tokens_seen": 10378368, + "step": 5301 + }, + { + "epoch": 0.7027170311464546, + "grad_norm": 6.7619147300720215, + "learning_rate": 4.354878186308151e-06, + "loss": 0.1738, + "num_input_tokens_seen": 10379832, + "step": 5302 + }, + { + "epoch": 0.7028495692511597, + "grad_norm": 8.367796897888184, + "learning_rate": 4.3546454288966155e-06, + "loss": 0.4947, + "num_input_tokens_seen": 10382240, + "step": 5303 + }, + { + "epoch": 0.7029821073558649, + "grad_norm": 12.617276191711426, + "learning_rate": 4.354412635726237e-06, + "loss": 0.1031, + "num_input_tokens_seen": 10384712, + "step": 5304 + }, + { + "epoch": 0.7031146454605699, + "grad_norm": 10.556106567382812, + "learning_rate": 4.354179806801502e-06, + "loss": 0.2775, + "num_input_tokens_seen": 10386536, + "step": 5305 + }, + { + "epoch": 0.703247183565275, + "grad_norm": 1.250957727432251, + "learning_rate": 4.353946942126903e-06, + "loss": 0.0145, + "num_input_tokens_seen": 10387944, + "step": 5306 + }, + { + "epoch": 0.7033797216699801, + "grad_norm": 4.309374809265137, + "learning_rate": 4.353714041706927e-06, + "loss": 0.0379, + "num_input_tokens_seen": 10390552, + "step": 5307 + }, + { + "epoch": 0.7035122597746852, + "grad_norm": 5.812258243560791, + "learning_rate": 4.3534811055460655e-06, + "loss": 0.1602, + "num_input_tokens_seen": 10391888, + "step": 5308 + }, + { + "epoch": 0.7036447978793903, + "grad_norm": 0.49550989270210266, + "learning_rate": 4.3532481336488096e-06, + "loss": 0.0034, + "num_input_tokens_seen": 10394320, + "step": 5309 + }, + { + "epoch": 0.7037773359840954, + "grad_norm": 11.787468910217285, + "learning_rate": 4.353015126019651e-06, + "loss": 0.6394, + "num_input_tokens_seen": 10396168, + "step": 5310 + }, + { + "epoch": 0.7039098740888006, + "grad_norm": 0.14753291010856628, + "learning_rate": 4.352782082663083e-06, + "loss": 0.001, + "num_input_tokens_seen": 10398056, + "step": 5311 + }, + { + "epoch": 0.7040424121935056, + "grad_norm": 34.56886672973633, + "learning_rate": 4.352549003583598e-06, + "loss": 1.3396, + "num_input_tokens_seen": 10400120, + "step": 5312 + }, + { + "epoch": 0.7041749502982108, + "grad_norm": 0.2277163416147232, + "learning_rate": 4.352315888785691e-06, + "loss": 0.0016, + "num_input_tokens_seen": 10402472, + "step": 5313 + }, + { + "epoch": 0.7043074884029158, + "grad_norm": 13.350781440734863, + "learning_rate": 4.352082738273854e-06, + "loss": 0.3459, + "num_input_tokens_seen": 10404416, + "step": 5314 + }, + { + "epoch": 0.7044400265076209, + "grad_norm": 7.1642279624938965, + "learning_rate": 4.351849552052585e-06, + "loss": 0.0972, + "num_input_tokens_seen": 10406424, + "step": 5315 + }, + { + "epoch": 0.7045725646123261, + "grad_norm": 6.712897777557373, + "learning_rate": 4.3516163301263796e-06, + "loss": 0.1395, + "num_input_tokens_seen": 10408432, + "step": 5316 + }, + { + "epoch": 0.7047051027170311, + "grad_norm": 9.911812782287598, + "learning_rate": 4.351383072499734e-06, + "loss": 0.2317, + "num_input_tokens_seen": 10410584, + "step": 5317 + }, + { + "epoch": 0.7048376408217363, + "grad_norm": 0.17060184478759766, + "learning_rate": 4.351149779177145e-06, + "loss": 0.0012, + "num_input_tokens_seen": 10412224, + "step": 5318 + }, + { + "epoch": 0.7049701789264413, + "grad_norm": 0.7141030430793762, + "learning_rate": 4.350916450163112e-06, + "loss": 0.0047, + "num_input_tokens_seen": 10413592, + "step": 5319 + }, + { + "epoch": 0.7051027170311465, + "grad_norm": 5.739474773406982, + "learning_rate": 4.350683085462132e-06, + "loss": 0.0564, + "num_input_tokens_seen": 10416256, + "step": 5320 + }, + { + "epoch": 0.7052352551358515, + "grad_norm": 4.152627468109131, + "learning_rate": 4.350449685078706e-06, + "loss": 0.1163, + "num_input_tokens_seen": 10418616, + "step": 5321 + }, + { + "epoch": 0.7053677932405567, + "grad_norm": 7.785134315490723, + "learning_rate": 4.350216249017333e-06, + "loss": 0.2863, + "num_input_tokens_seen": 10420712, + "step": 5322 + }, + { + "epoch": 0.7055003313452618, + "grad_norm": 15.75673770904541, + "learning_rate": 4.349982777282515e-06, + "loss": 0.5681, + "num_input_tokens_seen": 10422656, + "step": 5323 + }, + { + "epoch": 0.7056328694499668, + "grad_norm": 0.035148218274116516, + "learning_rate": 4.349749269878752e-06, + "loss": 0.0002, + "num_input_tokens_seen": 10424032, + "step": 5324 + }, + { + "epoch": 0.705765407554672, + "grad_norm": 15.061479568481445, + "learning_rate": 4.349515726810548e-06, + "loss": 0.3935, + "num_input_tokens_seen": 10426312, + "step": 5325 + }, + { + "epoch": 0.705897945659377, + "grad_norm": 7.007232189178467, + "learning_rate": 4.349282148082404e-06, + "loss": 0.2244, + "num_input_tokens_seen": 10427864, + "step": 5326 + }, + { + "epoch": 0.7060304837640822, + "grad_norm": 11.781513214111328, + "learning_rate": 4.349048533698824e-06, + "loss": 0.3764, + "num_input_tokens_seen": 10429880, + "step": 5327 + }, + { + "epoch": 0.7061630218687873, + "grad_norm": 3.62870454788208, + "learning_rate": 4.348814883664314e-06, + "loss": 0.09, + "num_input_tokens_seen": 10432048, + "step": 5328 + }, + { + "epoch": 0.7062955599734924, + "grad_norm": 0.13700160384178162, + "learning_rate": 4.348581197983377e-06, + "loss": 0.0009, + "num_input_tokens_seen": 10433480, + "step": 5329 + }, + { + "epoch": 0.7064280980781975, + "grad_norm": 11.195610046386719, + "learning_rate": 4.348347476660519e-06, + "loss": 0.2654, + "num_input_tokens_seen": 10434928, + "step": 5330 + }, + { + "epoch": 0.7065606361829025, + "grad_norm": 3.7285003662109375, + "learning_rate": 4.3481137197002475e-06, + "loss": 0.1234, + "num_input_tokens_seen": 10436616, + "step": 5331 + }, + { + "epoch": 0.7066931742876077, + "grad_norm": 3.434617757797241, + "learning_rate": 4.347879927107067e-06, + "loss": 0.0923, + "num_input_tokens_seen": 10438504, + "step": 5332 + }, + { + "epoch": 0.7068257123923128, + "grad_norm": 9.34477424621582, + "learning_rate": 4.347646098885488e-06, + "loss": 0.348, + "num_input_tokens_seen": 10440136, + "step": 5333 + }, + { + "epoch": 0.7069582504970179, + "grad_norm": 7.150413990020752, + "learning_rate": 4.347412235040017e-06, + "loss": 0.2217, + "num_input_tokens_seen": 10442840, + "step": 5334 + }, + { + "epoch": 0.707090788601723, + "grad_norm": 8.25818920135498, + "learning_rate": 4.347178335575164e-06, + "loss": 0.3178, + "num_input_tokens_seen": 10445592, + "step": 5335 + }, + { + "epoch": 0.7072233267064281, + "grad_norm": 8.304454803466797, + "learning_rate": 4.346944400495438e-06, + "loss": 0.2673, + "num_input_tokens_seen": 10447160, + "step": 5336 + }, + { + "epoch": 0.7073558648111332, + "grad_norm": 1.1421347856521606, + "learning_rate": 4.3467104298053495e-06, + "loss": 0.0069, + "num_input_tokens_seen": 10448424, + "step": 5337 + }, + { + "epoch": 0.7074884029158383, + "grad_norm": 0.12685857713222504, + "learning_rate": 4.346476423509411e-06, + "loss": 0.0009, + "num_input_tokens_seen": 10450600, + "step": 5338 + }, + { + "epoch": 0.7076209410205434, + "grad_norm": 3.092546224594116, + "learning_rate": 4.346242381612132e-06, + "loss": 0.0836, + "num_input_tokens_seen": 10452968, + "step": 5339 + }, + { + "epoch": 0.7077534791252486, + "grad_norm": 8.69534683227539, + "learning_rate": 4.346008304118028e-06, + "loss": 0.337, + "num_input_tokens_seen": 10455192, + "step": 5340 + }, + { + "epoch": 0.7078860172299536, + "grad_norm": 0.64402174949646, + "learning_rate": 4.345774191031609e-06, + "loss": 0.0041, + "num_input_tokens_seen": 10456808, + "step": 5341 + }, + { + "epoch": 0.7080185553346587, + "grad_norm": 13.261296272277832, + "learning_rate": 4.34554004235739e-06, + "loss": 0.251, + "num_input_tokens_seen": 10459152, + "step": 5342 + }, + { + "epoch": 0.7081510934393638, + "grad_norm": 4.151136875152588, + "learning_rate": 4.345305858099887e-06, + "loss": 0.0358, + "num_input_tokens_seen": 10461440, + "step": 5343 + }, + { + "epoch": 0.7082836315440689, + "grad_norm": 7.528071880340576, + "learning_rate": 4.3450716382636135e-06, + "loss": 0.2003, + "num_input_tokens_seen": 10463632, + "step": 5344 + }, + { + "epoch": 0.7084161696487741, + "grad_norm": 0.10861289501190186, + "learning_rate": 4.344837382853086e-06, + "loss": 0.0008, + "num_input_tokens_seen": 10465296, + "step": 5345 + }, + { + "epoch": 0.7085487077534791, + "grad_norm": 6.541368007659912, + "learning_rate": 4.34460309187282e-06, + "loss": 0.203, + "num_input_tokens_seen": 10467672, + "step": 5346 + }, + { + "epoch": 0.7086812458581843, + "grad_norm": 16.940521240234375, + "learning_rate": 4.344368765327335e-06, + "loss": 0.4411, + "num_input_tokens_seen": 10469392, + "step": 5347 + }, + { + "epoch": 0.7088137839628893, + "grad_norm": 6.4979071617126465, + "learning_rate": 4.344134403221148e-06, + "loss": 0.0985, + "num_input_tokens_seen": 10471928, + "step": 5348 + }, + { + "epoch": 0.7089463220675944, + "grad_norm": 9.174598693847656, + "learning_rate": 4.343900005558777e-06, + "loss": 0.1553, + "num_input_tokens_seen": 10474104, + "step": 5349 + }, + { + "epoch": 0.7090788601722995, + "grad_norm": 0.1891871839761734, + "learning_rate": 4.343665572344742e-06, + "loss": 0.0013, + "num_input_tokens_seen": 10477200, + "step": 5350 + }, + { + "epoch": 0.7092113982770046, + "grad_norm": 5.6008524894714355, + "learning_rate": 4.343431103583563e-06, + "loss": 0.0618, + "num_input_tokens_seen": 10479096, + "step": 5351 + }, + { + "epoch": 0.7093439363817098, + "grad_norm": 5.606502532958984, + "learning_rate": 4.343196599279761e-06, + "loss": 0.074, + "num_input_tokens_seen": 10480592, + "step": 5352 + }, + { + "epoch": 0.7094764744864148, + "grad_norm": 0.15002194046974182, + "learning_rate": 4.342962059437856e-06, + "loss": 0.001, + "num_input_tokens_seen": 10483512, + "step": 5353 + }, + { + "epoch": 0.70960901259112, + "grad_norm": 10.585344314575195, + "learning_rate": 4.342727484062371e-06, + "loss": 0.2651, + "num_input_tokens_seen": 10486464, + "step": 5354 + }, + { + "epoch": 0.709741550695825, + "grad_norm": 30.622039794921875, + "learning_rate": 4.3424928731578296e-06, + "loss": 0.6854, + "num_input_tokens_seen": 10489120, + "step": 5355 + }, + { + "epoch": 0.7098740888005302, + "grad_norm": 13.048038482666016, + "learning_rate": 4.342258226728755e-06, + "loss": 0.387, + "num_input_tokens_seen": 10490832, + "step": 5356 + }, + { + "epoch": 0.7100066269052353, + "grad_norm": 4.078963756561279, + "learning_rate": 4.34202354477967e-06, + "loss": 0.0576, + "num_input_tokens_seen": 10492144, + "step": 5357 + }, + { + "epoch": 0.7101391650099403, + "grad_norm": 14.519580841064453, + "learning_rate": 4.341788827315101e-06, + "loss": 0.3535, + "num_input_tokens_seen": 10494184, + "step": 5358 + }, + { + "epoch": 0.7102717031146455, + "grad_norm": 7.30565071105957, + "learning_rate": 4.341554074339572e-06, + "loss": 0.275, + "num_input_tokens_seen": 10496072, + "step": 5359 + }, + { + "epoch": 0.7104042412193505, + "grad_norm": 3.5833187103271484, + "learning_rate": 4.3413192858576105e-06, + "loss": 0.0819, + "num_input_tokens_seen": 10498744, + "step": 5360 + }, + { + "epoch": 0.7105367793240557, + "grad_norm": 6.386040210723877, + "learning_rate": 4.341084461873743e-06, + "loss": 0.0943, + "num_input_tokens_seen": 10500888, + "step": 5361 + }, + { + "epoch": 0.7106693174287607, + "grad_norm": 0.08746777474880219, + "learning_rate": 4.3408496023924974e-06, + "loss": 0.0006, + "num_input_tokens_seen": 10502136, + "step": 5362 + }, + { + "epoch": 0.7108018555334659, + "grad_norm": 11.39535903930664, + "learning_rate": 4.3406147074184e-06, + "loss": 0.1657, + "num_input_tokens_seen": 10504056, + "step": 5363 + }, + { + "epoch": 0.710934393638171, + "grad_norm": 5.37987756729126, + "learning_rate": 4.340379776955983e-06, + "loss": 0.1192, + "num_input_tokens_seen": 10506328, + "step": 5364 + }, + { + "epoch": 0.711066931742876, + "grad_norm": 0.2989250123500824, + "learning_rate": 4.340144811009773e-06, + "loss": 0.0022, + "num_input_tokens_seen": 10507848, + "step": 5365 + }, + { + "epoch": 0.7111994698475812, + "grad_norm": 0.557701826095581, + "learning_rate": 4.339909809584302e-06, + "loss": 0.0041, + "num_input_tokens_seen": 10510936, + "step": 5366 + }, + { + "epoch": 0.7113320079522862, + "grad_norm": 16.63957977294922, + "learning_rate": 4.339674772684101e-06, + "loss": 0.5224, + "num_input_tokens_seen": 10513096, + "step": 5367 + }, + { + "epoch": 0.7114645460569914, + "grad_norm": 9.339136123657227, + "learning_rate": 4.339439700313701e-06, + "loss": 0.3479, + "num_input_tokens_seen": 10514872, + "step": 5368 + }, + { + "epoch": 0.7115970841616965, + "grad_norm": 4.87075138092041, + "learning_rate": 4.339204592477635e-06, + "loss": 0.0768, + "num_input_tokens_seen": 10516544, + "step": 5369 + }, + { + "epoch": 0.7117296222664016, + "grad_norm": 2.8643717765808105, + "learning_rate": 4.338969449180435e-06, + "loss": 0.0486, + "num_input_tokens_seen": 10517648, + "step": 5370 + }, + { + "epoch": 0.7118621603711067, + "grad_norm": 15.83580207824707, + "learning_rate": 4.338734270426636e-06, + "loss": 0.4671, + "num_input_tokens_seen": 10519696, + "step": 5371 + }, + { + "epoch": 0.7119946984758118, + "grad_norm": 0.3790338933467865, + "learning_rate": 4.338499056220771e-06, + "loss": 0.0026, + "num_input_tokens_seen": 10521568, + "step": 5372 + }, + { + "epoch": 0.7121272365805169, + "grad_norm": 0.3970550000667572, + "learning_rate": 4.3382638065673755e-06, + "loss": 0.0028, + "num_input_tokens_seen": 10524128, + "step": 5373 + }, + { + "epoch": 0.712259774685222, + "grad_norm": 13.866540908813477, + "learning_rate": 4.338028521470987e-06, + "loss": 0.5854, + "num_input_tokens_seen": 10526368, + "step": 5374 + }, + { + "epoch": 0.7123923127899271, + "grad_norm": 0.11552596092224121, + "learning_rate": 4.33779320093614e-06, + "loss": 0.0008, + "num_input_tokens_seen": 10527960, + "step": 5375 + }, + { + "epoch": 0.7125248508946322, + "grad_norm": 7.586239337921143, + "learning_rate": 4.337557844967372e-06, + "loss": 0.0635, + "num_input_tokens_seen": 10529432, + "step": 5376 + }, + { + "epoch": 0.7126573889993373, + "grad_norm": 7.3875837326049805, + "learning_rate": 4.337322453569221e-06, + "loss": 0.1583, + "num_input_tokens_seen": 10531272, + "step": 5377 + }, + { + "epoch": 0.7127899271040424, + "grad_norm": 12.727602005004883, + "learning_rate": 4.337087026746226e-06, + "loss": 0.3294, + "num_input_tokens_seen": 10533552, + "step": 5378 + }, + { + "epoch": 0.7129224652087475, + "grad_norm": 2.3973748683929443, + "learning_rate": 4.336851564502927e-06, + "loss": 0.0077, + "num_input_tokens_seen": 10536488, + "step": 5379 + }, + { + "epoch": 0.7130550033134526, + "grad_norm": 10.029938697814941, + "learning_rate": 4.3366160668438606e-06, + "loss": 0.1877, + "num_input_tokens_seen": 10538456, + "step": 5380 + }, + { + "epoch": 0.7131875414181578, + "grad_norm": 18.643531799316406, + "learning_rate": 4.336380533773571e-06, + "loss": 0.3149, + "num_input_tokens_seen": 10540200, + "step": 5381 + }, + { + "epoch": 0.7133200795228628, + "grad_norm": 0.14181943237781525, + "learning_rate": 4.336144965296596e-06, + "loss": 0.0009, + "num_input_tokens_seen": 10541208, + "step": 5382 + }, + { + "epoch": 0.713452617627568, + "grad_norm": 5.419266700744629, + "learning_rate": 4.33590936141748e-06, + "loss": 0.0669, + "num_input_tokens_seen": 10543328, + "step": 5383 + }, + { + "epoch": 0.713585155732273, + "grad_norm": 1.700243592262268, + "learning_rate": 4.335673722140767e-06, + "loss": 0.0505, + "num_input_tokens_seen": 10544472, + "step": 5384 + }, + { + "epoch": 0.7137176938369781, + "grad_norm": 2.2277565002441406, + "learning_rate": 4.335438047470996e-06, + "loss": 0.0618, + "num_input_tokens_seen": 10545728, + "step": 5385 + }, + { + "epoch": 0.7138502319416833, + "grad_norm": 1.5551958084106445, + "learning_rate": 4.335202337412714e-06, + "loss": 0.0102, + "num_input_tokens_seen": 10547552, + "step": 5386 + }, + { + "epoch": 0.7139827700463883, + "grad_norm": 4.267632007598877, + "learning_rate": 4.334966591970465e-06, + "loss": 0.0627, + "num_input_tokens_seen": 10549232, + "step": 5387 + }, + { + "epoch": 0.7141153081510935, + "grad_norm": 1.5692423582077026, + "learning_rate": 4.334730811148794e-06, + "loss": 0.0358, + "num_input_tokens_seen": 10551208, + "step": 5388 + }, + { + "epoch": 0.7142478462557985, + "grad_norm": 3.1644279956817627, + "learning_rate": 4.334494994952247e-06, + "loss": 0.022, + "num_input_tokens_seen": 10553600, + "step": 5389 + }, + { + "epoch": 0.7143803843605037, + "grad_norm": 3.423093557357788, + "learning_rate": 4.334259143385371e-06, + "loss": 0.0475, + "num_input_tokens_seen": 10554968, + "step": 5390 + }, + { + "epoch": 0.7145129224652087, + "grad_norm": 7.736878871917725, + "learning_rate": 4.334023256452714e-06, + "loss": 0.0745, + "num_input_tokens_seen": 10557088, + "step": 5391 + }, + { + "epoch": 0.7146454605699138, + "grad_norm": 6.876927375793457, + "learning_rate": 4.333787334158823e-06, + "loss": 0.2055, + "num_input_tokens_seen": 10559416, + "step": 5392 + }, + { + "epoch": 0.714777998674619, + "grad_norm": 8.267438888549805, + "learning_rate": 4.333551376508247e-06, + "loss": 0.2574, + "num_input_tokens_seen": 10561336, + "step": 5393 + }, + { + "epoch": 0.714910536779324, + "grad_norm": 11.670564651489258, + "learning_rate": 4.333315383505536e-06, + "loss": 0.3131, + "num_input_tokens_seen": 10563480, + "step": 5394 + }, + { + "epoch": 0.7150430748840292, + "grad_norm": 1.9028925895690918, + "learning_rate": 4.333079355155239e-06, + "loss": 0.0234, + "num_input_tokens_seen": 10566328, + "step": 5395 + }, + { + "epoch": 0.7151756129887342, + "grad_norm": 0.20793849229812622, + "learning_rate": 4.332843291461908e-06, + "loss": 0.0014, + "num_input_tokens_seen": 10567632, + "step": 5396 + }, + { + "epoch": 0.7153081510934394, + "grad_norm": 11.558806419372559, + "learning_rate": 4.332607192430094e-06, + "loss": 0.2997, + "num_input_tokens_seen": 10569960, + "step": 5397 + }, + { + "epoch": 0.7154406891981445, + "grad_norm": 10.364891052246094, + "learning_rate": 4.332371058064348e-06, + "loss": 0.1178, + "num_input_tokens_seen": 10571784, + "step": 5398 + }, + { + "epoch": 0.7155732273028496, + "grad_norm": 7.065649509429932, + "learning_rate": 4.332134888369225e-06, + "loss": 0.1902, + "num_input_tokens_seen": 10573264, + "step": 5399 + }, + { + "epoch": 0.7157057654075547, + "grad_norm": 3.975583791732788, + "learning_rate": 4.331898683349277e-06, + "loss": 0.0243, + "num_input_tokens_seen": 10575072, + "step": 5400 + }, + { + "epoch": 0.7158383035122597, + "grad_norm": 0.5255308747291565, + "learning_rate": 4.33166244300906e-06, + "loss": 0.0035, + "num_input_tokens_seen": 10576704, + "step": 5401 + }, + { + "epoch": 0.7159708416169649, + "grad_norm": 9.464899063110352, + "learning_rate": 4.331426167353126e-06, + "loss": 0.1927, + "num_input_tokens_seen": 10579536, + "step": 5402 + }, + { + "epoch": 0.7161033797216699, + "grad_norm": 15.958910942077637, + "learning_rate": 4.331189856386031e-06, + "loss": 0.5436, + "num_input_tokens_seen": 10581512, + "step": 5403 + }, + { + "epoch": 0.7162359178263751, + "grad_norm": 9.428521156311035, + "learning_rate": 4.330953510112334e-06, + "loss": 0.4042, + "num_input_tokens_seen": 10583128, + "step": 5404 + }, + { + "epoch": 0.7163684559310802, + "grad_norm": 4.2583441734313965, + "learning_rate": 4.3307171285365905e-06, + "loss": 0.0778, + "num_input_tokens_seen": 10585040, + "step": 5405 + }, + { + "epoch": 0.7165009940357853, + "grad_norm": 10.576728820800781, + "learning_rate": 4.330480711663356e-06, + "loss": 0.2728, + "num_input_tokens_seen": 10587320, + "step": 5406 + }, + { + "epoch": 0.7166335321404904, + "grad_norm": 6.718013763427734, + "learning_rate": 4.3302442594971925e-06, + "loss": 0.1954, + "num_input_tokens_seen": 10590008, + "step": 5407 + }, + { + "epoch": 0.7167660702451955, + "grad_norm": 5.553983211517334, + "learning_rate": 4.330007772042656e-06, + "loss": 0.0641, + "num_input_tokens_seen": 10592216, + "step": 5408 + }, + { + "epoch": 0.7168986083499006, + "grad_norm": 1.5864758491516113, + "learning_rate": 4.329771249304307e-06, + "loss": 0.0224, + "num_input_tokens_seen": 10593608, + "step": 5409 + }, + { + "epoch": 0.7170311464546058, + "grad_norm": 7.581082344055176, + "learning_rate": 4.329534691286707e-06, + "loss": 0.1692, + "num_input_tokens_seen": 10595168, + "step": 5410 + }, + { + "epoch": 0.7171636845593108, + "grad_norm": 8.926233291625977, + "learning_rate": 4.329298097994415e-06, + "loss": 0.0996, + "num_input_tokens_seen": 10596624, + "step": 5411 + }, + { + "epoch": 0.7172962226640159, + "grad_norm": 3.1127870082855225, + "learning_rate": 4.329061469431995e-06, + "loss": 0.0999, + "num_input_tokens_seen": 10598856, + "step": 5412 + }, + { + "epoch": 0.717428760768721, + "grad_norm": 11.583666801452637, + "learning_rate": 4.328824805604007e-06, + "loss": 0.1968, + "num_input_tokens_seen": 10600640, + "step": 5413 + }, + { + "epoch": 0.7175612988734261, + "grad_norm": 7.029998779296875, + "learning_rate": 4.328588106515016e-06, + "loss": 0.1904, + "num_input_tokens_seen": 10602584, + "step": 5414 + }, + { + "epoch": 0.7176938369781312, + "grad_norm": 2.6394267082214355, + "learning_rate": 4.328351372169584e-06, + "loss": 0.0195, + "num_input_tokens_seen": 10604192, + "step": 5415 + }, + { + "epoch": 0.7178263750828363, + "grad_norm": 75.4927978515625, + "learning_rate": 4.328114602572276e-06, + "loss": 0.4094, + "num_input_tokens_seen": 10607632, + "step": 5416 + }, + { + "epoch": 0.7179589131875415, + "grad_norm": 0.11221078783273697, + "learning_rate": 4.3278777977276576e-06, + "loss": 0.0007, + "num_input_tokens_seen": 10608888, + "step": 5417 + }, + { + "epoch": 0.7180914512922465, + "grad_norm": 8.978494644165039, + "learning_rate": 4.327640957640295e-06, + "loss": 0.2859, + "num_input_tokens_seen": 10610808, + "step": 5418 + }, + { + "epoch": 0.7182239893969516, + "grad_norm": 0.056512702256441116, + "learning_rate": 4.327404082314754e-06, + "loss": 0.0004, + "num_input_tokens_seen": 10612144, + "step": 5419 + }, + { + "epoch": 0.7183565275016567, + "grad_norm": 0.36572855710983276, + "learning_rate": 4.327167171755599e-06, + "loss": 0.0025, + "num_input_tokens_seen": 10614080, + "step": 5420 + }, + { + "epoch": 0.7184890656063618, + "grad_norm": 11.710256576538086, + "learning_rate": 4.326930225967404e-06, + "loss": 0.5209, + "num_input_tokens_seen": 10616912, + "step": 5421 + }, + { + "epoch": 0.718621603711067, + "grad_norm": 0.14278718829154968, + "learning_rate": 4.3266932449547316e-06, + "loss": 0.001, + "num_input_tokens_seen": 10619664, + "step": 5422 + }, + { + "epoch": 0.718754141815772, + "grad_norm": 9.962724685668945, + "learning_rate": 4.326456228722155e-06, + "loss": 0.212, + "num_input_tokens_seen": 10621184, + "step": 5423 + }, + { + "epoch": 0.7188866799204772, + "grad_norm": 2.9257819652557373, + "learning_rate": 4.326219177274241e-06, + "loss": 0.07, + "num_input_tokens_seen": 10623632, + "step": 5424 + }, + { + "epoch": 0.7190192180251822, + "grad_norm": 0.731238603591919, + "learning_rate": 4.3259820906155615e-06, + "loss": 0.0046, + "num_input_tokens_seen": 10624976, + "step": 5425 + }, + { + "epoch": 0.7191517561298874, + "grad_norm": 11.588208198547363, + "learning_rate": 4.325744968750688e-06, + "loss": 0.3923, + "num_input_tokens_seen": 10627168, + "step": 5426 + }, + { + "epoch": 0.7192842942345924, + "grad_norm": 6.853522300720215, + "learning_rate": 4.325507811684192e-06, + "loss": 0.2317, + "num_input_tokens_seen": 10628976, + "step": 5427 + }, + { + "epoch": 0.7194168323392975, + "grad_norm": 7.076307773590088, + "learning_rate": 4.325270619420645e-06, + "loss": 0.1596, + "num_input_tokens_seen": 10631000, + "step": 5428 + }, + { + "epoch": 0.7195493704440027, + "grad_norm": 6.852641582489014, + "learning_rate": 4.325033391964623e-06, + "loss": 0.0981, + "num_input_tokens_seen": 10633528, + "step": 5429 + }, + { + "epoch": 0.7196819085487077, + "grad_norm": 9.873103141784668, + "learning_rate": 4.324796129320696e-06, + "loss": 0.2821, + "num_input_tokens_seen": 10635392, + "step": 5430 + }, + { + "epoch": 0.7198144466534129, + "grad_norm": 10.66834545135498, + "learning_rate": 4.324558831493443e-06, + "loss": 0.3592, + "num_input_tokens_seen": 10637032, + "step": 5431 + }, + { + "epoch": 0.7199469847581179, + "grad_norm": 6.377750873565674, + "learning_rate": 4.324321498487436e-06, + "loss": 0.1151, + "num_input_tokens_seen": 10639088, + "step": 5432 + }, + { + "epoch": 0.7200795228628231, + "grad_norm": 8.164138793945312, + "learning_rate": 4.324084130307252e-06, + "loss": 0.3514, + "num_input_tokens_seen": 10641048, + "step": 5433 + }, + { + "epoch": 0.7202120609675282, + "grad_norm": 0.09812411665916443, + "learning_rate": 4.323846726957468e-06, + "loss": 0.0006, + "num_input_tokens_seen": 10642280, + "step": 5434 + }, + { + "epoch": 0.7203445990722332, + "grad_norm": 12.22256088256836, + "learning_rate": 4.32360928844266e-06, + "loss": 0.3678, + "num_input_tokens_seen": 10645008, + "step": 5435 + }, + { + "epoch": 0.7204771371769384, + "grad_norm": 1.7772632837295532, + "learning_rate": 4.323371814767407e-06, + "loss": 0.025, + "num_input_tokens_seen": 10646920, + "step": 5436 + }, + { + "epoch": 0.7206096752816434, + "grad_norm": 7.865345001220703, + "learning_rate": 4.323134305936289e-06, + "loss": 0.2219, + "num_input_tokens_seen": 10649288, + "step": 5437 + }, + { + "epoch": 0.7207422133863486, + "grad_norm": 18.25265121459961, + "learning_rate": 4.322896761953882e-06, + "loss": 0.477, + "num_input_tokens_seen": 10651200, + "step": 5438 + }, + { + "epoch": 0.7208747514910536, + "grad_norm": 6.348635673522949, + "learning_rate": 4.322659182824769e-06, + "loss": 0.19, + "num_input_tokens_seen": 10653208, + "step": 5439 + }, + { + "epoch": 0.7210072895957588, + "grad_norm": 12.010926246643066, + "learning_rate": 4.322421568553529e-06, + "loss": 0.6441, + "num_input_tokens_seen": 10655312, + "step": 5440 + }, + { + "epoch": 0.7211398277004639, + "grad_norm": 2.6433982849121094, + "learning_rate": 4.322183919144745e-06, + "loss": 0.0169, + "num_input_tokens_seen": 10657112, + "step": 5441 + }, + { + "epoch": 0.721272365805169, + "grad_norm": 6.007479667663574, + "learning_rate": 4.321946234602997e-06, + "loss": 0.0486, + "num_input_tokens_seen": 10658768, + "step": 5442 + }, + { + "epoch": 0.7214049039098741, + "grad_norm": 6.667125225067139, + "learning_rate": 4.3217085149328685e-06, + "loss": 0.1054, + "num_input_tokens_seen": 10659760, + "step": 5443 + }, + { + "epoch": 0.7215374420145791, + "grad_norm": 3.3705594539642334, + "learning_rate": 4.321470760138944e-06, + "loss": 0.0446, + "num_input_tokens_seen": 10661000, + "step": 5444 + }, + { + "epoch": 0.7216699801192843, + "grad_norm": 0.22150379419326782, + "learning_rate": 4.321232970225806e-06, + "loss": 0.0015, + "num_input_tokens_seen": 10662568, + "step": 5445 + }, + { + "epoch": 0.7218025182239894, + "grad_norm": 7.884476661682129, + "learning_rate": 4.32099514519804e-06, + "loss": 0.1561, + "num_input_tokens_seen": 10663968, + "step": 5446 + }, + { + "epoch": 0.7219350563286945, + "grad_norm": 4.997055530548096, + "learning_rate": 4.320757285060231e-06, + "loss": 0.1245, + "num_input_tokens_seen": 10666568, + "step": 5447 + }, + { + "epoch": 0.7220675944333996, + "grad_norm": 0.8927567005157471, + "learning_rate": 4.320519389816966e-06, + "loss": 0.0041, + "num_input_tokens_seen": 10668344, + "step": 5448 + }, + { + "epoch": 0.7222001325381047, + "grad_norm": 0.24215669929981232, + "learning_rate": 4.320281459472831e-06, + "loss": 0.0015, + "num_input_tokens_seen": 10670288, + "step": 5449 + }, + { + "epoch": 0.7223326706428098, + "grad_norm": 1.2419615983963013, + "learning_rate": 4.320043494032414e-06, + "loss": 0.016, + "num_input_tokens_seen": 10672496, + "step": 5450 + }, + { + "epoch": 0.722465208747515, + "grad_norm": 0.5320871472358704, + "learning_rate": 4.3198054935003034e-06, + "loss": 0.0033, + "num_input_tokens_seen": 10673952, + "step": 5451 + }, + { + "epoch": 0.72259774685222, + "grad_norm": 10.658312797546387, + "learning_rate": 4.319567457881086e-06, + "loss": 0.3339, + "num_input_tokens_seen": 10676224, + "step": 5452 + }, + { + "epoch": 0.7227302849569252, + "grad_norm": 7.224648475646973, + "learning_rate": 4.3193293871793544e-06, + "loss": 0.2567, + "num_input_tokens_seen": 10678368, + "step": 5453 + }, + { + "epoch": 0.7228628230616302, + "grad_norm": 9.707650184631348, + "learning_rate": 4.319091281399695e-06, + "loss": 0.3174, + "num_input_tokens_seen": 10680320, + "step": 5454 + }, + { + "epoch": 0.7229953611663353, + "grad_norm": 11.136312484741211, + "learning_rate": 4.318853140546702e-06, + "loss": 0.4657, + "num_input_tokens_seen": 10682488, + "step": 5455 + }, + { + "epoch": 0.7231278992710404, + "grad_norm": 11.700010299682617, + "learning_rate": 4.3186149646249664e-06, + "loss": 0.2006, + "num_input_tokens_seen": 10684352, + "step": 5456 + }, + { + "epoch": 0.7232604373757455, + "grad_norm": 0.09427163749933243, + "learning_rate": 4.3183767536390795e-06, + "loss": 0.0006, + "num_input_tokens_seen": 10685504, + "step": 5457 + }, + { + "epoch": 0.7233929754804507, + "grad_norm": 0.11732390522956848, + "learning_rate": 4.318138507593633e-06, + "loss": 0.0008, + "num_input_tokens_seen": 10688448, + "step": 5458 + }, + { + "epoch": 0.7235255135851557, + "grad_norm": 8.149674415588379, + "learning_rate": 4.3179002264932234e-06, + "loss": 0.2207, + "num_input_tokens_seen": 10690416, + "step": 5459 + }, + { + "epoch": 0.7236580516898609, + "grad_norm": 1.1455308198928833, + "learning_rate": 4.317661910342442e-06, + "loss": 0.0076, + "num_input_tokens_seen": 10692392, + "step": 5460 + }, + { + "epoch": 0.7237905897945659, + "grad_norm": 8.14931869506836, + "learning_rate": 4.317423559145886e-06, + "loss": 0.2479, + "num_input_tokens_seen": 10693592, + "step": 5461 + }, + { + "epoch": 0.723923127899271, + "grad_norm": 12.34900951385498, + "learning_rate": 4.31718517290815e-06, + "loss": 0.3649, + "num_input_tokens_seen": 10695256, + "step": 5462 + }, + { + "epoch": 0.7240556660039762, + "grad_norm": 12.32076358795166, + "learning_rate": 4.31694675163383e-06, + "loss": 0.4123, + "num_input_tokens_seen": 10697216, + "step": 5463 + }, + { + "epoch": 0.7241882041086812, + "grad_norm": 4.103886127471924, + "learning_rate": 4.316708295327523e-06, + "loss": 0.0653, + "num_input_tokens_seen": 10698888, + "step": 5464 + }, + { + "epoch": 0.7243207422133864, + "grad_norm": 9.828661918640137, + "learning_rate": 4.316469803993826e-06, + "loss": 0.3418, + "num_input_tokens_seen": 10701624, + "step": 5465 + }, + { + "epoch": 0.7244532803180914, + "grad_norm": 16.441856384277344, + "learning_rate": 4.316231277637339e-06, + "loss": 0.4695, + "num_input_tokens_seen": 10703384, + "step": 5466 + }, + { + "epoch": 0.7245858184227966, + "grad_norm": 8.357653617858887, + "learning_rate": 4.31599271626266e-06, + "loss": 0.0804, + "num_input_tokens_seen": 10705000, + "step": 5467 + }, + { + "epoch": 0.7247183565275016, + "grad_norm": 8.830903053283691, + "learning_rate": 4.315754119874389e-06, + "loss": 0.0809, + "num_input_tokens_seen": 10706792, + "step": 5468 + }, + { + "epoch": 0.7248508946322068, + "grad_norm": 0.8405228853225708, + "learning_rate": 4.315515488477126e-06, + "loss": 0.0057, + "num_input_tokens_seen": 10708320, + "step": 5469 + }, + { + "epoch": 0.7249834327369119, + "grad_norm": 10.384105682373047, + "learning_rate": 4.315276822075471e-06, + "loss": 0.322, + "num_input_tokens_seen": 10710632, + "step": 5470 + }, + { + "epoch": 0.7251159708416169, + "grad_norm": 0.6235556602478027, + "learning_rate": 4.315038120674027e-06, + "loss": 0.0026, + "num_input_tokens_seen": 10712032, + "step": 5471 + }, + { + "epoch": 0.7252485089463221, + "grad_norm": 9.148031234741211, + "learning_rate": 4.314799384277396e-06, + "loss": 0.4222, + "num_input_tokens_seen": 10714000, + "step": 5472 + }, + { + "epoch": 0.7253810470510271, + "grad_norm": 0.5181266069412231, + "learning_rate": 4.314560612890181e-06, + "loss": 0.0034, + "num_input_tokens_seen": 10715584, + "step": 5473 + }, + { + "epoch": 0.7255135851557323, + "grad_norm": 6.813755512237549, + "learning_rate": 4.314321806516985e-06, + "loss": 0.1563, + "num_input_tokens_seen": 10717368, + "step": 5474 + }, + { + "epoch": 0.7256461232604374, + "grad_norm": 14.17917537689209, + "learning_rate": 4.314082965162413e-06, + "loss": 0.4966, + "num_input_tokens_seen": 10719248, + "step": 5475 + }, + { + "epoch": 0.7257786613651425, + "grad_norm": 11.906841278076172, + "learning_rate": 4.3138440888310706e-06, + "loss": 0.5422, + "num_input_tokens_seen": 10721016, + "step": 5476 + }, + { + "epoch": 0.7259111994698476, + "grad_norm": 0.41764453053474426, + "learning_rate": 4.313605177527563e-06, + "loss": 0.0025, + "num_input_tokens_seen": 10722616, + "step": 5477 + }, + { + "epoch": 0.7260437375745527, + "grad_norm": 10.652955055236816, + "learning_rate": 4.313366231256495e-06, + "loss": 0.4911, + "num_input_tokens_seen": 10724472, + "step": 5478 + }, + { + "epoch": 0.7261762756792578, + "grad_norm": 0.15038461983203888, + "learning_rate": 4.313127250022477e-06, + "loss": 0.001, + "num_input_tokens_seen": 10725760, + "step": 5479 + }, + { + "epoch": 0.7263088137839628, + "grad_norm": 0.08290032297372818, + "learning_rate": 4.312888233830113e-06, + "loss": 0.0005, + "num_input_tokens_seen": 10726848, + "step": 5480 + }, + { + "epoch": 0.726441351888668, + "grad_norm": 3.6962532997131348, + "learning_rate": 4.312649182684015e-06, + "loss": 0.0246, + "num_input_tokens_seen": 10728480, + "step": 5481 + }, + { + "epoch": 0.7265738899933731, + "grad_norm": 0.18984758853912354, + "learning_rate": 4.312410096588789e-06, + "loss": 0.0012, + "num_input_tokens_seen": 10730368, + "step": 5482 + }, + { + "epoch": 0.7267064280980782, + "grad_norm": 12.223434448242188, + "learning_rate": 4.312170975549046e-06, + "loss": 0.366, + "num_input_tokens_seen": 10733160, + "step": 5483 + }, + { + "epoch": 0.7268389662027833, + "grad_norm": 0.16764910519123077, + "learning_rate": 4.311931819569397e-06, + "loss": 0.001, + "num_input_tokens_seen": 10734464, + "step": 5484 + }, + { + "epoch": 0.7269715043074884, + "grad_norm": 6.567307472229004, + "learning_rate": 4.311692628654453e-06, + "loss": 0.1461, + "num_input_tokens_seen": 10736328, + "step": 5485 + }, + { + "epoch": 0.7271040424121935, + "grad_norm": 7.938125133514404, + "learning_rate": 4.311453402808824e-06, + "loss": 0.3783, + "num_input_tokens_seen": 10738520, + "step": 5486 + }, + { + "epoch": 0.7272365805168987, + "grad_norm": 1.6471339464187622, + "learning_rate": 4.311214142037125e-06, + "loss": 0.007, + "num_input_tokens_seen": 10740168, + "step": 5487 + }, + { + "epoch": 0.7273691186216037, + "grad_norm": 3.9337992668151855, + "learning_rate": 4.310974846343967e-06, + "loss": 0.1043, + "num_input_tokens_seen": 10741152, + "step": 5488 + }, + { + "epoch": 0.7275016567263088, + "grad_norm": 0.09851621836423874, + "learning_rate": 4.3107355157339645e-06, + "loss": 0.0006, + "num_input_tokens_seen": 10742736, + "step": 5489 + }, + { + "epoch": 0.7276341948310139, + "grad_norm": 10.458983421325684, + "learning_rate": 4.310496150211732e-06, + "loss": 0.2792, + "num_input_tokens_seen": 10744808, + "step": 5490 + }, + { + "epoch": 0.727766732935719, + "grad_norm": 14.809197425842285, + "learning_rate": 4.310256749781886e-06, + "loss": 0.5231, + "num_input_tokens_seen": 10747224, + "step": 5491 + }, + { + "epoch": 0.7278992710404241, + "grad_norm": 5.7847771644592285, + "learning_rate": 4.31001731444904e-06, + "loss": 0.0889, + "num_input_tokens_seen": 10749000, + "step": 5492 + }, + { + "epoch": 0.7280318091451292, + "grad_norm": 3.9566891193389893, + "learning_rate": 4.3097778442178115e-06, + "loss": 0.1133, + "num_input_tokens_seen": 10751232, + "step": 5493 + }, + { + "epoch": 0.7281643472498344, + "grad_norm": 9.527785301208496, + "learning_rate": 4.309538339092818e-06, + "loss": 0.1712, + "num_input_tokens_seen": 10752752, + "step": 5494 + }, + { + "epoch": 0.7282968853545394, + "grad_norm": 8.892221450805664, + "learning_rate": 4.3092987990786775e-06, + "loss": 0.3489, + "num_input_tokens_seen": 10754824, + "step": 5495 + }, + { + "epoch": 0.7284294234592446, + "grad_norm": 4.816215991973877, + "learning_rate": 4.309059224180007e-06, + "loss": 0.0811, + "num_input_tokens_seen": 10756528, + "step": 5496 + }, + { + "epoch": 0.7285619615639496, + "grad_norm": 5.372101306915283, + "learning_rate": 4.308819614401427e-06, + "loss": 0.0996, + "num_input_tokens_seen": 10757568, + "step": 5497 + }, + { + "epoch": 0.7286944996686547, + "grad_norm": 10.995583534240723, + "learning_rate": 4.308579969747558e-06, + "loss": 0.2673, + "num_input_tokens_seen": 10760072, + "step": 5498 + }, + { + "epoch": 0.7288270377733599, + "grad_norm": 18.099035263061523, + "learning_rate": 4.308340290223018e-06, + "loss": 0.6095, + "num_input_tokens_seen": 10763760, + "step": 5499 + }, + { + "epoch": 0.7289595758780649, + "grad_norm": 3.5018908977508545, + "learning_rate": 4.3081005758324305e-06, + "loss": 0.058, + "num_input_tokens_seen": 10766232, + "step": 5500 + }, + { + "epoch": 0.7290921139827701, + "grad_norm": 3.944511890411377, + "learning_rate": 4.307860826580416e-06, + "loss": 0.0606, + "num_input_tokens_seen": 10768000, + "step": 5501 + }, + { + "epoch": 0.7292246520874751, + "grad_norm": 5.916755676269531, + "learning_rate": 4.307621042471597e-06, + "loss": 0.1859, + "num_input_tokens_seen": 10769648, + "step": 5502 + }, + { + "epoch": 0.7293571901921803, + "grad_norm": 0.6834207773208618, + "learning_rate": 4.307381223510598e-06, + "loss": 0.0041, + "num_input_tokens_seen": 10771176, + "step": 5503 + }, + { + "epoch": 0.7294897282968854, + "grad_norm": 0.8466163873672485, + "learning_rate": 4.307141369702043e-06, + "loss": 0.0057, + "num_input_tokens_seen": 10772720, + "step": 5504 + }, + { + "epoch": 0.7296222664015904, + "grad_norm": 0.9664229154586792, + "learning_rate": 4.306901481050555e-06, + "loss": 0.0059, + "num_input_tokens_seen": 10774176, + "step": 5505 + }, + { + "epoch": 0.7297548045062956, + "grad_norm": 5.939652442932129, + "learning_rate": 4.3066615575607595e-06, + "loss": 0.0619, + "num_input_tokens_seen": 10775848, + "step": 5506 + }, + { + "epoch": 0.7298873426110006, + "grad_norm": 6.4808349609375, + "learning_rate": 4.306421599237284e-06, + "loss": 0.0418, + "num_input_tokens_seen": 10777576, + "step": 5507 + }, + { + "epoch": 0.7300198807157058, + "grad_norm": 8.337955474853516, + "learning_rate": 4.306181606084752e-06, + "loss": 0.5075, + "num_input_tokens_seen": 10779784, + "step": 5508 + }, + { + "epoch": 0.7301524188204108, + "grad_norm": 3.689265489578247, + "learning_rate": 4.305941578107795e-06, + "loss": 0.057, + "num_input_tokens_seen": 10781888, + "step": 5509 + }, + { + "epoch": 0.730284956925116, + "grad_norm": 0.28034961223602295, + "learning_rate": 4.305701515311037e-06, + "loss": 0.0019, + "num_input_tokens_seen": 10784144, + "step": 5510 + }, + { + "epoch": 0.7304174950298211, + "grad_norm": 12.424942970275879, + "learning_rate": 4.305461417699109e-06, + "loss": 0.6706, + "num_input_tokens_seen": 10786576, + "step": 5511 + }, + { + "epoch": 0.7305500331345262, + "grad_norm": 10.470807075500488, + "learning_rate": 4.3052212852766375e-06, + "loss": 0.1583, + "num_input_tokens_seen": 10788824, + "step": 5512 + }, + { + "epoch": 0.7306825712392313, + "grad_norm": 0.12171858549118042, + "learning_rate": 4.304981118048256e-06, + "loss": 0.0008, + "num_input_tokens_seen": 10789936, + "step": 5513 + }, + { + "epoch": 0.7308151093439363, + "grad_norm": 23.789995193481445, + "learning_rate": 4.304740916018593e-06, + "loss": 0.4018, + "num_input_tokens_seen": 10791656, + "step": 5514 + }, + { + "epoch": 0.7309476474486415, + "grad_norm": 0.07042837888002396, + "learning_rate": 4.3045006791922795e-06, + "loss": 0.0005, + "num_input_tokens_seen": 10793328, + "step": 5515 + }, + { + "epoch": 0.7310801855533466, + "grad_norm": 18.310781478881836, + "learning_rate": 4.304260407573949e-06, + "loss": 0.5433, + "num_input_tokens_seen": 10794792, + "step": 5516 + }, + { + "epoch": 0.7312127236580517, + "grad_norm": 9.13074779510498, + "learning_rate": 4.304020101168232e-06, + "loss": 0.2682, + "num_input_tokens_seen": 10796848, + "step": 5517 + }, + { + "epoch": 0.7313452617627568, + "grad_norm": 5.251986980438232, + "learning_rate": 4.303779759979764e-06, + "loss": 0.0808, + "num_input_tokens_seen": 10799248, + "step": 5518 + }, + { + "epoch": 0.7314777998674619, + "grad_norm": 16.71279525756836, + "learning_rate": 4.303539384013177e-06, + "loss": 0.5215, + "num_input_tokens_seen": 10800968, + "step": 5519 + }, + { + "epoch": 0.731610337972167, + "grad_norm": 11.054742813110352, + "learning_rate": 4.3032989732731075e-06, + "loss": 0.259, + "num_input_tokens_seen": 10802648, + "step": 5520 + }, + { + "epoch": 0.731742876076872, + "grad_norm": 0.18419474363327026, + "learning_rate": 4.303058527764189e-06, + "loss": 0.0012, + "num_input_tokens_seen": 10804792, + "step": 5521 + }, + { + "epoch": 0.7318754141815772, + "grad_norm": 10.891854286193848, + "learning_rate": 4.3028180474910585e-06, + "loss": 0.437, + "num_input_tokens_seen": 10807800, + "step": 5522 + }, + { + "epoch": 0.7320079522862823, + "grad_norm": 5.592297077178955, + "learning_rate": 4.302577532458352e-06, + "loss": 0.1746, + "num_input_tokens_seen": 10809704, + "step": 5523 + }, + { + "epoch": 0.7321404903909874, + "grad_norm": 0.04535595700144768, + "learning_rate": 4.302336982670707e-06, + "loss": 0.0003, + "num_input_tokens_seen": 10811080, + "step": 5524 + }, + { + "epoch": 0.7322730284956925, + "grad_norm": 0.05149759724736214, + "learning_rate": 4.302096398132762e-06, + "loss": 0.0004, + "num_input_tokens_seen": 10812104, + "step": 5525 + }, + { + "epoch": 0.7324055666003976, + "grad_norm": 11.92459487915039, + "learning_rate": 4.301855778849155e-06, + "loss": 0.4089, + "num_input_tokens_seen": 10813880, + "step": 5526 + }, + { + "epoch": 0.7325381047051027, + "grad_norm": 0.3102714419364929, + "learning_rate": 4.301615124824526e-06, + "loss": 0.0022, + "num_input_tokens_seen": 10816560, + "step": 5527 + }, + { + "epoch": 0.7326706428098079, + "grad_norm": 0.07011289894580841, + "learning_rate": 4.301374436063515e-06, + "loss": 0.0005, + "num_input_tokens_seen": 10817888, + "step": 5528 + }, + { + "epoch": 0.7328031809145129, + "grad_norm": 8.574570655822754, + "learning_rate": 4.301133712570761e-06, + "loss": 0.2513, + "num_input_tokens_seen": 10819448, + "step": 5529 + }, + { + "epoch": 0.7329357190192181, + "grad_norm": 0.12541843950748444, + "learning_rate": 4.300892954350907e-06, + "loss": 0.0009, + "num_input_tokens_seen": 10820912, + "step": 5530 + }, + { + "epoch": 0.7330682571239231, + "grad_norm": 1.4550225734710693, + "learning_rate": 4.300652161408594e-06, + "loss": 0.0099, + "num_input_tokens_seen": 10822064, + "step": 5531 + }, + { + "epoch": 0.7332007952286282, + "grad_norm": 5.956141948699951, + "learning_rate": 4.300411333748465e-06, + "loss": 0.1749, + "num_input_tokens_seen": 10823568, + "step": 5532 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 3.8698384761810303, + "learning_rate": 4.3001704713751645e-06, + "loss": 0.1231, + "num_input_tokens_seen": 10825160, + "step": 5533 + }, + { + "epoch": 0.7334658714380384, + "grad_norm": 4.995274066925049, + "learning_rate": 4.299929574293335e-06, + "loss": 0.0581, + "num_input_tokens_seen": 10826904, + "step": 5534 + }, + { + "epoch": 0.7335984095427436, + "grad_norm": 7.504029750823975, + "learning_rate": 4.2996886425076215e-06, + "loss": 0.223, + "num_input_tokens_seen": 10828736, + "step": 5535 + }, + { + "epoch": 0.7337309476474486, + "grad_norm": 15.170551300048828, + "learning_rate": 4.29944767602267e-06, + "loss": 0.4741, + "num_input_tokens_seen": 10830176, + "step": 5536 + }, + { + "epoch": 0.7338634857521538, + "grad_norm": 6.661860942840576, + "learning_rate": 4.299206674843125e-06, + "loss": 0.1722, + "num_input_tokens_seen": 10831864, + "step": 5537 + }, + { + "epoch": 0.7339960238568588, + "grad_norm": 18.007686614990234, + "learning_rate": 4.298965638973635e-06, + "loss": 0.2577, + "num_input_tokens_seen": 10833856, + "step": 5538 + }, + { + "epoch": 0.734128561961564, + "grad_norm": 0.1619449257850647, + "learning_rate": 4.298724568418845e-06, + "loss": 0.0012, + "num_input_tokens_seen": 10835640, + "step": 5539 + }, + { + "epoch": 0.7342611000662691, + "grad_norm": 6.918808937072754, + "learning_rate": 4.298483463183406e-06, + "loss": 0.258, + "num_input_tokens_seen": 10837328, + "step": 5540 + }, + { + "epoch": 0.7343936381709741, + "grad_norm": 9.625131607055664, + "learning_rate": 4.298242323271964e-06, + "loss": 0.2912, + "num_input_tokens_seen": 10838688, + "step": 5541 + }, + { + "epoch": 0.7345261762756793, + "grad_norm": 6.90231466293335, + "learning_rate": 4.29800114868917e-06, + "loss": 0.2205, + "num_input_tokens_seen": 10840536, + "step": 5542 + }, + { + "epoch": 0.7346587143803843, + "grad_norm": 9.644344329833984, + "learning_rate": 4.297759939439673e-06, + "loss": 0.2371, + "num_input_tokens_seen": 10842616, + "step": 5543 + }, + { + "epoch": 0.7347912524850895, + "grad_norm": 16.313232421875, + "learning_rate": 4.297518695528125e-06, + "loss": 0.4175, + "num_input_tokens_seen": 10844800, + "step": 5544 + }, + { + "epoch": 0.7349237905897945, + "grad_norm": 0.6032420992851257, + "learning_rate": 4.297277416959176e-06, + "loss": 0.0045, + "num_input_tokens_seen": 10846192, + "step": 5545 + }, + { + "epoch": 0.7350563286944997, + "grad_norm": 13.998650550842285, + "learning_rate": 4.2970361037374784e-06, + "loss": 0.2887, + "num_input_tokens_seen": 10847832, + "step": 5546 + }, + { + "epoch": 0.7351888667992048, + "grad_norm": 7.126074314117432, + "learning_rate": 4.296794755867685e-06, + "loss": 0.3418, + "num_input_tokens_seen": 10850720, + "step": 5547 + }, + { + "epoch": 0.7353214049039098, + "grad_norm": 5.029463768005371, + "learning_rate": 4.296553373354449e-06, + "loss": 0.1063, + "num_input_tokens_seen": 10852408, + "step": 5548 + }, + { + "epoch": 0.735453943008615, + "grad_norm": 1.4527138471603394, + "learning_rate": 4.296311956202426e-06, + "loss": 0.0243, + "num_input_tokens_seen": 10853960, + "step": 5549 + }, + { + "epoch": 0.73558648111332, + "grad_norm": 0.5472029447555542, + "learning_rate": 4.2960705044162675e-06, + "loss": 0.004, + "num_input_tokens_seen": 10856032, + "step": 5550 + }, + { + "epoch": 0.7357190192180252, + "grad_norm": 11.215896606445312, + "learning_rate": 4.2958290180006315e-06, + "loss": 0.296, + "num_input_tokens_seen": 10857640, + "step": 5551 + }, + { + "epoch": 0.7358515573227303, + "grad_norm": 3.4041786193847656, + "learning_rate": 4.295587496960173e-06, + "loss": 0.0657, + "num_input_tokens_seen": 10860624, + "step": 5552 + }, + { + "epoch": 0.7359840954274354, + "grad_norm": 4.3968586921691895, + "learning_rate": 4.295345941299549e-06, + "loss": 0.1082, + "num_input_tokens_seen": 10863712, + "step": 5553 + }, + { + "epoch": 0.7361166335321405, + "grad_norm": 1.3515654802322388, + "learning_rate": 4.295104351023417e-06, + "loss": 0.0103, + "num_input_tokens_seen": 10865024, + "step": 5554 + }, + { + "epoch": 0.7362491716368456, + "grad_norm": 7.054819583892822, + "learning_rate": 4.294862726136434e-06, + "loss": 0.1831, + "num_input_tokens_seen": 10866976, + "step": 5555 + }, + { + "epoch": 0.7363817097415507, + "grad_norm": 14.793862342834473, + "learning_rate": 4.29462106664326e-06, + "loss": 0.6049, + "num_input_tokens_seen": 10869472, + "step": 5556 + }, + { + "epoch": 0.7365142478462559, + "grad_norm": 9.630166053771973, + "learning_rate": 4.294379372548553e-06, + "loss": 0.1955, + "num_input_tokens_seen": 10872008, + "step": 5557 + }, + { + "epoch": 0.7366467859509609, + "grad_norm": 7.67263126373291, + "learning_rate": 4.294137643856974e-06, + "loss": 0.1745, + "num_input_tokens_seen": 10873608, + "step": 5558 + }, + { + "epoch": 0.736779324055666, + "grad_norm": 2.2542288303375244, + "learning_rate": 4.293895880573184e-06, + "loss": 0.0168, + "num_input_tokens_seen": 10875616, + "step": 5559 + }, + { + "epoch": 0.7369118621603711, + "grad_norm": 16.079044342041016, + "learning_rate": 4.293654082701843e-06, + "loss": 0.6604, + "num_input_tokens_seen": 10878384, + "step": 5560 + }, + { + "epoch": 0.7370444002650762, + "grad_norm": 9.172947883605957, + "learning_rate": 4.293412250247615e-06, + "loss": 0.239, + "num_input_tokens_seen": 10880568, + "step": 5561 + }, + { + "epoch": 0.7371769383697813, + "grad_norm": 6.037843227386475, + "learning_rate": 4.293170383215161e-06, + "loss": 0.1334, + "num_input_tokens_seen": 10882728, + "step": 5562 + }, + { + "epoch": 0.7373094764744864, + "grad_norm": 2.670527219772339, + "learning_rate": 4.292928481609146e-06, + "loss": 0.0729, + "num_input_tokens_seen": 10884168, + "step": 5563 + }, + { + "epoch": 0.7374420145791916, + "grad_norm": 5.2588114738464355, + "learning_rate": 4.292686545434232e-06, + "loss": 0.1216, + "num_input_tokens_seen": 10887440, + "step": 5564 + }, + { + "epoch": 0.7375745526838966, + "grad_norm": 1.534140706062317, + "learning_rate": 4.292444574695085e-06, + "loss": 0.0366, + "num_input_tokens_seen": 10888960, + "step": 5565 + }, + { + "epoch": 0.7377070907886017, + "grad_norm": 3.937309741973877, + "learning_rate": 4.2922025693963696e-06, + "loss": 0.0731, + "num_input_tokens_seen": 10890856, + "step": 5566 + }, + { + "epoch": 0.7378396288933068, + "grad_norm": 10.178291320800781, + "learning_rate": 4.291960529542753e-06, + "loss": 0.3296, + "num_input_tokens_seen": 10892952, + "step": 5567 + }, + { + "epoch": 0.7379721669980119, + "grad_norm": 9.69021224975586, + "learning_rate": 4.2917184551389e-06, + "loss": 0.4357, + "num_input_tokens_seen": 10895096, + "step": 5568 + }, + { + "epoch": 0.7381047051027171, + "grad_norm": 1.2098838090896606, + "learning_rate": 4.29147634618948e-06, + "loss": 0.0093, + "num_input_tokens_seen": 10896480, + "step": 5569 + }, + { + "epoch": 0.7382372432074221, + "grad_norm": 7.711188316345215, + "learning_rate": 4.29123420269916e-06, + "loss": 0.119, + "num_input_tokens_seen": 10899552, + "step": 5570 + }, + { + "epoch": 0.7383697813121273, + "grad_norm": 1.3528118133544922, + "learning_rate": 4.29099202467261e-06, + "loss": 0.0098, + "num_input_tokens_seen": 10901168, + "step": 5571 + }, + { + "epoch": 0.7385023194168323, + "grad_norm": 0.6277579069137573, + "learning_rate": 4.290749812114497e-06, + "loss": 0.0046, + "num_input_tokens_seen": 10902688, + "step": 5572 + }, + { + "epoch": 0.7386348575215375, + "grad_norm": 1.0253241062164307, + "learning_rate": 4.290507565029493e-06, + "loss": 0.0175, + "num_input_tokens_seen": 10904144, + "step": 5573 + }, + { + "epoch": 0.7387673956262425, + "grad_norm": 8.246904373168945, + "learning_rate": 4.290265283422267e-06, + "loss": 0.2873, + "num_input_tokens_seen": 10905616, + "step": 5574 + }, + { + "epoch": 0.7388999337309476, + "grad_norm": 4.6777729988098145, + "learning_rate": 4.290022967297491e-06, + "loss": 0.0829, + "num_input_tokens_seen": 10907160, + "step": 5575 + }, + { + "epoch": 0.7390324718356528, + "grad_norm": 7.547438621520996, + "learning_rate": 4.289780616659839e-06, + "loss": 0.089, + "num_input_tokens_seen": 10909120, + "step": 5576 + }, + { + "epoch": 0.7391650099403578, + "grad_norm": 16.122739791870117, + "learning_rate": 4.28953823151398e-06, + "loss": 0.6826, + "num_input_tokens_seen": 10911240, + "step": 5577 + }, + { + "epoch": 0.739297548045063, + "grad_norm": 9.46178913116455, + "learning_rate": 4.289295811864591e-06, + "loss": 0.2104, + "num_input_tokens_seen": 10913336, + "step": 5578 + }, + { + "epoch": 0.739430086149768, + "grad_norm": 6.104067325592041, + "learning_rate": 4.289053357716343e-06, + "loss": 0.2484, + "num_input_tokens_seen": 10915528, + "step": 5579 + }, + { + "epoch": 0.7395626242544732, + "grad_norm": 3.712414264678955, + "learning_rate": 4.288810869073914e-06, + "loss": 0.0274, + "num_input_tokens_seen": 10917528, + "step": 5580 + }, + { + "epoch": 0.7396951623591783, + "grad_norm": 6.806366443634033, + "learning_rate": 4.288568345941976e-06, + "loss": 0.0935, + "num_input_tokens_seen": 10919544, + "step": 5581 + }, + { + "epoch": 0.7398277004638834, + "grad_norm": 7.807260990142822, + "learning_rate": 4.2883257883252065e-06, + "loss": 0.1926, + "num_input_tokens_seen": 10921456, + "step": 5582 + }, + { + "epoch": 0.7399602385685885, + "grad_norm": 9.196476936340332, + "learning_rate": 4.288083196228281e-06, + "loss": 0.5939, + "num_input_tokens_seen": 10924424, + "step": 5583 + }, + { + "epoch": 0.7400927766732935, + "grad_norm": 3.339165210723877, + "learning_rate": 4.28784056965588e-06, + "loss": 0.0678, + "num_input_tokens_seen": 10925936, + "step": 5584 + }, + { + "epoch": 0.7402253147779987, + "grad_norm": 5.34340763092041, + "learning_rate": 4.287597908612678e-06, + "loss": 0.2142, + "num_input_tokens_seen": 10927816, + "step": 5585 + }, + { + "epoch": 0.7403578528827037, + "grad_norm": 10.882530212402344, + "learning_rate": 4.287355213103356e-06, + "loss": 0.3709, + "num_input_tokens_seen": 10929312, + "step": 5586 + }, + { + "epoch": 0.7404903909874089, + "grad_norm": 10.847646713256836, + "learning_rate": 4.2871124831325925e-06, + "loss": 0.5355, + "num_input_tokens_seen": 10930744, + "step": 5587 + }, + { + "epoch": 0.740622929092114, + "grad_norm": 8.853724479675293, + "learning_rate": 4.286869718705067e-06, + "loss": 0.3776, + "num_input_tokens_seen": 10932624, + "step": 5588 + }, + { + "epoch": 0.7407554671968191, + "grad_norm": 5.423361778259277, + "learning_rate": 4.28662691982546e-06, + "loss": 0.1039, + "num_input_tokens_seen": 10935464, + "step": 5589 + }, + { + "epoch": 0.7408880053015242, + "grad_norm": 9.560091018676758, + "learning_rate": 4.286384086498455e-06, + "loss": 0.2509, + "num_input_tokens_seen": 10937664, + "step": 5590 + }, + { + "epoch": 0.7410205434062292, + "grad_norm": 5.965892314910889, + "learning_rate": 4.286141218728732e-06, + "loss": 0.0945, + "num_input_tokens_seen": 10940104, + "step": 5591 + }, + { + "epoch": 0.7411530815109344, + "grad_norm": 0.16219490766525269, + "learning_rate": 4.285898316520974e-06, + "loss": 0.0011, + "num_input_tokens_seen": 10943152, + "step": 5592 + }, + { + "epoch": 0.7412856196156395, + "grad_norm": 3.0228981971740723, + "learning_rate": 4.285655379879864e-06, + "loss": 0.0504, + "num_input_tokens_seen": 10944808, + "step": 5593 + }, + { + "epoch": 0.7414181577203446, + "grad_norm": 6.613373279571533, + "learning_rate": 4.285412408810086e-06, + "loss": 0.112, + "num_input_tokens_seen": 10947032, + "step": 5594 + }, + { + "epoch": 0.7415506958250497, + "grad_norm": 5.082480430603027, + "learning_rate": 4.285169403316326e-06, + "loss": 0.1049, + "num_input_tokens_seen": 10949960, + "step": 5595 + }, + { + "epoch": 0.7416832339297548, + "grad_norm": 12.144387245178223, + "learning_rate": 4.284926363403268e-06, + "loss": 0.4337, + "num_input_tokens_seen": 10951400, + "step": 5596 + }, + { + "epoch": 0.7418157720344599, + "grad_norm": 7.6896586418151855, + "learning_rate": 4.284683289075598e-06, + "loss": 0.1994, + "num_input_tokens_seen": 10952928, + "step": 5597 + }, + { + "epoch": 0.741948310139165, + "grad_norm": 0.21499766409397125, + "learning_rate": 4.284440180338004e-06, + "loss": 0.0015, + "num_input_tokens_seen": 10954896, + "step": 5598 + }, + { + "epoch": 0.7420808482438701, + "grad_norm": 6.082207202911377, + "learning_rate": 4.284197037195171e-06, + "loss": 0.076, + "num_input_tokens_seen": 10957776, + "step": 5599 + }, + { + "epoch": 0.7422133863485753, + "grad_norm": 6.072389602661133, + "learning_rate": 4.28395385965179e-06, + "loss": 0.2148, + "num_input_tokens_seen": 10960280, + "step": 5600 + }, + { + "epoch": 0.7423459244532803, + "grad_norm": 15.996254920959473, + "learning_rate": 4.2837106477125465e-06, + "loss": 0.3645, + "num_input_tokens_seen": 10962328, + "step": 5601 + }, + { + "epoch": 0.7424784625579854, + "grad_norm": 11.350447654724121, + "learning_rate": 4.283467401382132e-06, + "loss": 0.3724, + "num_input_tokens_seen": 10964664, + "step": 5602 + }, + { + "epoch": 0.7426110006626905, + "grad_norm": 5.432244777679443, + "learning_rate": 4.2832241206652344e-06, + "loss": 0.0578, + "num_input_tokens_seen": 10967264, + "step": 5603 + }, + { + "epoch": 0.7427435387673956, + "grad_norm": 18.156660079956055, + "learning_rate": 4.2829808055665465e-06, + "loss": 0.6547, + "num_input_tokens_seen": 10969536, + "step": 5604 + }, + { + "epoch": 0.7428760768721008, + "grad_norm": 11.976000785827637, + "learning_rate": 4.282737456090759e-06, + "loss": 0.3157, + "num_input_tokens_seen": 10971584, + "step": 5605 + }, + { + "epoch": 0.7430086149768058, + "grad_norm": 6.209428787231445, + "learning_rate": 4.282494072242563e-06, + "loss": 0.0704, + "num_input_tokens_seen": 10973040, + "step": 5606 + }, + { + "epoch": 0.743141153081511, + "grad_norm": 4.18744421005249, + "learning_rate": 4.2822506540266515e-06, + "loss": 0.1342, + "num_input_tokens_seen": 10975288, + "step": 5607 + }, + { + "epoch": 0.743273691186216, + "grad_norm": 7.878844738006592, + "learning_rate": 4.282007201447718e-06, + "loss": 0.1245, + "num_input_tokens_seen": 10977248, + "step": 5608 + }, + { + "epoch": 0.7434062292909212, + "grad_norm": 1.692920207977295, + "learning_rate": 4.2817637145104565e-06, + "loss": 0.0291, + "num_input_tokens_seen": 10978888, + "step": 5609 + }, + { + "epoch": 0.7435387673956262, + "grad_norm": 2.3419227600097656, + "learning_rate": 4.281520193219561e-06, + "loss": 0.0203, + "num_input_tokens_seen": 10980592, + "step": 5610 + }, + { + "epoch": 0.7436713055003313, + "grad_norm": 1.2332572937011719, + "learning_rate": 4.281276637579728e-06, + "loss": 0.0087, + "num_input_tokens_seen": 10981920, + "step": 5611 + }, + { + "epoch": 0.7438038436050365, + "grad_norm": 5.501136302947998, + "learning_rate": 4.281033047595652e-06, + "loss": 0.0963, + "num_input_tokens_seen": 10984040, + "step": 5612 + }, + { + "epoch": 0.7439363817097415, + "grad_norm": 13.99736499786377, + "learning_rate": 4.28078942327203e-06, + "loss": 0.4102, + "num_input_tokens_seen": 10987416, + "step": 5613 + }, + { + "epoch": 0.7440689198144467, + "grad_norm": 24.510679244995117, + "learning_rate": 4.28054576461356e-06, + "loss": 0.7798, + "num_input_tokens_seen": 10990320, + "step": 5614 + }, + { + "epoch": 0.7442014579191517, + "grad_norm": 6.149999141693115, + "learning_rate": 4.280302071624939e-06, + "loss": 0.0897, + "num_input_tokens_seen": 10993664, + "step": 5615 + }, + { + "epoch": 0.7443339960238569, + "grad_norm": 8.941412925720215, + "learning_rate": 4.280058344310865e-06, + "loss": 0.1064, + "num_input_tokens_seen": 10995120, + "step": 5616 + }, + { + "epoch": 0.744466534128562, + "grad_norm": 11.968453407287598, + "learning_rate": 4.279814582676039e-06, + "loss": 0.4572, + "num_input_tokens_seen": 10996752, + "step": 5617 + }, + { + "epoch": 0.744599072233267, + "grad_norm": 9.384695053100586, + "learning_rate": 4.27957078672516e-06, + "loss": 0.2039, + "num_input_tokens_seen": 10999120, + "step": 5618 + }, + { + "epoch": 0.7447316103379722, + "grad_norm": 13.153914451599121, + "learning_rate": 4.279326956462928e-06, + "loss": 0.332, + "num_input_tokens_seen": 11000760, + "step": 5619 + }, + { + "epoch": 0.7448641484426772, + "grad_norm": 11.911666870117188, + "learning_rate": 4.279083091894046e-06, + "loss": 0.3316, + "num_input_tokens_seen": 11002224, + "step": 5620 + }, + { + "epoch": 0.7449966865473824, + "grad_norm": 1.1861519813537598, + "learning_rate": 4.278839193023214e-06, + "loss": 0.0086, + "num_input_tokens_seen": 11003656, + "step": 5621 + }, + { + "epoch": 0.7451292246520875, + "grad_norm": 4.774340629577637, + "learning_rate": 4.278595259855135e-06, + "loss": 0.0263, + "num_input_tokens_seen": 11004912, + "step": 5622 + }, + { + "epoch": 0.7452617627567926, + "grad_norm": 12.020133972167969, + "learning_rate": 4.278351292394513e-06, + "loss": 0.2986, + "num_input_tokens_seen": 11007104, + "step": 5623 + }, + { + "epoch": 0.7453943008614977, + "grad_norm": 0.41112396121025085, + "learning_rate": 4.27810729064605e-06, + "loss": 0.0029, + "num_input_tokens_seen": 11008808, + "step": 5624 + }, + { + "epoch": 0.7455268389662028, + "grad_norm": 6.758029937744141, + "learning_rate": 4.277863254614453e-06, + "loss": 0.0705, + "num_input_tokens_seen": 11011824, + "step": 5625 + }, + { + "epoch": 0.7456593770709079, + "grad_norm": 0.6719967126846313, + "learning_rate": 4.277619184304426e-06, + "loss": 0.0043, + "num_input_tokens_seen": 11014936, + "step": 5626 + }, + { + "epoch": 0.7457919151756129, + "grad_norm": 0.3601485788822174, + "learning_rate": 4.277375079720674e-06, + "loss": 0.0025, + "num_input_tokens_seen": 11016688, + "step": 5627 + }, + { + "epoch": 0.7459244532803181, + "grad_norm": 6.074212551116943, + "learning_rate": 4.277130940867905e-06, + "loss": 0.1615, + "num_input_tokens_seen": 11017800, + "step": 5628 + }, + { + "epoch": 0.7460569913850232, + "grad_norm": 13.346771240234375, + "learning_rate": 4.276886767750825e-06, + "loss": 0.34, + "num_input_tokens_seen": 11019760, + "step": 5629 + }, + { + "epoch": 0.7461895294897283, + "grad_norm": 16.044174194335938, + "learning_rate": 4.276642560374142e-06, + "loss": 0.3147, + "num_input_tokens_seen": 11021624, + "step": 5630 + }, + { + "epoch": 0.7463220675944334, + "grad_norm": 1.467759132385254, + "learning_rate": 4.276398318742565e-06, + "loss": 0.0513, + "num_input_tokens_seen": 11023552, + "step": 5631 + }, + { + "epoch": 0.7464546056991385, + "grad_norm": 0.23978865146636963, + "learning_rate": 4.276154042860804e-06, + "loss": 0.0017, + "num_input_tokens_seen": 11025520, + "step": 5632 + }, + { + "epoch": 0.7465871438038436, + "grad_norm": 14.448883056640625, + "learning_rate": 4.275909732733566e-06, + "loss": 0.4519, + "num_input_tokens_seen": 11026784, + "step": 5633 + }, + { + "epoch": 0.7467196819085488, + "grad_norm": 0.048455916345119476, + "learning_rate": 4.275665388365565e-06, + "loss": 0.0003, + "num_input_tokens_seen": 11027992, + "step": 5634 + }, + { + "epoch": 0.7468522200132538, + "grad_norm": 3.768547534942627, + "learning_rate": 4.27542100976151e-06, + "loss": 0.0426, + "num_input_tokens_seen": 11030160, + "step": 5635 + }, + { + "epoch": 0.746984758117959, + "grad_norm": 12.255239486694336, + "learning_rate": 4.275176596926112e-06, + "loss": 0.1494, + "num_input_tokens_seen": 11031408, + "step": 5636 + }, + { + "epoch": 0.747117296222664, + "grad_norm": 4.9672441482543945, + "learning_rate": 4.274932149864085e-06, + "loss": 0.0938, + "num_input_tokens_seen": 11033560, + "step": 5637 + }, + { + "epoch": 0.7472498343273691, + "grad_norm": 10.4606294631958, + "learning_rate": 4.274687668580142e-06, + "loss": 0.3691, + "num_input_tokens_seen": 11035520, + "step": 5638 + }, + { + "epoch": 0.7473823724320742, + "grad_norm": 5.626713752746582, + "learning_rate": 4.274443153078996e-06, + "loss": 0.05, + "num_input_tokens_seen": 11037704, + "step": 5639 + }, + { + "epoch": 0.7475149105367793, + "grad_norm": 10.198901176452637, + "learning_rate": 4.274198603365362e-06, + "loss": 0.1726, + "num_input_tokens_seen": 11039368, + "step": 5640 + }, + { + "epoch": 0.7476474486414845, + "grad_norm": 1.4593884944915771, + "learning_rate": 4.273954019443956e-06, + "loss": 0.0145, + "num_input_tokens_seen": 11041352, + "step": 5641 + }, + { + "epoch": 0.7477799867461895, + "grad_norm": 22.100982666015625, + "learning_rate": 4.273709401319492e-06, + "loss": 0.3743, + "num_input_tokens_seen": 11043328, + "step": 5642 + }, + { + "epoch": 0.7479125248508947, + "grad_norm": 9.474600791931152, + "learning_rate": 4.273464748996687e-06, + "loss": 0.3399, + "num_input_tokens_seen": 11046040, + "step": 5643 + }, + { + "epoch": 0.7480450629555997, + "grad_norm": 11.160884857177734, + "learning_rate": 4.2732200624802585e-06, + "loss": 0.1969, + "num_input_tokens_seen": 11047440, + "step": 5644 + }, + { + "epoch": 0.7481776010603048, + "grad_norm": 13.449312210083008, + "learning_rate": 4.272975341774923e-06, + "loss": 0.3297, + "num_input_tokens_seen": 11049072, + "step": 5645 + }, + { + "epoch": 0.74831013916501, + "grad_norm": 0.09367889910936356, + "learning_rate": 4.272730586885401e-06, + "loss": 0.0006, + "num_input_tokens_seen": 11051528, + "step": 5646 + }, + { + "epoch": 0.748442677269715, + "grad_norm": 11.46632194519043, + "learning_rate": 4.27248579781641e-06, + "loss": 0.4357, + "num_input_tokens_seen": 11053168, + "step": 5647 + }, + { + "epoch": 0.7485752153744202, + "grad_norm": 3.4057676792144775, + "learning_rate": 4.27224097457267e-06, + "loss": 0.1147, + "num_input_tokens_seen": 11055288, + "step": 5648 + }, + { + "epoch": 0.7487077534791252, + "grad_norm": 3.912752866744995, + "learning_rate": 4.271996117158901e-06, + "loss": 0.0206, + "num_input_tokens_seen": 11057128, + "step": 5649 + }, + { + "epoch": 0.7488402915838304, + "grad_norm": 0.3252025842666626, + "learning_rate": 4.271751225579825e-06, + "loss": 0.0018, + "num_input_tokens_seen": 11058624, + "step": 5650 + }, + { + "epoch": 0.7489728296885354, + "grad_norm": 9.784480094909668, + "learning_rate": 4.2715062998401624e-06, + "loss": 0.1798, + "num_input_tokens_seen": 11060184, + "step": 5651 + }, + { + "epoch": 0.7491053677932406, + "grad_norm": 3.8719005584716797, + "learning_rate": 4.271261339944637e-06, + "loss": 0.1088, + "num_input_tokens_seen": 11061800, + "step": 5652 + }, + { + "epoch": 0.7492379058979457, + "grad_norm": 10.998685836791992, + "learning_rate": 4.271016345897971e-06, + "loss": 0.1749, + "num_input_tokens_seen": 11063408, + "step": 5653 + }, + { + "epoch": 0.7493704440026507, + "grad_norm": 11.26396656036377, + "learning_rate": 4.270771317704887e-06, + "loss": 0.206, + "num_input_tokens_seen": 11065224, + "step": 5654 + }, + { + "epoch": 0.7495029821073559, + "grad_norm": 20.15559196472168, + "learning_rate": 4.270526255370112e-06, + "loss": 0.4393, + "num_input_tokens_seen": 11066448, + "step": 5655 + }, + { + "epoch": 0.7496355202120609, + "grad_norm": 19.359214782714844, + "learning_rate": 4.270281158898368e-06, + "loss": 0.3042, + "num_input_tokens_seen": 11068384, + "step": 5656 + }, + { + "epoch": 0.7497680583167661, + "grad_norm": 9.346695899963379, + "learning_rate": 4.270036028294382e-06, + "loss": 0.3392, + "num_input_tokens_seen": 11069896, + "step": 5657 + }, + { + "epoch": 0.7499005964214712, + "grad_norm": 0.2790091931819916, + "learning_rate": 4.269790863562882e-06, + "loss": 0.0018, + "num_input_tokens_seen": 11071400, + "step": 5658 + }, + { + "epoch": 0.7500331345261763, + "grad_norm": 14.131875991821289, + "learning_rate": 4.269545664708591e-06, + "loss": 0.5372, + "num_input_tokens_seen": 11072872, + "step": 5659 + }, + { + "epoch": 0.7501656726308814, + "grad_norm": 12.163456916809082, + "learning_rate": 4.2693004317362405e-06, + "loss": 0.1782, + "num_input_tokens_seen": 11074304, + "step": 5660 + }, + { + "epoch": 0.7502982107355864, + "grad_norm": 16.802536010742188, + "learning_rate": 4.269055164650556e-06, + "loss": 0.3635, + "num_input_tokens_seen": 11076160, + "step": 5661 + }, + { + "epoch": 0.7504307488402916, + "grad_norm": 3.50714373588562, + "learning_rate": 4.268809863456268e-06, + "loss": 0.0422, + "num_input_tokens_seen": 11078104, + "step": 5662 + }, + { + "epoch": 0.7505632869449966, + "grad_norm": 0.8152724504470825, + "learning_rate": 4.268564528158106e-06, + "loss": 0.0047, + "num_input_tokens_seen": 11079704, + "step": 5663 + }, + { + "epoch": 0.7506958250497018, + "grad_norm": 0.6605602502822876, + "learning_rate": 4.268319158760799e-06, + "loss": 0.0034, + "num_input_tokens_seen": 11081584, + "step": 5664 + }, + { + "epoch": 0.7508283631544069, + "grad_norm": 10.469990730285645, + "learning_rate": 4.26807375526908e-06, + "loss": 0.2902, + "num_input_tokens_seen": 11083392, + "step": 5665 + }, + { + "epoch": 0.750960901259112, + "grad_norm": 10.737537384033203, + "learning_rate": 4.267828317687677e-06, + "loss": 0.1021, + "num_input_tokens_seen": 11084656, + "step": 5666 + }, + { + "epoch": 0.7510934393638171, + "grad_norm": 13.463081359863281, + "learning_rate": 4.267582846021327e-06, + "loss": 0.2783, + "num_input_tokens_seen": 11086616, + "step": 5667 + }, + { + "epoch": 0.7512259774685222, + "grad_norm": 20.12310028076172, + "learning_rate": 4.267337340274759e-06, + "loss": 0.3446, + "num_input_tokens_seen": 11088696, + "step": 5668 + }, + { + "epoch": 0.7513585155732273, + "grad_norm": 10.932811737060547, + "learning_rate": 4.267091800452708e-06, + "loss": 0.3052, + "num_input_tokens_seen": 11090192, + "step": 5669 + }, + { + "epoch": 0.7514910536779325, + "grad_norm": 7.991088390350342, + "learning_rate": 4.266846226559908e-06, + "loss": 0.0304, + "num_input_tokens_seen": 11091640, + "step": 5670 + }, + { + "epoch": 0.7516235917826375, + "grad_norm": 13.237998962402344, + "learning_rate": 4.266600618601095e-06, + "loss": 0.3882, + "num_input_tokens_seen": 11094352, + "step": 5671 + }, + { + "epoch": 0.7517561298873426, + "grad_norm": 6.926783084869385, + "learning_rate": 4.266354976581002e-06, + "loss": 0.1678, + "num_input_tokens_seen": 11095632, + "step": 5672 + }, + { + "epoch": 0.7518886679920477, + "grad_norm": 11.170830726623535, + "learning_rate": 4.266109300504367e-06, + "loss": 0.2158, + "num_input_tokens_seen": 11097576, + "step": 5673 + }, + { + "epoch": 0.7520212060967528, + "grad_norm": 17.36977195739746, + "learning_rate": 4.265863590375926e-06, + "loss": 0.278, + "num_input_tokens_seen": 11099360, + "step": 5674 + }, + { + "epoch": 0.752153744201458, + "grad_norm": 12.258142471313477, + "learning_rate": 4.2656178462004165e-06, + "loss": 0.2109, + "num_input_tokens_seen": 11101256, + "step": 5675 + }, + { + "epoch": 0.752286282306163, + "grad_norm": 7.624269008636475, + "learning_rate": 4.265372067982577e-06, + "loss": 0.0828, + "num_input_tokens_seen": 11102656, + "step": 5676 + }, + { + "epoch": 0.7524188204108682, + "grad_norm": 10.854073524475098, + "learning_rate": 4.2651262557271465e-06, + "loss": 0.3755, + "num_input_tokens_seen": 11105160, + "step": 5677 + }, + { + "epoch": 0.7525513585155732, + "grad_norm": 0.296252578496933, + "learning_rate": 4.264880409438864e-06, + "loss": 0.002, + "num_input_tokens_seen": 11107112, + "step": 5678 + }, + { + "epoch": 0.7526838966202783, + "grad_norm": 8.36817741394043, + "learning_rate": 4.26463452912247e-06, + "loss": 0.223, + "num_input_tokens_seen": 11108840, + "step": 5679 + }, + { + "epoch": 0.7528164347249834, + "grad_norm": 5.196551322937012, + "learning_rate": 4.264388614782705e-06, + "loss": 0.1884, + "num_input_tokens_seen": 11110992, + "step": 5680 + }, + { + "epoch": 0.7529489728296885, + "grad_norm": 0.18713270127773285, + "learning_rate": 4.264142666424309e-06, + "loss": 0.0013, + "num_input_tokens_seen": 11113712, + "step": 5681 + }, + { + "epoch": 0.7530815109343937, + "grad_norm": 0.5625441074371338, + "learning_rate": 4.263896684052027e-06, + "loss": 0.0032, + "num_input_tokens_seen": 11116032, + "step": 5682 + }, + { + "epoch": 0.7532140490390987, + "grad_norm": 17.395601272583008, + "learning_rate": 4.263650667670599e-06, + "loss": 0.2589, + "num_input_tokens_seen": 11117776, + "step": 5683 + }, + { + "epoch": 0.7533465871438039, + "grad_norm": 95.3443374633789, + "learning_rate": 4.263404617284771e-06, + "loss": 0.685, + "num_input_tokens_seen": 11119696, + "step": 5684 + }, + { + "epoch": 0.7534791252485089, + "grad_norm": 7.4920454025268555, + "learning_rate": 4.263158532899284e-06, + "loss": 0.1321, + "num_input_tokens_seen": 11120904, + "step": 5685 + }, + { + "epoch": 0.7536116633532141, + "grad_norm": 8.150219917297363, + "learning_rate": 4.2629124145188846e-06, + "loss": 0.4258, + "num_input_tokens_seen": 11123392, + "step": 5686 + }, + { + "epoch": 0.7537442014579192, + "grad_norm": 14.679032325744629, + "learning_rate": 4.262666262148317e-06, + "loss": 0.4803, + "num_input_tokens_seen": 11125096, + "step": 5687 + }, + { + "epoch": 0.7538767395626242, + "grad_norm": 6.052586555480957, + "learning_rate": 4.262420075792328e-06, + "loss": 0.2023, + "num_input_tokens_seen": 11126904, + "step": 5688 + }, + { + "epoch": 0.7540092776673294, + "grad_norm": 2.7921276092529297, + "learning_rate": 4.262173855455665e-06, + "loss": 0.0174, + "num_input_tokens_seen": 11129232, + "step": 5689 + }, + { + "epoch": 0.7541418157720344, + "grad_norm": 11.46022891998291, + "learning_rate": 4.2619276011430735e-06, + "loss": 0.177, + "num_input_tokens_seen": 11130584, + "step": 5690 + }, + { + "epoch": 0.7542743538767396, + "grad_norm": 10.517122268676758, + "learning_rate": 4.261681312859303e-06, + "loss": 0.2271, + "num_input_tokens_seen": 11132832, + "step": 5691 + }, + { + "epoch": 0.7544068919814446, + "grad_norm": 12.965659141540527, + "learning_rate": 4.261434990609102e-06, + "loss": 0.3185, + "num_input_tokens_seen": 11134320, + "step": 5692 + }, + { + "epoch": 0.7545394300861498, + "grad_norm": 9.949737548828125, + "learning_rate": 4.261188634397218e-06, + "loss": 0.2886, + "num_input_tokens_seen": 11136016, + "step": 5693 + }, + { + "epoch": 0.7546719681908549, + "grad_norm": 0.2930799126625061, + "learning_rate": 4.260942244228403e-06, + "loss": 0.002, + "num_input_tokens_seen": 11138088, + "step": 5694 + }, + { + "epoch": 0.75480450629556, + "grad_norm": 0.10911756008863449, + "learning_rate": 4.2606958201074065e-06, + "loss": 0.0006, + "num_input_tokens_seen": 11139832, + "step": 5695 + }, + { + "epoch": 0.7549370444002651, + "grad_norm": 6.850417613983154, + "learning_rate": 4.26044936203898e-06, + "loss": 0.1509, + "num_input_tokens_seen": 11141696, + "step": 5696 + }, + { + "epoch": 0.7550695825049701, + "grad_norm": 10.386096954345703, + "learning_rate": 4.260202870027875e-06, + "loss": 0.2916, + "num_input_tokens_seen": 11144208, + "step": 5697 + }, + { + "epoch": 0.7552021206096753, + "grad_norm": 6.587453842163086, + "learning_rate": 4.2599563440788434e-06, + "loss": 0.1517, + "num_input_tokens_seen": 11145672, + "step": 5698 + }, + { + "epoch": 0.7553346587143804, + "grad_norm": 0.27563661336898804, + "learning_rate": 4.259709784196641e-06, + "loss": 0.0018, + "num_input_tokens_seen": 11147368, + "step": 5699 + }, + { + "epoch": 0.7554671968190855, + "grad_norm": 0.030336998403072357, + "learning_rate": 4.259463190386019e-06, + "loss": 0.0002, + "num_input_tokens_seen": 11148776, + "step": 5700 + }, + { + "epoch": 0.7555997349237906, + "grad_norm": 7.526874542236328, + "learning_rate": 4.259216562651734e-06, + "loss": 0.1515, + "num_input_tokens_seen": 11150616, + "step": 5701 + }, + { + "epoch": 0.7557322730284957, + "grad_norm": 0.04693884402513504, + "learning_rate": 4.25896990099854e-06, + "loss": 0.0003, + "num_input_tokens_seen": 11152736, + "step": 5702 + }, + { + "epoch": 0.7558648111332008, + "grad_norm": 11.271683692932129, + "learning_rate": 4.258723205431192e-06, + "loss": 0.4579, + "num_input_tokens_seen": 11155136, + "step": 5703 + }, + { + "epoch": 0.7559973492379058, + "grad_norm": 0.5262144207954407, + "learning_rate": 4.258476475954447e-06, + "loss": 0.0034, + "num_input_tokens_seen": 11157992, + "step": 5704 + }, + { + "epoch": 0.756129887342611, + "grad_norm": 4.162840843200684, + "learning_rate": 4.258229712573063e-06, + "loss": 0.0762, + "num_input_tokens_seen": 11159712, + "step": 5705 + }, + { + "epoch": 0.7562624254473161, + "grad_norm": 0.06144149601459503, + "learning_rate": 4.2579829152917975e-06, + "loss": 0.0004, + "num_input_tokens_seen": 11162296, + "step": 5706 + }, + { + "epoch": 0.7563949635520212, + "grad_norm": 2.904346466064453, + "learning_rate": 4.257736084115408e-06, + "loss": 0.0655, + "num_input_tokens_seen": 11164272, + "step": 5707 + }, + { + "epoch": 0.7565275016567263, + "grad_norm": 8.711077690124512, + "learning_rate": 4.257489219048655e-06, + "loss": 0.109, + "num_input_tokens_seen": 11166160, + "step": 5708 + }, + { + "epoch": 0.7566600397614314, + "grad_norm": 4.689111232757568, + "learning_rate": 4.2572423200962965e-06, + "loss": 0.14, + "num_input_tokens_seen": 11167848, + "step": 5709 + }, + { + "epoch": 0.7567925778661365, + "grad_norm": 10.667964935302734, + "learning_rate": 4.256995387263095e-06, + "loss": 0.3981, + "num_input_tokens_seen": 11169672, + "step": 5710 + }, + { + "epoch": 0.7569251159708417, + "grad_norm": 5.06772518157959, + "learning_rate": 4.256748420553809e-06, + "loss": 0.1634, + "num_input_tokens_seen": 11170880, + "step": 5711 + }, + { + "epoch": 0.7570576540755467, + "grad_norm": 23.774381637573242, + "learning_rate": 4.256501419973203e-06, + "loss": 0.8916, + "num_input_tokens_seen": 11173784, + "step": 5712 + }, + { + "epoch": 0.7571901921802519, + "grad_norm": 10.791685104370117, + "learning_rate": 4.2562543855260365e-06, + "loss": 0.389, + "num_input_tokens_seen": 11175808, + "step": 5713 + }, + { + "epoch": 0.7573227302849569, + "grad_norm": 12.001212120056152, + "learning_rate": 4.256007317217075e-06, + "loss": 0.4446, + "num_input_tokens_seen": 11177704, + "step": 5714 + }, + { + "epoch": 0.757455268389662, + "grad_norm": 6.769618511199951, + "learning_rate": 4.2557602150510805e-06, + "loss": 0.1965, + "num_input_tokens_seen": 11179096, + "step": 5715 + }, + { + "epoch": 0.7575878064943671, + "grad_norm": 1.5752532482147217, + "learning_rate": 4.255513079032817e-06, + "loss": 0.0329, + "num_input_tokens_seen": 11180368, + "step": 5716 + }, + { + "epoch": 0.7577203445990722, + "grad_norm": 19.579715728759766, + "learning_rate": 4.2552659091670515e-06, + "loss": 0.8356, + "num_input_tokens_seen": 11182816, + "step": 5717 + }, + { + "epoch": 0.7578528827037774, + "grad_norm": 0.31605860590934753, + "learning_rate": 4.2550187054585475e-06, + "loss": 0.0022, + "num_input_tokens_seen": 11185040, + "step": 5718 + }, + { + "epoch": 0.7579854208084824, + "grad_norm": 0.7418410181999207, + "learning_rate": 4.254771467912072e-06, + "loss": 0.0053, + "num_input_tokens_seen": 11186296, + "step": 5719 + }, + { + "epoch": 0.7581179589131876, + "grad_norm": 3.4883127212524414, + "learning_rate": 4.254524196532393e-06, + "loss": 0.0703, + "num_input_tokens_seen": 11188504, + "step": 5720 + }, + { + "epoch": 0.7582504970178926, + "grad_norm": 0.44677528738975525, + "learning_rate": 4.254276891324277e-06, + "loss": 0.0028, + "num_input_tokens_seen": 11190544, + "step": 5721 + }, + { + "epoch": 0.7583830351225977, + "grad_norm": 7.910586833953857, + "learning_rate": 4.254029552292491e-06, + "loss": 0.1835, + "num_input_tokens_seen": 11192256, + "step": 5722 + }, + { + "epoch": 0.7585155732273029, + "grad_norm": 0.13156349956989288, + "learning_rate": 4.253782179441807e-06, + "loss": 0.0009, + "num_input_tokens_seen": 11193736, + "step": 5723 + }, + { + "epoch": 0.7586481113320079, + "grad_norm": 0.3557851016521454, + "learning_rate": 4.253534772776991e-06, + "loss": 0.0026, + "num_input_tokens_seen": 11196472, + "step": 5724 + }, + { + "epoch": 0.7587806494367131, + "grad_norm": 0.40530019998550415, + "learning_rate": 4.253287332302817e-06, + "loss": 0.0029, + "num_input_tokens_seen": 11198272, + "step": 5725 + }, + { + "epoch": 0.7589131875414181, + "grad_norm": 0.18909013271331787, + "learning_rate": 4.253039858024052e-06, + "loss": 0.0013, + "num_input_tokens_seen": 11201264, + "step": 5726 + }, + { + "epoch": 0.7590457256461233, + "grad_norm": 4.389664649963379, + "learning_rate": 4.2527923499454695e-06, + "loss": 0.0313, + "num_input_tokens_seen": 11202792, + "step": 5727 + }, + { + "epoch": 0.7591782637508283, + "grad_norm": 0.36746108531951904, + "learning_rate": 4.252544808071841e-06, + "loss": 0.0023, + "num_input_tokens_seen": 11203968, + "step": 5728 + }, + { + "epoch": 0.7593108018555335, + "grad_norm": 3.4134554862976074, + "learning_rate": 4.2522972324079404e-06, + "loss": 0.0798, + "num_input_tokens_seen": 11206224, + "step": 5729 + }, + { + "epoch": 0.7594433399602386, + "grad_norm": 16.001447677612305, + "learning_rate": 4.25204962295854e-06, + "loss": 0.4242, + "num_input_tokens_seen": 11208312, + "step": 5730 + }, + { + "epoch": 0.7595758780649436, + "grad_norm": 1.2767168283462524, + "learning_rate": 4.251801979728415e-06, + "loss": 0.0085, + "num_input_tokens_seen": 11209776, + "step": 5731 + }, + { + "epoch": 0.7597084161696488, + "grad_norm": 3.2513034343719482, + "learning_rate": 4.2515543027223375e-06, + "loss": 0.1984, + "num_input_tokens_seen": 11211376, + "step": 5732 + }, + { + "epoch": 0.7598409542743538, + "grad_norm": 0.1131301000714302, + "learning_rate": 4.251306591945086e-06, + "loss": 0.0008, + "num_input_tokens_seen": 11213240, + "step": 5733 + }, + { + "epoch": 0.759973492379059, + "grad_norm": 2.8060240745544434, + "learning_rate": 4.2510588474014345e-06, + "loss": 0.034, + "num_input_tokens_seen": 11215328, + "step": 5734 + }, + { + "epoch": 0.7601060304837641, + "grad_norm": 10.863306999206543, + "learning_rate": 4.250811069096161e-06, + "loss": 0.245, + "num_input_tokens_seen": 11217816, + "step": 5735 + }, + { + "epoch": 0.7602385685884692, + "grad_norm": 2.7661736011505127, + "learning_rate": 4.250563257034043e-06, + "loss": 0.0474, + "num_input_tokens_seen": 11219320, + "step": 5736 + }, + { + "epoch": 0.7603711066931743, + "grad_norm": 16.974578857421875, + "learning_rate": 4.250315411219856e-06, + "loss": 0.6565, + "num_input_tokens_seen": 11221976, + "step": 5737 + }, + { + "epoch": 0.7605036447978794, + "grad_norm": 68.89404296875, + "learning_rate": 4.250067531658381e-06, + "loss": 0.593, + "num_input_tokens_seen": 11224280, + "step": 5738 + }, + { + "epoch": 0.7606361829025845, + "grad_norm": 8.063910484313965, + "learning_rate": 4.249819618354397e-06, + "loss": 0.2135, + "num_input_tokens_seen": 11225520, + "step": 5739 + }, + { + "epoch": 0.7607687210072896, + "grad_norm": 5.7894721031188965, + "learning_rate": 4.249571671312683e-06, + "loss": 0.1751, + "num_input_tokens_seen": 11227248, + "step": 5740 + }, + { + "epoch": 0.7609012591119947, + "grad_norm": 6.145232200622559, + "learning_rate": 4.249323690538021e-06, + "loss": 0.0193, + "num_input_tokens_seen": 11228504, + "step": 5741 + }, + { + "epoch": 0.7610337972166998, + "grad_norm": 25.760915756225586, + "learning_rate": 4.249075676035192e-06, + "loss": 0.631, + "num_input_tokens_seen": 11230136, + "step": 5742 + }, + { + "epoch": 0.7611663353214049, + "grad_norm": 10.716141700744629, + "learning_rate": 4.248827627808976e-06, + "loss": 0.2224, + "num_input_tokens_seen": 11231944, + "step": 5743 + }, + { + "epoch": 0.76129887342611, + "grad_norm": 3.7902352809906006, + "learning_rate": 4.2485795458641575e-06, + "loss": 0.0531, + "num_input_tokens_seen": 11233696, + "step": 5744 + }, + { + "epoch": 0.7614314115308151, + "grad_norm": 10.52564525604248, + "learning_rate": 4.248331430205519e-06, + "loss": 0.3176, + "num_input_tokens_seen": 11235800, + "step": 5745 + }, + { + "epoch": 0.7615639496355202, + "grad_norm": 0.18763208389282227, + "learning_rate": 4.248083280837845e-06, + "loss": 0.0012, + "num_input_tokens_seen": 11237424, + "step": 5746 + }, + { + "epoch": 0.7616964877402254, + "grad_norm": 7.817233562469482, + "learning_rate": 4.2478350977659185e-06, + "loss": 0.3156, + "num_input_tokens_seen": 11239072, + "step": 5747 + }, + { + "epoch": 0.7618290258449304, + "grad_norm": 6.89839506149292, + "learning_rate": 4.2475868809945256e-06, + "loss": 0.0743, + "num_input_tokens_seen": 11241128, + "step": 5748 + }, + { + "epoch": 0.7619615639496355, + "grad_norm": 5.536786079406738, + "learning_rate": 4.2473386305284526e-06, + "loss": 0.1883, + "num_input_tokens_seen": 11242688, + "step": 5749 + }, + { + "epoch": 0.7620941020543406, + "grad_norm": 6.102610111236572, + "learning_rate": 4.247090346372484e-06, + "loss": 0.1958, + "num_input_tokens_seen": 11245200, + "step": 5750 + }, + { + "epoch": 0.7622266401590457, + "grad_norm": 11.152328491210938, + "learning_rate": 4.246842028531411e-06, + "loss": 0.3336, + "num_input_tokens_seen": 11247352, + "step": 5751 + }, + { + "epoch": 0.7623591782637509, + "grad_norm": 11.340971946716309, + "learning_rate": 4.246593677010016e-06, + "loss": 0.357, + "num_input_tokens_seen": 11248640, + "step": 5752 + }, + { + "epoch": 0.7624917163684559, + "grad_norm": 10.115334510803223, + "learning_rate": 4.246345291813091e-06, + "loss": 0.3296, + "num_input_tokens_seen": 11251040, + "step": 5753 + }, + { + "epoch": 0.7626242544731611, + "grad_norm": 8.057600021362305, + "learning_rate": 4.246096872945425e-06, + "loss": 0.1341, + "num_input_tokens_seen": 11253256, + "step": 5754 + }, + { + "epoch": 0.7627567925778661, + "grad_norm": 6.15314245223999, + "learning_rate": 4.2458484204118045e-06, + "loss": 0.0862, + "num_input_tokens_seen": 11254720, + "step": 5755 + }, + { + "epoch": 0.7628893306825713, + "grad_norm": 0.5579625368118286, + "learning_rate": 4.245599934217024e-06, + "loss": 0.0036, + "num_input_tokens_seen": 11256616, + "step": 5756 + }, + { + "epoch": 0.7630218687872763, + "grad_norm": 2.6828835010528564, + "learning_rate": 4.245351414365871e-06, + "loss": 0.0532, + "num_input_tokens_seen": 11258128, + "step": 5757 + }, + { + "epoch": 0.7631544068919814, + "grad_norm": 5.420414924621582, + "learning_rate": 4.245102860863139e-06, + "loss": 0.0965, + "num_input_tokens_seen": 11259568, + "step": 5758 + }, + { + "epoch": 0.7632869449966866, + "grad_norm": 6.338709354400635, + "learning_rate": 4.244854273713621e-06, + "loss": 0.1135, + "num_input_tokens_seen": 11261024, + "step": 5759 + }, + { + "epoch": 0.7634194831013916, + "grad_norm": 8.973888397216797, + "learning_rate": 4.244605652922108e-06, + "loss": 0.3457, + "num_input_tokens_seen": 11263328, + "step": 5760 + }, + { + "epoch": 0.7635520212060968, + "grad_norm": 1.4278415441513062, + "learning_rate": 4.2443569984933955e-06, + "loss": 0.0091, + "num_input_tokens_seen": 11265048, + "step": 5761 + }, + { + "epoch": 0.7636845593108018, + "grad_norm": 4.837559223175049, + "learning_rate": 4.244108310432275e-06, + "loss": 0.131, + "num_input_tokens_seen": 11267448, + "step": 5762 + }, + { + "epoch": 0.763817097415507, + "grad_norm": 1.7924344539642334, + "learning_rate": 4.2438595887435455e-06, + "loss": 0.0108, + "num_input_tokens_seen": 11269192, + "step": 5763 + }, + { + "epoch": 0.7639496355202121, + "grad_norm": 20.689882278442383, + "learning_rate": 4.243610833431998e-06, + "loss": 0.5443, + "num_input_tokens_seen": 11271352, + "step": 5764 + }, + { + "epoch": 0.7640821736249171, + "grad_norm": 10.855278015136719, + "learning_rate": 4.243362044502432e-06, + "loss": 0.3385, + "num_input_tokens_seen": 11274176, + "step": 5765 + }, + { + "epoch": 0.7642147117296223, + "grad_norm": 16.532663345336914, + "learning_rate": 4.243113221959641e-06, + "loss": 0.5585, + "num_input_tokens_seen": 11276488, + "step": 5766 + }, + { + "epoch": 0.7643472498343273, + "grad_norm": 0.15407158434391022, + "learning_rate": 4.242864365808428e-06, + "loss": 0.0011, + "num_input_tokens_seen": 11277992, + "step": 5767 + }, + { + "epoch": 0.7644797879390325, + "grad_norm": 16.419313430786133, + "learning_rate": 4.242615476053585e-06, + "loss": 0.4792, + "num_input_tokens_seen": 11279448, + "step": 5768 + }, + { + "epoch": 0.7646123260437375, + "grad_norm": 0.0714157372713089, + "learning_rate": 4.2423665526999135e-06, + "loss": 0.0005, + "num_input_tokens_seen": 11280896, + "step": 5769 + }, + { + "epoch": 0.7647448641484427, + "grad_norm": 2.8343448638916016, + "learning_rate": 4.242117595752214e-06, + "loss": 0.057, + "num_input_tokens_seen": 11282488, + "step": 5770 + }, + { + "epoch": 0.7648774022531478, + "grad_norm": 17.782989501953125, + "learning_rate": 4.241868605215285e-06, + "loss": 0.4962, + "num_input_tokens_seen": 11284400, + "step": 5771 + }, + { + "epoch": 0.7650099403578529, + "grad_norm": 0.12318360805511475, + "learning_rate": 4.2416195810939274e-06, + "loss": 0.0009, + "num_input_tokens_seen": 11285848, + "step": 5772 + }, + { + "epoch": 0.765142478462558, + "grad_norm": 0.054391246289014816, + "learning_rate": 4.241370523392943e-06, + "loss": 0.0004, + "num_input_tokens_seen": 11287568, + "step": 5773 + }, + { + "epoch": 0.765275016567263, + "grad_norm": 7.675429821014404, + "learning_rate": 4.241121432117134e-06, + "loss": 0.3058, + "num_input_tokens_seen": 11289568, + "step": 5774 + }, + { + "epoch": 0.7654075546719682, + "grad_norm": 0.055851031094789505, + "learning_rate": 4.240872307271303e-06, + "loss": 0.0004, + "num_input_tokens_seen": 11290752, + "step": 5775 + }, + { + "epoch": 0.7655400927766733, + "grad_norm": 0.0902419164776802, + "learning_rate": 4.240623148860252e-06, + "loss": 0.0006, + "num_input_tokens_seen": 11291984, + "step": 5776 + }, + { + "epoch": 0.7656726308813784, + "grad_norm": 13.362424850463867, + "learning_rate": 4.240373956888786e-06, + "loss": 0.4817, + "num_input_tokens_seen": 11293688, + "step": 5777 + }, + { + "epoch": 0.7658051689860835, + "grad_norm": 7.325118064880371, + "learning_rate": 4.24012473136171e-06, + "loss": 0.2205, + "num_input_tokens_seen": 11295616, + "step": 5778 + }, + { + "epoch": 0.7659377070907886, + "grad_norm": 8.576746940612793, + "learning_rate": 4.239875472283828e-06, + "loss": 0.3081, + "num_input_tokens_seen": 11297944, + "step": 5779 + }, + { + "epoch": 0.7660702451954937, + "grad_norm": 5.8745551109313965, + "learning_rate": 4.239626179659947e-06, + "loss": 0.1959, + "num_input_tokens_seen": 11300520, + "step": 5780 + }, + { + "epoch": 0.7662027833001988, + "grad_norm": 3.4584062099456787, + "learning_rate": 4.239376853494873e-06, + "loss": 0.0785, + "num_input_tokens_seen": 11302048, + "step": 5781 + }, + { + "epoch": 0.7663353214049039, + "grad_norm": 9.096794128417969, + "learning_rate": 4.2391274937934135e-06, + "loss": 0.2595, + "num_input_tokens_seen": 11304960, + "step": 5782 + }, + { + "epoch": 0.766467859509609, + "grad_norm": 3.385596752166748, + "learning_rate": 4.238878100560377e-06, + "loss": 0.0762, + "num_input_tokens_seen": 11306832, + "step": 5783 + }, + { + "epoch": 0.7666003976143141, + "grad_norm": 6.691192626953125, + "learning_rate": 4.2386286738005695e-06, + "loss": 0.108, + "num_input_tokens_seen": 11308416, + "step": 5784 + }, + { + "epoch": 0.7667329357190192, + "grad_norm": 6.726831912994385, + "learning_rate": 4.238379213518803e-06, + "loss": 0.1552, + "num_input_tokens_seen": 11310168, + "step": 5785 + }, + { + "epoch": 0.7668654738237243, + "grad_norm": 8.928049087524414, + "learning_rate": 4.238129719719885e-06, + "loss": 0.4044, + "num_input_tokens_seen": 11312600, + "step": 5786 + }, + { + "epoch": 0.7669980119284294, + "grad_norm": 1.7198240756988525, + "learning_rate": 4.237880192408628e-06, + "loss": 0.0395, + "num_input_tokens_seen": 11314504, + "step": 5787 + }, + { + "epoch": 0.7671305500331346, + "grad_norm": 5.724818706512451, + "learning_rate": 4.2376306315898404e-06, + "loss": 0.2125, + "num_input_tokens_seen": 11317064, + "step": 5788 + }, + { + "epoch": 0.7672630881378396, + "grad_norm": 2.7203867435455322, + "learning_rate": 4.237381037268337e-06, + "loss": 0.0425, + "num_input_tokens_seen": 11319800, + "step": 5789 + }, + { + "epoch": 0.7673956262425448, + "grad_norm": 4.671877861022949, + "learning_rate": 4.237131409448928e-06, + "loss": 0.0863, + "num_input_tokens_seen": 11322184, + "step": 5790 + }, + { + "epoch": 0.7675281643472498, + "grad_norm": 11.119823455810547, + "learning_rate": 4.236881748136428e-06, + "loss": 0.3863, + "num_input_tokens_seen": 11325104, + "step": 5791 + }, + { + "epoch": 0.767660702451955, + "grad_norm": 8.732832908630371, + "learning_rate": 4.236632053335649e-06, + "loss": 0.3502, + "num_input_tokens_seen": 11326784, + "step": 5792 + }, + { + "epoch": 0.7677932405566601, + "grad_norm": 20.620769500732422, + "learning_rate": 4.2363823250514055e-06, + "loss": 0.7042, + "num_input_tokens_seen": 11328608, + "step": 5793 + }, + { + "epoch": 0.7679257786613651, + "grad_norm": 6.200304985046387, + "learning_rate": 4.2361325632885134e-06, + "loss": 0.1491, + "num_input_tokens_seen": 11329720, + "step": 5794 + }, + { + "epoch": 0.7680583167660703, + "grad_norm": 0.26988041400909424, + "learning_rate": 4.235882768051788e-06, + "loss": 0.0019, + "num_input_tokens_seen": 11331848, + "step": 5795 + }, + { + "epoch": 0.7681908548707753, + "grad_norm": 0.5760315656661987, + "learning_rate": 4.235632939346044e-06, + "loss": 0.0043, + "num_input_tokens_seen": 11333280, + "step": 5796 + }, + { + "epoch": 0.7683233929754805, + "grad_norm": 2.4314656257629395, + "learning_rate": 4.2353830771761e-06, + "loss": 0.0265, + "num_input_tokens_seen": 11334816, + "step": 5797 + }, + { + "epoch": 0.7684559310801855, + "grad_norm": 10.259264945983887, + "learning_rate": 4.2351331815467745e-06, + "loss": 0.3365, + "num_input_tokens_seen": 11336928, + "step": 5798 + }, + { + "epoch": 0.7685884691848907, + "grad_norm": 16.81617546081543, + "learning_rate": 4.234883252462884e-06, + "loss": 0.6191, + "num_input_tokens_seen": 11338192, + "step": 5799 + }, + { + "epoch": 0.7687210072895958, + "grad_norm": 15.31674575805664, + "learning_rate": 4.234633289929246e-06, + "loss": 0.5258, + "num_input_tokens_seen": 11339936, + "step": 5800 + }, + { + "epoch": 0.7688535453943008, + "grad_norm": 8.937020301818848, + "learning_rate": 4.234383293950683e-06, + "loss": 0.1926, + "num_input_tokens_seen": 11341960, + "step": 5801 + }, + { + "epoch": 0.768986083499006, + "grad_norm": 0.5101362466812134, + "learning_rate": 4.234133264532012e-06, + "loss": 0.0037, + "num_input_tokens_seen": 11344112, + "step": 5802 + }, + { + "epoch": 0.769118621603711, + "grad_norm": 0.22446228563785553, + "learning_rate": 4.233883201678057e-06, + "loss": 0.0016, + "num_input_tokens_seen": 11345184, + "step": 5803 + }, + { + "epoch": 0.7692511597084162, + "grad_norm": 0.22919607162475586, + "learning_rate": 4.233633105393637e-06, + "loss": 0.0016, + "num_input_tokens_seen": 11348776, + "step": 5804 + }, + { + "epoch": 0.7693836978131213, + "grad_norm": 0.34706634283065796, + "learning_rate": 4.233382975683576e-06, + "loss": 0.0025, + "num_input_tokens_seen": 11350696, + "step": 5805 + }, + { + "epoch": 0.7695162359178264, + "grad_norm": 1.915558934211731, + "learning_rate": 4.2331328125526934e-06, + "loss": 0.0091, + "num_input_tokens_seen": 11352888, + "step": 5806 + }, + { + "epoch": 0.7696487740225315, + "grad_norm": 7.9133124351501465, + "learning_rate": 4.232882616005816e-06, + "loss": 0.1214, + "num_input_tokens_seen": 11354000, + "step": 5807 + }, + { + "epoch": 0.7697813121272365, + "grad_norm": 7.735172271728516, + "learning_rate": 4.232632386047766e-06, + "loss": 0.1685, + "num_input_tokens_seen": 11355696, + "step": 5808 + }, + { + "epoch": 0.7699138502319417, + "grad_norm": 0.5019923448562622, + "learning_rate": 4.232382122683367e-06, + "loss": 0.0035, + "num_input_tokens_seen": 11357928, + "step": 5809 + }, + { + "epoch": 0.7700463883366467, + "grad_norm": 25.216238021850586, + "learning_rate": 4.232131825917447e-06, + "loss": 0.8289, + "num_input_tokens_seen": 11360544, + "step": 5810 + }, + { + "epoch": 0.7701789264413519, + "grad_norm": 0.34986865520477295, + "learning_rate": 4.2318814957548304e-06, + "loss": 0.0024, + "num_input_tokens_seen": 11361616, + "step": 5811 + }, + { + "epoch": 0.770311464546057, + "grad_norm": 7.062240123748779, + "learning_rate": 4.231631132200344e-06, + "loss": 0.1439, + "num_input_tokens_seen": 11363616, + "step": 5812 + }, + { + "epoch": 0.7704440026507621, + "grad_norm": 6.503780364990234, + "learning_rate": 4.231380735258813e-06, + "loss": 0.2644, + "num_input_tokens_seen": 11365592, + "step": 5813 + }, + { + "epoch": 0.7705765407554672, + "grad_norm": 6.799583911895752, + "learning_rate": 4.231130304935069e-06, + "loss": 0.2983, + "num_input_tokens_seen": 11367328, + "step": 5814 + }, + { + "epoch": 0.7707090788601723, + "grad_norm": 9.331462860107422, + "learning_rate": 4.230879841233938e-06, + "loss": 0.1279, + "num_input_tokens_seen": 11368976, + "step": 5815 + }, + { + "epoch": 0.7708416169648774, + "grad_norm": 0.8686022162437439, + "learning_rate": 4.230629344160249e-06, + "loss": 0.0047, + "num_input_tokens_seen": 11370640, + "step": 5816 + }, + { + "epoch": 0.7709741550695826, + "grad_norm": 13.30678939819336, + "learning_rate": 4.230378813718833e-06, + "loss": 0.3561, + "num_input_tokens_seen": 11372640, + "step": 5817 + }, + { + "epoch": 0.7711066931742876, + "grad_norm": 0.05788769572973251, + "learning_rate": 4.230128249914519e-06, + "loss": 0.0004, + "num_input_tokens_seen": 11373912, + "step": 5818 + }, + { + "epoch": 0.7712392312789927, + "grad_norm": 10.012808799743652, + "learning_rate": 4.2298776527521405e-06, + "loss": 0.3106, + "num_input_tokens_seen": 11375936, + "step": 5819 + }, + { + "epoch": 0.7713717693836978, + "grad_norm": 7.473123073577881, + "learning_rate": 4.229627022236526e-06, + "loss": 0.2177, + "num_input_tokens_seen": 11378088, + "step": 5820 + }, + { + "epoch": 0.7715043074884029, + "grad_norm": 7.057112693786621, + "learning_rate": 4.229376358372509e-06, + "loss": 0.2336, + "num_input_tokens_seen": 11381456, + "step": 5821 + }, + { + "epoch": 0.771636845593108, + "grad_norm": 5.974050045013428, + "learning_rate": 4.229125661164923e-06, + "loss": 0.103, + "num_input_tokens_seen": 11383832, + "step": 5822 + }, + { + "epoch": 0.7717693836978131, + "grad_norm": 15.61795711517334, + "learning_rate": 4.2288749306186015e-06, + "loss": 0.5149, + "num_input_tokens_seen": 11385368, + "step": 5823 + }, + { + "epoch": 0.7719019218025183, + "grad_norm": 9.622578620910645, + "learning_rate": 4.228624166738379e-06, + "loss": 0.3353, + "num_input_tokens_seen": 11387544, + "step": 5824 + }, + { + "epoch": 0.7720344599072233, + "grad_norm": 10.490093231201172, + "learning_rate": 4.228373369529089e-06, + "loss": 0.1436, + "num_input_tokens_seen": 11389496, + "step": 5825 + }, + { + "epoch": 0.7721669980119285, + "grad_norm": 0.08071985840797424, + "learning_rate": 4.22812253899557e-06, + "loss": 0.0005, + "num_input_tokens_seen": 11391080, + "step": 5826 + }, + { + "epoch": 0.7722995361166335, + "grad_norm": 23.108776092529297, + "learning_rate": 4.227871675142655e-06, + "loss": 0.316, + "num_input_tokens_seen": 11393320, + "step": 5827 + }, + { + "epoch": 0.7724320742213386, + "grad_norm": 13.226712226867676, + "learning_rate": 4.2276207779751825e-06, + "loss": 0.3802, + "num_input_tokens_seen": 11394968, + "step": 5828 + }, + { + "epoch": 0.7725646123260438, + "grad_norm": 9.378782272338867, + "learning_rate": 4.227369847497989e-06, + "loss": 0.1151, + "num_input_tokens_seen": 11395968, + "step": 5829 + }, + { + "epoch": 0.7726971504307488, + "grad_norm": 0.11491656303405762, + "learning_rate": 4.227118883715914e-06, + "loss": 0.0007, + "num_input_tokens_seen": 11397296, + "step": 5830 + }, + { + "epoch": 0.772829688535454, + "grad_norm": 8.225726127624512, + "learning_rate": 4.226867886633796e-06, + "loss": 0.1548, + "num_input_tokens_seen": 11398480, + "step": 5831 + }, + { + "epoch": 0.772962226640159, + "grad_norm": 2.115953207015991, + "learning_rate": 4.226616856256473e-06, + "loss": 0.0319, + "num_input_tokens_seen": 11399752, + "step": 5832 + }, + { + "epoch": 0.7730947647448642, + "grad_norm": 13.04377555847168, + "learning_rate": 4.226365792588785e-06, + "loss": 0.6515, + "num_input_tokens_seen": 11401496, + "step": 5833 + }, + { + "epoch": 0.7732273028495692, + "grad_norm": 0.051223721355199814, + "learning_rate": 4.226114695635575e-06, + "loss": 0.0003, + "num_input_tokens_seen": 11403016, + "step": 5834 + }, + { + "epoch": 0.7733598409542743, + "grad_norm": 3.2167584896087646, + "learning_rate": 4.225863565401681e-06, + "loss": 0.1053, + "num_input_tokens_seen": 11405424, + "step": 5835 + }, + { + "epoch": 0.7734923790589795, + "grad_norm": 0.976586103439331, + "learning_rate": 4.225612401891949e-06, + "loss": 0.0064, + "num_input_tokens_seen": 11407816, + "step": 5836 + }, + { + "epoch": 0.7736249171636845, + "grad_norm": 7.798348426818848, + "learning_rate": 4.225361205111218e-06, + "loss": 0.2249, + "num_input_tokens_seen": 11411080, + "step": 5837 + }, + { + "epoch": 0.7737574552683897, + "grad_norm": 6.437674045562744, + "learning_rate": 4.225109975064333e-06, + "loss": 0.1271, + "num_input_tokens_seen": 11412896, + "step": 5838 + }, + { + "epoch": 0.7738899933730947, + "grad_norm": 14.131633758544922, + "learning_rate": 4.224858711756138e-06, + "loss": 0.3775, + "num_input_tokens_seen": 11414928, + "step": 5839 + }, + { + "epoch": 0.7740225314777999, + "grad_norm": 7.073689937591553, + "learning_rate": 4.2246074151914765e-06, + "loss": 0.2157, + "num_input_tokens_seen": 11417576, + "step": 5840 + }, + { + "epoch": 0.774155069582505, + "grad_norm": 5.393301010131836, + "learning_rate": 4.224356085375195e-06, + "loss": 0.097, + "num_input_tokens_seen": 11419360, + "step": 5841 + }, + { + "epoch": 0.77428760768721, + "grad_norm": 9.1361722946167, + "learning_rate": 4.224104722312139e-06, + "loss": 0.2603, + "num_input_tokens_seen": 11421224, + "step": 5842 + }, + { + "epoch": 0.7744201457919152, + "grad_norm": 4.029488563537598, + "learning_rate": 4.223853326007154e-06, + "loss": 0.1309, + "num_input_tokens_seen": 11422952, + "step": 5843 + }, + { + "epoch": 0.7745526838966202, + "grad_norm": 9.863259315490723, + "learning_rate": 4.223601896465087e-06, + "loss": 0.3336, + "num_input_tokens_seen": 11424960, + "step": 5844 + }, + { + "epoch": 0.7746852220013254, + "grad_norm": 2.2270891666412354, + "learning_rate": 4.223350433690787e-06, + "loss": 0.0086, + "num_input_tokens_seen": 11426328, + "step": 5845 + }, + { + "epoch": 0.7748177601060304, + "grad_norm": 4.552037715911865, + "learning_rate": 4.223098937689102e-06, + "loss": 0.1021, + "num_input_tokens_seen": 11428016, + "step": 5846 + }, + { + "epoch": 0.7749502982107356, + "grad_norm": 3.9036269187927246, + "learning_rate": 4.22284740846488e-06, + "loss": 0.1306, + "num_input_tokens_seen": 11429576, + "step": 5847 + }, + { + "epoch": 0.7750828363154407, + "grad_norm": 1.1713247299194336, + "learning_rate": 4.222595846022972e-06, + "loss": 0.0066, + "num_input_tokens_seen": 11430832, + "step": 5848 + }, + { + "epoch": 0.7752153744201458, + "grad_norm": 0.15521301329135895, + "learning_rate": 4.222344250368228e-06, + "loss": 0.0011, + "num_input_tokens_seen": 11432456, + "step": 5849 + }, + { + "epoch": 0.7753479125248509, + "grad_norm": 6.758549690246582, + "learning_rate": 4.222092621505498e-06, + "loss": 0.2083, + "num_input_tokens_seen": 11434616, + "step": 5850 + }, + { + "epoch": 0.775480450629556, + "grad_norm": 17.421295166015625, + "learning_rate": 4.221840959439634e-06, + "loss": 0.6084, + "num_input_tokens_seen": 11436048, + "step": 5851 + }, + { + "epoch": 0.7756129887342611, + "grad_norm": 11.438631057739258, + "learning_rate": 4.221589264175489e-06, + "loss": 0.3721, + "num_input_tokens_seen": 11438424, + "step": 5852 + }, + { + "epoch": 0.7757455268389662, + "grad_norm": 0.06550151854753494, + "learning_rate": 4.2213375357179155e-06, + "loss": 0.0004, + "num_input_tokens_seen": 11439896, + "step": 5853 + }, + { + "epoch": 0.7758780649436713, + "grad_norm": 5.692852973937988, + "learning_rate": 4.221085774071767e-06, + "loss": 0.0998, + "num_input_tokens_seen": 11441744, + "step": 5854 + }, + { + "epoch": 0.7760106030483764, + "grad_norm": 13.590944290161133, + "learning_rate": 4.2208339792418965e-06, + "loss": 0.3326, + "num_input_tokens_seen": 11443328, + "step": 5855 + }, + { + "epoch": 0.7761431411530815, + "grad_norm": 14.802865982055664, + "learning_rate": 4.22058215123316e-06, + "loss": 0.4489, + "num_input_tokens_seen": 11444992, + "step": 5856 + }, + { + "epoch": 0.7762756792577866, + "grad_norm": 8.53990650177002, + "learning_rate": 4.220330290050414e-06, + "loss": 0.2612, + "num_input_tokens_seen": 11447440, + "step": 5857 + }, + { + "epoch": 0.7764082173624918, + "grad_norm": 5.915536880493164, + "learning_rate": 4.220078395698512e-06, + "loss": 0.0971, + "num_input_tokens_seen": 11448792, + "step": 5858 + }, + { + "epoch": 0.7765407554671968, + "grad_norm": 5.354061126708984, + "learning_rate": 4.219826468182312e-06, + "loss": 0.1051, + "num_input_tokens_seen": 11450800, + "step": 5859 + }, + { + "epoch": 0.776673293571902, + "grad_norm": 8.225664138793945, + "learning_rate": 4.2195745075066716e-06, + "loss": 0.3631, + "num_input_tokens_seen": 11453040, + "step": 5860 + }, + { + "epoch": 0.776805831676607, + "grad_norm": 7.352323532104492, + "learning_rate": 4.219322513676448e-06, + "loss": 0.2308, + "num_input_tokens_seen": 11454624, + "step": 5861 + }, + { + "epoch": 0.7769383697813121, + "grad_norm": 0.18368928134441376, + "learning_rate": 4.219070486696501e-06, + "loss": 0.0013, + "num_input_tokens_seen": 11455992, + "step": 5862 + }, + { + "epoch": 0.7770709078860172, + "grad_norm": 8.736626625061035, + "learning_rate": 4.218818426571688e-06, + "loss": 0.2097, + "num_input_tokens_seen": 11458496, + "step": 5863 + }, + { + "epoch": 0.7772034459907223, + "grad_norm": 7.095582485198975, + "learning_rate": 4.21856633330687e-06, + "loss": 0.1423, + "num_input_tokens_seen": 11460952, + "step": 5864 + }, + { + "epoch": 0.7773359840954275, + "grad_norm": 0.24318623542785645, + "learning_rate": 4.218314206906908e-06, + "loss": 0.0017, + "num_input_tokens_seen": 11462832, + "step": 5865 + }, + { + "epoch": 0.7774685222001325, + "grad_norm": 7.682394504547119, + "learning_rate": 4.218062047376663e-06, + "loss": 0.1031, + "num_input_tokens_seen": 11464304, + "step": 5866 + }, + { + "epoch": 0.7776010603048377, + "grad_norm": 6.468847751617432, + "learning_rate": 4.217809854720996e-06, + "loss": 0.1476, + "num_input_tokens_seen": 11465528, + "step": 5867 + }, + { + "epoch": 0.7777335984095427, + "grad_norm": 6.538835048675537, + "learning_rate": 4.2175576289447705e-06, + "loss": 0.1962, + "num_input_tokens_seen": 11467512, + "step": 5868 + }, + { + "epoch": 0.7778661365142479, + "grad_norm": 0.4626077711582184, + "learning_rate": 4.217305370052848e-06, + "loss": 0.0035, + "num_input_tokens_seen": 11470640, + "step": 5869 + }, + { + "epoch": 0.777998674618953, + "grad_norm": 0.42355719208717346, + "learning_rate": 4.217053078050094e-06, + "loss": 0.003, + "num_input_tokens_seen": 11471720, + "step": 5870 + }, + { + "epoch": 0.778131212723658, + "grad_norm": 0.2068232297897339, + "learning_rate": 4.216800752941372e-06, + "loss": 0.0014, + "num_input_tokens_seen": 11473392, + "step": 5871 + }, + { + "epoch": 0.7782637508283632, + "grad_norm": 6.194050312042236, + "learning_rate": 4.216548394731548e-06, + "loss": 0.0637, + "num_input_tokens_seen": 11475136, + "step": 5872 + }, + { + "epoch": 0.7783962889330682, + "grad_norm": 6.686072826385498, + "learning_rate": 4.216296003425486e-06, + "loss": 0.1335, + "num_input_tokens_seen": 11478072, + "step": 5873 + }, + { + "epoch": 0.7785288270377734, + "grad_norm": 0.1913270205259323, + "learning_rate": 4.216043579028053e-06, + "loss": 0.0014, + "num_input_tokens_seen": 11479656, + "step": 5874 + }, + { + "epoch": 0.7786613651424784, + "grad_norm": 7.532687664031982, + "learning_rate": 4.215791121544115e-06, + "loss": 0.0685, + "num_input_tokens_seen": 11482440, + "step": 5875 + }, + { + "epoch": 0.7787939032471836, + "grad_norm": 12.778433799743652, + "learning_rate": 4.215538630978542e-06, + "loss": 0.358, + "num_input_tokens_seen": 11484424, + "step": 5876 + }, + { + "epoch": 0.7789264413518887, + "grad_norm": 0.11313376575708389, + "learning_rate": 4.2152861073362e-06, + "loss": 0.0007, + "num_input_tokens_seen": 11485688, + "step": 5877 + }, + { + "epoch": 0.7790589794565937, + "grad_norm": 12.902118682861328, + "learning_rate": 4.215033550621959e-06, + "loss": 0.2716, + "num_input_tokens_seen": 11487792, + "step": 5878 + }, + { + "epoch": 0.7791915175612989, + "grad_norm": 8.817619323730469, + "learning_rate": 4.214780960840687e-06, + "loss": 0.1683, + "num_input_tokens_seen": 11489808, + "step": 5879 + }, + { + "epoch": 0.7793240556660039, + "grad_norm": 0.8432909846305847, + "learning_rate": 4.214528337997257e-06, + "loss": 0.0042, + "num_input_tokens_seen": 11491056, + "step": 5880 + }, + { + "epoch": 0.7794565937707091, + "grad_norm": 4.995293617248535, + "learning_rate": 4.214275682096536e-06, + "loss": 0.0623, + "num_input_tokens_seen": 11492392, + "step": 5881 + }, + { + "epoch": 0.7795891318754142, + "grad_norm": 5.417535781860352, + "learning_rate": 4.214022993143397e-06, + "loss": 0.1524, + "num_input_tokens_seen": 11493848, + "step": 5882 + }, + { + "epoch": 0.7797216699801193, + "grad_norm": 9.096508979797363, + "learning_rate": 4.2137702711427145e-06, + "loss": 0.2998, + "num_input_tokens_seen": 11496544, + "step": 5883 + }, + { + "epoch": 0.7798542080848244, + "grad_norm": 5.889286518096924, + "learning_rate": 4.213517516099357e-06, + "loss": 0.1171, + "num_input_tokens_seen": 11498584, + "step": 5884 + }, + { + "epoch": 0.7799867461895295, + "grad_norm": 0.04048706963658333, + "learning_rate": 4.2132647280182e-06, + "loss": 0.0003, + "num_input_tokens_seen": 11500720, + "step": 5885 + }, + { + "epoch": 0.7801192842942346, + "grad_norm": 0.0369567796587944, + "learning_rate": 4.213011906904118e-06, + "loss": 0.0003, + "num_input_tokens_seen": 11502392, + "step": 5886 + }, + { + "epoch": 0.7802518223989396, + "grad_norm": 7.6913557052612305, + "learning_rate": 4.212759052761983e-06, + "loss": 0.1934, + "num_input_tokens_seen": 11503720, + "step": 5887 + }, + { + "epoch": 0.7803843605036448, + "grad_norm": 14.853998184204102, + "learning_rate": 4.212506165596674e-06, + "loss": 0.147, + "num_input_tokens_seen": 11505592, + "step": 5888 + }, + { + "epoch": 0.7805168986083499, + "grad_norm": 1.0737090110778809, + "learning_rate": 4.212253245413063e-06, + "loss": 0.0056, + "num_input_tokens_seen": 11507272, + "step": 5889 + }, + { + "epoch": 0.780649436713055, + "grad_norm": 19.34444808959961, + "learning_rate": 4.212000292216028e-06, + "loss": 0.5448, + "num_input_tokens_seen": 11509952, + "step": 5890 + }, + { + "epoch": 0.7807819748177601, + "grad_norm": 0.04693669080734253, + "learning_rate": 4.211747306010448e-06, + "loss": 0.0003, + "num_input_tokens_seen": 11511456, + "step": 5891 + }, + { + "epoch": 0.7809145129224652, + "grad_norm": 0.016634413972496986, + "learning_rate": 4.211494286801198e-06, + "loss": 0.0001, + "num_input_tokens_seen": 11513096, + "step": 5892 + }, + { + "epoch": 0.7810470510271703, + "grad_norm": 7.608966827392578, + "learning_rate": 4.211241234593158e-06, + "loss": 0.2946, + "num_input_tokens_seen": 11514792, + "step": 5893 + }, + { + "epoch": 0.7811795891318755, + "grad_norm": 2.7981040477752686, + "learning_rate": 4.2109881493912065e-06, + "loss": 0.0349, + "num_input_tokens_seen": 11516728, + "step": 5894 + }, + { + "epoch": 0.7813121272365805, + "grad_norm": 7.430641174316406, + "learning_rate": 4.210735031200222e-06, + "loss": 0.1378, + "num_input_tokens_seen": 11518640, + "step": 5895 + }, + { + "epoch": 0.7814446653412856, + "grad_norm": 8.32115650177002, + "learning_rate": 4.2104818800250875e-06, + "loss": 0.4298, + "num_input_tokens_seen": 11520104, + "step": 5896 + }, + { + "epoch": 0.7815772034459907, + "grad_norm": 0.0429580956697464, + "learning_rate": 4.210228695870681e-06, + "loss": 0.0003, + "num_input_tokens_seen": 11521496, + "step": 5897 + }, + { + "epoch": 0.7817097415506958, + "grad_norm": 9.142416954040527, + "learning_rate": 4.209975478741886e-06, + "loss": 0.1891, + "num_input_tokens_seen": 11523920, + "step": 5898 + }, + { + "epoch": 0.7818422796554009, + "grad_norm": 9.95163631439209, + "learning_rate": 4.209722228643584e-06, + "loss": 0.3569, + "num_input_tokens_seen": 11527264, + "step": 5899 + }, + { + "epoch": 0.781974817760106, + "grad_norm": 9.261152267456055, + "learning_rate": 4.2094689455806584e-06, + "loss": 0.2835, + "num_input_tokens_seen": 11529040, + "step": 5900 + }, + { + "epoch": 0.7821073558648112, + "grad_norm": 0.5602549910545349, + "learning_rate": 4.209215629557992e-06, + "loss": 0.003, + "num_input_tokens_seen": 11531296, + "step": 5901 + }, + { + "epoch": 0.7822398939695162, + "grad_norm": 7.8317694664001465, + "learning_rate": 4.2089622805804685e-06, + "loss": 0.2295, + "num_input_tokens_seen": 11533128, + "step": 5902 + }, + { + "epoch": 0.7823724320742214, + "grad_norm": 0.6189653873443604, + "learning_rate": 4.208708898652974e-06, + "loss": 0.0032, + "num_input_tokens_seen": 11535688, + "step": 5903 + }, + { + "epoch": 0.7825049701789264, + "grad_norm": 5.0478081703186035, + "learning_rate": 4.2084554837803935e-06, + "loss": 0.0653, + "num_input_tokens_seen": 11537968, + "step": 5904 + }, + { + "epoch": 0.7826375082836315, + "grad_norm": 0.23618066310882568, + "learning_rate": 4.208202035967612e-06, + "loss": 0.0015, + "num_input_tokens_seen": 11539328, + "step": 5905 + }, + { + "epoch": 0.7827700463883367, + "grad_norm": 7.0716423988342285, + "learning_rate": 4.207948555219516e-06, + "loss": 0.1866, + "num_input_tokens_seen": 11541632, + "step": 5906 + }, + { + "epoch": 0.7829025844930417, + "grad_norm": 11.420003890991211, + "learning_rate": 4.207695041540995e-06, + "loss": 0.2858, + "num_input_tokens_seen": 11543456, + "step": 5907 + }, + { + "epoch": 0.7830351225977469, + "grad_norm": 0.08125365525484085, + "learning_rate": 4.207441494936936e-06, + "loss": 0.0006, + "num_input_tokens_seen": 11545272, + "step": 5908 + }, + { + "epoch": 0.7831676607024519, + "grad_norm": 0.5230322480201721, + "learning_rate": 4.207187915412226e-06, + "loss": 0.0033, + "num_input_tokens_seen": 11547088, + "step": 5909 + }, + { + "epoch": 0.7833001988071571, + "grad_norm": 0.35598552227020264, + "learning_rate": 4.206934302971755e-06, + "loss": 0.0018, + "num_input_tokens_seen": 11548872, + "step": 5910 + }, + { + "epoch": 0.7834327369118622, + "grad_norm": 0.1132628321647644, + "learning_rate": 4.206680657620413e-06, + "loss": 0.0008, + "num_input_tokens_seen": 11550752, + "step": 5911 + }, + { + "epoch": 0.7835652750165673, + "grad_norm": 6.93179988861084, + "learning_rate": 4.2064269793630915e-06, + "loss": 0.2319, + "num_input_tokens_seen": 11553712, + "step": 5912 + }, + { + "epoch": 0.7836978131212724, + "grad_norm": 10.415398597717285, + "learning_rate": 4.206173268204679e-06, + "loss": 0.3234, + "num_input_tokens_seen": 11556152, + "step": 5913 + }, + { + "epoch": 0.7838303512259774, + "grad_norm": 6.431523323059082, + "learning_rate": 4.205919524150071e-06, + "loss": 0.2563, + "num_input_tokens_seen": 11557928, + "step": 5914 + }, + { + "epoch": 0.7839628893306826, + "grad_norm": 4.781380653381348, + "learning_rate": 4.205665747204157e-06, + "loss": 0.1348, + "num_input_tokens_seen": 11559600, + "step": 5915 + }, + { + "epoch": 0.7840954274353876, + "grad_norm": 0.045080650597810745, + "learning_rate": 4.2054119373718305e-06, + "loss": 0.0003, + "num_input_tokens_seen": 11560656, + "step": 5916 + }, + { + "epoch": 0.7842279655400928, + "grad_norm": 10.3811674118042, + "learning_rate": 4.205158094657985e-06, + "loss": 0.3751, + "num_input_tokens_seen": 11563064, + "step": 5917 + }, + { + "epoch": 0.7843605036447979, + "grad_norm": 8.004494667053223, + "learning_rate": 4.204904219067516e-06, + "loss": 0.078, + "num_input_tokens_seen": 11565648, + "step": 5918 + }, + { + "epoch": 0.784493041749503, + "grad_norm": 4.974582672119141, + "learning_rate": 4.204650310605317e-06, + "loss": 0.0873, + "num_input_tokens_seen": 11567392, + "step": 5919 + }, + { + "epoch": 0.7846255798542081, + "grad_norm": 7.225044250488281, + "learning_rate": 4.204396369276283e-06, + "loss": 0.069, + "num_input_tokens_seen": 11568848, + "step": 5920 + }, + { + "epoch": 0.7847581179589131, + "grad_norm": 5.176958084106445, + "learning_rate": 4.204142395085313e-06, + "loss": 0.0967, + "num_input_tokens_seen": 11571040, + "step": 5921 + }, + { + "epoch": 0.7848906560636183, + "grad_norm": 14.297098159790039, + "learning_rate": 4.203888388037301e-06, + "loss": 0.4266, + "num_input_tokens_seen": 11573144, + "step": 5922 + }, + { + "epoch": 0.7850231941683234, + "grad_norm": 9.08741283416748, + "learning_rate": 4.203634348137145e-06, + "loss": 0.108, + "num_input_tokens_seen": 11574576, + "step": 5923 + }, + { + "epoch": 0.7851557322730285, + "grad_norm": 2.4092931747436523, + "learning_rate": 4.203380275389744e-06, + "loss": 0.0309, + "num_input_tokens_seen": 11576472, + "step": 5924 + }, + { + "epoch": 0.7852882703777336, + "grad_norm": 5.999264240264893, + "learning_rate": 4.203126169799997e-06, + "loss": 0.1914, + "num_input_tokens_seen": 11577904, + "step": 5925 + }, + { + "epoch": 0.7854208084824387, + "grad_norm": 12.166259765625, + "learning_rate": 4.202872031372802e-06, + "loss": 0.2646, + "num_input_tokens_seen": 11580008, + "step": 5926 + }, + { + "epoch": 0.7855533465871438, + "grad_norm": 18.700843811035156, + "learning_rate": 4.202617860113059e-06, + "loss": 0.6551, + "num_input_tokens_seen": 11582712, + "step": 5927 + }, + { + "epoch": 0.7856858846918489, + "grad_norm": 0.1118735820055008, + "learning_rate": 4.2023636560256686e-06, + "loss": 0.0008, + "num_input_tokens_seen": 11584072, + "step": 5928 + }, + { + "epoch": 0.785818422796554, + "grad_norm": 6.535750865936279, + "learning_rate": 4.202109419115534e-06, + "loss": 0.1895, + "num_input_tokens_seen": 11585872, + "step": 5929 + }, + { + "epoch": 0.7859509609012592, + "grad_norm": 6.2491350173950195, + "learning_rate": 4.201855149387555e-06, + "loss": 0.0853, + "num_input_tokens_seen": 11587592, + "step": 5930 + }, + { + "epoch": 0.7860834990059642, + "grad_norm": 0.4412160813808441, + "learning_rate": 4.2016008468466345e-06, + "loss": 0.0059, + "num_input_tokens_seen": 11589936, + "step": 5931 + }, + { + "epoch": 0.7862160371106693, + "grad_norm": 7.810582637786865, + "learning_rate": 4.201346511497676e-06, + "loss": 0.1806, + "num_input_tokens_seen": 11591968, + "step": 5932 + }, + { + "epoch": 0.7863485752153744, + "grad_norm": 8.169188499450684, + "learning_rate": 4.201092143345583e-06, + "loss": 0.2557, + "num_input_tokens_seen": 11593840, + "step": 5933 + }, + { + "epoch": 0.7864811133200795, + "grad_norm": 10.962566375732422, + "learning_rate": 4.200837742395259e-06, + "loss": 0.272, + "num_input_tokens_seen": 11596176, + "step": 5934 + }, + { + "epoch": 0.7866136514247847, + "grad_norm": 0.26985448598861694, + "learning_rate": 4.200583308651611e-06, + "loss": 0.0018, + "num_input_tokens_seen": 11597792, + "step": 5935 + }, + { + "epoch": 0.7867461895294897, + "grad_norm": 3.7500975131988525, + "learning_rate": 4.200328842119543e-06, + "loss": 0.1299, + "num_input_tokens_seen": 11599760, + "step": 5936 + }, + { + "epoch": 0.7868787276341949, + "grad_norm": 3.26781964302063, + "learning_rate": 4.200074342803962e-06, + "loss": 0.0204, + "num_input_tokens_seen": 11601904, + "step": 5937 + }, + { + "epoch": 0.7870112657388999, + "grad_norm": 7.098124980926514, + "learning_rate": 4.199819810709776e-06, + "loss": 0.0863, + "num_input_tokens_seen": 11603272, + "step": 5938 + }, + { + "epoch": 0.787143803843605, + "grad_norm": 6.3188934326171875, + "learning_rate": 4.19956524584189e-06, + "loss": 0.1421, + "num_input_tokens_seen": 11606056, + "step": 5939 + }, + { + "epoch": 0.7872763419483101, + "grad_norm": 11.25432300567627, + "learning_rate": 4.199310648205215e-06, + "loss": 0.3491, + "num_input_tokens_seen": 11608056, + "step": 5940 + }, + { + "epoch": 0.7874088800530152, + "grad_norm": 7.903420925140381, + "learning_rate": 4.199056017804657e-06, + "loss": 0.144, + "num_input_tokens_seen": 11609552, + "step": 5941 + }, + { + "epoch": 0.7875414181577204, + "grad_norm": 3.3785767555236816, + "learning_rate": 4.198801354645128e-06, + "loss": 0.0542, + "num_input_tokens_seen": 11610920, + "step": 5942 + }, + { + "epoch": 0.7876739562624254, + "grad_norm": 0.2599114179611206, + "learning_rate": 4.198546658731536e-06, + "loss": 0.0018, + "num_input_tokens_seen": 11612632, + "step": 5943 + }, + { + "epoch": 0.7878064943671306, + "grad_norm": 7.482265949249268, + "learning_rate": 4.198291930068793e-06, + "loss": 0.2419, + "num_input_tokens_seen": 11614072, + "step": 5944 + }, + { + "epoch": 0.7879390324718356, + "grad_norm": 8.974781036376953, + "learning_rate": 4.19803716866181e-06, + "loss": 0.3142, + "num_input_tokens_seen": 11616304, + "step": 5945 + }, + { + "epoch": 0.7880715705765408, + "grad_norm": 5.818236827850342, + "learning_rate": 4.1977823745155e-06, + "loss": 0.1383, + "num_input_tokens_seen": 11617528, + "step": 5946 + }, + { + "epoch": 0.7882041086812459, + "grad_norm": 0.21087734401226044, + "learning_rate": 4.1975275476347725e-06, + "loss": 0.0014, + "num_input_tokens_seen": 11619008, + "step": 5947 + }, + { + "epoch": 0.788336646785951, + "grad_norm": 0.294229656457901, + "learning_rate": 4.197272688024544e-06, + "loss": 0.0021, + "num_input_tokens_seen": 11620968, + "step": 5948 + }, + { + "epoch": 0.7884691848906561, + "grad_norm": 0.12691304087638855, + "learning_rate": 4.197017795689727e-06, + "loss": 0.0009, + "num_input_tokens_seen": 11623144, + "step": 5949 + }, + { + "epoch": 0.7886017229953611, + "grad_norm": 16.456361770629883, + "learning_rate": 4.196762870635236e-06, + "loss": 0.5651, + "num_input_tokens_seen": 11625296, + "step": 5950 + }, + { + "epoch": 0.7887342611000663, + "grad_norm": 0.14588159322738647, + "learning_rate": 4.196507912865987e-06, + "loss": 0.001, + "num_input_tokens_seen": 11627472, + "step": 5951 + }, + { + "epoch": 0.7888667992047713, + "grad_norm": 0.19442982971668243, + "learning_rate": 4.196252922386895e-06, + "loss": 0.0014, + "num_input_tokens_seen": 11628864, + "step": 5952 + }, + { + "epoch": 0.7889993373094765, + "grad_norm": 7.9510111808776855, + "learning_rate": 4.195997899202876e-06, + "loss": 0.1619, + "num_input_tokens_seen": 11631072, + "step": 5953 + }, + { + "epoch": 0.7891318754141816, + "grad_norm": 0.11786588281393051, + "learning_rate": 4.195742843318848e-06, + "loss": 0.0008, + "num_input_tokens_seen": 11632792, + "step": 5954 + }, + { + "epoch": 0.7892644135188867, + "grad_norm": 0.5430479645729065, + "learning_rate": 4.195487754739728e-06, + "loss": 0.0024, + "num_input_tokens_seen": 11634264, + "step": 5955 + }, + { + "epoch": 0.7893969516235918, + "grad_norm": 1.1728757619857788, + "learning_rate": 4.195232633470434e-06, + "loss": 0.0067, + "num_input_tokens_seen": 11636784, + "step": 5956 + }, + { + "epoch": 0.7895294897282968, + "grad_norm": 10.655708312988281, + "learning_rate": 4.194977479515885e-06, + "loss": 0.1973, + "num_input_tokens_seen": 11638376, + "step": 5957 + }, + { + "epoch": 0.789662027833002, + "grad_norm": 5.791609287261963, + "learning_rate": 4.194722292881001e-06, + "loss": 0.2065, + "num_input_tokens_seen": 11639720, + "step": 5958 + }, + { + "epoch": 0.7897945659377071, + "grad_norm": 9.388766288757324, + "learning_rate": 4.1944670735707026e-06, + "loss": 0.2796, + "num_input_tokens_seen": 11641360, + "step": 5959 + }, + { + "epoch": 0.7899271040424122, + "grad_norm": 11.944838523864746, + "learning_rate": 4.194211821589911e-06, + "loss": 0.3503, + "num_input_tokens_seen": 11643232, + "step": 5960 + }, + { + "epoch": 0.7900596421471173, + "grad_norm": 0.03752652183175087, + "learning_rate": 4.1939565369435455e-06, + "loss": 0.0003, + "num_input_tokens_seen": 11644496, + "step": 5961 + }, + { + "epoch": 0.7901921802518224, + "grad_norm": 3.6657378673553467, + "learning_rate": 4.193701219636529e-06, + "loss": 0.0393, + "num_input_tokens_seen": 11646160, + "step": 5962 + }, + { + "epoch": 0.7903247183565275, + "grad_norm": 13.018482208251953, + "learning_rate": 4.193445869673786e-06, + "loss": 0.3122, + "num_input_tokens_seen": 11648040, + "step": 5963 + }, + { + "epoch": 0.7904572564612325, + "grad_norm": 5.167471885681152, + "learning_rate": 4.193190487060237e-06, + "loss": 0.1695, + "num_input_tokens_seen": 11650376, + "step": 5964 + }, + { + "epoch": 0.7905897945659377, + "grad_norm": 5.533400058746338, + "learning_rate": 4.192935071800808e-06, + "loss": 0.0835, + "num_input_tokens_seen": 11653016, + "step": 5965 + }, + { + "epoch": 0.7907223326706428, + "grad_norm": 8.305009841918945, + "learning_rate": 4.192679623900423e-06, + "loss": 0.2808, + "num_input_tokens_seen": 11655296, + "step": 5966 + }, + { + "epoch": 0.7908548707753479, + "grad_norm": 0.14705567061901093, + "learning_rate": 4.192424143364006e-06, + "loss": 0.001, + "num_input_tokens_seen": 11656800, + "step": 5967 + }, + { + "epoch": 0.790987408880053, + "grad_norm": 16.64096450805664, + "learning_rate": 4.192168630196486e-06, + "loss": 0.574, + "num_input_tokens_seen": 11659272, + "step": 5968 + }, + { + "epoch": 0.7911199469847581, + "grad_norm": 9.419206619262695, + "learning_rate": 4.191913084402785e-06, + "loss": 0.2834, + "num_input_tokens_seen": 11661064, + "step": 5969 + }, + { + "epoch": 0.7912524850894632, + "grad_norm": 9.220453262329102, + "learning_rate": 4.191657505987835e-06, + "loss": 0.2722, + "num_input_tokens_seen": 11662704, + "step": 5970 + }, + { + "epoch": 0.7913850231941684, + "grad_norm": 10.419841766357422, + "learning_rate": 4.19140189495656e-06, + "loss": 0.3539, + "num_input_tokens_seen": 11664888, + "step": 5971 + }, + { + "epoch": 0.7915175612988734, + "grad_norm": 10.984697341918945, + "learning_rate": 4.19114625131389e-06, + "loss": 0.2693, + "num_input_tokens_seen": 11666520, + "step": 5972 + }, + { + "epoch": 0.7916500994035786, + "grad_norm": 13.330792427062988, + "learning_rate": 4.190890575064753e-06, + "loss": 0.3326, + "num_input_tokens_seen": 11668480, + "step": 5973 + }, + { + "epoch": 0.7917826375082836, + "grad_norm": 9.116632461547852, + "learning_rate": 4.190634866214079e-06, + "loss": 0.1985, + "num_input_tokens_seen": 11670728, + "step": 5974 + }, + { + "epoch": 0.7919151756129887, + "grad_norm": 5.533251762390137, + "learning_rate": 4.190379124766799e-06, + "loss": 0.154, + "num_input_tokens_seen": 11672496, + "step": 5975 + }, + { + "epoch": 0.7920477137176939, + "grad_norm": 11.745291709899902, + "learning_rate": 4.190123350727844e-06, + "loss": 0.3073, + "num_input_tokens_seen": 11675104, + "step": 5976 + }, + { + "epoch": 0.7921802518223989, + "grad_norm": 5.7864670753479, + "learning_rate": 4.189867544102144e-06, + "loss": 0.1776, + "num_input_tokens_seen": 11678040, + "step": 5977 + }, + { + "epoch": 0.7923127899271041, + "grad_norm": 0.23212522268295288, + "learning_rate": 4.189611704894633e-06, + "loss": 0.0017, + "num_input_tokens_seen": 11679704, + "step": 5978 + }, + { + "epoch": 0.7924453280318091, + "grad_norm": 14.259075164794922, + "learning_rate": 4.189355833110242e-06, + "loss": 0.2693, + "num_input_tokens_seen": 11682896, + "step": 5979 + }, + { + "epoch": 0.7925778661365143, + "grad_norm": 0.15822458267211914, + "learning_rate": 4.189099928753904e-06, + "loss": 0.0011, + "num_input_tokens_seen": 11683784, + "step": 5980 + }, + { + "epoch": 0.7927104042412193, + "grad_norm": 11.5624361038208, + "learning_rate": 4.188843991830556e-06, + "loss": 0.2294, + "num_input_tokens_seen": 11685336, + "step": 5981 + }, + { + "epoch": 0.7928429423459245, + "grad_norm": 3.4526143074035645, + "learning_rate": 4.1885880223451295e-06, + "loss": 0.0269, + "num_input_tokens_seen": 11687528, + "step": 5982 + }, + { + "epoch": 0.7929754804506296, + "grad_norm": 9.568391799926758, + "learning_rate": 4.188332020302561e-06, + "loss": 0.3418, + "num_input_tokens_seen": 11689592, + "step": 5983 + }, + { + "epoch": 0.7931080185553346, + "grad_norm": 10.51002311706543, + "learning_rate": 4.188075985707788e-06, + "loss": 0.256, + "num_input_tokens_seen": 11692544, + "step": 5984 + }, + { + "epoch": 0.7932405566600398, + "grad_norm": 0.6112151741981506, + "learning_rate": 4.187819918565744e-06, + "loss": 0.0044, + "num_input_tokens_seen": 11694136, + "step": 5985 + }, + { + "epoch": 0.7933730947647448, + "grad_norm": 1.2927066087722778, + "learning_rate": 4.187563818881368e-06, + "loss": 0.0092, + "num_input_tokens_seen": 11696384, + "step": 5986 + }, + { + "epoch": 0.79350563286945, + "grad_norm": 8.478625297546387, + "learning_rate": 4.187307686659597e-06, + "loss": 0.4016, + "num_input_tokens_seen": 11698424, + "step": 5987 + }, + { + "epoch": 0.7936381709741551, + "grad_norm": 15.124611854553223, + "learning_rate": 4.18705152190537e-06, + "loss": 0.7912, + "num_input_tokens_seen": 11700496, + "step": 5988 + }, + { + "epoch": 0.7937707090788602, + "grad_norm": 7.7354278564453125, + "learning_rate": 4.186795324623626e-06, + "loss": 0.2842, + "num_input_tokens_seen": 11702296, + "step": 5989 + }, + { + "epoch": 0.7939032471835653, + "grad_norm": 0.671261727809906, + "learning_rate": 4.186539094819305e-06, + "loss": 0.0046, + "num_input_tokens_seen": 11704096, + "step": 5990 + }, + { + "epoch": 0.7940357852882703, + "grad_norm": 0.2579774558544159, + "learning_rate": 4.186282832497346e-06, + "loss": 0.0019, + "num_input_tokens_seen": 11705848, + "step": 5991 + }, + { + "epoch": 0.7941683233929755, + "grad_norm": 2.8712058067321777, + "learning_rate": 4.186026537662691e-06, + "loss": 0.0806, + "num_input_tokens_seen": 11707656, + "step": 5992 + }, + { + "epoch": 0.7943008614976805, + "grad_norm": 7.225240230560303, + "learning_rate": 4.18577021032028e-06, + "loss": 0.2161, + "num_input_tokens_seen": 11709592, + "step": 5993 + }, + { + "epoch": 0.7944333996023857, + "grad_norm": 11.566388130187988, + "learning_rate": 4.1855138504750585e-06, + "loss": 0.3756, + "num_input_tokens_seen": 11711752, + "step": 5994 + }, + { + "epoch": 0.7945659377070908, + "grad_norm": 0.5353715419769287, + "learning_rate": 4.185257458131966e-06, + "loss": 0.0036, + "num_input_tokens_seen": 11713392, + "step": 5995 + }, + { + "epoch": 0.7946984758117959, + "grad_norm": 11.293478012084961, + "learning_rate": 4.185001033295947e-06, + "loss": 0.2453, + "num_input_tokens_seen": 11715112, + "step": 5996 + }, + { + "epoch": 0.794831013916501, + "grad_norm": 2.713548421859741, + "learning_rate": 4.184744575971946e-06, + "loss": 0.0405, + "num_input_tokens_seen": 11716896, + "step": 5997 + }, + { + "epoch": 0.794963552021206, + "grad_norm": 7.89310884475708, + "learning_rate": 4.184488086164908e-06, + "loss": 0.2823, + "num_input_tokens_seen": 11717976, + "step": 5998 + }, + { + "epoch": 0.7950960901259112, + "grad_norm": 10.849326133728027, + "learning_rate": 4.184231563879776e-06, + "loss": 0.4328, + "num_input_tokens_seen": 11719688, + "step": 5999 + }, + { + "epoch": 0.7952286282306164, + "grad_norm": 9.315733909606934, + "learning_rate": 4.1839750091215e-06, + "loss": 0.3607, + "num_input_tokens_seen": 11722424, + "step": 6000 + }, + { + "epoch": 0.7953611663353214, + "grad_norm": 0.05200628936290741, + "learning_rate": 4.183718421895021e-06, + "loss": 0.0004, + "num_input_tokens_seen": 11723720, + "step": 6001 + }, + { + "epoch": 0.7954937044400265, + "grad_norm": 10.4094877243042, + "learning_rate": 4.183461802205291e-06, + "loss": 0.4179, + "num_input_tokens_seen": 11726080, + "step": 6002 + }, + { + "epoch": 0.7956262425447316, + "grad_norm": 8.708499908447266, + "learning_rate": 4.183205150057256e-06, + "loss": 0.2146, + "num_input_tokens_seen": 11727608, + "step": 6003 + }, + { + "epoch": 0.7957587806494367, + "grad_norm": 0.26600104570388794, + "learning_rate": 4.182948465455865e-06, + "loss": 0.002, + "num_input_tokens_seen": 11729632, + "step": 6004 + }, + { + "epoch": 0.7958913187541418, + "grad_norm": 10.767897605895996, + "learning_rate": 4.182691748406066e-06, + "loss": 0.3271, + "num_input_tokens_seen": 11731952, + "step": 6005 + }, + { + "epoch": 0.7960238568588469, + "grad_norm": 2.3896379470825195, + "learning_rate": 4.1824349989128085e-06, + "loss": 0.0207, + "num_input_tokens_seen": 11733296, + "step": 6006 + }, + { + "epoch": 0.7961563949635521, + "grad_norm": 7.232451438903809, + "learning_rate": 4.182178216981045e-06, + "loss": 0.1567, + "num_input_tokens_seen": 11735264, + "step": 6007 + }, + { + "epoch": 0.7962889330682571, + "grad_norm": 24.213106155395508, + "learning_rate": 4.181921402615724e-06, + "loss": 0.4406, + "num_input_tokens_seen": 11736664, + "step": 6008 + }, + { + "epoch": 0.7964214711729622, + "grad_norm": 4.51219367980957, + "learning_rate": 4.1816645558217985e-06, + "loss": 0.0431, + "num_input_tokens_seen": 11738528, + "step": 6009 + }, + { + "epoch": 0.7965540092776673, + "grad_norm": 0.7979826927185059, + "learning_rate": 4.1814076766042206e-06, + "loss": 0.0052, + "num_input_tokens_seen": 11740288, + "step": 6010 + }, + { + "epoch": 0.7966865473823724, + "grad_norm": 5.296796798706055, + "learning_rate": 4.181150764967941e-06, + "loss": 0.1208, + "num_input_tokens_seen": 11741992, + "step": 6011 + }, + { + "epoch": 0.7968190854870776, + "grad_norm": 13.511838912963867, + "learning_rate": 4.180893820917917e-06, + "loss": 0.5607, + "num_input_tokens_seen": 11743752, + "step": 6012 + }, + { + "epoch": 0.7969516235917826, + "grad_norm": 10.28458023071289, + "learning_rate": 4.1806368444591e-06, + "loss": 0.4532, + "num_input_tokens_seen": 11746048, + "step": 6013 + }, + { + "epoch": 0.7970841616964878, + "grad_norm": 14.196305274963379, + "learning_rate": 4.1803798355964455e-06, + "loss": 0.8276, + "num_input_tokens_seen": 11748408, + "step": 6014 + }, + { + "epoch": 0.7972166998011928, + "grad_norm": 2.3924953937530518, + "learning_rate": 4.180122794334908e-06, + "loss": 0.0522, + "num_input_tokens_seen": 11750032, + "step": 6015 + }, + { + "epoch": 0.797349237905898, + "grad_norm": 13.982836723327637, + "learning_rate": 4.179865720679444e-06, + "loss": 0.6101, + "num_input_tokens_seen": 11752256, + "step": 6016 + }, + { + "epoch": 0.797481776010603, + "grad_norm": 0.32092785835266113, + "learning_rate": 4.1796086146350095e-06, + "loss": 0.0024, + "num_input_tokens_seen": 11754744, + "step": 6017 + }, + { + "epoch": 0.7976143141153081, + "grad_norm": 0.15612871944904327, + "learning_rate": 4.179351476206563e-06, + "loss": 0.0012, + "num_input_tokens_seen": 11755984, + "step": 6018 + }, + { + "epoch": 0.7977468522200133, + "grad_norm": 0.10572319477796555, + "learning_rate": 4.1790943053990604e-06, + "loss": 0.0008, + "num_input_tokens_seen": 11757152, + "step": 6019 + }, + { + "epoch": 0.7978793903247183, + "grad_norm": 0.21356245875358582, + "learning_rate": 4.178837102217463e-06, + "loss": 0.0016, + "num_input_tokens_seen": 11759664, + "step": 6020 + }, + { + "epoch": 0.7980119284294235, + "grad_norm": 2.1114044189453125, + "learning_rate": 4.178579866666727e-06, + "loss": 0.0146, + "num_input_tokens_seen": 11761760, + "step": 6021 + }, + { + "epoch": 0.7981444665341285, + "grad_norm": 11.381550788879395, + "learning_rate": 4.178322598751813e-06, + "loss": 0.2977, + "num_input_tokens_seen": 11764128, + "step": 6022 + }, + { + "epoch": 0.7982770046388337, + "grad_norm": 5.775282859802246, + "learning_rate": 4.1780652984776825e-06, + "loss": 0.1838, + "num_input_tokens_seen": 11766592, + "step": 6023 + }, + { + "epoch": 0.7984095427435388, + "grad_norm": 8.84891128540039, + "learning_rate": 4.1778079658492946e-06, + "loss": 0.2605, + "num_input_tokens_seen": 11769144, + "step": 6024 + }, + { + "epoch": 0.7985420808482439, + "grad_norm": 18.23410415649414, + "learning_rate": 4.177550600871611e-06, + "loss": 1.2102, + "num_input_tokens_seen": 11771776, + "step": 6025 + }, + { + "epoch": 0.798674618952949, + "grad_norm": 3.3001511096954346, + "learning_rate": 4.177293203549596e-06, + "loss": 0.0503, + "num_input_tokens_seen": 11773232, + "step": 6026 + }, + { + "epoch": 0.798807157057654, + "grad_norm": 13.878031730651855, + "learning_rate": 4.177035773888211e-06, + "loss": 0.224, + "num_input_tokens_seen": 11775200, + "step": 6027 + }, + { + "epoch": 0.7989396951623592, + "grad_norm": 4.134118556976318, + "learning_rate": 4.176778311892418e-06, + "loss": 0.067, + "num_input_tokens_seen": 11777096, + "step": 6028 + }, + { + "epoch": 0.7990722332670643, + "grad_norm": 12.027763366699219, + "learning_rate": 4.176520817567183e-06, + "loss": 0.5639, + "num_input_tokens_seen": 11778920, + "step": 6029 + }, + { + "epoch": 0.7992047713717694, + "grad_norm": 10.104657173156738, + "learning_rate": 4.17626329091747e-06, + "loss": 0.1619, + "num_input_tokens_seen": 11781112, + "step": 6030 + }, + { + "epoch": 0.7993373094764745, + "grad_norm": 7.181174278259277, + "learning_rate": 4.176005731948244e-06, + "loss": 0.18, + "num_input_tokens_seen": 11782560, + "step": 6031 + }, + { + "epoch": 0.7994698475811796, + "grad_norm": 0.1506657600402832, + "learning_rate": 4.175748140664472e-06, + "loss": 0.0011, + "num_input_tokens_seen": 11784304, + "step": 6032 + }, + { + "epoch": 0.7996023856858847, + "grad_norm": 13.986159324645996, + "learning_rate": 4.17549051707112e-06, + "loss": 0.5356, + "num_input_tokens_seen": 11787104, + "step": 6033 + }, + { + "epoch": 0.7997349237905897, + "grad_norm": 0.12098951637744904, + "learning_rate": 4.175232861173154e-06, + "loss": 0.0009, + "num_input_tokens_seen": 11788416, + "step": 6034 + }, + { + "epoch": 0.7998674618952949, + "grad_norm": 1.0018857717514038, + "learning_rate": 4.174975172975543e-06, + "loss": 0.0062, + "num_input_tokens_seen": 11790504, + "step": 6035 + }, + { + "epoch": 0.8, + "grad_norm": 5.0751166343688965, + "learning_rate": 4.174717452483255e-06, + "loss": 0.1208, + "num_input_tokens_seen": 11792568, + "step": 6036 + }, + { + "epoch": 0.8001325381047051, + "grad_norm": 0.24677395820617676, + "learning_rate": 4.17445969970126e-06, + "loss": 0.0016, + "num_input_tokens_seen": 11793968, + "step": 6037 + }, + { + "epoch": 0.8002650762094102, + "grad_norm": 0.1527145504951477, + "learning_rate": 4.174201914634527e-06, + "loss": 0.0011, + "num_input_tokens_seen": 11795792, + "step": 6038 + }, + { + "epoch": 0.8003976143141153, + "grad_norm": 4.625432014465332, + "learning_rate": 4.173944097288025e-06, + "loss": 0.0582, + "num_input_tokens_seen": 11798000, + "step": 6039 + }, + { + "epoch": 0.8005301524188204, + "grad_norm": 0.16194915771484375, + "learning_rate": 4.173686247666727e-06, + "loss": 0.0012, + "num_input_tokens_seen": 11800864, + "step": 6040 + }, + { + "epoch": 0.8006626905235256, + "grad_norm": 8.614712715148926, + "learning_rate": 4.173428365775603e-06, + "loss": 0.2911, + "num_input_tokens_seen": 11803632, + "step": 6041 + }, + { + "epoch": 0.8007952286282306, + "grad_norm": 10.721465110778809, + "learning_rate": 4.1731704516196255e-06, + "loss": 0.1964, + "num_input_tokens_seen": 11805512, + "step": 6042 + }, + { + "epoch": 0.8009277667329358, + "grad_norm": 0.0991855040192604, + "learning_rate": 4.172912505203768e-06, + "loss": 0.0007, + "num_input_tokens_seen": 11808568, + "step": 6043 + }, + { + "epoch": 0.8010603048376408, + "grad_norm": 1.4042294025421143, + "learning_rate": 4.172654526533003e-06, + "loss": 0.021, + "num_input_tokens_seen": 11810288, + "step": 6044 + }, + { + "epoch": 0.8011928429423459, + "grad_norm": 6.066646099090576, + "learning_rate": 4.172396515612305e-06, + "loss": 0.1738, + "num_input_tokens_seen": 11813128, + "step": 6045 + }, + { + "epoch": 0.801325381047051, + "grad_norm": 5.14725923538208, + "learning_rate": 4.172138472446648e-06, + "loss": 0.0769, + "num_input_tokens_seen": 11815720, + "step": 6046 + }, + { + "epoch": 0.8014579191517561, + "grad_norm": 0.03395279124379158, + "learning_rate": 4.171880397041008e-06, + "loss": 0.0002, + "num_input_tokens_seen": 11817032, + "step": 6047 + }, + { + "epoch": 0.8015904572564613, + "grad_norm": 5.20062780380249, + "learning_rate": 4.17162228940036e-06, + "loss": 0.0677, + "num_input_tokens_seen": 11818776, + "step": 6048 + }, + { + "epoch": 0.8017229953611663, + "grad_norm": 9.515907287597656, + "learning_rate": 4.171364149529682e-06, + "loss": 0.4526, + "num_input_tokens_seen": 11820904, + "step": 6049 + }, + { + "epoch": 0.8018555334658715, + "grad_norm": 0.1978934109210968, + "learning_rate": 4.17110597743395e-06, + "loss": 0.0014, + "num_input_tokens_seen": 11822576, + "step": 6050 + }, + { + "epoch": 0.8019880715705765, + "grad_norm": 10.959004402160645, + "learning_rate": 4.170847773118142e-06, + "loss": 0.3952, + "num_input_tokens_seen": 11824256, + "step": 6051 + }, + { + "epoch": 0.8021206096752816, + "grad_norm": 0.09633899480104446, + "learning_rate": 4.170589536587237e-06, + "loss": 0.0007, + "num_input_tokens_seen": 11825624, + "step": 6052 + }, + { + "epoch": 0.8022531477799868, + "grad_norm": 6.93980073928833, + "learning_rate": 4.170331267846211e-06, + "loss": 0.1995, + "num_input_tokens_seen": 11828056, + "step": 6053 + }, + { + "epoch": 0.8023856858846918, + "grad_norm": 0.42530685663223267, + "learning_rate": 4.1700729669000476e-06, + "loss": 0.0031, + "num_input_tokens_seen": 11830552, + "step": 6054 + }, + { + "epoch": 0.802518223989397, + "grad_norm": 16.74164390563965, + "learning_rate": 4.169814633753724e-06, + "loss": 0.6686, + "num_input_tokens_seen": 11833272, + "step": 6055 + }, + { + "epoch": 0.802650762094102, + "grad_norm": 7.9354705810546875, + "learning_rate": 4.169556268412224e-06, + "loss": 0.2462, + "num_input_tokens_seen": 11835312, + "step": 6056 + }, + { + "epoch": 0.8027833001988072, + "grad_norm": 8.194220542907715, + "learning_rate": 4.169297870880526e-06, + "loss": 0.1838, + "num_input_tokens_seen": 11837744, + "step": 6057 + }, + { + "epoch": 0.8029158383035122, + "grad_norm": 16.372188568115234, + "learning_rate": 4.169039441163613e-06, + "loss": 0.2398, + "num_input_tokens_seen": 11840816, + "step": 6058 + }, + { + "epoch": 0.8030483764082174, + "grad_norm": 6.946459770202637, + "learning_rate": 4.168780979266469e-06, + "loss": 0.3026, + "num_input_tokens_seen": 11843280, + "step": 6059 + }, + { + "epoch": 0.8031809145129225, + "grad_norm": 6.503567218780518, + "learning_rate": 4.168522485194077e-06, + "loss": 0.1942, + "num_input_tokens_seen": 11844704, + "step": 6060 + }, + { + "epoch": 0.8033134526176275, + "grad_norm": 0.33634835481643677, + "learning_rate": 4.168263958951421e-06, + "loss": 0.0022, + "num_input_tokens_seen": 11846656, + "step": 6061 + }, + { + "epoch": 0.8034459907223327, + "grad_norm": 0.16518740355968475, + "learning_rate": 4.168005400543483e-06, + "loss": 0.0011, + "num_input_tokens_seen": 11847512, + "step": 6062 + }, + { + "epoch": 0.8035785288270377, + "grad_norm": 5.616457939147949, + "learning_rate": 4.1677468099752515e-06, + "loss": 0.1044, + "num_input_tokens_seen": 11849664, + "step": 6063 + }, + { + "epoch": 0.8037110669317429, + "grad_norm": 12.65960693359375, + "learning_rate": 4.16748818725171e-06, + "loss": 0.2047, + "num_input_tokens_seen": 11852504, + "step": 6064 + }, + { + "epoch": 0.803843605036448, + "grad_norm": 12.545294761657715, + "learning_rate": 4.1672295323778475e-06, + "loss": 0.5445, + "num_input_tokens_seen": 11854752, + "step": 6065 + }, + { + "epoch": 0.8039761431411531, + "grad_norm": 0.33383458852767944, + "learning_rate": 4.166970845358648e-06, + "loss": 0.0022, + "num_input_tokens_seen": 11856280, + "step": 6066 + }, + { + "epoch": 0.8041086812458582, + "grad_norm": 5.8879241943359375, + "learning_rate": 4.1667121261991014e-06, + "loss": 0.1943, + "num_input_tokens_seen": 11858040, + "step": 6067 + }, + { + "epoch": 0.8042412193505633, + "grad_norm": 2.104642152786255, + "learning_rate": 4.1664533749041944e-06, + "loss": 0.0137, + "num_input_tokens_seen": 11860488, + "step": 6068 + }, + { + "epoch": 0.8043737574552684, + "grad_norm": 11.497167587280273, + "learning_rate": 4.166194591478918e-06, + "loss": 0.2467, + "num_input_tokens_seen": 11862416, + "step": 6069 + }, + { + "epoch": 0.8045062955599734, + "grad_norm": 0.09199334681034088, + "learning_rate": 4.165935775928259e-06, + "loss": 0.0006, + "num_input_tokens_seen": 11863728, + "step": 6070 + }, + { + "epoch": 0.8046388336646786, + "grad_norm": 11.039759635925293, + "learning_rate": 4.16567692825721e-06, + "loss": 0.3187, + "num_input_tokens_seen": 11865600, + "step": 6071 + }, + { + "epoch": 0.8047713717693837, + "grad_norm": 14.361676216125488, + "learning_rate": 4.165418048470761e-06, + "loss": 0.6379, + "num_input_tokens_seen": 11867840, + "step": 6072 + }, + { + "epoch": 0.8049039098740888, + "grad_norm": 0.2640438675880432, + "learning_rate": 4.165159136573902e-06, + "loss": 0.0018, + "num_input_tokens_seen": 11869312, + "step": 6073 + }, + { + "epoch": 0.8050364479787939, + "grad_norm": 7.360639572143555, + "learning_rate": 4.1649001925716265e-06, + "loss": 0.2877, + "num_input_tokens_seen": 11871640, + "step": 6074 + }, + { + "epoch": 0.805168986083499, + "grad_norm": 9.279560089111328, + "learning_rate": 4.164641216468927e-06, + "loss": 0.1389, + "num_input_tokens_seen": 11873400, + "step": 6075 + }, + { + "epoch": 0.8053015241882041, + "grad_norm": 0.19355402886867523, + "learning_rate": 4.164382208270797e-06, + "loss": 0.0014, + "num_input_tokens_seen": 11875088, + "step": 6076 + }, + { + "epoch": 0.8054340622929093, + "grad_norm": 6.217125415802002, + "learning_rate": 4.164123167982228e-06, + "loss": 0.0669, + "num_input_tokens_seen": 11877048, + "step": 6077 + }, + { + "epoch": 0.8055666003976143, + "grad_norm": 0.10231254249811172, + "learning_rate": 4.163864095608218e-06, + "loss": 0.0007, + "num_input_tokens_seen": 11878440, + "step": 6078 + }, + { + "epoch": 0.8056991385023194, + "grad_norm": 2.467869281768799, + "learning_rate": 4.16360499115376e-06, + "loss": 0.0254, + "num_input_tokens_seen": 11880456, + "step": 6079 + }, + { + "epoch": 0.8058316766070245, + "grad_norm": 1.3555772304534912, + "learning_rate": 4.16334585462385e-06, + "loss": 0.0084, + "num_input_tokens_seen": 11883360, + "step": 6080 + }, + { + "epoch": 0.8059642147117296, + "grad_norm": 0.061334118247032166, + "learning_rate": 4.163086686023484e-06, + "loss": 0.0004, + "num_input_tokens_seen": 11884656, + "step": 6081 + }, + { + "epoch": 0.8060967528164347, + "grad_norm": 0.1840270757675171, + "learning_rate": 4.16282748535766e-06, + "loss": 0.0013, + "num_input_tokens_seen": 11886208, + "step": 6082 + }, + { + "epoch": 0.8062292909211398, + "grad_norm": 7.886624336242676, + "learning_rate": 4.162568252631375e-06, + "loss": 0.25, + "num_input_tokens_seen": 11887816, + "step": 6083 + }, + { + "epoch": 0.806361829025845, + "grad_norm": 0.29648005962371826, + "learning_rate": 4.162308987849626e-06, + "loss": 0.0015, + "num_input_tokens_seen": 11890040, + "step": 6084 + }, + { + "epoch": 0.80649436713055, + "grad_norm": 4.816892147064209, + "learning_rate": 4.162049691017414e-06, + "loss": 0.117, + "num_input_tokens_seen": 11891992, + "step": 6085 + }, + { + "epoch": 0.8066269052352552, + "grad_norm": 13.150874137878418, + "learning_rate": 4.161790362139737e-06, + "loss": 0.3357, + "num_input_tokens_seen": 11893904, + "step": 6086 + }, + { + "epoch": 0.8067594433399602, + "grad_norm": 3.5175275802612305, + "learning_rate": 4.161531001221596e-06, + "loss": 0.0696, + "num_input_tokens_seen": 11895408, + "step": 6087 + }, + { + "epoch": 0.8068919814446653, + "grad_norm": 1.1539913415908813, + "learning_rate": 4.161271608267991e-06, + "loss": 0.0065, + "num_input_tokens_seen": 11896872, + "step": 6088 + }, + { + "epoch": 0.8070245195493705, + "grad_norm": 4.928594589233398, + "learning_rate": 4.161012183283922e-06, + "loss": 0.1082, + "num_input_tokens_seen": 11899640, + "step": 6089 + }, + { + "epoch": 0.8071570576540755, + "grad_norm": 5.266086101531982, + "learning_rate": 4.1607527262743945e-06, + "loss": 0.1278, + "num_input_tokens_seen": 11902232, + "step": 6090 + }, + { + "epoch": 0.8072895957587807, + "grad_norm": 13.122448921203613, + "learning_rate": 4.160493237244407e-06, + "loss": 0.5917, + "num_input_tokens_seen": 11905120, + "step": 6091 + }, + { + "epoch": 0.8074221338634857, + "grad_norm": 6.496933937072754, + "learning_rate": 4.160233716198965e-06, + "loss": 0.2639, + "num_input_tokens_seen": 11907368, + "step": 6092 + }, + { + "epoch": 0.8075546719681909, + "grad_norm": 8.79826545715332, + "learning_rate": 4.159974163143071e-06, + "loss": 0.203, + "num_input_tokens_seen": 11909240, + "step": 6093 + }, + { + "epoch": 0.807687210072896, + "grad_norm": 2.6826932430267334, + "learning_rate": 4.1597145780817305e-06, + "loss": 0.0179, + "num_input_tokens_seen": 11910624, + "step": 6094 + }, + { + "epoch": 0.807819748177601, + "grad_norm": 13.007667541503906, + "learning_rate": 4.159454961019948e-06, + "loss": 0.4548, + "num_input_tokens_seen": 11913536, + "step": 6095 + }, + { + "epoch": 0.8079522862823062, + "grad_norm": 0.010204214602708817, + "learning_rate": 4.159195311962729e-06, + "loss": 0.0001, + "num_input_tokens_seen": 11915048, + "step": 6096 + }, + { + "epoch": 0.8080848243870112, + "grad_norm": 2.3493385314941406, + "learning_rate": 4.158935630915079e-06, + "loss": 0.0396, + "num_input_tokens_seen": 11916928, + "step": 6097 + }, + { + "epoch": 0.8082173624917164, + "grad_norm": 0.04768723249435425, + "learning_rate": 4.158675917882007e-06, + "loss": 0.0003, + "num_input_tokens_seen": 11919176, + "step": 6098 + }, + { + "epoch": 0.8083499005964214, + "grad_norm": 0.18017545342445374, + "learning_rate": 4.158416172868518e-06, + "loss": 0.0013, + "num_input_tokens_seen": 11920520, + "step": 6099 + }, + { + "epoch": 0.8084824387011266, + "grad_norm": 6.926924228668213, + "learning_rate": 4.1581563958796214e-06, + "loss": 0.183, + "num_input_tokens_seen": 11922336, + "step": 6100 + }, + { + "epoch": 0.8086149768058317, + "grad_norm": 6.996441841125488, + "learning_rate": 4.157896586920326e-06, + "loss": 0.2006, + "num_input_tokens_seen": 11923912, + "step": 6101 + }, + { + "epoch": 0.8087475149105368, + "grad_norm": 0.028820473700761795, + "learning_rate": 4.15763674599564e-06, + "loss": 0.0002, + "num_input_tokens_seen": 11926768, + "step": 6102 + }, + { + "epoch": 0.8088800530152419, + "grad_norm": 4.1112494468688965, + "learning_rate": 4.157376873110575e-06, + "loss": 0.1544, + "num_input_tokens_seen": 11928672, + "step": 6103 + }, + { + "epoch": 0.8090125911199469, + "grad_norm": 9.680543899536133, + "learning_rate": 4.157116968270139e-06, + "loss": 0.4302, + "num_input_tokens_seen": 11930680, + "step": 6104 + }, + { + "epoch": 0.8091451292246521, + "grad_norm": 0.05377503111958504, + "learning_rate": 4.156857031479346e-06, + "loss": 0.0003, + "num_input_tokens_seen": 11932328, + "step": 6105 + }, + { + "epoch": 0.8092776673293572, + "grad_norm": 5.878805160522461, + "learning_rate": 4.156597062743206e-06, + "loss": 0.2099, + "num_input_tokens_seen": 11934520, + "step": 6106 + }, + { + "epoch": 0.8094102054340623, + "grad_norm": 9.819246292114258, + "learning_rate": 4.156337062066731e-06, + "loss": 0.196, + "num_input_tokens_seen": 11936528, + "step": 6107 + }, + { + "epoch": 0.8095427435387674, + "grad_norm": 13.29173469543457, + "learning_rate": 4.156077029454936e-06, + "loss": 0.6531, + "num_input_tokens_seen": 11939320, + "step": 6108 + }, + { + "epoch": 0.8096752816434725, + "grad_norm": 8.720046043395996, + "learning_rate": 4.155816964912833e-06, + "loss": 0.1242, + "num_input_tokens_seen": 11941464, + "step": 6109 + }, + { + "epoch": 0.8098078197481776, + "grad_norm": 7.567434787750244, + "learning_rate": 4.155556868445436e-06, + "loss": 0.3019, + "num_input_tokens_seen": 11944960, + "step": 6110 + }, + { + "epoch": 0.8099403578528827, + "grad_norm": 8.137965202331543, + "learning_rate": 4.155296740057761e-06, + "loss": 0.2041, + "num_input_tokens_seen": 11947040, + "step": 6111 + }, + { + "epoch": 0.8100728959575878, + "grad_norm": 5.043724536895752, + "learning_rate": 4.155036579754823e-06, + "loss": 0.1823, + "num_input_tokens_seen": 11949168, + "step": 6112 + }, + { + "epoch": 0.810205434062293, + "grad_norm": 0.033044882118701935, + "learning_rate": 4.154776387541638e-06, + "loss": 0.0002, + "num_input_tokens_seen": 11950448, + "step": 6113 + }, + { + "epoch": 0.810337972166998, + "grad_norm": 14.120488166809082, + "learning_rate": 4.154516163423223e-06, + "loss": 0.6416, + "num_input_tokens_seen": 11952840, + "step": 6114 + }, + { + "epoch": 0.8104705102717031, + "grad_norm": 10.699670791625977, + "learning_rate": 4.154255907404594e-06, + "loss": 0.3801, + "num_input_tokens_seen": 11955528, + "step": 6115 + }, + { + "epoch": 0.8106030483764082, + "grad_norm": 6.677476406097412, + "learning_rate": 4.153995619490771e-06, + "loss": 0.1422, + "num_input_tokens_seen": 11958104, + "step": 6116 + }, + { + "epoch": 0.8107355864811133, + "grad_norm": 0.1786443442106247, + "learning_rate": 4.153735299686771e-06, + "loss": 0.0012, + "num_input_tokens_seen": 11960128, + "step": 6117 + }, + { + "epoch": 0.8108681245858185, + "grad_norm": 0.21194416284561157, + "learning_rate": 4.153474947997613e-06, + "loss": 0.0015, + "num_input_tokens_seen": 11961648, + "step": 6118 + }, + { + "epoch": 0.8110006626905235, + "grad_norm": 0.0890662670135498, + "learning_rate": 4.1532145644283185e-06, + "loss": 0.0006, + "num_input_tokens_seen": 11963808, + "step": 6119 + }, + { + "epoch": 0.8111332007952287, + "grad_norm": 5.785924434661865, + "learning_rate": 4.152954148983905e-06, + "loss": 0.1664, + "num_input_tokens_seen": 11966352, + "step": 6120 + }, + { + "epoch": 0.8112657388999337, + "grad_norm": 13.983939170837402, + "learning_rate": 4.1526937016693966e-06, + "loss": 0.5277, + "num_input_tokens_seen": 11967856, + "step": 6121 + }, + { + "epoch": 0.8113982770046388, + "grad_norm": 0.5992397665977478, + "learning_rate": 4.152433222489814e-06, + "loss": 0.0086, + "num_input_tokens_seen": 11969632, + "step": 6122 + }, + { + "epoch": 0.8115308151093439, + "grad_norm": 8.290929794311523, + "learning_rate": 4.152172711450177e-06, + "loss": 0.2758, + "num_input_tokens_seen": 11972592, + "step": 6123 + }, + { + "epoch": 0.811663353214049, + "grad_norm": 5.982214450836182, + "learning_rate": 4.151912168555512e-06, + "loss": 0.1672, + "num_input_tokens_seen": 11974464, + "step": 6124 + }, + { + "epoch": 0.8117958913187542, + "grad_norm": 9.645614624023438, + "learning_rate": 4.151651593810839e-06, + "loss": 0.1947, + "num_input_tokens_seen": 11976792, + "step": 6125 + }, + { + "epoch": 0.8119284294234592, + "grad_norm": 8.89342975616455, + "learning_rate": 4.151390987221186e-06, + "loss": 0.1046, + "num_input_tokens_seen": 11978360, + "step": 6126 + }, + { + "epoch": 0.8120609675281644, + "grad_norm": 11.050488471984863, + "learning_rate": 4.151130348791575e-06, + "loss": 0.1653, + "num_input_tokens_seen": 11979904, + "step": 6127 + }, + { + "epoch": 0.8121935056328694, + "grad_norm": 0.1010405495762825, + "learning_rate": 4.1508696785270305e-06, + "loss": 0.0007, + "num_input_tokens_seen": 11981496, + "step": 6128 + }, + { + "epoch": 0.8123260437375746, + "grad_norm": 7.351529598236084, + "learning_rate": 4.150608976432581e-06, + "loss": 0.2574, + "num_input_tokens_seen": 11983512, + "step": 6129 + }, + { + "epoch": 0.8124585818422797, + "grad_norm": 14.337509155273438, + "learning_rate": 4.1503482425132505e-06, + "loss": 0.399, + "num_input_tokens_seen": 11984856, + "step": 6130 + }, + { + "epoch": 0.8125911199469847, + "grad_norm": 1.2829574346542358, + "learning_rate": 4.150087476774068e-06, + "loss": 0.0097, + "num_input_tokens_seen": 11986368, + "step": 6131 + }, + { + "epoch": 0.8127236580516899, + "grad_norm": 6.408889293670654, + "learning_rate": 4.149826679220061e-06, + "loss": 0.0725, + "num_input_tokens_seen": 11989712, + "step": 6132 + }, + { + "epoch": 0.8128561961563949, + "grad_norm": 13.214240074157715, + "learning_rate": 4.149565849856257e-06, + "loss": 0.644, + "num_input_tokens_seen": 11991832, + "step": 6133 + }, + { + "epoch": 0.8129887342611001, + "grad_norm": 11.497636795043945, + "learning_rate": 4.149304988687687e-06, + "loss": 0.5024, + "num_input_tokens_seen": 11994296, + "step": 6134 + }, + { + "epoch": 0.8131212723658051, + "grad_norm": 0.38540008664131165, + "learning_rate": 4.149044095719377e-06, + "loss": 0.0027, + "num_input_tokens_seen": 11995480, + "step": 6135 + }, + { + "epoch": 0.8132538104705103, + "grad_norm": 1.937770128250122, + "learning_rate": 4.14878317095636e-06, + "loss": 0.0332, + "num_input_tokens_seen": 11996904, + "step": 6136 + }, + { + "epoch": 0.8133863485752154, + "grad_norm": 21.665620803833008, + "learning_rate": 4.148522214403666e-06, + "loss": 0.923, + "num_input_tokens_seen": 11999968, + "step": 6137 + }, + { + "epoch": 0.8135188866799204, + "grad_norm": 14.573948860168457, + "learning_rate": 4.1482612260663255e-06, + "loss": 0.4182, + "num_input_tokens_seen": 12002184, + "step": 6138 + }, + { + "epoch": 0.8136514247846256, + "grad_norm": 0.11547131091356277, + "learning_rate": 4.148000205949373e-06, + "loss": 0.0007, + "num_input_tokens_seen": 12003376, + "step": 6139 + }, + { + "epoch": 0.8137839628893306, + "grad_norm": 9.716676712036133, + "learning_rate": 4.147739154057839e-06, + "loss": 0.4965, + "num_input_tokens_seen": 12006416, + "step": 6140 + }, + { + "epoch": 0.8139165009940358, + "grad_norm": 0.15690559148788452, + "learning_rate": 4.147478070396758e-06, + "loss": 0.0011, + "num_input_tokens_seen": 12007872, + "step": 6141 + }, + { + "epoch": 0.8140490390987409, + "grad_norm": 2.881154775619507, + "learning_rate": 4.147216954971162e-06, + "loss": 0.1374, + "num_input_tokens_seen": 12010432, + "step": 6142 + }, + { + "epoch": 0.814181577203446, + "grad_norm": 9.406018257141113, + "learning_rate": 4.146955807786087e-06, + "loss": 0.2384, + "num_input_tokens_seen": 12012256, + "step": 6143 + }, + { + "epoch": 0.8143141153081511, + "grad_norm": 7.63559627532959, + "learning_rate": 4.146694628846569e-06, + "loss": 0.3628, + "num_input_tokens_seen": 12014264, + "step": 6144 + }, + { + "epoch": 0.8144466534128562, + "grad_norm": 4.916349411010742, + "learning_rate": 4.146433418157642e-06, + "loss": 0.0664, + "num_input_tokens_seen": 12016056, + "step": 6145 + }, + { + "epoch": 0.8145791915175613, + "grad_norm": 0.33672434091567993, + "learning_rate": 4.1461721757243425e-06, + "loss": 0.0024, + "num_input_tokens_seen": 12019368, + "step": 6146 + }, + { + "epoch": 0.8147117296222665, + "grad_norm": 13.336397171020508, + "learning_rate": 4.145910901551708e-06, + "loss": 0.2492, + "num_input_tokens_seen": 12021984, + "step": 6147 + }, + { + "epoch": 0.8148442677269715, + "grad_norm": 0.3371427059173584, + "learning_rate": 4.145649595644776e-06, + "loss": 0.0023, + "num_input_tokens_seen": 12023856, + "step": 6148 + }, + { + "epoch": 0.8149768058316766, + "grad_norm": 4.515292167663574, + "learning_rate": 4.145388258008584e-06, + "loss": 0.0275, + "num_input_tokens_seen": 12026448, + "step": 6149 + }, + { + "epoch": 0.8151093439363817, + "grad_norm": 0.22956888377666473, + "learning_rate": 4.145126888648172e-06, + "loss": 0.0015, + "num_input_tokens_seen": 12027816, + "step": 6150 + }, + { + "epoch": 0.8152418820410868, + "grad_norm": 9.099516868591309, + "learning_rate": 4.144865487568578e-06, + "loss": 0.4176, + "num_input_tokens_seen": 12029328, + "step": 6151 + }, + { + "epoch": 0.8153744201457919, + "grad_norm": 5.425027847290039, + "learning_rate": 4.1446040547748435e-06, + "loss": 0.0546, + "num_input_tokens_seen": 12031128, + "step": 6152 + }, + { + "epoch": 0.815506958250497, + "grad_norm": 4.941423416137695, + "learning_rate": 4.144342590272008e-06, + "loss": 0.0538, + "num_input_tokens_seen": 12032576, + "step": 6153 + }, + { + "epoch": 0.8156394963552022, + "grad_norm": 8.356847763061523, + "learning_rate": 4.144081094065112e-06, + "loss": 0.154, + "num_input_tokens_seen": 12034648, + "step": 6154 + }, + { + "epoch": 0.8157720344599072, + "grad_norm": 6.338788986206055, + "learning_rate": 4.1438195661592e-06, + "loss": 0.319, + "num_input_tokens_seen": 12036168, + "step": 6155 + }, + { + "epoch": 0.8159045725646124, + "grad_norm": 9.516143798828125, + "learning_rate": 4.143558006559311e-06, + "loss": 0.1796, + "num_input_tokens_seen": 12039152, + "step": 6156 + }, + { + "epoch": 0.8160371106693174, + "grad_norm": 19.02674674987793, + "learning_rate": 4.1432964152704915e-06, + "loss": 0.7778, + "num_input_tokens_seen": 12041864, + "step": 6157 + }, + { + "epoch": 0.8161696487740225, + "grad_norm": 0.10304267704486847, + "learning_rate": 4.143034792297782e-06, + "loss": 0.0007, + "num_input_tokens_seen": 12043800, + "step": 6158 + }, + { + "epoch": 0.8163021868787277, + "grad_norm": 2.870968818664551, + "learning_rate": 4.142773137646229e-06, + "loss": 0.0965, + "num_input_tokens_seen": 12045144, + "step": 6159 + }, + { + "epoch": 0.8164347249834327, + "grad_norm": 10.343827247619629, + "learning_rate": 4.142511451320876e-06, + "loss": 0.2154, + "num_input_tokens_seen": 12046568, + "step": 6160 + }, + { + "epoch": 0.8165672630881379, + "grad_norm": 10.063755989074707, + "learning_rate": 4.14224973332677e-06, + "loss": 0.3229, + "num_input_tokens_seen": 12048384, + "step": 6161 + }, + { + "epoch": 0.8166998011928429, + "grad_norm": 15.415332794189453, + "learning_rate": 4.141987983668956e-06, + "loss": 0.4824, + "num_input_tokens_seen": 12050080, + "step": 6162 + }, + { + "epoch": 0.8168323392975481, + "grad_norm": 6.238321781158447, + "learning_rate": 4.1417262023524805e-06, + "loss": 0.2326, + "num_input_tokens_seen": 12051912, + "step": 6163 + }, + { + "epoch": 0.8169648774022531, + "grad_norm": 7.972569942474365, + "learning_rate": 4.141464389382392e-06, + "loss": 0.1352, + "num_input_tokens_seen": 12053912, + "step": 6164 + }, + { + "epoch": 0.8170974155069582, + "grad_norm": 0.061879269778728485, + "learning_rate": 4.141202544763737e-06, + "loss": 0.0004, + "num_input_tokens_seen": 12055336, + "step": 6165 + }, + { + "epoch": 0.8172299536116634, + "grad_norm": 0.14086776971817017, + "learning_rate": 4.140940668501565e-06, + "loss": 0.001, + "num_input_tokens_seen": 12057592, + "step": 6166 + }, + { + "epoch": 0.8173624917163684, + "grad_norm": 0.07263222336769104, + "learning_rate": 4.140678760600925e-06, + "loss": 0.0005, + "num_input_tokens_seen": 12059168, + "step": 6167 + }, + { + "epoch": 0.8174950298210736, + "grad_norm": 10.398987770080566, + "learning_rate": 4.140416821066866e-06, + "loss": 0.3898, + "num_input_tokens_seen": 12061072, + "step": 6168 + }, + { + "epoch": 0.8176275679257786, + "grad_norm": 2.268773317337036, + "learning_rate": 4.140154849904439e-06, + "loss": 0.0175, + "num_input_tokens_seen": 12062640, + "step": 6169 + }, + { + "epoch": 0.8177601060304838, + "grad_norm": 13.757884979248047, + "learning_rate": 4.1398928471186955e-06, + "loss": 0.4122, + "num_input_tokens_seen": 12064776, + "step": 6170 + }, + { + "epoch": 0.8178926441351889, + "grad_norm": 0.17335061728954315, + "learning_rate": 4.139630812714687e-06, + "loss": 0.0011, + "num_input_tokens_seen": 12066648, + "step": 6171 + }, + { + "epoch": 0.818025182239894, + "grad_norm": 0.4240673780441284, + "learning_rate": 4.139368746697464e-06, + "loss": 0.0029, + "num_input_tokens_seen": 12068488, + "step": 6172 + }, + { + "epoch": 0.8181577203445991, + "grad_norm": 0.35706931352615356, + "learning_rate": 4.139106649072082e-06, + "loss": 0.0024, + "num_input_tokens_seen": 12069968, + "step": 6173 + }, + { + "epoch": 0.8182902584493041, + "grad_norm": 0.09763240069150925, + "learning_rate": 4.138844519843593e-06, + "loss": 0.0007, + "num_input_tokens_seen": 12071504, + "step": 6174 + }, + { + "epoch": 0.8184227965540093, + "grad_norm": 8.46609878540039, + "learning_rate": 4.1385823590170505e-06, + "loss": 0.231, + "num_input_tokens_seen": 12072744, + "step": 6175 + }, + { + "epoch": 0.8185553346587143, + "grad_norm": 0.08261913061141968, + "learning_rate": 4.138320166597509e-06, + "loss": 0.0006, + "num_input_tokens_seen": 12074736, + "step": 6176 + }, + { + "epoch": 0.8186878727634195, + "grad_norm": 12.54770565032959, + "learning_rate": 4.138057942590026e-06, + "loss": 0.2753, + "num_input_tokens_seen": 12077296, + "step": 6177 + }, + { + "epoch": 0.8188204108681246, + "grad_norm": 0.351381778717041, + "learning_rate": 4.137795686999655e-06, + "loss": 0.0024, + "num_input_tokens_seen": 12079920, + "step": 6178 + }, + { + "epoch": 0.8189529489728297, + "grad_norm": 8.909554481506348, + "learning_rate": 4.137533399831453e-06, + "loss": 0.1311, + "num_input_tokens_seen": 12082016, + "step": 6179 + }, + { + "epoch": 0.8190854870775348, + "grad_norm": 10.467844009399414, + "learning_rate": 4.137271081090478e-06, + "loss": 0.3334, + "num_input_tokens_seen": 12084024, + "step": 6180 + }, + { + "epoch": 0.8192180251822399, + "grad_norm": 15.41341781616211, + "learning_rate": 4.1370087307817865e-06, + "loss": 0.4528, + "num_input_tokens_seen": 12086088, + "step": 6181 + }, + { + "epoch": 0.819350563286945, + "grad_norm": 4.557136058807373, + "learning_rate": 4.136746348910438e-06, + "loss": 0.058, + "num_input_tokens_seen": 12089128, + "step": 6182 + }, + { + "epoch": 0.8194831013916501, + "grad_norm": 13.68031120300293, + "learning_rate": 4.136483935481491e-06, + "loss": 0.2493, + "num_input_tokens_seen": 12091664, + "step": 6183 + }, + { + "epoch": 0.8196156394963552, + "grad_norm": 0.34771475195884705, + "learning_rate": 4.1362214905000045e-06, + "loss": 0.0024, + "num_input_tokens_seen": 12093784, + "step": 6184 + }, + { + "epoch": 0.8197481776010603, + "grad_norm": 0.03295198827981949, + "learning_rate": 4.135959013971038e-06, + "loss": 0.0002, + "num_input_tokens_seen": 12095192, + "step": 6185 + }, + { + "epoch": 0.8198807157057654, + "grad_norm": 7.492112636566162, + "learning_rate": 4.135696505899654e-06, + "loss": 0.2562, + "num_input_tokens_seen": 12097168, + "step": 6186 + }, + { + "epoch": 0.8200132538104705, + "grad_norm": 0.05433334782719612, + "learning_rate": 4.135433966290912e-06, + "loss": 0.0004, + "num_input_tokens_seen": 12098400, + "step": 6187 + }, + { + "epoch": 0.8201457919151756, + "grad_norm": 5.8571882247924805, + "learning_rate": 4.135171395149876e-06, + "loss": 0.1314, + "num_input_tokens_seen": 12100392, + "step": 6188 + }, + { + "epoch": 0.8202783300198807, + "grad_norm": 2.2193994522094727, + "learning_rate": 4.134908792481607e-06, + "loss": 0.0486, + "num_input_tokens_seen": 12103456, + "step": 6189 + }, + { + "epoch": 0.8204108681245859, + "grad_norm": 0.10799624770879745, + "learning_rate": 4.1346461582911686e-06, + "loss": 0.0008, + "num_input_tokens_seen": 12105256, + "step": 6190 + }, + { + "epoch": 0.8205434062292909, + "grad_norm": 8.882286071777344, + "learning_rate": 4.134383492583625e-06, + "loss": 0.4695, + "num_input_tokens_seen": 12107232, + "step": 6191 + }, + { + "epoch": 0.820675944333996, + "grad_norm": 0.07522015273571014, + "learning_rate": 4.13412079536404e-06, + "loss": 0.0005, + "num_input_tokens_seen": 12108976, + "step": 6192 + }, + { + "epoch": 0.8208084824387011, + "grad_norm": 3.9227962493896484, + "learning_rate": 4.133858066637478e-06, + "loss": 0.1063, + "num_input_tokens_seen": 12111720, + "step": 6193 + }, + { + "epoch": 0.8209410205434062, + "grad_norm": 9.348179817199707, + "learning_rate": 4.133595306409006e-06, + "loss": 0.3153, + "num_input_tokens_seen": 12113416, + "step": 6194 + }, + { + "epoch": 0.8210735586481114, + "grad_norm": 2.6346051692962646, + "learning_rate": 4.133332514683689e-06, + "loss": 0.0197, + "num_input_tokens_seen": 12115136, + "step": 6195 + }, + { + "epoch": 0.8212060967528164, + "grad_norm": 0.04772911220788956, + "learning_rate": 4.1330696914665954e-06, + "loss": 0.0003, + "num_input_tokens_seen": 12116416, + "step": 6196 + }, + { + "epoch": 0.8213386348575216, + "grad_norm": 3.1681723594665527, + "learning_rate": 4.132806836762791e-06, + "loss": 0.0919, + "num_input_tokens_seen": 12118000, + "step": 6197 + }, + { + "epoch": 0.8214711729622266, + "grad_norm": 12.629515647888184, + "learning_rate": 4.132543950577344e-06, + "loss": 0.2717, + "num_input_tokens_seen": 12119944, + "step": 6198 + }, + { + "epoch": 0.8216037110669318, + "grad_norm": 11.544231414794922, + "learning_rate": 4.132281032915324e-06, + "loss": 0.6901, + "num_input_tokens_seen": 12122712, + "step": 6199 + }, + { + "epoch": 0.8217362491716369, + "grad_norm": 9.117243766784668, + "learning_rate": 4.132018083781799e-06, + "loss": 0.2474, + "num_input_tokens_seen": 12124904, + "step": 6200 + }, + { + "epoch": 0.8218687872763419, + "grad_norm": 2.8727691173553467, + "learning_rate": 4.13175510318184e-06, + "loss": 0.0433, + "num_input_tokens_seen": 12126360, + "step": 6201 + }, + { + "epoch": 0.8220013253810471, + "grad_norm": 10.489371299743652, + "learning_rate": 4.131492091120517e-06, + "loss": 0.4187, + "num_input_tokens_seen": 12128312, + "step": 6202 + }, + { + "epoch": 0.8221338634857521, + "grad_norm": 5.456762790679932, + "learning_rate": 4.131229047602901e-06, + "loss": 0.165, + "num_input_tokens_seen": 12130192, + "step": 6203 + }, + { + "epoch": 0.8222664015904573, + "grad_norm": 0.09675624966621399, + "learning_rate": 4.130965972634063e-06, + "loss": 0.0007, + "num_input_tokens_seen": 12132224, + "step": 6204 + }, + { + "epoch": 0.8223989396951623, + "grad_norm": 6.89369535446167, + "learning_rate": 4.130702866219076e-06, + "loss": 0.1046, + "num_input_tokens_seen": 12134528, + "step": 6205 + }, + { + "epoch": 0.8225314777998675, + "grad_norm": 5.970261096954346, + "learning_rate": 4.1304397283630125e-06, + "loss": 0.1615, + "num_input_tokens_seen": 12136392, + "step": 6206 + }, + { + "epoch": 0.8226640159045726, + "grad_norm": 4.487571716308594, + "learning_rate": 4.1301765590709475e-06, + "loss": 0.0366, + "num_input_tokens_seen": 12137752, + "step": 6207 + }, + { + "epoch": 0.8227965540092776, + "grad_norm": 2.5095205307006836, + "learning_rate": 4.129913358347952e-06, + "loss": 0.0435, + "num_input_tokens_seen": 12138920, + "step": 6208 + }, + { + "epoch": 0.8229290921139828, + "grad_norm": 5.72824239730835, + "learning_rate": 4.129650126199104e-06, + "loss": 0.2694, + "num_input_tokens_seen": 12141088, + "step": 6209 + }, + { + "epoch": 0.8230616302186878, + "grad_norm": 14.83043098449707, + "learning_rate": 4.129386862629477e-06, + "loss": 0.4705, + "num_input_tokens_seen": 12143632, + "step": 6210 + }, + { + "epoch": 0.823194168323393, + "grad_norm": 11.735869407653809, + "learning_rate": 4.129123567644148e-06, + "loss": 0.3842, + "num_input_tokens_seen": 12145664, + "step": 6211 + }, + { + "epoch": 0.8233267064280981, + "grad_norm": 6.481836795806885, + "learning_rate": 4.128860241248191e-06, + "loss": 0.1997, + "num_input_tokens_seen": 12147920, + "step": 6212 + }, + { + "epoch": 0.8234592445328032, + "grad_norm": 7.761951923370361, + "learning_rate": 4.128596883446686e-06, + "loss": 0.445, + "num_input_tokens_seen": 12150056, + "step": 6213 + }, + { + "epoch": 0.8235917826375083, + "grad_norm": 6.926527976989746, + "learning_rate": 4.128333494244709e-06, + "loss": 0.2015, + "num_input_tokens_seen": 12151896, + "step": 6214 + }, + { + "epoch": 0.8237243207422134, + "grad_norm": 10.51961612701416, + "learning_rate": 4.12807007364734e-06, + "loss": 0.4099, + "num_input_tokens_seen": 12153432, + "step": 6215 + }, + { + "epoch": 0.8238568588469185, + "grad_norm": 7.494429111480713, + "learning_rate": 4.127806621659656e-06, + "loss": 0.169, + "num_input_tokens_seen": 12155384, + "step": 6216 + }, + { + "epoch": 0.8239893969516235, + "grad_norm": 0.5483980774879456, + "learning_rate": 4.127543138286738e-06, + "loss": 0.0039, + "num_input_tokens_seen": 12157496, + "step": 6217 + }, + { + "epoch": 0.8241219350563287, + "grad_norm": 13.005352020263672, + "learning_rate": 4.127279623533664e-06, + "loss": 0.3503, + "num_input_tokens_seen": 12159496, + "step": 6218 + }, + { + "epoch": 0.8242544731610338, + "grad_norm": 8.437920570373535, + "learning_rate": 4.127016077405517e-06, + "loss": 0.2606, + "num_input_tokens_seen": 12161224, + "step": 6219 + }, + { + "epoch": 0.8243870112657389, + "grad_norm": 4.138315200805664, + "learning_rate": 4.126752499907378e-06, + "loss": 0.0276, + "num_input_tokens_seen": 12163248, + "step": 6220 + }, + { + "epoch": 0.824519549370444, + "grad_norm": 8.378972053527832, + "learning_rate": 4.126488891044329e-06, + "loss": 0.1636, + "num_input_tokens_seen": 12165200, + "step": 6221 + }, + { + "epoch": 0.8246520874751491, + "grad_norm": 1.0396136045455933, + "learning_rate": 4.126225250821451e-06, + "loss": 0.0078, + "num_input_tokens_seen": 12166784, + "step": 6222 + }, + { + "epoch": 0.8247846255798542, + "grad_norm": 5.494361400604248, + "learning_rate": 4.125961579243829e-06, + "loss": 0.1194, + "num_input_tokens_seen": 12169216, + "step": 6223 + }, + { + "epoch": 0.8249171636845594, + "grad_norm": 0.8125260472297668, + "learning_rate": 4.125697876316546e-06, + "loss": 0.0061, + "num_input_tokens_seen": 12171288, + "step": 6224 + }, + { + "epoch": 0.8250497017892644, + "grad_norm": 9.584872245788574, + "learning_rate": 4.125434142044687e-06, + "loss": 0.3961, + "num_input_tokens_seen": 12173496, + "step": 6225 + }, + { + "epoch": 0.8251822398939695, + "grad_norm": 11.020746231079102, + "learning_rate": 4.125170376433335e-06, + "loss": 0.2975, + "num_input_tokens_seen": 12175976, + "step": 6226 + }, + { + "epoch": 0.8253147779986746, + "grad_norm": 0.5590848326683044, + "learning_rate": 4.124906579487578e-06, + "loss": 0.004, + "num_input_tokens_seen": 12177616, + "step": 6227 + }, + { + "epoch": 0.8254473161033797, + "grad_norm": 3.3342487812042236, + "learning_rate": 4.124642751212501e-06, + "loss": 0.0244, + "num_input_tokens_seen": 12178920, + "step": 6228 + }, + { + "epoch": 0.8255798542080848, + "grad_norm": 5.280827045440674, + "learning_rate": 4.1243788916131905e-06, + "loss": 0.2066, + "num_input_tokens_seen": 12180184, + "step": 6229 + }, + { + "epoch": 0.8257123923127899, + "grad_norm": 0.30848872661590576, + "learning_rate": 4.124115000694735e-06, + "loss": 0.0023, + "num_input_tokens_seen": 12182208, + "step": 6230 + }, + { + "epoch": 0.8258449304174951, + "grad_norm": 5.568369388580322, + "learning_rate": 4.123851078462221e-06, + "loss": 0.0861, + "num_input_tokens_seen": 12184800, + "step": 6231 + }, + { + "epoch": 0.8259774685222001, + "grad_norm": 2.4100353717803955, + "learning_rate": 4.123587124920739e-06, + "loss": 0.0175, + "num_input_tokens_seen": 12187248, + "step": 6232 + }, + { + "epoch": 0.8261100066269053, + "grad_norm": 9.171369552612305, + "learning_rate": 4.123323140075377e-06, + "loss": 0.1491, + "num_input_tokens_seen": 12189368, + "step": 6233 + }, + { + "epoch": 0.8262425447316103, + "grad_norm": 0.213578462600708, + "learning_rate": 4.123059123931224e-06, + "loss": 0.0015, + "num_input_tokens_seen": 12191008, + "step": 6234 + }, + { + "epoch": 0.8263750828363154, + "grad_norm": 5.748342037200928, + "learning_rate": 4.122795076493373e-06, + "loss": 0.2569, + "num_input_tokens_seen": 12193320, + "step": 6235 + }, + { + "epoch": 0.8265076209410206, + "grad_norm": 5.8942999839782715, + "learning_rate": 4.1225309977669125e-06, + "loss": 0.2039, + "num_input_tokens_seen": 12195136, + "step": 6236 + }, + { + "epoch": 0.8266401590457256, + "grad_norm": 3.219637155532837, + "learning_rate": 4.122266887756935e-06, + "loss": 0.0601, + "num_input_tokens_seen": 12196824, + "step": 6237 + }, + { + "epoch": 0.8267726971504308, + "grad_norm": 10.456040382385254, + "learning_rate": 4.122002746468532e-06, + "loss": 0.4008, + "num_input_tokens_seen": 12199280, + "step": 6238 + }, + { + "epoch": 0.8269052352551358, + "grad_norm": 4.665362358093262, + "learning_rate": 4.121738573906799e-06, + "loss": 0.1367, + "num_input_tokens_seen": 12201104, + "step": 6239 + }, + { + "epoch": 0.827037773359841, + "grad_norm": 0.05203327164053917, + "learning_rate": 4.121474370076825e-06, + "loss": 0.0004, + "num_input_tokens_seen": 12203400, + "step": 6240 + }, + { + "epoch": 0.827170311464546, + "grad_norm": 13.856741905212402, + "learning_rate": 4.121210134983709e-06, + "loss": 0.3902, + "num_input_tokens_seen": 12205216, + "step": 6241 + }, + { + "epoch": 0.8273028495692512, + "grad_norm": 11.38778018951416, + "learning_rate": 4.120945868632542e-06, + "loss": 0.3487, + "num_input_tokens_seen": 12207368, + "step": 6242 + }, + { + "epoch": 0.8274353876739563, + "grad_norm": 12.541460037231445, + "learning_rate": 4.120681571028421e-06, + "loss": 0.4703, + "num_input_tokens_seen": 12209104, + "step": 6243 + }, + { + "epoch": 0.8275679257786613, + "grad_norm": 6.679209232330322, + "learning_rate": 4.12041724217644e-06, + "loss": 0.1269, + "num_input_tokens_seen": 12211080, + "step": 6244 + }, + { + "epoch": 0.8277004638833665, + "grad_norm": 8.423904418945312, + "learning_rate": 4.120152882081698e-06, + "loss": 0.2557, + "num_input_tokens_seen": 12212848, + "step": 6245 + }, + { + "epoch": 0.8278330019880715, + "grad_norm": 7.195593357086182, + "learning_rate": 4.119888490749291e-06, + "loss": 0.284, + "num_input_tokens_seen": 12214840, + "step": 6246 + }, + { + "epoch": 0.8279655400927767, + "grad_norm": 0.030502120032906532, + "learning_rate": 4.119624068184314e-06, + "loss": 0.0002, + "num_input_tokens_seen": 12216216, + "step": 6247 + }, + { + "epoch": 0.8280980781974818, + "grad_norm": 5.737765789031982, + "learning_rate": 4.11935961439187e-06, + "loss": 0.0997, + "num_input_tokens_seen": 12218600, + "step": 6248 + }, + { + "epoch": 0.8282306163021869, + "grad_norm": 8.605488777160645, + "learning_rate": 4.1190951293770555e-06, + "loss": 0.1571, + "num_input_tokens_seen": 12220712, + "step": 6249 + }, + { + "epoch": 0.828363154406892, + "grad_norm": 13.504863739013672, + "learning_rate": 4.118830613144969e-06, + "loss": 0.385, + "num_input_tokens_seen": 12223048, + "step": 6250 + }, + { + "epoch": 0.828495692511597, + "grad_norm": 5.494770526885986, + "learning_rate": 4.118566065700712e-06, + "loss": 0.163, + "num_input_tokens_seen": 12225320, + "step": 6251 + }, + { + "epoch": 0.8286282306163022, + "grad_norm": 2.2683663368225098, + "learning_rate": 4.118301487049385e-06, + "loss": 0.0251, + "num_input_tokens_seen": 12228208, + "step": 6252 + }, + { + "epoch": 0.8287607687210072, + "grad_norm": 9.019432067871094, + "learning_rate": 4.118036877196087e-06, + "loss": 0.2643, + "num_input_tokens_seen": 12230168, + "step": 6253 + }, + { + "epoch": 0.8288933068257124, + "grad_norm": 13.964994430541992, + "learning_rate": 4.117772236145924e-06, + "loss": 0.4196, + "num_input_tokens_seen": 12233344, + "step": 6254 + }, + { + "epoch": 0.8290258449304175, + "grad_norm": 5.32543420791626, + "learning_rate": 4.1175075639039955e-06, + "loss": 0.1, + "num_input_tokens_seen": 12235376, + "step": 6255 + }, + { + "epoch": 0.8291583830351226, + "grad_norm": 0.0760633572936058, + "learning_rate": 4.117242860475405e-06, + "loss": 0.0005, + "num_input_tokens_seen": 12236968, + "step": 6256 + }, + { + "epoch": 0.8292909211398277, + "grad_norm": 4.2061944007873535, + "learning_rate": 4.116978125865257e-06, + "loss": 0.0341, + "num_input_tokens_seen": 12238080, + "step": 6257 + }, + { + "epoch": 0.8294234592445328, + "grad_norm": 1.437563419342041, + "learning_rate": 4.116713360078655e-06, + "loss": 0.0099, + "num_input_tokens_seen": 12239736, + "step": 6258 + }, + { + "epoch": 0.8295559973492379, + "grad_norm": 3.3553242683410645, + "learning_rate": 4.116448563120703e-06, + "loss": 0.0942, + "num_input_tokens_seen": 12241320, + "step": 6259 + }, + { + "epoch": 0.829688535453943, + "grad_norm": 12.096250534057617, + "learning_rate": 4.11618373499651e-06, + "loss": 0.4797, + "num_input_tokens_seen": 12243536, + "step": 6260 + }, + { + "epoch": 0.8298210735586481, + "grad_norm": 0.10807611048221588, + "learning_rate": 4.115918875711177e-06, + "loss": 0.0008, + "num_input_tokens_seen": 12245728, + "step": 6261 + }, + { + "epoch": 0.8299536116633532, + "grad_norm": 0.16108787059783936, + "learning_rate": 4.115653985269815e-06, + "loss": 0.0011, + "num_input_tokens_seen": 12248584, + "step": 6262 + }, + { + "epoch": 0.8300861497680583, + "grad_norm": 0.08902917802333832, + "learning_rate": 4.115389063677529e-06, + "loss": 0.0006, + "num_input_tokens_seen": 12250312, + "step": 6263 + }, + { + "epoch": 0.8302186878727634, + "grad_norm": 1.0257246494293213, + "learning_rate": 4.115124110939428e-06, + "loss": 0.0103, + "num_input_tokens_seen": 12252440, + "step": 6264 + }, + { + "epoch": 0.8303512259774686, + "grad_norm": 0.09052792936563492, + "learning_rate": 4.114859127060619e-06, + "loss": 0.0006, + "num_input_tokens_seen": 12254128, + "step": 6265 + }, + { + "epoch": 0.8304837640821736, + "grad_norm": 0.10972768813371658, + "learning_rate": 4.114594112046213e-06, + "loss": 0.0008, + "num_input_tokens_seen": 12256784, + "step": 6266 + }, + { + "epoch": 0.8306163021868788, + "grad_norm": 4.139855861663818, + "learning_rate": 4.114329065901318e-06, + "loss": 0.0647, + "num_input_tokens_seen": 12258640, + "step": 6267 + }, + { + "epoch": 0.8307488402915838, + "grad_norm": 10.520730018615723, + "learning_rate": 4.114063988631045e-06, + "loss": 0.3175, + "num_input_tokens_seen": 12260560, + "step": 6268 + }, + { + "epoch": 0.830881378396289, + "grad_norm": 12.92013168334961, + "learning_rate": 4.1137988802405045e-06, + "loss": 0.2762, + "num_input_tokens_seen": 12262840, + "step": 6269 + }, + { + "epoch": 0.831013916500994, + "grad_norm": 5.192386150360107, + "learning_rate": 4.113533740734808e-06, + "loss": 0.0964, + "num_input_tokens_seen": 12265520, + "step": 6270 + }, + { + "epoch": 0.8311464546056991, + "grad_norm": 0.05114308372139931, + "learning_rate": 4.113268570119069e-06, + "loss": 0.0004, + "num_input_tokens_seen": 12267640, + "step": 6271 + }, + { + "epoch": 0.8312789927104043, + "grad_norm": 3.6195547580718994, + "learning_rate": 4.113003368398399e-06, + "loss": 0.1584, + "num_input_tokens_seen": 12269296, + "step": 6272 + }, + { + "epoch": 0.8314115308151093, + "grad_norm": 11.716181755065918, + "learning_rate": 4.112738135577911e-06, + "loss": 0.5088, + "num_input_tokens_seen": 12271536, + "step": 6273 + }, + { + "epoch": 0.8315440689198145, + "grad_norm": 6.594929218292236, + "learning_rate": 4.112472871662719e-06, + "loss": 0.104, + "num_input_tokens_seen": 12272928, + "step": 6274 + }, + { + "epoch": 0.8316766070245195, + "grad_norm": 11.424931526184082, + "learning_rate": 4.1122075766579385e-06, + "loss": 0.5056, + "num_input_tokens_seen": 12275312, + "step": 6275 + }, + { + "epoch": 0.8318091451292247, + "grad_norm": 7.557201385498047, + "learning_rate": 4.111942250568682e-06, + "loss": 0.2129, + "num_input_tokens_seen": 12277224, + "step": 6276 + }, + { + "epoch": 0.8319416832339298, + "grad_norm": 10.103591918945312, + "learning_rate": 4.111676893400069e-06, + "loss": 0.4211, + "num_input_tokens_seen": 12278760, + "step": 6277 + }, + { + "epoch": 0.8320742213386348, + "grad_norm": 3.9535558223724365, + "learning_rate": 4.111411505157211e-06, + "loss": 0.0977, + "num_input_tokens_seen": 12280744, + "step": 6278 + }, + { + "epoch": 0.83220675944334, + "grad_norm": 8.658276557922363, + "learning_rate": 4.111146085845229e-06, + "loss": 0.3607, + "num_input_tokens_seen": 12282880, + "step": 6279 + }, + { + "epoch": 0.832339297548045, + "grad_norm": 4.8431901931762695, + "learning_rate": 4.11088063546924e-06, + "loss": 0.0953, + "num_input_tokens_seen": 12284408, + "step": 6280 + }, + { + "epoch": 0.8324718356527502, + "grad_norm": 17.915311813354492, + "learning_rate": 4.11061515403436e-06, + "loss": 0.5008, + "num_input_tokens_seen": 12286720, + "step": 6281 + }, + { + "epoch": 0.8326043737574552, + "grad_norm": 10.253244400024414, + "learning_rate": 4.110349641545709e-06, + "loss": 0.3892, + "num_input_tokens_seen": 12288848, + "step": 6282 + }, + { + "epoch": 0.8327369118621604, + "grad_norm": 0.2947631776332855, + "learning_rate": 4.110084098008405e-06, + "loss": 0.0021, + "num_input_tokens_seen": 12290680, + "step": 6283 + }, + { + "epoch": 0.8328694499668655, + "grad_norm": 3.5136497020721436, + "learning_rate": 4.10981852342757e-06, + "loss": 0.0234, + "num_input_tokens_seen": 12292488, + "step": 6284 + }, + { + "epoch": 0.8330019880715706, + "grad_norm": 0.11030790209770203, + "learning_rate": 4.1095529178083225e-06, + "loss": 0.0008, + "num_input_tokens_seen": 12293952, + "step": 6285 + }, + { + "epoch": 0.8331345261762757, + "grad_norm": 0.5135043859481812, + "learning_rate": 4.109287281155785e-06, + "loss": 0.0037, + "num_input_tokens_seen": 12295432, + "step": 6286 + }, + { + "epoch": 0.8332670642809807, + "grad_norm": 6.956450939178467, + "learning_rate": 4.109021613475078e-06, + "loss": 0.2062, + "num_input_tokens_seen": 12296872, + "step": 6287 + }, + { + "epoch": 0.8333996023856859, + "grad_norm": 0.48678722977638245, + "learning_rate": 4.1087559147713255e-06, + "loss": 0.0034, + "num_input_tokens_seen": 12298560, + "step": 6288 + }, + { + "epoch": 0.833532140490391, + "grad_norm": 11.695629119873047, + "learning_rate": 4.108490185049648e-06, + "loss": 0.2915, + "num_input_tokens_seen": 12300720, + "step": 6289 + }, + { + "epoch": 0.8336646785950961, + "grad_norm": 0.07623516768217087, + "learning_rate": 4.10822442431517e-06, + "loss": 0.0005, + "num_input_tokens_seen": 12301816, + "step": 6290 + }, + { + "epoch": 0.8337972166998012, + "grad_norm": 2.603740930557251, + "learning_rate": 4.107958632573016e-06, + "loss": 0.0209, + "num_input_tokens_seen": 12303424, + "step": 6291 + }, + { + "epoch": 0.8339297548045063, + "grad_norm": 3.211172103881836, + "learning_rate": 4.10769280982831e-06, + "loss": 0.0507, + "num_input_tokens_seen": 12304992, + "step": 6292 + }, + { + "epoch": 0.8340622929092114, + "grad_norm": 5.811161518096924, + "learning_rate": 4.107426956086178e-06, + "loss": 0.2022, + "num_input_tokens_seen": 12307400, + "step": 6293 + }, + { + "epoch": 0.8341948310139164, + "grad_norm": 7.342547416687012, + "learning_rate": 4.107161071351745e-06, + "loss": 0.299, + "num_input_tokens_seen": 12310224, + "step": 6294 + }, + { + "epoch": 0.8343273691186216, + "grad_norm": 8.435735702514648, + "learning_rate": 4.106895155630138e-06, + "loss": 0.1423, + "num_input_tokens_seen": 12312480, + "step": 6295 + }, + { + "epoch": 0.8344599072233267, + "grad_norm": 0.1092458963394165, + "learning_rate": 4.1066292089264845e-06, + "loss": 0.0008, + "num_input_tokens_seen": 12315384, + "step": 6296 + }, + { + "epoch": 0.8345924453280318, + "grad_norm": 5.305521488189697, + "learning_rate": 4.106363231245911e-06, + "loss": 0.0456, + "num_input_tokens_seen": 12317504, + "step": 6297 + }, + { + "epoch": 0.8347249834327369, + "grad_norm": 8.580910682678223, + "learning_rate": 4.1060972225935455e-06, + "loss": 0.155, + "num_input_tokens_seen": 12319744, + "step": 6298 + }, + { + "epoch": 0.834857521537442, + "grad_norm": 0.22852346301078796, + "learning_rate": 4.105831182974518e-06, + "loss": 0.0016, + "num_input_tokens_seen": 12321160, + "step": 6299 + }, + { + "epoch": 0.8349900596421471, + "grad_norm": 5.1165289878845215, + "learning_rate": 4.105565112393958e-06, + "loss": 0.1637, + "num_input_tokens_seen": 12323952, + "step": 6300 + }, + { + "epoch": 0.8351225977468523, + "grad_norm": 5.195356369018555, + "learning_rate": 4.105299010856995e-06, + "loss": 0.0787, + "num_input_tokens_seen": 12325240, + "step": 6301 + }, + { + "epoch": 0.8352551358515573, + "grad_norm": 3.7653090953826904, + "learning_rate": 4.105032878368759e-06, + "loss": 0.0738, + "num_input_tokens_seen": 12327568, + "step": 6302 + }, + { + "epoch": 0.8353876739562625, + "grad_norm": 0.12128431349992752, + "learning_rate": 4.104766714934383e-06, + "loss": 0.0008, + "num_input_tokens_seen": 12329216, + "step": 6303 + }, + { + "epoch": 0.8355202120609675, + "grad_norm": 5.701632022857666, + "learning_rate": 4.1045005205589965e-06, + "loss": 0.1953, + "num_input_tokens_seen": 12331056, + "step": 6304 + }, + { + "epoch": 0.8356527501656726, + "grad_norm": 0.14814801514148712, + "learning_rate": 4.1042342952477334e-06, + "loss": 0.001, + "num_input_tokens_seen": 12334528, + "step": 6305 + }, + { + "epoch": 0.8357852882703777, + "grad_norm": 2.8101770877838135, + "learning_rate": 4.103968039005726e-06, + "loss": 0.0961, + "num_input_tokens_seen": 12336864, + "step": 6306 + }, + { + "epoch": 0.8359178263750828, + "grad_norm": 4.956212043762207, + "learning_rate": 4.103701751838108e-06, + "loss": 0.1834, + "num_input_tokens_seen": 12338688, + "step": 6307 + }, + { + "epoch": 0.836050364479788, + "grad_norm": 0.09347277134656906, + "learning_rate": 4.103435433750015e-06, + "loss": 0.0006, + "num_input_tokens_seen": 12340000, + "step": 6308 + }, + { + "epoch": 0.836182902584493, + "grad_norm": 4.287031650543213, + "learning_rate": 4.10316908474658e-06, + "loss": 0.0396, + "num_input_tokens_seen": 12341976, + "step": 6309 + }, + { + "epoch": 0.8363154406891982, + "grad_norm": 11.563910484313965, + "learning_rate": 4.10290270483294e-06, + "loss": 0.3212, + "num_input_tokens_seen": 12344072, + "step": 6310 + }, + { + "epoch": 0.8364479787939032, + "grad_norm": 5.998438835144043, + "learning_rate": 4.1026362940142285e-06, + "loss": 0.1157, + "num_input_tokens_seen": 12346728, + "step": 6311 + }, + { + "epoch": 0.8365805168986084, + "grad_norm": 5.664106369018555, + "learning_rate": 4.102369852295584e-06, + "loss": 0.1563, + "num_input_tokens_seen": 12348424, + "step": 6312 + }, + { + "epoch": 0.8367130550033135, + "grad_norm": 11.986909866333008, + "learning_rate": 4.102103379682145e-06, + "loss": 0.2022, + "num_input_tokens_seen": 12350160, + "step": 6313 + }, + { + "epoch": 0.8368455931080185, + "grad_norm": 6.839475154876709, + "learning_rate": 4.1018368761790465e-06, + "loss": 0.1315, + "num_input_tokens_seen": 12351456, + "step": 6314 + }, + { + "epoch": 0.8369781312127237, + "grad_norm": 11.530745506286621, + "learning_rate": 4.1015703417914275e-06, + "loss": 0.2328, + "num_input_tokens_seen": 12353112, + "step": 6315 + }, + { + "epoch": 0.8371106693174287, + "grad_norm": 6.243917465209961, + "learning_rate": 4.101303776524428e-06, + "loss": 0.0768, + "num_input_tokens_seen": 12355976, + "step": 6316 + }, + { + "epoch": 0.8372432074221339, + "grad_norm": 0.14724506437778473, + "learning_rate": 4.101037180383187e-06, + "loss": 0.001, + "num_input_tokens_seen": 12357456, + "step": 6317 + }, + { + "epoch": 0.837375745526839, + "grad_norm": 0.04850737750530243, + "learning_rate": 4.100770553372844e-06, + "loss": 0.0003, + "num_input_tokens_seen": 12359176, + "step": 6318 + }, + { + "epoch": 0.8375082836315441, + "grad_norm": 1.7291908264160156, + "learning_rate": 4.100503895498542e-06, + "loss": 0.0293, + "num_input_tokens_seen": 12360840, + "step": 6319 + }, + { + "epoch": 0.8376408217362492, + "grad_norm": 0.3062196373939514, + "learning_rate": 4.100237206765419e-06, + "loss": 0.0022, + "num_input_tokens_seen": 12362696, + "step": 6320 + }, + { + "epoch": 0.8377733598409542, + "grad_norm": 12.664523124694824, + "learning_rate": 4.09997048717862e-06, + "loss": 0.543, + "num_input_tokens_seen": 12363968, + "step": 6321 + }, + { + "epoch": 0.8379058979456594, + "grad_norm": 0.22374042868614197, + "learning_rate": 4.099703736743285e-06, + "loss": 0.0015, + "num_input_tokens_seen": 12365408, + "step": 6322 + }, + { + "epoch": 0.8380384360503644, + "grad_norm": 0.6678643822669983, + "learning_rate": 4.099436955464559e-06, + "loss": 0.0142, + "num_input_tokens_seen": 12368328, + "step": 6323 + }, + { + "epoch": 0.8381709741550696, + "grad_norm": 5.0025858879089355, + "learning_rate": 4.099170143347585e-06, + "loss": 0.1092, + "num_input_tokens_seen": 12370520, + "step": 6324 + }, + { + "epoch": 0.8383035122597747, + "grad_norm": 7.469423770904541, + "learning_rate": 4.098903300397508e-06, + "loss": 0.0721, + "num_input_tokens_seen": 12372152, + "step": 6325 + }, + { + "epoch": 0.8384360503644798, + "grad_norm": 19.086153030395508, + "learning_rate": 4.0986364266194714e-06, + "loss": 0.2658, + "num_input_tokens_seen": 12373944, + "step": 6326 + }, + { + "epoch": 0.8385685884691849, + "grad_norm": 8.217903137207031, + "learning_rate": 4.098369522018623e-06, + "loss": 0.0947, + "num_input_tokens_seen": 12376040, + "step": 6327 + }, + { + "epoch": 0.83870112657389, + "grad_norm": 7.327276706695557, + "learning_rate": 4.098102586600106e-06, + "loss": 0.1314, + "num_input_tokens_seen": 12377768, + "step": 6328 + }, + { + "epoch": 0.8388336646785951, + "grad_norm": 6.97507381439209, + "learning_rate": 4.097835620369069e-06, + "loss": 0.1231, + "num_input_tokens_seen": 12379712, + "step": 6329 + }, + { + "epoch": 0.8389662027833003, + "grad_norm": 0.08070863783359528, + "learning_rate": 4.09756862333066e-06, + "loss": 0.0006, + "num_input_tokens_seen": 12381432, + "step": 6330 + }, + { + "epoch": 0.8390987408880053, + "grad_norm": 10.349992752075195, + "learning_rate": 4.097301595490025e-06, + "loss": 0.1958, + "num_input_tokens_seen": 12382848, + "step": 6331 + }, + { + "epoch": 0.8392312789927104, + "grad_norm": 1.6803542375564575, + "learning_rate": 4.097034536852313e-06, + "loss": 0.017, + "num_input_tokens_seen": 12384072, + "step": 6332 + }, + { + "epoch": 0.8393638170974155, + "grad_norm": 8.359085083007812, + "learning_rate": 4.096767447422674e-06, + "loss": 0.2623, + "num_input_tokens_seen": 12386240, + "step": 6333 + }, + { + "epoch": 0.8394963552021206, + "grad_norm": 6.7387213706970215, + "learning_rate": 4.096500327206257e-06, + "loss": 0.0492, + "num_input_tokens_seen": 12388184, + "step": 6334 + }, + { + "epoch": 0.8396288933068257, + "grad_norm": 10.887840270996094, + "learning_rate": 4.0962331762082125e-06, + "loss": 0.3014, + "num_input_tokens_seen": 12390080, + "step": 6335 + }, + { + "epoch": 0.8397614314115308, + "grad_norm": 0.38680610060691833, + "learning_rate": 4.095965994433691e-06, + "loss": 0.0028, + "num_input_tokens_seen": 12391488, + "step": 6336 + }, + { + "epoch": 0.839893969516236, + "grad_norm": 7.741716384887695, + "learning_rate": 4.095698781887844e-06, + "loss": 0.1921, + "num_input_tokens_seen": 12394640, + "step": 6337 + }, + { + "epoch": 0.840026507620941, + "grad_norm": 8.093084335327148, + "learning_rate": 4.095431538575824e-06, + "loss": 0.2131, + "num_input_tokens_seen": 12397216, + "step": 6338 + }, + { + "epoch": 0.8401590457256461, + "grad_norm": 0.13176777958869934, + "learning_rate": 4.095164264502783e-06, + "loss": 0.0009, + "num_input_tokens_seen": 12398896, + "step": 6339 + }, + { + "epoch": 0.8402915838303512, + "grad_norm": 17.86504364013672, + "learning_rate": 4.094896959673874e-06, + "loss": 0.6013, + "num_input_tokens_seen": 12401120, + "step": 6340 + }, + { + "epoch": 0.8404241219350563, + "grad_norm": 6.097297668457031, + "learning_rate": 4.094629624094252e-06, + "loss": 0.2176, + "num_input_tokens_seen": 12402920, + "step": 6341 + }, + { + "epoch": 0.8405566600397615, + "grad_norm": 15.754793167114258, + "learning_rate": 4.094362257769071e-06, + "loss": 0.5255, + "num_input_tokens_seen": 12405088, + "step": 6342 + }, + { + "epoch": 0.8406891981444665, + "grad_norm": 5.362997531890869, + "learning_rate": 4.094094860703485e-06, + "loss": 0.1026, + "num_input_tokens_seen": 12406496, + "step": 6343 + }, + { + "epoch": 0.8408217362491717, + "grad_norm": 7.831585884094238, + "learning_rate": 4.093827432902652e-06, + "loss": 0.1421, + "num_input_tokens_seen": 12408384, + "step": 6344 + }, + { + "epoch": 0.8409542743538767, + "grad_norm": 4.709921360015869, + "learning_rate": 4.093559974371725e-06, + "loss": 0.0376, + "num_input_tokens_seen": 12410120, + "step": 6345 + }, + { + "epoch": 0.8410868124585819, + "grad_norm": 5.881486415863037, + "learning_rate": 4.093292485115863e-06, + "loss": 0.3001, + "num_input_tokens_seen": 12411776, + "step": 6346 + }, + { + "epoch": 0.8412193505632869, + "grad_norm": 10.681548118591309, + "learning_rate": 4.093024965140222e-06, + "loss": 0.1459, + "num_input_tokens_seen": 12413504, + "step": 6347 + }, + { + "epoch": 0.841351888667992, + "grad_norm": 16.900562286376953, + "learning_rate": 4.092757414449961e-06, + "loss": 0.4993, + "num_input_tokens_seen": 12415088, + "step": 6348 + }, + { + "epoch": 0.8414844267726972, + "grad_norm": 3.6904962062835693, + "learning_rate": 4.0924898330502384e-06, + "loss": 0.0745, + "num_input_tokens_seen": 12418152, + "step": 6349 + }, + { + "epoch": 0.8416169648774022, + "grad_norm": 4.910851955413818, + "learning_rate": 4.092222220946214e-06, + "loss": 0.0575, + "num_input_tokens_seen": 12420728, + "step": 6350 + }, + { + "epoch": 0.8417495029821074, + "grad_norm": 9.716824531555176, + "learning_rate": 4.091954578143045e-06, + "loss": 0.4016, + "num_input_tokens_seen": 12423104, + "step": 6351 + }, + { + "epoch": 0.8418820410868124, + "grad_norm": 7.391814708709717, + "learning_rate": 4.091686904645895e-06, + "loss": 0.3463, + "num_input_tokens_seen": 12425408, + "step": 6352 + }, + { + "epoch": 0.8420145791915176, + "grad_norm": 6.142772674560547, + "learning_rate": 4.091419200459923e-06, + "loss": 0.1157, + "num_input_tokens_seen": 12427376, + "step": 6353 + }, + { + "epoch": 0.8421471172962227, + "grad_norm": 4.738673686981201, + "learning_rate": 4.091151465590291e-06, + "loss": 0.0355, + "num_input_tokens_seen": 12429464, + "step": 6354 + }, + { + "epoch": 0.8422796554009278, + "grad_norm": 4.7116875648498535, + "learning_rate": 4.090883700042161e-06, + "loss": 0.091, + "num_input_tokens_seen": 12431000, + "step": 6355 + }, + { + "epoch": 0.8424121935056329, + "grad_norm": 8.040979385375977, + "learning_rate": 4.090615903820695e-06, + "loss": 0.1254, + "num_input_tokens_seen": 12433184, + "step": 6356 + }, + { + "epoch": 0.8425447316103379, + "grad_norm": 6.648922920227051, + "learning_rate": 4.0903480769310586e-06, + "loss": 0.2189, + "num_input_tokens_seen": 12434864, + "step": 6357 + }, + { + "epoch": 0.8426772697150431, + "grad_norm": 6.222167015075684, + "learning_rate": 4.090080219378413e-06, + "loss": 0.1567, + "num_input_tokens_seen": 12437576, + "step": 6358 + }, + { + "epoch": 0.8428098078197481, + "grad_norm": 0.3987060785293579, + "learning_rate": 4.0898123311679244e-06, + "loss": 0.0028, + "num_input_tokens_seen": 12438880, + "step": 6359 + }, + { + "epoch": 0.8429423459244533, + "grad_norm": 22.511449813842773, + "learning_rate": 4.0895444123047575e-06, + "loss": 0.6172, + "num_input_tokens_seen": 12440880, + "step": 6360 + }, + { + "epoch": 0.8430748840291584, + "grad_norm": 0.560621440410614, + "learning_rate": 4.089276462794078e-06, + "loss": 0.004, + "num_input_tokens_seen": 12442280, + "step": 6361 + }, + { + "epoch": 0.8432074221338635, + "grad_norm": 4.715240478515625, + "learning_rate": 4.08900848264105e-06, + "loss": 0.0843, + "num_input_tokens_seen": 12444552, + "step": 6362 + }, + { + "epoch": 0.8433399602385686, + "grad_norm": 0.4060814678668976, + "learning_rate": 4.088740471850844e-06, + "loss": 0.0028, + "num_input_tokens_seen": 12445872, + "step": 6363 + }, + { + "epoch": 0.8434724983432736, + "grad_norm": 10.1954345703125, + "learning_rate": 4.088472430428625e-06, + "loss": 0.2563, + "num_input_tokens_seen": 12447544, + "step": 6364 + }, + { + "epoch": 0.8436050364479788, + "grad_norm": 8.777548789978027, + "learning_rate": 4.088204358379561e-06, + "loss": 0.33, + "num_input_tokens_seen": 12449584, + "step": 6365 + }, + { + "epoch": 0.8437375745526839, + "grad_norm": 0.23675388097763062, + "learning_rate": 4.087936255708823e-06, + "loss": 0.0016, + "num_input_tokens_seen": 12450784, + "step": 6366 + }, + { + "epoch": 0.843870112657389, + "grad_norm": 0.4394167959690094, + "learning_rate": 4.087668122421578e-06, + "loss": 0.003, + "num_input_tokens_seen": 12451864, + "step": 6367 + }, + { + "epoch": 0.8440026507620941, + "grad_norm": 11.21150016784668, + "learning_rate": 4.087399958522996e-06, + "loss": 0.5494, + "num_input_tokens_seen": 12453736, + "step": 6368 + }, + { + "epoch": 0.8441351888667992, + "grad_norm": 5.191708087921143, + "learning_rate": 4.0871317640182475e-06, + "loss": 0.1093, + "num_input_tokens_seen": 12455128, + "step": 6369 + }, + { + "epoch": 0.8442677269715043, + "grad_norm": 1.7173802852630615, + "learning_rate": 4.086863538912505e-06, + "loss": 0.0474, + "num_input_tokens_seen": 12456664, + "step": 6370 + }, + { + "epoch": 0.8444002650762094, + "grad_norm": 10.677254676818848, + "learning_rate": 4.0865952832109365e-06, + "loss": 0.2191, + "num_input_tokens_seen": 12458480, + "step": 6371 + }, + { + "epoch": 0.8445328031809145, + "grad_norm": 3.9512228965759277, + "learning_rate": 4.0863269969187184e-06, + "loss": 0.1005, + "num_input_tokens_seen": 12460112, + "step": 6372 + }, + { + "epoch": 0.8446653412856197, + "grad_norm": 0.15595698356628418, + "learning_rate": 4.086058680041021e-06, + "loss": 0.0011, + "num_input_tokens_seen": 12461208, + "step": 6373 + }, + { + "epoch": 0.8447978793903247, + "grad_norm": 15.256954193115234, + "learning_rate": 4.085790332583017e-06, + "loss": 0.4555, + "num_input_tokens_seen": 12463504, + "step": 6374 + }, + { + "epoch": 0.8449304174950298, + "grad_norm": 2.1838645935058594, + "learning_rate": 4.085521954549883e-06, + "loss": 0.0509, + "num_input_tokens_seen": 12465216, + "step": 6375 + }, + { + "epoch": 0.8450629555997349, + "grad_norm": 6.342325687408447, + "learning_rate": 4.085253545946791e-06, + "loss": 0.1477, + "num_input_tokens_seen": 12467056, + "step": 6376 + }, + { + "epoch": 0.84519549370444, + "grad_norm": 13.239951133728027, + "learning_rate": 4.084985106778916e-06, + "loss": 0.2052, + "num_input_tokens_seen": 12468456, + "step": 6377 + }, + { + "epoch": 0.8453280318091452, + "grad_norm": 0.18500353395938873, + "learning_rate": 4.084716637051437e-06, + "loss": 0.0013, + "num_input_tokens_seen": 12469712, + "step": 6378 + }, + { + "epoch": 0.8454605699138502, + "grad_norm": 11.335220336914062, + "learning_rate": 4.084448136769526e-06, + "loss": 0.4316, + "num_input_tokens_seen": 12472136, + "step": 6379 + }, + { + "epoch": 0.8455931080185554, + "grad_norm": 8.607227325439453, + "learning_rate": 4.084179605938363e-06, + "loss": 0.2958, + "num_input_tokens_seen": 12474144, + "step": 6380 + }, + { + "epoch": 0.8457256461232604, + "grad_norm": 12.615630149841309, + "learning_rate": 4.083911044563124e-06, + "loss": 0.3634, + "num_input_tokens_seen": 12476256, + "step": 6381 + }, + { + "epoch": 0.8458581842279655, + "grad_norm": 12.320110321044922, + "learning_rate": 4.083642452648987e-06, + "loss": 0.5625, + "num_input_tokens_seen": 12478512, + "step": 6382 + }, + { + "epoch": 0.8459907223326707, + "grad_norm": 14.719356536865234, + "learning_rate": 4.083373830201131e-06, + "loss": 0.4323, + "num_input_tokens_seen": 12480024, + "step": 6383 + }, + { + "epoch": 0.8461232604373757, + "grad_norm": 13.87115478515625, + "learning_rate": 4.083105177224737e-06, + "loss": 0.6382, + "num_input_tokens_seen": 12482072, + "step": 6384 + }, + { + "epoch": 0.8462557985420809, + "grad_norm": 9.822617530822754, + "learning_rate": 4.082836493724981e-06, + "loss": 0.1321, + "num_input_tokens_seen": 12484040, + "step": 6385 + }, + { + "epoch": 0.8463883366467859, + "grad_norm": 9.058950424194336, + "learning_rate": 4.082567779707046e-06, + "loss": 0.3669, + "num_input_tokens_seen": 12486720, + "step": 6386 + }, + { + "epoch": 0.8465208747514911, + "grad_norm": 0.40007105469703674, + "learning_rate": 4.082299035176112e-06, + "loss": 0.0026, + "num_input_tokens_seen": 12488408, + "step": 6387 + }, + { + "epoch": 0.8466534128561961, + "grad_norm": 4.104454040527344, + "learning_rate": 4.082030260137363e-06, + "loss": 0.0285, + "num_input_tokens_seen": 12490520, + "step": 6388 + }, + { + "epoch": 0.8467859509609013, + "grad_norm": 0.9374334216117859, + "learning_rate": 4.081761454595977e-06, + "loss": 0.0065, + "num_input_tokens_seen": 12492240, + "step": 6389 + }, + { + "epoch": 0.8469184890656064, + "grad_norm": 5.622342109680176, + "learning_rate": 4.08149261855714e-06, + "loss": 0.0694, + "num_input_tokens_seen": 12493920, + "step": 6390 + }, + { + "epoch": 0.8470510271703114, + "grad_norm": 0.704146683216095, + "learning_rate": 4.081223752026034e-06, + "loss": 0.0048, + "num_input_tokens_seen": 12495496, + "step": 6391 + }, + { + "epoch": 0.8471835652750166, + "grad_norm": 0.24329940974712372, + "learning_rate": 4.080954855007843e-06, + "loss": 0.0017, + "num_input_tokens_seen": 12497168, + "step": 6392 + }, + { + "epoch": 0.8473161033797216, + "grad_norm": 8.780317306518555, + "learning_rate": 4.080685927507752e-06, + "loss": 0.2158, + "num_input_tokens_seen": 12499088, + "step": 6393 + }, + { + "epoch": 0.8474486414844268, + "grad_norm": 11.403813362121582, + "learning_rate": 4.080416969530947e-06, + "loss": 0.2141, + "num_input_tokens_seen": 12501536, + "step": 6394 + }, + { + "epoch": 0.8475811795891319, + "grad_norm": 4.885720252990723, + "learning_rate": 4.0801479810826115e-06, + "loss": 0.1458, + "num_input_tokens_seen": 12502968, + "step": 6395 + }, + { + "epoch": 0.847713717693837, + "grad_norm": 6.20463752746582, + "learning_rate": 4.079878962167932e-06, + "loss": 0.2197, + "num_input_tokens_seen": 12505432, + "step": 6396 + }, + { + "epoch": 0.8478462557985421, + "grad_norm": 10.271278381347656, + "learning_rate": 4.079609912792097e-06, + "loss": 0.1384, + "num_input_tokens_seen": 12507392, + "step": 6397 + }, + { + "epoch": 0.8479787939032472, + "grad_norm": 10.539252281188965, + "learning_rate": 4.079340832960294e-06, + "loss": 0.3962, + "num_input_tokens_seen": 12509520, + "step": 6398 + }, + { + "epoch": 0.8481113320079523, + "grad_norm": 0.10395228862762451, + "learning_rate": 4.0790717226777095e-06, + "loss": 0.0007, + "num_input_tokens_seen": 12511248, + "step": 6399 + }, + { + "epoch": 0.8482438701126573, + "grad_norm": 7.647186756134033, + "learning_rate": 4.078802581949532e-06, + "loss": 0.1773, + "num_input_tokens_seen": 12514072, + "step": 6400 + }, + { + "epoch": 0.8483764082173625, + "grad_norm": 4.427883625030518, + "learning_rate": 4.078533410780952e-06, + "loss": 0.1244, + "num_input_tokens_seen": 12516136, + "step": 6401 + }, + { + "epoch": 0.8485089463220676, + "grad_norm": 1.6196510791778564, + "learning_rate": 4.07826420917716e-06, + "loss": 0.0092, + "num_input_tokens_seen": 12517880, + "step": 6402 + }, + { + "epoch": 0.8486414844267727, + "grad_norm": 13.98119068145752, + "learning_rate": 4.077994977143345e-06, + "loss": 0.3131, + "num_input_tokens_seen": 12519768, + "step": 6403 + }, + { + "epoch": 0.8487740225314778, + "grad_norm": 5.882109642028809, + "learning_rate": 4.077725714684697e-06, + "loss": 0.2947, + "num_input_tokens_seen": 12522096, + "step": 6404 + }, + { + "epoch": 0.8489065606361829, + "grad_norm": 0.4843659996986389, + "learning_rate": 4.07745642180641e-06, + "loss": 0.003, + "num_input_tokens_seen": 12523376, + "step": 6405 + }, + { + "epoch": 0.849039098740888, + "grad_norm": 3.825468063354492, + "learning_rate": 4.077187098513674e-06, + "loss": 0.0677, + "num_input_tokens_seen": 12525264, + "step": 6406 + }, + { + "epoch": 0.8491716368455932, + "grad_norm": 6.060602188110352, + "learning_rate": 4.076917744811684e-06, + "loss": 0.0626, + "num_input_tokens_seen": 12527192, + "step": 6407 + }, + { + "epoch": 0.8493041749502982, + "grad_norm": 3.4359869956970215, + "learning_rate": 4.07664836070563e-06, + "loss": 0.1188, + "num_input_tokens_seen": 12528416, + "step": 6408 + }, + { + "epoch": 0.8494367130550033, + "grad_norm": 7.520660877227783, + "learning_rate": 4.07637894620071e-06, + "loss": 0.2944, + "num_input_tokens_seen": 12530440, + "step": 6409 + }, + { + "epoch": 0.8495692511597084, + "grad_norm": 8.104143142700195, + "learning_rate": 4.076109501302115e-06, + "loss": 0.1677, + "num_input_tokens_seen": 12532776, + "step": 6410 + }, + { + "epoch": 0.8497017892644135, + "grad_norm": 10.299175262451172, + "learning_rate": 4.075840026015042e-06, + "loss": 0.317, + "num_input_tokens_seen": 12534784, + "step": 6411 + }, + { + "epoch": 0.8498343273691186, + "grad_norm": 0.05123569816350937, + "learning_rate": 4.075570520344686e-06, + "loss": 0.0003, + "num_input_tokens_seen": 12536840, + "step": 6412 + }, + { + "epoch": 0.8499668654738237, + "grad_norm": 0.03352626413106918, + "learning_rate": 4.075300984296244e-06, + "loss": 0.0002, + "num_input_tokens_seen": 12538624, + "step": 6413 + }, + { + "epoch": 0.8500994035785289, + "grad_norm": 6.965059757232666, + "learning_rate": 4.075031417874912e-06, + "loss": 0.189, + "num_input_tokens_seen": 12540312, + "step": 6414 + }, + { + "epoch": 0.8502319416832339, + "grad_norm": 0.02496710605919361, + "learning_rate": 4.074761821085887e-06, + "loss": 0.0002, + "num_input_tokens_seen": 12541680, + "step": 6415 + }, + { + "epoch": 0.850364479787939, + "grad_norm": 0.04955129697918892, + "learning_rate": 4.0744921939343685e-06, + "loss": 0.0003, + "num_input_tokens_seen": 12543032, + "step": 6416 + }, + { + "epoch": 0.8504970178926441, + "grad_norm": 0.01132159773260355, + "learning_rate": 4.074222536425555e-06, + "loss": 0.0001, + "num_input_tokens_seen": 12544128, + "step": 6417 + }, + { + "epoch": 0.8506295559973492, + "grad_norm": 0.05694584921002388, + "learning_rate": 4.0739528485646435e-06, + "loss": 0.0004, + "num_input_tokens_seen": 12545312, + "step": 6418 + }, + { + "epoch": 0.8507620941020544, + "grad_norm": 0.04980907216668129, + "learning_rate": 4.0736831303568356e-06, + "loss": 0.0003, + "num_input_tokens_seen": 12547192, + "step": 6419 + }, + { + "epoch": 0.8508946322067594, + "grad_norm": 7.295020580291748, + "learning_rate": 4.073413381807332e-06, + "loss": 0.0695, + "num_input_tokens_seen": 12548720, + "step": 6420 + }, + { + "epoch": 0.8510271703114646, + "grad_norm": 0.0273525919765234, + "learning_rate": 4.0731436029213326e-06, + "loss": 0.0002, + "num_input_tokens_seen": 12551576, + "step": 6421 + }, + { + "epoch": 0.8511597084161696, + "grad_norm": 0.09279648214578629, + "learning_rate": 4.07287379370404e-06, + "loss": 0.0007, + "num_input_tokens_seen": 12552752, + "step": 6422 + }, + { + "epoch": 0.8512922465208748, + "grad_norm": 0.04769066348671913, + "learning_rate": 4.0726039541606545e-06, + "loss": 0.0003, + "num_input_tokens_seen": 12554496, + "step": 6423 + }, + { + "epoch": 0.8514247846255798, + "grad_norm": 11.368703842163086, + "learning_rate": 4.07233408429638e-06, + "loss": 0.2633, + "num_input_tokens_seen": 12556616, + "step": 6424 + }, + { + "epoch": 0.851557322730285, + "grad_norm": 0.0345623642206192, + "learning_rate": 4.07206418411642e-06, + "loss": 0.0002, + "num_input_tokens_seen": 12558336, + "step": 6425 + }, + { + "epoch": 0.8516898608349901, + "grad_norm": 0.046168696135282516, + "learning_rate": 4.071794253625978e-06, + "loss": 0.0003, + "num_input_tokens_seen": 12559424, + "step": 6426 + }, + { + "epoch": 0.8518223989396951, + "grad_norm": 2.711806535720825, + "learning_rate": 4.071524292830259e-06, + "loss": 0.0732, + "num_input_tokens_seen": 12561656, + "step": 6427 + }, + { + "epoch": 0.8519549370444003, + "grad_norm": 2.3694021701812744, + "learning_rate": 4.071254301734466e-06, + "loss": 0.1042, + "num_input_tokens_seen": 12563560, + "step": 6428 + }, + { + "epoch": 0.8520874751491053, + "grad_norm": 17.226137161254883, + "learning_rate": 4.0709842803438075e-06, + "loss": 0.4848, + "num_input_tokens_seen": 12565064, + "step": 6429 + }, + { + "epoch": 0.8522200132538105, + "grad_norm": 15.364121437072754, + "learning_rate": 4.070714228663488e-06, + "loss": 0.6741, + "num_input_tokens_seen": 12567624, + "step": 6430 + }, + { + "epoch": 0.8523525513585156, + "grad_norm": 6.624318599700928, + "learning_rate": 4.070444146698713e-06, + "loss": 0.1217, + "num_input_tokens_seen": 12570056, + "step": 6431 + }, + { + "epoch": 0.8524850894632207, + "grad_norm": 0.11983751505613327, + "learning_rate": 4.070174034454693e-06, + "loss": 0.0008, + "num_input_tokens_seen": 12572816, + "step": 6432 + }, + { + "epoch": 0.8526176275679258, + "grad_norm": 0.2514071762561798, + "learning_rate": 4.069903891936635e-06, + "loss": 0.0018, + "num_input_tokens_seen": 12574296, + "step": 6433 + }, + { + "epoch": 0.8527501656726308, + "grad_norm": 11.703482627868652, + "learning_rate": 4.069633719149746e-06, + "loss": 0.395, + "num_input_tokens_seen": 12576384, + "step": 6434 + }, + { + "epoch": 0.852882703777336, + "grad_norm": 10.800896644592285, + "learning_rate": 4.0693635160992364e-06, + "loss": 0.2239, + "num_input_tokens_seen": 12579680, + "step": 6435 + }, + { + "epoch": 0.8530152418820411, + "grad_norm": 7.269272327423096, + "learning_rate": 4.069093282790315e-06, + "loss": 0.1074, + "num_input_tokens_seen": 12582664, + "step": 6436 + }, + { + "epoch": 0.8531477799867462, + "grad_norm": 2.0887575149536133, + "learning_rate": 4.068823019228193e-06, + "loss": 0.0044, + "num_input_tokens_seen": 12584632, + "step": 6437 + }, + { + "epoch": 0.8532803180914513, + "grad_norm": 7.457073211669922, + "learning_rate": 4.068552725418081e-06, + "loss": 0.1785, + "num_input_tokens_seen": 12587704, + "step": 6438 + }, + { + "epoch": 0.8534128561961564, + "grad_norm": 0.1505986452102661, + "learning_rate": 4.06828240136519e-06, + "loss": 0.001, + "num_input_tokens_seen": 12589728, + "step": 6439 + }, + { + "epoch": 0.8535453943008615, + "grad_norm": 0.6749204993247986, + "learning_rate": 4.068012047074734e-06, + "loss": 0.0045, + "num_input_tokens_seen": 12591256, + "step": 6440 + }, + { + "epoch": 0.8536779324055666, + "grad_norm": 14.419768333435059, + "learning_rate": 4.067741662551922e-06, + "loss": 0.5766, + "num_input_tokens_seen": 12593728, + "step": 6441 + }, + { + "epoch": 0.8538104705102717, + "grad_norm": 0.5851397514343262, + "learning_rate": 4.067471247801971e-06, + "loss": 0.0033, + "num_input_tokens_seen": 12595872, + "step": 6442 + }, + { + "epoch": 0.8539430086149769, + "grad_norm": 0.1268688440322876, + "learning_rate": 4.067200802830091e-06, + "loss": 0.0008, + "num_input_tokens_seen": 12598408, + "step": 6443 + }, + { + "epoch": 0.8540755467196819, + "grad_norm": 10.444578170776367, + "learning_rate": 4.066930327641499e-06, + "loss": 0.3122, + "num_input_tokens_seen": 12600592, + "step": 6444 + }, + { + "epoch": 0.854208084824387, + "grad_norm": 8.73383903503418, + "learning_rate": 4.06665982224141e-06, + "loss": 0.1723, + "num_input_tokens_seen": 12602832, + "step": 6445 + }, + { + "epoch": 0.8543406229290921, + "grad_norm": 6.050260066986084, + "learning_rate": 4.0663892866350384e-06, + "loss": 0.1812, + "num_input_tokens_seen": 12605264, + "step": 6446 + }, + { + "epoch": 0.8544731610337972, + "grad_norm": 11.662336349487305, + "learning_rate": 4.066118720827601e-06, + "loss": 0.4064, + "num_input_tokens_seen": 12607264, + "step": 6447 + }, + { + "epoch": 0.8546056991385024, + "grad_norm": 12.227848052978516, + "learning_rate": 4.0658481248243144e-06, + "loss": 0.1733, + "num_input_tokens_seen": 12609184, + "step": 6448 + }, + { + "epoch": 0.8547382372432074, + "grad_norm": 13.252177238464355, + "learning_rate": 4.065577498630395e-06, + "loss": 0.4484, + "num_input_tokens_seen": 12611472, + "step": 6449 + }, + { + "epoch": 0.8548707753479126, + "grad_norm": 4.079404830932617, + "learning_rate": 4.065306842251062e-06, + "loss": 0.1024, + "num_input_tokens_seen": 12613360, + "step": 6450 + }, + { + "epoch": 0.8550033134526176, + "grad_norm": 8.79055404663086, + "learning_rate": 4.065036155691533e-06, + "loss": 0.1455, + "num_input_tokens_seen": 12616840, + "step": 6451 + }, + { + "epoch": 0.8551358515573227, + "grad_norm": 7.854814052581787, + "learning_rate": 4.064765438957028e-06, + "loss": 0.279, + "num_input_tokens_seen": 12618656, + "step": 6452 + }, + { + "epoch": 0.8552683896620278, + "grad_norm": 5.607257843017578, + "learning_rate": 4.064494692052765e-06, + "loss": 0.0986, + "num_input_tokens_seen": 12620408, + "step": 6453 + }, + { + "epoch": 0.8554009277667329, + "grad_norm": 8.602025032043457, + "learning_rate": 4.064223914983964e-06, + "loss": 0.2128, + "num_input_tokens_seen": 12622432, + "step": 6454 + }, + { + "epoch": 0.8555334658714381, + "grad_norm": 4.314160346984863, + "learning_rate": 4.063953107755849e-06, + "loss": 0.1272, + "num_input_tokens_seen": 12625280, + "step": 6455 + }, + { + "epoch": 0.8556660039761431, + "grad_norm": 9.911271095275879, + "learning_rate": 4.063682270373638e-06, + "loss": 0.2331, + "num_input_tokens_seen": 12626776, + "step": 6456 + }, + { + "epoch": 0.8557985420808483, + "grad_norm": 0.09800702333450317, + "learning_rate": 4.063411402842554e-06, + "loss": 0.0007, + "num_input_tokens_seen": 12628328, + "step": 6457 + }, + { + "epoch": 0.8559310801855533, + "grad_norm": 4.810177326202393, + "learning_rate": 4.063140505167821e-06, + "loss": 0.0376, + "num_input_tokens_seen": 12629976, + "step": 6458 + }, + { + "epoch": 0.8560636182902585, + "grad_norm": 0.2726171910762787, + "learning_rate": 4.062869577354659e-06, + "loss": 0.002, + "num_input_tokens_seen": 12631536, + "step": 6459 + }, + { + "epoch": 0.8561961563949636, + "grad_norm": 4.750391483306885, + "learning_rate": 4.0625986194082945e-06, + "loss": 0.1273, + "num_input_tokens_seen": 12634160, + "step": 6460 + }, + { + "epoch": 0.8563286944996686, + "grad_norm": 7.9185099601745605, + "learning_rate": 4.0623276313339495e-06, + "loss": 0.0541, + "num_input_tokens_seen": 12636464, + "step": 6461 + }, + { + "epoch": 0.8564612326043738, + "grad_norm": 9.986881256103516, + "learning_rate": 4.0620566131368505e-06, + "loss": 0.1961, + "num_input_tokens_seen": 12638560, + "step": 6462 + }, + { + "epoch": 0.8565937707090788, + "grad_norm": 1.0187216997146606, + "learning_rate": 4.061785564822223e-06, + "loss": 0.0069, + "num_input_tokens_seen": 12640128, + "step": 6463 + }, + { + "epoch": 0.856726308813784, + "grad_norm": 0.28980928659439087, + "learning_rate": 4.061514486395292e-06, + "loss": 0.0018, + "num_input_tokens_seen": 12641176, + "step": 6464 + }, + { + "epoch": 0.856858846918489, + "grad_norm": 8.278011322021484, + "learning_rate": 4.061243377861284e-06, + "loss": 0.2433, + "num_input_tokens_seen": 12643224, + "step": 6465 + }, + { + "epoch": 0.8569913850231942, + "grad_norm": 7.07049036026001, + "learning_rate": 4.060972239225427e-06, + "loss": 0.23, + "num_input_tokens_seen": 12644888, + "step": 6466 + }, + { + "epoch": 0.8571239231278993, + "grad_norm": 8.95526123046875, + "learning_rate": 4.060701070492948e-06, + "loss": 0.1723, + "num_input_tokens_seen": 12647176, + "step": 6467 + }, + { + "epoch": 0.8572564612326043, + "grad_norm": 0.29301607608795166, + "learning_rate": 4.060429871669076e-06, + "loss": 0.0022, + "num_input_tokens_seen": 12648496, + "step": 6468 + }, + { + "epoch": 0.8573889993373095, + "grad_norm": 3.719393253326416, + "learning_rate": 4.06015864275904e-06, + "loss": 0.0853, + "num_input_tokens_seen": 12649920, + "step": 6469 + }, + { + "epoch": 0.8575215374420145, + "grad_norm": 4.546351432800293, + "learning_rate": 4.059887383768069e-06, + "loss": 0.0337, + "num_input_tokens_seen": 12652496, + "step": 6470 + }, + { + "epoch": 0.8576540755467197, + "grad_norm": 1.1764014959335327, + "learning_rate": 4.059616094701393e-06, + "loss": 0.0085, + "num_input_tokens_seen": 12654904, + "step": 6471 + }, + { + "epoch": 0.8577866136514248, + "grad_norm": 4.649127960205078, + "learning_rate": 4.059344775564242e-06, + "loss": 0.0318, + "num_input_tokens_seen": 12656872, + "step": 6472 + }, + { + "epoch": 0.8579191517561299, + "grad_norm": 0.17177623510360718, + "learning_rate": 4.059073426361849e-06, + "loss": 0.0012, + "num_input_tokens_seen": 12658072, + "step": 6473 + }, + { + "epoch": 0.858051689860835, + "grad_norm": 9.098956108093262, + "learning_rate": 4.0588020470994445e-06, + "loss": 0.232, + "num_input_tokens_seen": 12659592, + "step": 6474 + }, + { + "epoch": 0.8581842279655401, + "grad_norm": 7.675487518310547, + "learning_rate": 4.058530637782261e-06, + "loss": 0.2912, + "num_input_tokens_seen": 12661704, + "step": 6475 + }, + { + "epoch": 0.8583167660702452, + "grad_norm": 2.139901876449585, + "learning_rate": 4.058259198415532e-06, + "loss": 0.0469, + "num_input_tokens_seen": 12663120, + "step": 6476 + }, + { + "epoch": 0.8584493041749502, + "grad_norm": 9.362669944763184, + "learning_rate": 4.05798772900449e-06, + "loss": 0.3866, + "num_input_tokens_seen": 12665144, + "step": 6477 + }, + { + "epoch": 0.8585818422796554, + "grad_norm": 3.7443442344665527, + "learning_rate": 4.05771622955437e-06, + "loss": 0.1089, + "num_input_tokens_seen": 12667128, + "step": 6478 + }, + { + "epoch": 0.8587143803843605, + "grad_norm": 4.867796897888184, + "learning_rate": 4.0574447000704065e-06, + "loss": 0.1172, + "num_input_tokens_seen": 12669664, + "step": 6479 + }, + { + "epoch": 0.8588469184890656, + "grad_norm": 0.2032112181186676, + "learning_rate": 4.057173140557835e-06, + "loss": 0.0015, + "num_input_tokens_seen": 12672048, + "step": 6480 + }, + { + "epoch": 0.8589794565937707, + "grad_norm": 5.230216979980469, + "learning_rate": 4.056901551021891e-06, + "loss": 0.1137, + "num_input_tokens_seen": 12673984, + "step": 6481 + }, + { + "epoch": 0.8591119946984758, + "grad_norm": 0.036812663078308105, + "learning_rate": 4.056629931467811e-06, + "loss": 0.0003, + "num_input_tokens_seen": 12675304, + "step": 6482 + }, + { + "epoch": 0.8592445328031809, + "grad_norm": 1.739944338798523, + "learning_rate": 4.056358281900832e-06, + "loss": 0.0721, + "num_input_tokens_seen": 12677376, + "step": 6483 + }, + { + "epoch": 0.8593770709078861, + "grad_norm": 0.04181818291544914, + "learning_rate": 4.056086602326191e-06, + "loss": 0.0003, + "num_input_tokens_seen": 12678800, + "step": 6484 + }, + { + "epoch": 0.8595096090125911, + "grad_norm": 12.245060920715332, + "learning_rate": 4.055814892749128e-06, + "loss": 0.3995, + "num_input_tokens_seen": 12681800, + "step": 6485 + }, + { + "epoch": 0.8596421471172963, + "grad_norm": 0.06919483840465546, + "learning_rate": 4.05554315317488e-06, + "loss": 0.0005, + "num_input_tokens_seen": 12683048, + "step": 6486 + }, + { + "epoch": 0.8597746852220013, + "grad_norm": 17.151813507080078, + "learning_rate": 4.055271383608688e-06, + "loss": 0.5882, + "num_input_tokens_seen": 12684584, + "step": 6487 + }, + { + "epoch": 0.8599072233267064, + "grad_norm": 10.6739501953125, + "learning_rate": 4.0549995840557885e-06, + "loss": 0.1714, + "num_input_tokens_seen": 12686656, + "step": 6488 + }, + { + "epoch": 0.8600397614314115, + "grad_norm": 0.037264417856931686, + "learning_rate": 4.054727754521427e-06, + "loss": 0.0003, + "num_input_tokens_seen": 12687720, + "step": 6489 + }, + { + "epoch": 0.8601722995361166, + "grad_norm": 0.139362633228302, + "learning_rate": 4.05445589501084e-06, + "loss": 0.001, + "num_input_tokens_seen": 12689232, + "step": 6490 + }, + { + "epoch": 0.8603048376408218, + "grad_norm": 0.1307397335767746, + "learning_rate": 4.0541840055292715e-06, + "loss": 0.0009, + "num_input_tokens_seen": 12690880, + "step": 6491 + }, + { + "epoch": 0.8604373757455268, + "grad_norm": 0.31428685784339905, + "learning_rate": 4.0539120860819626e-06, + "loss": 0.0019, + "num_input_tokens_seen": 12693840, + "step": 6492 + }, + { + "epoch": 0.860569913850232, + "grad_norm": 13.333715438842773, + "learning_rate": 4.053640136674157e-06, + "loss": 0.3747, + "num_input_tokens_seen": 12696232, + "step": 6493 + }, + { + "epoch": 0.860702451954937, + "grad_norm": 4.701325416564941, + "learning_rate": 4.053368157311099e-06, + "loss": 0.1121, + "num_input_tokens_seen": 12697472, + "step": 6494 + }, + { + "epoch": 0.8608349900596421, + "grad_norm": 0.10589364171028137, + "learning_rate": 4.05309614799803e-06, + "loss": 0.0008, + "num_input_tokens_seen": 12699344, + "step": 6495 + }, + { + "epoch": 0.8609675281643473, + "grad_norm": 2.107407808303833, + "learning_rate": 4.052824108740195e-06, + "loss": 0.0385, + "num_input_tokens_seen": 12702536, + "step": 6496 + }, + { + "epoch": 0.8611000662690523, + "grad_norm": 4.547866344451904, + "learning_rate": 4.0525520395428415e-06, + "loss": 0.076, + "num_input_tokens_seen": 12704424, + "step": 6497 + }, + { + "epoch": 0.8612326043737575, + "grad_norm": 0.3751016855239868, + "learning_rate": 4.052279940411213e-06, + "loss": 0.0026, + "num_input_tokens_seen": 12706280, + "step": 6498 + }, + { + "epoch": 0.8613651424784625, + "grad_norm": 0.07610476762056351, + "learning_rate": 4.052007811350556e-06, + "loss": 0.0005, + "num_input_tokens_seen": 12708240, + "step": 6499 + }, + { + "epoch": 0.8614976805831677, + "grad_norm": 5.920442581176758, + "learning_rate": 4.05173565236612e-06, + "loss": 0.099, + "num_input_tokens_seen": 12710712, + "step": 6500 + }, + { + "epoch": 0.8616302186878728, + "grad_norm": 9.506125450134277, + "learning_rate": 4.051463463463148e-06, + "loss": 0.4154, + "num_input_tokens_seen": 12712440, + "step": 6501 + }, + { + "epoch": 0.8617627567925779, + "grad_norm": 9.298277854919434, + "learning_rate": 4.05119124464689e-06, + "loss": 0.2139, + "num_input_tokens_seen": 12714832, + "step": 6502 + }, + { + "epoch": 0.861895294897283, + "grad_norm": 6.611032485961914, + "learning_rate": 4.050918995922595e-06, + "loss": 0.2162, + "num_input_tokens_seen": 12718096, + "step": 6503 + }, + { + "epoch": 0.862027833001988, + "grad_norm": 3.853005886077881, + "learning_rate": 4.050646717295513e-06, + "loss": 0.0573, + "num_input_tokens_seen": 12719696, + "step": 6504 + }, + { + "epoch": 0.8621603711066932, + "grad_norm": 3.595113515853882, + "learning_rate": 4.050374408770891e-06, + "loss": 0.0939, + "num_input_tokens_seen": 12721144, + "step": 6505 + }, + { + "epoch": 0.8622929092113982, + "grad_norm": 7.952568531036377, + "learning_rate": 4.0501020703539815e-06, + "loss": 0.2061, + "num_input_tokens_seen": 12723568, + "step": 6506 + }, + { + "epoch": 0.8624254473161034, + "grad_norm": 7.826427936553955, + "learning_rate": 4.049829702050035e-06, + "loss": 0.1264, + "num_input_tokens_seen": 12725624, + "step": 6507 + }, + { + "epoch": 0.8625579854208085, + "grad_norm": 0.2814560830593109, + "learning_rate": 4.049557303864302e-06, + "loss": 0.002, + "num_input_tokens_seen": 12727352, + "step": 6508 + }, + { + "epoch": 0.8626905235255136, + "grad_norm": 11.72397518157959, + "learning_rate": 4.049284875802035e-06, + "loss": 0.3034, + "num_input_tokens_seen": 12729104, + "step": 6509 + }, + { + "epoch": 0.8628230616302187, + "grad_norm": 0.7104626893997192, + "learning_rate": 4.0490124178684884e-06, + "loss": 0.0044, + "num_input_tokens_seen": 12730872, + "step": 6510 + }, + { + "epoch": 0.8629555997349238, + "grad_norm": 4.97479248046875, + "learning_rate": 4.0487399300689126e-06, + "loss": 0.2197, + "num_input_tokens_seen": 12732256, + "step": 6511 + }, + { + "epoch": 0.8630881378396289, + "grad_norm": 15.474020957946777, + "learning_rate": 4.048467412408562e-06, + "loss": 0.9528, + "num_input_tokens_seen": 12735112, + "step": 6512 + }, + { + "epoch": 0.863220675944334, + "grad_norm": 26.950557708740234, + "learning_rate": 4.048194864892693e-06, + "loss": 0.5948, + "num_input_tokens_seen": 12737416, + "step": 6513 + }, + { + "epoch": 0.8633532140490391, + "grad_norm": 8.233342170715332, + "learning_rate": 4.047922287526558e-06, + "loss": 0.1663, + "num_input_tokens_seen": 12739520, + "step": 6514 + }, + { + "epoch": 0.8634857521537442, + "grad_norm": 13.284516334533691, + "learning_rate": 4.047649680315413e-06, + "loss": 0.4027, + "num_input_tokens_seen": 12741696, + "step": 6515 + }, + { + "epoch": 0.8636182902584493, + "grad_norm": 10.333155632019043, + "learning_rate": 4.047377043264515e-06, + "loss": 0.3278, + "num_input_tokens_seen": 12743552, + "step": 6516 + }, + { + "epoch": 0.8637508283631544, + "grad_norm": 4.250310897827148, + "learning_rate": 4.0471043763791205e-06, + "loss": 0.1546, + "num_input_tokens_seen": 12745320, + "step": 6517 + }, + { + "epoch": 0.8638833664678595, + "grad_norm": 4.928708553314209, + "learning_rate": 4.046831679664486e-06, + "loss": 0.1713, + "num_input_tokens_seen": 12747840, + "step": 6518 + }, + { + "epoch": 0.8640159045725646, + "grad_norm": 0.2027350515127182, + "learning_rate": 4.04655895312587e-06, + "loss": 0.0015, + "num_input_tokens_seen": 12749424, + "step": 6519 + }, + { + "epoch": 0.8641484426772698, + "grad_norm": 0.3864472806453705, + "learning_rate": 4.04628619676853e-06, + "loss": 0.0028, + "num_input_tokens_seen": 12750680, + "step": 6520 + }, + { + "epoch": 0.8642809807819748, + "grad_norm": 10.349750518798828, + "learning_rate": 4.046013410597726e-06, + "loss": 0.2658, + "num_input_tokens_seen": 12752376, + "step": 6521 + }, + { + "epoch": 0.8644135188866799, + "grad_norm": 0.39593321084976196, + "learning_rate": 4.045740594618716e-06, + "loss": 0.0029, + "num_input_tokens_seen": 12754912, + "step": 6522 + }, + { + "epoch": 0.864546056991385, + "grad_norm": 10.701123237609863, + "learning_rate": 4.045467748836761e-06, + "loss": 0.4084, + "num_input_tokens_seen": 12757840, + "step": 6523 + }, + { + "epoch": 0.8646785950960901, + "grad_norm": 3.4486422538757324, + "learning_rate": 4.045194873257121e-06, + "loss": 0.1268, + "num_input_tokens_seen": 12759944, + "step": 6524 + }, + { + "epoch": 0.8648111332007953, + "grad_norm": 11.5785493850708, + "learning_rate": 4.04492196788506e-06, + "loss": 0.5669, + "num_input_tokens_seen": 12761624, + "step": 6525 + }, + { + "epoch": 0.8649436713055003, + "grad_norm": 10.358160972595215, + "learning_rate": 4.044649032725836e-06, + "loss": 0.2981, + "num_input_tokens_seen": 12764416, + "step": 6526 + }, + { + "epoch": 0.8650762094102055, + "grad_norm": 12.329428672790527, + "learning_rate": 4.044376067784714e-06, + "loss": 0.0831, + "num_input_tokens_seen": 12766080, + "step": 6527 + }, + { + "epoch": 0.8652087475149105, + "grad_norm": 16.004451751708984, + "learning_rate": 4.044103073066955e-06, + "loss": 0.3266, + "num_input_tokens_seen": 12768784, + "step": 6528 + }, + { + "epoch": 0.8653412856196157, + "grad_norm": 4.590848445892334, + "learning_rate": 4.043830048577825e-06, + "loss": 0.1047, + "num_input_tokens_seen": 12770656, + "step": 6529 + }, + { + "epoch": 0.8654738237243207, + "grad_norm": 8.477762222290039, + "learning_rate": 4.043556994322585e-06, + "loss": 0.2034, + "num_input_tokens_seen": 12772920, + "step": 6530 + }, + { + "epoch": 0.8656063618290258, + "grad_norm": 24.9257755279541, + "learning_rate": 4.0432839103065024e-06, + "loss": 0.515, + "num_input_tokens_seen": 12774904, + "step": 6531 + }, + { + "epoch": 0.865738899933731, + "grad_norm": 14.30213451385498, + "learning_rate": 4.043010796534841e-06, + "loss": 0.3784, + "num_input_tokens_seen": 12776832, + "step": 6532 + }, + { + "epoch": 0.865871438038436, + "grad_norm": 0.14427445828914642, + "learning_rate": 4.042737653012867e-06, + "loss": 0.001, + "num_input_tokens_seen": 12778128, + "step": 6533 + }, + { + "epoch": 0.8660039761431412, + "grad_norm": 0.7226604223251343, + "learning_rate": 4.042464479745847e-06, + "loss": 0.005, + "num_input_tokens_seen": 12780848, + "step": 6534 + }, + { + "epoch": 0.8661365142478462, + "grad_norm": 0.15553000569343567, + "learning_rate": 4.042191276739047e-06, + "loss": 0.0011, + "num_input_tokens_seen": 12783256, + "step": 6535 + }, + { + "epoch": 0.8662690523525514, + "grad_norm": 7.058185577392578, + "learning_rate": 4.041918043997736e-06, + "loss": 0.2448, + "num_input_tokens_seen": 12785360, + "step": 6536 + }, + { + "epoch": 0.8664015904572565, + "grad_norm": 13.322836875915527, + "learning_rate": 4.041644781527181e-06, + "loss": 0.2507, + "num_input_tokens_seen": 12786912, + "step": 6537 + }, + { + "epoch": 0.8665341285619615, + "grad_norm": 0.24811577796936035, + "learning_rate": 4.041371489332652e-06, + "loss": 0.0017, + "num_input_tokens_seen": 12788608, + "step": 6538 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 13.515686988830566, + "learning_rate": 4.041098167419416e-06, + "loss": 0.3728, + "num_input_tokens_seen": 12790664, + "step": 6539 + }, + { + "epoch": 0.8667992047713717, + "grad_norm": 6.873810768127441, + "learning_rate": 4.040824815792744e-06, + "loss": 0.2274, + "num_input_tokens_seen": 12792784, + "step": 6540 + }, + { + "epoch": 0.8669317428760769, + "grad_norm": 0.2750130891799927, + "learning_rate": 4.040551434457907e-06, + "loss": 0.002, + "num_input_tokens_seen": 12795008, + "step": 6541 + }, + { + "epoch": 0.8670642809807819, + "grad_norm": 9.263015747070312, + "learning_rate": 4.040278023420176e-06, + "loss": 0.2562, + "num_input_tokens_seen": 12796984, + "step": 6542 + }, + { + "epoch": 0.8671968190854871, + "grad_norm": 7.518599510192871, + "learning_rate": 4.040004582684823e-06, + "loss": 0.1288, + "num_input_tokens_seen": 12799688, + "step": 6543 + }, + { + "epoch": 0.8673293571901922, + "grad_norm": 10.655508995056152, + "learning_rate": 4.039731112257118e-06, + "loss": 0.3279, + "num_input_tokens_seen": 12801584, + "step": 6544 + }, + { + "epoch": 0.8674618952948973, + "grad_norm": 0.12241854518651962, + "learning_rate": 4.0394576121423354e-06, + "loss": 0.0009, + "num_input_tokens_seen": 12803864, + "step": 6545 + }, + { + "epoch": 0.8675944333996024, + "grad_norm": 0.6830618381500244, + "learning_rate": 4.039184082345748e-06, + "loss": 0.004, + "num_input_tokens_seen": 12805392, + "step": 6546 + }, + { + "epoch": 0.8677269715043074, + "grad_norm": 12.72492790222168, + "learning_rate": 4.03891052287263e-06, + "loss": 0.5019, + "num_input_tokens_seen": 12806848, + "step": 6547 + }, + { + "epoch": 0.8678595096090126, + "grad_norm": 4.830325126647949, + "learning_rate": 4.038636933728255e-06, + "loss": 0.1417, + "num_input_tokens_seen": 12809000, + "step": 6548 + }, + { + "epoch": 0.8679920477137177, + "grad_norm": 12.862296104431152, + "learning_rate": 4.038363314917898e-06, + "loss": 0.2919, + "num_input_tokens_seen": 12811560, + "step": 6549 + }, + { + "epoch": 0.8681245858184228, + "grad_norm": 8.4625825881958, + "learning_rate": 4.038089666446835e-06, + "loss": 0.2311, + "num_input_tokens_seen": 12813648, + "step": 6550 + }, + { + "epoch": 0.8682571239231279, + "grad_norm": 8.512572288513184, + "learning_rate": 4.037815988320343e-06, + "loss": 0.2296, + "num_input_tokens_seen": 12815544, + "step": 6551 + }, + { + "epoch": 0.868389662027833, + "grad_norm": 0.9187560677528381, + "learning_rate": 4.0375422805436985e-06, + "loss": 0.0063, + "num_input_tokens_seen": 12817136, + "step": 6552 + }, + { + "epoch": 0.8685222001325381, + "grad_norm": 0.10755084455013275, + "learning_rate": 4.037268543122176e-06, + "loss": 0.0008, + "num_input_tokens_seen": 12820040, + "step": 6553 + }, + { + "epoch": 0.8686547382372433, + "grad_norm": 9.333830833435059, + "learning_rate": 4.036994776061058e-06, + "loss": 0.4326, + "num_input_tokens_seen": 12822800, + "step": 6554 + }, + { + "epoch": 0.8687872763419483, + "grad_norm": 6.232413291931152, + "learning_rate": 4.036720979365618e-06, + "loss": 0.1827, + "num_input_tokens_seen": 12824552, + "step": 6555 + }, + { + "epoch": 0.8689198144466534, + "grad_norm": 2.635136127471924, + "learning_rate": 4.036447153041139e-06, + "loss": 0.0235, + "num_input_tokens_seen": 12825960, + "step": 6556 + }, + { + "epoch": 0.8690523525513585, + "grad_norm": 0.15553709864616394, + "learning_rate": 4.036173297092898e-06, + "loss": 0.0011, + "num_input_tokens_seen": 12827768, + "step": 6557 + }, + { + "epoch": 0.8691848906560636, + "grad_norm": 0.10950968414545059, + "learning_rate": 4.035899411526176e-06, + "loss": 0.0008, + "num_input_tokens_seen": 12829232, + "step": 6558 + }, + { + "epoch": 0.8693174287607687, + "grad_norm": 12.491772651672363, + "learning_rate": 4.0356254963462545e-06, + "loss": 0.1866, + "num_input_tokens_seen": 12831400, + "step": 6559 + }, + { + "epoch": 0.8694499668654738, + "grad_norm": 0.16624446213245392, + "learning_rate": 4.035351551558414e-06, + "loss": 0.0012, + "num_input_tokens_seen": 12833024, + "step": 6560 + }, + { + "epoch": 0.869582504970179, + "grad_norm": 4.245083808898926, + "learning_rate": 4.035077577167936e-06, + "loss": 0.0732, + "num_input_tokens_seen": 12834984, + "step": 6561 + }, + { + "epoch": 0.869715043074884, + "grad_norm": 13.102285385131836, + "learning_rate": 4.034803573180104e-06, + "loss": 0.373, + "num_input_tokens_seen": 12836352, + "step": 6562 + }, + { + "epoch": 0.8698475811795892, + "grad_norm": 7.7806291580200195, + "learning_rate": 4.0345295396002e-06, + "loss": 0.1864, + "num_input_tokens_seen": 12837960, + "step": 6563 + }, + { + "epoch": 0.8699801192842942, + "grad_norm": 13.598511695861816, + "learning_rate": 4.0342554764335076e-06, + "loss": 0.2218, + "num_input_tokens_seen": 12840320, + "step": 6564 + }, + { + "epoch": 0.8701126573889993, + "grad_norm": 0.07740826904773712, + "learning_rate": 4.033981383685311e-06, + "loss": 0.0006, + "num_input_tokens_seen": 12841784, + "step": 6565 + }, + { + "epoch": 0.8702451954937045, + "grad_norm": 5.360774517059326, + "learning_rate": 4.033707261360895e-06, + "loss": 0.2601, + "num_input_tokens_seen": 12843264, + "step": 6566 + }, + { + "epoch": 0.8703777335984095, + "grad_norm": 10.747112274169922, + "learning_rate": 4.033433109465545e-06, + "loss": 0.4169, + "num_input_tokens_seen": 12845360, + "step": 6567 + }, + { + "epoch": 0.8705102717031147, + "grad_norm": 5.404027462005615, + "learning_rate": 4.033158928004548e-06, + "loss": 0.1806, + "num_input_tokens_seen": 12847184, + "step": 6568 + }, + { + "epoch": 0.8706428098078197, + "grad_norm": 5.204071998596191, + "learning_rate": 4.032884716983188e-06, + "loss": 0.1198, + "num_input_tokens_seen": 12849880, + "step": 6569 + }, + { + "epoch": 0.8707753479125249, + "grad_norm": 9.670557022094727, + "learning_rate": 4.032610476406753e-06, + "loss": 0.3377, + "num_input_tokens_seen": 12852136, + "step": 6570 + }, + { + "epoch": 0.8709078860172299, + "grad_norm": 7.172330379486084, + "learning_rate": 4.032336206280532e-06, + "loss": 0.227, + "num_input_tokens_seen": 12853704, + "step": 6571 + }, + { + "epoch": 0.871040424121935, + "grad_norm": 5.541779041290283, + "learning_rate": 4.032061906609811e-06, + "loss": 0.2366, + "num_input_tokens_seen": 12855792, + "step": 6572 + }, + { + "epoch": 0.8711729622266402, + "grad_norm": 0.06213773787021637, + "learning_rate": 4.031787577399879e-06, + "loss": 0.0004, + "num_input_tokens_seen": 12857336, + "step": 6573 + }, + { + "epoch": 0.8713055003313452, + "grad_norm": 3.156280755996704, + "learning_rate": 4.031513218656026e-06, + "loss": 0.1074, + "num_input_tokens_seen": 12859496, + "step": 6574 + }, + { + "epoch": 0.8714380384360504, + "grad_norm": 0.046763766556978226, + "learning_rate": 4.0312388303835425e-06, + "loss": 0.0003, + "num_input_tokens_seen": 12861056, + "step": 6575 + }, + { + "epoch": 0.8715705765407554, + "grad_norm": 4.981733322143555, + "learning_rate": 4.030964412587717e-06, + "loss": 0.0966, + "num_input_tokens_seen": 12863544, + "step": 6576 + }, + { + "epoch": 0.8717031146454606, + "grad_norm": 10.443001747131348, + "learning_rate": 4.0306899652738425e-06, + "loss": 0.0955, + "num_input_tokens_seen": 12865536, + "step": 6577 + }, + { + "epoch": 0.8718356527501657, + "grad_norm": 0.08453378081321716, + "learning_rate": 4.030415488447209e-06, + "loss": 0.0006, + "num_input_tokens_seen": 12867384, + "step": 6578 + }, + { + "epoch": 0.8719681908548708, + "grad_norm": 0.3494601845741272, + "learning_rate": 4.030140982113109e-06, + "loss": 0.0024, + "num_input_tokens_seen": 12870032, + "step": 6579 + }, + { + "epoch": 0.8721007289595759, + "grad_norm": 1.848763346672058, + "learning_rate": 4.029866446276835e-06, + "loss": 0.0222, + "num_input_tokens_seen": 12872280, + "step": 6580 + }, + { + "epoch": 0.872233267064281, + "grad_norm": 6.874385833740234, + "learning_rate": 4.029591880943681e-06, + "loss": 0.2263, + "num_input_tokens_seen": 12874144, + "step": 6581 + }, + { + "epoch": 0.8723658051689861, + "grad_norm": 10.52022933959961, + "learning_rate": 4.02931728611894e-06, + "loss": 0.2186, + "num_input_tokens_seen": 12875472, + "step": 6582 + }, + { + "epoch": 0.8724983432736911, + "grad_norm": 0.05349539965391159, + "learning_rate": 4.029042661807907e-06, + "loss": 0.0004, + "num_input_tokens_seen": 12876920, + "step": 6583 + }, + { + "epoch": 0.8726308813783963, + "grad_norm": 1.7303128242492676, + "learning_rate": 4.028768008015876e-06, + "loss": 0.0235, + "num_input_tokens_seen": 12878480, + "step": 6584 + }, + { + "epoch": 0.8727634194831014, + "grad_norm": 3.1480493545532227, + "learning_rate": 4.028493324748144e-06, + "loss": 0.0241, + "num_input_tokens_seen": 12879888, + "step": 6585 + }, + { + "epoch": 0.8728959575878065, + "grad_norm": 0.13807442784309387, + "learning_rate": 4.028218612010006e-06, + "loss": 0.001, + "num_input_tokens_seen": 12882544, + "step": 6586 + }, + { + "epoch": 0.8730284956925116, + "grad_norm": 4.553720474243164, + "learning_rate": 4.027943869806759e-06, + "loss": 0.1082, + "num_input_tokens_seen": 12884424, + "step": 6587 + }, + { + "epoch": 0.8731610337972167, + "grad_norm": 9.512887001037598, + "learning_rate": 4.027669098143699e-06, + "loss": 0.2467, + "num_input_tokens_seen": 12886200, + "step": 6588 + }, + { + "epoch": 0.8732935719019218, + "grad_norm": 5.4600114822387695, + "learning_rate": 4.027394297026126e-06, + "loss": 0.0931, + "num_input_tokens_seen": 12889040, + "step": 6589 + }, + { + "epoch": 0.873426110006627, + "grad_norm": 0.48244115710258484, + "learning_rate": 4.0271194664593375e-06, + "loss": 0.0033, + "num_input_tokens_seen": 12891152, + "step": 6590 + }, + { + "epoch": 0.873558648111332, + "grad_norm": 12.173998832702637, + "learning_rate": 4.026844606448632e-06, + "loss": 0.3115, + "num_input_tokens_seen": 12893264, + "step": 6591 + }, + { + "epoch": 0.8736911862160371, + "grad_norm": 3.688328742980957, + "learning_rate": 4.026569716999309e-06, + "loss": 0.0504, + "num_input_tokens_seen": 12894728, + "step": 6592 + }, + { + "epoch": 0.8738237243207422, + "grad_norm": 8.168661117553711, + "learning_rate": 4.0262947981166685e-06, + "loss": 0.1484, + "num_input_tokens_seen": 12896304, + "step": 6593 + }, + { + "epoch": 0.8739562624254473, + "grad_norm": 11.464158058166504, + "learning_rate": 4.026019849806011e-06, + "loss": 0.3719, + "num_input_tokens_seen": 12897872, + "step": 6594 + }, + { + "epoch": 0.8740888005301524, + "grad_norm": 11.796993255615234, + "learning_rate": 4.0257448720726386e-06, + "loss": 0.516, + "num_input_tokens_seen": 12899952, + "step": 6595 + }, + { + "epoch": 0.8742213386348575, + "grad_norm": 3.6393373012542725, + "learning_rate": 4.025469864921853e-06, + "loss": 0.181, + "num_input_tokens_seen": 12901696, + "step": 6596 + }, + { + "epoch": 0.8743538767395627, + "grad_norm": 0.2923526465892792, + "learning_rate": 4.025194828358955e-06, + "loss": 0.002, + "num_input_tokens_seen": 12903064, + "step": 6597 + }, + { + "epoch": 0.8744864148442677, + "grad_norm": 11.216588020324707, + "learning_rate": 4.024919762389249e-06, + "loss": 0.479, + "num_input_tokens_seen": 12905344, + "step": 6598 + }, + { + "epoch": 0.8746189529489728, + "grad_norm": 9.599145889282227, + "learning_rate": 4.024644667018038e-06, + "loss": 0.2199, + "num_input_tokens_seen": 12907096, + "step": 6599 + }, + { + "epoch": 0.8747514910536779, + "grad_norm": 0.379406213760376, + "learning_rate": 4.024369542250626e-06, + "loss": 0.0022, + "num_input_tokens_seen": 12908352, + "step": 6600 + }, + { + "epoch": 0.874884029158383, + "grad_norm": 10.412335395812988, + "learning_rate": 4.024094388092318e-06, + "loss": 0.4612, + "num_input_tokens_seen": 12910512, + "step": 6601 + }, + { + "epoch": 0.8750165672630882, + "grad_norm": 9.301907539367676, + "learning_rate": 4.023819204548418e-06, + "loss": 0.2108, + "num_input_tokens_seen": 12912184, + "step": 6602 + }, + { + "epoch": 0.8751491053677932, + "grad_norm": 0.5500617027282715, + "learning_rate": 4.023543991624233e-06, + "loss": 0.0019, + "num_input_tokens_seen": 12913640, + "step": 6603 + }, + { + "epoch": 0.8752816434724984, + "grad_norm": 0.10720466077327728, + "learning_rate": 4.0232687493250685e-06, + "loss": 0.0007, + "num_input_tokens_seen": 12915248, + "step": 6604 + }, + { + "epoch": 0.8754141815772034, + "grad_norm": 2.823162078857422, + "learning_rate": 4.022993477656232e-06, + "loss": 0.0283, + "num_input_tokens_seen": 12917208, + "step": 6605 + }, + { + "epoch": 0.8755467196819086, + "grad_norm": 11.075665473937988, + "learning_rate": 4.02271817662303e-06, + "loss": 0.3902, + "num_input_tokens_seen": 12920000, + "step": 6606 + }, + { + "epoch": 0.8756792577866136, + "grad_norm": 0.06861568987369537, + "learning_rate": 4.022442846230772e-06, + "loss": 0.0005, + "num_input_tokens_seen": 12921360, + "step": 6607 + }, + { + "epoch": 0.8758117958913187, + "grad_norm": 11.680721282958984, + "learning_rate": 4.022167486484766e-06, + "loss": 0.4556, + "num_input_tokens_seen": 12923136, + "step": 6608 + }, + { + "epoch": 0.8759443339960239, + "grad_norm": 7.1249823570251465, + "learning_rate": 4.0218920973903196e-06, + "loss": 0.3183, + "num_input_tokens_seen": 12925424, + "step": 6609 + }, + { + "epoch": 0.8760768721007289, + "grad_norm": 5.983315944671631, + "learning_rate": 4.021616678952744e-06, + "loss": 0.2933, + "num_input_tokens_seen": 12928032, + "step": 6610 + }, + { + "epoch": 0.8762094102054341, + "grad_norm": 14.065616607666016, + "learning_rate": 4.021341231177349e-06, + "loss": 0.5769, + "num_input_tokens_seen": 12930288, + "step": 6611 + }, + { + "epoch": 0.8763419483101391, + "grad_norm": 6.923241138458252, + "learning_rate": 4.021065754069446e-06, + "loss": 0.1123, + "num_input_tokens_seen": 12931904, + "step": 6612 + }, + { + "epoch": 0.8764744864148443, + "grad_norm": 0.06827846169471741, + "learning_rate": 4.020790247634345e-06, + "loss": 0.0005, + "num_input_tokens_seen": 12934416, + "step": 6613 + }, + { + "epoch": 0.8766070245195494, + "grad_norm": 10.93906021118164, + "learning_rate": 4.0205147118773605e-06, + "loss": 0.3382, + "num_input_tokens_seen": 12935680, + "step": 6614 + }, + { + "epoch": 0.8767395626242545, + "grad_norm": 6.033178806304932, + "learning_rate": 4.0202391468038025e-06, + "loss": 0.2365, + "num_input_tokens_seen": 12937344, + "step": 6615 + }, + { + "epoch": 0.8768721007289596, + "grad_norm": 3.401136875152588, + "learning_rate": 4.019963552418984e-06, + "loss": 0.0862, + "num_input_tokens_seen": 12938712, + "step": 6616 + }, + { + "epoch": 0.8770046388336646, + "grad_norm": 6.614410400390625, + "learning_rate": 4.0196879287282206e-06, + "loss": 0.1149, + "num_input_tokens_seen": 12940768, + "step": 6617 + }, + { + "epoch": 0.8771371769383698, + "grad_norm": 5.3681254386901855, + "learning_rate": 4.019412275736826e-06, + "loss": 0.1667, + "num_input_tokens_seen": 12942848, + "step": 6618 + }, + { + "epoch": 0.8772697150430749, + "grad_norm": 0.15372075140476227, + "learning_rate": 4.0191365934501144e-06, + "loss": 0.0009, + "num_input_tokens_seen": 12943992, + "step": 6619 + }, + { + "epoch": 0.87740225314778, + "grad_norm": 9.31204605102539, + "learning_rate": 4.018860881873401e-06, + "loss": 0.403, + "num_input_tokens_seen": 12946016, + "step": 6620 + }, + { + "epoch": 0.8775347912524851, + "grad_norm": 8.074703216552734, + "learning_rate": 4.018585141012002e-06, + "loss": 0.2962, + "num_input_tokens_seen": 12947144, + "step": 6621 + }, + { + "epoch": 0.8776673293571902, + "grad_norm": 6.778656959533691, + "learning_rate": 4.0183093708712336e-06, + "loss": 0.3136, + "num_input_tokens_seen": 12949592, + "step": 6622 + }, + { + "epoch": 0.8777998674618953, + "grad_norm": 8.215835571289062, + "learning_rate": 4.018033571456413e-06, + "loss": 0.2009, + "num_input_tokens_seen": 12951824, + "step": 6623 + }, + { + "epoch": 0.8779324055666003, + "grad_norm": 0.05925935506820679, + "learning_rate": 4.017757742772858e-06, + "loss": 0.0004, + "num_input_tokens_seen": 12952912, + "step": 6624 + }, + { + "epoch": 0.8780649436713055, + "grad_norm": 7.9847307205200195, + "learning_rate": 4.017481884825887e-06, + "loss": 0.175, + "num_input_tokens_seen": 12954736, + "step": 6625 + }, + { + "epoch": 0.8781974817760106, + "grad_norm": 13.987960815429688, + "learning_rate": 4.017205997620818e-06, + "loss": 0.3576, + "num_input_tokens_seen": 12956136, + "step": 6626 + }, + { + "epoch": 0.8783300198807157, + "grad_norm": 11.616796493530273, + "learning_rate": 4.0169300811629715e-06, + "loss": 0.2846, + "num_input_tokens_seen": 12958144, + "step": 6627 + }, + { + "epoch": 0.8784625579854208, + "grad_norm": 8.661978721618652, + "learning_rate": 4.016654135457666e-06, + "loss": 0.1598, + "num_input_tokens_seen": 12960272, + "step": 6628 + }, + { + "epoch": 0.8785950960901259, + "grad_norm": 4.389837265014648, + "learning_rate": 4.016378160510223e-06, + "loss": 0.1042, + "num_input_tokens_seen": 12961864, + "step": 6629 + }, + { + "epoch": 0.878727634194831, + "grad_norm": 7.631089210510254, + "learning_rate": 4.016102156325961e-06, + "loss": 0.2595, + "num_input_tokens_seen": 12963896, + "step": 6630 + }, + { + "epoch": 0.8788601722995362, + "grad_norm": 0.6388699412345886, + "learning_rate": 4.015826122910206e-06, + "loss": 0.0047, + "num_input_tokens_seen": 12965648, + "step": 6631 + }, + { + "epoch": 0.8789927104042412, + "grad_norm": 7.036970138549805, + "learning_rate": 4.015550060268276e-06, + "loss": 0.1062, + "num_input_tokens_seen": 12967120, + "step": 6632 + }, + { + "epoch": 0.8791252485089464, + "grad_norm": 0.9120568037033081, + "learning_rate": 4.015273968405497e-06, + "loss": 0.0065, + "num_input_tokens_seen": 12969776, + "step": 6633 + }, + { + "epoch": 0.8792577866136514, + "grad_norm": 2.7749216556549072, + "learning_rate": 4.0149978473271885e-06, + "loss": 0.03, + "num_input_tokens_seen": 12972888, + "step": 6634 + }, + { + "epoch": 0.8793903247183565, + "grad_norm": 4.196609020233154, + "learning_rate": 4.014721697038678e-06, + "loss": 0.1137, + "num_input_tokens_seen": 12975280, + "step": 6635 + }, + { + "epoch": 0.8795228628230616, + "grad_norm": 0.3837064802646637, + "learning_rate": 4.0144455175452875e-06, + "loss": 0.0028, + "num_input_tokens_seen": 12976952, + "step": 6636 + }, + { + "epoch": 0.8796554009277667, + "grad_norm": 5.483085632324219, + "learning_rate": 4.014169308852343e-06, + "loss": 0.1313, + "num_input_tokens_seen": 12978584, + "step": 6637 + }, + { + "epoch": 0.8797879390324719, + "grad_norm": 0.23507626354694366, + "learning_rate": 4.013893070965169e-06, + "loss": 0.0017, + "num_input_tokens_seen": 12979696, + "step": 6638 + }, + { + "epoch": 0.8799204771371769, + "grad_norm": 8.515172004699707, + "learning_rate": 4.013616803889093e-06, + "loss": 0.1387, + "num_input_tokens_seen": 12980888, + "step": 6639 + }, + { + "epoch": 0.8800530152418821, + "grad_norm": 1.915569543838501, + "learning_rate": 4.013340507629441e-06, + "loss": 0.0343, + "num_input_tokens_seen": 12982792, + "step": 6640 + }, + { + "epoch": 0.8801855533465871, + "grad_norm": 6.3353962898254395, + "learning_rate": 4.013064182191538e-06, + "loss": 0.1806, + "num_input_tokens_seen": 12984392, + "step": 6641 + }, + { + "epoch": 0.8803180914512923, + "grad_norm": 0.6934639811515808, + "learning_rate": 4.012787827580716e-06, + "loss": 0.0052, + "num_input_tokens_seen": 12986856, + "step": 6642 + }, + { + "epoch": 0.8804506295559974, + "grad_norm": 6.269089221954346, + "learning_rate": 4.0125114438023005e-06, + "loss": 0.06, + "num_input_tokens_seen": 12989112, + "step": 6643 + }, + { + "epoch": 0.8805831676607024, + "grad_norm": 1.2445381879806519, + "learning_rate": 4.012235030861621e-06, + "loss": 0.0276, + "num_input_tokens_seen": 12990464, + "step": 6644 + }, + { + "epoch": 0.8807157057654076, + "grad_norm": 15.877988815307617, + "learning_rate": 4.011958588764006e-06, + "loss": 0.2251, + "num_input_tokens_seen": 12991824, + "step": 6645 + }, + { + "epoch": 0.8808482438701126, + "grad_norm": 2.171858310699463, + "learning_rate": 4.011682117514787e-06, + "loss": 0.0285, + "num_input_tokens_seen": 12993880, + "step": 6646 + }, + { + "epoch": 0.8809807819748178, + "grad_norm": 1.752172827720642, + "learning_rate": 4.011405617119294e-06, + "loss": 0.011, + "num_input_tokens_seen": 12995216, + "step": 6647 + }, + { + "epoch": 0.8811133200795228, + "grad_norm": 10.062243461608887, + "learning_rate": 4.0111290875828575e-06, + "loss": 0.1441, + "num_input_tokens_seen": 12996736, + "step": 6648 + }, + { + "epoch": 0.881245858184228, + "grad_norm": 3.414544105529785, + "learning_rate": 4.01085252891081e-06, + "loss": 0.0154, + "num_input_tokens_seen": 12998872, + "step": 6649 + }, + { + "epoch": 0.8813783962889331, + "grad_norm": 11.534857749938965, + "learning_rate": 4.010575941108485e-06, + "loss": 0.5361, + "num_input_tokens_seen": 13001008, + "step": 6650 + }, + { + "epoch": 0.8815109343936381, + "grad_norm": 4.450883865356445, + "learning_rate": 4.010299324181212e-06, + "loss": 0.0311, + "num_input_tokens_seen": 13003184, + "step": 6651 + }, + { + "epoch": 0.8816434724983433, + "grad_norm": 14.355125427246094, + "learning_rate": 4.010022678134326e-06, + "loss": 0.6832, + "num_input_tokens_seen": 13005088, + "step": 6652 + }, + { + "epoch": 0.8817760106030483, + "grad_norm": 6.204155445098877, + "learning_rate": 4.009746002973162e-06, + "loss": 0.0783, + "num_input_tokens_seen": 13007872, + "step": 6653 + }, + { + "epoch": 0.8819085487077535, + "grad_norm": 0.11956506222486496, + "learning_rate": 4.009469298703052e-06, + "loss": 0.0009, + "num_input_tokens_seen": 13009120, + "step": 6654 + }, + { + "epoch": 0.8820410868124586, + "grad_norm": 8.148037910461426, + "learning_rate": 4.0091925653293346e-06, + "loss": 0.2221, + "num_input_tokens_seen": 13012168, + "step": 6655 + }, + { + "epoch": 0.8821736249171637, + "grad_norm": 5.589107513427734, + "learning_rate": 4.0089158028573415e-06, + "loss": 0.2554, + "num_input_tokens_seen": 13015592, + "step": 6656 + }, + { + "epoch": 0.8823061630218688, + "grad_norm": 16.427452087402344, + "learning_rate": 4.008639011292412e-06, + "loss": 0.6696, + "num_input_tokens_seen": 13017512, + "step": 6657 + }, + { + "epoch": 0.8824387011265739, + "grad_norm": 7.600020885467529, + "learning_rate": 4.008362190639882e-06, + "loss": 0.122, + "num_input_tokens_seen": 13019256, + "step": 6658 + }, + { + "epoch": 0.882571239231279, + "grad_norm": 0.22167982161045074, + "learning_rate": 4.008085340905087e-06, + "loss": 0.0011, + "num_input_tokens_seen": 13020336, + "step": 6659 + }, + { + "epoch": 0.882703777335984, + "grad_norm": 13.876978874206543, + "learning_rate": 4.007808462093368e-06, + "loss": 0.1771, + "num_input_tokens_seen": 13021976, + "step": 6660 + }, + { + "epoch": 0.8828363154406892, + "grad_norm": 0.05276597663760185, + "learning_rate": 4.00753155421006e-06, + "loss": 0.0004, + "num_input_tokens_seen": 13023816, + "step": 6661 + }, + { + "epoch": 0.8829688535453943, + "grad_norm": 0.12885721027851105, + "learning_rate": 4.007254617260504e-06, + "loss": 0.001, + "num_input_tokens_seen": 13027176, + "step": 6662 + }, + { + "epoch": 0.8831013916500994, + "grad_norm": 10.068879127502441, + "learning_rate": 4.00697765125004e-06, + "loss": 0.2176, + "num_input_tokens_seen": 13028504, + "step": 6663 + }, + { + "epoch": 0.8832339297548045, + "grad_norm": 10.87276554107666, + "learning_rate": 4.006700656184007e-06, + "loss": 0.3801, + "num_input_tokens_seen": 13030840, + "step": 6664 + }, + { + "epoch": 0.8833664678595096, + "grad_norm": 6.351883411407471, + "learning_rate": 4.006423632067745e-06, + "loss": 0.0649, + "num_input_tokens_seen": 13033400, + "step": 6665 + }, + { + "epoch": 0.8834990059642147, + "grad_norm": 0.27046671509742737, + "learning_rate": 4.006146578906597e-06, + "loss": 0.0017, + "num_input_tokens_seen": 13034912, + "step": 6666 + }, + { + "epoch": 0.8836315440689199, + "grad_norm": 4.643550872802734, + "learning_rate": 4.005869496705904e-06, + "loss": 0.1549, + "num_input_tokens_seen": 13038072, + "step": 6667 + }, + { + "epoch": 0.8837640821736249, + "grad_norm": 1.3105309009552002, + "learning_rate": 4.005592385471008e-06, + "loss": 0.0089, + "num_input_tokens_seen": 13040344, + "step": 6668 + }, + { + "epoch": 0.88389662027833, + "grad_norm": 8.70883846282959, + "learning_rate": 4.005315245207252e-06, + "loss": 0.1138, + "num_input_tokens_seen": 13042744, + "step": 6669 + }, + { + "epoch": 0.8840291583830351, + "grad_norm": 12.156509399414062, + "learning_rate": 4.00503807591998e-06, + "loss": 0.19, + "num_input_tokens_seen": 13045536, + "step": 6670 + }, + { + "epoch": 0.8841616964877402, + "grad_norm": 0.3896006941795349, + "learning_rate": 4.0047608776145356e-06, + "loss": 0.0028, + "num_input_tokens_seen": 13048128, + "step": 6671 + }, + { + "epoch": 0.8842942345924454, + "grad_norm": 10.521852493286133, + "learning_rate": 4.004483650296263e-06, + "loss": 0.3892, + "num_input_tokens_seen": 13049912, + "step": 6672 + }, + { + "epoch": 0.8844267726971504, + "grad_norm": 9.006622314453125, + "learning_rate": 4.004206393970508e-06, + "loss": 0.165, + "num_input_tokens_seen": 13051368, + "step": 6673 + }, + { + "epoch": 0.8845593108018556, + "grad_norm": 12.107438087463379, + "learning_rate": 4.003929108642616e-06, + "loss": 0.3605, + "num_input_tokens_seen": 13052912, + "step": 6674 + }, + { + "epoch": 0.8846918489065606, + "grad_norm": 7.598019599914551, + "learning_rate": 4.0036517943179335e-06, + "loss": 0.235, + "num_input_tokens_seen": 13055072, + "step": 6675 + }, + { + "epoch": 0.8848243870112658, + "grad_norm": 24.836118698120117, + "learning_rate": 4.003374451001806e-06, + "loss": 0.7976, + "num_input_tokens_seen": 13057560, + "step": 6676 + }, + { + "epoch": 0.8849569251159708, + "grad_norm": 7.8215765953063965, + "learning_rate": 4.003097078699584e-06, + "loss": 0.2053, + "num_input_tokens_seen": 13059128, + "step": 6677 + }, + { + "epoch": 0.8850894632206759, + "grad_norm": 11.228286743164062, + "learning_rate": 4.002819677416612e-06, + "loss": 0.4661, + "num_input_tokens_seen": 13060952, + "step": 6678 + }, + { + "epoch": 0.8852220013253811, + "grad_norm": 14.540953636169434, + "learning_rate": 4.00254224715824e-06, + "loss": 0.475, + "num_input_tokens_seen": 13062392, + "step": 6679 + }, + { + "epoch": 0.8853545394300861, + "grad_norm": 6.626898288726807, + "learning_rate": 4.002264787929816e-06, + "loss": 0.173, + "num_input_tokens_seen": 13064336, + "step": 6680 + }, + { + "epoch": 0.8854870775347913, + "grad_norm": 11.635744094848633, + "learning_rate": 4.001987299736692e-06, + "loss": 0.3065, + "num_input_tokens_seen": 13066504, + "step": 6681 + }, + { + "epoch": 0.8856196156394963, + "grad_norm": 10.648580551147461, + "learning_rate": 4.001709782584216e-06, + "loss": 0.3117, + "num_input_tokens_seen": 13068184, + "step": 6682 + }, + { + "epoch": 0.8857521537442015, + "grad_norm": 9.168694496154785, + "learning_rate": 4.00143223647774e-06, + "loss": 0.3817, + "num_input_tokens_seen": 13070440, + "step": 6683 + }, + { + "epoch": 0.8858846918489066, + "grad_norm": 11.912299156188965, + "learning_rate": 4.001154661422614e-06, + "loss": 0.502, + "num_input_tokens_seen": 13072424, + "step": 6684 + }, + { + "epoch": 0.8860172299536117, + "grad_norm": 8.324748039245605, + "learning_rate": 4.000877057424191e-06, + "loss": 0.1732, + "num_input_tokens_seen": 13073608, + "step": 6685 + }, + { + "epoch": 0.8861497680583168, + "grad_norm": 0.3402670621871948, + "learning_rate": 4.000599424487822e-06, + "loss": 0.0026, + "num_input_tokens_seen": 13074744, + "step": 6686 + }, + { + "epoch": 0.8862823061630218, + "grad_norm": 8.010757446289062, + "learning_rate": 4.000321762618861e-06, + "loss": 0.271, + "num_input_tokens_seen": 13076984, + "step": 6687 + }, + { + "epoch": 0.886414844267727, + "grad_norm": 8.477370262145996, + "learning_rate": 4.000044071822663e-06, + "loss": 0.2347, + "num_input_tokens_seen": 13078232, + "step": 6688 + }, + { + "epoch": 0.886547382372432, + "grad_norm": 13.76172924041748, + "learning_rate": 3.999766352104581e-06, + "loss": 0.5181, + "num_input_tokens_seen": 13079808, + "step": 6689 + }, + { + "epoch": 0.8866799204771372, + "grad_norm": 11.809602737426758, + "learning_rate": 3.999488603469967e-06, + "loss": 0.4689, + "num_input_tokens_seen": 13083120, + "step": 6690 + }, + { + "epoch": 0.8868124585818423, + "grad_norm": 0.4083312749862671, + "learning_rate": 3.999210825924178e-06, + "loss": 0.0031, + "num_input_tokens_seen": 13085400, + "step": 6691 + }, + { + "epoch": 0.8869449966865474, + "grad_norm": 4.873818874359131, + "learning_rate": 3.998933019472572e-06, + "loss": 0.0835, + "num_input_tokens_seen": 13088216, + "step": 6692 + }, + { + "epoch": 0.8870775347912525, + "grad_norm": 3.6133882999420166, + "learning_rate": 3.998655184120501e-06, + "loss": 0.0868, + "num_input_tokens_seen": 13091176, + "step": 6693 + }, + { + "epoch": 0.8872100728959575, + "grad_norm": 0.3113093376159668, + "learning_rate": 3.998377319873326e-06, + "loss": 0.0024, + "num_input_tokens_seen": 13093288, + "step": 6694 + }, + { + "epoch": 0.8873426110006627, + "grad_norm": 0.29036927223205566, + "learning_rate": 3.998099426736402e-06, + "loss": 0.0023, + "num_input_tokens_seen": 13094864, + "step": 6695 + }, + { + "epoch": 0.8874751491053678, + "grad_norm": 7.253854751586914, + "learning_rate": 3.997821504715087e-06, + "loss": 0.1559, + "num_input_tokens_seen": 13097560, + "step": 6696 + }, + { + "epoch": 0.8876076872100729, + "grad_norm": 3.300212860107422, + "learning_rate": 3.997543553814741e-06, + "loss": 0.0277, + "num_input_tokens_seen": 13099112, + "step": 6697 + }, + { + "epoch": 0.887740225314778, + "grad_norm": 9.389571189880371, + "learning_rate": 3.997265574040722e-06, + "loss": 0.1841, + "num_input_tokens_seen": 13101016, + "step": 6698 + }, + { + "epoch": 0.8878727634194831, + "grad_norm": 3.598320484161377, + "learning_rate": 3.996987565398391e-06, + "loss": 0.0691, + "num_input_tokens_seen": 13103208, + "step": 6699 + }, + { + "epoch": 0.8880053015241882, + "grad_norm": 2.46490216255188, + "learning_rate": 3.9967095278931045e-06, + "loss": 0.0178, + "num_input_tokens_seen": 13104664, + "step": 6700 + }, + { + "epoch": 0.8881378396288933, + "grad_norm": 0.32370656728744507, + "learning_rate": 3.996431461530227e-06, + "loss": 0.0025, + "num_input_tokens_seen": 13107584, + "step": 6701 + }, + { + "epoch": 0.8882703777335984, + "grad_norm": 0.3997730016708374, + "learning_rate": 3.996153366315119e-06, + "loss": 0.003, + "num_input_tokens_seen": 13108984, + "step": 6702 + }, + { + "epoch": 0.8884029158383036, + "grad_norm": 3.722670555114746, + "learning_rate": 3.995875242253141e-06, + "loss": 0.0491, + "num_input_tokens_seen": 13110576, + "step": 6703 + }, + { + "epoch": 0.8885354539430086, + "grad_norm": 8.67357063293457, + "learning_rate": 3.995597089349657e-06, + "loss": 0.2392, + "num_input_tokens_seen": 13112344, + "step": 6704 + }, + { + "epoch": 0.8886679920477137, + "grad_norm": 7.750943183898926, + "learning_rate": 3.9953189076100295e-06, + "loss": 0.173, + "num_input_tokens_seen": 13114192, + "step": 6705 + }, + { + "epoch": 0.8888005301524188, + "grad_norm": 8.844950675964355, + "learning_rate": 3.995040697039621e-06, + "loss": 0.2586, + "num_input_tokens_seen": 13115752, + "step": 6706 + }, + { + "epoch": 0.8889330682571239, + "grad_norm": 9.06618881225586, + "learning_rate": 3.9947624576437975e-06, + "loss": 0.1829, + "num_input_tokens_seen": 13117296, + "step": 6707 + }, + { + "epoch": 0.8890656063618291, + "grad_norm": 3.3334929943084717, + "learning_rate": 3.994484189427922e-06, + "loss": 0.0946, + "num_input_tokens_seen": 13119480, + "step": 6708 + }, + { + "epoch": 0.8891981444665341, + "grad_norm": 14.31494426727295, + "learning_rate": 3.9942058923973605e-06, + "loss": 0.552, + "num_input_tokens_seen": 13121840, + "step": 6709 + }, + { + "epoch": 0.8893306825712393, + "grad_norm": 10.512606620788574, + "learning_rate": 3.993927566557479e-06, + "loss": 0.3996, + "num_input_tokens_seen": 13123488, + "step": 6710 + }, + { + "epoch": 0.8894632206759443, + "grad_norm": 0.23473520576953888, + "learning_rate": 3.993649211913644e-06, + "loss": 0.0018, + "num_input_tokens_seen": 13125008, + "step": 6711 + }, + { + "epoch": 0.8895957587806494, + "grad_norm": 9.906105041503906, + "learning_rate": 3.99337082847122e-06, + "loss": 0.113, + "num_input_tokens_seen": 13126640, + "step": 6712 + }, + { + "epoch": 0.8897282968853545, + "grad_norm": 13.26136589050293, + "learning_rate": 3.993092416235577e-06, + "loss": 0.4776, + "num_input_tokens_seen": 13128528, + "step": 6713 + }, + { + "epoch": 0.8898608349900596, + "grad_norm": 12.092175483703613, + "learning_rate": 3.992813975212082e-06, + "loss": 0.4394, + "num_input_tokens_seen": 13130552, + "step": 6714 + }, + { + "epoch": 0.8899933730947648, + "grad_norm": 10.42903995513916, + "learning_rate": 3.992535505406105e-06, + "loss": 0.3662, + "num_input_tokens_seen": 13133088, + "step": 6715 + }, + { + "epoch": 0.8901259111994698, + "grad_norm": 0.21451830863952637, + "learning_rate": 3.992257006823013e-06, + "loss": 0.0015, + "num_input_tokens_seen": 13134248, + "step": 6716 + }, + { + "epoch": 0.890258449304175, + "grad_norm": 0.27231305837631226, + "learning_rate": 3.991978479468176e-06, + "loss": 0.0021, + "num_input_tokens_seen": 13135600, + "step": 6717 + }, + { + "epoch": 0.89039098740888, + "grad_norm": 1.050702452659607, + "learning_rate": 3.991699923346964e-06, + "loss": 0.0078, + "num_input_tokens_seen": 13137056, + "step": 6718 + }, + { + "epoch": 0.8905235255135852, + "grad_norm": 0.4525830149650574, + "learning_rate": 3.99142133846475e-06, + "loss": 0.0036, + "num_input_tokens_seen": 13138888, + "step": 6719 + }, + { + "epoch": 0.8906560636182903, + "grad_norm": 11.103044509887695, + "learning_rate": 3.991142724826903e-06, + "loss": 0.2176, + "num_input_tokens_seen": 13141448, + "step": 6720 + }, + { + "epoch": 0.8907886017229953, + "grad_norm": 5.946866989135742, + "learning_rate": 3.990864082438795e-06, + "loss": 0.054, + "num_input_tokens_seen": 13143008, + "step": 6721 + }, + { + "epoch": 0.8909211398277005, + "grad_norm": 4.217400074005127, + "learning_rate": 3.9905854113058e-06, + "loss": 0.1149, + "num_input_tokens_seen": 13144896, + "step": 6722 + }, + { + "epoch": 0.8910536779324055, + "grad_norm": 9.32218074798584, + "learning_rate": 3.9903067114332896e-06, + "loss": 0.2036, + "num_input_tokens_seen": 13146960, + "step": 6723 + }, + { + "epoch": 0.8911862160371107, + "grad_norm": 3.407961368560791, + "learning_rate": 3.990027982826637e-06, + "loss": 0.0809, + "num_input_tokens_seen": 13149016, + "step": 6724 + }, + { + "epoch": 0.8913187541418157, + "grad_norm": 0.24109815061092377, + "learning_rate": 3.989749225491218e-06, + "loss": 0.0018, + "num_input_tokens_seen": 13150328, + "step": 6725 + }, + { + "epoch": 0.8914512922465209, + "grad_norm": 11.359609603881836, + "learning_rate": 3.989470439432406e-06, + "loss": 0.2062, + "num_input_tokens_seen": 13152232, + "step": 6726 + }, + { + "epoch": 0.891583830351226, + "grad_norm": 0.488369345664978, + "learning_rate": 3.989191624655576e-06, + "loss": 0.0038, + "num_input_tokens_seen": 13154184, + "step": 6727 + }, + { + "epoch": 0.891716368455931, + "grad_norm": 4.248936653137207, + "learning_rate": 3.9889127811661046e-06, + "loss": 0.152, + "num_input_tokens_seen": 13155400, + "step": 6728 + }, + { + "epoch": 0.8918489065606362, + "grad_norm": 4.701811790466309, + "learning_rate": 3.988633908969367e-06, + "loss": 0.1707, + "num_input_tokens_seen": 13157432, + "step": 6729 + }, + { + "epoch": 0.8919814446653412, + "grad_norm": 13.551857948303223, + "learning_rate": 3.988355008070742e-06, + "loss": 0.2606, + "num_input_tokens_seen": 13160128, + "step": 6730 + }, + { + "epoch": 0.8921139827700464, + "grad_norm": 6.618265151977539, + "learning_rate": 3.988076078475605e-06, + "loss": 0.1961, + "num_input_tokens_seen": 13161600, + "step": 6731 + }, + { + "epoch": 0.8922465208747515, + "grad_norm": 0.18092544376850128, + "learning_rate": 3.987797120189334e-06, + "loss": 0.0014, + "num_input_tokens_seen": 13163040, + "step": 6732 + }, + { + "epoch": 0.8923790589794566, + "grad_norm": 0.19574877619743347, + "learning_rate": 3.987518133217309e-06, + "loss": 0.0015, + "num_input_tokens_seen": 13166640, + "step": 6733 + }, + { + "epoch": 0.8925115970841617, + "grad_norm": 8.873095512390137, + "learning_rate": 3.9872391175649075e-06, + "loss": 0.3073, + "num_input_tokens_seen": 13168536, + "step": 6734 + }, + { + "epoch": 0.8926441351888668, + "grad_norm": 0.14848877489566803, + "learning_rate": 3.98696007323751e-06, + "loss": 0.0011, + "num_input_tokens_seen": 13170784, + "step": 6735 + }, + { + "epoch": 0.8927766732935719, + "grad_norm": 15.606375694274902, + "learning_rate": 3.986681000240496e-06, + "loss": 0.4647, + "num_input_tokens_seen": 13172720, + "step": 6736 + }, + { + "epoch": 0.8929092113982771, + "grad_norm": 6.107414722442627, + "learning_rate": 3.986401898579247e-06, + "loss": 0.0759, + "num_input_tokens_seen": 13174576, + "step": 6737 + }, + { + "epoch": 0.8930417495029821, + "grad_norm": 11.266855239868164, + "learning_rate": 3.986122768259145e-06, + "loss": 0.3227, + "num_input_tokens_seen": 13176928, + "step": 6738 + }, + { + "epoch": 0.8931742876076872, + "grad_norm": 0.15079239010810852, + "learning_rate": 3.9858436092855705e-06, + "loss": 0.0011, + "num_input_tokens_seen": 13179088, + "step": 6739 + }, + { + "epoch": 0.8933068257123923, + "grad_norm": 5.368622779846191, + "learning_rate": 3.985564421663905e-06, + "loss": 0.1149, + "num_input_tokens_seen": 13181744, + "step": 6740 + }, + { + "epoch": 0.8934393638170974, + "grad_norm": 13.300801277160645, + "learning_rate": 3.985285205399534e-06, + "loss": 0.2036, + "num_input_tokens_seen": 13183760, + "step": 6741 + }, + { + "epoch": 0.8935719019218025, + "grad_norm": 0.9276784062385559, + "learning_rate": 3.985005960497839e-06, + "loss": 0.0215, + "num_input_tokens_seen": 13185584, + "step": 6742 + }, + { + "epoch": 0.8937044400265076, + "grad_norm": 0.07096778601408005, + "learning_rate": 3.9847266869642045e-06, + "loss": 0.0005, + "num_input_tokens_seen": 13187488, + "step": 6743 + }, + { + "epoch": 0.8938369781312128, + "grad_norm": 0.07736282795667648, + "learning_rate": 3.984447384804014e-06, + "loss": 0.0005, + "num_input_tokens_seen": 13188832, + "step": 6744 + }, + { + "epoch": 0.8939695162359178, + "grad_norm": 10.056239128112793, + "learning_rate": 3.984168054022656e-06, + "loss": 0.3127, + "num_input_tokens_seen": 13190704, + "step": 6745 + }, + { + "epoch": 0.894102054340623, + "grad_norm": 9.601117134094238, + "learning_rate": 3.983888694625513e-06, + "loss": 0.3174, + "num_input_tokens_seen": 13192784, + "step": 6746 + }, + { + "epoch": 0.894234592445328, + "grad_norm": 14.072978019714355, + "learning_rate": 3.983609306617972e-06, + "loss": 0.4327, + "num_input_tokens_seen": 13194176, + "step": 6747 + }, + { + "epoch": 0.8943671305500331, + "grad_norm": 5.683928489685059, + "learning_rate": 3.983329890005421e-06, + "loss": 0.1651, + "num_input_tokens_seen": 13195904, + "step": 6748 + }, + { + "epoch": 0.8944996686547383, + "grad_norm": 6.371161937713623, + "learning_rate": 3.983050444793246e-06, + "loss": 0.159, + "num_input_tokens_seen": 13197608, + "step": 6749 + }, + { + "epoch": 0.8946322067594433, + "grad_norm": 8.273656845092773, + "learning_rate": 3.9827709709868355e-06, + "loss": 0.2646, + "num_input_tokens_seen": 13199896, + "step": 6750 + }, + { + "epoch": 0.8947647448641485, + "grad_norm": 6.430032253265381, + "learning_rate": 3.9824914685915775e-06, + "loss": 0.1345, + "num_input_tokens_seen": 13202824, + "step": 6751 + }, + { + "epoch": 0.8948972829688535, + "grad_norm": 13.655915260314941, + "learning_rate": 3.982211937612861e-06, + "loss": 0.3215, + "num_input_tokens_seen": 13205168, + "step": 6752 + }, + { + "epoch": 0.8950298210735587, + "grad_norm": 10.387225151062012, + "learning_rate": 3.981932378056076e-06, + "loss": 0.2561, + "num_input_tokens_seen": 13206808, + "step": 6753 + }, + { + "epoch": 0.8951623591782637, + "grad_norm": 7.360228538513184, + "learning_rate": 3.9816527899266135e-06, + "loss": 0.265, + "num_input_tokens_seen": 13208960, + "step": 6754 + }, + { + "epoch": 0.8952948972829688, + "grad_norm": 7.233108997344971, + "learning_rate": 3.9813731732298616e-06, + "loss": 0.2099, + "num_input_tokens_seen": 13210776, + "step": 6755 + }, + { + "epoch": 0.895427435387674, + "grad_norm": 0.12063634395599365, + "learning_rate": 3.981093527971214e-06, + "loss": 0.0009, + "num_input_tokens_seen": 13212160, + "step": 6756 + }, + { + "epoch": 0.895559973492379, + "grad_norm": 13.123514175415039, + "learning_rate": 3.9808138541560605e-06, + "loss": 0.2448, + "num_input_tokens_seen": 13214728, + "step": 6757 + }, + { + "epoch": 0.8956925115970842, + "grad_norm": 9.949551582336426, + "learning_rate": 3.980534151789794e-06, + "loss": 0.2294, + "num_input_tokens_seen": 13215928, + "step": 6758 + }, + { + "epoch": 0.8958250497017892, + "grad_norm": 0.17229443788528442, + "learning_rate": 3.980254420877809e-06, + "loss": 0.0013, + "num_input_tokens_seen": 13217456, + "step": 6759 + }, + { + "epoch": 0.8959575878064944, + "grad_norm": 0.4262990951538086, + "learning_rate": 3.979974661425497e-06, + "loss": 0.0031, + "num_input_tokens_seen": 13219464, + "step": 6760 + }, + { + "epoch": 0.8960901259111995, + "grad_norm": 0.28240257501602173, + "learning_rate": 3.979694873438253e-06, + "loss": 0.0021, + "num_input_tokens_seen": 13221640, + "step": 6761 + }, + { + "epoch": 0.8962226640159046, + "grad_norm": 0.5996338129043579, + "learning_rate": 3.979415056921471e-06, + "loss": 0.0043, + "num_input_tokens_seen": 13223584, + "step": 6762 + }, + { + "epoch": 0.8963552021206097, + "grad_norm": 11.818794250488281, + "learning_rate": 3.979135211880545e-06, + "loss": 0.381, + "num_input_tokens_seen": 13225624, + "step": 6763 + }, + { + "epoch": 0.8964877402253147, + "grad_norm": 0.25988027453422546, + "learning_rate": 3.978855338320873e-06, + "loss": 0.002, + "num_input_tokens_seen": 13226936, + "step": 6764 + }, + { + "epoch": 0.8966202783300199, + "grad_norm": 1.5987876653671265, + "learning_rate": 3.978575436247849e-06, + "loss": 0.0238, + "num_input_tokens_seen": 13228432, + "step": 6765 + }, + { + "epoch": 0.8967528164347249, + "grad_norm": 4.49207067489624, + "learning_rate": 3.978295505666871e-06, + "loss": 0.165, + "num_input_tokens_seen": 13230624, + "step": 6766 + }, + { + "epoch": 0.8968853545394301, + "grad_norm": 6.721325874328613, + "learning_rate": 3.978015546583336e-06, + "loss": 0.1077, + "num_input_tokens_seen": 13232520, + "step": 6767 + }, + { + "epoch": 0.8970178926441352, + "grad_norm": 11.521770477294922, + "learning_rate": 3.977735559002641e-06, + "loss": 0.309, + "num_input_tokens_seen": 13234584, + "step": 6768 + }, + { + "epoch": 0.8971504307488403, + "grad_norm": 15.510161399841309, + "learning_rate": 3.977455542930186e-06, + "loss": 0.3792, + "num_input_tokens_seen": 13236152, + "step": 6769 + }, + { + "epoch": 0.8972829688535454, + "grad_norm": 0.4905068576335907, + "learning_rate": 3.977175498371368e-06, + "loss": 0.0035, + "num_input_tokens_seen": 13238232, + "step": 6770 + }, + { + "epoch": 0.8974155069582505, + "grad_norm": 7.077093124389648, + "learning_rate": 3.976895425331587e-06, + "loss": 0.1122, + "num_input_tokens_seen": 13240200, + "step": 6771 + }, + { + "epoch": 0.8975480450629556, + "grad_norm": 1.457783818244934, + "learning_rate": 3.976615323816244e-06, + "loss": 0.0095, + "num_input_tokens_seen": 13242040, + "step": 6772 + }, + { + "epoch": 0.8976805831676608, + "grad_norm": 6.866454601287842, + "learning_rate": 3.976335193830739e-06, + "loss": 0.1944, + "num_input_tokens_seen": 13244096, + "step": 6773 + }, + { + "epoch": 0.8978131212723658, + "grad_norm": 7.5314459800720215, + "learning_rate": 3.976055035380472e-06, + "loss": 0.2269, + "num_input_tokens_seen": 13245456, + "step": 6774 + }, + { + "epoch": 0.8979456593770709, + "grad_norm": 0.5146783590316772, + "learning_rate": 3.975774848470847e-06, + "loss": 0.0037, + "num_input_tokens_seen": 13246784, + "step": 6775 + }, + { + "epoch": 0.898078197481776, + "grad_norm": 13.591337203979492, + "learning_rate": 3.975494633107264e-06, + "loss": 0.4876, + "num_input_tokens_seen": 13249352, + "step": 6776 + }, + { + "epoch": 0.8982107355864811, + "grad_norm": 0.18662597239017487, + "learning_rate": 3.9752143892951275e-06, + "loss": 0.0013, + "num_input_tokens_seen": 13250936, + "step": 6777 + }, + { + "epoch": 0.8983432736911862, + "grad_norm": 5.383232593536377, + "learning_rate": 3.974934117039838e-06, + "loss": 0.1278, + "num_input_tokens_seen": 13253216, + "step": 6778 + }, + { + "epoch": 0.8984758117958913, + "grad_norm": 0.0787423774600029, + "learning_rate": 3.974653816346804e-06, + "loss": 0.0006, + "num_input_tokens_seen": 13254408, + "step": 6779 + }, + { + "epoch": 0.8986083499005965, + "grad_norm": 1.9264734983444214, + "learning_rate": 3.974373487221424e-06, + "loss": 0.0565, + "num_input_tokens_seen": 13256000, + "step": 6780 + }, + { + "epoch": 0.8987408880053015, + "grad_norm": 10.088286399841309, + "learning_rate": 3.974093129669109e-06, + "loss": 0.3258, + "num_input_tokens_seen": 13258168, + "step": 6781 + }, + { + "epoch": 0.8988734261100066, + "grad_norm": 13.328400611877441, + "learning_rate": 3.97381274369526e-06, + "loss": 0.3378, + "num_input_tokens_seen": 13260112, + "step": 6782 + }, + { + "epoch": 0.8990059642147117, + "grad_norm": 8.356839179992676, + "learning_rate": 3.973532329305284e-06, + "loss": 0.2847, + "num_input_tokens_seen": 13261568, + "step": 6783 + }, + { + "epoch": 0.8991385023194168, + "grad_norm": 1.082217812538147, + "learning_rate": 3.973251886504589e-06, + "loss": 0.0071, + "num_input_tokens_seen": 13263328, + "step": 6784 + }, + { + "epoch": 0.899271040424122, + "grad_norm": 9.59509563446045, + "learning_rate": 3.972971415298582e-06, + "loss": 0.2292, + "num_input_tokens_seen": 13264368, + "step": 6785 + }, + { + "epoch": 0.899403578528827, + "grad_norm": 0.48201510310173035, + "learning_rate": 3.972690915692668e-06, + "loss": 0.0026, + "num_input_tokens_seen": 13266992, + "step": 6786 + }, + { + "epoch": 0.8995361166335322, + "grad_norm": 0.16856423020362854, + "learning_rate": 3.972410387692259e-06, + "loss": 0.0012, + "num_input_tokens_seen": 13269528, + "step": 6787 + }, + { + "epoch": 0.8996686547382372, + "grad_norm": 11.04496955871582, + "learning_rate": 3.972129831302761e-06, + "loss": 0.4573, + "num_input_tokens_seen": 13271424, + "step": 6788 + }, + { + "epoch": 0.8998011928429424, + "grad_norm": 5.232909679412842, + "learning_rate": 3.9718492465295844e-06, + "loss": 0.0732, + "num_input_tokens_seen": 13273008, + "step": 6789 + }, + { + "epoch": 0.8999337309476475, + "grad_norm": 0.377306193113327, + "learning_rate": 3.971568633378139e-06, + "loss": 0.0026, + "num_input_tokens_seen": 13274256, + "step": 6790 + }, + { + "epoch": 0.9000662690523525, + "grad_norm": 1.9984349012374878, + "learning_rate": 3.971287991853835e-06, + "loss": 0.0171, + "num_input_tokens_seen": 13275648, + "step": 6791 + }, + { + "epoch": 0.9001988071570577, + "grad_norm": 7.085953712463379, + "learning_rate": 3.971007321962084e-06, + "loss": 0.1151, + "num_input_tokens_seen": 13277576, + "step": 6792 + }, + { + "epoch": 0.9003313452617627, + "grad_norm": 11.624064445495605, + "learning_rate": 3.970726623708298e-06, + "loss": 0.5022, + "num_input_tokens_seen": 13279792, + "step": 6793 + }, + { + "epoch": 0.9004638833664679, + "grad_norm": 0.25804612040519714, + "learning_rate": 3.970445897097886e-06, + "loss": 0.0017, + "num_input_tokens_seen": 13281320, + "step": 6794 + }, + { + "epoch": 0.9005964214711729, + "grad_norm": 3.893050193786621, + "learning_rate": 3.9701651421362636e-06, + "loss": 0.0833, + "num_input_tokens_seen": 13283232, + "step": 6795 + }, + { + "epoch": 0.9007289595758781, + "grad_norm": 5.530307292938232, + "learning_rate": 3.969884358828843e-06, + "loss": 0.1551, + "num_input_tokens_seen": 13285136, + "step": 6796 + }, + { + "epoch": 0.9008614976805832, + "grad_norm": 11.275203704833984, + "learning_rate": 3.9696035471810375e-06, + "loss": 0.3838, + "num_input_tokens_seen": 13287408, + "step": 6797 + }, + { + "epoch": 0.9009940357852882, + "grad_norm": 7.923050880432129, + "learning_rate": 3.969322707198263e-06, + "loss": 0.2763, + "num_input_tokens_seen": 13289064, + "step": 6798 + }, + { + "epoch": 0.9011265738899934, + "grad_norm": 0.04217950627207756, + "learning_rate": 3.969041838885932e-06, + "loss": 0.0003, + "num_input_tokens_seen": 13291064, + "step": 6799 + }, + { + "epoch": 0.9012591119946984, + "grad_norm": 10.22539234161377, + "learning_rate": 3.968760942249461e-06, + "loss": 0.2045, + "num_input_tokens_seen": 13293168, + "step": 6800 + }, + { + "epoch": 0.9013916500994036, + "grad_norm": 1.1670544147491455, + "learning_rate": 3.968480017294266e-06, + "loss": 0.0071, + "num_input_tokens_seen": 13295264, + "step": 6801 + }, + { + "epoch": 0.9015241882041087, + "grad_norm": 0.08044814318418503, + "learning_rate": 3.968199064025764e-06, + "loss": 0.0006, + "num_input_tokens_seen": 13297120, + "step": 6802 + }, + { + "epoch": 0.9016567263088138, + "grad_norm": 10.395515441894531, + "learning_rate": 3.96791808244937e-06, + "loss": 0.3847, + "num_input_tokens_seen": 13299504, + "step": 6803 + }, + { + "epoch": 0.9017892644135189, + "grad_norm": 0.12965846061706543, + "learning_rate": 3.967637072570503e-06, + "loss": 0.0007, + "num_input_tokens_seen": 13300712, + "step": 6804 + }, + { + "epoch": 0.901921802518224, + "grad_norm": 10.181883811950684, + "learning_rate": 3.967356034394581e-06, + "loss": 0.1699, + "num_input_tokens_seen": 13302600, + "step": 6805 + }, + { + "epoch": 0.9020543406229291, + "grad_norm": 4.366451740264893, + "learning_rate": 3.9670749679270225e-06, + "loss": 0.0689, + "num_input_tokens_seen": 13304176, + "step": 6806 + }, + { + "epoch": 0.9021868787276341, + "grad_norm": 12.554516792297363, + "learning_rate": 3.966793873173246e-06, + "loss": 0.2085, + "num_input_tokens_seen": 13305696, + "step": 6807 + }, + { + "epoch": 0.9023194168323393, + "grad_norm": 0.027440842241048813, + "learning_rate": 3.966512750138673e-06, + "loss": 0.0002, + "num_input_tokens_seen": 13307344, + "step": 6808 + }, + { + "epoch": 0.9024519549370444, + "grad_norm": 37.869083404541016, + "learning_rate": 3.966231598828721e-06, + "loss": 0.5207, + "num_input_tokens_seen": 13308968, + "step": 6809 + }, + { + "epoch": 0.9025844930417495, + "grad_norm": 13.978860855102539, + "learning_rate": 3.965950419248813e-06, + "loss": 0.4221, + "num_input_tokens_seen": 13310144, + "step": 6810 + }, + { + "epoch": 0.9027170311464546, + "grad_norm": 15.915724754333496, + "learning_rate": 3.9656692114043695e-06, + "loss": 0.6148, + "num_input_tokens_seen": 13312136, + "step": 6811 + }, + { + "epoch": 0.9028495692511597, + "grad_norm": 14.286364555358887, + "learning_rate": 3.965387975300812e-06, + "loss": 0.4406, + "num_input_tokens_seen": 13314528, + "step": 6812 + }, + { + "epoch": 0.9029821073558648, + "grad_norm": 6.257908344268799, + "learning_rate": 3.965106710943564e-06, + "loss": 0.2177, + "num_input_tokens_seen": 13316440, + "step": 6813 + }, + { + "epoch": 0.90311464546057, + "grad_norm": 0.10165877640247345, + "learning_rate": 3.964825418338048e-06, + "loss": 0.0007, + "num_input_tokens_seen": 13318552, + "step": 6814 + }, + { + "epoch": 0.903247183565275, + "grad_norm": 14.913152694702148, + "learning_rate": 3.964544097489687e-06, + "loss": 0.3458, + "num_input_tokens_seen": 13319680, + "step": 6815 + }, + { + "epoch": 0.9033797216699802, + "grad_norm": 0.5530948638916016, + "learning_rate": 3.964262748403905e-06, + "loss": 0.0041, + "num_input_tokens_seen": 13321632, + "step": 6816 + }, + { + "epoch": 0.9035122597746852, + "grad_norm": 6.750959396362305, + "learning_rate": 3.963981371086127e-06, + "loss": 0.2452, + "num_input_tokens_seen": 13323264, + "step": 6817 + }, + { + "epoch": 0.9036447978793903, + "grad_norm": 9.251420021057129, + "learning_rate": 3.96369996554178e-06, + "loss": 0.2868, + "num_input_tokens_seen": 13325216, + "step": 6818 + }, + { + "epoch": 0.9037773359840954, + "grad_norm": 15.157255172729492, + "learning_rate": 3.963418531776287e-06, + "loss": 0.4721, + "num_input_tokens_seen": 13327184, + "step": 6819 + }, + { + "epoch": 0.9039098740888005, + "grad_norm": 5.238453388214111, + "learning_rate": 3.963137069795074e-06, + "loss": 0.1627, + "num_input_tokens_seen": 13328816, + "step": 6820 + }, + { + "epoch": 0.9040424121935057, + "grad_norm": 7.305315017700195, + "learning_rate": 3.962855579603571e-06, + "loss": 0.088, + "num_input_tokens_seen": 13331336, + "step": 6821 + }, + { + "epoch": 0.9041749502982107, + "grad_norm": 10.264121055603027, + "learning_rate": 3.962574061207202e-06, + "loss": 0.3591, + "num_input_tokens_seen": 13333424, + "step": 6822 + }, + { + "epoch": 0.9043074884029159, + "grad_norm": 6.215490341186523, + "learning_rate": 3.962292514611396e-06, + "loss": 0.0509, + "num_input_tokens_seen": 13335336, + "step": 6823 + }, + { + "epoch": 0.9044400265076209, + "grad_norm": 0.1585277020931244, + "learning_rate": 3.962010939821582e-06, + "loss": 0.0011, + "num_input_tokens_seen": 13336544, + "step": 6824 + }, + { + "epoch": 0.904572564612326, + "grad_norm": 9.907018661499023, + "learning_rate": 3.961729336843188e-06, + "loss": 0.3397, + "num_input_tokens_seen": 13338464, + "step": 6825 + }, + { + "epoch": 0.9047051027170312, + "grad_norm": 11.353303909301758, + "learning_rate": 3.961447705681644e-06, + "loss": 0.1889, + "num_input_tokens_seen": 13340464, + "step": 6826 + }, + { + "epoch": 0.9048376408217362, + "grad_norm": 2.151334285736084, + "learning_rate": 3.961166046342381e-06, + "loss": 0.0514, + "num_input_tokens_seen": 13342872, + "step": 6827 + }, + { + "epoch": 0.9049701789264414, + "grad_norm": 0.2887546122074127, + "learning_rate": 3.960884358830827e-06, + "loss": 0.0021, + "num_input_tokens_seen": 13344576, + "step": 6828 + }, + { + "epoch": 0.9051027170311464, + "grad_norm": 7.460299015045166, + "learning_rate": 3.960602643152416e-06, + "loss": 0.105, + "num_input_tokens_seen": 13346312, + "step": 6829 + }, + { + "epoch": 0.9052352551358516, + "grad_norm": 15.81497573852539, + "learning_rate": 3.960320899312578e-06, + "loss": 0.6401, + "num_input_tokens_seen": 13348784, + "step": 6830 + }, + { + "epoch": 0.9053677932405566, + "grad_norm": 0.3608487546443939, + "learning_rate": 3.960039127316746e-06, + "loss": 0.0028, + "num_input_tokens_seen": 13350256, + "step": 6831 + }, + { + "epoch": 0.9055003313452618, + "grad_norm": 8.746213912963867, + "learning_rate": 3.9597573271703515e-06, + "loss": 0.3341, + "num_input_tokens_seen": 13352056, + "step": 6832 + }, + { + "epoch": 0.9056328694499669, + "grad_norm": 1.0589901208877563, + "learning_rate": 3.9594754988788295e-06, + "loss": 0.008, + "num_input_tokens_seen": 13353848, + "step": 6833 + }, + { + "epoch": 0.9057654075546719, + "grad_norm": 0.43171942234039307, + "learning_rate": 3.959193642447613e-06, + "loss": 0.0032, + "num_input_tokens_seen": 13356168, + "step": 6834 + }, + { + "epoch": 0.9058979456593771, + "grad_norm": 6.129023551940918, + "learning_rate": 3.958911757882136e-06, + "loss": 0.0888, + "num_input_tokens_seen": 13358016, + "step": 6835 + }, + { + "epoch": 0.9060304837640821, + "grad_norm": 15.092604637145996, + "learning_rate": 3.958629845187835e-06, + "loss": 0.7162, + "num_input_tokens_seen": 13359400, + "step": 6836 + }, + { + "epoch": 0.9061630218687873, + "grad_norm": 18.96518898010254, + "learning_rate": 3.958347904370143e-06, + "loss": 0.5034, + "num_input_tokens_seen": 13361528, + "step": 6837 + }, + { + "epoch": 0.9062955599734924, + "grad_norm": 6.916331768035889, + "learning_rate": 3.9580659354344975e-06, + "loss": 0.3403, + "num_input_tokens_seen": 13363520, + "step": 6838 + }, + { + "epoch": 0.9064280980781975, + "grad_norm": 0.282146692276001, + "learning_rate": 3.9577839383863355e-06, + "loss": 0.002, + "num_input_tokens_seen": 13365520, + "step": 6839 + }, + { + "epoch": 0.9065606361829026, + "grad_norm": 1.6812944412231445, + "learning_rate": 3.957501913231093e-06, + "loss": 0.012, + "num_input_tokens_seen": 13367424, + "step": 6840 + }, + { + "epoch": 0.9066931742876077, + "grad_norm": 9.485698699951172, + "learning_rate": 3.957219859974208e-06, + "loss": 0.2512, + "num_input_tokens_seen": 13369344, + "step": 6841 + }, + { + "epoch": 0.9068257123923128, + "grad_norm": 14.154409408569336, + "learning_rate": 3.95693777862112e-06, + "loss": 0.6911, + "num_input_tokens_seen": 13372200, + "step": 6842 + }, + { + "epoch": 0.9069582504970178, + "grad_norm": 12.965131759643555, + "learning_rate": 3.956655669177265e-06, + "loss": 0.611, + "num_input_tokens_seen": 13374192, + "step": 6843 + }, + { + "epoch": 0.907090788601723, + "grad_norm": 0.11968349665403366, + "learning_rate": 3.956373531648084e-06, + "loss": 0.0009, + "num_input_tokens_seen": 13376112, + "step": 6844 + }, + { + "epoch": 0.9072233267064281, + "grad_norm": 12.837465286254883, + "learning_rate": 3.956091366039018e-06, + "loss": 0.5493, + "num_input_tokens_seen": 13378632, + "step": 6845 + }, + { + "epoch": 0.9073558648111332, + "grad_norm": 6.205852508544922, + "learning_rate": 3.955809172355505e-06, + "loss": 0.1054, + "num_input_tokens_seen": 13382096, + "step": 6846 + }, + { + "epoch": 0.9074884029158383, + "grad_norm": 2.123368978500366, + "learning_rate": 3.955526950602987e-06, + "loss": 0.0392, + "num_input_tokens_seen": 13383552, + "step": 6847 + }, + { + "epoch": 0.9076209410205434, + "grad_norm": 0.06099429726600647, + "learning_rate": 3.955244700786904e-06, + "loss": 0.0005, + "num_input_tokens_seen": 13385584, + "step": 6848 + }, + { + "epoch": 0.9077534791252485, + "grad_norm": 4.104151248931885, + "learning_rate": 3.954962422912701e-06, + "loss": 0.1152, + "num_input_tokens_seen": 13388920, + "step": 6849 + }, + { + "epoch": 0.9078860172299537, + "grad_norm": 3.530329704284668, + "learning_rate": 3.954680116985818e-06, + "loss": 0.0984, + "num_input_tokens_seen": 13391056, + "step": 6850 + }, + { + "epoch": 0.9080185553346587, + "grad_norm": 0.13000766932964325, + "learning_rate": 3.954397783011699e-06, + "loss": 0.001, + "num_input_tokens_seen": 13393544, + "step": 6851 + }, + { + "epoch": 0.9081510934393638, + "grad_norm": 14.69619369506836, + "learning_rate": 3.954115420995788e-06, + "loss": 0.2713, + "num_input_tokens_seen": 13395320, + "step": 6852 + }, + { + "epoch": 0.9082836315440689, + "grad_norm": 2.2810709476470947, + "learning_rate": 3.953833030943528e-06, + "loss": 0.0845, + "num_input_tokens_seen": 13397320, + "step": 6853 + }, + { + "epoch": 0.908416169648774, + "grad_norm": 16.55765724182129, + "learning_rate": 3.953550612860364e-06, + "loss": 0.461, + "num_input_tokens_seen": 13398824, + "step": 6854 + }, + { + "epoch": 0.9085487077534792, + "grad_norm": 8.590668678283691, + "learning_rate": 3.953268166751742e-06, + "loss": 0.1012, + "num_input_tokens_seen": 13401296, + "step": 6855 + }, + { + "epoch": 0.9086812458581842, + "grad_norm": 4.806265830993652, + "learning_rate": 3.952985692623106e-06, + "loss": 0.0903, + "num_input_tokens_seen": 13402968, + "step": 6856 + }, + { + "epoch": 0.9088137839628894, + "grad_norm": 20.488168716430664, + "learning_rate": 3.9527031904799045e-06, + "loss": 0.6054, + "num_input_tokens_seen": 13404984, + "step": 6857 + }, + { + "epoch": 0.9089463220675944, + "grad_norm": 0.251567542552948, + "learning_rate": 3.952420660327583e-06, + "loss": 0.0019, + "num_input_tokens_seen": 13406424, + "step": 6858 + }, + { + "epoch": 0.9090788601722996, + "grad_norm": 0.10690252482891083, + "learning_rate": 3.9521381021715896e-06, + "loss": 0.0009, + "num_input_tokens_seen": 13409064, + "step": 6859 + }, + { + "epoch": 0.9092113982770046, + "grad_norm": 0.112008236348629, + "learning_rate": 3.951855516017371e-06, + "loss": 0.0008, + "num_input_tokens_seen": 13410960, + "step": 6860 + }, + { + "epoch": 0.9093439363817097, + "grad_norm": 0.07194206863641739, + "learning_rate": 3.9515729018703774e-06, + "loss": 0.0005, + "num_input_tokens_seen": 13411992, + "step": 6861 + }, + { + "epoch": 0.9094764744864149, + "grad_norm": 0.7300190329551697, + "learning_rate": 3.951290259736056e-06, + "loss": 0.0053, + "num_input_tokens_seen": 13414384, + "step": 6862 + }, + { + "epoch": 0.9096090125911199, + "grad_norm": 7.1128764152526855, + "learning_rate": 3.951007589619858e-06, + "loss": 0.1488, + "num_input_tokens_seen": 13417416, + "step": 6863 + }, + { + "epoch": 0.9097415506958251, + "grad_norm": 15.463702201843262, + "learning_rate": 3.950724891527232e-06, + "loss": 0.3597, + "num_input_tokens_seen": 13419376, + "step": 6864 + }, + { + "epoch": 0.9098740888005301, + "grad_norm": 1.5325649976730347, + "learning_rate": 3.95044216546363e-06, + "loss": 0.0094, + "num_input_tokens_seen": 13421984, + "step": 6865 + }, + { + "epoch": 0.9100066269052353, + "grad_norm": 2.666563034057617, + "learning_rate": 3.950159411434502e-06, + "loss": 0.1268, + "num_input_tokens_seen": 13423568, + "step": 6866 + }, + { + "epoch": 0.9101391650099404, + "grad_norm": 0.20841571688652039, + "learning_rate": 3.9498766294452995e-06, + "loss": 0.0014, + "num_input_tokens_seen": 13425472, + "step": 6867 + }, + { + "epoch": 0.9102717031146454, + "grad_norm": 11.133683204650879, + "learning_rate": 3.9495938195014765e-06, + "loss": 0.5669, + "num_input_tokens_seen": 13427712, + "step": 6868 + }, + { + "epoch": 0.9104042412193506, + "grad_norm": 6.95251989364624, + "learning_rate": 3.949310981608484e-06, + "loss": 0.1955, + "num_input_tokens_seen": 13430048, + "step": 6869 + }, + { + "epoch": 0.9105367793240556, + "grad_norm": 12.892457008361816, + "learning_rate": 3.949028115771776e-06, + "loss": 0.161, + "num_input_tokens_seen": 13431464, + "step": 6870 + }, + { + "epoch": 0.9106693174287608, + "grad_norm": 10.470257759094238, + "learning_rate": 3.948745221996806e-06, + "loss": 0.3872, + "num_input_tokens_seen": 13433720, + "step": 6871 + }, + { + "epoch": 0.9108018555334658, + "grad_norm": 0.09639782458543777, + "learning_rate": 3.948462300289029e-06, + "loss": 0.0007, + "num_input_tokens_seen": 13435560, + "step": 6872 + }, + { + "epoch": 0.910934393638171, + "grad_norm": 8.420403480529785, + "learning_rate": 3.9481793506539e-06, + "loss": 0.226, + "num_input_tokens_seen": 13437848, + "step": 6873 + }, + { + "epoch": 0.9110669317428761, + "grad_norm": 0.056494735181331635, + "learning_rate": 3.947896373096874e-06, + "loss": 0.0004, + "num_input_tokens_seen": 13439136, + "step": 6874 + }, + { + "epoch": 0.9111994698475812, + "grad_norm": 10.696561813354492, + "learning_rate": 3.947613367623407e-06, + "loss": 0.1749, + "num_input_tokens_seen": 13440920, + "step": 6875 + }, + { + "epoch": 0.9113320079522863, + "grad_norm": 10.43608283996582, + "learning_rate": 3.947330334238955e-06, + "loss": 0.2885, + "num_input_tokens_seen": 13443768, + "step": 6876 + }, + { + "epoch": 0.9114645460569913, + "grad_norm": 3.8761146068573, + "learning_rate": 3.947047272948976e-06, + "loss": 0.1595, + "num_input_tokens_seen": 13445800, + "step": 6877 + }, + { + "epoch": 0.9115970841616965, + "grad_norm": 4.681922435760498, + "learning_rate": 3.946764183758927e-06, + "loss": 0.161, + "num_input_tokens_seen": 13447312, + "step": 6878 + }, + { + "epoch": 0.9117296222664016, + "grad_norm": 0.1888013631105423, + "learning_rate": 3.946481066674267e-06, + "loss": 0.0013, + "num_input_tokens_seen": 13449192, + "step": 6879 + }, + { + "epoch": 0.9118621603711067, + "grad_norm": 12.019532203674316, + "learning_rate": 3.946197921700455e-06, + "loss": 0.1734, + "num_input_tokens_seen": 13451256, + "step": 6880 + }, + { + "epoch": 0.9119946984758118, + "grad_norm": 12.030641555786133, + "learning_rate": 3.945914748842948e-06, + "loss": 0.5239, + "num_input_tokens_seen": 13453288, + "step": 6881 + }, + { + "epoch": 0.9121272365805169, + "grad_norm": 20.368032455444336, + "learning_rate": 3.945631548107207e-06, + "loss": 0.5853, + "num_input_tokens_seen": 13455512, + "step": 6882 + }, + { + "epoch": 0.912259774685222, + "grad_norm": 10.199458122253418, + "learning_rate": 3.945348319498694e-06, + "loss": 0.1507, + "num_input_tokens_seen": 13457344, + "step": 6883 + }, + { + "epoch": 0.912392312789927, + "grad_norm": 5.051602840423584, + "learning_rate": 3.945065063022867e-06, + "loss": 0.0716, + "num_input_tokens_seen": 13459472, + "step": 6884 + }, + { + "epoch": 0.9125248508946322, + "grad_norm": 8.976778984069824, + "learning_rate": 3.944781778685189e-06, + "loss": 0.1547, + "num_input_tokens_seen": 13461280, + "step": 6885 + }, + { + "epoch": 0.9126573889993373, + "grad_norm": 1.6677252054214478, + "learning_rate": 3.944498466491122e-06, + "loss": 0.057, + "num_input_tokens_seen": 13462800, + "step": 6886 + }, + { + "epoch": 0.9127899271040424, + "grad_norm": 9.220848083496094, + "learning_rate": 3.9442151264461275e-06, + "loss": 0.1771, + "num_input_tokens_seen": 13464664, + "step": 6887 + }, + { + "epoch": 0.9129224652087475, + "grad_norm": 2.4703524112701416, + "learning_rate": 3.943931758555669e-06, + "loss": 0.032, + "num_input_tokens_seen": 13468016, + "step": 6888 + }, + { + "epoch": 0.9130550033134526, + "grad_norm": 0.13127319514751434, + "learning_rate": 3.9436483628252105e-06, + "loss": 0.001, + "num_input_tokens_seen": 13470072, + "step": 6889 + }, + { + "epoch": 0.9131875414181577, + "grad_norm": 6.142790794372559, + "learning_rate": 3.9433649392602155e-06, + "loss": 0.0951, + "num_input_tokens_seen": 13471848, + "step": 6890 + }, + { + "epoch": 0.9133200795228629, + "grad_norm": 0.3727642893791199, + "learning_rate": 3.9430814878661495e-06, + "loss": 0.0027, + "num_input_tokens_seen": 13473456, + "step": 6891 + }, + { + "epoch": 0.9134526176275679, + "grad_norm": 10.982667922973633, + "learning_rate": 3.942798008648475e-06, + "loss": 0.2959, + "num_input_tokens_seen": 13475152, + "step": 6892 + }, + { + "epoch": 0.9135851557322731, + "grad_norm": 5.460580825805664, + "learning_rate": 3.9425145016126606e-06, + "loss": 0.1934, + "num_input_tokens_seen": 13476824, + "step": 6893 + }, + { + "epoch": 0.9137176938369781, + "grad_norm": 12.394134521484375, + "learning_rate": 3.942230966764172e-06, + "loss": 0.4479, + "num_input_tokens_seen": 13479104, + "step": 6894 + }, + { + "epoch": 0.9138502319416832, + "grad_norm": 11.230950355529785, + "learning_rate": 3.941947404108475e-06, + "loss": 0.1138, + "num_input_tokens_seen": 13480784, + "step": 6895 + }, + { + "epoch": 0.9139827700463883, + "grad_norm": 0.35740038752555847, + "learning_rate": 3.941663813651038e-06, + "loss": 0.0026, + "num_input_tokens_seen": 13482640, + "step": 6896 + }, + { + "epoch": 0.9141153081510934, + "grad_norm": 0.2584132254123688, + "learning_rate": 3.9413801953973266e-06, + "loss": 0.0019, + "num_input_tokens_seen": 13484592, + "step": 6897 + }, + { + "epoch": 0.9142478462557986, + "grad_norm": 0.9678654074668884, + "learning_rate": 3.941096549352812e-06, + "loss": 0.0057, + "num_input_tokens_seen": 13486248, + "step": 6898 + }, + { + "epoch": 0.9143803843605036, + "grad_norm": 10.76855182647705, + "learning_rate": 3.940812875522961e-06, + "loss": 0.2149, + "num_input_tokens_seen": 13488088, + "step": 6899 + }, + { + "epoch": 0.9145129224652088, + "grad_norm": 4.404655933380127, + "learning_rate": 3.9405291739132435e-06, + "loss": 0.0673, + "num_input_tokens_seen": 13489968, + "step": 6900 + }, + { + "epoch": 0.9146454605699138, + "grad_norm": 0.7537723183631897, + "learning_rate": 3.9402454445291306e-06, + "loss": 0.0051, + "num_input_tokens_seen": 13491792, + "step": 6901 + }, + { + "epoch": 0.914777998674619, + "grad_norm": 7.359374046325684, + "learning_rate": 3.939961687376091e-06, + "loss": 0.0968, + "num_input_tokens_seen": 13493552, + "step": 6902 + }, + { + "epoch": 0.9149105367793241, + "grad_norm": 0.1832287460565567, + "learning_rate": 3.939677902459598e-06, + "loss": 0.0013, + "num_input_tokens_seen": 13495160, + "step": 6903 + }, + { + "epoch": 0.9150430748840291, + "grad_norm": 0.6329948902130127, + "learning_rate": 3.939394089785122e-06, + "loss": 0.0039, + "num_input_tokens_seen": 13496832, + "step": 6904 + }, + { + "epoch": 0.9151756129887343, + "grad_norm": 5.641445636749268, + "learning_rate": 3.9391102493581335e-06, + "loss": 0.0378, + "num_input_tokens_seen": 13499488, + "step": 6905 + }, + { + "epoch": 0.9153081510934393, + "grad_norm": 10.307908058166504, + "learning_rate": 3.938826381184107e-06, + "loss": 0.3068, + "num_input_tokens_seen": 13501912, + "step": 6906 + }, + { + "epoch": 0.9154406891981445, + "grad_norm": 12.66156005859375, + "learning_rate": 3.938542485268516e-06, + "loss": 0.342, + "num_input_tokens_seen": 13503888, + "step": 6907 + }, + { + "epoch": 0.9155732273028496, + "grad_norm": 3.969407320022583, + "learning_rate": 3.938258561616832e-06, + "loss": 0.1588, + "num_input_tokens_seen": 13506288, + "step": 6908 + }, + { + "epoch": 0.9157057654075547, + "grad_norm": 6.907105445861816, + "learning_rate": 3.937974610234533e-06, + "loss": 0.1965, + "num_input_tokens_seen": 13508448, + "step": 6909 + }, + { + "epoch": 0.9158383035122598, + "grad_norm": 0.08150976896286011, + "learning_rate": 3.93769063112709e-06, + "loss": 0.0006, + "num_input_tokens_seen": 13510184, + "step": 6910 + }, + { + "epoch": 0.9159708416169648, + "grad_norm": 0.11603689938783646, + "learning_rate": 3.93740662429998e-06, + "loss": 0.0008, + "num_input_tokens_seen": 13512536, + "step": 6911 + }, + { + "epoch": 0.91610337972167, + "grad_norm": 0.5147866010665894, + "learning_rate": 3.937122589758679e-06, + "loss": 0.006, + "num_input_tokens_seen": 13514928, + "step": 6912 + }, + { + "epoch": 0.916235917826375, + "grad_norm": 6.076064586639404, + "learning_rate": 3.936838527508662e-06, + "loss": 0.264, + "num_input_tokens_seen": 13516784, + "step": 6913 + }, + { + "epoch": 0.9163684559310802, + "grad_norm": 11.166181564331055, + "learning_rate": 3.9365544375554075e-06, + "loss": 0.2326, + "num_input_tokens_seen": 13519640, + "step": 6914 + }, + { + "epoch": 0.9165009940357853, + "grad_norm": 12.674071311950684, + "learning_rate": 3.936270319904393e-06, + "loss": 0.3049, + "num_input_tokens_seen": 13521488, + "step": 6915 + }, + { + "epoch": 0.9166335321404904, + "grad_norm": 0.14784374833106995, + "learning_rate": 3.935986174561096e-06, + "loss": 0.0011, + "num_input_tokens_seen": 13523072, + "step": 6916 + }, + { + "epoch": 0.9167660702451955, + "grad_norm": 4.376709461212158, + "learning_rate": 3.935702001530994e-06, + "loss": 0.1143, + "num_input_tokens_seen": 13525624, + "step": 6917 + }, + { + "epoch": 0.9168986083499006, + "grad_norm": 0.15594542026519775, + "learning_rate": 3.935417800819568e-06, + "loss": 0.001, + "num_input_tokens_seen": 13527112, + "step": 6918 + }, + { + "epoch": 0.9170311464546057, + "grad_norm": 10.183873176574707, + "learning_rate": 3.935133572432296e-06, + "loss": 0.2673, + "num_input_tokens_seen": 13528296, + "step": 6919 + }, + { + "epoch": 0.9171636845593109, + "grad_norm": 8.469533920288086, + "learning_rate": 3.9348493163746585e-06, + "loss": 0.152, + "num_input_tokens_seen": 13529712, + "step": 6920 + }, + { + "epoch": 0.9172962226640159, + "grad_norm": 8.414627075195312, + "learning_rate": 3.934565032652136e-06, + "loss": 0.1055, + "num_input_tokens_seen": 13531408, + "step": 6921 + }, + { + "epoch": 0.917428760768721, + "grad_norm": 0.11790445446968079, + "learning_rate": 3.934280721270211e-06, + "loss": 0.0008, + "num_input_tokens_seen": 13533120, + "step": 6922 + }, + { + "epoch": 0.9175612988734261, + "grad_norm": 0.12045390158891678, + "learning_rate": 3.933996382234364e-06, + "loss": 0.0009, + "num_input_tokens_seen": 13534584, + "step": 6923 + }, + { + "epoch": 0.9176938369781312, + "grad_norm": 0.4059077203273773, + "learning_rate": 3.933712015550077e-06, + "loss": 0.0024, + "num_input_tokens_seen": 13536184, + "step": 6924 + }, + { + "epoch": 0.9178263750828363, + "grad_norm": 0.11318500339984894, + "learning_rate": 3.933427621222834e-06, + "loss": 0.0006, + "num_input_tokens_seen": 13538376, + "step": 6925 + }, + { + "epoch": 0.9179589131875414, + "grad_norm": 5.965323448181152, + "learning_rate": 3.9331431992581165e-06, + "loss": 0.1785, + "num_input_tokens_seen": 13539856, + "step": 6926 + }, + { + "epoch": 0.9180914512922466, + "grad_norm": 4.110706329345703, + "learning_rate": 3.93285874966141e-06, + "loss": 0.1119, + "num_input_tokens_seen": 13541640, + "step": 6927 + }, + { + "epoch": 0.9182239893969516, + "grad_norm": 8.155120849609375, + "learning_rate": 3.932574272438197e-06, + "loss": 0.2811, + "num_input_tokens_seen": 13543800, + "step": 6928 + }, + { + "epoch": 0.9183565275016567, + "grad_norm": 7.560337543487549, + "learning_rate": 3.932289767593966e-06, + "loss": 0.3122, + "num_input_tokens_seen": 13546032, + "step": 6929 + }, + { + "epoch": 0.9184890656063618, + "grad_norm": 10.150988578796387, + "learning_rate": 3.9320052351342e-06, + "loss": 0.1944, + "num_input_tokens_seen": 13548608, + "step": 6930 + }, + { + "epoch": 0.9186216037110669, + "grad_norm": 0.23262743651866913, + "learning_rate": 3.931720675064384e-06, + "loss": 0.0016, + "num_input_tokens_seen": 13550128, + "step": 6931 + }, + { + "epoch": 0.9187541418157721, + "grad_norm": 0.06611327826976776, + "learning_rate": 3.931436087390006e-06, + "loss": 0.0004, + "num_input_tokens_seen": 13551856, + "step": 6932 + }, + { + "epoch": 0.9188866799204771, + "grad_norm": 5.257100582122803, + "learning_rate": 3.931151472116553e-06, + "loss": 0.0564, + "num_input_tokens_seen": 13553816, + "step": 6933 + }, + { + "epoch": 0.9190192180251823, + "grad_norm": 0.3416479825973511, + "learning_rate": 3.930866829249512e-06, + "loss": 0.0018, + "num_input_tokens_seen": 13555840, + "step": 6934 + }, + { + "epoch": 0.9191517561298873, + "grad_norm": 0.2147931158542633, + "learning_rate": 3.930582158794373e-06, + "loss": 0.0015, + "num_input_tokens_seen": 13557608, + "step": 6935 + }, + { + "epoch": 0.9192842942345925, + "grad_norm": 3.223468542098999, + "learning_rate": 3.930297460756621e-06, + "loss": 0.0465, + "num_input_tokens_seen": 13559480, + "step": 6936 + }, + { + "epoch": 0.9194168323392975, + "grad_norm": 8.972900390625, + "learning_rate": 3.930012735141748e-06, + "loss": 0.2574, + "num_input_tokens_seen": 13561544, + "step": 6937 + }, + { + "epoch": 0.9195493704440026, + "grad_norm": 8.675262451171875, + "learning_rate": 3.929727981955243e-06, + "loss": 0.1748, + "num_input_tokens_seen": 13563528, + "step": 6938 + }, + { + "epoch": 0.9196819085487078, + "grad_norm": 16.675857543945312, + "learning_rate": 3.929443201202596e-06, + "loss": 0.5545, + "num_input_tokens_seen": 13565784, + "step": 6939 + }, + { + "epoch": 0.9198144466534128, + "grad_norm": 11.325575828552246, + "learning_rate": 3.9291583928892985e-06, + "loss": 0.2299, + "num_input_tokens_seen": 13567680, + "step": 6940 + }, + { + "epoch": 0.919946984758118, + "grad_norm": 17.588489532470703, + "learning_rate": 3.92887355702084e-06, + "loss": 0.4056, + "num_input_tokens_seen": 13570112, + "step": 6941 + }, + { + "epoch": 0.920079522862823, + "grad_norm": 9.731961250305176, + "learning_rate": 3.928588693602715e-06, + "loss": 0.2847, + "num_input_tokens_seen": 13572832, + "step": 6942 + }, + { + "epoch": 0.9202120609675282, + "grad_norm": 5.086437225341797, + "learning_rate": 3.928303802640414e-06, + "loss": 0.1085, + "num_input_tokens_seen": 13575472, + "step": 6943 + }, + { + "epoch": 0.9203445990722333, + "grad_norm": 0.1162663921713829, + "learning_rate": 3.92801888413943e-06, + "loss": 0.0008, + "num_input_tokens_seen": 13577936, + "step": 6944 + }, + { + "epoch": 0.9204771371769384, + "grad_norm": 2.924497127532959, + "learning_rate": 3.927733938105257e-06, + "loss": 0.0496, + "num_input_tokens_seen": 13579352, + "step": 6945 + }, + { + "epoch": 0.9206096752816435, + "grad_norm": 5.916689872741699, + "learning_rate": 3.927448964543389e-06, + "loss": 0.2416, + "num_input_tokens_seen": 13581160, + "step": 6946 + }, + { + "epoch": 0.9207422133863485, + "grad_norm": 9.098535537719727, + "learning_rate": 3.92716396345932e-06, + "loss": 0.3417, + "num_input_tokens_seen": 13583592, + "step": 6947 + }, + { + "epoch": 0.9208747514910537, + "grad_norm": 11.006823539733887, + "learning_rate": 3.9268789348585455e-06, + "loss": 0.326, + "num_input_tokens_seen": 13585384, + "step": 6948 + }, + { + "epoch": 0.9210072895957587, + "grad_norm": 7.09971284866333, + "learning_rate": 3.9265938787465594e-06, + "loss": 0.1841, + "num_input_tokens_seen": 13587448, + "step": 6949 + }, + { + "epoch": 0.9211398277004639, + "grad_norm": 7.298976898193359, + "learning_rate": 3.926308795128861e-06, + "loss": 0.1785, + "num_input_tokens_seen": 13589576, + "step": 6950 + }, + { + "epoch": 0.921272365805169, + "grad_norm": 0.08833196759223938, + "learning_rate": 3.926023684010945e-06, + "loss": 0.0005, + "num_input_tokens_seen": 13590944, + "step": 6951 + }, + { + "epoch": 0.9214049039098741, + "grad_norm": 0.13017013669013977, + "learning_rate": 3.925738545398307e-06, + "loss": 0.0009, + "num_input_tokens_seen": 13593672, + "step": 6952 + }, + { + "epoch": 0.9215374420145792, + "grad_norm": 0.034440673887729645, + "learning_rate": 3.9254533792964475e-06, + "loss": 0.0002, + "num_input_tokens_seen": 13595304, + "step": 6953 + }, + { + "epoch": 0.9216699801192842, + "grad_norm": 0.629877507686615, + "learning_rate": 3.9251681857108625e-06, + "loss": 0.0037, + "num_input_tokens_seen": 13597096, + "step": 6954 + }, + { + "epoch": 0.9218025182239894, + "grad_norm": 6.357543468475342, + "learning_rate": 3.924882964647053e-06, + "loss": 0.1189, + "num_input_tokens_seen": 13598896, + "step": 6955 + }, + { + "epoch": 0.9219350563286945, + "grad_norm": 9.728635787963867, + "learning_rate": 3.924597716110516e-06, + "loss": 0.2992, + "num_input_tokens_seen": 13600880, + "step": 6956 + }, + { + "epoch": 0.9220675944333996, + "grad_norm": 1.617615818977356, + "learning_rate": 3.9243124401067525e-06, + "loss": 0.0066, + "num_input_tokens_seen": 13603784, + "step": 6957 + }, + { + "epoch": 0.9222001325381047, + "grad_norm": 10.881089210510254, + "learning_rate": 3.924027136641263e-06, + "loss": 0.2172, + "num_input_tokens_seen": 13605120, + "step": 6958 + }, + { + "epoch": 0.9223326706428098, + "grad_norm": 0.1734408736228943, + "learning_rate": 3.923741805719547e-06, + "loss": 0.0011, + "num_input_tokens_seen": 13607344, + "step": 6959 + }, + { + "epoch": 0.9224652087475149, + "grad_norm": 0.1797647774219513, + "learning_rate": 3.923456447347108e-06, + "loss": 0.0012, + "num_input_tokens_seen": 13609496, + "step": 6960 + }, + { + "epoch": 0.9225977468522201, + "grad_norm": 15.159379959106445, + "learning_rate": 3.923171061529446e-06, + "loss": 0.3704, + "num_input_tokens_seen": 13611760, + "step": 6961 + }, + { + "epoch": 0.9227302849569251, + "grad_norm": 10.4498291015625, + "learning_rate": 3.922885648272064e-06, + "loss": 0.4682, + "num_input_tokens_seen": 13614248, + "step": 6962 + }, + { + "epoch": 0.9228628230616303, + "grad_norm": 7.802560806274414, + "learning_rate": 3.922600207580466e-06, + "loss": 0.337, + "num_input_tokens_seen": 13616576, + "step": 6963 + }, + { + "epoch": 0.9229953611663353, + "grad_norm": 7.963098049163818, + "learning_rate": 3.922314739460153e-06, + "loss": 0.1244, + "num_input_tokens_seen": 13619224, + "step": 6964 + }, + { + "epoch": 0.9231278992710404, + "grad_norm": 0.27201947569847107, + "learning_rate": 3.922029243916632e-06, + "loss": 0.0017, + "num_input_tokens_seen": 13620696, + "step": 6965 + }, + { + "epoch": 0.9232604373757455, + "grad_norm": 3.4067540168762207, + "learning_rate": 3.921743720955405e-06, + "loss": 0.1671, + "num_input_tokens_seen": 13622952, + "step": 6966 + }, + { + "epoch": 0.9233929754804506, + "grad_norm": 0.026021990925073624, + "learning_rate": 3.9214581705819796e-06, + "loss": 0.0002, + "num_input_tokens_seen": 13624272, + "step": 6967 + }, + { + "epoch": 0.9235255135851558, + "grad_norm": 0.09226591140031815, + "learning_rate": 3.921172592801859e-06, + "loss": 0.0005, + "num_input_tokens_seen": 13626216, + "step": 6968 + }, + { + "epoch": 0.9236580516898608, + "grad_norm": 8.331116676330566, + "learning_rate": 3.920886987620551e-06, + "loss": 0.1847, + "num_input_tokens_seen": 13628296, + "step": 6969 + }, + { + "epoch": 0.923790589794566, + "grad_norm": 0.6970057487487793, + "learning_rate": 3.920601355043561e-06, + "loss": 0.0087, + "num_input_tokens_seen": 13630360, + "step": 6970 + }, + { + "epoch": 0.923923127899271, + "grad_norm": 10.433618545532227, + "learning_rate": 3.9203156950763976e-06, + "loss": 0.1811, + "num_input_tokens_seen": 13632560, + "step": 6971 + }, + { + "epoch": 0.9240556660039762, + "grad_norm": 8.163779258728027, + "learning_rate": 3.920030007724568e-06, + "loss": 0.0472, + "num_input_tokens_seen": 13634144, + "step": 6972 + }, + { + "epoch": 0.9241882041086813, + "grad_norm": 6.9013671875, + "learning_rate": 3.91974429299358e-06, + "loss": 0.1587, + "num_input_tokens_seen": 13636912, + "step": 6973 + }, + { + "epoch": 0.9243207422133863, + "grad_norm": 0.06709427386522293, + "learning_rate": 3.919458550888942e-06, + "loss": 0.0005, + "num_input_tokens_seen": 13638904, + "step": 6974 + }, + { + "epoch": 0.9244532803180915, + "grad_norm": 0.08862776309251785, + "learning_rate": 3.919172781416164e-06, + "loss": 0.0006, + "num_input_tokens_seen": 13641288, + "step": 6975 + }, + { + "epoch": 0.9245858184227965, + "grad_norm": 6.023418426513672, + "learning_rate": 3.918886984580756e-06, + "loss": 0.3011, + "num_input_tokens_seen": 13643320, + "step": 6976 + }, + { + "epoch": 0.9247183565275017, + "grad_norm": 9.24990463256836, + "learning_rate": 3.918601160388228e-06, + "loss": 0.092, + "num_input_tokens_seen": 13645392, + "step": 6977 + }, + { + "epoch": 0.9248508946322067, + "grad_norm": 13.778376579284668, + "learning_rate": 3.9183153088440915e-06, + "loss": 0.4526, + "num_input_tokens_seen": 13647064, + "step": 6978 + }, + { + "epoch": 0.9249834327369119, + "grad_norm": 7.930911540985107, + "learning_rate": 3.918029429953857e-06, + "loss": 0.4003, + "num_input_tokens_seen": 13648720, + "step": 6979 + }, + { + "epoch": 0.925115970841617, + "grad_norm": 0.056612592190504074, + "learning_rate": 3.917743523723037e-06, + "loss": 0.0004, + "num_input_tokens_seen": 13650040, + "step": 6980 + }, + { + "epoch": 0.925248508946322, + "grad_norm": 8.551873207092285, + "learning_rate": 3.917457590157143e-06, + "loss": 0.3409, + "num_input_tokens_seen": 13651656, + "step": 6981 + }, + { + "epoch": 0.9253810470510272, + "grad_norm": 8.11879825592041, + "learning_rate": 3.91717162926169e-06, + "loss": 0.2561, + "num_input_tokens_seen": 13654056, + "step": 6982 + }, + { + "epoch": 0.9255135851557322, + "grad_norm": 2.4395313262939453, + "learning_rate": 3.916885641042189e-06, + "loss": 0.0089, + "num_input_tokens_seen": 13657416, + "step": 6983 + }, + { + "epoch": 0.9256461232604374, + "grad_norm": 11.213218688964844, + "learning_rate": 3.916599625504156e-06, + "loss": 0.4637, + "num_input_tokens_seen": 13659624, + "step": 6984 + }, + { + "epoch": 0.9257786613651425, + "grad_norm": 0.7120232582092285, + "learning_rate": 3.916313582653105e-06, + "loss": 0.0042, + "num_input_tokens_seen": 13661208, + "step": 6985 + }, + { + "epoch": 0.9259111994698476, + "grad_norm": 4.323545455932617, + "learning_rate": 3.916027512494551e-06, + "loss": 0.1995, + "num_input_tokens_seen": 13662568, + "step": 6986 + }, + { + "epoch": 0.9260437375745527, + "grad_norm": 6.108663558959961, + "learning_rate": 3.91574141503401e-06, + "loss": 0.2203, + "num_input_tokens_seen": 13664360, + "step": 6987 + }, + { + "epoch": 0.9261762756792578, + "grad_norm": 9.732032775878906, + "learning_rate": 3.915455290276998e-06, + "loss": 0.4196, + "num_input_tokens_seen": 13666680, + "step": 6988 + }, + { + "epoch": 0.9263088137839629, + "grad_norm": 9.950480461120605, + "learning_rate": 3.915169138229032e-06, + "loss": 0.3896, + "num_input_tokens_seen": 13668816, + "step": 6989 + }, + { + "epoch": 0.9264413518886679, + "grad_norm": 0.8857958316802979, + "learning_rate": 3.914882958895627e-06, + "loss": 0.0054, + "num_input_tokens_seen": 13670576, + "step": 6990 + }, + { + "epoch": 0.9265738899933731, + "grad_norm": 4.835536003112793, + "learning_rate": 3.914596752282303e-06, + "loss": 0.2181, + "num_input_tokens_seen": 13672656, + "step": 6991 + }, + { + "epoch": 0.9267064280980782, + "grad_norm": 0.2453504353761673, + "learning_rate": 3.914310518394579e-06, + "loss": 0.0017, + "num_input_tokens_seen": 13674432, + "step": 6992 + }, + { + "epoch": 0.9268389662027833, + "grad_norm": 11.459491729736328, + "learning_rate": 3.914024257237972e-06, + "loss": 0.3185, + "num_input_tokens_seen": 13676408, + "step": 6993 + }, + { + "epoch": 0.9269715043074884, + "grad_norm": 2.2056283950805664, + "learning_rate": 3.913737968818001e-06, + "loss": 0.0123, + "num_input_tokens_seen": 13677792, + "step": 6994 + }, + { + "epoch": 0.9271040424121935, + "grad_norm": 4.765780925750732, + "learning_rate": 3.913451653140187e-06, + "loss": 0.1351, + "num_input_tokens_seen": 13680032, + "step": 6995 + }, + { + "epoch": 0.9272365805168986, + "grad_norm": 3.8953423500061035, + "learning_rate": 3.9131653102100504e-06, + "loss": 0.1622, + "num_input_tokens_seen": 13682776, + "step": 6996 + }, + { + "epoch": 0.9273691186216038, + "grad_norm": 7.251965045928955, + "learning_rate": 3.912878940033111e-06, + "loss": 0.168, + "num_input_tokens_seen": 13684512, + "step": 6997 + }, + { + "epoch": 0.9275016567263088, + "grad_norm": 9.148533821105957, + "learning_rate": 3.912592542614892e-06, + "loss": 0.1936, + "num_input_tokens_seen": 13686728, + "step": 6998 + }, + { + "epoch": 0.927634194831014, + "grad_norm": 0.0448852963745594, + "learning_rate": 3.912306117960914e-06, + "loss": 0.0003, + "num_input_tokens_seen": 13688800, + "step": 6999 + }, + { + "epoch": 0.927766732935719, + "grad_norm": 1.0870826244354248, + "learning_rate": 3.9120196660767e-06, + "loss": 0.0078, + "num_input_tokens_seen": 13690712, + "step": 7000 + }, + { + "epoch": 0.9278992710404241, + "grad_norm": 5.96731424331665, + "learning_rate": 3.911733186967772e-06, + "loss": 0.1266, + "num_input_tokens_seen": 13692216, + "step": 7001 + }, + { + "epoch": 0.9280318091451292, + "grad_norm": 7.900094985961914, + "learning_rate": 3.911446680639654e-06, + "loss": 0.4551, + "num_input_tokens_seen": 13694384, + "step": 7002 + }, + { + "epoch": 0.9281643472498343, + "grad_norm": 16.05607032775879, + "learning_rate": 3.911160147097871e-06, + "loss": 0.3751, + "num_input_tokens_seen": 13695944, + "step": 7003 + }, + { + "epoch": 0.9282968853545395, + "grad_norm": 8.916208267211914, + "learning_rate": 3.9108735863479465e-06, + "loss": 0.3866, + "num_input_tokens_seen": 13698784, + "step": 7004 + }, + { + "epoch": 0.9284294234592445, + "grad_norm": 12.39218521118164, + "learning_rate": 3.910586998395406e-06, + "loss": 0.5056, + "num_input_tokens_seen": 13700672, + "step": 7005 + }, + { + "epoch": 0.9285619615639497, + "grad_norm": 0.11652123928070068, + "learning_rate": 3.910300383245775e-06, + "loss": 0.0008, + "num_input_tokens_seen": 13702056, + "step": 7006 + }, + { + "epoch": 0.9286944996686547, + "grad_norm": 3.1840665340423584, + "learning_rate": 3.9100137409045805e-06, + "loss": 0.0698, + "num_input_tokens_seen": 13703640, + "step": 7007 + }, + { + "epoch": 0.9288270377733598, + "grad_norm": 14.369851112365723, + "learning_rate": 3.909727071377347e-06, + "loss": 0.4635, + "num_input_tokens_seen": 13706096, + "step": 7008 + }, + { + "epoch": 0.928959575878065, + "grad_norm": 2.8131906986236572, + "learning_rate": 3.9094403746696045e-06, + "loss": 0.0565, + "num_input_tokens_seen": 13707768, + "step": 7009 + }, + { + "epoch": 0.92909211398277, + "grad_norm": 4.054993629455566, + "learning_rate": 3.909153650786878e-06, + "loss": 0.1131, + "num_input_tokens_seen": 13710184, + "step": 7010 + }, + { + "epoch": 0.9292246520874752, + "grad_norm": 7.398223876953125, + "learning_rate": 3.908866899734697e-06, + "loss": 0.2667, + "num_input_tokens_seen": 13712560, + "step": 7011 + }, + { + "epoch": 0.9293571901921802, + "grad_norm": 3.997837543487549, + "learning_rate": 3.908580121518591e-06, + "loss": 0.0315, + "num_input_tokens_seen": 13714456, + "step": 7012 + }, + { + "epoch": 0.9294897282968854, + "grad_norm": 4.207622051239014, + "learning_rate": 3.908293316144089e-06, + "loss": 0.0598, + "num_input_tokens_seen": 13716440, + "step": 7013 + }, + { + "epoch": 0.9296222664015904, + "grad_norm": 0.13573262095451355, + "learning_rate": 3.908006483616719e-06, + "loss": 0.001, + "num_input_tokens_seen": 13717680, + "step": 7014 + }, + { + "epoch": 0.9297548045062956, + "grad_norm": 0.1665526032447815, + "learning_rate": 3.907719623942014e-06, + "loss": 0.0012, + "num_input_tokens_seen": 13718936, + "step": 7015 + }, + { + "epoch": 0.9298873426110007, + "grad_norm": 2.3353826999664307, + "learning_rate": 3.907432737125502e-06, + "loss": 0.0489, + "num_input_tokens_seen": 13720944, + "step": 7016 + }, + { + "epoch": 0.9300198807157057, + "grad_norm": 9.270376205444336, + "learning_rate": 3.907145823172717e-06, + "loss": 0.4772, + "num_input_tokens_seen": 13722992, + "step": 7017 + }, + { + "epoch": 0.9301524188204109, + "grad_norm": 6.651823997497559, + "learning_rate": 3.9068588820891896e-06, + "loss": 0.0963, + "num_input_tokens_seen": 13725520, + "step": 7018 + }, + { + "epoch": 0.9302849569251159, + "grad_norm": 4.772802829742432, + "learning_rate": 3.906571913880452e-06, + "loss": 0.1525, + "num_input_tokens_seen": 13727512, + "step": 7019 + }, + { + "epoch": 0.9304174950298211, + "grad_norm": 6.249307155609131, + "learning_rate": 3.9062849185520375e-06, + "loss": 0.0749, + "num_input_tokens_seen": 13729920, + "step": 7020 + }, + { + "epoch": 0.9305500331345262, + "grad_norm": 6.4778265953063965, + "learning_rate": 3.90599789610948e-06, + "loss": 0.205, + "num_input_tokens_seen": 13732936, + "step": 7021 + }, + { + "epoch": 0.9306825712392313, + "grad_norm": 7.849071025848389, + "learning_rate": 3.905710846558314e-06, + "loss": 0.111, + "num_input_tokens_seen": 13734232, + "step": 7022 + }, + { + "epoch": 0.9308151093439364, + "grad_norm": 6.195882797241211, + "learning_rate": 3.905423769904072e-06, + "loss": 0.1181, + "num_input_tokens_seen": 13736360, + "step": 7023 + }, + { + "epoch": 0.9309476474486414, + "grad_norm": 4.443562984466553, + "learning_rate": 3.90513666615229e-06, + "loss": 0.0656, + "num_input_tokens_seen": 13738496, + "step": 7024 + }, + { + "epoch": 0.9310801855533466, + "grad_norm": 0.2674237787723541, + "learning_rate": 3.904849535308504e-06, + "loss": 0.002, + "num_input_tokens_seen": 13739872, + "step": 7025 + }, + { + "epoch": 0.9312127236580517, + "grad_norm": 0.18563418090343475, + "learning_rate": 3.90456237737825e-06, + "loss": 0.0013, + "num_input_tokens_seen": 13740984, + "step": 7026 + }, + { + "epoch": 0.9313452617627568, + "grad_norm": 0.4591938853263855, + "learning_rate": 3.904275192367064e-06, + "loss": 0.003, + "num_input_tokens_seen": 13743832, + "step": 7027 + }, + { + "epoch": 0.9314777998674619, + "grad_norm": 8.741841316223145, + "learning_rate": 3.903987980280483e-06, + "loss": 0.1743, + "num_input_tokens_seen": 13746368, + "step": 7028 + }, + { + "epoch": 0.931610337972167, + "grad_norm": 7.380860328674316, + "learning_rate": 3.903700741124047e-06, + "loss": 0.4407, + "num_input_tokens_seen": 13748800, + "step": 7029 + }, + { + "epoch": 0.9317428760768721, + "grad_norm": 4.252569198608398, + "learning_rate": 3.903413474903291e-06, + "loss": 0.1617, + "num_input_tokens_seen": 13750712, + "step": 7030 + }, + { + "epoch": 0.9318754141815772, + "grad_norm": 0.3303375542163849, + "learning_rate": 3.903126181623755e-06, + "loss": 0.0024, + "num_input_tokens_seen": 13752424, + "step": 7031 + }, + { + "epoch": 0.9320079522862823, + "grad_norm": 0.12293339520692825, + "learning_rate": 3.902838861290979e-06, + "loss": 0.0009, + "num_input_tokens_seen": 13753944, + "step": 7032 + }, + { + "epoch": 0.9321404903909875, + "grad_norm": 3.9423863887786865, + "learning_rate": 3.902551513910502e-06, + "loss": 0.1561, + "num_input_tokens_seen": 13755752, + "step": 7033 + }, + { + "epoch": 0.9322730284956925, + "grad_norm": 0.35025668144226074, + "learning_rate": 3.902264139487863e-06, + "loss": 0.0024, + "num_input_tokens_seen": 13757216, + "step": 7034 + }, + { + "epoch": 0.9324055666003976, + "grad_norm": 12.962719917297363, + "learning_rate": 3.901976738028605e-06, + "loss": 0.2977, + "num_input_tokens_seen": 13759400, + "step": 7035 + }, + { + "epoch": 0.9325381047051027, + "grad_norm": 13.800041198730469, + "learning_rate": 3.9016893095382695e-06, + "loss": 0.3665, + "num_input_tokens_seen": 13760664, + "step": 7036 + }, + { + "epoch": 0.9326706428098078, + "grad_norm": 13.342743873596191, + "learning_rate": 3.901401854022395e-06, + "loss": 0.4706, + "num_input_tokens_seen": 13763200, + "step": 7037 + }, + { + "epoch": 0.932803180914513, + "grad_norm": 0.1290958970785141, + "learning_rate": 3.901114371486527e-06, + "loss": 0.0009, + "num_input_tokens_seen": 13765248, + "step": 7038 + }, + { + "epoch": 0.932935719019218, + "grad_norm": 17.582426071166992, + "learning_rate": 3.900826861936207e-06, + "loss": 0.5555, + "num_input_tokens_seen": 13767536, + "step": 7039 + }, + { + "epoch": 0.9330682571239232, + "grad_norm": 1.9207929372787476, + "learning_rate": 3.900539325376978e-06, + "loss": 0.028, + "num_input_tokens_seen": 13768904, + "step": 7040 + }, + { + "epoch": 0.9332007952286282, + "grad_norm": 10.582015037536621, + "learning_rate": 3.900251761814386e-06, + "loss": 0.2379, + "num_input_tokens_seen": 13769976, + "step": 7041 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 5.8943376541137695, + "learning_rate": 3.899964171253974e-06, + "loss": 0.1111, + "num_input_tokens_seen": 13772440, + "step": 7042 + }, + { + "epoch": 0.9334658714380384, + "grad_norm": 15.492154121398926, + "learning_rate": 3.899676553701286e-06, + "loss": 0.397, + "num_input_tokens_seen": 13774080, + "step": 7043 + }, + { + "epoch": 0.9335984095427435, + "grad_norm": 0.21643370389938354, + "learning_rate": 3.8993889091618695e-06, + "loss": 0.0015, + "num_input_tokens_seen": 13776240, + "step": 7044 + }, + { + "epoch": 0.9337309476474487, + "grad_norm": 13.893548011779785, + "learning_rate": 3.899101237641268e-06, + "loss": 0.4162, + "num_input_tokens_seen": 13777984, + "step": 7045 + }, + { + "epoch": 0.9338634857521537, + "grad_norm": 6.212873935699463, + "learning_rate": 3.898813539145031e-06, + "loss": 0.15, + "num_input_tokens_seen": 13779648, + "step": 7046 + }, + { + "epoch": 0.9339960238568589, + "grad_norm": 0.47066769003868103, + "learning_rate": 3.898525813678703e-06, + "loss": 0.0035, + "num_input_tokens_seen": 13781368, + "step": 7047 + }, + { + "epoch": 0.9341285619615639, + "grad_norm": 0.5407800078392029, + "learning_rate": 3.8982380612478335e-06, + "loss": 0.0039, + "num_input_tokens_seen": 13782520, + "step": 7048 + }, + { + "epoch": 0.9342611000662691, + "grad_norm": 0.5224636197090149, + "learning_rate": 3.897950281857969e-06, + "loss": 0.0035, + "num_input_tokens_seen": 13784160, + "step": 7049 + }, + { + "epoch": 0.9343936381709742, + "grad_norm": 0.786259651184082, + "learning_rate": 3.897662475514658e-06, + "loss": 0.0058, + "num_input_tokens_seen": 13785464, + "step": 7050 + }, + { + "epoch": 0.9345261762756792, + "grad_norm": 12.074559211730957, + "learning_rate": 3.897374642223452e-06, + "loss": 0.2928, + "num_input_tokens_seen": 13787080, + "step": 7051 + }, + { + "epoch": 0.9346587143803844, + "grad_norm": 7.477062225341797, + "learning_rate": 3.8970867819898975e-06, + "loss": 0.3001, + "num_input_tokens_seen": 13789264, + "step": 7052 + }, + { + "epoch": 0.9347912524850894, + "grad_norm": 6.711886405944824, + "learning_rate": 3.896798894819546e-06, + "loss": 0.053, + "num_input_tokens_seen": 13791128, + "step": 7053 + }, + { + "epoch": 0.9349237905897946, + "grad_norm": 13.31130599975586, + "learning_rate": 3.896510980717949e-06, + "loss": 0.3118, + "num_input_tokens_seen": 13793592, + "step": 7054 + }, + { + "epoch": 0.9350563286944996, + "grad_norm": 0.2863091230392456, + "learning_rate": 3.896223039690656e-06, + "loss": 0.002, + "num_input_tokens_seen": 13795264, + "step": 7055 + }, + { + "epoch": 0.9351888667992048, + "grad_norm": 0.13115935027599335, + "learning_rate": 3.89593507174322e-06, + "loss": 0.001, + "num_input_tokens_seen": 13797440, + "step": 7056 + }, + { + "epoch": 0.9353214049039099, + "grad_norm": 11.746980667114258, + "learning_rate": 3.895647076881193e-06, + "loss": 0.2542, + "num_input_tokens_seen": 13799016, + "step": 7057 + }, + { + "epoch": 0.935453943008615, + "grad_norm": 0.19306576251983643, + "learning_rate": 3.895359055110127e-06, + "loss": 0.0014, + "num_input_tokens_seen": 13800768, + "step": 7058 + }, + { + "epoch": 0.9355864811133201, + "grad_norm": 1.3046098947525024, + "learning_rate": 3.895071006435577e-06, + "loss": 0.0327, + "num_input_tokens_seen": 13801968, + "step": 7059 + }, + { + "epoch": 0.9357190192180251, + "grad_norm": 11.87197494506836, + "learning_rate": 3.8947829308630945e-06, + "loss": 0.3132, + "num_input_tokens_seen": 13804512, + "step": 7060 + }, + { + "epoch": 0.9358515573227303, + "grad_norm": 0.12728971242904663, + "learning_rate": 3.894494828398235e-06, + "loss": 0.0009, + "num_input_tokens_seen": 13806104, + "step": 7061 + }, + { + "epoch": 0.9359840954274354, + "grad_norm": 12.821535110473633, + "learning_rate": 3.8942066990465535e-06, + "loss": 0.2944, + "num_input_tokens_seen": 13808272, + "step": 7062 + }, + { + "epoch": 0.9361166335321405, + "grad_norm": 0.188517227768898, + "learning_rate": 3.893918542813605e-06, + "loss": 0.0012, + "num_input_tokens_seen": 13810824, + "step": 7063 + }, + { + "epoch": 0.9362491716368456, + "grad_norm": 18.274410247802734, + "learning_rate": 3.893630359704945e-06, + "loss": 0.5839, + "num_input_tokens_seen": 13813304, + "step": 7064 + }, + { + "epoch": 0.9363817097415507, + "grad_norm": 13.734169006347656, + "learning_rate": 3.89334214972613e-06, + "loss": 0.7593, + "num_input_tokens_seen": 13815264, + "step": 7065 + }, + { + "epoch": 0.9365142478462558, + "grad_norm": 1.042287826538086, + "learning_rate": 3.893053912882718e-06, + "loss": 0.0067, + "num_input_tokens_seen": 13817312, + "step": 7066 + }, + { + "epoch": 0.9366467859509608, + "grad_norm": 7.111483097076416, + "learning_rate": 3.892765649180265e-06, + "loss": 0.3247, + "num_input_tokens_seen": 13819280, + "step": 7067 + }, + { + "epoch": 0.936779324055666, + "grad_norm": 8.170369148254395, + "learning_rate": 3.89247735862433e-06, + "loss": 0.1589, + "num_input_tokens_seen": 13821488, + "step": 7068 + }, + { + "epoch": 0.9369118621603711, + "grad_norm": 10.619584083557129, + "learning_rate": 3.89218904122047e-06, + "loss": 0.2953, + "num_input_tokens_seen": 13823288, + "step": 7069 + }, + { + "epoch": 0.9370444002650762, + "grad_norm": 4.770723342895508, + "learning_rate": 3.891900696974245e-06, + "loss": 0.1214, + "num_input_tokens_seen": 13824808, + "step": 7070 + }, + { + "epoch": 0.9371769383697813, + "grad_norm": 18.4471435546875, + "learning_rate": 3.891612325891215e-06, + "loss": 0.8585, + "num_input_tokens_seen": 13828000, + "step": 7071 + }, + { + "epoch": 0.9373094764744864, + "grad_norm": 12.26531982421875, + "learning_rate": 3.89132392797694e-06, + "loss": 0.4587, + "num_input_tokens_seen": 13829896, + "step": 7072 + }, + { + "epoch": 0.9374420145791915, + "grad_norm": 0.14099828898906708, + "learning_rate": 3.891035503236978e-06, + "loss": 0.0009, + "num_input_tokens_seen": 13833472, + "step": 7073 + }, + { + "epoch": 0.9375745526838967, + "grad_norm": 10.49266529083252, + "learning_rate": 3.890747051676893e-06, + "loss": 0.3579, + "num_input_tokens_seen": 13835488, + "step": 7074 + }, + { + "epoch": 0.9377070907886017, + "grad_norm": 7.94146728515625, + "learning_rate": 3.890458573302246e-06, + "loss": 0.2631, + "num_input_tokens_seen": 13837600, + "step": 7075 + }, + { + "epoch": 0.9378396288933069, + "grad_norm": 5.770942211151123, + "learning_rate": 3.890170068118597e-06, + "loss": 0.042, + "num_input_tokens_seen": 13840240, + "step": 7076 + }, + { + "epoch": 0.9379721669980119, + "grad_norm": 3.6708078384399414, + "learning_rate": 3.889881536131511e-06, + "loss": 0.0825, + "num_input_tokens_seen": 13843984, + "step": 7077 + }, + { + "epoch": 0.938104705102717, + "grad_norm": 2.489748477935791, + "learning_rate": 3.88959297734655e-06, + "loss": 0.0811, + "num_input_tokens_seen": 13845864, + "step": 7078 + }, + { + "epoch": 0.9382372432074222, + "grad_norm": 11.491775512695312, + "learning_rate": 3.8893043917692776e-06, + "loss": 0.3878, + "num_input_tokens_seen": 13848720, + "step": 7079 + }, + { + "epoch": 0.9383697813121272, + "grad_norm": 9.523656845092773, + "learning_rate": 3.889015779405259e-06, + "loss": 0.3115, + "num_input_tokens_seen": 13850968, + "step": 7080 + }, + { + "epoch": 0.9385023194168324, + "grad_norm": 8.847926139831543, + "learning_rate": 3.888727140260056e-06, + "loss": 0.2503, + "num_input_tokens_seen": 13852792, + "step": 7081 + }, + { + "epoch": 0.9386348575215374, + "grad_norm": 5.771275997161865, + "learning_rate": 3.888438474339237e-06, + "loss": 0.2388, + "num_input_tokens_seen": 13854864, + "step": 7082 + }, + { + "epoch": 0.9387673956262426, + "grad_norm": 13.877067565917969, + "learning_rate": 3.888149781648367e-06, + "loss": 0.358, + "num_input_tokens_seen": 13858016, + "step": 7083 + }, + { + "epoch": 0.9388999337309476, + "grad_norm": 16.063514709472656, + "learning_rate": 3.88786106219301e-06, + "loss": 0.5331, + "num_input_tokens_seen": 13859632, + "step": 7084 + }, + { + "epoch": 0.9390324718356527, + "grad_norm": 0.07563447207212448, + "learning_rate": 3.887572315978735e-06, + "loss": 0.0005, + "num_input_tokens_seen": 13860856, + "step": 7085 + }, + { + "epoch": 0.9391650099403579, + "grad_norm": 0.11302629113197327, + "learning_rate": 3.887283543011109e-06, + "loss": 0.0008, + "num_input_tokens_seen": 13863552, + "step": 7086 + }, + { + "epoch": 0.9392975480450629, + "grad_norm": 2.5225846767425537, + "learning_rate": 3.886994743295699e-06, + "loss": 0.0342, + "num_input_tokens_seen": 13867112, + "step": 7087 + }, + { + "epoch": 0.9394300861497681, + "grad_norm": 12.152167320251465, + "learning_rate": 3.8867059168380725e-06, + "loss": 0.4418, + "num_input_tokens_seen": 13869416, + "step": 7088 + }, + { + "epoch": 0.9395626242544731, + "grad_norm": 7.963101387023926, + "learning_rate": 3.8864170636438004e-06, + "loss": 0.1687, + "num_input_tokens_seen": 13871400, + "step": 7089 + }, + { + "epoch": 0.9396951623591783, + "grad_norm": 7.331704616546631, + "learning_rate": 3.88612818371845e-06, + "loss": 0.2273, + "num_input_tokens_seen": 13872920, + "step": 7090 + }, + { + "epoch": 0.9398277004638834, + "grad_norm": 14.124824523925781, + "learning_rate": 3.885839277067593e-06, + "loss": 0.4864, + "num_input_tokens_seen": 13874760, + "step": 7091 + }, + { + "epoch": 0.9399602385685885, + "grad_norm": 6.634252071380615, + "learning_rate": 3.885550343696799e-06, + "loss": 0.1185, + "num_input_tokens_seen": 13877496, + "step": 7092 + }, + { + "epoch": 0.9400927766732936, + "grad_norm": 0.1869451105594635, + "learning_rate": 3.8852613836116375e-06, + "loss": 0.0012, + "num_input_tokens_seen": 13878856, + "step": 7093 + }, + { + "epoch": 0.9402253147779986, + "grad_norm": 8.83353328704834, + "learning_rate": 3.884972396817681e-06, + "loss": 0.1767, + "num_input_tokens_seen": 13880912, + "step": 7094 + }, + { + "epoch": 0.9403578528827038, + "grad_norm": 0.2119373083114624, + "learning_rate": 3.884683383320501e-06, + "loss": 0.0016, + "num_input_tokens_seen": 13882640, + "step": 7095 + }, + { + "epoch": 0.9404903909874088, + "grad_norm": 11.178598403930664, + "learning_rate": 3.884394343125671e-06, + "loss": 0.245, + "num_input_tokens_seen": 13885160, + "step": 7096 + }, + { + "epoch": 0.940622929092114, + "grad_norm": 10.791341781616211, + "learning_rate": 3.884105276238762e-06, + "loss": 0.2045, + "num_input_tokens_seen": 13887952, + "step": 7097 + }, + { + "epoch": 0.9407554671968191, + "grad_norm": 6.683265209197998, + "learning_rate": 3.88381618266535e-06, + "loss": 0.1451, + "num_input_tokens_seen": 13890352, + "step": 7098 + }, + { + "epoch": 0.9408880053015242, + "grad_norm": 11.57684326171875, + "learning_rate": 3.883527062411005e-06, + "loss": 0.2712, + "num_input_tokens_seen": 13892240, + "step": 7099 + }, + { + "epoch": 0.9410205434062293, + "grad_norm": 8.927364349365234, + "learning_rate": 3.883237915481306e-06, + "loss": 0.1335, + "num_input_tokens_seen": 13894328, + "step": 7100 + }, + { + "epoch": 0.9411530815109344, + "grad_norm": 0.3892424404621124, + "learning_rate": 3.882948741881824e-06, + "loss": 0.0029, + "num_input_tokens_seen": 13896472, + "step": 7101 + }, + { + "epoch": 0.9412856196156395, + "grad_norm": 7.853362083435059, + "learning_rate": 3.882659541618138e-06, + "loss": 0.1866, + "num_input_tokens_seen": 13898704, + "step": 7102 + }, + { + "epoch": 0.9414181577203447, + "grad_norm": 8.738165855407715, + "learning_rate": 3.8823703146958214e-06, + "loss": 0.3392, + "num_input_tokens_seen": 13900128, + "step": 7103 + }, + { + "epoch": 0.9415506958250497, + "grad_norm": 7.839698791503906, + "learning_rate": 3.882081061120451e-06, + "loss": 0.1573, + "num_input_tokens_seen": 13903448, + "step": 7104 + }, + { + "epoch": 0.9416832339297548, + "grad_norm": 0.2703794240951538, + "learning_rate": 3.881791780897604e-06, + "loss": 0.002, + "num_input_tokens_seen": 13905176, + "step": 7105 + }, + { + "epoch": 0.9418157720344599, + "grad_norm": 19.21526336669922, + "learning_rate": 3.88150247403286e-06, + "loss": 0.8292, + "num_input_tokens_seen": 13908224, + "step": 7106 + }, + { + "epoch": 0.941948310139165, + "grad_norm": 0.7847921848297119, + "learning_rate": 3.8812131405317936e-06, + "loss": 0.0057, + "num_input_tokens_seen": 13909400, + "step": 7107 + }, + { + "epoch": 0.9420808482438701, + "grad_norm": 17.949665069580078, + "learning_rate": 3.880923780399986e-06, + "loss": 0.8185, + "num_input_tokens_seen": 13911488, + "step": 7108 + }, + { + "epoch": 0.9422133863485752, + "grad_norm": 0.9618986248970032, + "learning_rate": 3.880634393643014e-06, + "loss": 0.0069, + "num_input_tokens_seen": 13913320, + "step": 7109 + }, + { + "epoch": 0.9423459244532804, + "grad_norm": 0.6985360383987427, + "learning_rate": 3.88034498026646e-06, + "loss": 0.0043, + "num_input_tokens_seen": 13916048, + "step": 7110 + }, + { + "epoch": 0.9424784625579854, + "grad_norm": 5.417811870574951, + "learning_rate": 3.880055540275902e-06, + "loss": 0.0629, + "num_input_tokens_seen": 13918696, + "step": 7111 + }, + { + "epoch": 0.9426110006626905, + "grad_norm": 0.7126796841621399, + "learning_rate": 3.8797660736769205e-06, + "loss": 0.005, + "num_input_tokens_seen": 13920704, + "step": 7112 + }, + { + "epoch": 0.9427435387673956, + "grad_norm": 10.946029663085938, + "learning_rate": 3.8794765804750974e-06, + "loss": 0.3829, + "num_input_tokens_seen": 13922800, + "step": 7113 + }, + { + "epoch": 0.9428760768721007, + "grad_norm": 0.6073399782180786, + "learning_rate": 3.879187060676015e-06, + "loss": 0.0042, + "num_input_tokens_seen": 13924992, + "step": 7114 + }, + { + "epoch": 0.9430086149768059, + "grad_norm": 6.484592437744141, + "learning_rate": 3.878897514285254e-06, + "loss": 0.083, + "num_input_tokens_seen": 13926256, + "step": 7115 + }, + { + "epoch": 0.9431411530815109, + "grad_norm": 6.664862155914307, + "learning_rate": 3.878607941308397e-06, + "loss": 0.1241, + "num_input_tokens_seen": 13928008, + "step": 7116 + }, + { + "epoch": 0.9432736911862161, + "grad_norm": 7.631915092468262, + "learning_rate": 3.878318341751029e-06, + "loss": 0.2026, + "num_input_tokens_seen": 13929680, + "step": 7117 + }, + { + "epoch": 0.9434062292909211, + "grad_norm": 8.696460723876953, + "learning_rate": 3.878028715618732e-06, + "loss": 0.066, + "num_input_tokens_seen": 13931512, + "step": 7118 + }, + { + "epoch": 0.9435387673956263, + "grad_norm": 0.03154376521706581, + "learning_rate": 3.877739062917091e-06, + "loss": 0.0002, + "num_input_tokens_seen": 13932608, + "step": 7119 + }, + { + "epoch": 0.9436713055003313, + "grad_norm": 0.05518821254372597, + "learning_rate": 3.87744938365169e-06, + "loss": 0.0004, + "num_input_tokens_seen": 13934160, + "step": 7120 + }, + { + "epoch": 0.9438038436050364, + "grad_norm": 0.021976392716169357, + "learning_rate": 3.8771596778281145e-06, + "loss": 0.0002, + "num_input_tokens_seen": 13935528, + "step": 7121 + }, + { + "epoch": 0.9439363817097416, + "grad_norm": 11.469886779785156, + "learning_rate": 3.876869945451951e-06, + "loss": 0.4154, + "num_input_tokens_seen": 13937656, + "step": 7122 + }, + { + "epoch": 0.9440689198144466, + "grad_norm": 4.259563446044922, + "learning_rate": 3.876580186528784e-06, + "loss": 0.0524, + "num_input_tokens_seen": 13939592, + "step": 7123 + }, + { + "epoch": 0.9442014579191518, + "grad_norm": 8.293495178222656, + "learning_rate": 3.876290401064202e-06, + "loss": 0.2282, + "num_input_tokens_seen": 13941464, + "step": 7124 + }, + { + "epoch": 0.9443339960238568, + "grad_norm": 0.025867924094200134, + "learning_rate": 3.876000589063792e-06, + "loss": 0.0002, + "num_input_tokens_seen": 13943432, + "step": 7125 + }, + { + "epoch": 0.944466534128562, + "grad_norm": 0.05256165564060211, + "learning_rate": 3.8757107505331414e-06, + "loss": 0.0004, + "num_input_tokens_seen": 13945576, + "step": 7126 + }, + { + "epoch": 0.9445990722332671, + "grad_norm": 0.03770623728632927, + "learning_rate": 3.875420885477839e-06, + "loss": 0.0003, + "num_input_tokens_seen": 13947552, + "step": 7127 + }, + { + "epoch": 0.9447316103379721, + "grad_norm": 2.8745107650756836, + "learning_rate": 3.875130993903471e-06, + "loss": 0.024, + "num_input_tokens_seen": 13950752, + "step": 7128 + }, + { + "epoch": 0.9448641484426773, + "grad_norm": 9.824553489685059, + "learning_rate": 3.874841075815631e-06, + "loss": 0.2826, + "num_input_tokens_seen": 13952560, + "step": 7129 + }, + { + "epoch": 0.9449966865473823, + "grad_norm": 4.166808128356934, + "learning_rate": 3.874551131219905e-06, + "loss": 0.0944, + "num_input_tokens_seen": 13954272, + "step": 7130 + }, + { + "epoch": 0.9451292246520875, + "grad_norm": 21.765663146972656, + "learning_rate": 3.874261160121886e-06, + "loss": 0.9576, + "num_input_tokens_seen": 13956280, + "step": 7131 + }, + { + "epoch": 0.9452617627567925, + "grad_norm": 16.836719512939453, + "learning_rate": 3.873971162527163e-06, + "loss": 0.4611, + "num_input_tokens_seen": 13958232, + "step": 7132 + }, + { + "epoch": 0.9453943008614977, + "grad_norm": 2.8087234497070312, + "learning_rate": 3.8736811384413286e-06, + "loss": 0.0304, + "num_input_tokens_seen": 13959728, + "step": 7133 + }, + { + "epoch": 0.9455268389662028, + "grad_norm": 4.472196102142334, + "learning_rate": 3.873391087869974e-06, + "loss": 0.1159, + "num_input_tokens_seen": 13961888, + "step": 7134 + }, + { + "epoch": 0.9456593770709079, + "grad_norm": 0.1440078169107437, + "learning_rate": 3.873101010818692e-06, + "loss": 0.001, + "num_input_tokens_seen": 13963512, + "step": 7135 + }, + { + "epoch": 0.945791915175613, + "grad_norm": 8.4107666015625, + "learning_rate": 3.872810907293075e-06, + "loss": 0.1642, + "num_input_tokens_seen": 13965048, + "step": 7136 + }, + { + "epoch": 0.945924453280318, + "grad_norm": 12.829103469848633, + "learning_rate": 3.872520777298717e-06, + "loss": 0.4506, + "num_input_tokens_seen": 13966944, + "step": 7137 + }, + { + "epoch": 0.9460569913850232, + "grad_norm": 12.858433723449707, + "learning_rate": 3.872230620841211e-06, + "loss": 0.4554, + "num_input_tokens_seen": 13968688, + "step": 7138 + }, + { + "epoch": 0.9461895294897283, + "grad_norm": 8.229705810546875, + "learning_rate": 3.871940437926153e-06, + "loss": 0.2522, + "num_input_tokens_seen": 13970744, + "step": 7139 + }, + { + "epoch": 0.9463220675944334, + "grad_norm": 6.4407525062561035, + "learning_rate": 3.871650228559136e-06, + "loss": 0.0445, + "num_input_tokens_seen": 13972032, + "step": 7140 + }, + { + "epoch": 0.9464546056991385, + "grad_norm": 11.054377555847168, + "learning_rate": 3.871359992745756e-06, + "loss": 0.4757, + "num_input_tokens_seen": 13973992, + "step": 7141 + }, + { + "epoch": 0.9465871438038436, + "grad_norm": 0.0927763506770134, + "learning_rate": 3.87106973049161e-06, + "loss": 0.0006, + "num_input_tokens_seen": 13974888, + "step": 7142 + }, + { + "epoch": 0.9467196819085487, + "grad_norm": 1.5007574558258057, + "learning_rate": 3.870779441802294e-06, + "loss": 0.0107, + "num_input_tokens_seen": 13976736, + "step": 7143 + }, + { + "epoch": 0.9468522200132539, + "grad_norm": 4.635882377624512, + "learning_rate": 3.870489126683404e-06, + "loss": 0.1144, + "num_input_tokens_seen": 13978448, + "step": 7144 + }, + { + "epoch": 0.9469847581179589, + "grad_norm": 2.254204511642456, + "learning_rate": 3.870198785140539e-06, + "loss": 0.0213, + "num_input_tokens_seen": 13980184, + "step": 7145 + }, + { + "epoch": 0.947117296222664, + "grad_norm": 0.20676346123218536, + "learning_rate": 3.869908417179295e-06, + "loss": 0.0015, + "num_input_tokens_seen": 13982160, + "step": 7146 + }, + { + "epoch": 0.9472498343273691, + "grad_norm": 0.3083372116088867, + "learning_rate": 3.869618022805272e-06, + "loss": 0.0022, + "num_input_tokens_seen": 13984376, + "step": 7147 + }, + { + "epoch": 0.9473823724320742, + "grad_norm": 4.793144702911377, + "learning_rate": 3.86932760202407e-06, + "loss": 0.0702, + "num_input_tokens_seen": 13986304, + "step": 7148 + }, + { + "epoch": 0.9475149105367793, + "grad_norm": 9.222419738769531, + "learning_rate": 3.8690371548412854e-06, + "loss": 0.0848, + "num_input_tokens_seen": 13988200, + "step": 7149 + }, + { + "epoch": 0.9476474486414844, + "grad_norm": 5.728933334350586, + "learning_rate": 3.868746681262521e-06, + "loss": 0.0799, + "num_input_tokens_seen": 13989800, + "step": 7150 + }, + { + "epoch": 0.9477799867461896, + "grad_norm": 0.1714426875114441, + "learning_rate": 3.868456181293376e-06, + "loss": 0.0012, + "num_input_tokens_seen": 13991240, + "step": 7151 + }, + { + "epoch": 0.9479125248508946, + "grad_norm": 10.58035945892334, + "learning_rate": 3.868165654939452e-06, + "loss": 0.3217, + "num_input_tokens_seen": 13992768, + "step": 7152 + }, + { + "epoch": 0.9480450629555998, + "grad_norm": 0.12699900567531586, + "learning_rate": 3.86787510220635e-06, + "loss": 0.0009, + "num_input_tokens_seen": 13994456, + "step": 7153 + }, + { + "epoch": 0.9481776010603048, + "grad_norm": 0.034609898924827576, + "learning_rate": 3.867584523099673e-06, + "loss": 0.0002, + "num_input_tokens_seen": 13995816, + "step": 7154 + }, + { + "epoch": 0.94831013916501, + "grad_norm": 0.3747157156467438, + "learning_rate": 3.867293917625022e-06, + "loss": 0.0041, + "num_input_tokens_seen": 13997728, + "step": 7155 + }, + { + "epoch": 0.9484426772697151, + "grad_norm": 7.815901279449463, + "learning_rate": 3.867003285788001e-06, + "loss": 0.173, + "num_input_tokens_seen": 13999840, + "step": 7156 + }, + { + "epoch": 0.9485752153744201, + "grad_norm": 0.10273309051990509, + "learning_rate": 3.866712627594215e-06, + "loss": 0.0007, + "num_input_tokens_seen": 14001848, + "step": 7157 + }, + { + "epoch": 0.9487077534791253, + "grad_norm": 0.4167424738407135, + "learning_rate": 3.8664219430492664e-06, + "loss": 0.0031, + "num_input_tokens_seen": 14003488, + "step": 7158 + }, + { + "epoch": 0.9488402915838303, + "grad_norm": 8.23000431060791, + "learning_rate": 3.866131232158759e-06, + "loss": 0.3109, + "num_input_tokens_seen": 14005800, + "step": 7159 + }, + { + "epoch": 0.9489728296885355, + "grad_norm": 8.696732521057129, + "learning_rate": 3.8658404949283e-06, + "loss": 0.2418, + "num_input_tokens_seen": 14007760, + "step": 7160 + }, + { + "epoch": 0.9491053677932405, + "grad_norm": 15.127118110656738, + "learning_rate": 3.865549731363493e-06, + "loss": 0.5056, + "num_input_tokens_seen": 14009344, + "step": 7161 + }, + { + "epoch": 0.9492379058979457, + "grad_norm": 15.005428314208984, + "learning_rate": 3.865258941469946e-06, + "loss": 0.5758, + "num_input_tokens_seen": 14011248, + "step": 7162 + }, + { + "epoch": 0.9493704440026508, + "grad_norm": 5.8355607986450195, + "learning_rate": 3.864968125253264e-06, + "loss": 0.1534, + "num_input_tokens_seen": 14013160, + "step": 7163 + }, + { + "epoch": 0.9495029821073558, + "grad_norm": 10.387696266174316, + "learning_rate": 3.864677282719056e-06, + "loss": 0.1443, + "num_input_tokens_seen": 14015896, + "step": 7164 + }, + { + "epoch": 0.949635520212061, + "grad_norm": 5.56615686416626, + "learning_rate": 3.864386413872928e-06, + "loss": 0.1462, + "num_input_tokens_seen": 14017856, + "step": 7165 + }, + { + "epoch": 0.949768058316766, + "grad_norm": 8.612417221069336, + "learning_rate": 3.864095518720489e-06, + "loss": 0.1314, + "num_input_tokens_seen": 14019808, + "step": 7166 + }, + { + "epoch": 0.9499005964214712, + "grad_norm": 9.724380493164062, + "learning_rate": 3.863804597267346e-06, + "loss": 0.2438, + "num_input_tokens_seen": 14022152, + "step": 7167 + }, + { + "epoch": 0.9500331345261763, + "grad_norm": 10.189743041992188, + "learning_rate": 3.86351364951911e-06, + "loss": 0.1932, + "num_input_tokens_seen": 14024544, + "step": 7168 + }, + { + "epoch": 0.9501656726308814, + "grad_norm": 10.015031814575195, + "learning_rate": 3.863222675481392e-06, + "loss": 0.2819, + "num_input_tokens_seen": 14026368, + "step": 7169 + }, + { + "epoch": 0.9502982107355865, + "grad_norm": 0.025956453755497932, + "learning_rate": 3.862931675159798e-06, + "loss": 0.0002, + "num_input_tokens_seen": 14027928, + "step": 7170 + }, + { + "epoch": 0.9504307488402916, + "grad_norm": 0.07927209883928299, + "learning_rate": 3.862640648559942e-06, + "loss": 0.0005, + "num_input_tokens_seen": 14030056, + "step": 7171 + }, + { + "epoch": 0.9505632869449967, + "grad_norm": 0.06405702978372574, + "learning_rate": 3.862349595687435e-06, + "loss": 0.0005, + "num_input_tokens_seen": 14031584, + "step": 7172 + }, + { + "epoch": 0.9506958250497017, + "grad_norm": 6.27801513671875, + "learning_rate": 3.862058516547888e-06, + "loss": 0.2297, + "num_input_tokens_seen": 14034512, + "step": 7173 + }, + { + "epoch": 0.9508283631544069, + "grad_norm": 6.723933696746826, + "learning_rate": 3.861767411146912e-06, + "loss": 0.1573, + "num_input_tokens_seen": 14037512, + "step": 7174 + }, + { + "epoch": 0.950960901259112, + "grad_norm": 10.136953353881836, + "learning_rate": 3.8614762794901215e-06, + "loss": 0.1258, + "num_input_tokens_seen": 14039568, + "step": 7175 + }, + { + "epoch": 0.9510934393638171, + "grad_norm": 4.971272945404053, + "learning_rate": 3.861185121583129e-06, + "loss": 0.1235, + "num_input_tokens_seen": 14041664, + "step": 7176 + }, + { + "epoch": 0.9512259774685222, + "grad_norm": 8.281096458435059, + "learning_rate": 3.860893937431548e-06, + "loss": 0.1439, + "num_input_tokens_seen": 14043328, + "step": 7177 + }, + { + "epoch": 0.9513585155732273, + "grad_norm": 7.257356643676758, + "learning_rate": 3.860602727040993e-06, + "loss": 0.3761, + "num_input_tokens_seen": 14045624, + "step": 7178 + }, + { + "epoch": 0.9514910536779324, + "grad_norm": 0.13173986971378326, + "learning_rate": 3.86031149041708e-06, + "loss": 0.0009, + "num_input_tokens_seen": 14048256, + "step": 7179 + }, + { + "epoch": 0.9516235917826376, + "grad_norm": 10.339430809020996, + "learning_rate": 3.860020227565422e-06, + "loss": 0.2926, + "num_input_tokens_seen": 14050416, + "step": 7180 + }, + { + "epoch": 0.9517561298873426, + "grad_norm": 8.4072265625, + "learning_rate": 3.859728938491636e-06, + "loss": 0.2412, + "num_input_tokens_seen": 14051616, + "step": 7181 + }, + { + "epoch": 0.9518886679920477, + "grad_norm": 0.313981294631958, + "learning_rate": 3.859437623201338e-06, + "loss": 0.0021, + "num_input_tokens_seen": 14055136, + "step": 7182 + }, + { + "epoch": 0.9520212060967528, + "grad_norm": 0.18514522910118103, + "learning_rate": 3.859146281700144e-06, + "loss": 0.0012, + "num_input_tokens_seen": 14056992, + "step": 7183 + }, + { + "epoch": 0.9521537442014579, + "grad_norm": 7.113615036010742, + "learning_rate": 3.858854913993674e-06, + "loss": 0.1219, + "num_input_tokens_seen": 14058632, + "step": 7184 + }, + { + "epoch": 0.952286282306163, + "grad_norm": 9.534204483032227, + "learning_rate": 3.858563520087542e-06, + "loss": 0.3481, + "num_input_tokens_seen": 14060832, + "step": 7185 + }, + { + "epoch": 0.9524188204108681, + "grad_norm": 3.2288169860839844, + "learning_rate": 3.858272099987369e-06, + "loss": 0.1545, + "num_input_tokens_seen": 14062560, + "step": 7186 + }, + { + "epoch": 0.9525513585155733, + "grad_norm": 4.816036224365234, + "learning_rate": 3.857980653698772e-06, + "loss": 0.1358, + "num_input_tokens_seen": 14064648, + "step": 7187 + }, + { + "epoch": 0.9526838966202783, + "grad_norm": 1.5882339477539062, + "learning_rate": 3.857689181227372e-06, + "loss": 0.0154, + "num_input_tokens_seen": 14067128, + "step": 7188 + }, + { + "epoch": 0.9528164347249835, + "grad_norm": 7.44096565246582, + "learning_rate": 3.857397682578788e-06, + "loss": 0.1958, + "num_input_tokens_seen": 14068568, + "step": 7189 + }, + { + "epoch": 0.9529489728296885, + "grad_norm": 11.390652656555176, + "learning_rate": 3.85710615775864e-06, + "loss": 0.1381, + "num_input_tokens_seen": 14070040, + "step": 7190 + }, + { + "epoch": 0.9530815109343936, + "grad_norm": 11.5276517868042, + "learning_rate": 3.856814606772549e-06, + "loss": 0.3754, + "num_input_tokens_seen": 14071792, + "step": 7191 + }, + { + "epoch": 0.9532140490390988, + "grad_norm": 7.8154120445251465, + "learning_rate": 3.856523029626137e-06, + "loss": 0.1743, + "num_input_tokens_seen": 14073584, + "step": 7192 + }, + { + "epoch": 0.9533465871438038, + "grad_norm": 5.855787754058838, + "learning_rate": 3.856231426325024e-06, + "loss": 0.0595, + "num_input_tokens_seen": 14076328, + "step": 7193 + }, + { + "epoch": 0.953479125248509, + "grad_norm": 14.576971054077148, + "learning_rate": 3.855939796874835e-06, + "loss": 0.3546, + "num_input_tokens_seen": 14078096, + "step": 7194 + }, + { + "epoch": 0.953611663353214, + "grad_norm": 8.141708374023438, + "learning_rate": 3.85564814128119e-06, + "loss": 0.3449, + "num_input_tokens_seen": 14080856, + "step": 7195 + }, + { + "epoch": 0.9537442014579192, + "grad_norm": 8.427746772766113, + "learning_rate": 3.855356459549714e-06, + "loss": 0.1124, + "num_input_tokens_seen": 14082824, + "step": 7196 + }, + { + "epoch": 0.9538767395626243, + "grad_norm": 5.8710198402404785, + "learning_rate": 3.855064751686031e-06, + "loss": 0.1003, + "num_input_tokens_seen": 14086392, + "step": 7197 + }, + { + "epoch": 0.9540092776673293, + "grad_norm": 2.7634494304656982, + "learning_rate": 3.8547730176957645e-06, + "loss": 0.0172, + "num_input_tokens_seen": 14089304, + "step": 7198 + }, + { + "epoch": 0.9541418157720345, + "grad_norm": 5.8507843017578125, + "learning_rate": 3.854481257584539e-06, + "loss": 0.0491, + "num_input_tokens_seen": 14091256, + "step": 7199 + }, + { + "epoch": 0.9542743538767395, + "grad_norm": 10.55928897857666, + "learning_rate": 3.854189471357981e-06, + "loss": 0.3291, + "num_input_tokens_seen": 14093640, + "step": 7200 + }, + { + "epoch": 0.9544068919814447, + "grad_norm": 4.255329608917236, + "learning_rate": 3.853897659021716e-06, + "loss": 0.0499, + "num_input_tokens_seen": 14095280, + "step": 7201 + }, + { + "epoch": 0.9545394300861497, + "grad_norm": 10.423710823059082, + "learning_rate": 3.85360582058137e-06, + "loss": 0.2372, + "num_input_tokens_seen": 14097216, + "step": 7202 + }, + { + "epoch": 0.9546719681908549, + "grad_norm": 0.06868020445108414, + "learning_rate": 3.85331395604257e-06, + "loss": 0.0004, + "num_input_tokens_seen": 14098624, + "step": 7203 + }, + { + "epoch": 0.95480450629556, + "grad_norm": 18.400344848632812, + "learning_rate": 3.853022065410942e-06, + "loss": 0.5321, + "num_input_tokens_seen": 14099712, + "step": 7204 + }, + { + "epoch": 0.9549370444002651, + "grad_norm": 3.9094605445861816, + "learning_rate": 3.852730148692117e-06, + "loss": 0.0928, + "num_input_tokens_seen": 14101624, + "step": 7205 + }, + { + "epoch": 0.9550695825049702, + "grad_norm": 10.600262641906738, + "learning_rate": 3.85243820589172e-06, + "loss": 0.3238, + "num_input_tokens_seen": 14103128, + "step": 7206 + }, + { + "epoch": 0.9552021206096752, + "grad_norm": 13.700072288513184, + "learning_rate": 3.852146237015383e-06, + "loss": 0.6478, + "num_input_tokens_seen": 14105504, + "step": 7207 + }, + { + "epoch": 0.9553346587143804, + "grad_norm": 8.97553825378418, + "learning_rate": 3.851854242068732e-06, + "loss": 0.1567, + "num_input_tokens_seen": 14107408, + "step": 7208 + }, + { + "epoch": 0.9554671968190855, + "grad_norm": 4.607557773590088, + "learning_rate": 3.8515622210574e-06, + "loss": 0.098, + "num_input_tokens_seen": 14108704, + "step": 7209 + }, + { + "epoch": 0.9555997349237906, + "grad_norm": 0.11022916436195374, + "learning_rate": 3.851270173987015e-06, + "loss": 0.0008, + "num_input_tokens_seen": 14111440, + "step": 7210 + }, + { + "epoch": 0.9557322730284957, + "grad_norm": 12.492790222167969, + "learning_rate": 3.850978100863209e-06, + "loss": 0.4499, + "num_input_tokens_seen": 14113368, + "step": 7211 + }, + { + "epoch": 0.9558648111332008, + "grad_norm": 0.06487315893173218, + "learning_rate": 3.850686001691615e-06, + "loss": 0.0005, + "num_input_tokens_seen": 14115248, + "step": 7212 + }, + { + "epoch": 0.9559973492379059, + "grad_norm": 0.6930650472640991, + "learning_rate": 3.850393876477861e-06, + "loss": 0.0048, + "num_input_tokens_seen": 14117512, + "step": 7213 + }, + { + "epoch": 0.956129887342611, + "grad_norm": 1.9123904705047607, + "learning_rate": 3.850101725227582e-06, + "loss": 0.0731, + "num_input_tokens_seen": 14120128, + "step": 7214 + }, + { + "epoch": 0.9562624254473161, + "grad_norm": 8.278051376342773, + "learning_rate": 3.849809547946411e-06, + "loss": 0.2676, + "num_input_tokens_seen": 14121816, + "step": 7215 + }, + { + "epoch": 0.9563949635520212, + "grad_norm": 0.09896618872880936, + "learning_rate": 3.84951734463998e-06, + "loss": 0.0007, + "num_input_tokens_seen": 14123920, + "step": 7216 + }, + { + "epoch": 0.9565275016567263, + "grad_norm": 13.187090873718262, + "learning_rate": 3.849225115313924e-06, + "loss": 0.171, + "num_input_tokens_seen": 14126104, + "step": 7217 + }, + { + "epoch": 0.9566600397614314, + "grad_norm": 12.555880546569824, + "learning_rate": 3.8489328599738765e-06, + "loss": 0.1654, + "num_input_tokens_seen": 14129184, + "step": 7218 + }, + { + "epoch": 0.9567925778661365, + "grad_norm": 6.290533542633057, + "learning_rate": 3.848640578625473e-06, + "loss": 0.1402, + "num_input_tokens_seen": 14130736, + "step": 7219 + }, + { + "epoch": 0.9569251159708416, + "grad_norm": 4.557801246643066, + "learning_rate": 3.848348271274349e-06, + "loss": 0.0534, + "num_input_tokens_seen": 14132032, + "step": 7220 + }, + { + "epoch": 0.9570576540755468, + "grad_norm": 8.2571439743042, + "learning_rate": 3.848055937926141e-06, + "loss": 0.1335, + "num_input_tokens_seen": 14133864, + "step": 7221 + }, + { + "epoch": 0.9571901921802518, + "grad_norm": 11.502821922302246, + "learning_rate": 3.8477635785864835e-06, + "loss": 0.2384, + "num_input_tokens_seen": 14135536, + "step": 7222 + }, + { + "epoch": 0.957322730284957, + "grad_norm": 2.774744987487793, + "learning_rate": 3.847471193261014e-06, + "loss": 0.0521, + "num_input_tokens_seen": 14138336, + "step": 7223 + }, + { + "epoch": 0.957455268389662, + "grad_norm": 9.686749458312988, + "learning_rate": 3.847178781955371e-06, + "loss": 0.4133, + "num_input_tokens_seen": 14139824, + "step": 7224 + }, + { + "epoch": 0.9575878064943671, + "grad_norm": 13.85748291015625, + "learning_rate": 3.846886344675193e-06, + "loss": 0.4314, + "num_input_tokens_seen": 14142256, + "step": 7225 + }, + { + "epoch": 0.9577203445990722, + "grad_norm": 0.21601247787475586, + "learning_rate": 3.846593881426114e-06, + "loss": 0.0015, + "num_input_tokens_seen": 14144232, + "step": 7226 + }, + { + "epoch": 0.9578528827037773, + "grad_norm": 4.852502346038818, + "learning_rate": 3.846301392213779e-06, + "loss": 0.0575, + "num_input_tokens_seen": 14145616, + "step": 7227 + }, + { + "epoch": 0.9579854208084825, + "grad_norm": 22.521774291992188, + "learning_rate": 3.846008877043823e-06, + "loss": 1.0353, + "num_input_tokens_seen": 14148632, + "step": 7228 + }, + { + "epoch": 0.9581179589131875, + "grad_norm": 4.47910737991333, + "learning_rate": 3.845716335921888e-06, + "loss": 0.0898, + "num_input_tokens_seen": 14150512, + "step": 7229 + }, + { + "epoch": 0.9582504970178927, + "grad_norm": 6.943549633026123, + "learning_rate": 3.845423768853612e-06, + "loss": 0.083, + "num_input_tokens_seen": 14151944, + "step": 7230 + }, + { + "epoch": 0.9583830351225977, + "grad_norm": 8.506834030151367, + "learning_rate": 3.845131175844639e-06, + "loss": 0.1692, + "num_input_tokens_seen": 14154272, + "step": 7231 + }, + { + "epoch": 0.9585155732273029, + "grad_norm": 12.816849708557129, + "learning_rate": 3.844838556900608e-06, + "loss": 0.6805, + "num_input_tokens_seen": 14156560, + "step": 7232 + }, + { + "epoch": 0.958648111332008, + "grad_norm": 0.6843388676643372, + "learning_rate": 3.844545912027163e-06, + "loss": 0.0053, + "num_input_tokens_seen": 14159176, + "step": 7233 + }, + { + "epoch": 0.958780649436713, + "grad_norm": 7.655886173248291, + "learning_rate": 3.844253241229945e-06, + "loss": 0.0878, + "num_input_tokens_seen": 14160856, + "step": 7234 + }, + { + "epoch": 0.9589131875414182, + "grad_norm": 3.3074445724487305, + "learning_rate": 3.843960544514597e-06, + "loss": 0.0677, + "num_input_tokens_seen": 14162048, + "step": 7235 + }, + { + "epoch": 0.9590457256461232, + "grad_norm": 16.33290672302246, + "learning_rate": 3.843667821886763e-06, + "loss": 0.2944, + "num_input_tokens_seen": 14163856, + "step": 7236 + }, + { + "epoch": 0.9591782637508284, + "grad_norm": 3.6990323066711426, + "learning_rate": 3.843375073352086e-06, + "loss": 0.0493, + "num_input_tokens_seen": 14165768, + "step": 7237 + }, + { + "epoch": 0.9593108018555334, + "grad_norm": 1.4711707830429077, + "learning_rate": 3.843082298916211e-06, + "loss": 0.0204, + "num_input_tokens_seen": 14167264, + "step": 7238 + }, + { + "epoch": 0.9594433399602386, + "grad_norm": 10.919734001159668, + "learning_rate": 3.842789498584781e-06, + "loss": 0.3468, + "num_input_tokens_seen": 14170096, + "step": 7239 + }, + { + "epoch": 0.9595758780649437, + "grad_norm": 7.053165912628174, + "learning_rate": 3.8424966723634464e-06, + "loss": 0.1099, + "num_input_tokens_seen": 14172832, + "step": 7240 + }, + { + "epoch": 0.9597084161696487, + "grad_norm": 0.6565023064613342, + "learning_rate": 3.842203820257847e-06, + "loss": 0.0051, + "num_input_tokens_seen": 14174088, + "step": 7241 + }, + { + "epoch": 0.9598409542743539, + "grad_norm": 3.8349311351776123, + "learning_rate": 3.841910942273634e-06, + "loss": 0.0972, + "num_input_tokens_seen": 14176016, + "step": 7242 + }, + { + "epoch": 0.9599734923790589, + "grad_norm": 4.940202713012695, + "learning_rate": 3.841618038416451e-06, + "loss": 0.1754, + "num_input_tokens_seen": 14177896, + "step": 7243 + }, + { + "epoch": 0.9601060304837641, + "grad_norm": 12.5956392288208, + "learning_rate": 3.841325108691946e-06, + "loss": 0.3572, + "num_input_tokens_seen": 14179768, + "step": 7244 + }, + { + "epoch": 0.9602385685884692, + "grad_norm": 11.306081771850586, + "learning_rate": 3.841032153105769e-06, + "loss": 0.5367, + "num_input_tokens_seen": 14181488, + "step": 7245 + }, + { + "epoch": 0.9603711066931743, + "grad_norm": 0.2793773114681244, + "learning_rate": 3.8407391716635675e-06, + "loss": 0.0021, + "num_input_tokens_seen": 14185048, + "step": 7246 + }, + { + "epoch": 0.9605036447978794, + "grad_norm": 0.32963821291923523, + "learning_rate": 3.840446164370988e-06, + "loss": 0.0025, + "num_input_tokens_seen": 14186680, + "step": 7247 + }, + { + "epoch": 0.9606361829025845, + "grad_norm": 6.8996686935424805, + "learning_rate": 3.8401531312336824e-06, + "loss": 0.2861, + "num_input_tokens_seen": 14189016, + "step": 7248 + }, + { + "epoch": 0.9607687210072896, + "grad_norm": 9.540470123291016, + "learning_rate": 3.839860072257301e-06, + "loss": 0.2321, + "num_input_tokens_seen": 14190848, + "step": 7249 + }, + { + "epoch": 0.9609012591119946, + "grad_norm": 9.375992774963379, + "learning_rate": 3.839566987447492e-06, + "loss": 0.2517, + "num_input_tokens_seen": 14192976, + "step": 7250 + }, + { + "epoch": 0.9610337972166998, + "grad_norm": 8.381304740905762, + "learning_rate": 3.839273876809908e-06, + "loss": 0.2747, + "num_input_tokens_seen": 14194728, + "step": 7251 + }, + { + "epoch": 0.9611663353214049, + "grad_norm": 8.329108238220215, + "learning_rate": 3.838980740350199e-06, + "loss": 0.1923, + "num_input_tokens_seen": 14196600, + "step": 7252 + }, + { + "epoch": 0.96129887342611, + "grad_norm": 5.750219821929932, + "learning_rate": 3.838687578074018e-06, + "loss": 0.0035, + "num_input_tokens_seen": 14198056, + "step": 7253 + }, + { + "epoch": 0.9614314115308151, + "grad_norm": 8.22239875793457, + "learning_rate": 3.838394389987017e-06, + "loss": 0.4236, + "num_input_tokens_seen": 14200488, + "step": 7254 + }, + { + "epoch": 0.9615639496355202, + "grad_norm": 1.7130067348480225, + "learning_rate": 3.838101176094848e-06, + "loss": 0.0187, + "num_input_tokens_seen": 14202368, + "step": 7255 + }, + { + "epoch": 0.9616964877402253, + "grad_norm": 13.184988021850586, + "learning_rate": 3.837807936403166e-06, + "loss": 0.298, + "num_input_tokens_seen": 14203896, + "step": 7256 + }, + { + "epoch": 0.9618290258449305, + "grad_norm": 0.5100119709968567, + "learning_rate": 3.837514670917625e-06, + "loss": 0.0038, + "num_input_tokens_seen": 14207040, + "step": 7257 + }, + { + "epoch": 0.9619615639496355, + "grad_norm": 5.201852321624756, + "learning_rate": 3.837221379643877e-06, + "loss": 0.0513, + "num_input_tokens_seen": 14208760, + "step": 7258 + }, + { + "epoch": 0.9620941020543406, + "grad_norm": 19.078611373901367, + "learning_rate": 3.836928062587579e-06, + "loss": 0.5577, + "num_input_tokens_seen": 14210944, + "step": 7259 + }, + { + "epoch": 0.9622266401590457, + "grad_norm": 4.421248435974121, + "learning_rate": 3.836634719754385e-06, + "loss": 0.0999, + "num_input_tokens_seen": 14213888, + "step": 7260 + }, + { + "epoch": 0.9623591782637508, + "grad_norm": 9.753243446350098, + "learning_rate": 3.8363413511499526e-06, + "loss": 0.3155, + "num_input_tokens_seen": 14216320, + "step": 7261 + }, + { + "epoch": 0.962491716368456, + "grad_norm": 1.1519651412963867, + "learning_rate": 3.836047956779936e-06, + "loss": 0.0079, + "num_input_tokens_seen": 14217528, + "step": 7262 + }, + { + "epoch": 0.962624254473161, + "grad_norm": 3.0933010578155518, + "learning_rate": 3.835754536649994e-06, + "loss": 0.0248, + "num_input_tokens_seen": 14219264, + "step": 7263 + }, + { + "epoch": 0.9627567925778662, + "grad_norm": 10.4746675491333, + "learning_rate": 3.835461090765782e-06, + "loss": 0.2494, + "num_input_tokens_seen": 14222136, + "step": 7264 + }, + { + "epoch": 0.9628893306825712, + "grad_norm": 2.1459710597991943, + "learning_rate": 3.835167619132959e-06, + "loss": 0.0172, + "num_input_tokens_seen": 14224016, + "step": 7265 + }, + { + "epoch": 0.9630218687872764, + "grad_norm": 0.9849420189857483, + "learning_rate": 3.834874121757185e-06, + "loss": 0.0134, + "num_input_tokens_seen": 14225752, + "step": 7266 + }, + { + "epoch": 0.9631544068919814, + "grad_norm": 13.74028205871582, + "learning_rate": 3.834580598644115e-06, + "loss": 0.6818, + "num_input_tokens_seen": 14227816, + "step": 7267 + }, + { + "epoch": 0.9632869449966865, + "grad_norm": 4.893702030181885, + "learning_rate": 3.834287049799411e-06, + "loss": 0.1002, + "num_input_tokens_seen": 14229400, + "step": 7268 + }, + { + "epoch": 0.9634194831013917, + "grad_norm": 0.13729555904865265, + "learning_rate": 3.833993475228733e-06, + "loss": 0.0009, + "num_input_tokens_seen": 14231136, + "step": 7269 + }, + { + "epoch": 0.9635520212060967, + "grad_norm": 0.38407355546951294, + "learning_rate": 3.83369987493774e-06, + "loss": 0.0028, + "num_input_tokens_seen": 14233112, + "step": 7270 + }, + { + "epoch": 0.9636845593108019, + "grad_norm": 9.019268989562988, + "learning_rate": 3.833406248932092e-06, + "loss": 0.0918, + "num_input_tokens_seen": 14234904, + "step": 7271 + }, + { + "epoch": 0.9638170974155069, + "grad_norm": 0.3099585175514221, + "learning_rate": 3.833112597217454e-06, + "loss": 0.0022, + "num_input_tokens_seen": 14237120, + "step": 7272 + }, + { + "epoch": 0.9639496355202121, + "grad_norm": 7.625340938568115, + "learning_rate": 3.8328189197994836e-06, + "loss": 0.171, + "num_input_tokens_seen": 14238608, + "step": 7273 + }, + { + "epoch": 0.9640821736249172, + "grad_norm": 0.03240329027175903, + "learning_rate": 3.832525216683845e-06, + "loss": 0.0002, + "num_input_tokens_seen": 14240336, + "step": 7274 + }, + { + "epoch": 0.9642147117296223, + "grad_norm": 5.497343063354492, + "learning_rate": 3.832231487876202e-06, + "loss": 0.0941, + "num_input_tokens_seen": 14242344, + "step": 7275 + }, + { + "epoch": 0.9643472498343274, + "grad_norm": 20.15484046936035, + "learning_rate": 3.831937733382217e-06, + "loss": 0.813, + "num_input_tokens_seen": 14244464, + "step": 7276 + }, + { + "epoch": 0.9644797879390324, + "grad_norm": 0.08887320756912231, + "learning_rate": 3.831643953207553e-06, + "loss": 0.0006, + "num_input_tokens_seen": 14246024, + "step": 7277 + }, + { + "epoch": 0.9646123260437376, + "grad_norm": 4.051753520965576, + "learning_rate": 3.831350147357875e-06, + "loss": 0.0361, + "num_input_tokens_seen": 14247792, + "step": 7278 + }, + { + "epoch": 0.9647448641484426, + "grad_norm": 18.706377029418945, + "learning_rate": 3.831056315838848e-06, + "loss": 0.7244, + "num_input_tokens_seen": 14250752, + "step": 7279 + }, + { + "epoch": 0.9648774022531478, + "grad_norm": 1.795831561088562, + "learning_rate": 3.830762458656136e-06, + "loss": 0.0446, + "num_input_tokens_seen": 14252912, + "step": 7280 + }, + { + "epoch": 0.9650099403578529, + "grad_norm": 9.70200252532959, + "learning_rate": 3.830468575815407e-06, + "loss": 0.1908, + "num_input_tokens_seen": 14255440, + "step": 7281 + }, + { + "epoch": 0.965142478462558, + "grad_norm": 7.659546852111816, + "learning_rate": 3.830174667322325e-06, + "loss": 0.2099, + "num_input_tokens_seen": 14257424, + "step": 7282 + }, + { + "epoch": 0.9652750165672631, + "grad_norm": 0.13579103350639343, + "learning_rate": 3.829880733182558e-06, + "loss": 0.0007, + "num_input_tokens_seen": 14261152, + "step": 7283 + }, + { + "epoch": 0.9654075546719681, + "grad_norm": 0.3113535940647125, + "learning_rate": 3.829586773401774e-06, + "loss": 0.0018, + "num_input_tokens_seen": 14262448, + "step": 7284 + }, + { + "epoch": 0.9655400927766733, + "grad_norm": 0.17341098189353943, + "learning_rate": 3.829292787985639e-06, + "loss": 0.0013, + "num_input_tokens_seen": 14263712, + "step": 7285 + }, + { + "epoch": 0.9656726308813784, + "grad_norm": 0.08654141426086426, + "learning_rate": 3.828998776939822e-06, + "loss": 0.0006, + "num_input_tokens_seen": 14267528, + "step": 7286 + }, + { + "epoch": 0.9658051689860835, + "grad_norm": 6.54250431060791, + "learning_rate": 3.828704740269992e-06, + "loss": 0.201, + "num_input_tokens_seen": 14269232, + "step": 7287 + }, + { + "epoch": 0.9659377070907886, + "grad_norm": 14.979636192321777, + "learning_rate": 3.828410677981817e-06, + "loss": 0.6083, + "num_input_tokens_seen": 14271720, + "step": 7288 + }, + { + "epoch": 0.9660702451954937, + "grad_norm": 1.4321199655532837, + "learning_rate": 3.828116590080969e-06, + "loss": 0.0367, + "num_input_tokens_seen": 14273512, + "step": 7289 + }, + { + "epoch": 0.9662027833001988, + "grad_norm": 8.749720573425293, + "learning_rate": 3.827822476573117e-06, + "loss": 0.259, + "num_input_tokens_seen": 14274920, + "step": 7290 + }, + { + "epoch": 0.9663353214049039, + "grad_norm": 2.8881542682647705, + "learning_rate": 3.82752833746393e-06, + "loss": 0.0504, + "num_input_tokens_seen": 14276464, + "step": 7291 + }, + { + "epoch": 0.966467859509609, + "grad_norm": 13.5059232711792, + "learning_rate": 3.827234172759082e-06, + "loss": 0.504, + "num_input_tokens_seen": 14278776, + "step": 7292 + }, + { + "epoch": 0.9666003976143142, + "grad_norm": 9.028846740722656, + "learning_rate": 3.826939982464244e-06, + "loss": 0.2612, + "num_input_tokens_seen": 14280712, + "step": 7293 + }, + { + "epoch": 0.9667329357190192, + "grad_norm": 0.21338699758052826, + "learning_rate": 3.8266457665850864e-06, + "loss": 0.0014, + "num_input_tokens_seen": 14283032, + "step": 7294 + }, + { + "epoch": 0.9668654738237243, + "grad_norm": 11.965989112854004, + "learning_rate": 3.826351525127285e-06, + "loss": 0.3068, + "num_input_tokens_seen": 14284904, + "step": 7295 + }, + { + "epoch": 0.9669980119284294, + "grad_norm": 13.963319778442383, + "learning_rate": 3.82605725809651e-06, + "loss": 0.3381, + "num_input_tokens_seen": 14286120, + "step": 7296 + }, + { + "epoch": 0.9671305500331345, + "grad_norm": 2.6831743717193604, + "learning_rate": 3.825762965498437e-06, + "loss": 0.0741, + "num_input_tokens_seen": 14288456, + "step": 7297 + }, + { + "epoch": 0.9672630881378397, + "grad_norm": 3.2664668560028076, + "learning_rate": 3.825468647338738e-06, + "loss": 0.059, + "num_input_tokens_seen": 14289768, + "step": 7298 + }, + { + "epoch": 0.9673956262425447, + "grad_norm": 20.401212692260742, + "learning_rate": 3.82517430362309e-06, + "loss": 0.969, + "num_input_tokens_seen": 14292456, + "step": 7299 + }, + { + "epoch": 0.9675281643472499, + "grad_norm": 8.999568939208984, + "learning_rate": 3.824879934357167e-06, + "loss": 0.3358, + "num_input_tokens_seen": 14294224, + "step": 7300 + }, + { + "epoch": 0.9676607024519549, + "grad_norm": 0.2338992953300476, + "learning_rate": 3.824585539546645e-06, + "loss": 0.0016, + "num_input_tokens_seen": 14296176, + "step": 7301 + }, + { + "epoch": 0.96779324055666, + "grad_norm": 0.9378090500831604, + "learning_rate": 3.824291119197201e-06, + "loss": 0.0067, + "num_input_tokens_seen": 14299256, + "step": 7302 + }, + { + "epoch": 0.9679257786613651, + "grad_norm": 0.47262099385261536, + "learning_rate": 3.8239966733145095e-06, + "loss": 0.0036, + "num_input_tokens_seen": 14301448, + "step": 7303 + }, + { + "epoch": 0.9680583167660702, + "grad_norm": 10.97904109954834, + "learning_rate": 3.8237022019042485e-06, + "loss": 0.4317, + "num_input_tokens_seen": 14303768, + "step": 7304 + }, + { + "epoch": 0.9681908548707754, + "grad_norm": 1.4852527379989624, + "learning_rate": 3.823407704972096e-06, + "loss": 0.0154, + "num_input_tokens_seen": 14305928, + "step": 7305 + }, + { + "epoch": 0.9683233929754804, + "grad_norm": 0.32150766253471375, + "learning_rate": 3.823113182523731e-06, + "loss": 0.0024, + "num_input_tokens_seen": 14307208, + "step": 7306 + }, + { + "epoch": 0.9684559310801856, + "grad_norm": 9.616583824157715, + "learning_rate": 3.82281863456483e-06, + "loss": 0.1175, + "num_input_tokens_seen": 14309680, + "step": 7307 + }, + { + "epoch": 0.9685884691848906, + "grad_norm": 10.362848281860352, + "learning_rate": 3.822524061101074e-06, + "loss": 0.4577, + "num_input_tokens_seen": 14312024, + "step": 7308 + }, + { + "epoch": 0.9687210072895958, + "grad_norm": 1.3299497365951538, + "learning_rate": 3.822229462138141e-06, + "loss": 0.0094, + "num_input_tokens_seen": 14313528, + "step": 7309 + }, + { + "epoch": 0.9688535453943009, + "grad_norm": 8.247082710266113, + "learning_rate": 3.821934837681712e-06, + "loss": 0.1152, + "num_input_tokens_seen": 14316408, + "step": 7310 + }, + { + "epoch": 0.968986083499006, + "grad_norm": 7.721426010131836, + "learning_rate": 3.8216401877374686e-06, + "loss": 0.1984, + "num_input_tokens_seen": 14318088, + "step": 7311 + }, + { + "epoch": 0.9691186216037111, + "grad_norm": 6.574674129486084, + "learning_rate": 3.821345512311089e-06, + "loss": 0.1403, + "num_input_tokens_seen": 14320288, + "step": 7312 + }, + { + "epoch": 0.9692511597084161, + "grad_norm": 2.0483012199401855, + "learning_rate": 3.8210508114082564e-06, + "loss": 0.0127, + "num_input_tokens_seen": 14323296, + "step": 7313 + }, + { + "epoch": 0.9693836978131213, + "grad_norm": 0.5963882207870483, + "learning_rate": 3.8207560850346536e-06, + "loss": 0.0042, + "num_input_tokens_seen": 14324824, + "step": 7314 + }, + { + "epoch": 0.9695162359178264, + "grad_norm": 16.435224533081055, + "learning_rate": 3.820461333195961e-06, + "loss": 0.3771, + "num_input_tokens_seen": 14327104, + "step": 7315 + }, + { + "epoch": 0.9696487740225315, + "grad_norm": 18.30119514465332, + "learning_rate": 3.820166555897865e-06, + "loss": 0.1952, + "num_input_tokens_seen": 14328632, + "step": 7316 + }, + { + "epoch": 0.9697813121272366, + "grad_norm": 11.88756275177002, + "learning_rate": 3.819871753146046e-06, + "loss": 0.319, + "num_input_tokens_seen": 14330288, + "step": 7317 + }, + { + "epoch": 0.9699138502319417, + "grad_norm": 7.416049480438232, + "learning_rate": 3.819576924946189e-06, + "loss": 0.1598, + "num_input_tokens_seen": 14333008, + "step": 7318 + }, + { + "epoch": 0.9700463883366468, + "grad_norm": 0.39120858907699585, + "learning_rate": 3.819282071303979e-06, + "loss": 0.0029, + "num_input_tokens_seen": 14336304, + "step": 7319 + }, + { + "epoch": 0.9701789264413518, + "grad_norm": 0.31496647000312805, + "learning_rate": 3.8189871922251e-06, + "loss": 0.002, + "num_input_tokens_seen": 14337560, + "step": 7320 + }, + { + "epoch": 0.970311464546057, + "grad_norm": 2.6085495948791504, + "learning_rate": 3.818692287715238e-06, + "loss": 0.0207, + "num_input_tokens_seen": 14339392, + "step": 7321 + }, + { + "epoch": 0.9704440026507621, + "grad_norm": 10.654458045959473, + "learning_rate": 3.8183973577800794e-06, + "loss": 0.2895, + "num_input_tokens_seen": 14341536, + "step": 7322 + }, + { + "epoch": 0.9705765407554672, + "grad_norm": 3.848426580429077, + "learning_rate": 3.81810240242531e-06, + "loss": 0.0374, + "num_input_tokens_seen": 14343000, + "step": 7323 + }, + { + "epoch": 0.9707090788601723, + "grad_norm": 5.351003170013428, + "learning_rate": 3.817807421656617e-06, + "loss": 0.103, + "num_input_tokens_seen": 14345112, + "step": 7324 + }, + { + "epoch": 0.9708416169648774, + "grad_norm": 1.600977897644043, + "learning_rate": 3.817512415479688e-06, + "loss": 0.023, + "num_input_tokens_seen": 14347200, + "step": 7325 + }, + { + "epoch": 0.9709741550695825, + "grad_norm": 5.760276794433594, + "learning_rate": 3.81721738390021e-06, + "loss": 0.0969, + "num_input_tokens_seen": 14349280, + "step": 7326 + }, + { + "epoch": 0.9711066931742877, + "grad_norm": 3.164106607437134, + "learning_rate": 3.816922326923873e-06, + "loss": 0.1272, + "num_input_tokens_seen": 14350904, + "step": 7327 + }, + { + "epoch": 0.9712392312789927, + "grad_norm": 15.033295631408691, + "learning_rate": 3.8166272445563644e-06, + "loss": 0.4148, + "num_input_tokens_seen": 14352176, + "step": 7328 + }, + { + "epoch": 0.9713717693836978, + "grad_norm": 0.14832700788974762, + "learning_rate": 3.816332136803375e-06, + "loss": 0.0011, + "num_input_tokens_seen": 14353672, + "step": 7329 + }, + { + "epoch": 0.9715043074884029, + "grad_norm": 9.556941986083984, + "learning_rate": 3.816037003670593e-06, + "loss": 0.2688, + "num_input_tokens_seen": 14355072, + "step": 7330 + }, + { + "epoch": 0.971636845593108, + "grad_norm": 0.10414943099021912, + "learning_rate": 3.815741845163711e-06, + "loss": 0.0007, + "num_input_tokens_seen": 14357544, + "step": 7331 + }, + { + "epoch": 0.9717693836978131, + "grad_norm": 12.795763969421387, + "learning_rate": 3.815446661288417e-06, + "loss": 0.6128, + "num_input_tokens_seen": 14359688, + "step": 7332 + }, + { + "epoch": 0.9719019218025182, + "grad_norm": 3.1570181846618652, + "learning_rate": 3.815151452050404e-06, + "loss": 0.0254, + "num_input_tokens_seen": 14361560, + "step": 7333 + }, + { + "epoch": 0.9720344599072234, + "grad_norm": 0.056894175708293915, + "learning_rate": 3.814856217455365e-06, + "loss": 0.0004, + "num_input_tokens_seen": 14363232, + "step": 7334 + }, + { + "epoch": 0.9721669980119284, + "grad_norm": 9.5023775100708, + "learning_rate": 3.81456095750899e-06, + "loss": 0.3386, + "num_input_tokens_seen": 14365400, + "step": 7335 + }, + { + "epoch": 0.9722995361166336, + "grad_norm": 25.07636070251465, + "learning_rate": 3.8142656722169735e-06, + "loss": 0.8565, + "num_input_tokens_seen": 14367488, + "step": 7336 + }, + { + "epoch": 0.9724320742213386, + "grad_norm": 14.887669563293457, + "learning_rate": 3.8139703615850077e-06, + "loss": 0.178, + "num_input_tokens_seen": 14370144, + "step": 7337 + }, + { + "epoch": 0.9725646123260437, + "grad_norm": 8.586089134216309, + "learning_rate": 3.813675025618787e-06, + "loss": 0.1007, + "num_input_tokens_seen": 14371416, + "step": 7338 + }, + { + "epoch": 0.9726971504307489, + "grad_norm": 9.457840919494629, + "learning_rate": 3.8133796643240055e-06, + "loss": 0.4164, + "num_input_tokens_seen": 14373264, + "step": 7339 + }, + { + "epoch": 0.9728296885354539, + "grad_norm": 0.08925791084766388, + "learning_rate": 3.813084277706358e-06, + "loss": 0.0006, + "num_input_tokens_seen": 14375040, + "step": 7340 + }, + { + "epoch": 0.9729622266401591, + "grad_norm": 0.2219213843345642, + "learning_rate": 3.8127888657715406e-06, + "loss": 0.0016, + "num_input_tokens_seen": 14376240, + "step": 7341 + }, + { + "epoch": 0.9730947647448641, + "grad_norm": 6.009913921356201, + "learning_rate": 3.812493428525247e-06, + "loss": 0.1745, + "num_input_tokens_seen": 14379432, + "step": 7342 + }, + { + "epoch": 0.9732273028495693, + "grad_norm": 0.638717770576477, + "learning_rate": 3.8121979659731755e-06, + "loss": 0.0087, + "num_input_tokens_seen": 14381576, + "step": 7343 + }, + { + "epoch": 0.9733598409542743, + "grad_norm": 4.583749771118164, + "learning_rate": 3.8119024781210213e-06, + "loss": 0.0977, + "num_input_tokens_seen": 14382816, + "step": 7344 + }, + { + "epoch": 0.9734923790589795, + "grad_norm": 19.053739547729492, + "learning_rate": 3.8116069649744825e-06, + "loss": 0.7657, + "num_input_tokens_seen": 14384528, + "step": 7345 + }, + { + "epoch": 0.9736249171636846, + "grad_norm": 6.605473518371582, + "learning_rate": 3.811311426539257e-06, + "loss": 0.1566, + "num_input_tokens_seen": 14385992, + "step": 7346 + }, + { + "epoch": 0.9737574552683896, + "grad_norm": 10.880682945251465, + "learning_rate": 3.8110158628210423e-06, + "loss": 0.2813, + "num_input_tokens_seen": 14388288, + "step": 7347 + }, + { + "epoch": 0.9738899933730948, + "grad_norm": 13.816142082214355, + "learning_rate": 3.8107202738255373e-06, + "loss": 0.4768, + "num_input_tokens_seen": 14390096, + "step": 7348 + }, + { + "epoch": 0.9740225314777998, + "grad_norm": 0.16052336990833282, + "learning_rate": 3.8104246595584414e-06, + "loss": 0.0011, + "num_input_tokens_seen": 14392256, + "step": 7349 + }, + { + "epoch": 0.974155069582505, + "grad_norm": 8.141950607299805, + "learning_rate": 3.8101290200254537e-06, + "loss": 0.3301, + "num_input_tokens_seen": 14394096, + "step": 7350 + }, + { + "epoch": 0.9742876076872101, + "grad_norm": 11.874165534973145, + "learning_rate": 3.8098333552322745e-06, + "loss": 0.2818, + "num_input_tokens_seen": 14397256, + "step": 7351 + }, + { + "epoch": 0.9744201457919152, + "grad_norm": 11.379547119140625, + "learning_rate": 3.8095376651846056e-06, + "loss": 0.3927, + "num_input_tokens_seen": 14399320, + "step": 7352 + }, + { + "epoch": 0.9745526838966203, + "grad_norm": 14.268540382385254, + "learning_rate": 3.809241949888146e-06, + "loss": 0.2681, + "num_input_tokens_seen": 14401528, + "step": 7353 + }, + { + "epoch": 0.9746852220013253, + "grad_norm": 0.4084547758102417, + "learning_rate": 3.8089462093485993e-06, + "loss": 0.003, + "num_input_tokens_seen": 14403448, + "step": 7354 + }, + { + "epoch": 0.9748177601060305, + "grad_norm": 3.1902153491973877, + "learning_rate": 3.8086504435716663e-06, + "loss": 0.0764, + "num_input_tokens_seen": 14407032, + "step": 7355 + }, + { + "epoch": 0.9749502982107355, + "grad_norm": 10.686497688293457, + "learning_rate": 3.80835465256305e-06, + "loss": 0.1938, + "num_input_tokens_seen": 14408896, + "step": 7356 + }, + { + "epoch": 0.9750828363154407, + "grad_norm": 16.394956588745117, + "learning_rate": 3.8080588363284533e-06, + "loss": 0.3917, + "num_input_tokens_seen": 14410048, + "step": 7357 + }, + { + "epoch": 0.9752153744201458, + "grad_norm": 0.6045948266983032, + "learning_rate": 3.8077629948735794e-06, + "loss": 0.0044, + "num_input_tokens_seen": 14411240, + "step": 7358 + }, + { + "epoch": 0.9753479125248509, + "grad_norm": 0.27397164702415466, + "learning_rate": 3.807467128204133e-06, + "loss": 0.002, + "num_input_tokens_seen": 14413640, + "step": 7359 + }, + { + "epoch": 0.975480450629556, + "grad_norm": 9.47497272491455, + "learning_rate": 3.8071712363258197e-06, + "loss": 0.1423, + "num_input_tokens_seen": 14415736, + "step": 7360 + }, + { + "epoch": 0.975612988734261, + "grad_norm": 0.6445384621620178, + "learning_rate": 3.8068753192443418e-06, + "loss": 0.0046, + "num_input_tokens_seen": 14417352, + "step": 7361 + }, + { + "epoch": 0.9757455268389662, + "grad_norm": 6.003385066986084, + "learning_rate": 3.8065793769654056e-06, + "loss": 0.1527, + "num_input_tokens_seen": 14419464, + "step": 7362 + }, + { + "epoch": 0.9758780649436714, + "grad_norm": 3.898773670196533, + "learning_rate": 3.8062834094947186e-06, + "loss": 0.0358, + "num_input_tokens_seen": 14421560, + "step": 7363 + }, + { + "epoch": 0.9760106030483764, + "grad_norm": 9.3087158203125, + "learning_rate": 3.8059874168379872e-06, + "loss": 0.0821, + "num_input_tokens_seen": 14423608, + "step": 7364 + }, + { + "epoch": 0.9761431411530815, + "grad_norm": 3.380053758621216, + "learning_rate": 3.8056913990009157e-06, + "loss": 0.0241, + "num_input_tokens_seen": 14424616, + "step": 7365 + }, + { + "epoch": 0.9762756792577866, + "grad_norm": 6.693706512451172, + "learning_rate": 3.8053953559892144e-06, + "loss": 0.1914, + "num_input_tokens_seen": 14426456, + "step": 7366 + }, + { + "epoch": 0.9764082173624917, + "grad_norm": 1.5852546691894531, + "learning_rate": 3.80509928780859e-06, + "loss": 0.0514, + "num_input_tokens_seen": 14427904, + "step": 7367 + }, + { + "epoch": 0.9765407554671968, + "grad_norm": 9.312158584594727, + "learning_rate": 3.8048031944647512e-06, + "loss": 0.2358, + "num_input_tokens_seen": 14430248, + "step": 7368 + }, + { + "epoch": 0.9766732935719019, + "grad_norm": 8.545063972473145, + "learning_rate": 3.804507075963406e-06, + "loss": 0.2748, + "num_input_tokens_seen": 14432296, + "step": 7369 + }, + { + "epoch": 0.9768058316766071, + "grad_norm": 10.22640609741211, + "learning_rate": 3.804210932310265e-06, + "loss": 0.4429, + "num_input_tokens_seen": 14434248, + "step": 7370 + }, + { + "epoch": 0.9769383697813121, + "grad_norm": 4.385878086090088, + "learning_rate": 3.803914763511036e-06, + "loss": 0.0854, + "num_input_tokens_seen": 14436160, + "step": 7371 + }, + { + "epoch": 0.9770709078860172, + "grad_norm": 4.706075668334961, + "learning_rate": 3.803618569571433e-06, + "loss": 0.1361, + "num_input_tokens_seen": 14438272, + "step": 7372 + }, + { + "epoch": 0.9772034459907223, + "grad_norm": 9.773750305175781, + "learning_rate": 3.803322350497164e-06, + "loss": 0.3596, + "num_input_tokens_seen": 14440760, + "step": 7373 + }, + { + "epoch": 0.9773359840954274, + "grad_norm": 7.5464091300964355, + "learning_rate": 3.80302610629394e-06, + "loss": 0.307, + "num_input_tokens_seen": 14442640, + "step": 7374 + }, + { + "epoch": 0.9774685222001326, + "grad_norm": 7.880706310272217, + "learning_rate": 3.802729836967475e-06, + "loss": 0.2558, + "num_input_tokens_seen": 14444176, + "step": 7375 + }, + { + "epoch": 0.9776010603048376, + "grad_norm": 3.486879825592041, + "learning_rate": 3.802433542523479e-06, + "loss": 0.0876, + "num_input_tokens_seen": 14445800, + "step": 7376 + }, + { + "epoch": 0.9777335984095428, + "grad_norm": 2.1389243602752686, + "learning_rate": 3.8021372229676662e-06, + "loss": 0.0708, + "num_input_tokens_seen": 14447368, + "step": 7377 + }, + { + "epoch": 0.9778661365142478, + "grad_norm": 0.04939216747879982, + "learning_rate": 3.8018408783057497e-06, + "loss": 0.0003, + "num_input_tokens_seen": 14449032, + "step": 7378 + }, + { + "epoch": 0.977998674618953, + "grad_norm": 6.989064693450928, + "learning_rate": 3.801544508543443e-06, + "loss": 0.1451, + "num_input_tokens_seen": 14450776, + "step": 7379 + }, + { + "epoch": 0.9781312127236581, + "grad_norm": 10.56523323059082, + "learning_rate": 3.8012481136864597e-06, + "loss": 0.281, + "num_input_tokens_seen": 14452696, + "step": 7380 + }, + { + "epoch": 0.9782637508283631, + "grad_norm": 4.1558918952941895, + "learning_rate": 3.8009516937405157e-06, + "loss": 0.0936, + "num_input_tokens_seen": 14454224, + "step": 7381 + }, + { + "epoch": 0.9783962889330683, + "grad_norm": 0.2487679421901703, + "learning_rate": 3.8006552487113253e-06, + "loss": 0.0017, + "num_input_tokens_seen": 14455928, + "step": 7382 + }, + { + "epoch": 0.9785288270377733, + "grad_norm": 0.08795146644115448, + "learning_rate": 3.8003587786046043e-06, + "loss": 0.0006, + "num_input_tokens_seen": 14457408, + "step": 7383 + }, + { + "epoch": 0.9786613651424785, + "grad_norm": 0.3246552050113678, + "learning_rate": 3.8000622834260694e-06, + "loss": 0.0021, + "num_input_tokens_seen": 14459144, + "step": 7384 + }, + { + "epoch": 0.9787939032471835, + "grad_norm": 0.20704562962055206, + "learning_rate": 3.7997657631814366e-06, + "loss": 0.0013, + "num_input_tokens_seen": 14462672, + "step": 7385 + }, + { + "epoch": 0.9789264413518887, + "grad_norm": 0.13691645860671997, + "learning_rate": 3.7994692178764227e-06, + "loss": 0.001, + "num_input_tokens_seen": 14464784, + "step": 7386 + }, + { + "epoch": 0.9790589794565938, + "grad_norm": 4.155706882476807, + "learning_rate": 3.7991726475167464e-06, + "loss": 0.0483, + "num_input_tokens_seen": 14466336, + "step": 7387 + }, + { + "epoch": 0.9791915175612989, + "grad_norm": 5.97236442565918, + "learning_rate": 3.798876052108125e-06, + "loss": 0.2341, + "num_input_tokens_seen": 14468024, + "step": 7388 + }, + { + "epoch": 0.979324055666004, + "grad_norm": 0.16712424159049988, + "learning_rate": 3.798579431656277e-06, + "loss": 0.0011, + "num_input_tokens_seen": 14470160, + "step": 7389 + }, + { + "epoch": 0.979456593770709, + "grad_norm": 13.663261413574219, + "learning_rate": 3.7982827861669224e-06, + "loss": 0.3484, + "num_input_tokens_seen": 14471888, + "step": 7390 + }, + { + "epoch": 0.9795891318754142, + "grad_norm": 8.83521842956543, + "learning_rate": 3.7979861156457796e-06, + "loss": 0.2455, + "num_input_tokens_seen": 14474400, + "step": 7391 + }, + { + "epoch": 0.9797216699801193, + "grad_norm": 11.258716583251953, + "learning_rate": 3.797689420098569e-06, + "loss": 0.5067, + "num_input_tokens_seen": 14476784, + "step": 7392 + }, + { + "epoch": 0.9798542080848244, + "grad_norm": 0.3901125490665436, + "learning_rate": 3.797392699531012e-06, + "loss": 0.0016, + "num_input_tokens_seen": 14479432, + "step": 7393 + }, + { + "epoch": 0.9799867461895295, + "grad_norm": 2.4827568531036377, + "learning_rate": 3.7970959539488277e-06, + "loss": 0.0177, + "num_input_tokens_seen": 14481024, + "step": 7394 + }, + { + "epoch": 0.9801192842942346, + "grad_norm": 0.08227360248565674, + "learning_rate": 3.7967991833577388e-06, + "loss": 0.0006, + "num_input_tokens_seen": 14482840, + "step": 7395 + }, + { + "epoch": 0.9802518223989397, + "grad_norm": 1.063278079032898, + "learning_rate": 3.796502387763467e-06, + "loss": 0.0107, + "num_input_tokens_seen": 14484928, + "step": 7396 + }, + { + "epoch": 0.9803843605036447, + "grad_norm": 2.9758307933807373, + "learning_rate": 3.796205567171735e-06, + "loss": 0.0854, + "num_input_tokens_seen": 14486632, + "step": 7397 + }, + { + "epoch": 0.9805168986083499, + "grad_norm": 6.8062872886657715, + "learning_rate": 3.795908721588265e-06, + "loss": 0.1775, + "num_input_tokens_seen": 14489448, + "step": 7398 + }, + { + "epoch": 0.980649436713055, + "grad_norm": 0.5578870177268982, + "learning_rate": 3.7956118510187817e-06, + "loss": 0.0035, + "num_input_tokens_seen": 14491408, + "step": 7399 + }, + { + "epoch": 0.9807819748177601, + "grad_norm": 6.043049335479736, + "learning_rate": 3.7953149554690073e-06, + "loss": 0.1417, + "num_input_tokens_seen": 14493288, + "step": 7400 + }, + { + "epoch": 0.9809145129224652, + "grad_norm": 0.15173298120498657, + "learning_rate": 3.7950180349446675e-06, + "loss": 0.001, + "num_input_tokens_seen": 14495920, + "step": 7401 + }, + { + "epoch": 0.9810470510271703, + "grad_norm": 0.04913967102766037, + "learning_rate": 3.7947210894514864e-06, + "loss": 0.0003, + "num_input_tokens_seen": 14497168, + "step": 7402 + }, + { + "epoch": 0.9811795891318754, + "grad_norm": 3.2894275188446045, + "learning_rate": 3.7944241189951892e-06, + "loss": 0.0861, + "num_input_tokens_seen": 14498952, + "step": 7403 + }, + { + "epoch": 0.9813121272365806, + "grad_norm": 7.083791255950928, + "learning_rate": 3.7941271235815026e-06, + "loss": 0.0963, + "num_input_tokens_seen": 14501000, + "step": 7404 + }, + { + "epoch": 0.9814446653412856, + "grad_norm": 0.01514469739049673, + "learning_rate": 3.793830103216152e-06, + "loss": 0.0001, + "num_input_tokens_seen": 14501888, + "step": 7405 + }, + { + "epoch": 0.9815772034459908, + "grad_norm": 11.904065132141113, + "learning_rate": 3.7935330579048647e-06, + "loss": 0.3611, + "num_input_tokens_seen": 14504536, + "step": 7406 + }, + { + "epoch": 0.9817097415506958, + "grad_norm": 0.03228849917650223, + "learning_rate": 3.793235987653367e-06, + "loss": 0.0002, + "num_input_tokens_seen": 14506696, + "step": 7407 + }, + { + "epoch": 0.9818422796554009, + "grad_norm": 6.021876335144043, + "learning_rate": 3.7929388924673875e-06, + "loss": 0.0158, + "num_input_tokens_seen": 14507912, + "step": 7408 + }, + { + "epoch": 0.981974817760106, + "grad_norm": 5.733320236206055, + "learning_rate": 3.7926417723526544e-06, + "loss": 0.2977, + "num_input_tokens_seen": 14509768, + "step": 7409 + }, + { + "epoch": 0.9821073558648111, + "grad_norm": 0.12485023587942123, + "learning_rate": 3.7923446273148962e-06, + "loss": 0.0008, + "num_input_tokens_seen": 14511088, + "step": 7410 + }, + { + "epoch": 0.9822398939695163, + "grad_norm": 7.824032783508301, + "learning_rate": 3.792047457359842e-06, + "loss": 0.2407, + "num_input_tokens_seen": 14513640, + "step": 7411 + }, + { + "epoch": 0.9823724320742213, + "grad_norm": 20.340831756591797, + "learning_rate": 3.7917502624932214e-06, + "loss": 0.6636, + "num_input_tokens_seen": 14516440, + "step": 7412 + }, + { + "epoch": 0.9825049701789265, + "grad_norm": 19.05531120300293, + "learning_rate": 3.791453042720765e-06, + "loss": 0.6905, + "num_input_tokens_seen": 14518304, + "step": 7413 + }, + { + "epoch": 0.9826375082836315, + "grad_norm": 5.129209518432617, + "learning_rate": 3.7911557980482027e-06, + "loss": 0.0272, + "num_input_tokens_seen": 14520064, + "step": 7414 + }, + { + "epoch": 0.9827700463883366, + "grad_norm": 9.143016815185547, + "learning_rate": 3.790858528481266e-06, + "loss": 0.1812, + "num_input_tokens_seen": 14522064, + "step": 7415 + }, + { + "epoch": 0.9829025844930418, + "grad_norm": 7.742749214172363, + "learning_rate": 3.790561234025686e-06, + "loss": 0.2871, + "num_input_tokens_seen": 14523704, + "step": 7416 + }, + { + "epoch": 0.9830351225977468, + "grad_norm": 8.041770935058594, + "learning_rate": 3.7902639146871954e-06, + "loss": 0.1239, + "num_input_tokens_seen": 14525936, + "step": 7417 + }, + { + "epoch": 0.983167660702452, + "grad_norm": 0.13700397312641144, + "learning_rate": 3.7899665704715265e-06, + "loss": 0.0009, + "num_input_tokens_seen": 14529752, + "step": 7418 + }, + { + "epoch": 0.983300198807157, + "grad_norm": 0.5118721127510071, + "learning_rate": 3.7896692013844122e-06, + "loss": 0.0034, + "num_input_tokens_seen": 14531792, + "step": 7419 + }, + { + "epoch": 0.9834327369118622, + "grad_norm": 12.241049766540527, + "learning_rate": 3.7893718074315853e-06, + "loss": 0.251, + "num_input_tokens_seen": 14533408, + "step": 7420 + }, + { + "epoch": 0.9835652750165672, + "grad_norm": 8.60028076171875, + "learning_rate": 3.789074388618782e-06, + "loss": 0.2729, + "num_input_tokens_seen": 14535224, + "step": 7421 + }, + { + "epoch": 0.9836978131212724, + "grad_norm": 0.05928985774517059, + "learning_rate": 3.788776944951734e-06, + "loss": 0.0004, + "num_input_tokens_seen": 14536688, + "step": 7422 + }, + { + "epoch": 0.9838303512259775, + "grad_norm": 11.478135108947754, + "learning_rate": 3.788479476436178e-06, + "loss": 0.1768, + "num_input_tokens_seen": 14538744, + "step": 7423 + }, + { + "epoch": 0.9839628893306825, + "grad_norm": 6.083920001983643, + "learning_rate": 3.788181983077848e-06, + "loss": 0.1063, + "num_input_tokens_seen": 14540584, + "step": 7424 + }, + { + "epoch": 0.9840954274353877, + "grad_norm": 14.518895149230957, + "learning_rate": 3.7878844648824814e-06, + "loss": 0.4472, + "num_input_tokens_seen": 14542464, + "step": 7425 + }, + { + "epoch": 0.9842279655400927, + "grad_norm": 8.088200569152832, + "learning_rate": 3.7875869218558136e-06, + "loss": 0.3527, + "num_input_tokens_seen": 14544000, + "step": 7426 + }, + { + "epoch": 0.9843605036447979, + "grad_norm": 10.399395942687988, + "learning_rate": 3.7872893540035815e-06, + "loss": 0.2184, + "num_input_tokens_seen": 14546808, + "step": 7427 + }, + { + "epoch": 0.984493041749503, + "grad_norm": 0.08218420296907425, + "learning_rate": 3.7869917613315228e-06, + "loss": 0.0005, + "num_input_tokens_seen": 14548744, + "step": 7428 + }, + { + "epoch": 0.9846255798542081, + "grad_norm": 0.10341787338256836, + "learning_rate": 3.7866941438453754e-06, + "loss": 0.0007, + "num_input_tokens_seen": 14550472, + "step": 7429 + }, + { + "epoch": 0.9847581179589132, + "grad_norm": 0.025562381371855736, + "learning_rate": 3.786396501550877e-06, + "loss": 0.0002, + "num_input_tokens_seen": 14551608, + "step": 7430 + }, + { + "epoch": 0.9848906560636183, + "grad_norm": 9.983819961547852, + "learning_rate": 3.7860988344537664e-06, + "loss": 0.2907, + "num_input_tokens_seen": 14553696, + "step": 7431 + }, + { + "epoch": 0.9850231941683234, + "grad_norm": 2.680049419403076, + "learning_rate": 3.7858011425597837e-06, + "loss": 0.0389, + "num_input_tokens_seen": 14555640, + "step": 7432 + }, + { + "epoch": 0.9851557322730286, + "grad_norm": 6.576312065124512, + "learning_rate": 3.785503425874667e-06, + "loss": 0.2179, + "num_input_tokens_seen": 14558208, + "step": 7433 + }, + { + "epoch": 0.9852882703777336, + "grad_norm": 0.42594072222709656, + "learning_rate": 3.785205684404158e-06, + "loss": 0.0024, + "num_input_tokens_seen": 14559512, + "step": 7434 + }, + { + "epoch": 0.9854208084824387, + "grad_norm": 5.457864284515381, + "learning_rate": 3.784907918153997e-06, + "loss": 0.1723, + "num_input_tokens_seen": 14561336, + "step": 7435 + }, + { + "epoch": 0.9855533465871438, + "grad_norm": 8.544934272766113, + "learning_rate": 3.784610127129924e-06, + "loss": 0.1595, + "num_input_tokens_seen": 14563224, + "step": 7436 + }, + { + "epoch": 0.9856858846918489, + "grad_norm": 4.888466835021973, + "learning_rate": 3.784312311337682e-06, + "loss": 0.0545, + "num_input_tokens_seen": 14565096, + "step": 7437 + }, + { + "epoch": 0.985818422796554, + "grad_norm": 0.1505114585161209, + "learning_rate": 3.784014470783014e-06, + "loss": 0.001, + "num_input_tokens_seen": 14567080, + "step": 7438 + }, + { + "epoch": 0.9859509609012591, + "grad_norm": 15.188621520996094, + "learning_rate": 3.783716605471659e-06, + "loss": 0.3375, + "num_input_tokens_seen": 14568688, + "step": 7439 + }, + { + "epoch": 0.9860834990059643, + "grad_norm": 6.947436332702637, + "learning_rate": 3.7834187154093637e-06, + "loss": 0.1894, + "num_input_tokens_seen": 14570584, + "step": 7440 + }, + { + "epoch": 0.9862160371106693, + "grad_norm": 0.024744799360632896, + "learning_rate": 3.7831208006018693e-06, + "loss": 0.0002, + "num_input_tokens_seen": 14572592, + "step": 7441 + }, + { + "epoch": 0.9863485752153744, + "grad_norm": 0.05840403214097023, + "learning_rate": 3.782822861054921e-06, + "loss": 0.0004, + "num_input_tokens_seen": 14574264, + "step": 7442 + }, + { + "epoch": 0.9864811133200795, + "grad_norm": 20.0048885345459, + "learning_rate": 3.7825248967742634e-06, + "loss": 0.5631, + "num_input_tokens_seen": 14575400, + "step": 7443 + }, + { + "epoch": 0.9866136514247846, + "grad_norm": 6.002795219421387, + "learning_rate": 3.782226907765641e-06, + "loss": 0.0682, + "num_input_tokens_seen": 14576472, + "step": 7444 + }, + { + "epoch": 0.9867461895294898, + "grad_norm": 7.828094005584717, + "learning_rate": 3.781928894034799e-06, + "loss": 0.2006, + "num_input_tokens_seen": 14577896, + "step": 7445 + }, + { + "epoch": 0.9868787276341948, + "grad_norm": 4.702576637268066, + "learning_rate": 3.781630855587483e-06, + "loss": 0.0451, + "num_input_tokens_seen": 14579408, + "step": 7446 + }, + { + "epoch": 0.9870112657389, + "grad_norm": 19.22153663635254, + "learning_rate": 3.7813327924294407e-06, + "loss": 0.5378, + "num_input_tokens_seen": 14581624, + "step": 7447 + }, + { + "epoch": 0.987143803843605, + "grad_norm": 8.9507474899292, + "learning_rate": 3.781034704566418e-06, + "loss": 0.1927, + "num_input_tokens_seen": 14584240, + "step": 7448 + }, + { + "epoch": 0.9872763419483102, + "grad_norm": 0.04042766988277435, + "learning_rate": 3.7807365920041626e-06, + "loss": 0.0003, + "num_input_tokens_seen": 14585568, + "step": 7449 + }, + { + "epoch": 0.9874088800530152, + "grad_norm": 0.08263298869132996, + "learning_rate": 3.780438454748422e-06, + "loss": 0.0006, + "num_input_tokens_seen": 14587136, + "step": 7450 + }, + { + "epoch": 0.9875414181577203, + "grad_norm": 10.530470848083496, + "learning_rate": 3.780140292804945e-06, + "loss": 0.3976, + "num_input_tokens_seen": 14588712, + "step": 7451 + }, + { + "epoch": 0.9876739562624255, + "grad_norm": 7.5321269035339355, + "learning_rate": 3.7798421061794794e-06, + "loss": 0.1258, + "num_input_tokens_seen": 14590736, + "step": 7452 + }, + { + "epoch": 0.9878064943671305, + "grad_norm": 5.495265960693359, + "learning_rate": 3.7795438948777758e-06, + "loss": 0.2552, + "num_input_tokens_seen": 14592536, + "step": 7453 + }, + { + "epoch": 0.9879390324718357, + "grad_norm": 0.05409146845340729, + "learning_rate": 3.779245658905582e-06, + "loss": 0.0004, + "num_input_tokens_seen": 14594312, + "step": 7454 + }, + { + "epoch": 0.9880715705765407, + "grad_norm": 0.059843823313713074, + "learning_rate": 3.778947398268651e-06, + "loss": 0.0004, + "num_input_tokens_seen": 14596576, + "step": 7455 + }, + { + "epoch": 0.9882041086812459, + "grad_norm": 13.64786148071289, + "learning_rate": 3.778649112972731e-06, + "loss": 0.5459, + "num_input_tokens_seen": 14598504, + "step": 7456 + }, + { + "epoch": 0.988336646785951, + "grad_norm": 7.324364185333252, + "learning_rate": 3.778350803023574e-06, + "loss": 0.1903, + "num_input_tokens_seen": 14600984, + "step": 7457 + }, + { + "epoch": 0.988469184890656, + "grad_norm": 10.663796424865723, + "learning_rate": 3.7780524684269315e-06, + "loss": 0.2283, + "num_input_tokens_seen": 14603144, + "step": 7458 + }, + { + "epoch": 0.9886017229953612, + "grad_norm": 0.2375771701335907, + "learning_rate": 3.7777541091885562e-06, + "loss": 0.0017, + "num_input_tokens_seen": 14605440, + "step": 7459 + }, + { + "epoch": 0.9887342611000662, + "grad_norm": 0.06719999015331268, + "learning_rate": 3.7774557253141997e-06, + "loss": 0.0005, + "num_input_tokens_seen": 14606984, + "step": 7460 + }, + { + "epoch": 0.9888667992047714, + "grad_norm": 1.729261040687561, + "learning_rate": 3.7771573168096155e-06, + "loss": 0.0402, + "num_input_tokens_seen": 14608856, + "step": 7461 + }, + { + "epoch": 0.9889993373094764, + "grad_norm": 7.544516086578369, + "learning_rate": 3.7768588836805573e-06, + "loss": 0.1204, + "num_input_tokens_seen": 14610928, + "step": 7462 + }, + { + "epoch": 0.9891318754141816, + "grad_norm": 12.71238899230957, + "learning_rate": 3.7765604259327792e-06, + "loss": 0.2894, + "num_input_tokens_seen": 14612816, + "step": 7463 + }, + { + "epoch": 0.9892644135188867, + "grad_norm": 5.414800643920898, + "learning_rate": 3.7762619435720355e-06, + "loss": 0.2036, + "num_input_tokens_seen": 14614784, + "step": 7464 + }, + { + "epoch": 0.9893969516235918, + "grad_norm": 0.1273748278617859, + "learning_rate": 3.775963436604081e-06, + "loss": 0.0009, + "num_input_tokens_seen": 14616256, + "step": 7465 + }, + { + "epoch": 0.9895294897282969, + "grad_norm": 11.797792434692383, + "learning_rate": 3.7756649050346705e-06, + "loss": 0.3017, + "num_input_tokens_seen": 14618832, + "step": 7466 + }, + { + "epoch": 0.989662027833002, + "grad_norm": 0.18892276287078857, + "learning_rate": 3.7753663488695613e-06, + "loss": 0.0012, + "num_input_tokens_seen": 14620072, + "step": 7467 + }, + { + "epoch": 0.9897945659377071, + "grad_norm": 4.78010368347168, + "learning_rate": 3.775067768114509e-06, + "loss": 0.1914, + "num_input_tokens_seen": 14622912, + "step": 7468 + }, + { + "epoch": 0.9899271040424122, + "grad_norm": 0.2012818455696106, + "learning_rate": 3.77476916277527e-06, + "loss": 0.0014, + "num_input_tokens_seen": 14624600, + "step": 7469 + }, + { + "epoch": 0.9900596421471173, + "grad_norm": 5.991137504577637, + "learning_rate": 3.774470532857603e-06, + "loss": 0.2401, + "num_input_tokens_seen": 14626392, + "step": 7470 + }, + { + "epoch": 0.9901921802518224, + "grad_norm": 0.36517441272735596, + "learning_rate": 3.7741718783672647e-06, + "loss": 0.0027, + "num_input_tokens_seen": 14629336, + "step": 7471 + }, + { + "epoch": 0.9903247183565275, + "grad_norm": 0.07761472463607788, + "learning_rate": 3.773873199310013e-06, + "loss": 0.0005, + "num_input_tokens_seen": 14630952, + "step": 7472 + }, + { + "epoch": 0.9904572564612326, + "grad_norm": 9.61211109161377, + "learning_rate": 3.7735744956916076e-06, + "loss": 0.2465, + "num_input_tokens_seen": 14634272, + "step": 7473 + }, + { + "epoch": 0.9905897945659377, + "grad_norm": 5.017262935638428, + "learning_rate": 3.7732757675178076e-06, + "loss": 0.0618, + "num_input_tokens_seen": 14636160, + "step": 7474 + }, + { + "epoch": 0.9907223326706428, + "grad_norm": 0.28648558259010315, + "learning_rate": 3.7729770147943725e-06, + "loss": 0.0021, + "num_input_tokens_seen": 14639392, + "step": 7475 + }, + { + "epoch": 0.990854870775348, + "grad_norm": 5.11911678314209, + "learning_rate": 3.772678237527062e-06, + "loss": 0.0593, + "num_input_tokens_seen": 14641456, + "step": 7476 + }, + { + "epoch": 0.990987408880053, + "grad_norm": 2.148871898651123, + "learning_rate": 3.772379435721637e-06, + "loss": 0.0628, + "num_input_tokens_seen": 14643472, + "step": 7477 + }, + { + "epoch": 0.9911199469847581, + "grad_norm": 6.179117679595947, + "learning_rate": 3.7720806093838585e-06, + "loss": 0.1541, + "num_input_tokens_seen": 14645392, + "step": 7478 + }, + { + "epoch": 0.9912524850894632, + "grad_norm": 2.1901626586914062, + "learning_rate": 3.771781758519489e-06, + "loss": 0.0166, + "num_input_tokens_seen": 14646696, + "step": 7479 + }, + { + "epoch": 0.9913850231941683, + "grad_norm": 17.649974822998047, + "learning_rate": 3.771482883134289e-06, + "loss": 0.4598, + "num_input_tokens_seen": 14648464, + "step": 7480 + }, + { + "epoch": 0.9915175612988735, + "grad_norm": 3.9150497913360596, + "learning_rate": 3.7711839832340225e-06, + "loss": 0.0746, + "num_input_tokens_seen": 14650584, + "step": 7481 + }, + { + "epoch": 0.9916500994035785, + "grad_norm": 0.09354069083929062, + "learning_rate": 3.7708850588244515e-06, + "loss": 0.0006, + "num_input_tokens_seen": 14652304, + "step": 7482 + }, + { + "epoch": 0.9917826375082837, + "grad_norm": 0.11603660136461258, + "learning_rate": 3.7705861099113404e-06, + "loss": 0.0008, + "num_input_tokens_seen": 14654304, + "step": 7483 + }, + { + "epoch": 0.9919151756129887, + "grad_norm": 0.11128681898117065, + "learning_rate": 3.770287136500452e-06, + "loss": 0.0008, + "num_input_tokens_seen": 14655640, + "step": 7484 + }, + { + "epoch": 0.9920477137176938, + "grad_norm": 0.0486735925078392, + "learning_rate": 3.769988138597551e-06, + "loss": 0.0003, + "num_input_tokens_seen": 14657024, + "step": 7485 + }, + { + "epoch": 0.9921802518223989, + "grad_norm": 15.03048038482666, + "learning_rate": 3.7696891162084036e-06, + "loss": 0.4605, + "num_input_tokens_seen": 14658672, + "step": 7486 + }, + { + "epoch": 0.992312789927104, + "grad_norm": 7.816737651824951, + "learning_rate": 3.769390069338773e-06, + "loss": 0.1599, + "num_input_tokens_seen": 14659920, + "step": 7487 + }, + { + "epoch": 0.9924453280318092, + "grad_norm": 0.13302229344844818, + "learning_rate": 3.7690909979944267e-06, + "loss": 0.0009, + "num_input_tokens_seen": 14661344, + "step": 7488 + }, + { + "epoch": 0.9925778661365142, + "grad_norm": 7.35618782043457, + "learning_rate": 3.7687919021811302e-06, + "loss": 0.1565, + "num_input_tokens_seen": 14663872, + "step": 7489 + }, + { + "epoch": 0.9927104042412194, + "grad_norm": 8.849679946899414, + "learning_rate": 3.7684927819046515e-06, + "loss": 0.1941, + "num_input_tokens_seen": 14665288, + "step": 7490 + }, + { + "epoch": 0.9928429423459244, + "grad_norm": 6.646866798400879, + "learning_rate": 3.768193637170756e-06, + "loss": 0.2763, + "num_input_tokens_seen": 14667384, + "step": 7491 + }, + { + "epoch": 0.9929754804506296, + "grad_norm": 0.05182486027479172, + "learning_rate": 3.7678944679852127e-06, + "loss": 0.0004, + "num_input_tokens_seen": 14668752, + "step": 7492 + }, + { + "epoch": 0.9931080185553347, + "grad_norm": 8.893109321594238, + "learning_rate": 3.7675952743537885e-06, + "loss": 0.2543, + "num_input_tokens_seen": 14670968, + "step": 7493 + }, + { + "epoch": 0.9932405566600397, + "grad_norm": 0.23408369719982147, + "learning_rate": 3.7672960562822543e-06, + "loss": 0.0014, + "num_input_tokens_seen": 14672560, + "step": 7494 + }, + { + "epoch": 0.9933730947647449, + "grad_norm": 2.355358362197876, + "learning_rate": 3.7669968137763767e-06, + "loss": 0.0284, + "num_input_tokens_seen": 14675672, + "step": 7495 + }, + { + "epoch": 0.9935056328694499, + "grad_norm": 0.159286230802536, + "learning_rate": 3.7666975468419263e-06, + "loss": 0.0011, + "num_input_tokens_seen": 14677336, + "step": 7496 + }, + { + "epoch": 0.9936381709741551, + "grad_norm": 5.72774600982666, + "learning_rate": 3.7663982554846746e-06, + "loss": 0.1427, + "num_input_tokens_seen": 14680616, + "step": 7497 + }, + { + "epoch": 0.9937707090788602, + "grad_norm": 0.07825324684381485, + "learning_rate": 3.7660989397103903e-06, + "loss": 0.0005, + "num_input_tokens_seen": 14681768, + "step": 7498 + }, + { + "epoch": 0.9939032471835653, + "grad_norm": 10.577095031738281, + "learning_rate": 3.7657995995248453e-06, + "loss": 0.3354, + "num_input_tokens_seen": 14683856, + "step": 7499 + }, + { + "epoch": 0.9940357852882704, + "grad_norm": 5.286223888397217, + "learning_rate": 3.7655002349338106e-06, + "loss": 0.2151, + "num_input_tokens_seen": 14686704, + "step": 7500 + }, + { + "epoch": 0.9941683233929755, + "grad_norm": 1.5178208351135254, + "learning_rate": 3.7652008459430576e-06, + "loss": 0.0082, + "num_input_tokens_seen": 14689808, + "step": 7501 + }, + { + "epoch": 0.9943008614976806, + "grad_norm": 8.504348754882812, + "learning_rate": 3.7649014325583597e-06, + "loss": 0.3122, + "num_input_tokens_seen": 14691472, + "step": 7502 + }, + { + "epoch": 0.9944333996023856, + "grad_norm": 3.3781087398529053, + "learning_rate": 3.7646019947854907e-06, + "loss": 0.0412, + "num_input_tokens_seen": 14693448, + "step": 7503 + }, + { + "epoch": 0.9945659377070908, + "grad_norm": 3.832814931869507, + "learning_rate": 3.7643025326302217e-06, + "loss": 0.042, + "num_input_tokens_seen": 14696440, + "step": 7504 + }, + { + "epoch": 0.9946984758117959, + "grad_norm": 7.151188373565674, + "learning_rate": 3.7640030460983284e-06, + "loss": 0.138, + "num_input_tokens_seen": 14697888, + "step": 7505 + }, + { + "epoch": 0.994831013916501, + "grad_norm": 0.24960653483867645, + "learning_rate": 3.763703535195584e-06, + "loss": 0.0017, + "num_input_tokens_seen": 14700832, + "step": 7506 + }, + { + "epoch": 0.9949635520212061, + "grad_norm": 13.948338508605957, + "learning_rate": 3.7634039999277633e-06, + "loss": 0.5592, + "num_input_tokens_seen": 14703040, + "step": 7507 + }, + { + "epoch": 0.9950960901259112, + "grad_norm": 2.88840651512146, + "learning_rate": 3.763104440300643e-06, + "loss": 0.0549, + "num_input_tokens_seen": 14704864, + "step": 7508 + }, + { + "epoch": 0.9952286282306163, + "grad_norm": 6.21791934967041, + "learning_rate": 3.7628048563199972e-06, + "loss": 0.1545, + "num_input_tokens_seen": 14707072, + "step": 7509 + }, + { + "epoch": 0.9953611663353215, + "grad_norm": 7.89548397064209, + "learning_rate": 3.762505247991601e-06, + "loss": 0.0699, + "num_input_tokens_seen": 14710048, + "step": 7510 + }, + { + "epoch": 0.9954937044400265, + "grad_norm": 4.993197917938232, + "learning_rate": 3.762205615321235e-06, + "loss": 0.0661, + "num_input_tokens_seen": 14712016, + "step": 7511 + }, + { + "epoch": 0.9956262425447316, + "grad_norm": 4.341444969177246, + "learning_rate": 3.7619059583146733e-06, + "loss": 0.0835, + "num_input_tokens_seen": 14714328, + "step": 7512 + }, + { + "epoch": 0.9957587806494367, + "grad_norm": 13.486825942993164, + "learning_rate": 3.761606276977694e-06, + "loss": 0.4369, + "num_input_tokens_seen": 14716664, + "step": 7513 + }, + { + "epoch": 0.9958913187541418, + "grad_norm": 7.296512126922607, + "learning_rate": 3.7613065713160747e-06, + "loss": 0.0684, + "num_input_tokens_seen": 14718216, + "step": 7514 + }, + { + "epoch": 0.9960238568588469, + "grad_norm": 0.5210379958152771, + "learning_rate": 3.7610068413355955e-06, + "loss": 0.0033, + "num_input_tokens_seen": 14719872, + "step": 7515 + }, + { + "epoch": 0.996156394963552, + "grad_norm": 11.074482917785645, + "learning_rate": 3.760707087042034e-06, + "loss": 0.32, + "num_input_tokens_seen": 14721784, + "step": 7516 + }, + { + "epoch": 0.9962889330682572, + "grad_norm": 10.787108421325684, + "learning_rate": 3.76040730844117e-06, + "loss": 0.3002, + "num_input_tokens_seen": 14723480, + "step": 7517 + }, + { + "epoch": 0.9964214711729622, + "grad_norm": 2.3107287883758545, + "learning_rate": 3.7601075055387848e-06, + "loss": 0.0382, + "num_input_tokens_seen": 14725608, + "step": 7518 + }, + { + "epoch": 0.9965540092776674, + "grad_norm": 0.15647807717323303, + "learning_rate": 3.7598076783406556e-06, + "loss": 0.0011, + "num_input_tokens_seen": 14727472, + "step": 7519 + }, + { + "epoch": 0.9966865473823724, + "grad_norm": 15.560263633728027, + "learning_rate": 3.7595078268525666e-06, + "loss": 0.2044, + "num_input_tokens_seen": 14728920, + "step": 7520 + }, + { + "epoch": 0.9968190854870775, + "grad_norm": 1.11007559299469, + "learning_rate": 3.759207951080298e-06, + "loss": 0.0074, + "num_input_tokens_seen": 14730816, + "step": 7521 + }, + { + "epoch": 0.9969516235917827, + "grad_norm": 10.53654956817627, + "learning_rate": 3.7589080510296307e-06, + "loss": 0.2754, + "num_input_tokens_seen": 14732432, + "step": 7522 + }, + { + "epoch": 0.9970841616964877, + "grad_norm": 0.5345222353935242, + "learning_rate": 3.758608126706348e-06, + "loss": 0.0108, + "num_input_tokens_seen": 14733976, + "step": 7523 + }, + { + "epoch": 0.9972166998011929, + "grad_norm": 3.964815139770508, + "learning_rate": 3.7583081781162323e-06, + "loss": 0.0365, + "num_input_tokens_seen": 14735856, + "step": 7524 + }, + { + "epoch": 0.9973492379058979, + "grad_norm": 4.957132816314697, + "learning_rate": 3.758008205265067e-06, + "loss": 0.1353, + "num_input_tokens_seen": 14737896, + "step": 7525 + }, + { + "epoch": 0.9974817760106031, + "grad_norm": 16.13467788696289, + "learning_rate": 3.757708208158636e-06, + "loss": 0.5086, + "num_input_tokens_seen": 14741200, + "step": 7526 + }, + { + "epoch": 0.9976143141153081, + "grad_norm": 5.856666088104248, + "learning_rate": 3.7574081868027218e-06, + "loss": 0.1209, + "num_input_tokens_seen": 14743688, + "step": 7527 + }, + { + "epoch": 0.9977468522200132, + "grad_norm": 20.14527130126953, + "learning_rate": 3.7571081412031106e-06, + "loss": 0.5185, + "num_input_tokens_seen": 14745240, + "step": 7528 + }, + { + "epoch": 0.9978793903247184, + "grad_norm": 14.533098220825195, + "learning_rate": 3.756808071365588e-06, + "loss": 0.3847, + "num_input_tokens_seen": 14746448, + "step": 7529 + }, + { + "epoch": 0.9980119284294234, + "grad_norm": 9.56916618347168, + "learning_rate": 3.756507977295939e-06, + "loss": 0.2758, + "num_input_tokens_seen": 14748504, + "step": 7530 + }, + { + "epoch": 0.9981444665341286, + "grad_norm": 5.491872787475586, + "learning_rate": 3.7562078589999483e-06, + "loss": 0.1232, + "num_input_tokens_seen": 14750664, + "step": 7531 + }, + { + "epoch": 0.9982770046388336, + "grad_norm": 0.10833562910556793, + "learning_rate": 3.7559077164834046e-06, + "loss": 0.0005, + "num_input_tokens_seen": 14752144, + "step": 7532 + }, + { + "epoch": 0.9984095427435388, + "grad_norm": 0.10356461256742477, + "learning_rate": 3.7556075497520927e-06, + "loss": 0.0007, + "num_input_tokens_seen": 14754128, + "step": 7533 + }, + { + "epoch": 0.9985420808482439, + "grad_norm": 0.4142891466617584, + "learning_rate": 3.7553073588118015e-06, + "loss": 0.0023, + "num_input_tokens_seen": 14755640, + "step": 7534 + }, + { + "epoch": 0.998674618952949, + "grad_norm": 7.838845252990723, + "learning_rate": 3.755007143668319e-06, + "loss": 0.0988, + "num_input_tokens_seen": 14757840, + "step": 7535 + }, + { + "epoch": 0.9988071570576541, + "grad_norm": 10.540555000305176, + "learning_rate": 3.754706904327432e-06, + "loss": 0.1365, + "num_input_tokens_seen": 14759464, + "step": 7536 + }, + { + "epoch": 0.9989396951623591, + "grad_norm": 0.06858278810977936, + "learning_rate": 3.75440664079493e-06, + "loss": 0.0005, + "num_input_tokens_seen": 14760736, + "step": 7537 + }, + { + "epoch": 0.9990722332670643, + "grad_norm": 0.12629128992557526, + "learning_rate": 3.7541063530766042e-06, + "loss": 0.0008, + "num_input_tokens_seen": 14762792, + "step": 7538 + }, + { + "epoch": 0.9992047713717693, + "grad_norm": 12.637260437011719, + "learning_rate": 3.7538060411782417e-06, + "loss": 0.3821, + "num_input_tokens_seen": 14764696, + "step": 7539 + }, + { + "epoch": 0.9993373094764745, + "grad_norm": 6.717950344085693, + "learning_rate": 3.7535057051056335e-06, + "loss": 0.1089, + "num_input_tokens_seen": 14766944, + "step": 7540 + }, + { + "epoch": 0.9994698475811796, + "grad_norm": 14.358124732971191, + "learning_rate": 3.753205344864571e-06, + "loss": 0.506, + "num_input_tokens_seen": 14768648, + "step": 7541 + }, + { + "epoch": 0.9996023856858847, + "grad_norm": 3.183262825012207, + "learning_rate": 3.752904960460845e-06, + "loss": 0.0535, + "num_input_tokens_seen": 14770392, + "step": 7542 + }, + { + "epoch": 0.9997349237905898, + "grad_norm": 7.213214874267578, + "learning_rate": 3.7526045519002473e-06, + "loss": 0.1266, + "num_input_tokens_seen": 14772192, + "step": 7543 + }, + { + "epoch": 0.9998674618952949, + "grad_norm": 1.4316110610961914, + "learning_rate": 3.7523041191885697e-06, + "loss": 0.0092, + "num_input_tokens_seen": 14773760, + "step": 7544 + }, + { + "epoch": 1.0, + "grad_norm": 12.919032096862793, + "learning_rate": 3.7520036623316034e-06, + "loss": 0.2122, + "num_input_tokens_seen": 14775328, + "step": 7545 + }, + { + "epoch": 1.000132538104705, + "grad_norm": 9.438655853271484, + "learning_rate": 3.7517031813351447e-06, + "loss": 0.1873, + "num_input_tokens_seen": 14777576, + "step": 7546 + }, + { + "epoch": 1.0002650762094103, + "grad_norm": 0.2385670244693756, + "learning_rate": 3.751402676204984e-06, + "loss": 0.0016, + "num_input_tokens_seen": 14779816, + "step": 7547 + }, + { + "epoch": 1.0003976143141153, + "grad_norm": 2.6697463989257812, + "learning_rate": 3.7511021469469167e-06, + "loss": 0.0116, + "num_input_tokens_seen": 14780760, + "step": 7548 + }, + { + "epoch": 1.0005301524188204, + "grad_norm": 13.829483985900879, + "learning_rate": 3.7508015935667365e-06, + "loss": 0.3487, + "num_input_tokens_seen": 14782984, + "step": 7549 + }, + { + "epoch": 1.0006626905235254, + "grad_norm": 1.0328798294067383, + "learning_rate": 3.750501016070239e-06, + "loss": 0.0099, + "num_input_tokens_seen": 14785544, + "step": 7550 + }, + { + "epoch": 1.0007952286282307, + "grad_norm": 14.574694633483887, + "learning_rate": 3.75020041446322e-06, + "loss": 0.5077, + "num_input_tokens_seen": 14788224, + "step": 7551 + }, + { + "epoch": 1.0009277667329357, + "grad_norm": 1.4466869831085205, + "learning_rate": 3.749899788751473e-06, + "loss": 0.0153, + "num_input_tokens_seen": 14789808, + "step": 7552 + }, + { + "epoch": 1.0010603048376407, + "grad_norm": 0.11413199454545975, + "learning_rate": 3.749599138940797e-06, + "loss": 0.0007, + "num_input_tokens_seen": 14791200, + "step": 7553 + }, + { + "epoch": 1.001192842942346, + "grad_norm": 1.0611664056777954, + "learning_rate": 3.7492984650369866e-06, + "loss": 0.0109, + "num_input_tokens_seen": 14792824, + "step": 7554 + }, + { + "epoch": 1.001325381047051, + "grad_norm": 14.517199516296387, + "learning_rate": 3.7489977670458406e-06, + "loss": 0.2872, + "num_input_tokens_seen": 14796568, + "step": 7555 + }, + { + "epoch": 1.001457919151756, + "grad_norm": 9.657989501953125, + "learning_rate": 3.7486970449731562e-06, + "loss": 0.1389, + "num_input_tokens_seen": 14798464, + "step": 7556 + }, + { + "epoch": 1.0015904572564613, + "grad_norm": 5.662469863891602, + "learning_rate": 3.7483962988247313e-06, + "loss": 0.0898, + "num_input_tokens_seen": 14800520, + "step": 7557 + }, + { + "epoch": 1.0017229953611664, + "grad_norm": 5.8557586669921875, + "learning_rate": 3.748095528606364e-06, + "loss": 0.0581, + "num_input_tokens_seen": 14802096, + "step": 7558 + }, + { + "epoch": 1.0018555334658714, + "grad_norm": 7.384589672088623, + "learning_rate": 3.7477947343238543e-06, + "loss": 0.1085, + "num_input_tokens_seen": 14804744, + "step": 7559 + }, + { + "epoch": 1.0019880715705765, + "grad_norm": 2.18312668800354, + "learning_rate": 3.7474939159830014e-06, + "loss": 0.0379, + "num_input_tokens_seen": 14806072, + "step": 7560 + }, + { + "epoch": 1.0021206096752817, + "grad_norm": 5.052554130554199, + "learning_rate": 3.7471930735896045e-06, + "loss": 0.0745, + "num_input_tokens_seen": 14808480, + "step": 7561 + }, + { + "epoch": 1.0022531477799868, + "grad_norm": 5.2918381690979, + "learning_rate": 3.7468922071494656e-06, + "loss": 0.0824, + "num_input_tokens_seen": 14810424, + "step": 7562 + }, + { + "epoch": 1.0023856858846918, + "grad_norm": 3.529203414916992, + "learning_rate": 3.746591316668384e-06, + "loss": 0.0737, + "num_input_tokens_seen": 14812000, + "step": 7563 + }, + { + "epoch": 1.002518223989397, + "grad_norm": 6.5588812828063965, + "learning_rate": 3.746290402152163e-06, + "loss": 0.089, + "num_input_tokens_seen": 14814024, + "step": 7564 + }, + { + "epoch": 1.002650762094102, + "grad_norm": 0.3529358208179474, + "learning_rate": 3.7459894636066024e-06, + "loss": 0.0022, + "num_input_tokens_seen": 14816024, + "step": 7565 + }, + { + "epoch": 1.0027833001988071, + "grad_norm": 1.5878340005874634, + "learning_rate": 3.7456885010375054e-06, + "loss": 0.0127, + "num_input_tokens_seen": 14817784, + "step": 7566 + }, + { + "epoch": 1.0029158383035122, + "grad_norm": 9.358026504516602, + "learning_rate": 3.7453875144506745e-06, + "loss": 0.1492, + "num_input_tokens_seen": 14820120, + "step": 7567 + }, + { + "epoch": 1.0030483764082174, + "grad_norm": 0.022215483710169792, + "learning_rate": 3.7450865038519133e-06, + "loss": 0.0001, + "num_input_tokens_seen": 14821744, + "step": 7568 + }, + { + "epoch": 1.0031809145129225, + "grad_norm": 9.642528533935547, + "learning_rate": 3.744785469247026e-06, + "loss": 0.1921, + "num_input_tokens_seen": 14823360, + "step": 7569 + }, + { + "epoch": 1.0033134526176275, + "grad_norm": 8.7027587890625, + "learning_rate": 3.744484410641816e-06, + "loss": 0.2407, + "num_input_tokens_seen": 14824888, + "step": 7570 + }, + { + "epoch": 1.0034459907223328, + "grad_norm": 6.996974468231201, + "learning_rate": 3.7441833280420875e-06, + "loss": 0.0911, + "num_input_tokens_seen": 14827016, + "step": 7571 + }, + { + "epoch": 1.0035785288270378, + "grad_norm": 1.7538390159606934, + "learning_rate": 3.7438822214536463e-06, + "loss": 0.0195, + "num_input_tokens_seen": 14828480, + "step": 7572 + }, + { + "epoch": 1.0037110669317428, + "grad_norm": 3.8951303958892822, + "learning_rate": 3.7435810908822977e-06, + "loss": 0.0134, + "num_input_tokens_seen": 14829848, + "step": 7573 + }, + { + "epoch": 1.0038436050364479, + "grad_norm": 10.267996788024902, + "learning_rate": 3.7432799363338477e-06, + "loss": 0.3677, + "num_input_tokens_seen": 14832168, + "step": 7574 + }, + { + "epoch": 1.0039761431411531, + "grad_norm": 3.218838930130005, + "learning_rate": 3.742978757814103e-06, + "loss": 0.0312, + "num_input_tokens_seen": 14833496, + "step": 7575 + }, + { + "epoch": 1.0041086812458582, + "grad_norm": 0.7289909720420837, + "learning_rate": 3.742677555328871e-06, + "loss": 0.0043, + "num_input_tokens_seen": 14834920, + "step": 7576 + }, + { + "epoch": 1.0042412193505632, + "grad_norm": 0.14638152718544006, + "learning_rate": 3.7423763288839577e-06, + "loss": 0.0009, + "num_input_tokens_seen": 14836416, + "step": 7577 + }, + { + "epoch": 1.0043737574552685, + "grad_norm": 11.662501335144043, + "learning_rate": 3.7420750784851717e-06, + "loss": 0.536, + "num_input_tokens_seen": 14839496, + "step": 7578 + }, + { + "epoch": 1.0045062955599735, + "grad_norm": 9.965180397033691, + "learning_rate": 3.741773804138321e-06, + "loss": 0.1082, + "num_input_tokens_seen": 14842344, + "step": 7579 + }, + { + "epoch": 1.0046388336646785, + "grad_norm": 3.89701771736145, + "learning_rate": 3.7414725058492155e-06, + "loss": 0.0635, + "num_input_tokens_seen": 14843720, + "step": 7580 + }, + { + "epoch": 1.0047713717693838, + "grad_norm": 0.7009601593017578, + "learning_rate": 3.741171183623663e-06, + "loss": 0.003, + "num_input_tokens_seen": 14846696, + "step": 7581 + }, + { + "epoch": 1.0049039098740888, + "grad_norm": 11.044825553894043, + "learning_rate": 3.7408698374674747e-06, + "loss": 0.1879, + "num_input_tokens_seen": 14848504, + "step": 7582 + }, + { + "epoch": 1.0050364479787939, + "grad_norm": 5.662332534790039, + "learning_rate": 3.7405684673864595e-06, + "loss": 0.1149, + "num_input_tokens_seen": 14850600, + "step": 7583 + }, + { + "epoch": 1.005168986083499, + "grad_norm": 0.02677895873785019, + "learning_rate": 3.7402670733864278e-06, + "loss": 0.0002, + "num_input_tokens_seen": 14851984, + "step": 7584 + }, + { + "epoch": 1.0053015241882042, + "grad_norm": 5.345312118530273, + "learning_rate": 3.7399656554731922e-06, + "loss": 0.0571, + "num_input_tokens_seen": 14854208, + "step": 7585 + }, + { + "epoch": 1.0054340622929092, + "grad_norm": 3.379507064819336, + "learning_rate": 3.7396642136525623e-06, + "loss": 0.0469, + "num_input_tokens_seen": 14856064, + "step": 7586 + }, + { + "epoch": 1.0055666003976143, + "grad_norm": 0.025146977975964546, + "learning_rate": 3.739362747930352e-06, + "loss": 0.0002, + "num_input_tokens_seen": 14857968, + "step": 7587 + }, + { + "epoch": 1.0056991385023195, + "grad_norm": 3.455627918243408, + "learning_rate": 3.7390612583123732e-06, + "loss": 0.0807, + "num_input_tokens_seen": 14860152, + "step": 7588 + }, + { + "epoch": 1.0058316766070245, + "grad_norm": 6.212761402130127, + "learning_rate": 3.7387597448044375e-06, + "loss": 0.1035, + "num_input_tokens_seen": 14861960, + "step": 7589 + }, + { + "epoch": 1.0059642147117296, + "grad_norm": 7.009047031402588, + "learning_rate": 3.7384582074123598e-06, + "loss": 0.1578, + "num_input_tokens_seen": 14864536, + "step": 7590 + }, + { + "epoch": 1.0060967528164346, + "grad_norm": 1.261563777923584, + "learning_rate": 3.738156646141954e-06, + "loss": 0.0304, + "num_input_tokens_seen": 14866160, + "step": 7591 + }, + { + "epoch": 1.0062292909211399, + "grad_norm": 0.06088895723223686, + "learning_rate": 3.7378550609990333e-06, + "loss": 0.0004, + "num_input_tokens_seen": 14868568, + "step": 7592 + }, + { + "epoch": 1.006361829025845, + "grad_norm": 9.468000411987305, + "learning_rate": 3.737553451989413e-06, + "loss": 0.0442, + "num_input_tokens_seen": 14869608, + "step": 7593 + }, + { + "epoch": 1.00649436713055, + "grad_norm": 3.9841530323028564, + "learning_rate": 3.737251819118909e-06, + "loss": 0.1112, + "num_input_tokens_seen": 14871552, + "step": 7594 + }, + { + "epoch": 1.0066269052352552, + "grad_norm": 0.03620518743991852, + "learning_rate": 3.7369501623933365e-06, + "loss": 0.0002, + "num_input_tokens_seen": 14873264, + "step": 7595 + }, + { + "epoch": 1.0067594433399603, + "grad_norm": 0.036536525934934616, + "learning_rate": 3.7366484818185107e-06, + "loss": 0.0002, + "num_input_tokens_seen": 14874568, + "step": 7596 + }, + { + "epoch": 1.0068919814446653, + "grad_norm": 6.898550510406494, + "learning_rate": 3.736346777400249e-06, + "loss": 0.1151, + "num_input_tokens_seen": 14876464, + "step": 7597 + }, + { + "epoch": 1.0070245195493703, + "grad_norm": 6.820957183837891, + "learning_rate": 3.73604504914437e-06, + "loss": 0.0808, + "num_input_tokens_seen": 14878056, + "step": 7598 + }, + { + "epoch": 1.0071570576540756, + "grad_norm": 0.1408001035451889, + "learning_rate": 3.7357432970566876e-06, + "loss": 0.0007, + "num_input_tokens_seen": 14880176, + "step": 7599 + }, + { + "epoch": 1.0072895957587806, + "grad_norm": 0.4174724519252777, + "learning_rate": 3.735441521143024e-06, + "loss": 0.0016, + "num_input_tokens_seen": 14882824, + "step": 7600 + }, + { + "epoch": 1.0074221338634857, + "grad_norm": 1.121027946472168, + "learning_rate": 3.7351397214091944e-06, + "loss": 0.0092, + "num_input_tokens_seen": 14884240, + "step": 7601 + }, + { + "epoch": 1.007554671968191, + "grad_norm": 8.860727310180664, + "learning_rate": 3.734837897861019e-06, + "loss": 0.1755, + "num_input_tokens_seen": 14885960, + "step": 7602 + }, + { + "epoch": 1.007687210072896, + "grad_norm": 2.6130781173706055, + "learning_rate": 3.734536050504317e-06, + "loss": 0.0841, + "num_input_tokens_seen": 14888584, + "step": 7603 + }, + { + "epoch": 1.007819748177601, + "grad_norm": 2.5570755004882812, + "learning_rate": 3.734234179344909e-06, + "loss": 0.0667, + "num_input_tokens_seen": 14890080, + "step": 7604 + }, + { + "epoch": 1.0079522862823063, + "grad_norm": 8.613471984863281, + "learning_rate": 3.733932284388614e-06, + "loss": 0.1941, + "num_input_tokens_seen": 14892016, + "step": 7605 + }, + { + "epoch": 1.0080848243870113, + "grad_norm": 9.126236915588379, + "learning_rate": 3.733630365641253e-06, + "loss": 0.1027, + "num_input_tokens_seen": 14894136, + "step": 7606 + }, + { + "epoch": 1.0082173624917163, + "grad_norm": 0.034967873245477676, + "learning_rate": 3.7333284231086483e-06, + "loss": 0.0002, + "num_input_tokens_seen": 14895464, + "step": 7607 + }, + { + "epoch": 1.0083499005964214, + "grad_norm": 0.4401995539665222, + "learning_rate": 3.73302645679662e-06, + "loss": 0.0028, + "num_input_tokens_seen": 14898384, + "step": 7608 + }, + { + "epoch": 1.0084824387011266, + "grad_norm": 0.10709373652935028, + "learning_rate": 3.7327244667109917e-06, + "loss": 0.0004, + "num_input_tokens_seen": 14899792, + "step": 7609 + }, + { + "epoch": 1.0086149768058317, + "grad_norm": 0.03993635252118111, + "learning_rate": 3.7324224528575846e-06, + "loss": 0.0002, + "num_input_tokens_seen": 14902144, + "step": 7610 + }, + { + "epoch": 1.0087475149105367, + "grad_norm": 3.902662515640259, + "learning_rate": 3.732120415242223e-06, + "loss": 0.0492, + "num_input_tokens_seen": 14904176, + "step": 7611 + }, + { + "epoch": 1.008880053015242, + "grad_norm": 8.799952507019043, + "learning_rate": 3.731818353870729e-06, + "loss": 0.3616, + "num_input_tokens_seen": 14907000, + "step": 7612 + }, + { + "epoch": 1.009012591119947, + "grad_norm": 0.5898030996322632, + "learning_rate": 3.7315162687489282e-06, + "loss": 0.0031, + "num_input_tokens_seen": 14908736, + "step": 7613 + }, + { + "epoch": 1.009145129224652, + "grad_norm": 1.6499249935150146, + "learning_rate": 3.731214159882644e-06, + "loss": 0.0172, + "num_input_tokens_seen": 14910712, + "step": 7614 + }, + { + "epoch": 1.009277667329357, + "grad_norm": 3.692089557647705, + "learning_rate": 3.7309120272777013e-06, + "loss": 0.1076, + "num_input_tokens_seen": 14912768, + "step": 7615 + }, + { + "epoch": 1.0094102054340623, + "grad_norm": 17.565773010253906, + "learning_rate": 3.7306098709399253e-06, + "loss": 0.2443, + "num_input_tokens_seen": 14914528, + "step": 7616 + }, + { + "epoch": 1.0095427435387674, + "grad_norm": 9.190149307250977, + "learning_rate": 3.7303076908751416e-06, + "loss": 0.2021, + "num_input_tokens_seen": 14916256, + "step": 7617 + }, + { + "epoch": 1.0096752816434724, + "grad_norm": 8.503251075744629, + "learning_rate": 3.730005487089178e-06, + "loss": 0.135, + "num_input_tokens_seen": 14917784, + "step": 7618 + }, + { + "epoch": 1.0098078197481777, + "grad_norm": 0.6125144362449646, + "learning_rate": 3.72970325958786e-06, + "loss": 0.0059, + "num_input_tokens_seen": 14918992, + "step": 7619 + }, + { + "epoch": 1.0099403578528827, + "grad_norm": 0.01596934162080288, + "learning_rate": 3.7294010083770136e-06, + "loss": 0.0001, + "num_input_tokens_seen": 14920360, + "step": 7620 + }, + { + "epoch": 1.0100728959575878, + "grad_norm": 3.808002233505249, + "learning_rate": 3.729098733462469e-06, + "loss": 0.1162, + "num_input_tokens_seen": 14922368, + "step": 7621 + }, + { + "epoch": 1.010205434062293, + "grad_norm": 3.6952250003814697, + "learning_rate": 3.728796434850052e-06, + "loss": 0.0635, + "num_input_tokens_seen": 14925104, + "step": 7622 + }, + { + "epoch": 1.010337972166998, + "grad_norm": 0.08713754266500473, + "learning_rate": 3.7284941125455928e-06, + "loss": 0.0005, + "num_input_tokens_seen": 14927376, + "step": 7623 + }, + { + "epoch": 1.010470510271703, + "grad_norm": 0.09904371201992035, + "learning_rate": 3.7281917665549194e-06, + "loss": 0.0003, + "num_input_tokens_seen": 14929336, + "step": 7624 + }, + { + "epoch": 1.0106030483764081, + "grad_norm": 9.295527458190918, + "learning_rate": 3.7278893968838607e-06, + "loss": 0.2358, + "num_input_tokens_seen": 14931888, + "step": 7625 + }, + { + "epoch": 1.0107355864811134, + "grad_norm": 14.037169456481934, + "learning_rate": 3.7275870035382476e-06, + "loss": 0.2825, + "num_input_tokens_seen": 14934144, + "step": 7626 + }, + { + "epoch": 1.0108681245858184, + "grad_norm": 8.949822425842285, + "learning_rate": 3.7272845865239116e-06, + "loss": 0.291, + "num_input_tokens_seen": 14936360, + "step": 7627 + }, + { + "epoch": 1.0110006626905235, + "grad_norm": 7.279786586761475, + "learning_rate": 3.7269821458466806e-06, + "loss": 0.1912, + "num_input_tokens_seen": 14938248, + "step": 7628 + }, + { + "epoch": 1.0111332007952287, + "grad_norm": 4.974860668182373, + "learning_rate": 3.7266796815123873e-06, + "loss": 0.1813, + "num_input_tokens_seen": 14941600, + "step": 7629 + }, + { + "epoch": 1.0112657388999338, + "grad_norm": 0.014519990421831608, + "learning_rate": 3.7263771935268644e-06, + "loss": 0.0001, + "num_input_tokens_seen": 14943520, + "step": 7630 + }, + { + "epoch": 1.0113982770046388, + "grad_norm": 8.698603630065918, + "learning_rate": 3.7260746818959433e-06, + "loss": 0.2853, + "num_input_tokens_seen": 14945936, + "step": 7631 + }, + { + "epoch": 1.0115308151093438, + "grad_norm": 1.3413982391357422, + "learning_rate": 3.7257721466254554e-06, + "loss": 0.013, + "num_input_tokens_seen": 14947696, + "step": 7632 + }, + { + "epoch": 1.011663353214049, + "grad_norm": 6.3997955322265625, + "learning_rate": 3.7254695877212355e-06, + "loss": 0.1012, + "num_input_tokens_seen": 14949544, + "step": 7633 + }, + { + "epoch": 1.0117958913187541, + "grad_norm": 0.04592238366603851, + "learning_rate": 3.7251670051891166e-06, + "loss": 0.0003, + "num_input_tokens_seen": 14951640, + "step": 7634 + }, + { + "epoch": 1.0119284294234592, + "grad_norm": 3.4023611545562744, + "learning_rate": 3.724864399034932e-06, + "loss": 0.043, + "num_input_tokens_seen": 14953720, + "step": 7635 + }, + { + "epoch": 1.0120609675281644, + "grad_norm": 5.359580993652344, + "learning_rate": 3.7245617692645174e-06, + "loss": 0.1993, + "num_input_tokens_seen": 14956408, + "step": 7636 + }, + { + "epoch": 1.0121935056328695, + "grad_norm": 6.454990863800049, + "learning_rate": 3.724259115883707e-06, + "loss": 0.0865, + "num_input_tokens_seen": 14958624, + "step": 7637 + }, + { + "epoch": 1.0123260437375745, + "grad_norm": 0.16809487342834473, + "learning_rate": 3.723956438898336e-06, + "loss": 0.0009, + "num_input_tokens_seen": 14960304, + "step": 7638 + }, + { + "epoch": 1.0124585818422795, + "grad_norm": 0.3803405463695526, + "learning_rate": 3.7236537383142417e-06, + "loss": 0.0019, + "num_input_tokens_seen": 14962224, + "step": 7639 + }, + { + "epoch": 1.0125911199469848, + "grad_norm": 6.1069207191467285, + "learning_rate": 3.723351014137258e-06, + "loss": 0.1377, + "num_input_tokens_seen": 14965000, + "step": 7640 + }, + { + "epoch": 1.0127236580516898, + "grad_norm": 16.45765495300293, + "learning_rate": 3.723048266373223e-06, + "loss": 0.3874, + "num_input_tokens_seen": 14966552, + "step": 7641 + }, + { + "epoch": 1.0128561961563949, + "grad_norm": 0.059850260615348816, + "learning_rate": 3.722745495027974e-06, + "loss": 0.0004, + "num_input_tokens_seen": 14969424, + "step": 7642 + }, + { + "epoch": 1.0129887342611001, + "grad_norm": 10.33269214630127, + "learning_rate": 3.7224427001073483e-06, + "loss": 0.1372, + "num_input_tokens_seen": 14971408, + "step": 7643 + }, + { + "epoch": 1.0131212723658052, + "grad_norm": 13.54920768737793, + "learning_rate": 3.722139881617184e-06, + "loss": 0.6083, + "num_input_tokens_seen": 14973544, + "step": 7644 + }, + { + "epoch": 1.0132538104705102, + "grad_norm": 0.17079365253448486, + "learning_rate": 3.7218370395633196e-06, + "loss": 0.001, + "num_input_tokens_seen": 14974904, + "step": 7645 + }, + { + "epoch": 1.0133863485752155, + "grad_norm": 5.43550968170166, + "learning_rate": 3.7215341739515937e-06, + "loss": 0.0836, + "num_input_tokens_seen": 14977312, + "step": 7646 + }, + { + "epoch": 1.0135188866799205, + "grad_norm": 2.850954055786133, + "learning_rate": 3.721231284787847e-06, + "loss": 0.0415, + "num_input_tokens_seen": 14979272, + "step": 7647 + }, + { + "epoch": 1.0136514247846256, + "grad_norm": 0.5837000012397766, + "learning_rate": 3.7209283720779175e-06, + "loss": 0.0037, + "num_input_tokens_seen": 14980456, + "step": 7648 + }, + { + "epoch": 1.0137839628893306, + "grad_norm": 3.5512311458587646, + "learning_rate": 3.7206254358276474e-06, + "loss": 0.0619, + "num_input_tokens_seen": 14982256, + "step": 7649 + }, + { + "epoch": 1.0139165009940359, + "grad_norm": 8.080368041992188, + "learning_rate": 3.7203224760428776e-06, + "loss": 0.1858, + "num_input_tokens_seen": 14985000, + "step": 7650 + }, + { + "epoch": 1.014049039098741, + "grad_norm": 7.836287975311279, + "learning_rate": 3.7200194927294473e-06, + "loss": 0.191, + "num_input_tokens_seen": 14987208, + "step": 7651 + }, + { + "epoch": 1.014181577203446, + "grad_norm": 1.292928695678711, + "learning_rate": 3.7197164858932e-06, + "loss": 0.0038, + "num_input_tokens_seen": 14988680, + "step": 7652 + }, + { + "epoch": 1.0143141153081512, + "grad_norm": 6.147712707519531, + "learning_rate": 3.719413455539978e-06, + "loss": 0.038, + "num_input_tokens_seen": 14990448, + "step": 7653 + }, + { + "epoch": 1.0144466534128562, + "grad_norm": 9.30418872833252, + "learning_rate": 3.719110401675623e-06, + "loss": 0.1737, + "num_input_tokens_seen": 14992576, + "step": 7654 + }, + { + "epoch": 1.0145791915175613, + "grad_norm": 0.08699267357587814, + "learning_rate": 3.718807324305978e-06, + "loss": 0.0006, + "num_input_tokens_seen": 14995440, + "step": 7655 + }, + { + "epoch": 1.0147117296222663, + "grad_norm": 6.959599018096924, + "learning_rate": 3.718504223436887e-06, + "loss": 0.2037, + "num_input_tokens_seen": 14997952, + "step": 7656 + }, + { + "epoch": 1.0148442677269716, + "grad_norm": 9.382429122924805, + "learning_rate": 3.7182010990741947e-06, + "loss": 0.1251, + "num_input_tokens_seen": 15000616, + "step": 7657 + }, + { + "epoch": 1.0149768058316766, + "grad_norm": 4.614858150482178, + "learning_rate": 3.7178979512237445e-06, + "loss": 0.2036, + "num_input_tokens_seen": 15003704, + "step": 7658 + }, + { + "epoch": 1.0151093439363816, + "grad_norm": 7.221208095550537, + "learning_rate": 3.7175947798913815e-06, + "loss": 0.1139, + "num_input_tokens_seen": 15005088, + "step": 7659 + }, + { + "epoch": 1.015241882041087, + "grad_norm": 6.505541801452637, + "learning_rate": 3.7172915850829515e-06, + "loss": 0.1367, + "num_input_tokens_seen": 15007376, + "step": 7660 + }, + { + "epoch": 1.015374420145792, + "grad_norm": 0.024377090856432915, + "learning_rate": 3.716988366804299e-06, + "loss": 0.0002, + "num_input_tokens_seen": 15009008, + "step": 7661 + }, + { + "epoch": 1.015506958250497, + "grad_norm": 8.376495361328125, + "learning_rate": 3.7166851250612723e-06, + "loss": 0.1139, + "num_input_tokens_seen": 15010672, + "step": 7662 + }, + { + "epoch": 1.015639496355202, + "grad_norm": 0.04851066693663597, + "learning_rate": 3.716381859859717e-06, + "loss": 0.0003, + "num_input_tokens_seen": 15012160, + "step": 7663 + }, + { + "epoch": 1.0157720344599073, + "grad_norm": 10.797849655151367, + "learning_rate": 3.7160785712054794e-06, + "loss": 0.241, + "num_input_tokens_seen": 15014408, + "step": 7664 + }, + { + "epoch": 1.0159045725646123, + "grad_norm": 12.424819946289062, + "learning_rate": 3.7157752591044085e-06, + "loss": 0.1677, + "num_input_tokens_seen": 15016952, + "step": 7665 + }, + { + "epoch": 1.0160371106693173, + "grad_norm": 0.018478531390428543, + "learning_rate": 3.7154719235623525e-06, + "loss": 0.0001, + "num_input_tokens_seen": 15018256, + "step": 7666 + }, + { + "epoch": 1.0161696487740226, + "grad_norm": 13.343843460083008, + "learning_rate": 3.715168564585158e-06, + "loss": 0.3948, + "num_input_tokens_seen": 15020200, + "step": 7667 + }, + { + "epoch": 1.0163021868787276, + "grad_norm": 7.778841018676758, + "learning_rate": 3.7148651821786764e-06, + "loss": 0.1581, + "num_input_tokens_seen": 15022000, + "step": 7668 + }, + { + "epoch": 1.0164347249834327, + "grad_norm": 3.345562696456909, + "learning_rate": 3.714561776348755e-06, + "loss": 0.0336, + "num_input_tokens_seen": 15023736, + "step": 7669 + }, + { + "epoch": 1.016567263088138, + "grad_norm": 4.784765243530273, + "learning_rate": 3.7142583471012454e-06, + "loss": 0.179, + "num_input_tokens_seen": 15025416, + "step": 7670 + }, + { + "epoch": 1.016699801192843, + "grad_norm": 0.06177437677979469, + "learning_rate": 3.7139548944419975e-06, + "loss": 0.0004, + "num_input_tokens_seen": 15027592, + "step": 7671 + }, + { + "epoch": 1.016832339297548, + "grad_norm": 0.05669925734400749, + "learning_rate": 3.7136514183768613e-06, + "loss": 0.0004, + "num_input_tokens_seen": 15029416, + "step": 7672 + }, + { + "epoch": 1.016964877402253, + "grad_norm": 7.42257833480835, + "learning_rate": 3.7133479189116883e-06, + "loss": 0.1978, + "num_input_tokens_seen": 15031736, + "step": 7673 + }, + { + "epoch": 1.0170974155069583, + "grad_norm": 6.421863555908203, + "learning_rate": 3.7130443960523305e-06, + "loss": 0.1002, + "num_input_tokens_seen": 15033784, + "step": 7674 + }, + { + "epoch": 1.0172299536116634, + "grad_norm": 18.70168113708496, + "learning_rate": 3.71274084980464e-06, + "loss": 0.5854, + "num_input_tokens_seen": 15035952, + "step": 7675 + }, + { + "epoch": 1.0173624917163684, + "grad_norm": 16.408714294433594, + "learning_rate": 3.7124372801744697e-06, + "loss": 0.7264, + "num_input_tokens_seen": 15037504, + "step": 7676 + }, + { + "epoch": 1.0174950298210736, + "grad_norm": 3.3047120571136475, + "learning_rate": 3.7121336871676718e-06, + "loss": 0.0718, + "num_input_tokens_seen": 15039064, + "step": 7677 + }, + { + "epoch": 1.0176275679257787, + "grad_norm": 6.032345294952393, + "learning_rate": 3.7118300707901e-06, + "loss": 0.107, + "num_input_tokens_seen": 15041832, + "step": 7678 + }, + { + "epoch": 1.0177601060304837, + "grad_norm": 5.13366174697876, + "learning_rate": 3.7115264310476086e-06, + "loss": 0.1306, + "num_input_tokens_seen": 15043584, + "step": 7679 + }, + { + "epoch": 1.0178926441351888, + "grad_norm": 0.06975368410348892, + "learning_rate": 3.711222767946052e-06, + "loss": 0.0005, + "num_input_tokens_seen": 15045152, + "step": 7680 + }, + { + "epoch": 1.018025182239894, + "grad_norm": 0.1463736891746521, + "learning_rate": 3.710919081491285e-06, + "loss": 0.001, + "num_input_tokens_seen": 15047944, + "step": 7681 + }, + { + "epoch": 1.018157720344599, + "grad_norm": 0.37924399971961975, + "learning_rate": 3.7106153716891617e-06, + "loss": 0.0025, + "num_input_tokens_seen": 15050224, + "step": 7682 + }, + { + "epoch": 1.018290258449304, + "grad_norm": 3.9590206146240234, + "learning_rate": 3.71031163854554e-06, + "loss": 0.0555, + "num_input_tokens_seen": 15051944, + "step": 7683 + }, + { + "epoch": 1.0184227965540094, + "grad_norm": 5.259951591491699, + "learning_rate": 3.710007882066275e-06, + "loss": 0.0644, + "num_input_tokens_seen": 15053664, + "step": 7684 + }, + { + "epoch": 1.0185553346587144, + "grad_norm": 6.186981201171875, + "learning_rate": 3.7097041022572224e-06, + "loss": 0.0532, + "num_input_tokens_seen": 15054912, + "step": 7685 + }, + { + "epoch": 1.0186878727634194, + "grad_norm": 6.308382511138916, + "learning_rate": 3.709400299124241e-06, + "loss": 0.116, + "num_input_tokens_seen": 15056880, + "step": 7686 + }, + { + "epoch": 1.0188204108681247, + "grad_norm": 13.603167533874512, + "learning_rate": 3.709096472673187e-06, + "loss": 0.2289, + "num_input_tokens_seen": 15058520, + "step": 7687 + }, + { + "epoch": 1.0189529489728297, + "grad_norm": 2.428762197494507, + "learning_rate": 3.7087926229099193e-06, + "loss": 0.0225, + "num_input_tokens_seen": 15060288, + "step": 7688 + }, + { + "epoch": 1.0190854870775348, + "grad_norm": 0.05689212679862976, + "learning_rate": 3.7084887498402955e-06, + "loss": 0.0004, + "num_input_tokens_seen": 15061784, + "step": 7689 + }, + { + "epoch": 1.0192180251822398, + "grad_norm": 0.060718584805727005, + "learning_rate": 3.708184853470175e-06, + "loss": 0.0004, + "num_input_tokens_seen": 15062864, + "step": 7690 + }, + { + "epoch": 1.019350563286945, + "grad_norm": 0.5899678468704224, + "learning_rate": 3.7078809338054172e-06, + "loss": 0.0078, + "num_input_tokens_seen": 15065008, + "step": 7691 + }, + { + "epoch": 1.01948310139165, + "grad_norm": 4.334835529327393, + "learning_rate": 3.7075769908518814e-06, + "loss": 0.1255, + "num_input_tokens_seen": 15066560, + "step": 7692 + }, + { + "epoch": 1.0196156394963551, + "grad_norm": 9.271401405334473, + "learning_rate": 3.7072730246154288e-06, + "loss": 0.228, + "num_input_tokens_seen": 15068760, + "step": 7693 + }, + { + "epoch": 1.0197481776010604, + "grad_norm": 2.927886724472046, + "learning_rate": 3.7069690351019187e-06, + "loss": 0.0294, + "num_input_tokens_seen": 15070616, + "step": 7694 + }, + { + "epoch": 1.0198807157057654, + "grad_norm": 8.631467819213867, + "learning_rate": 3.7066650223172134e-06, + "loss": 0.118, + "num_input_tokens_seen": 15072304, + "step": 7695 + }, + { + "epoch": 1.0200132538104705, + "grad_norm": 1.3208357095718384, + "learning_rate": 3.7063609862671744e-06, + "loss": 0.0071, + "num_input_tokens_seen": 15073840, + "step": 7696 + }, + { + "epoch": 1.0201457919151755, + "grad_norm": 7.74658203125, + "learning_rate": 3.706056926957663e-06, + "loss": 0.2107, + "num_input_tokens_seen": 15075816, + "step": 7697 + }, + { + "epoch": 1.0202783300198808, + "grad_norm": 7.877039432525635, + "learning_rate": 3.705752844394542e-06, + "loss": 0.1971, + "num_input_tokens_seen": 15077488, + "step": 7698 + }, + { + "epoch": 1.0204108681245858, + "grad_norm": 9.886817932128906, + "learning_rate": 3.7054487385836736e-06, + "loss": 0.1633, + "num_input_tokens_seen": 15080376, + "step": 7699 + }, + { + "epoch": 1.0205434062292909, + "grad_norm": 13.04549503326416, + "learning_rate": 3.705144609530923e-06, + "loss": 0.3479, + "num_input_tokens_seen": 15082440, + "step": 7700 + }, + { + "epoch": 1.020675944333996, + "grad_norm": 5.335203170776367, + "learning_rate": 3.7048404572421525e-06, + "loss": 0.0918, + "num_input_tokens_seen": 15085184, + "step": 7701 + }, + { + "epoch": 1.0208084824387011, + "grad_norm": 12.4288969039917, + "learning_rate": 3.704536281723227e-06, + "loss": 0.6115, + "num_input_tokens_seen": 15088216, + "step": 7702 + }, + { + "epoch": 1.0209410205434062, + "grad_norm": 0.21038709580898285, + "learning_rate": 3.7042320829800105e-06, + "loss": 0.0012, + "num_input_tokens_seen": 15089560, + "step": 7703 + }, + { + "epoch": 1.0210735586481112, + "grad_norm": 0.1432301551103592, + "learning_rate": 3.703927861018369e-06, + "loss": 0.0009, + "num_input_tokens_seen": 15091152, + "step": 7704 + }, + { + "epoch": 1.0212060967528165, + "grad_norm": 2.8473587036132812, + "learning_rate": 3.7036236158441685e-06, + "loss": 0.0501, + "num_input_tokens_seen": 15092440, + "step": 7705 + }, + { + "epoch": 1.0213386348575215, + "grad_norm": 8.373361587524414, + "learning_rate": 3.7033193474632734e-06, + "loss": 0.0601, + "num_input_tokens_seen": 15094104, + "step": 7706 + }, + { + "epoch": 1.0214711729622266, + "grad_norm": 3.8365566730499268, + "learning_rate": 3.7030150558815513e-06, + "loss": 0.0711, + "num_input_tokens_seen": 15095584, + "step": 7707 + }, + { + "epoch": 1.0216037110669318, + "grad_norm": 4.58162784576416, + "learning_rate": 3.7027107411048695e-06, + "loss": 0.0872, + "num_input_tokens_seen": 15097368, + "step": 7708 + }, + { + "epoch": 1.0217362491716369, + "grad_norm": 0.2154024988412857, + "learning_rate": 3.702406403139094e-06, + "loss": 0.0024, + "num_input_tokens_seen": 15099104, + "step": 7709 + }, + { + "epoch": 1.021868787276342, + "grad_norm": 4.24046516418457, + "learning_rate": 3.702102041990095e-06, + "loss": 0.0661, + "num_input_tokens_seen": 15100944, + "step": 7710 + }, + { + "epoch": 1.0220013253810472, + "grad_norm": 0.07628628611564636, + "learning_rate": 3.701797657663738e-06, + "loss": 0.0005, + "num_input_tokens_seen": 15101944, + "step": 7711 + }, + { + "epoch": 1.0221338634857522, + "grad_norm": 0.9856442213058472, + "learning_rate": 3.7014932501658934e-06, + "loss": 0.0039, + "num_input_tokens_seen": 15104480, + "step": 7712 + }, + { + "epoch": 1.0222664015904572, + "grad_norm": 9.911195755004883, + "learning_rate": 3.7011888195024303e-06, + "loss": 0.1938, + "num_input_tokens_seen": 15106856, + "step": 7713 + }, + { + "epoch": 1.0223989396951623, + "grad_norm": 2.2839925289154053, + "learning_rate": 3.7008843656792183e-06, + "loss": 0.0186, + "num_input_tokens_seen": 15109144, + "step": 7714 + }, + { + "epoch": 1.0225314777998675, + "grad_norm": 10.681915283203125, + "learning_rate": 3.7005798887021273e-06, + "loss": 0.0804, + "num_input_tokens_seen": 15111920, + "step": 7715 + }, + { + "epoch": 1.0226640159045726, + "grad_norm": 2.6024203300476074, + "learning_rate": 3.7002753885770273e-06, + "loss": 0.0339, + "num_input_tokens_seen": 15113112, + "step": 7716 + }, + { + "epoch": 1.0227965540092776, + "grad_norm": 0.8200755715370178, + "learning_rate": 3.69997086530979e-06, + "loss": 0.0051, + "num_input_tokens_seen": 15114912, + "step": 7717 + }, + { + "epoch": 1.0229290921139829, + "grad_norm": 0.08860411494970322, + "learning_rate": 3.6996663189062858e-06, + "loss": 0.0006, + "num_input_tokens_seen": 15117232, + "step": 7718 + }, + { + "epoch": 1.023061630218688, + "grad_norm": 3.6393609046936035, + "learning_rate": 3.699361749372389e-06, + "loss": 0.1231, + "num_input_tokens_seen": 15119984, + "step": 7719 + }, + { + "epoch": 1.023194168323393, + "grad_norm": 7.931157112121582, + "learning_rate": 3.6990571567139686e-06, + "loss": 0.0334, + "num_input_tokens_seen": 15121392, + "step": 7720 + }, + { + "epoch": 1.023326706428098, + "grad_norm": 0.013317984528839588, + "learning_rate": 3.6987525409369e-06, + "loss": 0.0001, + "num_input_tokens_seen": 15123112, + "step": 7721 + }, + { + "epoch": 1.0234592445328032, + "grad_norm": 10.313695907592773, + "learning_rate": 3.698447902047056e-06, + "loss": 0.4243, + "num_input_tokens_seen": 15125008, + "step": 7722 + }, + { + "epoch": 1.0235917826375083, + "grad_norm": 5.078224182128906, + "learning_rate": 3.6981432400503086e-06, + "loss": 0.0544, + "num_input_tokens_seen": 15127064, + "step": 7723 + }, + { + "epoch": 1.0237243207422133, + "grad_norm": 0.01462921965867281, + "learning_rate": 3.697838554952533e-06, + "loss": 0.0001, + "num_input_tokens_seen": 15129192, + "step": 7724 + }, + { + "epoch": 1.0238568588469186, + "grad_norm": 9.532901763916016, + "learning_rate": 3.6975338467596047e-06, + "loss": 0.2543, + "num_input_tokens_seen": 15131008, + "step": 7725 + }, + { + "epoch": 1.0239893969516236, + "grad_norm": 0.012940621003508568, + "learning_rate": 3.6972291154773964e-06, + "loss": 0.0001, + "num_input_tokens_seen": 15133320, + "step": 7726 + }, + { + "epoch": 1.0241219350563286, + "grad_norm": 15.29073429107666, + "learning_rate": 3.696924361111786e-06, + "loss": 0.5838, + "num_input_tokens_seen": 15136488, + "step": 7727 + }, + { + "epoch": 1.024254473161034, + "grad_norm": 9.175992965698242, + "learning_rate": 3.696619583668648e-06, + "loss": 0.0709, + "num_input_tokens_seen": 15137744, + "step": 7728 + }, + { + "epoch": 1.024387011265739, + "grad_norm": 7.448381423950195, + "learning_rate": 3.696314783153858e-06, + "loss": 0.3783, + "num_input_tokens_seen": 15139760, + "step": 7729 + }, + { + "epoch": 1.024519549370444, + "grad_norm": 2.747375011444092, + "learning_rate": 3.6960099595732944e-06, + "loss": 0.0499, + "num_input_tokens_seen": 15141280, + "step": 7730 + }, + { + "epoch": 1.024652087475149, + "grad_norm": 11.375801086425781, + "learning_rate": 3.6957051129328335e-06, + "loss": 0.2844, + "num_input_tokens_seen": 15143064, + "step": 7731 + }, + { + "epoch": 1.0247846255798543, + "grad_norm": 0.008039495907723904, + "learning_rate": 3.6954002432383533e-06, + "loss": 0.0001, + "num_input_tokens_seen": 15144376, + "step": 7732 + }, + { + "epoch": 1.0249171636845593, + "grad_norm": 9.363496780395508, + "learning_rate": 3.6950953504957322e-06, + "loss": 0.086, + "num_input_tokens_seen": 15146568, + "step": 7733 + }, + { + "epoch": 1.0250497017892644, + "grad_norm": 8.235267639160156, + "learning_rate": 3.694790434710848e-06, + "loss": 0.3043, + "num_input_tokens_seen": 15148344, + "step": 7734 + }, + { + "epoch": 1.0251822398939696, + "grad_norm": 0.05013801530003548, + "learning_rate": 3.6944854958895794e-06, + "loss": 0.0002, + "num_input_tokens_seen": 15149472, + "step": 7735 + }, + { + "epoch": 1.0253147779986747, + "grad_norm": 7.490808963775635, + "learning_rate": 3.694180534037807e-06, + "loss": 0.143, + "num_input_tokens_seen": 15151000, + "step": 7736 + }, + { + "epoch": 1.0254473161033797, + "grad_norm": 0.0684259831905365, + "learning_rate": 3.69387554916141e-06, + "loss": 0.0004, + "num_input_tokens_seen": 15152432, + "step": 7737 + }, + { + "epoch": 1.0255798542080847, + "grad_norm": 12.239639282226562, + "learning_rate": 3.693570541266268e-06, + "loss": 0.2533, + "num_input_tokens_seen": 15154344, + "step": 7738 + }, + { + "epoch": 1.02571239231279, + "grad_norm": 4.195292949676514, + "learning_rate": 3.693265510358263e-06, + "loss": 0.0807, + "num_input_tokens_seen": 15157328, + "step": 7739 + }, + { + "epoch": 1.025844930417495, + "grad_norm": 2.8659470081329346, + "learning_rate": 3.6929604564432762e-06, + "loss": 0.0649, + "num_input_tokens_seen": 15159600, + "step": 7740 + }, + { + "epoch": 1.0259774685222, + "grad_norm": 2.2537879943847656, + "learning_rate": 3.6926553795271885e-06, + "loss": 0.0229, + "num_input_tokens_seen": 15161312, + "step": 7741 + }, + { + "epoch": 1.0261100066269053, + "grad_norm": 6.324787616729736, + "learning_rate": 3.692350279615883e-06, + "loss": 0.1419, + "num_input_tokens_seen": 15163160, + "step": 7742 + }, + { + "epoch": 1.0262425447316104, + "grad_norm": 4.146732330322266, + "learning_rate": 3.6920451567152404e-06, + "loss": 0.0347, + "num_input_tokens_seen": 15165752, + "step": 7743 + }, + { + "epoch": 1.0263750828363154, + "grad_norm": 0.17877551913261414, + "learning_rate": 3.6917400108311454e-06, + "loss": 0.001, + "num_input_tokens_seen": 15167112, + "step": 7744 + }, + { + "epoch": 1.0265076209410204, + "grad_norm": 0.019409723579883575, + "learning_rate": 3.6914348419694813e-06, + "loss": 0.0001, + "num_input_tokens_seen": 15168208, + "step": 7745 + }, + { + "epoch": 1.0266401590457257, + "grad_norm": 5.205817699432373, + "learning_rate": 3.691129650136131e-06, + "loss": 0.1737, + "num_input_tokens_seen": 15171120, + "step": 7746 + }, + { + "epoch": 1.0267726971504307, + "grad_norm": 0.011662323959171772, + "learning_rate": 3.6908244353369786e-06, + "loss": 0.0001, + "num_input_tokens_seen": 15173704, + "step": 7747 + }, + { + "epoch": 1.0269052352551358, + "grad_norm": 5.509905815124512, + "learning_rate": 3.690519197577911e-06, + "loss": 0.0507, + "num_input_tokens_seen": 15175272, + "step": 7748 + }, + { + "epoch": 1.027037773359841, + "grad_norm": 0.01596641354262829, + "learning_rate": 3.690213936864811e-06, + "loss": 0.0001, + "num_input_tokens_seen": 15177136, + "step": 7749 + }, + { + "epoch": 1.027170311464546, + "grad_norm": 7.253083229064941, + "learning_rate": 3.6899086532035657e-06, + "loss": 0.1838, + "num_input_tokens_seen": 15179008, + "step": 7750 + }, + { + "epoch": 1.027302849569251, + "grad_norm": 0.03541019186377525, + "learning_rate": 3.689603346600061e-06, + "loss": 0.0002, + "num_input_tokens_seen": 15180776, + "step": 7751 + }, + { + "epoch": 1.0274353876739564, + "grad_norm": 4.202393054962158, + "learning_rate": 3.689298017060182e-06, + "loss": 0.0701, + "num_input_tokens_seen": 15182680, + "step": 7752 + }, + { + "epoch": 1.0275679257786614, + "grad_norm": 4.130717754364014, + "learning_rate": 3.6889926645898177e-06, + "loss": 0.0694, + "num_input_tokens_seen": 15183920, + "step": 7753 + }, + { + "epoch": 1.0277004638833664, + "grad_norm": 4.633484363555908, + "learning_rate": 3.6886872891948543e-06, + "loss": 0.0829, + "num_input_tokens_seen": 15185976, + "step": 7754 + }, + { + "epoch": 1.0278330019880715, + "grad_norm": 10.510533332824707, + "learning_rate": 3.6883818908811797e-06, + "loss": 0.2194, + "num_input_tokens_seen": 15187680, + "step": 7755 + }, + { + "epoch": 1.0279655400927767, + "grad_norm": 7.969301223754883, + "learning_rate": 3.6880764696546828e-06, + "loss": 0.296, + "num_input_tokens_seen": 15189456, + "step": 7756 + }, + { + "epoch": 1.0280980781974818, + "grad_norm": 7.980248928070068, + "learning_rate": 3.6877710255212517e-06, + "loss": 0.204, + "num_input_tokens_seen": 15191408, + "step": 7757 + }, + { + "epoch": 1.0282306163021868, + "grad_norm": 10.49356460571289, + "learning_rate": 3.6874655584867763e-06, + "loss": 0.1157, + "num_input_tokens_seen": 15192768, + "step": 7758 + }, + { + "epoch": 1.028363154406892, + "grad_norm": 0.010349218733608723, + "learning_rate": 3.6871600685571453e-06, + "loss": 0.0001, + "num_input_tokens_seen": 15194488, + "step": 7759 + }, + { + "epoch": 1.0284956925115971, + "grad_norm": 11.422708511352539, + "learning_rate": 3.686854555738249e-06, + "loss": 0.2013, + "num_input_tokens_seen": 15196280, + "step": 7760 + }, + { + "epoch": 1.0286282306163022, + "grad_norm": 2.7241554260253906, + "learning_rate": 3.6865490200359787e-06, + "loss": 0.0293, + "num_input_tokens_seen": 15198216, + "step": 7761 + }, + { + "epoch": 1.0287607687210072, + "grad_norm": 2.9253151416778564, + "learning_rate": 3.6862434614562238e-06, + "loss": 0.033, + "num_input_tokens_seen": 15200072, + "step": 7762 + }, + { + "epoch": 1.0288933068257125, + "grad_norm": 10.132658958435059, + "learning_rate": 3.685937880004877e-06, + "loss": 0.336, + "num_input_tokens_seen": 15201848, + "step": 7763 + }, + { + "epoch": 1.0290258449304175, + "grad_norm": 9.825067520141602, + "learning_rate": 3.685632275687831e-06, + "loss": 0.2315, + "num_input_tokens_seen": 15204368, + "step": 7764 + }, + { + "epoch": 1.0291583830351225, + "grad_norm": 13.222792625427246, + "learning_rate": 3.685326648510975e-06, + "loss": 0.2946, + "num_input_tokens_seen": 15207088, + "step": 7765 + }, + { + "epoch": 1.0292909211398278, + "grad_norm": 9.702701568603516, + "learning_rate": 3.685020998480204e-06, + "loss": 0.1448, + "num_input_tokens_seen": 15209096, + "step": 7766 + }, + { + "epoch": 1.0294234592445328, + "grad_norm": 0.08773332089185715, + "learning_rate": 3.6847153256014112e-06, + "loss": 0.0005, + "num_input_tokens_seen": 15210072, + "step": 7767 + }, + { + "epoch": 1.0295559973492379, + "grad_norm": 8.301722526550293, + "learning_rate": 3.6844096298804894e-06, + "loss": 0.1234, + "num_input_tokens_seen": 15211384, + "step": 7768 + }, + { + "epoch": 1.0296885354539431, + "grad_norm": 0.09840217232704163, + "learning_rate": 3.6841039113233327e-06, + "loss": 0.0006, + "num_input_tokens_seen": 15214288, + "step": 7769 + }, + { + "epoch": 1.0298210735586482, + "grad_norm": 5.12624454498291, + "learning_rate": 3.683798169935836e-06, + "loss": 0.1075, + "num_input_tokens_seen": 15215968, + "step": 7770 + }, + { + "epoch": 1.0299536116633532, + "grad_norm": 3.5404484272003174, + "learning_rate": 3.6834924057238937e-06, + "loss": 0.0554, + "num_input_tokens_seen": 15219368, + "step": 7771 + }, + { + "epoch": 1.0300861497680582, + "grad_norm": 5.58198356628418, + "learning_rate": 3.6831866186934016e-06, + "loss": 0.1017, + "num_input_tokens_seen": 15222392, + "step": 7772 + }, + { + "epoch": 1.0302186878727635, + "grad_norm": 8.423585891723633, + "learning_rate": 3.682880808850255e-06, + "loss": 0.2263, + "num_input_tokens_seen": 15224584, + "step": 7773 + }, + { + "epoch": 1.0303512259774685, + "grad_norm": 0.8835532069206238, + "learning_rate": 3.6825749762003504e-06, + "loss": 0.0102, + "num_input_tokens_seen": 15227008, + "step": 7774 + }, + { + "epoch": 1.0304837640821736, + "grad_norm": 6.299657344818115, + "learning_rate": 3.682269120749584e-06, + "loss": 0.1994, + "num_input_tokens_seen": 15228592, + "step": 7775 + }, + { + "epoch": 1.0306163021868788, + "grad_norm": 2.78489351272583, + "learning_rate": 3.681963242503854e-06, + "loss": 0.0335, + "num_input_tokens_seen": 15232088, + "step": 7776 + }, + { + "epoch": 1.0307488402915839, + "grad_norm": 8.4452486038208, + "learning_rate": 3.681657341469057e-06, + "loss": 0.0832, + "num_input_tokens_seen": 15234320, + "step": 7777 + }, + { + "epoch": 1.030881378396289, + "grad_norm": 0.9621162414550781, + "learning_rate": 3.6813514176510916e-06, + "loss": 0.0058, + "num_input_tokens_seen": 15236512, + "step": 7778 + }, + { + "epoch": 1.031013916500994, + "grad_norm": 1.3048114776611328, + "learning_rate": 3.6810454710558558e-06, + "loss": 0.0088, + "num_input_tokens_seen": 15237928, + "step": 7779 + }, + { + "epoch": 1.0311464546056992, + "grad_norm": 9.818628311157227, + "learning_rate": 3.6807395016892487e-06, + "loss": 0.1096, + "num_input_tokens_seen": 15239952, + "step": 7780 + }, + { + "epoch": 1.0312789927104042, + "grad_norm": 10.57654094696045, + "learning_rate": 3.680433509557169e-06, + "loss": 0.4838, + "num_input_tokens_seen": 15242216, + "step": 7781 + }, + { + "epoch": 1.0314115308151093, + "grad_norm": 17.96974754333496, + "learning_rate": 3.680127494665517e-06, + "loss": 0.3304, + "num_input_tokens_seen": 15244312, + "step": 7782 + }, + { + "epoch": 1.0315440689198145, + "grad_norm": 6.091078281402588, + "learning_rate": 3.679821457020193e-06, + "loss": 0.1163, + "num_input_tokens_seen": 15246072, + "step": 7783 + }, + { + "epoch": 1.0316766070245196, + "grad_norm": 0.03685947507619858, + "learning_rate": 3.679515396627097e-06, + "loss": 0.0002, + "num_input_tokens_seen": 15248552, + "step": 7784 + }, + { + "epoch": 1.0318091451292246, + "grad_norm": 11.809764862060547, + "learning_rate": 3.679209313492131e-06, + "loss": 0.1769, + "num_input_tokens_seen": 15250424, + "step": 7785 + }, + { + "epoch": 1.0319416832339297, + "grad_norm": 0.02438715659081936, + "learning_rate": 3.678903207621195e-06, + "loss": 0.0001, + "num_input_tokens_seen": 15251592, + "step": 7786 + }, + { + "epoch": 1.032074221338635, + "grad_norm": 0.6539574265480042, + "learning_rate": 3.678597079020193e-06, + "loss": 0.0038, + "num_input_tokens_seen": 15253144, + "step": 7787 + }, + { + "epoch": 1.03220675944334, + "grad_norm": 1.7858794927597046, + "learning_rate": 3.678290927695026e-06, + "loss": 0.0072, + "num_input_tokens_seen": 15254848, + "step": 7788 + }, + { + "epoch": 1.032339297548045, + "grad_norm": 0.033668048679828644, + "learning_rate": 3.677984753651597e-06, + "loss": 0.0002, + "num_input_tokens_seen": 15256112, + "step": 7789 + }, + { + "epoch": 1.0324718356527502, + "grad_norm": 0.06988954544067383, + "learning_rate": 3.677678556895809e-06, + "loss": 0.0005, + "num_input_tokens_seen": 15258568, + "step": 7790 + }, + { + "epoch": 1.0326043737574553, + "grad_norm": 10.055173873901367, + "learning_rate": 3.6773723374335657e-06, + "loss": 0.1671, + "num_input_tokens_seen": 15260656, + "step": 7791 + }, + { + "epoch": 1.0327369118621603, + "grad_norm": 0.2110818773508072, + "learning_rate": 3.677066095270772e-06, + "loss": 0.0013, + "num_input_tokens_seen": 15262656, + "step": 7792 + }, + { + "epoch": 1.0328694499668654, + "grad_norm": 8.365522384643555, + "learning_rate": 3.6767598304133325e-06, + "loss": 0.156, + "num_input_tokens_seen": 15264384, + "step": 7793 + }, + { + "epoch": 1.0330019880715706, + "grad_norm": 0.04982191324234009, + "learning_rate": 3.67645354286715e-06, + "loss": 0.0003, + "num_input_tokens_seen": 15265560, + "step": 7794 + }, + { + "epoch": 1.0331345261762757, + "grad_norm": 0.9988853931427002, + "learning_rate": 3.676147232638133e-06, + "loss": 0.0042, + "num_input_tokens_seen": 15267864, + "step": 7795 + }, + { + "epoch": 1.0332670642809807, + "grad_norm": 0.12236367911100388, + "learning_rate": 3.6758408997321853e-06, + "loss": 0.0008, + "num_input_tokens_seen": 15269600, + "step": 7796 + }, + { + "epoch": 1.033399602385686, + "grad_norm": 4.066896438598633, + "learning_rate": 3.6755345441552143e-06, + "loss": 0.0933, + "num_input_tokens_seen": 15271624, + "step": 7797 + }, + { + "epoch": 1.033532140490391, + "grad_norm": 3.94903826713562, + "learning_rate": 3.675228165913127e-06, + "loss": 0.0766, + "num_input_tokens_seen": 15273952, + "step": 7798 + }, + { + "epoch": 1.033664678595096, + "grad_norm": 0.871780514717102, + "learning_rate": 3.6749217650118286e-06, + "loss": 0.004, + "num_input_tokens_seen": 15275392, + "step": 7799 + }, + { + "epoch": 1.0337972166998013, + "grad_norm": 9.41145133972168, + "learning_rate": 3.6746153414572284e-06, + "loss": 0.4313, + "num_input_tokens_seen": 15277896, + "step": 7800 + }, + { + "epoch": 1.0339297548045063, + "grad_norm": 0.09845166653394699, + "learning_rate": 3.674308895255234e-06, + "loss": 0.001, + "num_input_tokens_seen": 15280448, + "step": 7801 + }, + { + "epoch": 1.0340622929092114, + "grad_norm": 6.438985347747803, + "learning_rate": 3.6740024264117546e-06, + "loss": 0.143, + "num_input_tokens_seen": 15282208, + "step": 7802 + }, + { + "epoch": 1.0341948310139164, + "grad_norm": 6.802124977111816, + "learning_rate": 3.6736959349326984e-06, + "loss": 0.1381, + "num_input_tokens_seen": 15284080, + "step": 7803 + }, + { + "epoch": 1.0343273691186217, + "grad_norm": 0.004904947243630886, + "learning_rate": 3.673389420823974e-06, + "loss": 0.0, + "num_input_tokens_seen": 15285880, + "step": 7804 + }, + { + "epoch": 1.0344599072233267, + "grad_norm": 9.987713813781738, + "learning_rate": 3.6730828840914933e-06, + "loss": 0.1208, + "num_input_tokens_seen": 15288016, + "step": 7805 + }, + { + "epoch": 1.0345924453280317, + "grad_norm": 0.119804747402668, + "learning_rate": 3.672776324741164e-06, + "loss": 0.0003, + "num_input_tokens_seen": 15289632, + "step": 7806 + }, + { + "epoch": 1.034724983432737, + "grad_norm": 0.015266526490449905, + "learning_rate": 3.672469742778899e-06, + "loss": 0.0001, + "num_input_tokens_seen": 15291456, + "step": 7807 + }, + { + "epoch": 1.034857521537442, + "grad_norm": 8.613381385803223, + "learning_rate": 3.6721631382106093e-06, + "loss": 0.1196, + "num_input_tokens_seen": 15294416, + "step": 7808 + }, + { + "epoch": 1.034990059642147, + "grad_norm": 11.110567092895508, + "learning_rate": 3.6718565110422037e-06, + "loss": 0.158, + "num_input_tokens_seen": 15296200, + "step": 7809 + }, + { + "epoch": 1.0351225977468521, + "grad_norm": 1.3834730386734009, + "learning_rate": 3.6715498612795974e-06, + "loss": 0.008, + "num_input_tokens_seen": 15298520, + "step": 7810 + }, + { + "epoch": 1.0352551358515574, + "grad_norm": 17.030418395996094, + "learning_rate": 3.671243188928702e-06, + "loss": 0.6, + "num_input_tokens_seen": 15299952, + "step": 7811 + }, + { + "epoch": 1.0353876739562624, + "grad_norm": 0.0029156908858567476, + "learning_rate": 3.6709364939954285e-06, + "loss": 0.0, + "num_input_tokens_seen": 15301400, + "step": 7812 + }, + { + "epoch": 1.0355202120609674, + "grad_norm": 2.830509662628174, + "learning_rate": 3.670629776485693e-06, + "loss": 0.0253, + "num_input_tokens_seen": 15303720, + "step": 7813 + }, + { + "epoch": 1.0356527501656727, + "grad_norm": 0.003302226308733225, + "learning_rate": 3.6703230364054064e-06, + "loss": 0.0, + "num_input_tokens_seen": 15304896, + "step": 7814 + }, + { + "epoch": 1.0357852882703777, + "grad_norm": 10.746882438659668, + "learning_rate": 3.6700162737604857e-06, + "loss": 0.2075, + "num_input_tokens_seen": 15307064, + "step": 7815 + }, + { + "epoch": 1.0359178263750828, + "grad_norm": 0.01224896777421236, + "learning_rate": 3.6697094885568436e-06, + "loss": 0.0001, + "num_input_tokens_seen": 15309152, + "step": 7816 + }, + { + "epoch": 1.036050364479788, + "grad_norm": 2.5384762287139893, + "learning_rate": 3.6694026808003953e-06, + "loss": 0.0243, + "num_input_tokens_seen": 15312968, + "step": 7817 + }, + { + "epoch": 1.036182902584493, + "grad_norm": 0.03204203024506569, + "learning_rate": 3.6690958504970564e-06, + "loss": 0.0002, + "num_input_tokens_seen": 15315040, + "step": 7818 + }, + { + "epoch": 1.0363154406891981, + "grad_norm": 0.6126527190208435, + "learning_rate": 3.6687889976527436e-06, + "loss": 0.0032, + "num_input_tokens_seen": 15316440, + "step": 7819 + }, + { + "epoch": 1.0364479787939032, + "grad_norm": 6.791605472564697, + "learning_rate": 3.6684821222733725e-06, + "loss": 0.0475, + "num_input_tokens_seen": 15318064, + "step": 7820 + }, + { + "epoch": 1.0365805168986084, + "grad_norm": 3.4890735149383545, + "learning_rate": 3.66817522436486e-06, + "loss": 0.0567, + "num_input_tokens_seen": 15319768, + "step": 7821 + }, + { + "epoch": 1.0367130550033135, + "grad_norm": 3.435305118560791, + "learning_rate": 3.6678683039331232e-06, + "loss": 0.0113, + "num_input_tokens_seen": 15321064, + "step": 7822 + }, + { + "epoch": 1.0368455931080185, + "grad_norm": 8.242443084716797, + "learning_rate": 3.6675613609840798e-06, + "loss": 0.215, + "num_input_tokens_seen": 15323216, + "step": 7823 + }, + { + "epoch": 1.0369781312127238, + "grad_norm": 4.11225700378418, + "learning_rate": 3.667254395523648e-06, + "loss": 0.0711, + "num_input_tokens_seen": 15325264, + "step": 7824 + }, + { + "epoch": 1.0371106693174288, + "grad_norm": 0.001202165149152279, + "learning_rate": 3.666947407557746e-06, + "loss": 0.0, + "num_input_tokens_seen": 15327472, + "step": 7825 + }, + { + "epoch": 1.0372432074221338, + "grad_norm": 5.19279670715332, + "learning_rate": 3.6666403970922932e-06, + "loss": 0.0862, + "num_input_tokens_seen": 15329064, + "step": 7826 + }, + { + "epoch": 1.0373757455268389, + "grad_norm": 0.0024823781568557024, + "learning_rate": 3.6663333641332076e-06, + "loss": 0.0, + "num_input_tokens_seen": 15330800, + "step": 7827 + }, + { + "epoch": 1.0375082836315441, + "grad_norm": 8.199209213256836, + "learning_rate": 3.6660263086864116e-06, + "loss": 0.2951, + "num_input_tokens_seen": 15334256, + "step": 7828 + }, + { + "epoch": 1.0376408217362492, + "grad_norm": 0.0018876849208027124, + "learning_rate": 3.665719230757824e-06, + "loss": 0.0, + "num_input_tokens_seen": 15336272, + "step": 7829 + }, + { + "epoch": 1.0377733598409542, + "grad_norm": 0.004428075160831213, + "learning_rate": 3.665412130353364e-06, + "loss": 0.0, + "num_input_tokens_seen": 15337440, + "step": 7830 + }, + { + "epoch": 1.0379058979456595, + "grad_norm": 3.082026243209839, + "learning_rate": 3.6651050074789555e-06, + "loss": 0.0718, + "num_input_tokens_seen": 15339184, + "step": 7831 + }, + { + "epoch": 1.0380384360503645, + "grad_norm": 0.00799545831978321, + "learning_rate": 3.6647978621405182e-06, + "loss": 0.0, + "num_input_tokens_seen": 15340984, + "step": 7832 + }, + { + "epoch": 1.0381709741550695, + "grad_norm": 5.365766525268555, + "learning_rate": 3.6644906943439747e-06, + "loss": 0.087, + "num_input_tokens_seen": 15343872, + "step": 7833 + }, + { + "epoch": 1.0383035122597746, + "grad_norm": 0.0016862810589373112, + "learning_rate": 3.6641835040952475e-06, + "loss": 0.0, + "num_input_tokens_seen": 15345128, + "step": 7834 + }, + { + "epoch": 1.0384360503644798, + "grad_norm": 0.003183546243235469, + "learning_rate": 3.663876291400259e-06, + "loss": 0.0, + "num_input_tokens_seen": 15347280, + "step": 7835 + }, + { + "epoch": 1.0385685884691849, + "grad_norm": 7.209559440612793, + "learning_rate": 3.6635690562649324e-06, + "loss": 0.1444, + "num_input_tokens_seen": 15348744, + "step": 7836 + }, + { + "epoch": 1.03870112657389, + "grad_norm": 0.6442499160766602, + "learning_rate": 3.663261798695192e-06, + "loss": 0.0063, + "num_input_tokens_seen": 15350472, + "step": 7837 + }, + { + "epoch": 1.0388336646785952, + "grad_norm": 7.633230686187744, + "learning_rate": 3.662954518696961e-06, + "loss": 0.2288, + "num_input_tokens_seen": 15353112, + "step": 7838 + }, + { + "epoch": 1.0389662027833002, + "grad_norm": 1.9601349830627441, + "learning_rate": 3.662647216276165e-06, + "loss": 0.0773, + "num_input_tokens_seen": 15354608, + "step": 7839 + }, + { + "epoch": 1.0390987408880052, + "grad_norm": 9.072651863098145, + "learning_rate": 3.662339891438729e-06, + "loss": 0.3534, + "num_input_tokens_seen": 15356392, + "step": 7840 + }, + { + "epoch": 1.0392312789927105, + "grad_norm": 7.431890964508057, + "learning_rate": 3.662032544190578e-06, + "loss": 0.2346, + "num_input_tokens_seen": 15358288, + "step": 7841 + }, + { + "epoch": 1.0393638170974155, + "grad_norm": 3.3793728351593018, + "learning_rate": 3.6617251745376377e-06, + "loss": 0.0584, + "num_input_tokens_seen": 15360272, + "step": 7842 + }, + { + "epoch": 1.0394963552021206, + "grad_norm": 13.772272109985352, + "learning_rate": 3.6614177824858345e-06, + "loss": 0.0919, + "num_input_tokens_seen": 15361880, + "step": 7843 + }, + { + "epoch": 1.0396288933068256, + "grad_norm": 5.662418365478516, + "learning_rate": 3.6611103680410954e-06, + "loss": 0.0528, + "num_input_tokens_seen": 15363392, + "step": 7844 + }, + { + "epoch": 1.0397614314115309, + "grad_norm": 5.790665626525879, + "learning_rate": 3.660802931209346e-06, + "loss": 0.0721, + "num_input_tokens_seen": 15365448, + "step": 7845 + }, + { + "epoch": 1.039893969516236, + "grad_norm": 8.851383209228516, + "learning_rate": 3.6604954719965175e-06, + "loss": 0.1901, + "num_input_tokens_seen": 15367360, + "step": 7846 + }, + { + "epoch": 1.040026507620941, + "grad_norm": 5.972159385681152, + "learning_rate": 3.6601879904085345e-06, + "loss": 0.1497, + "num_input_tokens_seen": 15368920, + "step": 7847 + }, + { + "epoch": 1.0401590457256462, + "grad_norm": 15.191608428955078, + "learning_rate": 3.659880486451327e-06, + "loss": 0.3309, + "num_input_tokens_seen": 15370648, + "step": 7848 + }, + { + "epoch": 1.0402915838303513, + "grad_norm": 0.921578586101532, + "learning_rate": 3.659572960130824e-06, + "loss": 0.0046, + "num_input_tokens_seen": 15372416, + "step": 7849 + }, + { + "epoch": 1.0404241219350563, + "grad_norm": 0.9026769399642944, + "learning_rate": 3.6592654114529537e-06, + "loss": 0.0034, + "num_input_tokens_seen": 15373616, + "step": 7850 + }, + { + "epoch": 1.0405566600397613, + "grad_norm": 4.914867401123047, + "learning_rate": 3.6589578404236465e-06, + "loss": 0.0838, + "num_input_tokens_seen": 15376760, + "step": 7851 + }, + { + "epoch": 1.0406891981444666, + "grad_norm": 0.14350825548171997, + "learning_rate": 3.6586502470488326e-06, + "loss": 0.001, + "num_input_tokens_seen": 15377952, + "step": 7852 + }, + { + "epoch": 1.0408217362491716, + "grad_norm": 12.301382064819336, + "learning_rate": 3.6583426313344433e-06, + "loss": 0.2435, + "num_input_tokens_seen": 15379760, + "step": 7853 + }, + { + "epoch": 1.0409542743538767, + "grad_norm": 6.860457420349121, + "learning_rate": 3.6580349932864083e-06, + "loss": 0.1237, + "num_input_tokens_seen": 15382264, + "step": 7854 + }, + { + "epoch": 1.041086812458582, + "grad_norm": 7.114510536193848, + "learning_rate": 3.65772733291066e-06, + "loss": 0.1774, + "num_input_tokens_seen": 15384160, + "step": 7855 + }, + { + "epoch": 1.041219350563287, + "grad_norm": 0.31468909978866577, + "learning_rate": 3.65741965021313e-06, + "loss": 0.0023, + "num_input_tokens_seen": 15385504, + "step": 7856 + }, + { + "epoch": 1.041351888667992, + "grad_norm": 8.943973541259766, + "learning_rate": 3.6571119451997504e-06, + "loss": 0.1581, + "num_input_tokens_seen": 15387880, + "step": 7857 + }, + { + "epoch": 1.0414844267726973, + "grad_norm": 5.880111217498779, + "learning_rate": 3.6568042178764545e-06, + "loss": 0.0464, + "num_input_tokens_seen": 15389504, + "step": 7858 + }, + { + "epoch": 1.0416169648774023, + "grad_norm": 0.21621862053871155, + "learning_rate": 3.6564964682491754e-06, + "loss": 0.0016, + "num_input_tokens_seen": 15390768, + "step": 7859 + }, + { + "epoch": 1.0417495029821073, + "grad_norm": 0.818871259689331, + "learning_rate": 3.656188696323847e-06, + "loss": 0.0061, + "num_input_tokens_seen": 15392544, + "step": 7860 + }, + { + "epoch": 1.0418820410868124, + "grad_norm": 7.558903694152832, + "learning_rate": 3.655880902106402e-06, + "loss": 0.0393, + "num_input_tokens_seen": 15394504, + "step": 7861 + }, + { + "epoch": 1.0420145791915176, + "grad_norm": 19.59562873840332, + "learning_rate": 3.655573085602776e-06, + "loss": 0.4424, + "num_input_tokens_seen": 15396184, + "step": 7862 + }, + { + "epoch": 1.0421471172962227, + "grad_norm": 12.088844299316406, + "learning_rate": 3.6552652468189037e-06, + "loss": 0.1839, + "num_input_tokens_seen": 15397880, + "step": 7863 + }, + { + "epoch": 1.0422796554009277, + "grad_norm": 0.26387423276901245, + "learning_rate": 3.654957385760721e-06, + "loss": 0.0019, + "num_input_tokens_seen": 15400272, + "step": 7864 + }, + { + "epoch": 1.042412193505633, + "grad_norm": 6.885824680328369, + "learning_rate": 3.6546495024341627e-06, + "loss": 0.1756, + "num_input_tokens_seen": 15402864, + "step": 7865 + }, + { + "epoch": 1.042544731610338, + "grad_norm": 1.6332216262817383, + "learning_rate": 3.6543415968451657e-06, + "loss": 0.0112, + "num_input_tokens_seen": 15405584, + "step": 7866 + }, + { + "epoch": 1.042677269715043, + "grad_norm": 2.2447736263275146, + "learning_rate": 3.6540336689996664e-06, + "loss": 0.0528, + "num_input_tokens_seen": 15407544, + "step": 7867 + }, + { + "epoch": 1.042809807819748, + "grad_norm": 6.372786045074463, + "learning_rate": 3.6537257189036014e-06, + "loss": 0.0328, + "num_input_tokens_seen": 15410808, + "step": 7868 + }, + { + "epoch": 1.0429423459244533, + "grad_norm": 7.790759086608887, + "learning_rate": 3.653417746562909e-06, + "loss": 0.2551, + "num_input_tokens_seen": 15413544, + "step": 7869 + }, + { + "epoch": 1.0430748840291584, + "grad_norm": 11.951497077941895, + "learning_rate": 3.653109751983527e-06, + "loss": 0.2086, + "num_input_tokens_seen": 15415768, + "step": 7870 + }, + { + "epoch": 1.0432074221338634, + "grad_norm": 0.10513343662023544, + "learning_rate": 3.652801735171393e-06, + "loss": 0.0008, + "num_input_tokens_seen": 15417112, + "step": 7871 + }, + { + "epoch": 1.0433399602385687, + "grad_norm": 10.91108226776123, + "learning_rate": 3.652493696132446e-06, + "loss": 0.1029, + "num_input_tokens_seen": 15418880, + "step": 7872 + }, + { + "epoch": 1.0434724983432737, + "grad_norm": 5.90530252456665, + "learning_rate": 3.6521856348726258e-06, + "loss": 0.1214, + "num_input_tokens_seen": 15420848, + "step": 7873 + }, + { + "epoch": 1.0436050364479788, + "grad_norm": 0.04488575458526611, + "learning_rate": 3.6518775513978714e-06, + "loss": 0.0003, + "num_input_tokens_seen": 15422184, + "step": 7874 + }, + { + "epoch": 1.0437375745526838, + "grad_norm": 9.93698501586914, + "learning_rate": 3.6515694457141237e-06, + "loss": 0.2576, + "num_input_tokens_seen": 15425016, + "step": 7875 + }, + { + "epoch": 1.043870112657389, + "grad_norm": 12.571539878845215, + "learning_rate": 3.6512613178273228e-06, + "loss": 0.1986, + "num_input_tokens_seen": 15426264, + "step": 7876 + }, + { + "epoch": 1.044002650762094, + "grad_norm": 9.868276596069336, + "learning_rate": 3.650953167743409e-06, + "loss": 0.2965, + "num_input_tokens_seen": 15428344, + "step": 7877 + }, + { + "epoch": 1.0441351888667991, + "grad_norm": 11.722805976867676, + "learning_rate": 3.650644995468324e-06, + "loss": 0.2161, + "num_input_tokens_seen": 15431048, + "step": 7878 + }, + { + "epoch": 1.0442677269715044, + "grad_norm": 4.383892059326172, + "learning_rate": 3.6503368010080104e-06, + "loss": 0.1594, + "num_input_tokens_seen": 15432968, + "step": 7879 + }, + { + "epoch": 1.0444002650762094, + "grad_norm": 0.02683247998356819, + "learning_rate": 3.650028584368409e-06, + "loss": 0.0002, + "num_input_tokens_seen": 15434104, + "step": 7880 + }, + { + "epoch": 1.0445328031809145, + "grad_norm": 0.06939803808927536, + "learning_rate": 3.6497203455554643e-06, + "loss": 0.0005, + "num_input_tokens_seen": 15436528, + "step": 7881 + }, + { + "epoch": 1.0446653412856197, + "grad_norm": 6.24174690246582, + "learning_rate": 3.6494120845751167e-06, + "loss": 0.1593, + "num_input_tokens_seen": 15438312, + "step": 7882 + }, + { + "epoch": 1.0447978793903248, + "grad_norm": 0.052460283041000366, + "learning_rate": 3.649103801433312e-06, + "loss": 0.0004, + "num_input_tokens_seen": 15439888, + "step": 7883 + }, + { + "epoch": 1.0449304174950298, + "grad_norm": 0.09696125239133835, + "learning_rate": 3.6487954961359927e-06, + "loss": 0.0007, + "num_input_tokens_seen": 15443152, + "step": 7884 + }, + { + "epoch": 1.0450629555997348, + "grad_norm": 15.219600677490234, + "learning_rate": 3.6484871686891044e-06, + "loss": 0.2777, + "num_input_tokens_seen": 15446616, + "step": 7885 + }, + { + "epoch": 1.04519549370444, + "grad_norm": 0.45388805866241455, + "learning_rate": 3.648178819098591e-06, + "loss": 0.0017, + "num_input_tokens_seen": 15448936, + "step": 7886 + }, + { + "epoch": 1.0453280318091451, + "grad_norm": 0.14878511428833008, + "learning_rate": 3.6478704473703972e-06, + "loss": 0.0011, + "num_input_tokens_seen": 15451504, + "step": 7887 + }, + { + "epoch": 1.0454605699138502, + "grad_norm": 0.6884857416152954, + "learning_rate": 3.6475620535104707e-06, + "loss": 0.0067, + "num_input_tokens_seen": 15453616, + "step": 7888 + }, + { + "epoch": 1.0455931080185554, + "grad_norm": 0.5440292954444885, + "learning_rate": 3.6472536375247543e-06, + "loss": 0.0028, + "num_input_tokens_seen": 15456720, + "step": 7889 + }, + { + "epoch": 1.0457256461232605, + "grad_norm": 0.18380853533744812, + "learning_rate": 3.6469451994191974e-06, + "loss": 0.0013, + "num_input_tokens_seen": 15459184, + "step": 7890 + }, + { + "epoch": 1.0458581842279655, + "grad_norm": 7.74333381652832, + "learning_rate": 3.6466367391997464e-06, + "loss": 0.1093, + "num_input_tokens_seen": 15461648, + "step": 7891 + }, + { + "epoch": 1.0459907223326705, + "grad_norm": 10.005255699157715, + "learning_rate": 3.6463282568723467e-06, + "loss": 0.2064, + "num_input_tokens_seen": 15463312, + "step": 7892 + }, + { + "epoch": 1.0461232604373758, + "grad_norm": 0.027855021879076958, + "learning_rate": 3.6460197524429487e-06, + "loss": 0.0002, + "num_input_tokens_seen": 15464528, + "step": 7893 + }, + { + "epoch": 1.0462557985420808, + "grad_norm": 11.140253067016602, + "learning_rate": 3.6457112259174985e-06, + "loss": 0.1429, + "num_input_tokens_seen": 15466120, + "step": 7894 + }, + { + "epoch": 1.0463883366467859, + "grad_norm": 3.5038602352142334, + "learning_rate": 3.6454026773019454e-06, + "loss": 0.1865, + "num_input_tokens_seen": 15468536, + "step": 7895 + }, + { + "epoch": 1.0465208747514911, + "grad_norm": 6.214282512664795, + "learning_rate": 3.6450941066022388e-06, + "loss": 0.1239, + "num_input_tokens_seen": 15470632, + "step": 7896 + }, + { + "epoch": 1.0466534128561962, + "grad_norm": 4.667426586151123, + "learning_rate": 3.6447855138243275e-06, + "loss": 0.0467, + "num_input_tokens_seen": 15472160, + "step": 7897 + }, + { + "epoch": 1.0467859509609012, + "grad_norm": 0.024939320981502533, + "learning_rate": 3.644476898974162e-06, + "loss": 0.0002, + "num_input_tokens_seen": 15473328, + "step": 7898 + }, + { + "epoch": 1.0469184890656065, + "grad_norm": 0.029729114845395088, + "learning_rate": 3.6441682620576927e-06, + "loss": 0.0002, + "num_input_tokens_seen": 15474688, + "step": 7899 + }, + { + "epoch": 1.0470510271703115, + "grad_norm": 13.749137878417969, + "learning_rate": 3.6438596030808697e-06, + "loss": 0.1394, + "num_input_tokens_seen": 15476872, + "step": 7900 + }, + { + "epoch": 1.0471835652750165, + "grad_norm": 1.9877148866653442, + "learning_rate": 3.6435509220496444e-06, + "loss": 0.0332, + "num_input_tokens_seen": 15478352, + "step": 7901 + }, + { + "epoch": 1.0473161033797216, + "grad_norm": 6.648719310760498, + "learning_rate": 3.643242218969968e-06, + "loss": 0.1306, + "num_input_tokens_seen": 15480536, + "step": 7902 + }, + { + "epoch": 1.0474486414844268, + "grad_norm": 0.04364347830414772, + "learning_rate": 3.6429334938477937e-06, + "loss": 0.0003, + "num_input_tokens_seen": 15482752, + "step": 7903 + }, + { + "epoch": 1.0475811795891319, + "grad_norm": 2.9016292095184326, + "learning_rate": 3.642624746689073e-06, + "loss": 0.0098, + "num_input_tokens_seen": 15484616, + "step": 7904 + }, + { + "epoch": 1.047713717693837, + "grad_norm": 0.14206485450267792, + "learning_rate": 3.6423159774997584e-06, + "loss": 0.0006, + "num_input_tokens_seen": 15486384, + "step": 7905 + }, + { + "epoch": 1.0478462557985422, + "grad_norm": 0.018472516909241676, + "learning_rate": 3.6420071862858046e-06, + "loss": 0.0001, + "num_input_tokens_seen": 15487664, + "step": 7906 + }, + { + "epoch": 1.0479787939032472, + "grad_norm": 0.9390114545822144, + "learning_rate": 3.6416983730531637e-06, + "loss": 0.0098, + "num_input_tokens_seen": 15489664, + "step": 7907 + }, + { + "epoch": 1.0481113320079523, + "grad_norm": 13.47948169708252, + "learning_rate": 3.641389537807791e-06, + "loss": 0.4652, + "num_input_tokens_seen": 15491568, + "step": 7908 + }, + { + "epoch": 1.0482438701126573, + "grad_norm": 3.4799435138702393, + "learning_rate": 3.6410806805556405e-06, + "loss": 0.0971, + "num_input_tokens_seen": 15494032, + "step": 7909 + }, + { + "epoch": 1.0483764082173626, + "grad_norm": 3.107759952545166, + "learning_rate": 3.6407718013026665e-06, + "loss": 0.0565, + "num_input_tokens_seen": 15496840, + "step": 7910 + }, + { + "epoch": 1.0485089463220676, + "grad_norm": 9.912053108215332, + "learning_rate": 3.6404629000548266e-06, + "loss": 0.2486, + "num_input_tokens_seen": 15499192, + "step": 7911 + }, + { + "epoch": 1.0486414844267726, + "grad_norm": 0.04165896028280258, + "learning_rate": 3.640153976818074e-06, + "loss": 0.0003, + "num_input_tokens_seen": 15501008, + "step": 7912 + }, + { + "epoch": 1.048774022531478, + "grad_norm": 0.024278365075588226, + "learning_rate": 3.6398450315983664e-06, + "loss": 0.0002, + "num_input_tokens_seen": 15503136, + "step": 7913 + }, + { + "epoch": 1.048906560636183, + "grad_norm": 3.063241720199585, + "learning_rate": 3.639536064401661e-06, + "loss": 0.0209, + "num_input_tokens_seen": 15505256, + "step": 7914 + }, + { + "epoch": 1.049039098740888, + "grad_norm": 3.5645577907562256, + "learning_rate": 3.639227075233913e-06, + "loss": 0.0449, + "num_input_tokens_seen": 15507168, + "step": 7915 + }, + { + "epoch": 1.049171636845593, + "grad_norm": 0.09378673881292343, + "learning_rate": 3.638918064101082e-06, + "loss": 0.0005, + "num_input_tokens_seen": 15509504, + "step": 7916 + }, + { + "epoch": 1.0493041749502983, + "grad_norm": 13.254733085632324, + "learning_rate": 3.6386090310091247e-06, + "loss": 0.3593, + "num_input_tokens_seen": 15512064, + "step": 7917 + }, + { + "epoch": 1.0494367130550033, + "grad_norm": 0.22730045020580292, + "learning_rate": 3.638299975964e-06, + "loss": 0.0009, + "num_input_tokens_seen": 15514080, + "step": 7918 + }, + { + "epoch": 1.0495692511597083, + "grad_norm": 2.637681245803833, + "learning_rate": 3.637990898971666e-06, + "loss": 0.0719, + "num_input_tokens_seen": 15516128, + "step": 7919 + }, + { + "epoch": 1.0497017892644136, + "grad_norm": 5.010317802429199, + "learning_rate": 3.637681800038083e-06, + "loss": 0.0241, + "num_input_tokens_seen": 15517440, + "step": 7920 + }, + { + "epoch": 1.0498343273691186, + "grad_norm": 0.019499680027365685, + "learning_rate": 3.63737267916921e-06, + "loss": 0.0001, + "num_input_tokens_seen": 15518696, + "step": 7921 + }, + { + "epoch": 1.0499668654738237, + "grad_norm": 9.007317543029785, + "learning_rate": 3.6370635363710074e-06, + "loss": 0.2072, + "num_input_tokens_seen": 15520376, + "step": 7922 + }, + { + "epoch": 1.050099403578529, + "grad_norm": 22.122814178466797, + "learning_rate": 3.636754371649434e-06, + "loss": 0.7119, + "num_input_tokens_seen": 15522744, + "step": 7923 + }, + { + "epoch": 1.050231941683234, + "grad_norm": 8.595895767211914, + "learning_rate": 3.636445185010453e-06, + "loss": 0.2309, + "num_input_tokens_seen": 15525096, + "step": 7924 + }, + { + "epoch": 1.050364479787939, + "grad_norm": 0.032340340316295624, + "learning_rate": 3.6361359764600256e-06, + "loss": 0.0002, + "num_input_tokens_seen": 15527232, + "step": 7925 + }, + { + "epoch": 1.050497017892644, + "grad_norm": 4.753627777099609, + "learning_rate": 3.6358267460041117e-06, + "loss": 0.0327, + "num_input_tokens_seen": 15529024, + "step": 7926 + }, + { + "epoch": 1.0506295559973493, + "grad_norm": 0.07969135046005249, + "learning_rate": 3.6355174936486748e-06, + "loss": 0.0006, + "num_input_tokens_seen": 15531984, + "step": 7927 + }, + { + "epoch": 1.0507620941020543, + "grad_norm": 5.616736888885498, + "learning_rate": 3.6352082193996774e-06, + "loss": 0.1005, + "num_input_tokens_seen": 15533624, + "step": 7928 + }, + { + "epoch": 1.0508946322067594, + "grad_norm": 6.749379634857178, + "learning_rate": 3.6348989232630826e-06, + "loss": 0.0987, + "num_input_tokens_seen": 15535848, + "step": 7929 + }, + { + "epoch": 1.0510271703114646, + "grad_norm": 4.371708393096924, + "learning_rate": 3.6345896052448533e-06, + "loss": 0.0512, + "num_input_tokens_seen": 15537336, + "step": 7930 + }, + { + "epoch": 1.0511597084161697, + "grad_norm": 8.944043159484863, + "learning_rate": 3.6342802653509537e-06, + "loss": 0.0922, + "num_input_tokens_seen": 15539216, + "step": 7931 + }, + { + "epoch": 1.0512922465208747, + "grad_norm": 1.5652776956558228, + "learning_rate": 3.633970903587348e-06, + "loss": 0.0117, + "num_input_tokens_seen": 15541216, + "step": 7932 + }, + { + "epoch": 1.0514247846255798, + "grad_norm": 9.27216911315918, + "learning_rate": 3.633661519960001e-06, + "loss": 0.3117, + "num_input_tokens_seen": 15543872, + "step": 7933 + }, + { + "epoch": 1.051557322730285, + "grad_norm": 0.022086873650550842, + "learning_rate": 3.6333521144748784e-06, + "loss": 0.0002, + "num_input_tokens_seen": 15545624, + "step": 7934 + }, + { + "epoch": 1.05168986083499, + "grad_norm": 6.73577356338501, + "learning_rate": 3.6330426871379454e-06, + "loss": 0.1788, + "num_input_tokens_seen": 15548080, + "step": 7935 + }, + { + "epoch": 1.051822398939695, + "grad_norm": 10.533881187438965, + "learning_rate": 3.6327332379551666e-06, + "loss": 0.2538, + "num_input_tokens_seen": 15550280, + "step": 7936 + }, + { + "epoch": 1.0519549370444004, + "grad_norm": 3.3311026096343994, + "learning_rate": 3.63242376693251e-06, + "loss": 0.066, + "num_input_tokens_seen": 15551912, + "step": 7937 + }, + { + "epoch": 1.0520874751491054, + "grad_norm": 0.01083206757903099, + "learning_rate": 3.6321142740759423e-06, + "loss": 0.0001, + "num_input_tokens_seen": 15553224, + "step": 7938 + }, + { + "epoch": 1.0522200132538104, + "grad_norm": 10.024352073669434, + "learning_rate": 3.6318047593914297e-06, + "loss": 0.2063, + "num_input_tokens_seen": 15554680, + "step": 7939 + }, + { + "epoch": 1.0523525513585157, + "grad_norm": 1.554442286491394, + "learning_rate": 3.6314952228849408e-06, + "loss": 0.0148, + "num_input_tokens_seen": 15556912, + "step": 7940 + }, + { + "epoch": 1.0524850894632207, + "grad_norm": 2.78432035446167, + "learning_rate": 3.631185664562444e-06, + "loss": 0.0139, + "num_input_tokens_seen": 15558504, + "step": 7941 + }, + { + "epoch": 1.0526176275679258, + "grad_norm": 0.620375394821167, + "learning_rate": 3.6308760844299064e-06, + "loss": 0.0096, + "num_input_tokens_seen": 15560200, + "step": 7942 + }, + { + "epoch": 1.0527501656726308, + "grad_norm": 8.954679489135742, + "learning_rate": 3.6305664824932986e-06, + "loss": 0.3197, + "num_input_tokens_seen": 15562128, + "step": 7943 + }, + { + "epoch": 1.052882703777336, + "grad_norm": 6.514616966247559, + "learning_rate": 3.6302568587585885e-06, + "loss": 0.1718, + "num_input_tokens_seen": 15564592, + "step": 7944 + }, + { + "epoch": 1.053015241882041, + "grad_norm": 7.087714195251465, + "learning_rate": 3.6299472132317463e-06, + "loss": 0.0663, + "num_input_tokens_seen": 15567000, + "step": 7945 + }, + { + "epoch": 1.0531477799867461, + "grad_norm": 6.69550895690918, + "learning_rate": 3.629637545918743e-06, + "loss": 0.168, + "num_input_tokens_seen": 15568576, + "step": 7946 + }, + { + "epoch": 1.0532803180914514, + "grad_norm": 0.9348760843276978, + "learning_rate": 3.629327856825548e-06, + "loss": 0.0059, + "num_input_tokens_seen": 15571728, + "step": 7947 + }, + { + "epoch": 1.0534128561961564, + "grad_norm": 0.5714061856269836, + "learning_rate": 3.629018145958133e-06, + "loss": 0.0047, + "num_input_tokens_seen": 15573088, + "step": 7948 + }, + { + "epoch": 1.0535453943008615, + "grad_norm": 12.479203224182129, + "learning_rate": 3.6287084133224693e-06, + "loss": 0.1991, + "num_input_tokens_seen": 15576712, + "step": 7949 + }, + { + "epoch": 1.0536779324055665, + "grad_norm": 6.5978312492370605, + "learning_rate": 3.6283986589245283e-06, + "loss": 0.0816, + "num_input_tokens_seen": 15578864, + "step": 7950 + }, + { + "epoch": 1.0538104705102718, + "grad_norm": 17.88849449157715, + "learning_rate": 3.628088882770283e-06, + "loss": 0.7151, + "num_input_tokens_seen": 15580856, + "step": 7951 + }, + { + "epoch": 1.0539430086149768, + "grad_norm": 0.022077850997447968, + "learning_rate": 3.6277790848657055e-06, + "loss": 0.0002, + "num_input_tokens_seen": 15581872, + "step": 7952 + }, + { + "epoch": 1.0540755467196818, + "grad_norm": 0.027225205674767494, + "learning_rate": 3.6274692652167703e-06, + "loss": 0.0002, + "num_input_tokens_seen": 15583632, + "step": 7953 + }, + { + "epoch": 1.054208084824387, + "grad_norm": 4.710811614990234, + "learning_rate": 3.627159423829449e-06, + "loss": 0.0953, + "num_input_tokens_seen": 15585832, + "step": 7954 + }, + { + "epoch": 1.0543406229290921, + "grad_norm": 0.7662599682807922, + "learning_rate": 3.6268495607097166e-06, + "loss": 0.006, + "num_input_tokens_seen": 15587240, + "step": 7955 + }, + { + "epoch": 1.0544731610337972, + "grad_norm": 0.5820773243904114, + "learning_rate": 3.6265396758635475e-06, + "loss": 0.0071, + "num_input_tokens_seen": 15588728, + "step": 7956 + }, + { + "epoch": 1.0546056991385022, + "grad_norm": 9.566177368164062, + "learning_rate": 3.6262297692969155e-06, + "loss": 0.0658, + "num_input_tokens_seen": 15590408, + "step": 7957 + }, + { + "epoch": 1.0547382372432075, + "grad_norm": 0.06261811405420303, + "learning_rate": 3.6259198410157973e-06, + "loss": 0.0005, + "num_input_tokens_seen": 15592480, + "step": 7958 + }, + { + "epoch": 1.0548707753479125, + "grad_norm": 5.135053634643555, + "learning_rate": 3.625609891026168e-06, + "loss": 0.0409, + "num_input_tokens_seen": 15594088, + "step": 7959 + }, + { + "epoch": 1.0550033134526176, + "grad_norm": 9.567028045654297, + "learning_rate": 3.6252999193340034e-06, + "loss": 0.3779, + "num_input_tokens_seen": 15596904, + "step": 7960 + }, + { + "epoch": 1.0551358515573228, + "grad_norm": 3.100637197494507, + "learning_rate": 3.6249899259452797e-06, + "loss": 0.0555, + "num_input_tokens_seen": 15598208, + "step": 7961 + }, + { + "epoch": 1.0552683896620279, + "grad_norm": 5.322288990020752, + "learning_rate": 3.6246799108659743e-06, + "loss": 0.1527, + "num_input_tokens_seen": 15600064, + "step": 7962 + }, + { + "epoch": 1.0554009277667329, + "grad_norm": 9.8389253616333, + "learning_rate": 3.624369874102064e-06, + "loss": 0.2311, + "num_input_tokens_seen": 15602128, + "step": 7963 + }, + { + "epoch": 1.055533465871438, + "grad_norm": 10.773490905761719, + "learning_rate": 3.6240598156595276e-06, + "loss": 0.1671, + "num_input_tokens_seen": 15604200, + "step": 7964 + }, + { + "epoch": 1.0556660039761432, + "grad_norm": 1.9360040426254272, + "learning_rate": 3.6237497355443417e-06, + "loss": 0.0253, + "num_input_tokens_seen": 15605872, + "step": 7965 + }, + { + "epoch": 1.0557985420808482, + "grad_norm": 10.263525009155273, + "learning_rate": 3.623439633762486e-06, + "loss": 0.0645, + "num_input_tokens_seen": 15609072, + "step": 7966 + }, + { + "epoch": 1.0559310801855533, + "grad_norm": 8.068907737731934, + "learning_rate": 3.6231295103199394e-06, + "loss": 0.1277, + "num_input_tokens_seen": 15611088, + "step": 7967 + }, + { + "epoch": 1.0560636182902585, + "grad_norm": 3.7351903915405273, + "learning_rate": 3.6228193652226813e-06, + "loss": 0.0571, + "num_input_tokens_seen": 15612944, + "step": 7968 + }, + { + "epoch": 1.0561961563949636, + "grad_norm": 4.450117111206055, + "learning_rate": 3.6225091984766904e-06, + "loss": 0.071, + "num_input_tokens_seen": 15615088, + "step": 7969 + }, + { + "epoch": 1.0563286944996686, + "grad_norm": 2.517181396484375, + "learning_rate": 3.6221990100879477e-06, + "loss": 0.0363, + "num_input_tokens_seen": 15616944, + "step": 7970 + }, + { + "epoch": 1.0564612326043739, + "grad_norm": 6.846189975738525, + "learning_rate": 3.6218888000624353e-06, + "loss": 0.1158, + "num_input_tokens_seen": 15618920, + "step": 7971 + }, + { + "epoch": 1.056593770709079, + "grad_norm": 0.2078702747821808, + "learning_rate": 3.621578568406131e-06, + "loss": 0.0016, + "num_input_tokens_seen": 15621712, + "step": 7972 + }, + { + "epoch": 1.056726308813784, + "grad_norm": 0.3657509386539459, + "learning_rate": 3.6212683151250194e-06, + "loss": 0.0028, + "num_input_tokens_seen": 15623792, + "step": 7973 + }, + { + "epoch": 1.056858846918489, + "grad_norm": 6.588100433349609, + "learning_rate": 3.6209580402250816e-06, + "loss": 0.1765, + "num_input_tokens_seen": 15625336, + "step": 7974 + }, + { + "epoch": 1.0569913850231942, + "grad_norm": 0.3558519780635834, + "learning_rate": 3.6206477437122977e-06, + "loss": 0.0027, + "num_input_tokens_seen": 15626384, + "step": 7975 + }, + { + "epoch": 1.0571239231278993, + "grad_norm": 5.9306135177612305, + "learning_rate": 3.6203374255926537e-06, + "loss": 0.1036, + "num_input_tokens_seen": 15628432, + "step": 7976 + }, + { + "epoch": 1.0572564612326043, + "grad_norm": 13.191484451293945, + "learning_rate": 3.620027085872131e-06, + "loss": 0.287, + "num_input_tokens_seen": 15630152, + "step": 7977 + }, + { + "epoch": 1.0573889993373096, + "grad_norm": 0.3466063439846039, + "learning_rate": 3.6197167245567128e-06, + "loss": 0.0019, + "num_input_tokens_seen": 15632144, + "step": 7978 + }, + { + "epoch": 1.0575215374420146, + "grad_norm": 11.53245735168457, + "learning_rate": 3.6194063416523844e-06, + "loss": 0.324, + "num_input_tokens_seen": 15635000, + "step": 7979 + }, + { + "epoch": 1.0576540755467196, + "grad_norm": 11.756266593933105, + "learning_rate": 3.6190959371651293e-06, + "loss": 0.1556, + "num_input_tokens_seen": 15638216, + "step": 7980 + }, + { + "epoch": 1.0577866136514247, + "grad_norm": 3.1588351726531982, + "learning_rate": 3.6187855111009325e-06, + "loss": 0.0648, + "num_input_tokens_seen": 15641104, + "step": 7981 + }, + { + "epoch": 1.05791915175613, + "grad_norm": 6.496006488800049, + "learning_rate": 3.6184750634657796e-06, + "loss": 0.1822, + "num_input_tokens_seen": 15642960, + "step": 7982 + }, + { + "epoch": 1.058051689860835, + "grad_norm": 0.6988822817802429, + "learning_rate": 3.618164594265655e-06, + "loss": 0.0082, + "num_input_tokens_seen": 15645112, + "step": 7983 + }, + { + "epoch": 1.05818422796554, + "grad_norm": 6.729306221008301, + "learning_rate": 3.617854103506546e-06, + "loss": 0.1606, + "num_input_tokens_seen": 15646928, + "step": 7984 + }, + { + "epoch": 1.0583167660702453, + "grad_norm": 9.223274230957031, + "learning_rate": 3.6175435911944385e-06, + "loss": 0.1782, + "num_input_tokens_seen": 15648544, + "step": 7985 + }, + { + "epoch": 1.0584493041749503, + "grad_norm": 0.09302958101034164, + "learning_rate": 3.6172330573353203e-06, + "loss": 0.0007, + "num_input_tokens_seen": 15649840, + "step": 7986 + }, + { + "epoch": 1.0585818422796553, + "grad_norm": 12.603796005249023, + "learning_rate": 3.616922501935177e-06, + "loss": 0.2568, + "num_input_tokens_seen": 15651320, + "step": 7987 + }, + { + "epoch": 1.0587143803843606, + "grad_norm": 6.875571250915527, + "learning_rate": 3.6166119249999985e-06, + "loss": 0.1152, + "num_input_tokens_seen": 15653488, + "step": 7988 + }, + { + "epoch": 1.0588469184890656, + "grad_norm": 7.396050930023193, + "learning_rate": 3.6163013265357716e-06, + "loss": 0.188, + "num_input_tokens_seen": 15655680, + "step": 7989 + }, + { + "epoch": 1.0589794565937707, + "grad_norm": 9.612442016601562, + "learning_rate": 3.615990706548484e-06, + "loss": 0.2561, + "num_input_tokens_seen": 15657928, + "step": 7990 + }, + { + "epoch": 1.0591119946984757, + "grad_norm": 3.7665109634399414, + "learning_rate": 3.6156800650441266e-06, + "loss": 0.0395, + "num_input_tokens_seen": 15659472, + "step": 7991 + }, + { + "epoch": 1.059244532803181, + "grad_norm": 5.557758808135986, + "learning_rate": 3.6153694020286876e-06, + "loss": 0.0386, + "num_input_tokens_seen": 15662088, + "step": 7992 + }, + { + "epoch": 1.059377070907886, + "grad_norm": 0.1930140107870102, + "learning_rate": 3.615058717508157e-06, + "loss": 0.0015, + "num_input_tokens_seen": 15663968, + "step": 7993 + }, + { + "epoch": 1.059509609012591, + "grad_norm": 1.4421720504760742, + "learning_rate": 3.6147480114885255e-06, + "loss": 0.0095, + "num_input_tokens_seen": 15665056, + "step": 7994 + }, + { + "epoch": 1.0596421471172963, + "grad_norm": 0.11434435844421387, + "learning_rate": 3.614437283975783e-06, + "loss": 0.0009, + "num_input_tokens_seen": 15666216, + "step": 7995 + }, + { + "epoch": 1.0597746852220014, + "grad_norm": 5.47565221786499, + "learning_rate": 3.61412653497592e-06, + "loss": 0.164, + "num_input_tokens_seen": 15669144, + "step": 7996 + }, + { + "epoch": 1.0599072233267064, + "grad_norm": 9.914389610290527, + "learning_rate": 3.6138157644949307e-06, + "loss": 0.1557, + "num_input_tokens_seen": 15670800, + "step": 7997 + }, + { + "epoch": 1.0600397614314114, + "grad_norm": 12.099952697753906, + "learning_rate": 3.6135049725388036e-06, + "loss": 0.4654, + "num_input_tokens_seen": 15673584, + "step": 7998 + }, + { + "epoch": 1.0601722995361167, + "grad_norm": 0.09205961972475052, + "learning_rate": 3.613194159113533e-06, + "loss": 0.0007, + "num_input_tokens_seen": 15674960, + "step": 7999 + }, + { + "epoch": 1.0603048376408217, + "grad_norm": 0.14178790152072906, + "learning_rate": 3.6128833242251115e-06, + "loss": 0.0011, + "num_input_tokens_seen": 15677320, + "step": 8000 + }, + { + "epoch": 1.0604373757455268, + "grad_norm": 0.8104797005653381, + "learning_rate": 3.612572467879531e-06, + "loss": 0.0056, + "num_input_tokens_seen": 15678816, + "step": 8001 + }, + { + "epoch": 1.060569913850232, + "grad_norm": 3.1845006942749023, + "learning_rate": 3.6122615900827855e-06, + "loss": 0.0399, + "num_input_tokens_seen": 15681680, + "step": 8002 + }, + { + "epoch": 1.060702451954937, + "grad_norm": 6.671764850616455, + "learning_rate": 3.61195069084087e-06, + "loss": 0.2277, + "num_input_tokens_seen": 15683248, + "step": 8003 + }, + { + "epoch": 1.060834990059642, + "grad_norm": 6.917355060577393, + "learning_rate": 3.611639770159778e-06, + "loss": 0.1731, + "num_input_tokens_seen": 15685192, + "step": 8004 + }, + { + "epoch": 1.0609675281643471, + "grad_norm": 5.837344646453857, + "learning_rate": 3.611328828045504e-06, + "loss": 0.0849, + "num_input_tokens_seen": 15686936, + "step": 8005 + }, + { + "epoch": 1.0611000662690524, + "grad_norm": 8.548568725585938, + "learning_rate": 3.6110178645040437e-06, + "loss": 0.3469, + "num_input_tokens_seen": 15689216, + "step": 8006 + }, + { + "epoch": 1.0612326043737574, + "grad_norm": 13.973820686340332, + "learning_rate": 3.6107068795413926e-06, + "loss": 0.1346, + "num_input_tokens_seen": 15690704, + "step": 8007 + }, + { + "epoch": 1.0613651424784625, + "grad_norm": 0.3159739077091217, + "learning_rate": 3.6103958731635468e-06, + "loss": 0.0024, + "num_input_tokens_seen": 15692776, + "step": 8008 + }, + { + "epoch": 1.0614976805831677, + "grad_norm": 3.2810280323028564, + "learning_rate": 3.6100848453765027e-06, + "loss": 0.0429, + "num_input_tokens_seen": 15694208, + "step": 8009 + }, + { + "epoch": 1.0616302186878728, + "grad_norm": 0.11063665896654129, + "learning_rate": 3.609773796186256e-06, + "loss": 0.0008, + "num_input_tokens_seen": 15696032, + "step": 8010 + }, + { + "epoch": 1.0617627567925778, + "grad_norm": 10.046136856079102, + "learning_rate": 3.609462725598806e-06, + "loss": 0.1981, + "num_input_tokens_seen": 15698640, + "step": 8011 + }, + { + "epoch": 1.061895294897283, + "grad_norm": 9.504996299743652, + "learning_rate": 3.6091516336201483e-06, + "loss": 0.1076, + "num_input_tokens_seen": 15700184, + "step": 8012 + }, + { + "epoch": 1.062027833001988, + "grad_norm": 0.27269691228866577, + "learning_rate": 3.6088405202562826e-06, + "loss": 0.0021, + "num_input_tokens_seen": 15701592, + "step": 8013 + }, + { + "epoch": 1.0621603711066931, + "grad_norm": 1.5353727340698242, + "learning_rate": 3.6085293855132063e-06, + "loss": 0.0062, + "num_input_tokens_seen": 15703088, + "step": 8014 + }, + { + "epoch": 1.0622929092113982, + "grad_norm": 4.5996551513671875, + "learning_rate": 3.608218229396919e-06, + "loss": 0.1748, + "num_input_tokens_seen": 15704760, + "step": 8015 + }, + { + "epoch": 1.0624254473161034, + "grad_norm": 2.493993043899536, + "learning_rate": 3.6079070519134196e-06, + "loss": 0.0243, + "num_input_tokens_seen": 15706296, + "step": 8016 + }, + { + "epoch": 1.0625579854208085, + "grad_norm": 4.057400703430176, + "learning_rate": 3.6075958530687084e-06, + "loss": 0.0804, + "num_input_tokens_seen": 15707520, + "step": 8017 + }, + { + "epoch": 1.0626905235255135, + "grad_norm": 0.1196255311369896, + "learning_rate": 3.607284632868785e-06, + "loss": 0.0009, + "num_input_tokens_seen": 15709912, + "step": 8018 + }, + { + "epoch": 1.0628230616302188, + "grad_norm": 0.3267013430595398, + "learning_rate": 3.6069733913196487e-06, + "loss": 0.0025, + "num_input_tokens_seen": 15712176, + "step": 8019 + }, + { + "epoch": 1.0629555997349238, + "grad_norm": 5.814580917358398, + "learning_rate": 3.606662128427303e-06, + "loss": 0.0282, + "num_input_tokens_seen": 15714912, + "step": 8020 + }, + { + "epoch": 1.0630881378396289, + "grad_norm": 0.09737732261419296, + "learning_rate": 3.6063508441977478e-06, + "loss": 0.0007, + "num_input_tokens_seen": 15717888, + "step": 8021 + }, + { + "epoch": 1.063220675944334, + "grad_norm": 5.149318218231201, + "learning_rate": 3.6060395386369852e-06, + "loss": 0.1045, + "num_input_tokens_seen": 15720096, + "step": 8022 + }, + { + "epoch": 1.0633532140490392, + "grad_norm": 4.0814127922058105, + "learning_rate": 3.605728211751017e-06, + "loss": 0.0303, + "num_input_tokens_seen": 15722120, + "step": 8023 + }, + { + "epoch": 1.0634857521537442, + "grad_norm": 14.288308143615723, + "learning_rate": 3.6054168635458464e-06, + "loss": 0.3516, + "num_input_tokens_seen": 15724544, + "step": 8024 + }, + { + "epoch": 1.0636182902584492, + "grad_norm": 4.778275966644287, + "learning_rate": 3.6051054940274755e-06, + "loss": 0.094, + "num_input_tokens_seen": 15727016, + "step": 8025 + }, + { + "epoch": 1.0637508283631545, + "grad_norm": 0.04523913189768791, + "learning_rate": 3.604794103201909e-06, + "loss": 0.0003, + "num_input_tokens_seen": 15728160, + "step": 8026 + }, + { + "epoch": 1.0638833664678595, + "grad_norm": 14.25879955291748, + "learning_rate": 3.6044826910751496e-06, + "loss": 0.1672, + "num_input_tokens_seen": 15730944, + "step": 8027 + }, + { + "epoch": 1.0640159045725646, + "grad_norm": 6.4809370040893555, + "learning_rate": 3.6041712576532017e-06, + "loss": 0.0662, + "num_input_tokens_seen": 15732320, + "step": 8028 + }, + { + "epoch": 1.0641484426772698, + "grad_norm": 0.08630184084177017, + "learning_rate": 3.603859802942071e-06, + "loss": 0.0006, + "num_input_tokens_seen": 15735152, + "step": 8029 + }, + { + "epoch": 1.0642809807819749, + "grad_norm": 7.073913097381592, + "learning_rate": 3.6035483269477613e-06, + "loss": 0.1699, + "num_input_tokens_seen": 15737520, + "step": 8030 + }, + { + "epoch": 1.06441351888668, + "grad_norm": 0.057469792664051056, + "learning_rate": 3.603236829676279e-06, + "loss": 0.0004, + "num_input_tokens_seen": 15738752, + "step": 8031 + }, + { + "epoch": 1.064546056991385, + "grad_norm": 12.062638282775879, + "learning_rate": 3.6029253111336286e-06, + "loss": 0.3565, + "num_input_tokens_seen": 15740856, + "step": 8032 + }, + { + "epoch": 1.0646785950960902, + "grad_norm": 2.599116325378418, + "learning_rate": 3.602613771325818e-06, + "loss": 0.0466, + "num_input_tokens_seen": 15742520, + "step": 8033 + }, + { + "epoch": 1.0648111332007952, + "grad_norm": 5.615490436553955, + "learning_rate": 3.602302210258853e-06, + "loss": 0.0597, + "num_input_tokens_seen": 15745592, + "step": 8034 + }, + { + "epoch": 1.0649436713055003, + "grad_norm": 1.8426272869110107, + "learning_rate": 3.601990627938741e-06, + "loss": 0.0367, + "num_input_tokens_seen": 15748440, + "step": 8035 + }, + { + "epoch": 1.0650762094102055, + "grad_norm": 0.04948681220412254, + "learning_rate": 3.6016790243714903e-06, + "loss": 0.0004, + "num_input_tokens_seen": 15750080, + "step": 8036 + }, + { + "epoch": 1.0652087475149106, + "grad_norm": 9.407215118408203, + "learning_rate": 3.6013673995631066e-06, + "loss": 0.1188, + "num_input_tokens_seen": 15752208, + "step": 8037 + }, + { + "epoch": 1.0653412856196156, + "grad_norm": 7.872118949890137, + "learning_rate": 3.6010557535196005e-06, + "loss": 0.2509, + "num_input_tokens_seen": 15754288, + "step": 8038 + }, + { + "epoch": 1.0654738237243206, + "grad_norm": 6.255258083343506, + "learning_rate": 3.60074408624698e-06, + "loss": 0.2379, + "num_input_tokens_seen": 15755984, + "step": 8039 + }, + { + "epoch": 1.065606361829026, + "grad_norm": 12.880196571350098, + "learning_rate": 3.6004323977512534e-06, + "loss": 0.5036, + "num_input_tokens_seen": 15757888, + "step": 8040 + }, + { + "epoch": 1.065738899933731, + "grad_norm": 9.998342514038086, + "learning_rate": 3.600120688038432e-06, + "loss": 0.2828, + "num_input_tokens_seen": 15759824, + "step": 8041 + }, + { + "epoch": 1.065871438038436, + "grad_norm": 0.03212375566363335, + "learning_rate": 3.599808957114524e-06, + "loss": 0.0002, + "num_input_tokens_seen": 15761160, + "step": 8042 + }, + { + "epoch": 1.0660039761431412, + "grad_norm": 4.342859745025635, + "learning_rate": 3.5994972049855402e-06, + "loss": 0.1486, + "num_input_tokens_seen": 15762816, + "step": 8043 + }, + { + "epoch": 1.0661365142478463, + "grad_norm": 0.11181376874446869, + "learning_rate": 3.599185431657493e-06, + "loss": 0.0008, + "num_input_tokens_seen": 15764528, + "step": 8044 + }, + { + "epoch": 1.0662690523525513, + "grad_norm": 0.03542919456958771, + "learning_rate": 3.5988736371363917e-06, + "loss": 0.0003, + "num_input_tokens_seen": 15765904, + "step": 8045 + }, + { + "epoch": 1.0664015904572564, + "grad_norm": 2.2654542922973633, + "learning_rate": 3.5985618214282493e-06, + "loss": 0.0318, + "num_input_tokens_seen": 15767144, + "step": 8046 + }, + { + "epoch": 1.0665341285619616, + "grad_norm": 0.371366411447525, + "learning_rate": 3.598249984539076e-06, + "loss": 0.0025, + "num_input_tokens_seen": 15768800, + "step": 8047 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 5.629929542541504, + "learning_rate": 3.5979381264748864e-06, + "loss": 0.1896, + "num_input_tokens_seen": 15770808, + "step": 8048 + }, + { + "epoch": 1.0667992047713717, + "grad_norm": 6.7023024559021, + "learning_rate": 3.5976262472416912e-06, + "loss": 0.1372, + "num_input_tokens_seen": 15773168, + "step": 8049 + }, + { + "epoch": 1.066931742876077, + "grad_norm": 8.313822746276855, + "learning_rate": 3.5973143468455053e-06, + "loss": 0.1465, + "num_input_tokens_seen": 15774792, + "step": 8050 + }, + { + "epoch": 1.067064280980782, + "grad_norm": 0.24633155763149261, + "learning_rate": 3.5970024252923422e-06, + "loss": 0.0018, + "num_input_tokens_seen": 15776912, + "step": 8051 + }, + { + "epoch": 1.067196819085487, + "grad_norm": 5.482618808746338, + "learning_rate": 3.5966904825882155e-06, + "loss": 0.1024, + "num_input_tokens_seen": 15778776, + "step": 8052 + }, + { + "epoch": 1.067329357190192, + "grad_norm": 0.06309965252876282, + "learning_rate": 3.5963785187391394e-06, + "loss": 0.0005, + "num_input_tokens_seen": 15780512, + "step": 8053 + }, + { + "epoch": 1.0674618952948973, + "grad_norm": 0.19172799587249756, + "learning_rate": 3.596066533751129e-06, + "loss": 0.0014, + "num_input_tokens_seen": 15783416, + "step": 8054 + }, + { + "epoch": 1.0675944333996024, + "grad_norm": 5.725820541381836, + "learning_rate": 3.595754527630199e-06, + "loss": 0.0403, + "num_input_tokens_seen": 15785312, + "step": 8055 + }, + { + "epoch": 1.0677269715043074, + "grad_norm": 2.857809066772461, + "learning_rate": 3.595442500382367e-06, + "loss": 0.0958, + "num_input_tokens_seen": 15787576, + "step": 8056 + }, + { + "epoch": 1.0678595096090127, + "grad_norm": 6.153066158294678, + "learning_rate": 3.5951304520136474e-06, + "loss": 0.1129, + "num_input_tokens_seen": 15789272, + "step": 8057 + }, + { + "epoch": 1.0679920477137177, + "grad_norm": 0.520875096321106, + "learning_rate": 3.5948183825300574e-06, + "loss": 0.0034, + "num_input_tokens_seen": 15791832, + "step": 8058 + }, + { + "epoch": 1.0681245858184227, + "grad_norm": 9.945199966430664, + "learning_rate": 3.594506291937614e-06, + "loss": 0.1561, + "num_input_tokens_seen": 15793528, + "step": 8059 + }, + { + "epoch": 1.068257123923128, + "grad_norm": 10.815088272094727, + "learning_rate": 3.5941941802423335e-06, + "loss": 0.4284, + "num_input_tokens_seen": 15796104, + "step": 8060 + }, + { + "epoch": 1.068389662027833, + "grad_norm": 0.019287072122097015, + "learning_rate": 3.5938820474502347e-06, + "loss": 0.0001, + "num_input_tokens_seen": 15797096, + "step": 8061 + }, + { + "epoch": 1.068522200132538, + "grad_norm": 5.974384784698486, + "learning_rate": 3.5935698935673356e-06, + "loss": 0.1707, + "num_input_tokens_seen": 15799752, + "step": 8062 + }, + { + "epoch": 1.068654738237243, + "grad_norm": 6.245574951171875, + "learning_rate": 3.593257718599654e-06, + "loss": 0.0765, + "num_input_tokens_seen": 15801504, + "step": 8063 + }, + { + "epoch": 1.0687872763419484, + "grad_norm": 7.039210796356201, + "learning_rate": 3.59294552255321e-06, + "loss": 0.0959, + "num_input_tokens_seen": 15803280, + "step": 8064 + }, + { + "epoch": 1.0689198144466534, + "grad_norm": 5.357276916503906, + "learning_rate": 3.5926333054340224e-06, + "loss": 0.0538, + "num_input_tokens_seen": 15805576, + "step": 8065 + }, + { + "epoch": 1.0690523525513584, + "grad_norm": 0.06459338217973709, + "learning_rate": 3.59232106724811e-06, + "loss": 0.0004, + "num_input_tokens_seen": 15807264, + "step": 8066 + }, + { + "epoch": 1.0691848906560637, + "grad_norm": 4.824656009674072, + "learning_rate": 3.592008808001495e-06, + "loss": 0.0854, + "num_input_tokens_seen": 15809000, + "step": 8067 + }, + { + "epoch": 1.0693174287607687, + "grad_norm": 12.060391426086426, + "learning_rate": 3.591696527700196e-06, + "loss": 0.2814, + "num_input_tokens_seen": 15810720, + "step": 8068 + }, + { + "epoch": 1.0694499668654738, + "grad_norm": 11.24808406829834, + "learning_rate": 3.5913842263502348e-06, + "loss": 0.0572, + "num_input_tokens_seen": 15812408, + "step": 8069 + }, + { + "epoch": 1.069582504970179, + "grad_norm": 8.627835273742676, + "learning_rate": 3.591071903957633e-06, + "loss": 0.1992, + "num_input_tokens_seen": 15814984, + "step": 8070 + }, + { + "epoch": 1.069715043074884, + "grad_norm": 4.567038536071777, + "learning_rate": 3.5907595605284124e-06, + "loss": 0.1277, + "num_input_tokens_seen": 15816320, + "step": 8071 + }, + { + "epoch": 1.0698475811795891, + "grad_norm": 10.388762474060059, + "learning_rate": 3.590447196068595e-06, + "loss": 0.1898, + "num_input_tokens_seen": 15818032, + "step": 8072 + }, + { + "epoch": 1.0699801192842942, + "grad_norm": 2.58420467376709, + "learning_rate": 3.590134810584203e-06, + "loss": 0.0339, + "num_input_tokens_seen": 15819904, + "step": 8073 + }, + { + "epoch": 1.0701126573889994, + "grad_norm": 0.4488075077533722, + "learning_rate": 3.58982240408126e-06, + "loss": 0.0014, + "num_input_tokens_seen": 15821576, + "step": 8074 + }, + { + "epoch": 1.0702451954937044, + "grad_norm": 10.076026916503906, + "learning_rate": 3.589509976565789e-06, + "loss": 0.1657, + "num_input_tokens_seen": 15823296, + "step": 8075 + }, + { + "epoch": 1.0703777335984095, + "grad_norm": 4.311559200286865, + "learning_rate": 3.5891975280438147e-06, + "loss": 0.1262, + "num_input_tokens_seen": 15824808, + "step": 8076 + }, + { + "epoch": 1.0705102717031147, + "grad_norm": 8.541370391845703, + "learning_rate": 3.588885058521361e-06, + "loss": 0.1557, + "num_input_tokens_seen": 15826528, + "step": 8077 + }, + { + "epoch": 1.0706428098078198, + "grad_norm": 10.6084566116333, + "learning_rate": 3.588572568004451e-06, + "loss": 0.187, + "num_input_tokens_seen": 15828160, + "step": 8078 + }, + { + "epoch": 1.0707753479125248, + "grad_norm": 3.0699830055236816, + "learning_rate": 3.588260056499111e-06, + "loss": 0.0206, + "num_input_tokens_seen": 15829976, + "step": 8079 + }, + { + "epoch": 1.0709078860172299, + "grad_norm": 22.24512481689453, + "learning_rate": 3.5879475240113676e-06, + "loss": 0.597, + "num_input_tokens_seen": 15832080, + "step": 8080 + }, + { + "epoch": 1.0710404241219351, + "grad_norm": 8.170434951782227, + "learning_rate": 3.5876349705472445e-06, + "loss": 0.1425, + "num_input_tokens_seen": 15833768, + "step": 8081 + }, + { + "epoch": 1.0711729622266402, + "grad_norm": 0.05929776653647423, + "learning_rate": 3.5873223961127683e-06, + "loss": 0.0004, + "num_input_tokens_seen": 15835664, + "step": 8082 + }, + { + "epoch": 1.0713055003313452, + "grad_norm": 6.4934892654418945, + "learning_rate": 3.5870098007139676e-06, + "loss": 0.2244, + "num_input_tokens_seen": 15837720, + "step": 8083 + }, + { + "epoch": 1.0714380384360505, + "grad_norm": 4.574771881103516, + "learning_rate": 3.5866971843568667e-06, + "loss": 0.1167, + "num_input_tokens_seen": 15839528, + "step": 8084 + }, + { + "epoch": 1.0715705765407555, + "grad_norm": 19.111574172973633, + "learning_rate": 3.586384547047495e-06, + "loss": 0.6393, + "num_input_tokens_seen": 15842224, + "step": 8085 + }, + { + "epoch": 1.0717031146454605, + "grad_norm": 9.524335861206055, + "learning_rate": 3.58607188879188e-06, + "loss": 0.2351, + "num_input_tokens_seen": 15843904, + "step": 8086 + }, + { + "epoch": 1.0718356527501656, + "grad_norm": 7.590734481811523, + "learning_rate": 3.5857592095960492e-06, + "loss": 0.1356, + "num_input_tokens_seen": 15845536, + "step": 8087 + }, + { + "epoch": 1.0719681908548708, + "grad_norm": 2.6261420249938965, + "learning_rate": 3.5854465094660325e-06, + "loss": 0.0414, + "num_input_tokens_seen": 15848136, + "step": 8088 + }, + { + "epoch": 1.0721007289595759, + "grad_norm": 0.08045609295368195, + "learning_rate": 3.5851337884078573e-06, + "loss": 0.0006, + "num_input_tokens_seen": 15850160, + "step": 8089 + }, + { + "epoch": 1.072233267064281, + "grad_norm": 0.09503606706857681, + "learning_rate": 3.5848210464275546e-06, + "loss": 0.0007, + "num_input_tokens_seen": 15852576, + "step": 8090 + }, + { + "epoch": 1.0723658051689862, + "grad_norm": 5.456388473510742, + "learning_rate": 3.584508283531154e-06, + "loss": 0.0669, + "num_input_tokens_seen": 15854080, + "step": 8091 + }, + { + "epoch": 1.0724983432736912, + "grad_norm": 0.08958735316991806, + "learning_rate": 3.584195499724686e-06, + "loss": 0.0007, + "num_input_tokens_seen": 15856544, + "step": 8092 + }, + { + "epoch": 1.0726308813783962, + "grad_norm": 7.314899921417236, + "learning_rate": 3.58388269501418e-06, + "loss": 0.1473, + "num_input_tokens_seen": 15858896, + "step": 8093 + }, + { + "epoch": 1.0727634194831013, + "grad_norm": 4.568482398986816, + "learning_rate": 3.5835698694056675e-06, + "loss": 0.0727, + "num_input_tokens_seen": 15861568, + "step": 8094 + }, + { + "epoch": 1.0728959575878065, + "grad_norm": 6.254690647125244, + "learning_rate": 3.583257022905181e-06, + "loss": 0.1294, + "num_input_tokens_seen": 15863192, + "step": 8095 + }, + { + "epoch": 1.0730284956925116, + "grad_norm": 7.0536112785339355, + "learning_rate": 3.582944155518752e-06, + "loss": 0.0943, + "num_input_tokens_seen": 15864968, + "step": 8096 + }, + { + "epoch": 1.0731610337972166, + "grad_norm": 10.413454055786133, + "learning_rate": 3.5826312672524123e-06, + "loss": 0.22, + "num_input_tokens_seen": 15867200, + "step": 8097 + }, + { + "epoch": 1.0732935719019219, + "grad_norm": 6.400625228881836, + "learning_rate": 3.582318358112195e-06, + "loss": 0.1419, + "num_input_tokens_seen": 15869024, + "step": 8098 + }, + { + "epoch": 1.073426110006627, + "grad_norm": 9.621027946472168, + "learning_rate": 3.582005428104133e-06, + "loss": 0.084, + "num_input_tokens_seen": 15871512, + "step": 8099 + }, + { + "epoch": 1.073558648111332, + "grad_norm": 5.116708755493164, + "learning_rate": 3.5816924772342597e-06, + "loss": 0.0327, + "num_input_tokens_seen": 15873256, + "step": 8100 + }, + { + "epoch": 1.0736911862160372, + "grad_norm": 0.0987025648355484, + "learning_rate": 3.5813795055086105e-06, + "loss": 0.0007, + "num_input_tokens_seen": 15875008, + "step": 8101 + }, + { + "epoch": 1.0738237243207422, + "grad_norm": 4.944706439971924, + "learning_rate": 3.581066512933217e-06, + "loss": 0.0325, + "num_input_tokens_seen": 15877008, + "step": 8102 + }, + { + "epoch": 1.0739562624254473, + "grad_norm": 5.2322678565979, + "learning_rate": 3.5807534995141156e-06, + "loss": 0.0574, + "num_input_tokens_seen": 15878952, + "step": 8103 + }, + { + "epoch": 1.0740888005301523, + "grad_norm": 8.959538459777832, + "learning_rate": 3.580440465257341e-06, + "loss": 0.1042, + "num_input_tokens_seen": 15881408, + "step": 8104 + }, + { + "epoch": 1.0742213386348576, + "grad_norm": 1.8725056648254395, + "learning_rate": 3.5801274101689292e-06, + "loss": 0.0081, + "num_input_tokens_seen": 15883824, + "step": 8105 + }, + { + "epoch": 1.0743538767395626, + "grad_norm": 2.592747449874878, + "learning_rate": 3.5798143342549164e-06, + "loss": 0.0238, + "num_input_tokens_seen": 15885720, + "step": 8106 + }, + { + "epoch": 1.0744864148442677, + "grad_norm": 6.40894889831543, + "learning_rate": 3.5795012375213373e-06, + "loss": 0.1735, + "num_input_tokens_seen": 15888392, + "step": 8107 + }, + { + "epoch": 1.074618952948973, + "grad_norm": 0.06675010919570923, + "learning_rate": 3.5791881199742295e-06, + "loss": 0.0005, + "num_input_tokens_seen": 15890144, + "step": 8108 + }, + { + "epoch": 1.074751491053678, + "grad_norm": 5.444103240966797, + "learning_rate": 3.5788749816196316e-06, + "loss": 0.1699, + "num_input_tokens_seen": 15892288, + "step": 8109 + }, + { + "epoch": 1.074884029158383, + "grad_norm": 11.893511772155762, + "learning_rate": 3.5785618224635787e-06, + "loss": 0.0667, + "num_input_tokens_seen": 15893912, + "step": 8110 + }, + { + "epoch": 1.0750165672630883, + "grad_norm": 8.7160005569458, + "learning_rate": 3.5782486425121097e-06, + "loss": 0.1929, + "num_input_tokens_seen": 15896080, + "step": 8111 + }, + { + "epoch": 1.0751491053677933, + "grad_norm": 8.394972801208496, + "learning_rate": 3.5779354417712635e-06, + "loss": 0.1596, + "num_input_tokens_seen": 15898248, + "step": 8112 + }, + { + "epoch": 1.0752816434724983, + "grad_norm": 1.3126581907272339, + "learning_rate": 3.5776222202470782e-06, + "loss": 0.0032, + "num_input_tokens_seen": 15901040, + "step": 8113 + }, + { + "epoch": 1.0754141815772034, + "grad_norm": 7.538087368011475, + "learning_rate": 3.577308977945593e-06, + "loss": 0.1423, + "num_input_tokens_seen": 15903304, + "step": 8114 + }, + { + "epoch": 1.0755467196819086, + "grad_norm": 0.28379806876182556, + "learning_rate": 3.576995714872848e-06, + "loss": 0.0039, + "num_input_tokens_seen": 15905280, + "step": 8115 + }, + { + "epoch": 1.0756792577866137, + "grad_norm": 0.036194901913404465, + "learning_rate": 3.5766824310348824e-06, + "loss": 0.0002, + "num_input_tokens_seen": 15906592, + "step": 8116 + }, + { + "epoch": 1.0758117958913187, + "grad_norm": 9.973142623901367, + "learning_rate": 3.576369126437736e-06, + "loss": 0.1383, + "num_input_tokens_seen": 15908592, + "step": 8117 + }, + { + "epoch": 1.075944333996024, + "grad_norm": 0.06339427083730698, + "learning_rate": 3.5760558010874515e-06, + "loss": 0.0005, + "num_input_tokens_seen": 15910848, + "step": 8118 + }, + { + "epoch": 1.076076872100729, + "grad_norm": 13.152695655822754, + "learning_rate": 3.5757424549900684e-06, + "loss": 0.4119, + "num_input_tokens_seen": 15912784, + "step": 8119 + }, + { + "epoch": 1.076209410205434, + "grad_norm": 1.4193695783615112, + "learning_rate": 3.575429088151628e-06, + "loss": 0.0279, + "num_input_tokens_seen": 15914432, + "step": 8120 + }, + { + "epoch": 1.076341948310139, + "grad_norm": 7.375639915466309, + "learning_rate": 3.5751157005781733e-06, + "loss": 0.0785, + "num_input_tokens_seen": 15917184, + "step": 8121 + }, + { + "epoch": 1.0764744864148443, + "grad_norm": 10.823859214782715, + "learning_rate": 3.5748022922757464e-06, + "loss": 0.4937, + "num_input_tokens_seen": 15919072, + "step": 8122 + }, + { + "epoch": 1.0766070245195494, + "grad_norm": 1.9543877840042114, + "learning_rate": 3.5744888632503895e-06, + "loss": 0.0392, + "num_input_tokens_seen": 15920616, + "step": 8123 + }, + { + "epoch": 1.0767395626242544, + "grad_norm": 5.766176223754883, + "learning_rate": 3.574175413508146e-06, + "loss": 0.1383, + "num_input_tokens_seen": 15922632, + "step": 8124 + }, + { + "epoch": 1.0768721007289597, + "grad_norm": 3.0857796669006348, + "learning_rate": 3.5738619430550604e-06, + "loss": 0.0551, + "num_input_tokens_seen": 15925224, + "step": 8125 + }, + { + "epoch": 1.0770046388336647, + "grad_norm": 0.8152457475662231, + "learning_rate": 3.5735484518971746e-06, + "loss": 0.0141, + "num_input_tokens_seen": 15927176, + "step": 8126 + }, + { + "epoch": 1.0771371769383697, + "grad_norm": 2.7200238704681396, + "learning_rate": 3.5732349400405352e-06, + "loss": 0.0507, + "num_input_tokens_seen": 15929656, + "step": 8127 + }, + { + "epoch": 1.0772697150430748, + "grad_norm": 5.794968128204346, + "learning_rate": 3.572921407491185e-06, + "loss": 0.1987, + "num_input_tokens_seen": 15931800, + "step": 8128 + }, + { + "epoch": 1.07740225314778, + "grad_norm": 7.812846660614014, + "learning_rate": 3.5726078542551697e-06, + "loss": 0.1798, + "num_input_tokens_seen": 15934648, + "step": 8129 + }, + { + "epoch": 1.077534791252485, + "grad_norm": 6.983575820922852, + "learning_rate": 3.572294280338535e-06, + "loss": 0.2329, + "num_input_tokens_seen": 15936544, + "step": 8130 + }, + { + "epoch": 1.0776673293571901, + "grad_norm": 7.1914753913879395, + "learning_rate": 3.571980685747328e-06, + "loss": 0.1489, + "num_input_tokens_seen": 15938160, + "step": 8131 + }, + { + "epoch": 1.0777998674618954, + "grad_norm": 1.1720621585845947, + "learning_rate": 3.571667070487592e-06, + "loss": 0.0187, + "num_input_tokens_seen": 15940000, + "step": 8132 + }, + { + "epoch": 1.0779324055666004, + "grad_norm": 7.3055195808410645, + "learning_rate": 3.5713534345653768e-06, + "loss": 0.1616, + "num_input_tokens_seen": 15941856, + "step": 8133 + }, + { + "epoch": 1.0780649436713055, + "grad_norm": 1.5841007232666016, + "learning_rate": 3.5710397779867286e-06, + "loss": 0.0049, + "num_input_tokens_seen": 15943352, + "step": 8134 + }, + { + "epoch": 1.0781974817760105, + "grad_norm": 4.9820942878723145, + "learning_rate": 3.570726100757693e-06, + "loss": 0.1462, + "num_input_tokens_seen": 15945064, + "step": 8135 + }, + { + "epoch": 1.0783300198807158, + "grad_norm": 8.949235916137695, + "learning_rate": 3.570412402884321e-06, + "loss": 0.2062, + "num_input_tokens_seen": 15946616, + "step": 8136 + }, + { + "epoch": 1.0784625579854208, + "grad_norm": 0.11764899641275406, + "learning_rate": 3.570098684372659e-06, + "loss": 0.0007, + "num_input_tokens_seen": 15947768, + "step": 8137 + }, + { + "epoch": 1.0785950960901258, + "grad_norm": 11.29038143157959, + "learning_rate": 3.5697849452287565e-06, + "loss": 0.3404, + "num_input_tokens_seen": 15950504, + "step": 8138 + }, + { + "epoch": 1.078727634194831, + "grad_norm": 11.140201568603516, + "learning_rate": 3.5694711854586623e-06, + "loss": 0.2069, + "num_input_tokens_seen": 15952096, + "step": 8139 + }, + { + "epoch": 1.0788601722995361, + "grad_norm": 0.0532853864133358, + "learning_rate": 3.569157405068425e-06, + "loss": 0.0004, + "num_input_tokens_seen": 15953768, + "step": 8140 + }, + { + "epoch": 1.0789927104042412, + "grad_norm": 0.04913414269685745, + "learning_rate": 3.5688436040640956e-06, + "loss": 0.0004, + "num_input_tokens_seen": 15955416, + "step": 8141 + }, + { + "epoch": 1.0791252485089464, + "grad_norm": 6.232329368591309, + "learning_rate": 3.5685297824517253e-06, + "loss": 0.0786, + "num_input_tokens_seen": 15957624, + "step": 8142 + }, + { + "epoch": 1.0792577866136515, + "grad_norm": 2.941006660461426, + "learning_rate": 3.568215940237362e-06, + "loss": 0.057, + "num_input_tokens_seen": 15959456, + "step": 8143 + }, + { + "epoch": 1.0793903247183565, + "grad_norm": 0.049006737768650055, + "learning_rate": 3.567902077427059e-06, + "loss": 0.0004, + "num_input_tokens_seen": 15960432, + "step": 8144 + }, + { + "epoch": 1.0795228628230615, + "grad_norm": 6.584778308868408, + "learning_rate": 3.567588194026868e-06, + "loss": 0.0863, + "num_input_tokens_seen": 15962376, + "step": 8145 + }, + { + "epoch": 1.0796554009277668, + "grad_norm": 3.96089768409729, + "learning_rate": 3.5672742900428394e-06, + "loss": 0.0207, + "num_input_tokens_seen": 15964544, + "step": 8146 + }, + { + "epoch": 1.0797879390324718, + "grad_norm": 10.333329200744629, + "learning_rate": 3.5669603654810262e-06, + "loss": 0.2505, + "num_input_tokens_seen": 15966904, + "step": 8147 + }, + { + "epoch": 1.0799204771371769, + "grad_norm": 4.460213661193848, + "learning_rate": 3.5666464203474814e-06, + "loss": 0.0738, + "num_input_tokens_seen": 15968640, + "step": 8148 + }, + { + "epoch": 1.0800530152418821, + "grad_norm": 8.36618709564209, + "learning_rate": 3.5663324546482574e-06, + "loss": 0.2368, + "num_input_tokens_seen": 15971320, + "step": 8149 + }, + { + "epoch": 1.0801855533465872, + "grad_norm": 4.934171676635742, + "learning_rate": 3.5660184683894085e-06, + "loss": 0.0418, + "num_input_tokens_seen": 15973416, + "step": 8150 + }, + { + "epoch": 1.0803180914512922, + "grad_norm": 10.742892265319824, + "learning_rate": 3.5657044615769874e-06, + "loss": 0.2656, + "num_input_tokens_seen": 15975560, + "step": 8151 + }, + { + "epoch": 1.0804506295559975, + "grad_norm": 0.1110231950879097, + "learning_rate": 3.5653904342170497e-06, + "loss": 0.0008, + "num_input_tokens_seen": 15976832, + "step": 8152 + }, + { + "epoch": 1.0805831676607025, + "grad_norm": 0.21634437143802643, + "learning_rate": 3.5650763863156503e-06, + "loss": 0.0014, + "num_input_tokens_seen": 15978912, + "step": 8153 + }, + { + "epoch": 1.0807157057654075, + "grad_norm": 4.954441547393799, + "learning_rate": 3.564762317878842e-06, + "loss": 0.0684, + "num_input_tokens_seen": 15980856, + "step": 8154 + }, + { + "epoch": 1.0808482438701126, + "grad_norm": 2.1595606803894043, + "learning_rate": 3.564448228912682e-06, + "loss": 0.0185, + "num_input_tokens_seen": 15982376, + "step": 8155 + }, + { + "epoch": 1.0809807819748178, + "grad_norm": 3.8298614025115967, + "learning_rate": 3.564134119423226e-06, + "loss": 0.0246, + "num_input_tokens_seen": 15984160, + "step": 8156 + }, + { + "epoch": 1.0811133200795229, + "grad_norm": 6.214467525482178, + "learning_rate": 3.56381998941653e-06, + "loss": 0.1867, + "num_input_tokens_seen": 15986176, + "step": 8157 + }, + { + "epoch": 1.081245858184228, + "grad_norm": 9.021778106689453, + "learning_rate": 3.5635058388986508e-06, + "loss": 0.04, + "num_input_tokens_seen": 15987672, + "step": 8158 + }, + { + "epoch": 1.0813783962889332, + "grad_norm": 11.224575996398926, + "learning_rate": 3.5631916678756453e-06, + "loss": 0.135, + "num_input_tokens_seen": 15990824, + "step": 8159 + }, + { + "epoch": 1.0815109343936382, + "grad_norm": 0.042119693011045456, + "learning_rate": 3.5628774763535713e-06, + "loss": 0.0003, + "num_input_tokens_seen": 15992128, + "step": 8160 + }, + { + "epoch": 1.0816434724983433, + "grad_norm": 0.1152798980474472, + "learning_rate": 3.5625632643384855e-06, + "loss": 0.0008, + "num_input_tokens_seen": 15993888, + "step": 8161 + }, + { + "epoch": 1.0817760106030483, + "grad_norm": 5.699147701263428, + "learning_rate": 3.5622490318364476e-06, + "loss": 0.145, + "num_input_tokens_seen": 15996088, + "step": 8162 + }, + { + "epoch": 1.0819085487077535, + "grad_norm": 8.137140274047852, + "learning_rate": 3.561934778853515e-06, + "loss": 0.1752, + "num_input_tokens_seen": 15997904, + "step": 8163 + }, + { + "epoch": 1.0820410868124586, + "grad_norm": 0.05584509298205376, + "learning_rate": 3.5616205053957474e-06, + "loss": 0.0004, + "num_input_tokens_seen": 15999824, + "step": 8164 + }, + { + "epoch": 1.0821736249171636, + "grad_norm": 0.4254057705402374, + "learning_rate": 3.561306211469205e-06, + "loss": 0.0025, + "num_input_tokens_seen": 16002496, + "step": 8165 + }, + { + "epoch": 1.0823061630218689, + "grad_norm": 0.2343982309103012, + "learning_rate": 3.5609918970799464e-06, + "loss": 0.0014, + "num_input_tokens_seen": 16004448, + "step": 8166 + }, + { + "epoch": 1.082438701126574, + "grad_norm": 4.10145378112793, + "learning_rate": 3.5606775622340307e-06, + "loss": 0.0292, + "num_input_tokens_seen": 16006984, + "step": 8167 + }, + { + "epoch": 1.082571239231279, + "grad_norm": 0.07509990781545639, + "learning_rate": 3.5603632069375215e-06, + "loss": 0.0005, + "num_input_tokens_seen": 16008464, + "step": 8168 + }, + { + "epoch": 1.082703777335984, + "grad_norm": 15.488178253173828, + "learning_rate": 3.5600488311964778e-06, + "loss": 0.38, + "num_input_tokens_seen": 16010672, + "step": 8169 + }, + { + "epoch": 1.0828363154406893, + "grad_norm": 0.6473097801208496, + "learning_rate": 3.5597344350169606e-06, + "loss": 0.0061, + "num_input_tokens_seen": 16012088, + "step": 8170 + }, + { + "epoch": 1.0829688535453943, + "grad_norm": 9.896974563598633, + "learning_rate": 3.559420018405033e-06, + "loss": 0.2312, + "num_input_tokens_seen": 16015128, + "step": 8171 + }, + { + "epoch": 1.0831013916500993, + "grad_norm": 11.054922103881836, + "learning_rate": 3.559105581366756e-06, + "loss": 0.2444, + "num_input_tokens_seen": 16017160, + "step": 8172 + }, + { + "epoch": 1.0832339297548046, + "grad_norm": 1.8386130332946777, + "learning_rate": 3.5587911239081936e-06, + "loss": 0.0254, + "num_input_tokens_seen": 16018464, + "step": 8173 + }, + { + "epoch": 1.0833664678595096, + "grad_norm": 0.040409430861473083, + "learning_rate": 3.5584766460354073e-06, + "loss": 0.0003, + "num_input_tokens_seen": 16019544, + "step": 8174 + }, + { + "epoch": 1.0834990059642147, + "grad_norm": 5.590936183929443, + "learning_rate": 3.558162147754462e-06, + "loss": 0.2231, + "num_input_tokens_seen": 16022496, + "step": 8175 + }, + { + "epoch": 1.0836315440689197, + "grad_norm": 0.08457775413990021, + "learning_rate": 3.557847629071419e-06, + "loss": 0.0006, + "num_input_tokens_seen": 16023560, + "step": 8176 + }, + { + "epoch": 1.083764082173625, + "grad_norm": 6.011713981628418, + "learning_rate": 3.5575330899923456e-06, + "loss": 0.1792, + "num_input_tokens_seen": 16024904, + "step": 8177 + }, + { + "epoch": 1.08389662027833, + "grad_norm": 0.1273772418498993, + "learning_rate": 3.557218530523304e-06, + "loss": 0.0009, + "num_input_tokens_seen": 16026408, + "step": 8178 + }, + { + "epoch": 1.084029158383035, + "grad_norm": 0.2513926923274994, + "learning_rate": 3.5569039506703583e-06, + "loss": 0.0018, + "num_input_tokens_seen": 16027736, + "step": 8179 + }, + { + "epoch": 1.0841616964877403, + "grad_norm": 2.227747917175293, + "learning_rate": 3.5565893504395776e-06, + "loss": 0.017, + "num_input_tokens_seen": 16029384, + "step": 8180 + }, + { + "epoch": 1.0842942345924453, + "grad_norm": 11.231149673461914, + "learning_rate": 3.5562747298370247e-06, + "loss": 0.0915, + "num_input_tokens_seen": 16031272, + "step": 8181 + }, + { + "epoch": 1.0844267726971504, + "grad_norm": 5.937643051147461, + "learning_rate": 3.5559600888687653e-06, + "loss": 0.1258, + "num_input_tokens_seen": 16032912, + "step": 8182 + }, + { + "epoch": 1.0845593108018556, + "grad_norm": 6.154516220092773, + "learning_rate": 3.5556454275408683e-06, + "loss": 0.096, + "num_input_tokens_seen": 16035176, + "step": 8183 + }, + { + "epoch": 1.0846918489065607, + "grad_norm": 0.14566108584403992, + "learning_rate": 3.5553307458593985e-06, + "loss": 0.0011, + "num_input_tokens_seen": 16036544, + "step": 8184 + }, + { + "epoch": 1.0848243870112657, + "grad_norm": 3.7918355464935303, + "learning_rate": 3.555016043830423e-06, + "loss": 0.116, + "num_input_tokens_seen": 16038952, + "step": 8185 + }, + { + "epoch": 1.0849569251159707, + "grad_norm": 0.853606104850769, + "learning_rate": 3.554701321460012e-06, + "loss": 0.0053, + "num_input_tokens_seen": 16040448, + "step": 8186 + }, + { + "epoch": 1.085089463220676, + "grad_norm": 1.653964877128601, + "learning_rate": 3.5543865787542308e-06, + "loss": 0.0193, + "num_input_tokens_seen": 16042320, + "step": 8187 + }, + { + "epoch": 1.085222001325381, + "grad_norm": 15.24460220336914, + "learning_rate": 3.554071815719149e-06, + "loss": 0.1534, + "num_input_tokens_seen": 16044096, + "step": 8188 + }, + { + "epoch": 1.085354539430086, + "grad_norm": 5.259398460388184, + "learning_rate": 3.553757032360836e-06, + "loss": 0.0402, + "num_input_tokens_seen": 16046064, + "step": 8189 + }, + { + "epoch": 1.0854870775347913, + "grad_norm": 0.33785828948020935, + "learning_rate": 3.55344222868536e-06, + "loss": 0.0024, + "num_input_tokens_seen": 16047432, + "step": 8190 + }, + { + "epoch": 1.0856196156394964, + "grad_norm": 0.049030616879463196, + "learning_rate": 3.553127404698792e-06, + "loss": 0.0003, + "num_input_tokens_seen": 16048880, + "step": 8191 + }, + { + "epoch": 1.0857521537442014, + "grad_norm": 0.11163589358329773, + "learning_rate": 3.5528125604072e-06, + "loss": 0.0008, + "num_input_tokens_seen": 16051056, + "step": 8192 + }, + { + "epoch": 1.0858846918489065, + "grad_norm": 0.04698791354894638, + "learning_rate": 3.5524976958166563e-06, + "loss": 0.0003, + "num_input_tokens_seen": 16052200, + "step": 8193 + }, + { + "epoch": 1.0860172299536117, + "grad_norm": 2.4435131549835205, + "learning_rate": 3.55218281093323e-06, + "loss": 0.0255, + "num_input_tokens_seen": 16053704, + "step": 8194 + }, + { + "epoch": 1.0861497680583168, + "grad_norm": 1.1539897918701172, + "learning_rate": 3.551867905762994e-06, + "loss": 0.0136, + "num_input_tokens_seen": 16055592, + "step": 8195 + }, + { + "epoch": 1.0862823061630218, + "grad_norm": 10.937845230102539, + "learning_rate": 3.5515529803120195e-06, + "loss": 0.0967, + "num_input_tokens_seen": 16057584, + "step": 8196 + }, + { + "epoch": 1.086414844267727, + "grad_norm": 0.058928750455379486, + "learning_rate": 3.5512380345863773e-06, + "loss": 0.0004, + "num_input_tokens_seen": 16059104, + "step": 8197 + }, + { + "epoch": 1.086547382372432, + "grad_norm": 2.5197365283966064, + "learning_rate": 3.550923068592141e-06, + "loss": 0.0428, + "num_input_tokens_seen": 16060224, + "step": 8198 + }, + { + "epoch": 1.0866799204771371, + "grad_norm": 8.676409721374512, + "learning_rate": 3.5506080823353834e-06, + "loss": 0.1122, + "num_input_tokens_seen": 16061776, + "step": 8199 + }, + { + "epoch": 1.0868124585818424, + "grad_norm": 0.09996600449085236, + "learning_rate": 3.5502930758221756e-06, + "loss": 0.0007, + "num_input_tokens_seen": 16062992, + "step": 8200 + }, + { + "epoch": 1.0869449966865474, + "grad_norm": 0.052888184785842896, + "learning_rate": 3.5499780490585946e-06, + "loss": 0.0004, + "num_input_tokens_seen": 16064352, + "step": 8201 + }, + { + "epoch": 1.0870775347912525, + "grad_norm": 1.610395073890686, + "learning_rate": 3.549663002050711e-06, + "loss": 0.0141, + "num_input_tokens_seen": 16066832, + "step": 8202 + }, + { + "epoch": 1.0872100728959575, + "grad_norm": 4.254324913024902, + "learning_rate": 3.5493479348046013e-06, + "loss": 0.1132, + "num_input_tokens_seen": 16068752, + "step": 8203 + }, + { + "epoch": 1.0873426110006628, + "grad_norm": 8.843944549560547, + "learning_rate": 3.54903284732634e-06, + "loss": 0.2142, + "num_input_tokens_seen": 16070552, + "step": 8204 + }, + { + "epoch": 1.0874751491053678, + "grad_norm": 18.701635360717773, + "learning_rate": 3.5487177396220005e-06, + "loss": 0.4981, + "num_input_tokens_seen": 16073208, + "step": 8205 + }, + { + "epoch": 1.0876076872100728, + "grad_norm": 5.607776165008545, + "learning_rate": 3.5484026116976604e-06, + "loss": 0.0783, + "num_input_tokens_seen": 16074728, + "step": 8206 + }, + { + "epoch": 1.087740225314778, + "grad_norm": 3.237525463104248, + "learning_rate": 3.548087463559395e-06, + "loss": 0.0489, + "num_input_tokens_seen": 16076472, + "step": 8207 + }, + { + "epoch": 1.0878727634194831, + "grad_norm": 9.666227340698242, + "learning_rate": 3.5477722952132792e-06, + "loss": 0.25, + "num_input_tokens_seen": 16078312, + "step": 8208 + }, + { + "epoch": 1.0880053015241882, + "grad_norm": 0.1694585084915161, + "learning_rate": 3.5474571066653914e-06, + "loss": 0.0012, + "num_input_tokens_seen": 16079808, + "step": 8209 + }, + { + "epoch": 1.0881378396288932, + "grad_norm": 0.09490477293729782, + "learning_rate": 3.547141897921808e-06, + "loss": 0.0006, + "num_input_tokens_seen": 16081360, + "step": 8210 + }, + { + "epoch": 1.0882703777335985, + "grad_norm": 6.817934036254883, + "learning_rate": 3.546826668988606e-06, + "loss": 0.0327, + "num_input_tokens_seen": 16083088, + "step": 8211 + }, + { + "epoch": 1.0884029158383035, + "grad_norm": 0.13362659513950348, + "learning_rate": 3.546511419871864e-06, + "loss": 0.0009, + "num_input_tokens_seen": 16085696, + "step": 8212 + }, + { + "epoch": 1.0885354539430085, + "grad_norm": 11.052128791809082, + "learning_rate": 3.5461961505776598e-06, + "loss": 0.3462, + "num_input_tokens_seen": 16087800, + "step": 8213 + }, + { + "epoch": 1.0886679920477138, + "grad_norm": 8.090506553649902, + "learning_rate": 3.5458808611120715e-06, + "loss": 0.1688, + "num_input_tokens_seen": 16089960, + "step": 8214 + }, + { + "epoch": 1.0888005301524188, + "grad_norm": 0.11798691004514694, + "learning_rate": 3.54556555148118e-06, + "loss": 0.0009, + "num_input_tokens_seen": 16092680, + "step": 8215 + }, + { + "epoch": 1.0889330682571239, + "grad_norm": 5.873249530792236, + "learning_rate": 3.5452502216910623e-06, + "loss": 0.0934, + "num_input_tokens_seen": 16094648, + "step": 8216 + }, + { + "epoch": 1.089065606361829, + "grad_norm": 0.47509628534317017, + "learning_rate": 3.5449348717477993e-06, + "loss": 0.0033, + "num_input_tokens_seen": 16096064, + "step": 8217 + }, + { + "epoch": 1.0891981444665342, + "grad_norm": 10.27983570098877, + "learning_rate": 3.5446195016574714e-06, + "loss": 0.2465, + "num_input_tokens_seen": 16099016, + "step": 8218 + }, + { + "epoch": 1.0893306825712392, + "grad_norm": 0.7238097786903381, + "learning_rate": 3.544304111426159e-06, + "loss": 0.005, + "num_input_tokens_seen": 16101480, + "step": 8219 + }, + { + "epoch": 1.0894632206759443, + "grad_norm": 0.3214225471019745, + "learning_rate": 3.5439887010599427e-06, + "loss": 0.0022, + "num_input_tokens_seen": 16103272, + "step": 8220 + }, + { + "epoch": 1.0895957587806495, + "grad_norm": 7.985569477081299, + "learning_rate": 3.543673270564904e-06, + "loss": 0.0627, + "num_input_tokens_seen": 16104576, + "step": 8221 + }, + { + "epoch": 1.0897282968853546, + "grad_norm": 4.76102876663208, + "learning_rate": 3.5433578199471254e-06, + "loss": 0.1557, + "num_input_tokens_seen": 16106936, + "step": 8222 + }, + { + "epoch": 1.0898608349900596, + "grad_norm": 2.316457509994507, + "learning_rate": 3.543042349212687e-06, + "loss": 0.0096, + "num_input_tokens_seen": 16108120, + "step": 8223 + }, + { + "epoch": 1.0899933730947646, + "grad_norm": 6.312108516693115, + "learning_rate": 3.542726858367673e-06, + "loss": 0.0819, + "num_input_tokens_seen": 16109664, + "step": 8224 + }, + { + "epoch": 1.0901259111994699, + "grad_norm": 1.4967623949050903, + "learning_rate": 3.5424113474181665e-06, + "loss": 0.0174, + "num_input_tokens_seen": 16111752, + "step": 8225 + }, + { + "epoch": 1.090258449304175, + "grad_norm": 0.6069185733795166, + "learning_rate": 3.542095816370249e-06, + "loss": 0.0035, + "num_input_tokens_seen": 16113936, + "step": 8226 + }, + { + "epoch": 1.09039098740888, + "grad_norm": 5.663053035736084, + "learning_rate": 3.541780265230006e-06, + "loss": 0.1568, + "num_input_tokens_seen": 16116304, + "step": 8227 + }, + { + "epoch": 1.0905235255135852, + "grad_norm": 1.1117454767227173, + "learning_rate": 3.5414646940035206e-06, + "loss": 0.0093, + "num_input_tokens_seen": 16117656, + "step": 8228 + }, + { + "epoch": 1.0906560636182903, + "grad_norm": 11.32906436920166, + "learning_rate": 3.5411491026968768e-06, + "loss": 0.521, + "num_input_tokens_seen": 16121344, + "step": 8229 + }, + { + "epoch": 1.0907886017229953, + "grad_norm": 1.6885398626327515, + "learning_rate": 3.540833491316161e-06, + "loss": 0.0139, + "num_input_tokens_seen": 16123056, + "step": 8230 + }, + { + "epoch": 1.0909211398277006, + "grad_norm": 0.07304983586072922, + "learning_rate": 3.5405178598674574e-06, + "loss": 0.0005, + "num_input_tokens_seen": 16125568, + "step": 8231 + }, + { + "epoch": 1.0910536779324056, + "grad_norm": 8.944059371948242, + "learning_rate": 3.540202208356851e-06, + "loss": 0.1175, + "num_input_tokens_seen": 16127856, + "step": 8232 + }, + { + "epoch": 1.0911862160371106, + "grad_norm": 0.015177439898252487, + "learning_rate": 3.5398865367904288e-06, + "loss": 0.0001, + "num_input_tokens_seen": 16128880, + "step": 8233 + }, + { + "epoch": 1.0913187541418157, + "grad_norm": 0.01764591410756111, + "learning_rate": 3.539570845174277e-06, + "loss": 0.0001, + "num_input_tokens_seen": 16130376, + "step": 8234 + }, + { + "epoch": 1.091451292246521, + "grad_norm": 1.2147263288497925, + "learning_rate": 3.5392551335144822e-06, + "loss": 0.0076, + "num_input_tokens_seen": 16132784, + "step": 8235 + }, + { + "epoch": 1.091583830351226, + "grad_norm": 6.77931022644043, + "learning_rate": 3.5389394018171308e-06, + "loss": 0.1096, + "num_input_tokens_seen": 16134688, + "step": 8236 + }, + { + "epoch": 1.091716368455931, + "grad_norm": 0.014793340116739273, + "learning_rate": 3.538623650088312e-06, + "loss": 0.0001, + "num_input_tokens_seen": 16135752, + "step": 8237 + }, + { + "epoch": 1.0918489065606363, + "grad_norm": 2.4918525218963623, + "learning_rate": 3.5383078783341118e-06, + "loss": 0.051, + "num_input_tokens_seen": 16138432, + "step": 8238 + }, + { + "epoch": 1.0919814446653413, + "grad_norm": 4.628347396850586, + "learning_rate": 3.53799208656062e-06, + "loss": 0.0759, + "num_input_tokens_seen": 16140408, + "step": 8239 + }, + { + "epoch": 1.0921139827700463, + "grad_norm": 0.2055290788412094, + "learning_rate": 3.5376762747739244e-06, + "loss": 0.0012, + "num_input_tokens_seen": 16141896, + "step": 8240 + }, + { + "epoch": 1.0922465208747516, + "grad_norm": 9.050737380981445, + "learning_rate": 3.5373604429801146e-06, + "loss": 0.1521, + "num_input_tokens_seen": 16144272, + "step": 8241 + }, + { + "epoch": 1.0923790589794566, + "grad_norm": 0.31272488832473755, + "learning_rate": 3.537044591185279e-06, + "loss": 0.001, + "num_input_tokens_seen": 16145920, + "step": 8242 + }, + { + "epoch": 1.0925115970841617, + "grad_norm": 4.944389820098877, + "learning_rate": 3.536728719395509e-06, + "loss": 0.1488, + "num_input_tokens_seen": 16147392, + "step": 8243 + }, + { + "epoch": 1.0926441351888667, + "grad_norm": 0.00924631766974926, + "learning_rate": 3.5364128276168943e-06, + "loss": 0.0001, + "num_input_tokens_seen": 16149376, + "step": 8244 + }, + { + "epoch": 1.092776673293572, + "grad_norm": 7.475938320159912, + "learning_rate": 3.5360969158555247e-06, + "loss": 0.1904, + "num_input_tokens_seen": 16151280, + "step": 8245 + }, + { + "epoch": 1.092909211398277, + "grad_norm": 0.08372288942337036, + "learning_rate": 3.535780984117493e-06, + "loss": 0.0007, + "num_input_tokens_seen": 16153240, + "step": 8246 + }, + { + "epoch": 1.093041749502982, + "grad_norm": 0.40021905303001404, + "learning_rate": 3.5354650324088874e-06, + "loss": 0.0015, + "num_input_tokens_seen": 16155856, + "step": 8247 + }, + { + "epoch": 1.0931742876076873, + "grad_norm": 0.009994975291192532, + "learning_rate": 3.535149060735803e-06, + "loss": 0.0001, + "num_input_tokens_seen": 16158192, + "step": 8248 + }, + { + "epoch": 1.0933068257123923, + "grad_norm": 6.496945381164551, + "learning_rate": 3.5348330691043303e-06, + "loss": 0.1516, + "num_input_tokens_seen": 16159760, + "step": 8249 + }, + { + "epoch": 1.0934393638170974, + "grad_norm": 0.004689021036028862, + "learning_rate": 3.5345170575205616e-06, + "loss": 0.0, + "num_input_tokens_seen": 16160992, + "step": 8250 + }, + { + "epoch": 1.0935719019218024, + "grad_norm": 0.012878828682005405, + "learning_rate": 3.5342010259905913e-06, + "loss": 0.0001, + "num_input_tokens_seen": 16162680, + "step": 8251 + }, + { + "epoch": 1.0937044400265077, + "grad_norm": 0.017912013456225395, + "learning_rate": 3.5338849745205116e-06, + "loss": 0.0001, + "num_input_tokens_seen": 16164376, + "step": 8252 + }, + { + "epoch": 1.0938369781312127, + "grad_norm": 0.024705074727535248, + "learning_rate": 3.5335689031164157e-06, + "loss": 0.0002, + "num_input_tokens_seen": 16165576, + "step": 8253 + }, + { + "epoch": 1.0939695162359178, + "grad_norm": 8.418940544128418, + "learning_rate": 3.533252811784399e-06, + "loss": 0.0689, + "num_input_tokens_seen": 16167008, + "step": 8254 + }, + { + "epoch": 1.094102054340623, + "grad_norm": 2.623546838760376, + "learning_rate": 3.532936700530555e-06, + "loss": 0.0169, + "num_input_tokens_seen": 16168376, + "step": 8255 + }, + { + "epoch": 1.094234592445328, + "grad_norm": 12.64990520477295, + "learning_rate": 3.532620569360979e-06, + "loss": 0.3545, + "num_input_tokens_seen": 16169728, + "step": 8256 + }, + { + "epoch": 1.094367130550033, + "grad_norm": 0.006974699441343546, + "learning_rate": 3.5323044182817657e-06, + "loss": 0.0, + "num_input_tokens_seen": 16171224, + "step": 8257 + }, + { + "epoch": 1.0944996686547381, + "grad_norm": 0.28953295946121216, + "learning_rate": 3.531988247299011e-06, + "loss": 0.0019, + "num_input_tokens_seen": 16173008, + "step": 8258 + }, + { + "epoch": 1.0946322067594434, + "grad_norm": 0.00885310210287571, + "learning_rate": 3.531672056418811e-06, + "loss": 0.0001, + "num_input_tokens_seen": 16175528, + "step": 8259 + }, + { + "epoch": 1.0947647448641484, + "grad_norm": 15.129582405090332, + "learning_rate": 3.5313558456472623e-06, + "loss": 0.0934, + "num_input_tokens_seen": 16177704, + "step": 8260 + }, + { + "epoch": 1.0948972829688535, + "grad_norm": 16.722957611083984, + "learning_rate": 3.531039614990462e-06, + "loss": 0.4819, + "num_input_tokens_seen": 16180488, + "step": 8261 + }, + { + "epoch": 1.0950298210735587, + "grad_norm": 0.0044142683036625385, + "learning_rate": 3.530723364454505e-06, + "loss": 0.0, + "num_input_tokens_seen": 16181728, + "step": 8262 + }, + { + "epoch": 1.0951623591782638, + "grad_norm": 4.102495193481445, + "learning_rate": 3.5304070940454922e-06, + "loss": 0.1115, + "num_input_tokens_seen": 16183368, + "step": 8263 + }, + { + "epoch": 1.0952948972829688, + "grad_norm": 9.280797004699707, + "learning_rate": 3.5300908037695193e-06, + "loss": 0.0819, + "num_input_tokens_seen": 16186592, + "step": 8264 + }, + { + "epoch": 1.0954274353876738, + "grad_norm": 15.228312492370605, + "learning_rate": 3.5297744936326842e-06, + "loss": 0.1209, + "num_input_tokens_seen": 16188424, + "step": 8265 + }, + { + "epoch": 1.095559973492379, + "grad_norm": 0.009741786867380142, + "learning_rate": 3.5294581636410873e-06, + "loss": 0.0001, + "num_input_tokens_seen": 16189904, + "step": 8266 + }, + { + "epoch": 1.0956925115970841, + "grad_norm": 3.496364116668701, + "learning_rate": 3.5291418138008266e-06, + "loss": 0.1001, + "num_input_tokens_seen": 16191440, + "step": 8267 + }, + { + "epoch": 1.0958250497017892, + "grad_norm": 8.591773986816406, + "learning_rate": 3.5288254441180014e-06, + "loss": 0.2504, + "num_input_tokens_seen": 16194280, + "step": 8268 + }, + { + "epoch": 1.0959575878064944, + "grad_norm": 0.195570170879364, + "learning_rate": 3.528509054598713e-06, + "loss": 0.0015, + "num_input_tokens_seen": 16196424, + "step": 8269 + }, + { + "epoch": 1.0960901259111995, + "grad_norm": 0.03169688209891319, + "learning_rate": 3.5281926452490593e-06, + "loss": 0.0002, + "num_input_tokens_seen": 16198816, + "step": 8270 + }, + { + "epoch": 1.0962226640159045, + "grad_norm": 0.022248223423957825, + "learning_rate": 3.5278762160751427e-06, + "loss": 0.0001, + "num_input_tokens_seen": 16200040, + "step": 8271 + }, + { + "epoch": 1.0963552021206098, + "grad_norm": 0.07524771243333817, + "learning_rate": 3.5275597670830636e-06, + "loss": 0.0005, + "num_input_tokens_seen": 16201832, + "step": 8272 + }, + { + "epoch": 1.0964877402253148, + "grad_norm": 28.33182716369629, + "learning_rate": 3.527243298278923e-06, + "loss": 0.0846, + "num_input_tokens_seen": 16204240, + "step": 8273 + }, + { + "epoch": 1.0966202783300198, + "grad_norm": 2.217759132385254, + "learning_rate": 3.526926809668823e-06, + "loss": 0.0236, + "num_input_tokens_seen": 16206720, + "step": 8274 + }, + { + "epoch": 1.0967528164347249, + "grad_norm": 11.358010292053223, + "learning_rate": 3.526610301258866e-06, + "loss": 0.3215, + "num_input_tokens_seen": 16208760, + "step": 8275 + }, + { + "epoch": 1.0968853545394301, + "grad_norm": 0.03283505514264107, + "learning_rate": 3.526293773055154e-06, + "loss": 0.0002, + "num_input_tokens_seen": 16211024, + "step": 8276 + }, + { + "epoch": 1.0970178926441352, + "grad_norm": 0.012253843247890472, + "learning_rate": 3.5259772250637897e-06, + "loss": 0.0001, + "num_input_tokens_seen": 16212528, + "step": 8277 + }, + { + "epoch": 1.0971504307488402, + "grad_norm": 0.20125627517700195, + "learning_rate": 3.525660657290877e-06, + "loss": 0.0009, + "num_input_tokens_seen": 16213928, + "step": 8278 + }, + { + "epoch": 1.0972829688535455, + "grad_norm": 0.13756966590881348, + "learning_rate": 3.5253440697425197e-06, + "loss": 0.0008, + "num_input_tokens_seen": 16216928, + "step": 8279 + }, + { + "epoch": 1.0974155069582505, + "grad_norm": 8.94886302947998, + "learning_rate": 3.5250274624248205e-06, + "loss": 0.2819, + "num_input_tokens_seen": 16219200, + "step": 8280 + }, + { + "epoch": 1.0975480450629556, + "grad_norm": 18.615583419799805, + "learning_rate": 3.524710835343885e-06, + "loss": 0.1888, + "num_input_tokens_seen": 16221584, + "step": 8281 + }, + { + "epoch": 1.0976805831676608, + "grad_norm": 3.2483725547790527, + "learning_rate": 3.5243941885058173e-06, + "loss": 0.052, + "num_input_tokens_seen": 16223296, + "step": 8282 + }, + { + "epoch": 1.0978131212723659, + "grad_norm": 5.578486919403076, + "learning_rate": 3.5240775219167234e-06, + "loss": 0.1013, + "num_input_tokens_seen": 16224968, + "step": 8283 + }, + { + "epoch": 1.097945659377071, + "grad_norm": 0.1545022428035736, + "learning_rate": 3.523760835582709e-06, + "loss": 0.001, + "num_input_tokens_seen": 16226456, + "step": 8284 + }, + { + "epoch": 1.098078197481776, + "grad_norm": 3.727581739425659, + "learning_rate": 3.5234441295098786e-06, + "loss": 0.0175, + "num_input_tokens_seen": 16227736, + "step": 8285 + }, + { + "epoch": 1.0982107355864812, + "grad_norm": 15.797917366027832, + "learning_rate": 3.5231274037043394e-06, + "loss": 0.1476, + "num_input_tokens_seen": 16229800, + "step": 8286 + }, + { + "epoch": 1.0983432736911862, + "grad_norm": 1.437326192855835, + "learning_rate": 3.5228106581721987e-06, + "loss": 0.022, + "num_input_tokens_seen": 16231696, + "step": 8287 + }, + { + "epoch": 1.0984758117958913, + "grad_norm": 0.46936115622520447, + "learning_rate": 3.5224938929195623e-06, + "loss": 0.0041, + "num_input_tokens_seen": 16234256, + "step": 8288 + }, + { + "epoch": 1.0986083499005965, + "grad_norm": 6.105960845947266, + "learning_rate": 3.5221771079525385e-06, + "loss": 0.1333, + "num_input_tokens_seen": 16236560, + "step": 8289 + }, + { + "epoch": 1.0987408880053016, + "grad_norm": 5.267346382141113, + "learning_rate": 3.5218603032772354e-06, + "loss": 0.1177, + "num_input_tokens_seen": 16238912, + "step": 8290 + }, + { + "epoch": 1.0988734261100066, + "grad_norm": 9.997039794921875, + "learning_rate": 3.52154347889976e-06, + "loss": 0.1596, + "num_input_tokens_seen": 16241088, + "step": 8291 + }, + { + "epoch": 1.0990059642147116, + "grad_norm": 0.09027338027954102, + "learning_rate": 3.5212266348262215e-06, + "loss": 0.0005, + "num_input_tokens_seen": 16242912, + "step": 8292 + }, + { + "epoch": 1.099138502319417, + "grad_norm": 11.439615249633789, + "learning_rate": 3.5209097710627287e-06, + "loss": 0.2605, + "num_input_tokens_seen": 16245328, + "step": 8293 + }, + { + "epoch": 1.099271040424122, + "grad_norm": 10.554126739501953, + "learning_rate": 3.520592887615392e-06, + "loss": 0.2899, + "num_input_tokens_seen": 16247576, + "step": 8294 + }, + { + "epoch": 1.099403578528827, + "grad_norm": 18.85726547241211, + "learning_rate": 3.52027598449032e-06, + "loss": 0.3745, + "num_input_tokens_seen": 16249160, + "step": 8295 + }, + { + "epoch": 1.0995361166335322, + "grad_norm": 0.021806567907333374, + "learning_rate": 3.5199590616936236e-06, + "loss": 0.0001, + "num_input_tokens_seen": 16251048, + "step": 8296 + }, + { + "epoch": 1.0996686547382373, + "grad_norm": 1.52481210231781, + "learning_rate": 3.519642119231412e-06, + "loss": 0.014, + "num_input_tokens_seen": 16252856, + "step": 8297 + }, + { + "epoch": 1.0998011928429423, + "grad_norm": 0.029399951919913292, + "learning_rate": 3.519325157109798e-06, + "loss": 0.0001, + "num_input_tokens_seen": 16255128, + "step": 8298 + }, + { + "epoch": 1.0999337309476473, + "grad_norm": 4.998563766479492, + "learning_rate": 3.5190081753348908e-06, + "loss": 0.0931, + "num_input_tokens_seen": 16258136, + "step": 8299 + }, + { + "epoch": 1.1000662690523526, + "grad_norm": 0.5044589042663574, + "learning_rate": 3.5186911739128037e-06, + "loss": 0.0028, + "num_input_tokens_seen": 16260800, + "step": 8300 + }, + { + "epoch": 1.1001988071570576, + "grad_norm": 0.060424428433179855, + "learning_rate": 3.518374152849647e-06, + "loss": 0.0003, + "num_input_tokens_seen": 16263352, + "step": 8301 + }, + { + "epoch": 1.1003313452617627, + "grad_norm": 6.916269302368164, + "learning_rate": 3.5180571121515356e-06, + "loss": 0.1468, + "num_input_tokens_seen": 16265600, + "step": 8302 + }, + { + "epoch": 1.100463883366468, + "grad_norm": 5.286959648132324, + "learning_rate": 3.5177400518245797e-06, + "loss": 0.0845, + "num_input_tokens_seen": 16267112, + "step": 8303 + }, + { + "epoch": 1.100596421471173, + "grad_norm": 19.800045013427734, + "learning_rate": 3.5174229718748935e-06, + "loss": 0.6038, + "num_input_tokens_seen": 16269600, + "step": 8304 + }, + { + "epoch": 1.100728959575878, + "grad_norm": 8.920562744140625, + "learning_rate": 3.5171058723085918e-06, + "loss": 0.1026, + "num_input_tokens_seen": 16272136, + "step": 8305 + }, + { + "epoch": 1.100861497680583, + "grad_norm": 0.0033870558254420757, + "learning_rate": 3.5167887531317857e-06, + "loss": 0.0, + "num_input_tokens_seen": 16273960, + "step": 8306 + }, + { + "epoch": 1.1009940357852883, + "grad_norm": 2.870105504989624, + "learning_rate": 3.516471614350592e-06, + "loss": 0.0398, + "num_input_tokens_seen": 16275360, + "step": 8307 + }, + { + "epoch": 1.1011265738899934, + "grad_norm": 9.500845909118652, + "learning_rate": 3.5161544559711237e-06, + "loss": 0.3368, + "num_input_tokens_seen": 16277048, + "step": 8308 + }, + { + "epoch": 1.1012591119946984, + "grad_norm": 15.229972839355469, + "learning_rate": 3.515837277999496e-06, + "loss": 0.2557, + "num_input_tokens_seen": 16279056, + "step": 8309 + }, + { + "epoch": 1.1013916500994037, + "grad_norm": 8.924276351928711, + "learning_rate": 3.5155200804418256e-06, + "loss": 0.1926, + "num_input_tokens_seen": 16281328, + "step": 8310 + }, + { + "epoch": 1.1015241882041087, + "grad_norm": 0.02541017159819603, + "learning_rate": 3.5152028633042277e-06, + "loss": 0.0002, + "num_input_tokens_seen": 16282888, + "step": 8311 + }, + { + "epoch": 1.1016567263088137, + "grad_norm": 0.27714741230010986, + "learning_rate": 3.5148856265928174e-06, + "loss": 0.0015, + "num_input_tokens_seen": 16285416, + "step": 8312 + }, + { + "epoch": 1.101789264413519, + "grad_norm": 7.060849189758301, + "learning_rate": 3.5145683703137122e-06, + "loss": 0.1501, + "num_input_tokens_seen": 16286672, + "step": 8313 + }, + { + "epoch": 1.101921802518224, + "grad_norm": 5.63136100769043, + "learning_rate": 3.5142510944730296e-06, + "loss": 0.0717, + "num_input_tokens_seen": 16289168, + "step": 8314 + }, + { + "epoch": 1.102054340622929, + "grad_norm": 5.149138450622559, + "learning_rate": 3.513933799076886e-06, + "loss": 0.0589, + "num_input_tokens_seen": 16290784, + "step": 8315 + }, + { + "epoch": 1.102186878727634, + "grad_norm": 1.0878469944000244, + "learning_rate": 3.513616484131399e-06, + "loss": 0.0071, + "num_input_tokens_seen": 16292960, + "step": 8316 + }, + { + "epoch": 1.1023194168323394, + "grad_norm": 9.899919509887695, + "learning_rate": 3.513299149642687e-06, + "loss": 0.1153, + "num_input_tokens_seen": 16295424, + "step": 8317 + }, + { + "epoch": 1.1024519549370444, + "grad_norm": 1.6255052089691162, + "learning_rate": 3.512981795616868e-06, + "loss": 0.0148, + "num_input_tokens_seen": 16297008, + "step": 8318 + }, + { + "epoch": 1.1025844930417494, + "grad_norm": 10.011774063110352, + "learning_rate": 3.5126644220600616e-06, + "loss": 0.0705, + "num_input_tokens_seen": 16298104, + "step": 8319 + }, + { + "epoch": 1.1027170311464547, + "grad_norm": 4.621273040771484, + "learning_rate": 3.5123470289783867e-06, + "loss": 0.0452, + "num_input_tokens_seen": 16299624, + "step": 8320 + }, + { + "epoch": 1.1028495692511597, + "grad_norm": 18.461647033691406, + "learning_rate": 3.512029616377963e-06, + "loss": 0.4036, + "num_input_tokens_seen": 16301496, + "step": 8321 + }, + { + "epoch": 1.1029821073558648, + "grad_norm": 6.921319007873535, + "learning_rate": 3.5117121842649095e-06, + "loss": 0.1659, + "num_input_tokens_seen": 16304088, + "step": 8322 + }, + { + "epoch": 1.10311464546057, + "grad_norm": 0.08320663124322891, + "learning_rate": 3.5113947326453475e-06, + "loss": 0.0005, + "num_input_tokens_seen": 16305952, + "step": 8323 + }, + { + "epoch": 1.103247183565275, + "grad_norm": 14.583900451660156, + "learning_rate": 3.511077261525397e-06, + "loss": 0.2031, + "num_input_tokens_seen": 16307952, + "step": 8324 + }, + { + "epoch": 1.10337972166998, + "grad_norm": 0.207341268658638, + "learning_rate": 3.510759770911179e-06, + "loss": 0.0011, + "num_input_tokens_seen": 16309304, + "step": 8325 + }, + { + "epoch": 1.1035122597746851, + "grad_norm": 0.32227155566215515, + "learning_rate": 3.5104422608088164e-06, + "loss": 0.0018, + "num_input_tokens_seen": 16310800, + "step": 8326 + }, + { + "epoch": 1.1036447978793904, + "grad_norm": 0.743057131767273, + "learning_rate": 3.5101247312244287e-06, + "loss": 0.007, + "num_input_tokens_seen": 16312696, + "step": 8327 + }, + { + "epoch": 1.1037773359840954, + "grad_norm": 0.021749364212155342, + "learning_rate": 3.50980718216414e-06, + "loss": 0.0001, + "num_input_tokens_seen": 16315160, + "step": 8328 + }, + { + "epoch": 1.1039098740888005, + "grad_norm": 2.9811768531799316, + "learning_rate": 3.5094896136340727e-06, + "loss": 0.0657, + "num_input_tokens_seen": 16317296, + "step": 8329 + }, + { + "epoch": 1.1040424121935057, + "grad_norm": 0.0035204675514250994, + "learning_rate": 3.5091720256403483e-06, + "loss": 0.0, + "num_input_tokens_seen": 16318656, + "step": 8330 + }, + { + "epoch": 1.1041749502982108, + "grad_norm": 0.16462518274784088, + "learning_rate": 3.5088544181890917e-06, + "loss": 0.0011, + "num_input_tokens_seen": 16321576, + "step": 8331 + }, + { + "epoch": 1.1043074884029158, + "grad_norm": 5.607799530029297, + "learning_rate": 3.5085367912864256e-06, + "loss": 0.1057, + "num_input_tokens_seen": 16322672, + "step": 8332 + }, + { + "epoch": 1.1044400265076209, + "grad_norm": 4.60710334777832, + "learning_rate": 3.5082191449384745e-06, + "loss": 0.0245, + "num_input_tokens_seen": 16324320, + "step": 8333 + }, + { + "epoch": 1.1045725646123261, + "grad_norm": 16.800397872924805, + "learning_rate": 3.5079014791513634e-06, + "loss": 0.3846, + "num_input_tokens_seen": 16326240, + "step": 8334 + }, + { + "epoch": 1.1047051027170312, + "grad_norm": 2.139453887939453, + "learning_rate": 3.5075837939312154e-06, + "loss": 0.0331, + "num_input_tokens_seen": 16327888, + "step": 8335 + }, + { + "epoch": 1.1048376408217362, + "grad_norm": 5.235513687133789, + "learning_rate": 3.507266089284157e-06, + "loss": 0.0461, + "num_input_tokens_seen": 16329272, + "step": 8336 + }, + { + "epoch": 1.1049701789264414, + "grad_norm": 0.15007513761520386, + "learning_rate": 3.506948365216314e-06, + "loss": 0.001, + "num_input_tokens_seen": 16330672, + "step": 8337 + }, + { + "epoch": 1.1051027170311465, + "grad_norm": 11.270011901855469, + "learning_rate": 3.5066306217338114e-06, + "loss": 0.2029, + "num_input_tokens_seen": 16332976, + "step": 8338 + }, + { + "epoch": 1.1052352551358515, + "grad_norm": 0.0049406662583351135, + "learning_rate": 3.5063128588427762e-06, + "loss": 0.0, + "num_input_tokens_seen": 16334224, + "step": 8339 + }, + { + "epoch": 1.1053677932405566, + "grad_norm": 6.272645473480225, + "learning_rate": 3.5059950765493346e-06, + "loss": 0.1344, + "num_input_tokens_seen": 16336168, + "step": 8340 + }, + { + "epoch": 1.1055003313452618, + "grad_norm": 9.624823570251465, + "learning_rate": 3.5056772748596145e-06, + "loss": 0.1995, + "num_input_tokens_seen": 16338184, + "step": 8341 + }, + { + "epoch": 1.1056328694499669, + "grad_norm": 7.304860591888428, + "learning_rate": 3.505359453779742e-06, + "loss": 0.1469, + "num_input_tokens_seen": 16340192, + "step": 8342 + }, + { + "epoch": 1.105765407554672, + "grad_norm": 6.836062431335449, + "learning_rate": 3.505041613315846e-06, + "loss": 0.2079, + "num_input_tokens_seen": 16342712, + "step": 8343 + }, + { + "epoch": 1.1058979456593772, + "grad_norm": 0.004496525973081589, + "learning_rate": 3.5047237534740553e-06, + "loss": 0.0, + "num_input_tokens_seen": 16344104, + "step": 8344 + }, + { + "epoch": 1.1060304837640822, + "grad_norm": 4.775545597076416, + "learning_rate": 3.504405874260496e-06, + "loss": 0.0775, + "num_input_tokens_seen": 16346784, + "step": 8345 + }, + { + "epoch": 1.1061630218687872, + "grad_norm": 8.949226379394531, + "learning_rate": 3.5040879756812997e-06, + "loss": 0.2499, + "num_input_tokens_seen": 16348592, + "step": 8346 + }, + { + "epoch": 1.1062955599734923, + "grad_norm": 12.20987319946289, + "learning_rate": 3.5037700577425937e-06, + "loss": 0.1819, + "num_input_tokens_seen": 16351496, + "step": 8347 + }, + { + "epoch": 1.1064280980781975, + "grad_norm": 0.34519150853157043, + "learning_rate": 3.5034521204505084e-06, + "loss": 0.0024, + "num_input_tokens_seen": 16353664, + "step": 8348 + }, + { + "epoch": 1.1065606361829026, + "grad_norm": 1.483240008354187, + "learning_rate": 3.503134163811175e-06, + "loss": 0.0094, + "num_input_tokens_seen": 16355272, + "step": 8349 + }, + { + "epoch": 1.1066931742876076, + "grad_norm": 0.5485265851020813, + "learning_rate": 3.502816187830722e-06, + "loss": 0.0034, + "num_input_tokens_seen": 16357672, + "step": 8350 + }, + { + "epoch": 1.1068257123923129, + "grad_norm": 5.789738178253174, + "learning_rate": 3.502498192515281e-06, + "loss": 0.1501, + "num_input_tokens_seen": 16359016, + "step": 8351 + }, + { + "epoch": 1.106958250497018, + "grad_norm": 0.06561057269573212, + "learning_rate": 3.502180177870984e-06, + "loss": 0.0004, + "num_input_tokens_seen": 16361152, + "step": 8352 + }, + { + "epoch": 1.107090788601723, + "grad_norm": 3.2476675510406494, + "learning_rate": 3.501862143903962e-06, + "loss": 0.0371, + "num_input_tokens_seen": 16363120, + "step": 8353 + }, + { + "epoch": 1.1072233267064282, + "grad_norm": 11.596503257751465, + "learning_rate": 3.5015440906203457e-06, + "loss": 0.3629, + "num_input_tokens_seen": 16365480, + "step": 8354 + }, + { + "epoch": 1.1073558648111332, + "grad_norm": 3.6839776039123535, + "learning_rate": 3.5012260180262692e-06, + "loss": 0.0832, + "num_input_tokens_seen": 16367968, + "step": 8355 + }, + { + "epoch": 1.1074884029158383, + "grad_norm": 6.694451332092285, + "learning_rate": 3.500907926127864e-06, + "loss": 0.094, + "num_input_tokens_seen": 16369512, + "step": 8356 + }, + { + "epoch": 1.1076209410205433, + "grad_norm": 12.352378845214844, + "learning_rate": 3.5005898149312633e-06, + "loss": 0.3779, + "num_input_tokens_seen": 16371720, + "step": 8357 + }, + { + "epoch": 1.1077534791252486, + "grad_norm": 7.384657859802246, + "learning_rate": 3.500271684442601e-06, + "loss": 0.2277, + "num_input_tokens_seen": 16373440, + "step": 8358 + }, + { + "epoch": 1.1078860172299536, + "grad_norm": 7.060083389282227, + "learning_rate": 3.499953534668011e-06, + "loss": 0.1622, + "num_input_tokens_seen": 16375272, + "step": 8359 + }, + { + "epoch": 1.1080185553346587, + "grad_norm": 1.209827184677124, + "learning_rate": 3.499635365613626e-06, + "loss": 0.0053, + "num_input_tokens_seen": 16377576, + "step": 8360 + }, + { + "epoch": 1.108151093439364, + "grad_norm": 0.09285591542720795, + "learning_rate": 3.4993171772855823e-06, + "loss": 0.0007, + "num_input_tokens_seen": 16378976, + "step": 8361 + }, + { + "epoch": 1.108283631544069, + "grad_norm": 9.731882095336914, + "learning_rate": 3.498998969690013e-06, + "loss": 0.1288, + "num_input_tokens_seen": 16380992, + "step": 8362 + }, + { + "epoch": 1.108416169648774, + "grad_norm": 6.474432945251465, + "learning_rate": 3.4986807428330555e-06, + "loss": 0.1106, + "num_input_tokens_seen": 16383352, + "step": 8363 + }, + { + "epoch": 1.108548707753479, + "grad_norm": 10.774589538574219, + "learning_rate": 3.4983624967208443e-06, + "loss": 0.3102, + "num_input_tokens_seen": 16385448, + "step": 8364 + }, + { + "epoch": 1.1086812458581843, + "grad_norm": 0.07440017908811569, + "learning_rate": 3.4980442313595153e-06, + "loss": 0.0005, + "num_input_tokens_seen": 16386824, + "step": 8365 + }, + { + "epoch": 1.1088137839628893, + "grad_norm": 9.913625717163086, + "learning_rate": 3.4977259467552043e-06, + "loss": 0.1835, + "num_input_tokens_seen": 16388952, + "step": 8366 + }, + { + "epoch": 1.1089463220675944, + "grad_norm": 2.193392276763916, + "learning_rate": 3.4974076429140495e-06, + "loss": 0.0163, + "num_input_tokens_seen": 16390384, + "step": 8367 + }, + { + "epoch": 1.1090788601722996, + "grad_norm": 19.282543182373047, + "learning_rate": 3.4970893198421873e-06, + "loss": 0.328, + "num_input_tokens_seen": 16392240, + "step": 8368 + }, + { + "epoch": 1.1092113982770047, + "grad_norm": 4.310641765594482, + "learning_rate": 3.496770977545755e-06, + "loss": 0.0382, + "num_input_tokens_seen": 16394080, + "step": 8369 + }, + { + "epoch": 1.1093439363817097, + "grad_norm": 0.09109468758106232, + "learning_rate": 3.496452616030891e-06, + "loss": 0.0007, + "num_input_tokens_seen": 16395736, + "step": 8370 + }, + { + "epoch": 1.109476474486415, + "grad_norm": 0.07450588792562485, + "learning_rate": 3.496134235303732e-06, + "loss": 0.0005, + "num_input_tokens_seen": 16397280, + "step": 8371 + }, + { + "epoch": 1.10960901259112, + "grad_norm": 8.375280380249023, + "learning_rate": 3.4958158353704185e-06, + "loss": 0.3642, + "num_input_tokens_seen": 16399672, + "step": 8372 + }, + { + "epoch": 1.109741550695825, + "grad_norm": 2.1880531311035156, + "learning_rate": 3.4954974162370894e-06, + "loss": 0.0186, + "num_input_tokens_seen": 16401672, + "step": 8373 + }, + { + "epoch": 1.10987408880053, + "grad_norm": 3.5415656566619873, + "learning_rate": 3.4951789779098814e-06, + "loss": 0.1101, + "num_input_tokens_seen": 16403656, + "step": 8374 + }, + { + "epoch": 1.1100066269052353, + "grad_norm": 15.858475685119629, + "learning_rate": 3.494860520394938e-06, + "loss": 0.2527, + "num_input_tokens_seen": 16405872, + "step": 8375 + }, + { + "epoch": 1.1101391650099404, + "grad_norm": 5.520722389221191, + "learning_rate": 3.4945420436983968e-06, + "loss": 0.1629, + "num_input_tokens_seen": 16408704, + "step": 8376 + }, + { + "epoch": 1.1102717031146454, + "grad_norm": 1.4546762704849243, + "learning_rate": 3.4942235478263987e-06, + "loss": 0.0084, + "num_input_tokens_seen": 16410472, + "step": 8377 + }, + { + "epoch": 1.1104042412193507, + "grad_norm": 0.5183747410774231, + "learning_rate": 3.493905032785085e-06, + "loss": 0.0053, + "num_input_tokens_seen": 16411496, + "step": 8378 + }, + { + "epoch": 1.1105367793240557, + "grad_norm": 7.558457374572754, + "learning_rate": 3.493586498580597e-06, + "loss": 0.2219, + "num_input_tokens_seen": 16413272, + "step": 8379 + }, + { + "epoch": 1.1106693174287607, + "grad_norm": 10.105422973632812, + "learning_rate": 3.493267945219075e-06, + "loss": 0.2057, + "num_input_tokens_seen": 16415680, + "step": 8380 + }, + { + "epoch": 1.1108018555334658, + "grad_norm": 0.1435491442680359, + "learning_rate": 3.4929493727066622e-06, + "loss": 0.0011, + "num_input_tokens_seen": 16416840, + "step": 8381 + }, + { + "epoch": 1.110934393638171, + "grad_norm": 5.828749179840088, + "learning_rate": 3.4926307810495e-06, + "loss": 0.1092, + "num_input_tokens_seen": 16419192, + "step": 8382 + }, + { + "epoch": 1.111066931742876, + "grad_norm": 3.2960901260375977, + "learning_rate": 3.492312170253732e-06, + "loss": 0.0371, + "num_input_tokens_seen": 16421608, + "step": 8383 + }, + { + "epoch": 1.1111994698475811, + "grad_norm": 4.1906023025512695, + "learning_rate": 3.4919935403255013e-06, + "loss": 0.0399, + "num_input_tokens_seen": 16423664, + "step": 8384 + }, + { + "epoch": 1.1113320079522864, + "grad_norm": 16.101318359375, + "learning_rate": 3.4916748912709506e-06, + "loss": 0.3938, + "num_input_tokens_seen": 16425440, + "step": 8385 + }, + { + "epoch": 1.1114645460569914, + "grad_norm": 0.15895967185497284, + "learning_rate": 3.4913562230962232e-06, + "loss": 0.0012, + "num_input_tokens_seen": 16427064, + "step": 8386 + }, + { + "epoch": 1.1115970841616964, + "grad_norm": 0.4451965093612671, + "learning_rate": 3.4910375358074642e-06, + "loss": 0.0033, + "num_input_tokens_seen": 16429416, + "step": 8387 + }, + { + "epoch": 1.1117296222664015, + "grad_norm": 0.11195900291204453, + "learning_rate": 3.4907188294108185e-06, + "loss": 0.0008, + "num_input_tokens_seen": 16430648, + "step": 8388 + }, + { + "epoch": 1.1118621603711067, + "grad_norm": 11.73038387298584, + "learning_rate": 3.490400103912429e-06, + "loss": 0.3006, + "num_input_tokens_seen": 16432744, + "step": 8389 + }, + { + "epoch": 1.1119946984758118, + "grad_norm": 0.1679728627204895, + "learning_rate": 3.4900813593184434e-06, + "loss": 0.0013, + "num_input_tokens_seen": 16433880, + "step": 8390 + }, + { + "epoch": 1.1121272365805168, + "grad_norm": 13.385002136230469, + "learning_rate": 3.489762595635006e-06, + "loss": 0.1437, + "num_input_tokens_seen": 16435864, + "step": 8391 + }, + { + "epoch": 1.112259774685222, + "grad_norm": 0.6505146026611328, + "learning_rate": 3.4894438128682622e-06, + "loss": 0.0047, + "num_input_tokens_seen": 16438080, + "step": 8392 + }, + { + "epoch": 1.1123923127899271, + "grad_norm": 7.733471870422363, + "learning_rate": 3.48912501102436e-06, + "loss": 0.1729, + "num_input_tokens_seen": 16440112, + "step": 8393 + }, + { + "epoch": 1.1125248508946322, + "grad_norm": 0.2137671262025833, + "learning_rate": 3.488806190109445e-06, + "loss": 0.0015, + "num_input_tokens_seen": 16442024, + "step": 8394 + }, + { + "epoch": 1.1126573889993372, + "grad_norm": 3.7220957279205322, + "learning_rate": 3.4884873501296644e-06, + "loss": 0.0829, + "num_input_tokens_seen": 16443760, + "step": 8395 + }, + { + "epoch": 1.1127899271040425, + "grad_norm": 2.425835371017456, + "learning_rate": 3.4881684910911662e-06, + "loss": 0.0315, + "num_input_tokens_seen": 16446440, + "step": 8396 + }, + { + "epoch": 1.1129224652087475, + "grad_norm": 0.22856350243091583, + "learning_rate": 3.487849613000097e-06, + "loss": 0.0017, + "num_input_tokens_seen": 16448448, + "step": 8397 + }, + { + "epoch": 1.1130550033134525, + "grad_norm": 8.065740585327148, + "learning_rate": 3.487530715862606e-06, + "loss": 0.1184, + "num_input_tokens_seen": 16450760, + "step": 8398 + }, + { + "epoch": 1.1131875414181578, + "grad_norm": 1.304933786392212, + "learning_rate": 3.487211799684842e-06, + "loss": 0.0088, + "num_input_tokens_seen": 16452184, + "step": 8399 + }, + { + "epoch": 1.1133200795228628, + "grad_norm": 3.2624142169952393, + "learning_rate": 3.4868928644729527e-06, + "loss": 0.0841, + "num_input_tokens_seen": 16455432, + "step": 8400 + }, + { + "epoch": 1.1134526176275679, + "grad_norm": 11.970552444458008, + "learning_rate": 3.486573910233089e-06, + "loss": 0.227, + "num_input_tokens_seen": 16457344, + "step": 8401 + }, + { + "epoch": 1.1135851557322731, + "grad_norm": 0.14939334988594055, + "learning_rate": 3.486254936971399e-06, + "loss": 0.0011, + "num_input_tokens_seen": 16459216, + "step": 8402 + }, + { + "epoch": 1.1137176938369782, + "grad_norm": 5.5158796310424805, + "learning_rate": 3.4859359446940337e-06, + "loss": 0.1117, + "num_input_tokens_seen": 16461344, + "step": 8403 + }, + { + "epoch": 1.1138502319416832, + "grad_norm": 0.07000122219324112, + "learning_rate": 3.4856169334071434e-06, + "loss": 0.0005, + "num_input_tokens_seen": 16462992, + "step": 8404 + }, + { + "epoch": 1.1139827700463882, + "grad_norm": 7.405033111572266, + "learning_rate": 3.4852979031168787e-06, + "loss": 0.0946, + "num_input_tokens_seen": 16464712, + "step": 8405 + }, + { + "epoch": 1.1141153081510935, + "grad_norm": 0.14460404217243195, + "learning_rate": 3.4849788538293905e-06, + "loss": 0.001, + "num_input_tokens_seen": 16466264, + "step": 8406 + }, + { + "epoch": 1.1142478462557985, + "grad_norm": 11.341758728027344, + "learning_rate": 3.48465978555083e-06, + "loss": 0.3165, + "num_input_tokens_seen": 16468648, + "step": 8407 + }, + { + "epoch": 1.1143803843605036, + "grad_norm": 7.661235809326172, + "learning_rate": 3.4843406982873507e-06, + "loss": 0.2533, + "num_input_tokens_seen": 16470936, + "step": 8408 + }, + { + "epoch": 1.1145129224652088, + "grad_norm": 6.7464189529418945, + "learning_rate": 3.4840215920451032e-06, + "loss": 0.041, + "num_input_tokens_seen": 16472816, + "step": 8409 + }, + { + "epoch": 1.1146454605699139, + "grad_norm": 0.5663466453552246, + "learning_rate": 3.48370246683024e-06, + "loss": 0.0045, + "num_input_tokens_seen": 16474992, + "step": 8410 + }, + { + "epoch": 1.114777998674619, + "grad_norm": 10.365991592407227, + "learning_rate": 3.483383322648915e-06, + "loss": 0.3239, + "num_input_tokens_seen": 16476632, + "step": 8411 + }, + { + "epoch": 1.1149105367793242, + "grad_norm": 0.10152385383844376, + "learning_rate": 3.483064159507281e-06, + "loss": 0.0014, + "num_input_tokens_seen": 16477992, + "step": 8412 + }, + { + "epoch": 1.1150430748840292, + "grad_norm": 0.3106798231601715, + "learning_rate": 3.4827449774114917e-06, + "loss": 0.0023, + "num_input_tokens_seen": 16481088, + "step": 8413 + }, + { + "epoch": 1.1151756129887342, + "grad_norm": 0.7371013760566711, + "learning_rate": 3.4824257763677023e-06, + "loss": 0.0041, + "num_input_tokens_seen": 16482568, + "step": 8414 + }, + { + "epoch": 1.1153081510934393, + "grad_norm": 1.9163726568222046, + "learning_rate": 3.4821065563820654e-06, + "loss": 0.0082, + "num_input_tokens_seen": 16484248, + "step": 8415 + }, + { + "epoch": 1.1154406891981445, + "grad_norm": 9.128623962402344, + "learning_rate": 3.4817873174607363e-06, + "loss": 0.2268, + "num_input_tokens_seen": 16486080, + "step": 8416 + }, + { + "epoch": 1.1155732273028496, + "grad_norm": 10.534138679504395, + "learning_rate": 3.4814680596098717e-06, + "loss": 0.3603, + "num_input_tokens_seen": 16487624, + "step": 8417 + }, + { + "epoch": 1.1157057654075546, + "grad_norm": 0.7412254214286804, + "learning_rate": 3.4811487828356244e-06, + "loss": 0.0055, + "num_input_tokens_seen": 16490632, + "step": 8418 + }, + { + "epoch": 1.1158383035122599, + "grad_norm": 0.31978484988212585, + "learning_rate": 3.4808294871441516e-06, + "loss": 0.0016, + "num_input_tokens_seen": 16492840, + "step": 8419 + }, + { + "epoch": 1.115970841616965, + "grad_norm": 10.115509986877441, + "learning_rate": 3.4805101725416102e-06, + "loss": 0.2746, + "num_input_tokens_seen": 16495344, + "step": 8420 + }, + { + "epoch": 1.11610337972167, + "grad_norm": 7.60206413269043, + "learning_rate": 3.480190839034157e-06, + "loss": 0.1715, + "num_input_tokens_seen": 16497464, + "step": 8421 + }, + { + "epoch": 1.116235917826375, + "grad_norm": 6.78001070022583, + "learning_rate": 3.479871486627947e-06, + "loss": 0.1248, + "num_input_tokens_seen": 16499688, + "step": 8422 + }, + { + "epoch": 1.1163684559310803, + "grad_norm": 9.612598419189453, + "learning_rate": 3.4795521153291394e-06, + "loss": 0.1158, + "num_input_tokens_seen": 16501600, + "step": 8423 + }, + { + "epoch": 1.1165009940357853, + "grad_norm": 11.636311531066895, + "learning_rate": 3.479232725143892e-06, + "loss": 0.2754, + "num_input_tokens_seen": 16503992, + "step": 8424 + }, + { + "epoch": 1.1166335321404903, + "grad_norm": 7.0306501388549805, + "learning_rate": 3.4789133160783606e-06, + "loss": 0.0267, + "num_input_tokens_seen": 16506560, + "step": 8425 + }, + { + "epoch": 1.1167660702451956, + "grad_norm": 1.5405538082122803, + "learning_rate": 3.478593888138706e-06, + "loss": 0.0265, + "num_input_tokens_seen": 16508432, + "step": 8426 + }, + { + "epoch": 1.1168986083499006, + "grad_norm": 11.093717575073242, + "learning_rate": 3.4782744413310854e-06, + "loss": 0.141, + "num_input_tokens_seen": 16510736, + "step": 8427 + }, + { + "epoch": 1.1170311464546057, + "grad_norm": 0.015495721250772476, + "learning_rate": 3.477954975661659e-06, + "loss": 0.0001, + "num_input_tokens_seen": 16512256, + "step": 8428 + }, + { + "epoch": 1.1171636845593107, + "grad_norm": 1.2309634685516357, + "learning_rate": 3.4776354911365867e-06, + "loss": 0.0112, + "num_input_tokens_seen": 16513792, + "step": 8429 + }, + { + "epoch": 1.117296222664016, + "grad_norm": 7.763525485992432, + "learning_rate": 3.477315987762027e-06, + "loss": 0.1012, + "num_input_tokens_seen": 16516224, + "step": 8430 + }, + { + "epoch": 1.117428760768721, + "grad_norm": 0.026643717661499977, + "learning_rate": 3.4769964655441413e-06, + "loss": 0.0002, + "num_input_tokens_seen": 16517376, + "step": 8431 + }, + { + "epoch": 1.117561298873426, + "grad_norm": 0.05065273493528366, + "learning_rate": 3.4766769244890898e-06, + "loss": 0.0003, + "num_input_tokens_seen": 16518968, + "step": 8432 + }, + { + "epoch": 1.1176938369781313, + "grad_norm": 19.02833366394043, + "learning_rate": 3.476357364603033e-06, + "loss": 0.4334, + "num_input_tokens_seen": 16521504, + "step": 8433 + }, + { + "epoch": 1.1178263750828363, + "grad_norm": 7.889640808105469, + "learning_rate": 3.476037785892133e-06, + "loss": 0.2245, + "num_input_tokens_seen": 16523256, + "step": 8434 + }, + { + "epoch": 1.1179589131875414, + "grad_norm": 0.04523637145757675, + "learning_rate": 3.4757181883625512e-06, + "loss": 0.0003, + "num_input_tokens_seen": 16524632, + "step": 8435 + }, + { + "epoch": 1.1180914512922464, + "grad_norm": 0.01622576080262661, + "learning_rate": 3.475398572020449e-06, + "loss": 0.0001, + "num_input_tokens_seen": 16526072, + "step": 8436 + }, + { + "epoch": 1.1182239893969517, + "grad_norm": 6.051847457885742, + "learning_rate": 3.4750789368719893e-06, + "loss": 0.1571, + "num_input_tokens_seen": 16528520, + "step": 8437 + }, + { + "epoch": 1.1183565275016567, + "grad_norm": 9.460832595825195, + "learning_rate": 3.4747592829233367e-06, + "loss": 0.2143, + "num_input_tokens_seen": 16530384, + "step": 8438 + }, + { + "epoch": 1.1184890656063617, + "grad_norm": 12.428264617919922, + "learning_rate": 3.4744396101806506e-06, + "loss": 0.2346, + "num_input_tokens_seen": 16532504, + "step": 8439 + }, + { + "epoch": 1.118621603711067, + "grad_norm": 0.06932102143764496, + "learning_rate": 3.4741199186500977e-06, + "loss": 0.0004, + "num_input_tokens_seen": 16534496, + "step": 8440 + }, + { + "epoch": 1.118754141815772, + "grad_norm": 7.941710472106934, + "learning_rate": 3.473800208337841e-06, + "loss": 0.1156, + "num_input_tokens_seen": 16536456, + "step": 8441 + }, + { + "epoch": 1.118886679920477, + "grad_norm": 5.026635646820068, + "learning_rate": 3.473480479250043e-06, + "loss": 0.0836, + "num_input_tokens_seen": 16538056, + "step": 8442 + }, + { + "epoch": 1.1190192180251823, + "grad_norm": 0.15467025339603424, + "learning_rate": 3.4731607313928717e-06, + "loss": 0.0011, + "num_input_tokens_seen": 16540344, + "step": 8443 + }, + { + "epoch": 1.1191517561298874, + "grad_norm": 10.816108703613281, + "learning_rate": 3.4728409647724894e-06, + "loss": 0.1771, + "num_input_tokens_seen": 16542744, + "step": 8444 + }, + { + "epoch": 1.1192842942345924, + "grad_norm": 3.546783208847046, + "learning_rate": 3.472521179395062e-06, + "loss": 0.0435, + "num_input_tokens_seen": 16544448, + "step": 8445 + }, + { + "epoch": 1.1194168323392975, + "grad_norm": 7.411040782928467, + "learning_rate": 3.472201375266755e-06, + "loss": 0.1766, + "num_input_tokens_seen": 16546568, + "step": 8446 + }, + { + "epoch": 1.1195493704440027, + "grad_norm": 9.078460693359375, + "learning_rate": 3.4718815523937354e-06, + "loss": 0.2169, + "num_input_tokens_seen": 16548224, + "step": 8447 + }, + { + "epoch": 1.1196819085487077, + "grad_norm": 0.03959347680211067, + "learning_rate": 3.4715617107821686e-06, + "loss": 0.0003, + "num_input_tokens_seen": 16549328, + "step": 8448 + }, + { + "epoch": 1.1198144466534128, + "grad_norm": 3.9412460327148438, + "learning_rate": 3.471241850438222e-06, + "loss": 0.1316, + "num_input_tokens_seen": 16551848, + "step": 8449 + }, + { + "epoch": 1.119946984758118, + "grad_norm": 0.05313378572463989, + "learning_rate": 3.470921971368063e-06, + "loss": 0.0003, + "num_input_tokens_seen": 16553328, + "step": 8450 + }, + { + "epoch": 1.120079522862823, + "grad_norm": 4.744962215423584, + "learning_rate": 3.470602073577858e-06, + "loss": 0.0196, + "num_input_tokens_seen": 16554856, + "step": 8451 + }, + { + "epoch": 1.1202120609675281, + "grad_norm": 38.21556854248047, + "learning_rate": 3.4702821570737753e-06, + "loss": 0.4533, + "num_input_tokens_seen": 16557752, + "step": 8452 + }, + { + "epoch": 1.1203445990722334, + "grad_norm": 1.7391879558563232, + "learning_rate": 3.469962221861984e-06, + "loss": 0.0314, + "num_input_tokens_seen": 16559616, + "step": 8453 + }, + { + "epoch": 1.1204771371769384, + "grad_norm": 9.767043113708496, + "learning_rate": 3.469642267948651e-06, + "loss": 0.2575, + "num_input_tokens_seen": 16561680, + "step": 8454 + }, + { + "epoch": 1.1206096752816435, + "grad_norm": 13.81735897064209, + "learning_rate": 3.469322295339946e-06, + "loss": 0.5153, + "num_input_tokens_seen": 16564296, + "step": 8455 + }, + { + "epoch": 1.1207422133863485, + "grad_norm": 0.16171440482139587, + "learning_rate": 3.4690023040420396e-06, + "loss": 0.0011, + "num_input_tokens_seen": 16565904, + "step": 8456 + }, + { + "epoch": 1.1208747514910538, + "grad_norm": 4.92385196685791, + "learning_rate": 3.4686822940610993e-06, + "loss": 0.028, + "num_input_tokens_seen": 16567712, + "step": 8457 + }, + { + "epoch": 1.1210072895957588, + "grad_norm": 2.7117927074432373, + "learning_rate": 3.468362265403297e-06, + "loss": 0.0174, + "num_input_tokens_seen": 16568744, + "step": 8458 + }, + { + "epoch": 1.1211398277004638, + "grad_norm": 6.28870153427124, + "learning_rate": 3.468042218074801e-06, + "loss": 0.1242, + "num_input_tokens_seen": 16571400, + "step": 8459 + }, + { + "epoch": 1.121272365805169, + "grad_norm": 7.26967716217041, + "learning_rate": 3.4677221520817837e-06, + "loss": 0.1595, + "num_input_tokens_seen": 16573488, + "step": 8460 + }, + { + "epoch": 1.1214049039098741, + "grad_norm": 3.3980298042297363, + "learning_rate": 3.4674020674304164e-06, + "loss": 0.0373, + "num_input_tokens_seen": 16575200, + "step": 8461 + }, + { + "epoch": 1.1215374420145792, + "grad_norm": 8.304523468017578, + "learning_rate": 3.4670819641268695e-06, + "loss": 0.2353, + "num_input_tokens_seen": 16576808, + "step": 8462 + }, + { + "epoch": 1.1216699801192842, + "grad_norm": 11.468254089355469, + "learning_rate": 3.4667618421773148e-06, + "loss": 0.1792, + "num_input_tokens_seen": 16578800, + "step": 8463 + }, + { + "epoch": 1.1218025182239895, + "grad_norm": 5.864408493041992, + "learning_rate": 3.4664417015879247e-06, + "loss": 0.0596, + "num_input_tokens_seen": 16580552, + "step": 8464 + }, + { + "epoch": 1.1219350563286945, + "grad_norm": 7.9062652587890625, + "learning_rate": 3.466121542364873e-06, + "loss": 0.1275, + "num_input_tokens_seen": 16582272, + "step": 8465 + }, + { + "epoch": 1.1220675944333995, + "grad_norm": 0.14119014143943787, + "learning_rate": 3.465801364514331e-06, + "loss": 0.0015, + "num_input_tokens_seen": 16584472, + "step": 8466 + }, + { + "epoch": 1.1222001325381048, + "grad_norm": 0.13928206264972687, + "learning_rate": 3.4654811680424715e-06, + "loss": 0.001, + "num_input_tokens_seen": 16586424, + "step": 8467 + }, + { + "epoch": 1.1223326706428098, + "grad_norm": 0.2411433607339859, + "learning_rate": 3.46516095295547e-06, + "loss": 0.0018, + "num_input_tokens_seen": 16588120, + "step": 8468 + }, + { + "epoch": 1.1224652087475149, + "grad_norm": 8.199978828430176, + "learning_rate": 3.464840719259499e-06, + "loss": 0.2624, + "num_input_tokens_seen": 16590160, + "step": 8469 + }, + { + "epoch": 1.12259774685222, + "grad_norm": 0.11893632262945175, + "learning_rate": 3.4645204669607335e-06, + "loss": 0.0008, + "num_input_tokens_seen": 16591824, + "step": 8470 + }, + { + "epoch": 1.1227302849569252, + "grad_norm": 1.7515157461166382, + "learning_rate": 3.4642001960653483e-06, + "loss": 0.0191, + "num_input_tokens_seen": 16593296, + "step": 8471 + }, + { + "epoch": 1.1228628230616302, + "grad_norm": 9.317205429077148, + "learning_rate": 3.463879906579517e-06, + "loss": 0.1698, + "num_input_tokens_seen": 16595104, + "step": 8472 + }, + { + "epoch": 1.1229953611663352, + "grad_norm": 9.877138137817383, + "learning_rate": 3.463559598509418e-06, + "loss": 0.1405, + "num_input_tokens_seen": 16597616, + "step": 8473 + }, + { + "epoch": 1.1231278992710405, + "grad_norm": 0.1762053370475769, + "learning_rate": 3.463239271861224e-06, + "loss": 0.0012, + "num_input_tokens_seen": 16599824, + "step": 8474 + }, + { + "epoch": 1.1232604373757455, + "grad_norm": 1.5470515489578247, + "learning_rate": 3.462918926641112e-06, + "loss": 0.0064, + "num_input_tokens_seen": 16601192, + "step": 8475 + }, + { + "epoch": 1.1233929754804506, + "grad_norm": 9.179183959960938, + "learning_rate": 3.46259856285526e-06, + "loss": 0.1462, + "num_input_tokens_seen": 16603152, + "step": 8476 + }, + { + "epoch": 1.1235255135851556, + "grad_norm": 13.946859359741211, + "learning_rate": 3.4622781805098426e-06, + "loss": 0.2277, + "num_input_tokens_seen": 16606088, + "step": 8477 + }, + { + "epoch": 1.1236580516898609, + "grad_norm": 1.3416038751602173, + "learning_rate": 3.461957779611039e-06, + "loss": 0.0091, + "num_input_tokens_seen": 16607552, + "step": 8478 + }, + { + "epoch": 1.123790589794566, + "grad_norm": 0.246473029255867, + "learning_rate": 3.4616373601650253e-06, + "loss": 0.0018, + "num_input_tokens_seen": 16610656, + "step": 8479 + }, + { + "epoch": 1.123923127899271, + "grad_norm": 12.998847961425781, + "learning_rate": 3.46131692217798e-06, + "loss": 0.4176, + "num_input_tokens_seen": 16613584, + "step": 8480 + }, + { + "epoch": 1.1240556660039762, + "grad_norm": 2.1694047451019287, + "learning_rate": 3.4609964656560806e-06, + "loss": 0.0181, + "num_input_tokens_seen": 16615752, + "step": 8481 + }, + { + "epoch": 1.1241882041086813, + "grad_norm": 12.08128833770752, + "learning_rate": 3.460675990605507e-06, + "loss": 0.2262, + "num_input_tokens_seen": 16617664, + "step": 8482 + }, + { + "epoch": 1.1243207422133863, + "grad_norm": 5.935091972351074, + "learning_rate": 3.4603554970324377e-06, + "loss": 0.0608, + "num_input_tokens_seen": 16619856, + "step": 8483 + }, + { + "epoch": 1.1244532803180916, + "grad_norm": 7.788889408111572, + "learning_rate": 3.4600349849430515e-06, + "loss": 0.1488, + "num_input_tokens_seen": 16621576, + "step": 8484 + }, + { + "epoch": 1.1245858184227966, + "grad_norm": 46.85171890258789, + "learning_rate": 3.4597144543435284e-06, + "loss": 0.1006, + "num_input_tokens_seen": 16624168, + "step": 8485 + }, + { + "epoch": 1.1247183565275016, + "grad_norm": 0.018745025619864464, + "learning_rate": 3.4593939052400492e-06, + "loss": 0.0001, + "num_input_tokens_seen": 16625232, + "step": 8486 + }, + { + "epoch": 1.1248508946322067, + "grad_norm": 1.3916137218475342, + "learning_rate": 3.459073337638793e-06, + "loss": 0.0052, + "num_input_tokens_seen": 16626776, + "step": 8487 + }, + { + "epoch": 1.124983432736912, + "grad_norm": 7.29010534286499, + "learning_rate": 3.458752751545941e-06, + "loss": 0.1705, + "num_input_tokens_seen": 16628424, + "step": 8488 + }, + { + "epoch": 1.125115970841617, + "grad_norm": 0.05532471090555191, + "learning_rate": 3.4584321469676756e-06, + "loss": 0.0004, + "num_input_tokens_seen": 16630032, + "step": 8489 + }, + { + "epoch": 1.125248508946322, + "grad_norm": 12.74735164642334, + "learning_rate": 3.458111523910176e-06, + "loss": 0.2124, + "num_input_tokens_seen": 16632496, + "step": 8490 + }, + { + "epoch": 1.1253810470510273, + "grad_norm": 3.17748761177063, + "learning_rate": 3.4577908823796265e-06, + "loss": 0.0679, + "num_input_tokens_seen": 16634200, + "step": 8491 + }, + { + "epoch": 1.1255135851557323, + "grad_norm": 6.777058124542236, + "learning_rate": 3.4574702223822072e-06, + "loss": 0.169, + "num_input_tokens_seen": 16635744, + "step": 8492 + }, + { + "epoch": 1.1256461232604373, + "grad_norm": 0.5229358673095703, + "learning_rate": 3.4571495439241017e-06, + "loss": 0.0036, + "num_input_tokens_seen": 16637936, + "step": 8493 + }, + { + "epoch": 1.1257786613651426, + "grad_norm": 10.011480331420898, + "learning_rate": 3.456828847011493e-06, + "loss": 0.2385, + "num_input_tokens_seen": 16639416, + "step": 8494 + }, + { + "epoch": 1.1259111994698476, + "grad_norm": 8.405430793762207, + "learning_rate": 3.4565081316505635e-06, + "loss": 0.1069, + "num_input_tokens_seen": 16641040, + "step": 8495 + }, + { + "epoch": 1.1260437375745527, + "grad_norm": 0.043477486819028854, + "learning_rate": 3.4561873978474975e-06, + "loss": 0.0003, + "num_input_tokens_seen": 16642240, + "step": 8496 + }, + { + "epoch": 1.1261762756792577, + "grad_norm": 4.222424507141113, + "learning_rate": 3.4558666456084795e-06, + "loss": 0.0398, + "num_input_tokens_seen": 16643896, + "step": 8497 + }, + { + "epoch": 1.126308813783963, + "grad_norm": 8.825814247131348, + "learning_rate": 3.4555458749396924e-06, + "loss": 0.2694, + "num_input_tokens_seen": 16645616, + "step": 8498 + }, + { + "epoch": 1.126441351888668, + "grad_norm": 0.27611249685287476, + "learning_rate": 3.455225085847322e-06, + "loss": 0.0018, + "num_input_tokens_seen": 16647104, + "step": 8499 + }, + { + "epoch": 1.126573889993373, + "grad_norm": 0.23041993379592896, + "learning_rate": 3.454904278337553e-06, + "loss": 0.0009, + "num_input_tokens_seen": 16650024, + "step": 8500 + }, + { + "epoch": 1.1267064280980783, + "grad_norm": 0.1716599315404892, + "learning_rate": 3.45458345241657e-06, + "loss": 0.001, + "num_input_tokens_seen": 16651968, + "step": 8501 + }, + { + "epoch": 1.1268389662027833, + "grad_norm": 0.12479785829782486, + "learning_rate": 3.45426260809056e-06, + "loss": 0.0007, + "num_input_tokens_seen": 16654720, + "step": 8502 + }, + { + "epoch": 1.1269715043074884, + "grad_norm": 3.2692744731903076, + "learning_rate": 3.453941745365709e-06, + "loss": 0.0077, + "num_input_tokens_seen": 16656504, + "step": 8503 + }, + { + "epoch": 1.1271040424121934, + "grad_norm": 0.10245697945356369, + "learning_rate": 3.453620864248203e-06, + "loss": 0.0004, + "num_input_tokens_seen": 16657880, + "step": 8504 + }, + { + "epoch": 1.1272365805168987, + "grad_norm": 9.530024528503418, + "learning_rate": 3.4532999647442285e-06, + "loss": 0.1167, + "num_input_tokens_seen": 16660800, + "step": 8505 + }, + { + "epoch": 1.1273691186216037, + "grad_norm": 9.497403144836426, + "learning_rate": 3.452979046859973e-06, + "loss": 0.2009, + "num_input_tokens_seen": 16663472, + "step": 8506 + }, + { + "epoch": 1.1275016567263088, + "grad_norm": 5.696786403656006, + "learning_rate": 3.4526581106016237e-06, + "loss": 0.0647, + "num_input_tokens_seen": 16665576, + "step": 8507 + }, + { + "epoch": 1.127634194831014, + "grad_norm": 0.03766528517007828, + "learning_rate": 3.452337155975369e-06, + "loss": 0.0003, + "num_input_tokens_seen": 16667160, + "step": 8508 + }, + { + "epoch": 1.127766732935719, + "grad_norm": 1.616792917251587, + "learning_rate": 3.4520161829873975e-06, + "loss": 0.0358, + "num_input_tokens_seen": 16668728, + "step": 8509 + }, + { + "epoch": 1.127899271040424, + "grad_norm": 0.04752480983734131, + "learning_rate": 3.4516951916438974e-06, + "loss": 0.0003, + "num_input_tokens_seen": 16671016, + "step": 8510 + }, + { + "epoch": 1.1280318091451291, + "grad_norm": 10.47054386138916, + "learning_rate": 3.4513741819510566e-06, + "loss": 0.2314, + "num_input_tokens_seen": 16672880, + "step": 8511 + }, + { + "epoch": 1.1281643472498344, + "grad_norm": 6.560187816619873, + "learning_rate": 3.451053153915066e-06, + "loss": 0.0726, + "num_input_tokens_seen": 16675912, + "step": 8512 + }, + { + "epoch": 1.1282968853545394, + "grad_norm": 13.351219177246094, + "learning_rate": 3.450732107542114e-06, + "loss": 0.2376, + "num_input_tokens_seen": 16677672, + "step": 8513 + }, + { + "epoch": 1.1284294234592445, + "grad_norm": 0.15342412889003754, + "learning_rate": 3.450411042838391e-06, + "loss": 0.0009, + "num_input_tokens_seen": 16680328, + "step": 8514 + }, + { + "epoch": 1.1285619615639497, + "grad_norm": 8.456214904785156, + "learning_rate": 3.4500899598100882e-06, + "loss": 0.14, + "num_input_tokens_seen": 16682984, + "step": 8515 + }, + { + "epoch": 1.1286944996686548, + "grad_norm": 0.3954511880874634, + "learning_rate": 3.4497688584633955e-06, + "loss": 0.0018, + "num_input_tokens_seen": 16685408, + "step": 8516 + }, + { + "epoch": 1.1288270377733598, + "grad_norm": 0.25022488832473755, + "learning_rate": 3.4494477388045035e-06, + "loss": 0.002, + "num_input_tokens_seen": 16686792, + "step": 8517 + }, + { + "epoch": 1.1289595758780648, + "grad_norm": 0.18703708052635193, + "learning_rate": 3.449126600839604e-06, + "loss": 0.0009, + "num_input_tokens_seen": 16689128, + "step": 8518 + }, + { + "epoch": 1.12909211398277, + "grad_norm": 3.252548933029175, + "learning_rate": 3.4488054445748886e-06, + "loss": 0.1102, + "num_input_tokens_seen": 16690784, + "step": 8519 + }, + { + "epoch": 1.1292246520874751, + "grad_norm": 8.948058128356934, + "learning_rate": 3.4484842700165506e-06, + "loss": 0.2526, + "num_input_tokens_seen": 16692912, + "step": 8520 + }, + { + "epoch": 1.1293571901921802, + "grad_norm": 3.9407899379730225, + "learning_rate": 3.448163077170781e-06, + "loss": 0.0439, + "num_input_tokens_seen": 16694168, + "step": 8521 + }, + { + "epoch": 1.1294897282968854, + "grad_norm": 32.42314147949219, + "learning_rate": 3.447841866043773e-06, + "loss": 0.3299, + "num_input_tokens_seen": 16696544, + "step": 8522 + }, + { + "epoch": 1.1296222664015905, + "grad_norm": 3.7470414638519287, + "learning_rate": 3.44752063664172e-06, + "loss": 0.0594, + "num_input_tokens_seen": 16698656, + "step": 8523 + }, + { + "epoch": 1.1297548045062955, + "grad_norm": 0.2576407492160797, + "learning_rate": 3.4471993889708155e-06, + "loss": 0.0014, + "num_input_tokens_seen": 16700544, + "step": 8524 + }, + { + "epoch": 1.1298873426110005, + "grad_norm": 0.04765302315354347, + "learning_rate": 3.4468781230372537e-06, + "loss": 0.0003, + "num_input_tokens_seen": 16702096, + "step": 8525 + }, + { + "epoch": 1.1300198807157058, + "grad_norm": 2.5305490493774414, + "learning_rate": 3.4465568388472287e-06, + "loss": 0.0358, + "num_input_tokens_seen": 16704208, + "step": 8526 + }, + { + "epoch": 1.1301524188204108, + "grad_norm": 10.042616844177246, + "learning_rate": 3.4462355364069344e-06, + "loss": 0.248, + "num_input_tokens_seen": 16706976, + "step": 8527 + }, + { + "epoch": 1.1302849569251159, + "grad_norm": 3.648715019226074, + "learning_rate": 3.4459142157225656e-06, + "loss": 0.048, + "num_input_tokens_seen": 16708920, + "step": 8528 + }, + { + "epoch": 1.1304174950298211, + "grad_norm": 2.7969470024108887, + "learning_rate": 3.4455928768003187e-06, + "loss": 0.0433, + "num_input_tokens_seen": 16710816, + "step": 8529 + }, + { + "epoch": 1.1305500331345262, + "grad_norm": 8.435277938842773, + "learning_rate": 3.4452715196463895e-06, + "loss": 0.2165, + "num_input_tokens_seen": 16712688, + "step": 8530 + }, + { + "epoch": 1.1306825712392312, + "grad_norm": 4.815793037414551, + "learning_rate": 3.4449501442669725e-06, + "loss": 0.1311, + "num_input_tokens_seen": 16714792, + "step": 8531 + }, + { + "epoch": 1.1308151093439365, + "grad_norm": 0.04750956594944, + "learning_rate": 3.4446287506682644e-06, + "loss": 0.0003, + "num_input_tokens_seen": 16716024, + "step": 8532 + }, + { + "epoch": 1.1309476474486415, + "grad_norm": 11.87228775024414, + "learning_rate": 3.4443073388564633e-06, + "loss": 0.1807, + "num_input_tokens_seen": 16718840, + "step": 8533 + }, + { + "epoch": 1.1310801855533466, + "grad_norm": 9.133766174316406, + "learning_rate": 3.443985908837765e-06, + "loss": 0.2608, + "num_input_tokens_seen": 16721040, + "step": 8534 + }, + { + "epoch": 1.1312127236580518, + "grad_norm": 3.89784836769104, + "learning_rate": 3.4436644606183667e-06, + "loss": 0.0405, + "num_input_tokens_seen": 16722976, + "step": 8535 + }, + { + "epoch": 1.1313452617627568, + "grad_norm": 0.05301361531019211, + "learning_rate": 3.443342994204467e-06, + "loss": 0.0004, + "num_input_tokens_seen": 16724728, + "step": 8536 + }, + { + "epoch": 1.1314777998674619, + "grad_norm": 4.3112897872924805, + "learning_rate": 3.4430215096022636e-06, + "loss": 0.0473, + "num_input_tokens_seen": 16725984, + "step": 8537 + }, + { + "epoch": 1.131610337972167, + "grad_norm": 12.038641929626465, + "learning_rate": 3.442700006817955e-06, + "loss": 0.093, + "num_input_tokens_seen": 16728224, + "step": 8538 + }, + { + "epoch": 1.1317428760768722, + "grad_norm": 0.046350039541721344, + "learning_rate": 3.4423784858577403e-06, + "loss": 0.0003, + "num_input_tokens_seen": 16729864, + "step": 8539 + }, + { + "epoch": 1.1318754141815772, + "grad_norm": 0.14071060717105865, + "learning_rate": 3.442056946727817e-06, + "loss": 0.0014, + "num_input_tokens_seen": 16731040, + "step": 8540 + }, + { + "epoch": 1.1320079522862823, + "grad_norm": 12.13999080657959, + "learning_rate": 3.441735389434387e-06, + "loss": 0.1461, + "num_input_tokens_seen": 16733288, + "step": 8541 + }, + { + "epoch": 1.1321404903909875, + "grad_norm": 13.159934997558594, + "learning_rate": 3.4414138139836486e-06, + "loss": 0.1902, + "num_input_tokens_seen": 16735384, + "step": 8542 + }, + { + "epoch": 1.1322730284956926, + "grad_norm": 3.971379518508911, + "learning_rate": 3.441092220381802e-06, + "loss": 0.1109, + "num_input_tokens_seen": 16737888, + "step": 8543 + }, + { + "epoch": 1.1324055666003976, + "grad_norm": 6.648914337158203, + "learning_rate": 3.440770608635049e-06, + "loss": 0.1229, + "num_input_tokens_seen": 16739840, + "step": 8544 + }, + { + "epoch": 1.1325381047051026, + "grad_norm": 7.494126796722412, + "learning_rate": 3.440448978749589e-06, + "loss": 0.0438, + "num_input_tokens_seen": 16741576, + "step": 8545 + }, + { + "epoch": 1.132670642809808, + "grad_norm": 9.250385284423828, + "learning_rate": 3.440127330731624e-06, + "loss": 0.068, + "num_input_tokens_seen": 16743176, + "step": 8546 + }, + { + "epoch": 1.132803180914513, + "grad_norm": 0.14744509756565094, + "learning_rate": 3.439805664587356e-06, + "loss": 0.0009, + "num_input_tokens_seen": 16744960, + "step": 8547 + }, + { + "epoch": 1.132935719019218, + "grad_norm": 0.18512561917304993, + "learning_rate": 3.439483980322986e-06, + "loss": 0.0011, + "num_input_tokens_seen": 16747616, + "step": 8548 + }, + { + "epoch": 1.1330682571239232, + "grad_norm": 0.7280220985412598, + "learning_rate": 3.439162277944717e-06, + "loss": 0.0066, + "num_input_tokens_seen": 16749264, + "step": 8549 + }, + { + "epoch": 1.1332007952286283, + "grad_norm": 9.674351692199707, + "learning_rate": 3.4388405574587514e-06, + "loss": 0.0297, + "num_input_tokens_seen": 16751824, + "step": 8550 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 1.0557432174682617, + "learning_rate": 3.4385188188712925e-06, + "loss": 0.0431, + "num_input_tokens_seen": 16752856, + "step": 8551 + }, + { + "epoch": 1.1334658714380383, + "grad_norm": 0.1130351796746254, + "learning_rate": 3.4381970621885424e-06, + "loss": 0.0008, + "num_input_tokens_seen": 16754504, + "step": 8552 + }, + { + "epoch": 1.1335984095427436, + "grad_norm": 27.402082443237305, + "learning_rate": 3.4378752874167063e-06, + "loss": 0.1206, + "num_input_tokens_seen": 16755752, + "step": 8553 + }, + { + "epoch": 1.1337309476474486, + "grad_norm": 0.10342779010534286, + "learning_rate": 3.4375534945619877e-06, + "loss": 0.0006, + "num_input_tokens_seen": 16756888, + "step": 8554 + }, + { + "epoch": 1.1338634857521537, + "grad_norm": 7.233067512512207, + "learning_rate": 3.4372316836305903e-06, + "loss": 0.1272, + "num_input_tokens_seen": 16758592, + "step": 8555 + }, + { + "epoch": 1.133996023856859, + "grad_norm": 0.7266532778739929, + "learning_rate": 3.4369098546287204e-06, + "loss": 0.0036, + "num_input_tokens_seen": 16760264, + "step": 8556 + }, + { + "epoch": 1.134128561961564, + "grad_norm": 7.788826942443848, + "learning_rate": 3.436588007562581e-06, + "loss": 0.2243, + "num_input_tokens_seen": 16762048, + "step": 8557 + }, + { + "epoch": 1.134261100066269, + "grad_norm": 1.5146348476409912, + "learning_rate": 3.4362661424383796e-06, + "loss": 0.0169, + "num_input_tokens_seen": 16764680, + "step": 8558 + }, + { + "epoch": 1.134393638170974, + "grad_norm": 4.772076606750488, + "learning_rate": 3.4359442592623216e-06, + "loss": 0.0222, + "num_input_tokens_seen": 16766512, + "step": 8559 + }, + { + "epoch": 1.1345261762756793, + "grad_norm": 2.7049789428710938, + "learning_rate": 3.4356223580406116e-06, + "loss": 0.0284, + "num_input_tokens_seen": 16768720, + "step": 8560 + }, + { + "epoch": 1.1346587143803843, + "grad_norm": 0.32087281346321106, + "learning_rate": 3.4353004387794576e-06, + "loss": 0.0016, + "num_input_tokens_seen": 16770648, + "step": 8561 + }, + { + "epoch": 1.1347912524850894, + "grad_norm": 0.4021240472793579, + "learning_rate": 3.4349785014850657e-06, + "loss": 0.0038, + "num_input_tokens_seen": 16773168, + "step": 8562 + }, + { + "epoch": 1.1349237905897946, + "grad_norm": 12.813952445983887, + "learning_rate": 3.4346565461636434e-06, + "loss": 0.3589, + "num_input_tokens_seen": 16775128, + "step": 8563 + }, + { + "epoch": 1.1350563286944997, + "grad_norm": 3.23867130279541, + "learning_rate": 3.4343345728213974e-06, + "loss": 0.0269, + "num_input_tokens_seen": 16777664, + "step": 8564 + }, + { + "epoch": 1.1351888667992047, + "grad_norm": 10.377969741821289, + "learning_rate": 3.434012581464537e-06, + "loss": 0.3502, + "num_input_tokens_seen": 16781208, + "step": 8565 + }, + { + "epoch": 1.1353214049039098, + "grad_norm": 7.261528968811035, + "learning_rate": 3.4336905720992697e-06, + "loss": 0.128, + "num_input_tokens_seen": 16783448, + "step": 8566 + }, + { + "epoch": 1.135453943008615, + "grad_norm": 2.609156370162964, + "learning_rate": 3.4333685447318037e-06, + "loss": 0.085, + "num_input_tokens_seen": 16786232, + "step": 8567 + }, + { + "epoch": 1.13558648111332, + "grad_norm": 9.573345184326172, + "learning_rate": 3.4330464993683486e-06, + "loss": 0.2338, + "num_input_tokens_seen": 16788064, + "step": 8568 + }, + { + "epoch": 1.135719019218025, + "grad_norm": 0.07628241926431656, + "learning_rate": 3.4327244360151135e-06, + "loss": 0.0005, + "num_input_tokens_seen": 16789536, + "step": 8569 + }, + { + "epoch": 1.1358515573227304, + "grad_norm": 6.021028518676758, + "learning_rate": 3.4324023546783068e-06, + "loss": 0.1291, + "num_input_tokens_seen": 16791576, + "step": 8570 + }, + { + "epoch": 1.1359840954274354, + "grad_norm": 13.60378646850586, + "learning_rate": 3.432080255364141e-06, + "loss": 0.1604, + "num_input_tokens_seen": 16794792, + "step": 8571 + }, + { + "epoch": 1.1361166335321404, + "grad_norm": 1.469277024269104, + "learning_rate": 3.4317581380788235e-06, + "loss": 0.0116, + "num_input_tokens_seen": 16796424, + "step": 8572 + }, + { + "epoch": 1.1362491716368457, + "grad_norm": 0.05658235028386116, + "learning_rate": 3.4314360028285666e-06, + "loss": 0.0004, + "num_input_tokens_seen": 16798632, + "step": 8573 + }, + { + "epoch": 1.1363817097415507, + "grad_norm": 13.66796875, + "learning_rate": 3.4311138496195815e-06, + "loss": 0.5306, + "num_input_tokens_seen": 16800288, + "step": 8574 + }, + { + "epoch": 1.1365142478462558, + "grad_norm": 10.541521072387695, + "learning_rate": 3.4307916784580787e-06, + "loss": 0.2313, + "num_input_tokens_seen": 16802192, + "step": 8575 + }, + { + "epoch": 1.136646785950961, + "grad_norm": 3.4167089462280273, + "learning_rate": 3.43046948935027e-06, + "loss": 0.0074, + "num_input_tokens_seen": 16804920, + "step": 8576 + }, + { + "epoch": 1.136779324055666, + "grad_norm": 0.1870068907737732, + "learning_rate": 3.4301472823023685e-06, + "loss": 0.0013, + "num_input_tokens_seen": 16806616, + "step": 8577 + }, + { + "epoch": 1.136911862160371, + "grad_norm": 0.06414901465177536, + "learning_rate": 3.429825057320585e-06, + "loss": 0.0005, + "num_input_tokens_seen": 16808272, + "step": 8578 + }, + { + "epoch": 1.1370444002650761, + "grad_norm": 9.337677001953125, + "learning_rate": 3.429502814411133e-06, + "loss": 0.1495, + "num_input_tokens_seen": 16810728, + "step": 8579 + }, + { + "epoch": 1.1371769383697814, + "grad_norm": 28.958005905151367, + "learning_rate": 3.429180553580226e-06, + "loss": 1.0032, + "num_input_tokens_seen": 16812272, + "step": 8580 + }, + { + "epoch": 1.1373094764744864, + "grad_norm": 0.24584613740444183, + "learning_rate": 3.4288582748340767e-06, + "loss": 0.0015, + "num_input_tokens_seen": 16815112, + "step": 8581 + }, + { + "epoch": 1.1374420145791915, + "grad_norm": 0.05927819386124611, + "learning_rate": 3.4285359781788986e-06, + "loss": 0.0004, + "num_input_tokens_seen": 16816600, + "step": 8582 + }, + { + "epoch": 1.1375745526838967, + "grad_norm": 12.961067199707031, + "learning_rate": 3.4282136636209067e-06, + "loss": 0.1441, + "num_input_tokens_seen": 16818760, + "step": 8583 + }, + { + "epoch": 1.1377070907886018, + "grad_norm": 5.697150230407715, + "learning_rate": 3.4278913311663143e-06, + "loss": 0.0394, + "num_input_tokens_seen": 16821048, + "step": 8584 + }, + { + "epoch": 1.1378396288933068, + "grad_norm": 0.09280016273260117, + "learning_rate": 3.427568980821338e-06, + "loss": 0.0005, + "num_input_tokens_seen": 16823120, + "step": 8585 + }, + { + "epoch": 1.1379721669980118, + "grad_norm": 4.647305011749268, + "learning_rate": 3.427246612592191e-06, + "loss": 0.1233, + "num_input_tokens_seen": 16824600, + "step": 8586 + }, + { + "epoch": 1.138104705102717, + "grad_norm": 11.824877738952637, + "learning_rate": 3.42692422648509e-06, + "loss": 0.4268, + "num_input_tokens_seen": 16826944, + "step": 8587 + }, + { + "epoch": 1.1382372432074221, + "grad_norm": 8.499072074890137, + "learning_rate": 3.426601822506251e-06, + "loss": 0.0912, + "num_input_tokens_seen": 16828376, + "step": 8588 + }, + { + "epoch": 1.1383697813121272, + "grad_norm": 9.63304328918457, + "learning_rate": 3.426279400661889e-06, + "loss": 0.1826, + "num_input_tokens_seen": 16830352, + "step": 8589 + }, + { + "epoch": 1.1385023194168324, + "grad_norm": 0.0502646341919899, + "learning_rate": 3.425956960958221e-06, + "loss": 0.0004, + "num_input_tokens_seen": 16831528, + "step": 8590 + }, + { + "epoch": 1.1386348575215375, + "grad_norm": 1.0173084735870361, + "learning_rate": 3.4256345034014637e-06, + "loss": 0.0115, + "num_input_tokens_seen": 16833528, + "step": 8591 + }, + { + "epoch": 1.1387673956262425, + "grad_norm": 7.693487644195557, + "learning_rate": 3.425312027997836e-06, + "loss": 0.0833, + "num_input_tokens_seen": 16835576, + "step": 8592 + }, + { + "epoch": 1.1388999337309476, + "grad_norm": 9.145318984985352, + "learning_rate": 3.424989534753553e-06, + "loss": 0.2436, + "num_input_tokens_seen": 16837720, + "step": 8593 + }, + { + "epoch": 1.1390324718356528, + "grad_norm": 4.3246026039123535, + "learning_rate": 3.4246670236748336e-06, + "loss": 0.0666, + "num_input_tokens_seen": 16839288, + "step": 8594 + }, + { + "epoch": 1.1391650099403579, + "grad_norm": 0.0994713082909584, + "learning_rate": 3.4243444947678966e-06, + "loss": 0.0007, + "num_input_tokens_seen": 16841224, + "step": 8595 + }, + { + "epoch": 1.139297548045063, + "grad_norm": 9.752495765686035, + "learning_rate": 3.4240219480389603e-06, + "loss": 0.0352, + "num_input_tokens_seen": 16842736, + "step": 8596 + }, + { + "epoch": 1.1394300861497682, + "grad_norm": 0.5212600827217102, + "learning_rate": 3.4236993834942424e-06, + "loss": 0.0046, + "num_input_tokens_seen": 16844640, + "step": 8597 + }, + { + "epoch": 1.1395626242544732, + "grad_norm": 8.79934310913086, + "learning_rate": 3.423376801139964e-06, + "loss": 0.179, + "num_input_tokens_seen": 16846960, + "step": 8598 + }, + { + "epoch": 1.1396951623591782, + "grad_norm": 12.062625885009766, + "learning_rate": 3.4230542009823433e-06, + "loss": 0.4201, + "num_input_tokens_seen": 16849056, + "step": 8599 + }, + { + "epoch": 1.1398277004638833, + "grad_norm": 8.590723991394043, + "learning_rate": 3.4227315830276007e-06, + "loss": 0.1873, + "num_input_tokens_seen": 16851072, + "step": 8600 + }, + { + "epoch": 1.1399602385685885, + "grad_norm": 11.714760780334473, + "learning_rate": 3.4224089472819578e-06, + "loss": 0.3325, + "num_input_tokens_seen": 16852968, + "step": 8601 + }, + { + "epoch": 1.1400927766732936, + "grad_norm": 7.744892120361328, + "learning_rate": 3.4220862937516324e-06, + "loss": 0.0432, + "num_input_tokens_seen": 16854504, + "step": 8602 + }, + { + "epoch": 1.1402253147779986, + "grad_norm": 0.4743809103965759, + "learning_rate": 3.4217636224428487e-06, + "loss": 0.0047, + "num_input_tokens_seen": 16855760, + "step": 8603 + }, + { + "epoch": 1.1403578528827039, + "grad_norm": 0.1568261682987213, + "learning_rate": 3.4214409333618255e-06, + "loss": 0.001, + "num_input_tokens_seen": 16857648, + "step": 8604 + }, + { + "epoch": 1.140490390987409, + "grad_norm": 0.8237578272819519, + "learning_rate": 3.421118226514786e-06, + "loss": 0.007, + "num_input_tokens_seen": 16859656, + "step": 8605 + }, + { + "epoch": 1.140622929092114, + "grad_norm": 0.2042047083377838, + "learning_rate": 3.4207955019079516e-06, + "loss": 0.0015, + "num_input_tokens_seen": 16861256, + "step": 8606 + }, + { + "epoch": 1.140755467196819, + "grad_norm": 5.378801345825195, + "learning_rate": 3.4204727595475438e-06, + "loss": 0.0157, + "num_input_tokens_seen": 16862968, + "step": 8607 + }, + { + "epoch": 1.1408880053015242, + "grad_norm": 2.959404945373535, + "learning_rate": 3.420149999439787e-06, + "loss": 0.0851, + "num_input_tokens_seen": 16864488, + "step": 8608 + }, + { + "epoch": 1.1410205434062293, + "grad_norm": 9.210344314575195, + "learning_rate": 3.4198272215909035e-06, + "loss": 0.1908, + "num_input_tokens_seen": 16866600, + "step": 8609 + }, + { + "epoch": 1.1411530815109343, + "grad_norm": 4.894270420074463, + "learning_rate": 3.4195044260071166e-06, + "loss": 0.0978, + "num_input_tokens_seen": 16868360, + "step": 8610 + }, + { + "epoch": 1.1412856196156396, + "grad_norm": 12.243182182312012, + "learning_rate": 3.419181612694649e-06, + "loss": 0.346, + "num_input_tokens_seen": 16870248, + "step": 8611 + }, + { + "epoch": 1.1414181577203446, + "grad_norm": 5.7628865242004395, + "learning_rate": 3.4188587816597267e-06, + "loss": 0.139, + "num_input_tokens_seen": 16872056, + "step": 8612 + }, + { + "epoch": 1.1415506958250496, + "grad_norm": 8.342791557312012, + "learning_rate": 3.4185359329085736e-06, + "loss": 0.1827, + "num_input_tokens_seen": 16874344, + "step": 8613 + }, + { + "epoch": 1.1416832339297547, + "grad_norm": 6.55029296875, + "learning_rate": 3.4182130664474133e-06, + "loss": 0.1378, + "num_input_tokens_seen": 16876920, + "step": 8614 + }, + { + "epoch": 1.14181577203446, + "grad_norm": 1.6190155744552612, + "learning_rate": 3.4178901822824716e-06, + "loss": 0.0117, + "num_input_tokens_seen": 16879016, + "step": 8615 + }, + { + "epoch": 1.141948310139165, + "grad_norm": 1.389581561088562, + "learning_rate": 3.417567280419975e-06, + "loss": 0.0095, + "num_input_tokens_seen": 16880992, + "step": 8616 + }, + { + "epoch": 1.14208084824387, + "grad_norm": 0.3550684154033661, + "learning_rate": 3.4172443608661467e-06, + "loss": 0.0026, + "num_input_tokens_seen": 16883216, + "step": 8617 + }, + { + "epoch": 1.1422133863485753, + "grad_norm": 2.869988203048706, + "learning_rate": 3.4169214236272158e-06, + "loss": 0.032, + "num_input_tokens_seen": 16884464, + "step": 8618 + }, + { + "epoch": 1.1423459244532803, + "grad_norm": 0.21294815838336945, + "learning_rate": 3.4165984687094066e-06, + "loss": 0.0015, + "num_input_tokens_seen": 16886104, + "step": 8619 + }, + { + "epoch": 1.1424784625579854, + "grad_norm": 0.3610846996307373, + "learning_rate": 3.4162754961189457e-06, + "loss": 0.003, + "num_input_tokens_seen": 16888032, + "step": 8620 + }, + { + "epoch": 1.1426110006626906, + "grad_norm": 0.8468477129936218, + "learning_rate": 3.415952505862063e-06, + "loss": 0.0054, + "num_input_tokens_seen": 16890352, + "step": 8621 + }, + { + "epoch": 1.1427435387673957, + "grad_norm": 7.751956939697266, + "learning_rate": 3.4156294979449835e-06, + "loss": 0.0654, + "num_input_tokens_seen": 16892352, + "step": 8622 + }, + { + "epoch": 1.1428760768721007, + "grad_norm": 0.08117523789405823, + "learning_rate": 3.415306472373935e-06, + "loss": 0.0006, + "num_input_tokens_seen": 16893360, + "step": 8623 + }, + { + "epoch": 1.143008614976806, + "grad_norm": 1.9368408918380737, + "learning_rate": 3.4149834291551466e-06, + "loss": 0.0329, + "num_input_tokens_seen": 16894880, + "step": 8624 + }, + { + "epoch": 1.143141153081511, + "grad_norm": 5.319334030151367, + "learning_rate": 3.4146603682948466e-06, + "loss": 0.0462, + "num_input_tokens_seen": 16896920, + "step": 8625 + }, + { + "epoch": 1.143273691186216, + "grad_norm": 3.5588929653167725, + "learning_rate": 3.4143372897992634e-06, + "loss": 0.0222, + "num_input_tokens_seen": 16898832, + "step": 8626 + }, + { + "epoch": 1.143406229290921, + "grad_norm": 2.9614055156707764, + "learning_rate": 3.414014193674627e-06, + "loss": 0.0553, + "num_input_tokens_seen": 16900728, + "step": 8627 + }, + { + "epoch": 1.1435387673956263, + "grad_norm": 7.509366035461426, + "learning_rate": 3.4136910799271656e-06, + "loss": 0.1376, + "num_input_tokens_seen": 16902536, + "step": 8628 + }, + { + "epoch": 1.1436713055003314, + "grad_norm": 0.03411921486258507, + "learning_rate": 3.41336794856311e-06, + "loss": 0.0002, + "num_input_tokens_seen": 16903816, + "step": 8629 + }, + { + "epoch": 1.1438038436050364, + "grad_norm": 4.779444217681885, + "learning_rate": 3.4130447995886906e-06, + "loss": 0.1106, + "num_input_tokens_seen": 16907368, + "step": 8630 + }, + { + "epoch": 1.1439363817097417, + "grad_norm": 0.25097790360450745, + "learning_rate": 3.4127216330101377e-06, + "loss": 0.0015, + "num_input_tokens_seen": 16909584, + "step": 8631 + }, + { + "epoch": 1.1440689198144467, + "grad_norm": 3.9529929161071777, + "learning_rate": 3.412398448833682e-06, + "loss": 0.0902, + "num_input_tokens_seen": 16911576, + "step": 8632 + }, + { + "epoch": 1.1442014579191517, + "grad_norm": 9.972177505493164, + "learning_rate": 3.4120752470655544e-06, + "loss": 0.3206, + "num_input_tokens_seen": 16913736, + "step": 8633 + }, + { + "epoch": 1.1443339960238568, + "grad_norm": 2.711470365524292, + "learning_rate": 3.4117520277119873e-06, + "loss": 0.0659, + "num_input_tokens_seen": 16916368, + "step": 8634 + }, + { + "epoch": 1.144466534128562, + "grad_norm": 2.3460464477539062, + "learning_rate": 3.4114287907792115e-06, + "loss": 0.0141, + "num_input_tokens_seen": 16917920, + "step": 8635 + }, + { + "epoch": 1.144599072233267, + "grad_norm": 0.07205978035926819, + "learning_rate": 3.4111055362734608e-06, + "loss": 0.0005, + "num_input_tokens_seen": 16919792, + "step": 8636 + }, + { + "epoch": 1.144731610337972, + "grad_norm": 9.346211433410645, + "learning_rate": 3.410782264200966e-06, + "loss": 0.1025, + "num_input_tokens_seen": 16922320, + "step": 8637 + }, + { + "epoch": 1.1448641484426774, + "grad_norm": 24.722517013549805, + "learning_rate": 3.4104589745679604e-06, + "loss": 0.2614, + "num_input_tokens_seen": 16924920, + "step": 8638 + }, + { + "epoch": 1.1449966865473824, + "grad_norm": 1.5098464488983154, + "learning_rate": 3.4101356673806795e-06, + "loss": 0.0266, + "num_input_tokens_seen": 16926552, + "step": 8639 + }, + { + "epoch": 1.1451292246520874, + "grad_norm": 6.394235134124756, + "learning_rate": 3.409812342645353e-06, + "loss": 0.0603, + "num_input_tokens_seen": 16928760, + "step": 8640 + }, + { + "epoch": 1.1452617627567925, + "grad_norm": 7.629454612731934, + "learning_rate": 3.4094890003682184e-06, + "loss": 0.1967, + "num_input_tokens_seen": 16930536, + "step": 8641 + }, + { + "epoch": 1.1453943008614977, + "grad_norm": 12.822225570678711, + "learning_rate": 3.4091656405555085e-06, + "loss": 0.125, + "num_input_tokens_seen": 16931840, + "step": 8642 + }, + { + "epoch": 1.1455268389662028, + "grad_norm": 7.744934558868408, + "learning_rate": 3.4088422632134565e-06, + "loss": 0.0942, + "num_input_tokens_seen": 16933560, + "step": 8643 + }, + { + "epoch": 1.1456593770709078, + "grad_norm": 2.90161395072937, + "learning_rate": 3.4085188683482994e-06, + "loss": 0.0434, + "num_input_tokens_seen": 16936544, + "step": 8644 + }, + { + "epoch": 1.145791915175613, + "grad_norm": 0.41321036219596863, + "learning_rate": 3.4081954559662723e-06, + "loss": 0.0033, + "num_input_tokens_seen": 16938360, + "step": 8645 + }, + { + "epoch": 1.1459244532803181, + "grad_norm": 4.784876823425293, + "learning_rate": 3.4078720260736097e-06, + "loss": 0.0545, + "num_input_tokens_seen": 16939984, + "step": 8646 + }, + { + "epoch": 1.1460569913850231, + "grad_norm": 0.05184270441532135, + "learning_rate": 3.4075485786765483e-06, + "loss": 0.0004, + "num_input_tokens_seen": 16941576, + "step": 8647 + }, + { + "epoch": 1.1461895294897282, + "grad_norm": 0.04907209426164627, + "learning_rate": 3.407225113781324e-06, + "loss": 0.0003, + "num_input_tokens_seen": 16942856, + "step": 8648 + }, + { + "epoch": 1.1463220675944334, + "grad_norm": 10.976516723632812, + "learning_rate": 3.406901631394174e-06, + "loss": 0.1819, + "num_input_tokens_seen": 16944888, + "step": 8649 + }, + { + "epoch": 1.1464546056991385, + "grad_norm": 0.14874796569347382, + "learning_rate": 3.406578131521334e-06, + "loss": 0.0008, + "num_input_tokens_seen": 16946184, + "step": 8650 + }, + { + "epoch": 1.1465871438038435, + "grad_norm": 3.850764751434326, + "learning_rate": 3.4062546141690434e-06, + "loss": 0.0665, + "num_input_tokens_seen": 16948024, + "step": 8651 + }, + { + "epoch": 1.1467196819085488, + "grad_norm": 9.90334415435791, + "learning_rate": 3.4059310793435376e-06, + "loss": 0.0879, + "num_input_tokens_seen": 16950304, + "step": 8652 + }, + { + "epoch": 1.1468522200132538, + "grad_norm": 9.862438201904297, + "learning_rate": 3.4056075270510565e-06, + "loss": 0.2046, + "num_input_tokens_seen": 16951704, + "step": 8653 + }, + { + "epoch": 1.1469847581179589, + "grad_norm": 1.0565545558929443, + "learning_rate": 3.405283957297837e-06, + "loss": 0.004, + "num_input_tokens_seen": 16953216, + "step": 8654 + }, + { + "epoch": 1.147117296222664, + "grad_norm": 6.22489070892334, + "learning_rate": 3.4049603700901184e-06, + "loss": 0.1233, + "num_input_tokens_seen": 16954928, + "step": 8655 + }, + { + "epoch": 1.1472498343273692, + "grad_norm": 5.998757362365723, + "learning_rate": 3.4046367654341394e-06, + "loss": 0.1288, + "num_input_tokens_seen": 16956992, + "step": 8656 + }, + { + "epoch": 1.1473823724320742, + "grad_norm": 4.141188621520996, + "learning_rate": 3.40431314333614e-06, + "loss": 0.0637, + "num_input_tokens_seen": 16959792, + "step": 8657 + }, + { + "epoch": 1.1475149105367792, + "grad_norm": 2.5851380825042725, + "learning_rate": 3.4039895038023592e-06, + "loss": 0.0225, + "num_input_tokens_seen": 16961448, + "step": 8658 + }, + { + "epoch": 1.1476474486414845, + "grad_norm": 0.15643319487571716, + "learning_rate": 3.4036658468390366e-06, + "loss": 0.001, + "num_input_tokens_seen": 16963488, + "step": 8659 + }, + { + "epoch": 1.1477799867461895, + "grad_norm": 0.035852137953042984, + "learning_rate": 3.403342172452414e-06, + "loss": 0.0004, + "num_input_tokens_seen": 16964760, + "step": 8660 + }, + { + "epoch": 1.1479125248508946, + "grad_norm": 0.08998502045869827, + "learning_rate": 3.4030184806487302e-06, + "loss": 0.0006, + "num_input_tokens_seen": 16966968, + "step": 8661 + }, + { + "epoch": 1.1480450629555998, + "grad_norm": 0.19321848452091217, + "learning_rate": 3.4026947714342275e-06, + "loss": 0.001, + "num_input_tokens_seen": 16968520, + "step": 8662 + }, + { + "epoch": 1.1481776010603049, + "grad_norm": 14.807652473449707, + "learning_rate": 3.402371044815147e-06, + "loss": 0.5693, + "num_input_tokens_seen": 16970704, + "step": 8663 + }, + { + "epoch": 1.14831013916501, + "grad_norm": 2.8694050312042236, + "learning_rate": 3.4020473007977295e-06, + "loss": 0.0128, + "num_input_tokens_seen": 16972008, + "step": 8664 + }, + { + "epoch": 1.1484426772697152, + "grad_norm": 0.16470128297805786, + "learning_rate": 3.401723539388219e-06, + "loss": 0.0009, + "num_input_tokens_seen": 16973256, + "step": 8665 + }, + { + "epoch": 1.1485752153744202, + "grad_norm": 10.99104118347168, + "learning_rate": 3.4013997605928554e-06, + "loss": 0.2464, + "num_input_tokens_seen": 16976024, + "step": 8666 + }, + { + "epoch": 1.1487077534791252, + "grad_norm": 1.039171576499939, + "learning_rate": 3.4010759644178837e-06, + "loss": 0.0036, + "num_input_tokens_seen": 16977536, + "step": 8667 + }, + { + "epoch": 1.1488402915838303, + "grad_norm": 11.11103630065918, + "learning_rate": 3.400752150869545e-06, + "loss": 0.209, + "num_input_tokens_seen": 16979848, + "step": 8668 + }, + { + "epoch": 1.1489728296885355, + "grad_norm": 15.490960121154785, + "learning_rate": 3.400428319954084e-06, + "loss": 0.3856, + "num_input_tokens_seen": 16982056, + "step": 8669 + }, + { + "epoch": 1.1491053677932406, + "grad_norm": 6.847944736480713, + "learning_rate": 3.4001044716777433e-06, + "loss": 0.081, + "num_input_tokens_seen": 16984128, + "step": 8670 + }, + { + "epoch": 1.1492379058979456, + "grad_norm": 4.964559078216553, + "learning_rate": 3.3997806060467674e-06, + "loss": 0.0964, + "num_input_tokens_seen": 16986376, + "step": 8671 + }, + { + "epoch": 1.1493704440026509, + "grad_norm": 7.073760032653809, + "learning_rate": 3.3994567230674014e-06, + "loss": 0.2158, + "num_input_tokens_seen": 16988168, + "step": 8672 + }, + { + "epoch": 1.149502982107356, + "grad_norm": 7.601524829864502, + "learning_rate": 3.3991328227458888e-06, + "loss": 0.2519, + "num_input_tokens_seen": 16990656, + "step": 8673 + }, + { + "epoch": 1.149635520212061, + "grad_norm": 10.729582786560059, + "learning_rate": 3.3988089050884743e-06, + "loss": 0.2365, + "num_input_tokens_seen": 16992392, + "step": 8674 + }, + { + "epoch": 1.149768058316766, + "grad_norm": 0.0676494613289833, + "learning_rate": 3.3984849701014056e-06, + "loss": 0.0004, + "num_input_tokens_seen": 16993816, + "step": 8675 + }, + { + "epoch": 1.1499005964214712, + "grad_norm": 3.9291961193084717, + "learning_rate": 3.3981610177909258e-06, + "loss": 0.1051, + "num_input_tokens_seen": 16995152, + "step": 8676 + }, + { + "epoch": 1.1500331345261763, + "grad_norm": 13.542081832885742, + "learning_rate": 3.397837048163282e-06, + "loss": 0.2971, + "num_input_tokens_seen": 16996304, + "step": 8677 + }, + { + "epoch": 1.1501656726308813, + "grad_norm": 14.670005798339844, + "learning_rate": 3.3975130612247207e-06, + "loss": 0.2928, + "num_input_tokens_seen": 16998200, + "step": 8678 + }, + { + "epoch": 1.1502982107355866, + "grad_norm": 15.614364624023438, + "learning_rate": 3.397189056981488e-06, + "loss": 0.3072, + "num_input_tokens_seen": 17001128, + "step": 8679 + }, + { + "epoch": 1.1504307488402916, + "grad_norm": 9.3743257522583, + "learning_rate": 3.3968650354398324e-06, + "loss": 0.291, + "num_input_tokens_seen": 17004160, + "step": 8680 + }, + { + "epoch": 1.1505632869449967, + "grad_norm": 7.353440761566162, + "learning_rate": 3.3965409966060005e-06, + "loss": 0.0589, + "num_input_tokens_seen": 17006208, + "step": 8681 + }, + { + "epoch": 1.1506958250497017, + "grad_norm": 3.036517381668091, + "learning_rate": 3.396216940486238e-06, + "loss": 0.074, + "num_input_tokens_seen": 17007928, + "step": 8682 + }, + { + "epoch": 1.150828363154407, + "grad_norm": 4.513935089111328, + "learning_rate": 3.395892867086795e-06, + "loss": 0.0803, + "num_input_tokens_seen": 17010984, + "step": 8683 + }, + { + "epoch": 1.150960901259112, + "grad_norm": 0.15509624779224396, + "learning_rate": 3.3955687764139212e-06, + "loss": 0.001, + "num_input_tokens_seen": 17012576, + "step": 8684 + }, + { + "epoch": 1.151093439363817, + "grad_norm": 9.42901611328125, + "learning_rate": 3.3952446684738615e-06, + "loss": 0.2047, + "num_input_tokens_seen": 17014296, + "step": 8685 + }, + { + "epoch": 1.1512259774685223, + "grad_norm": 8.755342483520508, + "learning_rate": 3.3949205432728682e-06, + "loss": 0.1312, + "num_input_tokens_seen": 17016056, + "step": 8686 + }, + { + "epoch": 1.1513585155732273, + "grad_norm": 7.112166881561279, + "learning_rate": 3.3945964008171883e-06, + "loss": 0.2424, + "num_input_tokens_seen": 17018624, + "step": 8687 + }, + { + "epoch": 1.1514910536779324, + "grad_norm": 2.068117141723633, + "learning_rate": 3.3942722411130737e-06, + "loss": 0.0248, + "num_input_tokens_seen": 17020176, + "step": 8688 + }, + { + "epoch": 1.1516235917826374, + "grad_norm": 13.561476707458496, + "learning_rate": 3.3939480641667733e-06, + "loss": 0.1434, + "num_input_tokens_seen": 17021880, + "step": 8689 + }, + { + "epoch": 1.1517561298873427, + "grad_norm": 0.6590791940689087, + "learning_rate": 3.393623869984537e-06, + "loss": 0.0043, + "num_input_tokens_seen": 17023504, + "step": 8690 + }, + { + "epoch": 1.1518886679920477, + "grad_norm": 1.3271563053131104, + "learning_rate": 3.3932996585726163e-06, + "loss": 0.0085, + "num_input_tokens_seen": 17026128, + "step": 8691 + }, + { + "epoch": 1.1520212060967527, + "grad_norm": 3.6453139781951904, + "learning_rate": 3.392975429937262e-06, + "loss": 0.0445, + "num_input_tokens_seen": 17027896, + "step": 8692 + }, + { + "epoch": 1.152153744201458, + "grad_norm": 2.0067014694213867, + "learning_rate": 3.3926511840847254e-06, + "loss": 0.024, + "num_input_tokens_seen": 17029832, + "step": 8693 + }, + { + "epoch": 1.152286282306163, + "grad_norm": 5.881217002868652, + "learning_rate": 3.392326921021258e-06, + "loss": 0.2421, + "num_input_tokens_seen": 17031512, + "step": 8694 + }, + { + "epoch": 1.152418820410868, + "grad_norm": 3.375748634338379, + "learning_rate": 3.3920026407531115e-06, + "loss": 0.0451, + "num_input_tokens_seen": 17033120, + "step": 8695 + }, + { + "epoch": 1.152551358515573, + "grad_norm": 5.98948860168457, + "learning_rate": 3.3916783432865398e-06, + "loss": 0.102, + "num_input_tokens_seen": 17034784, + "step": 8696 + }, + { + "epoch": 1.1526838966202784, + "grad_norm": 0.6252455711364746, + "learning_rate": 3.3913540286277937e-06, + "loss": 0.0046, + "num_input_tokens_seen": 17037424, + "step": 8697 + }, + { + "epoch": 1.1528164347249834, + "grad_norm": 1.4617334604263306, + "learning_rate": 3.391029696783127e-06, + "loss": 0.0105, + "num_input_tokens_seen": 17039472, + "step": 8698 + }, + { + "epoch": 1.1529489728296884, + "grad_norm": 6.784725666046143, + "learning_rate": 3.3907053477587937e-06, + "loss": 0.0522, + "num_input_tokens_seen": 17041544, + "step": 8699 + }, + { + "epoch": 1.1530815109343937, + "grad_norm": 0.046742189675569534, + "learning_rate": 3.3903809815610455e-06, + "loss": 0.0003, + "num_input_tokens_seen": 17043040, + "step": 8700 + }, + { + "epoch": 1.1532140490390987, + "grad_norm": 5.8381500244140625, + "learning_rate": 3.3900565981961396e-06, + "loss": 0.0453, + "num_input_tokens_seen": 17044584, + "step": 8701 + }, + { + "epoch": 1.1533465871438038, + "grad_norm": 4.4542951583862305, + "learning_rate": 3.389732197670327e-06, + "loss": 0.0298, + "num_input_tokens_seen": 17046776, + "step": 8702 + }, + { + "epoch": 1.153479125248509, + "grad_norm": 7.867623805999756, + "learning_rate": 3.3894077799898644e-06, + "loss": 0.1661, + "num_input_tokens_seen": 17048648, + "step": 8703 + }, + { + "epoch": 1.153611663353214, + "grad_norm": 0.014720787294209003, + "learning_rate": 3.3890833451610065e-06, + "loss": 0.0001, + "num_input_tokens_seen": 17049784, + "step": 8704 + }, + { + "epoch": 1.1537442014579191, + "grad_norm": 8.061545372009277, + "learning_rate": 3.388758893190008e-06, + "loss": 0.1444, + "num_input_tokens_seen": 17052280, + "step": 8705 + }, + { + "epoch": 1.1538767395626244, + "grad_norm": 11.435490608215332, + "learning_rate": 3.388434424083125e-06, + "loss": 0.3489, + "num_input_tokens_seen": 17054368, + "step": 8706 + }, + { + "epoch": 1.1540092776673294, + "grad_norm": 4.209782123565674, + "learning_rate": 3.388109937846614e-06, + "loss": 0.0387, + "num_input_tokens_seen": 17056072, + "step": 8707 + }, + { + "epoch": 1.1541418157720345, + "grad_norm": 1.5468027591705322, + "learning_rate": 3.3877854344867296e-06, + "loss": 0.008, + "num_input_tokens_seen": 17057560, + "step": 8708 + }, + { + "epoch": 1.1542743538767395, + "grad_norm": 4.870978832244873, + "learning_rate": 3.38746091400973e-06, + "loss": 0.0982, + "num_input_tokens_seen": 17060200, + "step": 8709 + }, + { + "epoch": 1.1544068919814447, + "grad_norm": 8.96373462677002, + "learning_rate": 3.3871363764218723e-06, + "loss": 0.213, + "num_input_tokens_seen": 17061968, + "step": 8710 + }, + { + "epoch": 1.1545394300861498, + "grad_norm": 9.577131271362305, + "learning_rate": 3.386811821729412e-06, + "loss": 0.4016, + "num_input_tokens_seen": 17064064, + "step": 8711 + }, + { + "epoch": 1.1546719681908548, + "grad_norm": 0.036699771881103516, + "learning_rate": 3.3864872499386094e-06, + "loss": 0.0002, + "num_input_tokens_seen": 17066600, + "step": 8712 + }, + { + "epoch": 1.15480450629556, + "grad_norm": 0.19082653522491455, + "learning_rate": 3.3861626610557198e-06, + "loss": 0.0012, + "num_input_tokens_seen": 17068344, + "step": 8713 + }, + { + "epoch": 1.1549370444002651, + "grad_norm": 0.11535575985908508, + "learning_rate": 3.385838055087004e-06, + "loss": 0.0008, + "num_input_tokens_seen": 17069984, + "step": 8714 + }, + { + "epoch": 1.1550695825049702, + "grad_norm": 5.328784942626953, + "learning_rate": 3.3855134320387183e-06, + "loss": 0.1432, + "num_input_tokens_seen": 17072096, + "step": 8715 + }, + { + "epoch": 1.1552021206096752, + "grad_norm": 2.4692676067352295, + "learning_rate": 3.3851887919171235e-06, + "loss": 0.0618, + "num_input_tokens_seen": 17073520, + "step": 8716 + }, + { + "epoch": 1.1553346587143805, + "grad_norm": 0.3779284954071045, + "learning_rate": 3.3848641347284777e-06, + "loss": 0.002, + "num_input_tokens_seen": 17075048, + "step": 8717 + }, + { + "epoch": 1.1554671968190855, + "grad_norm": 6.258880615234375, + "learning_rate": 3.384539460479041e-06, + "loss": 0.1501, + "num_input_tokens_seen": 17077032, + "step": 8718 + }, + { + "epoch": 1.1555997349237905, + "grad_norm": 9.282422065734863, + "learning_rate": 3.384214769175074e-06, + "loss": 0.0658, + "num_input_tokens_seen": 17078232, + "step": 8719 + }, + { + "epoch": 1.1557322730284958, + "grad_norm": 7.147819519042969, + "learning_rate": 3.383890060822835e-06, + "loss": 0.0946, + "num_input_tokens_seen": 17080504, + "step": 8720 + }, + { + "epoch": 1.1558648111332008, + "grad_norm": 0.02076355367898941, + "learning_rate": 3.383565335428587e-06, + "loss": 0.0001, + "num_input_tokens_seen": 17082704, + "step": 8721 + }, + { + "epoch": 1.1559973492379059, + "grad_norm": 1.0376105308532715, + "learning_rate": 3.3832405929985895e-06, + "loss": 0.0054, + "num_input_tokens_seen": 17084944, + "step": 8722 + }, + { + "epoch": 1.156129887342611, + "grad_norm": 0.7480855584144592, + "learning_rate": 3.382915833539104e-06, + "loss": 0.0014, + "num_input_tokens_seen": 17087728, + "step": 8723 + }, + { + "epoch": 1.1562624254473162, + "grad_norm": 5.616735458374023, + "learning_rate": 3.3825910570563913e-06, + "loss": 0.1466, + "num_input_tokens_seen": 17089920, + "step": 8724 + }, + { + "epoch": 1.1563949635520212, + "grad_norm": 3.6636204719543457, + "learning_rate": 3.3822662635567157e-06, + "loss": 0.016, + "num_input_tokens_seen": 17091616, + "step": 8725 + }, + { + "epoch": 1.1565275016567262, + "grad_norm": 0.12443515658378601, + "learning_rate": 3.3819414530463366e-06, + "loss": 0.0007, + "num_input_tokens_seen": 17093336, + "step": 8726 + }, + { + "epoch": 1.1566600397614315, + "grad_norm": 7.651934623718262, + "learning_rate": 3.381616625531518e-06, + "loss": 0.0351, + "num_input_tokens_seen": 17095576, + "step": 8727 + }, + { + "epoch": 1.1567925778661365, + "grad_norm": 1.422959327697754, + "learning_rate": 3.3812917810185236e-06, + "loss": 0.017, + "num_input_tokens_seen": 17097312, + "step": 8728 + }, + { + "epoch": 1.1569251159708416, + "grad_norm": 9.669575691223145, + "learning_rate": 3.380966919513614e-06, + "loss": 0.1732, + "num_input_tokens_seen": 17099424, + "step": 8729 + }, + { + "epoch": 1.1570576540755466, + "grad_norm": 0.3030741810798645, + "learning_rate": 3.380642041023056e-06, + "loss": 0.002, + "num_input_tokens_seen": 17101528, + "step": 8730 + }, + { + "epoch": 1.1571901921802519, + "grad_norm": 4.4121575355529785, + "learning_rate": 3.3803171455531116e-06, + "loss": 0.0713, + "num_input_tokens_seen": 17104272, + "step": 8731 + }, + { + "epoch": 1.157322730284957, + "grad_norm": 10.905190467834473, + "learning_rate": 3.3799922331100447e-06, + "loss": 0.2383, + "num_input_tokens_seen": 17106512, + "step": 8732 + }, + { + "epoch": 1.157455268389662, + "grad_norm": 6.185886383056641, + "learning_rate": 3.3796673037001215e-06, + "loss": 0.1465, + "num_input_tokens_seen": 17108264, + "step": 8733 + }, + { + "epoch": 1.1575878064943672, + "grad_norm": 1.5981717109680176, + "learning_rate": 3.3793423573296053e-06, + "loss": 0.0118, + "num_input_tokens_seen": 17110024, + "step": 8734 + }, + { + "epoch": 1.1577203445990722, + "grad_norm": 6.555273056030273, + "learning_rate": 3.3790173940047616e-06, + "loss": 0.1967, + "num_input_tokens_seen": 17111920, + "step": 8735 + }, + { + "epoch": 1.1578528827037773, + "grad_norm": 0.8882776498794556, + "learning_rate": 3.3786924137318562e-06, + "loss": 0.0076, + "num_input_tokens_seen": 17113904, + "step": 8736 + }, + { + "epoch": 1.1579854208084823, + "grad_norm": 2.360891103744507, + "learning_rate": 3.378367416517155e-06, + "loss": 0.0248, + "num_input_tokens_seen": 17115560, + "step": 8737 + }, + { + "epoch": 1.1581179589131876, + "grad_norm": 9.996360778808594, + "learning_rate": 3.3780424023669245e-06, + "loss": 0.317, + "num_input_tokens_seen": 17116888, + "step": 8738 + }, + { + "epoch": 1.1582504970178926, + "grad_norm": 7.274130344390869, + "learning_rate": 3.37771737128743e-06, + "loss": 0.2716, + "num_input_tokens_seen": 17119720, + "step": 8739 + }, + { + "epoch": 1.1583830351225977, + "grad_norm": 10.05730152130127, + "learning_rate": 3.37739232328494e-06, + "loss": 0.3369, + "num_input_tokens_seen": 17122016, + "step": 8740 + }, + { + "epoch": 1.158515573227303, + "grad_norm": 0.019153209403157234, + "learning_rate": 3.37706725836572e-06, + "loss": 0.0001, + "num_input_tokens_seen": 17123168, + "step": 8741 + }, + { + "epoch": 1.158648111332008, + "grad_norm": 0.03187284618616104, + "learning_rate": 3.376742176536038e-06, + "loss": 0.0002, + "num_input_tokens_seen": 17126472, + "step": 8742 + }, + { + "epoch": 1.158780649436713, + "grad_norm": 2.7827870845794678, + "learning_rate": 3.3764170778021626e-06, + "loss": 0.0272, + "num_input_tokens_seen": 17128152, + "step": 8743 + }, + { + "epoch": 1.158913187541418, + "grad_norm": 0.2974751889705658, + "learning_rate": 3.376091962170361e-06, + "loss": 0.0019, + "num_input_tokens_seen": 17130776, + "step": 8744 + }, + { + "epoch": 1.1590457256461233, + "grad_norm": 1.2216901779174805, + "learning_rate": 3.375766829646902e-06, + "loss": 0.0081, + "num_input_tokens_seen": 17132488, + "step": 8745 + }, + { + "epoch": 1.1591782637508283, + "grad_norm": 15.379068374633789, + "learning_rate": 3.375441680238055e-06, + "loss": 0.5392, + "num_input_tokens_seen": 17135232, + "step": 8746 + }, + { + "epoch": 1.1593108018555336, + "grad_norm": 1.8882536888122559, + "learning_rate": 3.3751165139500874e-06, + "loss": 0.018, + "num_input_tokens_seen": 17136664, + "step": 8747 + }, + { + "epoch": 1.1594433399602386, + "grad_norm": 7.539924621582031, + "learning_rate": 3.3747913307892703e-06, + "loss": 0.1419, + "num_input_tokens_seen": 17138984, + "step": 8748 + }, + { + "epoch": 1.1595758780649437, + "grad_norm": 7.482346534729004, + "learning_rate": 3.3744661307618725e-06, + "loss": 0.1318, + "num_input_tokens_seen": 17140888, + "step": 8749 + }, + { + "epoch": 1.1597084161696487, + "grad_norm": 0.3391251862049103, + "learning_rate": 3.3741409138741644e-06, + "loss": 0.0019, + "num_input_tokens_seen": 17142288, + "step": 8750 + }, + { + "epoch": 1.159840954274354, + "grad_norm": 9.31494426727295, + "learning_rate": 3.3738156801324174e-06, + "loss": 0.1202, + "num_input_tokens_seen": 17143912, + "step": 8751 + }, + { + "epoch": 1.159973492379059, + "grad_norm": 7.749035835266113, + "learning_rate": 3.3734904295429006e-06, + "loss": 0.2291, + "num_input_tokens_seen": 17145648, + "step": 8752 + }, + { + "epoch": 1.160106030483764, + "grad_norm": 5.612725257873535, + "learning_rate": 3.3731651621118856e-06, + "loss": 0.147, + "num_input_tokens_seen": 17146960, + "step": 8753 + }, + { + "epoch": 1.1602385685884693, + "grad_norm": 0.039717089384794235, + "learning_rate": 3.372839877845644e-06, + "loss": 0.0003, + "num_input_tokens_seen": 17148472, + "step": 8754 + }, + { + "epoch": 1.1603711066931743, + "grad_norm": 9.674017906188965, + "learning_rate": 3.3725145767504473e-06, + "loss": 0.2206, + "num_input_tokens_seen": 17151040, + "step": 8755 + }, + { + "epoch": 1.1605036447978794, + "grad_norm": 9.146735191345215, + "learning_rate": 3.3721892588325676e-06, + "loss": 0.2608, + "num_input_tokens_seen": 17153680, + "step": 8756 + }, + { + "epoch": 1.1606361829025844, + "grad_norm": 0.1338479369878769, + "learning_rate": 3.3718639240982777e-06, + "loss": 0.0009, + "num_input_tokens_seen": 17155232, + "step": 8757 + }, + { + "epoch": 1.1607687210072897, + "grad_norm": 6.3811259269714355, + "learning_rate": 3.37153857255385e-06, + "loss": 0.0622, + "num_input_tokens_seen": 17156664, + "step": 8758 + }, + { + "epoch": 1.1609012591119947, + "grad_norm": 0.7668942213058472, + "learning_rate": 3.3712132042055567e-06, + "loss": 0.0148, + "num_input_tokens_seen": 17158448, + "step": 8759 + }, + { + "epoch": 1.1610337972166997, + "grad_norm": 7.476630210876465, + "learning_rate": 3.3708878190596724e-06, + "loss": 0.0352, + "num_input_tokens_seen": 17159800, + "step": 8760 + }, + { + "epoch": 1.161166335321405, + "grad_norm": 1.237681269645691, + "learning_rate": 3.37056241712247e-06, + "loss": 0.0139, + "num_input_tokens_seen": 17162024, + "step": 8761 + }, + { + "epoch": 1.16129887342611, + "grad_norm": 0.022176114842295647, + "learning_rate": 3.3702369984002227e-06, + "loss": 0.0002, + "num_input_tokens_seen": 17163368, + "step": 8762 + }, + { + "epoch": 1.161431411530815, + "grad_norm": 9.826338768005371, + "learning_rate": 3.3699115628992076e-06, + "loss": 0.205, + "num_input_tokens_seen": 17165672, + "step": 8763 + }, + { + "epoch": 1.1615639496355201, + "grad_norm": 10.378432273864746, + "learning_rate": 3.3695861106256966e-06, + "loss": 0.136, + "num_input_tokens_seen": 17167416, + "step": 8764 + }, + { + "epoch": 1.1616964877402254, + "grad_norm": 14.10642147064209, + "learning_rate": 3.369260641585964e-06, + "loss": 0.4722, + "num_input_tokens_seen": 17169800, + "step": 8765 + }, + { + "epoch": 1.1618290258449304, + "grad_norm": 13.667675018310547, + "learning_rate": 3.3689351557862886e-06, + "loss": 0.0734, + "num_input_tokens_seen": 17172424, + "step": 8766 + }, + { + "epoch": 1.1619615639496355, + "grad_norm": 3.496528387069702, + "learning_rate": 3.368609653232944e-06, + "loss": 0.0264, + "num_input_tokens_seen": 17173912, + "step": 8767 + }, + { + "epoch": 1.1620941020543407, + "grad_norm": 2.0171031951904297, + "learning_rate": 3.368284133932204e-06, + "loss": 0.0165, + "num_input_tokens_seen": 17175872, + "step": 8768 + }, + { + "epoch": 1.1622266401590458, + "grad_norm": 10.521379470825195, + "learning_rate": 3.3679585978903483e-06, + "loss": 0.1359, + "num_input_tokens_seen": 17178296, + "step": 8769 + }, + { + "epoch": 1.1623591782637508, + "grad_norm": 15.802945137023926, + "learning_rate": 3.367633045113652e-06, + "loss": 0.3126, + "num_input_tokens_seen": 17180440, + "step": 8770 + }, + { + "epoch": 1.1624917163684558, + "grad_norm": 13.81311321258545, + "learning_rate": 3.3673074756083916e-06, + "loss": 0.3181, + "num_input_tokens_seen": 17182888, + "step": 8771 + }, + { + "epoch": 1.162624254473161, + "grad_norm": 18.235857009887695, + "learning_rate": 3.366981889380846e-06, + "loss": 0.5816, + "num_input_tokens_seen": 17184704, + "step": 8772 + }, + { + "epoch": 1.1627567925778661, + "grad_norm": 1.5781246423721313, + "learning_rate": 3.3666562864372904e-06, + "loss": 0.0182, + "num_input_tokens_seen": 17186688, + "step": 8773 + }, + { + "epoch": 1.1628893306825712, + "grad_norm": 5.09915828704834, + "learning_rate": 3.3663306667840035e-06, + "loss": 0.0392, + "num_input_tokens_seen": 17188840, + "step": 8774 + }, + { + "epoch": 1.1630218687872764, + "grad_norm": 15.1967191696167, + "learning_rate": 3.366005030427264e-06, + "loss": 0.3, + "num_input_tokens_seen": 17191464, + "step": 8775 + }, + { + "epoch": 1.1631544068919815, + "grad_norm": 3.6834731101989746, + "learning_rate": 3.365679377373351e-06, + "loss": 0.023, + "num_input_tokens_seen": 17194064, + "step": 8776 + }, + { + "epoch": 1.1632869449966865, + "grad_norm": 4.609734058380127, + "learning_rate": 3.3653537076285407e-06, + "loss": 0.107, + "num_input_tokens_seen": 17195976, + "step": 8777 + }, + { + "epoch": 1.1634194831013915, + "grad_norm": 3.198232889175415, + "learning_rate": 3.365028021199115e-06, + "loss": 0.1231, + "num_input_tokens_seen": 17197968, + "step": 8778 + }, + { + "epoch": 1.1635520212060968, + "grad_norm": 0.03016057424247265, + "learning_rate": 3.3647023180913524e-06, + "loss": 0.0002, + "num_input_tokens_seen": 17199224, + "step": 8779 + }, + { + "epoch": 1.1636845593108018, + "grad_norm": 11.049237251281738, + "learning_rate": 3.364376598311532e-06, + "loss": 0.0792, + "num_input_tokens_seen": 17200960, + "step": 8780 + }, + { + "epoch": 1.1638170974155069, + "grad_norm": 0.27476415038108826, + "learning_rate": 3.364050861865935e-06, + "loss": 0.0019, + "num_input_tokens_seen": 17202816, + "step": 8781 + }, + { + "epoch": 1.1639496355202121, + "grad_norm": 0.16871066391468048, + "learning_rate": 3.3637251087608407e-06, + "loss": 0.001, + "num_input_tokens_seen": 17204056, + "step": 8782 + }, + { + "epoch": 1.1640821736249172, + "grad_norm": 0.01754230447113514, + "learning_rate": 3.363399339002531e-06, + "loss": 0.0001, + "num_input_tokens_seen": 17205152, + "step": 8783 + }, + { + "epoch": 1.1642147117296222, + "grad_norm": 0.09081870317459106, + "learning_rate": 3.3630735525972864e-06, + "loss": 0.0006, + "num_input_tokens_seen": 17207416, + "step": 8784 + }, + { + "epoch": 1.1643472498343272, + "grad_norm": 1.478420615196228, + "learning_rate": 3.3627477495513883e-06, + "loss": 0.0044, + "num_input_tokens_seen": 17208912, + "step": 8785 + }, + { + "epoch": 1.1644797879390325, + "grad_norm": 2.149013042449951, + "learning_rate": 3.362421929871118e-06, + "loss": 0.0108, + "num_input_tokens_seen": 17210200, + "step": 8786 + }, + { + "epoch": 1.1646123260437375, + "grad_norm": 1.4341086149215698, + "learning_rate": 3.362096093562758e-06, + "loss": 0.0344, + "num_input_tokens_seen": 17211704, + "step": 8787 + }, + { + "epoch": 1.1647448641484426, + "grad_norm": 11.264973640441895, + "learning_rate": 3.3617702406325915e-06, + "loss": 0.149, + "num_input_tokens_seen": 17214104, + "step": 8788 + }, + { + "epoch": 1.1648774022531478, + "grad_norm": 16.0450382232666, + "learning_rate": 3.3614443710868995e-06, + "loss": 0.3785, + "num_input_tokens_seen": 17216184, + "step": 8789 + }, + { + "epoch": 1.1650099403578529, + "grad_norm": 0.42266690731048584, + "learning_rate": 3.3611184849319654e-06, + "loss": 0.004, + "num_input_tokens_seen": 17218296, + "step": 8790 + }, + { + "epoch": 1.165142478462558, + "grad_norm": 0.05343859642744064, + "learning_rate": 3.3607925821740734e-06, + "loss": 0.0004, + "num_input_tokens_seen": 17219816, + "step": 8791 + }, + { + "epoch": 1.1652750165672632, + "grad_norm": 31.450143814086914, + "learning_rate": 3.360466662819506e-06, + "loss": 0.4873, + "num_input_tokens_seen": 17222224, + "step": 8792 + }, + { + "epoch": 1.1654075546719682, + "grad_norm": 2.264739513397217, + "learning_rate": 3.3601407268745483e-06, + "loss": 0.0114, + "num_input_tokens_seen": 17223744, + "step": 8793 + }, + { + "epoch": 1.1655400927766733, + "grad_norm": 10.350147247314453, + "learning_rate": 3.3598147743454844e-06, + "loss": 0.2334, + "num_input_tokens_seen": 17226088, + "step": 8794 + }, + { + "epoch": 1.1656726308813785, + "grad_norm": 6.459985733032227, + "learning_rate": 3.3594888052385975e-06, + "loss": 0.1633, + "num_input_tokens_seen": 17227808, + "step": 8795 + }, + { + "epoch": 1.1658051689860836, + "grad_norm": 0.018837304785847664, + "learning_rate": 3.3591628195601737e-06, + "loss": 0.0001, + "num_input_tokens_seen": 17229008, + "step": 8796 + }, + { + "epoch": 1.1659377070907886, + "grad_norm": 10.784920692443848, + "learning_rate": 3.3588368173164985e-06, + "loss": 0.1736, + "num_input_tokens_seen": 17230216, + "step": 8797 + }, + { + "epoch": 1.1660702451954936, + "grad_norm": 1.151878833770752, + "learning_rate": 3.358510798513857e-06, + "loss": 0.0108, + "num_input_tokens_seen": 17232856, + "step": 8798 + }, + { + "epoch": 1.1662027833001989, + "grad_norm": 2.6485376358032227, + "learning_rate": 3.358184763158535e-06, + "loss": 0.0494, + "num_input_tokens_seen": 17234728, + "step": 8799 + }, + { + "epoch": 1.166335321404904, + "grad_norm": 5.9347734451293945, + "learning_rate": 3.3578587112568186e-06, + "loss": 0.0848, + "num_input_tokens_seen": 17237536, + "step": 8800 + }, + { + "epoch": 1.166467859509609, + "grad_norm": 0.7881411910057068, + "learning_rate": 3.357532642814994e-06, + "loss": 0.0049, + "num_input_tokens_seen": 17239520, + "step": 8801 + }, + { + "epoch": 1.1666003976143142, + "grad_norm": 15.894207954406738, + "learning_rate": 3.3572065578393493e-06, + "loss": 0.1129, + "num_input_tokens_seen": 17241896, + "step": 8802 + }, + { + "epoch": 1.1667329357190193, + "grad_norm": 15.068872451782227, + "learning_rate": 3.3568804563361703e-06, + "loss": 0.366, + "num_input_tokens_seen": 17244616, + "step": 8803 + }, + { + "epoch": 1.1668654738237243, + "grad_norm": 3.778249979019165, + "learning_rate": 3.356554338311745e-06, + "loss": 0.1047, + "num_input_tokens_seen": 17246296, + "step": 8804 + }, + { + "epoch": 1.1669980119284293, + "grad_norm": 8.295060157775879, + "learning_rate": 3.3562282037723616e-06, + "loss": 0.3166, + "num_input_tokens_seen": 17248152, + "step": 8805 + }, + { + "epoch": 1.1671305500331346, + "grad_norm": 3.767232656478882, + "learning_rate": 3.355902052724307e-06, + "loss": 0.082, + "num_input_tokens_seen": 17250328, + "step": 8806 + }, + { + "epoch": 1.1672630881378396, + "grad_norm": 5.73391056060791, + "learning_rate": 3.3555758851738706e-06, + "loss": 0.0981, + "num_input_tokens_seen": 17253008, + "step": 8807 + }, + { + "epoch": 1.1673956262425447, + "grad_norm": 6.256034851074219, + "learning_rate": 3.355249701127341e-06, + "loss": 0.2291, + "num_input_tokens_seen": 17254648, + "step": 8808 + }, + { + "epoch": 1.16752816434725, + "grad_norm": 0.10122384130954742, + "learning_rate": 3.354923500591007e-06, + "loss": 0.0005, + "num_input_tokens_seen": 17256784, + "step": 8809 + }, + { + "epoch": 1.167660702451955, + "grad_norm": 3.7205545902252197, + "learning_rate": 3.354597283571158e-06, + "loss": 0.1055, + "num_input_tokens_seen": 17258280, + "step": 8810 + }, + { + "epoch": 1.16779324055666, + "grad_norm": 0.05275749787688255, + "learning_rate": 3.354271050074085e-06, + "loss": 0.0004, + "num_input_tokens_seen": 17260320, + "step": 8811 + }, + { + "epoch": 1.167925778661365, + "grad_norm": 11.595385551452637, + "learning_rate": 3.3539448001060753e-06, + "loss": 0.3486, + "num_input_tokens_seen": 17262832, + "step": 8812 + }, + { + "epoch": 1.1680583167660703, + "grad_norm": 11.180983543395996, + "learning_rate": 3.353618533673422e-06, + "loss": 0.0208, + "num_input_tokens_seen": 17265336, + "step": 8813 + }, + { + "epoch": 1.1681908548707753, + "grad_norm": 2.978334665298462, + "learning_rate": 3.3532922507824134e-06, + "loss": 0.0421, + "num_input_tokens_seen": 17267280, + "step": 8814 + }, + { + "epoch": 1.1683233929754804, + "grad_norm": 0.10361512750387192, + "learning_rate": 3.352965951439342e-06, + "loss": 0.0007, + "num_input_tokens_seen": 17268768, + "step": 8815 + }, + { + "epoch": 1.1684559310801856, + "grad_norm": 9.087681770324707, + "learning_rate": 3.3526396356504986e-06, + "loss": 0.0943, + "num_input_tokens_seen": 17271512, + "step": 8816 + }, + { + "epoch": 1.1685884691848907, + "grad_norm": 0.09535300731658936, + "learning_rate": 3.352313303422175e-06, + "loss": 0.0007, + "num_input_tokens_seen": 17273312, + "step": 8817 + }, + { + "epoch": 1.1687210072895957, + "grad_norm": 0.03143332898616791, + "learning_rate": 3.3519869547606624e-06, + "loss": 0.0002, + "num_input_tokens_seen": 17274432, + "step": 8818 + }, + { + "epoch": 1.1688535453943008, + "grad_norm": 4.9378132820129395, + "learning_rate": 3.351660589672254e-06, + "loss": 0.0831, + "num_input_tokens_seen": 17276328, + "step": 8819 + }, + { + "epoch": 1.168986083499006, + "grad_norm": 0.1342911422252655, + "learning_rate": 3.3513342081632423e-06, + "loss": 0.001, + "num_input_tokens_seen": 17279056, + "step": 8820 + }, + { + "epoch": 1.169118621603711, + "grad_norm": 9.325128555297852, + "learning_rate": 3.351007810239919e-06, + "loss": 0.1162, + "num_input_tokens_seen": 17281984, + "step": 8821 + }, + { + "epoch": 1.169251159708416, + "grad_norm": 3.1699886322021484, + "learning_rate": 3.3506813959085783e-06, + "loss": 0.0185, + "num_input_tokens_seen": 17283576, + "step": 8822 + }, + { + "epoch": 1.1693836978131213, + "grad_norm": 0.08768800646066666, + "learning_rate": 3.3503549651755143e-06, + "loss": 0.0006, + "num_input_tokens_seen": 17285328, + "step": 8823 + }, + { + "epoch": 1.1695162359178264, + "grad_norm": 0.173847958445549, + "learning_rate": 3.350028518047019e-06, + "loss": 0.0012, + "num_input_tokens_seen": 17288296, + "step": 8824 + }, + { + "epoch": 1.1696487740225314, + "grad_norm": 5.677957534790039, + "learning_rate": 3.3497020545293875e-06, + "loss": 0.1246, + "num_input_tokens_seen": 17290112, + "step": 8825 + }, + { + "epoch": 1.1697813121272365, + "grad_norm": 9.095964431762695, + "learning_rate": 3.349375574628914e-06, + "loss": 0.2168, + "num_input_tokens_seen": 17291688, + "step": 8826 + }, + { + "epoch": 1.1699138502319417, + "grad_norm": 6.570424556732178, + "learning_rate": 3.3490490783518935e-06, + "loss": 0.0264, + "num_input_tokens_seen": 17293312, + "step": 8827 + }, + { + "epoch": 1.1700463883366468, + "grad_norm": 0.30812981724739075, + "learning_rate": 3.348722565704622e-06, + "loss": 0.0019, + "num_input_tokens_seen": 17294536, + "step": 8828 + }, + { + "epoch": 1.1701789264413518, + "grad_norm": 0.6817254424095154, + "learning_rate": 3.348396036693393e-06, + "loss": 0.0065, + "num_input_tokens_seen": 17295944, + "step": 8829 + }, + { + "epoch": 1.170311464546057, + "grad_norm": 5.960328578948975, + "learning_rate": 3.348069491324503e-06, + "loss": 0.2425, + "num_input_tokens_seen": 17298048, + "step": 8830 + }, + { + "epoch": 1.170444002650762, + "grad_norm": 0.05711880698800087, + "learning_rate": 3.347742929604249e-06, + "loss": 0.0004, + "num_input_tokens_seen": 17300904, + "step": 8831 + }, + { + "epoch": 1.1705765407554671, + "grad_norm": 11.044378280639648, + "learning_rate": 3.3474163515389256e-06, + "loss": 0.2476, + "num_input_tokens_seen": 17302864, + "step": 8832 + }, + { + "epoch": 1.1707090788601724, + "grad_norm": 0.027814380824565887, + "learning_rate": 3.347089757134831e-06, + "loss": 0.0002, + "num_input_tokens_seen": 17304152, + "step": 8833 + }, + { + "epoch": 1.1708416169648774, + "grad_norm": 2.0247271060943604, + "learning_rate": 3.3467631463982616e-06, + "loss": 0.0129, + "num_input_tokens_seen": 17305984, + "step": 8834 + }, + { + "epoch": 1.1709741550695825, + "grad_norm": 7.8655571937561035, + "learning_rate": 3.3464365193355143e-06, + "loss": 0.0829, + "num_input_tokens_seen": 17307552, + "step": 8835 + }, + { + "epoch": 1.1711066931742877, + "grad_norm": 5.011282444000244, + "learning_rate": 3.3461098759528866e-06, + "loss": 0.0491, + "num_input_tokens_seen": 17309024, + "step": 8836 + }, + { + "epoch": 1.1712392312789928, + "grad_norm": 0.04960355535149574, + "learning_rate": 3.3457832162566767e-06, + "loss": 0.0003, + "num_input_tokens_seen": 17312032, + "step": 8837 + }, + { + "epoch": 1.1713717693836978, + "grad_norm": 0.019741661846637726, + "learning_rate": 3.3454565402531835e-06, + "loss": 0.0001, + "num_input_tokens_seen": 17314304, + "step": 8838 + }, + { + "epoch": 1.1715043074884028, + "grad_norm": 0.04193928465247154, + "learning_rate": 3.3451298479487048e-06, + "loss": 0.0003, + "num_input_tokens_seen": 17316408, + "step": 8839 + }, + { + "epoch": 1.171636845593108, + "grad_norm": 0.05488637834787369, + "learning_rate": 3.3448031393495394e-06, + "loss": 0.0004, + "num_input_tokens_seen": 17318072, + "step": 8840 + }, + { + "epoch": 1.1717693836978131, + "grad_norm": 8.657123565673828, + "learning_rate": 3.344476414461987e-06, + "loss": 0.0978, + "num_input_tokens_seen": 17320640, + "step": 8841 + }, + { + "epoch": 1.1719019218025182, + "grad_norm": 10.635114669799805, + "learning_rate": 3.3441496732923475e-06, + "loss": 0.279, + "num_input_tokens_seen": 17322512, + "step": 8842 + }, + { + "epoch": 1.1720344599072234, + "grad_norm": 2.2050373554229736, + "learning_rate": 3.343822915846918e-06, + "loss": 0.0095, + "num_input_tokens_seen": 17325416, + "step": 8843 + }, + { + "epoch": 1.1721669980119285, + "grad_norm": 10.27675724029541, + "learning_rate": 3.343496142132002e-06, + "loss": 0.1018, + "num_input_tokens_seen": 17327056, + "step": 8844 + }, + { + "epoch": 1.1722995361166335, + "grad_norm": 0.08411324769258499, + "learning_rate": 3.3431693521538977e-06, + "loss": 0.0005, + "num_input_tokens_seen": 17328424, + "step": 8845 + }, + { + "epoch": 1.1724320742213385, + "grad_norm": 4.106504440307617, + "learning_rate": 3.342842545918907e-06, + "loss": 0.0509, + "num_input_tokens_seen": 17330512, + "step": 8846 + }, + { + "epoch": 1.1725646123260438, + "grad_norm": 0.02667836658656597, + "learning_rate": 3.3425157234333316e-06, + "loss": 0.0002, + "num_input_tokens_seen": 17332856, + "step": 8847 + }, + { + "epoch": 1.1726971504307488, + "grad_norm": 15.952197074890137, + "learning_rate": 3.34218888470347e-06, + "loss": 0.0825, + "num_input_tokens_seen": 17335680, + "step": 8848 + }, + { + "epoch": 1.1728296885354539, + "grad_norm": 0.30259251594543457, + "learning_rate": 3.341862029735627e-06, + "loss": 0.001, + "num_input_tokens_seen": 17337416, + "step": 8849 + }, + { + "epoch": 1.1729622266401591, + "grad_norm": 7.76979923248291, + "learning_rate": 3.3415351585361028e-06, + "loss": 0.0993, + "num_input_tokens_seen": 17339088, + "step": 8850 + }, + { + "epoch": 1.1730947647448642, + "grad_norm": 4.826986789703369, + "learning_rate": 3.3412082711112005e-06, + "loss": 0.1121, + "num_input_tokens_seen": 17340936, + "step": 8851 + }, + { + "epoch": 1.1732273028495692, + "grad_norm": 0.32022175192832947, + "learning_rate": 3.340881367467222e-06, + "loss": 0.0033, + "num_input_tokens_seen": 17343000, + "step": 8852 + }, + { + "epoch": 1.1733598409542743, + "grad_norm": 5.01305627822876, + "learning_rate": 3.3405544476104705e-06, + "loss": 0.0399, + "num_input_tokens_seen": 17344472, + "step": 8853 + }, + { + "epoch": 1.1734923790589795, + "grad_norm": 10.551304817199707, + "learning_rate": 3.34022751154725e-06, + "loss": 0.3568, + "num_input_tokens_seen": 17346072, + "step": 8854 + }, + { + "epoch": 1.1736249171636846, + "grad_norm": 3.7864937782287598, + "learning_rate": 3.3399005592838628e-06, + "loss": 0.0418, + "num_input_tokens_seen": 17349048, + "step": 8855 + }, + { + "epoch": 1.1737574552683896, + "grad_norm": 0.05981745198369026, + "learning_rate": 3.3395735908266143e-06, + "loss": 0.0004, + "num_input_tokens_seen": 17350832, + "step": 8856 + }, + { + "epoch": 1.1738899933730949, + "grad_norm": 0.12674234807491302, + "learning_rate": 3.3392466061818067e-06, + "loss": 0.0014, + "num_input_tokens_seen": 17352168, + "step": 8857 + }, + { + "epoch": 1.1740225314778, + "grad_norm": 0.0324234776198864, + "learning_rate": 3.338919605355746e-06, + "loss": 0.0002, + "num_input_tokens_seen": 17354216, + "step": 8858 + }, + { + "epoch": 1.174155069582505, + "grad_norm": 10.12738037109375, + "learning_rate": 3.3385925883547373e-06, + "loss": 0.4872, + "num_input_tokens_seen": 17357776, + "step": 8859 + }, + { + "epoch": 1.17428760768721, + "grad_norm": 6.50280237197876, + "learning_rate": 3.338265555185084e-06, + "loss": 0.1522, + "num_input_tokens_seen": 17359816, + "step": 8860 + }, + { + "epoch": 1.1744201457919152, + "grad_norm": 6.599811553955078, + "learning_rate": 3.3379385058530923e-06, + "loss": 0.1842, + "num_input_tokens_seen": 17362576, + "step": 8861 + }, + { + "epoch": 1.1745526838966203, + "grad_norm": 4.615855693817139, + "learning_rate": 3.3376114403650683e-06, + "loss": 0.1078, + "num_input_tokens_seen": 17364960, + "step": 8862 + }, + { + "epoch": 1.1746852220013253, + "grad_norm": 16.18260383605957, + "learning_rate": 3.3372843587273184e-06, + "loss": 0.5028, + "num_input_tokens_seen": 17366992, + "step": 8863 + }, + { + "epoch": 1.1748177601060306, + "grad_norm": 1.8142237663269043, + "learning_rate": 3.336957260946149e-06, + "loss": 0.013, + "num_input_tokens_seen": 17368584, + "step": 8864 + }, + { + "epoch": 1.1749502982107356, + "grad_norm": 1.0917799472808838, + "learning_rate": 3.336630147027865e-06, + "loss": 0.0151, + "num_input_tokens_seen": 17371344, + "step": 8865 + }, + { + "epoch": 1.1750828363154406, + "grad_norm": 0.1811767965555191, + "learning_rate": 3.3363030169787754e-06, + "loss": 0.0012, + "num_input_tokens_seen": 17372640, + "step": 8866 + }, + { + "epoch": 1.1752153744201457, + "grad_norm": 3.082073450088501, + "learning_rate": 3.3359758708051864e-06, + "loss": 0.018, + "num_input_tokens_seen": 17374080, + "step": 8867 + }, + { + "epoch": 1.175347912524851, + "grad_norm": 8.657854080200195, + "learning_rate": 3.335648708513406e-06, + "loss": 0.2723, + "num_input_tokens_seen": 17376576, + "step": 8868 + }, + { + "epoch": 1.175480450629556, + "grad_norm": 0.4310837388038635, + "learning_rate": 3.335321530109741e-06, + "loss": 0.0015, + "num_input_tokens_seen": 17378976, + "step": 8869 + }, + { + "epoch": 1.175612988734261, + "grad_norm": 7.137519836425781, + "learning_rate": 3.334994335600503e-06, + "loss": 0.1222, + "num_input_tokens_seen": 17381272, + "step": 8870 + }, + { + "epoch": 1.1757455268389663, + "grad_norm": 9.532232284545898, + "learning_rate": 3.334667124991996e-06, + "loss": 0.1163, + "num_input_tokens_seen": 17384048, + "step": 8871 + }, + { + "epoch": 1.1758780649436713, + "grad_norm": 0.07253357023000717, + "learning_rate": 3.3343398982905315e-06, + "loss": 0.0008, + "num_input_tokens_seen": 17386832, + "step": 8872 + }, + { + "epoch": 1.1760106030483763, + "grad_norm": 0.41350844502449036, + "learning_rate": 3.334012655502419e-06, + "loss": 0.0046, + "num_input_tokens_seen": 17388352, + "step": 8873 + }, + { + "epoch": 1.1761431411530816, + "grad_norm": 0.04887591674923897, + "learning_rate": 3.333685396633967e-06, + "loss": 0.0003, + "num_input_tokens_seen": 17389504, + "step": 8874 + }, + { + "epoch": 1.1762756792577866, + "grad_norm": 0.40415316820144653, + "learning_rate": 3.3333581216914844e-06, + "loss": 0.0023, + "num_input_tokens_seen": 17391528, + "step": 8875 + }, + { + "epoch": 1.1764082173624917, + "grad_norm": 0.25336959958076477, + "learning_rate": 3.333030830681283e-06, + "loss": 0.0018, + "num_input_tokens_seen": 17393816, + "step": 8876 + }, + { + "epoch": 1.176540755467197, + "grad_norm": 0.041031885892152786, + "learning_rate": 3.3327035236096723e-06, + "loss": 0.0003, + "num_input_tokens_seen": 17394800, + "step": 8877 + }, + { + "epoch": 1.176673293571902, + "grad_norm": 0.12305854260921478, + "learning_rate": 3.3323762004829635e-06, + "loss": 0.0008, + "num_input_tokens_seen": 17395968, + "step": 8878 + }, + { + "epoch": 1.176805831676607, + "grad_norm": 5.9266357421875, + "learning_rate": 3.332048861307467e-06, + "loss": 0.1003, + "num_input_tokens_seen": 17398104, + "step": 8879 + }, + { + "epoch": 1.176938369781312, + "grad_norm": 0.023426292464137077, + "learning_rate": 3.3317215060894946e-06, + "loss": 0.0003, + "num_input_tokens_seen": 17399640, + "step": 8880 + }, + { + "epoch": 1.1770709078860173, + "grad_norm": 6.088700771331787, + "learning_rate": 3.3313941348353575e-06, + "loss": 0.1498, + "num_input_tokens_seen": 17401912, + "step": 8881 + }, + { + "epoch": 1.1772034459907224, + "grad_norm": 6.036590576171875, + "learning_rate": 3.3310667475513684e-06, + "loss": 0.0984, + "num_input_tokens_seen": 17404048, + "step": 8882 + }, + { + "epoch": 1.1773359840954274, + "grad_norm": 11.896187782287598, + "learning_rate": 3.330739344243839e-06, + "loss": 0.0712, + "num_input_tokens_seen": 17406168, + "step": 8883 + }, + { + "epoch": 1.1774685222001327, + "grad_norm": 14.426502227783203, + "learning_rate": 3.330411924919082e-06, + "loss": 0.3913, + "num_input_tokens_seen": 17407968, + "step": 8884 + }, + { + "epoch": 1.1776010603048377, + "grad_norm": 4.15945291519165, + "learning_rate": 3.33008448958341e-06, + "loss": 0.0832, + "num_input_tokens_seen": 17409488, + "step": 8885 + }, + { + "epoch": 1.1777335984095427, + "grad_norm": 4.968869209289551, + "learning_rate": 3.3297570382431367e-06, + "loss": 0.1024, + "num_input_tokens_seen": 17411248, + "step": 8886 + }, + { + "epoch": 1.1778661365142478, + "grad_norm": 6.137375354766846, + "learning_rate": 3.329429570904575e-06, + "loss": 0.1137, + "num_input_tokens_seen": 17414112, + "step": 8887 + }, + { + "epoch": 1.177998674618953, + "grad_norm": 0.15902596712112427, + "learning_rate": 3.3291020875740394e-06, + "loss": 0.0012, + "num_input_tokens_seen": 17415952, + "step": 8888 + }, + { + "epoch": 1.178131212723658, + "grad_norm": 3.0795891284942627, + "learning_rate": 3.3287745882578427e-06, + "loss": 0.0955, + "num_input_tokens_seen": 17417864, + "step": 8889 + }, + { + "epoch": 1.178263750828363, + "grad_norm": 3.824113130569458, + "learning_rate": 3.328447072962301e-06, + "loss": 0.0154, + "num_input_tokens_seen": 17420168, + "step": 8890 + }, + { + "epoch": 1.1783962889330684, + "grad_norm": 7.940334796905518, + "learning_rate": 3.328119541693728e-06, + "loss": 0.1104, + "num_input_tokens_seen": 17421704, + "step": 8891 + }, + { + "epoch": 1.1785288270377734, + "grad_norm": 11.775419235229492, + "learning_rate": 3.327791994458438e-06, + "loss": 0.2967, + "num_input_tokens_seen": 17424280, + "step": 8892 + }, + { + "epoch": 1.1786613651424784, + "grad_norm": 0.05949070677161217, + "learning_rate": 3.327464431262749e-06, + "loss": 0.0004, + "num_input_tokens_seen": 17425848, + "step": 8893 + }, + { + "epoch": 1.1787939032471835, + "grad_norm": 10.573440551757812, + "learning_rate": 3.3271368521129736e-06, + "loss": 0.2735, + "num_input_tokens_seen": 17427880, + "step": 8894 + }, + { + "epoch": 1.1789264413518887, + "grad_norm": 9.537389755249023, + "learning_rate": 3.3268092570154296e-06, + "loss": 0.0932, + "num_input_tokens_seen": 17429624, + "step": 8895 + }, + { + "epoch": 1.1790589794565938, + "grad_norm": 3.9666707515716553, + "learning_rate": 3.3264816459764334e-06, + "loss": 0.0489, + "num_input_tokens_seen": 17430864, + "step": 8896 + }, + { + "epoch": 1.1791915175612988, + "grad_norm": 7.0137553215026855, + "learning_rate": 3.3261540190023e-06, + "loss": 0.0712, + "num_input_tokens_seen": 17433168, + "step": 8897 + }, + { + "epoch": 1.179324055666004, + "grad_norm": 5.60612154006958, + "learning_rate": 3.325826376099347e-06, + "loss": 0.047, + "num_input_tokens_seen": 17434920, + "step": 8898 + }, + { + "epoch": 1.179456593770709, + "grad_norm": 16.52873420715332, + "learning_rate": 3.325498717273893e-06, + "loss": 0.3325, + "num_input_tokens_seen": 17437592, + "step": 8899 + }, + { + "epoch": 1.1795891318754141, + "grad_norm": 11.053872108459473, + "learning_rate": 3.3251710425322524e-06, + "loss": 0.4957, + "num_input_tokens_seen": 17439800, + "step": 8900 + }, + { + "epoch": 1.1797216699801192, + "grad_norm": 11.831011772155762, + "learning_rate": 3.324843351880746e-06, + "loss": 0.2683, + "num_input_tokens_seen": 17442432, + "step": 8901 + }, + { + "epoch": 1.1798542080848244, + "grad_norm": 5.917088031768799, + "learning_rate": 3.3245156453256905e-06, + "loss": 0.0753, + "num_input_tokens_seen": 17443880, + "step": 8902 + }, + { + "epoch": 1.1799867461895295, + "grad_norm": 0.07549116015434265, + "learning_rate": 3.324187922873405e-06, + "loss": 0.0005, + "num_input_tokens_seen": 17445840, + "step": 8903 + }, + { + "epoch": 1.1801192842942345, + "grad_norm": 0.0452670156955719, + "learning_rate": 3.3238601845302075e-06, + "loss": 0.0003, + "num_input_tokens_seen": 17448104, + "step": 8904 + }, + { + "epoch": 1.1802518223989398, + "grad_norm": 0.22549304366111755, + "learning_rate": 3.3235324303024167e-06, + "loss": 0.0024, + "num_input_tokens_seen": 17450056, + "step": 8905 + }, + { + "epoch": 1.1803843605036448, + "grad_norm": 0.2072581946849823, + "learning_rate": 3.3232046601963534e-06, + "loss": 0.0014, + "num_input_tokens_seen": 17451632, + "step": 8906 + }, + { + "epoch": 1.1805168986083499, + "grad_norm": 5.191887855529785, + "learning_rate": 3.322876874218335e-06, + "loss": 0.0398, + "num_input_tokens_seen": 17453624, + "step": 8907 + }, + { + "epoch": 1.180649436713055, + "grad_norm": 15.796998023986816, + "learning_rate": 3.3225490723746847e-06, + "loss": 0.5041, + "num_input_tokens_seen": 17456552, + "step": 8908 + }, + { + "epoch": 1.1807819748177601, + "grad_norm": 10.588598251342773, + "learning_rate": 3.3222212546717204e-06, + "loss": 0.4351, + "num_input_tokens_seen": 17458408, + "step": 8909 + }, + { + "epoch": 1.1809145129224652, + "grad_norm": 0.0450008325278759, + "learning_rate": 3.3218934211157615e-06, + "loss": 0.0003, + "num_input_tokens_seen": 17461544, + "step": 8910 + }, + { + "epoch": 1.1810470510271702, + "grad_norm": 0.08888935297727585, + "learning_rate": 3.321565571713132e-06, + "loss": 0.0006, + "num_input_tokens_seen": 17463152, + "step": 8911 + }, + { + "epoch": 1.1811795891318755, + "grad_norm": 5.804290771484375, + "learning_rate": 3.321237706470152e-06, + "loss": 0.0827, + "num_input_tokens_seen": 17464728, + "step": 8912 + }, + { + "epoch": 1.1813121272365805, + "grad_norm": 9.533463478088379, + "learning_rate": 3.3209098253931404e-06, + "loss": 0.2408, + "num_input_tokens_seen": 17467024, + "step": 8913 + }, + { + "epoch": 1.1814446653412856, + "grad_norm": 0.9059574604034424, + "learning_rate": 3.3205819284884226e-06, + "loss": 0.0125, + "num_input_tokens_seen": 17468592, + "step": 8914 + }, + { + "epoch": 1.1815772034459906, + "grad_norm": 5.516317367553711, + "learning_rate": 3.3202540157623186e-06, + "loss": 0.1311, + "num_input_tokens_seen": 17470440, + "step": 8915 + }, + { + "epoch": 1.1817097415506959, + "grad_norm": 0.050684865564107895, + "learning_rate": 3.3199260872211515e-06, + "loss": 0.0003, + "num_input_tokens_seen": 17473056, + "step": 8916 + }, + { + "epoch": 1.181842279655401, + "grad_norm": 2.446044445037842, + "learning_rate": 3.319598142871244e-06, + "loss": 0.0691, + "num_input_tokens_seen": 17475448, + "step": 8917 + }, + { + "epoch": 1.181974817760106, + "grad_norm": 0.014913138933479786, + "learning_rate": 3.319270182718918e-06, + "loss": 0.0001, + "num_input_tokens_seen": 17476592, + "step": 8918 + }, + { + "epoch": 1.1821073558648112, + "grad_norm": 10.886700630187988, + "learning_rate": 3.3189422067704985e-06, + "loss": 0.2426, + "num_input_tokens_seen": 17478104, + "step": 8919 + }, + { + "epoch": 1.1822398939695162, + "grad_norm": 12.689355850219727, + "learning_rate": 3.3186142150323083e-06, + "loss": 0.3231, + "num_input_tokens_seen": 17479472, + "step": 8920 + }, + { + "epoch": 1.1823724320742213, + "grad_norm": 9.062192916870117, + "learning_rate": 3.3182862075106708e-06, + "loss": 0.1892, + "num_input_tokens_seen": 17481520, + "step": 8921 + }, + { + "epoch": 1.1825049701789265, + "grad_norm": 3.434389591217041, + "learning_rate": 3.3179581842119113e-06, + "loss": 0.0506, + "num_input_tokens_seen": 17483160, + "step": 8922 + }, + { + "epoch": 1.1826375082836316, + "grad_norm": 3.0705623626708984, + "learning_rate": 3.317630145142353e-06, + "loss": 0.0422, + "num_input_tokens_seen": 17485024, + "step": 8923 + }, + { + "epoch": 1.1827700463883366, + "grad_norm": 0.038525160402059555, + "learning_rate": 3.3173020903083227e-06, + "loss": 0.0003, + "num_input_tokens_seen": 17487056, + "step": 8924 + }, + { + "epoch": 1.1829025844930419, + "grad_norm": 5.803386211395264, + "learning_rate": 3.3169740197161427e-06, + "loss": 0.1272, + "num_input_tokens_seen": 17489888, + "step": 8925 + }, + { + "epoch": 1.183035122597747, + "grad_norm": 7.382401466369629, + "learning_rate": 3.3166459333721408e-06, + "loss": 0.2465, + "num_input_tokens_seen": 17491296, + "step": 8926 + }, + { + "epoch": 1.183167660702452, + "grad_norm": 6.785992622375488, + "learning_rate": 3.3163178312826417e-06, + "loss": 0.1687, + "num_input_tokens_seen": 17493992, + "step": 8927 + }, + { + "epoch": 1.183300198807157, + "grad_norm": 0.04319420084357262, + "learning_rate": 3.315989713453972e-06, + "loss": 0.0003, + "num_input_tokens_seen": 17495848, + "step": 8928 + }, + { + "epoch": 1.1834327369118622, + "grad_norm": 10.298629760742188, + "learning_rate": 3.3156615798924576e-06, + "loss": 0.2509, + "num_input_tokens_seen": 17498544, + "step": 8929 + }, + { + "epoch": 1.1835652750165673, + "grad_norm": 5.441462516784668, + "learning_rate": 3.315333430604425e-06, + "loss": 0.1466, + "num_input_tokens_seen": 17501072, + "step": 8930 + }, + { + "epoch": 1.1836978131212723, + "grad_norm": 0.0792512595653534, + "learning_rate": 3.315005265596202e-06, + "loss": 0.0006, + "num_input_tokens_seen": 17502888, + "step": 8931 + }, + { + "epoch": 1.1838303512259776, + "grad_norm": 10.404994010925293, + "learning_rate": 3.314677084874115e-06, + "loss": 0.163, + "num_input_tokens_seen": 17504656, + "step": 8932 + }, + { + "epoch": 1.1839628893306826, + "grad_norm": 12.821009635925293, + "learning_rate": 3.314348888444491e-06, + "loss": 0.2046, + "num_input_tokens_seen": 17506536, + "step": 8933 + }, + { + "epoch": 1.1840954274353876, + "grad_norm": 5.730447769165039, + "learning_rate": 3.3140206763136597e-06, + "loss": 0.0515, + "num_input_tokens_seen": 17507912, + "step": 8934 + }, + { + "epoch": 1.1842279655400927, + "grad_norm": 3.5229740142822266, + "learning_rate": 3.313692448487948e-06, + "loss": 0.0161, + "num_input_tokens_seen": 17509968, + "step": 8935 + }, + { + "epoch": 1.184360503644798, + "grad_norm": 3.23940110206604, + "learning_rate": 3.3133642049736835e-06, + "loss": 0.0809, + "num_input_tokens_seen": 17512592, + "step": 8936 + }, + { + "epoch": 1.184493041749503, + "grad_norm": 4.539789199829102, + "learning_rate": 3.313035945777197e-06, + "loss": 0.1271, + "num_input_tokens_seen": 17514776, + "step": 8937 + }, + { + "epoch": 1.184625579854208, + "grad_norm": 5.83056640625, + "learning_rate": 3.312707670904816e-06, + "loss": 0.1138, + "num_input_tokens_seen": 17517440, + "step": 8938 + }, + { + "epoch": 1.1847581179589133, + "grad_norm": 7.625308036804199, + "learning_rate": 3.312379380362872e-06, + "loss": 0.2632, + "num_input_tokens_seen": 17519176, + "step": 8939 + }, + { + "epoch": 1.1848906560636183, + "grad_norm": 8.107145309448242, + "learning_rate": 3.3120510741576915e-06, + "loss": 0.0861, + "num_input_tokens_seen": 17521256, + "step": 8940 + }, + { + "epoch": 1.1850231941683234, + "grad_norm": 5.104151248931885, + "learning_rate": 3.3117227522956064e-06, + "loss": 0.1521, + "num_input_tokens_seen": 17522736, + "step": 8941 + }, + { + "epoch": 1.1851557322730284, + "grad_norm": 0.7377922534942627, + "learning_rate": 3.3113944147829465e-06, + "loss": 0.01, + "num_input_tokens_seen": 17524968, + "step": 8942 + }, + { + "epoch": 1.1852882703777337, + "grad_norm": 13.687117576599121, + "learning_rate": 3.3110660616260433e-06, + "loss": 0.2918, + "num_input_tokens_seen": 17526744, + "step": 8943 + }, + { + "epoch": 1.1854208084824387, + "grad_norm": 3.5081520080566406, + "learning_rate": 3.3107376928312263e-06, + "loss": 0.0506, + "num_input_tokens_seen": 17528856, + "step": 8944 + }, + { + "epoch": 1.1855533465871437, + "grad_norm": 0.7712841629981995, + "learning_rate": 3.310409308404827e-06, + "loss": 0.0054, + "num_input_tokens_seen": 17530576, + "step": 8945 + }, + { + "epoch": 1.185685884691849, + "grad_norm": 0.2213091105222702, + "learning_rate": 3.3100809083531777e-06, + "loss": 0.0016, + "num_input_tokens_seen": 17532448, + "step": 8946 + }, + { + "epoch": 1.185818422796554, + "grad_norm": 4.120362758636475, + "learning_rate": 3.3097524926826095e-06, + "loss": 0.0328, + "num_input_tokens_seen": 17535016, + "step": 8947 + }, + { + "epoch": 1.185950960901259, + "grad_norm": 3.156841278076172, + "learning_rate": 3.3094240613994548e-06, + "loss": 0.0521, + "num_input_tokens_seen": 17537448, + "step": 8948 + }, + { + "epoch": 1.186083499005964, + "grad_norm": 0.6524286866188049, + "learning_rate": 3.3090956145100457e-06, + "loss": 0.0068, + "num_input_tokens_seen": 17538616, + "step": 8949 + }, + { + "epoch": 1.1862160371106694, + "grad_norm": 0.1091860681772232, + "learning_rate": 3.3087671520207153e-06, + "loss": 0.0008, + "num_input_tokens_seen": 17540080, + "step": 8950 + }, + { + "epoch": 1.1863485752153744, + "grad_norm": 0.5010088086128235, + "learning_rate": 3.3084386739377954e-06, + "loss": 0.0046, + "num_input_tokens_seen": 17541696, + "step": 8951 + }, + { + "epoch": 1.1864811133200794, + "grad_norm": 0.1223435252904892, + "learning_rate": 3.3081101802676207e-06, + "loss": 0.0009, + "num_input_tokens_seen": 17543224, + "step": 8952 + }, + { + "epoch": 1.1866136514247847, + "grad_norm": 0.1239573135972023, + "learning_rate": 3.3077816710165244e-06, + "loss": 0.0008, + "num_input_tokens_seen": 17544544, + "step": 8953 + }, + { + "epoch": 1.1867461895294897, + "grad_norm": 0.04506201297044754, + "learning_rate": 3.3074531461908392e-06, + "loss": 0.0003, + "num_input_tokens_seen": 17546208, + "step": 8954 + }, + { + "epoch": 1.1868787276341948, + "grad_norm": 7.987022876739502, + "learning_rate": 3.307124605796902e-06, + "loss": 0.1541, + "num_input_tokens_seen": 17547984, + "step": 8955 + }, + { + "epoch": 1.1870112657388998, + "grad_norm": 4.274661540985107, + "learning_rate": 3.3067960498410446e-06, + "loss": 0.0573, + "num_input_tokens_seen": 17550576, + "step": 8956 + }, + { + "epoch": 1.187143803843605, + "grad_norm": 7.053033828735352, + "learning_rate": 3.3064674783296025e-06, + "loss": 0.1211, + "num_input_tokens_seen": 17552424, + "step": 8957 + }, + { + "epoch": 1.18727634194831, + "grad_norm": 11.39521598815918, + "learning_rate": 3.306138891268912e-06, + "loss": 0.1456, + "num_input_tokens_seen": 17554576, + "step": 8958 + }, + { + "epoch": 1.1874088800530151, + "grad_norm": 4.583521366119385, + "learning_rate": 3.3058102886653065e-06, + "loss": 0.0686, + "num_input_tokens_seen": 17556816, + "step": 8959 + }, + { + "epoch": 1.1875414181577204, + "grad_norm": 0.13757112622261047, + "learning_rate": 3.3054816705251237e-06, + "loss": 0.001, + "num_input_tokens_seen": 17558776, + "step": 8960 + }, + { + "epoch": 1.1876739562624254, + "grad_norm": 8.311262130737305, + "learning_rate": 3.305153036854698e-06, + "loss": 0.1432, + "num_input_tokens_seen": 17560280, + "step": 8961 + }, + { + "epoch": 1.1878064943671305, + "grad_norm": 7.239976406097412, + "learning_rate": 3.3048243876603665e-06, + "loss": 0.186, + "num_input_tokens_seen": 17563192, + "step": 8962 + }, + { + "epoch": 1.1879390324718357, + "grad_norm": 7.996603965759277, + "learning_rate": 3.3044957229484654e-06, + "loss": 0.0561, + "num_input_tokens_seen": 17565912, + "step": 8963 + }, + { + "epoch": 1.1880715705765408, + "grad_norm": 9.780413627624512, + "learning_rate": 3.3041670427253314e-06, + "loss": 0.0855, + "num_input_tokens_seen": 17567664, + "step": 8964 + }, + { + "epoch": 1.1882041086812458, + "grad_norm": 2.266666889190674, + "learning_rate": 3.3038383469973034e-06, + "loss": 0.0257, + "num_input_tokens_seen": 17569464, + "step": 8965 + }, + { + "epoch": 1.188336646785951, + "grad_norm": 0.07029636949300766, + "learning_rate": 3.3035096357707163e-06, + "loss": 0.0005, + "num_input_tokens_seen": 17571040, + "step": 8966 + }, + { + "epoch": 1.1884691848906561, + "grad_norm": 5.848216533660889, + "learning_rate": 3.3031809090519097e-06, + "loss": 0.1223, + "num_input_tokens_seen": 17573928, + "step": 8967 + }, + { + "epoch": 1.1886017229953612, + "grad_norm": 5.0672607421875, + "learning_rate": 3.3028521668472215e-06, + "loss": 0.0934, + "num_input_tokens_seen": 17576328, + "step": 8968 + }, + { + "epoch": 1.1887342611000662, + "grad_norm": 0.12875960767269135, + "learning_rate": 3.302523409162989e-06, + "loss": 0.0009, + "num_input_tokens_seen": 17578080, + "step": 8969 + }, + { + "epoch": 1.1888667992047715, + "grad_norm": 5.097253799438477, + "learning_rate": 3.3021946360055505e-06, + "loss": 0.1417, + "num_input_tokens_seen": 17579896, + "step": 8970 + }, + { + "epoch": 1.1889993373094765, + "grad_norm": 3.7811038494110107, + "learning_rate": 3.3018658473812482e-06, + "loss": 0.0347, + "num_input_tokens_seen": 17581976, + "step": 8971 + }, + { + "epoch": 1.1891318754141815, + "grad_norm": 4.580356597900391, + "learning_rate": 3.3015370432964174e-06, + "loss": 0.2121, + "num_input_tokens_seen": 17583864, + "step": 8972 + }, + { + "epoch": 1.1892644135188868, + "grad_norm": 7.334189414978027, + "learning_rate": 3.3012082237574e-06, + "loss": 0.2186, + "num_input_tokens_seen": 17585600, + "step": 8973 + }, + { + "epoch": 1.1893969516235918, + "grad_norm": 6.757203578948975, + "learning_rate": 3.300879388770536e-06, + "loss": 0.0448, + "num_input_tokens_seen": 17587744, + "step": 8974 + }, + { + "epoch": 1.1895294897282969, + "grad_norm": 9.832472801208496, + "learning_rate": 3.3005505383421638e-06, + "loss": 0.2743, + "num_input_tokens_seen": 17589872, + "step": 8975 + }, + { + "epoch": 1.189662027833002, + "grad_norm": 0.3736446499824524, + "learning_rate": 3.3002216724786257e-06, + "loss": 0.0025, + "num_input_tokens_seen": 17591536, + "step": 8976 + }, + { + "epoch": 1.1897945659377072, + "grad_norm": 13.027419090270996, + "learning_rate": 3.2998927911862614e-06, + "loss": 0.1662, + "num_input_tokens_seen": 17593056, + "step": 8977 + }, + { + "epoch": 1.1899271040424122, + "grad_norm": 0.26161283254623413, + "learning_rate": 3.2995638944714126e-06, + "loss": 0.0019, + "num_input_tokens_seen": 17594824, + "step": 8978 + }, + { + "epoch": 1.1900596421471172, + "grad_norm": 0.8130900263786316, + "learning_rate": 3.29923498234042e-06, + "loss": 0.0053, + "num_input_tokens_seen": 17597312, + "step": 8979 + }, + { + "epoch": 1.1901921802518225, + "grad_norm": 2.9022505283355713, + "learning_rate": 3.2989060547996254e-06, + "loss": 0.0152, + "num_input_tokens_seen": 17599880, + "step": 8980 + }, + { + "epoch": 1.1903247183565275, + "grad_norm": 4.475684642791748, + "learning_rate": 3.2985771118553713e-06, + "loss": 0.0408, + "num_input_tokens_seen": 17601632, + "step": 8981 + }, + { + "epoch": 1.1904572564612326, + "grad_norm": 10.67611026763916, + "learning_rate": 3.298248153514e-06, + "loss": 0.1873, + "num_input_tokens_seen": 17604240, + "step": 8982 + }, + { + "epoch": 1.1905897945659376, + "grad_norm": 8.216914176940918, + "learning_rate": 3.2979191797818525e-06, + "loss": 0.2709, + "num_input_tokens_seen": 17606472, + "step": 8983 + }, + { + "epoch": 1.1907223326706429, + "grad_norm": 6.191561698913574, + "learning_rate": 3.2975901906652734e-06, + "loss": 0.071, + "num_input_tokens_seen": 17608080, + "step": 8984 + }, + { + "epoch": 1.190854870775348, + "grad_norm": 0.023402512073516846, + "learning_rate": 3.297261186170605e-06, + "loss": 0.0002, + "num_input_tokens_seen": 17609248, + "step": 8985 + }, + { + "epoch": 1.190987408880053, + "grad_norm": 2.9093871116638184, + "learning_rate": 3.296932166304192e-06, + "loss": 0.0505, + "num_input_tokens_seen": 17611336, + "step": 8986 + }, + { + "epoch": 1.1911199469847582, + "grad_norm": 4.128095626831055, + "learning_rate": 3.2966031310723755e-06, + "loss": 0.0721, + "num_input_tokens_seen": 17613320, + "step": 8987 + }, + { + "epoch": 1.1912524850894632, + "grad_norm": 9.696879386901855, + "learning_rate": 3.296274080481502e-06, + "loss": 0.1604, + "num_input_tokens_seen": 17615312, + "step": 8988 + }, + { + "epoch": 1.1913850231941683, + "grad_norm": 16.401153564453125, + "learning_rate": 3.2959450145379147e-06, + "loss": 0.5523, + "num_input_tokens_seen": 17618800, + "step": 8989 + }, + { + "epoch": 1.1915175612988733, + "grad_norm": 12.310830116271973, + "learning_rate": 3.2956159332479574e-06, + "loss": 0.1986, + "num_input_tokens_seen": 17620864, + "step": 8990 + }, + { + "epoch": 1.1916500994035786, + "grad_norm": 1.4701000452041626, + "learning_rate": 3.295286836617978e-06, + "loss": 0.0098, + "num_input_tokens_seen": 17623656, + "step": 8991 + }, + { + "epoch": 1.1917826375082836, + "grad_norm": 0.0612865649163723, + "learning_rate": 3.2949577246543183e-06, + "loss": 0.0004, + "num_input_tokens_seen": 17625256, + "step": 8992 + }, + { + "epoch": 1.1919151756129887, + "grad_norm": 12.265141487121582, + "learning_rate": 3.294628597363326e-06, + "loss": 0.1129, + "num_input_tokens_seen": 17627856, + "step": 8993 + }, + { + "epoch": 1.192047713717694, + "grad_norm": 12.065597534179688, + "learning_rate": 3.2942994547513456e-06, + "loss": 0.1572, + "num_input_tokens_seen": 17629656, + "step": 8994 + }, + { + "epoch": 1.192180251822399, + "grad_norm": 17.106163024902344, + "learning_rate": 3.2939702968247238e-06, + "loss": 0.4284, + "num_input_tokens_seen": 17631256, + "step": 8995 + }, + { + "epoch": 1.192312789927104, + "grad_norm": 7.499751567840576, + "learning_rate": 3.2936411235898067e-06, + "loss": 0.0815, + "num_input_tokens_seen": 17632888, + "step": 8996 + }, + { + "epoch": 1.192445328031809, + "grad_norm": 5.12352991104126, + "learning_rate": 3.2933119350529426e-06, + "loss": 0.0923, + "num_input_tokens_seen": 17634832, + "step": 8997 + }, + { + "epoch": 1.1925778661365143, + "grad_norm": 6.49252986907959, + "learning_rate": 3.292982731220476e-06, + "loss": 0.0536, + "num_input_tokens_seen": 17637480, + "step": 8998 + }, + { + "epoch": 1.1927104042412193, + "grad_norm": 8.960293769836426, + "learning_rate": 3.2926535120987558e-06, + "loss": 0.1325, + "num_input_tokens_seen": 17639104, + "step": 8999 + }, + { + "epoch": 1.1928429423459244, + "grad_norm": 0.037488870322704315, + "learning_rate": 3.2923242776941288e-06, + "loss": 0.0002, + "num_input_tokens_seen": 17640736, + "step": 9000 + }, + { + "epoch": 1.1929754804506296, + "grad_norm": 7.179965019226074, + "learning_rate": 3.2919950280129433e-06, + "loss": 0.1689, + "num_input_tokens_seen": 17643488, + "step": 9001 + }, + { + "epoch": 1.1931080185553347, + "grad_norm": 4.196895122528076, + "learning_rate": 3.291665763061547e-06, + "loss": 0.0837, + "num_input_tokens_seen": 17645608, + "step": 9002 + }, + { + "epoch": 1.1932405566600397, + "grad_norm": 5.421475887298584, + "learning_rate": 3.291336482846288e-06, + "loss": 0.1259, + "num_input_tokens_seen": 17647704, + "step": 9003 + }, + { + "epoch": 1.193373094764745, + "grad_norm": 7.672059059143066, + "learning_rate": 3.2910071873735172e-06, + "loss": 0.3219, + "num_input_tokens_seen": 17649664, + "step": 9004 + }, + { + "epoch": 1.19350563286945, + "grad_norm": 5.5291948318481445, + "learning_rate": 3.2906778766495806e-06, + "loss": 0.0851, + "num_input_tokens_seen": 17651896, + "step": 9005 + }, + { + "epoch": 1.193638170974155, + "grad_norm": 10.174932479858398, + "learning_rate": 3.2903485506808296e-06, + "loss": 0.1148, + "num_input_tokens_seen": 17653072, + "step": 9006 + }, + { + "epoch": 1.1937707090788603, + "grad_norm": 0.3604847490787506, + "learning_rate": 3.2900192094736138e-06, + "loss": 0.0031, + "num_input_tokens_seen": 17654368, + "step": 9007 + }, + { + "epoch": 1.1939032471835653, + "grad_norm": 10.02323055267334, + "learning_rate": 3.2896898530342823e-06, + "loss": 0.2756, + "num_input_tokens_seen": 17656320, + "step": 9008 + }, + { + "epoch": 1.1940357852882704, + "grad_norm": 0.30341312289237976, + "learning_rate": 3.289360481369186e-06, + "loss": 0.0022, + "num_input_tokens_seen": 17658056, + "step": 9009 + }, + { + "epoch": 1.1941683233929754, + "grad_norm": 4.6171369552612305, + "learning_rate": 3.289031094484675e-06, + "loss": 0.0931, + "num_input_tokens_seen": 17659696, + "step": 9010 + }, + { + "epoch": 1.1943008614976807, + "grad_norm": 0.09706432372331619, + "learning_rate": 3.2887016923870997e-06, + "loss": 0.0012, + "num_input_tokens_seen": 17661672, + "step": 9011 + }, + { + "epoch": 1.1944333996023857, + "grad_norm": 1.645926833152771, + "learning_rate": 3.2883722750828123e-06, + "loss": 0.0135, + "num_input_tokens_seen": 17663312, + "step": 9012 + }, + { + "epoch": 1.1945659377070907, + "grad_norm": 0.04872516915202141, + "learning_rate": 3.288042842578163e-06, + "loss": 0.0003, + "num_input_tokens_seen": 17664896, + "step": 9013 + }, + { + "epoch": 1.194698475811796, + "grad_norm": 5.716052055358887, + "learning_rate": 3.2877133948795035e-06, + "loss": 0.2099, + "num_input_tokens_seen": 17666336, + "step": 9014 + }, + { + "epoch": 1.194831013916501, + "grad_norm": 4.82434606552124, + "learning_rate": 3.2873839319931875e-06, + "loss": 0.091, + "num_input_tokens_seen": 17668248, + "step": 9015 + }, + { + "epoch": 1.194963552021206, + "grad_norm": 4.683647155761719, + "learning_rate": 3.287054453925566e-06, + "loss": 0.0381, + "num_input_tokens_seen": 17669448, + "step": 9016 + }, + { + "epoch": 1.1950960901259111, + "grad_norm": 8.186884880065918, + "learning_rate": 3.2867249606829908e-06, + "loss": 0.169, + "num_input_tokens_seen": 17670992, + "step": 9017 + }, + { + "epoch": 1.1952286282306164, + "grad_norm": 0.03388563543558121, + "learning_rate": 3.286395452271817e-06, + "loss": 0.0002, + "num_input_tokens_seen": 17672632, + "step": 9018 + }, + { + "epoch": 1.1953611663353214, + "grad_norm": 16.840633392333984, + "learning_rate": 3.2860659286983954e-06, + "loss": 0.5339, + "num_input_tokens_seen": 17675920, + "step": 9019 + }, + { + "epoch": 1.1954937044400265, + "grad_norm": 5.57549524307251, + "learning_rate": 3.28573638996908e-06, + "loss": 0.1376, + "num_input_tokens_seen": 17677736, + "step": 9020 + }, + { + "epoch": 1.1956262425447317, + "grad_norm": 0.05066152289509773, + "learning_rate": 3.2854068360902257e-06, + "loss": 0.0004, + "num_input_tokens_seen": 17681104, + "step": 9021 + }, + { + "epoch": 1.1957587806494367, + "grad_norm": 7.6431884765625, + "learning_rate": 3.2850772670681856e-06, + "loss": 0.1038, + "num_input_tokens_seen": 17683144, + "step": 9022 + }, + { + "epoch": 1.1958913187541418, + "grad_norm": 6.904979705810547, + "learning_rate": 3.2847476829093145e-06, + "loss": 0.0702, + "num_input_tokens_seen": 17684984, + "step": 9023 + }, + { + "epoch": 1.1960238568588468, + "grad_norm": 3.5132839679718018, + "learning_rate": 3.284418083619966e-06, + "loss": 0.0155, + "num_input_tokens_seen": 17686376, + "step": 9024 + }, + { + "epoch": 1.196156394963552, + "grad_norm": 0.14713510870933533, + "learning_rate": 3.284088469206496e-06, + "loss": 0.0018, + "num_input_tokens_seen": 17687480, + "step": 9025 + }, + { + "epoch": 1.1962889330682571, + "grad_norm": 3.8775322437286377, + "learning_rate": 3.2837588396752603e-06, + "loss": 0.0194, + "num_input_tokens_seen": 17689552, + "step": 9026 + }, + { + "epoch": 1.1964214711729622, + "grad_norm": 7.628950595855713, + "learning_rate": 3.2834291950326124e-06, + "loss": 0.1294, + "num_input_tokens_seen": 17692072, + "step": 9027 + }, + { + "epoch": 1.1965540092776674, + "grad_norm": 0.08033449202775955, + "learning_rate": 3.2830995352849093e-06, + "loss": 0.001, + "num_input_tokens_seen": 17693856, + "step": 9028 + }, + { + "epoch": 1.1966865473823725, + "grad_norm": 7.240048885345459, + "learning_rate": 3.2827698604385073e-06, + "loss": 0.224, + "num_input_tokens_seen": 17695360, + "step": 9029 + }, + { + "epoch": 1.1968190854870775, + "grad_norm": 9.042844772338867, + "learning_rate": 3.282440170499762e-06, + "loss": 0.1262, + "num_input_tokens_seen": 17696696, + "step": 9030 + }, + { + "epoch": 1.1969516235917825, + "grad_norm": 9.930558204650879, + "learning_rate": 3.282110465475031e-06, + "loss": 0.3018, + "num_input_tokens_seen": 17698800, + "step": 9031 + }, + { + "epoch": 1.1970841616964878, + "grad_norm": 5.182363510131836, + "learning_rate": 3.2817807453706697e-06, + "loss": 0.0559, + "num_input_tokens_seen": 17701168, + "step": 9032 + }, + { + "epoch": 1.1972166998011928, + "grad_norm": 6.812538146972656, + "learning_rate": 3.2814510101930373e-06, + "loss": 0.1047, + "num_input_tokens_seen": 17703496, + "step": 9033 + }, + { + "epoch": 1.1973492379058979, + "grad_norm": 0.19804666936397552, + "learning_rate": 3.2811212599484892e-06, + "loss": 0.0014, + "num_input_tokens_seen": 17705792, + "step": 9034 + }, + { + "epoch": 1.1974817760106031, + "grad_norm": 0.279073029756546, + "learning_rate": 3.2807914946433857e-06, + "loss": 0.0019, + "num_input_tokens_seen": 17707704, + "step": 9035 + }, + { + "epoch": 1.1976143141153082, + "grad_norm": 0.7152688503265381, + "learning_rate": 3.2804617142840834e-06, + "loss": 0.007, + "num_input_tokens_seen": 17709504, + "step": 9036 + }, + { + "epoch": 1.1977468522200132, + "grad_norm": 0.12465457618236542, + "learning_rate": 3.2801319188769392e-06, + "loss": 0.0009, + "num_input_tokens_seen": 17710728, + "step": 9037 + }, + { + "epoch": 1.1978793903247182, + "grad_norm": 5.552314758300781, + "learning_rate": 3.279802108428315e-06, + "loss": 0.0662, + "num_input_tokens_seen": 17712688, + "step": 9038 + }, + { + "epoch": 1.1980119284294235, + "grad_norm": 3.6004574298858643, + "learning_rate": 3.279472282944568e-06, + "loss": 0.0351, + "num_input_tokens_seen": 17714632, + "step": 9039 + }, + { + "epoch": 1.1981444665341285, + "grad_norm": 0.7143848538398743, + "learning_rate": 3.2791424424320568e-06, + "loss": 0.0051, + "num_input_tokens_seen": 17716360, + "step": 9040 + }, + { + "epoch": 1.1982770046388336, + "grad_norm": 0.11256122589111328, + "learning_rate": 3.2788125868971434e-06, + "loss": 0.0007, + "num_input_tokens_seen": 17717808, + "step": 9041 + }, + { + "epoch": 1.1984095427435388, + "grad_norm": 3.1990604400634766, + "learning_rate": 3.278482716346184e-06, + "loss": 0.0394, + "num_input_tokens_seen": 17720432, + "step": 9042 + }, + { + "epoch": 1.1985420808482439, + "grad_norm": 5.13429069519043, + "learning_rate": 3.278152830785542e-06, + "loss": 0.1068, + "num_input_tokens_seen": 17722160, + "step": 9043 + }, + { + "epoch": 1.198674618952949, + "grad_norm": 0.10997794568538666, + "learning_rate": 3.2778229302215768e-06, + "loss": 0.0008, + "num_input_tokens_seen": 17723800, + "step": 9044 + }, + { + "epoch": 1.1988071570576542, + "grad_norm": 2.9213993549346924, + "learning_rate": 3.2774930146606487e-06, + "loss": 0.028, + "num_input_tokens_seen": 17725656, + "step": 9045 + }, + { + "epoch": 1.1989396951623592, + "grad_norm": 0.1360163390636444, + "learning_rate": 3.2771630841091182e-06, + "loss": 0.0009, + "num_input_tokens_seen": 17729240, + "step": 9046 + }, + { + "epoch": 1.1990722332670642, + "grad_norm": 10.870383262634277, + "learning_rate": 3.2768331385733475e-06, + "loss": 0.2757, + "num_input_tokens_seen": 17731104, + "step": 9047 + }, + { + "epoch": 1.1992047713717695, + "grad_norm": 1.0196704864501953, + "learning_rate": 3.276503178059699e-06, + "loss": 0.0093, + "num_input_tokens_seen": 17733072, + "step": 9048 + }, + { + "epoch": 1.1993373094764745, + "grad_norm": 6.064451694488525, + "learning_rate": 3.2761732025745328e-06, + "loss": 0.198, + "num_input_tokens_seen": 17734784, + "step": 9049 + }, + { + "epoch": 1.1994698475811796, + "grad_norm": 6.145493507385254, + "learning_rate": 3.2758432121242115e-06, + "loss": 0.1355, + "num_input_tokens_seen": 17737320, + "step": 9050 + }, + { + "epoch": 1.1996023856858846, + "grad_norm": 10.204673767089844, + "learning_rate": 3.2755132067150987e-06, + "loss": 0.1281, + "num_input_tokens_seen": 17739680, + "step": 9051 + }, + { + "epoch": 1.1997349237905899, + "grad_norm": 4.3429365158081055, + "learning_rate": 3.2751831863535556e-06, + "loss": 0.0799, + "num_input_tokens_seen": 17741168, + "step": 9052 + }, + { + "epoch": 1.199867461895295, + "grad_norm": 0.06358958780765533, + "learning_rate": 3.274853151045946e-06, + "loss": 0.0004, + "num_input_tokens_seen": 17742344, + "step": 9053 + }, + { + "epoch": 1.2, + "grad_norm": 0.05594007298350334, + "learning_rate": 3.274523100798634e-06, + "loss": 0.0003, + "num_input_tokens_seen": 17744840, + "step": 9054 + }, + { + "epoch": 1.2001325381047052, + "grad_norm": 0.9304759502410889, + "learning_rate": 3.2741930356179806e-06, + "loss": 0.002, + "num_input_tokens_seen": 17746592, + "step": 9055 + }, + { + "epoch": 1.2002650762094103, + "grad_norm": 5.205002784729004, + "learning_rate": 3.273862955510353e-06, + "loss": 0.0965, + "num_input_tokens_seen": 17747792, + "step": 9056 + }, + { + "epoch": 1.2003976143141153, + "grad_norm": 1.2414475679397583, + "learning_rate": 3.2735328604821132e-06, + "loss": 0.0082, + "num_input_tokens_seen": 17749272, + "step": 9057 + }, + { + "epoch": 1.2005301524188203, + "grad_norm": 12.819445610046387, + "learning_rate": 3.273202750539625e-06, + "loss": 0.2514, + "num_input_tokens_seen": 17751280, + "step": 9058 + }, + { + "epoch": 1.2006626905235256, + "grad_norm": 0.3836911618709564, + "learning_rate": 3.2728726256892563e-06, + "loss": 0.002, + "num_input_tokens_seen": 17752736, + "step": 9059 + }, + { + "epoch": 1.2007952286282306, + "grad_norm": 7.996298789978027, + "learning_rate": 3.272542485937369e-06, + "loss": 0.1837, + "num_input_tokens_seen": 17754400, + "step": 9060 + }, + { + "epoch": 1.2009277667329357, + "grad_norm": 5.354411602020264, + "learning_rate": 3.2722123312903296e-06, + "loss": 0.1867, + "num_input_tokens_seen": 17756120, + "step": 9061 + }, + { + "epoch": 1.201060304837641, + "grad_norm": 0.02621486596763134, + "learning_rate": 3.2718821617545047e-06, + "loss": 0.0002, + "num_input_tokens_seen": 17757152, + "step": 9062 + }, + { + "epoch": 1.201192842942346, + "grad_norm": 3.123389959335327, + "learning_rate": 3.271551977336258e-06, + "loss": 0.0952, + "num_input_tokens_seen": 17758448, + "step": 9063 + }, + { + "epoch": 1.201325381047051, + "grad_norm": 4.6133880615234375, + "learning_rate": 3.2712217780419575e-06, + "loss": 0.0581, + "num_input_tokens_seen": 17760232, + "step": 9064 + }, + { + "epoch": 1.201457919151756, + "grad_norm": 2.612525463104248, + "learning_rate": 3.2708915638779694e-06, + "loss": 0.0239, + "num_input_tokens_seen": 17761992, + "step": 9065 + }, + { + "epoch": 1.2015904572564613, + "grad_norm": 4.150389671325684, + "learning_rate": 3.2705613348506597e-06, + "loss": 0.0844, + "num_input_tokens_seen": 17764400, + "step": 9066 + }, + { + "epoch": 1.2017229953611663, + "grad_norm": 0.6718852519989014, + "learning_rate": 3.270231090966396e-06, + "loss": 0.0032, + "num_input_tokens_seen": 17767248, + "step": 9067 + }, + { + "epoch": 1.2018555334658714, + "grad_norm": 1.9013322591781616, + "learning_rate": 3.2699008322315456e-06, + "loss": 0.0094, + "num_input_tokens_seen": 17768936, + "step": 9068 + }, + { + "epoch": 1.2019880715705766, + "grad_norm": 0.08963616192340851, + "learning_rate": 3.2695705586524764e-06, + "loss": 0.0006, + "num_input_tokens_seen": 17770288, + "step": 9069 + }, + { + "epoch": 1.2021206096752817, + "grad_norm": 9.684527397155762, + "learning_rate": 3.2692402702355554e-06, + "loss": 0.1807, + "num_input_tokens_seen": 17772736, + "step": 9070 + }, + { + "epoch": 1.2022531477799867, + "grad_norm": 11.210332870483398, + "learning_rate": 3.268909966987151e-06, + "loss": 0.3761, + "num_input_tokens_seen": 17774464, + "step": 9071 + }, + { + "epoch": 1.2023856858846917, + "grad_norm": 0.33393895626068115, + "learning_rate": 3.2685796489136324e-06, + "loss": 0.0022, + "num_input_tokens_seen": 17776136, + "step": 9072 + }, + { + "epoch": 1.202518223989397, + "grad_norm": 6.568004608154297, + "learning_rate": 3.268249316021368e-06, + "loss": 0.1201, + "num_input_tokens_seen": 17778136, + "step": 9073 + }, + { + "epoch": 1.202650762094102, + "grad_norm": 0.0875641405582428, + "learning_rate": 3.2679189683167267e-06, + "loss": 0.0006, + "num_input_tokens_seen": 17779536, + "step": 9074 + }, + { + "epoch": 1.202783300198807, + "grad_norm": 5.369405746459961, + "learning_rate": 3.267588605806078e-06, + "loss": 0.0314, + "num_input_tokens_seen": 17781408, + "step": 9075 + }, + { + "epoch": 1.2029158383035123, + "grad_norm": 8.198540687561035, + "learning_rate": 3.267258228495791e-06, + "loss": 0.1699, + "num_input_tokens_seen": 17783072, + "step": 9076 + }, + { + "epoch": 1.2030483764082174, + "grad_norm": 2.685887098312378, + "learning_rate": 3.266927836392237e-06, + "loss": 0.023, + "num_input_tokens_seen": 17784840, + "step": 9077 + }, + { + "epoch": 1.2031809145129224, + "grad_norm": 0.10177570581436157, + "learning_rate": 3.2665974295017845e-06, + "loss": 0.0007, + "num_input_tokens_seen": 17787088, + "step": 9078 + }, + { + "epoch": 1.2033134526176275, + "grad_norm": 4.595022201538086, + "learning_rate": 3.266267007830805e-06, + "loss": 0.0801, + "num_input_tokens_seen": 17788616, + "step": 9079 + }, + { + "epoch": 1.2034459907223327, + "grad_norm": 13.191292762756348, + "learning_rate": 3.2659365713856695e-06, + "loss": 0.2605, + "num_input_tokens_seen": 17791240, + "step": 9080 + }, + { + "epoch": 1.2035785288270378, + "grad_norm": 0.11333123594522476, + "learning_rate": 3.265606120172747e-06, + "loss": 0.0008, + "num_input_tokens_seen": 17794688, + "step": 9081 + }, + { + "epoch": 1.2037110669317428, + "grad_norm": 4.5614848136901855, + "learning_rate": 3.2652756541984114e-06, + "loss": 0.0577, + "num_input_tokens_seen": 17796664, + "step": 9082 + }, + { + "epoch": 1.203843605036448, + "grad_norm": 8.041604995727539, + "learning_rate": 3.2649451734690336e-06, + "loss": 0.2329, + "num_input_tokens_seen": 17798968, + "step": 9083 + }, + { + "epoch": 1.203976143141153, + "grad_norm": 0.04253728687763214, + "learning_rate": 3.2646146779909843e-06, + "loss": 0.0003, + "num_input_tokens_seen": 17800368, + "step": 9084 + }, + { + "epoch": 1.2041086812458581, + "grad_norm": 11.104769706726074, + "learning_rate": 3.264284167770637e-06, + "loss": 0.1891, + "num_input_tokens_seen": 17801976, + "step": 9085 + }, + { + "epoch": 1.2042412193505632, + "grad_norm": 2.4175796508789062, + "learning_rate": 3.263953642814363e-06, + "loss": 0.0796, + "num_input_tokens_seen": 17803432, + "step": 9086 + }, + { + "epoch": 1.2043737574552684, + "grad_norm": 0.16529804468154907, + "learning_rate": 3.2636231031285364e-06, + "loss": 0.0012, + "num_input_tokens_seen": 17804968, + "step": 9087 + }, + { + "epoch": 1.2045062955599735, + "grad_norm": 0.6849271655082703, + "learning_rate": 3.2632925487195297e-06, + "loss": 0.004, + "num_input_tokens_seen": 17806272, + "step": 9088 + }, + { + "epoch": 1.2046388336646785, + "grad_norm": 12.577308654785156, + "learning_rate": 3.262961979593716e-06, + "loss": 0.2467, + "num_input_tokens_seen": 17807776, + "step": 9089 + }, + { + "epoch": 1.2047713717693838, + "grad_norm": 0.7359256744384766, + "learning_rate": 3.2626313957574685e-06, + "loss": 0.0051, + "num_input_tokens_seen": 17809488, + "step": 9090 + }, + { + "epoch": 1.2049039098740888, + "grad_norm": 7.815922260284424, + "learning_rate": 3.262300797217162e-06, + "loss": 0.1023, + "num_input_tokens_seen": 17811160, + "step": 9091 + }, + { + "epoch": 1.2050364479787938, + "grad_norm": 12.068278312683105, + "learning_rate": 3.2619701839791707e-06, + "loss": 0.1517, + "num_input_tokens_seen": 17813096, + "step": 9092 + }, + { + "epoch": 1.205168986083499, + "grad_norm": 0.09156709164381027, + "learning_rate": 3.2616395560498675e-06, + "loss": 0.0006, + "num_input_tokens_seen": 17815408, + "step": 9093 + }, + { + "epoch": 1.2053015241882041, + "grad_norm": 16.273731231689453, + "learning_rate": 3.261308913435629e-06, + "loss": 0.4094, + "num_input_tokens_seen": 17818096, + "step": 9094 + }, + { + "epoch": 1.2054340622929092, + "grad_norm": 2.3563897609710693, + "learning_rate": 3.2609782561428293e-06, + "loss": 0.0266, + "num_input_tokens_seen": 17819792, + "step": 9095 + }, + { + "epoch": 1.2055666003976144, + "grad_norm": 5.291491985321045, + "learning_rate": 3.260647584177844e-06, + "loss": 0.0359, + "num_input_tokens_seen": 17821184, + "step": 9096 + }, + { + "epoch": 1.2056991385023195, + "grad_norm": 3.2442591190338135, + "learning_rate": 3.260316897547048e-06, + "loss": 0.0523, + "num_input_tokens_seen": 17823208, + "step": 9097 + }, + { + "epoch": 1.2058316766070245, + "grad_norm": 9.191383361816406, + "learning_rate": 3.2599861962568187e-06, + "loss": 0.0982, + "num_input_tokens_seen": 17825672, + "step": 9098 + }, + { + "epoch": 1.2059642147117295, + "grad_norm": 0.047080304473638535, + "learning_rate": 3.2596554803135294e-06, + "loss": 0.0003, + "num_input_tokens_seen": 17827248, + "step": 9099 + }, + { + "epoch": 1.2060967528164348, + "grad_norm": 9.187613487243652, + "learning_rate": 3.2593247497235603e-06, + "loss": 0.1071, + "num_input_tokens_seen": 17829632, + "step": 9100 + }, + { + "epoch": 1.2062292909211398, + "grad_norm": 4.77036190032959, + "learning_rate": 3.2589940044932856e-06, + "loss": 0.1297, + "num_input_tokens_seen": 17831104, + "step": 9101 + }, + { + "epoch": 1.2063618290258449, + "grad_norm": 0.4092884361743927, + "learning_rate": 3.2586632446290824e-06, + "loss": 0.0018, + "num_input_tokens_seen": 17832520, + "step": 9102 + }, + { + "epoch": 1.2064943671305501, + "grad_norm": 0.14627094566822052, + "learning_rate": 3.258332470137329e-06, + "loss": 0.0009, + "num_input_tokens_seen": 17834296, + "step": 9103 + }, + { + "epoch": 1.2066269052352552, + "grad_norm": 10.010648727416992, + "learning_rate": 3.258001681024402e-06, + "loss": 0.237, + "num_input_tokens_seen": 17836360, + "step": 9104 + }, + { + "epoch": 1.2067594433399602, + "grad_norm": 5.303126335144043, + "learning_rate": 3.25767087729668e-06, + "loss": 0.042, + "num_input_tokens_seen": 17837808, + "step": 9105 + }, + { + "epoch": 1.2068919814446653, + "grad_norm": 0.16575190424919128, + "learning_rate": 3.257340058960541e-06, + "loss": 0.0011, + "num_input_tokens_seen": 17839848, + "step": 9106 + }, + { + "epoch": 1.2070245195493705, + "grad_norm": 0.10358315706253052, + "learning_rate": 3.257009226022363e-06, + "loss": 0.0007, + "num_input_tokens_seen": 17841320, + "step": 9107 + }, + { + "epoch": 1.2071570576540755, + "grad_norm": 11.738689422607422, + "learning_rate": 3.2566783784885248e-06, + "loss": 0.2074, + "num_input_tokens_seen": 17843128, + "step": 9108 + }, + { + "epoch": 1.2072895957587806, + "grad_norm": 7.097743511199951, + "learning_rate": 3.256347516365405e-06, + "loss": 0.1716, + "num_input_tokens_seen": 17845136, + "step": 9109 + }, + { + "epoch": 1.2074221338634858, + "grad_norm": 1.5800695419311523, + "learning_rate": 3.256016639659384e-06, + "loss": 0.0137, + "num_input_tokens_seen": 17847464, + "step": 9110 + }, + { + "epoch": 1.2075546719681909, + "grad_norm": 3.827202320098877, + "learning_rate": 3.2556857483768407e-06, + "loss": 0.0334, + "num_input_tokens_seen": 17848656, + "step": 9111 + }, + { + "epoch": 1.207687210072896, + "grad_norm": 0.06750406324863434, + "learning_rate": 3.2553548425241542e-06, + "loss": 0.0005, + "num_input_tokens_seen": 17850016, + "step": 9112 + }, + { + "epoch": 1.207819748177601, + "grad_norm": 13.304123878479004, + "learning_rate": 3.255023922107706e-06, + "loss": 0.3205, + "num_input_tokens_seen": 17852712, + "step": 9113 + }, + { + "epoch": 1.2079522862823062, + "grad_norm": 10.418885231018066, + "learning_rate": 3.254692987133875e-06, + "loss": 0.1812, + "num_input_tokens_seen": 17854584, + "step": 9114 + }, + { + "epoch": 1.2080848243870113, + "grad_norm": 8.750944137573242, + "learning_rate": 3.254362037609043e-06, + "loss": 0.1287, + "num_input_tokens_seen": 17855984, + "step": 9115 + }, + { + "epoch": 1.2082173624917163, + "grad_norm": 0.18187783658504486, + "learning_rate": 3.254031073539591e-06, + "loss": 0.0013, + "num_input_tokens_seen": 17857952, + "step": 9116 + }, + { + "epoch": 1.2083499005964216, + "grad_norm": 4.154590606689453, + "learning_rate": 3.253700094931898e-06, + "loss": 0.0357, + "num_input_tokens_seen": 17859240, + "step": 9117 + }, + { + "epoch": 1.2084824387011266, + "grad_norm": 7.514345645904541, + "learning_rate": 3.2533691017923486e-06, + "loss": 0.2115, + "num_input_tokens_seen": 17861104, + "step": 9118 + }, + { + "epoch": 1.2086149768058316, + "grad_norm": 11.65219497680664, + "learning_rate": 3.2530380941273234e-06, + "loss": 0.2162, + "num_input_tokens_seen": 17862760, + "step": 9119 + }, + { + "epoch": 1.2087475149105367, + "grad_norm": 10.27974796295166, + "learning_rate": 3.252707071943203e-06, + "loss": 0.1028, + "num_input_tokens_seen": 17865272, + "step": 9120 + }, + { + "epoch": 1.208880053015242, + "grad_norm": 10.581058502197266, + "learning_rate": 3.2523760352463722e-06, + "loss": 0.1422, + "num_input_tokens_seen": 17867624, + "step": 9121 + }, + { + "epoch": 1.209012591119947, + "grad_norm": 5.052036762237549, + "learning_rate": 3.2520449840432123e-06, + "loss": 0.0524, + "num_input_tokens_seen": 17869272, + "step": 9122 + }, + { + "epoch": 1.209145129224652, + "grad_norm": 9.949357986450195, + "learning_rate": 3.2517139183401058e-06, + "loss": 0.2596, + "num_input_tokens_seen": 17871256, + "step": 9123 + }, + { + "epoch": 1.2092776673293573, + "grad_norm": 0.14825275540351868, + "learning_rate": 3.2513828381434367e-06, + "loss": 0.0009, + "num_input_tokens_seen": 17872840, + "step": 9124 + }, + { + "epoch": 1.2094102054340623, + "grad_norm": 0.0769953653216362, + "learning_rate": 3.251051743459588e-06, + "loss": 0.0005, + "num_input_tokens_seen": 17874736, + "step": 9125 + }, + { + "epoch": 1.2095427435387673, + "grad_norm": 5.3438401222229, + "learning_rate": 3.2507206342949434e-06, + "loss": 0.0853, + "num_input_tokens_seen": 17877424, + "step": 9126 + }, + { + "epoch": 1.2096752816434724, + "grad_norm": 2.3395495414733887, + "learning_rate": 3.2503895106558874e-06, + "loss": 0.0505, + "num_input_tokens_seen": 17879200, + "step": 9127 + }, + { + "epoch": 1.2098078197481776, + "grad_norm": 14.266555786132812, + "learning_rate": 3.2500583725488034e-06, + "loss": 0.2809, + "num_input_tokens_seen": 17880888, + "step": 9128 + }, + { + "epoch": 1.2099403578528827, + "grad_norm": 13.303766250610352, + "learning_rate": 3.2497272199800766e-06, + "loss": 0.4695, + "num_input_tokens_seen": 17883760, + "step": 9129 + }, + { + "epoch": 1.2100728959575877, + "grad_norm": 2.8186981678009033, + "learning_rate": 3.2493960529560925e-06, + "loss": 0.0267, + "num_input_tokens_seen": 17885624, + "step": 9130 + }, + { + "epoch": 1.210205434062293, + "grad_norm": 3.4796862602233887, + "learning_rate": 3.249064871483235e-06, + "loss": 0.0457, + "num_input_tokens_seen": 17887592, + "step": 9131 + }, + { + "epoch": 1.210337972166998, + "grad_norm": 4.06358003616333, + "learning_rate": 3.2487336755678905e-06, + "loss": 0.0551, + "num_input_tokens_seen": 17889600, + "step": 9132 + }, + { + "epoch": 1.210470510271703, + "grad_norm": 0.035727713257074356, + "learning_rate": 3.2484024652164436e-06, + "loss": 0.0002, + "num_input_tokens_seen": 17891656, + "step": 9133 + }, + { + "epoch": 1.2106030483764083, + "grad_norm": 7.055615425109863, + "learning_rate": 3.2480712404352815e-06, + "loss": 0.1159, + "num_input_tokens_seen": 17893344, + "step": 9134 + }, + { + "epoch": 1.2107355864811133, + "grad_norm": 5.3411736488342285, + "learning_rate": 3.2477400012307885e-06, + "loss": 0.0823, + "num_input_tokens_seen": 17894912, + "step": 9135 + }, + { + "epoch": 1.2108681245858184, + "grad_norm": 0.3413936495780945, + "learning_rate": 3.247408747609354e-06, + "loss": 0.0035, + "num_input_tokens_seen": 17897056, + "step": 9136 + }, + { + "epoch": 1.2110006626905236, + "grad_norm": 18.52158546447754, + "learning_rate": 3.247077479577363e-06, + "loss": 0.143, + "num_input_tokens_seen": 17899200, + "step": 9137 + }, + { + "epoch": 1.2111332007952287, + "grad_norm": 3.46530818939209, + "learning_rate": 3.2467461971412016e-06, + "loss": 0.0351, + "num_input_tokens_seen": 17901040, + "step": 9138 + }, + { + "epoch": 1.2112657388999337, + "grad_norm": 0.16881491243839264, + "learning_rate": 3.2464149003072597e-06, + "loss": 0.001, + "num_input_tokens_seen": 17902632, + "step": 9139 + }, + { + "epoch": 1.2113982770046388, + "grad_norm": 5.722073554992676, + "learning_rate": 3.246083589081923e-06, + "loss": 0.0456, + "num_input_tokens_seen": 17905208, + "step": 9140 + }, + { + "epoch": 1.211530815109344, + "grad_norm": 4.076426982879639, + "learning_rate": 3.2457522634715797e-06, + "loss": 0.0407, + "num_input_tokens_seen": 17906880, + "step": 9141 + }, + { + "epoch": 1.211663353214049, + "grad_norm": 0.6941167712211609, + "learning_rate": 3.2454209234826195e-06, + "loss": 0.0032, + "num_input_tokens_seen": 17908256, + "step": 9142 + }, + { + "epoch": 1.211795891318754, + "grad_norm": 10.72095775604248, + "learning_rate": 3.2450895691214283e-06, + "loss": 0.2082, + "num_input_tokens_seen": 17910224, + "step": 9143 + }, + { + "epoch": 1.2119284294234594, + "grad_norm": 0.7398728132247925, + "learning_rate": 3.2447582003943966e-06, + "loss": 0.009, + "num_input_tokens_seen": 17911912, + "step": 9144 + }, + { + "epoch": 1.2120609675281644, + "grad_norm": 2.5592658519744873, + "learning_rate": 3.2444268173079143e-06, + "loss": 0.0275, + "num_input_tokens_seen": 17914024, + "step": 9145 + }, + { + "epoch": 1.2121935056328694, + "grad_norm": 0.03525635600090027, + "learning_rate": 3.2440954198683676e-06, + "loss": 0.0002, + "num_input_tokens_seen": 17916064, + "step": 9146 + }, + { + "epoch": 1.2123260437375745, + "grad_norm": 6.539280891418457, + "learning_rate": 3.2437640080821487e-06, + "loss": 0.1487, + "num_input_tokens_seen": 17918920, + "step": 9147 + }, + { + "epoch": 1.2124585818422797, + "grad_norm": 0.02743186056613922, + "learning_rate": 3.243432581955646e-06, + "loss": 0.0002, + "num_input_tokens_seen": 17920616, + "step": 9148 + }, + { + "epoch": 1.2125911199469848, + "grad_norm": 28.531972885131836, + "learning_rate": 3.2431011414952514e-06, + "loss": 0.2476, + "num_input_tokens_seen": 17922696, + "step": 9149 + }, + { + "epoch": 1.2127236580516898, + "grad_norm": 4.468481063842773, + "learning_rate": 3.2427696867073534e-06, + "loss": 0.0675, + "num_input_tokens_seen": 17924424, + "step": 9150 + }, + { + "epoch": 1.212856196156395, + "grad_norm": 0.10224050283432007, + "learning_rate": 3.2424382175983433e-06, + "loss": 0.0006, + "num_input_tokens_seen": 17926432, + "step": 9151 + }, + { + "epoch": 1.2129887342611, + "grad_norm": 1.6860630512237549, + "learning_rate": 3.242106734174612e-06, + "loss": 0.014, + "num_input_tokens_seen": 17928288, + "step": 9152 + }, + { + "epoch": 1.2131212723658051, + "grad_norm": 13.53470230102539, + "learning_rate": 3.2417752364425514e-06, + "loss": 0.053, + "num_input_tokens_seen": 17930448, + "step": 9153 + }, + { + "epoch": 1.2132538104705102, + "grad_norm": 11.0657377243042, + "learning_rate": 3.241443724408553e-06, + "loss": 0.2652, + "num_input_tokens_seen": 17932048, + "step": 9154 + }, + { + "epoch": 1.2133863485752154, + "grad_norm": 0.10713815689086914, + "learning_rate": 3.241112198079007e-06, + "loss": 0.0007, + "num_input_tokens_seen": 17934696, + "step": 9155 + }, + { + "epoch": 1.2135188866799205, + "grad_norm": 4.125648021697998, + "learning_rate": 3.240780657460307e-06, + "loss": 0.0298, + "num_input_tokens_seen": 17936576, + "step": 9156 + }, + { + "epoch": 1.2136514247846255, + "grad_norm": 1.5447955131530762, + "learning_rate": 3.240449102558845e-06, + "loss": 0.0173, + "num_input_tokens_seen": 17938536, + "step": 9157 + }, + { + "epoch": 1.2137839628893308, + "grad_norm": 4.081322193145752, + "learning_rate": 3.2401175333810137e-06, + "loss": 0.0755, + "num_input_tokens_seen": 17940168, + "step": 9158 + }, + { + "epoch": 1.2139165009940358, + "grad_norm": 8.045662879943848, + "learning_rate": 3.239785949933205e-06, + "loss": 0.1928, + "num_input_tokens_seen": 17942256, + "step": 9159 + }, + { + "epoch": 1.2140490390987408, + "grad_norm": 9.230433464050293, + "learning_rate": 3.2394543522218143e-06, + "loss": 0.1386, + "num_input_tokens_seen": 17944184, + "step": 9160 + }, + { + "epoch": 1.2141815772034459, + "grad_norm": 10.852619171142578, + "learning_rate": 3.2391227402532323e-06, + "loss": 0.426, + "num_input_tokens_seen": 17945784, + "step": 9161 + }, + { + "epoch": 1.2143141153081511, + "grad_norm": 0.04514923319220543, + "learning_rate": 3.238791114033854e-06, + "loss": 0.0003, + "num_input_tokens_seen": 17947624, + "step": 9162 + }, + { + "epoch": 1.2144466534128562, + "grad_norm": 0.19006283581256866, + "learning_rate": 3.238459473570074e-06, + "loss": 0.0009, + "num_input_tokens_seen": 17949032, + "step": 9163 + }, + { + "epoch": 1.2145791915175612, + "grad_norm": 0.015030396170914173, + "learning_rate": 3.2381278188682853e-06, + "loss": 0.0001, + "num_input_tokens_seen": 17950424, + "step": 9164 + }, + { + "epoch": 1.2147117296222665, + "grad_norm": 6.36232328414917, + "learning_rate": 3.237796149934883e-06, + "loss": 0.1403, + "num_input_tokens_seen": 17952384, + "step": 9165 + }, + { + "epoch": 1.2148442677269715, + "grad_norm": 2.568963050842285, + "learning_rate": 3.237464466776262e-06, + "loss": 0.0836, + "num_input_tokens_seen": 17954280, + "step": 9166 + }, + { + "epoch": 1.2149768058316766, + "grad_norm": 7.111922740936279, + "learning_rate": 3.237132769398818e-06, + "loss": 0.1639, + "num_input_tokens_seen": 17956120, + "step": 9167 + }, + { + "epoch": 1.2151093439363816, + "grad_norm": 9.37205696105957, + "learning_rate": 3.2368010578089453e-06, + "loss": 0.1038, + "num_input_tokens_seen": 17957584, + "step": 9168 + }, + { + "epoch": 1.2152418820410869, + "grad_norm": 1.1963266134262085, + "learning_rate": 3.23646933201304e-06, + "loss": 0.0235, + "num_input_tokens_seen": 17959640, + "step": 9169 + }, + { + "epoch": 1.215374420145792, + "grad_norm": 6.404896259307861, + "learning_rate": 3.2361375920174974e-06, + "loss": 0.1163, + "num_input_tokens_seen": 17961616, + "step": 9170 + }, + { + "epoch": 1.215506958250497, + "grad_norm": 3.5480639934539795, + "learning_rate": 3.235805837828715e-06, + "loss": 0.0754, + "num_input_tokens_seen": 17963392, + "step": 9171 + }, + { + "epoch": 1.2156394963552022, + "grad_norm": 15.129243850708008, + "learning_rate": 3.2354740694530886e-06, + "loss": 0.4384, + "num_input_tokens_seen": 17965648, + "step": 9172 + }, + { + "epoch": 1.2157720344599072, + "grad_norm": 8.131954193115234, + "learning_rate": 3.235142286897014e-06, + "loss": 0.3006, + "num_input_tokens_seen": 17967328, + "step": 9173 + }, + { + "epoch": 1.2159045725646123, + "grad_norm": 0.08159670978784561, + "learning_rate": 3.2348104901668884e-06, + "loss": 0.0006, + "num_input_tokens_seen": 17969248, + "step": 9174 + }, + { + "epoch": 1.2160371106693175, + "grad_norm": 0.46669483184814453, + "learning_rate": 3.234478679269111e-06, + "loss": 0.0043, + "num_input_tokens_seen": 17971392, + "step": 9175 + }, + { + "epoch": 1.2161696487740226, + "grad_norm": 0.024687767028808594, + "learning_rate": 3.2341468542100773e-06, + "loss": 0.0002, + "num_input_tokens_seen": 17972960, + "step": 9176 + }, + { + "epoch": 1.2163021868787276, + "grad_norm": 5.344636917114258, + "learning_rate": 3.2338150149961856e-06, + "loss": 0.1337, + "num_input_tokens_seen": 17975432, + "step": 9177 + }, + { + "epoch": 1.2164347249834329, + "grad_norm": 1.641262412071228, + "learning_rate": 3.2334831616338346e-06, + "loss": 0.0121, + "num_input_tokens_seen": 17978824, + "step": 9178 + }, + { + "epoch": 1.216567263088138, + "grad_norm": 9.067586898803711, + "learning_rate": 3.233151294129422e-06, + "loss": 0.1388, + "num_input_tokens_seen": 17981536, + "step": 9179 + }, + { + "epoch": 1.216699801192843, + "grad_norm": 8.68569278717041, + "learning_rate": 3.2328194124893465e-06, + "loss": 0.2926, + "num_input_tokens_seen": 17984024, + "step": 9180 + }, + { + "epoch": 1.216832339297548, + "grad_norm": 0.09665299206972122, + "learning_rate": 3.232487516720008e-06, + "loss": 0.0006, + "num_input_tokens_seen": 17985704, + "step": 9181 + }, + { + "epoch": 1.2169648774022532, + "grad_norm": 8.126981735229492, + "learning_rate": 3.232155606827804e-06, + "loss": 0.1961, + "num_input_tokens_seen": 17987368, + "step": 9182 + }, + { + "epoch": 1.2170974155069583, + "grad_norm": 9.479541778564453, + "learning_rate": 3.2318236828191353e-06, + "loss": 0.367, + "num_input_tokens_seen": 17989272, + "step": 9183 + }, + { + "epoch": 1.2172299536116633, + "grad_norm": 8.423776626586914, + "learning_rate": 3.2314917447004014e-06, + "loss": 0.2195, + "num_input_tokens_seen": 17991160, + "step": 9184 + }, + { + "epoch": 1.2173624917163686, + "grad_norm": 3.2139229774475098, + "learning_rate": 3.231159792478e-06, + "loss": 0.0367, + "num_input_tokens_seen": 17992848, + "step": 9185 + }, + { + "epoch": 1.2174950298210736, + "grad_norm": 4.176015853881836, + "learning_rate": 3.230827826158336e-06, + "loss": 0.0506, + "num_input_tokens_seen": 17994520, + "step": 9186 + }, + { + "epoch": 1.2176275679257786, + "grad_norm": 9.809189796447754, + "learning_rate": 3.2304958457478063e-06, + "loss": 0.0787, + "num_input_tokens_seen": 17995808, + "step": 9187 + }, + { + "epoch": 1.2177601060304837, + "grad_norm": 0.12853042781352997, + "learning_rate": 3.230163851252812e-06, + "loss": 0.0009, + "num_input_tokens_seen": 17998128, + "step": 9188 + }, + { + "epoch": 1.217892644135189, + "grad_norm": 13.686176300048828, + "learning_rate": 3.2298318426797563e-06, + "loss": 0.3382, + "num_input_tokens_seen": 18000232, + "step": 9189 + }, + { + "epoch": 1.218025182239894, + "grad_norm": 9.637618064880371, + "learning_rate": 3.2294998200350387e-06, + "loss": 0.1385, + "num_input_tokens_seen": 18002024, + "step": 9190 + }, + { + "epoch": 1.218157720344599, + "grad_norm": 0.7034379839897156, + "learning_rate": 3.2291677833250612e-06, + "loss": 0.0066, + "num_input_tokens_seen": 18003224, + "step": 9191 + }, + { + "epoch": 1.2182902584493043, + "grad_norm": 3.8842196464538574, + "learning_rate": 3.2288357325562253e-06, + "loss": 0.1317, + "num_input_tokens_seen": 18005904, + "step": 9192 + }, + { + "epoch": 1.2184227965540093, + "grad_norm": 2.9960498809814453, + "learning_rate": 3.2285036677349344e-06, + "loss": 0.0441, + "num_input_tokens_seen": 18007600, + "step": 9193 + }, + { + "epoch": 1.2185553346587144, + "grad_norm": 3.8884594440460205, + "learning_rate": 3.22817158886759e-06, + "loss": 0.0321, + "num_input_tokens_seen": 18009344, + "step": 9194 + }, + { + "epoch": 1.2186878727634194, + "grad_norm": 1.028959035873413, + "learning_rate": 3.227839495960595e-06, + "loss": 0.0076, + "num_input_tokens_seen": 18011184, + "step": 9195 + }, + { + "epoch": 1.2188204108681246, + "grad_norm": 10.512124061584473, + "learning_rate": 3.227507389020353e-06, + "loss": 0.388, + "num_input_tokens_seen": 18012888, + "step": 9196 + }, + { + "epoch": 1.2189529489728297, + "grad_norm": 6.777951240539551, + "learning_rate": 3.227175268053266e-06, + "loss": 0.2184, + "num_input_tokens_seen": 18014328, + "step": 9197 + }, + { + "epoch": 1.2190854870775347, + "grad_norm": 6.231356620788574, + "learning_rate": 3.226843133065738e-06, + "loss": 0.1529, + "num_input_tokens_seen": 18016704, + "step": 9198 + }, + { + "epoch": 1.21921802518224, + "grad_norm": 0.8585266470909119, + "learning_rate": 3.2265109840641733e-06, + "loss": 0.0101, + "num_input_tokens_seen": 18019336, + "step": 9199 + }, + { + "epoch": 1.219350563286945, + "grad_norm": 9.952191352844238, + "learning_rate": 3.226178821054975e-06, + "loss": 0.205, + "num_input_tokens_seen": 18020832, + "step": 9200 + }, + { + "epoch": 1.21948310139165, + "grad_norm": 0.07330610603094101, + "learning_rate": 3.225846644044549e-06, + "loss": 0.0004, + "num_input_tokens_seen": 18021960, + "step": 9201 + }, + { + "epoch": 1.219615639496355, + "grad_norm": 3.2237322330474854, + "learning_rate": 3.225514453039299e-06, + "loss": 0.0638, + "num_input_tokens_seen": 18023512, + "step": 9202 + }, + { + "epoch": 1.2197481776010604, + "grad_norm": 3.106146812438965, + "learning_rate": 3.2251822480456284e-06, + "loss": 0.042, + "num_input_tokens_seen": 18025576, + "step": 9203 + }, + { + "epoch": 1.2198807157057654, + "grad_norm": 0.47439780831336975, + "learning_rate": 3.224850029069946e-06, + "loss": 0.0034, + "num_input_tokens_seen": 18027512, + "step": 9204 + }, + { + "epoch": 1.2200132538104704, + "grad_norm": 7.108116626739502, + "learning_rate": 3.2245177961186536e-06, + "loss": 0.0661, + "num_input_tokens_seen": 18029704, + "step": 9205 + }, + { + "epoch": 1.2201457919151757, + "grad_norm": 1.56760835647583, + "learning_rate": 3.224185549198158e-06, + "loss": 0.011, + "num_input_tokens_seen": 18033088, + "step": 9206 + }, + { + "epoch": 1.2202783300198807, + "grad_norm": 7.924262523651123, + "learning_rate": 3.2238532883148664e-06, + "loss": 0.1684, + "num_input_tokens_seen": 18035576, + "step": 9207 + }, + { + "epoch": 1.2204108681245858, + "grad_norm": 6.61377477645874, + "learning_rate": 3.2235210134751834e-06, + "loss": 0.2069, + "num_input_tokens_seen": 18037528, + "step": 9208 + }, + { + "epoch": 1.2205434062292908, + "grad_norm": 4.029158592224121, + "learning_rate": 3.2231887246855157e-06, + "loss": 0.1011, + "num_input_tokens_seen": 18038840, + "step": 9209 + }, + { + "epoch": 1.220675944333996, + "grad_norm": 0.368107408285141, + "learning_rate": 3.222856421952271e-06, + "loss": 0.0025, + "num_input_tokens_seen": 18041328, + "step": 9210 + }, + { + "epoch": 1.220808482438701, + "grad_norm": 7.702954292297363, + "learning_rate": 3.2225241052818567e-06, + "loss": 0.2969, + "num_input_tokens_seen": 18043392, + "step": 9211 + }, + { + "epoch": 1.2209410205434061, + "grad_norm": 4.439028263092041, + "learning_rate": 3.222191774680678e-06, + "loss": 0.1701, + "num_input_tokens_seen": 18045096, + "step": 9212 + }, + { + "epoch": 1.2210735586481114, + "grad_norm": 0.5507677793502808, + "learning_rate": 3.2218594301551443e-06, + "loss": 0.0025, + "num_input_tokens_seen": 18047848, + "step": 9213 + }, + { + "epoch": 1.2212060967528164, + "grad_norm": 8.306917190551758, + "learning_rate": 3.2215270717116627e-06, + "loss": 0.074, + "num_input_tokens_seen": 18049896, + "step": 9214 + }, + { + "epoch": 1.2213386348575215, + "grad_norm": 1.734559416770935, + "learning_rate": 3.2211946993566412e-06, + "loss": 0.0106, + "num_input_tokens_seen": 18052208, + "step": 9215 + }, + { + "epoch": 1.2214711729622267, + "grad_norm": 0.2658510208129883, + "learning_rate": 3.220862313096488e-06, + "loss": 0.0016, + "num_input_tokens_seen": 18053712, + "step": 9216 + }, + { + "epoch": 1.2216037110669318, + "grad_norm": 11.191339492797852, + "learning_rate": 3.220529912937612e-06, + "loss": 0.3079, + "num_input_tokens_seen": 18056488, + "step": 9217 + }, + { + "epoch": 1.2217362491716368, + "grad_norm": 0.03186461701989174, + "learning_rate": 3.220197498886423e-06, + "loss": 0.0002, + "num_input_tokens_seen": 18058448, + "step": 9218 + }, + { + "epoch": 1.221868787276342, + "grad_norm": 10.536408424377441, + "learning_rate": 3.2198650709493295e-06, + "loss": 0.3974, + "num_input_tokens_seen": 18060384, + "step": 9219 + }, + { + "epoch": 1.222001325381047, + "grad_norm": 5.595023155212402, + "learning_rate": 3.21953262913274e-06, + "loss": 0.1325, + "num_input_tokens_seen": 18062744, + "step": 9220 + }, + { + "epoch": 1.2221338634857521, + "grad_norm": 10.682698249816895, + "learning_rate": 3.2192001734430657e-06, + "loss": 0.2467, + "num_input_tokens_seen": 18064976, + "step": 9221 + }, + { + "epoch": 1.2222664015904572, + "grad_norm": 5.00292444229126, + "learning_rate": 3.218867703886716e-06, + "loss": 0.1113, + "num_input_tokens_seen": 18067664, + "step": 9222 + }, + { + "epoch": 1.2223989396951624, + "grad_norm": 10.59686279296875, + "learning_rate": 3.2185352204701004e-06, + "loss": 0.1072, + "num_input_tokens_seen": 18070024, + "step": 9223 + }, + { + "epoch": 1.2225314777998675, + "grad_norm": 0.017353275790810585, + "learning_rate": 3.2182027231996306e-06, + "loss": 0.0001, + "num_input_tokens_seen": 18071880, + "step": 9224 + }, + { + "epoch": 1.2226640159045725, + "grad_norm": 0.01517693605273962, + "learning_rate": 3.2178702120817167e-06, + "loss": 0.0001, + "num_input_tokens_seen": 18073464, + "step": 9225 + }, + { + "epoch": 1.2227965540092778, + "grad_norm": 7.755061626434326, + "learning_rate": 3.21753768712277e-06, + "loss": 0.1229, + "num_input_tokens_seen": 18075376, + "step": 9226 + }, + { + "epoch": 1.2229290921139828, + "grad_norm": 7.077408790588379, + "learning_rate": 3.217205148329201e-06, + "loss": 0.2471, + "num_input_tokens_seen": 18077792, + "step": 9227 + }, + { + "epoch": 1.2230616302186879, + "grad_norm": 1.7125215530395508, + "learning_rate": 3.216872595707423e-06, + "loss": 0.0076, + "num_input_tokens_seen": 18079064, + "step": 9228 + }, + { + "epoch": 1.223194168323393, + "grad_norm": 0.08020414412021637, + "learning_rate": 3.216540029263846e-06, + "loss": 0.0005, + "num_input_tokens_seen": 18081280, + "step": 9229 + }, + { + "epoch": 1.2233267064280982, + "grad_norm": 14.944341659545898, + "learning_rate": 3.216207449004883e-06, + "loss": 0.3158, + "num_input_tokens_seen": 18083480, + "step": 9230 + }, + { + "epoch": 1.2234592445328032, + "grad_norm": 0.013583719730377197, + "learning_rate": 3.215874854936947e-06, + "loss": 0.0001, + "num_input_tokens_seen": 18085264, + "step": 9231 + }, + { + "epoch": 1.2235917826375082, + "grad_norm": 0.02987409569323063, + "learning_rate": 3.215542247066449e-06, + "loss": 0.0002, + "num_input_tokens_seen": 18086680, + "step": 9232 + }, + { + "epoch": 1.2237243207422135, + "grad_norm": 7.907426834106445, + "learning_rate": 3.215209625399803e-06, + "loss": 0.0413, + "num_input_tokens_seen": 18088712, + "step": 9233 + }, + { + "epoch": 1.2238568588469185, + "grad_norm": 0.00683439290151, + "learning_rate": 3.2148769899434225e-06, + "loss": 0.0, + "num_input_tokens_seen": 18090744, + "step": 9234 + }, + { + "epoch": 1.2239893969516236, + "grad_norm": 0.0033382922410964966, + "learning_rate": 3.2145443407037203e-06, + "loss": 0.0, + "num_input_tokens_seen": 18091776, + "step": 9235 + }, + { + "epoch": 1.2241219350563286, + "grad_norm": 0.10739707946777344, + "learning_rate": 3.21421167768711e-06, + "loss": 0.0003, + "num_input_tokens_seen": 18092792, + "step": 9236 + }, + { + "epoch": 1.2242544731610339, + "grad_norm": 11.95257568359375, + "learning_rate": 3.2138790009000063e-06, + "loss": 0.2878, + "num_input_tokens_seen": 18095688, + "step": 9237 + }, + { + "epoch": 1.224387011265739, + "grad_norm": 6.569685935974121, + "learning_rate": 3.213546310348823e-06, + "loss": 0.1772, + "num_input_tokens_seen": 18097280, + "step": 9238 + }, + { + "epoch": 1.224519549370444, + "grad_norm": 5.394773960113525, + "learning_rate": 3.2132136060399736e-06, + "loss": 0.0375, + "num_input_tokens_seen": 18099680, + "step": 9239 + }, + { + "epoch": 1.2246520874751492, + "grad_norm": 0.006969102658331394, + "learning_rate": 3.2128808879798752e-06, + "loss": 0.0, + "num_input_tokens_seen": 18101384, + "step": 9240 + }, + { + "epoch": 1.2247846255798542, + "grad_norm": 10.540257453918457, + "learning_rate": 3.2125481561749406e-06, + "loss": 0.2018, + "num_input_tokens_seen": 18103432, + "step": 9241 + }, + { + "epoch": 1.2249171636845593, + "grad_norm": 3.446161985397339, + "learning_rate": 3.212215410631586e-06, + "loss": 0.0998, + "num_input_tokens_seen": 18105160, + "step": 9242 + }, + { + "epoch": 1.2250497017892643, + "grad_norm": 10.671965599060059, + "learning_rate": 3.2118826513562277e-06, + "loss": 0.1067, + "num_input_tokens_seen": 18106536, + "step": 9243 + }, + { + "epoch": 1.2251822398939696, + "grad_norm": 18.58019256591797, + "learning_rate": 3.21154987835528e-06, + "loss": 0.2939, + "num_input_tokens_seen": 18108000, + "step": 9244 + }, + { + "epoch": 1.2253147779986746, + "grad_norm": 0.014874340035021305, + "learning_rate": 3.2112170916351605e-06, + "loss": 0.0001, + "num_input_tokens_seen": 18109728, + "step": 9245 + }, + { + "epoch": 1.2254473161033796, + "grad_norm": 11.958312034606934, + "learning_rate": 3.210884291202285e-06, + "loss": 0.3032, + "num_input_tokens_seen": 18113040, + "step": 9246 + }, + { + "epoch": 1.225579854208085, + "grad_norm": 0.1405659317970276, + "learning_rate": 3.210551477063068e-06, + "loss": 0.0007, + "num_input_tokens_seen": 18114824, + "step": 9247 + }, + { + "epoch": 1.22571239231279, + "grad_norm": 9.155230522155762, + "learning_rate": 3.21021864922393e-06, + "loss": 0.1568, + "num_input_tokens_seen": 18116848, + "step": 9248 + }, + { + "epoch": 1.225844930417495, + "grad_norm": 0.054299380630254745, + "learning_rate": 3.2098858076912863e-06, + "loss": 0.0003, + "num_input_tokens_seen": 18118448, + "step": 9249 + }, + { + "epoch": 1.2259774685222, + "grad_norm": 18.31075668334961, + "learning_rate": 3.209552952471554e-06, + "loss": 0.4385, + "num_input_tokens_seen": 18120664, + "step": 9250 + }, + { + "epoch": 1.2261100066269053, + "grad_norm": 0.02250790223479271, + "learning_rate": 3.2092200835711524e-06, + "loss": 0.0002, + "num_input_tokens_seen": 18123304, + "step": 9251 + }, + { + "epoch": 1.2262425447316103, + "grad_norm": 3.341330051422119, + "learning_rate": 3.208887200996497e-06, + "loss": 0.0334, + "num_input_tokens_seen": 18124584, + "step": 9252 + }, + { + "epoch": 1.2263750828363154, + "grad_norm": 8.040726661682129, + "learning_rate": 3.208554304754008e-06, + "loss": 0.2825, + "num_input_tokens_seen": 18127264, + "step": 9253 + }, + { + "epoch": 1.2265076209410206, + "grad_norm": 6.532689094543457, + "learning_rate": 3.2082213948501034e-06, + "loss": 0.1198, + "num_input_tokens_seen": 18130200, + "step": 9254 + }, + { + "epoch": 1.2266401590457257, + "grad_norm": 6.583267688751221, + "learning_rate": 3.207888471291202e-06, + "loss": 0.1209, + "num_input_tokens_seen": 18131704, + "step": 9255 + }, + { + "epoch": 1.2267726971504307, + "grad_norm": 19.91008758544922, + "learning_rate": 3.207555534083722e-06, + "loss": 0.6711, + "num_input_tokens_seen": 18134240, + "step": 9256 + }, + { + "epoch": 1.2269052352551357, + "grad_norm": 8.736006736755371, + "learning_rate": 3.2072225832340825e-06, + "loss": 0.1148, + "num_input_tokens_seen": 18135424, + "step": 9257 + }, + { + "epoch": 1.227037773359841, + "grad_norm": 2.818773031234741, + "learning_rate": 3.2068896187487047e-06, + "loss": 0.0305, + "num_input_tokens_seen": 18138280, + "step": 9258 + }, + { + "epoch": 1.227170311464546, + "grad_norm": 3.318325996398926, + "learning_rate": 3.2065566406340065e-06, + "loss": 0.1294, + "num_input_tokens_seen": 18139952, + "step": 9259 + }, + { + "epoch": 1.227302849569251, + "grad_norm": 10.270318984985352, + "learning_rate": 3.206223648896409e-06, + "loss": 0.1332, + "num_input_tokens_seen": 18142248, + "step": 9260 + }, + { + "epoch": 1.2274353876739563, + "grad_norm": 8.633078575134277, + "learning_rate": 3.2058906435423337e-06, + "loss": 0.1485, + "num_input_tokens_seen": 18145128, + "step": 9261 + }, + { + "epoch": 1.2275679257786614, + "grad_norm": 7.300957202911377, + "learning_rate": 3.205557624578198e-06, + "loss": 0.1416, + "num_input_tokens_seen": 18147048, + "step": 9262 + }, + { + "epoch": 1.2277004638833664, + "grad_norm": 6.557152271270752, + "learning_rate": 3.205224592010426e-06, + "loss": 0.08, + "num_input_tokens_seen": 18148928, + "step": 9263 + }, + { + "epoch": 1.2278330019880717, + "grad_norm": 25.99142837524414, + "learning_rate": 3.2048915458454373e-06, + "loss": 0.4206, + "num_input_tokens_seen": 18152368, + "step": 9264 + }, + { + "epoch": 1.2279655400927767, + "grad_norm": 6.490729331970215, + "learning_rate": 3.204558486089652e-06, + "loss": 0.0975, + "num_input_tokens_seen": 18154384, + "step": 9265 + }, + { + "epoch": 1.2280980781974817, + "grad_norm": 8.948105812072754, + "learning_rate": 3.204225412749494e-06, + "loss": 0.2656, + "num_input_tokens_seen": 18156256, + "step": 9266 + }, + { + "epoch": 1.228230616302187, + "grad_norm": 0.017085418105125427, + "learning_rate": 3.2038923258313844e-06, + "loss": 0.0001, + "num_input_tokens_seen": 18157768, + "step": 9267 + }, + { + "epoch": 1.228363154406892, + "grad_norm": 0.03549474850296974, + "learning_rate": 3.2035592253417447e-06, + "loss": 0.0002, + "num_input_tokens_seen": 18159008, + "step": 9268 + }, + { + "epoch": 1.228495692511597, + "grad_norm": 7.160843849182129, + "learning_rate": 3.2032261112869977e-06, + "loss": 0.1787, + "num_input_tokens_seen": 18161280, + "step": 9269 + }, + { + "epoch": 1.228628230616302, + "grad_norm": 2.5544095039367676, + "learning_rate": 3.202892983673567e-06, + "loss": 0.0452, + "num_input_tokens_seen": 18162536, + "step": 9270 + }, + { + "epoch": 1.2287607687210074, + "grad_norm": 8.32440185546875, + "learning_rate": 3.2025598425078736e-06, + "loss": 0.1916, + "num_input_tokens_seen": 18164736, + "step": 9271 + }, + { + "epoch": 1.2288933068257124, + "grad_norm": 2.2942121028900146, + "learning_rate": 3.2022266877963426e-06, + "loss": 0.0203, + "num_input_tokens_seen": 18166296, + "step": 9272 + }, + { + "epoch": 1.2290258449304174, + "grad_norm": 7.356770992279053, + "learning_rate": 3.201893519545396e-06, + "loss": 0.1383, + "num_input_tokens_seen": 18167736, + "step": 9273 + }, + { + "epoch": 1.2291583830351227, + "grad_norm": 0.17508713901042938, + "learning_rate": 3.201560337761458e-06, + "loss": 0.0016, + "num_input_tokens_seen": 18168968, + "step": 9274 + }, + { + "epoch": 1.2292909211398277, + "grad_norm": 0.05699050799012184, + "learning_rate": 3.2012271424509527e-06, + "loss": 0.0004, + "num_input_tokens_seen": 18170672, + "step": 9275 + }, + { + "epoch": 1.2294234592445328, + "grad_norm": 10.298219680786133, + "learning_rate": 3.2008939336203056e-06, + "loss": 0.3584, + "num_input_tokens_seen": 18173144, + "step": 9276 + }, + { + "epoch": 1.2295559973492378, + "grad_norm": 10.176429748535156, + "learning_rate": 3.2005607112759386e-06, + "loss": 0.2976, + "num_input_tokens_seen": 18175464, + "step": 9277 + }, + { + "epoch": 1.229688535453943, + "grad_norm": 6.7923583984375, + "learning_rate": 3.200227475424278e-06, + "loss": 0.0554, + "num_input_tokens_seen": 18177248, + "step": 9278 + }, + { + "epoch": 1.2298210735586481, + "grad_norm": 1.3180171251296997, + "learning_rate": 3.1998942260717493e-06, + "loss": 0.0023, + "num_input_tokens_seen": 18178392, + "step": 9279 + }, + { + "epoch": 1.2299536116633532, + "grad_norm": 7.796667098999023, + "learning_rate": 3.199560963224776e-06, + "loss": 0.1244, + "num_input_tokens_seen": 18180192, + "step": 9280 + }, + { + "epoch": 1.2300861497680584, + "grad_norm": 10.44566535949707, + "learning_rate": 3.1992276868897853e-06, + "loss": 0.1516, + "num_input_tokens_seen": 18181816, + "step": 9281 + }, + { + "epoch": 1.2302186878727635, + "grad_norm": 6.60996150970459, + "learning_rate": 3.1988943970732034e-06, + "loss": 0.1289, + "num_input_tokens_seen": 18184512, + "step": 9282 + }, + { + "epoch": 1.2303512259774685, + "grad_norm": 3.5900001525878906, + "learning_rate": 3.198561093781454e-06, + "loss": 0.0823, + "num_input_tokens_seen": 18186552, + "step": 9283 + }, + { + "epoch": 1.2304837640821735, + "grad_norm": 8.369278907775879, + "learning_rate": 3.1982277770209653e-06, + "loss": 0.1048, + "num_input_tokens_seen": 18188688, + "step": 9284 + }, + { + "epoch": 1.2306163021868788, + "grad_norm": 9.693196296691895, + "learning_rate": 3.1978944467981633e-06, + "loss": 0.1269, + "num_input_tokens_seen": 18191160, + "step": 9285 + }, + { + "epoch": 1.2307488402915838, + "grad_norm": 7.41039514541626, + "learning_rate": 3.197561103119474e-06, + "loss": 0.159, + "num_input_tokens_seen": 18193096, + "step": 9286 + }, + { + "epoch": 1.2308813783962889, + "grad_norm": 0.32638660073280334, + "learning_rate": 3.197227745991327e-06, + "loss": 0.0033, + "num_input_tokens_seen": 18195136, + "step": 9287 + }, + { + "epoch": 1.2310139165009941, + "grad_norm": 0.03574603423476219, + "learning_rate": 3.1968943754201477e-06, + "loss": 0.0002, + "num_input_tokens_seen": 18197544, + "step": 9288 + }, + { + "epoch": 1.2311464546056992, + "grad_norm": 0.028221873566508293, + "learning_rate": 3.1965609914123637e-06, + "loss": 0.0002, + "num_input_tokens_seen": 18199432, + "step": 9289 + }, + { + "epoch": 1.2312789927104042, + "grad_norm": 6.937863349914551, + "learning_rate": 3.1962275939744034e-06, + "loss": 0.045, + "num_input_tokens_seen": 18201160, + "step": 9290 + }, + { + "epoch": 1.2314115308151092, + "grad_norm": 17.245624542236328, + "learning_rate": 3.1958941831126943e-06, + "loss": 0.1696, + "num_input_tokens_seen": 18202592, + "step": 9291 + }, + { + "epoch": 1.2315440689198145, + "grad_norm": 0.7239933609962463, + "learning_rate": 3.1955607588336656e-06, + "loss": 0.0049, + "num_input_tokens_seen": 18204200, + "step": 9292 + }, + { + "epoch": 1.2316766070245195, + "grad_norm": 0.004897387698292732, + "learning_rate": 3.1952273211437455e-06, + "loss": 0.0, + "num_input_tokens_seen": 18205600, + "step": 9293 + }, + { + "epoch": 1.2318091451292246, + "grad_norm": 0.011759313754737377, + "learning_rate": 3.1948938700493635e-06, + "loss": 0.0001, + "num_input_tokens_seen": 18207016, + "step": 9294 + }, + { + "epoch": 1.2319416832339298, + "grad_norm": 11.25745964050293, + "learning_rate": 3.1945604055569484e-06, + "loss": 0.2258, + "num_input_tokens_seen": 18209048, + "step": 9295 + }, + { + "epoch": 1.2320742213386349, + "grad_norm": 6.83518123626709, + "learning_rate": 3.1942269276729294e-06, + "loss": 0.1694, + "num_input_tokens_seen": 18210920, + "step": 9296 + }, + { + "epoch": 1.23220675944334, + "grad_norm": 0.055272430181503296, + "learning_rate": 3.1938934364037357e-06, + "loss": 0.0003, + "num_input_tokens_seen": 18213040, + "step": 9297 + }, + { + "epoch": 1.232339297548045, + "grad_norm": 1.4829679727554321, + "learning_rate": 3.1935599317557984e-06, + "loss": 0.0077, + "num_input_tokens_seen": 18214296, + "step": 9298 + }, + { + "epoch": 1.2324718356527502, + "grad_norm": 10.800658226013184, + "learning_rate": 3.1932264137355475e-06, + "loss": 0.2396, + "num_input_tokens_seen": 18216352, + "step": 9299 + }, + { + "epoch": 1.2326043737574552, + "grad_norm": 12.229174613952637, + "learning_rate": 3.1928928823494123e-06, + "loss": 0.2611, + "num_input_tokens_seen": 18218464, + "step": 9300 + }, + { + "epoch": 1.2327369118621603, + "grad_norm": 4.178027153015137, + "learning_rate": 3.1925593376038243e-06, + "loss": 0.0543, + "num_input_tokens_seen": 18220640, + "step": 9301 + }, + { + "epoch": 1.2328694499668655, + "grad_norm": 12.953418731689453, + "learning_rate": 3.1922257795052155e-06, + "loss": 0.5702, + "num_input_tokens_seen": 18223064, + "step": 9302 + }, + { + "epoch": 1.2330019880715706, + "grad_norm": 0.02145867608487606, + "learning_rate": 3.1918922080600152e-06, + "loss": 0.0001, + "num_input_tokens_seen": 18225232, + "step": 9303 + }, + { + "epoch": 1.2331345261762756, + "grad_norm": 3.8966023921966553, + "learning_rate": 3.191558623274656e-06, + "loss": 0.0544, + "num_input_tokens_seen": 18226960, + "step": 9304 + }, + { + "epoch": 1.2332670642809809, + "grad_norm": 4.835464000701904, + "learning_rate": 3.19122502515557e-06, + "loss": 0.1506, + "num_input_tokens_seen": 18229376, + "step": 9305 + }, + { + "epoch": 1.233399602385686, + "grad_norm": 8.787933349609375, + "learning_rate": 3.190891413709188e-06, + "loss": 0.1053, + "num_input_tokens_seen": 18231352, + "step": 9306 + }, + { + "epoch": 1.233532140490391, + "grad_norm": 7.555550575256348, + "learning_rate": 3.1905577889419428e-06, + "loss": 0.142, + "num_input_tokens_seen": 18233088, + "step": 9307 + }, + { + "epoch": 1.2336646785950962, + "grad_norm": 6.24015998840332, + "learning_rate": 3.1902241508602675e-06, + "loss": 0.1152, + "num_input_tokens_seen": 18235336, + "step": 9308 + }, + { + "epoch": 1.2337972166998012, + "grad_norm": 9.500175476074219, + "learning_rate": 3.1898904994705942e-06, + "loss": 0.179, + "num_input_tokens_seen": 18237712, + "step": 9309 + }, + { + "epoch": 1.2339297548045063, + "grad_norm": 8.311349868774414, + "learning_rate": 3.1895568347793565e-06, + "loss": 0.171, + "num_input_tokens_seen": 18239608, + "step": 9310 + }, + { + "epoch": 1.2340622929092113, + "grad_norm": 0.027424868196249008, + "learning_rate": 3.189223156792987e-06, + "loss": 0.0001, + "num_input_tokens_seen": 18241536, + "step": 9311 + }, + { + "epoch": 1.2341948310139166, + "grad_norm": 5.580315113067627, + "learning_rate": 3.188889465517919e-06, + "loss": 0.1465, + "num_input_tokens_seen": 18242952, + "step": 9312 + }, + { + "epoch": 1.2343273691186216, + "grad_norm": 5.297321319580078, + "learning_rate": 3.188555760960588e-06, + "loss": 0.2009, + "num_input_tokens_seen": 18245160, + "step": 9313 + }, + { + "epoch": 1.2344599072233267, + "grad_norm": 0.6485732197761536, + "learning_rate": 3.188222043127426e-06, + "loss": 0.0038, + "num_input_tokens_seen": 18247808, + "step": 9314 + }, + { + "epoch": 1.234592445328032, + "grad_norm": 4.620710849761963, + "learning_rate": 3.187888312024869e-06, + "loss": 0.1192, + "num_input_tokens_seen": 18250432, + "step": 9315 + }, + { + "epoch": 1.234724983432737, + "grad_norm": 0.052157919853925705, + "learning_rate": 3.1875545676593506e-06, + "loss": 0.0002, + "num_input_tokens_seen": 18252344, + "step": 9316 + }, + { + "epoch": 1.234857521537442, + "grad_norm": 6.928371906280518, + "learning_rate": 3.187220810037306e-06, + "loss": 0.0552, + "num_input_tokens_seen": 18254288, + "step": 9317 + }, + { + "epoch": 1.234990059642147, + "grad_norm": 5.52483606338501, + "learning_rate": 3.186887039165169e-06, + "loss": 0.1268, + "num_input_tokens_seen": 18256880, + "step": 9318 + }, + { + "epoch": 1.2351225977468523, + "grad_norm": 11.089993476867676, + "learning_rate": 3.1865532550493766e-06, + "loss": 0.1547, + "num_input_tokens_seen": 18259400, + "step": 9319 + }, + { + "epoch": 1.2352551358515573, + "grad_norm": 14.036687850952148, + "learning_rate": 3.1862194576963647e-06, + "loss": 0.1868, + "num_input_tokens_seen": 18261624, + "step": 9320 + }, + { + "epoch": 1.2353876739562624, + "grad_norm": 11.554037094116211, + "learning_rate": 3.1858856471125673e-06, + "loss": 0.1097, + "num_input_tokens_seen": 18262704, + "step": 9321 + }, + { + "epoch": 1.2355202120609676, + "grad_norm": 5.136627197265625, + "learning_rate": 3.1855518233044214e-06, + "loss": 0.1061, + "num_input_tokens_seen": 18264424, + "step": 9322 + }, + { + "epoch": 1.2356527501656727, + "grad_norm": 9.465648651123047, + "learning_rate": 3.185217986278364e-06, + "loss": 0.1846, + "num_input_tokens_seen": 18266560, + "step": 9323 + }, + { + "epoch": 1.2357852882703777, + "grad_norm": 5.932622909545898, + "learning_rate": 3.18488413604083e-06, + "loss": 0.2484, + "num_input_tokens_seen": 18269352, + "step": 9324 + }, + { + "epoch": 1.2359178263750827, + "grad_norm": 0.005948694888502359, + "learning_rate": 3.184550272598259e-06, + "loss": 0.0, + "num_input_tokens_seen": 18270928, + "step": 9325 + }, + { + "epoch": 1.236050364479788, + "grad_norm": 9.615486145019531, + "learning_rate": 3.184216395957086e-06, + "loss": 0.0273, + "num_input_tokens_seen": 18272544, + "step": 9326 + }, + { + "epoch": 1.236182902584493, + "grad_norm": 7.075868606567383, + "learning_rate": 3.1838825061237477e-06, + "loss": 0.1495, + "num_input_tokens_seen": 18274600, + "step": 9327 + }, + { + "epoch": 1.236315440689198, + "grad_norm": 6.406460285186768, + "learning_rate": 3.1835486031046843e-06, + "loss": 0.0908, + "num_input_tokens_seen": 18276136, + "step": 9328 + }, + { + "epoch": 1.2364479787939033, + "grad_norm": 1.0897564888000488, + "learning_rate": 3.183214686906332e-06, + "loss": 0.0054, + "num_input_tokens_seen": 18278904, + "step": 9329 + }, + { + "epoch": 1.2365805168986084, + "grad_norm": 4.777773380279541, + "learning_rate": 3.1828807575351283e-06, + "loss": 0.0993, + "num_input_tokens_seen": 18281392, + "step": 9330 + }, + { + "epoch": 1.2367130550033134, + "grad_norm": 0.19937686622142792, + "learning_rate": 3.1825468149975137e-06, + "loss": 0.0017, + "num_input_tokens_seen": 18283104, + "step": 9331 + }, + { + "epoch": 1.2368455931080184, + "grad_norm": 6.602205753326416, + "learning_rate": 3.1822128592999246e-06, + "loss": 0.1102, + "num_input_tokens_seen": 18285312, + "step": 9332 + }, + { + "epoch": 1.2369781312127237, + "grad_norm": 3.541440725326538, + "learning_rate": 3.181878890448802e-06, + "loss": 0.024, + "num_input_tokens_seen": 18286960, + "step": 9333 + }, + { + "epoch": 1.2371106693174287, + "grad_norm": 0.04583694413304329, + "learning_rate": 3.181544908450583e-06, + "loss": 0.0003, + "num_input_tokens_seen": 18288136, + "step": 9334 + }, + { + "epoch": 1.2372432074221338, + "grad_norm": 0.014077777974307537, + "learning_rate": 3.1812109133117088e-06, + "loss": 0.0001, + "num_input_tokens_seen": 18290440, + "step": 9335 + }, + { + "epoch": 1.237375745526839, + "grad_norm": 19.751127243041992, + "learning_rate": 3.1808769050386174e-06, + "loss": 0.4317, + "num_input_tokens_seen": 18292008, + "step": 9336 + }, + { + "epoch": 1.237508283631544, + "grad_norm": 10.469135284423828, + "learning_rate": 3.18054288363775e-06, + "loss": 0.2902, + "num_input_tokens_seen": 18294024, + "step": 9337 + }, + { + "epoch": 1.2376408217362491, + "grad_norm": 3.93719220161438, + "learning_rate": 3.180208849115547e-06, + "loss": 0.0448, + "num_input_tokens_seen": 18295720, + "step": 9338 + }, + { + "epoch": 1.2377733598409542, + "grad_norm": 0.4953845739364624, + "learning_rate": 3.1798748014784468e-06, + "loss": 0.0062, + "num_input_tokens_seen": 18296904, + "step": 9339 + }, + { + "epoch": 1.2379058979456594, + "grad_norm": 32.3203010559082, + "learning_rate": 3.179540740732892e-06, + "loss": 0.1986, + "num_input_tokens_seen": 18298840, + "step": 9340 + }, + { + "epoch": 1.2380384360503645, + "grad_norm": 0.007707961369305849, + "learning_rate": 3.1792066668853237e-06, + "loss": 0.0001, + "num_input_tokens_seen": 18300688, + "step": 9341 + }, + { + "epoch": 1.2381709741550695, + "grad_norm": 6.5232625007629395, + "learning_rate": 3.1788725799421818e-06, + "loss": 0.1998, + "num_input_tokens_seen": 18302648, + "step": 9342 + }, + { + "epoch": 1.2383035122597748, + "grad_norm": 0.10024408996105194, + "learning_rate": 3.1785384799099074e-06, + "loss": 0.0003, + "num_input_tokens_seen": 18304368, + "step": 9343 + }, + { + "epoch": 1.2384360503644798, + "grad_norm": 5.403772354125977, + "learning_rate": 3.178204366794944e-06, + "loss": 0.0782, + "num_input_tokens_seen": 18306296, + "step": 9344 + }, + { + "epoch": 1.2385685884691848, + "grad_norm": 0.09861955791711807, + "learning_rate": 3.1778702406037313e-06, + "loss": 0.0005, + "num_input_tokens_seen": 18307552, + "step": 9345 + }, + { + "epoch": 1.23870112657389, + "grad_norm": 5.897062301635742, + "learning_rate": 3.1775361013427142e-06, + "loss": 0.0536, + "num_input_tokens_seen": 18309632, + "step": 9346 + }, + { + "epoch": 1.2388336646785951, + "grad_norm": 12.025014877319336, + "learning_rate": 3.1772019490183336e-06, + "loss": 0.1004, + "num_input_tokens_seen": 18312280, + "step": 9347 + }, + { + "epoch": 1.2389662027833002, + "grad_norm": 8.678224563598633, + "learning_rate": 3.1768677836370308e-06, + "loss": 0.2125, + "num_input_tokens_seen": 18314064, + "step": 9348 + }, + { + "epoch": 1.2390987408880054, + "grad_norm": 11.939963340759277, + "learning_rate": 3.1765336052052516e-06, + "loss": 0.2322, + "num_input_tokens_seen": 18316344, + "step": 9349 + }, + { + "epoch": 1.2392312789927105, + "grad_norm": 0.01730145886540413, + "learning_rate": 3.1761994137294373e-06, + "loss": 0.0001, + "num_input_tokens_seen": 18318152, + "step": 9350 + }, + { + "epoch": 1.2393638170974155, + "grad_norm": 8.85667896270752, + "learning_rate": 3.1758652092160314e-06, + "loss": 0.2596, + "num_input_tokens_seen": 18319272, + "step": 9351 + }, + { + "epoch": 1.2394963552021205, + "grad_norm": 0.1291411817073822, + "learning_rate": 3.175530991671479e-06, + "loss": 0.0007, + "num_input_tokens_seen": 18321448, + "step": 9352 + }, + { + "epoch": 1.2396288933068258, + "grad_norm": 13.346277236938477, + "learning_rate": 3.175196761102222e-06, + "loss": 0.3455, + "num_input_tokens_seen": 18323872, + "step": 9353 + }, + { + "epoch": 1.2397614314115308, + "grad_norm": 2.7845067977905273, + "learning_rate": 3.1748625175147065e-06, + "loss": 0.0579, + "num_input_tokens_seen": 18325576, + "step": 9354 + }, + { + "epoch": 1.2398939695162359, + "grad_norm": 6.650882720947266, + "learning_rate": 3.1745282609153757e-06, + "loss": 0.1294, + "num_input_tokens_seen": 18327616, + "step": 9355 + }, + { + "epoch": 1.2400265076209411, + "grad_norm": 9.113703727722168, + "learning_rate": 3.1741939913106747e-06, + "loss": 0.1603, + "num_input_tokens_seen": 18330280, + "step": 9356 + }, + { + "epoch": 1.2401590457256462, + "grad_norm": 2.501316547393799, + "learning_rate": 3.173859708707048e-06, + "loss": 0.0282, + "num_input_tokens_seen": 18333336, + "step": 9357 + }, + { + "epoch": 1.2402915838303512, + "grad_norm": 7.7092695236206055, + "learning_rate": 3.1735254131109418e-06, + "loss": 0.1834, + "num_input_tokens_seen": 18335096, + "step": 9358 + }, + { + "epoch": 1.2404241219350562, + "grad_norm": 0.028295818716287613, + "learning_rate": 3.1731911045288013e-06, + "loss": 0.0002, + "num_input_tokens_seen": 18337616, + "step": 9359 + }, + { + "epoch": 1.2405566600397615, + "grad_norm": 5.227165699005127, + "learning_rate": 3.172856782967071e-06, + "loss": 0.1034, + "num_input_tokens_seen": 18339560, + "step": 9360 + }, + { + "epoch": 1.2406891981444665, + "grad_norm": 1.2620757818222046, + "learning_rate": 3.172522448432198e-06, + "loss": 0.0085, + "num_input_tokens_seen": 18341832, + "step": 9361 + }, + { + "epoch": 1.2408217362491716, + "grad_norm": 0.07901065051555634, + "learning_rate": 3.1721881009306287e-06, + "loss": 0.0005, + "num_input_tokens_seen": 18344000, + "step": 9362 + }, + { + "epoch": 1.2409542743538768, + "grad_norm": 6.6687912940979, + "learning_rate": 3.1718537404688086e-06, + "loss": 0.2197, + "num_input_tokens_seen": 18346152, + "step": 9363 + }, + { + "epoch": 1.2410868124585819, + "grad_norm": 13.124435424804688, + "learning_rate": 3.1715193670531853e-06, + "loss": 0.1347, + "num_input_tokens_seen": 18348032, + "step": 9364 + }, + { + "epoch": 1.241219350563287, + "grad_norm": 0.04227553308010101, + "learning_rate": 3.171184980690205e-06, + "loss": 0.0003, + "num_input_tokens_seen": 18350216, + "step": 9365 + }, + { + "epoch": 1.241351888667992, + "grad_norm": 6.170018196105957, + "learning_rate": 3.1708505813863154e-06, + "loss": 0.194, + "num_input_tokens_seen": 18352312, + "step": 9366 + }, + { + "epoch": 1.2414844267726972, + "grad_norm": 0.025294190272688866, + "learning_rate": 3.170516169147964e-06, + "loss": 0.0002, + "num_input_tokens_seen": 18353696, + "step": 9367 + }, + { + "epoch": 1.2416169648774023, + "grad_norm": 10.759210586547852, + "learning_rate": 3.1701817439815986e-06, + "loss": 0.2687, + "num_input_tokens_seen": 18355856, + "step": 9368 + }, + { + "epoch": 1.2417495029821073, + "grad_norm": 0.02228090912103653, + "learning_rate": 3.1698473058936656e-06, + "loss": 0.0001, + "num_input_tokens_seen": 18357272, + "step": 9369 + }, + { + "epoch": 1.2418820410868125, + "grad_norm": 0.02004374749958515, + "learning_rate": 3.1695128548906163e-06, + "loss": 0.0001, + "num_input_tokens_seen": 18359304, + "step": 9370 + }, + { + "epoch": 1.2420145791915176, + "grad_norm": 3.675518274307251, + "learning_rate": 3.1691783909788955e-06, + "loss": 0.0349, + "num_input_tokens_seen": 18360808, + "step": 9371 + }, + { + "epoch": 1.2421471172962226, + "grad_norm": 11.972142219543457, + "learning_rate": 3.168843914164954e-06, + "loss": 0.3635, + "num_input_tokens_seen": 18362808, + "step": 9372 + }, + { + "epoch": 1.2422796554009277, + "grad_norm": 13.614036560058594, + "learning_rate": 3.168509424455242e-06, + "loss": 0.3149, + "num_input_tokens_seen": 18365552, + "step": 9373 + }, + { + "epoch": 1.242412193505633, + "grad_norm": 0.016464266926050186, + "learning_rate": 3.168174921856205e-06, + "loss": 0.0001, + "num_input_tokens_seen": 18366896, + "step": 9374 + }, + { + "epoch": 1.242544731610338, + "grad_norm": 1.4928807020187378, + "learning_rate": 3.1678404063742956e-06, + "loss": 0.0086, + "num_input_tokens_seen": 18368176, + "step": 9375 + }, + { + "epoch": 1.242677269715043, + "grad_norm": 0.2840084135532379, + "learning_rate": 3.1675058780159624e-06, + "loss": 0.0018, + "num_input_tokens_seen": 18371768, + "step": 9376 + }, + { + "epoch": 1.2428098078197483, + "grad_norm": 8.258296012878418, + "learning_rate": 3.167171336787655e-06, + "loss": 0.1305, + "num_input_tokens_seen": 18373680, + "step": 9377 + }, + { + "epoch": 1.2429423459244533, + "grad_norm": 3.565471649169922, + "learning_rate": 3.1668367826958246e-06, + "loss": 0.1101, + "num_input_tokens_seen": 18375096, + "step": 9378 + }, + { + "epoch": 1.2430748840291583, + "grad_norm": 8.388815879821777, + "learning_rate": 3.1665022157469206e-06, + "loss": 0.1278, + "num_input_tokens_seen": 18377984, + "step": 9379 + }, + { + "epoch": 1.2432074221338634, + "grad_norm": 2.3532800674438477, + "learning_rate": 3.1661676359473935e-06, + "loss": 0.0615, + "num_input_tokens_seen": 18379584, + "step": 9380 + }, + { + "epoch": 1.2433399602385686, + "grad_norm": 2.3064815998077393, + "learning_rate": 3.1658330433036956e-06, + "loss": 0.0683, + "num_input_tokens_seen": 18381568, + "step": 9381 + }, + { + "epoch": 1.2434724983432737, + "grad_norm": 0.034538425505161285, + "learning_rate": 3.1654984378222773e-06, + "loss": 0.0002, + "num_input_tokens_seen": 18384024, + "step": 9382 + }, + { + "epoch": 1.2436050364479787, + "grad_norm": 5.336788177490234, + "learning_rate": 3.16516381950959e-06, + "loss": 0.0974, + "num_input_tokens_seen": 18385720, + "step": 9383 + }, + { + "epoch": 1.243737574552684, + "grad_norm": 5.130247116088867, + "learning_rate": 3.164829188372085e-06, + "loss": 0.089, + "num_input_tokens_seen": 18387928, + "step": 9384 + }, + { + "epoch": 1.243870112657389, + "grad_norm": 6.359325885772705, + "learning_rate": 3.1644945444162155e-06, + "loss": 0.1341, + "num_input_tokens_seen": 18390176, + "step": 9385 + }, + { + "epoch": 1.244002650762094, + "grad_norm": 3.77101469039917, + "learning_rate": 3.164159887648432e-06, + "loss": 0.087, + "num_input_tokens_seen": 18392424, + "step": 9386 + }, + { + "epoch": 1.244135188866799, + "grad_norm": 14.491848945617676, + "learning_rate": 3.1638252180751876e-06, + "loss": 0.0051, + "num_input_tokens_seen": 18394208, + "step": 9387 + }, + { + "epoch": 1.2442677269715043, + "grad_norm": 0.027656245976686478, + "learning_rate": 3.1634905357029356e-06, + "loss": 0.0002, + "num_input_tokens_seen": 18396320, + "step": 9388 + }, + { + "epoch": 1.2444002650762094, + "grad_norm": 0.053529489785432816, + "learning_rate": 3.163155840538128e-06, + "loss": 0.0003, + "num_input_tokens_seen": 18398192, + "step": 9389 + }, + { + "epoch": 1.2445328031809146, + "grad_norm": 4.0329790115356445, + "learning_rate": 3.1628211325872183e-06, + "loss": 0.0413, + "num_input_tokens_seen": 18399872, + "step": 9390 + }, + { + "epoch": 1.2446653412856197, + "grad_norm": 0.021568113937973976, + "learning_rate": 3.16248641185666e-06, + "loss": 0.0002, + "num_input_tokens_seen": 18401296, + "step": 9391 + }, + { + "epoch": 1.2447978793903247, + "grad_norm": 9.324602127075195, + "learning_rate": 3.162151678352906e-06, + "loss": 0.1673, + "num_input_tokens_seen": 18403544, + "step": 9392 + }, + { + "epoch": 1.2449304174950298, + "grad_norm": 7.7093634605407715, + "learning_rate": 3.161816932082412e-06, + "loss": 0.126, + "num_input_tokens_seen": 18405728, + "step": 9393 + }, + { + "epoch": 1.245062955599735, + "grad_norm": 10.101548194885254, + "learning_rate": 3.1614821730516298e-06, + "loss": 0.2218, + "num_input_tokens_seen": 18407680, + "step": 9394 + }, + { + "epoch": 1.24519549370444, + "grad_norm": 7.607382297515869, + "learning_rate": 3.1611474012670157e-06, + "loss": 0.1541, + "num_input_tokens_seen": 18409904, + "step": 9395 + }, + { + "epoch": 1.245328031809145, + "grad_norm": 9.067893028259277, + "learning_rate": 3.1608126167350238e-06, + "loss": 0.2329, + "num_input_tokens_seen": 18412576, + "step": 9396 + }, + { + "epoch": 1.2454605699138503, + "grad_norm": 0.01961863599717617, + "learning_rate": 3.1604778194621087e-06, + "loss": 0.0001, + "num_input_tokens_seen": 18415336, + "step": 9397 + }, + { + "epoch": 1.2455931080185554, + "grad_norm": 0.037139642983675, + "learning_rate": 3.1601430094547246e-06, + "loss": 0.0003, + "num_input_tokens_seen": 18417024, + "step": 9398 + }, + { + "epoch": 1.2457256461232604, + "grad_norm": 1.0354135036468506, + "learning_rate": 3.1598081867193297e-06, + "loss": 0.0247, + "num_input_tokens_seen": 18419296, + "step": 9399 + }, + { + "epoch": 1.2458581842279655, + "grad_norm": 12.495909690856934, + "learning_rate": 3.1594733512623765e-06, + "loss": 0.3334, + "num_input_tokens_seen": 18422208, + "step": 9400 + }, + { + "epoch": 1.2459907223326707, + "grad_norm": 8.827960014343262, + "learning_rate": 3.159138503090322e-06, + "loss": 0.2592, + "num_input_tokens_seen": 18423776, + "step": 9401 + }, + { + "epoch": 1.2461232604373758, + "grad_norm": 0.052393652498722076, + "learning_rate": 3.1588036422096225e-06, + "loss": 0.0004, + "num_input_tokens_seen": 18425552, + "step": 9402 + }, + { + "epoch": 1.2462557985420808, + "grad_norm": 8.29236888885498, + "learning_rate": 3.158468768626735e-06, + "loss": 0.2568, + "num_input_tokens_seen": 18427648, + "step": 9403 + }, + { + "epoch": 1.246388336646786, + "grad_norm": 3.726759672164917, + "learning_rate": 3.158133882348115e-06, + "loss": 0.0938, + "num_input_tokens_seen": 18429936, + "step": 9404 + }, + { + "epoch": 1.246520874751491, + "grad_norm": 6.531547546386719, + "learning_rate": 3.15779898338022e-06, + "loss": 0.0882, + "num_input_tokens_seen": 18431680, + "step": 9405 + }, + { + "epoch": 1.2466534128561961, + "grad_norm": 2.9095585346221924, + "learning_rate": 3.1574640717295068e-06, + "loss": 0.053, + "num_input_tokens_seen": 18432864, + "step": 9406 + }, + { + "epoch": 1.2467859509609012, + "grad_norm": 10.680782318115234, + "learning_rate": 3.1571291474024315e-06, + "loss": 0.3253, + "num_input_tokens_seen": 18434712, + "step": 9407 + }, + { + "epoch": 1.2469184890656064, + "grad_norm": 12.099526405334473, + "learning_rate": 3.1567942104054543e-06, + "loss": 0.2051, + "num_input_tokens_seen": 18436728, + "step": 9408 + }, + { + "epoch": 1.2470510271703115, + "grad_norm": 6.636758327484131, + "learning_rate": 3.1564592607450318e-06, + "loss": 0.1144, + "num_input_tokens_seen": 18437872, + "step": 9409 + }, + { + "epoch": 1.2471835652750165, + "grad_norm": 6.598060131072998, + "learning_rate": 3.1561242984276203e-06, + "loss": 0.0421, + "num_input_tokens_seen": 18439480, + "step": 9410 + }, + { + "epoch": 1.2473161033797218, + "grad_norm": 14.159141540527344, + "learning_rate": 3.155789323459681e-06, + "loss": 0.3629, + "num_input_tokens_seen": 18441008, + "step": 9411 + }, + { + "epoch": 1.2474486414844268, + "grad_norm": 0.2686271071434021, + "learning_rate": 3.1554543358476713e-06, + "loss": 0.0013, + "num_input_tokens_seen": 18442568, + "step": 9412 + }, + { + "epoch": 1.2475811795891318, + "grad_norm": 0.9424227476119995, + "learning_rate": 3.1551193355980485e-06, + "loss": 0.0113, + "num_input_tokens_seen": 18444480, + "step": 9413 + }, + { + "epoch": 1.2477137176938369, + "grad_norm": 16.265222549438477, + "learning_rate": 3.1547843227172742e-06, + "loss": 0.4469, + "num_input_tokens_seen": 18446832, + "step": 9414 + }, + { + "epoch": 1.2478462557985421, + "grad_norm": 5.499216079711914, + "learning_rate": 3.154449297211806e-06, + "loss": 0.0324, + "num_input_tokens_seen": 18448416, + "step": 9415 + }, + { + "epoch": 1.2479787939032472, + "grad_norm": 0.10832680761814117, + "learning_rate": 3.1541142590881035e-06, + "loss": 0.0007, + "num_input_tokens_seen": 18449832, + "step": 9416 + }, + { + "epoch": 1.2481113320079522, + "grad_norm": 2.778978109359741, + "learning_rate": 3.153779208352628e-06, + "loss": 0.032, + "num_input_tokens_seen": 18452256, + "step": 9417 + }, + { + "epoch": 1.2482438701126575, + "grad_norm": 9.094318389892578, + "learning_rate": 3.1534441450118374e-06, + "loss": 0.2523, + "num_input_tokens_seen": 18454080, + "step": 9418 + }, + { + "epoch": 1.2483764082173625, + "grad_norm": 6.72135591506958, + "learning_rate": 3.1531090690721927e-06, + "loss": 0.0791, + "num_input_tokens_seen": 18455928, + "step": 9419 + }, + { + "epoch": 1.2485089463220675, + "grad_norm": 9.36076545715332, + "learning_rate": 3.1527739805401548e-06, + "loss": 0.2046, + "num_input_tokens_seen": 18458376, + "step": 9420 + }, + { + "epoch": 1.2486414844267726, + "grad_norm": 4.62542200088501, + "learning_rate": 3.152438879422185e-06, + "loss": 0.108, + "num_input_tokens_seen": 18459936, + "step": 9421 + }, + { + "epoch": 1.2487740225314778, + "grad_norm": 0.1427992582321167, + "learning_rate": 3.152103765724743e-06, + "loss": 0.0009, + "num_input_tokens_seen": 18462016, + "step": 9422 + }, + { + "epoch": 1.2489065606361829, + "grad_norm": 4.885402679443359, + "learning_rate": 3.15176863945429e-06, + "loss": 0.0882, + "num_input_tokens_seen": 18463904, + "step": 9423 + }, + { + "epoch": 1.249039098740888, + "grad_norm": 0.07357434183359146, + "learning_rate": 3.1514335006172892e-06, + "loss": 0.0005, + "num_input_tokens_seen": 18466440, + "step": 9424 + }, + { + "epoch": 1.2491716368455932, + "grad_norm": 3.786656141281128, + "learning_rate": 3.1510983492202e-06, + "loss": 0.0386, + "num_input_tokens_seen": 18468152, + "step": 9425 + }, + { + "epoch": 1.2493041749502982, + "grad_norm": 0.38764688372612, + "learning_rate": 3.1507631852694854e-06, + "loss": 0.0026, + "num_input_tokens_seen": 18470480, + "step": 9426 + }, + { + "epoch": 1.2494367130550033, + "grad_norm": 0.549560546875, + "learning_rate": 3.150428008771609e-06, + "loss": 0.0077, + "num_input_tokens_seen": 18472208, + "step": 9427 + }, + { + "epoch": 1.2495692511597083, + "grad_norm": 0.6289083361625671, + "learning_rate": 3.15009281973303e-06, + "loss": 0.0035, + "num_input_tokens_seen": 18473744, + "step": 9428 + }, + { + "epoch": 1.2497017892644136, + "grad_norm": 0.17262619733810425, + "learning_rate": 3.1497576181602143e-06, + "loss": 0.0012, + "num_input_tokens_seen": 18476088, + "step": 9429 + }, + { + "epoch": 1.2498343273691186, + "grad_norm": 1.0129958391189575, + "learning_rate": 3.149422404059623e-06, + "loss": 0.0166, + "num_input_tokens_seen": 18478120, + "step": 9430 + }, + { + "epoch": 1.2499668654738236, + "grad_norm": 0.05448863282799721, + "learning_rate": 3.1490871774377195e-06, + "loss": 0.0004, + "num_input_tokens_seen": 18480344, + "step": 9431 + }, + { + "epoch": 1.250099403578529, + "grad_norm": 6.407369613647461, + "learning_rate": 3.148751938300968e-06, + "loss": 0.0635, + "num_input_tokens_seen": 18483032, + "step": 9432 + }, + { + "epoch": 1.250231941683234, + "grad_norm": 10.402389526367188, + "learning_rate": 3.1484166866558313e-06, + "loss": 0.2615, + "num_input_tokens_seen": 18484976, + "step": 9433 + }, + { + "epoch": 1.250364479787939, + "grad_norm": 5.193000316619873, + "learning_rate": 3.1480814225087734e-06, + "loss": 0.1604, + "num_input_tokens_seen": 18486912, + "step": 9434 + }, + { + "epoch": 1.250497017892644, + "grad_norm": 0.06429009884595871, + "learning_rate": 3.1477461458662597e-06, + "loss": 0.0004, + "num_input_tokens_seen": 18488512, + "step": 9435 + }, + { + "epoch": 1.2506295559973493, + "grad_norm": 11.41175651550293, + "learning_rate": 3.1474108567347517e-06, + "loss": 0.1243, + "num_input_tokens_seen": 18490920, + "step": 9436 + }, + { + "epoch": 1.2507620941020543, + "grad_norm": 8.308806419372559, + "learning_rate": 3.1470755551207164e-06, + "loss": 0.1525, + "num_input_tokens_seen": 18492896, + "step": 9437 + }, + { + "epoch": 1.2508946322067596, + "grad_norm": 0.0789283812046051, + "learning_rate": 3.146740241030618e-06, + "loss": 0.0006, + "num_input_tokens_seen": 18495440, + "step": 9438 + }, + { + "epoch": 1.2510271703114646, + "grad_norm": 8.656133651733398, + "learning_rate": 3.1464049144709223e-06, + "loss": 0.1156, + "num_input_tokens_seen": 18497656, + "step": 9439 + }, + { + "epoch": 1.2511597084161696, + "grad_norm": 4.3566718101501465, + "learning_rate": 3.1460695754480937e-06, + "loss": 0.0455, + "num_input_tokens_seen": 18499240, + "step": 9440 + }, + { + "epoch": 1.2512922465208747, + "grad_norm": 9.78972339630127, + "learning_rate": 3.1457342239685974e-06, + "loss": 0.1711, + "num_input_tokens_seen": 18500920, + "step": 9441 + }, + { + "epoch": 1.25142478462558, + "grad_norm": 0.33329737186431885, + "learning_rate": 3.1453988600388997e-06, + "loss": 0.0016, + "num_input_tokens_seen": 18502384, + "step": 9442 + }, + { + "epoch": 1.251557322730285, + "grad_norm": 8.892279624938965, + "learning_rate": 3.1450634836654676e-06, + "loss": 0.0965, + "num_input_tokens_seen": 18503920, + "step": 9443 + }, + { + "epoch": 1.25168986083499, + "grad_norm": 2.3811089992523193, + "learning_rate": 3.144728094854766e-06, + "loss": 0.0466, + "num_input_tokens_seen": 18505416, + "step": 9444 + }, + { + "epoch": 1.2518223989396953, + "grad_norm": 0.09232152253389359, + "learning_rate": 3.144392693613262e-06, + "loss": 0.0006, + "num_input_tokens_seen": 18507912, + "step": 9445 + }, + { + "epoch": 1.2519549370444003, + "grad_norm": 0.42596831917762756, + "learning_rate": 3.144057279947422e-06, + "loss": 0.0051, + "num_input_tokens_seen": 18509976, + "step": 9446 + }, + { + "epoch": 1.2520874751491053, + "grad_norm": 4.679616928100586, + "learning_rate": 3.1437218538637136e-06, + "loss": 0.0622, + "num_input_tokens_seen": 18512392, + "step": 9447 + }, + { + "epoch": 1.2522200132538104, + "grad_norm": 9.054844856262207, + "learning_rate": 3.143386415368604e-06, + "loss": 0.0962, + "num_input_tokens_seen": 18514040, + "step": 9448 + }, + { + "epoch": 1.2523525513585156, + "grad_norm": 3.491696357727051, + "learning_rate": 3.1430509644685604e-06, + "loss": 0.1033, + "num_input_tokens_seen": 18515760, + "step": 9449 + }, + { + "epoch": 1.2524850894632207, + "grad_norm": 0.15452627837657928, + "learning_rate": 3.1427155011700506e-06, + "loss": 0.0007, + "num_input_tokens_seen": 18517384, + "step": 9450 + }, + { + "epoch": 1.2526176275679257, + "grad_norm": 12.096922874450684, + "learning_rate": 3.1423800254795422e-06, + "loss": 0.3494, + "num_input_tokens_seen": 18519848, + "step": 9451 + }, + { + "epoch": 1.252750165672631, + "grad_norm": 7.669510841369629, + "learning_rate": 3.1420445374035043e-06, + "loss": 0.3238, + "num_input_tokens_seen": 18522272, + "step": 9452 + }, + { + "epoch": 1.252882703777336, + "grad_norm": 6.504327297210693, + "learning_rate": 3.1417090369484042e-06, + "loss": 0.0619, + "num_input_tokens_seen": 18523968, + "step": 9453 + }, + { + "epoch": 1.253015241882041, + "grad_norm": 5.50931453704834, + "learning_rate": 3.1413735241207117e-06, + "loss": 0.1576, + "num_input_tokens_seen": 18525456, + "step": 9454 + }, + { + "epoch": 1.253147779986746, + "grad_norm": 2.3638157844543457, + "learning_rate": 3.1410379989268947e-06, + "loss": 0.0292, + "num_input_tokens_seen": 18527608, + "step": 9455 + }, + { + "epoch": 1.2532803180914514, + "grad_norm": 7.7129807472229, + "learning_rate": 3.140702461373424e-06, + "loss": 0.07, + "num_input_tokens_seen": 18529088, + "step": 9456 + }, + { + "epoch": 1.2534128561961564, + "grad_norm": 0.2745905816555023, + "learning_rate": 3.1403669114667655e-06, + "loss": 0.002, + "num_input_tokens_seen": 18531976, + "step": 9457 + }, + { + "epoch": 1.2535453943008614, + "grad_norm": 5.1964030265808105, + "learning_rate": 3.140031349213393e-06, + "loss": 0.0712, + "num_input_tokens_seen": 18534056, + "step": 9458 + }, + { + "epoch": 1.2536779324055667, + "grad_norm": 0.15483932197093964, + "learning_rate": 3.1396957746197743e-06, + "loss": 0.001, + "num_input_tokens_seen": 18535168, + "step": 9459 + }, + { + "epoch": 1.2538104705102717, + "grad_norm": 13.136423110961914, + "learning_rate": 3.1393601876923796e-06, + "loss": 0.1342, + "num_input_tokens_seen": 18536752, + "step": 9460 + }, + { + "epoch": 1.2539430086149768, + "grad_norm": 9.722463607788086, + "learning_rate": 3.1390245884376795e-06, + "loss": 0.134, + "num_input_tokens_seen": 18538720, + "step": 9461 + }, + { + "epoch": 1.2540755467196818, + "grad_norm": 0.6463195085525513, + "learning_rate": 3.1386889768621444e-06, + "loss": 0.0044, + "num_input_tokens_seen": 18540616, + "step": 9462 + }, + { + "epoch": 1.254208084824387, + "grad_norm": 4.802942752838135, + "learning_rate": 3.1383533529722452e-06, + "loss": 0.1939, + "num_input_tokens_seen": 18542688, + "step": 9463 + }, + { + "epoch": 1.254340622929092, + "grad_norm": 5.149003505706787, + "learning_rate": 3.138017716774453e-06, + "loss": 0.169, + "num_input_tokens_seen": 18544008, + "step": 9464 + }, + { + "epoch": 1.2544731610337971, + "grad_norm": 11.130813598632812, + "learning_rate": 3.1376820682752398e-06, + "loss": 0.1901, + "num_input_tokens_seen": 18545544, + "step": 9465 + }, + { + "epoch": 1.2546056991385024, + "grad_norm": 0.5692558288574219, + "learning_rate": 3.1373464074810757e-06, + "loss": 0.004, + "num_input_tokens_seen": 18547888, + "step": 9466 + }, + { + "epoch": 1.2547382372432074, + "grad_norm": 8.315135955810547, + "learning_rate": 3.1370107343984324e-06, + "loss": 0.3071, + "num_input_tokens_seen": 18550448, + "step": 9467 + }, + { + "epoch": 1.2548707753479125, + "grad_norm": 1.7729310989379883, + "learning_rate": 3.136675049033784e-06, + "loss": 0.0201, + "num_input_tokens_seen": 18551744, + "step": 9468 + }, + { + "epoch": 1.2550033134526175, + "grad_norm": 0.9985671043395996, + "learning_rate": 3.1363393513936006e-06, + "loss": 0.0099, + "num_input_tokens_seen": 18553328, + "step": 9469 + }, + { + "epoch": 1.2551358515573228, + "grad_norm": 0.1492919623851776, + "learning_rate": 3.136003641484356e-06, + "loss": 0.001, + "num_input_tokens_seen": 18554640, + "step": 9470 + }, + { + "epoch": 1.2552683896620278, + "grad_norm": 0.3378029465675354, + "learning_rate": 3.1356679193125226e-06, + "loss": 0.0024, + "num_input_tokens_seen": 18556344, + "step": 9471 + }, + { + "epoch": 1.255400927766733, + "grad_norm": 0.16051307320594788, + "learning_rate": 3.135332184884572e-06, + "loss": 0.0011, + "num_input_tokens_seen": 18557960, + "step": 9472 + }, + { + "epoch": 1.255533465871438, + "grad_norm": 4.7512288093566895, + "learning_rate": 3.1349964382069798e-06, + "loss": 0.0827, + "num_input_tokens_seen": 18559768, + "step": 9473 + }, + { + "epoch": 1.2556660039761431, + "grad_norm": 0.15483461320400238, + "learning_rate": 3.1346606792862184e-06, + "loss": 0.0011, + "num_input_tokens_seen": 18561192, + "step": 9474 + }, + { + "epoch": 1.2557985420808482, + "grad_norm": 0.08765651285648346, + "learning_rate": 3.1343249081287604e-06, + "loss": 0.0006, + "num_input_tokens_seen": 18564008, + "step": 9475 + }, + { + "epoch": 1.2559310801855532, + "grad_norm": 7.49232816696167, + "learning_rate": 3.133989124741081e-06, + "loss": 0.1776, + "num_input_tokens_seen": 18566016, + "step": 9476 + }, + { + "epoch": 1.2560636182902585, + "grad_norm": 6.845271110534668, + "learning_rate": 3.1336533291296534e-06, + "loss": 0.157, + "num_input_tokens_seen": 18567680, + "step": 9477 + }, + { + "epoch": 1.2561961563949635, + "grad_norm": 12.176644325256348, + "learning_rate": 3.133317521300953e-06, + "loss": 0.3223, + "num_input_tokens_seen": 18569832, + "step": 9478 + }, + { + "epoch": 1.2563286944996688, + "grad_norm": 2.2846949100494385, + "learning_rate": 3.1329817012614544e-06, + "loss": 0.008, + "num_input_tokens_seen": 18571328, + "step": 9479 + }, + { + "epoch": 1.2564612326043738, + "grad_norm": 6.750478267669678, + "learning_rate": 3.1326458690176307e-06, + "loss": 0.1171, + "num_input_tokens_seen": 18573064, + "step": 9480 + }, + { + "epoch": 1.2565937707090788, + "grad_norm": 3.2539079189300537, + "learning_rate": 3.132310024575959e-06, + "loss": 0.079, + "num_input_tokens_seen": 18574552, + "step": 9481 + }, + { + "epoch": 1.2567263088137839, + "grad_norm": 1.7233370542526245, + "learning_rate": 3.1319741679429135e-06, + "loss": 0.0174, + "num_input_tokens_seen": 18575824, + "step": 9482 + }, + { + "epoch": 1.2568588469184891, + "grad_norm": 8.986180305480957, + "learning_rate": 3.1316382991249707e-06, + "loss": 0.1727, + "num_input_tokens_seen": 18578440, + "step": 9483 + }, + { + "epoch": 1.2569913850231942, + "grad_norm": 7.015954971313477, + "learning_rate": 3.131302418128605e-06, + "loss": 0.1063, + "num_input_tokens_seen": 18579568, + "step": 9484 + }, + { + "epoch": 1.2571239231278992, + "grad_norm": 0.16736604273319244, + "learning_rate": 3.1309665249602934e-06, + "loss": 0.0008, + "num_input_tokens_seen": 18580976, + "step": 9485 + }, + { + "epoch": 1.2572564612326045, + "grad_norm": 13.920928001403809, + "learning_rate": 3.1306306196265125e-06, + "loss": 0.3723, + "num_input_tokens_seen": 18582792, + "step": 9486 + }, + { + "epoch": 1.2573889993373095, + "grad_norm": 0.45890504121780396, + "learning_rate": 3.130294702133737e-06, + "loss": 0.0029, + "num_input_tokens_seen": 18584504, + "step": 9487 + }, + { + "epoch": 1.2575215374420146, + "grad_norm": 0.0401802696287632, + "learning_rate": 3.1299587724884454e-06, + "loss": 0.0002, + "num_input_tokens_seen": 18585824, + "step": 9488 + }, + { + "epoch": 1.2576540755467196, + "grad_norm": 8.223142623901367, + "learning_rate": 3.1296228306971146e-06, + "loss": 0.107, + "num_input_tokens_seen": 18587416, + "step": 9489 + }, + { + "epoch": 1.2577866136514249, + "grad_norm": 10.77361011505127, + "learning_rate": 3.1292868767662197e-06, + "loss": 0.204, + "num_input_tokens_seen": 18589000, + "step": 9490 + }, + { + "epoch": 1.25791915175613, + "grad_norm": 7.1792826652526855, + "learning_rate": 3.1289509107022416e-06, + "loss": 0.2093, + "num_input_tokens_seen": 18590664, + "step": 9491 + }, + { + "epoch": 1.258051689860835, + "grad_norm": 11.447093963623047, + "learning_rate": 3.1286149325116553e-06, + "loss": 0.0759, + "num_input_tokens_seen": 18592952, + "step": 9492 + }, + { + "epoch": 1.2581842279655402, + "grad_norm": 13.41461181640625, + "learning_rate": 3.128278942200939e-06, + "loss": 0.3191, + "num_input_tokens_seen": 18594432, + "step": 9493 + }, + { + "epoch": 1.2583167660702452, + "grad_norm": 7.305506229400635, + "learning_rate": 3.127942939776572e-06, + "loss": 0.0567, + "num_input_tokens_seen": 18596576, + "step": 9494 + }, + { + "epoch": 1.2584493041749503, + "grad_norm": 0.07337658107280731, + "learning_rate": 3.1276069252450313e-06, + "loss": 0.0005, + "num_input_tokens_seen": 18598376, + "step": 9495 + }, + { + "epoch": 1.2585818422796553, + "grad_norm": 5.728024005889893, + "learning_rate": 3.1272708986127964e-06, + "loss": 0.1101, + "num_input_tokens_seen": 18600424, + "step": 9496 + }, + { + "epoch": 1.2587143803843606, + "grad_norm": 11.810660362243652, + "learning_rate": 3.1269348598863463e-06, + "loss": 0.3545, + "num_input_tokens_seen": 18602688, + "step": 9497 + }, + { + "epoch": 1.2588469184890656, + "grad_norm": 0.04677664488554001, + "learning_rate": 3.126598809072159e-06, + "loss": 0.0003, + "num_input_tokens_seen": 18604552, + "step": 9498 + }, + { + "epoch": 1.2589794565937706, + "grad_norm": 7.689836502075195, + "learning_rate": 3.126262746176714e-06, + "loss": 0.1539, + "num_input_tokens_seen": 18607288, + "step": 9499 + }, + { + "epoch": 1.259111994698476, + "grad_norm": 3.314669609069824, + "learning_rate": 3.125926671206493e-06, + "loss": 0.0591, + "num_input_tokens_seen": 18608728, + "step": 9500 + }, + { + "epoch": 1.259244532803181, + "grad_norm": 0.008816717192530632, + "learning_rate": 3.1255905841679725e-06, + "loss": 0.0001, + "num_input_tokens_seen": 18610136, + "step": 9501 + }, + { + "epoch": 1.259377070907886, + "grad_norm": 2.8269801139831543, + "learning_rate": 3.125254485067634e-06, + "loss": 0.0636, + "num_input_tokens_seen": 18613064, + "step": 9502 + }, + { + "epoch": 1.259509609012591, + "grad_norm": 7.846867084503174, + "learning_rate": 3.1249183739119576e-06, + "loss": 0.1682, + "num_input_tokens_seen": 18614816, + "step": 9503 + }, + { + "epoch": 1.2596421471172963, + "grad_norm": 0.03581458330154419, + "learning_rate": 3.124582250707424e-06, + "loss": 0.0002, + "num_input_tokens_seen": 18616200, + "step": 9504 + }, + { + "epoch": 1.2597746852220013, + "grad_norm": 8.068528175354004, + "learning_rate": 3.1242461154605142e-06, + "loss": 0.2604, + "num_input_tokens_seen": 18618560, + "step": 9505 + }, + { + "epoch": 1.2599072233267063, + "grad_norm": 7.452920913696289, + "learning_rate": 3.123909968177708e-06, + "loss": 0.1024, + "num_input_tokens_seen": 18620112, + "step": 9506 + }, + { + "epoch": 1.2600397614314116, + "grad_norm": 6.352168560028076, + "learning_rate": 3.1235738088654876e-06, + "loss": 0.203, + "num_input_tokens_seen": 18622216, + "step": 9507 + }, + { + "epoch": 1.2601722995361166, + "grad_norm": 0.038655441254377365, + "learning_rate": 3.1232376375303337e-06, + "loss": 0.0002, + "num_input_tokens_seen": 18623784, + "step": 9508 + }, + { + "epoch": 1.2603048376408217, + "grad_norm": 0.1970355063676834, + "learning_rate": 3.1229014541787284e-06, + "loss": 0.0012, + "num_input_tokens_seen": 18625528, + "step": 9509 + }, + { + "epoch": 1.2604373757455267, + "grad_norm": 9.269770622253418, + "learning_rate": 3.1225652588171534e-06, + "loss": 0.2189, + "num_input_tokens_seen": 18627320, + "step": 9510 + }, + { + "epoch": 1.260569913850232, + "grad_norm": 1.2548916339874268, + "learning_rate": 3.1222290514520903e-06, + "loss": 0.007, + "num_input_tokens_seen": 18629080, + "step": 9511 + }, + { + "epoch": 1.260702451954937, + "grad_norm": 4.4019694328308105, + "learning_rate": 3.1218928320900226e-06, + "loss": 0.0762, + "num_input_tokens_seen": 18630480, + "step": 9512 + }, + { + "epoch": 1.2608349900596423, + "grad_norm": 1.8467514514923096, + "learning_rate": 3.1215566007374316e-06, + "loss": 0.0132, + "num_input_tokens_seen": 18632144, + "step": 9513 + }, + { + "epoch": 1.2609675281643473, + "grad_norm": 5.521622657775879, + "learning_rate": 3.1212203574008002e-06, + "loss": 0.0693, + "num_input_tokens_seen": 18634304, + "step": 9514 + }, + { + "epoch": 1.2611000662690524, + "grad_norm": 10.560824394226074, + "learning_rate": 3.120884102086613e-06, + "loss": 0.1431, + "num_input_tokens_seen": 18636296, + "step": 9515 + }, + { + "epoch": 1.2612326043737574, + "grad_norm": 12.83585262298584, + "learning_rate": 3.120547834801351e-06, + "loss": 0.2935, + "num_input_tokens_seen": 18638936, + "step": 9516 + }, + { + "epoch": 1.2613651424784624, + "grad_norm": 9.468527793884277, + "learning_rate": 3.1202115555514984e-06, + "loss": 0.1141, + "num_input_tokens_seen": 18640664, + "step": 9517 + }, + { + "epoch": 1.2614976805831677, + "grad_norm": 13.555997848510742, + "learning_rate": 3.1198752643435405e-06, + "loss": 0.2374, + "num_input_tokens_seen": 18642712, + "step": 9518 + }, + { + "epoch": 1.2616302186878727, + "grad_norm": 0.10782702267169952, + "learning_rate": 3.1195389611839583e-06, + "loss": 0.0006, + "num_input_tokens_seen": 18644000, + "step": 9519 + }, + { + "epoch": 1.261762756792578, + "grad_norm": 0.2904266119003296, + "learning_rate": 3.119202646079238e-06, + "loss": 0.0018, + "num_input_tokens_seen": 18645680, + "step": 9520 + }, + { + "epoch": 1.261895294897283, + "grad_norm": 1.20993173122406, + "learning_rate": 3.1188663190358636e-06, + "loss": 0.0087, + "num_input_tokens_seen": 18647904, + "step": 9521 + }, + { + "epoch": 1.262027833001988, + "grad_norm": 15.421807289123535, + "learning_rate": 3.1185299800603195e-06, + "loss": 0.2938, + "num_input_tokens_seen": 18649472, + "step": 9522 + }, + { + "epoch": 1.262160371106693, + "grad_norm": 6.484807014465332, + "learning_rate": 3.1181936291590916e-06, + "loss": 0.1156, + "num_input_tokens_seen": 18651536, + "step": 9523 + }, + { + "epoch": 1.2622929092113981, + "grad_norm": 0.25301843881607056, + "learning_rate": 3.117857266338663e-06, + "loss": 0.0029, + "num_input_tokens_seen": 18652664, + "step": 9524 + }, + { + "epoch": 1.2624254473161034, + "grad_norm": 0.5421400666236877, + "learning_rate": 3.11752089160552e-06, + "loss": 0.0037, + "num_input_tokens_seen": 18654280, + "step": 9525 + }, + { + "epoch": 1.2625579854208084, + "grad_norm": 4.363363265991211, + "learning_rate": 3.117184504966149e-06, + "loss": 0.0774, + "num_input_tokens_seen": 18655696, + "step": 9526 + }, + { + "epoch": 1.2626905235255137, + "grad_norm": 0.01573200523853302, + "learning_rate": 3.1168481064270344e-06, + "loss": 0.0001, + "num_input_tokens_seen": 18657320, + "step": 9527 + }, + { + "epoch": 1.2628230616302187, + "grad_norm": 3.245358467102051, + "learning_rate": 3.116511695994663e-06, + "loss": 0.0272, + "num_input_tokens_seen": 18659048, + "step": 9528 + }, + { + "epoch": 1.2629555997349238, + "grad_norm": 4.839794158935547, + "learning_rate": 3.116175273675521e-06, + "loss": 0.1093, + "num_input_tokens_seen": 18660408, + "step": 9529 + }, + { + "epoch": 1.2630881378396288, + "grad_norm": 2.515503168106079, + "learning_rate": 3.1158388394760946e-06, + "loss": 0.0256, + "num_input_tokens_seen": 18661792, + "step": 9530 + }, + { + "epoch": 1.263220675944334, + "grad_norm": 10.010395050048828, + "learning_rate": 3.11550239340287e-06, + "loss": 0.1196, + "num_input_tokens_seen": 18664184, + "step": 9531 + }, + { + "epoch": 1.263353214049039, + "grad_norm": 16.384199142456055, + "learning_rate": 3.1151659354623348e-06, + "loss": 0.2016, + "num_input_tokens_seen": 18666112, + "step": 9532 + }, + { + "epoch": 1.2634857521537441, + "grad_norm": 15.091303825378418, + "learning_rate": 3.114829465660977e-06, + "loss": 0.2366, + "num_input_tokens_seen": 18668592, + "step": 9533 + }, + { + "epoch": 1.2636182902584494, + "grad_norm": 0.546700119972229, + "learning_rate": 3.1144929840052816e-06, + "loss": 0.0022, + "num_input_tokens_seen": 18670608, + "step": 9534 + }, + { + "epoch": 1.2637508283631544, + "grad_norm": 10.77407455444336, + "learning_rate": 3.114156490501738e-06, + "loss": 0.2383, + "num_input_tokens_seen": 18673248, + "step": 9535 + }, + { + "epoch": 1.2638833664678595, + "grad_norm": 7.434164524078369, + "learning_rate": 3.113819985156834e-06, + "loss": 0.2916, + "num_input_tokens_seen": 18676008, + "step": 9536 + }, + { + "epoch": 1.2640159045725645, + "grad_norm": 14.079195022583008, + "learning_rate": 3.113483467977056e-06, + "loss": 0.2363, + "num_input_tokens_seen": 18677664, + "step": 9537 + }, + { + "epoch": 1.2641484426772698, + "grad_norm": 1.8467170000076294, + "learning_rate": 3.1131469389688953e-06, + "loss": 0.0096, + "num_input_tokens_seen": 18680304, + "step": 9538 + }, + { + "epoch": 1.2642809807819748, + "grad_norm": 7.341131687164307, + "learning_rate": 3.112810398138837e-06, + "loss": 0.1654, + "num_input_tokens_seen": 18681984, + "step": 9539 + }, + { + "epoch": 1.2644135188866799, + "grad_norm": 14.365467071533203, + "learning_rate": 3.1124738454933723e-06, + "loss": 0.6037, + "num_input_tokens_seen": 18683976, + "step": 9540 + }, + { + "epoch": 1.2645460569913851, + "grad_norm": 5.505293369293213, + "learning_rate": 3.112137281038989e-06, + "loss": 0.0492, + "num_input_tokens_seen": 18685232, + "step": 9541 + }, + { + "epoch": 1.2646785950960902, + "grad_norm": 0.0494539849460125, + "learning_rate": 3.111800704782177e-06, + "loss": 0.0003, + "num_input_tokens_seen": 18687168, + "step": 9542 + }, + { + "epoch": 1.2648111332007952, + "grad_norm": 0.21555307507514954, + "learning_rate": 3.111464116729425e-06, + "loss": 0.0026, + "num_input_tokens_seen": 18689048, + "step": 9543 + }, + { + "epoch": 1.2649436713055002, + "grad_norm": 0.05539043992757797, + "learning_rate": 3.1111275168872234e-06, + "loss": 0.0004, + "num_input_tokens_seen": 18690912, + "step": 9544 + }, + { + "epoch": 1.2650762094102055, + "grad_norm": 8.146784782409668, + "learning_rate": 3.1107909052620617e-06, + "loss": 0.1219, + "num_input_tokens_seen": 18693328, + "step": 9545 + }, + { + "epoch": 1.2652087475149105, + "grad_norm": 7.495118618011475, + "learning_rate": 3.1104542818604294e-06, + "loss": 0.2423, + "num_input_tokens_seen": 18695008, + "step": 9546 + }, + { + "epoch": 1.2653412856196156, + "grad_norm": 3.8518741130828857, + "learning_rate": 3.1101176466888177e-06, + "loss": 0.053, + "num_input_tokens_seen": 18697560, + "step": 9547 + }, + { + "epoch": 1.2654738237243208, + "grad_norm": 4.492794036865234, + "learning_rate": 3.1097809997537175e-06, + "loss": 0.1521, + "num_input_tokens_seen": 18699352, + "step": 9548 + }, + { + "epoch": 1.2656063618290259, + "grad_norm": 4.953414440155029, + "learning_rate": 3.1094443410616183e-06, + "loss": 0.061, + "num_input_tokens_seen": 18701736, + "step": 9549 + }, + { + "epoch": 1.265738899933731, + "grad_norm": 10.90267276763916, + "learning_rate": 3.1091076706190114e-06, + "loss": 0.3236, + "num_input_tokens_seen": 18704688, + "step": 9550 + }, + { + "epoch": 1.265871438038436, + "grad_norm": 8.966817855834961, + "learning_rate": 3.1087709884323896e-06, + "loss": 0.1224, + "num_input_tokens_seen": 18706512, + "step": 9551 + }, + { + "epoch": 1.2660039761431412, + "grad_norm": 1.2514560222625732, + "learning_rate": 3.108434294508242e-06, + "loss": 0.0141, + "num_input_tokens_seen": 18707928, + "step": 9552 + }, + { + "epoch": 1.2661365142478462, + "grad_norm": 1.3735851049423218, + "learning_rate": 3.1080975888530618e-06, + "loss": 0.0153, + "num_input_tokens_seen": 18709784, + "step": 9553 + }, + { + "epoch": 1.2662690523525515, + "grad_norm": 6.200412273406982, + "learning_rate": 3.1077608714733417e-06, + "loss": 0.1075, + "num_input_tokens_seen": 18712104, + "step": 9554 + }, + { + "epoch": 1.2664015904572565, + "grad_norm": 4.373355865478516, + "learning_rate": 3.107424142375571e-06, + "loss": 0.0557, + "num_input_tokens_seen": 18714056, + "step": 9555 + }, + { + "epoch": 1.2665341285619616, + "grad_norm": 0.972809910774231, + "learning_rate": 3.1070874015662443e-06, + "loss": 0.0047, + "num_input_tokens_seen": 18716824, + "step": 9556 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 4.625906944274902, + "learning_rate": 3.1067506490518546e-06, + "loss": 0.0343, + "num_input_tokens_seen": 18719264, + "step": 9557 + }, + { + "epoch": 1.2667992047713716, + "grad_norm": 4.805831432342529, + "learning_rate": 3.1064138848388925e-06, + "loss": 0.0519, + "num_input_tokens_seen": 18721984, + "step": 9558 + }, + { + "epoch": 1.266931742876077, + "grad_norm": 7.70939302444458, + "learning_rate": 3.106077108933853e-06, + "loss": 0.1364, + "num_input_tokens_seen": 18725352, + "step": 9559 + }, + { + "epoch": 1.267064280980782, + "grad_norm": 0.014057337306439877, + "learning_rate": 3.105740321343229e-06, + "loss": 0.0001, + "num_input_tokens_seen": 18727040, + "step": 9560 + }, + { + "epoch": 1.2671968190854872, + "grad_norm": 0.13010846078395844, + "learning_rate": 3.105403522073513e-06, + "loss": 0.0008, + "num_input_tokens_seen": 18729440, + "step": 9561 + }, + { + "epoch": 1.2673293571901922, + "grad_norm": 10.101225852966309, + "learning_rate": 3.1050667111312005e-06, + "loss": 0.2033, + "num_input_tokens_seen": 18731560, + "step": 9562 + }, + { + "epoch": 1.2674618952948973, + "grad_norm": 4.382718086242676, + "learning_rate": 3.1047298885227833e-06, + "loss": 0.0261, + "num_input_tokens_seen": 18733080, + "step": 9563 + }, + { + "epoch": 1.2675944333996023, + "grad_norm": 0.7799378037452698, + "learning_rate": 3.1043930542547573e-06, + "loss": 0.0064, + "num_input_tokens_seen": 18735448, + "step": 9564 + }, + { + "epoch": 1.2677269715043074, + "grad_norm": 1.7454496622085571, + "learning_rate": 3.104056208333616e-06, + "loss": 0.0208, + "num_input_tokens_seen": 18737096, + "step": 9565 + }, + { + "epoch": 1.2678595096090126, + "grad_norm": 6.805015563964844, + "learning_rate": 3.1037193507658547e-06, + "loss": 0.1759, + "num_input_tokens_seen": 18738800, + "step": 9566 + }, + { + "epoch": 1.2679920477137177, + "grad_norm": 0.14510655403137207, + "learning_rate": 3.1033824815579674e-06, + "loss": 0.0008, + "num_input_tokens_seen": 18741728, + "step": 9567 + }, + { + "epoch": 1.268124585818423, + "grad_norm": 6.825650691986084, + "learning_rate": 3.1030456007164493e-06, + "loss": 0.1444, + "num_input_tokens_seen": 18743480, + "step": 9568 + }, + { + "epoch": 1.268257123923128, + "grad_norm": 8.057954788208008, + "learning_rate": 3.1027087082477967e-06, + "loss": 0.0707, + "num_input_tokens_seen": 18745136, + "step": 9569 + }, + { + "epoch": 1.268389662027833, + "grad_norm": 0.07268264889717102, + "learning_rate": 3.1023718041585044e-06, + "loss": 0.0012, + "num_input_tokens_seen": 18747800, + "step": 9570 + }, + { + "epoch": 1.268522200132538, + "grad_norm": 9.976461410522461, + "learning_rate": 3.1020348884550676e-06, + "loss": 0.0964, + "num_input_tokens_seen": 18749728, + "step": 9571 + }, + { + "epoch": 1.2686547382372433, + "grad_norm": 2.2272660732269287, + "learning_rate": 3.1016979611439833e-06, + "loss": 0.0233, + "num_input_tokens_seen": 18751648, + "step": 9572 + }, + { + "epoch": 1.2687872763419483, + "grad_norm": 15.046384811401367, + "learning_rate": 3.101361022231747e-06, + "loss": 0.4865, + "num_input_tokens_seen": 18754176, + "step": 9573 + }, + { + "epoch": 1.2689198144466534, + "grad_norm": 0.020708920434117317, + "learning_rate": 3.101024071724855e-06, + "loss": 0.0001, + "num_input_tokens_seen": 18755688, + "step": 9574 + }, + { + "epoch": 1.2690523525513586, + "grad_norm": 14.709152221679688, + "learning_rate": 3.1006871096298054e-06, + "loss": 0.2334, + "num_input_tokens_seen": 18758088, + "step": 9575 + }, + { + "epoch": 1.2691848906560637, + "grad_norm": 2.9605281352996826, + "learning_rate": 3.1003501359530925e-06, + "loss": 0.0966, + "num_input_tokens_seen": 18760608, + "step": 9576 + }, + { + "epoch": 1.2693174287607687, + "grad_norm": 6.462142467498779, + "learning_rate": 3.100013150701216e-06, + "loss": 0.1449, + "num_input_tokens_seen": 18762728, + "step": 9577 + }, + { + "epoch": 1.2694499668654737, + "grad_norm": 0.048527903854846954, + "learning_rate": 3.0996761538806715e-06, + "loss": 0.0003, + "num_input_tokens_seen": 18765560, + "step": 9578 + }, + { + "epoch": 1.269582504970179, + "grad_norm": 2.6319034099578857, + "learning_rate": 3.0993391454979573e-06, + "loss": 0.0055, + "num_input_tokens_seen": 18767392, + "step": 9579 + }, + { + "epoch": 1.269715043074884, + "grad_norm": 0.589040994644165, + "learning_rate": 3.099002125559571e-06, + "loss": 0.0027, + "num_input_tokens_seen": 18770080, + "step": 9580 + }, + { + "epoch": 1.269847581179589, + "grad_norm": 1.468569040298462, + "learning_rate": 3.09866509407201e-06, + "loss": 0.025, + "num_input_tokens_seen": 18771584, + "step": 9581 + }, + { + "epoch": 1.2699801192842943, + "grad_norm": 9.574628829956055, + "learning_rate": 3.098328051041773e-06, + "loss": 0.0561, + "num_input_tokens_seen": 18773672, + "step": 9582 + }, + { + "epoch": 1.2701126573889994, + "grad_norm": 9.829450607299805, + "learning_rate": 3.0979909964753585e-06, + "loss": 0.2955, + "num_input_tokens_seen": 18775504, + "step": 9583 + }, + { + "epoch": 1.2702451954937044, + "grad_norm": 0.05495566502213478, + "learning_rate": 3.0976539303792653e-06, + "loss": 0.0004, + "num_input_tokens_seen": 18778176, + "step": 9584 + }, + { + "epoch": 1.2703777335984094, + "grad_norm": 14.334283828735352, + "learning_rate": 3.0973168527599915e-06, + "loss": 0.5355, + "num_input_tokens_seen": 18779736, + "step": 9585 + }, + { + "epoch": 1.2705102717031147, + "grad_norm": 12.509750366210938, + "learning_rate": 3.0969797636240373e-06, + "loss": 0.2096, + "num_input_tokens_seen": 18780936, + "step": 9586 + }, + { + "epoch": 1.2706428098078197, + "grad_norm": 2.2883243560791016, + "learning_rate": 3.0966426629779013e-06, + "loss": 0.018, + "num_input_tokens_seen": 18783576, + "step": 9587 + }, + { + "epoch": 1.2707753479125248, + "grad_norm": 0.03417337313294411, + "learning_rate": 3.096305550828083e-06, + "loss": 0.0002, + "num_input_tokens_seen": 18784840, + "step": 9588 + }, + { + "epoch": 1.27090788601723, + "grad_norm": 8.606836318969727, + "learning_rate": 3.0959684271810825e-06, + "loss": 0.2122, + "num_input_tokens_seen": 18786584, + "step": 9589 + }, + { + "epoch": 1.271040424121935, + "grad_norm": 0.030336497351527214, + "learning_rate": 3.095631292043399e-06, + "loss": 0.0002, + "num_input_tokens_seen": 18788088, + "step": 9590 + }, + { + "epoch": 1.2711729622266401, + "grad_norm": 10.598922729492188, + "learning_rate": 3.0952941454215336e-06, + "loss": 0.1836, + "num_input_tokens_seen": 18789960, + "step": 9591 + }, + { + "epoch": 1.2713055003313452, + "grad_norm": 7.983222007751465, + "learning_rate": 3.094956987321987e-06, + "loss": 0.1773, + "num_input_tokens_seen": 18791784, + "step": 9592 + }, + { + "epoch": 1.2714380384360504, + "grad_norm": 9.11338996887207, + "learning_rate": 3.0946198177512587e-06, + "loss": 0.2313, + "num_input_tokens_seen": 18794024, + "step": 9593 + }, + { + "epoch": 1.2715705765407554, + "grad_norm": 9.565092086791992, + "learning_rate": 3.0942826367158503e-06, + "loss": 0.215, + "num_input_tokens_seen": 18795552, + "step": 9594 + }, + { + "epoch": 1.2717031146454607, + "grad_norm": 2.4294536113739014, + "learning_rate": 3.0939454442222625e-06, + "loss": 0.0495, + "num_input_tokens_seen": 18797208, + "step": 9595 + }, + { + "epoch": 1.2718356527501657, + "grad_norm": 18.182601928710938, + "learning_rate": 3.093608240276997e-06, + "loss": 0.4393, + "num_input_tokens_seen": 18799544, + "step": 9596 + }, + { + "epoch": 1.2719681908548708, + "grad_norm": 9.620306968688965, + "learning_rate": 3.093271024886555e-06, + "loss": 0.134, + "num_input_tokens_seen": 18800816, + "step": 9597 + }, + { + "epoch": 1.2721007289595758, + "grad_norm": 4.487289905548096, + "learning_rate": 3.0929337980574386e-06, + "loss": 0.0895, + "num_input_tokens_seen": 18802712, + "step": 9598 + }, + { + "epoch": 1.2722332670642809, + "grad_norm": 10.956438064575195, + "learning_rate": 3.0925965597961493e-06, + "loss": 0.2121, + "num_input_tokens_seen": 18805312, + "step": 9599 + }, + { + "epoch": 1.2723658051689861, + "grad_norm": 3.5175516605377197, + "learning_rate": 3.092259310109189e-06, + "loss": 0.0314, + "num_input_tokens_seen": 18807120, + "step": 9600 + }, + { + "epoch": 1.2724983432736912, + "grad_norm": 2.5592570304870605, + "learning_rate": 3.0919220490030614e-06, + "loss": 0.0427, + "num_input_tokens_seen": 18809288, + "step": 9601 + }, + { + "epoch": 1.2726308813783964, + "grad_norm": 8.165724754333496, + "learning_rate": 3.091584776484268e-06, + "loss": 0.2291, + "num_input_tokens_seen": 18811040, + "step": 9602 + }, + { + "epoch": 1.2727634194831015, + "grad_norm": 0.14805783331394196, + "learning_rate": 3.0912474925593124e-06, + "loss": 0.0011, + "num_input_tokens_seen": 18813920, + "step": 9603 + }, + { + "epoch": 1.2728959575878065, + "grad_norm": 12.349943161010742, + "learning_rate": 3.0909101972346967e-06, + "loss": 0.1134, + "num_input_tokens_seen": 18815280, + "step": 9604 + }, + { + "epoch": 1.2730284956925115, + "grad_norm": 9.280753135681152, + "learning_rate": 3.0905728905169245e-06, + "loss": 0.1114, + "num_input_tokens_seen": 18816960, + "step": 9605 + }, + { + "epoch": 1.2731610337972166, + "grad_norm": 1.9063833951950073, + "learning_rate": 3.0902355724125003e-06, + "loss": 0.0383, + "num_input_tokens_seen": 18818592, + "step": 9606 + }, + { + "epoch": 1.2732935719019218, + "grad_norm": 7.783565521240234, + "learning_rate": 3.0898982429279267e-06, + "loss": 0.1839, + "num_input_tokens_seen": 18820576, + "step": 9607 + }, + { + "epoch": 1.2734261100066269, + "grad_norm": 0.11132501810789108, + "learning_rate": 3.089560902069708e-06, + "loss": 0.0008, + "num_input_tokens_seen": 18823120, + "step": 9608 + }, + { + "epoch": 1.2735586481113321, + "grad_norm": 0.48439648747444153, + "learning_rate": 3.0892235498443485e-06, + "loss": 0.0098, + "num_input_tokens_seen": 18824616, + "step": 9609 + }, + { + "epoch": 1.2736911862160372, + "grad_norm": 0.922950267791748, + "learning_rate": 3.088886186258353e-06, + "loss": 0.0082, + "num_input_tokens_seen": 18827056, + "step": 9610 + }, + { + "epoch": 1.2738237243207422, + "grad_norm": 0.3289572596549988, + "learning_rate": 3.088548811318225e-06, + "loss": 0.0008, + "num_input_tokens_seen": 18828640, + "step": 9611 + }, + { + "epoch": 1.2739562624254472, + "grad_norm": 9.76451587677002, + "learning_rate": 3.08821142503047e-06, + "loss": 0.2686, + "num_input_tokens_seen": 18831232, + "step": 9612 + }, + { + "epoch": 1.2740888005301525, + "grad_norm": 1.7446781396865845, + "learning_rate": 3.087874027401593e-06, + "loss": 0.0135, + "num_input_tokens_seen": 18833096, + "step": 9613 + }, + { + "epoch": 1.2742213386348575, + "grad_norm": 13.916878700256348, + "learning_rate": 3.0875366184380996e-06, + "loss": 0.1872, + "num_input_tokens_seen": 18834976, + "step": 9614 + }, + { + "epoch": 1.2743538767395626, + "grad_norm": 10.917094230651855, + "learning_rate": 3.0871991981464936e-06, + "loss": 0.1023, + "num_input_tokens_seen": 18837184, + "step": 9615 + }, + { + "epoch": 1.2744864148442678, + "grad_norm": 5.964156627655029, + "learning_rate": 3.0868617665332835e-06, + "loss": 0.092, + "num_input_tokens_seen": 18839152, + "step": 9616 + }, + { + "epoch": 1.2746189529489729, + "grad_norm": 0.44540196657180786, + "learning_rate": 3.0865243236049724e-06, + "loss": 0.0031, + "num_input_tokens_seen": 18841496, + "step": 9617 + }, + { + "epoch": 1.274751491053678, + "grad_norm": 2.5499613285064697, + "learning_rate": 3.0861868693680687e-06, + "loss": 0.0277, + "num_input_tokens_seen": 18843344, + "step": 9618 + }, + { + "epoch": 1.274884029158383, + "grad_norm": 8.558600425720215, + "learning_rate": 3.0858494038290775e-06, + "loss": 0.0896, + "num_input_tokens_seen": 18845368, + "step": 9619 + }, + { + "epoch": 1.2750165672630882, + "grad_norm": 7.310911655426025, + "learning_rate": 3.085511926994505e-06, + "loss": 0.0744, + "num_input_tokens_seen": 18846952, + "step": 9620 + }, + { + "epoch": 1.2751491053677932, + "grad_norm": 5.543603420257568, + "learning_rate": 3.0851744388708595e-06, + "loss": 0.236, + "num_input_tokens_seen": 18849016, + "step": 9621 + }, + { + "epoch": 1.2752816434724983, + "grad_norm": 1.869657278060913, + "learning_rate": 3.0848369394646466e-06, + "loss": 0.0282, + "num_input_tokens_seen": 18850952, + "step": 9622 + }, + { + "epoch": 1.2754141815772035, + "grad_norm": 12.220597267150879, + "learning_rate": 3.0844994287823737e-06, + "loss": 0.4002, + "num_input_tokens_seen": 18853304, + "step": 9623 + }, + { + "epoch": 1.2755467196819086, + "grad_norm": 7.594727993011475, + "learning_rate": 3.0841619068305494e-06, + "loss": 0.1406, + "num_input_tokens_seen": 18854800, + "step": 9624 + }, + { + "epoch": 1.2756792577866136, + "grad_norm": 0.8061110377311707, + "learning_rate": 3.0838243736156805e-06, + "loss": 0.0048, + "num_input_tokens_seen": 18856584, + "step": 9625 + }, + { + "epoch": 1.2758117958913187, + "grad_norm": 0.11500990390777588, + "learning_rate": 3.0834868291442745e-06, + "loss": 0.0007, + "num_input_tokens_seen": 18857720, + "step": 9626 + }, + { + "epoch": 1.275944333996024, + "grad_norm": 0.19176723062992096, + "learning_rate": 3.08314927342284e-06, + "loss": 0.0019, + "num_input_tokens_seen": 18859656, + "step": 9627 + }, + { + "epoch": 1.276076872100729, + "grad_norm": 0.2339429408311844, + "learning_rate": 3.0828117064578855e-06, + "loss": 0.0014, + "num_input_tokens_seen": 18861072, + "step": 9628 + }, + { + "epoch": 1.276209410205434, + "grad_norm": 47.69477844238281, + "learning_rate": 3.082474128255919e-06, + "loss": 0.2067, + "num_input_tokens_seen": 18863760, + "step": 9629 + }, + { + "epoch": 1.2763419483101393, + "grad_norm": 5.988940715789795, + "learning_rate": 3.0821365388234493e-06, + "loss": 0.0724, + "num_input_tokens_seen": 18865048, + "step": 9630 + }, + { + "epoch": 1.2764744864148443, + "grad_norm": 24.3988094329834, + "learning_rate": 3.0817989381669862e-06, + "loss": 0.0355, + "num_input_tokens_seen": 18866728, + "step": 9631 + }, + { + "epoch": 1.2766070245195493, + "grad_norm": 17.62729835510254, + "learning_rate": 3.081461326293037e-06, + "loss": 0.3655, + "num_input_tokens_seen": 18868808, + "step": 9632 + }, + { + "epoch": 1.2767395626242544, + "grad_norm": 2.246546983718872, + "learning_rate": 3.081123703208113e-06, + "loss": 0.0814, + "num_input_tokens_seen": 18870704, + "step": 9633 + }, + { + "epoch": 1.2768721007289596, + "grad_norm": 5.094381332397461, + "learning_rate": 3.0807860689187234e-06, + "loss": 0.053, + "num_input_tokens_seen": 18872320, + "step": 9634 + }, + { + "epoch": 1.2770046388336647, + "grad_norm": 15.825763702392578, + "learning_rate": 3.080448423431377e-06, + "loss": 0.3018, + "num_input_tokens_seen": 18873968, + "step": 9635 + }, + { + "epoch": 1.2771371769383697, + "grad_norm": 13.416014671325684, + "learning_rate": 3.0801107667525852e-06, + "loss": 0.1532, + "num_input_tokens_seen": 18875904, + "step": 9636 + }, + { + "epoch": 1.277269715043075, + "grad_norm": 10.295441627502441, + "learning_rate": 3.079773098888858e-06, + "loss": 0.2415, + "num_input_tokens_seen": 18877488, + "step": 9637 + }, + { + "epoch": 1.27740225314778, + "grad_norm": 10.214176177978516, + "learning_rate": 3.0794354198467042e-06, + "loss": 0.1805, + "num_input_tokens_seen": 18879208, + "step": 9638 + }, + { + "epoch": 1.277534791252485, + "grad_norm": 7.119900703430176, + "learning_rate": 3.079097729632637e-06, + "loss": 0.1439, + "num_input_tokens_seen": 18881232, + "step": 9639 + }, + { + "epoch": 1.27766732935719, + "grad_norm": 3.8956868648529053, + "learning_rate": 3.0787600282531654e-06, + "loss": 0.1306, + "num_input_tokens_seen": 18883240, + "step": 9640 + }, + { + "epoch": 1.2777998674618953, + "grad_norm": 0.06579560786485672, + "learning_rate": 3.078422315714801e-06, + "loss": 0.0004, + "num_input_tokens_seen": 18884744, + "step": 9641 + }, + { + "epoch": 1.2779324055666004, + "grad_norm": 10.839669227600098, + "learning_rate": 3.0780845920240564e-06, + "loss": 0.0438, + "num_input_tokens_seen": 18885856, + "step": 9642 + }, + { + "epoch": 1.2780649436713056, + "grad_norm": 9.453757286071777, + "learning_rate": 3.077746857187441e-06, + "loss": 0.2159, + "num_input_tokens_seen": 18887952, + "step": 9643 + }, + { + "epoch": 1.2781974817760107, + "grad_norm": 0.9112799763679504, + "learning_rate": 3.0774091112114678e-06, + "loss": 0.0061, + "num_input_tokens_seen": 18891152, + "step": 9644 + }, + { + "epoch": 1.2783300198807157, + "grad_norm": 6.867603778839111, + "learning_rate": 3.077071354102649e-06, + "loss": 0.0365, + "num_input_tokens_seen": 18892440, + "step": 9645 + }, + { + "epoch": 1.2784625579854207, + "grad_norm": 17.317108154296875, + "learning_rate": 3.0767335858674963e-06, + "loss": 0.2027, + "num_input_tokens_seen": 18894160, + "step": 9646 + }, + { + "epoch": 1.2785950960901258, + "grad_norm": 9.53577709197998, + "learning_rate": 3.076395806512522e-06, + "loss": 0.0504, + "num_input_tokens_seen": 18896944, + "step": 9647 + }, + { + "epoch": 1.278727634194831, + "grad_norm": 0.11195813119411469, + "learning_rate": 3.0760580160442387e-06, + "loss": 0.0007, + "num_input_tokens_seen": 18898280, + "step": 9648 + }, + { + "epoch": 1.278860172299536, + "grad_norm": 6.44428825378418, + "learning_rate": 3.07572021446916e-06, + "loss": 0.028, + "num_input_tokens_seen": 18899912, + "step": 9649 + }, + { + "epoch": 1.2789927104042413, + "grad_norm": 11.092840194702148, + "learning_rate": 3.0753824017937984e-06, + "loss": 0.4394, + "num_input_tokens_seen": 18902328, + "step": 9650 + }, + { + "epoch": 1.2791252485089464, + "grad_norm": 6.483431339263916, + "learning_rate": 3.0750445780246664e-06, + "loss": 0.0824, + "num_input_tokens_seen": 18904040, + "step": 9651 + }, + { + "epoch": 1.2792577866136514, + "grad_norm": 11.363965034484863, + "learning_rate": 3.074706743168279e-06, + "loss": 0.2859, + "num_input_tokens_seen": 18906032, + "step": 9652 + }, + { + "epoch": 1.2793903247183565, + "grad_norm": 4.041637897491455, + "learning_rate": 3.074368897231148e-06, + "loss": 0.0186, + "num_input_tokens_seen": 18908048, + "step": 9653 + }, + { + "epoch": 1.2795228628230617, + "grad_norm": 5.013463973999023, + "learning_rate": 3.07403104021979e-06, + "loss": 0.0966, + "num_input_tokens_seen": 18910224, + "step": 9654 + }, + { + "epoch": 1.2796554009277668, + "grad_norm": 7.778846740722656, + "learning_rate": 3.0736931721407164e-06, + "loss": 0.0704, + "num_input_tokens_seen": 18911608, + "step": 9655 + }, + { + "epoch": 1.2797879390324718, + "grad_norm": 11.31035327911377, + "learning_rate": 3.073355293000443e-06, + "loss": 0.1528, + "num_input_tokens_seen": 18913808, + "step": 9656 + }, + { + "epoch": 1.279920477137177, + "grad_norm": 5.761524200439453, + "learning_rate": 3.073017402805485e-06, + "loss": 0.217, + "num_input_tokens_seen": 18915048, + "step": 9657 + }, + { + "epoch": 1.280053015241882, + "grad_norm": 13.522019386291504, + "learning_rate": 3.0726795015623544e-06, + "loss": 0.0805, + "num_input_tokens_seen": 18916408, + "step": 9658 + }, + { + "epoch": 1.2801855533465871, + "grad_norm": 0.06115204840898514, + "learning_rate": 3.0723415892775687e-06, + "loss": 0.0004, + "num_input_tokens_seen": 18917928, + "step": 9659 + }, + { + "epoch": 1.2803180914512922, + "grad_norm": 7.510279178619385, + "learning_rate": 3.0720036659576428e-06, + "loss": 0.2667, + "num_input_tokens_seen": 18919816, + "step": 9660 + }, + { + "epoch": 1.2804506295559974, + "grad_norm": 3.488945484161377, + "learning_rate": 3.0716657316090913e-06, + "loss": 0.0189, + "num_input_tokens_seen": 18922064, + "step": 9661 + }, + { + "epoch": 1.2805831676607025, + "grad_norm": 7.147156715393066, + "learning_rate": 3.07132778623843e-06, + "loss": 0.219, + "num_input_tokens_seen": 18924496, + "step": 9662 + }, + { + "epoch": 1.2807157057654075, + "grad_norm": 0.39212489128112793, + "learning_rate": 3.070989829852175e-06, + "loss": 0.0022, + "num_input_tokens_seen": 18926592, + "step": 9663 + }, + { + "epoch": 1.2808482438701128, + "grad_norm": 0.3600044846534729, + "learning_rate": 3.0706518624568423e-06, + "loss": 0.0024, + "num_input_tokens_seen": 18928384, + "step": 9664 + }, + { + "epoch": 1.2809807819748178, + "grad_norm": 3.4699389934539795, + "learning_rate": 3.0703138840589476e-06, + "loss": 0.0615, + "num_input_tokens_seen": 18929736, + "step": 9665 + }, + { + "epoch": 1.2811133200795228, + "grad_norm": 1.422703742980957, + "learning_rate": 3.0699758946650075e-06, + "loss": 0.0283, + "num_input_tokens_seen": 18931824, + "step": 9666 + }, + { + "epoch": 1.2812458581842279, + "grad_norm": 9.015321731567383, + "learning_rate": 3.0696378942815396e-06, + "loss": 0.2072, + "num_input_tokens_seen": 18934072, + "step": 9667 + }, + { + "epoch": 1.2813783962889331, + "grad_norm": 1.2301030158996582, + "learning_rate": 3.06929988291506e-06, + "loss": 0.0071, + "num_input_tokens_seen": 18936136, + "step": 9668 + }, + { + "epoch": 1.2815109343936382, + "grad_norm": 2.421050548553467, + "learning_rate": 3.068961860572086e-06, + "loss": 0.1029, + "num_input_tokens_seen": 18937984, + "step": 9669 + }, + { + "epoch": 1.2816434724983432, + "grad_norm": 3.3356924057006836, + "learning_rate": 3.0686238272591343e-06, + "loss": 0.103, + "num_input_tokens_seen": 18939936, + "step": 9670 + }, + { + "epoch": 1.2817760106030485, + "grad_norm": 0.303770512342453, + "learning_rate": 3.068285782982724e-06, + "loss": 0.0032, + "num_input_tokens_seen": 18942640, + "step": 9671 + }, + { + "epoch": 1.2819085487077535, + "grad_norm": 0.05068698897957802, + "learning_rate": 3.067947727749371e-06, + "loss": 0.0003, + "num_input_tokens_seen": 18944192, + "step": 9672 + }, + { + "epoch": 1.2820410868124585, + "grad_norm": 1.897047996520996, + "learning_rate": 3.067609661565594e-06, + "loss": 0.014, + "num_input_tokens_seen": 18946448, + "step": 9673 + }, + { + "epoch": 1.2821736249171636, + "grad_norm": 4.392913341522217, + "learning_rate": 3.067271584437911e-06, + "loss": 0.0879, + "num_input_tokens_seen": 18949112, + "step": 9674 + }, + { + "epoch": 1.2823061630218688, + "grad_norm": 11.281965255737305, + "learning_rate": 3.066933496372841e-06, + "loss": 0.1542, + "num_input_tokens_seen": 18951128, + "step": 9675 + }, + { + "epoch": 1.2824387011265739, + "grad_norm": 8.32690715789795, + "learning_rate": 3.0665953973769014e-06, + "loss": 0.1778, + "num_input_tokens_seen": 18953696, + "step": 9676 + }, + { + "epoch": 1.282571239231279, + "grad_norm": 0.17295058071613312, + "learning_rate": 3.066257287456612e-06, + "loss": 0.0012, + "num_input_tokens_seen": 18955752, + "step": 9677 + }, + { + "epoch": 1.2827037773359842, + "grad_norm": 5.499164581298828, + "learning_rate": 3.0659191666184925e-06, + "loss": 0.1118, + "num_input_tokens_seen": 18957744, + "step": 9678 + }, + { + "epoch": 1.2828363154406892, + "grad_norm": 9.637052536010742, + "learning_rate": 3.0655810348690596e-06, + "loss": 0.1749, + "num_input_tokens_seen": 18959680, + "step": 9679 + }, + { + "epoch": 1.2829688535453942, + "grad_norm": 1.190090298652649, + "learning_rate": 3.0652428922148347e-06, + "loss": 0.0044, + "num_input_tokens_seen": 18961704, + "step": 9680 + }, + { + "epoch": 1.2831013916500993, + "grad_norm": 4.269988059997559, + "learning_rate": 3.064904738662337e-06, + "loss": 0.0298, + "num_input_tokens_seen": 18963216, + "step": 9681 + }, + { + "epoch": 1.2832339297548045, + "grad_norm": 14.536900520324707, + "learning_rate": 3.064566574218085e-06, + "loss": 0.2741, + "num_input_tokens_seen": 18965160, + "step": 9682 + }, + { + "epoch": 1.2833664678595096, + "grad_norm": 1.055992603302002, + "learning_rate": 3.0642283988886016e-06, + "loss": 0.0087, + "num_input_tokens_seen": 18966792, + "step": 9683 + }, + { + "epoch": 1.2834990059642148, + "grad_norm": 1.5844638347625732, + "learning_rate": 3.0638902126804055e-06, + "loss": 0.005, + "num_input_tokens_seen": 18968696, + "step": 9684 + }, + { + "epoch": 1.2836315440689199, + "grad_norm": 14.28619384765625, + "learning_rate": 3.063552015600016e-06, + "loss": 0.2204, + "num_input_tokens_seen": 18972160, + "step": 9685 + }, + { + "epoch": 1.283764082173625, + "grad_norm": 5.226593017578125, + "learning_rate": 3.063213807653956e-06, + "loss": 0.0535, + "num_input_tokens_seen": 18974488, + "step": 9686 + }, + { + "epoch": 1.28389662027833, + "grad_norm": 0.11245589703321457, + "learning_rate": 3.0628755888487443e-06, + "loss": 0.0008, + "num_input_tokens_seen": 18975800, + "step": 9687 + }, + { + "epoch": 1.284029158383035, + "grad_norm": 0.37782302498817444, + "learning_rate": 3.062537359190903e-06, + "loss": 0.0022, + "num_input_tokens_seen": 18976928, + "step": 9688 + }, + { + "epoch": 1.2841616964877403, + "grad_norm": 0.10434232652187347, + "learning_rate": 3.062199118686954e-06, + "loss": 0.0007, + "num_input_tokens_seen": 18978832, + "step": 9689 + }, + { + "epoch": 1.2842942345924453, + "grad_norm": 0.05019907280802727, + "learning_rate": 3.0618608673434175e-06, + "loss": 0.0003, + "num_input_tokens_seen": 18980024, + "step": 9690 + }, + { + "epoch": 1.2844267726971506, + "grad_norm": 6.764538288116455, + "learning_rate": 3.0615226051668163e-06, + "loss": 0.054, + "num_input_tokens_seen": 18981576, + "step": 9691 + }, + { + "epoch": 1.2845593108018556, + "grad_norm": 1.1425418853759766, + "learning_rate": 3.0611843321636713e-06, + "loss": 0.0084, + "num_input_tokens_seen": 18983880, + "step": 9692 + }, + { + "epoch": 1.2846918489065606, + "grad_norm": 9.006121635437012, + "learning_rate": 3.060846048340506e-06, + "loss": 0.1607, + "num_input_tokens_seen": 18985688, + "step": 9693 + }, + { + "epoch": 1.2848243870112657, + "grad_norm": 15.304471969604492, + "learning_rate": 3.0605077537038414e-06, + "loss": 0.3334, + "num_input_tokens_seen": 18987224, + "step": 9694 + }, + { + "epoch": 1.2849569251159707, + "grad_norm": 0.15343420207500458, + "learning_rate": 3.0601694482602005e-06, + "loss": 0.0011, + "num_input_tokens_seen": 18989016, + "step": 9695 + }, + { + "epoch": 1.285089463220676, + "grad_norm": 0.07735420763492584, + "learning_rate": 3.0598311320161074e-06, + "loss": 0.0005, + "num_input_tokens_seen": 18991920, + "step": 9696 + }, + { + "epoch": 1.285222001325381, + "grad_norm": 0.11463579535484314, + "learning_rate": 3.059492804978082e-06, + "loss": 0.0007, + "num_input_tokens_seen": 18993224, + "step": 9697 + }, + { + "epoch": 1.2853545394300863, + "grad_norm": 2.7782106399536133, + "learning_rate": 3.0591544671526506e-06, + "loss": 0.0543, + "num_input_tokens_seen": 18995952, + "step": 9698 + }, + { + "epoch": 1.2854870775347913, + "grad_norm": 24.03133201599121, + "learning_rate": 3.058816118546336e-06, + "loss": 0.0932, + "num_input_tokens_seen": 18997688, + "step": 9699 + }, + { + "epoch": 1.2856196156394963, + "grad_norm": 4.620479583740234, + "learning_rate": 3.0584777591656597e-06, + "loss": 0.1028, + "num_input_tokens_seen": 18999640, + "step": 9700 + }, + { + "epoch": 1.2857521537442014, + "grad_norm": 1.369361400604248, + "learning_rate": 3.0581393890171474e-06, + "loss": 0.0095, + "num_input_tokens_seen": 19001096, + "step": 9701 + }, + { + "epoch": 1.2858846918489066, + "grad_norm": 0.0573832243680954, + "learning_rate": 3.0578010081073233e-06, + "loss": 0.0003, + "num_input_tokens_seen": 19002376, + "step": 9702 + }, + { + "epoch": 1.2860172299536117, + "grad_norm": 0.1977907419204712, + "learning_rate": 3.0574626164427097e-06, + "loss": 0.0013, + "num_input_tokens_seen": 19004408, + "step": 9703 + }, + { + "epoch": 1.2861497680583167, + "grad_norm": 0.1418311893939972, + "learning_rate": 3.057124214029833e-06, + "loss": 0.0007, + "num_input_tokens_seen": 19006584, + "step": 9704 + }, + { + "epoch": 1.286282306163022, + "grad_norm": 8.344600677490234, + "learning_rate": 3.056785800875217e-06, + "loss": 0.2985, + "num_input_tokens_seen": 19008944, + "step": 9705 + }, + { + "epoch": 1.286414844267727, + "grad_norm": 9.698260307312012, + "learning_rate": 3.0564473769853864e-06, + "loss": 0.1165, + "num_input_tokens_seen": 19010496, + "step": 9706 + }, + { + "epoch": 1.286547382372432, + "grad_norm": 0.5961040258407593, + "learning_rate": 3.0561089423668672e-06, + "loss": 0.0058, + "num_input_tokens_seen": 19012824, + "step": 9707 + }, + { + "epoch": 1.286679920477137, + "grad_norm": 0.0620257668197155, + "learning_rate": 3.0557704970261835e-06, + "loss": 0.0004, + "num_input_tokens_seen": 19014912, + "step": 9708 + }, + { + "epoch": 1.2868124585818423, + "grad_norm": 6.489939212799072, + "learning_rate": 3.0554320409698614e-06, + "loss": 0.125, + "num_input_tokens_seen": 19017424, + "step": 9709 + }, + { + "epoch": 1.2869449966865474, + "grad_norm": 3.703885078430176, + "learning_rate": 3.055093574204426e-06, + "loss": 0.0591, + "num_input_tokens_seen": 19018928, + "step": 9710 + }, + { + "epoch": 1.2870775347912524, + "grad_norm": 8.148422241210938, + "learning_rate": 3.0547550967364042e-06, + "loss": 0.2213, + "num_input_tokens_seen": 19021288, + "step": 9711 + }, + { + "epoch": 1.2872100728959577, + "grad_norm": 5.139490604400635, + "learning_rate": 3.0544166085723207e-06, + "loss": 0.0953, + "num_input_tokens_seen": 19023688, + "step": 9712 + }, + { + "epoch": 1.2873426110006627, + "grad_norm": 7.265867233276367, + "learning_rate": 3.0540781097187026e-06, + "loss": 0.0757, + "num_input_tokens_seen": 19025512, + "step": 9713 + }, + { + "epoch": 1.2874751491053678, + "grad_norm": 6.572906970977783, + "learning_rate": 3.0537396001820767e-06, + "loss": 0.086, + "num_input_tokens_seen": 19027488, + "step": 9714 + }, + { + "epoch": 1.2876076872100728, + "grad_norm": 3.4964687824249268, + "learning_rate": 3.0534010799689693e-06, + "loss": 0.0346, + "num_input_tokens_seen": 19029256, + "step": 9715 + }, + { + "epoch": 1.287740225314778, + "grad_norm": 8.579503059387207, + "learning_rate": 3.0530625490859074e-06, + "loss": 0.1003, + "num_input_tokens_seen": 19030960, + "step": 9716 + }, + { + "epoch": 1.287872763419483, + "grad_norm": 11.344704627990723, + "learning_rate": 3.0527240075394176e-06, + "loss": 0.3999, + "num_input_tokens_seen": 19033696, + "step": 9717 + }, + { + "epoch": 1.2880053015241881, + "grad_norm": 2.4015750885009766, + "learning_rate": 3.0523854553360273e-06, + "loss": 0.0192, + "num_input_tokens_seen": 19036592, + "step": 9718 + }, + { + "epoch": 1.2881378396288934, + "grad_norm": 0.16041046380996704, + "learning_rate": 3.0520468924822654e-06, + "loss": 0.0008, + "num_input_tokens_seen": 19039136, + "step": 9719 + }, + { + "epoch": 1.2882703777335984, + "grad_norm": 10.441962242126465, + "learning_rate": 3.0517083189846574e-06, + "loss": 0.3448, + "num_input_tokens_seen": 19041216, + "step": 9720 + }, + { + "epoch": 1.2884029158383035, + "grad_norm": 6.029481410980225, + "learning_rate": 3.0513697348497327e-06, + "loss": 0.1117, + "num_input_tokens_seen": 19043088, + "step": 9721 + }, + { + "epoch": 1.2885354539430085, + "grad_norm": 5.383876800537109, + "learning_rate": 3.05103114008402e-06, + "loss": 0.2055, + "num_input_tokens_seen": 19045800, + "step": 9722 + }, + { + "epoch": 1.2886679920477138, + "grad_norm": 2.778184652328491, + "learning_rate": 3.050692534694046e-06, + "loss": 0.0404, + "num_input_tokens_seen": 19048640, + "step": 9723 + }, + { + "epoch": 1.2888005301524188, + "grad_norm": 4.202306747436523, + "learning_rate": 3.0503539186863406e-06, + "loss": 0.0215, + "num_input_tokens_seen": 19049832, + "step": 9724 + }, + { + "epoch": 1.288933068257124, + "grad_norm": 0.6517506241798401, + "learning_rate": 3.050015292067432e-06, + "loss": 0.0041, + "num_input_tokens_seen": 19052056, + "step": 9725 + }, + { + "epoch": 1.289065606361829, + "grad_norm": 5.3916497230529785, + "learning_rate": 3.0496766548438485e-06, + "loss": 0.098, + "num_input_tokens_seen": 19053472, + "step": 9726 + }, + { + "epoch": 1.2891981444665341, + "grad_norm": 11.986101150512695, + "learning_rate": 3.04933800702212e-06, + "loss": 0.2834, + "num_input_tokens_seen": 19055784, + "step": 9727 + }, + { + "epoch": 1.2893306825712392, + "grad_norm": 11.522847175598145, + "learning_rate": 3.0489993486087767e-06, + "loss": 0.3434, + "num_input_tokens_seen": 19058800, + "step": 9728 + }, + { + "epoch": 1.2894632206759442, + "grad_norm": 0.9851899147033691, + "learning_rate": 3.0486606796103464e-06, + "loss": 0.004, + "num_input_tokens_seen": 19060760, + "step": 9729 + }, + { + "epoch": 1.2895957587806495, + "grad_norm": 16.994403839111328, + "learning_rate": 3.048322000033359e-06, + "loss": 0.4069, + "num_input_tokens_seen": 19063320, + "step": 9730 + }, + { + "epoch": 1.2897282968853545, + "grad_norm": 8.685630798339844, + "learning_rate": 3.047983309884346e-06, + "loss": 0.3634, + "num_input_tokens_seen": 19066168, + "step": 9731 + }, + { + "epoch": 1.2898608349900598, + "grad_norm": 6.574770450592041, + "learning_rate": 3.0476446091698362e-06, + "loss": 0.1284, + "num_input_tokens_seen": 19068872, + "step": 9732 + }, + { + "epoch": 1.2899933730947648, + "grad_norm": 8.515045166015625, + "learning_rate": 3.0473058978963617e-06, + "loss": 0.0441, + "num_input_tokens_seen": 19070184, + "step": 9733 + }, + { + "epoch": 1.2901259111994698, + "grad_norm": 0.9776235818862915, + "learning_rate": 3.0469671760704513e-06, + "loss": 0.0093, + "num_input_tokens_seen": 19072912, + "step": 9734 + }, + { + "epoch": 1.2902584493041749, + "grad_norm": 0.057561054825782776, + "learning_rate": 3.046628443698636e-06, + "loss": 0.0002, + "num_input_tokens_seen": 19075448, + "step": 9735 + }, + { + "epoch": 1.29039098740888, + "grad_norm": 1.3620918989181519, + "learning_rate": 3.0462897007874474e-06, + "loss": 0.004, + "num_input_tokens_seen": 19077144, + "step": 9736 + }, + { + "epoch": 1.2905235255135852, + "grad_norm": 0.027444053441286087, + "learning_rate": 3.045950947343417e-06, + "loss": 0.0002, + "num_input_tokens_seen": 19078768, + "step": 9737 + }, + { + "epoch": 1.2906560636182902, + "grad_norm": 0.15424036979675293, + "learning_rate": 3.045612183373075e-06, + "loss": 0.0012, + "num_input_tokens_seen": 19080176, + "step": 9738 + }, + { + "epoch": 1.2907886017229955, + "grad_norm": 1.8967140913009644, + "learning_rate": 3.0452734088829543e-06, + "loss": 0.0221, + "num_input_tokens_seen": 19082416, + "step": 9739 + }, + { + "epoch": 1.2909211398277005, + "grad_norm": 0.4022458791732788, + "learning_rate": 3.0449346238795863e-06, + "loss": 0.001, + "num_input_tokens_seen": 19084432, + "step": 9740 + }, + { + "epoch": 1.2910536779324056, + "grad_norm": 9.513527870178223, + "learning_rate": 3.044595828369502e-06, + "loss": 0.2238, + "num_input_tokens_seen": 19086632, + "step": 9741 + }, + { + "epoch": 1.2911862160371106, + "grad_norm": 0.009346122853457928, + "learning_rate": 3.044257022359235e-06, + "loss": 0.0001, + "num_input_tokens_seen": 19088568, + "step": 9742 + }, + { + "epoch": 1.2913187541418158, + "grad_norm": 8.654743194580078, + "learning_rate": 3.0439182058553174e-06, + "loss": 0.1487, + "num_input_tokens_seen": 19090016, + "step": 9743 + }, + { + "epoch": 1.2914512922465209, + "grad_norm": 0.032451413571834564, + "learning_rate": 3.043579378864281e-06, + "loss": 0.0001, + "num_input_tokens_seen": 19091512, + "step": 9744 + }, + { + "epoch": 1.291583830351226, + "grad_norm": 0.6726590991020203, + "learning_rate": 3.043240541392659e-06, + "loss": 0.0038, + "num_input_tokens_seen": 19092704, + "step": 9745 + }, + { + "epoch": 1.2917163684559312, + "grad_norm": 9.572498321533203, + "learning_rate": 3.042901693446985e-06, + "loss": 0.2451, + "num_input_tokens_seen": 19094112, + "step": 9746 + }, + { + "epoch": 1.2918489065606362, + "grad_norm": 1.6226102113723755, + "learning_rate": 3.042562835033791e-06, + "loss": 0.0285, + "num_input_tokens_seen": 19095568, + "step": 9747 + }, + { + "epoch": 1.2919814446653413, + "grad_norm": 0.0014707483351230621, + "learning_rate": 3.0422239661596125e-06, + "loss": 0.0, + "num_input_tokens_seen": 19096880, + "step": 9748 + }, + { + "epoch": 1.2921139827700463, + "grad_norm": 5.7448410987854, + "learning_rate": 3.041885086830981e-06, + "loss": 0.0808, + "num_input_tokens_seen": 19099416, + "step": 9749 + }, + { + "epoch": 1.2922465208747516, + "grad_norm": 0.007264920510351658, + "learning_rate": 3.041546197054432e-06, + "loss": 0.0, + "num_input_tokens_seen": 19101208, + "step": 9750 + }, + { + "epoch": 1.2923790589794566, + "grad_norm": 8.610037803649902, + "learning_rate": 3.041207296836498e-06, + "loss": 0.1357, + "num_input_tokens_seen": 19102752, + "step": 9751 + }, + { + "epoch": 1.2925115970841616, + "grad_norm": 0.007513558492064476, + "learning_rate": 3.0408683861837145e-06, + "loss": 0.0, + "num_input_tokens_seen": 19104248, + "step": 9752 + }, + { + "epoch": 1.292644135188867, + "grad_norm": 6.709893703460693, + "learning_rate": 3.040529465102615e-06, + "loss": 0.1889, + "num_input_tokens_seen": 19106992, + "step": 9753 + }, + { + "epoch": 1.292776673293572, + "grad_norm": 3.686472177505493, + "learning_rate": 3.0401905335997345e-06, + "loss": 0.0405, + "num_input_tokens_seen": 19109024, + "step": 9754 + }, + { + "epoch": 1.292909211398277, + "grad_norm": 6.466920375823975, + "learning_rate": 3.0398515916816084e-06, + "loss": 0.1192, + "num_input_tokens_seen": 19110824, + "step": 9755 + }, + { + "epoch": 1.293041749502982, + "grad_norm": 0.8524161577224731, + "learning_rate": 3.03951263935477e-06, + "loss": 0.0075, + "num_input_tokens_seen": 19112392, + "step": 9756 + }, + { + "epoch": 1.2931742876076873, + "grad_norm": 7.447122573852539, + "learning_rate": 3.039173676625757e-06, + "loss": 0.1172, + "num_input_tokens_seen": 19114104, + "step": 9757 + }, + { + "epoch": 1.2933068257123923, + "grad_norm": 0.43963733315467834, + "learning_rate": 3.0388347035011032e-06, + "loss": 0.0033, + "num_input_tokens_seen": 19115864, + "step": 9758 + }, + { + "epoch": 1.2934393638170973, + "grad_norm": 11.866196632385254, + "learning_rate": 3.0384957199873444e-06, + "loss": 0.4718, + "num_input_tokens_seen": 19118376, + "step": 9759 + }, + { + "epoch": 1.2935719019218026, + "grad_norm": 8.702754974365234, + "learning_rate": 3.0381567260910166e-06, + "loss": 0.1476, + "num_input_tokens_seen": 19120184, + "step": 9760 + }, + { + "epoch": 1.2937044400265076, + "grad_norm": 0.42992159724235535, + "learning_rate": 3.0378177218186563e-06, + "loss": 0.0016, + "num_input_tokens_seen": 19122200, + "step": 9761 + }, + { + "epoch": 1.2938369781312127, + "grad_norm": 0.4262789487838745, + "learning_rate": 3.037478707176798e-06, + "loss": 0.0051, + "num_input_tokens_seen": 19124112, + "step": 9762 + }, + { + "epoch": 1.2939695162359177, + "grad_norm": 5.587210655212402, + "learning_rate": 3.037139682171981e-06, + "loss": 0.0763, + "num_input_tokens_seen": 19125496, + "step": 9763 + }, + { + "epoch": 1.294102054340623, + "grad_norm": 0.03021675907075405, + "learning_rate": 3.0368006468107404e-06, + "loss": 0.0001, + "num_input_tokens_seen": 19127088, + "step": 9764 + }, + { + "epoch": 1.294234592445328, + "grad_norm": 0.5387074947357178, + "learning_rate": 3.036461601099612e-06, + "loss": 0.0043, + "num_input_tokens_seen": 19128800, + "step": 9765 + }, + { + "epoch": 1.294367130550033, + "grad_norm": 6.314044952392578, + "learning_rate": 3.036122545045134e-06, + "loss": 0.0503, + "num_input_tokens_seen": 19130304, + "step": 9766 + }, + { + "epoch": 1.2944996686547383, + "grad_norm": 0.009839704260230064, + "learning_rate": 3.0357834786538436e-06, + "loss": 0.0001, + "num_input_tokens_seen": 19132304, + "step": 9767 + }, + { + "epoch": 1.2946322067594433, + "grad_norm": 5.681088924407959, + "learning_rate": 3.035444401932278e-06, + "loss": 0.0668, + "num_input_tokens_seen": 19134952, + "step": 9768 + }, + { + "epoch": 1.2947647448641484, + "grad_norm": 8.596113204956055, + "learning_rate": 3.0351053148869756e-06, + "loss": 0.3042, + "num_input_tokens_seen": 19137120, + "step": 9769 + }, + { + "epoch": 1.2948972829688534, + "grad_norm": 5.423805236816406, + "learning_rate": 3.034766217524473e-06, + "loss": 0.0459, + "num_input_tokens_seen": 19139752, + "step": 9770 + }, + { + "epoch": 1.2950298210735587, + "grad_norm": 0.07241814583539963, + "learning_rate": 3.034427109851309e-06, + "loss": 0.0004, + "num_input_tokens_seen": 19141200, + "step": 9771 + }, + { + "epoch": 1.2951623591782637, + "grad_norm": 6.751254081726074, + "learning_rate": 3.034087991874022e-06, + "loss": 0.0858, + "num_input_tokens_seen": 19142808, + "step": 9772 + }, + { + "epoch": 1.295294897282969, + "grad_norm": 0.022137442603707314, + "learning_rate": 3.033748863599149e-06, + "loss": 0.0001, + "num_input_tokens_seen": 19144640, + "step": 9773 + }, + { + "epoch": 1.295427435387674, + "grad_norm": 4.193861961364746, + "learning_rate": 3.0334097250332304e-06, + "loss": 0.0648, + "num_input_tokens_seen": 19145880, + "step": 9774 + }, + { + "epoch": 1.295559973492379, + "grad_norm": 1.506089210510254, + "learning_rate": 3.0330705761828043e-06, + "loss": 0.015, + "num_input_tokens_seen": 19147416, + "step": 9775 + }, + { + "epoch": 1.295692511597084, + "grad_norm": 6.302011489868164, + "learning_rate": 3.0327314170544097e-06, + "loss": 0.1528, + "num_input_tokens_seen": 19149248, + "step": 9776 + }, + { + "epoch": 1.2958250497017891, + "grad_norm": 4.4645891189575195, + "learning_rate": 3.0323922476545857e-06, + "loss": 0.0173, + "num_input_tokens_seen": 19151176, + "step": 9777 + }, + { + "epoch": 1.2959575878064944, + "grad_norm": 4.286782741546631, + "learning_rate": 3.0320530679898726e-06, + "loss": 0.0584, + "num_input_tokens_seen": 19152776, + "step": 9778 + }, + { + "epoch": 1.2960901259111994, + "grad_norm": 0.015966463834047318, + "learning_rate": 3.0317138780668087e-06, + "loss": 0.0001, + "num_input_tokens_seen": 19154480, + "step": 9779 + }, + { + "epoch": 1.2962226640159047, + "grad_norm": 0.7465761303901672, + "learning_rate": 3.031374677891934e-06, + "loss": 0.0067, + "num_input_tokens_seen": 19156432, + "step": 9780 + }, + { + "epoch": 1.2963552021206097, + "grad_norm": 0.001739933155477047, + "learning_rate": 3.0310354674717896e-06, + "loss": 0.0, + "num_input_tokens_seen": 19157576, + "step": 9781 + }, + { + "epoch": 1.2964877402253148, + "grad_norm": 8.151362419128418, + "learning_rate": 3.0306962468129155e-06, + "loss": 0.1103, + "num_input_tokens_seen": 19159896, + "step": 9782 + }, + { + "epoch": 1.2966202783300198, + "grad_norm": 2.6362977027893066, + "learning_rate": 3.0303570159218504e-06, + "loss": 0.0096, + "num_input_tokens_seen": 19161176, + "step": 9783 + }, + { + "epoch": 1.296752816434725, + "grad_norm": 6.76418399810791, + "learning_rate": 3.0300177748051375e-06, + "loss": 0.0477, + "num_input_tokens_seen": 19163296, + "step": 9784 + }, + { + "epoch": 1.29688535453943, + "grad_norm": 10.04101848602295, + "learning_rate": 3.029678523469315e-06, + "loss": 0.0837, + "num_input_tokens_seen": 19165160, + "step": 9785 + }, + { + "epoch": 1.2970178926441351, + "grad_norm": 5.181935787200928, + "learning_rate": 3.0293392619209255e-06, + "loss": 0.1267, + "num_input_tokens_seen": 19166928, + "step": 9786 + }, + { + "epoch": 1.2971504307488404, + "grad_norm": 5.8313517570495605, + "learning_rate": 3.028999990166511e-06, + "loss": 0.1437, + "num_input_tokens_seen": 19169112, + "step": 9787 + }, + { + "epoch": 1.2972829688535454, + "grad_norm": 1.919809103012085, + "learning_rate": 3.02866070821261e-06, + "loss": 0.0592, + "num_input_tokens_seen": 19170792, + "step": 9788 + }, + { + "epoch": 1.2974155069582505, + "grad_norm": 0.005080838687717915, + "learning_rate": 3.0283214160657667e-06, + "loss": 0.0, + "num_input_tokens_seen": 19172944, + "step": 9789 + }, + { + "epoch": 1.2975480450629555, + "grad_norm": 0.003783055581152439, + "learning_rate": 3.027982113732522e-06, + "loss": 0.0, + "num_input_tokens_seen": 19174096, + "step": 9790 + }, + { + "epoch": 1.2976805831676608, + "grad_norm": 0.07809297740459442, + "learning_rate": 3.027642801219418e-06, + "loss": 0.0005, + "num_input_tokens_seen": 19176376, + "step": 9791 + }, + { + "epoch": 1.2978131212723658, + "grad_norm": 10.713796615600586, + "learning_rate": 3.027303478532997e-06, + "loss": 0.2463, + "num_input_tokens_seen": 19177704, + "step": 9792 + }, + { + "epoch": 1.2979456593770708, + "grad_norm": 12.298617362976074, + "learning_rate": 3.0269641456798005e-06, + "loss": 0.2499, + "num_input_tokens_seen": 19179160, + "step": 9793 + }, + { + "epoch": 1.298078197481776, + "grad_norm": 5.390293121337891, + "learning_rate": 3.0266248026663726e-06, + "loss": 0.024, + "num_input_tokens_seen": 19181440, + "step": 9794 + }, + { + "epoch": 1.2982107355864811, + "grad_norm": 0.0240275077521801, + "learning_rate": 3.0262854494992545e-06, + "loss": 0.0001, + "num_input_tokens_seen": 19183184, + "step": 9795 + }, + { + "epoch": 1.2983432736911862, + "grad_norm": 1.6278728246688843, + "learning_rate": 3.0259460861849897e-06, + "loss": 0.0143, + "num_input_tokens_seen": 19184512, + "step": 9796 + }, + { + "epoch": 1.2984758117958912, + "grad_norm": 0.0202358178794384, + "learning_rate": 3.0256067127301226e-06, + "loss": 0.0002, + "num_input_tokens_seen": 19185992, + "step": 9797 + }, + { + "epoch": 1.2986083499005965, + "grad_norm": 19.848304748535156, + "learning_rate": 3.0252673291411945e-06, + "loss": 0.6341, + "num_input_tokens_seen": 19188224, + "step": 9798 + }, + { + "epoch": 1.2987408880053015, + "grad_norm": 8.365300178527832, + "learning_rate": 3.0249279354247496e-06, + "loss": 0.2167, + "num_input_tokens_seen": 19190544, + "step": 9799 + }, + { + "epoch": 1.2988734261100066, + "grad_norm": 9.824725151062012, + "learning_rate": 3.0245885315873325e-06, + "loss": 0.1355, + "num_input_tokens_seen": 19192104, + "step": 9800 + }, + { + "epoch": 1.2990059642147118, + "grad_norm": 0.034354858100414276, + "learning_rate": 3.0242491176354864e-06, + "loss": 0.0002, + "num_input_tokens_seen": 19194248, + "step": 9801 + }, + { + "epoch": 1.2991385023194169, + "grad_norm": 9.92458438873291, + "learning_rate": 3.0239096935757565e-06, + "loss": 0.3343, + "num_input_tokens_seen": 19196240, + "step": 9802 + }, + { + "epoch": 1.299271040424122, + "grad_norm": 14.954906463623047, + "learning_rate": 3.0235702594146853e-06, + "loss": 0.3673, + "num_input_tokens_seen": 19198504, + "step": 9803 + }, + { + "epoch": 1.299403578528827, + "grad_norm": 14.225966453552246, + "learning_rate": 3.0232308151588193e-06, + "loss": 0.1318, + "num_input_tokens_seen": 19200576, + "step": 9804 + }, + { + "epoch": 1.2995361166335322, + "grad_norm": 1.6429986953735352, + "learning_rate": 3.0228913608147016e-06, + "loss": 0.0237, + "num_input_tokens_seen": 19202320, + "step": 9805 + }, + { + "epoch": 1.2996686547382372, + "grad_norm": 9.938350677490234, + "learning_rate": 3.022551896388878e-06, + "loss": 0.142, + "num_input_tokens_seen": 19204224, + "step": 9806 + }, + { + "epoch": 1.2998011928429423, + "grad_norm": 2.1648783683776855, + "learning_rate": 3.022212421887893e-06, + "loss": 0.0405, + "num_input_tokens_seen": 19205920, + "step": 9807 + }, + { + "epoch": 1.2999337309476475, + "grad_norm": 6.9877848625183105, + "learning_rate": 3.0218729373182928e-06, + "loss": 0.0493, + "num_input_tokens_seen": 19207520, + "step": 9808 + }, + { + "epoch": 1.3000662690523526, + "grad_norm": 3.8207414150238037, + "learning_rate": 3.0215334426866216e-06, + "loss": 0.0627, + "num_input_tokens_seen": 19210160, + "step": 9809 + }, + { + "epoch": 1.3001988071570576, + "grad_norm": 4.377755641937256, + "learning_rate": 3.021193937999426e-06, + "loss": 0.081, + "num_input_tokens_seen": 19212376, + "step": 9810 + }, + { + "epoch": 1.3003313452617626, + "grad_norm": 1.4650959968566895, + "learning_rate": 3.020854423263252e-06, + "loss": 0.0067, + "num_input_tokens_seen": 19213720, + "step": 9811 + }, + { + "epoch": 1.300463883366468, + "grad_norm": 0.027493711560964584, + "learning_rate": 3.020514898484645e-06, + "loss": 0.0002, + "num_input_tokens_seen": 19215000, + "step": 9812 + }, + { + "epoch": 1.300596421471173, + "grad_norm": 5.769961357116699, + "learning_rate": 3.0201753636701526e-06, + "loss": 0.1003, + "num_input_tokens_seen": 19217016, + "step": 9813 + }, + { + "epoch": 1.3007289595758782, + "grad_norm": 0.008449739776551723, + "learning_rate": 3.019835818826319e-06, + "loss": 0.0001, + "num_input_tokens_seen": 19218176, + "step": 9814 + }, + { + "epoch": 1.3008614976805832, + "grad_norm": 17.199377059936523, + "learning_rate": 3.019496263959693e-06, + "loss": 0.4646, + "num_input_tokens_seen": 19220096, + "step": 9815 + }, + { + "epoch": 1.3009940357852883, + "grad_norm": 2.3080921173095703, + "learning_rate": 3.0191566990768208e-06, + "loss": 0.0057, + "num_input_tokens_seen": 19221472, + "step": 9816 + }, + { + "epoch": 1.3011265738899933, + "grad_norm": 11.911713600158691, + "learning_rate": 3.018817124184249e-06, + "loss": 0.3617, + "num_input_tokens_seen": 19223864, + "step": 9817 + }, + { + "epoch": 1.3012591119946983, + "grad_norm": 0.09361158311367035, + "learning_rate": 3.018477539288525e-06, + "loss": 0.0006, + "num_input_tokens_seen": 19226104, + "step": 9818 + }, + { + "epoch": 1.3013916500994036, + "grad_norm": 6.93593692779541, + "learning_rate": 3.0181379443961966e-06, + "loss": 0.265, + "num_input_tokens_seen": 19227640, + "step": 9819 + }, + { + "epoch": 1.3015241882041086, + "grad_norm": 17.24040412902832, + "learning_rate": 3.017798339513811e-06, + "loss": 0.4593, + "num_input_tokens_seen": 19229544, + "step": 9820 + }, + { + "epoch": 1.301656726308814, + "grad_norm": 9.467525482177734, + "learning_rate": 3.0174587246479164e-06, + "loss": 0.09, + "num_input_tokens_seen": 19232400, + "step": 9821 + }, + { + "epoch": 1.301789264413519, + "grad_norm": 5.386353969573975, + "learning_rate": 3.01711909980506e-06, + "loss": 0.0509, + "num_input_tokens_seen": 19234456, + "step": 9822 + }, + { + "epoch": 1.301921802518224, + "grad_norm": 6.4422502517700195, + "learning_rate": 3.016779464991792e-06, + "loss": 0.034, + "num_input_tokens_seen": 19236600, + "step": 9823 + }, + { + "epoch": 1.302054340622929, + "grad_norm": 4.175759315490723, + "learning_rate": 3.016439820214658e-06, + "loss": 0.0688, + "num_input_tokens_seen": 19238312, + "step": 9824 + }, + { + "epoch": 1.302186878727634, + "grad_norm": 7.012655258178711, + "learning_rate": 3.016100165480209e-06, + "loss": 0.1636, + "num_input_tokens_seen": 19239888, + "step": 9825 + }, + { + "epoch": 1.3023194168323393, + "grad_norm": 5.514277458190918, + "learning_rate": 3.0157605007949924e-06, + "loss": 0.1272, + "num_input_tokens_seen": 19241992, + "step": 9826 + }, + { + "epoch": 1.3024519549370444, + "grad_norm": 0.4687544107437134, + "learning_rate": 3.015420826165557e-06, + "loss": 0.003, + "num_input_tokens_seen": 19244456, + "step": 9827 + }, + { + "epoch": 1.3025844930417496, + "grad_norm": 0.036308739334344864, + "learning_rate": 3.015081141598453e-06, + "loss": 0.0002, + "num_input_tokens_seen": 19245712, + "step": 9828 + }, + { + "epoch": 1.3027170311464547, + "grad_norm": 2.2799830436706543, + "learning_rate": 3.01474144710023e-06, + "loss": 0.0127, + "num_input_tokens_seen": 19247600, + "step": 9829 + }, + { + "epoch": 1.3028495692511597, + "grad_norm": 9.877415657043457, + "learning_rate": 3.0144017426774354e-06, + "loss": 0.1175, + "num_input_tokens_seen": 19249248, + "step": 9830 + }, + { + "epoch": 1.3029821073558647, + "grad_norm": 7.919954299926758, + "learning_rate": 3.014062028336621e-06, + "loss": 0.1845, + "num_input_tokens_seen": 19251224, + "step": 9831 + }, + { + "epoch": 1.30311464546057, + "grad_norm": 0.11104398220777512, + "learning_rate": 3.013722304084336e-06, + "loss": 0.0006, + "num_input_tokens_seen": 19252952, + "step": 9832 + }, + { + "epoch": 1.303247183565275, + "grad_norm": 9.88637638092041, + "learning_rate": 3.013382569927131e-06, + "loss": 0.1996, + "num_input_tokens_seen": 19255336, + "step": 9833 + }, + { + "epoch": 1.30337972166998, + "grad_norm": 0.5427951812744141, + "learning_rate": 3.0130428258715554e-06, + "loss": 0.0029, + "num_input_tokens_seen": 19257272, + "step": 9834 + }, + { + "epoch": 1.3035122597746853, + "grad_norm": 5.192131042480469, + "learning_rate": 3.01270307192416e-06, + "loss": 0.0446, + "num_input_tokens_seen": 19258992, + "step": 9835 + }, + { + "epoch": 1.3036447978793904, + "grad_norm": 1.0832217931747437, + "learning_rate": 3.012363308091496e-06, + "loss": 0.013, + "num_input_tokens_seen": 19261192, + "step": 9836 + }, + { + "epoch": 1.3037773359840954, + "grad_norm": 6.979450225830078, + "learning_rate": 3.0120235343801134e-06, + "loss": 0.1944, + "num_input_tokens_seen": 19263304, + "step": 9837 + }, + { + "epoch": 1.3039098740888004, + "grad_norm": 10.192964553833008, + "learning_rate": 3.011683750796565e-06, + "loss": 0.0874, + "num_input_tokens_seen": 19265384, + "step": 9838 + }, + { + "epoch": 1.3040424121935057, + "grad_norm": 5.590973854064941, + "learning_rate": 3.0113439573474e-06, + "loss": 0.0855, + "num_input_tokens_seen": 19266952, + "step": 9839 + }, + { + "epoch": 1.3041749502982107, + "grad_norm": 0.09927178174257278, + "learning_rate": 3.011004154039171e-06, + "loss": 0.0007, + "num_input_tokens_seen": 19268936, + "step": 9840 + }, + { + "epoch": 1.3043074884029158, + "grad_norm": 0.2859851121902466, + "learning_rate": 3.0106643408784296e-06, + "loss": 0.0019, + "num_input_tokens_seen": 19270376, + "step": 9841 + }, + { + "epoch": 1.304440026507621, + "grad_norm": 0.03660707548260689, + "learning_rate": 3.010324517871727e-06, + "loss": 0.0002, + "num_input_tokens_seen": 19272264, + "step": 9842 + }, + { + "epoch": 1.304572564612326, + "grad_norm": 5.459381103515625, + "learning_rate": 3.0099846850256158e-06, + "loss": 0.1462, + "num_input_tokens_seen": 19273776, + "step": 9843 + }, + { + "epoch": 1.304705102717031, + "grad_norm": 3.6920478343963623, + "learning_rate": 3.0096448423466484e-06, + "loss": 0.0593, + "num_input_tokens_seen": 19275632, + "step": 9844 + }, + { + "epoch": 1.3048376408217361, + "grad_norm": 6.117527484893799, + "learning_rate": 3.0093049898413764e-06, + "loss": 0.133, + "num_input_tokens_seen": 19277744, + "step": 9845 + }, + { + "epoch": 1.3049701789264414, + "grad_norm": 9.444272994995117, + "learning_rate": 3.0089651275163527e-06, + "loss": 0.2615, + "num_input_tokens_seen": 19280008, + "step": 9846 + }, + { + "epoch": 1.3051027170311464, + "grad_norm": 0.09730510413646698, + "learning_rate": 3.008625255378131e-06, + "loss": 0.0007, + "num_input_tokens_seen": 19283968, + "step": 9847 + }, + { + "epoch": 1.3052352551358515, + "grad_norm": 2.9203476905822754, + "learning_rate": 3.0082853734332623e-06, + "loss": 0.0292, + "num_input_tokens_seen": 19285960, + "step": 9848 + }, + { + "epoch": 1.3053677932405567, + "grad_norm": 8.503900527954102, + "learning_rate": 3.007945481688302e-06, + "loss": 0.0424, + "num_input_tokens_seen": 19287664, + "step": 9849 + }, + { + "epoch": 1.3055003313452618, + "grad_norm": 0.029379185289144516, + "learning_rate": 3.007605580149802e-06, + "loss": 0.0002, + "num_input_tokens_seen": 19288784, + "step": 9850 + }, + { + "epoch": 1.3056328694499668, + "grad_norm": 0.14543849229812622, + "learning_rate": 3.0072656688243163e-06, + "loss": 0.001, + "num_input_tokens_seen": 19290376, + "step": 9851 + }, + { + "epoch": 1.3057654075546719, + "grad_norm": 10.279061317443848, + "learning_rate": 3.006925747718399e-06, + "loss": 0.2745, + "num_input_tokens_seen": 19292744, + "step": 9852 + }, + { + "epoch": 1.3058979456593771, + "grad_norm": 0.08319159597158432, + "learning_rate": 3.0065858168386024e-06, + "loss": 0.0006, + "num_input_tokens_seen": 19294872, + "step": 9853 + }, + { + "epoch": 1.3060304837640822, + "grad_norm": 0.12819544970989227, + "learning_rate": 3.0062458761914825e-06, + "loss": 0.0007, + "num_input_tokens_seen": 19296960, + "step": 9854 + }, + { + "epoch": 1.3061630218687874, + "grad_norm": 11.408480644226074, + "learning_rate": 3.0059059257835932e-06, + "loss": 0.1457, + "num_input_tokens_seen": 19299432, + "step": 9855 + }, + { + "epoch": 1.3062955599734924, + "grad_norm": 0.02459358051419258, + "learning_rate": 3.005565965621488e-06, + "loss": 0.0002, + "num_input_tokens_seen": 19301840, + "step": 9856 + }, + { + "epoch": 1.3064280980781975, + "grad_norm": 1.7044261693954468, + "learning_rate": 3.0052259957117224e-06, + "loss": 0.0068, + "num_input_tokens_seen": 19303912, + "step": 9857 + }, + { + "epoch": 1.3065606361829025, + "grad_norm": 1.4778809547424316, + "learning_rate": 3.004886016060851e-06, + "loss": 0.0051, + "num_input_tokens_seen": 19306296, + "step": 9858 + }, + { + "epoch": 1.3066931742876076, + "grad_norm": 0.26052358746528625, + "learning_rate": 3.0045460266754297e-06, + "loss": 0.0018, + "num_input_tokens_seen": 19308760, + "step": 9859 + }, + { + "epoch": 1.3068257123923128, + "grad_norm": 1.9929078817367554, + "learning_rate": 3.004206027562012e-06, + "loss": 0.0193, + "num_input_tokens_seen": 19310160, + "step": 9860 + }, + { + "epoch": 1.3069582504970179, + "grad_norm": 11.843964576721191, + "learning_rate": 3.0038660187271544e-06, + "loss": 0.194, + "num_input_tokens_seen": 19312264, + "step": 9861 + }, + { + "epoch": 1.3070907886017231, + "grad_norm": 4.158558368682861, + "learning_rate": 3.003526000177413e-06, + "loss": 0.1654, + "num_input_tokens_seen": 19313984, + "step": 9862 + }, + { + "epoch": 1.3072233267064282, + "grad_norm": 8.47726821899414, + "learning_rate": 3.003185971919342e-06, + "loss": 0.0835, + "num_input_tokens_seen": 19316104, + "step": 9863 + }, + { + "epoch": 1.3073558648111332, + "grad_norm": 2.9485225677490234, + "learning_rate": 3.002845933959499e-06, + "loss": 0.0382, + "num_input_tokens_seen": 19318456, + "step": 9864 + }, + { + "epoch": 1.3074884029158382, + "grad_norm": 6.460711479187012, + "learning_rate": 3.0025058863044393e-06, + "loss": 0.1015, + "num_input_tokens_seen": 19320120, + "step": 9865 + }, + { + "epoch": 1.3076209410205433, + "grad_norm": 0.013843146152794361, + "learning_rate": 3.002165828960719e-06, + "loss": 0.0001, + "num_input_tokens_seen": 19321560, + "step": 9866 + }, + { + "epoch": 1.3077534791252485, + "grad_norm": 0.05526039004325867, + "learning_rate": 3.001825761934896e-06, + "loss": 0.0003, + "num_input_tokens_seen": 19323168, + "step": 9867 + }, + { + "epoch": 1.3078860172299536, + "grad_norm": 12.874618530273438, + "learning_rate": 3.0014856852335257e-06, + "loss": 0.295, + "num_input_tokens_seen": 19324872, + "step": 9868 + }, + { + "epoch": 1.3080185553346588, + "grad_norm": 6.637728214263916, + "learning_rate": 3.0011455988631655e-06, + "loss": 0.1222, + "num_input_tokens_seen": 19326584, + "step": 9869 + }, + { + "epoch": 1.3081510934393639, + "grad_norm": 8.515054702758789, + "learning_rate": 3.000805502830373e-06, + "loss": 0.2351, + "num_input_tokens_seen": 19327912, + "step": 9870 + }, + { + "epoch": 1.308283631544069, + "grad_norm": 1.5021626949310303, + "learning_rate": 3.0004653971417036e-06, + "loss": 0.0204, + "num_input_tokens_seen": 19330776, + "step": 9871 + }, + { + "epoch": 1.308416169648774, + "grad_norm": 4.543746471405029, + "learning_rate": 3.000125281803717e-06, + "loss": 0.0919, + "num_input_tokens_seen": 19332592, + "step": 9872 + }, + { + "epoch": 1.3085487077534792, + "grad_norm": 0.030060553923249245, + "learning_rate": 2.9997851568229696e-06, + "loss": 0.0002, + "num_input_tokens_seen": 19334168, + "step": 9873 + }, + { + "epoch": 1.3086812458581842, + "grad_norm": 0.027817172929644585, + "learning_rate": 2.9994450222060197e-06, + "loss": 0.0002, + "num_input_tokens_seen": 19335672, + "step": 9874 + }, + { + "epoch": 1.3088137839628893, + "grad_norm": 7.033565998077393, + "learning_rate": 2.999104877959425e-06, + "loss": 0.0675, + "num_input_tokens_seen": 19338176, + "step": 9875 + }, + { + "epoch": 1.3089463220675945, + "grad_norm": 9.003929138183594, + "learning_rate": 2.998764724089744e-06, + "loss": 0.0813, + "num_input_tokens_seen": 19340744, + "step": 9876 + }, + { + "epoch": 1.3090788601722996, + "grad_norm": 6.206470966339111, + "learning_rate": 2.9984245606035356e-06, + "loss": 0.1508, + "num_input_tokens_seen": 19342720, + "step": 9877 + }, + { + "epoch": 1.3092113982770046, + "grad_norm": 12.467761993408203, + "learning_rate": 2.9980843875073574e-06, + "loss": 0.5515, + "num_input_tokens_seen": 19344968, + "step": 9878 + }, + { + "epoch": 1.3093439363817096, + "grad_norm": 1.3949103355407715, + "learning_rate": 2.9977442048077683e-06, + "loss": 0.0243, + "num_input_tokens_seen": 19346704, + "step": 9879 + }, + { + "epoch": 1.309476474486415, + "grad_norm": 3.690950393676758, + "learning_rate": 2.997404012511328e-06, + "loss": 0.0703, + "num_input_tokens_seen": 19349408, + "step": 9880 + }, + { + "epoch": 1.30960901259112, + "grad_norm": 0.09674520045518875, + "learning_rate": 2.9970638106245948e-06, + "loss": 0.0007, + "num_input_tokens_seen": 19351288, + "step": 9881 + }, + { + "epoch": 1.309741550695825, + "grad_norm": 0.07682705670595169, + "learning_rate": 2.996723599154129e-06, + "loss": 0.0005, + "num_input_tokens_seen": 19352824, + "step": 9882 + }, + { + "epoch": 1.3098740888005302, + "grad_norm": 5.513399124145508, + "learning_rate": 2.996383378106489e-06, + "loss": 0.0777, + "num_input_tokens_seen": 19354712, + "step": 9883 + }, + { + "epoch": 1.3100066269052353, + "grad_norm": 1.9033175706863403, + "learning_rate": 2.996043147488235e-06, + "loss": 0.0151, + "num_input_tokens_seen": 19356320, + "step": 9884 + }, + { + "epoch": 1.3101391650099403, + "grad_norm": 3.78482985496521, + "learning_rate": 2.9957029073059276e-06, + "loss": 0.0409, + "num_input_tokens_seen": 19357976, + "step": 9885 + }, + { + "epoch": 1.3102717031146454, + "grad_norm": 3.719390392303467, + "learning_rate": 2.9953626575661256e-06, + "loss": 0.0718, + "num_input_tokens_seen": 19359320, + "step": 9886 + }, + { + "epoch": 1.3104042412193506, + "grad_norm": 0.04501882568001747, + "learning_rate": 2.99502239827539e-06, + "loss": 0.0003, + "num_input_tokens_seen": 19360400, + "step": 9887 + }, + { + "epoch": 1.3105367793240557, + "grad_norm": 0.3203045427799225, + "learning_rate": 2.994682129440281e-06, + "loss": 0.0022, + "num_input_tokens_seen": 19362040, + "step": 9888 + }, + { + "epoch": 1.3106693174287607, + "grad_norm": 11.09875774383545, + "learning_rate": 2.9943418510673596e-06, + "loss": 0.0881, + "num_input_tokens_seen": 19363776, + "step": 9889 + }, + { + "epoch": 1.310801855533466, + "grad_norm": 6.155569076538086, + "learning_rate": 2.994001563163186e-06, + "loss": 0.1444, + "num_input_tokens_seen": 19366320, + "step": 9890 + }, + { + "epoch": 1.310934393638171, + "grad_norm": 10.54354476928711, + "learning_rate": 2.993661265734322e-06, + "loss": 0.2677, + "num_input_tokens_seen": 19369352, + "step": 9891 + }, + { + "epoch": 1.311066931742876, + "grad_norm": 0.06393119692802429, + "learning_rate": 2.9933209587873262e-06, + "loss": 0.0004, + "num_input_tokens_seen": 19371144, + "step": 9892 + }, + { + "epoch": 1.311199469847581, + "grad_norm": 5.1202497482299805, + "learning_rate": 2.9929806423287642e-06, + "loss": 0.0561, + "num_input_tokens_seen": 19373000, + "step": 9893 + }, + { + "epoch": 1.3113320079522863, + "grad_norm": 6.305868148803711, + "learning_rate": 2.9926403163651945e-06, + "loss": 0.088, + "num_input_tokens_seen": 19375560, + "step": 9894 + }, + { + "epoch": 1.3114645460569914, + "grad_norm": 1.9412078857421875, + "learning_rate": 2.9922999809031794e-06, + "loss": 0.0058, + "num_input_tokens_seen": 19377480, + "step": 9895 + }, + { + "epoch": 1.3115970841616966, + "grad_norm": 0.5885036587715149, + "learning_rate": 2.991959635949282e-06, + "loss": 0.003, + "num_input_tokens_seen": 19378896, + "step": 9896 + }, + { + "epoch": 1.3117296222664017, + "grad_norm": 1.7329483032226562, + "learning_rate": 2.9916192815100627e-06, + "loss": 0.0129, + "num_input_tokens_seen": 19380840, + "step": 9897 + }, + { + "epoch": 1.3118621603711067, + "grad_norm": 4.214051723480225, + "learning_rate": 2.991278917592084e-06, + "loss": 0.0342, + "num_input_tokens_seen": 19382736, + "step": 9898 + }, + { + "epoch": 1.3119946984758117, + "grad_norm": 5.996362686157227, + "learning_rate": 2.9909385442019103e-06, + "loss": 0.0285, + "num_input_tokens_seen": 19384088, + "step": 9899 + }, + { + "epoch": 1.3121272365805168, + "grad_norm": 0.09559997916221619, + "learning_rate": 2.990598161346102e-06, + "loss": 0.0006, + "num_input_tokens_seen": 19385512, + "step": 9900 + }, + { + "epoch": 1.312259774685222, + "grad_norm": 11.439251899719238, + "learning_rate": 2.9902577690312225e-06, + "loss": 0.1754, + "num_input_tokens_seen": 19387464, + "step": 9901 + }, + { + "epoch": 1.312392312789927, + "grad_norm": 2.9319746494293213, + "learning_rate": 2.989917367263835e-06, + "loss": 0.0303, + "num_input_tokens_seen": 19388720, + "step": 9902 + }, + { + "epoch": 1.3125248508946323, + "grad_norm": 0.18789204955101013, + "learning_rate": 2.9895769560505035e-06, + "loss": 0.001, + "num_input_tokens_seen": 19390000, + "step": 9903 + }, + { + "epoch": 1.3126573889993374, + "grad_norm": 0.12220821529626846, + "learning_rate": 2.9892365353977897e-06, + "loss": 0.0008, + "num_input_tokens_seen": 19392088, + "step": 9904 + }, + { + "epoch": 1.3127899271040424, + "grad_norm": 0.9044216275215149, + "learning_rate": 2.988896105312258e-06, + "loss": 0.0071, + "num_input_tokens_seen": 19393688, + "step": 9905 + }, + { + "epoch": 1.3129224652087474, + "grad_norm": 0.021063951775431633, + "learning_rate": 2.988555665800473e-06, + "loss": 0.0001, + "num_input_tokens_seen": 19395160, + "step": 9906 + }, + { + "epoch": 1.3130550033134525, + "grad_norm": 3.0281379222869873, + "learning_rate": 2.988215216868996e-06, + "loss": 0.0902, + "num_input_tokens_seen": 19397376, + "step": 9907 + }, + { + "epoch": 1.3131875414181577, + "grad_norm": 9.167708396911621, + "learning_rate": 2.9878747585243946e-06, + "loss": 0.1647, + "num_input_tokens_seen": 19399832, + "step": 9908 + }, + { + "epoch": 1.3133200795228628, + "grad_norm": 13.564544677734375, + "learning_rate": 2.987534290773231e-06, + "loss": 0.3183, + "num_input_tokens_seen": 19402544, + "step": 9909 + }, + { + "epoch": 1.313452617627568, + "grad_norm": 7.019082546234131, + "learning_rate": 2.9871938136220685e-06, + "loss": 0.0335, + "num_input_tokens_seen": 19404488, + "step": 9910 + }, + { + "epoch": 1.313585155732273, + "grad_norm": 5.595988750457764, + "learning_rate": 2.9868533270774746e-06, + "loss": 0.058, + "num_input_tokens_seen": 19406480, + "step": 9911 + }, + { + "epoch": 1.3137176938369781, + "grad_norm": 4.800780773162842, + "learning_rate": 2.986512831146012e-06, + "loss": 0.065, + "num_input_tokens_seen": 19408312, + "step": 9912 + }, + { + "epoch": 1.3138502319416832, + "grad_norm": 14.699591636657715, + "learning_rate": 2.9861723258342467e-06, + "loss": 0.2819, + "num_input_tokens_seen": 19410192, + "step": 9913 + }, + { + "epoch": 1.3139827700463884, + "grad_norm": 1.0004544258117676, + "learning_rate": 2.985831811148744e-06, + "loss": 0.0099, + "num_input_tokens_seen": 19411896, + "step": 9914 + }, + { + "epoch": 1.3141153081510935, + "grad_norm": 0.03511684015393257, + "learning_rate": 2.985491287096068e-06, + "loss": 0.0002, + "num_input_tokens_seen": 19413936, + "step": 9915 + }, + { + "epoch": 1.3142478462557985, + "grad_norm": 12.71423625946045, + "learning_rate": 2.985150753682785e-06, + "loss": 0.3427, + "num_input_tokens_seen": 19415792, + "step": 9916 + }, + { + "epoch": 1.3143803843605038, + "grad_norm": 7.258702278137207, + "learning_rate": 2.984810210915462e-06, + "loss": 0.1446, + "num_input_tokens_seen": 19418208, + "step": 9917 + }, + { + "epoch": 1.3145129224652088, + "grad_norm": 0.0269358828663826, + "learning_rate": 2.9844696588006628e-06, + "loss": 0.0002, + "num_input_tokens_seen": 19419544, + "step": 9918 + }, + { + "epoch": 1.3146454605699138, + "grad_norm": 6.48532772064209, + "learning_rate": 2.9841290973449537e-06, + "loss": 0.1476, + "num_input_tokens_seen": 19421184, + "step": 9919 + }, + { + "epoch": 1.3147779986746189, + "grad_norm": 0.021829042583703995, + "learning_rate": 2.9837885265549025e-06, + "loss": 0.0001, + "num_input_tokens_seen": 19422496, + "step": 9920 + }, + { + "epoch": 1.3149105367793241, + "grad_norm": 0.016613634303212166, + "learning_rate": 2.9834479464370747e-06, + "loss": 0.0001, + "num_input_tokens_seen": 19423944, + "step": 9921 + }, + { + "epoch": 1.3150430748840292, + "grad_norm": 0.11919800937175751, + "learning_rate": 2.9831073569980368e-06, + "loss": 0.0008, + "num_input_tokens_seen": 19425616, + "step": 9922 + }, + { + "epoch": 1.3151756129887342, + "grad_norm": 0.034875400364398956, + "learning_rate": 2.982766758244356e-06, + "loss": 0.0002, + "num_input_tokens_seen": 19426784, + "step": 9923 + }, + { + "epoch": 1.3153081510934395, + "grad_norm": 7.285836219787598, + "learning_rate": 2.982426150182599e-06, + "loss": 0.1458, + "num_input_tokens_seen": 19428256, + "step": 9924 + }, + { + "epoch": 1.3154406891981445, + "grad_norm": 1.549041748046875, + "learning_rate": 2.9820855328193327e-06, + "loss": 0.0093, + "num_input_tokens_seen": 19429824, + "step": 9925 + }, + { + "epoch": 1.3155732273028495, + "grad_norm": 0.04375510290265083, + "learning_rate": 2.9817449061611254e-06, + "loss": 0.0003, + "num_input_tokens_seen": 19431720, + "step": 9926 + }, + { + "epoch": 1.3157057654075546, + "grad_norm": 11.506336212158203, + "learning_rate": 2.9814042702145437e-06, + "loss": 0.2946, + "num_input_tokens_seen": 19434008, + "step": 9927 + }, + { + "epoch": 1.3158383035122598, + "grad_norm": 15.159743309020996, + "learning_rate": 2.9810636249861546e-06, + "loss": 0.5215, + "num_input_tokens_seen": 19436192, + "step": 9928 + }, + { + "epoch": 1.3159708416169649, + "grad_norm": 2.542710065841675, + "learning_rate": 2.980722970482528e-06, + "loss": 0.0068, + "num_input_tokens_seen": 19438424, + "step": 9929 + }, + { + "epoch": 1.31610337972167, + "grad_norm": 8.891913414001465, + "learning_rate": 2.9803823067102306e-06, + "loss": 0.2336, + "num_input_tokens_seen": 19441464, + "step": 9930 + }, + { + "epoch": 1.3162359178263752, + "grad_norm": 0.11982476711273193, + "learning_rate": 2.980041633675831e-06, + "loss": 0.0008, + "num_input_tokens_seen": 19443784, + "step": 9931 + }, + { + "epoch": 1.3163684559310802, + "grad_norm": 1.6848747730255127, + "learning_rate": 2.979700951385898e-06, + "loss": 0.0147, + "num_input_tokens_seen": 19445264, + "step": 9932 + }, + { + "epoch": 1.3165009940357852, + "grad_norm": 6.685860633850098, + "learning_rate": 2.979360259846999e-06, + "loss": 0.1324, + "num_input_tokens_seen": 19446952, + "step": 9933 + }, + { + "epoch": 1.3166335321404903, + "grad_norm": 0.07136420905590057, + "learning_rate": 2.9790195590657036e-06, + "loss": 0.0005, + "num_input_tokens_seen": 19449432, + "step": 9934 + }, + { + "epoch": 1.3167660702451955, + "grad_norm": 5.44427490234375, + "learning_rate": 2.9786788490485814e-06, + "loss": 0.0791, + "num_input_tokens_seen": 19452088, + "step": 9935 + }, + { + "epoch": 1.3168986083499006, + "grad_norm": 6.908329010009766, + "learning_rate": 2.9783381298021996e-06, + "loss": 0.1786, + "num_input_tokens_seen": 19454096, + "step": 9936 + }, + { + "epoch": 1.3170311464546056, + "grad_norm": 0.05879872292280197, + "learning_rate": 2.977997401333129e-06, + "loss": 0.0004, + "num_input_tokens_seen": 19456008, + "step": 9937 + }, + { + "epoch": 1.3171636845593109, + "grad_norm": 0.12926550209522247, + "learning_rate": 2.9776566636479395e-06, + "loss": 0.0009, + "num_input_tokens_seen": 19457896, + "step": 9938 + }, + { + "epoch": 1.317296222664016, + "grad_norm": 0.025347793474793434, + "learning_rate": 2.9773159167531995e-06, + "loss": 0.0002, + "num_input_tokens_seen": 19459328, + "step": 9939 + }, + { + "epoch": 1.317428760768721, + "grad_norm": 1.2416092157363892, + "learning_rate": 2.976975160655479e-06, + "loss": 0.0073, + "num_input_tokens_seen": 19461448, + "step": 9940 + }, + { + "epoch": 1.317561298873426, + "grad_norm": 0.1330566257238388, + "learning_rate": 2.9766343953613484e-06, + "loss": 0.0007, + "num_input_tokens_seen": 19462632, + "step": 9941 + }, + { + "epoch": 1.3176938369781312, + "grad_norm": 0.4423275589942932, + "learning_rate": 2.976293620877379e-06, + "loss": 0.0036, + "num_input_tokens_seen": 19464312, + "step": 9942 + }, + { + "epoch": 1.3178263750828363, + "grad_norm": 6.362712383270264, + "learning_rate": 2.9759528372101386e-06, + "loss": 0.1667, + "num_input_tokens_seen": 19467368, + "step": 9943 + }, + { + "epoch": 1.3179589131875415, + "grad_norm": 7.883072376251221, + "learning_rate": 2.9756120443661995e-06, + "loss": 0.0455, + "num_input_tokens_seen": 19470088, + "step": 9944 + }, + { + "epoch": 1.3180914512922466, + "grad_norm": 3.5510778427124023, + "learning_rate": 2.9752712423521326e-06, + "loss": 0.0126, + "num_input_tokens_seen": 19471680, + "step": 9945 + }, + { + "epoch": 1.3182239893969516, + "grad_norm": 11.454949378967285, + "learning_rate": 2.9749304311745075e-06, + "loss": 0.1412, + "num_input_tokens_seen": 19473248, + "step": 9946 + }, + { + "epoch": 1.3183565275016567, + "grad_norm": 1.6351819038391113, + "learning_rate": 2.9745896108398968e-06, + "loss": 0.0112, + "num_input_tokens_seen": 19474752, + "step": 9947 + }, + { + "epoch": 1.3184890656063617, + "grad_norm": 0.4207864999771118, + "learning_rate": 2.974248781354871e-06, + "loss": 0.001, + "num_input_tokens_seen": 19476016, + "step": 9948 + }, + { + "epoch": 1.318621603711067, + "grad_norm": 0.05926761403679848, + "learning_rate": 2.973907942726001e-06, + "loss": 0.0004, + "num_input_tokens_seen": 19478592, + "step": 9949 + }, + { + "epoch": 1.318754141815772, + "grad_norm": 6.461844444274902, + "learning_rate": 2.9735670949598595e-06, + "loss": 0.0684, + "num_input_tokens_seen": 19480720, + "step": 9950 + }, + { + "epoch": 1.3188866799204773, + "grad_norm": 8.41862964630127, + "learning_rate": 2.9732262380630174e-06, + "loss": 0.2072, + "num_input_tokens_seen": 19482536, + "step": 9951 + }, + { + "epoch": 1.3190192180251823, + "grad_norm": 11.489011764526367, + "learning_rate": 2.9728853720420465e-06, + "loss": 0.1057, + "num_input_tokens_seen": 19484672, + "step": 9952 + }, + { + "epoch": 1.3191517561298873, + "grad_norm": 6.331583499908447, + "learning_rate": 2.97254449690352e-06, + "loss": 0.0323, + "num_input_tokens_seen": 19486408, + "step": 9953 + }, + { + "epoch": 1.3192842942345924, + "grad_norm": 2.09333872795105, + "learning_rate": 2.9722036126540095e-06, + "loss": 0.0801, + "num_input_tokens_seen": 19488224, + "step": 9954 + }, + { + "epoch": 1.3194168323392976, + "grad_norm": 0.019762258976697922, + "learning_rate": 2.971862719300087e-06, + "loss": 0.0001, + "num_input_tokens_seen": 19490584, + "step": 9955 + }, + { + "epoch": 1.3195493704440027, + "grad_norm": 5.01272439956665, + "learning_rate": 2.9715218168483274e-06, + "loss": 0.066, + "num_input_tokens_seen": 19492744, + "step": 9956 + }, + { + "epoch": 1.3196819085487077, + "grad_norm": 1.7981114387512207, + "learning_rate": 2.9711809053052996e-06, + "loss": 0.0276, + "num_input_tokens_seen": 19494400, + "step": 9957 + }, + { + "epoch": 1.319814446653413, + "grad_norm": 2.380805015563965, + "learning_rate": 2.9708399846775804e-06, + "loss": 0.0113, + "num_input_tokens_seen": 19496576, + "step": 9958 + }, + { + "epoch": 1.319946984758118, + "grad_norm": 0.10329004377126694, + "learning_rate": 2.970499054971741e-06, + "loss": 0.0005, + "num_input_tokens_seen": 19499144, + "step": 9959 + }, + { + "epoch": 1.320079522862823, + "grad_norm": 9.753327369689941, + "learning_rate": 2.9701581161943556e-06, + "loss": 0.0705, + "num_input_tokens_seen": 19501400, + "step": 9960 + }, + { + "epoch": 1.320212060967528, + "grad_norm": 7.294651508331299, + "learning_rate": 2.969817168351997e-06, + "loss": 0.1248, + "num_input_tokens_seen": 19503392, + "step": 9961 + }, + { + "epoch": 1.3203445990722333, + "grad_norm": 0.029961518943309784, + "learning_rate": 2.9694762114512398e-06, + "loss": 0.0002, + "num_input_tokens_seen": 19504720, + "step": 9962 + }, + { + "epoch": 1.3204771371769384, + "grad_norm": 9.028806686401367, + "learning_rate": 2.9691352454986567e-06, + "loss": 0.052, + "num_input_tokens_seen": 19506528, + "step": 9963 + }, + { + "epoch": 1.3206096752816434, + "grad_norm": 5.4462175369262695, + "learning_rate": 2.9687942705008227e-06, + "loss": 0.059, + "num_input_tokens_seen": 19509032, + "step": 9964 + }, + { + "epoch": 1.3207422133863487, + "grad_norm": 1.9304429292678833, + "learning_rate": 2.9684532864643123e-06, + "loss": 0.0123, + "num_input_tokens_seen": 19510368, + "step": 9965 + }, + { + "epoch": 1.3208747514910537, + "grad_norm": 25.729135513305664, + "learning_rate": 2.9681122933956987e-06, + "loss": 0.6048, + "num_input_tokens_seen": 19512232, + "step": 9966 + }, + { + "epoch": 1.3210072895957587, + "grad_norm": 9.176871299743652, + "learning_rate": 2.967771291301557e-06, + "loss": 0.171, + "num_input_tokens_seen": 19514072, + "step": 9967 + }, + { + "epoch": 1.3211398277004638, + "grad_norm": 0.008625632151961327, + "learning_rate": 2.9674302801884626e-06, + "loss": 0.0001, + "num_input_tokens_seen": 19515440, + "step": 9968 + }, + { + "epoch": 1.321272365805169, + "grad_norm": 8.296027183532715, + "learning_rate": 2.9670892600629895e-06, + "loss": 0.203, + "num_input_tokens_seen": 19516960, + "step": 9969 + }, + { + "epoch": 1.321404903909874, + "grad_norm": 0.052140120416879654, + "learning_rate": 2.9667482309317135e-06, + "loss": 0.0004, + "num_input_tokens_seen": 19519280, + "step": 9970 + }, + { + "epoch": 1.3215374420145791, + "grad_norm": 0.05078285187482834, + "learning_rate": 2.96640719280121e-06, + "loss": 0.0003, + "num_input_tokens_seen": 19521144, + "step": 9971 + }, + { + "epoch": 1.3216699801192844, + "grad_norm": 5.552460193634033, + "learning_rate": 2.966066145678053e-06, + "loss": 0.1435, + "num_input_tokens_seen": 19524608, + "step": 9972 + }, + { + "epoch": 1.3218025182239894, + "grad_norm": 6.086935043334961, + "learning_rate": 2.9657250895688204e-06, + "loss": 0.2532, + "num_input_tokens_seen": 19526312, + "step": 9973 + }, + { + "epoch": 1.3219350563286945, + "grad_norm": 0.019320817664265633, + "learning_rate": 2.9653840244800865e-06, + "loss": 0.0001, + "num_input_tokens_seen": 19527504, + "step": 9974 + }, + { + "epoch": 1.3220675944333995, + "grad_norm": 0.07383199781179428, + "learning_rate": 2.9650429504184264e-06, + "loss": 0.0004, + "num_input_tokens_seen": 19528744, + "step": 9975 + }, + { + "epoch": 1.3222001325381048, + "grad_norm": 6.378537178039551, + "learning_rate": 2.964701867390419e-06, + "loss": 0.1783, + "num_input_tokens_seen": 19531016, + "step": 9976 + }, + { + "epoch": 1.3223326706428098, + "grad_norm": 5.286518573760986, + "learning_rate": 2.964360775402638e-06, + "loss": 0.0663, + "num_input_tokens_seen": 19533568, + "step": 9977 + }, + { + "epoch": 1.3224652087475148, + "grad_norm": 0.11155138164758682, + "learning_rate": 2.9640196744616612e-06, + "loss": 0.0006, + "num_input_tokens_seen": 19535016, + "step": 9978 + }, + { + "epoch": 1.32259774685222, + "grad_norm": 0.047535307705402374, + "learning_rate": 2.963678564574065e-06, + "loss": 0.0003, + "num_input_tokens_seen": 19536744, + "step": 9979 + }, + { + "epoch": 1.3227302849569251, + "grad_norm": 3.061805248260498, + "learning_rate": 2.9633374457464265e-06, + "loss": 0.0196, + "num_input_tokens_seen": 19538512, + "step": 9980 + }, + { + "epoch": 1.3228628230616302, + "grad_norm": 6.893800258636475, + "learning_rate": 2.9629963179853216e-06, + "loss": 0.2231, + "num_input_tokens_seen": 19540352, + "step": 9981 + }, + { + "epoch": 1.3229953611663352, + "grad_norm": 8.546428680419922, + "learning_rate": 2.962655181297329e-06, + "loss": 0.2148, + "num_input_tokens_seen": 19542488, + "step": 9982 + }, + { + "epoch": 1.3231278992710405, + "grad_norm": 8.707723617553711, + "learning_rate": 2.9623140356890255e-06, + "loss": 0.1585, + "num_input_tokens_seen": 19544824, + "step": 9983 + }, + { + "epoch": 1.3232604373757455, + "grad_norm": 6.410830020904541, + "learning_rate": 2.961972881166988e-06, + "loss": 0.0704, + "num_input_tokens_seen": 19546696, + "step": 9984 + }, + { + "epoch": 1.3233929754804508, + "grad_norm": 9.56839370727539, + "learning_rate": 2.9616317177377944e-06, + "loss": 0.1832, + "num_input_tokens_seen": 19548200, + "step": 9985 + }, + { + "epoch": 1.3235255135851558, + "grad_norm": 2.904906988143921, + "learning_rate": 2.9612905454080232e-06, + "loss": 0.0503, + "num_input_tokens_seen": 19551312, + "step": 9986 + }, + { + "epoch": 1.3236580516898608, + "grad_norm": 3.848745346069336, + "learning_rate": 2.9609493641842523e-06, + "loss": 0.0151, + "num_input_tokens_seen": 19554264, + "step": 9987 + }, + { + "epoch": 1.3237905897945659, + "grad_norm": 1.7232786417007446, + "learning_rate": 2.960608174073059e-06, + "loss": 0.0114, + "num_input_tokens_seen": 19555504, + "step": 9988 + }, + { + "epoch": 1.323923127899271, + "grad_norm": 2.6616146564483643, + "learning_rate": 2.960266975081023e-06, + "loss": 0.0107, + "num_input_tokens_seen": 19556912, + "step": 9989 + }, + { + "epoch": 1.3240556660039762, + "grad_norm": 8.854670524597168, + "learning_rate": 2.9599257672147214e-06, + "loss": 0.1191, + "num_input_tokens_seen": 19558672, + "step": 9990 + }, + { + "epoch": 1.3241882041086812, + "grad_norm": 5.425792217254639, + "learning_rate": 2.9595845504807345e-06, + "loss": 0.0423, + "num_input_tokens_seen": 19561672, + "step": 9991 + }, + { + "epoch": 1.3243207422133865, + "grad_norm": 7.549142837524414, + "learning_rate": 2.9592433248856407e-06, + "loss": 0.1785, + "num_input_tokens_seen": 19563048, + "step": 9992 + }, + { + "epoch": 1.3244532803180915, + "grad_norm": 1.9474326372146606, + "learning_rate": 2.9589020904360177e-06, + "loss": 0.0072, + "num_input_tokens_seen": 19565184, + "step": 9993 + }, + { + "epoch": 1.3245858184227965, + "grad_norm": 3.898163318634033, + "learning_rate": 2.9585608471384466e-06, + "loss": 0.0404, + "num_input_tokens_seen": 19567120, + "step": 9994 + }, + { + "epoch": 1.3247183565275016, + "grad_norm": 0.66218501329422, + "learning_rate": 2.9582195949995057e-06, + "loss": 0.0046, + "num_input_tokens_seen": 19568544, + "step": 9995 + }, + { + "epoch": 1.3248508946322066, + "grad_norm": 4.629344463348389, + "learning_rate": 2.9578783340257744e-06, + "loss": 0.0794, + "num_input_tokens_seen": 19570368, + "step": 9996 + }, + { + "epoch": 1.3249834327369119, + "grad_norm": 2.912808656692505, + "learning_rate": 2.9575370642238342e-06, + "loss": 0.0565, + "num_input_tokens_seen": 19572608, + "step": 9997 + }, + { + "epoch": 1.325115970841617, + "grad_norm": 4.5502824783325195, + "learning_rate": 2.957195785600263e-06, + "loss": 0.1112, + "num_input_tokens_seen": 19575184, + "step": 9998 + }, + { + "epoch": 1.3252485089463222, + "grad_norm": 13.879768371582031, + "learning_rate": 2.9568544981616413e-06, + "loss": 0.2624, + "num_input_tokens_seen": 19577400, + "step": 9999 + }, + { + "epoch": 1.3253810470510272, + "grad_norm": 4.059289932250977, + "learning_rate": 2.9565132019145503e-06, + "loss": 0.0468, + "num_input_tokens_seen": 19579072, + "step": 10000 + }, + { + "epoch": 1.3255135851557323, + "grad_norm": 8.60438060760498, + "learning_rate": 2.95617189686557e-06, + "loss": 0.1337, + "num_input_tokens_seen": 19580984, + "step": 10001 + }, + { + "epoch": 1.3256461232604373, + "grad_norm": 6.7872724533081055, + "learning_rate": 2.9558305830212802e-06, + "loss": 0.0645, + "num_input_tokens_seen": 19582288, + "step": 10002 + }, + { + "epoch": 1.3257786613651426, + "grad_norm": 1.1699812412261963, + "learning_rate": 2.9554892603882625e-06, + "loss": 0.0045, + "num_input_tokens_seen": 19583800, + "step": 10003 + }, + { + "epoch": 1.3259111994698476, + "grad_norm": 0.13351790606975555, + "learning_rate": 2.955147928973098e-06, + "loss": 0.0009, + "num_input_tokens_seen": 19585488, + "step": 10004 + }, + { + "epoch": 1.3260437375745526, + "grad_norm": 0.215609610080719, + "learning_rate": 2.9548065887823673e-06, + "loss": 0.0015, + "num_input_tokens_seen": 19588976, + "step": 10005 + }, + { + "epoch": 1.3261762756792579, + "grad_norm": 3.2422540187835693, + "learning_rate": 2.9544652398226516e-06, + "loss": 0.0325, + "num_input_tokens_seen": 19590616, + "step": 10006 + }, + { + "epoch": 1.326308813783963, + "grad_norm": 12.906964302062988, + "learning_rate": 2.9541238821005325e-06, + "loss": 0.1524, + "num_input_tokens_seen": 19593336, + "step": 10007 + }, + { + "epoch": 1.326441351888668, + "grad_norm": 0.3269090950489044, + "learning_rate": 2.9537825156225914e-06, + "loss": 0.0022, + "num_input_tokens_seen": 19595352, + "step": 10008 + }, + { + "epoch": 1.326573889993373, + "grad_norm": 13.211973190307617, + "learning_rate": 2.953441140395411e-06, + "loss": 0.2347, + "num_input_tokens_seen": 19597824, + "step": 10009 + }, + { + "epoch": 1.3267064280980783, + "grad_norm": 4.517259120941162, + "learning_rate": 2.9530997564255728e-06, + "loss": 0.1121, + "num_input_tokens_seen": 19599496, + "step": 10010 + }, + { + "epoch": 1.3268389662027833, + "grad_norm": 1.8769259452819824, + "learning_rate": 2.9527583637196576e-06, + "loss": 0.0478, + "num_input_tokens_seen": 19601192, + "step": 10011 + }, + { + "epoch": 1.3269715043074883, + "grad_norm": 0.07857752591371536, + "learning_rate": 2.9524169622842497e-06, + "loss": 0.0005, + "num_input_tokens_seen": 19602712, + "step": 10012 + }, + { + "epoch": 1.3271040424121936, + "grad_norm": 1.4949793815612793, + "learning_rate": 2.9520755521259304e-06, + "loss": 0.0131, + "num_input_tokens_seen": 19604032, + "step": 10013 + }, + { + "epoch": 1.3272365805168986, + "grad_norm": 0.6585063934326172, + "learning_rate": 2.9517341332512826e-06, + "loss": 0.0052, + "num_input_tokens_seen": 19605480, + "step": 10014 + }, + { + "epoch": 1.3273691186216037, + "grad_norm": 23.149818420410156, + "learning_rate": 2.9513927056668897e-06, + "loss": 0.038, + "num_input_tokens_seen": 19606848, + "step": 10015 + }, + { + "epoch": 1.3275016567263087, + "grad_norm": 3.8701798915863037, + "learning_rate": 2.9510512693793333e-06, + "loss": 0.1129, + "num_input_tokens_seen": 19608456, + "step": 10016 + }, + { + "epoch": 1.327634194831014, + "grad_norm": 0.030370693653821945, + "learning_rate": 2.9507098243951972e-06, + "loss": 0.0002, + "num_input_tokens_seen": 19609640, + "step": 10017 + }, + { + "epoch": 1.327766732935719, + "grad_norm": 4.822741508483887, + "learning_rate": 2.950368370721065e-06, + "loss": 0.1762, + "num_input_tokens_seen": 19612680, + "step": 10018 + }, + { + "epoch": 1.327899271040424, + "grad_norm": 6.039475440979004, + "learning_rate": 2.9500269083635198e-06, + "loss": 0.084, + "num_input_tokens_seen": 19614168, + "step": 10019 + }, + { + "epoch": 1.3280318091451293, + "grad_norm": 16.466712951660156, + "learning_rate": 2.949685437329145e-06, + "loss": 0.2928, + "num_input_tokens_seen": 19616664, + "step": 10020 + }, + { + "epoch": 1.3281643472498343, + "grad_norm": 10.803589820861816, + "learning_rate": 2.9493439576245254e-06, + "loss": 0.3449, + "num_input_tokens_seen": 19618904, + "step": 10021 + }, + { + "epoch": 1.3282968853545394, + "grad_norm": 0.15575425326824188, + "learning_rate": 2.949002469256245e-06, + "loss": 0.001, + "num_input_tokens_seen": 19621232, + "step": 10022 + }, + { + "epoch": 1.3284294234592444, + "grad_norm": 0.31767404079437256, + "learning_rate": 2.9486609722308856e-06, + "loss": 0.0015, + "num_input_tokens_seen": 19623088, + "step": 10023 + }, + { + "epoch": 1.3285619615639497, + "grad_norm": 13.803759574890137, + "learning_rate": 2.9483194665550335e-06, + "loss": 0.2117, + "num_input_tokens_seen": 19624720, + "step": 10024 + }, + { + "epoch": 1.3286944996686547, + "grad_norm": 0.6023021340370178, + "learning_rate": 2.947977952235273e-06, + "loss": 0.0027, + "num_input_tokens_seen": 19626184, + "step": 10025 + }, + { + "epoch": 1.32882703777336, + "grad_norm": 7.839282989501953, + "learning_rate": 2.9476364292781888e-06, + "loss": 0.2793, + "num_input_tokens_seen": 19628048, + "step": 10026 + }, + { + "epoch": 1.328959575878065, + "grad_norm": 4.595125675201416, + "learning_rate": 2.947294897690366e-06, + "loss": 0.0944, + "num_input_tokens_seen": 19630448, + "step": 10027 + }, + { + "epoch": 1.32909211398277, + "grad_norm": 5.367440700531006, + "learning_rate": 2.9469533574783877e-06, + "loss": 0.0545, + "num_input_tokens_seen": 19632424, + "step": 10028 + }, + { + "epoch": 1.329224652087475, + "grad_norm": 0.023499052971601486, + "learning_rate": 2.946611808648841e-06, + "loss": 0.0002, + "num_input_tokens_seen": 19634000, + "step": 10029 + }, + { + "epoch": 1.3293571901921801, + "grad_norm": 2.7969186305999756, + "learning_rate": 2.9462702512083106e-06, + "loss": 0.0154, + "num_input_tokens_seen": 19635832, + "step": 10030 + }, + { + "epoch": 1.3294897282968854, + "grad_norm": 8.23483657836914, + "learning_rate": 2.945928685163382e-06, + "loss": 0.3252, + "num_input_tokens_seen": 19637472, + "step": 10031 + }, + { + "epoch": 1.3296222664015904, + "grad_norm": 0.21697549521923065, + "learning_rate": 2.94558711052064e-06, + "loss": 0.0011, + "num_input_tokens_seen": 19639920, + "step": 10032 + }, + { + "epoch": 1.3297548045062957, + "grad_norm": 7.126245021820068, + "learning_rate": 2.945245527286672e-06, + "loss": 0.1254, + "num_input_tokens_seen": 19641840, + "step": 10033 + }, + { + "epoch": 1.3298873426110007, + "grad_norm": 0.010683054104447365, + "learning_rate": 2.944903935468063e-06, + "loss": 0.0001, + "num_input_tokens_seen": 19642976, + "step": 10034 + }, + { + "epoch": 1.3300198807157058, + "grad_norm": 11.981829643249512, + "learning_rate": 2.9445623350713985e-06, + "loss": 0.3104, + "num_input_tokens_seen": 19645552, + "step": 10035 + }, + { + "epoch": 1.3301524188204108, + "grad_norm": 3.877479314804077, + "learning_rate": 2.944220726103266e-06, + "loss": 0.0681, + "num_input_tokens_seen": 19648080, + "step": 10036 + }, + { + "epoch": 1.3302849569251158, + "grad_norm": 0.010967373847961426, + "learning_rate": 2.9438791085702513e-06, + "loss": 0.0001, + "num_input_tokens_seen": 19649352, + "step": 10037 + }, + { + "epoch": 1.330417495029821, + "grad_norm": 19.11020278930664, + "learning_rate": 2.9435374824789413e-06, + "loss": 0.4749, + "num_input_tokens_seen": 19651456, + "step": 10038 + }, + { + "epoch": 1.3305500331345261, + "grad_norm": 0.005261686630547047, + "learning_rate": 2.9431958478359223e-06, + "loss": 0.0, + "num_input_tokens_seen": 19652432, + "step": 10039 + }, + { + "epoch": 1.3306825712392314, + "grad_norm": 6.606871128082275, + "learning_rate": 2.942854204647782e-06, + "loss": 0.0718, + "num_input_tokens_seen": 19654728, + "step": 10040 + }, + { + "epoch": 1.3308151093439364, + "grad_norm": 1.32657790184021, + "learning_rate": 2.9425125529211072e-06, + "loss": 0.0251, + "num_input_tokens_seen": 19657288, + "step": 10041 + }, + { + "epoch": 1.3309476474486415, + "grad_norm": 0.0718064159154892, + "learning_rate": 2.9421708926624846e-06, + "loss": 0.0004, + "num_input_tokens_seen": 19659704, + "step": 10042 + }, + { + "epoch": 1.3310801855533465, + "grad_norm": 0.0737922340631485, + "learning_rate": 2.9418292238785025e-06, + "loss": 0.0005, + "num_input_tokens_seen": 19661416, + "step": 10043 + }, + { + "epoch": 1.3312127236580518, + "grad_norm": 0.05091865360736847, + "learning_rate": 2.9414875465757487e-06, + "loss": 0.0003, + "num_input_tokens_seen": 19663768, + "step": 10044 + }, + { + "epoch": 1.3313452617627568, + "grad_norm": 10.098896026611328, + "learning_rate": 2.9411458607608097e-06, + "loss": 0.1686, + "num_input_tokens_seen": 19665296, + "step": 10045 + }, + { + "epoch": 1.3314777998674618, + "grad_norm": 0.037031542509794235, + "learning_rate": 2.940804166440274e-06, + "loss": 0.0002, + "num_input_tokens_seen": 19666520, + "step": 10046 + }, + { + "epoch": 1.331610337972167, + "grad_norm": 5.026845932006836, + "learning_rate": 2.9404624636207302e-06, + "loss": 0.0582, + "num_input_tokens_seen": 19668176, + "step": 10047 + }, + { + "epoch": 1.3317428760768721, + "grad_norm": 6.104671001434326, + "learning_rate": 2.9401207523087667e-06, + "loss": 0.1335, + "num_input_tokens_seen": 19670512, + "step": 10048 + }, + { + "epoch": 1.3318754141815772, + "grad_norm": 5.615001678466797, + "learning_rate": 2.9397790325109705e-06, + "loss": 0.0249, + "num_input_tokens_seen": 19672744, + "step": 10049 + }, + { + "epoch": 1.3320079522862822, + "grad_norm": 5.745657920837402, + "learning_rate": 2.939437304233932e-06, + "loss": 0.0731, + "num_input_tokens_seen": 19674976, + "step": 10050 + }, + { + "epoch": 1.3321404903909875, + "grad_norm": 0.6104015707969666, + "learning_rate": 2.9390955674842386e-06, + "loss": 0.0015, + "num_input_tokens_seen": 19677152, + "step": 10051 + }, + { + "epoch": 1.3322730284956925, + "grad_norm": 0.02321402169764042, + "learning_rate": 2.93875382226848e-06, + "loss": 0.0001, + "num_input_tokens_seen": 19679856, + "step": 10052 + }, + { + "epoch": 1.3324055666003976, + "grad_norm": 2.957949638366699, + "learning_rate": 2.9384120685932448e-06, + "loss": 0.0299, + "num_input_tokens_seen": 19681232, + "step": 10053 + }, + { + "epoch": 1.3325381047051028, + "grad_norm": 9.981496810913086, + "learning_rate": 2.9380703064651227e-06, + "loss": 0.1688, + "num_input_tokens_seen": 19683352, + "step": 10054 + }, + { + "epoch": 1.3326706428098078, + "grad_norm": 10.203073501586914, + "learning_rate": 2.9377285358907027e-06, + "loss": 0.3986, + "num_input_tokens_seen": 19684952, + "step": 10055 + }, + { + "epoch": 1.3328031809145129, + "grad_norm": 0.004375721327960491, + "learning_rate": 2.9373867568765748e-06, + "loss": 0.0, + "num_input_tokens_seen": 19686272, + "step": 10056 + }, + { + "epoch": 1.332935719019218, + "grad_norm": 1.7350661754608154, + "learning_rate": 2.937044969429329e-06, + "loss": 0.019, + "num_input_tokens_seen": 19688888, + "step": 10057 + }, + { + "epoch": 1.3330682571239232, + "grad_norm": 0.1596454530954361, + "learning_rate": 2.936703173555553e-06, + "loss": 0.0007, + "num_input_tokens_seen": 19691320, + "step": 10058 + }, + { + "epoch": 1.3332007952286282, + "grad_norm": 9.800030708312988, + "learning_rate": 2.9363613692618402e-06, + "loss": 0.116, + "num_input_tokens_seen": 19693000, + "step": 10059 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 11.671333312988281, + "learning_rate": 2.9360195565547783e-06, + "loss": 0.3986, + "num_input_tokens_seen": 19695560, + "step": 10060 + }, + { + "epoch": 1.3334658714380385, + "grad_norm": 8.852104187011719, + "learning_rate": 2.935677735440959e-06, + "loss": 0.2575, + "num_input_tokens_seen": 19697088, + "step": 10061 + }, + { + "epoch": 1.3335984095427436, + "grad_norm": 7.2411909103393555, + "learning_rate": 2.9353359059269727e-06, + "loss": 0.152, + "num_input_tokens_seen": 19699480, + "step": 10062 + }, + { + "epoch": 1.3337309476474486, + "grad_norm": 8.616983413696289, + "learning_rate": 2.934994068019409e-06, + "loss": 0.1954, + "num_input_tokens_seen": 19701256, + "step": 10063 + }, + { + "epoch": 1.3338634857521536, + "grad_norm": 3.4285824298858643, + "learning_rate": 2.9346522217248597e-06, + "loss": 0.0588, + "num_input_tokens_seen": 19703384, + "step": 10064 + }, + { + "epoch": 1.333996023856859, + "grad_norm": 9.62035846710205, + "learning_rate": 2.9343103670499157e-06, + "loss": 0.0674, + "num_input_tokens_seen": 19705128, + "step": 10065 + }, + { + "epoch": 1.334128561961564, + "grad_norm": 0.5120173692703247, + "learning_rate": 2.9339685040011688e-06, + "loss": 0.004, + "num_input_tokens_seen": 19707016, + "step": 10066 + }, + { + "epoch": 1.3342611000662692, + "grad_norm": 0.04398007690906525, + "learning_rate": 2.9336266325852093e-06, + "loss": 0.0003, + "num_input_tokens_seen": 19708488, + "step": 10067 + }, + { + "epoch": 1.3343936381709742, + "grad_norm": 6.221183776855469, + "learning_rate": 2.9332847528086294e-06, + "loss": 0.0852, + "num_input_tokens_seen": 19710408, + "step": 10068 + }, + { + "epoch": 1.3345261762756793, + "grad_norm": 4.612703323364258, + "learning_rate": 2.9329428646780206e-06, + "loss": 0.1367, + "num_input_tokens_seen": 19712712, + "step": 10069 + }, + { + "epoch": 1.3346587143803843, + "grad_norm": 1.6482594013214111, + "learning_rate": 2.9326009681999745e-06, + "loss": 0.0205, + "num_input_tokens_seen": 19714440, + "step": 10070 + }, + { + "epoch": 1.3347912524850893, + "grad_norm": 11.199882507324219, + "learning_rate": 2.932259063381083e-06, + "loss": 0.3282, + "num_input_tokens_seen": 19717528, + "step": 10071 + }, + { + "epoch": 1.3349237905897946, + "grad_norm": 0.02905864641070366, + "learning_rate": 2.931917150227939e-06, + "loss": 0.0001, + "num_input_tokens_seen": 19719768, + "step": 10072 + }, + { + "epoch": 1.3350563286944996, + "grad_norm": 0.11134571582078934, + "learning_rate": 2.931575228747134e-06, + "loss": 0.0005, + "num_input_tokens_seen": 19721448, + "step": 10073 + }, + { + "epoch": 1.335188866799205, + "grad_norm": 8.956958770751953, + "learning_rate": 2.9312332989452615e-06, + "loss": 0.2012, + "num_input_tokens_seen": 19723656, + "step": 10074 + }, + { + "epoch": 1.33532140490391, + "grad_norm": 3.762080430984497, + "learning_rate": 2.930891360828913e-06, + "loss": 0.0362, + "num_input_tokens_seen": 19725224, + "step": 10075 + }, + { + "epoch": 1.335453943008615, + "grad_norm": 6.9835591316223145, + "learning_rate": 2.930549414404682e-06, + "loss": 0.1084, + "num_input_tokens_seen": 19726848, + "step": 10076 + }, + { + "epoch": 1.33558648111332, + "grad_norm": 5.485349178314209, + "learning_rate": 2.930207459679161e-06, + "loss": 0.0829, + "num_input_tokens_seen": 19729104, + "step": 10077 + }, + { + "epoch": 1.335719019218025, + "grad_norm": 0.029335543513298035, + "learning_rate": 2.929865496658944e-06, + "loss": 0.0001, + "num_input_tokens_seen": 19731088, + "step": 10078 + }, + { + "epoch": 1.3358515573227303, + "grad_norm": 10.912320137023926, + "learning_rate": 2.929523525350623e-06, + "loss": 0.2798, + "num_input_tokens_seen": 19732920, + "step": 10079 + }, + { + "epoch": 1.3359840954274353, + "grad_norm": 4.1649699211120605, + "learning_rate": 2.9291815457607933e-06, + "loss": 0.0912, + "num_input_tokens_seen": 19734616, + "step": 10080 + }, + { + "epoch": 1.3361166335321406, + "grad_norm": 1.1570743322372437, + "learning_rate": 2.928839557896046e-06, + "loss": 0.0159, + "num_input_tokens_seen": 19736520, + "step": 10081 + }, + { + "epoch": 1.3362491716368456, + "grad_norm": 0.035161860287189484, + "learning_rate": 2.9284975617629763e-06, + "loss": 0.0002, + "num_input_tokens_seen": 19738176, + "step": 10082 + }, + { + "epoch": 1.3363817097415507, + "grad_norm": 0.44542983174324036, + "learning_rate": 2.928155557368178e-06, + "loss": 0.0038, + "num_input_tokens_seen": 19740224, + "step": 10083 + }, + { + "epoch": 1.3365142478462557, + "grad_norm": 0.43377885222435, + "learning_rate": 2.927813544718246e-06, + "loss": 0.0016, + "num_input_tokens_seen": 19743000, + "step": 10084 + }, + { + "epoch": 1.336646785950961, + "grad_norm": 5.311315059661865, + "learning_rate": 2.9274715238197734e-06, + "loss": 0.0391, + "num_input_tokens_seen": 19744536, + "step": 10085 + }, + { + "epoch": 1.336779324055666, + "grad_norm": 8.02383041381836, + "learning_rate": 2.9271294946793542e-06, + "loss": 0.1608, + "num_input_tokens_seen": 19745496, + "step": 10086 + }, + { + "epoch": 1.336911862160371, + "grad_norm": 0.14343765377998352, + "learning_rate": 2.926787457303585e-06, + "loss": 0.0015, + "num_input_tokens_seen": 19747128, + "step": 10087 + }, + { + "epoch": 1.3370444002650763, + "grad_norm": 13.380282402038574, + "learning_rate": 2.926445411699058e-06, + "loss": 0.1181, + "num_input_tokens_seen": 19748472, + "step": 10088 + }, + { + "epoch": 1.3371769383697814, + "grad_norm": 0.01169082336127758, + "learning_rate": 2.92610335787237e-06, + "loss": 0.0001, + "num_input_tokens_seen": 19751384, + "step": 10089 + }, + { + "epoch": 1.3373094764744864, + "grad_norm": 10.448746681213379, + "learning_rate": 2.9257612958301146e-06, + "loss": 0.2239, + "num_input_tokens_seen": 19755344, + "step": 10090 + }, + { + "epoch": 1.3374420145791914, + "grad_norm": 4.047427177429199, + "learning_rate": 2.925419225578888e-06, + "loss": 0.0438, + "num_input_tokens_seen": 19757376, + "step": 10091 + }, + { + "epoch": 1.3375745526838967, + "grad_norm": 7.419445037841797, + "learning_rate": 2.9250771471252854e-06, + "loss": 0.1977, + "num_input_tokens_seen": 19759832, + "step": 10092 + }, + { + "epoch": 1.3377070907886017, + "grad_norm": 0.05178523808717728, + "learning_rate": 2.924735060475902e-06, + "loss": 0.0001, + "num_input_tokens_seen": 19760968, + "step": 10093 + }, + { + "epoch": 1.3378396288933068, + "grad_norm": 9.346990585327148, + "learning_rate": 2.924392965637334e-06, + "loss": 0.0569, + "num_input_tokens_seen": 19763960, + "step": 10094 + }, + { + "epoch": 1.337972166998012, + "grad_norm": 8.95223617553711, + "learning_rate": 2.9240508626161766e-06, + "loss": 0.3841, + "num_input_tokens_seen": 19765456, + "step": 10095 + }, + { + "epoch": 1.338104705102717, + "grad_norm": 19.677982330322266, + "learning_rate": 2.923708751419026e-06, + "loss": 0.1088, + "num_input_tokens_seen": 19768480, + "step": 10096 + }, + { + "epoch": 1.338237243207422, + "grad_norm": 0.1691170036792755, + "learning_rate": 2.923366632052478e-06, + "loss": 0.0005, + "num_input_tokens_seen": 19770968, + "step": 10097 + }, + { + "epoch": 1.3383697813121271, + "grad_norm": 13.489299774169922, + "learning_rate": 2.9230245045231303e-06, + "loss": 0.3041, + "num_input_tokens_seen": 19773904, + "step": 10098 + }, + { + "epoch": 1.3385023194168324, + "grad_norm": 8.371328353881836, + "learning_rate": 2.922682368837577e-06, + "loss": 0.2247, + "num_input_tokens_seen": 19775392, + "step": 10099 + }, + { + "epoch": 1.3386348575215374, + "grad_norm": 0.026870815083384514, + "learning_rate": 2.922340225002417e-06, + "loss": 0.0002, + "num_input_tokens_seen": 19777424, + "step": 10100 + }, + { + "epoch": 1.3387673956262425, + "grad_norm": 0.27529075741767883, + "learning_rate": 2.9219980730242457e-06, + "loss": 0.0013, + "num_input_tokens_seen": 19779296, + "step": 10101 + }, + { + "epoch": 1.3388999337309477, + "grad_norm": 0.11955448985099792, + "learning_rate": 2.92165591290966e-06, + "loss": 0.0005, + "num_input_tokens_seen": 19780984, + "step": 10102 + }, + { + "epoch": 1.3390324718356528, + "grad_norm": 4.565912246704102, + "learning_rate": 2.9213137446652586e-06, + "loss": 0.0809, + "num_input_tokens_seen": 19783016, + "step": 10103 + }, + { + "epoch": 1.3391650099403578, + "grad_norm": 8.942695617675781, + "learning_rate": 2.920971568297637e-06, + "loss": 0.2034, + "num_input_tokens_seen": 19785504, + "step": 10104 + }, + { + "epoch": 1.3392975480450628, + "grad_norm": 4.521328926086426, + "learning_rate": 2.9206293838133926e-06, + "loss": 0.1611, + "num_input_tokens_seen": 19787608, + "step": 10105 + }, + { + "epoch": 1.339430086149768, + "grad_norm": 0.9563708901405334, + "learning_rate": 2.9202871912191245e-06, + "loss": 0.0162, + "num_input_tokens_seen": 19789584, + "step": 10106 + }, + { + "epoch": 1.3395626242544731, + "grad_norm": 13.546354293823242, + "learning_rate": 2.919944990521429e-06, + "loss": 0.3334, + "num_input_tokens_seen": 19791968, + "step": 10107 + }, + { + "epoch": 1.3396951623591782, + "grad_norm": 10.50308609008789, + "learning_rate": 2.9196027817269036e-06, + "loss": 0.3293, + "num_input_tokens_seen": 19794312, + "step": 10108 + }, + { + "epoch": 1.3398277004638834, + "grad_norm": 1.4749419689178467, + "learning_rate": 2.919260564842148e-06, + "loss": 0.0178, + "num_input_tokens_seen": 19796200, + "step": 10109 + }, + { + "epoch": 1.3399602385685885, + "grad_norm": 8.910107612609863, + "learning_rate": 2.9189183398737592e-06, + "loss": 0.2659, + "num_input_tokens_seen": 19799032, + "step": 10110 + }, + { + "epoch": 1.3400927766732935, + "grad_norm": 0.08865901827812195, + "learning_rate": 2.918576106828336e-06, + "loss": 0.0006, + "num_input_tokens_seen": 19800600, + "step": 10111 + }, + { + "epoch": 1.3402253147779986, + "grad_norm": 0.0036424901336431503, + "learning_rate": 2.9182338657124765e-06, + "loss": 0.0, + "num_input_tokens_seen": 19801768, + "step": 10112 + }, + { + "epoch": 1.3403578528827038, + "grad_norm": 5.268975257873535, + "learning_rate": 2.9178916165327796e-06, + "loss": 0.1854, + "num_input_tokens_seen": 19803720, + "step": 10113 + }, + { + "epoch": 1.3404903909874089, + "grad_norm": 4.819614410400391, + "learning_rate": 2.917549359295844e-06, + "loss": 0.1283, + "num_input_tokens_seen": 19805696, + "step": 10114 + }, + { + "epoch": 1.3406229290921141, + "grad_norm": 13.397098541259766, + "learning_rate": 2.9172070940082686e-06, + "loss": 0.3197, + "num_input_tokens_seen": 19807600, + "step": 10115 + }, + { + "epoch": 1.3407554671968192, + "grad_norm": 8.238554954528809, + "learning_rate": 2.916864820676653e-06, + "loss": 0.1971, + "num_input_tokens_seen": 19809272, + "step": 10116 + }, + { + "epoch": 1.3408880053015242, + "grad_norm": 0.016477268189191818, + "learning_rate": 2.916522539307595e-06, + "loss": 0.0001, + "num_input_tokens_seen": 19810328, + "step": 10117 + }, + { + "epoch": 1.3410205434062292, + "grad_norm": 0.04328158497810364, + "learning_rate": 2.916180249907696e-06, + "loss": 0.0003, + "num_input_tokens_seen": 19813096, + "step": 10118 + }, + { + "epoch": 1.3411530815109343, + "grad_norm": 7.495695114135742, + "learning_rate": 2.9158379524835556e-06, + "loss": 0.0877, + "num_input_tokens_seen": 19815608, + "step": 10119 + }, + { + "epoch": 1.3412856196156395, + "grad_norm": 3.471045732498169, + "learning_rate": 2.9154956470417704e-06, + "loss": 0.0265, + "num_input_tokens_seen": 19817336, + "step": 10120 + }, + { + "epoch": 1.3414181577203446, + "grad_norm": 0.19808629155158997, + "learning_rate": 2.9151533335889446e-06, + "loss": 0.0029, + "num_input_tokens_seen": 19819824, + "step": 10121 + }, + { + "epoch": 1.3415506958250498, + "grad_norm": 0.04542338475584984, + "learning_rate": 2.9148110121316752e-06, + "loss": 0.0003, + "num_input_tokens_seen": 19822256, + "step": 10122 + }, + { + "epoch": 1.3416832339297549, + "grad_norm": 1.081764817237854, + "learning_rate": 2.914468682676563e-06, + "loss": 0.0061, + "num_input_tokens_seen": 19823960, + "step": 10123 + }, + { + "epoch": 1.34181577203446, + "grad_norm": 7.454776763916016, + "learning_rate": 2.9141263452302095e-06, + "loss": 0.1221, + "num_input_tokens_seen": 19825472, + "step": 10124 + }, + { + "epoch": 1.341948310139165, + "grad_norm": 12.951793670654297, + "learning_rate": 2.9137839997992133e-06, + "loss": 0.1706, + "num_input_tokens_seen": 19827232, + "step": 10125 + }, + { + "epoch": 1.3420808482438702, + "grad_norm": 0.0587170273065567, + "learning_rate": 2.9134416463901773e-06, + "loss": 0.0004, + "num_input_tokens_seen": 19828720, + "step": 10126 + }, + { + "epoch": 1.3422133863485752, + "grad_norm": 0.05062089115381241, + "learning_rate": 2.9130992850097006e-06, + "loss": 0.0004, + "num_input_tokens_seen": 19830448, + "step": 10127 + }, + { + "epoch": 1.3423459244532803, + "grad_norm": 0.9958415627479553, + "learning_rate": 2.9127569156643842e-06, + "loss": 0.0046, + "num_input_tokens_seen": 19831520, + "step": 10128 + }, + { + "epoch": 1.3424784625579855, + "grad_norm": 11.671673774719238, + "learning_rate": 2.9124145383608304e-06, + "loss": 0.2535, + "num_input_tokens_seen": 19833704, + "step": 10129 + }, + { + "epoch": 1.3426110006626906, + "grad_norm": 0.027266185730695724, + "learning_rate": 2.9120721531056395e-06, + "loss": 0.0002, + "num_input_tokens_seen": 19835176, + "step": 10130 + }, + { + "epoch": 1.3427435387673956, + "grad_norm": 2.540097236633301, + "learning_rate": 2.9117297599054135e-06, + "loss": 0.0375, + "num_input_tokens_seen": 19837504, + "step": 10131 + }, + { + "epoch": 1.3428760768721006, + "grad_norm": 12.47049331665039, + "learning_rate": 2.911387358766753e-06, + "loss": 0.1394, + "num_input_tokens_seen": 19838960, + "step": 10132 + }, + { + "epoch": 1.343008614976806, + "grad_norm": 3.064147710800171, + "learning_rate": 2.911044949696261e-06, + "loss": 0.0131, + "num_input_tokens_seen": 19840536, + "step": 10133 + }, + { + "epoch": 1.343141153081511, + "grad_norm": 2.4661829471588135, + "learning_rate": 2.910702532700539e-06, + "loss": 0.0064, + "num_input_tokens_seen": 19843376, + "step": 10134 + }, + { + "epoch": 1.343273691186216, + "grad_norm": 0.021428564563393593, + "learning_rate": 2.9103601077861875e-06, + "loss": 0.0002, + "num_input_tokens_seen": 19844784, + "step": 10135 + }, + { + "epoch": 1.3434062292909212, + "grad_norm": 1.7033615112304688, + "learning_rate": 2.910017674959812e-06, + "loss": 0.0123, + "num_input_tokens_seen": 19846440, + "step": 10136 + }, + { + "epoch": 1.3435387673956263, + "grad_norm": 5.182422637939453, + "learning_rate": 2.909675234228012e-06, + "loss": 0.0891, + "num_input_tokens_seen": 19848280, + "step": 10137 + }, + { + "epoch": 1.3436713055003313, + "grad_norm": 1.9017574787139893, + "learning_rate": 2.9093327855973904e-06, + "loss": 0.0117, + "num_input_tokens_seen": 19849760, + "step": 10138 + }, + { + "epoch": 1.3438038436050364, + "grad_norm": 7.488884925842285, + "learning_rate": 2.9089903290745507e-06, + "loss": 0.1768, + "num_input_tokens_seen": 19851240, + "step": 10139 + }, + { + "epoch": 1.3439363817097416, + "grad_norm": 0.7020218968391418, + "learning_rate": 2.9086478646660952e-06, + "loss": 0.0043, + "num_input_tokens_seen": 19853360, + "step": 10140 + }, + { + "epoch": 1.3440689198144466, + "grad_norm": 12.125798225402832, + "learning_rate": 2.908305392378627e-06, + "loss": 0.2483, + "num_input_tokens_seen": 19855520, + "step": 10141 + }, + { + "epoch": 1.3442014579191517, + "grad_norm": 3.5772783756256104, + "learning_rate": 2.9079629122187497e-06, + "loss": 0.031, + "num_input_tokens_seen": 19858712, + "step": 10142 + }, + { + "epoch": 1.344333996023857, + "grad_norm": 0.4007512331008911, + "learning_rate": 2.9076204241930654e-06, + "loss": 0.0053, + "num_input_tokens_seen": 19860320, + "step": 10143 + }, + { + "epoch": 1.344466534128562, + "grad_norm": 0.014961271546781063, + "learning_rate": 2.907277928308178e-06, + "loss": 0.0001, + "num_input_tokens_seen": 19861792, + "step": 10144 + }, + { + "epoch": 1.344599072233267, + "grad_norm": 13.702409744262695, + "learning_rate": 2.906935424570692e-06, + "loss": 0.2687, + "num_input_tokens_seen": 19864440, + "step": 10145 + }, + { + "epoch": 1.344731610337972, + "grad_norm": 0.8807671070098877, + "learning_rate": 2.9065929129872097e-06, + "loss": 0.0076, + "num_input_tokens_seen": 19865968, + "step": 10146 + }, + { + "epoch": 1.3448641484426773, + "grad_norm": 8.62695598602295, + "learning_rate": 2.906250393564335e-06, + "loss": 0.1194, + "num_input_tokens_seen": 19867800, + "step": 10147 + }, + { + "epoch": 1.3449966865473824, + "grad_norm": 3.3124172687530518, + "learning_rate": 2.905907866308673e-06, + "loss": 0.0274, + "num_input_tokens_seen": 19869744, + "step": 10148 + }, + { + "epoch": 1.3451292246520874, + "grad_norm": 7.892882823944092, + "learning_rate": 2.9055653312268277e-06, + "loss": 0.1785, + "num_input_tokens_seen": 19871680, + "step": 10149 + }, + { + "epoch": 1.3452617627567927, + "grad_norm": 11.916269302368164, + "learning_rate": 2.9052227883254023e-06, + "loss": 0.2335, + "num_input_tokens_seen": 19873496, + "step": 10150 + }, + { + "epoch": 1.3453943008614977, + "grad_norm": 0.008640076033771038, + "learning_rate": 2.9048802376110023e-06, + "loss": 0.0001, + "num_input_tokens_seen": 19876096, + "step": 10151 + }, + { + "epoch": 1.3455268389662027, + "grad_norm": 0.02837945520877838, + "learning_rate": 2.9045376790902326e-06, + "loss": 0.0002, + "num_input_tokens_seen": 19878008, + "step": 10152 + }, + { + "epoch": 1.3456593770709078, + "grad_norm": 6.174884796142578, + "learning_rate": 2.904195112769696e-06, + "loss": 0.0219, + "num_input_tokens_seen": 19879440, + "step": 10153 + }, + { + "epoch": 1.345791915175613, + "grad_norm": 0.005460857413709164, + "learning_rate": 2.903852538656e-06, + "loss": 0.0, + "num_input_tokens_seen": 19880672, + "step": 10154 + }, + { + "epoch": 1.345924453280318, + "grad_norm": 9.355748176574707, + "learning_rate": 2.9035099567557477e-06, + "loss": 0.2427, + "num_input_tokens_seen": 19883192, + "step": 10155 + }, + { + "epoch": 1.3460569913850233, + "grad_norm": 8.21791934967041, + "learning_rate": 2.9031673670755455e-06, + "loss": 0.1599, + "num_input_tokens_seen": 19884784, + "step": 10156 + }, + { + "epoch": 1.3461895294897284, + "grad_norm": 2.5992867946624756, + "learning_rate": 2.902824769621999e-06, + "loss": 0.0696, + "num_input_tokens_seen": 19886672, + "step": 10157 + }, + { + "epoch": 1.3463220675944334, + "grad_norm": 10.464076042175293, + "learning_rate": 2.902482164401712e-06, + "loss": 0.2608, + "num_input_tokens_seen": 19887984, + "step": 10158 + }, + { + "epoch": 1.3464546056991384, + "grad_norm": 8.851065635681152, + "learning_rate": 2.9021395514212915e-06, + "loss": 0.217, + "num_input_tokens_seen": 19889952, + "step": 10159 + }, + { + "epoch": 1.3465871438038435, + "grad_norm": 10.956007957458496, + "learning_rate": 2.9017969306873435e-06, + "loss": 0.266, + "num_input_tokens_seen": 19892328, + "step": 10160 + }, + { + "epoch": 1.3467196819085487, + "grad_norm": 0.3187258541584015, + "learning_rate": 2.901454302206473e-06, + "loss": 0.0026, + "num_input_tokens_seen": 19893864, + "step": 10161 + }, + { + "epoch": 1.3468522200132538, + "grad_norm": 0.06205557659268379, + "learning_rate": 2.901111665985286e-06, + "loss": 0.0003, + "num_input_tokens_seen": 19895432, + "step": 10162 + }, + { + "epoch": 1.346984758117959, + "grad_norm": 0.0053013465367257595, + "learning_rate": 2.9007690220303906e-06, + "loss": 0.0, + "num_input_tokens_seen": 19897064, + "step": 10163 + }, + { + "epoch": 1.347117296222664, + "grad_norm": 0.017224103212356567, + "learning_rate": 2.9004263703483914e-06, + "loss": 0.0001, + "num_input_tokens_seen": 19899688, + "step": 10164 + }, + { + "epoch": 1.3472498343273691, + "grad_norm": 2.4016005992889404, + "learning_rate": 2.9000837109458952e-06, + "loss": 0.0334, + "num_input_tokens_seen": 19901032, + "step": 10165 + }, + { + "epoch": 1.3473823724320741, + "grad_norm": 0.02619960345327854, + "learning_rate": 2.8997410438295094e-06, + "loss": 0.0001, + "num_input_tokens_seen": 19902904, + "step": 10166 + }, + { + "epoch": 1.3475149105367792, + "grad_norm": 18.34744644165039, + "learning_rate": 2.8993983690058408e-06, + "loss": 0.4916, + "num_input_tokens_seen": 19905312, + "step": 10167 + }, + { + "epoch": 1.3476474486414844, + "grad_norm": 8.735126495361328, + "learning_rate": 2.8990556864814958e-06, + "loss": 0.2607, + "num_input_tokens_seen": 19907000, + "step": 10168 + }, + { + "epoch": 1.3477799867461895, + "grad_norm": 9.965794563293457, + "learning_rate": 2.8987129962630817e-06, + "loss": 0.2581, + "num_input_tokens_seen": 19909032, + "step": 10169 + }, + { + "epoch": 1.3479125248508947, + "grad_norm": 3.9884376525878906, + "learning_rate": 2.8983702983572058e-06, + "loss": 0.102, + "num_input_tokens_seen": 19910672, + "step": 10170 + }, + { + "epoch": 1.3480450629555998, + "grad_norm": 5.314563274383545, + "learning_rate": 2.8980275927704766e-06, + "loss": 0.0869, + "num_input_tokens_seen": 19912664, + "step": 10171 + }, + { + "epoch": 1.3481776010603048, + "grad_norm": 6.50699520111084, + "learning_rate": 2.8976848795095e-06, + "loss": 0.0239, + "num_input_tokens_seen": 19914544, + "step": 10172 + }, + { + "epoch": 1.3483101391650099, + "grad_norm": 4.813033103942871, + "learning_rate": 2.8973421585808846e-06, + "loss": 0.1234, + "num_input_tokens_seen": 19916416, + "step": 10173 + }, + { + "epoch": 1.3484426772697151, + "grad_norm": 0.07860854268074036, + "learning_rate": 2.8969994299912385e-06, + "loss": 0.0003, + "num_input_tokens_seen": 19917584, + "step": 10174 + }, + { + "epoch": 1.3485752153744202, + "grad_norm": 4.865730285644531, + "learning_rate": 2.8966566937471704e-06, + "loss": 0.0942, + "num_input_tokens_seen": 19920096, + "step": 10175 + }, + { + "epoch": 1.3487077534791252, + "grad_norm": 6.186446666717529, + "learning_rate": 2.8963139498552862e-06, + "loss": 0.0228, + "num_input_tokens_seen": 19921736, + "step": 10176 + }, + { + "epoch": 1.3488402915838305, + "grad_norm": 0.04173063859343529, + "learning_rate": 2.895971198322196e-06, + "loss": 0.0003, + "num_input_tokens_seen": 19924080, + "step": 10177 + }, + { + "epoch": 1.3489728296885355, + "grad_norm": 12.86430835723877, + "learning_rate": 2.8956284391545088e-06, + "loss": 0.1685, + "num_input_tokens_seen": 19925776, + "step": 10178 + }, + { + "epoch": 1.3491053677932405, + "grad_norm": 0.038513023406267166, + "learning_rate": 2.8952856723588313e-06, + "loss": 0.0002, + "num_input_tokens_seen": 19928400, + "step": 10179 + }, + { + "epoch": 1.3492379058979456, + "grad_norm": 8.598003387451172, + "learning_rate": 2.8949428979417737e-06, + "loss": 0.2563, + "num_input_tokens_seen": 19930488, + "step": 10180 + }, + { + "epoch": 1.3493704440026508, + "grad_norm": 3.0397284030914307, + "learning_rate": 2.8946001159099454e-06, + "loss": 0.0348, + "num_input_tokens_seen": 19933040, + "step": 10181 + }, + { + "epoch": 1.3495029821073559, + "grad_norm": 8.168566703796387, + "learning_rate": 2.8942573262699523e-06, + "loss": 0.1854, + "num_input_tokens_seen": 19935176, + "step": 10182 + }, + { + "epoch": 1.349635520212061, + "grad_norm": 4.828032493591309, + "learning_rate": 2.8939145290284082e-06, + "loss": 0.0502, + "num_input_tokens_seen": 19936488, + "step": 10183 + }, + { + "epoch": 1.3497680583167662, + "grad_norm": 0.024451255798339844, + "learning_rate": 2.89357172419192e-06, + "loss": 0.0002, + "num_input_tokens_seen": 19939440, + "step": 10184 + }, + { + "epoch": 1.3499005964214712, + "grad_norm": 0.03356282785534859, + "learning_rate": 2.8932289117670964e-06, + "loss": 0.0002, + "num_input_tokens_seen": 19941448, + "step": 10185 + }, + { + "epoch": 1.3500331345261762, + "grad_norm": 14.649279594421387, + "learning_rate": 2.892886091760549e-06, + "loss": 0.1989, + "num_input_tokens_seen": 19943488, + "step": 10186 + }, + { + "epoch": 1.3501656726308813, + "grad_norm": 7.300917625427246, + "learning_rate": 2.8925432641788862e-06, + "loss": 0.1145, + "num_input_tokens_seen": 19945520, + "step": 10187 + }, + { + "epoch": 1.3502982107355865, + "grad_norm": 3.558278799057007, + "learning_rate": 2.892200429028719e-06, + "loss": 0.0836, + "num_input_tokens_seen": 19947064, + "step": 10188 + }, + { + "epoch": 1.3504307488402916, + "grad_norm": 0.13311751186847687, + "learning_rate": 2.8918575863166572e-06, + "loss": 0.0012, + "num_input_tokens_seen": 19949792, + "step": 10189 + }, + { + "epoch": 1.3505632869449966, + "grad_norm": 4.461389064788818, + "learning_rate": 2.8915147360493095e-06, + "loss": 0.1042, + "num_input_tokens_seen": 19951968, + "step": 10190 + }, + { + "epoch": 1.3506958250497019, + "grad_norm": 6.293132781982422, + "learning_rate": 2.8911718782332888e-06, + "loss": 0.0531, + "num_input_tokens_seen": 19953856, + "step": 10191 + }, + { + "epoch": 1.350828363154407, + "grad_norm": 2.249549388885498, + "learning_rate": 2.8908290128752037e-06, + "loss": 0.0292, + "num_input_tokens_seen": 19955664, + "step": 10192 + }, + { + "epoch": 1.350960901259112, + "grad_norm": 14.243449211120605, + "learning_rate": 2.8904861399816658e-06, + "loss": 0.1987, + "num_input_tokens_seen": 19957880, + "step": 10193 + }, + { + "epoch": 1.351093439363817, + "grad_norm": 5.069923400878906, + "learning_rate": 2.890143259559286e-06, + "loss": 0.0888, + "num_input_tokens_seen": 19960136, + "step": 10194 + }, + { + "epoch": 1.3512259774685222, + "grad_norm": 1.965651273727417, + "learning_rate": 2.8898003716146745e-06, + "loss": 0.0148, + "num_input_tokens_seen": 19962976, + "step": 10195 + }, + { + "epoch": 1.3513585155732273, + "grad_norm": 24.33405113220215, + "learning_rate": 2.8894574761544437e-06, + "loss": 0.1173, + "num_input_tokens_seen": 19965880, + "step": 10196 + }, + { + "epoch": 1.3514910536779325, + "grad_norm": 9.608214378356934, + "learning_rate": 2.889114573185203e-06, + "loss": 0.1648, + "num_input_tokens_seen": 19969280, + "step": 10197 + }, + { + "epoch": 1.3516235917826376, + "grad_norm": 0.01072927750647068, + "learning_rate": 2.888771662713566e-06, + "loss": 0.0001, + "num_input_tokens_seen": 19970960, + "step": 10198 + }, + { + "epoch": 1.3517561298873426, + "grad_norm": 5.591780662536621, + "learning_rate": 2.8884287447461427e-06, + "loss": 0.1571, + "num_input_tokens_seen": 19972808, + "step": 10199 + }, + { + "epoch": 1.3518886679920477, + "grad_norm": 5.871712684631348, + "learning_rate": 2.888085819289545e-06, + "loss": 0.0367, + "num_input_tokens_seen": 19974648, + "step": 10200 + }, + { + "epoch": 1.3520212060967527, + "grad_norm": 5.371670722961426, + "learning_rate": 2.887742886350385e-06, + "loss": 0.0086, + "num_input_tokens_seen": 19976400, + "step": 10201 + }, + { + "epoch": 1.352153744201458, + "grad_norm": 0.9329508543014526, + "learning_rate": 2.8873999459352758e-06, + "loss": 0.008, + "num_input_tokens_seen": 19978400, + "step": 10202 + }, + { + "epoch": 1.352286282306163, + "grad_norm": 9.446467399597168, + "learning_rate": 2.8870569980508266e-06, + "loss": 0.2293, + "num_input_tokens_seen": 19980968, + "step": 10203 + }, + { + "epoch": 1.3524188204108682, + "grad_norm": 7.590339660644531, + "learning_rate": 2.886714042703653e-06, + "loss": 0.2046, + "num_input_tokens_seen": 19983472, + "step": 10204 + }, + { + "epoch": 1.3525513585155733, + "grad_norm": 4.051093101501465, + "learning_rate": 2.8863710799003645e-06, + "loss": 0.0643, + "num_input_tokens_seen": 19985208, + "step": 10205 + }, + { + "epoch": 1.3526838966202783, + "grad_norm": 0.039376918226480484, + "learning_rate": 2.8860281096475764e-06, + "loss": 0.0002, + "num_input_tokens_seen": 19987256, + "step": 10206 + }, + { + "epoch": 1.3528164347249834, + "grad_norm": 12.279695510864258, + "learning_rate": 2.885685131951899e-06, + "loss": 0.4457, + "num_input_tokens_seen": 19989392, + "step": 10207 + }, + { + "epoch": 1.3529489728296884, + "grad_norm": 7.10658597946167, + "learning_rate": 2.8853421468199465e-06, + "loss": 0.0961, + "num_input_tokens_seen": 19991152, + "step": 10208 + }, + { + "epoch": 1.3530815109343937, + "grad_norm": 8.920280456542969, + "learning_rate": 2.8849991542583315e-06, + "loss": 0.2092, + "num_input_tokens_seen": 19992928, + "step": 10209 + }, + { + "epoch": 1.3532140490390987, + "grad_norm": 0.9285838603973389, + "learning_rate": 2.884656154273667e-06, + "loss": 0.0043, + "num_input_tokens_seen": 19994984, + "step": 10210 + }, + { + "epoch": 1.353346587143804, + "grad_norm": 8.971582412719727, + "learning_rate": 2.8843131468725677e-06, + "loss": 0.0741, + "num_input_tokens_seen": 19997408, + "step": 10211 + }, + { + "epoch": 1.353479125248509, + "grad_norm": 5.864439487457275, + "learning_rate": 2.883970132061645e-06, + "loss": 0.1143, + "num_input_tokens_seen": 20000552, + "step": 10212 + }, + { + "epoch": 1.353611663353214, + "grad_norm": 0.030447054654359818, + "learning_rate": 2.8836271098475126e-06, + "loss": 0.0002, + "num_input_tokens_seen": 20003408, + "step": 10213 + }, + { + "epoch": 1.353744201457919, + "grad_norm": 3.24778151512146, + "learning_rate": 2.883284080236786e-06, + "loss": 0.0248, + "num_input_tokens_seen": 20006560, + "step": 10214 + }, + { + "epoch": 1.3538767395626243, + "grad_norm": 0.009181424044072628, + "learning_rate": 2.8829410432360773e-06, + "loss": 0.0001, + "num_input_tokens_seen": 20007880, + "step": 10215 + }, + { + "epoch": 1.3540092776673294, + "grad_norm": 0.016043713316321373, + "learning_rate": 2.8825979988520007e-06, + "loss": 0.0001, + "num_input_tokens_seen": 20009760, + "step": 10216 + }, + { + "epoch": 1.3541418157720344, + "grad_norm": 8.518325805664062, + "learning_rate": 2.882254947091172e-06, + "loss": 0.1072, + "num_input_tokens_seen": 20011560, + "step": 10217 + }, + { + "epoch": 1.3542743538767397, + "grad_norm": 5.284838676452637, + "learning_rate": 2.881911887960203e-06, + "loss": 0.0611, + "num_input_tokens_seen": 20013088, + "step": 10218 + }, + { + "epoch": 1.3544068919814447, + "grad_norm": 4.867649078369141, + "learning_rate": 2.88156882146571e-06, + "loss": 0.1284, + "num_input_tokens_seen": 20014920, + "step": 10219 + }, + { + "epoch": 1.3545394300861497, + "grad_norm": 13.477498054504395, + "learning_rate": 2.8812257476143064e-06, + "loss": 0.4959, + "num_input_tokens_seen": 20017280, + "step": 10220 + }, + { + "epoch": 1.3546719681908548, + "grad_norm": 10.210515975952148, + "learning_rate": 2.8808826664126078e-06, + "loss": 0.1693, + "num_input_tokens_seen": 20019120, + "step": 10221 + }, + { + "epoch": 1.35480450629556, + "grad_norm": 2.6398537158966064, + "learning_rate": 2.8805395778672284e-06, + "loss": 0.0227, + "num_input_tokens_seen": 20021016, + "step": 10222 + }, + { + "epoch": 1.354937044400265, + "grad_norm": 3.121579885482788, + "learning_rate": 2.880196481984784e-06, + "loss": 0.0243, + "num_input_tokens_seen": 20022512, + "step": 10223 + }, + { + "epoch": 1.3550695825049701, + "grad_norm": 3.8359482288360596, + "learning_rate": 2.879853378771888e-06, + "loss": 0.045, + "num_input_tokens_seen": 20024920, + "step": 10224 + }, + { + "epoch": 1.3552021206096754, + "grad_norm": 6.265467166900635, + "learning_rate": 2.8795102682351585e-06, + "loss": 0.2113, + "num_input_tokens_seen": 20027256, + "step": 10225 + }, + { + "epoch": 1.3553346587143804, + "grad_norm": 4.263208866119385, + "learning_rate": 2.8791671503812076e-06, + "loss": 0.0941, + "num_input_tokens_seen": 20029512, + "step": 10226 + }, + { + "epoch": 1.3554671968190855, + "grad_norm": 0.9884427785873413, + "learning_rate": 2.878824025216653e-06, + "loss": 0.008, + "num_input_tokens_seen": 20031040, + "step": 10227 + }, + { + "epoch": 1.3555997349237905, + "grad_norm": 1.610509991645813, + "learning_rate": 2.87848089274811e-06, + "loss": 0.0131, + "num_input_tokens_seen": 20032464, + "step": 10228 + }, + { + "epoch": 1.3557322730284957, + "grad_norm": 14.54312801361084, + "learning_rate": 2.878137752982194e-06, + "loss": 0.4424, + "num_input_tokens_seen": 20034872, + "step": 10229 + }, + { + "epoch": 1.3558648111332008, + "grad_norm": 5.1244049072265625, + "learning_rate": 2.877794605925521e-06, + "loss": 0.1085, + "num_input_tokens_seen": 20036952, + "step": 10230 + }, + { + "epoch": 1.3559973492379058, + "grad_norm": 14.133984565734863, + "learning_rate": 2.8774514515847087e-06, + "loss": 0.3746, + "num_input_tokens_seen": 20039408, + "step": 10231 + }, + { + "epoch": 1.356129887342611, + "grad_norm": 0.022026080638170242, + "learning_rate": 2.877108289966371e-06, + "loss": 0.0002, + "num_input_tokens_seen": 20042696, + "step": 10232 + }, + { + "epoch": 1.3562624254473161, + "grad_norm": 0.014277014881372452, + "learning_rate": 2.8767651210771255e-06, + "loss": 0.0001, + "num_input_tokens_seen": 20044264, + "step": 10233 + }, + { + "epoch": 1.3563949635520212, + "grad_norm": 12.550745010375977, + "learning_rate": 2.876421944923588e-06, + "loss": 0.2026, + "num_input_tokens_seen": 20046000, + "step": 10234 + }, + { + "epoch": 1.3565275016567262, + "grad_norm": 0.06669106334447861, + "learning_rate": 2.8760787615123764e-06, + "loss": 0.0004, + "num_input_tokens_seen": 20048168, + "step": 10235 + }, + { + "epoch": 1.3566600397614315, + "grad_norm": 6.550791263580322, + "learning_rate": 2.8757355708501067e-06, + "loss": 0.099, + "num_input_tokens_seen": 20050192, + "step": 10236 + }, + { + "epoch": 1.3567925778661365, + "grad_norm": 4.426619529724121, + "learning_rate": 2.8753923729433964e-06, + "loss": 0.1262, + "num_input_tokens_seen": 20051984, + "step": 10237 + }, + { + "epoch": 1.3569251159708418, + "grad_norm": 10.63064193725586, + "learning_rate": 2.875049167798861e-06, + "loss": 0.2175, + "num_input_tokens_seen": 20054096, + "step": 10238 + }, + { + "epoch": 1.3570576540755468, + "grad_norm": 13.080416679382324, + "learning_rate": 2.87470595542312e-06, + "loss": 0.3272, + "num_input_tokens_seen": 20056288, + "step": 10239 + }, + { + "epoch": 1.3571901921802518, + "grad_norm": 0.38462960720062256, + "learning_rate": 2.874362735822789e-06, + "loss": 0.0019, + "num_input_tokens_seen": 20058112, + "step": 10240 + }, + { + "epoch": 1.3573227302849569, + "grad_norm": 0.2194252461194992, + "learning_rate": 2.874019509004487e-06, + "loss": 0.0012, + "num_input_tokens_seen": 20059712, + "step": 10241 + }, + { + "epoch": 1.357455268389662, + "grad_norm": 4.273165225982666, + "learning_rate": 2.8736762749748305e-06, + "loss": 0.085, + "num_input_tokens_seen": 20061400, + "step": 10242 + }, + { + "epoch": 1.3575878064943672, + "grad_norm": 0.3740423023700714, + "learning_rate": 2.8733330337404376e-06, + "loss": 0.0017, + "num_input_tokens_seen": 20063776, + "step": 10243 + }, + { + "epoch": 1.3577203445990722, + "grad_norm": 1.5675824880599976, + "learning_rate": 2.8729897853079257e-06, + "loss": 0.016, + "num_input_tokens_seen": 20065576, + "step": 10244 + }, + { + "epoch": 1.3578528827037775, + "grad_norm": 6.7635016441345215, + "learning_rate": 2.872646529683914e-06, + "loss": 0.1116, + "num_input_tokens_seen": 20067072, + "step": 10245 + }, + { + "epoch": 1.3579854208084825, + "grad_norm": 4.161694049835205, + "learning_rate": 2.8723032668750204e-06, + "loss": 0.0461, + "num_input_tokens_seen": 20068392, + "step": 10246 + }, + { + "epoch": 1.3581179589131875, + "grad_norm": 0.04506103694438934, + "learning_rate": 2.871959996887862e-06, + "loss": 0.0003, + "num_input_tokens_seen": 20069672, + "step": 10247 + }, + { + "epoch": 1.3582504970178926, + "grad_norm": 4.090624809265137, + "learning_rate": 2.871616719729059e-06, + "loss": 0.0897, + "num_input_tokens_seen": 20071552, + "step": 10248 + }, + { + "epoch": 1.3583830351225976, + "grad_norm": 4.394562721252441, + "learning_rate": 2.8712734354052293e-06, + "loss": 0.0511, + "num_input_tokens_seen": 20073520, + "step": 10249 + }, + { + "epoch": 1.3585155732273029, + "grad_norm": 5.162529468536377, + "learning_rate": 2.8709301439229916e-06, + "loss": 0.0462, + "num_input_tokens_seen": 20075128, + "step": 10250 + }, + { + "epoch": 1.358648111332008, + "grad_norm": 2.5445590019226074, + "learning_rate": 2.8705868452889646e-06, + "loss": 0.085, + "num_input_tokens_seen": 20076856, + "step": 10251 + }, + { + "epoch": 1.3587806494367132, + "grad_norm": 3.473867654800415, + "learning_rate": 2.870243539509768e-06, + "loss": 0.0978, + "num_input_tokens_seen": 20078312, + "step": 10252 + }, + { + "epoch": 1.3589131875414182, + "grad_norm": 11.80223274230957, + "learning_rate": 2.8699002265920193e-06, + "loss": 0.2638, + "num_input_tokens_seen": 20080264, + "step": 10253 + }, + { + "epoch": 1.3590457256461232, + "grad_norm": 0.14455585181713104, + "learning_rate": 2.86955690654234e-06, + "loss": 0.001, + "num_input_tokens_seen": 20081824, + "step": 10254 + }, + { + "epoch": 1.3591782637508283, + "grad_norm": 0.05246931314468384, + "learning_rate": 2.869213579367349e-06, + "loss": 0.0007, + "num_input_tokens_seen": 20083376, + "step": 10255 + }, + { + "epoch": 1.3593108018555335, + "grad_norm": 1.6229512691497803, + "learning_rate": 2.868870245073665e-06, + "loss": 0.0123, + "num_input_tokens_seen": 20084744, + "step": 10256 + }, + { + "epoch": 1.3594433399602386, + "grad_norm": 7.217762470245361, + "learning_rate": 2.868526903667908e-06, + "loss": 0.1523, + "num_input_tokens_seen": 20086712, + "step": 10257 + }, + { + "epoch": 1.3595758780649436, + "grad_norm": 3.4183149337768555, + "learning_rate": 2.868183555156699e-06, + "loss": 0.0325, + "num_input_tokens_seen": 20088192, + "step": 10258 + }, + { + "epoch": 1.3597084161696489, + "grad_norm": 11.224275588989258, + "learning_rate": 2.867840199546656e-06, + "loss": 0.1021, + "num_input_tokens_seen": 20090112, + "step": 10259 + }, + { + "epoch": 1.359840954274354, + "grad_norm": 5.306778907775879, + "learning_rate": 2.8674968368444004e-06, + "loss": 0.132, + "num_input_tokens_seen": 20091904, + "step": 10260 + }, + { + "epoch": 1.359973492379059, + "grad_norm": 2.3273115158081055, + "learning_rate": 2.867153467056553e-06, + "loss": 0.0155, + "num_input_tokens_seen": 20093648, + "step": 10261 + }, + { + "epoch": 1.360106030483764, + "grad_norm": 5.035145282745361, + "learning_rate": 2.866810090189732e-06, + "loss": 0.1099, + "num_input_tokens_seen": 20095864, + "step": 10262 + }, + { + "epoch": 1.3602385685884693, + "grad_norm": 6.717398166656494, + "learning_rate": 2.866466706250561e-06, + "loss": 0.1171, + "num_input_tokens_seen": 20098552, + "step": 10263 + }, + { + "epoch": 1.3603711066931743, + "grad_norm": 5.512709140777588, + "learning_rate": 2.8661233152456586e-06, + "loss": 0.0603, + "num_input_tokens_seen": 20100304, + "step": 10264 + }, + { + "epoch": 1.3605036447978793, + "grad_norm": 0.025621457025408745, + "learning_rate": 2.8657799171816454e-06, + "loss": 0.0002, + "num_input_tokens_seen": 20101304, + "step": 10265 + }, + { + "epoch": 1.3606361829025846, + "grad_norm": 2.1676132678985596, + "learning_rate": 2.865436512065144e-06, + "loss": 0.0379, + "num_input_tokens_seen": 20103344, + "step": 10266 + }, + { + "epoch": 1.3607687210072896, + "grad_norm": 0.12138326466083527, + "learning_rate": 2.8650930999027736e-06, + "loss": 0.0009, + "num_input_tokens_seen": 20104968, + "step": 10267 + }, + { + "epoch": 1.3609012591119947, + "grad_norm": 6.712257385253906, + "learning_rate": 2.864749680701157e-06, + "loss": 0.1843, + "num_input_tokens_seen": 20106112, + "step": 10268 + }, + { + "epoch": 1.3610337972166997, + "grad_norm": 4.170565128326416, + "learning_rate": 2.8644062544669147e-06, + "loss": 0.0701, + "num_input_tokens_seen": 20107552, + "step": 10269 + }, + { + "epoch": 1.361166335321405, + "grad_norm": 10.114723205566406, + "learning_rate": 2.864062821206669e-06, + "loss": 0.3739, + "num_input_tokens_seen": 20109184, + "step": 10270 + }, + { + "epoch": 1.36129887342611, + "grad_norm": 12.09151840209961, + "learning_rate": 2.86371938092704e-06, + "loss": 0.2708, + "num_input_tokens_seen": 20112032, + "step": 10271 + }, + { + "epoch": 1.361431411530815, + "grad_norm": 7.512026786804199, + "learning_rate": 2.8633759336346515e-06, + "loss": 0.1339, + "num_input_tokens_seen": 20115056, + "step": 10272 + }, + { + "epoch": 1.3615639496355203, + "grad_norm": 6.696202278137207, + "learning_rate": 2.8630324793361236e-06, + "loss": 0.1857, + "num_input_tokens_seen": 20117688, + "step": 10273 + }, + { + "epoch": 1.3616964877402253, + "grad_norm": 7.93437385559082, + "learning_rate": 2.862689018038079e-06, + "loss": 0.1602, + "num_input_tokens_seen": 20119112, + "step": 10274 + }, + { + "epoch": 1.3618290258449304, + "grad_norm": 0.15057912468910217, + "learning_rate": 2.86234554974714e-06, + "loss": 0.0022, + "num_input_tokens_seen": 20120848, + "step": 10275 + }, + { + "epoch": 1.3619615639496354, + "grad_norm": 0.11671590059995651, + "learning_rate": 2.8620020744699296e-06, + "loss": 0.0008, + "num_input_tokens_seen": 20122192, + "step": 10276 + }, + { + "epoch": 1.3620941020543407, + "grad_norm": 1.5508573055267334, + "learning_rate": 2.8616585922130684e-06, + "loss": 0.0122, + "num_input_tokens_seen": 20123816, + "step": 10277 + }, + { + "epoch": 1.3622266401590457, + "grad_norm": 2.985682964324951, + "learning_rate": 2.8613151029831803e-06, + "loss": 0.0183, + "num_input_tokens_seen": 20125768, + "step": 10278 + }, + { + "epoch": 1.3623591782637507, + "grad_norm": 0.6363770365715027, + "learning_rate": 2.860971606786888e-06, + "loss": 0.0041, + "num_input_tokens_seen": 20127816, + "step": 10279 + }, + { + "epoch": 1.362491716368456, + "grad_norm": 2.284545660018921, + "learning_rate": 2.8606281036308136e-06, + "loss": 0.0328, + "num_input_tokens_seen": 20128968, + "step": 10280 + }, + { + "epoch": 1.362624254473161, + "grad_norm": 4.79076623916626, + "learning_rate": 2.860284593521581e-06, + "loss": 0.0576, + "num_input_tokens_seen": 20130584, + "step": 10281 + }, + { + "epoch": 1.362756792577866, + "grad_norm": 0.6736502647399902, + "learning_rate": 2.8599410764658138e-06, + "loss": 0.0042, + "num_input_tokens_seen": 20131984, + "step": 10282 + }, + { + "epoch": 1.3628893306825711, + "grad_norm": 1.1504400968551636, + "learning_rate": 2.8595975524701326e-06, + "loss": 0.0077, + "num_input_tokens_seen": 20133944, + "step": 10283 + }, + { + "epoch": 1.3630218687872764, + "grad_norm": 0.2669570744037628, + "learning_rate": 2.8592540215411637e-06, + "loss": 0.0019, + "num_input_tokens_seen": 20135000, + "step": 10284 + }, + { + "epoch": 1.3631544068919814, + "grad_norm": 3.761084794998169, + "learning_rate": 2.85891048368553e-06, + "loss": 0.0168, + "num_input_tokens_seen": 20138040, + "step": 10285 + }, + { + "epoch": 1.3632869449966867, + "grad_norm": 5.310940742492676, + "learning_rate": 2.8585669389098525e-06, + "loss": 0.0231, + "num_input_tokens_seen": 20140312, + "step": 10286 + }, + { + "epoch": 1.3634194831013917, + "grad_norm": 22.121137619018555, + "learning_rate": 2.858223387220758e-06, + "loss": 0.1966, + "num_input_tokens_seen": 20141520, + "step": 10287 + }, + { + "epoch": 1.3635520212060968, + "grad_norm": 0.9412453174591064, + "learning_rate": 2.85787982862487e-06, + "loss": 0.0053, + "num_input_tokens_seen": 20143240, + "step": 10288 + }, + { + "epoch": 1.3636845593108018, + "grad_norm": 16.68670082092285, + "learning_rate": 2.857536263128812e-06, + "loss": 0.402, + "num_input_tokens_seen": 20145456, + "step": 10289 + }, + { + "epoch": 1.3638170974155068, + "grad_norm": 9.812102317810059, + "learning_rate": 2.8571926907392077e-06, + "loss": 0.1274, + "num_input_tokens_seen": 20147752, + "step": 10290 + }, + { + "epoch": 1.363949635520212, + "grad_norm": 1.9744746685028076, + "learning_rate": 2.856849111462682e-06, + "loss": 0.0121, + "num_input_tokens_seen": 20149424, + "step": 10291 + }, + { + "epoch": 1.3640821736249171, + "grad_norm": 7.101957321166992, + "learning_rate": 2.8565055253058595e-06, + "loss": 0.1704, + "num_input_tokens_seen": 20151376, + "step": 10292 + }, + { + "epoch": 1.3642147117296224, + "grad_norm": 10.328837394714355, + "learning_rate": 2.8561619322753635e-06, + "loss": 0.151, + "num_input_tokens_seen": 20153816, + "step": 10293 + }, + { + "epoch": 1.3643472498343274, + "grad_norm": 8.471162796020508, + "learning_rate": 2.855818332377821e-06, + "loss": 0.0582, + "num_input_tokens_seen": 20155968, + "step": 10294 + }, + { + "epoch": 1.3644797879390325, + "grad_norm": 5.953080177307129, + "learning_rate": 2.8554747256198555e-06, + "loss": 0.0525, + "num_input_tokens_seen": 20157768, + "step": 10295 + }, + { + "epoch": 1.3646123260437375, + "grad_norm": 0.03656305745244026, + "learning_rate": 2.8551311120080915e-06, + "loss": 0.0002, + "num_input_tokens_seen": 20159520, + "step": 10296 + }, + { + "epoch": 1.3647448641484428, + "grad_norm": 0.029501622542738914, + "learning_rate": 2.8547874915491554e-06, + "loss": 0.0002, + "num_input_tokens_seen": 20160872, + "step": 10297 + }, + { + "epoch": 1.3648774022531478, + "grad_norm": 0.943374752998352, + "learning_rate": 2.85444386424967e-06, + "loss": 0.0054, + "num_input_tokens_seen": 20162976, + "step": 10298 + }, + { + "epoch": 1.3650099403578528, + "grad_norm": 2.241506576538086, + "learning_rate": 2.8541002301162647e-06, + "loss": 0.0201, + "num_input_tokens_seen": 20164552, + "step": 10299 + }, + { + "epoch": 1.365142478462558, + "grad_norm": 12.566509246826172, + "learning_rate": 2.8537565891555615e-06, + "loss": 0.414, + "num_input_tokens_seen": 20166928, + "step": 10300 + }, + { + "epoch": 1.3652750165672631, + "grad_norm": 8.582292556762695, + "learning_rate": 2.853412941374187e-06, + "loss": 0.1756, + "num_input_tokens_seen": 20168936, + "step": 10301 + }, + { + "epoch": 1.3654075546719682, + "grad_norm": 2.428239107131958, + "learning_rate": 2.853069286778768e-06, + "loss": 0.0069, + "num_input_tokens_seen": 20171320, + "step": 10302 + }, + { + "epoch": 1.3655400927766732, + "grad_norm": 15.82681941986084, + "learning_rate": 2.8527256253759284e-06, + "loss": 0.3676, + "num_input_tokens_seen": 20173088, + "step": 10303 + }, + { + "epoch": 1.3656726308813785, + "grad_norm": 0.0718628391623497, + "learning_rate": 2.852381957172296e-06, + "loss": 0.0005, + "num_input_tokens_seen": 20176328, + "step": 10304 + }, + { + "epoch": 1.3658051689860835, + "grad_norm": 5.051784515380859, + "learning_rate": 2.852038282174497e-06, + "loss": 0.0263, + "num_input_tokens_seen": 20178168, + "step": 10305 + }, + { + "epoch": 1.3659377070907885, + "grad_norm": 0.08920305222272873, + "learning_rate": 2.8516946003891566e-06, + "loss": 0.0006, + "num_input_tokens_seen": 20180504, + "step": 10306 + }, + { + "epoch": 1.3660702451954938, + "grad_norm": 8.767950057983398, + "learning_rate": 2.851350911822901e-06, + "loss": 0.2199, + "num_input_tokens_seen": 20182944, + "step": 10307 + }, + { + "epoch": 1.3662027833001988, + "grad_norm": 2.9058895111083984, + "learning_rate": 2.851007216482359e-06, + "loss": 0.0114, + "num_input_tokens_seen": 20185000, + "step": 10308 + }, + { + "epoch": 1.3663353214049039, + "grad_norm": 6.984080791473389, + "learning_rate": 2.850663514374154e-06, + "loss": 0.1136, + "num_input_tokens_seen": 20186944, + "step": 10309 + }, + { + "epoch": 1.366467859509609, + "grad_norm": 9.4683256149292, + "learning_rate": 2.8503198055049153e-06, + "loss": 0.104, + "num_input_tokens_seen": 20188608, + "step": 10310 + }, + { + "epoch": 1.3666003976143142, + "grad_norm": 10.349664688110352, + "learning_rate": 2.849976089881269e-06, + "loss": 0.1551, + "num_input_tokens_seen": 20190016, + "step": 10311 + }, + { + "epoch": 1.3667329357190192, + "grad_norm": 1.6757444143295288, + "learning_rate": 2.8496323675098427e-06, + "loss": 0.0103, + "num_input_tokens_seen": 20191664, + "step": 10312 + }, + { + "epoch": 1.3668654738237243, + "grad_norm": 0.00777265802025795, + "learning_rate": 2.8492886383972623e-06, + "loss": 0.0001, + "num_input_tokens_seen": 20192840, + "step": 10313 + }, + { + "epoch": 1.3669980119284295, + "grad_norm": 6.374345779418945, + "learning_rate": 2.8489449025501564e-06, + "loss": 0.1638, + "num_input_tokens_seen": 20194288, + "step": 10314 + }, + { + "epoch": 1.3671305500331346, + "grad_norm": 0.0755545124411583, + "learning_rate": 2.8486011599751516e-06, + "loss": 0.0005, + "num_input_tokens_seen": 20195888, + "step": 10315 + }, + { + "epoch": 1.3672630881378396, + "grad_norm": 0.42357337474823, + "learning_rate": 2.848257410678877e-06, + "loss": 0.0038, + "num_input_tokens_seen": 20197672, + "step": 10316 + }, + { + "epoch": 1.3673956262425446, + "grad_norm": 6.133724689483643, + "learning_rate": 2.847913654667959e-06, + "loss": 0.0958, + "num_input_tokens_seen": 20199616, + "step": 10317 + }, + { + "epoch": 1.3675281643472499, + "grad_norm": 8.292734146118164, + "learning_rate": 2.8475698919490252e-06, + "loss": 0.1518, + "num_input_tokens_seen": 20202296, + "step": 10318 + }, + { + "epoch": 1.367660702451955, + "grad_norm": 3.567265272140503, + "learning_rate": 2.847226122528704e-06, + "loss": 0.0831, + "num_input_tokens_seen": 20204152, + "step": 10319 + }, + { + "epoch": 1.36779324055666, + "grad_norm": 3.355271339416504, + "learning_rate": 2.8468823464136247e-06, + "loss": 0.1029, + "num_input_tokens_seen": 20205176, + "step": 10320 + }, + { + "epoch": 1.3679257786613652, + "grad_norm": 0.9234265685081482, + "learning_rate": 2.846538563610413e-06, + "loss": 0.004, + "num_input_tokens_seen": 20206832, + "step": 10321 + }, + { + "epoch": 1.3680583167660703, + "grad_norm": 0.3051629066467285, + "learning_rate": 2.8461947741256995e-06, + "loss": 0.0019, + "num_input_tokens_seen": 20208704, + "step": 10322 + }, + { + "epoch": 1.3681908548707753, + "grad_norm": 14.910869598388672, + "learning_rate": 2.8458509779661127e-06, + "loss": 0.2117, + "num_input_tokens_seen": 20210728, + "step": 10323 + }, + { + "epoch": 1.3683233929754803, + "grad_norm": 8.039057731628418, + "learning_rate": 2.845507175138279e-06, + "loss": 0.2379, + "num_input_tokens_seen": 20213392, + "step": 10324 + }, + { + "epoch": 1.3684559310801856, + "grad_norm": 0.037988003343343735, + "learning_rate": 2.8451633656488292e-06, + "loss": 0.0003, + "num_input_tokens_seen": 20215456, + "step": 10325 + }, + { + "epoch": 1.3685884691848906, + "grad_norm": 7.742657661437988, + "learning_rate": 2.8448195495043924e-06, + "loss": 0.102, + "num_input_tokens_seen": 20217792, + "step": 10326 + }, + { + "epoch": 1.368721007289596, + "grad_norm": 4.368834972381592, + "learning_rate": 2.844475726711595e-06, + "loss": 0.0826, + "num_input_tokens_seen": 20219568, + "step": 10327 + }, + { + "epoch": 1.368853545394301, + "grad_norm": 7.590453624725342, + "learning_rate": 2.844131897277069e-06, + "loss": 0.1541, + "num_input_tokens_seen": 20221040, + "step": 10328 + }, + { + "epoch": 1.368986083499006, + "grad_norm": 3.8208529949188232, + "learning_rate": 2.843788061207444e-06, + "loss": 0.0478, + "num_input_tokens_seen": 20222896, + "step": 10329 + }, + { + "epoch": 1.369118621603711, + "grad_norm": 0.543812096118927, + "learning_rate": 2.8434442185093453e-06, + "loss": 0.0039, + "num_input_tokens_seen": 20224576, + "step": 10330 + }, + { + "epoch": 1.369251159708416, + "grad_norm": 3.001206874847412, + "learning_rate": 2.843100369189407e-06, + "loss": 0.0813, + "num_input_tokens_seen": 20226528, + "step": 10331 + }, + { + "epoch": 1.3693836978131213, + "grad_norm": 0.16421914100646973, + "learning_rate": 2.8427565132542562e-06, + "loss": 0.0009, + "num_input_tokens_seen": 20228864, + "step": 10332 + }, + { + "epoch": 1.3695162359178263, + "grad_norm": 8.166908264160156, + "learning_rate": 2.842412650710523e-06, + "loss": 0.1447, + "num_input_tokens_seen": 20230992, + "step": 10333 + }, + { + "epoch": 1.3696487740225316, + "grad_norm": 5.81083345413208, + "learning_rate": 2.842068781564839e-06, + "loss": 0.2562, + "num_input_tokens_seen": 20233384, + "step": 10334 + }, + { + "epoch": 1.3697813121272366, + "grad_norm": 0.9662743210792542, + "learning_rate": 2.841724905823832e-06, + "loss": 0.0092, + "num_input_tokens_seen": 20235328, + "step": 10335 + }, + { + "epoch": 1.3699138502319417, + "grad_norm": 17.670150756835938, + "learning_rate": 2.841381023494133e-06, + "loss": 0.4182, + "num_input_tokens_seen": 20236992, + "step": 10336 + }, + { + "epoch": 1.3700463883366467, + "grad_norm": 0.040695711970329285, + "learning_rate": 2.8410371345823722e-06, + "loss": 0.0003, + "num_input_tokens_seen": 20238904, + "step": 10337 + }, + { + "epoch": 1.3701789264413518, + "grad_norm": 0.2721966803073883, + "learning_rate": 2.8406932390951814e-06, + "loss": 0.0014, + "num_input_tokens_seen": 20240208, + "step": 10338 + }, + { + "epoch": 1.370311464546057, + "grad_norm": 18.994857788085938, + "learning_rate": 2.8403493370391887e-06, + "loss": 0.3366, + "num_input_tokens_seen": 20242352, + "step": 10339 + }, + { + "epoch": 1.370444002650762, + "grad_norm": 0.026472801342606544, + "learning_rate": 2.840005428421026e-06, + "loss": 0.0002, + "num_input_tokens_seen": 20243912, + "step": 10340 + }, + { + "epoch": 1.3705765407554673, + "grad_norm": 0.19931645691394806, + "learning_rate": 2.839661513247325e-06, + "loss": 0.0011, + "num_input_tokens_seen": 20245736, + "step": 10341 + }, + { + "epoch": 1.3707090788601723, + "grad_norm": 0.023659413680434227, + "learning_rate": 2.839317591524715e-06, + "loss": 0.0002, + "num_input_tokens_seen": 20247456, + "step": 10342 + }, + { + "epoch": 1.3708416169648774, + "grad_norm": 6.474206447601318, + "learning_rate": 2.8389736632598275e-06, + "loss": 0.1612, + "num_input_tokens_seen": 20249440, + "step": 10343 + }, + { + "epoch": 1.3709741550695824, + "grad_norm": 0.013446916826069355, + "learning_rate": 2.8386297284592944e-06, + "loss": 0.0001, + "num_input_tokens_seen": 20250824, + "step": 10344 + }, + { + "epoch": 1.3711066931742877, + "grad_norm": 14.753133773803711, + "learning_rate": 2.8382857871297458e-06, + "loss": 0.2521, + "num_input_tokens_seen": 20252976, + "step": 10345 + }, + { + "epoch": 1.3712392312789927, + "grad_norm": 4.31685733795166, + "learning_rate": 2.837941839277815e-06, + "loss": 0.0875, + "num_input_tokens_seen": 20254864, + "step": 10346 + }, + { + "epoch": 1.3713717693836978, + "grad_norm": 5.400439262390137, + "learning_rate": 2.8375978849101328e-06, + "loss": 0.1157, + "num_input_tokens_seen": 20256376, + "step": 10347 + }, + { + "epoch": 1.371504307488403, + "grad_norm": 7.819105625152588, + "learning_rate": 2.8372539240333287e-06, + "loss": 0.2554, + "num_input_tokens_seen": 20258000, + "step": 10348 + }, + { + "epoch": 1.371636845593108, + "grad_norm": 11.385879516601562, + "learning_rate": 2.836909956654037e-06, + "loss": 0.2496, + "num_input_tokens_seen": 20259896, + "step": 10349 + }, + { + "epoch": 1.371769383697813, + "grad_norm": 0.12775881588459015, + "learning_rate": 2.8365659827788893e-06, + "loss": 0.0006, + "num_input_tokens_seen": 20262352, + "step": 10350 + }, + { + "epoch": 1.3719019218025181, + "grad_norm": 8.935964584350586, + "learning_rate": 2.8362220024145164e-06, + "loss": 0.1398, + "num_input_tokens_seen": 20264624, + "step": 10351 + }, + { + "epoch": 1.3720344599072234, + "grad_norm": 14.91470718383789, + "learning_rate": 2.8358780155675525e-06, + "loss": 0.3691, + "num_input_tokens_seen": 20266120, + "step": 10352 + }, + { + "epoch": 1.3721669980119284, + "grad_norm": 3.048900604248047, + "learning_rate": 2.835534022244628e-06, + "loss": 0.0316, + "num_input_tokens_seen": 20267808, + "step": 10353 + }, + { + "epoch": 1.3722995361166335, + "grad_norm": 0.12697701156139374, + "learning_rate": 2.8351900224523766e-06, + "loss": 0.0006, + "num_input_tokens_seen": 20269184, + "step": 10354 + }, + { + "epoch": 1.3724320742213387, + "grad_norm": 0.09627808630466461, + "learning_rate": 2.8348460161974294e-06, + "loss": 0.0006, + "num_input_tokens_seen": 20271632, + "step": 10355 + }, + { + "epoch": 1.3725646123260438, + "grad_norm": 3.2159152030944824, + "learning_rate": 2.8345020034864213e-06, + "loss": 0.0407, + "num_input_tokens_seen": 20273248, + "step": 10356 + }, + { + "epoch": 1.3726971504307488, + "grad_norm": 1.2430331707000732, + "learning_rate": 2.834157984325983e-06, + "loss": 0.0038, + "num_input_tokens_seen": 20274544, + "step": 10357 + }, + { + "epoch": 1.3728296885354538, + "grad_norm": 8.720396995544434, + "learning_rate": 2.833813958722748e-06, + "loss": 0.1205, + "num_input_tokens_seen": 20276720, + "step": 10358 + }, + { + "epoch": 1.372962226640159, + "grad_norm": 9.535908699035645, + "learning_rate": 2.83346992668335e-06, + "loss": 0.2483, + "num_input_tokens_seen": 20279160, + "step": 10359 + }, + { + "epoch": 1.3730947647448641, + "grad_norm": 11.56834888458252, + "learning_rate": 2.833125888214422e-06, + "loss": 0.1703, + "num_input_tokens_seen": 20281112, + "step": 10360 + }, + { + "epoch": 1.3732273028495692, + "grad_norm": 7.1938982009887695, + "learning_rate": 2.832781843322597e-06, + "loss": 0.0781, + "num_input_tokens_seen": 20283200, + "step": 10361 + }, + { + "epoch": 1.3733598409542744, + "grad_norm": 8.007373809814453, + "learning_rate": 2.8324377920145085e-06, + "loss": 0.25, + "num_input_tokens_seen": 20285104, + "step": 10362 + }, + { + "epoch": 1.3734923790589795, + "grad_norm": 20.793052673339844, + "learning_rate": 2.8320937342967892e-06, + "loss": 0.6114, + "num_input_tokens_seen": 20287552, + "step": 10363 + }, + { + "epoch": 1.3736249171636845, + "grad_norm": 8.44959831237793, + "learning_rate": 2.8317496701760743e-06, + "loss": 0.1284, + "num_input_tokens_seen": 20289304, + "step": 10364 + }, + { + "epoch": 1.3737574552683895, + "grad_norm": 6.841488838195801, + "learning_rate": 2.8314055996589973e-06, + "loss": 0.0395, + "num_input_tokens_seen": 20291360, + "step": 10365 + }, + { + "epoch": 1.3738899933730948, + "grad_norm": 5.396323204040527, + "learning_rate": 2.831061522752191e-06, + "loss": 0.1209, + "num_input_tokens_seen": 20293432, + "step": 10366 + }, + { + "epoch": 1.3740225314777998, + "grad_norm": 5.499234676361084, + "learning_rate": 2.8307174394622904e-06, + "loss": 0.0999, + "num_input_tokens_seen": 20295392, + "step": 10367 + }, + { + "epoch": 1.374155069582505, + "grad_norm": 0.12524831295013428, + "learning_rate": 2.8303733497959296e-06, + "loss": 0.0008, + "num_input_tokens_seen": 20297832, + "step": 10368 + }, + { + "epoch": 1.3742876076872101, + "grad_norm": 1.9644845724105835, + "learning_rate": 2.8300292537597417e-06, + "loss": 0.0161, + "num_input_tokens_seen": 20300736, + "step": 10369 + }, + { + "epoch": 1.3744201457919152, + "grad_norm": 0.7431864738464355, + "learning_rate": 2.8296851513603636e-06, + "loss": 0.0017, + "num_input_tokens_seen": 20302400, + "step": 10370 + }, + { + "epoch": 1.3745526838966202, + "grad_norm": 0.12795034050941467, + "learning_rate": 2.8293410426044276e-06, + "loss": 0.0008, + "num_input_tokens_seen": 20305808, + "step": 10371 + }, + { + "epoch": 1.3746852220013253, + "grad_norm": 10.205643653869629, + "learning_rate": 2.828996927498569e-06, + "loss": 0.1156, + "num_input_tokens_seen": 20307616, + "step": 10372 + }, + { + "epoch": 1.3748177601060305, + "grad_norm": 10.105992317199707, + "learning_rate": 2.8286528060494233e-06, + "loss": 0.2117, + "num_input_tokens_seen": 20309696, + "step": 10373 + }, + { + "epoch": 1.3749502982107356, + "grad_norm": 2.8062946796417236, + "learning_rate": 2.828308678263624e-06, + "loss": 0.0946, + "num_input_tokens_seen": 20313200, + "step": 10374 + }, + { + "epoch": 1.3750828363154408, + "grad_norm": 3.965085029602051, + "learning_rate": 2.8279645441478072e-06, + "loss": 0.0568, + "num_input_tokens_seen": 20315184, + "step": 10375 + }, + { + "epoch": 1.3752153744201459, + "grad_norm": 4.602263450622559, + "learning_rate": 2.8276204037086073e-06, + "loss": 0.065, + "num_input_tokens_seen": 20317344, + "step": 10376 + }, + { + "epoch": 1.375347912524851, + "grad_norm": 8.008227348327637, + "learning_rate": 2.827276256952661e-06, + "loss": 0.2021, + "num_input_tokens_seen": 20318992, + "step": 10377 + }, + { + "epoch": 1.375480450629556, + "grad_norm": 11.01041316986084, + "learning_rate": 2.826932103886602e-06, + "loss": 0.3012, + "num_input_tokens_seen": 20321840, + "step": 10378 + }, + { + "epoch": 1.375612988734261, + "grad_norm": 6.367805004119873, + "learning_rate": 2.8265879445170662e-06, + "loss": 0.1274, + "num_input_tokens_seen": 20324056, + "step": 10379 + }, + { + "epoch": 1.3757455268389662, + "grad_norm": 2.3177542686462402, + "learning_rate": 2.8262437788506904e-06, + "loss": 0.006, + "num_input_tokens_seen": 20326784, + "step": 10380 + }, + { + "epoch": 1.3758780649436713, + "grad_norm": 0.3130215108394623, + "learning_rate": 2.8258996068941085e-06, + "loss": 0.002, + "num_input_tokens_seen": 20328448, + "step": 10381 + }, + { + "epoch": 1.3760106030483765, + "grad_norm": 4.384000301361084, + "learning_rate": 2.8255554286539583e-06, + "loss": 0.0388, + "num_input_tokens_seen": 20330384, + "step": 10382 + }, + { + "epoch": 1.3761431411530816, + "grad_norm": 12.097065925598145, + "learning_rate": 2.8252112441368744e-06, + "loss": 0.2576, + "num_input_tokens_seen": 20331960, + "step": 10383 + }, + { + "epoch": 1.3762756792577866, + "grad_norm": 10.108831405639648, + "learning_rate": 2.824867053349493e-06, + "loss": 0.285, + "num_input_tokens_seen": 20334048, + "step": 10384 + }, + { + "epoch": 1.3764082173624916, + "grad_norm": 0.09586158394813538, + "learning_rate": 2.8245228562984518e-06, + "loss": 0.0006, + "num_input_tokens_seen": 20336216, + "step": 10385 + }, + { + "epoch": 1.376540755467197, + "grad_norm": 8.426509857177734, + "learning_rate": 2.8241786529903853e-06, + "loss": 0.2734, + "num_input_tokens_seen": 20338056, + "step": 10386 + }, + { + "epoch": 1.376673293571902, + "grad_norm": 12.111716270446777, + "learning_rate": 2.8238344434319306e-06, + "loss": 0.1966, + "num_input_tokens_seen": 20339960, + "step": 10387 + }, + { + "epoch": 1.376805831676607, + "grad_norm": 9.422719955444336, + "learning_rate": 2.823490227629725e-06, + "loss": 0.056, + "num_input_tokens_seen": 20341808, + "step": 10388 + }, + { + "epoch": 1.3769383697813122, + "grad_norm": 0.0920829102396965, + "learning_rate": 2.823146005590404e-06, + "loss": 0.0006, + "num_input_tokens_seen": 20343848, + "step": 10389 + }, + { + "epoch": 1.3770709078860173, + "grad_norm": 12.219758033752441, + "learning_rate": 2.8228017773206052e-06, + "loss": 0.1352, + "num_input_tokens_seen": 20345472, + "step": 10390 + }, + { + "epoch": 1.3772034459907223, + "grad_norm": 6.981125354766846, + "learning_rate": 2.8224575428269663e-06, + "loss": 0.1797, + "num_input_tokens_seen": 20347760, + "step": 10391 + }, + { + "epoch": 1.3773359840954273, + "grad_norm": 2.835998296737671, + "learning_rate": 2.8221133021161225e-06, + "loss": 0.0303, + "num_input_tokens_seen": 20350008, + "step": 10392 + }, + { + "epoch": 1.3774685222001326, + "grad_norm": 0.34046420454978943, + "learning_rate": 2.821769055194712e-06, + "loss": 0.0026, + "num_input_tokens_seen": 20351576, + "step": 10393 + }, + { + "epoch": 1.3776010603048376, + "grad_norm": 0.14385081827640533, + "learning_rate": 2.8214248020693726e-06, + "loss": 0.0008, + "num_input_tokens_seen": 20352944, + "step": 10394 + }, + { + "epoch": 1.3777335984095427, + "grad_norm": 1.46402108669281, + "learning_rate": 2.8210805427467408e-06, + "loss": 0.0086, + "num_input_tokens_seen": 20355936, + "step": 10395 + }, + { + "epoch": 1.377866136514248, + "grad_norm": 9.110713005065918, + "learning_rate": 2.820736277233455e-06, + "loss": 0.1026, + "num_input_tokens_seen": 20357960, + "step": 10396 + }, + { + "epoch": 1.377998674618953, + "grad_norm": 0.09277038276195526, + "learning_rate": 2.8203920055361526e-06, + "loss": 0.0006, + "num_input_tokens_seen": 20361016, + "step": 10397 + }, + { + "epoch": 1.378131212723658, + "grad_norm": 15.511788368225098, + "learning_rate": 2.8200477276614714e-06, + "loss": 0.1336, + "num_input_tokens_seen": 20362536, + "step": 10398 + }, + { + "epoch": 1.378263750828363, + "grad_norm": 9.558394432067871, + "learning_rate": 2.8197034436160494e-06, + "loss": 0.169, + "num_input_tokens_seen": 20364296, + "step": 10399 + }, + { + "epoch": 1.3783962889330683, + "grad_norm": 0.20659379661083221, + "learning_rate": 2.8193591534065233e-06, + "loss": 0.0015, + "num_input_tokens_seen": 20365600, + "step": 10400 + }, + { + "epoch": 1.3785288270377734, + "grad_norm": 0.09778586775064468, + "learning_rate": 2.819014857039533e-06, + "loss": 0.0006, + "num_input_tokens_seen": 20367480, + "step": 10401 + }, + { + "epoch": 1.3786613651424784, + "grad_norm": 0.20377711951732635, + "learning_rate": 2.8186705545217163e-06, + "loss": 0.0009, + "num_input_tokens_seen": 20369072, + "step": 10402 + }, + { + "epoch": 1.3787939032471836, + "grad_norm": 0.22968515753746033, + "learning_rate": 2.818326245859712e-06, + "loss": 0.0011, + "num_input_tokens_seen": 20370496, + "step": 10403 + }, + { + "epoch": 1.3789264413518887, + "grad_norm": 1.510170578956604, + "learning_rate": 2.8179819310601573e-06, + "loss": 0.0086, + "num_input_tokens_seen": 20372352, + "step": 10404 + }, + { + "epoch": 1.3790589794565937, + "grad_norm": 0.6657647490501404, + "learning_rate": 2.817637610129692e-06, + "loss": 0.0036, + "num_input_tokens_seen": 20374136, + "step": 10405 + }, + { + "epoch": 1.3791915175612988, + "grad_norm": 0.04369795322418213, + "learning_rate": 2.8172932830749548e-06, + "loss": 0.0002, + "num_input_tokens_seen": 20375720, + "step": 10406 + }, + { + "epoch": 1.379324055666004, + "grad_norm": 5.477841854095459, + "learning_rate": 2.816948949902583e-06, + "loss": 0.0801, + "num_input_tokens_seen": 20378448, + "step": 10407 + }, + { + "epoch": 1.379456593770709, + "grad_norm": 3.954936981201172, + "learning_rate": 2.8166046106192178e-06, + "loss": 0.066, + "num_input_tokens_seen": 20380944, + "step": 10408 + }, + { + "epoch": 1.379589131875414, + "grad_norm": 0.27908411622047424, + "learning_rate": 2.816260265231498e-06, + "loss": 0.0008, + "num_input_tokens_seen": 20382280, + "step": 10409 + }, + { + "epoch": 1.3797216699801194, + "grad_norm": 0.03562706336379051, + "learning_rate": 2.8159159137460603e-06, + "loss": 0.0002, + "num_input_tokens_seen": 20384168, + "step": 10410 + }, + { + "epoch": 1.3798542080848244, + "grad_norm": 0.389545202255249, + "learning_rate": 2.8155715561695467e-06, + "loss": 0.0013, + "num_input_tokens_seen": 20386056, + "step": 10411 + }, + { + "epoch": 1.3799867461895294, + "grad_norm": 8.96086597442627, + "learning_rate": 2.815227192508596e-06, + "loss": 0.2516, + "num_input_tokens_seen": 20388384, + "step": 10412 + }, + { + "epoch": 1.3801192842942345, + "grad_norm": 0.7904037237167358, + "learning_rate": 2.814882822769847e-06, + "loss": 0.0034, + "num_input_tokens_seen": 20389656, + "step": 10413 + }, + { + "epoch": 1.3802518223989397, + "grad_norm": 0.011315170675516129, + "learning_rate": 2.8145384469599406e-06, + "loss": 0.0001, + "num_input_tokens_seen": 20390872, + "step": 10414 + }, + { + "epoch": 1.3803843605036448, + "grad_norm": 5.591720104217529, + "learning_rate": 2.814194065085516e-06, + "loss": 0.0385, + "num_input_tokens_seen": 20392072, + "step": 10415 + }, + { + "epoch": 1.38051689860835, + "grad_norm": 12.15715217590332, + "learning_rate": 2.8138496771532128e-06, + "loss": 0.2143, + "num_input_tokens_seen": 20393672, + "step": 10416 + }, + { + "epoch": 1.380649436713055, + "grad_norm": 9.443870544433594, + "learning_rate": 2.8135052831696713e-06, + "loss": 0.2079, + "num_input_tokens_seen": 20396280, + "step": 10417 + }, + { + "epoch": 1.38078197481776, + "grad_norm": 13.527814865112305, + "learning_rate": 2.8131608831415313e-06, + "loss": 0.1683, + "num_input_tokens_seen": 20398216, + "step": 10418 + }, + { + "epoch": 1.3809145129224651, + "grad_norm": 0.0022506157401949167, + "learning_rate": 2.812816477075434e-06, + "loss": 0.0, + "num_input_tokens_seen": 20399544, + "step": 10419 + }, + { + "epoch": 1.3810470510271702, + "grad_norm": 8.63974666595459, + "learning_rate": 2.8124720649780186e-06, + "loss": 0.1409, + "num_input_tokens_seen": 20401168, + "step": 10420 + }, + { + "epoch": 1.3811795891318754, + "grad_norm": 0.01717144250869751, + "learning_rate": 2.812127646855927e-06, + "loss": 0.0001, + "num_input_tokens_seen": 20404264, + "step": 10421 + }, + { + "epoch": 1.3813121272365805, + "grad_norm": 1.1958845853805542, + "learning_rate": 2.8117832227157986e-06, + "loss": 0.0058, + "num_input_tokens_seen": 20407736, + "step": 10422 + }, + { + "epoch": 1.3814446653412857, + "grad_norm": 8.497928619384766, + "learning_rate": 2.8114387925642743e-06, + "loss": 0.1717, + "num_input_tokens_seen": 20409248, + "step": 10423 + }, + { + "epoch": 1.3815772034459908, + "grad_norm": 5.585585594177246, + "learning_rate": 2.8110943564079962e-06, + "loss": 0.0402, + "num_input_tokens_seen": 20410664, + "step": 10424 + }, + { + "epoch": 1.3817097415506958, + "grad_norm": 0.012229116633534431, + "learning_rate": 2.8107499142536026e-06, + "loss": 0.0001, + "num_input_tokens_seen": 20412856, + "step": 10425 + }, + { + "epoch": 1.3818422796554009, + "grad_norm": 0.10501024127006531, + "learning_rate": 2.810405466107738e-06, + "loss": 0.0003, + "num_input_tokens_seen": 20414360, + "step": 10426 + }, + { + "epoch": 1.3819748177601061, + "grad_norm": 5.115072250366211, + "learning_rate": 2.8100610119770416e-06, + "loss": 0.129, + "num_input_tokens_seen": 20416760, + "step": 10427 + }, + { + "epoch": 1.3821073558648111, + "grad_norm": 9.215903282165527, + "learning_rate": 2.8097165518681537e-06, + "loss": 0.218, + "num_input_tokens_seen": 20419256, + "step": 10428 + }, + { + "epoch": 1.3822398939695162, + "grad_norm": 4.782633304595947, + "learning_rate": 2.809372085787718e-06, + "loss": 0.1543, + "num_input_tokens_seen": 20421064, + "step": 10429 + }, + { + "epoch": 1.3823724320742214, + "grad_norm": 9.270026206970215, + "learning_rate": 2.8090276137423755e-06, + "loss": 0.1809, + "num_input_tokens_seen": 20423096, + "step": 10430 + }, + { + "epoch": 1.3825049701789265, + "grad_norm": 5.7914814949035645, + "learning_rate": 2.8086831357387667e-06, + "loss": 0.1416, + "num_input_tokens_seen": 20425040, + "step": 10431 + }, + { + "epoch": 1.3826375082836315, + "grad_norm": 6.540102958679199, + "learning_rate": 2.8083386517835343e-06, + "loss": 0.1587, + "num_input_tokens_seen": 20426864, + "step": 10432 + }, + { + "epoch": 1.3827700463883366, + "grad_norm": 0.007017153315246105, + "learning_rate": 2.80799416188332e-06, + "loss": 0.0, + "num_input_tokens_seen": 20428544, + "step": 10433 + }, + { + "epoch": 1.3829025844930418, + "grad_norm": 7.678238868713379, + "learning_rate": 2.8076496660447657e-06, + "loss": 0.2836, + "num_input_tokens_seen": 20430744, + "step": 10434 + }, + { + "epoch": 1.3830351225977469, + "grad_norm": 0.005133752711117268, + "learning_rate": 2.8073051642745144e-06, + "loss": 0.0, + "num_input_tokens_seen": 20431816, + "step": 10435 + }, + { + "epoch": 1.383167660702452, + "grad_norm": 0.03415863215923309, + "learning_rate": 2.806960656579206e-06, + "loss": 0.0002, + "num_input_tokens_seen": 20433424, + "step": 10436 + }, + { + "epoch": 1.3833001988071572, + "grad_norm": 16.254480361938477, + "learning_rate": 2.8066161429654853e-06, + "loss": 0.3303, + "num_input_tokens_seen": 20435128, + "step": 10437 + }, + { + "epoch": 1.3834327369118622, + "grad_norm": 0.008742360398173332, + "learning_rate": 2.806271623439993e-06, + "loss": 0.0001, + "num_input_tokens_seen": 20436848, + "step": 10438 + }, + { + "epoch": 1.3835652750165672, + "grad_norm": 11.590932846069336, + "learning_rate": 2.805927098009374e-06, + "loss": 0.2539, + "num_input_tokens_seen": 20439192, + "step": 10439 + }, + { + "epoch": 1.3836978131212723, + "grad_norm": 0.02244654670357704, + "learning_rate": 2.805582566680269e-06, + "loss": 0.0001, + "num_input_tokens_seen": 20440776, + "step": 10440 + }, + { + "epoch": 1.3838303512259775, + "grad_norm": 0.022556914016604424, + "learning_rate": 2.8052380294593206e-06, + "loss": 0.0001, + "num_input_tokens_seen": 20442616, + "step": 10441 + }, + { + "epoch": 1.3839628893306826, + "grad_norm": 7.46207857131958, + "learning_rate": 2.804893486353173e-06, + "loss": 0.1687, + "num_input_tokens_seen": 20444816, + "step": 10442 + }, + { + "epoch": 1.3840954274353876, + "grad_norm": 8.115246772766113, + "learning_rate": 2.8045489373684686e-06, + "loss": 0.0756, + "num_input_tokens_seen": 20446824, + "step": 10443 + }, + { + "epoch": 1.3842279655400929, + "grad_norm": 0.02196202613413334, + "learning_rate": 2.8042043825118504e-06, + "loss": 0.0001, + "num_input_tokens_seen": 20448632, + "step": 10444 + }, + { + "epoch": 1.384360503644798, + "grad_norm": 4.157558917999268, + "learning_rate": 2.8038598217899614e-06, + "loss": 0.1407, + "num_input_tokens_seen": 20451736, + "step": 10445 + }, + { + "epoch": 1.384493041749503, + "grad_norm": 8.752606391906738, + "learning_rate": 2.8035152552094457e-06, + "loss": 0.1446, + "num_input_tokens_seen": 20453752, + "step": 10446 + }, + { + "epoch": 1.384625579854208, + "grad_norm": 0.03144088760018349, + "learning_rate": 2.803170682776947e-06, + "loss": 0.0002, + "num_input_tokens_seen": 20455720, + "step": 10447 + }, + { + "epoch": 1.3847581179589132, + "grad_norm": 4.156338691711426, + "learning_rate": 2.8028261044991086e-06, + "loss": 0.0466, + "num_input_tokens_seen": 20458520, + "step": 10448 + }, + { + "epoch": 1.3848906560636183, + "grad_norm": 5.109591960906982, + "learning_rate": 2.802481520382573e-06, + "loss": 0.1273, + "num_input_tokens_seen": 20460616, + "step": 10449 + }, + { + "epoch": 1.3850231941683233, + "grad_norm": 0.029196010902523994, + "learning_rate": 2.8021369304339864e-06, + "loss": 0.0002, + "num_input_tokens_seen": 20462608, + "step": 10450 + }, + { + "epoch": 1.3851557322730286, + "grad_norm": 4.7668023109436035, + "learning_rate": 2.8017923346599902e-06, + "loss": 0.0866, + "num_input_tokens_seen": 20463952, + "step": 10451 + }, + { + "epoch": 1.3852882703777336, + "grad_norm": 0.7397273182868958, + "learning_rate": 2.8014477330672297e-06, + "loss": 0.0049, + "num_input_tokens_seen": 20466176, + "step": 10452 + }, + { + "epoch": 1.3854208084824386, + "grad_norm": 4.5694756507873535, + "learning_rate": 2.8011031256623492e-06, + "loss": 0.0425, + "num_input_tokens_seen": 20468616, + "step": 10453 + }, + { + "epoch": 1.3855533465871437, + "grad_norm": 0.07951880246400833, + "learning_rate": 2.8007585124519927e-06, + "loss": 0.0006, + "num_input_tokens_seen": 20470224, + "step": 10454 + }, + { + "epoch": 1.385685884691849, + "grad_norm": 10.578248023986816, + "learning_rate": 2.8004138934428045e-06, + "loss": 0.1397, + "num_input_tokens_seen": 20472488, + "step": 10455 + }, + { + "epoch": 1.385818422796554, + "grad_norm": 9.208282470703125, + "learning_rate": 2.800069268641429e-06, + "loss": 0.2332, + "num_input_tokens_seen": 20474680, + "step": 10456 + }, + { + "epoch": 1.3859509609012592, + "grad_norm": 8.394522666931152, + "learning_rate": 2.799724638054512e-06, + "loss": 0.1886, + "num_input_tokens_seen": 20476384, + "step": 10457 + }, + { + "epoch": 1.3860834990059643, + "grad_norm": 4.234194278717041, + "learning_rate": 2.799380001688696e-06, + "loss": 0.0752, + "num_input_tokens_seen": 20477648, + "step": 10458 + }, + { + "epoch": 1.3862160371106693, + "grad_norm": 7.70643424987793, + "learning_rate": 2.799035359550627e-06, + "loss": 0.1917, + "num_input_tokens_seen": 20479520, + "step": 10459 + }, + { + "epoch": 1.3863485752153744, + "grad_norm": 9.970481872558594, + "learning_rate": 2.79869071164695e-06, + "loss": 0.1534, + "num_input_tokens_seen": 20481296, + "step": 10460 + }, + { + "epoch": 1.3864811133200794, + "grad_norm": 2.355271577835083, + "learning_rate": 2.798346057984311e-06, + "loss": 0.029, + "num_input_tokens_seen": 20482648, + "step": 10461 + }, + { + "epoch": 1.3866136514247847, + "grad_norm": 5.343208312988281, + "learning_rate": 2.798001398569354e-06, + "loss": 0.0774, + "num_input_tokens_seen": 20483984, + "step": 10462 + }, + { + "epoch": 1.3867461895294897, + "grad_norm": 0.054289281368255615, + "learning_rate": 2.7976567334087234e-06, + "loss": 0.0004, + "num_input_tokens_seen": 20485336, + "step": 10463 + }, + { + "epoch": 1.386878727634195, + "grad_norm": 0.11812238395214081, + "learning_rate": 2.7973120625090666e-06, + "loss": 0.0009, + "num_input_tokens_seen": 20487312, + "step": 10464 + }, + { + "epoch": 1.3870112657389, + "grad_norm": 7.9044718742370605, + "learning_rate": 2.796967385877028e-06, + "loss": 0.0839, + "num_input_tokens_seen": 20489240, + "step": 10465 + }, + { + "epoch": 1.387143803843605, + "grad_norm": 6.656596660614014, + "learning_rate": 2.7966227035192536e-06, + "loss": 0.0596, + "num_input_tokens_seen": 20491072, + "step": 10466 + }, + { + "epoch": 1.38727634194831, + "grad_norm": 0.5502724647521973, + "learning_rate": 2.796278015442388e-06, + "loss": 0.0059, + "num_input_tokens_seen": 20492880, + "step": 10467 + }, + { + "epoch": 1.387408880053015, + "grad_norm": 0.517119288444519, + "learning_rate": 2.7959333216530788e-06, + "loss": 0.0063, + "num_input_tokens_seen": 20495496, + "step": 10468 + }, + { + "epoch": 1.3875414181577204, + "grad_norm": 11.155953407287598, + "learning_rate": 2.7955886221579704e-06, + "loss": 0.1368, + "num_input_tokens_seen": 20497496, + "step": 10469 + }, + { + "epoch": 1.3876739562624254, + "grad_norm": 0.07422763854265213, + "learning_rate": 2.795243916963709e-06, + "loss": 0.0005, + "num_input_tokens_seen": 20499408, + "step": 10470 + }, + { + "epoch": 1.3878064943671307, + "grad_norm": 3.8631277084350586, + "learning_rate": 2.7948992060769424e-06, + "loss": 0.0833, + "num_input_tokens_seen": 20500672, + "step": 10471 + }, + { + "epoch": 1.3879390324718357, + "grad_norm": 0.2156079262495041, + "learning_rate": 2.7945544895043144e-06, + "loss": 0.0016, + "num_input_tokens_seen": 20502424, + "step": 10472 + }, + { + "epoch": 1.3880715705765407, + "grad_norm": 1.184791088104248, + "learning_rate": 2.7942097672524737e-06, + "loss": 0.0092, + "num_input_tokens_seen": 20504144, + "step": 10473 + }, + { + "epoch": 1.3882041086812458, + "grad_norm": 0.7888578772544861, + "learning_rate": 2.793865039328066e-06, + "loss": 0.0058, + "num_input_tokens_seen": 20505968, + "step": 10474 + }, + { + "epoch": 1.388336646785951, + "grad_norm": 10.811427116394043, + "learning_rate": 2.793520305737736e-06, + "loss": 0.3473, + "num_input_tokens_seen": 20507728, + "step": 10475 + }, + { + "epoch": 1.388469184890656, + "grad_norm": 9.360350608825684, + "learning_rate": 2.793175566488133e-06, + "loss": 0.3323, + "num_input_tokens_seen": 20509704, + "step": 10476 + }, + { + "epoch": 1.388601722995361, + "grad_norm": 6.565247535705566, + "learning_rate": 2.792830821585903e-06, + "loss": 0.0904, + "num_input_tokens_seen": 20511784, + "step": 10477 + }, + { + "epoch": 1.3887342611000664, + "grad_norm": 5.025900840759277, + "learning_rate": 2.792486071037692e-06, + "loss": 0.1034, + "num_input_tokens_seen": 20513776, + "step": 10478 + }, + { + "epoch": 1.3888667992047714, + "grad_norm": 0.09110893309116364, + "learning_rate": 2.7921413148501484e-06, + "loss": 0.0005, + "num_input_tokens_seen": 20515240, + "step": 10479 + }, + { + "epoch": 1.3889993373094764, + "grad_norm": 5.7710185050964355, + "learning_rate": 2.791796553029919e-06, + "loss": 0.0141, + "num_input_tokens_seen": 20516824, + "step": 10480 + }, + { + "epoch": 1.3891318754141815, + "grad_norm": 6.486245155334473, + "learning_rate": 2.79145178558365e-06, + "loss": 0.1815, + "num_input_tokens_seen": 20518760, + "step": 10481 + }, + { + "epoch": 1.3892644135188867, + "grad_norm": 0.3129856586456299, + "learning_rate": 2.7911070125179897e-06, + "loss": 0.0018, + "num_input_tokens_seen": 20520832, + "step": 10482 + }, + { + "epoch": 1.3893969516235918, + "grad_norm": 3.4867331981658936, + "learning_rate": 2.7907622338395864e-06, + "loss": 0.048, + "num_input_tokens_seen": 20522248, + "step": 10483 + }, + { + "epoch": 1.3895294897282968, + "grad_norm": 1.8596006631851196, + "learning_rate": 2.790417449555085e-06, + "loss": 0.0099, + "num_input_tokens_seen": 20524208, + "step": 10484 + }, + { + "epoch": 1.389662027833002, + "grad_norm": 10.76799488067627, + "learning_rate": 2.7900726596711358e-06, + "loss": 0.2336, + "num_input_tokens_seen": 20526584, + "step": 10485 + }, + { + "epoch": 1.3897945659377071, + "grad_norm": 0.5786974430084229, + "learning_rate": 2.789727864194386e-06, + "loss": 0.003, + "num_input_tokens_seen": 20529440, + "step": 10486 + }, + { + "epoch": 1.3899271040424122, + "grad_norm": 10.499486923217773, + "learning_rate": 2.7893830631314824e-06, + "loss": 0.1422, + "num_input_tokens_seen": 20531344, + "step": 10487 + }, + { + "epoch": 1.3900596421471172, + "grad_norm": 0.1186819076538086, + "learning_rate": 2.789038256489073e-06, + "loss": 0.0008, + "num_input_tokens_seen": 20532800, + "step": 10488 + }, + { + "epoch": 1.3901921802518225, + "grad_norm": 9.143051147460938, + "learning_rate": 2.7886934442738085e-06, + "loss": 0.2226, + "num_input_tokens_seen": 20534520, + "step": 10489 + }, + { + "epoch": 1.3903247183565275, + "grad_norm": 8.500914573669434, + "learning_rate": 2.7883486264923333e-06, + "loss": 0.1719, + "num_input_tokens_seen": 20536328, + "step": 10490 + }, + { + "epoch": 1.3904572564612325, + "grad_norm": 6.91124963760376, + "learning_rate": 2.788003803151299e-06, + "loss": 0.1649, + "num_input_tokens_seen": 20538480, + "step": 10491 + }, + { + "epoch": 1.3905897945659378, + "grad_norm": 0.7869444489479065, + "learning_rate": 2.7876589742573524e-06, + "loss": 0.0064, + "num_input_tokens_seen": 20540096, + "step": 10492 + }, + { + "epoch": 1.3907223326706428, + "grad_norm": 0.2748253047466278, + "learning_rate": 2.7873141398171417e-06, + "loss": 0.0023, + "num_input_tokens_seen": 20542960, + "step": 10493 + }, + { + "epoch": 1.3908548707753479, + "grad_norm": 4.588486194610596, + "learning_rate": 2.786969299837317e-06, + "loss": 0.079, + "num_input_tokens_seen": 20544544, + "step": 10494 + }, + { + "epoch": 1.390987408880053, + "grad_norm": 6.643099784851074, + "learning_rate": 2.786624454324526e-06, + "loss": 0.1162, + "num_input_tokens_seen": 20546856, + "step": 10495 + }, + { + "epoch": 1.3911199469847582, + "grad_norm": 7.182328224182129, + "learning_rate": 2.786279603285417e-06, + "loss": 0.0329, + "num_input_tokens_seen": 20548808, + "step": 10496 + }, + { + "epoch": 1.3912524850894632, + "grad_norm": 4.121879577636719, + "learning_rate": 2.7859347467266407e-06, + "loss": 0.1052, + "num_input_tokens_seen": 20551336, + "step": 10497 + }, + { + "epoch": 1.3913850231941685, + "grad_norm": 4.8759965896606445, + "learning_rate": 2.785589884654845e-06, + "loss": 0.046, + "num_input_tokens_seen": 20553096, + "step": 10498 + }, + { + "epoch": 1.3915175612988735, + "grad_norm": 7.739864349365234, + "learning_rate": 2.785245017076679e-06, + "loss": 0.0229, + "num_input_tokens_seen": 20554416, + "step": 10499 + }, + { + "epoch": 1.3916500994035785, + "grad_norm": 0.04690643027424812, + "learning_rate": 2.7849001439987933e-06, + "loss": 0.0003, + "num_input_tokens_seen": 20556528, + "step": 10500 + }, + { + "epoch": 1.3917826375082836, + "grad_norm": 6.401378154754639, + "learning_rate": 2.7845552654278353e-06, + "loss": 0.1222, + "num_input_tokens_seen": 20558768, + "step": 10501 + }, + { + "epoch": 1.3919151756129886, + "grad_norm": 0.7200358510017395, + "learning_rate": 2.784210381370455e-06, + "loss": 0.0036, + "num_input_tokens_seen": 20560480, + "step": 10502 + }, + { + "epoch": 1.3920477137176939, + "grad_norm": 2.336420774459839, + "learning_rate": 2.7838654918333034e-06, + "loss": 0.0394, + "num_input_tokens_seen": 20561880, + "step": 10503 + }, + { + "epoch": 1.392180251822399, + "grad_norm": 1.5883830785751343, + "learning_rate": 2.7835205968230296e-06, + "loss": 0.0096, + "num_input_tokens_seen": 20563448, + "step": 10504 + }, + { + "epoch": 1.3923127899271042, + "grad_norm": 30.041460037231445, + "learning_rate": 2.7831756963462825e-06, + "loss": 0.3351, + "num_input_tokens_seen": 20564832, + "step": 10505 + }, + { + "epoch": 1.3924453280318092, + "grad_norm": 12.721914291381836, + "learning_rate": 2.7828307904097123e-06, + "loss": 0.2572, + "num_input_tokens_seen": 20566608, + "step": 10506 + }, + { + "epoch": 1.3925778661365142, + "grad_norm": 11.767704963684082, + "learning_rate": 2.78248587901997e-06, + "loss": 0.0743, + "num_input_tokens_seen": 20569624, + "step": 10507 + }, + { + "epoch": 1.3927104042412193, + "grad_norm": 8.095712661743164, + "learning_rate": 2.7821409621837042e-06, + "loss": 0.1937, + "num_input_tokens_seen": 20572648, + "step": 10508 + }, + { + "epoch": 1.3928429423459243, + "grad_norm": 2.4632644653320312, + "learning_rate": 2.7817960399075673e-06, + "loss": 0.0098, + "num_input_tokens_seen": 20574104, + "step": 10509 + }, + { + "epoch": 1.3929754804506296, + "grad_norm": 6.075214862823486, + "learning_rate": 2.781451112198208e-06, + "loss": 0.0718, + "num_input_tokens_seen": 20575552, + "step": 10510 + }, + { + "epoch": 1.3931080185553346, + "grad_norm": 13.98482894897461, + "learning_rate": 2.7811061790622756e-06, + "loss": 0.174, + "num_input_tokens_seen": 20577688, + "step": 10511 + }, + { + "epoch": 1.3932405566600399, + "grad_norm": 9.87360954284668, + "learning_rate": 2.7807612405064238e-06, + "loss": 0.0571, + "num_input_tokens_seen": 20579544, + "step": 10512 + }, + { + "epoch": 1.393373094764745, + "grad_norm": 2.584228038787842, + "learning_rate": 2.780416296537301e-06, + "loss": 0.0258, + "num_input_tokens_seen": 20581544, + "step": 10513 + }, + { + "epoch": 1.39350563286945, + "grad_norm": 6.481777191162109, + "learning_rate": 2.7800713471615586e-06, + "loss": 0.108, + "num_input_tokens_seen": 20583328, + "step": 10514 + }, + { + "epoch": 1.393638170974155, + "grad_norm": 7.916453838348389, + "learning_rate": 2.779726392385848e-06, + "loss": 0.0877, + "num_input_tokens_seen": 20584696, + "step": 10515 + }, + { + "epoch": 1.3937707090788602, + "grad_norm": 7.973959445953369, + "learning_rate": 2.779381432216819e-06, + "loss": 0.0954, + "num_input_tokens_seen": 20586120, + "step": 10516 + }, + { + "epoch": 1.3939032471835653, + "grad_norm": 0.8193455338478088, + "learning_rate": 2.779036466661123e-06, + "loss": 0.0076, + "num_input_tokens_seen": 20587992, + "step": 10517 + }, + { + "epoch": 1.3940357852882703, + "grad_norm": 13.569361686706543, + "learning_rate": 2.7786914957254116e-06, + "loss": 0.1825, + "num_input_tokens_seen": 20590184, + "step": 10518 + }, + { + "epoch": 1.3941683233929756, + "grad_norm": 17.50252914428711, + "learning_rate": 2.778346519416336e-06, + "loss": 0.4022, + "num_input_tokens_seen": 20592592, + "step": 10519 + }, + { + "epoch": 1.3943008614976806, + "grad_norm": 2.511194944381714, + "learning_rate": 2.7780015377405477e-06, + "loss": 0.0108, + "num_input_tokens_seen": 20594016, + "step": 10520 + }, + { + "epoch": 1.3944333996023857, + "grad_norm": 6.79447078704834, + "learning_rate": 2.7776565507046978e-06, + "loss": 0.1039, + "num_input_tokens_seen": 20595808, + "step": 10521 + }, + { + "epoch": 1.3945659377070907, + "grad_norm": 0.4829334616661072, + "learning_rate": 2.777311558315438e-06, + "loss": 0.0024, + "num_input_tokens_seen": 20598008, + "step": 10522 + }, + { + "epoch": 1.394698475811796, + "grad_norm": 1.4145292043685913, + "learning_rate": 2.7769665605794206e-06, + "loss": 0.0056, + "num_input_tokens_seen": 20599320, + "step": 10523 + }, + { + "epoch": 1.394831013916501, + "grad_norm": 6.7324395179748535, + "learning_rate": 2.7766215575032955e-06, + "loss": 0.1325, + "num_input_tokens_seen": 20600720, + "step": 10524 + }, + { + "epoch": 1.394963552021206, + "grad_norm": 3.594271659851074, + "learning_rate": 2.776276549093717e-06, + "loss": 0.0296, + "num_input_tokens_seen": 20602968, + "step": 10525 + }, + { + "epoch": 1.3950960901259113, + "grad_norm": 9.122613906860352, + "learning_rate": 2.775931535357336e-06, + "loss": 0.0354, + "num_input_tokens_seen": 20605104, + "step": 10526 + }, + { + "epoch": 1.3952286282306163, + "grad_norm": 3.760298252105713, + "learning_rate": 2.7755865163008046e-06, + "loss": 0.0616, + "num_input_tokens_seen": 20607424, + "step": 10527 + }, + { + "epoch": 1.3953611663353214, + "grad_norm": 13.919990539550781, + "learning_rate": 2.775241491930775e-06, + "loss": 0.1657, + "num_input_tokens_seen": 20608648, + "step": 10528 + }, + { + "epoch": 1.3954937044400264, + "grad_norm": 10.288064002990723, + "learning_rate": 2.774896462253899e-06, + "loss": 0.2808, + "num_input_tokens_seen": 20610736, + "step": 10529 + }, + { + "epoch": 1.3956262425447317, + "grad_norm": 1.5678014755249023, + "learning_rate": 2.7745514272768304e-06, + "loss": 0.0191, + "num_input_tokens_seen": 20612624, + "step": 10530 + }, + { + "epoch": 1.3957587806494367, + "grad_norm": 5.624784469604492, + "learning_rate": 2.7742063870062202e-06, + "loss": 0.1369, + "num_input_tokens_seen": 20615112, + "step": 10531 + }, + { + "epoch": 1.3958913187541417, + "grad_norm": 17.48431968688965, + "learning_rate": 2.773861341448722e-06, + "loss": 0.3428, + "num_input_tokens_seen": 20618176, + "step": 10532 + }, + { + "epoch": 1.396023856858847, + "grad_norm": 8.184895515441895, + "learning_rate": 2.773516290610989e-06, + "loss": 0.2259, + "num_input_tokens_seen": 20620304, + "step": 10533 + }, + { + "epoch": 1.396156394963552, + "grad_norm": 1.2629934549331665, + "learning_rate": 2.773171234499672e-06, + "loss": 0.0091, + "num_input_tokens_seen": 20621928, + "step": 10534 + }, + { + "epoch": 1.396288933068257, + "grad_norm": 10.707488059997559, + "learning_rate": 2.772826173121425e-06, + "loss": 0.2222, + "num_input_tokens_seen": 20623352, + "step": 10535 + }, + { + "epoch": 1.3964214711729621, + "grad_norm": 9.687740325927734, + "learning_rate": 2.772481106482902e-06, + "loss": 0.4304, + "num_input_tokens_seen": 20626040, + "step": 10536 + }, + { + "epoch": 1.3965540092776674, + "grad_norm": 8.113819122314453, + "learning_rate": 2.7721360345907557e-06, + "loss": 0.2208, + "num_input_tokens_seen": 20628600, + "step": 10537 + }, + { + "epoch": 1.3966865473823724, + "grad_norm": 10.707056045532227, + "learning_rate": 2.7717909574516375e-06, + "loss": 0.2607, + "num_input_tokens_seen": 20630376, + "step": 10538 + }, + { + "epoch": 1.3968190854870777, + "grad_norm": 1.6803975105285645, + "learning_rate": 2.7714458750722027e-06, + "loss": 0.0053, + "num_input_tokens_seen": 20631704, + "step": 10539 + }, + { + "epoch": 1.3969516235917827, + "grad_norm": 6.36403751373291, + "learning_rate": 2.7711007874591046e-06, + "loss": 0.0282, + "num_input_tokens_seen": 20633672, + "step": 10540 + }, + { + "epoch": 1.3970841616964877, + "grad_norm": 0.1906656175851822, + "learning_rate": 2.7707556946189962e-06, + "loss": 0.0013, + "num_input_tokens_seen": 20635192, + "step": 10541 + }, + { + "epoch": 1.3972166998011928, + "grad_norm": 0.12651477754116058, + "learning_rate": 2.770410596558531e-06, + "loss": 0.0008, + "num_input_tokens_seen": 20636976, + "step": 10542 + }, + { + "epoch": 1.3973492379058978, + "grad_norm": 1.0268782377243042, + "learning_rate": 2.770065493284363e-06, + "loss": 0.007, + "num_input_tokens_seen": 20638600, + "step": 10543 + }, + { + "epoch": 1.397481776010603, + "grad_norm": 6.950503826141357, + "learning_rate": 2.7697203848031462e-06, + "loss": 0.0767, + "num_input_tokens_seen": 20640504, + "step": 10544 + }, + { + "epoch": 1.3976143141153081, + "grad_norm": 6.949264049530029, + "learning_rate": 2.7693752711215343e-06, + "loss": 0.1284, + "num_input_tokens_seen": 20642680, + "step": 10545 + }, + { + "epoch": 1.3977468522200134, + "grad_norm": 16.17197608947754, + "learning_rate": 2.769030152246181e-06, + "loss": 0.3429, + "num_input_tokens_seen": 20644952, + "step": 10546 + }, + { + "epoch": 1.3978793903247184, + "grad_norm": 15.016852378845215, + "learning_rate": 2.7686850281837407e-06, + "loss": 0.3566, + "num_input_tokens_seen": 20647016, + "step": 10547 + }, + { + "epoch": 1.3980119284294235, + "grad_norm": 0.28729212284088135, + "learning_rate": 2.7683398989408684e-06, + "loss": 0.0019, + "num_input_tokens_seen": 20648752, + "step": 10548 + }, + { + "epoch": 1.3981444665341285, + "grad_norm": 12.260937690734863, + "learning_rate": 2.7679947645242173e-06, + "loss": 0.0382, + "num_input_tokens_seen": 20649800, + "step": 10549 + }, + { + "epoch": 1.3982770046388335, + "grad_norm": 3.0606260299682617, + "learning_rate": 2.7676496249404416e-06, + "loss": 0.0239, + "num_input_tokens_seen": 20652256, + "step": 10550 + }, + { + "epoch": 1.3984095427435388, + "grad_norm": 0.4068226218223572, + "learning_rate": 2.7673044801961977e-06, + "loss": 0.0024, + "num_input_tokens_seen": 20654480, + "step": 10551 + }, + { + "epoch": 1.3985420808482438, + "grad_norm": 8.831425666809082, + "learning_rate": 2.766959330298138e-06, + "loss": 0.2934, + "num_input_tokens_seen": 20656032, + "step": 10552 + }, + { + "epoch": 1.398674618952949, + "grad_norm": 10.669981956481934, + "learning_rate": 2.7666141752529187e-06, + "loss": 0.171, + "num_input_tokens_seen": 20657976, + "step": 10553 + }, + { + "epoch": 1.3988071570576541, + "grad_norm": 0.029108744114637375, + "learning_rate": 2.7662690150671943e-06, + "loss": 0.0002, + "num_input_tokens_seen": 20659352, + "step": 10554 + }, + { + "epoch": 1.3989396951623592, + "grad_norm": 3.287593364715576, + "learning_rate": 2.7659238497476193e-06, + "loss": 0.0324, + "num_input_tokens_seen": 20660736, + "step": 10555 + }, + { + "epoch": 1.3990722332670642, + "grad_norm": 3.797125816345215, + "learning_rate": 2.7655786793008493e-06, + "loss": 0.0046, + "num_input_tokens_seen": 20662104, + "step": 10556 + }, + { + "epoch": 1.3992047713717695, + "grad_norm": 6.064399242401123, + "learning_rate": 2.7652335037335397e-06, + "loss": 0.0347, + "num_input_tokens_seen": 20663112, + "step": 10557 + }, + { + "epoch": 1.3993373094764745, + "grad_norm": 4.344359874725342, + "learning_rate": 2.7648883230523438e-06, + "loss": 0.0197, + "num_input_tokens_seen": 20664680, + "step": 10558 + }, + { + "epoch": 1.3994698475811795, + "grad_norm": 0.10848988592624664, + "learning_rate": 2.764543137263919e-06, + "loss": 0.0005, + "num_input_tokens_seen": 20666048, + "step": 10559 + }, + { + "epoch": 1.3996023856858848, + "grad_norm": 4.652587890625, + "learning_rate": 2.76419794637492e-06, + "loss": 0.08, + "num_input_tokens_seen": 20667968, + "step": 10560 + }, + { + "epoch": 1.3997349237905898, + "grad_norm": 7.251633644104004, + "learning_rate": 2.763852750392002e-06, + "loss": 0.1393, + "num_input_tokens_seen": 20670672, + "step": 10561 + }, + { + "epoch": 1.3998674618952949, + "grad_norm": 8.513386726379395, + "learning_rate": 2.763507549321822e-06, + "loss": 0.2209, + "num_input_tokens_seen": 20673000, + "step": 10562 + }, + { + "epoch": 1.4, + "grad_norm": 7.966372966766357, + "learning_rate": 2.7631623431710337e-06, + "loss": 0.0758, + "num_input_tokens_seen": 20674840, + "step": 10563 + }, + { + "epoch": 1.4001325381047052, + "grad_norm": 1.2695943117141724, + "learning_rate": 2.7628171319462942e-06, + "loss": 0.0035, + "num_input_tokens_seen": 20676640, + "step": 10564 + }, + { + "epoch": 1.4002650762094102, + "grad_norm": 13.614012718200684, + "learning_rate": 2.762471915654259e-06, + "loss": 0.3232, + "num_input_tokens_seen": 20678640, + "step": 10565 + }, + { + "epoch": 1.4003976143141152, + "grad_norm": 2.442643165588379, + "learning_rate": 2.7621266943015846e-06, + "loss": 0.0585, + "num_input_tokens_seen": 20679928, + "step": 10566 + }, + { + "epoch": 1.4005301524188205, + "grad_norm": 0.011214971542358398, + "learning_rate": 2.761781467894926e-06, + "loss": 0.0001, + "num_input_tokens_seen": 20681112, + "step": 10567 + }, + { + "epoch": 1.4006626905235255, + "grad_norm": 10.345452308654785, + "learning_rate": 2.7614362364409405e-06, + "loss": 0.1659, + "num_input_tokens_seen": 20683088, + "step": 10568 + }, + { + "epoch": 1.4007952286282306, + "grad_norm": 9.527271270751953, + "learning_rate": 2.7610909999462842e-06, + "loss": 0.0851, + "num_input_tokens_seen": 20684592, + "step": 10569 + }, + { + "epoch": 1.4009277667329356, + "grad_norm": 5.224287509918213, + "learning_rate": 2.760745758417612e-06, + "loss": 0.1301, + "num_input_tokens_seen": 20686648, + "step": 10570 + }, + { + "epoch": 1.4010603048376409, + "grad_norm": 4.609777927398682, + "learning_rate": 2.7604005118615834e-06, + "loss": 0.1214, + "num_input_tokens_seen": 20689512, + "step": 10571 + }, + { + "epoch": 1.401192842942346, + "grad_norm": 6.513919830322266, + "learning_rate": 2.760055260284853e-06, + "loss": 0.1348, + "num_input_tokens_seen": 20691216, + "step": 10572 + }, + { + "epoch": 1.401325381047051, + "grad_norm": 4.91895866394043, + "learning_rate": 2.7597100036940766e-06, + "loss": 0.0944, + "num_input_tokens_seen": 20692928, + "step": 10573 + }, + { + "epoch": 1.4014579191517562, + "grad_norm": 5.612612724304199, + "learning_rate": 2.7593647420959134e-06, + "loss": 0.142, + "num_input_tokens_seen": 20695984, + "step": 10574 + }, + { + "epoch": 1.4015904572564613, + "grad_norm": 0.018909305334091187, + "learning_rate": 2.759019475497019e-06, + "loss": 0.0001, + "num_input_tokens_seen": 20697936, + "step": 10575 + }, + { + "epoch": 1.4017229953611663, + "grad_norm": 11.03704833984375, + "learning_rate": 2.7586742039040493e-06, + "loss": 0.2918, + "num_input_tokens_seen": 20699544, + "step": 10576 + }, + { + "epoch": 1.4018555334658713, + "grad_norm": 9.435297012329102, + "learning_rate": 2.7583289273236635e-06, + "loss": 0.1865, + "num_input_tokens_seen": 20700912, + "step": 10577 + }, + { + "epoch": 1.4019880715705766, + "grad_norm": 1.4211225509643555, + "learning_rate": 2.7579836457625177e-06, + "loss": 0.0161, + "num_input_tokens_seen": 20703360, + "step": 10578 + }, + { + "epoch": 1.4021206096752816, + "grad_norm": 5.037970542907715, + "learning_rate": 2.7576383592272687e-06, + "loss": 0.1302, + "num_input_tokens_seen": 20705104, + "step": 10579 + }, + { + "epoch": 1.4022531477799867, + "grad_norm": 0.01099399197846651, + "learning_rate": 2.757293067724575e-06, + "loss": 0.0001, + "num_input_tokens_seen": 20706584, + "step": 10580 + }, + { + "epoch": 1.402385685884692, + "grad_norm": 4.613814830780029, + "learning_rate": 2.7569477712610932e-06, + "loss": 0.0216, + "num_input_tokens_seen": 20708496, + "step": 10581 + }, + { + "epoch": 1.402518223989397, + "grad_norm": 0.16234523057937622, + "learning_rate": 2.7566024698434813e-06, + "loss": 0.0007, + "num_input_tokens_seen": 20710048, + "step": 10582 + }, + { + "epoch": 1.402650762094102, + "grad_norm": 1.3125298023223877, + "learning_rate": 2.7562571634783965e-06, + "loss": 0.0073, + "num_input_tokens_seen": 20712176, + "step": 10583 + }, + { + "epoch": 1.402783300198807, + "grad_norm": 4.0816144943237305, + "learning_rate": 2.755911852172497e-06, + "loss": 0.0514, + "num_input_tokens_seen": 20714112, + "step": 10584 + }, + { + "epoch": 1.4029158383035123, + "grad_norm": 4.365691184997559, + "learning_rate": 2.7555665359324403e-06, + "loss": 0.1042, + "num_input_tokens_seen": 20716224, + "step": 10585 + }, + { + "epoch": 1.4030483764082173, + "grad_norm": 0.07552376389503479, + "learning_rate": 2.7552212147648843e-06, + "loss": 0.0004, + "num_input_tokens_seen": 20718408, + "step": 10586 + }, + { + "epoch": 1.4031809145129226, + "grad_norm": 0.15295080840587616, + "learning_rate": 2.7548758886764874e-06, + "loss": 0.0011, + "num_input_tokens_seen": 20721728, + "step": 10587 + }, + { + "epoch": 1.4033134526176276, + "grad_norm": 5.798828601837158, + "learning_rate": 2.7545305576739077e-06, + "loss": 0.0239, + "num_input_tokens_seen": 20723128, + "step": 10588 + }, + { + "epoch": 1.4034459907223327, + "grad_norm": 1.074324369430542, + "learning_rate": 2.7541852217638034e-06, + "loss": 0.015, + "num_input_tokens_seen": 20725136, + "step": 10589 + }, + { + "epoch": 1.4035785288270377, + "grad_norm": 0.09571792185306549, + "learning_rate": 2.7538398809528316e-06, + "loss": 0.0005, + "num_input_tokens_seen": 20726968, + "step": 10590 + }, + { + "epoch": 1.4037110669317427, + "grad_norm": 8.460127830505371, + "learning_rate": 2.7534945352476527e-06, + "loss": 0.1832, + "num_input_tokens_seen": 20729624, + "step": 10591 + }, + { + "epoch": 1.403843605036448, + "grad_norm": 4.628119945526123, + "learning_rate": 2.7531491846549247e-06, + "loss": 0.0475, + "num_input_tokens_seen": 20731768, + "step": 10592 + }, + { + "epoch": 1.403976143141153, + "grad_norm": 7.634605407714844, + "learning_rate": 2.752803829181305e-06, + "loss": 0.1954, + "num_input_tokens_seen": 20733736, + "step": 10593 + }, + { + "epoch": 1.4041086812458583, + "grad_norm": 0.11401008069515228, + "learning_rate": 2.7524584688334525e-06, + "loss": 0.0005, + "num_input_tokens_seen": 20735000, + "step": 10594 + }, + { + "epoch": 1.4042412193505633, + "grad_norm": 10.275894165039062, + "learning_rate": 2.7521131036180284e-06, + "loss": 0.3567, + "num_input_tokens_seen": 20736984, + "step": 10595 + }, + { + "epoch": 1.4043737574552684, + "grad_norm": 7.303134918212891, + "learning_rate": 2.751767733541688e-06, + "loss": 0.2757, + "num_input_tokens_seen": 20739640, + "step": 10596 + }, + { + "epoch": 1.4045062955599734, + "grad_norm": 0.04334006831049919, + "learning_rate": 2.751422358611093e-06, + "loss": 0.0003, + "num_input_tokens_seen": 20741392, + "step": 10597 + }, + { + "epoch": 1.4046388336646787, + "grad_norm": 9.306764602661133, + "learning_rate": 2.751076978832902e-06, + "loss": 0.1802, + "num_input_tokens_seen": 20743048, + "step": 10598 + }, + { + "epoch": 1.4047713717693837, + "grad_norm": 0.04407084360718727, + "learning_rate": 2.750731594213772e-06, + "loss": 0.0003, + "num_input_tokens_seen": 20744768, + "step": 10599 + }, + { + "epoch": 1.4049039098740888, + "grad_norm": 0.009371750988066196, + "learning_rate": 2.750386204760365e-06, + "loss": 0.0001, + "num_input_tokens_seen": 20746088, + "step": 10600 + }, + { + "epoch": 1.405036447978794, + "grad_norm": 0.025039702653884888, + "learning_rate": 2.7500408104793393e-06, + "loss": 0.0002, + "num_input_tokens_seen": 20747648, + "step": 10601 + }, + { + "epoch": 1.405168986083499, + "grad_norm": 45.71411895751953, + "learning_rate": 2.7496954113773533e-06, + "loss": 0.1113, + "num_input_tokens_seen": 20749200, + "step": 10602 + }, + { + "epoch": 1.405301524188204, + "grad_norm": 8.057454109191895, + "learning_rate": 2.749350007461068e-06, + "loss": 0.187, + "num_input_tokens_seen": 20751184, + "step": 10603 + }, + { + "epoch": 1.4054340622929091, + "grad_norm": 7.636471271514893, + "learning_rate": 2.749004598737143e-06, + "loss": 0.1274, + "num_input_tokens_seen": 20753680, + "step": 10604 + }, + { + "epoch": 1.4055666003976144, + "grad_norm": 7.203907012939453, + "learning_rate": 2.748659185212237e-06, + "loss": 0.1101, + "num_input_tokens_seen": 20755776, + "step": 10605 + }, + { + "epoch": 1.4056991385023194, + "grad_norm": 9.805910110473633, + "learning_rate": 2.748313766893011e-06, + "loss": 0.2876, + "num_input_tokens_seen": 20758088, + "step": 10606 + }, + { + "epoch": 1.4058316766070245, + "grad_norm": 0.05939975008368492, + "learning_rate": 2.747968343786124e-06, + "loss": 0.0004, + "num_input_tokens_seen": 20759776, + "step": 10607 + }, + { + "epoch": 1.4059642147117297, + "grad_norm": 0.27053332328796387, + "learning_rate": 2.747622915898235e-06, + "loss": 0.002, + "num_input_tokens_seen": 20762680, + "step": 10608 + }, + { + "epoch": 1.4060967528164348, + "grad_norm": 5.980823993682861, + "learning_rate": 2.747277483236007e-06, + "loss": 0.0673, + "num_input_tokens_seen": 20764136, + "step": 10609 + }, + { + "epoch": 1.4062292909211398, + "grad_norm": 4.109341621398926, + "learning_rate": 2.7469320458060984e-06, + "loss": 0.0294, + "num_input_tokens_seen": 20765760, + "step": 10610 + }, + { + "epoch": 1.4063618290258448, + "grad_norm": 0.18717600405216217, + "learning_rate": 2.746586603615169e-06, + "loss": 0.0012, + "num_input_tokens_seen": 20767256, + "step": 10611 + }, + { + "epoch": 1.40649436713055, + "grad_norm": 0.011210133321583271, + "learning_rate": 2.74624115666988e-06, + "loss": 0.0001, + "num_input_tokens_seen": 20768464, + "step": 10612 + }, + { + "epoch": 1.4066269052352551, + "grad_norm": 4.336481094360352, + "learning_rate": 2.745895704976892e-06, + "loss": 0.0216, + "num_input_tokens_seen": 20769944, + "step": 10613 + }, + { + "epoch": 1.4067594433399602, + "grad_norm": 5.574187755584717, + "learning_rate": 2.7455502485428647e-06, + "loss": 0.0474, + "num_input_tokens_seen": 20771416, + "step": 10614 + }, + { + "epoch": 1.4068919814446654, + "grad_norm": 5.246311187744141, + "learning_rate": 2.745204787374459e-06, + "loss": 0.0999, + "num_input_tokens_seen": 20772992, + "step": 10615 + }, + { + "epoch": 1.4070245195493705, + "grad_norm": 1.4681001901626587, + "learning_rate": 2.744859321478337e-06, + "loss": 0.0098, + "num_input_tokens_seen": 20774808, + "step": 10616 + }, + { + "epoch": 1.4071570576540755, + "grad_norm": 0.920194685459137, + "learning_rate": 2.744513850861156e-06, + "loss": 0.004, + "num_input_tokens_seen": 20775976, + "step": 10617 + }, + { + "epoch": 1.4072895957587805, + "grad_norm": 8.181239128112793, + "learning_rate": 2.7441683755295816e-06, + "loss": 0.0963, + "num_input_tokens_seen": 20777952, + "step": 10618 + }, + { + "epoch": 1.4074221338634858, + "grad_norm": 0.21132461726665497, + "learning_rate": 2.7438228954902724e-06, + "loss": 0.0013, + "num_input_tokens_seen": 20779576, + "step": 10619 + }, + { + "epoch": 1.4075546719681908, + "grad_norm": 0.06734517216682434, + "learning_rate": 2.743477410749888e-06, + "loss": 0.0003, + "num_input_tokens_seen": 20780864, + "step": 10620 + }, + { + "epoch": 1.4076872100728959, + "grad_norm": 9.145798683166504, + "learning_rate": 2.7431319213150926e-06, + "loss": 0.2204, + "num_input_tokens_seen": 20783208, + "step": 10621 + }, + { + "epoch": 1.4078197481776011, + "grad_norm": 0.5492861270904541, + "learning_rate": 2.7427864271925452e-06, + "loss": 0.0057, + "num_input_tokens_seen": 20784536, + "step": 10622 + }, + { + "epoch": 1.4079522862823062, + "grad_norm": 0.02927401289343834, + "learning_rate": 2.742440928388908e-06, + "loss": 0.0002, + "num_input_tokens_seen": 20786176, + "step": 10623 + }, + { + "epoch": 1.4080848243870112, + "grad_norm": 11.943614959716797, + "learning_rate": 2.742095424910843e-06, + "loss": 0.203, + "num_input_tokens_seen": 20788032, + "step": 10624 + }, + { + "epoch": 1.4082173624917163, + "grad_norm": 8.293085098266602, + "learning_rate": 2.7417499167650107e-06, + "loss": 0.2296, + "num_input_tokens_seen": 20789728, + "step": 10625 + }, + { + "epoch": 1.4083499005964215, + "grad_norm": 6.553684234619141, + "learning_rate": 2.7414044039580736e-06, + "loss": 0.1605, + "num_input_tokens_seen": 20792384, + "step": 10626 + }, + { + "epoch": 1.4084824387011265, + "grad_norm": 7.885653495788574, + "learning_rate": 2.7410588864966925e-06, + "loss": 0.1703, + "num_input_tokens_seen": 20794384, + "step": 10627 + }, + { + "epoch": 1.4086149768058318, + "grad_norm": 3.4076781272888184, + "learning_rate": 2.7407133643875303e-06, + "loss": 0.0581, + "num_input_tokens_seen": 20796440, + "step": 10628 + }, + { + "epoch": 1.4087475149105368, + "grad_norm": 0.09203991293907166, + "learning_rate": 2.7403678376372485e-06, + "loss": 0.0007, + "num_input_tokens_seen": 20798840, + "step": 10629 + }, + { + "epoch": 1.4088800530152419, + "grad_norm": 0.08978790789842606, + "learning_rate": 2.7400223062525084e-06, + "loss": 0.0006, + "num_input_tokens_seen": 20801616, + "step": 10630 + }, + { + "epoch": 1.409012591119947, + "grad_norm": 6.554531574249268, + "learning_rate": 2.739676770239973e-06, + "loss": 0.1438, + "num_input_tokens_seen": 20803384, + "step": 10631 + }, + { + "epoch": 1.409145129224652, + "grad_norm": 6.797333717346191, + "learning_rate": 2.739331229606304e-06, + "loss": 0.1107, + "num_input_tokens_seen": 20805088, + "step": 10632 + }, + { + "epoch": 1.4092776673293572, + "grad_norm": 4.143064498901367, + "learning_rate": 2.7389856843581638e-06, + "loss": 0.0162, + "num_input_tokens_seen": 20806880, + "step": 10633 + }, + { + "epoch": 1.4094102054340623, + "grad_norm": 7.512301445007324, + "learning_rate": 2.738640134502215e-06, + "loss": 0.1972, + "num_input_tokens_seen": 20809072, + "step": 10634 + }, + { + "epoch": 1.4095427435387675, + "grad_norm": 0.07993736863136292, + "learning_rate": 2.738294580045119e-06, + "loss": 0.0006, + "num_input_tokens_seen": 20811048, + "step": 10635 + }, + { + "epoch": 1.4096752816434726, + "grad_norm": 0.6963827013969421, + "learning_rate": 2.73794902099354e-06, + "loss": 0.0046, + "num_input_tokens_seen": 20813232, + "step": 10636 + }, + { + "epoch": 1.4098078197481776, + "grad_norm": 8.478193283081055, + "learning_rate": 2.73760345735414e-06, + "loss": 0.128, + "num_input_tokens_seen": 20815248, + "step": 10637 + }, + { + "epoch": 1.4099403578528826, + "grad_norm": 5.135081768035889, + "learning_rate": 2.73725788913358e-06, + "loss": 0.1353, + "num_input_tokens_seen": 20816984, + "step": 10638 + }, + { + "epoch": 1.4100728959575877, + "grad_norm": 1.0555853843688965, + "learning_rate": 2.7369123163385254e-06, + "loss": 0.0062, + "num_input_tokens_seen": 20819040, + "step": 10639 + }, + { + "epoch": 1.410205434062293, + "grad_norm": 9.966594696044922, + "learning_rate": 2.736566738975637e-06, + "loss": 0.2144, + "num_input_tokens_seen": 20821128, + "step": 10640 + }, + { + "epoch": 1.410337972166998, + "grad_norm": 0.64225172996521, + "learning_rate": 2.7362211570515794e-06, + "loss": 0.0121, + "num_input_tokens_seen": 20823216, + "step": 10641 + }, + { + "epoch": 1.4104705102717032, + "grad_norm": 2.339709758758545, + "learning_rate": 2.735875570573015e-06, + "loss": 0.0201, + "num_input_tokens_seen": 20824944, + "step": 10642 + }, + { + "epoch": 1.4106030483764083, + "grad_norm": 8.702493667602539, + "learning_rate": 2.7355299795466068e-06, + "loss": 0.0893, + "num_input_tokens_seen": 20826512, + "step": 10643 + }, + { + "epoch": 1.4107355864811133, + "grad_norm": 9.747122764587402, + "learning_rate": 2.7351843839790175e-06, + "loss": 0.3861, + "num_input_tokens_seen": 20829056, + "step": 10644 + }, + { + "epoch": 1.4108681245858183, + "grad_norm": 4.787661075592041, + "learning_rate": 2.734838783876912e-06, + "loss": 0.024, + "num_input_tokens_seen": 20830936, + "step": 10645 + }, + { + "epoch": 1.4110006626905236, + "grad_norm": 6.043982028961182, + "learning_rate": 2.734493179246952e-06, + "loss": 0.0842, + "num_input_tokens_seen": 20833352, + "step": 10646 + }, + { + "epoch": 1.4111332007952286, + "grad_norm": 2.3153305053710938, + "learning_rate": 2.734147570095802e-06, + "loss": 0.02, + "num_input_tokens_seen": 20835096, + "step": 10647 + }, + { + "epoch": 1.4112657388999337, + "grad_norm": 1.2299238443374634, + "learning_rate": 2.733801956430125e-06, + "loss": 0.0096, + "num_input_tokens_seen": 20837320, + "step": 10648 + }, + { + "epoch": 1.411398277004639, + "grad_norm": 7.338959693908691, + "learning_rate": 2.7334563382565853e-06, + "loss": 0.2465, + "num_input_tokens_seen": 20840976, + "step": 10649 + }, + { + "epoch": 1.411530815109344, + "grad_norm": 0.06455773115158081, + "learning_rate": 2.7331107155818466e-06, + "loss": 0.0004, + "num_input_tokens_seen": 20842544, + "step": 10650 + }, + { + "epoch": 1.411663353214049, + "grad_norm": 7.122287273406982, + "learning_rate": 2.732765088412572e-06, + "loss": 0.0774, + "num_input_tokens_seen": 20844024, + "step": 10651 + }, + { + "epoch": 1.411795891318754, + "grad_norm": 7.4047160148620605, + "learning_rate": 2.7324194567554265e-06, + "loss": 0.0433, + "num_input_tokens_seen": 20845856, + "step": 10652 + }, + { + "epoch": 1.4119284294234593, + "grad_norm": 4.618725299835205, + "learning_rate": 2.7320738206170724e-06, + "loss": 0.0791, + "num_input_tokens_seen": 20848064, + "step": 10653 + }, + { + "epoch": 1.4120609675281643, + "grad_norm": 7.90389347076416, + "learning_rate": 2.7317281800041756e-06, + "loss": 0.174, + "num_input_tokens_seen": 20850640, + "step": 10654 + }, + { + "epoch": 1.4121935056328694, + "grad_norm": 2.023629903793335, + "learning_rate": 2.7313825349234004e-06, + "loss": 0.0281, + "num_input_tokens_seen": 20852344, + "step": 10655 + }, + { + "epoch": 1.4123260437375746, + "grad_norm": 1.1122645139694214, + "learning_rate": 2.731036885381409e-06, + "loss": 0.0071, + "num_input_tokens_seen": 20854744, + "step": 10656 + }, + { + "epoch": 1.4124585818422797, + "grad_norm": 2.542592763900757, + "learning_rate": 2.7306912313848677e-06, + "loss": 0.0335, + "num_input_tokens_seen": 20855888, + "step": 10657 + }, + { + "epoch": 1.4125911199469847, + "grad_norm": 5.379162788391113, + "learning_rate": 2.7303455729404404e-06, + "loss": 0.1568, + "num_input_tokens_seen": 20857608, + "step": 10658 + }, + { + "epoch": 1.4127236580516898, + "grad_norm": 3.359867572784424, + "learning_rate": 2.729999910054791e-06, + "loss": 0.0878, + "num_input_tokens_seen": 20859744, + "step": 10659 + }, + { + "epoch": 1.412856196156395, + "grad_norm": 15.724868774414062, + "learning_rate": 2.7296542427345856e-06, + "loss": 0.1215, + "num_input_tokens_seen": 20862104, + "step": 10660 + }, + { + "epoch": 1.4129887342611, + "grad_norm": 0.07450693100690842, + "learning_rate": 2.7293085709864874e-06, + "loss": 0.0004, + "num_input_tokens_seen": 20863864, + "step": 10661 + }, + { + "epoch": 1.413121272365805, + "grad_norm": 8.932411193847656, + "learning_rate": 2.7289628948171614e-06, + "loss": 0.0893, + "num_input_tokens_seen": 20866272, + "step": 10662 + }, + { + "epoch": 1.4132538104705104, + "grad_norm": 9.074971199035645, + "learning_rate": 2.728617214233274e-06, + "loss": 0.1708, + "num_input_tokens_seen": 20868080, + "step": 10663 + }, + { + "epoch": 1.4133863485752154, + "grad_norm": 1.7560399770736694, + "learning_rate": 2.728271529241488e-06, + "loss": 0.0037, + "num_input_tokens_seen": 20869928, + "step": 10664 + }, + { + "epoch": 1.4135188866799204, + "grad_norm": 4.777466773986816, + "learning_rate": 2.72792583984847e-06, + "loss": 0.1645, + "num_input_tokens_seen": 20871736, + "step": 10665 + }, + { + "epoch": 1.4136514247846255, + "grad_norm": 5.210172653198242, + "learning_rate": 2.7275801460608836e-06, + "loss": 0.0716, + "num_input_tokens_seen": 20873344, + "step": 10666 + }, + { + "epoch": 1.4137839628893307, + "grad_norm": 13.746232032775879, + "learning_rate": 2.7272344478853964e-06, + "loss": 0.2772, + "num_input_tokens_seen": 20875680, + "step": 10667 + }, + { + "epoch": 1.4139165009940358, + "grad_norm": 10.221718788146973, + "learning_rate": 2.7268887453286715e-06, + "loss": 0.2391, + "num_input_tokens_seen": 20877576, + "step": 10668 + }, + { + "epoch": 1.414049039098741, + "grad_norm": 0.03659462928771973, + "learning_rate": 2.7265430383973746e-06, + "loss": 0.0003, + "num_input_tokens_seen": 20879208, + "step": 10669 + }, + { + "epoch": 1.414181577203446, + "grad_norm": 0.028182020410895348, + "learning_rate": 2.726197327098172e-06, + "loss": 0.0002, + "num_input_tokens_seen": 20880944, + "step": 10670 + }, + { + "epoch": 1.414314115308151, + "grad_norm": 0.773452877998352, + "learning_rate": 2.725851611437729e-06, + "loss": 0.0089, + "num_input_tokens_seen": 20882000, + "step": 10671 + }, + { + "epoch": 1.4144466534128561, + "grad_norm": 9.345169067382812, + "learning_rate": 2.7255058914227112e-06, + "loss": 0.1693, + "num_input_tokens_seen": 20884056, + "step": 10672 + }, + { + "epoch": 1.4145791915175612, + "grad_norm": 7.091841697692871, + "learning_rate": 2.7251601670597842e-06, + "loss": 0.122, + "num_input_tokens_seen": 20887232, + "step": 10673 + }, + { + "epoch": 1.4147117296222664, + "grad_norm": 7.1334123611450195, + "learning_rate": 2.7248144383556135e-06, + "loss": 0.2165, + "num_input_tokens_seen": 20889360, + "step": 10674 + }, + { + "epoch": 1.4148442677269715, + "grad_norm": 0.22957558929920197, + "learning_rate": 2.724468705316866e-06, + "loss": 0.0012, + "num_input_tokens_seen": 20890920, + "step": 10675 + }, + { + "epoch": 1.4149768058316767, + "grad_norm": 9.437118530273438, + "learning_rate": 2.724122967950207e-06, + "loss": 0.242, + "num_input_tokens_seen": 20892672, + "step": 10676 + }, + { + "epoch": 1.4151093439363818, + "grad_norm": 0.013177376240491867, + "learning_rate": 2.723777226262302e-06, + "loss": 0.0001, + "num_input_tokens_seen": 20894232, + "step": 10677 + }, + { + "epoch": 1.4152418820410868, + "grad_norm": 3.774428367614746, + "learning_rate": 2.7234314802598184e-06, + "loss": 0.0439, + "num_input_tokens_seen": 20895816, + "step": 10678 + }, + { + "epoch": 1.4153744201457918, + "grad_norm": 20.24034881591797, + "learning_rate": 2.7230857299494206e-06, + "loss": 0.6097, + "num_input_tokens_seen": 20898528, + "step": 10679 + }, + { + "epoch": 1.4155069582504969, + "grad_norm": 5.048086166381836, + "learning_rate": 2.722739975337777e-06, + "loss": 0.122, + "num_input_tokens_seen": 20900808, + "step": 10680 + }, + { + "epoch": 1.4156394963552021, + "grad_norm": 12.051342010498047, + "learning_rate": 2.7223942164315533e-06, + "loss": 0.3031, + "num_input_tokens_seen": 20902848, + "step": 10681 + }, + { + "epoch": 1.4157720344599072, + "grad_norm": 0.1069600060582161, + "learning_rate": 2.7220484532374147e-06, + "loss": 0.0006, + "num_input_tokens_seen": 20904960, + "step": 10682 + }, + { + "epoch": 1.4159045725646124, + "grad_norm": 0.026960812509059906, + "learning_rate": 2.721702685762029e-06, + "loss": 0.0002, + "num_input_tokens_seen": 20906688, + "step": 10683 + }, + { + "epoch": 1.4160371106693175, + "grad_norm": 13.062935829162598, + "learning_rate": 2.7213569140120627e-06, + "loss": 0.1239, + "num_input_tokens_seen": 20909120, + "step": 10684 + }, + { + "epoch": 1.4161696487740225, + "grad_norm": 13.373900413513184, + "learning_rate": 2.7210111379941824e-06, + "loss": 0.2291, + "num_input_tokens_seen": 20910688, + "step": 10685 + }, + { + "epoch": 1.4163021868787276, + "grad_norm": 6.7151947021484375, + "learning_rate": 2.720665357715055e-06, + "loss": 0.2858, + "num_input_tokens_seen": 20912928, + "step": 10686 + }, + { + "epoch": 1.4164347249834328, + "grad_norm": 0.6810829043388367, + "learning_rate": 2.720319573181347e-06, + "loss": 0.0059, + "num_input_tokens_seen": 20914120, + "step": 10687 + }, + { + "epoch": 1.4165672630881379, + "grad_norm": 0.5847558975219727, + "learning_rate": 2.7199737843997254e-06, + "loss": 0.0023, + "num_input_tokens_seen": 20916328, + "step": 10688 + }, + { + "epoch": 1.416699801192843, + "grad_norm": 0.0157274529337883, + "learning_rate": 2.7196279913768587e-06, + "loss": 0.0001, + "num_input_tokens_seen": 20917672, + "step": 10689 + }, + { + "epoch": 1.4168323392975481, + "grad_norm": 15.493023872375488, + "learning_rate": 2.7192821941194113e-06, + "loss": 0.4019, + "num_input_tokens_seen": 20919904, + "step": 10690 + }, + { + "epoch": 1.4169648774022532, + "grad_norm": 0.033382903784513474, + "learning_rate": 2.7189363926340527e-06, + "loss": 0.0002, + "num_input_tokens_seen": 20922384, + "step": 10691 + }, + { + "epoch": 1.4170974155069582, + "grad_norm": 0.03856700658798218, + "learning_rate": 2.7185905869274495e-06, + "loss": 0.0003, + "num_input_tokens_seen": 20923768, + "step": 10692 + }, + { + "epoch": 1.4172299536116633, + "grad_norm": 9.818392753601074, + "learning_rate": 2.718244777006269e-06, + "loss": 0.4454, + "num_input_tokens_seen": 20926576, + "step": 10693 + }, + { + "epoch": 1.4173624917163685, + "grad_norm": 5.062514305114746, + "learning_rate": 2.7178989628771785e-06, + "loss": 0.1346, + "num_input_tokens_seen": 20928280, + "step": 10694 + }, + { + "epoch": 1.4174950298210736, + "grad_norm": 10.080476760864258, + "learning_rate": 2.7175531445468456e-06, + "loss": 0.2572, + "num_input_tokens_seen": 20930136, + "step": 10695 + }, + { + "epoch": 1.4176275679257786, + "grad_norm": 0.0313757061958313, + "learning_rate": 2.7172073220219382e-06, + "loss": 0.0002, + "num_input_tokens_seen": 20931696, + "step": 10696 + }, + { + "epoch": 1.4177601060304839, + "grad_norm": 2.7627387046813965, + "learning_rate": 2.7168614953091225e-06, + "loss": 0.0133, + "num_input_tokens_seen": 20934456, + "step": 10697 + }, + { + "epoch": 1.417892644135189, + "grad_norm": 11.445853233337402, + "learning_rate": 2.7165156644150692e-06, + "loss": 0.2616, + "num_input_tokens_seen": 20937128, + "step": 10698 + }, + { + "epoch": 1.418025182239894, + "grad_norm": 8.642108917236328, + "learning_rate": 2.7161698293464446e-06, + "loss": 0.106, + "num_input_tokens_seen": 20939064, + "step": 10699 + }, + { + "epoch": 1.418157720344599, + "grad_norm": 0.07817390561103821, + "learning_rate": 2.7158239901099155e-06, + "loss": 0.0006, + "num_input_tokens_seen": 20940432, + "step": 10700 + }, + { + "epoch": 1.4182902584493042, + "grad_norm": 1.8011860847473145, + "learning_rate": 2.7154781467121516e-06, + "loss": 0.0422, + "num_input_tokens_seen": 20941888, + "step": 10701 + }, + { + "epoch": 1.4184227965540093, + "grad_norm": 0.05256745591759682, + "learning_rate": 2.7151322991598207e-06, + "loss": 0.0004, + "num_input_tokens_seen": 20944184, + "step": 10702 + }, + { + "epoch": 1.4185553346587143, + "grad_norm": 4.226961612701416, + "learning_rate": 2.71478644745959e-06, + "loss": 0.1165, + "num_input_tokens_seen": 20945600, + "step": 10703 + }, + { + "epoch": 1.4186878727634196, + "grad_norm": 6.054051876068115, + "learning_rate": 2.714440591618129e-06, + "loss": 0.1086, + "num_input_tokens_seen": 20947544, + "step": 10704 + }, + { + "epoch": 1.4188204108681246, + "grad_norm": 3.5379059314727783, + "learning_rate": 2.7140947316421044e-06, + "loss": 0.0605, + "num_input_tokens_seen": 20949728, + "step": 10705 + }, + { + "epoch": 1.4189529489728296, + "grad_norm": 0.04323343187570572, + "learning_rate": 2.7137488675381864e-06, + "loss": 0.0003, + "num_input_tokens_seen": 20951080, + "step": 10706 + }, + { + "epoch": 1.4190854870775347, + "grad_norm": 0.19374366104602814, + "learning_rate": 2.7134029993130427e-06, + "loss": 0.0014, + "num_input_tokens_seen": 20952472, + "step": 10707 + }, + { + "epoch": 1.41921802518224, + "grad_norm": 9.088922500610352, + "learning_rate": 2.713057126973342e-06, + "loss": 0.2748, + "num_input_tokens_seen": 20954216, + "step": 10708 + }, + { + "epoch": 1.419350563286945, + "grad_norm": 19.48166847229004, + "learning_rate": 2.712711250525753e-06, + "loss": 0.3357, + "num_input_tokens_seen": 20956368, + "step": 10709 + }, + { + "epoch": 1.4194831013916502, + "grad_norm": 5.546043395996094, + "learning_rate": 2.7123653699769438e-06, + "loss": 0.1073, + "num_input_tokens_seen": 20958104, + "step": 10710 + }, + { + "epoch": 1.4196156394963553, + "grad_norm": 0.09428960084915161, + "learning_rate": 2.7120194853335848e-06, + "loss": 0.0007, + "num_input_tokens_seen": 20960904, + "step": 10711 + }, + { + "epoch": 1.4197481776010603, + "grad_norm": 17.581602096557617, + "learning_rate": 2.7116735966023435e-06, + "loss": 0.0708, + "num_input_tokens_seen": 20962376, + "step": 10712 + }, + { + "epoch": 1.4198807157057654, + "grad_norm": 7.995454788208008, + "learning_rate": 2.7113277037898883e-06, + "loss": 0.1375, + "num_input_tokens_seen": 20965408, + "step": 10713 + }, + { + "epoch": 1.4200132538104704, + "grad_norm": 3.8892593383789062, + "learning_rate": 2.7109818069028904e-06, + "loss": 0.1005, + "num_input_tokens_seen": 20968760, + "step": 10714 + }, + { + "epoch": 1.4201457919151756, + "grad_norm": 4.330591201782227, + "learning_rate": 2.710635905948017e-06, + "loss": 0.1271, + "num_input_tokens_seen": 20970616, + "step": 10715 + }, + { + "epoch": 1.4202783300198807, + "grad_norm": 6.15462589263916, + "learning_rate": 2.710290000931938e-06, + "loss": 0.1639, + "num_input_tokens_seen": 20972784, + "step": 10716 + }, + { + "epoch": 1.420410868124586, + "grad_norm": 1.6662498712539673, + "learning_rate": 2.7099440918613233e-06, + "loss": 0.0186, + "num_input_tokens_seen": 20974376, + "step": 10717 + }, + { + "epoch": 1.420543406229291, + "grad_norm": 4.802343368530273, + "learning_rate": 2.7095981787428405e-06, + "loss": 0.0682, + "num_input_tokens_seen": 20976344, + "step": 10718 + }, + { + "epoch": 1.420675944333996, + "grad_norm": 14.069927215576172, + "learning_rate": 2.709252261583162e-06, + "loss": 0.3954, + "num_input_tokens_seen": 20978152, + "step": 10719 + }, + { + "epoch": 1.420808482438701, + "grad_norm": 0.21985822916030884, + "learning_rate": 2.7089063403889547e-06, + "loss": 0.0016, + "num_input_tokens_seen": 20979976, + "step": 10720 + }, + { + "epoch": 1.420941020543406, + "grad_norm": 10.50622844696045, + "learning_rate": 2.7085604151668882e-06, + "loss": 0.2087, + "num_input_tokens_seen": 20982496, + "step": 10721 + }, + { + "epoch": 1.4210735586481114, + "grad_norm": 11.014713287353516, + "learning_rate": 2.708214485923634e-06, + "loss": 0.3833, + "num_input_tokens_seen": 20984784, + "step": 10722 + }, + { + "epoch": 1.4212060967528164, + "grad_norm": 0.06483417004346848, + "learning_rate": 2.7078685526658604e-06, + "loss": 0.0005, + "num_input_tokens_seen": 20986064, + "step": 10723 + }, + { + "epoch": 1.4213386348575217, + "grad_norm": 2.469977855682373, + "learning_rate": 2.7075226154002378e-06, + "loss": 0.0225, + "num_input_tokens_seen": 20989232, + "step": 10724 + }, + { + "epoch": 1.4214711729622267, + "grad_norm": 8.448700904846191, + "learning_rate": 2.7071766741334365e-06, + "loss": 0.1814, + "num_input_tokens_seen": 20992456, + "step": 10725 + }, + { + "epoch": 1.4216037110669317, + "grad_norm": 0.23664437234401703, + "learning_rate": 2.706830728872125e-06, + "loss": 0.0016, + "num_input_tokens_seen": 20993848, + "step": 10726 + }, + { + "epoch": 1.4217362491716368, + "grad_norm": 5.103252410888672, + "learning_rate": 2.7064847796229756e-06, + "loss": 0.05, + "num_input_tokens_seen": 20996960, + "step": 10727 + }, + { + "epoch": 1.421868787276342, + "grad_norm": 13.323234558105469, + "learning_rate": 2.706138826392656e-06, + "loss": 0.2098, + "num_input_tokens_seen": 20999008, + "step": 10728 + }, + { + "epoch": 1.422001325381047, + "grad_norm": 7.097051620483398, + "learning_rate": 2.7057928691878387e-06, + "loss": 0.2471, + "num_input_tokens_seen": 21000824, + "step": 10729 + }, + { + "epoch": 1.422133863485752, + "grad_norm": 0.07095486670732498, + "learning_rate": 2.7054469080151923e-06, + "loss": 0.0005, + "num_input_tokens_seen": 21002216, + "step": 10730 + }, + { + "epoch": 1.4222664015904574, + "grad_norm": 7.510002613067627, + "learning_rate": 2.7051009428813873e-06, + "loss": 0.2944, + "num_input_tokens_seen": 21004144, + "step": 10731 + }, + { + "epoch": 1.4223989396951624, + "grad_norm": 0.1955568790435791, + "learning_rate": 2.704754973793096e-06, + "loss": 0.0014, + "num_input_tokens_seen": 21005544, + "step": 10732 + }, + { + "epoch": 1.4225314777998674, + "grad_norm": 2.6343190670013428, + "learning_rate": 2.704409000756987e-06, + "loss": 0.0198, + "num_input_tokens_seen": 21007368, + "step": 10733 + }, + { + "epoch": 1.4226640159045725, + "grad_norm": 4.825110912322998, + "learning_rate": 2.7040630237797306e-06, + "loss": 0.0478, + "num_input_tokens_seen": 21009352, + "step": 10734 + }, + { + "epoch": 1.4227965540092777, + "grad_norm": 0.5988158583641052, + "learning_rate": 2.7037170428679985e-06, + "loss": 0.0043, + "num_input_tokens_seen": 21011072, + "step": 10735 + }, + { + "epoch": 1.4229290921139828, + "grad_norm": 7.015699863433838, + "learning_rate": 2.7033710580284617e-06, + "loss": 0.0269, + "num_input_tokens_seen": 21013352, + "step": 10736 + }, + { + "epoch": 1.4230616302186878, + "grad_norm": 11.575616836547852, + "learning_rate": 2.703025069267791e-06, + "loss": 0.2996, + "num_input_tokens_seen": 21016008, + "step": 10737 + }, + { + "epoch": 1.423194168323393, + "grad_norm": 0.09432853758335114, + "learning_rate": 2.7026790765926566e-06, + "loss": 0.0007, + "num_input_tokens_seen": 21017272, + "step": 10738 + }, + { + "epoch": 1.423326706428098, + "grad_norm": 6.571661472320557, + "learning_rate": 2.7023330800097295e-06, + "loss": 0.2169, + "num_input_tokens_seen": 21019392, + "step": 10739 + }, + { + "epoch": 1.4234592445328031, + "grad_norm": 8.83761215209961, + "learning_rate": 2.7019870795256813e-06, + "loss": 0.051, + "num_input_tokens_seen": 21021800, + "step": 10740 + }, + { + "epoch": 1.4235917826375082, + "grad_norm": 0.5460322499275208, + "learning_rate": 2.7016410751471827e-06, + "loss": 0.005, + "num_input_tokens_seen": 21023520, + "step": 10741 + }, + { + "epoch": 1.4237243207422134, + "grad_norm": 10.693652153015137, + "learning_rate": 2.7012950668809053e-06, + "loss": 0.1056, + "num_input_tokens_seen": 21024552, + "step": 10742 + }, + { + "epoch": 1.4238568588469185, + "grad_norm": 7.407985210418701, + "learning_rate": 2.7009490547335205e-06, + "loss": 0.0789, + "num_input_tokens_seen": 21026504, + "step": 10743 + }, + { + "epoch": 1.4239893969516235, + "grad_norm": 3.5396578311920166, + "learning_rate": 2.700603038711699e-06, + "loss": 0.0498, + "num_input_tokens_seen": 21028192, + "step": 10744 + }, + { + "epoch": 1.4241219350563288, + "grad_norm": 6.02685022354126, + "learning_rate": 2.7002570188221115e-06, + "loss": 0.0896, + "num_input_tokens_seen": 21030496, + "step": 10745 + }, + { + "epoch": 1.4242544731610338, + "grad_norm": 13.647383689880371, + "learning_rate": 2.699910995071432e-06, + "loss": 0.3442, + "num_input_tokens_seen": 21032368, + "step": 10746 + }, + { + "epoch": 1.4243870112657389, + "grad_norm": 5.454916000366211, + "learning_rate": 2.6995649674663298e-06, + "loss": 0.0846, + "num_input_tokens_seen": 21034920, + "step": 10747 + }, + { + "epoch": 1.424519549370444, + "grad_norm": 6.790247917175293, + "learning_rate": 2.6992189360134776e-06, + "loss": 0.1502, + "num_input_tokens_seen": 21037448, + "step": 10748 + }, + { + "epoch": 1.4246520874751492, + "grad_norm": 2.1342883110046387, + "learning_rate": 2.6988729007195465e-06, + "loss": 0.0252, + "num_input_tokens_seen": 21039400, + "step": 10749 + }, + { + "epoch": 1.4247846255798542, + "grad_norm": 8.948573112487793, + "learning_rate": 2.6985268615912087e-06, + "loss": 0.1485, + "num_input_tokens_seen": 21040976, + "step": 10750 + }, + { + "epoch": 1.4249171636845592, + "grad_norm": 0.0212973952293396, + "learning_rate": 2.6981808186351366e-06, + "loss": 0.0001, + "num_input_tokens_seen": 21042184, + "step": 10751 + }, + { + "epoch": 1.4250497017892645, + "grad_norm": 8.74812126159668, + "learning_rate": 2.697834771858001e-06, + "loss": 0.1209, + "num_input_tokens_seen": 21043952, + "step": 10752 + }, + { + "epoch": 1.4251822398939695, + "grad_norm": 8.053406715393066, + "learning_rate": 2.6974887212664754e-06, + "loss": 0.1652, + "num_input_tokens_seen": 21045896, + "step": 10753 + }, + { + "epoch": 1.4253147779986746, + "grad_norm": 0.5992239713668823, + "learning_rate": 2.69714266686723e-06, + "loss": 0.0063, + "num_input_tokens_seen": 21047760, + "step": 10754 + }, + { + "epoch": 1.4254473161033796, + "grad_norm": 16.952857971191406, + "learning_rate": 2.696796608666939e-06, + "loss": 0.2909, + "num_input_tokens_seen": 21050096, + "step": 10755 + }, + { + "epoch": 1.4255798542080849, + "grad_norm": 0.08713732659816742, + "learning_rate": 2.696450546672273e-06, + "loss": 0.0006, + "num_input_tokens_seen": 21052536, + "step": 10756 + }, + { + "epoch": 1.42571239231279, + "grad_norm": 0.031629063189029694, + "learning_rate": 2.6961044808899043e-06, + "loss": 0.0002, + "num_input_tokens_seen": 21054904, + "step": 10757 + }, + { + "epoch": 1.4258449304174952, + "grad_norm": 0.3914913237094879, + "learning_rate": 2.695758411326507e-06, + "loss": 0.0023, + "num_input_tokens_seen": 21056216, + "step": 10758 + }, + { + "epoch": 1.4259774685222002, + "grad_norm": 11.714195251464844, + "learning_rate": 2.695412337988752e-06, + "loss": 0.3401, + "num_input_tokens_seen": 21058448, + "step": 10759 + }, + { + "epoch": 1.4261100066269052, + "grad_norm": 1.5228157043457031, + "learning_rate": 2.695066260883313e-06, + "loss": 0.0202, + "num_input_tokens_seen": 21060784, + "step": 10760 + }, + { + "epoch": 1.4262425447316103, + "grad_norm": 6.81503438949585, + "learning_rate": 2.6947201800168616e-06, + "loss": 0.0941, + "num_input_tokens_seen": 21062848, + "step": 10761 + }, + { + "epoch": 1.4263750828363153, + "grad_norm": 2.946133852005005, + "learning_rate": 2.6943740953960694e-06, + "loss": 0.0448, + "num_input_tokens_seen": 21065120, + "step": 10762 + }, + { + "epoch": 1.4265076209410206, + "grad_norm": 8.323074340820312, + "learning_rate": 2.694028007027612e-06, + "loss": 0.2973, + "num_input_tokens_seen": 21067592, + "step": 10763 + }, + { + "epoch": 1.4266401590457256, + "grad_norm": 0.021862177178263664, + "learning_rate": 2.693681914918161e-06, + "loss": 0.0002, + "num_input_tokens_seen": 21069480, + "step": 10764 + }, + { + "epoch": 1.4267726971504309, + "grad_norm": 6.99967098236084, + "learning_rate": 2.693335819074388e-06, + "loss": 0.1365, + "num_input_tokens_seen": 21071304, + "step": 10765 + }, + { + "epoch": 1.426905235255136, + "grad_norm": 10.257532119750977, + "learning_rate": 2.692989719502968e-06, + "loss": 0.2691, + "num_input_tokens_seen": 21073104, + "step": 10766 + }, + { + "epoch": 1.427037773359841, + "grad_norm": 4.620853900909424, + "learning_rate": 2.6926436162105725e-06, + "loss": 0.0462, + "num_input_tokens_seen": 21074448, + "step": 10767 + }, + { + "epoch": 1.427170311464546, + "grad_norm": 2.277707099914551, + "learning_rate": 2.692297509203875e-06, + "loss": 0.0173, + "num_input_tokens_seen": 21075952, + "step": 10768 + }, + { + "epoch": 1.4273028495692512, + "grad_norm": 7.918296813964844, + "learning_rate": 2.6919513984895494e-06, + "loss": 0.2352, + "num_input_tokens_seen": 21078216, + "step": 10769 + }, + { + "epoch": 1.4274353876739563, + "grad_norm": 5.6482062339782715, + "learning_rate": 2.6916052840742686e-06, + "loss": 0.0275, + "num_input_tokens_seen": 21080512, + "step": 10770 + }, + { + "epoch": 1.4275679257786613, + "grad_norm": 7.644151210784912, + "learning_rate": 2.691259165964705e-06, + "loss": 0.2771, + "num_input_tokens_seen": 21082248, + "step": 10771 + }, + { + "epoch": 1.4277004638833666, + "grad_norm": 3.9690589904785156, + "learning_rate": 2.6909130441675336e-06, + "loss": 0.0608, + "num_input_tokens_seen": 21084360, + "step": 10772 + }, + { + "epoch": 1.4278330019880716, + "grad_norm": 0.04588577151298523, + "learning_rate": 2.690566918689426e-06, + "loss": 0.0002, + "num_input_tokens_seen": 21085720, + "step": 10773 + }, + { + "epoch": 1.4279655400927767, + "grad_norm": 0.09488814324140549, + "learning_rate": 2.690220789537057e-06, + "loss": 0.0007, + "num_input_tokens_seen": 21086808, + "step": 10774 + }, + { + "epoch": 1.4280980781974817, + "grad_norm": 5.080634117126465, + "learning_rate": 2.6898746567171e-06, + "loss": 0.0617, + "num_input_tokens_seen": 21089792, + "step": 10775 + }, + { + "epoch": 1.428230616302187, + "grad_norm": 4.373073577880859, + "learning_rate": 2.6895285202362294e-06, + "loss": 0.0812, + "num_input_tokens_seen": 21091920, + "step": 10776 + }, + { + "epoch": 1.428363154406892, + "grad_norm": 0.016191670671105385, + "learning_rate": 2.6891823801011176e-06, + "loss": 0.0001, + "num_input_tokens_seen": 21093072, + "step": 10777 + }, + { + "epoch": 1.428495692511597, + "grad_norm": 7.616604328155518, + "learning_rate": 2.6888362363184384e-06, + "loss": 0.0806, + "num_input_tokens_seen": 21095512, + "step": 10778 + }, + { + "epoch": 1.4286282306163023, + "grad_norm": 4.610668182373047, + "learning_rate": 2.6884900888948672e-06, + "loss": 0.1204, + "num_input_tokens_seen": 21097480, + "step": 10779 + }, + { + "epoch": 1.4287607687210073, + "grad_norm": 0.036202505230903625, + "learning_rate": 2.6881439378370763e-06, + "loss": 0.0002, + "num_input_tokens_seen": 21098696, + "step": 10780 + }, + { + "epoch": 1.4288933068257124, + "grad_norm": 2.6236343383789062, + "learning_rate": 2.687797783151741e-06, + "loss": 0.0369, + "num_input_tokens_seen": 21101352, + "step": 10781 + }, + { + "epoch": 1.4290258449304174, + "grad_norm": 1.6541774272918701, + "learning_rate": 2.6874516248455358e-06, + "loss": 0.0156, + "num_input_tokens_seen": 21104160, + "step": 10782 + }, + { + "epoch": 1.4291583830351227, + "grad_norm": 0.02200099267065525, + "learning_rate": 2.687105462925132e-06, + "loss": 0.0002, + "num_input_tokens_seen": 21106520, + "step": 10783 + }, + { + "epoch": 1.4292909211398277, + "grad_norm": 0.012917269952595234, + "learning_rate": 2.6867592973972075e-06, + "loss": 0.0001, + "num_input_tokens_seen": 21107504, + "step": 10784 + }, + { + "epoch": 1.4294234592445327, + "grad_norm": 3.4966816902160645, + "learning_rate": 2.6864131282684336e-06, + "loss": 0.0466, + "num_input_tokens_seen": 21109256, + "step": 10785 + }, + { + "epoch": 1.429555997349238, + "grad_norm": 1.1626818180084229, + "learning_rate": 2.6860669555454868e-06, + "loss": 0.011, + "num_input_tokens_seen": 21112192, + "step": 10786 + }, + { + "epoch": 1.429688535453943, + "grad_norm": 9.177265167236328, + "learning_rate": 2.6857207792350403e-06, + "loss": 0.1326, + "num_input_tokens_seen": 21113808, + "step": 10787 + }, + { + "epoch": 1.429821073558648, + "grad_norm": 8.129093170166016, + "learning_rate": 2.685374599343769e-06, + "loss": 0.2534, + "num_input_tokens_seen": 21115792, + "step": 10788 + }, + { + "epoch": 1.429953611663353, + "grad_norm": 7.252820014953613, + "learning_rate": 2.685028415878348e-06, + "loss": 0.1092, + "num_input_tokens_seen": 21117616, + "step": 10789 + }, + { + "epoch": 1.4300861497680584, + "grad_norm": 4.159541130065918, + "learning_rate": 2.6846822288454518e-06, + "loss": 0.0175, + "num_input_tokens_seen": 21119264, + "step": 10790 + }, + { + "epoch": 1.4302186878727634, + "grad_norm": 18.92696189880371, + "learning_rate": 2.6843360382517542e-06, + "loss": 0.2849, + "num_input_tokens_seen": 21121248, + "step": 10791 + }, + { + "epoch": 1.4303512259774684, + "grad_norm": 6.462379455566406, + "learning_rate": 2.68398984410393e-06, + "loss": 0.1016, + "num_input_tokens_seen": 21123776, + "step": 10792 + }, + { + "epoch": 1.4304837640821737, + "grad_norm": 7.214189052581787, + "learning_rate": 2.683643646408656e-06, + "loss": 0.14, + "num_input_tokens_seen": 21126072, + "step": 10793 + }, + { + "epoch": 1.4306163021868787, + "grad_norm": 0.04476047307252884, + "learning_rate": 2.683297445172605e-06, + "loss": 0.0005, + "num_input_tokens_seen": 21127528, + "step": 10794 + }, + { + "epoch": 1.4307488402915838, + "grad_norm": 1.2396240234375, + "learning_rate": 2.6829512404024526e-06, + "loss": 0.0095, + "num_input_tokens_seen": 21129328, + "step": 10795 + }, + { + "epoch": 1.4308813783962888, + "grad_norm": 0.012335745617747307, + "learning_rate": 2.6826050321048745e-06, + "loss": 0.0001, + "num_input_tokens_seen": 21130856, + "step": 10796 + }, + { + "epoch": 1.431013916500994, + "grad_norm": 12.724159240722656, + "learning_rate": 2.6822588202865464e-06, + "loss": 0.2843, + "num_input_tokens_seen": 21132576, + "step": 10797 + }, + { + "epoch": 1.4311464546056991, + "grad_norm": 7.841065883636475, + "learning_rate": 2.6819126049541404e-06, + "loss": 0.1608, + "num_input_tokens_seen": 21134776, + "step": 10798 + }, + { + "epoch": 1.4312789927104044, + "grad_norm": 13.757062911987305, + "learning_rate": 2.6815663861143356e-06, + "loss": 0.3978, + "num_input_tokens_seen": 21136440, + "step": 10799 + }, + { + "epoch": 1.4314115308151094, + "grad_norm": 17.46832275390625, + "learning_rate": 2.681220163773805e-06, + "loss": 0.674, + "num_input_tokens_seen": 21139616, + "step": 10800 + }, + { + "epoch": 1.4315440689198144, + "grad_norm": 0.028478533029556274, + "learning_rate": 2.680873937939224e-06, + "loss": 0.0002, + "num_input_tokens_seen": 21141000, + "step": 10801 + }, + { + "epoch": 1.4316766070245195, + "grad_norm": 7.294116973876953, + "learning_rate": 2.68052770861727e-06, + "loss": 0.1726, + "num_input_tokens_seen": 21143024, + "step": 10802 + }, + { + "epoch": 1.4318091451292245, + "grad_norm": 6.928138256072998, + "learning_rate": 2.680181475814616e-06, + "loss": 0.1766, + "num_input_tokens_seen": 21144880, + "step": 10803 + }, + { + "epoch": 1.4319416832339298, + "grad_norm": 1.035500168800354, + "learning_rate": 2.6798352395379394e-06, + "loss": 0.0061, + "num_input_tokens_seen": 21147032, + "step": 10804 + }, + { + "epoch": 1.4320742213386348, + "grad_norm": 0.03243763744831085, + "learning_rate": 2.679488999793916e-06, + "loss": 0.0002, + "num_input_tokens_seen": 21148904, + "step": 10805 + }, + { + "epoch": 1.43220675944334, + "grad_norm": 8.883461952209473, + "learning_rate": 2.67914275658922e-06, + "loss": 0.1167, + "num_input_tokens_seen": 21150504, + "step": 10806 + }, + { + "epoch": 1.4323392975480451, + "grad_norm": 4.02717924118042, + "learning_rate": 2.678796509930528e-06, + "loss": 0.0706, + "num_input_tokens_seen": 21152000, + "step": 10807 + }, + { + "epoch": 1.4324718356527502, + "grad_norm": 0.05344383046030998, + "learning_rate": 2.6784502598245166e-06, + "loss": 0.0004, + "num_input_tokens_seen": 21153840, + "step": 10808 + }, + { + "epoch": 1.4326043737574552, + "grad_norm": 1.95729398727417, + "learning_rate": 2.678104006277861e-06, + "loss": 0.0165, + "num_input_tokens_seen": 21155664, + "step": 10809 + }, + { + "epoch": 1.4327369118621602, + "grad_norm": 4.9740309715271, + "learning_rate": 2.677757749297236e-06, + "loss": 0.0505, + "num_input_tokens_seen": 21157600, + "step": 10810 + }, + { + "epoch": 1.4328694499668655, + "grad_norm": 3.5419251918792725, + "learning_rate": 2.67741148888932e-06, + "loss": 0.0594, + "num_input_tokens_seen": 21159448, + "step": 10811 + }, + { + "epoch": 1.4330019880715705, + "grad_norm": 0.08372638374567032, + "learning_rate": 2.6770652250607887e-06, + "loss": 0.0006, + "num_input_tokens_seen": 21161296, + "step": 10812 + }, + { + "epoch": 1.4331345261762758, + "grad_norm": 1.6004853248596191, + "learning_rate": 2.6767189578183166e-06, + "loss": 0.0072, + "num_input_tokens_seen": 21162760, + "step": 10813 + }, + { + "epoch": 1.4332670642809808, + "grad_norm": 7.510328769683838, + "learning_rate": 2.676372687168581e-06, + "loss": 0.1518, + "num_input_tokens_seen": 21164640, + "step": 10814 + }, + { + "epoch": 1.4333996023856859, + "grad_norm": 0.872948169708252, + "learning_rate": 2.676026413118259e-06, + "loss": 0.006, + "num_input_tokens_seen": 21167400, + "step": 10815 + }, + { + "epoch": 1.433532140490391, + "grad_norm": 6.372611045837402, + "learning_rate": 2.675680135674027e-06, + "loss": 0.138, + "num_input_tokens_seen": 21169608, + "step": 10816 + }, + { + "epoch": 1.4336646785950962, + "grad_norm": 0.12144671380519867, + "learning_rate": 2.6753338548425596e-06, + "loss": 0.0009, + "num_input_tokens_seen": 21172496, + "step": 10817 + }, + { + "epoch": 1.4337972166998012, + "grad_norm": 0.8862242698669434, + "learning_rate": 2.674987570630535e-06, + "loss": 0.0043, + "num_input_tokens_seen": 21174272, + "step": 10818 + }, + { + "epoch": 1.4339297548045062, + "grad_norm": 8.42066478729248, + "learning_rate": 2.674641283044629e-06, + "loss": 0.0537, + "num_input_tokens_seen": 21175952, + "step": 10819 + }, + { + "epoch": 1.4340622929092115, + "grad_norm": 3.451714038848877, + "learning_rate": 2.674294992091519e-06, + "loss": 0.0249, + "num_input_tokens_seen": 21177584, + "step": 10820 + }, + { + "epoch": 1.4341948310139165, + "grad_norm": 16.580110549926758, + "learning_rate": 2.673948697777881e-06, + "loss": 0.2487, + "num_input_tokens_seen": 21179592, + "step": 10821 + }, + { + "epoch": 1.4343273691186216, + "grad_norm": 12.697294235229492, + "learning_rate": 2.673602400110392e-06, + "loss": 0.3123, + "num_input_tokens_seen": 21181880, + "step": 10822 + }, + { + "epoch": 1.4344599072233266, + "grad_norm": 11.573830604553223, + "learning_rate": 2.6732560990957295e-06, + "loss": 0.2114, + "num_input_tokens_seen": 21184000, + "step": 10823 + }, + { + "epoch": 1.4345924453280319, + "grad_norm": 6.1512250900268555, + "learning_rate": 2.67290979474057e-06, + "loss": 0.085, + "num_input_tokens_seen": 21186744, + "step": 10824 + }, + { + "epoch": 1.434724983432737, + "grad_norm": 6.671546459197998, + "learning_rate": 2.6725634870515892e-06, + "loss": 0.1158, + "num_input_tokens_seen": 21188272, + "step": 10825 + }, + { + "epoch": 1.434857521537442, + "grad_norm": 6.947902679443359, + "learning_rate": 2.6722171760354666e-06, + "loss": 0.1654, + "num_input_tokens_seen": 21190016, + "step": 10826 + }, + { + "epoch": 1.4349900596421472, + "grad_norm": 10.70562744140625, + "learning_rate": 2.6718708616988774e-06, + "loss": 0.1895, + "num_input_tokens_seen": 21192192, + "step": 10827 + }, + { + "epoch": 1.4351225977468522, + "grad_norm": 5.385256290435791, + "learning_rate": 2.6715245440484993e-06, + "loss": 0.068, + "num_input_tokens_seen": 21193728, + "step": 10828 + }, + { + "epoch": 1.4352551358515573, + "grad_norm": 0.04405488446354866, + "learning_rate": 2.67117822309101e-06, + "loss": 0.0003, + "num_input_tokens_seen": 21195648, + "step": 10829 + }, + { + "epoch": 1.4353876739562623, + "grad_norm": 0.03205659240484238, + "learning_rate": 2.6708318988330855e-06, + "loss": 0.0002, + "num_input_tokens_seen": 21197464, + "step": 10830 + }, + { + "epoch": 1.4355202120609676, + "grad_norm": 14.518558502197266, + "learning_rate": 2.670485571281406e-06, + "loss": 0.2543, + "num_input_tokens_seen": 21199568, + "step": 10831 + }, + { + "epoch": 1.4356527501656726, + "grad_norm": 0.09428905695676804, + "learning_rate": 2.6701392404426456e-06, + "loss": 0.0005, + "num_input_tokens_seen": 21200952, + "step": 10832 + }, + { + "epoch": 1.4357852882703777, + "grad_norm": 9.231392860412598, + "learning_rate": 2.6697929063234833e-06, + "loss": 0.3062, + "num_input_tokens_seen": 21202528, + "step": 10833 + }, + { + "epoch": 1.435917826375083, + "grad_norm": 4.179986476898193, + "learning_rate": 2.6694465689305976e-06, + "loss": 0.0403, + "num_input_tokens_seen": 21204472, + "step": 10834 + }, + { + "epoch": 1.436050364479788, + "grad_norm": 12.07902717590332, + "learning_rate": 2.6691002282706647e-06, + "loss": 0.2788, + "num_input_tokens_seen": 21207408, + "step": 10835 + }, + { + "epoch": 1.436182902584493, + "grad_norm": 8.348955154418945, + "learning_rate": 2.6687538843503622e-06, + "loss": 0.2033, + "num_input_tokens_seen": 21209488, + "step": 10836 + }, + { + "epoch": 1.436315440689198, + "grad_norm": 0.08609013259410858, + "learning_rate": 2.668407537176369e-06, + "loss": 0.0006, + "num_input_tokens_seen": 21211672, + "step": 10837 + }, + { + "epoch": 1.4364479787939033, + "grad_norm": 4.086520195007324, + "learning_rate": 2.6680611867553623e-06, + "loss": 0.0973, + "num_input_tokens_seen": 21214256, + "step": 10838 + }, + { + "epoch": 1.4365805168986083, + "grad_norm": 1.847739577293396, + "learning_rate": 2.66771483309402e-06, + "loss": 0.0153, + "num_input_tokens_seen": 21215864, + "step": 10839 + }, + { + "epoch": 1.4367130550033136, + "grad_norm": 13.70706844329834, + "learning_rate": 2.6673684761990194e-06, + "loss": 0.115, + "num_input_tokens_seen": 21217704, + "step": 10840 + }, + { + "epoch": 1.4368455931080186, + "grad_norm": 0.029677601531147957, + "learning_rate": 2.6670221160770404e-06, + "loss": 0.0002, + "num_input_tokens_seen": 21220272, + "step": 10841 + }, + { + "epoch": 1.4369781312127237, + "grad_norm": 4.11194372177124, + "learning_rate": 2.6666757527347582e-06, + "loss": 0.0307, + "num_input_tokens_seen": 21221440, + "step": 10842 + }, + { + "epoch": 1.4371106693174287, + "grad_norm": 0.0691581666469574, + "learning_rate": 2.6663293861788534e-06, + "loss": 0.0005, + "num_input_tokens_seen": 21222496, + "step": 10843 + }, + { + "epoch": 1.4372432074221337, + "grad_norm": 7.387668609619141, + "learning_rate": 2.665983016416004e-06, + "loss": 0.1529, + "num_input_tokens_seen": 21225360, + "step": 10844 + }, + { + "epoch": 1.437375745526839, + "grad_norm": 0.18080316483974457, + "learning_rate": 2.6656366434528856e-06, + "loss": 0.0012, + "num_input_tokens_seen": 21227840, + "step": 10845 + }, + { + "epoch": 1.437508283631544, + "grad_norm": 5.551860809326172, + "learning_rate": 2.66529026729618e-06, + "loss": 0.0473, + "num_input_tokens_seen": 21229480, + "step": 10846 + }, + { + "epoch": 1.4376408217362493, + "grad_norm": 0.3541734516620636, + "learning_rate": 2.664943887952564e-06, + "loss": 0.0035, + "num_input_tokens_seen": 21230520, + "step": 10847 + }, + { + "epoch": 1.4377733598409543, + "grad_norm": 0.04205181077122688, + "learning_rate": 2.6645975054287147e-06, + "loss": 0.0003, + "num_input_tokens_seen": 21232144, + "step": 10848 + }, + { + "epoch": 1.4379058979456594, + "grad_norm": 0.7722583413124084, + "learning_rate": 2.6642511197313133e-06, + "loss": 0.0051, + "num_input_tokens_seen": 21234744, + "step": 10849 + }, + { + "epoch": 1.4380384360503644, + "grad_norm": 0.3419129550457001, + "learning_rate": 2.663904730867036e-06, + "loss": 0.0032, + "num_input_tokens_seen": 21236128, + "step": 10850 + }, + { + "epoch": 1.4381709741550694, + "grad_norm": 7.903298377990723, + "learning_rate": 2.6635583388425632e-06, + "loss": 0.1407, + "num_input_tokens_seen": 21238600, + "step": 10851 + }, + { + "epoch": 1.4383035122597747, + "grad_norm": 1.1632781028747559, + "learning_rate": 2.663211943664573e-06, + "loss": 0.0074, + "num_input_tokens_seen": 21240104, + "step": 10852 + }, + { + "epoch": 1.4384360503644797, + "grad_norm": 0.1822330355644226, + "learning_rate": 2.6628655453397434e-06, + "loss": 0.0013, + "num_input_tokens_seen": 21241712, + "step": 10853 + }, + { + "epoch": 1.438568588469185, + "grad_norm": 6.655280590057373, + "learning_rate": 2.6625191438747534e-06, + "loss": 0.1056, + "num_input_tokens_seen": 21243016, + "step": 10854 + }, + { + "epoch": 1.43870112657389, + "grad_norm": 0.21163149178028107, + "learning_rate": 2.6621727392762827e-06, + "loss": 0.0015, + "num_input_tokens_seen": 21244776, + "step": 10855 + }, + { + "epoch": 1.438833664678595, + "grad_norm": 7.408387184143066, + "learning_rate": 2.66182633155101e-06, + "loss": 0.1293, + "num_input_tokens_seen": 21246560, + "step": 10856 + }, + { + "epoch": 1.4389662027833001, + "grad_norm": 3.002204179763794, + "learning_rate": 2.6614799207056134e-06, + "loss": 0.0817, + "num_input_tokens_seen": 21248992, + "step": 10857 + }, + { + "epoch": 1.4390987408880054, + "grad_norm": 0.11059904843568802, + "learning_rate": 2.6611335067467726e-06, + "loss": 0.0008, + "num_input_tokens_seen": 21250768, + "step": 10858 + }, + { + "epoch": 1.4392312789927104, + "grad_norm": 10.025544166564941, + "learning_rate": 2.660787089681167e-06, + "loss": 0.0919, + "num_input_tokens_seen": 21253208, + "step": 10859 + }, + { + "epoch": 1.4393638170974155, + "grad_norm": 1.1724212169647217, + "learning_rate": 2.660440669515475e-06, + "loss": 0.0095, + "num_input_tokens_seen": 21256096, + "step": 10860 + }, + { + "epoch": 1.4394963552021207, + "grad_norm": 3.212331533432007, + "learning_rate": 2.660094246256376e-06, + "loss": 0.059, + "num_input_tokens_seen": 21257560, + "step": 10861 + }, + { + "epoch": 1.4396288933068258, + "grad_norm": 0.04032079875469208, + "learning_rate": 2.6597478199105507e-06, + "loss": 0.0003, + "num_input_tokens_seen": 21258816, + "step": 10862 + }, + { + "epoch": 1.4397614314115308, + "grad_norm": 7.889703750610352, + "learning_rate": 2.6594013904846753e-06, + "loss": 0.2138, + "num_input_tokens_seen": 21260544, + "step": 10863 + }, + { + "epoch": 1.4398939695162358, + "grad_norm": 1.3625165224075317, + "learning_rate": 2.659054957985433e-06, + "loss": 0.014, + "num_input_tokens_seen": 21262776, + "step": 10864 + }, + { + "epoch": 1.440026507620941, + "grad_norm": 2.79219651222229, + "learning_rate": 2.658708522419501e-06, + "loss": 0.0253, + "num_input_tokens_seen": 21264744, + "step": 10865 + }, + { + "epoch": 1.4401590457256461, + "grad_norm": 2.9032938480377197, + "learning_rate": 2.658362083793558e-06, + "loss": 0.0077, + "num_input_tokens_seen": 21266288, + "step": 10866 + }, + { + "epoch": 1.4402915838303512, + "grad_norm": 16.138391494750977, + "learning_rate": 2.658015642114286e-06, + "loss": 0.484, + "num_input_tokens_seen": 21268632, + "step": 10867 + }, + { + "epoch": 1.4404241219350564, + "grad_norm": 1.767700433731079, + "learning_rate": 2.657669197388363e-06, + "loss": 0.0201, + "num_input_tokens_seen": 21270600, + "step": 10868 + }, + { + "epoch": 1.4405566600397615, + "grad_norm": 0.4708884060382843, + "learning_rate": 2.6573227496224686e-06, + "loss": 0.0098, + "num_input_tokens_seen": 21272960, + "step": 10869 + }, + { + "epoch": 1.4406891981444665, + "grad_norm": 7.196113109588623, + "learning_rate": 2.6569762988232838e-06, + "loss": 0.0698, + "num_input_tokens_seen": 21275088, + "step": 10870 + }, + { + "epoch": 1.4408217362491715, + "grad_norm": 12.136961936950684, + "learning_rate": 2.6566298449974875e-06, + "loss": 0.1915, + "num_input_tokens_seen": 21277680, + "step": 10871 + }, + { + "epoch": 1.4409542743538768, + "grad_norm": 2.3023693561553955, + "learning_rate": 2.6562833881517597e-06, + "loss": 0.0114, + "num_input_tokens_seen": 21279496, + "step": 10872 + }, + { + "epoch": 1.4410868124585818, + "grad_norm": 8.550653457641602, + "learning_rate": 2.6559369282927803e-06, + "loss": 0.1449, + "num_input_tokens_seen": 21281480, + "step": 10873 + }, + { + "epoch": 1.4412193505632869, + "grad_norm": 8.463733673095703, + "learning_rate": 2.655590465427229e-06, + "loss": 0.0361, + "num_input_tokens_seen": 21283024, + "step": 10874 + }, + { + "epoch": 1.4413518886679921, + "grad_norm": 8.72244930267334, + "learning_rate": 2.6552439995617866e-06, + "loss": 0.1854, + "num_input_tokens_seen": 21285192, + "step": 10875 + }, + { + "epoch": 1.4414844267726972, + "grad_norm": 10.276098251342773, + "learning_rate": 2.6548975307031326e-06, + "loss": 0.164, + "num_input_tokens_seen": 21287384, + "step": 10876 + }, + { + "epoch": 1.4416169648774022, + "grad_norm": 0.015046215616166592, + "learning_rate": 2.6545510588579483e-06, + "loss": 0.0001, + "num_input_tokens_seen": 21288768, + "step": 10877 + }, + { + "epoch": 1.4417495029821072, + "grad_norm": 11.002762794494629, + "learning_rate": 2.6542045840329115e-06, + "loss": 0.1451, + "num_input_tokens_seen": 21290872, + "step": 10878 + }, + { + "epoch": 1.4418820410868125, + "grad_norm": 2.009108781814575, + "learning_rate": 2.6538581062347048e-06, + "loss": 0.0195, + "num_input_tokens_seen": 21292808, + "step": 10879 + }, + { + "epoch": 1.4420145791915175, + "grad_norm": 7.2593183517456055, + "learning_rate": 2.653511625470008e-06, + "loss": 0.2822, + "num_input_tokens_seen": 21294736, + "step": 10880 + }, + { + "epoch": 1.4421471172962228, + "grad_norm": 0.1216149851679802, + "learning_rate": 2.6531651417454995e-06, + "loss": 0.0011, + "num_input_tokens_seen": 21297488, + "step": 10881 + }, + { + "epoch": 1.4422796554009278, + "grad_norm": 6.1806793212890625, + "learning_rate": 2.6528186550678633e-06, + "loss": 0.153, + "num_input_tokens_seen": 21301048, + "step": 10882 + }, + { + "epoch": 1.4424121935056329, + "grad_norm": 4.283254623413086, + "learning_rate": 2.652472165443777e-06, + "loss": 0.1848, + "num_input_tokens_seen": 21303304, + "step": 10883 + }, + { + "epoch": 1.442544731610338, + "grad_norm": 6.099390983581543, + "learning_rate": 2.6521256728799223e-06, + "loss": 0.085, + "num_input_tokens_seen": 21305336, + "step": 10884 + }, + { + "epoch": 1.442677269715043, + "grad_norm": 8.291504859924316, + "learning_rate": 2.65177917738298e-06, + "loss": 0.1675, + "num_input_tokens_seen": 21307608, + "step": 10885 + }, + { + "epoch": 1.4428098078197482, + "grad_norm": 3.575012683868408, + "learning_rate": 2.65143267895963e-06, + "loss": 0.1128, + "num_input_tokens_seen": 21309664, + "step": 10886 + }, + { + "epoch": 1.4429423459244533, + "grad_norm": 10.08403491973877, + "learning_rate": 2.6510861776165536e-06, + "loss": 0.1998, + "num_input_tokens_seen": 21311856, + "step": 10887 + }, + { + "epoch": 1.4430748840291585, + "grad_norm": 6.192297458648682, + "learning_rate": 2.6507396733604317e-06, + "loss": 0.1451, + "num_input_tokens_seen": 21314000, + "step": 10888 + }, + { + "epoch": 1.4432074221338635, + "grad_norm": 0.017191238701343536, + "learning_rate": 2.650393166197945e-06, + "loss": 0.0001, + "num_input_tokens_seen": 21315072, + "step": 10889 + }, + { + "epoch": 1.4433399602385686, + "grad_norm": 0.02423868142068386, + "learning_rate": 2.6500466561357733e-06, + "loss": 0.0002, + "num_input_tokens_seen": 21317256, + "step": 10890 + }, + { + "epoch": 1.4434724983432736, + "grad_norm": 2.660085439682007, + "learning_rate": 2.6497001431806e-06, + "loss": 0.0149, + "num_input_tokens_seen": 21318440, + "step": 10891 + }, + { + "epoch": 1.4436050364479787, + "grad_norm": 6.089280128479004, + "learning_rate": 2.649353627339104e-06, + "loss": 0.1332, + "num_input_tokens_seen": 21320512, + "step": 10892 + }, + { + "epoch": 1.443737574552684, + "grad_norm": 7.312432765960693, + "learning_rate": 2.6490071086179668e-06, + "loss": 0.1663, + "num_input_tokens_seen": 21322416, + "step": 10893 + }, + { + "epoch": 1.443870112657389, + "grad_norm": 5.278479099273682, + "learning_rate": 2.6486605870238702e-06, + "loss": 0.0486, + "num_input_tokens_seen": 21325104, + "step": 10894 + }, + { + "epoch": 1.4440026507620942, + "grad_norm": 0.106948122382164, + "learning_rate": 2.6483140625634947e-06, + "loss": 0.0008, + "num_input_tokens_seen": 21327072, + "step": 10895 + }, + { + "epoch": 1.4441351888667993, + "grad_norm": 5.404239654541016, + "learning_rate": 2.647967535243522e-06, + "loss": 0.1117, + "num_input_tokens_seen": 21329296, + "step": 10896 + }, + { + "epoch": 1.4442677269715043, + "grad_norm": 9.389456748962402, + "learning_rate": 2.6476210050706325e-06, + "loss": 0.1636, + "num_input_tokens_seen": 21331344, + "step": 10897 + }, + { + "epoch": 1.4444002650762093, + "grad_norm": 1.4074392318725586, + "learning_rate": 2.6472744720515087e-06, + "loss": 0.0071, + "num_input_tokens_seen": 21333712, + "step": 10898 + }, + { + "epoch": 1.4445328031809146, + "grad_norm": 0.042017802596092224, + "learning_rate": 2.6469279361928314e-06, + "loss": 0.0003, + "num_input_tokens_seen": 21334840, + "step": 10899 + }, + { + "epoch": 1.4446653412856196, + "grad_norm": 0.08807942271232605, + "learning_rate": 2.646581397501282e-06, + "loss": 0.0007, + "num_input_tokens_seen": 21337976, + "step": 10900 + }, + { + "epoch": 1.4447978793903247, + "grad_norm": 5.1953277587890625, + "learning_rate": 2.646234855983542e-06, + "loss": 0.0893, + "num_input_tokens_seen": 21339112, + "step": 10901 + }, + { + "epoch": 1.44493041749503, + "grad_norm": 8.31412124633789, + "learning_rate": 2.6458883116462938e-06, + "loss": 0.23, + "num_input_tokens_seen": 21341432, + "step": 10902 + }, + { + "epoch": 1.445062955599735, + "grad_norm": 0.4157674312591553, + "learning_rate": 2.645541764496218e-06, + "loss": 0.0039, + "num_input_tokens_seen": 21343064, + "step": 10903 + }, + { + "epoch": 1.44519549370444, + "grad_norm": 8.908576011657715, + "learning_rate": 2.645195214539996e-06, + "loss": 0.1281, + "num_input_tokens_seen": 21344728, + "step": 10904 + }, + { + "epoch": 1.445328031809145, + "grad_norm": 0.04328708350658417, + "learning_rate": 2.6448486617843105e-06, + "loss": 0.0003, + "num_input_tokens_seen": 21346664, + "step": 10905 + }, + { + "epoch": 1.4454605699138503, + "grad_norm": 9.455588340759277, + "learning_rate": 2.6445021062358433e-06, + "loss": 0.3237, + "num_input_tokens_seen": 21348392, + "step": 10906 + }, + { + "epoch": 1.4455931080185553, + "grad_norm": 0.19983354210853577, + "learning_rate": 2.6441555479012755e-06, + "loss": 0.0013, + "num_input_tokens_seen": 21350120, + "step": 10907 + }, + { + "epoch": 1.4457256461232604, + "grad_norm": 0.3083600699901581, + "learning_rate": 2.643808986787289e-06, + "loss": 0.0017, + "num_input_tokens_seen": 21352104, + "step": 10908 + }, + { + "epoch": 1.4458581842279656, + "grad_norm": 0.11944610625505447, + "learning_rate": 2.6434624229005663e-06, + "loss": 0.0006, + "num_input_tokens_seen": 21354288, + "step": 10909 + }, + { + "epoch": 1.4459907223326707, + "grad_norm": 13.957178115844727, + "learning_rate": 2.643115856247789e-06, + "loss": 0.32, + "num_input_tokens_seen": 21356264, + "step": 10910 + }, + { + "epoch": 1.4461232604373757, + "grad_norm": 5.9540629386901855, + "learning_rate": 2.642769286835639e-06, + "loss": 0.1435, + "num_input_tokens_seen": 21358072, + "step": 10911 + }, + { + "epoch": 1.4462557985420808, + "grad_norm": 9.122639656066895, + "learning_rate": 2.642422714670799e-06, + "loss": 0.2269, + "num_input_tokens_seen": 21360208, + "step": 10912 + }, + { + "epoch": 1.446388336646786, + "grad_norm": 1.7268279790878296, + "learning_rate": 2.6420761397599504e-06, + "loss": 0.0259, + "num_input_tokens_seen": 21361904, + "step": 10913 + }, + { + "epoch": 1.446520874751491, + "grad_norm": 3.9890286922454834, + "learning_rate": 2.6417295621097767e-06, + "loss": 0.0265, + "num_input_tokens_seen": 21364192, + "step": 10914 + }, + { + "epoch": 1.446653412856196, + "grad_norm": 8.87393856048584, + "learning_rate": 2.6413829817269586e-06, + "loss": 0.2262, + "num_input_tokens_seen": 21365792, + "step": 10915 + }, + { + "epoch": 1.4467859509609013, + "grad_norm": 8.883562088012695, + "learning_rate": 2.6410363986181794e-06, + "loss": 0.2228, + "num_input_tokens_seen": 21368344, + "step": 10916 + }, + { + "epoch": 1.4469184890656064, + "grad_norm": 0.031127553433179855, + "learning_rate": 2.640689812790121e-06, + "loss": 0.0002, + "num_input_tokens_seen": 21370232, + "step": 10917 + }, + { + "epoch": 1.4470510271703114, + "grad_norm": 1.267810583114624, + "learning_rate": 2.640343224249466e-06, + "loss": 0.0103, + "num_input_tokens_seen": 21371648, + "step": 10918 + }, + { + "epoch": 1.4471835652750165, + "grad_norm": 0.07533013820648193, + "learning_rate": 2.6399966330028965e-06, + "loss": 0.0005, + "num_input_tokens_seen": 21372912, + "step": 10919 + }, + { + "epoch": 1.4473161033797217, + "grad_norm": 12.462226867675781, + "learning_rate": 2.6396500390570956e-06, + "loss": 0.31, + "num_input_tokens_seen": 21375280, + "step": 10920 + }, + { + "epoch": 1.4474486414844268, + "grad_norm": 3.2640459537506104, + "learning_rate": 2.639303442418746e-06, + "loss": 0.0949, + "num_input_tokens_seen": 21378144, + "step": 10921 + }, + { + "epoch": 1.4475811795891318, + "grad_norm": 0.05585382133722305, + "learning_rate": 2.6389568430945288e-06, + "loss": 0.0004, + "num_input_tokens_seen": 21379272, + "step": 10922 + }, + { + "epoch": 1.447713717693837, + "grad_norm": 5.33964729309082, + "learning_rate": 2.6386102410911287e-06, + "loss": 0.1004, + "num_input_tokens_seen": 21380984, + "step": 10923 + }, + { + "epoch": 1.447846255798542, + "grad_norm": 10.410721778869629, + "learning_rate": 2.6382636364152274e-06, + "loss": 0.1765, + "num_input_tokens_seen": 21383400, + "step": 10924 + }, + { + "epoch": 1.4479787939032471, + "grad_norm": 4.276920795440674, + "learning_rate": 2.6379170290735072e-06, + "loss": 0.0848, + "num_input_tokens_seen": 21385544, + "step": 10925 + }, + { + "epoch": 1.4481113320079522, + "grad_norm": 6.234306812286377, + "learning_rate": 2.6375704190726525e-06, + "loss": 0.1099, + "num_input_tokens_seen": 21387352, + "step": 10926 + }, + { + "epoch": 1.4482438701126574, + "grad_norm": 20.285568237304688, + "learning_rate": 2.637223806419345e-06, + "loss": 0.3372, + "num_input_tokens_seen": 21388528, + "step": 10927 + }, + { + "epoch": 1.4483764082173625, + "grad_norm": 8.563652992248535, + "learning_rate": 2.6368771911202673e-06, + "loss": 0.2984, + "num_input_tokens_seen": 21390344, + "step": 10928 + }, + { + "epoch": 1.4485089463220677, + "grad_norm": 0.03882817551493645, + "learning_rate": 2.6365305731821035e-06, + "loss": 0.0003, + "num_input_tokens_seen": 21391824, + "step": 10929 + }, + { + "epoch": 1.4486414844267728, + "grad_norm": 4.560957908630371, + "learning_rate": 2.6361839526115363e-06, + "loss": 0.0641, + "num_input_tokens_seen": 21393432, + "step": 10930 + }, + { + "epoch": 1.4487740225314778, + "grad_norm": 0.08066105842590332, + "learning_rate": 2.6358373294152473e-06, + "loss": 0.0005, + "num_input_tokens_seen": 21394872, + "step": 10931 + }, + { + "epoch": 1.4489065606361828, + "grad_norm": 10.6721830368042, + "learning_rate": 2.6354907035999224e-06, + "loss": 0.1093, + "num_input_tokens_seen": 21396696, + "step": 10932 + }, + { + "epoch": 1.4490390987408879, + "grad_norm": 0.06653618812561035, + "learning_rate": 2.6351440751722423e-06, + "loss": 0.0004, + "num_input_tokens_seen": 21397936, + "step": 10933 + }, + { + "epoch": 1.4491716368455931, + "grad_norm": 0.024140650406479836, + "learning_rate": 2.634797444138891e-06, + "loss": 0.0002, + "num_input_tokens_seen": 21399584, + "step": 10934 + }, + { + "epoch": 1.4493041749502982, + "grad_norm": 5.0089569091796875, + "learning_rate": 2.6344508105065532e-06, + "loss": 0.1128, + "num_input_tokens_seen": 21401192, + "step": 10935 + }, + { + "epoch": 1.4494367130550034, + "grad_norm": 6.013140678405762, + "learning_rate": 2.6341041742819095e-06, + "loss": 0.0845, + "num_input_tokens_seen": 21403056, + "step": 10936 + }, + { + "epoch": 1.4495692511597085, + "grad_norm": 0.23236481845378876, + "learning_rate": 2.6337575354716456e-06, + "loss": 0.0014, + "num_input_tokens_seen": 21405504, + "step": 10937 + }, + { + "epoch": 1.4497017892644135, + "grad_norm": 4.379516124725342, + "learning_rate": 2.633410894082444e-06, + "loss": 0.1337, + "num_input_tokens_seen": 21407592, + "step": 10938 + }, + { + "epoch": 1.4498343273691185, + "grad_norm": 8.200029373168945, + "learning_rate": 2.6330642501209884e-06, + "loss": 0.1795, + "num_input_tokens_seen": 21410032, + "step": 10939 + }, + { + "epoch": 1.4499668654738238, + "grad_norm": 3.719460964202881, + "learning_rate": 2.632717603593962e-06, + "loss": 0.0454, + "num_input_tokens_seen": 21411608, + "step": 10940 + }, + { + "epoch": 1.4500994035785288, + "grad_norm": 8.499351501464844, + "learning_rate": 2.632370954508049e-06, + "loss": 0.092, + "num_input_tokens_seen": 21413704, + "step": 10941 + }, + { + "epoch": 1.4502319416832339, + "grad_norm": 0.04778960347175598, + "learning_rate": 2.632024302869933e-06, + "loss": 0.0003, + "num_input_tokens_seen": 21416464, + "step": 10942 + }, + { + "epoch": 1.4503644797879391, + "grad_norm": 11.142475128173828, + "learning_rate": 2.6316776486862965e-06, + "loss": 0.0975, + "num_input_tokens_seen": 21418176, + "step": 10943 + }, + { + "epoch": 1.4504970178926442, + "grad_norm": 1.1982251405715942, + "learning_rate": 2.631330991963825e-06, + "loss": 0.0111, + "num_input_tokens_seen": 21419832, + "step": 10944 + }, + { + "epoch": 1.4506295559973492, + "grad_norm": 1.6619386672973633, + "learning_rate": 2.630984332709201e-06, + "loss": 0.0182, + "num_input_tokens_seen": 21421408, + "step": 10945 + }, + { + "epoch": 1.4507620941020543, + "grad_norm": 0.17957210540771484, + "learning_rate": 2.6306376709291082e-06, + "loss": 0.001, + "num_input_tokens_seen": 21424392, + "step": 10946 + }, + { + "epoch": 1.4508946322067595, + "grad_norm": 8.476288795471191, + "learning_rate": 2.6302910066302317e-06, + "loss": 0.3443, + "num_input_tokens_seen": 21426824, + "step": 10947 + }, + { + "epoch": 1.4510271703114646, + "grad_norm": 5.011803150177002, + "learning_rate": 2.629944339819254e-06, + "loss": 0.1052, + "num_input_tokens_seen": 21428608, + "step": 10948 + }, + { + "epoch": 1.4511597084161696, + "grad_norm": 1.140215516090393, + "learning_rate": 2.62959767050286e-06, + "loss": 0.0087, + "num_input_tokens_seen": 21430088, + "step": 10949 + }, + { + "epoch": 1.4512922465208749, + "grad_norm": 8.578670501708984, + "learning_rate": 2.6292509986877347e-06, + "loss": 0.2103, + "num_input_tokens_seen": 21432376, + "step": 10950 + }, + { + "epoch": 1.45142478462558, + "grad_norm": 6.808307647705078, + "learning_rate": 2.6289043243805596e-06, + "loss": 0.1431, + "num_input_tokens_seen": 21435048, + "step": 10951 + }, + { + "epoch": 1.451557322730285, + "grad_norm": 5.706945896148682, + "learning_rate": 2.6285576475880203e-06, + "loss": 0.1198, + "num_input_tokens_seen": 21437360, + "step": 10952 + }, + { + "epoch": 1.45168986083499, + "grad_norm": 5.379715919494629, + "learning_rate": 2.628210968316802e-06, + "loss": 0.1032, + "num_input_tokens_seen": 21439088, + "step": 10953 + }, + { + "epoch": 1.4518223989396952, + "grad_norm": 3.1793594360351562, + "learning_rate": 2.6278642865735867e-06, + "loss": 0.0103, + "num_input_tokens_seen": 21441184, + "step": 10954 + }, + { + "epoch": 1.4519549370444003, + "grad_norm": 13.018506050109863, + "learning_rate": 2.6275176023650593e-06, + "loss": 0.2009, + "num_input_tokens_seen": 21443160, + "step": 10955 + }, + { + "epoch": 1.4520874751491053, + "grad_norm": 0.05415809899568558, + "learning_rate": 2.6271709156979052e-06, + "loss": 0.0004, + "num_input_tokens_seen": 21445000, + "step": 10956 + }, + { + "epoch": 1.4522200132538106, + "grad_norm": 6.648740768432617, + "learning_rate": 2.6268242265788085e-06, + "loss": 0.1467, + "num_input_tokens_seen": 21446680, + "step": 10957 + }, + { + "epoch": 1.4523525513585156, + "grad_norm": 6.552952289581299, + "learning_rate": 2.6264775350144526e-06, + "loss": 0.1582, + "num_input_tokens_seen": 21449968, + "step": 10958 + }, + { + "epoch": 1.4524850894632206, + "grad_norm": 6.78386116027832, + "learning_rate": 2.6261308410115227e-06, + "loss": 0.1823, + "num_input_tokens_seen": 21453136, + "step": 10959 + }, + { + "epoch": 1.4526176275679257, + "grad_norm": 0.8264293074607849, + "learning_rate": 2.6257841445767036e-06, + "loss": 0.0053, + "num_input_tokens_seen": 21455712, + "step": 10960 + }, + { + "epoch": 1.452750165672631, + "grad_norm": 0.06400512903928757, + "learning_rate": 2.6254374457166793e-06, + "loss": 0.0005, + "num_input_tokens_seen": 21458552, + "step": 10961 + }, + { + "epoch": 1.452882703777336, + "grad_norm": 0.1834973841905594, + "learning_rate": 2.625090744438134e-06, + "loss": 0.0011, + "num_input_tokens_seen": 21460224, + "step": 10962 + }, + { + "epoch": 1.453015241882041, + "grad_norm": 0.7387140393257141, + "learning_rate": 2.624744040747753e-06, + "loss": 0.0128, + "num_input_tokens_seen": 21462352, + "step": 10963 + }, + { + "epoch": 1.4531477799867463, + "grad_norm": 0.041847970336675644, + "learning_rate": 2.624397334652221e-06, + "loss": 0.0003, + "num_input_tokens_seen": 21464464, + "step": 10964 + }, + { + "epoch": 1.4532803180914513, + "grad_norm": 6.78325080871582, + "learning_rate": 2.6240506261582235e-06, + "loss": 0.0922, + "num_input_tokens_seen": 21466784, + "step": 10965 + }, + { + "epoch": 1.4534128561961563, + "grad_norm": 10.543416023254395, + "learning_rate": 2.623703915272443e-06, + "loss": 0.2573, + "num_input_tokens_seen": 21470120, + "step": 10966 + }, + { + "epoch": 1.4535453943008614, + "grad_norm": 0.02953626587986946, + "learning_rate": 2.623357202001566e-06, + "loss": 0.0002, + "num_input_tokens_seen": 21473312, + "step": 10967 + }, + { + "epoch": 1.4536779324055666, + "grad_norm": 8.097883224487305, + "learning_rate": 2.6230104863522775e-06, + "loss": 0.1904, + "num_input_tokens_seen": 21475128, + "step": 10968 + }, + { + "epoch": 1.4538104705102717, + "grad_norm": 0.0263700932264328, + "learning_rate": 2.6226637683312616e-06, + "loss": 0.0002, + "num_input_tokens_seen": 21476744, + "step": 10969 + }, + { + "epoch": 1.453943008614977, + "grad_norm": 7.196662902832031, + "learning_rate": 2.622317047945203e-06, + "loss": 0.2292, + "num_input_tokens_seen": 21479912, + "step": 10970 + }, + { + "epoch": 1.454075546719682, + "grad_norm": 2.936291456222534, + "learning_rate": 2.621970325200789e-06, + "loss": 0.0644, + "num_input_tokens_seen": 21482000, + "step": 10971 + }, + { + "epoch": 1.454208084824387, + "grad_norm": 4.8482842445373535, + "learning_rate": 2.621623600104702e-06, + "loss": 0.1083, + "num_input_tokens_seen": 21483960, + "step": 10972 + }, + { + "epoch": 1.454340622929092, + "grad_norm": 0.035413652658462524, + "learning_rate": 2.621276872663627e-06, + "loss": 0.0003, + "num_input_tokens_seen": 21486232, + "step": 10973 + }, + { + "epoch": 1.454473161033797, + "grad_norm": 1.3260926008224487, + "learning_rate": 2.620930142884252e-06, + "loss": 0.0115, + "num_input_tokens_seen": 21488536, + "step": 10974 + }, + { + "epoch": 1.4546056991385024, + "grad_norm": 0.007139185909181833, + "learning_rate": 2.620583410773259e-06, + "loss": 0.0, + "num_input_tokens_seen": 21489960, + "step": 10975 + }, + { + "epoch": 1.4547382372432074, + "grad_norm": 0.006348004564642906, + "learning_rate": 2.620236676337336e-06, + "loss": 0.0, + "num_input_tokens_seen": 21491312, + "step": 10976 + }, + { + "epoch": 1.4548707753479126, + "grad_norm": 5.154878616333008, + "learning_rate": 2.6198899395831662e-06, + "loss": 0.1182, + "num_input_tokens_seen": 21493376, + "step": 10977 + }, + { + "epoch": 1.4550033134526177, + "grad_norm": 10.80441951751709, + "learning_rate": 2.6195432005174356e-06, + "loss": 0.3572, + "num_input_tokens_seen": 21495960, + "step": 10978 + }, + { + "epoch": 1.4551358515573227, + "grad_norm": 0.10277149081230164, + "learning_rate": 2.61919645914683e-06, + "loss": 0.0004, + "num_input_tokens_seen": 21497752, + "step": 10979 + }, + { + "epoch": 1.4552683896620278, + "grad_norm": 13.517969131469727, + "learning_rate": 2.618849715478034e-06, + "loss": 0.2683, + "num_input_tokens_seen": 21499216, + "step": 10980 + }, + { + "epoch": 1.4554009277667328, + "grad_norm": 3.810706853866577, + "learning_rate": 2.6185029695177337e-06, + "loss": 0.1581, + "num_input_tokens_seen": 21501224, + "step": 10981 + }, + { + "epoch": 1.455533465871438, + "grad_norm": 0.02137044630944729, + "learning_rate": 2.6181562212726143e-06, + "loss": 0.0001, + "num_input_tokens_seen": 21502640, + "step": 10982 + }, + { + "epoch": 1.455666003976143, + "grad_norm": 7.698780059814453, + "learning_rate": 2.6178094707493616e-06, + "loss": 0.1243, + "num_input_tokens_seen": 21503704, + "step": 10983 + }, + { + "epoch": 1.4557985420808484, + "grad_norm": 3.8445756435394287, + "learning_rate": 2.6174627179546613e-06, + "loss": 0.0652, + "num_input_tokens_seen": 21505080, + "step": 10984 + }, + { + "epoch": 1.4559310801855534, + "grad_norm": 13.640947341918945, + "learning_rate": 2.617115962895198e-06, + "loss": 0.156, + "num_input_tokens_seen": 21506432, + "step": 10985 + }, + { + "epoch": 1.4560636182902584, + "grad_norm": 3.8770592212677, + "learning_rate": 2.6167692055776596e-06, + "loss": 0.1563, + "num_input_tokens_seen": 21508448, + "step": 10986 + }, + { + "epoch": 1.4561961563949635, + "grad_norm": 0.33174797892570496, + "learning_rate": 2.616422446008729e-06, + "loss": 0.0018, + "num_input_tokens_seen": 21510672, + "step": 10987 + }, + { + "epoch": 1.4563286944996687, + "grad_norm": 7.495532035827637, + "learning_rate": 2.6160756841950934e-06, + "loss": 0.1136, + "num_input_tokens_seen": 21512680, + "step": 10988 + }, + { + "epoch": 1.4564612326043738, + "grad_norm": 4.138807773590088, + "learning_rate": 2.615728920143439e-06, + "loss": 0.0477, + "num_input_tokens_seen": 21514432, + "step": 10989 + }, + { + "epoch": 1.4565937707090788, + "grad_norm": 0.24531511962413788, + "learning_rate": 2.6153821538604507e-06, + "loss": 0.0014, + "num_input_tokens_seen": 21517680, + "step": 10990 + }, + { + "epoch": 1.456726308813784, + "grad_norm": 0.2585221230983734, + "learning_rate": 2.6150353853528154e-06, + "loss": 0.0012, + "num_input_tokens_seen": 21519136, + "step": 10991 + }, + { + "epoch": 1.456858846918489, + "grad_norm": 3.002260208129883, + "learning_rate": 2.6146886146272187e-06, + "loss": 0.0561, + "num_input_tokens_seen": 21520584, + "step": 10992 + }, + { + "epoch": 1.4569913850231941, + "grad_norm": 5.412962913513184, + "learning_rate": 2.6143418416903456e-06, + "loss": 0.0672, + "num_input_tokens_seen": 21523176, + "step": 10993 + }, + { + "epoch": 1.4571239231278992, + "grad_norm": 9.048022270202637, + "learning_rate": 2.6139950665488833e-06, + "loss": 0.2233, + "num_input_tokens_seen": 21524968, + "step": 10994 + }, + { + "epoch": 1.4572564612326044, + "grad_norm": 0.10742446035146713, + "learning_rate": 2.613648289209517e-06, + "loss": 0.0007, + "num_input_tokens_seen": 21527304, + "step": 10995 + }, + { + "epoch": 1.4573889993373095, + "grad_norm": 0.040093861520290375, + "learning_rate": 2.6133015096789336e-06, + "loss": 0.0003, + "num_input_tokens_seen": 21529776, + "step": 10996 + }, + { + "epoch": 1.4575215374420145, + "grad_norm": 5.786831378936768, + "learning_rate": 2.6129547279638197e-06, + "loss": 0.1479, + "num_input_tokens_seen": 21532376, + "step": 10997 + }, + { + "epoch": 1.4576540755467198, + "grad_norm": 19.81473731994629, + "learning_rate": 2.61260794407086e-06, + "loss": 0.1038, + "num_input_tokens_seen": 21534592, + "step": 10998 + }, + { + "epoch": 1.4577866136514248, + "grad_norm": 0.03994022682309151, + "learning_rate": 2.6122611580067414e-06, + "loss": 0.0002, + "num_input_tokens_seen": 21535992, + "step": 10999 + }, + { + "epoch": 1.4579191517561298, + "grad_norm": 9.363077163696289, + "learning_rate": 2.61191436977815e-06, + "loss": 0.3072, + "num_input_tokens_seen": 21537944, + "step": 11000 + }, + { + "epoch": 1.4580516898608349, + "grad_norm": 17.945985794067383, + "learning_rate": 2.6115675793917734e-06, + "loss": 0.2591, + "num_input_tokens_seen": 21540072, + "step": 11001 + }, + { + "epoch": 1.4581842279655401, + "grad_norm": 2.4495420455932617, + "learning_rate": 2.611220786854296e-06, + "loss": 0.0327, + "num_input_tokens_seen": 21542440, + "step": 11002 + }, + { + "epoch": 1.4583167660702452, + "grad_norm": 5.7916998863220215, + "learning_rate": 2.610873992172405e-06, + "loss": 0.0779, + "num_input_tokens_seen": 21544808, + "step": 11003 + }, + { + "epoch": 1.4584493041749502, + "grad_norm": 3.049166440963745, + "learning_rate": 2.610527195352788e-06, + "loss": 0.019, + "num_input_tokens_seen": 21547384, + "step": 11004 + }, + { + "epoch": 1.4585818422796555, + "grad_norm": 9.886913299560547, + "learning_rate": 2.6101803964021294e-06, + "loss": 0.2062, + "num_input_tokens_seen": 21549832, + "step": 11005 + }, + { + "epoch": 1.4587143803843605, + "grad_norm": 5.452359676361084, + "learning_rate": 2.609833595327117e-06, + "loss": 0.1347, + "num_input_tokens_seen": 21551896, + "step": 11006 + }, + { + "epoch": 1.4588469184890656, + "grad_norm": 0.06725816428661346, + "learning_rate": 2.6094867921344374e-06, + "loss": 0.0005, + "num_input_tokens_seen": 21554200, + "step": 11007 + }, + { + "epoch": 1.4589794565937706, + "grad_norm": 3.4270777702331543, + "learning_rate": 2.609139986830776e-06, + "loss": 0.0074, + "num_input_tokens_seen": 21555832, + "step": 11008 + }, + { + "epoch": 1.4591119946984759, + "grad_norm": 0.05174370855093002, + "learning_rate": 2.6087931794228217e-06, + "loss": 0.0004, + "num_input_tokens_seen": 21557160, + "step": 11009 + }, + { + "epoch": 1.459244532803181, + "grad_norm": 4.266785621643066, + "learning_rate": 2.6084463699172594e-06, + "loss": 0.0611, + "num_input_tokens_seen": 21558904, + "step": 11010 + }, + { + "epoch": 1.4593770709078862, + "grad_norm": 7.021951675415039, + "learning_rate": 2.608099558320775e-06, + "loss": 0.1487, + "num_input_tokens_seen": 21560736, + "step": 11011 + }, + { + "epoch": 1.4595096090125912, + "grad_norm": 11.409744262695312, + "learning_rate": 2.6077527446400587e-06, + "loss": 0.4967, + "num_input_tokens_seen": 21563880, + "step": 11012 + }, + { + "epoch": 1.4596421471172962, + "grad_norm": 0.5445393919944763, + "learning_rate": 2.6074059288817936e-06, + "loss": 0.005, + "num_input_tokens_seen": 21565136, + "step": 11013 + }, + { + "epoch": 1.4597746852220013, + "grad_norm": 0.10181518644094467, + "learning_rate": 2.607059111052668e-06, + "loss": 0.0007, + "num_input_tokens_seen": 21567688, + "step": 11014 + }, + { + "epoch": 1.4599072233267063, + "grad_norm": 10.874519348144531, + "learning_rate": 2.6067122911593703e-06, + "loss": 0.1048, + "num_input_tokens_seen": 21569704, + "step": 11015 + }, + { + "epoch": 1.4600397614314116, + "grad_norm": 0.13061000406742096, + "learning_rate": 2.606365469208585e-06, + "loss": 0.0007, + "num_input_tokens_seen": 21571176, + "step": 11016 + }, + { + "epoch": 1.4601722995361166, + "grad_norm": 8.87643051147461, + "learning_rate": 2.6060186452069995e-06, + "loss": 0.058, + "num_input_tokens_seen": 21572696, + "step": 11017 + }, + { + "epoch": 1.4603048376408219, + "grad_norm": 0.19947637617588043, + "learning_rate": 2.605671819161303e-06, + "loss": 0.0013, + "num_input_tokens_seen": 21574928, + "step": 11018 + }, + { + "epoch": 1.460437375745527, + "grad_norm": 9.756861686706543, + "learning_rate": 2.60532499107818e-06, + "loss": 0.141, + "num_input_tokens_seen": 21576592, + "step": 11019 + }, + { + "epoch": 1.460569913850232, + "grad_norm": 5.723066806793213, + "learning_rate": 2.6049781609643182e-06, + "loss": 0.1272, + "num_input_tokens_seen": 21579080, + "step": 11020 + }, + { + "epoch": 1.460702451954937, + "grad_norm": 0.43191543221473694, + "learning_rate": 2.6046313288264053e-06, + "loss": 0.0026, + "num_input_tokens_seen": 21581032, + "step": 11021 + }, + { + "epoch": 1.460834990059642, + "grad_norm": 6.983368873596191, + "learning_rate": 2.604284494671129e-06, + "loss": 0.1808, + "num_input_tokens_seen": 21583152, + "step": 11022 + }, + { + "epoch": 1.4609675281643473, + "grad_norm": 5.39556360244751, + "learning_rate": 2.6039376585051745e-06, + "loss": 0.1061, + "num_input_tokens_seen": 21585592, + "step": 11023 + }, + { + "epoch": 1.4611000662690523, + "grad_norm": 7.101685523986816, + "learning_rate": 2.6035908203352308e-06, + "loss": 0.2581, + "num_input_tokens_seen": 21588008, + "step": 11024 + }, + { + "epoch": 1.4612326043737576, + "grad_norm": 0.06250391900539398, + "learning_rate": 2.603243980167984e-06, + "loss": 0.0004, + "num_input_tokens_seen": 21589448, + "step": 11025 + }, + { + "epoch": 1.4613651424784626, + "grad_norm": 9.151115417480469, + "learning_rate": 2.6028971380101226e-06, + "loss": 0.1574, + "num_input_tokens_seen": 21590704, + "step": 11026 + }, + { + "epoch": 1.4614976805831676, + "grad_norm": 10.341407775878906, + "learning_rate": 2.602550293868333e-06, + "loss": 0.1468, + "num_input_tokens_seen": 21592808, + "step": 11027 + }, + { + "epoch": 1.4616302186878727, + "grad_norm": 7.832805633544922, + "learning_rate": 2.6022034477493037e-06, + "loss": 0.1732, + "num_input_tokens_seen": 21595000, + "step": 11028 + }, + { + "epoch": 1.461762756792578, + "grad_norm": 10.837166786193848, + "learning_rate": 2.601856599659721e-06, + "loss": 0.1066, + "num_input_tokens_seen": 21596848, + "step": 11029 + }, + { + "epoch": 1.461895294897283, + "grad_norm": 6.255279541015625, + "learning_rate": 2.601509749606273e-06, + "loss": 0.0909, + "num_input_tokens_seen": 21598400, + "step": 11030 + }, + { + "epoch": 1.462027833001988, + "grad_norm": 0.10824492573738098, + "learning_rate": 2.6011628975956473e-06, + "loss": 0.001, + "num_input_tokens_seen": 21599584, + "step": 11031 + }, + { + "epoch": 1.4621603711066933, + "grad_norm": 4.105659484863281, + "learning_rate": 2.60081604363453e-06, + "loss": 0.0464, + "num_input_tokens_seen": 21601432, + "step": 11032 + }, + { + "epoch": 1.4622929092113983, + "grad_norm": 4.549729824066162, + "learning_rate": 2.6004691877296114e-06, + "loss": 0.0939, + "num_input_tokens_seen": 21603600, + "step": 11033 + }, + { + "epoch": 1.4624254473161034, + "grad_norm": 4.451516151428223, + "learning_rate": 2.600122329887576e-06, + "loss": 0.0936, + "num_input_tokens_seen": 21605520, + "step": 11034 + }, + { + "epoch": 1.4625579854208084, + "grad_norm": 10.242341041564941, + "learning_rate": 2.599775470115114e-06, + "loss": 0.3185, + "num_input_tokens_seen": 21607168, + "step": 11035 + }, + { + "epoch": 1.4626905235255137, + "grad_norm": 14.894968032836914, + "learning_rate": 2.599428608418912e-06, + "loss": 0.4668, + "num_input_tokens_seen": 21608560, + "step": 11036 + }, + { + "epoch": 1.4628230616302187, + "grad_norm": 7.097993850708008, + "learning_rate": 2.5990817448056576e-06, + "loss": 0.1258, + "num_input_tokens_seen": 21611120, + "step": 11037 + }, + { + "epoch": 1.4629555997349237, + "grad_norm": 11.64086627960205, + "learning_rate": 2.598734879282039e-06, + "loss": 0.2424, + "num_input_tokens_seen": 21612976, + "step": 11038 + }, + { + "epoch": 1.463088137839629, + "grad_norm": 0.029997162520885468, + "learning_rate": 2.5983880118547433e-06, + "loss": 0.0002, + "num_input_tokens_seen": 21614288, + "step": 11039 + }, + { + "epoch": 1.463220675944334, + "grad_norm": 2.3856747150421143, + "learning_rate": 2.5980411425304585e-06, + "loss": 0.0362, + "num_input_tokens_seen": 21616240, + "step": 11040 + }, + { + "epoch": 1.463353214049039, + "grad_norm": 0.273572713136673, + "learning_rate": 2.5976942713158745e-06, + "loss": 0.0011, + "num_input_tokens_seen": 21617416, + "step": 11041 + }, + { + "epoch": 1.463485752153744, + "grad_norm": 3.9511852264404297, + "learning_rate": 2.597347398217676e-06, + "loss": 0.0162, + "num_input_tokens_seen": 21619408, + "step": 11042 + }, + { + "epoch": 1.4636182902584494, + "grad_norm": 3.52996826171875, + "learning_rate": 2.597000523242553e-06, + "loss": 0.0414, + "num_input_tokens_seen": 21621824, + "step": 11043 + }, + { + "epoch": 1.4637508283631544, + "grad_norm": 7.73145055770874, + "learning_rate": 2.5966536463971935e-06, + "loss": 0.1554, + "num_input_tokens_seen": 21623480, + "step": 11044 + }, + { + "epoch": 1.4638833664678594, + "grad_norm": 6.404489040374756, + "learning_rate": 2.5963067676882847e-06, + "loss": 0.1023, + "num_input_tokens_seen": 21625000, + "step": 11045 + }, + { + "epoch": 1.4640159045725647, + "grad_norm": 10.446934700012207, + "learning_rate": 2.595959887122515e-06, + "loss": 0.1, + "num_input_tokens_seen": 21627528, + "step": 11046 + }, + { + "epoch": 1.4641484426772697, + "grad_norm": 6.792369365692139, + "learning_rate": 2.595613004706572e-06, + "loss": 0.1165, + "num_input_tokens_seen": 21629600, + "step": 11047 + }, + { + "epoch": 1.4642809807819748, + "grad_norm": 15.01921558380127, + "learning_rate": 2.595266120447145e-06, + "loss": 0.3895, + "num_input_tokens_seen": 21631320, + "step": 11048 + }, + { + "epoch": 1.4644135188866798, + "grad_norm": 0.09403236955404282, + "learning_rate": 2.594919234350921e-06, + "loss": 0.0007, + "num_input_tokens_seen": 21632928, + "step": 11049 + }, + { + "epoch": 1.464546056991385, + "grad_norm": 11.55078125, + "learning_rate": 2.5945723464245888e-06, + "loss": 0.2409, + "num_input_tokens_seen": 21635184, + "step": 11050 + }, + { + "epoch": 1.46467859509609, + "grad_norm": 3.041452169418335, + "learning_rate": 2.594225456674837e-06, + "loss": 0.0378, + "num_input_tokens_seen": 21636920, + "step": 11051 + }, + { + "epoch": 1.4648111332007951, + "grad_norm": 4.928841590881348, + "learning_rate": 2.5938785651083524e-06, + "loss": 0.1688, + "num_input_tokens_seen": 21638568, + "step": 11052 + }, + { + "epoch": 1.4649436713055004, + "grad_norm": 9.234848976135254, + "learning_rate": 2.5935316717318246e-06, + "loss": 0.0434, + "num_input_tokens_seen": 21640032, + "step": 11053 + }, + { + "epoch": 1.4650762094102054, + "grad_norm": 0.16447634994983673, + "learning_rate": 2.593184776551943e-06, + "loss": 0.0012, + "num_input_tokens_seen": 21642496, + "step": 11054 + }, + { + "epoch": 1.4652087475149105, + "grad_norm": 0.13132554292678833, + "learning_rate": 2.5928378795753927e-06, + "loss": 0.001, + "num_input_tokens_seen": 21644368, + "step": 11055 + }, + { + "epoch": 1.4653412856196155, + "grad_norm": 7.2734785079956055, + "learning_rate": 2.592490980808865e-06, + "loss": 0.1554, + "num_input_tokens_seen": 21646200, + "step": 11056 + }, + { + "epoch": 1.4654738237243208, + "grad_norm": 5.767739295959473, + "learning_rate": 2.5921440802590476e-06, + "loss": 0.1359, + "num_input_tokens_seen": 21648080, + "step": 11057 + }, + { + "epoch": 1.4656063618290258, + "grad_norm": 0.20113810896873474, + "learning_rate": 2.591797177932628e-06, + "loss": 0.0015, + "num_input_tokens_seen": 21649760, + "step": 11058 + }, + { + "epoch": 1.465738899933731, + "grad_norm": 0.38479676842689514, + "learning_rate": 2.5914502738362964e-06, + "loss": 0.0038, + "num_input_tokens_seen": 21651056, + "step": 11059 + }, + { + "epoch": 1.4658714380384361, + "grad_norm": 0.834749162197113, + "learning_rate": 2.5911033679767396e-06, + "loss": 0.0062, + "num_input_tokens_seen": 21653416, + "step": 11060 + }, + { + "epoch": 1.4660039761431412, + "grad_norm": 8.273273468017578, + "learning_rate": 2.590756460360647e-06, + "loss": 0.1301, + "num_input_tokens_seen": 21654792, + "step": 11061 + }, + { + "epoch": 1.4661365142478462, + "grad_norm": 7.856949329376221, + "learning_rate": 2.590409550994708e-06, + "loss": 0.2194, + "num_input_tokens_seen": 21656320, + "step": 11062 + }, + { + "epoch": 1.4662690523525512, + "grad_norm": 13.234530448913574, + "learning_rate": 2.59006263988561e-06, + "loss": 0.2757, + "num_input_tokens_seen": 21659568, + "step": 11063 + }, + { + "epoch": 1.4664015904572565, + "grad_norm": 0.28951647877693176, + "learning_rate": 2.5897157270400417e-06, + "loss": 0.002, + "num_input_tokens_seen": 21660672, + "step": 11064 + }, + { + "epoch": 1.4665341285619615, + "grad_norm": 4.913437366485596, + "learning_rate": 2.5893688124646934e-06, + "loss": 0.1141, + "num_input_tokens_seen": 21662856, + "step": 11065 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 2.5340819358825684, + "learning_rate": 2.589021896166252e-06, + "loss": 0.0253, + "num_input_tokens_seen": 21664480, + "step": 11066 + }, + { + "epoch": 1.4667992047713718, + "grad_norm": 2.1130244731903076, + "learning_rate": 2.5886749781514075e-06, + "loss": 0.0385, + "num_input_tokens_seen": 21665928, + "step": 11067 + }, + { + "epoch": 1.4669317428760769, + "grad_norm": 4.872178554534912, + "learning_rate": 2.5883280584268476e-06, + "loss": 0.0828, + "num_input_tokens_seen": 21667912, + "step": 11068 + }, + { + "epoch": 1.467064280980782, + "grad_norm": 0.040651287883520126, + "learning_rate": 2.5879811369992624e-06, + "loss": 0.0003, + "num_input_tokens_seen": 21669392, + "step": 11069 + }, + { + "epoch": 1.4671968190854872, + "grad_norm": 7.2859787940979, + "learning_rate": 2.5876342138753394e-06, + "loss": 0.0592, + "num_input_tokens_seen": 21671608, + "step": 11070 + }, + { + "epoch": 1.4673293571901922, + "grad_norm": 9.997447967529297, + "learning_rate": 2.5872872890617696e-06, + "loss": 0.3044, + "num_input_tokens_seen": 21673408, + "step": 11071 + }, + { + "epoch": 1.4674618952948972, + "grad_norm": 8.22560977935791, + "learning_rate": 2.58694036256524e-06, + "loss": 0.2734, + "num_input_tokens_seen": 21676120, + "step": 11072 + }, + { + "epoch": 1.4675944333996025, + "grad_norm": 9.969096183776855, + "learning_rate": 2.5865934343924393e-06, + "loss": 0.2661, + "num_input_tokens_seen": 21678320, + "step": 11073 + }, + { + "epoch": 1.4677269715043075, + "grad_norm": 0.2750203311443329, + "learning_rate": 2.586246504550059e-06, + "loss": 0.0018, + "num_input_tokens_seen": 21680688, + "step": 11074 + }, + { + "epoch": 1.4678595096090126, + "grad_norm": 0.04734223335981369, + "learning_rate": 2.585899573044786e-06, + "loss": 0.0003, + "num_input_tokens_seen": 21682008, + "step": 11075 + }, + { + "epoch": 1.4679920477137176, + "grad_norm": 0.20386825501918793, + "learning_rate": 2.5855526398833094e-06, + "loss": 0.0014, + "num_input_tokens_seen": 21685120, + "step": 11076 + }, + { + "epoch": 1.4681245858184229, + "grad_norm": 10.253085136413574, + "learning_rate": 2.5852057050723193e-06, + "loss": 0.1797, + "num_input_tokens_seen": 21687352, + "step": 11077 + }, + { + "epoch": 1.468257123923128, + "grad_norm": 2.6124250888824463, + "learning_rate": 2.5848587686185047e-06, + "loss": 0.0141, + "num_input_tokens_seen": 21688568, + "step": 11078 + }, + { + "epoch": 1.468389662027833, + "grad_norm": 4.995578765869141, + "learning_rate": 2.584511830528554e-06, + "loss": 0.0516, + "num_input_tokens_seen": 21690696, + "step": 11079 + }, + { + "epoch": 1.4685222001325382, + "grad_norm": 0.03132272884249687, + "learning_rate": 2.584164890809157e-06, + "loss": 0.0002, + "num_input_tokens_seen": 21691560, + "step": 11080 + }, + { + "epoch": 1.4686547382372432, + "grad_norm": 0.11116857826709747, + "learning_rate": 2.5838179494670035e-06, + "loss": 0.0007, + "num_input_tokens_seen": 21692944, + "step": 11081 + }, + { + "epoch": 1.4687872763419483, + "grad_norm": 7.7167816162109375, + "learning_rate": 2.583471006508781e-06, + "loss": 0.1324, + "num_input_tokens_seen": 21694856, + "step": 11082 + }, + { + "epoch": 1.4689198144466533, + "grad_norm": 0.6264228820800781, + "learning_rate": 2.583124061941181e-06, + "loss": 0.0029, + "num_input_tokens_seen": 21696648, + "step": 11083 + }, + { + "epoch": 1.4690523525513586, + "grad_norm": 11.472622871398926, + "learning_rate": 2.582777115770891e-06, + "loss": 0.1836, + "num_input_tokens_seen": 21698904, + "step": 11084 + }, + { + "epoch": 1.4691848906560636, + "grad_norm": 0.14224688708782196, + "learning_rate": 2.5824301680046017e-06, + "loss": 0.001, + "num_input_tokens_seen": 21700808, + "step": 11085 + }, + { + "epoch": 1.4693174287607687, + "grad_norm": 5.589762210845947, + "learning_rate": 2.582083218649001e-06, + "loss": 0.2335, + "num_input_tokens_seen": 21702936, + "step": 11086 + }, + { + "epoch": 1.469449966865474, + "grad_norm": 4.799900054931641, + "learning_rate": 2.5817362677107807e-06, + "loss": 0.0955, + "num_input_tokens_seen": 21705152, + "step": 11087 + }, + { + "epoch": 1.469582504970179, + "grad_norm": 7.143099308013916, + "learning_rate": 2.5813893151966275e-06, + "loss": 0.1625, + "num_input_tokens_seen": 21707544, + "step": 11088 + }, + { + "epoch": 1.469715043074884, + "grad_norm": 6.8088059425354, + "learning_rate": 2.581042361113232e-06, + "loss": 0.0654, + "num_input_tokens_seen": 21709632, + "step": 11089 + }, + { + "epoch": 1.469847581179589, + "grad_norm": 5.073712348937988, + "learning_rate": 2.580695405467285e-06, + "loss": 0.0977, + "num_input_tokens_seen": 21711960, + "step": 11090 + }, + { + "epoch": 1.4699801192842943, + "grad_norm": 7.132404804229736, + "learning_rate": 2.5803484482654733e-06, + "loss": 0.143, + "num_input_tokens_seen": 21713728, + "step": 11091 + }, + { + "epoch": 1.4701126573889993, + "grad_norm": 1.1447491645812988, + "learning_rate": 2.58000148951449e-06, + "loss": 0.0072, + "num_input_tokens_seen": 21715184, + "step": 11092 + }, + { + "epoch": 1.4702451954937044, + "grad_norm": 8.833479881286621, + "learning_rate": 2.579654529221021e-06, + "loss": 0.2726, + "num_input_tokens_seen": 21717832, + "step": 11093 + }, + { + "epoch": 1.4703777335984096, + "grad_norm": 6.5997467041015625, + "learning_rate": 2.5793075673917587e-06, + "loss": 0.152, + "num_input_tokens_seen": 21719592, + "step": 11094 + }, + { + "epoch": 1.4705102717031147, + "grad_norm": 0.02729985862970352, + "learning_rate": 2.578960604033392e-06, + "loss": 0.0002, + "num_input_tokens_seen": 21721024, + "step": 11095 + }, + { + "epoch": 1.4706428098078197, + "grad_norm": 10.959158897399902, + "learning_rate": 2.57861363915261e-06, + "loss": 0.0868, + "num_input_tokens_seen": 21722576, + "step": 11096 + }, + { + "epoch": 1.4707753479125247, + "grad_norm": 4.057428359985352, + "learning_rate": 2.578266672756103e-06, + "loss": 0.0664, + "num_input_tokens_seen": 21724560, + "step": 11097 + }, + { + "epoch": 1.47090788601723, + "grad_norm": 10.816934585571289, + "learning_rate": 2.5779197048505607e-06, + "loss": 0.2868, + "num_input_tokens_seen": 21727144, + "step": 11098 + }, + { + "epoch": 1.471040424121935, + "grad_norm": 3.0672852993011475, + "learning_rate": 2.577572735442672e-06, + "loss": 0.0357, + "num_input_tokens_seen": 21729112, + "step": 11099 + }, + { + "epoch": 1.4711729622266403, + "grad_norm": 14.973370552062988, + "learning_rate": 2.577225764539128e-06, + "loss": 0.2588, + "num_input_tokens_seen": 21730840, + "step": 11100 + }, + { + "epoch": 1.4713055003313453, + "grad_norm": 1.2575414180755615, + "learning_rate": 2.576878792146618e-06, + "loss": 0.0113, + "num_input_tokens_seen": 21733104, + "step": 11101 + }, + { + "epoch": 1.4714380384360504, + "grad_norm": 5.518712043762207, + "learning_rate": 2.5765318182718318e-06, + "loss": 0.0243, + "num_input_tokens_seen": 21735680, + "step": 11102 + }, + { + "epoch": 1.4715705765407554, + "grad_norm": 0.049268919974565506, + "learning_rate": 2.576184842921459e-06, + "loss": 0.0003, + "num_input_tokens_seen": 21736896, + "step": 11103 + }, + { + "epoch": 1.4717031146454604, + "grad_norm": 12.287383079528809, + "learning_rate": 2.5758378661021898e-06, + "loss": 0.3671, + "num_input_tokens_seen": 21738984, + "step": 11104 + }, + { + "epoch": 1.4718356527501657, + "grad_norm": 12.141632080078125, + "learning_rate": 2.5754908878207146e-06, + "loss": 0.1644, + "num_input_tokens_seen": 21741056, + "step": 11105 + }, + { + "epoch": 1.4719681908548707, + "grad_norm": 12.063698768615723, + "learning_rate": 2.575143908083723e-06, + "loss": 0.3979, + "num_input_tokens_seen": 21743712, + "step": 11106 + }, + { + "epoch": 1.472100728959576, + "grad_norm": 12.591207504272461, + "learning_rate": 2.574796926897905e-06, + "loss": 0.234, + "num_input_tokens_seen": 21745768, + "step": 11107 + }, + { + "epoch": 1.472233267064281, + "grad_norm": 0.1670842468738556, + "learning_rate": 2.5744499442699504e-06, + "loss": 0.0012, + "num_input_tokens_seen": 21747744, + "step": 11108 + }, + { + "epoch": 1.472365805168986, + "grad_norm": 0.1941344439983368, + "learning_rate": 2.5741029602065496e-06, + "loss": 0.0013, + "num_input_tokens_seen": 21749472, + "step": 11109 + }, + { + "epoch": 1.4724983432736911, + "grad_norm": 0.5653542876243591, + "learning_rate": 2.5737559747143935e-06, + "loss": 0.0024, + "num_input_tokens_seen": 21751424, + "step": 11110 + }, + { + "epoch": 1.4726308813783962, + "grad_norm": 0.11951368302106857, + "learning_rate": 2.5734089878001707e-06, + "loss": 0.0008, + "num_input_tokens_seen": 21753200, + "step": 11111 + }, + { + "epoch": 1.4727634194831014, + "grad_norm": 3.1244583129882812, + "learning_rate": 2.573061999470571e-06, + "loss": 0.0452, + "num_input_tokens_seen": 21755344, + "step": 11112 + }, + { + "epoch": 1.4728959575878064, + "grad_norm": 3.4613451957702637, + "learning_rate": 2.572715009732287e-06, + "loss": 0.1063, + "num_input_tokens_seen": 21757176, + "step": 11113 + }, + { + "epoch": 1.4730284956925117, + "grad_norm": 2.556030750274658, + "learning_rate": 2.572368018592007e-06, + "loss": 0.0118, + "num_input_tokens_seen": 21758920, + "step": 11114 + }, + { + "epoch": 1.4731610337972167, + "grad_norm": 3.470649480819702, + "learning_rate": 2.5720210260564216e-06, + "loss": 0.0358, + "num_input_tokens_seen": 21760160, + "step": 11115 + }, + { + "epoch": 1.4732935719019218, + "grad_norm": 5.224481582641602, + "learning_rate": 2.5716740321322215e-06, + "loss": 0.0781, + "num_input_tokens_seen": 21761760, + "step": 11116 + }, + { + "epoch": 1.4734261100066268, + "grad_norm": 0.05140223726630211, + "learning_rate": 2.5713270368260956e-06, + "loss": 0.0004, + "num_input_tokens_seen": 21763616, + "step": 11117 + }, + { + "epoch": 1.473558648111332, + "grad_norm": 9.445391654968262, + "learning_rate": 2.570980040144736e-06, + "loss": 0.1684, + "num_input_tokens_seen": 21766016, + "step": 11118 + }, + { + "epoch": 1.4736911862160371, + "grad_norm": 0.0800311416387558, + "learning_rate": 2.5706330420948323e-06, + "loss": 0.0005, + "num_input_tokens_seen": 21767200, + "step": 11119 + }, + { + "epoch": 1.4738237243207422, + "grad_norm": 4.082507133483887, + "learning_rate": 2.570286042683074e-06, + "loss": 0.0379, + "num_input_tokens_seen": 21768896, + "step": 11120 + }, + { + "epoch": 1.4739562624254474, + "grad_norm": 0.03419762849807739, + "learning_rate": 2.5699390419161528e-06, + "loss": 0.0002, + "num_input_tokens_seen": 21770736, + "step": 11121 + }, + { + "epoch": 1.4740888005301525, + "grad_norm": 8.612048149108887, + "learning_rate": 2.569592039800759e-06, + "loss": 0.228, + "num_input_tokens_seen": 21772824, + "step": 11122 + }, + { + "epoch": 1.4742213386348575, + "grad_norm": 11.682594299316406, + "learning_rate": 2.569245036343582e-06, + "loss": 0.2107, + "num_input_tokens_seen": 21774632, + "step": 11123 + }, + { + "epoch": 1.4743538767395625, + "grad_norm": 7.543234348297119, + "learning_rate": 2.568898031551314e-06, + "loss": 0.1641, + "num_input_tokens_seen": 21777016, + "step": 11124 + }, + { + "epoch": 1.4744864148442678, + "grad_norm": 2.7014565467834473, + "learning_rate": 2.5685510254306434e-06, + "loss": 0.014, + "num_input_tokens_seen": 21778432, + "step": 11125 + }, + { + "epoch": 1.4746189529489728, + "grad_norm": 0.6720866560935974, + "learning_rate": 2.5682040179882625e-06, + "loss": 0.0043, + "num_input_tokens_seen": 21780440, + "step": 11126 + }, + { + "epoch": 1.4747514910536779, + "grad_norm": 6.520196437835693, + "learning_rate": 2.5678570092308603e-06, + "loss": 0.1699, + "num_input_tokens_seen": 21783360, + "step": 11127 + }, + { + "epoch": 1.4748840291583831, + "grad_norm": 5.362974166870117, + "learning_rate": 2.5675099991651286e-06, + "loss": 0.1027, + "num_input_tokens_seen": 21784616, + "step": 11128 + }, + { + "epoch": 1.4750165672630882, + "grad_norm": 0.012301245704293251, + "learning_rate": 2.567162987797757e-06, + "loss": 0.0001, + "num_input_tokens_seen": 21785704, + "step": 11129 + }, + { + "epoch": 1.4751491053677932, + "grad_norm": 10.43653392791748, + "learning_rate": 2.566815975135437e-06, + "loss": 0.2249, + "num_input_tokens_seen": 21787664, + "step": 11130 + }, + { + "epoch": 1.4752816434724982, + "grad_norm": 0.44780564308166504, + "learning_rate": 2.5664689611848597e-06, + "loss": 0.0027, + "num_input_tokens_seen": 21789176, + "step": 11131 + }, + { + "epoch": 1.4754141815772035, + "grad_norm": 6.9935832023620605, + "learning_rate": 2.566121945952714e-06, + "loss": 0.1843, + "num_input_tokens_seen": 21792752, + "step": 11132 + }, + { + "epoch": 1.4755467196819085, + "grad_norm": 0.13129478693008423, + "learning_rate": 2.565774929445692e-06, + "loss": 0.0009, + "num_input_tokens_seen": 21794720, + "step": 11133 + }, + { + "epoch": 1.4756792577866136, + "grad_norm": 0.14713329076766968, + "learning_rate": 2.565427911670484e-06, + "loss": 0.0008, + "num_input_tokens_seen": 21797096, + "step": 11134 + }, + { + "epoch": 1.4758117958913188, + "grad_norm": 1.356232762336731, + "learning_rate": 2.56508089263378e-06, + "loss": 0.0069, + "num_input_tokens_seen": 21798552, + "step": 11135 + }, + { + "epoch": 1.4759443339960239, + "grad_norm": 9.22382926940918, + "learning_rate": 2.5647338723422723e-06, + "loss": 0.1656, + "num_input_tokens_seen": 21800568, + "step": 11136 + }, + { + "epoch": 1.476076872100729, + "grad_norm": 5.675853729248047, + "learning_rate": 2.564386850802651e-06, + "loss": 0.1563, + "num_input_tokens_seen": 21802432, + "step": 11137 + }, + { + "epoch": 1.476209410205434, + "grad_norm": 5.752342700958252, + "learning_rate": 2.564039828021606e-06, + "loss": 0.0472, + "num_input_tokens_seen": 21804080, + "step": 11138 + }, + { + "epoch": 1.4763419483101392, + "grad_norm": 3.5545132160186768, + "learning_rate": 2.5636928040058296e-06, + "loss": 0.1123, + "num_input_tokens_seen": 21806904, + "step": 11139 + }, + { + "epoch": 1.4764744864148442, + "grad_norm": 6.1984758377075195, + "learning_rate": 2.5633457787620115e-06, + "loss": 0.1285, + "num_input_tokens_seen": 21809176, + "step": 11140 + }, + { + "epoch": 1.4766070245195495, + "grad_norm": 6.578401565551758, + "learning_rate": 2.5629987522968435e-06, + "loss": 0.1361, + "num_input_tokens_seen": 21810968, + "step": 11141 + }, + { + "epoch": 1.4767395626242545, + "grad_norm": 2.6197304725646973, + "learning_rate": 2.5626517246170163e-06, + "loss": 0.0744, + "num_input_tokens_seen": 21812416, + "step": 11142 + }, + { + "epoch": 1.4768721007289596, + "grad_norm": 0.10279325395822525, + "learning_rate": 2.56230469572922e-06, + "loss": 0.0007, + "num_input_tokens_seen": 21813776, + "step": 11143 + }, + { + "epoch": 1.4770046388336646, + "grad_norm": 8.896048545837402, + "learning_rate": 2.5619576656401463e-06, + "loss": 0.3092, + "num_input_tokens_seen": 21816064, + "step": 11144 + }, + { + "epoch": 1.4771371769383697, + "grad_norm": 0.13208064436912537, + "learning_rate": 2.5616106343564865e-06, + "loss": 0.001, + "num_input_tokens_seen": 21817640, + "step": 11145 + }, + { + "epoch": 1.477269715043075, + "grad_norm": 0.09150807559490204, + "learning_rate": 2.561263601884931e-06, + "loss": 0.0007, + "num_input_tokens_seen": 21819048, + "step": 11146 + }, + { + "epoch": 1.47740225314778, + "grad_norm": 0.23883140087127686, + "learning_rate": 2.5609165682321708e-06, + "loss": 0.0016, + "num_input_tokens_seen": 21820272, + "step": 11147 + }, + { + "epoch": 1.4775347912524852, + "grad_norm": 6.0577473640441895, + "learning_rate": 2.560569533404897e-06, + "loss": 0.1454, + "num_input_tokens_seen": 21822176, + "step": 11148 + }, + { + "epoch": 1.4776673293571903, + "grad_norm": 0.26165109872817993, + "learning_rate": 2.5602224974098016e-06, + "loss": 0.0015, + "num_input_tokens_seen": 21824400, + "step": 11149 + }, + { + "epoch": 1.4777998674618953, + "grad_norm": 7.366633415222168, + "learning_rate": 2.5598754602535743e-06, + "loss": 0.2068, + "num_input_tokens_seen": 21826720, + "step": 11150 + }, + { + "epoch": 1.4779324055666003, + "grad_norm": 5.239055633544922, + "learning_rate": 2.5595284219429064e-06, + "loss": 0.158, + "num_input_tokens_seen": 21828936, + "step": 11151 + }, + { + "epoch": 1.4780649436713054, + "grad_norm": 1.0452977418899536, + "learning_rate": 2.55918138248449e-06, + "loss": 0.0056, + "num_input_tokens_seen": 21831400, + "step": 11152 + }, + { + "epoch": 1.4781974817760106, + "grad_norm": 2.629706382751465, + "learning_rate": 2.5588343418850155e-06, + "loss": 0.1017, + "num_input_tokens_seen": 21833592, + "step": 11153 + }, + { + "epoch": 1.4783300198807157, + "grad_norm": 0.04483118653297424, + "learning_rate": 2.5584873001511745e-06, + "loss": 0.0003, + "num_input_tokens_seen": 21834976, + "step": 11154 + }, + { + "epoch": 1.478462557985421, + "grad_norm": 0.04260517284274101, + "learning_rate": 2.5581402572896585e-06, + "loss": 0.0003, + "num_input_tokens_seen": 21836864, + "step": 11155 + }, + { + "epoch": 1.478595096090126, + "grad_norm": 2.183316946029663, + "learning_rate": 2.557793213307157e-06, + "loss": 0.0291, + "num_input_tokens_seen": 21838624, + "step": 11156 + }, + { + "epoch": 1.478727634194831, + "grad_norm": 14.253841400146484, + "learning_rate": 2.5574461682103636e-06, + "loss": 0.2388, + "num_input_tokens_seen": 21840312, + "step": 11157 + }, + { + "epoch": 1.478860172299536, + "grad_norm": 5.777599334716797, + "learning_rate": 2.557099122005968e-06, + "loss": 0.0843, + "num_input_tokens_seen": 21843056, + "step": 11158 + }, + { + "epoch": 1.4789927104042413, + "grad_norm": 0.06728745996952057, + "learning_rate": 2.5567520747006612e-06, + "loss": 0.0004, + "num_input_tokens_seen": 21844288, + "step": 11159 + }, + { + "epoch": 1.4791252485089463, + "grad_norm": 0.8680945038795471, + "learning_rate": 2.556405026301137e-06, + "loss": 0.0076, + "num_input_tokens_seen": 21846712, + "step": 11160 + }, + { + "epoch": 1.4792577866136514, + "grad_norm": 6.212240219116211, + "learning_rate": 2.5560579768140838e-06, + "loss": 0.1338, + "num_input_tokens_seen": 21848608, + "step": 11161 + }, + { + "epoch": 1.4793903247183566, + "grad_norm": 5.905635833740234, + "learning_rate": 2.555710926246194e-06, + "loss": 0.0404, + "num_input_tokens_seen": 21850848, + "step": 11162 + }, + { + "epoch": 1.4795228628230617, + "grad_norm": 5.782008171081543, + "learning_rate": 2.55536387460416e-06, + "loss": 0.067, + "num_input_tokens_seen": 21852408, + "step": 11163 + }, + { + "epoch": 1.4796554009277667, + "grad_norm": 0.08158911764621735, + "learning_rate": 2.5550168218946714e-06, + "loss": 0.0008, + "num_input_tokens_seen": 21853816, + "step": 11164 + }, + { + "epoch": 1.4797879390324717, + "grad_norm": 0.04930699244141579, + "learning_rate": 2.554669768124421e-06, + "loss": 0.0003, + "num_input_tokens_seen": 21856400, + "step": 11165 + }, + { + "epoch": 1.479920477137177, + "grad_norm": 11.54738998413086, + "learning_rate": 2.554322713300099e-06, + "loss": 0.1988, + "num_input_tokens_seen": 21858440, + "step": 11166 + }, + { + "epoch": 1.480053015241882, + "grad_norm": 13.03319263458252, + "learning_rate": 2.5539756574283985e-06, + "loss": 0.3302, + "num_input_tokens_seen": 21860776, + "step": 11167 + }, + { + "epoch": 1.480185553346587, + "grad_norm": 6.881104469299316, + "learning_rate": 2.55362860051601e-06, + "loss": 0.1164, + "num_input_tokens_seen": 21862776, + "step": 11168 + }, + { + "epoch": 1.4803180914512923, + "grad_norm": 8.74828815460205, + "learning_rate": 2.5532815425696245e-06, + "loss": 0.1805, + "num_input_tokens_seen": 21864552, + "step": 11169 + }, + { + "epoch": 1.4804506295559974, + "grad_norm": 4.866464614868164, + "learning_rate": 2.552934483595935e-06, + "loss": 0.1057, + "num_input_tokens_seen": 21867120, + "step": 11170 + }, + { + "epoch": 1.4805831676607024, + "grad_norm": 0.12263792008161545, + "learning_rate": 2.5525874236016307e-06, + "loss": 0.0007, + "num_input_tokens_seen": 21868400, + "step": 11171 + }, + { + "epoch": 1.4807157057654075, + "grad_norm": 0.25782310962677, + "learning_rate": 2.5522403625934063e-06, + "loss": 0.0019, + "num_input_tokens_seen": 21870776, + "step": 11172 + }, + { + "epoch": 1.4808482438701127, + "grad_norm": 0.02431444823741913, + "learning_rate": 2.5518933005779504e-06, + "loss": 0.0002, + "num_input_tokens_seen": 21872440, + "step": 11173 + }, + { + "epoch": 1.4809807819748178, + "grad_norm": 5.497162818908691, + "learning_rate": 2.5515462375619563e-06, + "loss": 0.0649, + "num_input_tokens_seen": 21874600, + "step": 11174 + }, + { + "epoch": 1.4811133200795228, + "grad_norm": 4.860320091247559, + "learning_rate": 2.5511991735521157e-06, + "loss": 0.0184, + "num_input_tokens_seen": 21875800, + "step": 11175 + }, + { + "epoch": 1.481245858184228, + "grad_norm": 9.03791332244873, + "learning_rate": 2.550852108555119e-06, + "loss": 0.1615, + "num_input_tokens_seen": 21877776, + "step": 11176 + }, + { + "epoch": 1.481378396288933, + "grad_norm": 1.5148662328720093, + "learning_rate": 2.550505042577659e-06, + "loss": 0.0253, + "num_input_tokens_seen": 21878968, + "step": 11177 + }, + { + "epoch": 1.4815109343936381, + "grad_norm": 0.45201557874679565, + "learning_rate": 2.5501579756264273e-06, + "loss": 0.0016, + "num_input_tokens_seen": 21881048, + "step": 11178 + }, + { + "epoch": 1.4816434724983432, + "grad_norm": 1.0872268676757812, + "learning_rate": 2.549810907708114e-06, + "loss": 0.0096, + "num_input_tokens_seen": 21883864, + "step": 11179 + }, + { + "epoch": 1.4817760106030484, + "grad_norm": 0.534960150718689, + "learning_rate": 2.5494638388294128e-06, + "loss": 0.004, + "num_input_tokens_seen": 21885648, + "step": 11180 + }, + { + "epoch": 1.4819085487077535, + "grad_norm": 2.264981985092163, + "learning_rate": 2.5491167689970147e-06, + "loss": 0.0186, + "num_input_tokens_seen": 21887752, + "step": 11181 + }, + { + "epoch": 1.4820410868124587, + "grad_norm": 4.386968612670898, + "learning_rate": 2.5487696982176113e-06, + "loss": 0.0152, + "num_input_tokens_seen": 21889368, + "step": 11182 + }, + { + "epoch": 1.4821736249171638, + "grad_norm": 6.181941986083984, + "learning_rate": 2.5484226264978946e-06, + "loss": 0.0861, + "num_input_tokens_seen": 21890744, + "step": 11183 + }, + { + "epoch": 1.4823061630218688, + "grad_norm": 7.733394622802734, + "learning_rate": 2.5480755538445557e-06, + "loss": 0.0939, + "num_input_tokens_seen": 21892368, + "step": 11184 + }, + { + "epoch": 1.4824387011265738, + "grad_norm": 0.04716675356030464, + "learning_rate": 2.547728480264287e-06, + "loss": 0.0003, + "num_input_tokens_seen": 21894520, + "step": 11185 + }, + { + "epoch": 1.4825712392312789, + "grad_norm": 6.8333563804626465, + "learning_rate": 2.547381405763781e-06, + "loss": 0.0525, + "num_input_tokens_seen": 21896056, + "step": 11186 + }, + { + "epoch": 1.4827037773359841, + "grad_norm": 3.5846142768859863, + "learning_rate": 2.5470343303497286e-06, + "loss": 0.058, + "num_input_tokens_seen": 21898688, + "step": 11187 + }, + { + "epoch": 1.4828363154406892, + "grad_norm": 0.02812708541750908, + "learning_rate": 2.546687254028822e-06, + "loss": 0.0003, + "num_input_tokens_seen": 21900088, + "step": 11188 + }, + { + "epoch": 1.4829688535453944, + "grad_norm": 16.55612564086914, + "learning_rate": 2.5463401768077528e-06, + "loss": 0.3377, + "num_input_tokens_seen": 21901880, + "step": 11189 + }, + { + "epoch": 1.4831013916500995, + "grad_norm": 6.110386848449707, + "learning_rate": 2.5459930986932126e-06, + "loss": 0.0955, + "num_input_tokens_seen": 21903960, + "step": 11190 + }, + { + "epoch": 1.4832339297548045, + "grad_norm": 7.810425281524658, + "learning_rate": 2.5456460196918938e-06, + "loss": 0.145, + "num_input_tokens_seen": 21906384, + "step": 11191 + }, + { + "epoch": 1.4833664678595095, + "grad_norm": 4.55702018737793, + "learning_rate": 2.545298939810488e-06, + "loss": 0.0796, + "num_input_tokens_seen": 21908472, + "step": 11192 + }, + { + "epoch": 1.4834990059642146, + "grad_norm": 15.048956871032715, + "learning_rate": 2.5449518590556888e-06, + "loss": 0.3339, + "num_input_tokens_seen": 21910632, + "step": 11193 + }, + { + "epoch": 1.4836315440689198, + "grad_norm": 1.0949821472167969, + "learning_rate": 2.5446047774341853e-06, + "loss": 0.007, + "num_input_tokens_seen": 21912552, + "step": 11194 + }, + { + "epoch": 1.4837640821736249, + "grad_norm": 3.7838027477264404, + "learning_rate": 2.5442576949526715e-06, + "loss": 0.1036, + "num_input_tokens_seen": 21914480, + "step": 11195 + }, + { + "epoch": 1.4838966202783301, + "grad_norm": 8.974052429199219, + "learning_rate": 2.5439106116178393e-06, + "loss": 0.0848, + "num_input_tokens_seen": 21915944, + "step": 11196 + }, + { + "epoch": 1.4840291583830352, + "grad_norm": 8.462437629699707, + "learning_rate": 2.543563527436379e-06, + "loss": 0.1722, + "num_input_tokens_seen": 21917752, + "step": 11197 + }, + { + "epoch": 1.4841616964877402, + "grad_norm": 10.801608085632324, + "learning_rate": 2.5432164424149845e-06, + "loss": 0.2548, + "num_input_tokens_seen": 21919552, + "step": 11198 + }, + { + "epoch": 1.4842942345924452, + "grad_norm": 0.5651176571846008, + "learning_rate": 2.5428693565603475e-06, + "loss": 0.0053, + "num_input_tokens_seen": 21921720, + "step": 11199 + }, + { + "epoch": 1.4844267726971505, + "grad_norm": 8.296182632446289, + "learning_rate": 2.5425222698791592e-06, + "loss": 0.117, + "num_input_tokens_seen": 21923784, + "step": 11200 + }, + { + "epoch": 1.4845593108018555, + "grad_norm": 0.13115756213665009, + "learning_rate": 2.5421751823781125e-06, + "loss": 0.001, + "num_input_tokens_seen": 21927096, + "step": 11201 + }, + { + "epoch": 1.4846918489065606, + "grad_norm": 6.350335597991943, + "learning_rate": 2.5418280940638993e-06, + "loss": 0.0399, + "num_input_tokens_seen": 21928608, + "step": 11202 + }, + { + "epoch": 1.4848243870112658, + "grad_norm": 6.139593601226807, + "learning_rate": 2.541481004943211e-06, + "loss": 0.2025, + "num_input_tokens_seen": 21930472, + "step": 11203 + }, + { + "epoch": 1.4849569251159709, + "grad_norm": 6.233639240264893, + "learning_rate": 2.541133915022741e-06, + "loss": 0.1612, + "num_input_tokens_seen": 21932496, + "step": 11204 + }, + { + "epoch": 1.485089463220676, + "grad_norm": 10.467829704284668, + "learning_rate": 2.54078682430918e-06, + "loss": 0.1516, + "num_input_tokens_seen": 21934832, + "step": 11205 + }, + { + "epoch": 1.485222001325381, + "grad_norm": 0.09774071723222733, + "learning_rate": 2.5404397328092214e-06, + "loss": 0.0006, + "num_input_tokens_seen": 21936296, + "step": 11206 + }, + { + "epoch": 1.4853545394300862, + "grad_norm": 0.5842940211296082, + "learning_rate": 2.5400926405295574e-06, + "loss": 0.003, + "num_input_tokens_seen": 21938528, + "step": 11207 + }, + { + "epoch": 1.4854870775347913, + "grad_norm": 13.543889999389648, + "learning_rate": 2.5397455474768784e-06, + "loss": 0.1856, + "num_input_tokens_seen": 21941208, + "step": 11208 + }, + { + "epoch": 1.4856196156394963, + "grad_norm": 4.807651996612549, + "learning_rate": 2.539398453657878e-06, + "loss": 0.0981, + "num_input_tokens_seen": 21943056, + "step": 11209 + }, + { + "epoch": 1.4857521537442016, + "grad_norm": 0.08717454224824905, + "learning_rate": 2.5390513590792486e-06, + "loss": 0.0005, + "num_input_tokens_seen": 21945888, + "step": 11210 + }, + { + "epoch": 1.4858846918489066, + "grad_norm": 0.8177707195281982, + "learning_rate": 2.5387042637476826e-06, + "loss": 0.0057, + "num_input_tokens_seen": 21947136, + "step": 11211 + }, + { + "epoch": 1.4860172299536116, + "grad_norm": 0.14777931571006775, + "learning_rate": 2.5383571676698708e-06, + "loss": 0.0011, + "num_input_tokens_seen": 21949320, + "step": 11212 + }, + { + "epoch": 1.4861497680583167, + "grad_norm": 3.502070903778076, + "learning_rate": 2.538010070852507e-06, + "loss": 0.0596, + "num_input_tokens_seen": 21951000, + "step": 11213 + }, + { + "epoch": 1.486282306163022, + "grad_norm": 4.961649417877197, + "learning_rate": 2.537662973302283e-06, + "loss": 0.0815, + "num_input_tokens_seen": 21953120, + "step": 11214 + }, + { + "epoch": 1.486414844267727, + "grad_norm": 10.793278694152832, + "learning_rate": 2.53731587502589e-06, + "loss": 0.1703, + "num_input_tokens_seen": 21955160, + "step": 11215 + }, + { + "epoch": 1.486547382372432, + "grad_norm": 7.7959065437316895, + "learning_rate": 2.5369687760300216e-06, + "loss": 0.2909, + "num_input_tokens_seen": 21956848, + "step": 11216 + }, + { + "epoch": 1.4866799204771373, + "grad_norm": 4.264549732208252, + "learning_rate": 2.53662167632137e-06, + "loss": 0.0563, + "num_input_tokens_seen": 21958560, + "step": 11217 + }, + { + "epoch": 1.4868124585818423, + "grad_norm": 3.288600206375122, + "learning_rate": 2.5362745759066263e-06, + "loss": 0.0234, + "num_input_tokens_seen": 21960376, + "step": 11218 + }, + { + "epoch": 1.4869449966865473, + "grad_norm": 14.401220321655273, + "learning_rate": 2.535927474792485e-06, + "loss": 0.2336, + "num_input_tokens_seen": 21962360, + "step": 11219 + }, + { + "epoch": 1.4870775347912524, + "grad_norm": 4.68358850479126, + "learning_rate": 2.535580372985637e-06, + "loss": 0.0825, + "num_input_tokens_seen": 21963888, + "step": 11220 + }, + { + "epoch": 1.4872100728959576, + "grad_norm": 0.11901406943798065, + "learning_rate": 2.535233270492774e-06, + "loss": 0.0007, + "num_input_tokens_seen": 21965424, + "step": 11221 + }, + { + "epoch": 1.4873426110006627, + "grad_norm": 7.619617938995361, + "learning_rate": 2.53488616732059e-06, + "loss": 0.2111, + "num_input_tokens_seen": 21966888, + "step": 11222 + }, + { + "epoch": 1.4874751491053677, + "grad_norm": 4.48398494720459, + "learning_rate": 2.5345390634757763e-06, + "loss": 0.0881, + "num_input_tokens_seen": 21968504, + "step": 11223 + }, + { + "epoch": 1.487607687210073, + "grad_norm": 9.354607582092285, + "learning_rate": 2.5341919589650256e-06, + "loss": 0.235, + "num_input_tokens_seen": 21970744, + "step": 11224 + }, + { + "epoch": 1.487740225314778, + "grad_norm": 5.516051292419434, + "learning_rate": 2.5338448537950304e-06, + "loss": 0.0601, + "num_input_tokens_seen": 21972840, + "step": 11225 + }, + { + "epoch": 1.487872763419483, + "grad_norm": 4.914371967315674, + "learning_rate": 2.533497747972483e-06, + "loss": 0.025, + "num_input_tokens_seen": 21973768, + "step": 11226 + }, + { + "epoch": 1.488005301524188, + "grad_norm": 0.1430598646402359, + "learning_rate": 2.533150641504076e-06, + "loss": 0.0007, + "num_input_tokens_seen": 21976832, + "step": 11227 + }, + { + "epoch": 1.4881378396288933, + "grad_norm": 1.527557611465454, + "learning_rate": 2.5328035343965012e-06, + "loss": 0.0232, + "num_input_tokens_seen": 21978224, + "step": 11228 + }, + { + "epoch": 1.4882703777335984, + "grad_norm": 7.597232341766357, + "learning_rate": 2.5324564266564527e-06, + "loss": 0.1311, + "num_input_tokens_seen": 21980176, + "step": 11229 + }, + { + "epoch": 1.4884029158383036, + "grad_norm": 12.762622833251953, + "learning_rate": 2.5321093182906217e-06, + "loss": 0.1823, + "num_input_tokens_seen": 21983904, + "step": 11230 + }, + { + "epoch": 1.4885354539430087, + "grad_norm": 0.03356817364692688, + "learning_rate": 2.5317622093057e-06, + "loss": 0.0002, + "num_input_tokens_seen": 21985240, + "step": 11231 + }, + { + "epoch": 1.4886679920477137, + "grad_norm": 11.758745193481445, + "learning_rate": 2.531415099708382e-06, + "loss": 0.0957, + "num_input_tokens_seen": 21987504, + "step": 11232 + }, + { + "epoch": 1.4888005301524188, + "grad_norm": 7.603974342346191, + "learning_rate": 2.5310679895053585e-06, + "loss": 0.0304, + "num_input_tokens_seen": 21989088, + "step": 11233 + }, + { + "epoch": 1.4889330682571238, + "grad_norm": 14.615354537963867, + "learning_rate": 2.530720878703323e-06, + "loss": 0.4464, + "num_input_tokens_seen": 21991032, + "step": 11234 + }, + { + "epoch": 1.489065606361829, + "grad_norm": 3.262742757797241, + "learning_rate": 2.5303737673089686e-06, + "loss": 0.0468, + "num_input_tokens_seen": 21993072, + "step": 11235 + }, + { + "epoch": 1.489198144466534, + "grad_norm": 5.871703147888184, + "learning_rate": 2.530026655328985e-06, + "loss": 0.1643, + "num_input_tokens_seen": 21994792, + "step": 11236 + }, + { + "epoch": 1.4893306825712394, + "grad_norm": 0.1343318372964859, + "learning_rate": 2.529679542770069e-06, + "loss": 0.001, + "num_input_tokens_seen": 21996856, + "step": 11237 + }, + { + "epoch": 1.4894632206759444, + "grad_norm": 2.609346628189087, + "learning_rate": 2.529332429638909e-06, + "loss": 0.0388, + "num_input_tokens_seen": 21999624, + "step": 11238 + }, + { + "epoch": 1.4895957587806494, + "grad_norm": 2.5073983669281006, + "learning_rate": 2.5289853159422005e-06, + "loss": 0.0495, + "num_input_tokens_seen": 22002560, + "step": 11239 + }, + { + "epoch": 1.4897282968853545, + "grad_norm": 0.22896023094654083, + "learning_rate": 2.5286382016866356e-06, + "loss": 0.0015, + "num_input_tokens_seen": 22004808, + "step": 11240 + }, + { + "epoch": 1.4898608349900597, + "grad_norm": 2.1829471588134766, + "learning_rate": 2.5282910868789056e-06, + "loss": 0.0235, + "num_input_tokens_seen": 22006728, + "step": 11241 + }, + { + "epoch": 1.4899933730947648, + "grad_norm": 3.60050106048584, + "learning_rate": 2.5279439715257037e-06, + "loss": 0.0379, + "num_input_tokens_seen": 22008480, + "step": 11242 + }, + { + "epoch": 1.4901259111994698, + "grad_norm": 6.7282023429870605, + "learning_rate": 2.5275968556337237e-06, + "loss": 0.0827, + "num_input_tokens_seen": 22011032, + "step": 11243 + }, + { + "epoch": 1.490258449304175, + "grad_norm": 2.6511669158935547, + "learning_rate": 2.5272497392096566e-06, + "loss": 0.0363, + "num_input_tokens_seen": 22012536, + "step": 11244 + }, + { + "epoch": 1.49039098740888, + "grad_norm": 0.2613256871700287, + "learning_rate": 2.526902622260196e-06, + "loss": 0.0018, + "num_input_tokens_seen": 22015072, + "step": 11245 + }, + { + "epoch": 1.4905235255135851, + "grad_norm": 0.10562022775411606, + "learning_rate": 2.5265555047920347e-06, + "loss": 0.0006, + "num_input_tokens_seen": 22016744, + "step": 11246 + }, + { + "epoch": 1.4906560636182902, + "grad_norm": 0.04290936142206192, + "learning_rate": 2.5262083868118643e-06, + "loss": 0.0003, + "num_input_tokens_seen": 22018624, + "step": 11247 + }, + { + "epoch": 1.4907886017229954, + "grad_norm": 6.47480583190918, + "learning_rate": 2.5258612683263782e-06, + "loss": 0.0377, + "num_input_tokens_seen": 22020256, + "step": 11248 + }, + { + "epoch": 1.4909211398277005, + "grad_norm": 2.856600284576416, + "learning_rate": 2.5255141493422684e-06, + "loss": 0.0397, + "num_input_tokens_seen": 22023384, + "step": 11249 + }, + { + "epoch": 1.4910536779324055, + "grad_norm": 0.0670863464474678, + "learning_rate": 2.525167029866229e-06, + "loss": 0.0004, + "num_input_tokens_seen": 22025376, + "step": 11250 + }, + { + "epoch": 1.4911862160371108, + "grad_norm": 24.312685012817383, + "learning_rate": 2.5248199099049527e-06, + "loss": 0.4785, + "num_input_tokens_seen": 22028736, + "step": 11251 + }, + { + "epoch": 1.4913187541418158, + "grad_norm": 7.973196983337402, + "learning_rate": 2.52447278946513e-06, + "loss": 0.2607, + "num_input_tokens_seen": 22031008, + "step": 11252 + }, + { + "epoch": 1.4914512922465208, + "grad_norm": 0.0632823035120964, + "learning_rate": 2.5241256685534556e-06, + "loss": 0.0004, + "num_input_tokens_seen": 22033024, + "step": 11253 + }, + { + "epoch": 1.4915838303512259, + "grad_norm": 2.3455021381378174, + "learning_rate": 2.5237785471766213e-06, + "loss": 0.0234, + "num_input_tokens_seen": 22034880, + "step": 11254 + }, + { + "epoch": 1.4917163684559311, + "grad_norm": 6.446322917938232, + "learning_rate": 2.523431425341321e-06, + "loss": 0.1945, + "num_input_tokens_seen": 22036944, + "step": 11255 + }, + { + "epoch": 1.4918489065606362, + "grad_norm": 7.368457317352295, + "learning_rate": 2.5230843030542456e-06, + "loss": 0.1715, + "num_input_tokens_seen": 22039776, + "step": 11256 + }, + { + "epoch": 1.4919814446653412, + "grad_norm": 0.030115073546767235, + "learning_rate": 2.5227371803220896e-06, + "loss": 0.0004, + "num_input_tokens_seen": 22041120, + "step": 11257 + }, + { + "epoch": 1.4921139827700465, + "grad_norm": 0.5102638602256775, + "learning_rate": 2.5223900571515452e-06, + "loss": 0.0043, + "num_input_tokens_seen": 22042616, + "step": 11258 + }, + { + "epoch": 1.4922465208747515, + "grad_norm": 7.951561450958252, + "learning_rate": 2.5220429335493046e-06, + "loss": 0.1131, + "num_input_tokens_seen": 22044288, + "step": 11259 + }, + { + "epoch": 1.4923790589794566, + "grad_norm": 9.365495681762695, + "learning_rate": 2.521695809522061e-06, + "loss": 0.1356, + "num_input_tokens_seen": 22045936, + "step": 11260 + }, + { + "epoch": 1.4925115970841616, + "grad_norm": 3.4460318088531494, + "learning_rate": 2.5213486850765085e-06, + "loss": 0.0487, + "num_input_tokens_seen": 22047864, + "step": 11261 + }, + { + "epoch": 1.4926441351888668, + "grad_norm": 3.424572467803955, + "learning_rate": 2.5210015602193373e-06, + "loss": 0.0704, + "num_input_tokens_seen": 22049240, + "step": 11262 + }, + { + "epoch": 1.4927766732935719, + "grad_norm": 4.318841457366943, + "learning_rate": 2.5206544349572416e-06, + "loss": 0.0785, + "num_input_tokens_seen": 22050688, + "step": 11263 + }, + { + "epoch": 1.492909211398277, + "grad_norm": 13.93221664428711, + "learning_rate": 2.520307309296915e-06, + "loss": 0.2348, + "num_input_tokens_seen": 22053128, + "step": 11264 + }, + { + "epoch": 1.4930417495029822, + "grad_norm": 7.366179466247559, + "learning_rate": 2.519960183245048e-06, + "loss": 0.0477, + "num_input_tokens_seen": 22055872, + "step": 11265 + }, + { + "epoch": 1.4931742876076872, + "grad_norm": 6.625668525695801, + "learning_rate": 2.519613056808336e-06, + "loss": 0.1607, + "num_input_tokens_seen": 22057984, + "step": 11266 + }, + { + "epoch": 1.4933068257123923, + "grad_norm": 10.712101936340332, + "learning_rate": 2.5192659299934703e-06, + "loss": 0.0988, + "num_input_tokens_seen": 22059464, + "step": 11267 + }, + { + "epoch": 1.4934393638170973, + "grad_norm": 16.251506805419922, + "learning_rate": 2.5189188028071444e-06, + "loss": 0.2147, + "num_input_tokens_seen": 22062080, + "step": 11268 + }, + { + "epoch": 1.4935719019218026, + "grad_norm": 5.481698036193848, + "learning_rate": 2.5185716752560515e-06, + "loss": 0.0641, + "num_input_tokens_seen": 22065144, + "step": 11269 + }, + { + "epoch": 1.4937044400265076, + "grad_norm": 19.051408767700195, + "learning_rate": 2.518224547346883e-06, + "loss": 0.6381, + "num_input_tokens_seen": 22067528, + "step": 11270 + }, + { + "epoch": 1.4938369781312129, + "grad_norm": 5.339697360992432, + "learning_rate": 2.5178774190863326e-06, + "loss": 0.134, + "num_input_tokens_seen": 22070088, + "step": 11271 + }, + { + "epoch": 1.493969516235918, + "grad_norm": 0.02127126231789589, + "learning_rate": 2.517530290481093e-06, + "loss": 0.0001, + "num_input_tokens_seen": 22072176, + "step": 11272 + }, + { + "epoch": 1.494102054340623, + "grad_norm": 5.8056416511535645, + "learning_rate": 2.5171831615378587e-06, + "loss": 0.1708, + "num_input_tokens_seen": 22075088, + "step": 11273 + }, + { + "epoch": 1.494234592445328, + "grad_norm": 3.2386772632598877, + "learning_rate": 2.51683603226332e-06, + "loss": 0.0211, + "num_input_tokens_seen": 22076632, + "step": 11274 + }, + { + "epoch": 1.494367130550033, + "grad_norm": 0.04409939795732498, + "learning_rate": 2.5164889026641714e-06, + "loss": 0.0003, + "num_input_tokens_seen": 22078208, + "step": 11275 + }, + { + "epoch": 1.4944996686547383, + "grad_norm": 7.009223937988281, + "learning_rate": 2.5161417727471053e-06, + "loss": 0.1532, + "num_input_tokens_seen": 22079960, + "step": 11276 + }, + { + "epoch": 1.4946322067594433, + "grad_norm": 0.036608558148145676, + "learning_rate": 2.5157946425188153e-06, + "loss": 0.0002, + "num_input_tokens_seen": 22081944, + "step": 11277 + }, + { + "epoch": 1.4947647448641486, + "grad_norm": 1.2869271039962769, + "learning_rate": 2.5154475119859927e-06, + "loss": 0.0116, + "num_input_tokens_seen": 22083152, + "step": 11278 + }, + { + "epoch": 1.4948972829688536, + "grad_norm": 0.39250287413597107, + "learning_rate": 2.515100381155332e-06, + "loss": 0.0037, + "num_input_tokens_seen": 22084600, + "step": 11279 + }, + { + "epoch": 1.4950298210735586, + "grad_norm": 17.673362731933594, + "learning_rate": 2.514753250033525e-06, + "loss": 0.1486, + "num_input_tokens_seen": 22087552, + "step": 11280 + }, + { + "epoch": 1.4951623591782637, + "grad_norm": 4.3808441162109375, + "learning_rate": 2.5144061186272656e-06, + "loss": 0.0296, + "num_input_tokens_seen": 22088840, + "step": 11281 + }, + { + "epoch": 1.4952948972829687, + "grad_norm": 0.08708979189395905, + "learning_rate": 2.514058986943247e-06, + "loss": 0.0005, + "num_input_tokens_seen": 22091560, + "step": 11282 + }, + { + "epoch": 1.495427435387674, + "grad_norm": 5.112151622772217, + "learning_rate": 2.5137118549881594e-06, + "loss": 0.1537, + "num_input_tokens_seen": 22093336, + "step": 11283 + }, + { + "epoch": 1.495559973492379, + "grad_norm": 0.03771781921386719, + "learning_rate": 2.5133647227687e-06, + "loss": 0.0003, + "num_input_tokens_seen": 22094512, + "step": 11284 + }, + { + "epoch": 1.4956925115970843, + "grad_norm": 0.16863234341144562, + "learning_rate": 2.513017590291558e-06, + "loss": 0.0009, + "num_input_tokens_seen": 22097056, + "step": 11285 + }, + { + "epoch": 1.4958250497017893, + "grad_norm": 6.053509712219238, + "learning_rate": 2.512670457563428e-06, + "loss": 0.1273, + "num_input_tokens_seen": 22099080, + "step": 11286 + }, + { + "epoch": 1.4959575878064943, + "grad_norm": 0.22670598328113556, + "learning_rate": 2.512323324591004e-06, + "loss": 0.0009, + "num_input_tokens_seen": 22100424, + "step": 11287 + }, + { + "epoch": 1.4960901259111994, + "grad_norm": 11.104025840759277, + "learning_rate": 2.511976191380977e-06, + "loss": 0.2368, + "num_input_tokens_seen": 22102352, + "step": 11288 + }, + { + "epoch": 1.4962226640159046, + "grad_norm": 3.6719870567321777, + "learning_rate": 2.5116290579400406e-06, + "loss": 0.0383, + "num_input_tokens_seen": 22104312, + "step": 11289 + }, + { + "epoch": 1.4963552021206097, + "grad_norm": 0.06800150871276855, + "learning_rate": 2.5112819242748885e-06, + "loss": 0.0004, + "num_input_tokens_seen": 22107704, + "step": 11290 + }, + { + "epoch": 1.4964877402253147, + "grad_norm": 8.735757827758789, + "learning_rate": 2.510934790392213e-06, + "loss": 0.2587, + "num_input_tokens_seen": 22109520, + "step": 11291 + }, + { + "epoch": 1.49662027833002, + "grad_norm": 9.229066848754883, + "learning_rate": 2.5105876562987064e-06, + "loss": 0.147, + "num_input_tokens_seen": 22111552, + "step": 11292 + }, + { + "epoch": 1.496752816434725, + "grad_norm": 1.3582638502120972, + "learning_rate": 2.510240522001063e-06, + "loss": 0.0128, + "num_input_tokens_seen": 22113072, + "step": 11293 + }, + { + "epoch": 1.49688535453943, + "grad_norm": 4.1831889152526855, + "learning_rate": 2.5098933875059757e-06, + "loss": 0.0381, + "num_input_tokens_seen": 22114664, + "step": 11294 + }, + { + "epoch": 1.497017892644135, + "grad_norm": 11.545395851135254, + "learning_rate": 2.5095462528201366e-06, + "loss": 0.1321, + "num_input_tokens_seen": 22117368, + "step": 11295 + }, + { + "epoch": 1.4971504307488404, + "grad_norm": 0.4734537601470947, + "learning_rate": 2.509199117950239e-06, + "loss": 0.0032, + "num_input_tokens_seen": 22119296, + "step": 11296 + }, + { + "epoch": 1.4972829688535454, + "grad_norm": 5.652725696563721, + "learning_rate": 2.508851982902977e-06, + "loss": 0.0973, + "num_input_tokens_seen": 22120976, + "step": 11297 + }, + { + "epoch": 1.4974155069582504, + "grad_norm": 7.9530744552612305, + "learning_rate": 2.508504847685041e-06, + "loss": 0.0364, + "num_input_tokens_seen": 22122320, + "step": 11298 + }, + { + "epoch": 1.4975480450629557, + "grad_norm": 5.118899345397949, + "learning_rate": 2.508157712303127e-06, + "loss": 0.2109, + "num_input_tokens_seen": 22124688, + "step": 11299 + }, + { + "epoch": 1.4976805831676607, + "grad_norm": 14.509099960327148, + "learning_rate": 2.5078105767639265e-06, + "loss": 0.388, + "num_input_tokens_seen": 22127072, + "step": 11300 + }, + { + "epoch": 1.4978131212723658, + "grad_norm": 12.065324783325195, + "learning_rate": 2.5074634410741323e-06, + "loss": 0.2531, + "num_input_tokens_seen": 22128688, + "step": 11301 + }, + { + "epoch": 1.4979456593770708, + "grad_norm": 6.663219928741455, + "learning_rate": 2.5071163052404378e-06, + "loss": 0.0614, + "num_input_tokens_seen": 22130416, + "step": 11302 + }, + { + "epoch": 1.498078197481776, + "grad_norm": 9.804076194763184, + "learning_rate": 2.5067691692695366e-06, + "loss": 0.221, + "num_input_tokens_seen": 22132512, + "step": 11303 + }, + { + "epoch": 1.498210735586481, + "grad_norm": 1.0664399862289429, + "learning_rate": 2.5064220331681204e-06, + "loss": 0.0051, + "num_input_tokens_seen": 22134480, + "step": 11304 + }, + { + "epoch": 1.4983432736911861, + "grad_norm": 0.10796388983726501, + "learning_rate": 2.506074896942884e-06, + "loss": 0.001, + "num_input_tokens_seen": 22136768, + "step": 11305 + }, + { + "epoch": 1.4984758117958914, + "grad_norm": 10.3233003616333, + "learning_rate": 2.5057277606005187e-06, + "loss": 0.2936, + "num_input_tokens_seen": 22139416, + "step": 11306 + }, + { + "epoch": 1.4986083499005964, + "grad_norm": 9.231203079223633, + "learning_rate": 2.505380624147718e-06, + "loss": 0.1681, + "num_input_tokens_seen": 22142064, + "step": 11307 + }, + { + "epoch": 1.4987408880053015, + "grad_norm": 12.606380462646484, + "learning_rate": 2.505033487591176e-06, + "loss": 0.3864, + "num_input_tokens_seen": 22144416, + "step": 11308 + }, + { + "epoch": 1.4988734261100065, + "grad_norm": 7.775861740112305, + "learning_rate": 2.504686350937584e-06, + "loss": 0.1466, + "num_input_tokens_seen": 22146768, + "step": 11309 + }, + { + "epoch": 1.4990059642147118, + "grad_norm": 7.8473005294799805, + "learning_rate": 2.5043392141936364e-06, + "loss": 0.0993, + "num_input_tokens_seen": 22148760, + "step": 11310 + }, + { + "epoch": 1.4991385023194168, + "grad_norm": 6.642422676086426, + "learning_rate": 2.5039920773660254e-06, + "loss": 0.0623, + "num_input_tokens_seen": 22150768, + "step": 11311 + }, + { + "epoch": 1.499271040424122, + "grad_norm": 1.4296715259552002, + "learning_rate": 2.503644940461445e-06, + "loss": 0.0065, + "num_input_tokens_seen": 22153208, + "step": 11312 + }, + { + "epoch": 1.499403578528827, + "grad_norm": 0.5597422122955322, + "learning_rate": 2.503297803486587e-06, + "loss": 0.0048, + "num_input_tokens_seen": 22155432, + "step": 11313 + }, + { + "epoch": 1.4995361166335321, + "grad_norm": 0.233292818069458, + "learning_rate": 2.502950666448145e-06, + "loss": 0.0016, + "num_input_tokens_seen": 22157696, + "step": 11314 + }, + { + "epoch": 1.4996686547382372, + "grad_norm": 0.19976887106895447, + "learning_rate": 2.502603529352813e-06, + "loss": 0.0013, + "num_input_tokens_seen": 22159360, + "step": 11315 + }, + { + "epoch": 1.4998011928429422, + "grad_norm": 4.656866550445557, + "learning_rate": 2.5022563922072825e-06, + "loss": 0.1073, + "num_input_tokens_seen": 22161176, + "step": 11316 + }, + { + "epoch": 1.4999337309476475, + "grad_norm": 12.248750686645508, + "learning_rate": 2.5019092550182467e-06, + "loss": 0.1918, + "num_input_tokens_seen": 22163416, + "step": 11317 + }, + { + "epoch": 1.5000662690523525, + "grad_norm": 23.958274841308594, + "learning_rate": 2.5015621177923998e-06, + "loss": 0.5868, + "num_input_tokens_seen": 22165064, + "step": 11318 + }, + { + "epoch": 1.5001988071570578, + "grad_norm": 16.156267166137695, + "learning_rate": 2.501214980536434e-06, + "loss": 0.4877, + "num_input_tokens_seen": 22167104, + "step": 11319 + }, + { + "epoch": 1.5003313452617628, + "grad_norm": 6.773266315460205, + "learning_rate": 2.500867843257043e-06, + "loss": 0.1135, + "num_input_tokens_seen": 22170016, + "step": 11320 + }, + { + "epoch": 1.5004638833664679, + "grad_norm": 8.740592956542969, + "learning_rate": 2.5005207059609187e-06, + "loss": 0.1243, + "num_input_tokens_seen": 22171416, + "step": 11321 + }, + { + "epoch": 1.500596421471173, + "grad_norm": 9.475445747375488, + "learning_rate": 2.5001735686547553e-06, + "loss": 0.0828, + "num_input_tokens_seen": 22173496, + "step": 11322 + }, + { + "epoch": 1.500728959575878, + "grad_norm": 0.007296200376003981, + "learning_rate": 2.499826431345245e-06, + "loss": 0.0, + "num_input_tokens_seen": 22174976, + "step": 11323 + }, + { + "epoch": 1.5008614976805832, + "grad_norm": 0.19035208225250244, + "learning_rate": 2.4994792940390817e-06, + "loss": 0.0018, + "num_input_tokens_seen": 22176624, + "step": 11324 + }, + { + "epoch": 1.5009940357852882, + "grad_norm": 6.67063570022583, + "learning_rate": 2.4991321567429577e-06, + "loss": 0.0483, + "num_input_tokens_seen": 22178128, + "step": 11325 + }, + { + "epoch": 1.5011265738899935, + "grad_norm": 4.840065002441406, + "learning_rate": 2.498785019463567e-06, + "loss": 0.1344, + "num_input_tokens_seen": 22180536, + "step": 11326 + }, + { + "epoch": 1.5012591119946985, + "grad_norm": 3.436634063720703, + "learning_rate": 2.498437882207601e-06, + "loss": 0.0638, + "num_input_tokens_seen": 22182216, + "step": 11327 + }, + { + "epoch": 1.5013916500994036, + "grad_norm": 0.12780870497226715, + "learning_rate": 2.4980907449817533e-06, + "loss": 0.0008, + "num_input_tokens_seen": 22184384, + "step": 11328 + }, + { + "epoch": 1.5015241882041086, + "grad_norm": 4.5274858474731445, + "learning_rate": 2.4977436077927184e-06, + "loss": 0.1023, + "num_input_tokens_seen": 22186648, + "step": 11329 + }, + { + "epoch": 1.5016567263088136, + "grad_norm": 4.441134452819824, + "learning_rate": 2.497396470647188e-06, + "loss": 0.0203, + "num_input_tokens_seen": 22188384, + "step": 11330 + }, + { + "epoch": 1.501789264413519, + "grad_norm": 17.046367645263672, + "learning_rate": 2.497049333551856e-06, + "loss": 0.5917, + "num_input_tokens_seen": 22190168, + "step": 11331 + }, + { + "epoch": 1.501921802518224, + "grad_norm": 6.989553928375244, + "learning_rate": 2.4967021965134143e-06, + "loss": 0.1428, + "num_input_tokens_seen": 22191912, + "step": 11332 + }, + { + "epoch": 1.5020543406229292, + "grad_norm": 1.971257209777832, + "learning_rate": 2.496355059538556e-06, + "loss": 0.0107, + "num_input_tokens_seen": 22194072, + "step": 11333 + }, + { + "epoch": 1.5021868787276342, + "grad_norm": 0.024518702179193497, + "learning_rate": 2.496007922633975e-06, + "loss": 0.0002, + "num_input_tokens_seen": 22195328, + "step": 11334 + }, + { + "epoch": 1.5023194168323393, + "grad_norm": 4.5171427726745605, + "learning_rate": 2.495660785806365e-06, + "loss": 0.1023, + "num_input_tokens_seen": 22197688, + "step": 11335 + }, + { + "epoch": 1.5024519549370443, + "grad_norm": 0.1347322165966034, + "learning_rate": 2.4953136490624165e-06, + "loss": 0.001, + "num_input_tokens_seen": 22199952, + "step": 11336 + }, + { + "epoch": 1.5025844930417493, + "grad_norm": 0.06904023885726929, + "learning_rate": 2.4949665124088254e-06, + "loss": 0.0003, + "num_input_tokens_seen": 22201776, + "step": 11337 + }, + { + "epoch": 1.5027170311464546, + "grad_norm": 6.8512864112854, + "learning_rate": 2.494619375852283e-06, + "loss": 0.1023, + "num_input_tokens_seen": 22204408, + "step": 11338 + }, + { + "epoch": 1.5028495692511599, + "grad_norm": 12.186630249023438, + "learning_rate": 2.494272239399482e-06, + "loss": 0.3112, + "num_input_tokens_seen": 22206568, + "step": 11339 + }, + { + "epoch": 1.502982107355865, + "grad_norm": 5.696072101593018, + "learning_rate": 2.4939251030571168e-06, + "loss": 0.1162, + "num_input_tokens_seen": 22209192, + "step": 11340 + }, + { + "epoch": 1.50311464546057, + "grad_norm": 9.824942588806152, + "learning_rate": 2.49357796683188e-06, + "loss": 0.1622, + "num_input_tokens_seen": 22210696, + "step": 11341 + }, + { + "epoch": 1.503247183565275, + "grad_norm": 11.528942108154297, + "learning_rate": 2.493230830730464e-06, + "loss": 0.3344, + "num_input_tokens_seen": 22212600, + "step": 11342 + }, + { + "epoch": 1.50337972166998, + "grad_norm": 11.75717544555664, + "learning_rate": 2.492883694759563e-06, + "loss": 0.059, + "num_input_tokens_seen": 22214792, + "step": 11343 + }, + { + "epoch": 1.503512259774685, + "grad_norm": 0.1370205283164978, + "learning_rate": 2.492536558925868e-06, + "loss": 0.001, + "num_input_tokens_seen": 22216464, + "step": 11344 + }, + { + "epoch": 1.5036447978793903, + "grad_norm": 12.367904663085938, + "learning_rate": 2.492189423236074e-06, + "loss": 0.2596, + "num_input_tokens_seen": 22218528, + "step": 11345 + }, + { + "epoch": 1.5037773359840956, + "grad_norm": 9.471757888793945, + "learning_rate": 2.4918422876968737e-06, + "loss": 0.1572, + "num_input_tokens_seen": 22220056, + "step": 11346 + }, + { + "epoch": 1.5039098740888006, + "grad_norm": 9.884944915771484, + "learning_rate": 2.49149515231496e-06, + "loss": 0.3949, + "num_input_tokens_seen": 22222008, + "step": 11347 + }, + { + "epoch": 1.5040424121935057, + "grad_norm": 0.5098044872283936, + "learning_rate": 2.4911480170970244e-06, + "loss": 0.0031, + "num_input_tokens_seen": 22224440, + "step": 11348 + }, + { + "epoch": 1.5041749502982107, + "grad_norm": 5.8175048828125, + "learning_rate": 2.490800882049761e-06, + "loss": 0.0695, + "num_input_tokens_seen": 22226640, + "step": 11349 + }, + { + "epoch": 1.5043074884029157, + "grad_norm": 1.5691171884536743, + "learning_rate": 2.4904537471798642e-06, + "loss": 0.0155, + "num_input_tokens_seen": 22228184, + "step": 11350 + }, + { + "epoch": 1.504440026507621, + "grad_norm": 4.199514389038086, + "learning_rate": 2.490106612494025e-06, + "loss": 0.0582, + "num_input_tokens_seen": 22230208, + "step": 11351 + }, + { + "epoch": 1.504572564612326, + "grad_norm": 0.05354466661810875, + "learning_rate": 2.489759477998938e-06, + "loss": 0.0004, + "num_input_tokens_seen": 22232216, + "step": 11352 + }, + { + "epoch": 1.5047051027170313, + "grad_norm": 11.919499397277832, + "learning_rate": 2.489412343701295e-06, + "loss": 0.1582, + "num_input_tokens_seen": 22235016, + "step": 11353 + }, + { + "epoch": 1.5048376408217363, + "grad_norm": 8.071243286132812, + "learning_rate": 2.489065209607788e-06, + "loss": 0.1582, + "num_input_tokens_seen": 22237776, + "step": 11354 + }, + { + "epoch": 1.5049701789264414, + "grad_norm": 11.218445777893066, + "learning_rate": 2.4887180757251123e-06, + "loss": 0.1551, + "num_input_tokens_seen": 22239856, + "step": 11355 + }, + { + "epoch": 1.5051027170311464, + "grad_norm": 12.45864486694336, + "learning_rate": 2.48837094205996e-06, + "loss": 0.3021, + "num_input_tokens_seen": 22242200, + "step": 11356 + }, + { + "epoch": 1.5052352551358514, + "grad_norm": 0.0641266256570816, + "learning_rate": 2.488023808619024e-06, + "loss": 0.0004, + "num_input_tokens_seen": 22243768, + "step": 11357 + }, + { + "epoch": 1.5053677932405567, + "grad_norm": 5.266931056976318, + "learning_rate": 2.487676675408997e-06, + "loss": 0.1245, + "num_input_tokens_seen": 22245696, + "step": 11358 + }, + { + "epoch": 1.5055003313452617, + "grad_norm": 5.88974666595459, + "learning_rate": 2.487329542436572e-06, + "loss": 0.0891, + "num_input_tokens_seen": 22247288, + "step": 11359 + }, + { + "epoch": 1.505632869449967, + "grad_norm": 1.401282787322998, + "learning_rate": 2.4869824097084424e-06, + "loss": 0.0098, + "num_input_tokens_seen": 22249200, + "step": 11360 + }, + { + "epoch": 1.505765407554672, + "grad_norm": 16.31235694885254, + "learning_rate": 2.486635277231301e-06, + "loss": 0.1813, + "num_input_tokens_seen": 22251648, + "step": 11361 + }, + { + "epoch": 1.505897945659377, + "grad_norm": 6.742690086364746, + "learning_rate": 2.486288145011841e-06, + "loss": 0.11, + "num_input_tokens_seen": 22254248, + "step": 11362 + }, + { + "epoch": 1.506030483764082, + "grad_norm": 5.9086689949035645, + "learning_rate": 2.485941013056755e-06, + "loss": 0.0669, + "num_input_tokens_seen": 22256360, + "step": 11363 + }, + { + "epoch": 1.5061630218687871, + "grad_norm": 10.00708293914795, + "learning_rate": 2.4855938813727344e-06, + "loss": 0.2642, + "num_input_tokens_seen": 22258288, + "step": 11364 + }, + { + "epoch": 1.5062955599734924, + "grad_norm": 4.125768184661865, + "learning_rate": 2.485246749966475e-06, + "loss": 0.0427, + "num_input_tokens_seen": 22260032, + "step": 11365 + }, + { + "epoch": 1.5064280980781974, + "grad_norm": 3.1132164001464844, + "learning_rate": 2.4848996188446687e-06, + "loss": 0.0146, + "num_input_tokens_seen": 22262440, + "step": 11366 + }, + { + "epoch": 1.5065606361829027, + "grad_norm": 5.616835594177246, + "learning_rate": 2.484552488014008e-06, + "loss": 0.1172, + "num_input_tokens_seen": 22264552, + "step": 11367 + }, + { + "epoch": 1.5066931742876077, + "grad_norm": 2.5037827491760254, + "learning_rate": 2.4842053574811864e-06, + "loss": 0.042, + "num_input_tokens_seen": 22266056, + "step": 11368 + }, + { + "epoch": 1.5068257123923128, + "grad_norm": 5.691380023956299, + "learning_rate": 2.4838582272528955e-06, + "loss": 0.0329, + "num_input_tokens_seen": 22268304, + "step": 11369 + }, + { + "epoch": 1.5069582504970178, + "grad_norm": 12.696510314941406, + "learning_rate": 2.4835110973358286e-06, + "loss": 0.2086, + "num_input_tokens_seen": 22269872, + "step": 11370 + }, + { + "epoch": 1.5070907886017229, + "grad_norm": 0.20764188468456268, + "learning_rate": 2.4831639677366807e-06, + "loss": 0.0013, + "num_input_tokens_seen": 22271280, + "step": 11371 + }, + { + "epoch": 1.5072233267064281, + "grad_norm": 0.033183302730321884, + "learning_rate": 2.482816838462142e-06, + "loss": 0.0002, + "num_input_tokens_seen": 22272664, + "step": 11372 + }, + { + "epoch": 1.5073558648111332, + "grad_norm": 5.8083977699279785, + "learning_rate": 2.4824697095189073e-06, + "loss": 0.1131, + "num_input_tokens_seen": 22274440, + "step": 11373 + }, + { + "epoch": 1.5074884029158384, + "grad_norm": 0.038225311785936356, + "learning_rate": 2.4821225809136683e-06, + "loss": 0.0003, + "num_input_tokens_seen": 22276864, + "step": 11374 + }, + { + "epoch": 1.5076209410205434, + "grad_norm": 10.296028137207031, + "learning_rate": 2.4817754526531175e-06, + "loss": 0.1572, + "num_input_tokens_seen": 22279728, + "step": 11375 + }, + { + "epoch": 1.5077534791252485, + "grad_norm": 7.478351593017578, + "learning_rate": 2.4814283247439494e-06, + "loss": 0.201, + "num_input_tokens_seen": 22281496, + "step": 11376 + }, + { + "epoch": 1.5078860172299535, + "grad_norm": 3.580753803253174, + "learning_rate": 2.4810811971928564e-06, + "loss": 0.0797, + "num_input_tokens_seen": 22283240, + "step": 11377 + }, + { + "epoch": 1.5080185553346586, + "grad_norm": 17.45608901977539, + "learning_rate": 2.48073407000653e-06, + "loss": 0.3048, + "num_input_tokens_seen": 22285456, + "step": 11378 + }, + { + "epoch": 1.5081510934393638, + "grad_norm": 8.84461784362793, + "learning_rate": 2.480386943191665e-06, + "loss": 0.1207, + "num_input_tokens_seen": 22287304, + "step": 11379 + }, + { + "epoch": 1.508283631544069, + "grad_norm": 1.9499233961105347, + "learning_rate": 2.4800398167549524e-06, + "loss": 0.0237, + "num_input_tokens_seen": 22289664, + "step": 11380 + }, + { + "epoch": 1.5084161696487741, + "grad_norm": 11.147247314453125, + "learning_rate": 2.479692690703086e-06, + "loss": 0.1485, + "num_input_tokens_seen": 22291472, + "step": 11381 + }, + { + "epoch": 1.5085487077534792, + "grad_norm": 0.19540995359420776, + "learning_rate": 2.479345565042759e-06, + "loss": 0.0014, + "num_input_tokens_seen": 22292472, + "step": 11382 + }, + { + "epoch": 1.5086812458581842, + "grad_norm": 6.15656042098999, + "learning_rate": 2.4789984397806635e-06, + "loss": 0.1339, + "num_input_tokens_seen": 22294344, + "step": 11383 + }, + { + "epoch": 1.5088137839628892, + "grad_norm": 0.4155294597148895, + "learning_rate": 2.4786513149234932e-06, + "loss": 0.0018, + "num_input_tokens_seen": 22296048, + "step": 11384 + }, + { + "epoch": 1.5089463220675943, + "grad_norm": 0.4467773139476776, + "learning_rate": 2.4783041904779386e-06, + "loss": 0.0024, + "num_input_tokens_seen": 22297952, + "step": 11385 + }, + { + "epoch": 1.5090788601722995, + "grad_norm": 0.2392825037240982, + "learning_rate": 2.477957066450696e-06, + "loss": 0.0018, + "num_input_tokens_seen": 22299976, + "step": 11386 + }, + { + "epoch": 1.5092113982770048, + "grad_norm": 0.013642658479511738, + "learning_rate": 2.4776099428484556e-06, + "loss": 0.0001, + "num_input_tokens_seen": 22301280, + "step": 11387 + }, + { + "epoch": 1.5093439363817098, + "grad_norm": 6.525798797607422, + "learning_rate": 2.4772628196779113e-06, + "loss": 0.1094, + "num_input_tokens_seen": 22302816, + "step": 11388 + }, + { + "epoch": 1.5094764744864149, + "grad_norm": 9.64405345916748, + "learning_rate": 2.4769156969457548e-06, + "loss": 0.1949, + "num_input_tokens_seen": 22305040, + "step": 11389 + }, + { + "epoch": 1.50960901259112, + "grad_norm": 0.1423177272081375, + "learning_rate": 2.4765685746586794e-06, + "loss": 0.0009, + "num_input_tokens_seen": 22306472, + "step": 11390 + }, + { + "epoch": 1.509741550695825, + "grad_norm": 13.29039192199707, + "learning_rate": 2.4762214528233787e-06, + "loss": 0.2779, + "num_input_tokens_seen": 22309280, + "step": 11391 + }, + { + "epoch": 1.5098740888005302, + "grad_norm": 4.106523036956787, + "learning_rate": 2.4758743314465452e-06, + "loss": 0.0998, + "num_input_tokens_seen": 22310832, + "step": 11392 + }, + { + "epoch": 1.5100066269052352, + "grad_norm": 2.65285062789917, + "learning_rate": 2.475527210534871e-06, + "loss": 0.0604, + "num_input_tokens_seen": 22312120, + "step": 11393 + }, + { + "epoch": 1.5101391650099405, + "grad_norm": 0.539609968662262, + "learning_rate": 2.475180090095049e-06, + "loss": 0.0021, + "num_input_tokens_seen": 22313296, + "step": 11394 + }, + { + "epoch": 1.5102717031146455, + "grad_norm": 0.15880119800567627, + "learning_rate": 2.4748329701337715e-06, + "loss": 0.001, + "num_input_tokens_seen": 22315272, + "step": 11395 + }, + { + "epoch": 1.5104042412193506, + "grad_norm": 5.109264850616455, + "learning_rate": 2.474485850657731e-06, + "loss": 0.0904, + "num_input_tokens_seen": 22317048, + "step": 11396 + }, + { + "epoch": 1.5105367793240556, + "grad_norm": 5.510128021240234, + "learning_rate": 2.4741387316736226e-06, + "loss": 0.1304, + "num_input_tokens_seen": 22318432, + "step": 11397 + }, + { + "epoch": 1.5106693174287606, + "grad_norm": 11.299368858337402, + "learning_rate": 2.4737916131881365e-06, + "loss": 0.1552, + "num_input_tokens_seen": 22320624, + "step": 11398 + }, + { + "epoch": 1.510801855533466, + "grad_norm": 6.239479064941406, + "learning_rate": 2.4734444952079665e-06, + "loss": 0.0913, + "num_input_tokens_seen": 22322336, + "step": 11399 + }, + { + "epoch": 1.510934393638171, + "grad_norm": 8.359880447387695, + "learning_rate": 2.473097377739805e-06, + "loss": 0.1226, + "num_input_tokens_seen": 22324336, + "step": 11400 + }, + { + "epoch": 1.5110669317428762, + "grad_norm": 0.19654779136180878, + "learning_rate": 2.472750260790344e-06, + "loss": 0.0012, + "num_input_tokens_seen": 22326736, + "step": 11401 + }, + { + "epoch": 1.5111994698475812, + "grad_norm": 0.013369173742830753, + "learning_rate": 2.4724031443662767e-06, + "loss": 0.0001, + "num_input_tokens_seen": 22328184, + "step": 11402 + }, + { + "epoch": 1.5113320079522863, + "grad_norm": 1.897829294204712, + "learning_rate": 2.4720560284742967e-06, + "loss": 0.023, + "num_input_tokens_seen": 22329968, + "step": 11403 + }, + { + "epoch": 1.5114645460569913, + "grad_norm": 0.008718353696167469, + "learning_rate": 2.4717089131210952e-06, + "loss": 0.0001, + "num_input_tokens_seen": 22331864, + "step": 11404 + }, + { + "epoch": 1.5115970841616964, + "grad_norm": 12.930502891540527, + "learning_rate": 2.471361798313366e-06, + "loss": 0.1683, + "num_input_tokens_seen": 22333784, + "step": 11405 + }, + { + "epoch": 1.5117296222664016, + "grad_norm": 5.951053619384766, + "learning_rate": 2.4710146840578e-06, + "loss": 0.0765, + "num_input_tokens_seen": 22336264, + "step": 11406 + }, + { + "epoch": 1.5118621603711067, + "grad_norm": 3.437422513961792, + "learning_rate": 2.470667570361091e-06, + "loss": 0.0255, + "num_input_tokens_seen": 22339152, + "step": 11407 + }, + { + "epoch": 1.511994698475812, + "grad_norm": 0.0223418939858675, + "learning_rate": 2.470320457229932e-06, + "loss": 0.0002, + "num_input_tokens_seen": 22341608, + "step": 11408 + }, + { + "epoch": 1.512127236580517, + "grad_norm": 0.13284729421138763, + "learning_rate": 2.4699733446710156e-06, + "loss": 0.001, + "num_input_tokens_seen": 22343456, + "step": 11409 + }, + { + "epoch": 1.512259774685222, + "grad_norm": 0.01674438640475273, + "learning_rate": 2.469626232691033e-06, + "loss": 0.0001, + "num_input_tokens_seen": 22345160, + "step": 11410 + }, + { + "epoch": 1.512392312789927, + "grad_norm": 13.227300643920898, + "learning_rate": 2.4692791212966773e-06, + "loss": 0.5566, + "num_input_tokens_seen": 22347152, + "step": 11411 + }, + { + "epoch": 1.512524850894632, + "grad_norm": 3.151787281036377, + "learning_rate": 2.468932010494642e-06, + "loss": 0.0134, + "num_input_tokens_seen": 22348592, + "step": 11412 + }, + { + "epoch": 1.5126573889993373, + "grad_norm": 1.0811048746109009, + "learning_rate": 2.4685849002916184e-06, + "loss": 0.0103, + "num_input_tokens_seen": 22350464, + "step": 11413 + }, + { + "epoch": 1.5127899271040424, + "grad_norm": 8.103339195251465, + "learning_rate": 2.4682377906943006e-06, + "loss": 0.1068, + "num_input_tokens_seen": 22352208, + "step": 11414 + }, + { + "epoch": 1.5129224652087476, + "grad_norm": 6.376973628997803, + "learning_rate": 2.4678906817093796e-06, + "loss": 0.1227, + "num_input_tokens_seen": 22355856, + "step": 11415 + }, + { + "epoch": 1.5130550033134527, + "grad_norm": 6.41684627532959, + "learning_rate": 2.4675435733435477e-06, + "loss": 0.1571, + "num_input_tokens_seen": 22357024, + "step": 11416 + }, + { + "epoch": 1.5131875414181577, + "grad_norm": 0.519968569278717, + "learning_rate": 2.4671964656034988e-06, + "loss": 0.0054, + "num_input_tokens_seen": 22358864, + "step": 11417 + }, + { + "epoch": 1.5133200795228627, + "grad_norm": 3.497556209564209, + "learning_rate": 2.4668493584959246e-06, + "loss": 0.0234, + "num_input_tokens_seen": 22360144, + "step": 11418 + }, + { + "epoch": 1.5134526176275678, + "grad_norm": 7.769497871398926, + "learning_rate": 2.4665022520275177e-06, + "loss": 0.1806, + "num_input_tokens_seen": 22362144, + "step": 11419 + }, + { + "epoch": 1.513585155732273, + "grad_norm": 0.045171912759542465, + "learning_rate": 2.466155146204971e-06, + "loss": 0.0003, + "num_input_tokens_seen": 22364552, + "step": 11420 + }, + { + "epoch": 1.5137176938369783, + "grad_norm": 0.009083000011742115, + "learning_rate": 2.465808041034975e-06, + "loss": 0.0001, + "num_input_tokens_seen": 22366112, + "step": 11421 + }, + { + "epoch": 1.5138502319416833, + "grad_norm": 9.206603050231934, + "learning_rate": 2.4654609365242246e-06, + "loss": 0.1156, + "num_input_tokens_seen": 22367856, + "step": 11422 + }, + { + "epoch": 1.5139827700463884, + "grad_norm": 0.048868078738451004, + "learning_rate": 2.4651138326794107e-06, + "loss": 0.0003, + "num_input_tokens_seen": 22370488, + "step": 11423 + }, + { + "epoch": 1.5141153081510934, + "grad_norm": 0.015876734629273415, + "learning_rate": 2.464766729507227e-06, + "loss": 0.0001, + "num_input_tokens_seen": 22371896, + "step": 11424 + }, + { + "epoch": 1.5142478462557984, + "grad_norm": 3.494384765625, + "learning_rate": 2.464419627014364e-06, + "loss": 0.0091, + "num_input_tokens_seen": 22372992, + "step": 11425 + }, + { + "epoch": 1.5143803843605035, + "grad_norm": 13.920032501220703, + "learning_rate": 2.4640725252075163e-06, + "loss": 0.1951, + "num_input_tokens_seen": 22375128, + "step": 11426 + }, + { + "epoch": 1.5145129224652087, + "grad_norm": 6.217750072479248, + "learning_rate": 2.4637254240933737e-06, + "loss": 0.0553, + "num_input_tokens_seen": 22376912, + "step": 11427 + }, + { + "epoch": 1.514645460569914, + "grad_norm": 9.408684730529785, + "learning_rate": 2.4633783236786303e-06, + "loss": 0.1186, + "num_input_tokens_seen": 22378952, + "step": 11428 + }, + { + "epoch": 1.514777998674619, + "grad_norm": 4.269500255584717, + "learning_rate": 2.4630312239699788e-06, + "loss": 0.0627, + "num_input_tokens_seen": 22381456, + "step": 11429 + }, + { + "epoch": 1.514910536779324, + "grad_norm": 13.82970142364502, + "learning_rate": 2.462684124974111e-06, + "loss": 0.1542, + "num_input_tokens_seen": 22383128, + "step": 11430 + }, + { + "epoch": 1.5150430748840291, + "grad_norm": 8.828168869018555, + "learning_rate": 2.4623370266977184e-06, + "loss": 0.1288, + "num_input_tokens_seen": 22384832, + "step": 11431 + }, + { + "epoch": 1.5151756129887342, + "grad_norm": 16.929853439331055, + "learning_rate": 2.461989929147493e-06, + "loss": 0.3295, + "num_input_tokens_seen": 22386704, + "step": 11432 + }, + { + "epoch": 1.5153081510934394, + "grad_norm": 5.446898460388184, + "learning_rate": 2.4616428323301296e-06, + "loss": 0.1645, + "num_input_tokens_seen": 22388280, + "step": 11433 + }, + { + "epoch": 1.5154406891981445, + "grad_norm": 3.7495453357696533, + "learning_rate": 2.461295736252318e-06, + "loss": 0.0459, + "num_input_tokens_seen": 22389760, + "step": 11434 + }, + { + "epoch": 1.5155732273028497, + "grad_norm": 0.29297253489494324, + "learning_rate": 2.4609486409207518e-06, + "loss": 0.0021, + "num_input_tokens_seen": 22391232, + "step": 11435 + }, + { + "epoch": 1.5157057654075548, + "grad_norm": 0.07760973274707794, + "learning_rate": 2.4606015463421222e-06, + "loss": 0.0005, + "num_input_tokens_seen": 22393704, + "step": 11436 + }, + { + "epoch": 1.5158383035122598, + "grad_norm": 0.3208957314491272, + "learning_rate": 2.4602544525231215e-06, + "loss": 0.003, + "num_input_tokens_seen": 22394752, + "step": 11437 + }, + { + "epoch": 1.5159708416169648, + "grad_norm": 0.01919209584593773, + "learning_rate": 2.459907359470444e-06, + "loss": 0.0001, + "num_input_tokens_seen": 22396840, + "step": 11438 + }, + { + "epoch": 1.5161033797216699, + "grad_norm": 6.300719738006592, + "learning_rate": 2.4595602671907795e-06, + "loss": 0.0483, + "num_input_tokens_seen": 22398016, + "step": 11439 + }, + { + "epoch": 1.5162359178263751, + "grad_norm": 4.877910614013672, + "learning_rate": 2.4592131756908203e-06, + "loss": 0.0982, + "num_input_tokens_seen": 22399688, + "step": 11440 + }, + { + "epoch": 1.5163684559310802, + "grad_norm": 9.544331550598145, + "learning_rate": 2.4588660849772605e-06, + "loss": 0.1411, + "num_input_tokens_seen": 22401344, + "step": 11441 + }, + { + "epoch": 1.5165009940357854, + "grad_norm": 1.895017147064209, + "learning_rate": 2.4585189950567896e-06, + "loss": 0.0436, + "num_input_tokens_seen": 22403640, + "step": 11442 + }, + { + "epoch": 1.5166335321404905, + "grad_norm": 8.639747619628906, + "learning_rate": 2.4581719059361015e-06, + "loss": 0.1444, + "num_input_tokens_seen": 22404688, + "step": 11443 + }, + { + "epoch": 1.5167660702451955, + "grad_norm": 1.1425280570983887, + "learning_rate": 2.4578248176218883e-06, + "loss": 0.0084, + "num_input_tokens_seen": 22406688, + "step": 11444 + }, + { + "epoch": 1.5168986083499005, + "grad_norm": 6.7906174659729, + "learning_rate": 2.457477730120842e-06, + "loss": 0.0381, + "num_input_tokens_seen": 22408248, + "step": 11445 + }, + { + "epoch": 1.5170311464546056, + "grad_norm": 9.517753601074219, + "learning_rate": 2.4571306434396538e-06, + "loss": 0.2562, + "num_input_tokens_seen": 22410584, + "step": 11446 + }, + { + "epoch": 1.5171636845593108, + "grad_norm": 0.013001424260437489, + "learning_rate": 2.4567835575850155e-06, + "loss": 0.0001, + "num_input_tokens_seen": 22411752, + "step": 11447 + }, + { + "epoch": 1.5172962226640159, + "grad_norm": 10.140884399414062, + "learning_rate": 2.4564364725636213e-06, + "loss": 0.1911, + "num_input_tokens_seen": 22414616, + "step": 11448 + }, + { + "epoch": 1.5174287607687211, + "grad_norm": 1.1815012693405151, + "learning_rate": 2.4560893883821615e-06, + "loss": 0.01, + "num_input_tokens_seen": 22415856, + "step": 11449 + }, + { + "epoch": 1.5175612988734262, + "grad_norm": 1.2157396078109741, + "learning_rate": 2.4557423050473293e-06, + "loss": 0.0248, + "num_input_tokens_seen": 22417336, + "step": 11450 + }, + { + "epoch": 1.5176938369781312, + "grad_norm": 7.1166510581970215, + "learning_rate": 2.4553952225658155e-06, + "loss": 0.1643, + "num_input_tokens_seen": 22418880, + "step": 11451 + }, + { + "epoch": 1.5178263750828362, + "grad_norm": 1.477338433265686, + "learning_rate": 2.455048140944313e-06, + "loss": 0.0216, + "num_input_tokens_seen": 22421344, + "step": 11452 + }, + { + "epoch": 1.5179589131875413, + "grad_norm": 4.89297342300415, + "learning_rate": 2.4547010601895118e-06, + "loss": 0.1136, + "num_input_tokens_seen": 22423656, + "step": 11453 + }, + { + "epoch": 1.5180914512922465, + "grad_norm": 7.648576736450195, + "learning_rate": 2.4543539803081066e-06, + "loss": 0.1621, + "num_input_tokens_seen": 22425856, + "step": 11454 + }, + { + "epoch": 1.5182239893969516, + "grad_norm": 7.1625590324401855, + "learning_rate": 2.454006901306788e-06, + "loss": 0.1007, + "num_input_tokens_seen": 22427728, + "step": 11455 + }, + { + "epoch": 1.5183565275016568, + "grad_norm": 11.234631538391113, + "learning_rate": 2.453659823192249e-06, + "loss": 0.1801, + "num_input_tokens_seen": 22429960, + "step": 11456 + }, + { + "epoch": 1.5184890656063619, + "grad_norm": 4.8884053230285645, + "learning_rate": 2.453312745971179e-06, + "loss": 0.0665, + "num_input_tokens_seen": 22432608, + "step": 11457 + }, + { + "epoch": 1.518621603711067, + "grad_norm": 0.7603376507759094, + "learning_rate": 2.4529656696502714e-06, + "loss": 0.0031, + "num_input_tokens_seen": 22433648, + "step": 11458 + }, + { + "epoch": 1.518754141815772, + "grad_norm": 13.671935081481934, + "learning_rate": 2.452618594236219e-06, + "loss": 0.268, + "num_input_tokens_seen": 22435272, + "step": 11459 + }, + { + "epoch": 1.518886679920477, + "grad_norm": 20.833864212036133, + "learning_rate": 2.4522715197357137e-06, + "loss": 0.5292, + "num_input_tokens_seen": 22437528, + "step": 11460 + }, + { + "epoch": 1.5190192180251822, + "grad_norm": 7.143860816955566, + "learning_rate": 2.451924446155445e-06, + "loss": 0.047, + "num_input_tokens_seen": 22439872, + "step": 11461 + }, + { + "epoch": 1.5191517561298873, + "grad_norm": 12.772725105285645, + "learning_rate": 2.451577373502107e-06, + "loss": 0.3187, + "num_input_tokens_seen": 22441984, + "step": 11462 + }, + { + "epoch": 1.5192842942345925, + "grad_norm": 5.707908630371094, + "learning_rate": 2.4512303017823896e-06, + "loss": 0.0575, + "num_input_tokens_seen": 22443760, + "step": 11463 + }, + { + "epoch": 1.5194168323392976, + "grad_norm": 11.056180953979492, + "learning_rate": 2.4508832310029857e-06, + "loss": 0.294, + "num_input_tokens_seen": 22446008, + "step": 11464 + }, + { + "epoch": 1.5195493704440026, + "grad_norm": 6.699303150177002, + "learning_rate": 2.450536161170588e-06, + "loss": 0.1604, + "num_input_tokens_seen": 22448352, + "step": 11465 + }, + { + "epoch": 1.5196819085487077, + "grad_norm": 0.10472378879785538, + "learning_rate": 2.450189092291887e-06, + "loss": 0.0007, + "num_input_tokens_seen": 22451504, + "step": 11466 + }, + { + "epoch": 1.5198144466534127, + "grad_norm": 0.6150741577148438, + "learning_rate": 2.4498420243735744e-06, + "loss": 0.0011, + "num_input_tokens_seen": 22453656, + "step": 11467 + }, + { + "epoch": 1.519946984758118, + "grad_norm": 0.3481442928314209, + "learning_rate": 2.4494949574223415e-06, + "loss": 0.0025, + "num_input_tokens_seen": 22454768, + "step": 11468 + }, + { + "epoch": 1.5200795228628232, + "grad_norm": 7.7076029777526855, + "learning_rate": 2.4491478914448818e-06, + "loss": 0.1037, + "num_input_tokens_seen": 22456544, + "step": 11469 + }, + { + "epoch": 1.5202120609675283, + "grad_norm": 3.316929578781128, + "learning_rate": 2.4488008264478848e-06, + "loss": 0.0368, + "num_input_tokens_seen": 22458456, + "step": 11470 + }, + { + "epoch": 1.5203445990722333, + "grad_norm": 0.1612437516450882, + "learning_rate": 2.4484537624380445e-06, + "loss": 0.001, + "num_input_tokens_seen": 22459920, + "step": 11471 + }, + { + "epoch": 1.5204771371769383, + "grad_norm": 3.084296464920044, + "learning_rate": 2.44810669942205e-06, + "loss": 0.0345, + "num_input_tokens_seen": 22463040, + "step": 11472 + }, + { + "epoch": 1.5206096752816434, + "grad_norm": 3.710674524307251, + "learning_rate": 2.447759637406594e-06, + "loss": 0.0274, + "num_input_tokens_seen": 22464744, + "step": 11473 + }, + { + "epoch": 1.5207422133863486, + "grad_norm": 18.078521728515625, + "learning_rate": 2.4474125763983693e-06, + "loss": 0.4338, + "num_input_tokens_seen": 22466776, + "step": 11474 + }, + { + "epoch": 1.5208747514910537, + "grad_norm": 7.830974102020264, + "learning_rate": 2.4470655164040657e-06, + "loss": 0.1839, + "num_input_tokens_seen": 22470352, + "step": 11475 + }, + { + "epoch": 1.521007289595759, + "grad_norm": 1.2614049911499023, + "learning_rate": 2.4467184574303763e-06, + "loss": 0.016, + "num_input_tokens_seen": 22471632, + "step": 11476 + }, + { + "epoch": 1.521139827700464, + "grad_norm": 7.095685005187988, + "learning_rate": 2.4463713994839913e-06, + "loss": 0.1117, + "num_input_tokens_seen": 22474816, + "step": 11477 + }, + { + "epoch": 1.521272365805169, + "grad_norm": 20.29723358154297, + "learning_rate": 2.446024342571602e-06, + "loss": 0.2885, + "num_input_tokens_seen": 22476416, + "step": 11478 + }, + { + "epoch": 1.521404903909874, + "grad_norm": 0.24325886368751526, + "learning_rate": 2.445677286699901e-06, + "loss": 0.0012, + "num_input_tokens_seen": 22477736, + "step": 11479 + }, + { + "epoch": 1.521537442014579, + "grad_norm": 0.00675522955134511, + "learning_rate": 2.44533023187558e-06, + "loss": 0.0, + "num_input_tokens_seen": 22479208, + "step": 11480 + }, + { + "epoch": 1.5216699801192843, + "grad_norm": 7.187641143798828, + "learning_rate": 2.4449831781053295e-06, + "loss": 0.1026, + "num_input_tokens_seen": 22481136, + "step": 11481 + }, + { + "epoch": 1.5218025182239894, + "grad_norm": 2.886211395263672, + "learning_rate": 2.4446361253958413e-06, + "loss": 0.0116, + "num_input_tokens_seen": 22482368, + "step": 11482 + }, + { + "epoch": 1.5219350563286946, + "grad_norm": 12.720470428466797, + "learning_rate": 2.4442890737538073e-06, + "loss": 0.14, + "num_input_tokens_seen": 22484624, + "step": 11483 + }, + { + "epoch": 1.5220675944333997, + "grad_norm": 8.339862823486328, + "learning_rate": 2.443942023185917e-06, + "loss": 0.1009, + "num_input_tokens_seen": 22487608, + "step": 11484 + }, + { + "epoch": 1.5222001325381047, + "grad_norm": 6.182653427124023, + "learning_rate": 2.443594973698864e-06, + "loss": 0.1349, + "num_input_tokens_seen": 22489672, + "step": 11485 + }, + { + "epoch": 1.5223326706428097, + "grad_norm": 24.37546730041504, + "learning_rate": 2.443247925299339e-06, + "loss": 0.0853, + "num_input_tokens_seen": 22491096, + "step": 11486 + }, + { + "epoch": 1.5224652087475148, + "grad_norm": 7.647595405578613, + "learning_rate": 2.442900877994033e-06, + "loss": 0.1444, + "num_input_tokens_seen": 22493976, + "step": 11487 + }, + { + "epoch": 1.52259774685222, + "grad_norm": 10.800320625305176, + "learning_rate": 2.4425538317896376e-06, + "loss": 0.2408, + "num_input_tokens_seen": 22495824, + "step": 11488 + }, + { + "epoch": 1.522730284956925, + "grad_norm": 13.868978500366211, + "learning_rate": 2.4422067866928432e-06, + "loss": 0.1649, + "num_input_tokens_seen": 22497616, + "step": 11489 + }, + { + "epoch": 1.5228628230616303, + "grad_norm": 8.312049865722656, + "learning_rate": 2.4418597427103423e-06, + "loss": 0.1875, + "num_input_tokens_seen": 22499592, + "step": 11490 + }, + { + "epoch": 1.5229953611663354, + "grad_norm": 0.05519895628094673, + "learning_rate": 2.441512699848826e-06, + "loss": 0.0006, + "num_input_tokens_seen": 22501064, + "step": 11491 + }, + { + "epoch": 1.5231278992710404, + "grad_norm": 3.442178249359131, + "learning_rate": 2.4411656581149853e-06, + "loss": 0.0179, + "num_input_tokens_seen": 22502928, + "step": 11492 + }, + { + "epoch": 1.5232604373757455, + "grad_norm": 8.075897216796875, + "learning_rate": 2.4408186175155107e-06, + "loss": 0.1547, + "num_input_tokens_seen": 22505056, + "step": 11493 + }, + { + "epoch": 1.5233929754804505, + "grad_norm": 1.9534374475479126, + "learning_rate": 2.4404715780570936e-06, + "loss": 0.0163, + "num_input_tokens_seen": 22506888, + "step": 11494 + }, + { + "epoch": 1.5235255135851558, + "grad_norm": 0.00805640872567892, + "learning_rate": 2.440124539746427e-06, + "loss": 0.0, + "num_input_tokens_seen": 22508080, + "step": 11495 + }, + { + "epoch": 1.5236580516898608, + "grad_norm": 5.63231897354126, + "learning_rate": 2.4397775025901993e-06, + "loss": 0.1749, + "num_input_tokens_seen": 22510232, + "step": 11496 + }, + { + "epoch": 1.523790589794566, + "grad_norm": 0.18145166337490082, + "learning_rate": 2.439430466595104e-06, + "loss": 0.0011, + "num_input_tokens_seen": 22512424, + "step": 11497 + }, + { + "epoch": 1.523923127899271, + "grad_norm": 13.174072265625, + "learning_rate": 2.4390834317678305e-06, + "loss": 0.252, + "num_input_tokens_seen": 22514968, + "step": 11498 + }, + { + "epoch": 1.5240556660039761, + "grad_norm": 7.739680767059326, + "learning_rate": 2.4387363981150696e-06, + "loss": 0.2095, + "num_input_tokens_seen": 22516968, + "step": 11499 + }, + { + "epoch": 1.5241882041086812, + "grad_norm": 10.621193885803223, + "learning_rate": 2.438389365643514e-06, + "loss": 0.3333, + "num_input_tokens_seen": 22518920, + "step": 11500 + }, + { + "epoch": 1.5243207422133862, + "grad_norm": 6.683601379394531, + "learning_rate": 2.4380423343598546e-06, + "loss": 0.1188, + "num_input_tokens_seen": 22520880, + "step": 11501 + }, + { + "epoch": 1.5244532803180915, + "grad_norm": 1.6892681121826172, + "learning_rate": 2.4376953042707806e-06, + "loss": 0.009, + "num_input_tokens_seen": 22522424, + "step": 11502 + }, + { + "epoch": 1.5245858184227965, + "grad_norm": 0.021394947543740273, + "learning_rate": 2.437348275382985e-06, + "loss": 0.0001, + "num_input_tokens_seen": 22524504, + "step": 11503 + }, + { + "epoch": 1.5247183565275018, + "grad_norm": 0.08736203610897064, + "learning_rate": 2.437001247703157e-06, + "loss": 0.0006, + "num_input_tokens_seen": 22525984, + "step": 11504 + }, + { + "epoch": 1.5248508946322068, + "grad_norm": 0.8170214295387268, + "learning_rate": 2.4366542212379885e-06, + "loss": 0.0089, + "num_input_tokens_seen": 22527560, + "step": 11505 + }, + { + "epoch": 1.5249834327369118, + "grad_norm": 7.6855645179748535, + "learning_rate": 2.4363071959941713e-06, + "loss": 0.1148, + "num_input_tokens_seen": 22528704, + "step": 11506 + }, + { + "epoch": 1.5251159708416169, + "grad_norm": 0.0020029114093631506, + "learning_rate": 2.4359601719783947e-06, + "loss": 0.0, + "num_input_tokens_seen": 22529704, + "step": 11507 + }, + { + "epoch": 1.525248508946322, + "grad_norm": 0.7428519129753113, + "learning_rate": 2.43561314919735e-06, + "loss": 0.006, + "num_input_tokens_seen": 22530992, + "step": 11508 + }, + { + "epoch": 1.5253810470510272, + "grad_norm": 5.047426223754883, + "learning_rate": 2.435266127657729e-06, + "loss": 0.1471, + "num_input_tokens_seen": 22532592, + "step": 11509 + }, + { + "epoch": 1.5255135851557324, + "grad_norm": 7.155620098114014, + "learning_rate": 2.4349191073662203e-06, + "loss": 0.1497, + "num_input_tokens_seen": 22535048, + "step": 11510 + }, + { + "epoch": 1.5256461232604375, + "grad_norm": 0.2275935411453247, + "learning_rate": 2.434572088329517e-06, + "loss": 0.0019, + "num_input_tokens_seen": 22536592, + "step": 11511 + }, + { + "epoch": 1.5257786613651425, + "grad_norm": 0.33775341510772705, + "learning_rate": 2.434225070554309e-06, + "loss": 0.0021, + "num_input_tokens_seen": 22538424, + "step": 11512 + }, + { + "epoch": 1.5259111994698475, + "grad_norm": 12.338313102722168, + "learning_rate": 2.4338780540472872e-06, + "loss": 0.2175, + "num_input_tokens_seen": 22540288, + "step": 11513 + }, + { + "epoch": 1.5260437375745526, + "grad_norm": 3.4022042751312256, + "learning_rate": 2.4335310388151415e-06, + "loss": 0.0835, + "num_input_tokens_seen": 22542672, + "step": 11514 + }, + { + "epoch": 1.5261762756792576, + "grad_norm": 11.534870147705078, + "learning_rate": 2.433184024864563e-06, + "loss": 0.2833, + "num_input_tokens_seen": 22545552, + "step": 11515 + }, + { + "epoch": 1.5263088137839629, + "grad_norm": 0.021259797737002373, + "learning_rate": 2.4328370122022433e-06, + "loss": 0.0001, + "num_input_tokens_seen": 22546752, + "step": 11516 + }, + { + "epoch": 1.5264413518886681, + "grad_norm": 11.607074737548828, + "learning_rate": 2.432490000834872e-06, + "loss": 0.2568, + "num_input_tokens_seen": 22549136, + "step": 11517 + }, + { + "epoch": 1.5265738899933732, + "grad_norm": 0.23991453647613525, + "learning_rate": 2.432142990769141e-06, + "loss": 0.0008, + "num_input_tokens_seen": 22550568, + "step": 11518 + }, + { + "epoch": 1.5267064280980782, + "grad_norm": 9.1854887008667, + "learning_rate": 2.431795982011739e-06, + "loss": 0.0972, + "num_input_tokens_seen": 22552448, + "step": 11519 + }, + { + "epoch": 1.5268389662027833, + "grad_norm": 5.432278156280518, + "learning_rate": 2.4314489745693566e-06, + "loss": 0.0454, + "num_input_tokens_seen": 22554016, + "step": 11520 + }, + { + "epoch": 1.5269715043074883, + "grad_norm": 7.507198333740234, + "learning_rate": 2.431101968448687e-06, + "loss": 0.0777, + "num_input_tokens_seen": 22555104, + "step": 11521 + }, + { + "epoch": 1.5271040424121936, + "grad_norm": 5.396219730377197, + "learning_rate": 2.4307549636564186e-06, + "loss": 0.0728, + "num_input_tokens_seen": 22557552, + "step": 11522 + }, + { + "epoch": 1.5272365805168986, + "grad_norm": 8.457440376281738, + "learning_rate": 2.4304079601992418e-06, + "loss": 0.2236, + "num_input_tokens_seen": 22559624, + "step": 11523 + }, + { + "epoch": 1.5273691186216038, + "grad_norm": 13.04212474822998, + "learning_rate": 2.430060958083848e-06, + "loss": 0.2716, + "num_input_tokens_seen": 22561504, + "step": 11524 + }, + { + "epoch": 1.5275016567263089, + "grad_norm": 3.1242918968200684, + "learning_rate": 2.429713957316926e-06, + "loss": 0.0547, + "num_input_tokens_seen": 22563264, + "step": 11525 + }, + { + "epoch": 1.527634194831014, + "grad_norm": 16.329011917114258, + "learning_rate": 2.4293669579051685e-06, + "loss": 0.2914, + "num_input_tokens_seen": 22565200, + "step": 11526 + }, + { + "epoch": 1.527766732935719, + "grad_norm": 8.383599281311035, + "learning_rate": 2.429019959855265e-06, + "loss": 0.0876, + "num_input_tokens_seen": 22567472, + "step": 11527 + }, + { + "epoch": 1.527899271040424, + "grad_norm": 4.4311323165893555, + "learning_rate": 2.428672963173905e-06, + "loss": 0.0401, + "num_input_tokens_seen": 22569312, + "step": 11528 + }, + { + "epoch": 1.5280318091451293, + "grad_norm": 5.711136817932129, + "learning_rate": 2.42832596786778e-06, + "loss": 0.1183, + "num_input_tokens_seen": 22572480, + "step": 11529 + }, + { + "epoch": 1.5281643472498343, + "grad_norm": 0.024878622964024544, + "learning_rate": 2.4279789739435788e-06, + "loss": 0.0002, + "num_input_tokens_seen": 22574360, + "step": 11530 + }, + { + "epoch": 1.5282968853545396, + "grad_norm": 0.07630543410778046, + "learning_rate": 2.427631981407994e-06, + "loss": 0.0004, + "num_input_tokens_seen": 22575928, + "step": 11531 + }, + { + "epoch": 1.5284294234592446, + "grad_norm": 0.13784672319889069, + "learning_rate": 2.4272849902677132e-06, + "loss": 0.0009, + "num_input_tokens_seen": 22577856, + "step": 11532 + }, + { + "epoch": 1.5285619615639496, + "grad_norm": 8.90227222442627, + "learning_rate": 2.4269380005294293e-06, + "loss": 0.1216, + "num_input_tokens_seen": 22579280, + "step": 11533 + }, + { + "epoch": 1.5286944996686547, + "grad_norm": 12.536809921264648, + "learning_rate": 2.42659101219983e-06, + "loss": 0.2887, + "num_input_tokens_seen": 22581808, + "step": 11534 + }, + { + "epoch": 1.5288270377733597, + "grad_norm": 5.591627597808838, + "learning_rate": 2.426244025285607e-06, + "loss": 0.1447, + "num_input_tokens_seen": 22583568, + "step": 11535 + }, + { + "epoch": 1.528959575878065, + "grad_norm": 5.494261741638184, + "learning_rate": 2.4258970397934504e-06, + "loss": 0.0464, + "num_input_tokens_seen": 22585456, + "step": 11536 + }, + { + "epoch": 1.52909211398277, + "grad_norm": 5.630473613739014, + "learning_rate": 2.42555005573005e-06, + "loss": 0.1614, + "num_input_tokens_seen": 22587216, + "step": 11537 + }, + { + "epoch": 1.5292246520874753, + "grad_norm": 2.4623281955718994, + "learning_rate": 2.4252030731020955e-06, + "loss": 0.0236, + "num_input_tokens_seen": 22588800, + "step": 11538 + }, + { + "epoch": 1.5293571901921803, + "grad_norm": 0.1985958367586136, + "learning_rate": 2.424856091916278e-06, + "loss": 0.0012, + "num_input_tokens_seen": 22591200, + "step": 11539 + }, + { + "epoch": 1.5294897282968853, + "grad_norm": 0.015358200296759605, + "learning_rate": 2.4245091121792862e-06, + "loss": 0.0001, + "num_input_tokens_seen": 22592600, + "step": 11540 + }, + { + "epoch": 1.5296222664015904, + "grad_norm": 7.7459259033203125, + "learning_rate": 2.4241621338978102e-06, + "loss": 0.1888, + "num_input_tokens_seen": 22595448, + "step": 11541 + }, + { + "epoch": 1.5297548045062954, + "grad_norm": 0.007106230594217777, + "learning_rate": 2.4238151570785418e-06, + "loss": 0.0, + "num_input_tokens_seen": 22596456, + "step": 11542 + }, + { + "epoch": 1.5298873426110007, + "grad_norm": 0.021753041073679924, + "learning_rate": 2.423468181728169e-06, + "loss": 0.0001, + "num_input_tokens_seen": 22598816, + "step": 11543 + }, + { + "epoch": 1.5300198807157057, + "grad_norm": 0.044503118842840195, + "learning_rate": 2.4231212078533833e-06, + "loss": 0.0003, + "num_input_tokens_seen": 22600976, + "step": 11544 + }, + { + "epoch": 1.530152418820411, + "grad_norm": 0.06732330471277237, + "learning_rate": 2.4227742354608737e-06, + "loss": 0.0004, + "num_input_tokens_seen": 22602488, + "step": 11545 + }, + { + "epoch": 1.530284956925116, + "grad_norm": 12.244284629821777, + "learning_rate": 2.4224272645573287e-06, + "loss": 0.4139, + "num_input_tokens_seen": 22605520, + "step": 11546 + }, + { + "epoch": 1.530417495029821, + "grad_norm": 8.19758129119873, + "learning_rate": 2.42208029514944e-06, + "loss": 0.0717, + "num_input_tokens_seen": 22607176, + "step": 11547 + }, + { + "epoch": 1.530550033134526, + "grad_norm": 1.9976475238800049, + "learning_rate": 2.421733327243898e-06, + "loss": 0.0157, + "num_input_tokens_seen": 22609984, + "step": 11548 + }, + { + "epoch": 1.5306825712392311, + "grad_norm": 4.700127601623535, + "learning_rate": 2.4213863608473907e-06, + "loss": 0.0689, + "num_input_tokens_seen": 22611800, + "step": 11549 + }, + { + "epoch": 1.5308151093439364, + "grad_norm": 0.09441649168729782, + "learning_rate": 2.421039395966609e-06, + "loss": 0.0006, + "num_input_tokens_seen": 22613608, + "step": 11550 + }, + { + "epoch": 1.5309476474486416, + "grad_norm": 6.115640640258789, + "learning_rate": 2.4206924326082417e-06, + "loss": 0.0963, + "num_input_tokens_seen": 22615088, + "step": 11551 + }, + { + "epoch": 1.5310801855533467, + "grad_norm": 0.23396918177604675, + "learning_rate": 2.4203454707789793e-06, + "loss": 0.0017, + "num_input_tokens_seen": 22616496, + "step": 11552 + }, + { + "epoch": 1.5312127236580517, + "grad_norm": 5.720498085021973, + "learning_rate": 2.4199985104855113e-06, + "loss": 0.1663, + "num_input_tokens_seen": 22619448, + "step": 11553 + }, + { + "epoch": 1.5313452617627568, + "grad_norm": 0.01905905455350876, + "learning_rate": 2.4196515517345275e-06, + "loss": 0.0001, + "num_input_tokens_seen": 22621200, + "step": 11554 + }, + { + "epoch": 1.5314777998674618, + "grad_norm": 11.752032279968262, + "learning_rate": 2.4193045945327166e-06, + "loss": 0.2048, + "num_input_tokens_seen": 22623416, + "step": 11555 + }, + { + "epoch": 1.5316103379721668, + "grad_norm": 3.8344991207122803, + "learning_rate": 2.418957638886768e-06, + "loss": 0.0344, + "num_input_tokens_seen": 22625224, + "step": 11556 + }, + { + "epoch": 1.531742876076872, + "grad_norm": 0.011460091918706894, + "learning_rate": 2.4186106848033737e-06, + "loss": 0.0001, + "num_input_tokens_seen": 22627272, + "step": 11557 + }, + { + "epoch": 1.5318754141815774, + "grad_norm": 2.23699951171875, + "learning_rate": 2.4182637322892206e-06, + "loss": 0.0361, + "num_input_tokens_seen": 22629200, + "step": 11558 + }, + { + "epoch": 1.5320079522862824, + "grad_norm": 10.898748397827148, + "learning_rate": 2.4179167813509994e-06, + "loss": 0.1935, + "num_input_tokens_seen": 22631784, + "step": 11559 + }, + { + "epoch": 1.5321404903909874, + "grad_norm": 1.1473876237869263, + "learning_rate": 2.4175698319954e-06, + "loss": 0.0012, + "num_input_tokens_seen": 22633456, + "step": 11560 + }, + { + "epoch": 1.5322730284956925, + "grad_norm": 3.6401376724243164, + "learning_rate": 2.4172228842291096e-06, + "loss": 0.0767, + "num_input_tokens_seen": 22635888, + "step": 11561 + }, + { + "epoch": 1.5324055666003975, + "grad_norm": 6.430000305175781, + "learning_rate": 2.4168759380588195e-06, + "loss": 0.0624, + "num_input_tokens_seen": 22637480, + "step": 11562 + }, + { + "epoch": 1.5325381047051028, + "grad_norm": 0.029940148815512657, + "learning_rate": 2.4165289934912195e-06, + "loss": 0.0002, + "num_input_tokens_seen": 22639968, + "step": 11563 + }, + { + "epoch": 1.5326706428098078, + "grad_norm": 0.03596257418394089, + "learning_rate": 2.4161820505329973e-06, + "loss": 0.0002, + "num_input_tokens_seen": 22641952, + "step": 11564 + }, + { + "epoch": 1.532803180914513, + "grad_norm": 12.339677810668945, + "learning_rate": 2.4158351091908437e-06, + "loss": 0.3235, + "num_input_tokens_seen": 22643944, + "step": 11565 + }, + { + "epoch": 1.532935719019218, + "grad_norm": 0.09271947294473648, + "learning_rate": 2.415488169471447e-06, + "loss": 0.0004, + "num_input_tokens_seen": 22645680, + "step": 11566 + }, + { + "epoch": 1.5330682571239231, + "grad_norm": 0.058309148997068405, + "learning_rate": 2.415141231381496e-06, + "loss": 0.0004, + "num_input_tokens_seen": 22646880, + "step": 11567 + }, + { + "epoch": 1.5332007952286282, + "grad_norm": 8.9210205078125, + "learning_rate": 2.414794294927681e-06, + "loss": 0.0469, + "num_input_tokens_seen": 22648576, + "step": 11568 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 9.173518180847168, + "learning_rate": 2.4144473601166915e-06, + "loss": 0.2022, + "num_input_tokens_seen": 22650688, + "step": 11569 + }, + { + "epoch": 1.5334658714380385, + "grad_norm": 2.9558844566345215, + "learning_rate": 2.414100426955215e-06, + "loss": 0.0529, + "num_input_tokens_seen": 22652552, + "step": 11570 + }, + { + "epoch": 1.5335984095427435, + "grad_norm": 3.058595895767212, + "learning_rate": 2.4137534954499427e-06, + "loss": 0.0297, + "num_input_tokens_seen": 22654136, + "step": 11571 + }, + { + "epoch": 1.5337309476474488, + "grad_norm": 32.69734191894531, + "learning_rate": 2.4134065656075607e-06, + "loss": 0.414, + "num_input_tokens_seen": 22655656, + "step": 11572 + }, + { + "epoch": 1.5338634857521538, + "grad_norm": 0.014726991765201092, + "learning_rate": 2.4130596374347607e-06, + "loss": 0.0001, + "num_input_tokens_seen": 22657776, + "step": 11573 + }, + { + "epoch": 1.5339960238568588, + "grad_norm": 6.930245876312256, + "learning_rate": 2.4127127109382317e-06, + "loss": 0.1745, + "num_input_tokens_seen": 22659576, + "step": 11574 + }, + { + "epoch": 1.5341285619615639, + "grad_norm": 0.002220445079728961, + "learning_rate": 2.4123657861246614e-06, + "loss": 0.0, + "num_input_tokens_seen": 22660840, + "step": 11575 + }, + { + "epoch": 1.534261100066269, + "grad_norm": 9.52695369720459, + "learning_rate": 2.4120188630007385e-06, + "loss": 0.1791, + "num_input_tokens_seen": 22662512, + "step": 11576 + }, + { + "epoch": 1.5343936381709742, + "grad_norm": 5.009772300720215, + "learning_rate": 2.4116719415731524e-06, + "loss": 0.1079, + "num_input_tokens_seen": 22664344, + "step": 11577 + }, + { + "epoch": 1.5345261762756792, + "grad_norm": 8.084025382995605, + "learning_rate": 2.4113250218485933e-06, + "loss": 0.1634, + "num_input_tokens_seen": 22666312, + "step": 11578 + }, + { + "epoch": 1.5346587143803845, + "grad_norm": 11.554540634155273, + "learning_rate": 2.4109781038337487e-06, + "loss": 0.1441, + "num_input_tokens_seen": 22668240, + "step": 11579 + }, + { + "epoch": 1.5347912524850895, + "grad_norm": 9.741320610046387, + "learning_rate": 2.4106311875353075e-06, + "loss": 0.1973, + "num_input_tokens_seen": 22670096, + "step": 11580 + }, + { + "epoch": 1.5349237905897946, + "grad_norm": 9.415050506591797, + "learning_rate": 2.4102842729599587e-06, + "loss": 0.144, + "num_input_tokens_seen": 22671520, + "step": 11581 + }, + { + "epoch": 1.5350563286944996, + "grad_norm": 8.422316551208496, + "learning_rate": 2.40993736011439e-06, + "loss": 0.0912, + "num_input_tokens_seen": 22673424, + "step": 11582 + }, + { + "epoch": 1.5351888667992046, + "grad_norm": 9.129530906677246, + "learning_rate": 2.409590449005293e-06, + "loss": 0.0701, + "num_input_tokens_seen": 22675176, + "step": 11583 + }, + { + "epoch": 1.53532140490391, + "grad_norm": 13.418356895446777, + "learning_rate": 2.4092435396393534e-06, + "loss": 0.2364, + "num_input_tokens_seen": 22676664, + "step": 11584 + }, + { + "epoch": 1.535453943008615, + "grad_norm": 2.6037704944610596, + "learning_rate": 2.4088966320232608e-06, + "loss": 0.0426, + "num_input_tokens_seen": 22678632, + "step": 11585 + }, + { + "epoch": 1.5355864811133202, + "grad_norm": 16.049972534179688, + "learning_rate": 2.408549726163705e-06, + "loss": 0.253, + "num_input_tokens_seen": 22680784, + "step": 11586 + }, + { + "epoch": 1.5357190192180252, + "grad_norm": 0.08646835386753082, + "learning_rate": 2.408202822067372e-06, + "loss": 0.0004, + "num_input_tokens_seen": 22682856, + "step": 11587 + }, + { + "epoch": 1.5358515573227303, + "grad_norm": 5.785270690917969, + "learning_rate": 2.407855919740953e-06, + "loss": 0.1299, + "num_input_tokens_seen": 22684280, + "step": 11588 + }, + { + "epoch": 1.5359840954274353, + "grad_norm": 3.449090003967285, + "learning_rate": 2.4075090191911356e-06, + "loss": 0.0347, + "num_input_tokens_seen": 22685984, + "step": 11589 + }, + { + "epoch": 1.5361166335321403, + "grad_norm": 0.15381969511508942, + "learning_rate": 2.407162120424608e-06, + "loss": 0.0007, + "num_input_tokens_seen": 22687712, + "step": 11590 + }, + { + "epoch": 1.5362491716368456, + "grad_norm": 5.978672504425049, + "learning_rate": 2.4068152234480587e-06, + "loss": 0.1444, + "num_input_tokens_seen": 22689744, + "step": 11591 + }, + { + "epoch": 1.5363817097415509, + "grad_norm": 0.07744533568620682, + "learning_rate": 2.406468328268175e-06, + "loss": 0.0005, + "num_input_tokens_seen": 22691632, + "step": 11592 + }, + { + "epoch": 1.536514247846256, + "grad_norm": 2.2990314960479736, + "learning_rate": 2.406121434891648e-06, + "loss": 0.0851, + "num_input_tokens_seen": 22693552, + "step": 11593 + }, + { + "epoch": 1.536646785950961, + "grad_norm": 7.156403064727783, + "learning_rate": 2.4057745433251637e-06, + "loss": 0.2251, + "num_input_tokens_seen": 22695720, + "step": 11594 + }, + { + "epoch": 1.536779324055666, + "grad_norm": 2.482516288757324, + "learning_rate": 2.405427653575412e-06, + "loss": 0.052, + "num_input_tokens_seen": 22697368, + "step": 11595 + }, + { + "epoch": 1.536911862160371, + "grad_norm": 14.224454879760742, + "learning_rate": 2.4050807656490797e-06, + "loss": 0.1454, + "num_input_tokens_seen": 22699456, + "step": 11596 + }, + { + "epoch": 1.537044400265076, + "grad_norm": 4.463079452514648, + "learning_rate": 2.4047338795528565e-06, + "loss": 0.0806, + "num_input_tokens_seen": 22702504, + "step": 11597 + }, + { + "epoch": 1.5371769383697813, + "grad_norm": 0.7017115950584412, + "learning_rate": 2.404386995293428e-06, + "loss": 0.0035, + "num_input_tokens_seen": 22703976, + "step": 11598 + }, + { + "epoch": 1.5373094764744866, + "grad_norm": 0.037335388362407684, + "learning_rate": 2.404040112877486e-06, + "loss": 0.0003, + "num_input_tokens_seen": 22705288, + "step": 11599 + }, + { + "epoch": 1.5374420145791916, + "grad_norm": 0.09657034277915955, + "learning_rate": 2.403693232311716e-06, + "loss": 0.0004, + "num_input_tokens_seen": 22707416, + "step": 11600 + }, + { + "epoch": 1.5375745526838966, + "grad_norm": 1.8468842506408691, + "learning_rate": 2.4033463536028078e-06, + "loss": 0.0076, + "num_input_tokens_seen": 22709400, + "step": 11601 + }, + { + "epoch": 1.5377070907886017, + "grad_norm": 11.107010841369629, + "learning_rate": 2.4029994767574473e-06, + "loss": 0.2856, + "num_input_tokens_seen": 22711712, + "step": 11602 + }, + { + "epoch": 1.5378396288933067, + "grad_norm": 0.019703494384884834, + "learning_rate": 2.402652601782324e-06, + "loss": 0.0001, + "num_input_tokens_seen": 22714296, + "step": 11603 + }, + { + "epoch": 1.537972166998012, + "grad_norm": 4.242217540740967, + "learning_rate": 2.4023057286841264e-06, + "loss": 0.0143, + "num_input_tokens_seen": 22716440, + "step": 11604 + }, + { + "epoch": 1.538104705102717, + "grad_norm": 0.0658925473690033, + "learning_rate": 2.401958857469542e-06, + "loss": 0.0002, + "num_input_tokens_seen": 22717744, + "step": 11605 + }, + { + "epoch": 1.5382372432074223, + "grad_norm": 4.9633893966674805, + "learning_rate": 2.4016119881452576e-06, + "loss": 0.035, + "num_input_tokens_seen": 22719968, + "step": 11606 + }, + { + "epoch": 1.5383697813121273, + "grad_norm": 7.452939033508301, + "learning_rate": 2.401265120717963e-06, + "loss": 0.1678, + "num_input_tokens_seen": 22721384, + "step": 11607 + }, + { + "epoch": 1.5385023194168324, + "grad_norm": 1.522218108177185, + "learning_rate": 2.4009182551943432e-06, + "loss": 0.0062, + "num_input_tokens_seen": 22723912, + "step": 11608 + }, + { + "epoch": 1.5386348575215374, + "grad_norm": 0.11837451905012131, + "learning_rate": 2.4005713915810888e-06, + "loss": 0.0004, + "num_input_tokens_seen": 22725808, + "step": 11609 + }, + { + "epoch": 1.5387673956262424, + "grad_norm": 6.951574325561523, + "learning_rate": 2.400224529884887e-06, + "loss": 0.2079, + "num_input_tokens_seen": 22727592, + "step": 11610 + }, + { + "epoch": 1.5388999337309477, + "grad_norm": 0.06466217339038849, + "learning_rate": 2.3998776701124243e-06, + "loss": 0.0004, + "num_input_tokens_seen": 22730048, + "step": 11611 + }, + { + "epoch": 1.5390324718356527, + "grad_norm": 17.191783905029297, + "learning_rate": 2.39953081227039e-06, + "loss": 0.1756, + "num_input_tokens_seen": 22732816, + "step": 11612 + }, + { + "epoch": 1.539165009940358, + "grad_norm": 0.0008452195324935019, + "learning_rate": 2.3991839563654702e-06, + "loss": 0.0, + "num_input_tokens_seen": 22734176, + "step": 11613 + }, + { + "epoch": 1.539297548045063, + "grad_norm": 9.631402015686035, + "learning_rate": 2.398837102404354e-06, + "loss": 0.1509, + "num_input_tokens_seen": 22736464, + "step": 11614 + }, + { + "epoch": 1.539430086149768, + "grad_norm": 0.023336652666330338, + "learning_rate": 2.3984902503937276e-06, + "loss": 0.0001, + "num_input_tokens_seen": 22738288, + "step": 11615 + }, + { + "epoch": 1.539562624254473, + "grad_norm": 1.3398380279541016, + "learning_rate": 2.3981434003402797e-06, + "loss": 0.0112, + "num_input_tokens_seen": 22739800, + "step": 11616 + }, + { + "epoch": 1.5396951623591781, + "grad_norm": 8.471689224243164, + "learning_rate": 2.3977965522506967e-06, + "loss": 0.2433, + "num_input_tokens_seen": 22741392, + "step": 11617 + }, + { + "epoch": 1.5398277004638834, + "grad_norm": 0.0248655304312706, + "learning_rate": 2.3974497061316664e-06, + "loss": 0.0002, + "num_input_tokens_seen": 22743144, + "step": 11618 + }, + { + "epoch": 1.5399602385685884, + "grad_norm": 8.336241722106934, + "learning_rate": 2.397102861989878e-06, + "loss": 0.1343, + "num_input_tokens_seen": 22744688, + "step": 11619 + }, + { + "epoch": 1.5400927766732937, + "grad_norm": 4.331550121307373, + "learning_rate": 2.396756019832016e-06, + "loss": 0.0218, + "num_input_tokens_seen": 22746504, + "step": 11620 + }, + { + "epoch": 1.5402253147779987, + "grad_norm": 5.52543306350708, + "learning_rate": 2.3964091796647705e-06, + "loss": 0.1194, + "num_input_tokens_seen": 22748088, + "step": 11621 + }, + { + "epoch": 1.5403578528827038, + "grad_norm": 3.9056236743927, + "learning_rate": 2.396062341494827e-06, + "loss": 0.0882, + "num_input_tokens_seen": 22749832, + "step": 11622 + }, + { + "epoch": 1.5404903909874088, + "grad_norm": 5.999610424041748, + "learning_rate": 2.3957155053288724e-06, + "loss": 0.0817, + "num_input_tokens_seen": 22751384, + "step": 11623 + }, + { + "epoch": 1.5406229290921138, + "grad_norm": 4.145784854888916, + "learning_rate": 2.395368671173595e-06, + "loss": 0.0633, + "num_input_tokens_seen": 22753808, + "step": 11624 + }, + { + "epoch": 1.540755467196819, + "grad_norm": 5.507304668426514, + "learning_rate": 2.3950218390356826e-06, + "loss": 0.0597, + "num_input_tokens_seen": 22756864, + "step": 11625 + }, + { + "epoch": 1.5408880053015241, + "grad_norm": 5.125180721282959, + "learning_rate": 2.394675008921821e-06, + "loss": 0.1665, + "num_input_tokens_seen": 22759200, + "step": 11626 + }, + { + "epoch": 1.5410205434062294, + "grad_norm": 0.09268554300069809, + "learning_rate": 2.3943281808386984e-06, + "loss": 0.0004, + "num_input_tokens_seen": 22760480, + "step": 11627 + }, + { + "epoch": 1.5411530815109344, + "grad_norm": 0.01375078596174717, + "learning_rate": 2.3939813547930013e-06, + "loss": 0.0001, + "num_input_tokens_seen": 22763176, + "step": 11628 + }, + { + "epoch": 1.5412856196156395, + "grad_norm": 3.6710283756256104, + "learning_rate": 2.393634530791416e-06, + "loss": 0.0536, + "num_input_tokens_seen": 22765472, + "step": 11629 + }, + { + "epoch": 1.5414181577203445, + "grad_norm": 0.0025124638341367245, + "learning_rate": 2.3932877088406306e-06, + "loss": 0.0, + "num_input_tokens_seen": 22766672, + "step": 11630 + }, + { + "epoch": 1.5415506958250496, + "grad_norm": 5.217998504638672, + "learning_rate": 2.3929408889473323e-06, + "loss": 0.1543, + "num_input_tokens_seen": 22768872, + "step": 11631 + }, + { + "epoch": 1.5416832339297548, + "grad_norm": 0.700996994972229, + "learning_rate": 2.3925940711182072e-06, + "loss": 0.0029, + "num_input_tokens_seen": 22771880, + "step": 11632 + }, + { + "epoch": 1.5418157720344599, + "grad_norm": 0.022334029898047447, + "learning_rate": 2.392247255359943e-06, + "loss": 0.0001, + "num_input_tokens_seen": 22773624, + "step": 11633 + }, + { + "epoch": 1.5419483101391651, + "grad_norm": 8.716713905334473, + "learning_rate": 2.391900441679225e-06, + "loss": 0.2371, + "num_input_tokens_seen": 22775720, + "step": 11634 + }, + { + "epoch": 1.5420808482438702, + "grad_norm": 13.455893516540527, + "learning_rate": 2.3915536300827414e-06, + "loss": 0.261, + "num_input_tokens_seen": 22777528, + "step": 11635 + }, + { + "epoch": 1.5422133863485752, + "grad_norm": 1.5254008769989014, + "learning_rate": 2.3912068205771795e-06, + "loss": 0.0425, + "num_input_tokens_seen": 22779312, + "step": 11636 + }, + { + "epoch": 1.5423459244532802, + "grad_norm": 10.9517822265625, + "learning_rate": 2.390860013169225e-06, + "loss": 0.1086, + "num_input_tokens_seen": 22780848, + "step": 11637 + }, + { + "epoch": 1.5424784625579853, + "grad_norm": 5.070095062255859, + "learning_rate": 2.390513207865564e-06, + "loss": 0.1246, + "num_input_tokens_seen": 22782624, + "step": 11638 + }, + { + "epoch": 1.5426110006626905, + "grad_norm": 4.699265480041504, + "learning_rate": 2.390166404672883e-06, + "loss": 0.05, + "num_input_tokens_seen": 22784496, + "step": 11639 + }, + { + "epoch": 1.5427435387673958, + "grad_norm": 0.004969986155629158, + "learning_rate": 2.3898196035978715e-06, + "loss": 0.0, + "num_input_tokens_seen": 22786216, + "step": 11640 + }, + { + "epoch": 1.5428760768721008, + "grad_norm": 6.936388969421387, + "learning_rate": 2.389472804647213e-06, + "loss": 0.319, + "num_input_tokens_seen": 22788304, + "step": 11641 + }, + { + "epoch": 1.5430086149768059, + "grad_norm": 5.080018043518066, + "learning_rate": 2.3891260078275954e-06, + "loss": 0.0752, + "num_input_tokens_seen": 22791120, + "step": 11642 + }, + { + "epoch": 1.543141153081511, + "grad_norm": 6.296807765960693, + "learning_rate": 2.388779213145705e-06, + "loss": 0.0812, + "num_input_tokens_seen": 22792584, + "step": 11643 + }, + { + "epoch": 1.543273691186216, + "grad_norm": 2.7061445713043213, + "learning_rate": 2.3884324206082274e-06, + "loss": 0.0492, + "num_input_tokens_seen": 22794072, + "step": 11644 + }, + { + "epoch": 1.5434062292909212, + "grad_norm": 0.013041294179856777, + "learning_rate": 2.38808563022185e-06, + "loss": 0.0001, + "num_input_tokens_seen": 22795408, + "step": 11645 + }, + { + "epoch": 1.5435387673956262, + "grad_norm": 12.966225624084473, + "learning_rate": 2.3877388419932594e-06, + "loss": 0.1733, + "num_input_tokens_seen": 22797192, + "step": 11646 + }, + { + "epoch": 1.5436713055003315, + "grad_norm": 0.03698479384183884, + "learning_rate": 2.387392055929141e-06, + "loss": 0.0002, + "num_input_tokens_seen": 22798240, + "step": 11647 + }, + { + "epoch": 1.5438038436050365, + "grad_norm": 0.4972839653491974, + "learning_rate": 2.3870452720361815e-06, + "loss": 0.0038, + "num_input_tokens_seen": 22799960, + "step": 11648 + }, + { + "epoch": 1.5439363817097416, + "grad_norm": 0.0447104349732399, + "learning_rate": 2.3866984903210663e-06, + "loss": 0.0002, + "num_input_tokens_seen": 22801488, + "step": 11649 + }, + { + "epoch": 1.5440689198144466, + "grad_norm": 1.4432384967803955, + "learning_rate": 2.386351710790483e-06, + "loss": 0.0136, + "num_input_tokens_seen": 22803632, + "step": 11650 + }, + { + "epoch": 1.5442014579191516, + "grad_norm": 1.0031282901763916, + "learning_rate": 2.386004933451117e-06, + "loss": 0.0083, + "num_input_tokens_seen": 22805000, + "step": 11651 + }, + { + "epoch": 1.544333996023857, + "grad_norm": 9.571556091308594, + "learning_rate": 2.3856581583096557e-06, + "loss": 0.2114, + "num_input_tokens_seen": 22807920, + "step": 11652 + }, + { + "epoch": 1.544466534128562, + "grad_norm": 6.322209358215332, + "learning_rate": 2.3853113853727825e-06, + "loss": 0.1103, + "num_input_tokens_seen": 22809584, + "step": 11653 + }, + { + "epoch": 1.5445990722332672, + "grad_norm": 7.201266288757324, + "learning_rate": 2.384964614647186e-06, + "loss": 0.158, + "num_input_tokens_seen": 22811496, + "step": 11654 + }, + { + "epoch": 1.5447316103379722, + "grad_norm": 8.91861629486084, + "learning_rate": 2.3846178461395497e-06, + "loss": 0.2733, + "num_input_tokens_seen": 22813648, + "step": 11655 + }, + { + "epoch": 1.5448641484426773, + "grad_norm": 7.845762729644775, + "learning_rate": 2.3842710798565616e-06, + "loss": 0.2225, + "num_input_tokens_seen": 22815704, + "step": 11656 + }, + { + "epoch": 1.5449966865473823, + "grad_norm": 0.5984724760055542, + "learning_rate": 2.3839243158049074e-06, + "loss": 0.0018, + "num_input_tokens_seen": 22817936, + "step": 11657 + }, + { + "epoch": 1.5451292246520874, + "grad_norm": 0.3342927098274231, + "learning_rate": 2.3835775539912726e-06, + "loss": 0.0021, + "num_input_tokens_seen": 22819808, + "step": 11658 + }, + { + "epoch": 1.5452617627567926, + "grad_norm": 8.167765617370605, + "learning_rate": 2.383230794422342e-06, + "loss": 0.2535, + "num_input_tokens_seen": 22821808, + "step": 11659 + }, + { + "epoch": 1.5453943008614976, + "grad_norm": 7.889893531799316, + "learning_rate": 2.3828840371048022e-06, + "loss": 0.0531, + "num_input_tokens_seen": 22823440, + "step": 11660 + }, + { + "epoch": 1.545526838966203, + "grad_norm": 2.6872446537017822, + "learning_rate": 2.3825372820453395e-06, + "loss": 0.0295, + "num_input_tokens_seen": 22824768, + "step": 11661 + }, + { + "epoch": 1.545659377070908, + "grad_norm": 3.2457728385925293, + "learning_rate": 2.382190529250639e-06, + "loss": 0.0355, + "num_input_tokens_seen": 22826880, + "step": 11662 + }, + { + "epoch": 1.545791915175613, + "grad_norm": 4.213348388671875, + "learning_rate": 2.381843778727386e-06, + "loss": 0.0469, + "num_input_tokens_seen": 22828952, + "step": 11663 + }, + { + "epoch": 1.545924453280318, + "grad_norm": 0.07602731138467789, + "learning_rate": 2.3814970304822667e-06, + "loss": 0.0003, + "num_input_tokens_seen": 22831840, + "step": 11664 + }, + { + "epoch": 1.546056991385023, + "grad_norm": 6.952153205871582, + "learning_rate": 2.3811502845219657e-06, + "loss": 0.0725, + "num_input_tokens_seen": 22833960, + "step": 11665 + }, + { + "epoch": 1.5461895294897283, + "grad_norm": 6.129274845123291, + "learning_rate": 2.3808035408531704e-06, + "loss": 0.2096, + "num_input_tokens_seen": 22836168, + "step": 11666 + }, + { + "epoch": 1.5463220675944334, + "grad_norm": 0.009145737625658512, + "learning_rate": 2.380456799482565e-06, + "loss": 0.0001, + "num_input_tokens_seen": 22837872, + "step": 11667 + }, + { + "epoch": 1.5464546056991386, + "grad_norm": 3.4855196475982666, + "learning_rate": 2.3801100604168346e-06, + "loss": 0.0832, + "num_input_tokens_seen": 22839872, + "step": 11668 + }, + { + "epoch": 1.5465871438038437, + "grad_norm": 7.353240966796875, + "learning_rate": 2.3797633236626653e-06, + "loss": 0.0907, + "num_input_tokens_seen": 22841376, + "step": 11669 + }, + { + "epoch": 1.5467196819085487, + "grad_norm": 6.336739540100098, + "learning_rate": 2.379416589226741e-06, + "loss": 0.091, + "num_input_tokens_seen": 22843968, + "step": 11670 + }, + { + "epoch": 1.5468522200132537, + "grad_norm": 7.328383445739746, + "learning_rate": 2.379069857115749e-06, + "loss": 0.1129, + "num_input_tokens_seen": 22845600, + "step": 11671 + }, + { + "epoch": 1.5469847581179588, + "grad_norm": 4.484687328338623, + "learning_rate": 2.3787231273363732e-06, + "loss": 0.0332, + "num_input_tokens_seen": 22848136, + "step": 11672 + }, + { + "epoch": 1.547117296222664, + "grad_norm": 2.0273196697235107, + "learning_rate": 2.378376399895299e-06, + "loss": 0.0088, + "num_input_tokens_seen": 22850448, + "step": 11673 + }, + { + "epoch": 1.547249834327369, + "grad_norm": 0.05740797147154808, + "learning_rate": 2.3780296747992122e-06, + "loss": 0.0004, + "num_input_tokens_seen": 22852600, + "step": 11674 + }, + { + "epoch": 1.5473823724320743, + "grad_norm": 9.62387752532959, + "learning_rate": 2.377682952054797e-06, + "loss": 0.1257, + "num_input_tokens_seen": 22854600, + "step": 11675 + }, + { + "epoch": 1.5475149105367794, + "grad_norm": 6.483031272888184, + "learning_rate": 2.377336231668739e-06, + "loss": 0.2746, + "num_input_tokens_seen": 22856336, + "step": 11676 + }, + { + "epoch": 1.5476474486414844, + "grad_norm": 2.4936413764953613, + "learning_rate": 2.3769895136477234e-06, + "loss": 0.0467, + "num_input_tokens_seen": 22858080, + "step": 11677 + }, + { + "epoch": 1.5477799867461894, + "grad_norm": 0.14681215584278107, + "learning_rate": 2.3766427979984345e-06, + "loss": 0.0006, + "num_input_tokens_seen": 22860856, + "step": 11678 + }, + { + "epoch": 1.5479125248508945, + "grad_norm": 7.2523016929626465, + "learning_rate": 2.376296084727558e-06, + "loss": 0.2218, + "num_input_tokens_seen": 22862856, + "step": 11679 + }, + { + "epoch": 1.5480450629555997, + "grad_norm": 9.062152862548828, + "learning_rate": 2.375949373841777e-06, + "loss": 0.2012, + "num_input_tokens_seen": 22865264, + "step": 11680 + }, + { + "epoch": 1.548177601060305, + "grad_norm": 8.32116985321045, + "learning_rate": 2.375602665347779e-06, + "loss": 0.0319, + "num_input_tokens_seen": 22866816, + "step": 11681 + }, + { + "epoch": 1.54831013916501, + "grad_norm": 16.30388641357422, + "learning_rate": 2.3752559592522476e-06, + "loss": 0.4519, + "num_input_tokens_seen": 22869192, + "step": 11682 + }, + { + "epoch": 1.548442677269715, + "grad_norm": 15.974126815795898, + "learning_rate": 2.3749092555618663e-06, + "loss": 0.2502, + "num_input_tokens_seen": 22870864, + "step": 11683 + }, + { + "epoch": 1.54857521537442, + "grad_norm": 0.12354584038257599, + "learning_rate": 2.374562554283322e-06, + "loss": 0.0005, + "num_input_tokens_seen": 22872176, + "step": 11684 + }, + { + "epoch": 1.5487077534791251, + "grad_norm": 0.011220953427255154, + "learning_rate": 2.374215855423297e-06, + "loss": 0.0001, + "num_input_tokens_seen": 22875008, + "step": 11685 + }, + { + "epoch": 1.5488402915838302, + "grad_norm": 0.06803762167692184, + "learning_rate": 2.3738691589884773e-06, + "loss": 0.0004, + "num_input_tokens_seen": 22876488, + "step": 11686 + }, + { + "epoch": 1.5489728296885354, + "grad_norm": 1.0669276714324951, + "learning_rate": 2.373522464985548e-06, + "loss": 0.0117, + "num_input_tokens_seen": 22878344, + "step": 11687 + }, + { + "epoch": 1.5491053677932407, + "grad_norm": 0.011777970008552074, + "learning_rate": 2.3731757734211923e-06, + "loss": 0.0001, + "num_input_tokens_seen": 22880008, + "step": 11688 + }, + { + "epoch": 1.5492379058979457, + "grad_norm": 0.03759561479091644, + "learning_rate": 2.3728290843020956e-06, + "loss": 0.0002, + "num_input_tokens_seen": 22881480, + "step": 11689 + }, + { + "epoch": 1.5493704440026508, + "grad_norm": 7.132319450378418, + "learning_rate": 2.3724823976349415e-06, + "loss": 0.1799, + "num_input_tokens_seen": 22884288, + "step": 11690 + }, + { + "epoch": 1.5495029821073558, + "grad_norm": 12.136001586914062, + "learning_rate": 2.3721357134264146e-06, + "loss": 0.0529, + "num_input_tokens_seen": 22885648, + "step": 11691 + }, + { + "epoch": 1.5496355202120609, + "grad_norm": 2.5058939456939697, + "learning_rate": 2.371789031683199e-06, + "loss": 0.0238, + "num_input_tokens_seen": 22887920, + "step": 11692 + }, + { + "epoch": 1.5497680583167661, + "grad_norm": 9.547260284423828, + "learning_rate": 2.3714423524119805e-06, + "loss": 0.3117, + "num_input_tokens_seen": 22890464, + "step": 11693 + }, + { + "epoch": 1.5499005964214712, + "grad_norm": 9.446438789367676, + "learning_rate": 2.3710956756194413e-06, + "loss": 0.2493, + "num_input_tokens_seen": 22892624, + "step": 11694 + }, + { + "epoch": 1.5500331345261764, + "grad_norm": 3.51804780960083, + "learning_rate": 2.370749001312267e-06, + "loss": 0.0665, + "num_input_tokens_seen": 22893944, + "step": 11695 + }, + { + "epoch": 1.5501656726308815, + "grad_norm": 12.27486801147461, + "learning_rate": 2.37040232949714e-06, + "loss": 0.2455, + "num_input_tokens_seen": 22896416, + "step": 11696 + }, + { + "epoch": 1.5502982107355865, + "grad_norm": 0.02395310066640377, + "learning_rate": 2.3700556601807463e-06, + "loss": 0.0002, + "num_input_tokens_seen": 22898392, + "step": 11697 + }, + { + "epoch": 1.5504307488402915, + "grad_norm": 0.013616015203297138, + "learning_rate": 2.369708993369769e-06, + "loss": 0.0001, + "num_input_tokens_seen": 22899992, + "step": 11698 + }, + { + "epoch": 1.5505632869449966, + "grad_norm": 9.419471740722656, + "learning_rate": 2.3693623290708926e-06, + "loss": 0.1168, + "num_input_tokens_seen": 22901832, + "step": 11699 + }, + { + "epoch": 1.5506958250497018, + "grad_norm": 3.9283926486968994, + "learning_rate": 2.3690156672908003e-06, + "loss": 0.0413, + "num_input_tokens_seen": 22903344, + "step": 11700 + }, + { + "epoch": 1.5508283631544069, + "grad_norm": 6.461010932922363, + "learning_rate": 2.368669008036175e-06, + "loss": 0.1308, + "num_input_tokens_seen": 22905000, + "step": 11701 + }, + { + "epoch": 1.5509609012591121, + "grad_norm": 8.997200965881348, + "learning_rate": 2.368322351313704e-06, + "loss": 0.2779, + "num_input_tokens_seen": 22907016, + "step": 11702 + }, + { + "epoch": 1.5510934393638172, + "grad_norm": 5.607660293579102, + "learning_rate": 2.3679756971300673e-06, + "loss": 0.0737, + "num_input_tokens_seen": 22909264, + "step": 11703 + }, + { + "epoch": 1.5512259774685222, + "grad_norm": 1.3159074783325195, + "learning_rate": 2.3676290454919515e-06, + "loss": 0.0063, + "num_input_tokens_seen": 22911088, + "step": 11704 + }, + { + "epoch": 1.5513585155732272, + "grad_norm": 6.545577049255371, + "learning_rate": 2.3672823964060387e-06, + "loss": 0.1858, + "num_input_tokens_seen": 22913120, + "step": 11705 + }, + { + "epoch": 1.5514910536779323, + "grad_norm": 11.555002212524414, + "learning_rate": 2.366935749879012e-06, + "loss": 0.2645, + "num_input_tokens_seen": 22914472, + "step": 11706 + }, + { + "epoch": 1.5516235917826375, + "grad_norm": 4.435707092285156, + "learning_rate": 2.366589105917556e-06, + "loss": 0.0619, + "num_input_tokens_seen": 22915984, + "step": 11707 + }, + { + "epoch": 1.5517561298873426, + "grad_norm": 10.366458892822266, + "learning_rate": 2.366242464528355e-06, + "loss": 0.2164, + "num_input_tokens_seen": 22918848, + "step": 11708 + }, + { + "epoch": 1.5518886679920478, + "grad_norm": 6.799312591552734, + "learning_rate": 2.365895825718091e-06, + "loss": 0.1911, + "num_input_tokens_seen": 22921504, + "step": 11709 + }, + { + "epoch": 1.5520212060967529, + "grad_norm": 16.173952102661133, + "learning_rate": 2.3655491894934484e-06, + "loss": 0.0655, + "num_input_tokens_seen": 22923520, + "step": 11710 + }, + { + "epoch": 1.552153744201458, + "grad_norm": 5.98959493637085, + "learning_rate": 2.3652025558611097e-06, + "loss": 0.0686, + "num_input_tokens_seen": 22925544, + "step": 11711 + }, + { + "epoch": 1.552286282306163, + "grad_norm": 0.2014966607093811, + "learning_rate": 2.3648559248277585e-06, + "loss": 0.002, + "num_input_tokens_seen": 22927336, + "step": 11712 + }, + { + "epoch": 1.552418820410868, + "grad_norm": 6.154041290283203, + "learning_rate": 2.3645092964000785e-06, + "loss": 0.0415, + "num_input_tokens_seen": 22929864, + "step": 11713 + }, + { + "epoch": 1.5525513585155732, + "grad_norm": 5.471816539764404, + "learning_rate": 2.3641626705847535e-06, + "loss": 0.1061, + "num_input_tokens_seen": 22931864, + "step": 11714 + }, + { + "epoch": 1.5526838966202783, + "grad_norm": 6.523082733154297, + "learning_rate": 2.363816047388465e-06, + "loss": 0.0851, + "num_input_tokens_seen": 22933944, + "step": 11715 + }, + { + "epoch": 1.5528164347249835, + "grad_norm": 2.2643446922302246, + "learning_rate": 2.3634694268178977e-06, + "loss": 0.0338, + "num_input_tokens_seen": 22936000, + "step": 11716 + }, + { + "epoch": 1.5529489728296886, + "grad_norm": 6.2633442878723145, + "learning_rate": 2.3631228088797327e-06, + "loss": 0.1522, + "num_input_tokens_seen": 22937520, + "step": 11717 + }, + { + "epoch": 1.5530815109343936, + "grad_norm": 0.15778692066669464, + "learning_rate": 2.3627761935806555e-06, + "loss": 0.0011, + "num_input_tokens_seen": 22938600, + "step": 11718 + }, + { + "epoch": 1.5532140490390987, + "grad_norm": 7.601447105407715, + "learning_rate": 2.362429580927348e-06, + "loss": 0.2649, + "num_input_tokens_seen": 22941072, + "step": 11719 + }, + { + "epoch": 1.5533465871438037, + "grad_norm": 0.777221143245697, + "learning_rate": 2.362082970926493e-06, + "loss": 0.0036, + "num_input_tokens_seen": 22943856, + "step": 11720 + }, + { + "epoch": 1.553479125248509, + "grad_norm": 3.7049474716186523, + "learning_rate": 2.361736363584774e-06, + "loss": 0.0368, + "num_input_tokens_seen": 22945808, + "step": 11721 + }, + { + "epoch": 1.5536116633532142, + "grad_norm": 10.061142921447754, + "learning_rate": 2.3613897589088717e-06, + "loss": 0.2484, + "num_input_tokens_seen": 22947736, + "step": 11722 + }, + { + "epoch": 1.5537442014579192, + "grad_norm": 12.228292465209961, + "learning_rate": 2.3610431569054716e-06, + "loss": 0.2504, + "num_input_tokens_seen": 22950312, + "step": 11723 + }, + { + "epoch": 1.5538767395626243, + "grad_norm": 0.1607706993818283, + "learning_rate": 2.360696557581255e-06, + "loss": 0.0012, + "num_input_tokens_seen": 22951976, + "step": 11724 + }, + { + "epoch": 1.5540092776673293, + "grad_norm": 3.754638910293579, + "learning_rate": 2.3603499609429053e-06, + "loss": 0.0489, + "num_input_tokens_seen": 22954912, + "step": 11725 + }, + { + "epoch": 1.5541418157720344, + "grad_norm": 4.418231964111328, + "learning_rate": 2.3600033669971043e-06, + "loss": 0.0515, + "num_input_tokens_seen": 22957496, + "step": 11726 + }, + { + "epoch": 1.5542743538767394, + "grad_norm": 3.307616710662842, + "learning_rate": 2.3596567757505343e-06, + "loss": 0.0322, + "num_input_tokens_seen": 22960120, + "step": 11727 + }, + { + "epoch": 1.5544068919814447, + "grad_norm": 10.511303901672363, + "learning_rate": 2.3593101872098797e-06, + "loss": 0.164, + "num_input_tokens_seen": 22962712, + "step": 11728 + }, + { + "epoch": 1.55453943008615, + "grad_norm": 6.065456390380859, + "learning_rate": 2.3589636013818215e-06, + "loss": 0.1166, + "num_input_tokens_seen": 22964624, + "step": 11729 + }, + { + "epoch": 1.554671968190855, + "grad_norm": 6.678621292114258, + "learning_rate": 2.3586170182730418e-06, + "loss": 0.1637, + "num_input_tokens_seen": 22966064, + "step": 11730 + }, + { + "epoch": 1.55480450629556, + "grad_norm": 5.360035419464111, + "learning_rate": 2.358270437890224e-06, + "loss": 0.078, + "num_input_tokens_seen": 22969080, + "step": 11731 + }, + { + "epoch": 1.554937044400265, + "grad_norm": 4.380037307739258, + "learning_rate": 2.3579238602400496e-06, + "loss": 0.1009, + "num_input_tokens_seen": 22971424, + "step": 11732 + }, + { + "epoch": 1.55506958250497, + "grad_norm": 0.137910395860672, + "learning_rate": 2.3575772853292012e-06, + "loss": 0.0007, + "num_input_tokens_seen": 22973416, + "step": 11733 + }, + { + "epoch": 1.5552021206096753, + "grad_norm": 1.6743347644805908, + "learning_rate": 2.3572307131643616e-06, + "loss": 0.0195, + "num_input_tokens_seen": 22975144, + "step": 11734 + }, + { + "epoch": 1.5553346587143804, + "grad_norm": 3.458446979522705, + "learning_rate": 2.3568841437522123e-06, + "loss": 0.0882, + "num_input_tokens_seen": 22976768, + "step": 11735 + }, + { + "epoch": 1.5554671968190856, + "grad_norm": 20.108211517333984, + "learning_rate": 2.356537577099435e-06, + "loss": 0.454, + "num_input_tokens_seen": 22978304, + "step": 11736 + }, + { + "epoch": 1.5555997349237907, + "grad_norm": 10.755016326904297, + "learning_rate": 2.3561910132127115e-06, + "loss": 0.2422, + "num_input_tokens_seen": 22980656, + "step": 11737 + }, + { + "epoch": 1.5557322730284957, + "grad_norm": 0.616950273513794, + "learning_rate": 2.3558444520987254e-06, + "loss": 0.0069, + "num_input_tokens_seen": 22982896, + "step": 11738 + }, + { + "epoch": 1.5558648111332007, + "grad_norm": 5.336042404174805, + "learning_rate": 2.355497893764157e-06, + "loss": 0.1459, + "num_input_tokens_seen": 22985216, + "step": 11739 + }, + { + "epoch": 1.5559973492379058, + "grad_norm": 0.046280067414045334, + "learning_rate": 2.35515133821569e-06, + "loss": 0.0003, + "num_input_tokens_seen": 22986872, + "step": 11740 + }, + { + "epoch": 1.556129887342611, + "grad_norm": 8.701187133789062, + "learning_rate": 2.3548047854600043e-06, + "loss": 0.1642, + "num_input_tokens_seen": 22988816, + "step": 11741 + }, + { + "epoch": 1.556262425447316, + "grad_norm": 0.2147613912820816, + "learning_rate": 2.3544582355037833e-06, + "loss": 0.0016, + "num_input_tokens_seen": 22992088, + "step": 11742 + }, + { + "epoch": 1.5563949635520213, + "grad_norm": 10.167132377624512, + "learning_rate": 2.3541116883537066e-06, + "loss": 0.1277, + "num_input_tokens_seen": 22994840, + "step": 11743 + }, + { + "epoch": 1.5565275016567264, + "grad_norm": 11.632242202758789, + "learning_rate": 2.3537651440164587e-06, + "loss": 0.3117, + "num_input_tokens_seen": 22997552, + "step": 11744 + }, + { + "epoch": 1.5566600397614314, + "grad_norm": 0.7559077143669128, + "learning_rate": 2.3534186024987183e-06, + "loss": 0.005, + "num_input_tokens_seen": 22999352, + "step": 11745 + }, + { + "epoch": 1.5567925778661365, + "grad_norm": 0.6411713361740112, + "learning_rate": 2.35307206380717e-06, + "loss": 0.0069, + "num_input_tokens_seen": 23001104, + "step": 11746 + }, + { + "epoch": 1.5569251159708415, + "grad_norm": 4.030973434448242, + "learning_rate": 2.352725527948492e-06, + "loss": 0.068, + "num_input_tokens_seen": 23002368, + "step": 11747 + }, + { + "epoch": 1.5570576540755467, + "grad_norm": 2.213083267211914, + "learning_rate": 2.3523789949293675e-06, + "loss": 0.0326, + "num_input_tokens_seen": 23004112, + "step": 11748 + }, + { + "epoch": 1.5571901921802518, + "grad_norm": 2.434950828552246, + "learning_rate": 2.352032464756479e-06, + "loss": 0.0184, + "num_input_tokens_seen": 23006128, + "step": 11749 + }, + { + "epoch": 1.557322730284957, + "grad_norm": 9.92917537689209, + "learning_rate": 2.3516859374365066e-06, + "loss": 0.1264, + "num_input_tokens_seen": 23007864, + "step": 11750 + }, + { + "epoch": 1.557455268389662, + "grad_norm": 2.113671064376831, + "learning_rate": 2.3513394129761306e-06, + "loss": 0.0384, + "num_input_tokens_seen": 23009960, + "step": 11751 + }, + { + "epoch": 1.5575878064943671, + "grad_norm": 6.396363735198975, + "learning_rate": 2.3509928913820345e-06, + "loss": 0.1262, + "num_input_tokens_seen": 23011576, + "step": 11752 + }, + { + "epoch": 1.5577203445990722, + "grad_norm": 7.60141658782959, + "learning_rate": 2.3506463726608967e-06, + "loss": 0.198, + "num_input_tokens_seen": 23013320, + "step": 11753 + }, + { + "epoch": 1.5578528827037772, + "grad_norm": 10.095383644104004, + "learning_rate": 2.3502998568194003e-06, + "loss": 0.3141, + "num_input_tokens_seen": 23015608, + "step": 11754 + }, + { + "epoch": 1.5579854208084825, + "grad_norm": 0.28539103269577026, + "learning_rate": 2.349953343864227e-06, + "loss": 0.0015, + "num_input_tokens_seen": 23017864, + "step": 11755 + }, + { + "epoch": 1.5581179589131875, + "grad_norm": 0.18585848808288574, + "learning_rate": 2.349606833802056e-06, + "loss": 0.0012, + "num_input_tokens_seen": 23019232, + "step": 11756 + }, + { + "epoch": 1.5582504970178928, + "grad_norm": 0.05250381678342819, + "learning_rate": 2.3492603266395696e-06, + "loss": 0.0004, + "num_input_tokens_seen": 23020424, + "step": 11757 + }, + { + "epoch": 1.5583830351225978, + "grad_norm": 4.3008928298950195, + "learning_rate": 2.3489138223834463e-06, + "loss": 0.0319, + "num_input_tokens_seen": 23021936, + "step": 11758 + }, + { + "epoch": 1.5585155732273028, + "grad_norm": 0.09873567521572113, + "learning_rate": 2.348567321040371e-06, + "loss": 0.0006, + "num_input_tokens_seen": 23023320, + "step": 11759 + }, + { + "epoch": 1.5586481113320079, + "grad_norm": 7.6662445068359375, + "learning_rate": 2.348220822617021e-06, + "loss": 0.1673, + "num_input_tokens_seen": 23025536, + "step": 11760 + }, + { + "epoch": 1.558780649436713, + "grad_norm": 0.0330217070877552, + "learning_rate": 2.347874327120079e-06, + "loss": 0.0002, + "num_input_tokens_seen": 23026952, + "step": 11761 + }, + { + "epoch": 1.5589131875414182, + "grad_norm": 6.7171478271484375, + "learning_rate": 2.347527834556224e-06, + "loss": 0.1008, + "num_input_tokens_seen": 23029464, + "step": 11762 + }, + { + "epoch": 1.5590457256461234, + "grad_norm": 8.293986320495605, + "learning_rate": 2.347181344932137e-06, + "loss": 0.0812, + "num_input_tokens_seen": 23031480, + "step": 11763 + }, + { + "epoch": 1.5591782637508285, + "grad_norm": 0.10536544024944305, + "learning_rate": 2.3468348582545005e-06, + "loss": 0.0007, + "num_input_tokens_seen": 23032760, + "step": 11764 + }, + { + "epoch": 1.5593108018555335, + "grad_norm": 2.951874256134033, + "learning_rate": 2.346488374529993e-06, + "loss": 0.0353, + "num_input_tokens_seen": 23034832, + "step": 11765 + }, + { + "epoch": 1.5594433399602385, + "grad_norm": 11.947152137756348, + "learning_rate": 2.346141893765296e-06, + "loss": 0.1959, + "num_input_tokens_seen": 23036456, + "step": 11766 + }, + { + "epoch": 1.5595758780649436, + "grad_norm": 0.2531041204929352, + "learning_rate": 2.3457954159670897e-06, + "loss": 0.0013, + "num_input_tokens_seen": 23037856, + "step": 11767 + }, + { + "epoch": 1.5597084161696486, + "grad_norm": 3.4456894397735596, + "learning_rate": 2.3454489411420534e-06, + "loss": 0.041, + "num_input_tokens_seen": 23039472, + "step": 11768 + }, + { + "epoch": 1.5598409542743539, + "grad_norm": 3.3264551162719727, + "learning_rate": 2.3451024692968674e-06, + "loss": 0.071, + "num_input_tokens_seen": 23042496, + "step": 11769 + }, + { + "epoch": 1.5599734923790591, + "grad_norm": 3.6480424404144287, + "learning_rate": 2.344756000438214e-06, + "loss": 0.0252, + "num_input_tokens_seen": 23045176, + "step": 11770 + }, + { + "epoch": 1.5601060304837642, + "grad_norm": 0.1852840632200241, + "learning_rate": 2.3444095345727713e-06, + "loss": 0.0013, + "num_input_tokens_seen": 23046528, + "step": 11771 + }, + { + "epoch": 1.5602385685884692, + "grad_norm": 6.85092306137085, + "learning_rate": 2.344063071707221e-06, + "loss": 0.1052, + "num_input_tokens_seen": 23048792, + "step": 11772 + }, + { + "epoch": 1.5603711066931742, + "grad_norm": 0.10513570159673691, + "learning_rate": 2.343716611848242e-06, + "loss": 0.0007, + "num_input_tokens_seen": 23050896, + "step": 11773 + }, + { + "epoch": 1.5605036447978793, + "grad_norm": 4.905401229858398, + "learning_rate": 2.3433701550025133e-06, + "loss": 0.0328, + "num_input_tokens_seen": 23053088, + "step": 11774 + }, + { + "epoch": 1.5606361829025845, + "grad_norm": 4.9157023429870605, + "learning_rate": 2.3430237011767166e-06, + "loss": 0.064, + "num_input_tokens_seen": 23054568, + "step": 11775 + }, + { + "epoch": 1.5607687210072896, + "grad_norm": 1.8303897380828857, + "learning_rate": 2.3426772503775318e-06, + "loss": 0.0093, + "num_input_tokens_seen": 23056368, + "step": 11776 + }, + { + "epoch": 1.5609012591119948, + "grad_norm": 7.003386497497559, + "learning_rate": 2.342330802611638e-06, + "loss": 0.2108, + "num_input_tokens_seen": 23059064, + "step": 11777 + }, + { + "epoch": 1.5610337972166999, + "grad_norm": 4.772353172302246, + "learning_rate": 2.3419843578857153e-06, + "loss": 0.0504, + "num_input_tokens_seen": 23062176, + "step": 11778 + }, + { + "epoch": 1.561166335321405, + "grad_norm": 0.053890857845544815, + "learning_rate": 2.3416379162064423e-06, + "loss": 0.0004, + "num_input_tokens_seen": 23063912, + "step": 11779 + }, + { + "epoch": 1.56129887342611, + "grad_norm": 7.776603698730469, + "learning_rate": 2.3412914775805e-06, + "loss": 0.0501, + "num_input_tokens_seen": 23065352, + "step": 11780 + }, + { + "epoch": 1.561431411530815, + "grad_norm": 0.02211894653737545, + "learning_rate": 2.340945042014568e-06, + "loss": 0.0001, + "num_input_tokens_seen": 23066768, + "step": 11781 + }, + { + "epoch": 1.5615639496355203, + "grad_norm": 7.118942737579346, + "learning_rate": 2.340598609515325e-06, + "loss": 0.141, + "num_input_tokens_seen": 23069176, + "step": 11782 + }, + { + "epoch": 1.5616964877402253, + "grad_norm": 3.627549886703491, + "learning_rate": 2.3402521800894505e-06, + "loss": 0.0252, + "num_input_tokens_seen": 23071648, + "step": 11783 + }, + { + "epoch": 1.5618290258449306, + "grad_norm": 11.284818649291992, + "learning_rate": 2.3399057537436236e-06, + "loss": 0.3394, + "num_input_tokens_seen": 23075664, + "step": 11784 + }, + { + "epoch": 1.5619615639496356, + "grad_norm": 0.9425681233406067, + "learning_rate": 2.3395593304845254e-06, + "loss": 0.0258, + "num_input_tokens_seen": 23078376, + "step": 11785 + }, + { + "epoch": 1.5620941020543406, + "grad_norm": 5.32555627822876, + "learning_rate": 2.3392129103188334e-06, + "loss": 0.0825, + "num_input_tokens_seen": 23081232, + "step": 11786 + }, + { + "epoch": 1.5622266401590457, + "grad_norm": 9.754157066345215, + "learning_rate": 2.338866493253228e-06, + "loss": 0.1463, + "num_input_tokens_seen": 23083656, + "step": 11787 + }, + { + "epoch": 1.5623591782637507, + "grad_norm": 10.609440803527832, + "learning_rate": 2.338520079294388e-06, + "loss": 0.1795, + "num_input_tokens_seen": 23086552, + "step": 11788 + }, + { + "epoch": 1.562491716368456, + "grad_norm": 0.018197791650891304, + "learning_rate": 2.3381736684489907e-06, + "loss": 0.0001, + "num_input_tokens_seen": 23088304, + "step": 11789 + }, + { + "epoch": 1.562624254473161, + "grad_norm": 0.07660821080207825, + "learning_rate": 2.3378272607237177e-06, + "loss": 0.0004, + "num_input_tokens_seen": 23089392, + "step": 11790 + }, + { + "epoch": 1.5627567925778663, + "grad_norm": 10.474480628967285, + "learning_rate": 2.337480856125247e-06, + "loss": 0.0642, + "num_input_tokens_seen": 23091824, + "step": 11791 + }, + { + "epoch": 1.5628893306825713, + "grad_norm": 0.08091287314891815, + "learning_rate": 2.3371344546602574e-06, + "loss": 0.0005, + "num_input_tokens_seen": 23093144, + "step": 11792 + }, + { + "epoch": 1.5630218687872763, + "grad_norm": 6.021177291870117, + "learning_rate": 2.3367880563354285e-06, + "loss": 0.083, + "num_input_tokens_seen": 23095520, + "step": 11793 + }, + { + "epoch": 1.5631544068919814, + "grad_norm": 0.04064526408910751, + "learning_rate": 2.336441661157437e-06, + "loss": 0.0003, + "num_input_tokens_seen": 23097264, + "step": 11794 + }, + { + "epoch": 1.5632869449966864, + "grad_norm": 0.4156019985675812, + "learning_rate": 2.336095269132964e-06, + "loss": 0.003, + "num_input_tokens_seen": 23099136, + "step": 11795 + }, + { + "epoch": 1.5634194831013917, + "grad_norm": 3.1713433265686035, + "learning_rate": 2.335748880268687e-06, + "loss": 0.0576, + "num_input_tokens_seen": 23101040, + "step": 11796 + }, + { + "epoch": 1.5635520212060967, + "grad_norm": 2.0922791957855225, + "learning_rate": 2.335402494571286e-06, + "loss": 0.0174, + "num_input_tokens_seen": 23102712, + "step": 11797 + }, + { + "epoch": 1.563684559310802, + "grad_norm": 7.8315019607543945, + "learning_rate": 2.3350561120474376e-06, + "loss": 0.1846, + "num_input_tokens_seen": 23105128, + "step": 11798 + }, + { + "epoch": 1.563817097415507, + "grad_norm": 5.811864852905273, + "learning_rate": 2.3347097327038217e-06, + "loss": 0.1285, + "num_input_tokens_seen": 23107112, + "step": 11799 + }, + { + "epoch": 1.563949635520212, + "grad_norm": 4.618442535400391, + "learning_rate": 2.3343633565471148e-06, + "loss": 0.0811, + "num_input_tokens_seen": 23108696, + "step": 11800 + }, + { + "epoch": 1.564082173624917, + "grad_norm": 4.920743942260742, + "learning_rate": 2.334016983583997e-06, + "loss": 0.0592, + "num_input_tokens_seen": 23110576, + "step": 11801 + }, + { + "epoch": 1.5642147117296221, + "grad_norm": 9.9241943359375, + "learning_rate": 2.3336706138211474e-06, + "loss": 0.1294, + "num_input_tokens_seen": 23113712, + "step": 11802 + }, + { + "epoch": 1.5643472498343274, + "grad_norm": 5.606466293334961, + "learning_rate": 2.333324247265243e-06, + "loss": 0.2304, + "num_input_tokens_seen": 23116328, + "step": 11803 + }, + { + "epoch": 1.5644797879390324, + "grad_norm": 2.9157915115356445, + "learning_rate": 2.3329778839229613e-06, + "loss": 0.038, + "num_input_tokens_seen": 23118600, + "step": 11804 + }, + { + "epoch": 1.5646123260437377, + "grad_norm": 0.08944808691740036, + "learning_rate": 2.3326315238009806e-06, + "loss": 0.0006, + "num_input_tokens_seen": 23120120, + "step": 11805 + }, + { + "epoch": 1.5647448641484427, + "grad_norm": 1.166289210319519, + "learning_rate": 2.332285166905981e-06, + "loss": 0.0097, + "num_input_tokens_seen": 23122632, + "step": 11806 + }, + { + "epoch": 1.5648774022531478, + "grad_norm": 0.024734152480959892, + "learning_rate": 2.331938813244638e-06, + "loss": 0.0001, + "num_input_tokens_seen": 23123880, + "step": 11807 + }, + { + "epoch": 1.5650099403578528, + "grad_norm": 9.396806716918945, + "learning_rate": 2.3315924628236318e-06, + "loss": 0.2541, + "num_input_tokens_seen": 23126272, + "step": 11808 + }, + { + "epoch": 1.5651424784625578, + "grad_norm": 0.024552958086133003, + "learning_rate": 2.331246115649638e-06, + "loss": 0.0002, + "num_input_tokens_seen": 23127720, + "step": 11809 + }, + { + "epoch": 1.565275016567263, + "grad_norm": 6.480804443359375, + "learning_rate": 2.3308997717293357e-06, + "loss": 0.1237, + "num_input_tokens_seen": 23129544, + "step": 11810 + }, + { + "epoch": 1.5654075546719683, + "grad_norm": 4.319464683532715, + "learning_rate": 2.3305534310694033e-06, + "loss": 0.1092, + "num_input_tokens_seen": 23131088, + "step": 11811 + }, + { + "epoch": 1.5655400927766734, + "grad_norm": 0.13295838236808777, + "learning_rate": 2.330207093676517e-06, + "loss": 0.0006, + "num_input_tokens_seen": 23132952, + "step": 11812 + }, + { + "epoch": 1.5656726308813784, + "grad_norm": 10.54491138458252, + "learning_rate": 2.3298607595573553e-06, + "loss": 0.0984, + "num_input_tokens_seen": 23134888, + "step": 11813 + }, + { + "epoch": 1.5658051689860835, + "grad_norm": 5.460895538330078, + "learning_rate": 2.3295144287185954e-06, + "loss": 0.0517, + "num_input_tokens_seen": 23137176, + "step": 11814 + }, + { + "epoch": 1.5659377070907885, + "grad_norm": 13.162677764892578, + "learning_rate": 2.329168101166914e-06, + "loss": 0.2229, + "num_input_tokens_seen": 23139800, + "step": 11815 + }, + { + "epoch": 1.5660702451954938, + "grad_norm": 3.8460566997528076, + "learning_rate": 2.3288217769089903e-06, + "loss": 0.0299, + "num_input_tokens_seen": 23141800, + "step": 11816 + }, + { + "epoch": 1.5662027833001988, + "grad_norm": 16.859697341918945, + "learning_rate": 2.3284754559515016e-06, + "loss": 0.46, + "num_input_tokens_seen": 23143304, + "step": 11817 + }, + { + "epoch": 1.566335321404904, + "grad_norm": 16.181333541870117, + "learning_rate": 2.3281291383011234e-06, + "loss": 0.8357, + "num_input_tokens_seen": 23146352, + "step": 11818 + }, + { + "epoch": 1.566467859509609, + "grad_norm": 1.2226130962371826, + "learning_rate": 2.3277828239645347e-06, + "loss": 0.0158, + "num_input_tokens_seen": 23148448, + "step": 11819 + }, + { + "epoch": 1.5666003976143141, + "grad_norm": 0.32900407910346985, + "learning_rate": 2.3274365129484108e-06, + "loss": 0.0021, + "num_input_tokens_seen": 23150240, + "step": 11820 + }, + { + "epoch": 1.5667329357190192, + "grad_norm": 0.9531776309013367, + "learning_rate": 2.3270902052594314e-06, + "loss": 0.0069, + "num_input_tokens_seen": 23153152, + "step": 11821 + }, + { + "epoch": 1.5668654738237242, + "grad_norm": 3.535245180130005, + "learning_rate": 2.3267439009042713e-06, + "loss": 0.0572, + "num_input_tokens_seen": 23154768, + "step": 11822 + }, + { + "epoch": 1.5669980119284295, + "grad_norm": 2.0109262466430664, + "learning_rate": 2.326397599889609e-06, + "loss": 0.0359, + "num_input_tokens_seen": 23156440, + "step": 11823 + }, + { + "epoch": 1.5671305500331345, + "grad_norm": 7.079641342163086, + "learning_rate": 2.32605130222212e-06, + "loss": 0.1706, + "num_input_tokens_seen": 23158144, + "step": 11824 + }, + { + "epoch": 1.5672630881378398, + "grad_norm": 2.586860418319702, + "learning_rate": 2.3257050079084823e-06, + "loss": 0.0291, + "num_input_tokens_seen": 23159648, + "step": 11825 + }, + { + "epoch": 1.5673956262425448, + "grad_norm": 12.001721382141113, + "learning_rate": 2.3253587169553714e-06, + "loss": 0.1553, + "num_input_tokens_seen": 23162024, + "step": 11826 + }, + { + "epoch": 1.5675281643472498, + "grad_norm": 1.3586593866348267, + "learning_rate": 2.325012429369466e-06, + "loss": 0.0082, + "num_input_tokens_seen": 23164080, + "step": 11827 + }, + { + "epoch": 1.5676607024519549, + "grad_norm": 0.04144002124667168, + "learning_rate": 2.3246661451574413e-06, + "loss": 0.0003, + "num_input_tokens_seen": 23165704, + "step": 11828 + }, + { + "epoch": 1.56779324055666, + "grad_norm": 0.028851399198174477, + "learning_rate": 2.3243198643259747e-06, + "loss": 0.0002, + "num_input_tokens_seen": 23167256, + "step": 11829 + }, + { + "epoch": 1.5679257786613652, + "grad_norm": 0.06583758443593979, + "learning_rate": 2.3239735868817416e-06, + "loss": 0.0005, + "num_input_tokens_seen": 23168824, + "step": 11830 + }, + { + "epoch": 1.5680583167660702, + "grad_norm": 22.437429428100586, + "learning_rate": 2.3236273128314184e-06, + "loss": 0.3746, + "num_input_tokens_seen": 23170304, + "step": 11831 + }, + { + "epoch": 1.5681908548707755, + "grad_norm": 6.973607540130615, + "learning_rate": 2.323281042181684e-06, + "loss": 0.0543, + "num_input_tokens_seen": 23171728, + "step": 11832 + }, + { + "epoch": 1.5683233929754805, + "grad_norm": 3.017742156982422, + "learning_rate": 2.322934774939212e-06, + "loss": 0.0108, + "num_input_tokens_seen": 23173360, + "step": 11833 + }, + { + "epoch": 1.5684559310801856, + "grad_norm": 9.6630859375, + "learning_rate": 2.3225885111106805e-06, + "loss": 0.1628, + "num_input_tokens_seen": 23175224, + "step": 11834 + }, + { + "epoch": 1.5685884691848906, + "grad_norm": 11.627849578857422, + "learning_rate": 2.322242250702765e-06, + "loss": 0.1733, + "num_input_tokens_seen": 23176816, + "step": 11835 + }, + { + "epoch": 1.5687210072895956, + "grad_norm": 12.374133110046387, + "learning_rate": 2.3218959937221404e-06, + "loss": 0.1513, + "num_input_tokens_seen": 23178216, + "step": 11836 + }, + { + "epoch": 1.5688535453943009, + "grad_norm": 3.9707603454589844, + "learning_rate": 2.321549740175484e-06, + "loss": 0.0925, + "num_input_tokens_seen": 23180048, + "step": 11837 + }, + { + "epoch": 1.568986083499006, + "grad_norm": 5.106534481048584, + "learning_rate": 2.321203490069473e-06, + "loss": 0.0237, + "num_input_tokens_seen": 23181632, + "step": 11838 + }, + { + "epoch": 1.5691186216037112, + "grad_norm": 0.05615193024277687, + "learning_rate": 2.320857243410781e-06, + "loss": 0.0004, + "num_input_tokens_seen": 23183192, + "step": 11839 + }, + { + "epoch": 1.5692511597084162, + "grad_norm": 5.115355491638184, + "learning_rate": 2.3205110002060854e-06, + "loss": 0.0855, + "num_input_tokens_seen": 23185440, + "step": 11840 + }, + { + "epoch": 1.5693836978131213, + "grad_norm": 2.50343918800354, + "learning_rate": 2.3201647604620606e-06, + "loss": 0.029, + "num_input_tokens_seen": 23187936, + "step": 11841 + }, + { + "epoch": 1.5695162359178263, + "grad_norm": 2.482462167739868, + "learning_rate": 2.3198185241853842e-06, + "loss": 0.0381, + "num_input_tokens_seen": 23189600, + "step": 11842 + }, + { + "epoch": 1.5696487740225313, + "grad_norm": 0.015036329627037048, + "learning_rate": 2.319472291382731e-06, + "loss": 0.0001, + "num_input_tokens_seen": 23191032, + "step": 11843 + }, + { + "epoch": 1.5697813121272366, + "grad_norm": 0.009527535177767277, + "learning_rate": 2.3191260620607766e-06, + "loss": 0.0001, + "num_input_tokens_seen": 23192520, + "step": 11844 + }, + { + "epoch": 1.5699138502319416, + "grad_norm": 6.501735687255859, + "learning_rate": 2.3187798362261958e-06, + "loss": 0.0412, + "num_input_tokens_seen": 23194288, + "step": 11845 + }, + { + "epoch": 1.570046388336647, + "grad_norm": 3.4656929969787598, + "learning_rate": 2.318433613885665e-06, + "loss": 0.0522, + "num_input_tokens_seen": 23195936, + "step": 11846 + }, + { + "epoch": 1.570178926441352, + "grad_norm": 13.463749885559082, + "learning_rate": 2.31808739504586e-06, + "loss": 0.4768, + "num_input_tokens_seen": 23197880, + "step": 11847 + }, + { + "epoch": 1.570311464546057, + "grad_norm": 0.23207640647888184, + "learning_rate": 2.317741179713455e-06, + "loss": 0.0029, + "num_input_tokens_seen": 23200016, + "step": 11848 + }, + { + "epoch": 1.570444002650762, + "grad_norm": 10.478626251220703, + "learning_rate": 2.317394967895126e-06, + "loss": 0.2562, + "num_input_tokens_seen": 23201928, + "step": 11849 + }, + { + "epoch": 1.570576540755467, + "grad_norm": 0.2518407702445984, + "learning_rate": 2.3170487595975482e-06, + "loss": 0.0012, + "num_input_tokens_seen": 23203104, + "step": 11850 + }, + { + "epoch": 1.5707090788601723, + "grad_norm": 0.5737168192863464, + "learning_rate": 2.3167025548273957e-06, + "loss": 0.004, + "num_input_tokens_seen": 23205216, + "step": 11851 + }, + { + "epoch": 1.5708416169648776, + "grad_norm": 4.06577205657959, + "learning_rate": 2.3163563535913445e-06, + "loss": 0.0625, + "num_input_tokens_seen": 23206864, + "step": 11852 + }, + { + "epoch": 1.5709741550695826, + "grad_norm": 6.196745872497559, + "learning_rate": 2.3160101558960703e-06, + "loss": 0.1276, + "num_input_tokens_seen": 23209072, + "step": 11853 + }, + { + "epoch": 1.5711066931742876, + "grad_norm": 15.247641563415527, + "learning_rate": 2.3156639617482466e-06, + "loss": 0.3407, + "num_input_tokens_seen": 23210736, + "step": 11854 + }, + { + "epoch": 1.5712392312789927, + "grad_norm": 16.242324829101562, + "learning_rate": 2.3153177711545495e-06, + "loss": 0.2255, + "num_input_tokens_seen": 23213032, + "step": 11855 + }, + { + "epoch": 1.5713717693836977, + "grad_norm": 4.422913551330566, + "learning_rate": 2.314971584121653e-06, + "loss": 0.0427, + "num_input_tokens_seen": 23214640, + "step": 11856 + }, + { + "epoch": 1.5715043074884028, + "grad_norm": 0.023176299408078194, + "learning_rate": 2.3146254006562312e-06, + "loss": 0.0002, + "num_input_tokens_seen": 23217176, + "step": 11857 + }, + { + "epoch": 1.571636845593108, + "grad_norm": 0.024347057566046715, + "learning_rate": 2.31427922076496e-06, + "loss": 0.0002, + "num_input_tokens_seen": 23219208, + "step": 11858 + }, + { + "epoch": 1.5717693836978133, + "grad_norm": 0.03210686892271042, + "learning_rate": 2.3139330444545145e-06, + "loss": 0.0002, + "num_input_tokens_seen": 23221456, + "step": 11859 + }, + { + "epoch": 1.5719019218025183, + "grad_norm": 4.715926647186279, + "learning_rate": 2.313586871731567e-06, + "loss": 0.0836, + "num_input_tokens_seen": 23223744, + "step": 11860 + }, + { + "epoch": 1.5720344599072233, + "grad_norm": 7.625388145446777, + "learning_rate": 2.313240702602794e-06, + "loss": 0.0757, + "num_input_tokens_seen": 23225168, + "step": 11861 + }, + { + "epoch": 1.5721669980119284, + "grad_norm": 7.96759033203125, + "learning_rate": 2.3128945370748683e-06, + "loss": 0.2099, + "num_input_tokens_seen": 23226952, + "step": 11862 + }, + { + "epoch": 1.5722995361166334, + "grad_norm": 0.012869305908679962, + "learning_rate": 2.3125483751544655e-06, + "loss": 0.0001, + "num_input_tokens_seen": 23229128, + "step": 11863 + }, + { + "epoch": 1.5724320742213387, + "grad_norm": 0.007695955224335194, + "learning_rate": 2.3122022168482595e-06, + "loss": 0.0001, + "num_input_tokens_seen": 23230752, + "step": 11864 + }, + { + "epoch": 1.5725646123260437, + "grad_norm": 0.04273669049143791, + "learning_rate": 2.311856062162924e-06, + "loss": 0.0003, + "num_input_tokens_seen": 23233672, + "step": 11865 + }, + { + "epoch": 1.572697150430749, + "grad_norm": 5.344576835632324, + "learning_rate": 2.3115099111051336e-06, + "loss": 0.0859, + "num_input_tokens_seen": 23235552, + "step": 11866 + }, + { + "epoch": 1.572829688535454, + "grad_norm": 0.005217389203608036, + "learning_rate": 2.3111637636815616e-06, + "loss": 0.0, + "num_input_tokens_seen": 23237144, + "step": 11867 + }, + { + "epoch": 1.572962226640159, + "grad_norm": 6.825217247009277, + "learning_rate": 2.3108176198988833e-06, + "loss": 0.1558, + "num_input_tokens_seen": 23238616, + "step": 11868 + }, + { + "epoch": 1.573094764744864, + "grad_norm": 0.038383398205041885, + "learning_rate": 2.3104714797637714e-06, + "loss": 0.0002, + "num_input_tokens_seen": 23241080, + "step": 11869 + }, + { + "epoch": 1.5732273028495691, + "grad_norm": 6.217370510101318, + "learning_rate": 2.310125343282901e-06, + "loss": 0.0794, + "num_input_tokens_seen": 23242888, + "step": 11870 + }, + { + "epoch": 1.5733598409542744, + "grad_norm": 9.227302551269531, + "learning_rate": 2.3097792104629434e-06, + "loss": 0.1598, + "num_input_tokens_seen": 23244496, + "step": 11871 + }, + { + "epoch": 1.5734923790589794, + "grad_norm": 0.029518185183405876, + "learning_rate": 2.3094330813105743e-06, + "loss": 0.0002, + "num_input_tokens_seen": 23246536, + "step": 11872 + }, + { + "epoch": 1.5736249171636847, + "grad_norm": 2.307223320007324, + "learning_rate": 2.3090869558324676e-06, + "loss": 0.0317, + "num_input_tokens_seen": 23249048, + "step": 11873 + }, + { + "epoch": 1.5737574552683897, + "grad_norm": 8.633649826049805, + "learning_rate": 2.308740834035296e-06, + "loss": 0.0822, + "num_input_tokens_seen": 23251624, + "step": 11874 + }, + { + "epoch": 1.5738899933730948, + "grad_norm": 9.78980827331543, + "learning_rate": 2.3083947159257327e-06, + "loss": 0.0592, + "num_input_tokens_seen": 23253776, + "step": 11875 + }, + { + "epoch": 1.5740225314777998, + "grad_norm": 10.400908470153809, + "learning_rate": 2.308048601510452e-06, + "loss": 0.1832, + "num_input_tokens_seen": 23255784, + "step": 11876 + }, + { + "epoch": 1.5741550695825048, + "grad_norm": 15.188549041748047, + "learning_rate": 2.3077024907961253e-06, + "loss": 0.194, + "num_input_tokens_seen": 23258112, + "step": 11877 + }, + { + "epoch": 1.57428760768721, + "grad_norm": 4.213099002838135, + "learning_rate": 2.307356383789428e-06, + "loss": 0.1282, + "num_input_tokens_seen": 23259896, + "step": 11878 + }, + { + "epoch": 1.5744201457919151, + "grad_norm": 7.259431838989258, + "learning_rate": 2.3070102804970327e-06, + "loss": 0.1013, + "num_input_tokens_seen": 23262048, + "step": 11879 + }, + { + "epoch": 1.5745526838966204, + "grad_norm": 0.03413210064172745, + "learning_rate": 2.306664180925613e-06, + "loss": 0.0002, + "num_input_tokens_seen": 23263776, + "step": 11880 + }, + { + "epoch": 1.5746852220013254, + "grad_norm": 14.316970825195312, + "learning_rate": 2.30631808508184e-06, + "loss": 0.4378, + "num_input_tokens_seen": 23266048, + "step": 11881 + }, + { + "epoch": 1.5748177601060305, + "grad_norm": 0.009788365103304386, + "learning_rate": 2.305971992972389e-06, + "loss": 0.0001, + "num_input_tokens_seen": 23268608, + "step": 11882 + }, + { + "epoch": 1.5749502982107355, + "grad_norm": 0.08582916110754013, + "learning_rate": 2.3056259046039306e-06, + "loss": 0.0006, + "num_input_tokens_seen": 23270232, + "step": 11883 + }, + { + "epoch": 1.5750828363154405, + "grad_norm": 0.04013047739863396, + "learning_rate": 2.3052798199831396e-06, + "loss": 0.0002, + "num_input_tokens_seen": 23272088, + "step": 11884 + }, + { + "epoch": 1.5752153744201458, + "grad_norm": 0.03055867925286293, + "learning_rate": 2.3049337391166884e-06, + "loss": 0.0002, + "num_input_tokens_seen": 23273680, + "step": 11885 + }, + { + "epoch": 1.5753479125248508, + "grad_norm": 0.029550379142165184, + "learning_rate": 2.3045876620112484e-06, + "loss": 0.0002, + "num_input_tokens_seen": 23274616, + "step": 11886 + }, + { + "epoch": 1.575480450629556, + "grad_norm": 7.734893321990967, + "learning_rate": 2.304241588673494e-06, + "loss": 0.0682, + "num_input_tokens_seen": 23277368, + "step": 11887 + }, + { + "epoch": 1.5756129887342611, + "grad_norm": 0.3007345199584961, + "learning_rate": 2.3038955191100957e-06, + "loss": 0.002, + "num_input_tokens_seen": 23279808, + "step": 11888 + }, + { + "epoch": 1.5757455268389662, + "grad_norm": 5.712344169616699, + "learning_rate": 2.3035494533277282e-06, + "loss": 0.0297, + "num_input_tokens_seen": 23281280, + "step": 11889 + }, + { + "epoch": 1.5758780649436712, + "grad_norm": 11.294878959655762, + "learning_rate": 2.3032033913330618e-06, + "loss": 0.1981, + "num_input_tokens_seen": 23283136, + "step": 11890 + }, + { + "epoch": 1.5760106030483763, + "grad_norm": 0.029315093532204628, + "learning_rate": 2.302857333132771e-06, + "loss": 0.0002, + "num_input_tokens_seen": 23285472, + "step": 11891 + }, + { + "epoch": 1.5761431411530815, + "grad_norm": 6.328197479248047, + "learning_rate": 2.302511278733526e-06, + "loss": 0.132, + "num_input_tokens_seen": 23288016, + "step": 11892 + }, + { + "epoch": 1.5762756792577868, + "grad_norm": 4.279500961303711, + "learning_rate": 2.3021652281419985e-06, + "loss": 0.0727, + "num_input_tokens_seen": 23289624, + "step": 11893 + }, + { + "epoch": 1.5764082173624918, + "grad_norm": 11.489892959594727, + "learning_rate": 2.301819181364864e-06, + "loss": 0.2265, + "num_input_tokens_seen": 23291864, + "step": 11894 + }, + { + "epoch": 1.5765407554671969, + "grad_norm": 4.643187046051025, + "learning_rate": 2.3014731384087917e-06, + "loss": 0.1017, + "num_input_tokens_seen": 23293312, + "step": 11895 + }, + { + "epoch": 1.576673293571902, + "grad_norm": 0.060901664197444916, + "learning_rate": 2.3011270992804543e-06, + "loss": 0.0004, + "num_input_tokens_seen": 23294728, + "step": 11896 + }, + { + "epoch": 1.576805831676607, + "grad_norm": 12.484745025634766, + "learning_rate": 2.3007810639865237e-06, + "loss": 0.3547, + "num_input_tokens_seen": 23296408, + "step": 11897 + }, + { + "epoch": 1.576938369781312, + "grad_norm": 2.3672783374786377, + "learning_rate": 2.3004350325336706e-06, + "loss": 0.0221, + "num_input_tokens_seen": 23298872, + "step": 11898 + }, + { + "epoch": 1.5770709078860172, + "grad_norm": 0.263560950756073, + "learning_rate": 2.300089004928569e-06, + "loss": 0.0016, + "num_input_tokens_seen": 23300688, + "step": 11899 + }, + { + "epoch": 1.5772034459907225, + "grad_norm": 7.4764790534973145, + "learning_rate": 2.299742981177889e-06, + "loss": 0.1596, + "num_input_tokens_seen": 23302776, + "step": 11900 + }, + { + "epoch": 1.5773359840954275, + "grad_norm": 2.0159964561462402, + "learning_rate": 2.2993969612883023e-06, + "loss": 0.0307, + "num_input_tokens_seen": 23306056, + "step": 11901 + }, + { + "epoch": 1.5774685222001326, + "grad_norm": 5.1496477127075195, + "learning_rate": 2.2990509452664808e-06, + "loss": 0.1122, + "num_input_tokens_seen": 23308840, + "step": 11902 + }, + { + "epoch": 1.5776010603048376, + "grad_norm": 3.5109922885894775, + "learning_rate": 2.2987049331190946e-06, + "loss": 0.033, + "num_input_tokens_seen": 23310072, + "step": 11903 + }, + { + "epoch": 1.5777335984095426, + "grad_norm": 0.06520876288414001, + "learning_rate": 2.2983589248528177e-06, + "loss": 0.0004, + "num_input_tokens_seen": 23312200, + "step": 11904 + }, + { + "epoch": 1.577866136514248, + "grad_norm": 4.272860527038574, + "learning_rate": 2.298012920474319e-06, + "loss": 0.0258, + "num_input_tokens_seen": 23314728, + "step": 11905 + }, + { + "epoch": 1.577998674618953, + "grad_norm": 2.5731828212738037, + "learning_rate": 2.2976669199902713e-06, + "loss": 0.051, + "num_input_tokens_seen": 23316632, + "step": 11906 + }, + { + "epoch": 1.5781312127236582, + "grad_norm": 0.07348982244729996, + "learning_rate": 2.297320923407344e-06, + "loss": 0.0005, + "num_input_tokens_seen": 23318664, + "step": 11907 + }, + { + "epoch": 1.5782637508283632, + "grad_norm": 8.55766487121582, + "learning_rate": 2.296974930732209e-06, + "loss": 0.1826, + "num_input_tokens_seen": 23320432, + "step": 11908 + }, + { + "epoch": 1.5783962889330683, + "grad_norm": 0.46611157059669495, + "learning_rate": 2.2966289419715383e-06, + "loss": 0.003, + "num_input_tokens_seen": 23321632, + "step": 11909 + }, + { + "epoch": 1.5785288270377733, + "grad_norm": 1.407111406326294, + "learning_rate": 2.296282957132002e-06, + "loss": 0.0157, + "num_input_tokens_seen": 23323432, + "step": 11910 + }, + { + "epoch": 1.5786613651424783, + "grad_norm": 4.4142680168151855, + "learning_rate": 2.2959369762202703e-06, + "loss": 0.101, + "num_input_tokens_seen": 23325624, + "step": 11911 + }, + { + "epoch": 1.5787939032471836, + "grad_norm": 0.7156218886375427, + "learning_rate": 2.2955909992430148e-06, + "loss": 0.0066, + "num_input_tokens_seen": 23326840, + "step": 11912 + }, + { + "epoch": 1.5789264413518886, + "grad_norm": 0.02910565212368965, + "learning_rate": 2.2952450262069053e-06, + "loss": 0.0002, + "num_input_tokens_seen": 23328248, + "step": 11913 + }, + { + "epoch": 1.579058979456594, + "grad_norm": 2.8141586780548096, + "learning_rate": 2.2948990571186127e-06, + "loss": 0.0342, + "num_input_tokens_seen": 23330136, + "step": 11914 + }, + { + "epoch": 1.579191517561299, + "grad_norm": 0.015408039093017578, + "learning_rate": 2.2945530919848086e-06, + "loss": 0.0001, + "num_input_tokens_seen": 23331512, + "step": 11915 + }, + { + "epoch": 1.579324055666004, + "grad_norm": 6.919739246368408, + "learning_rate": 2.294207130812162e-06, + "loss": 0.0963, + "num_input_tokens_seen": 23334824, + "step": 11916 + }, + { + "epoch": 1.579456593770709, + "grad_norm": 0.03383958712220192, + "learning_rate": 2.2938611736073445e-06, + "loss": 0.0002, + "num_input_tokens_seen": 23336248, + "step": 11917 + }, + { + "epoch": 1.579589131875414, + "grad_norm": 8.613393783569336, + "learning_rate": 2.293515220377026e-06, + "loss": 0.2771, + "num_input_tokens_seen": 23338864, + "step": 11918 + }, + { + "epoch": 1.5797216699801193, + "grad_norm": 5.089381694793701, + "learning_rate": 2.2931692711278752e-06, + "loss": 0.0641, + "num_input_tokens_seen": 23341280, + "step": 11919 + }, + { + "epoch": 1.5798542080848244, + "grad_norm": 10.700448036193848, + "learning_rate": 2.292823325866564e-06, + "loss": 0.2213, + "num_input_tokens_seen": 23343136, + "step": 11920 + }, + { + "epoch": 1.5799867461895296, + "grad_norm": 4.684841632843018, + "learning_rate": 2.292477384599763e-06, + "loss": 0.1901, + "num_input_tokens_seen": 23344624, + "step": 11921 + }, + { + "epoch": 1.5801192842942346, + "grad_norm": 9.807380676269531, + "learning_rate": 2.2921314473341404e-06, + "loss": 0.2262, + "num_input_tokens_seen": 23346320, + "step": 11922 + }, + { + "epoch": 1.5802518223989397, + "grad_norm": 6.636413097381592, + "learning_rate": 2.291785514076367e-06, + "loss": 0.1269, + "num_input_tokens_seen": 23348272, + "step": 11923 + }, + { + "epoch": 1.5803843605036447, + "grad_norm": 9.999238967895508, + "learning_rate": 2.291439584833112e-06, + "loss": 0.0324, + "num_input_tokens_seen": 23349816, + "step": 11924 + }, + { + "epoch": 1.5805168986083498, + "grad_norm": 1.6644606590270996, + "learning_rate": 2.291093659611046e-06, + "loss": 0.0148, + "num_input_tokens_seen": 23351240, + "step": 11925 + }, + { + "epoch": 1.580649436713055, + "grad_norm": 5.8127264976501465, + "learning_rate": 2.290747738416839e-06, + "loss": 0.0809, + "num_input_tokens_seen": 23352664, + "step": 11926 + }, + { + "epoch": 1.58078197481776, + "grad_norm": 0.04623502865433693, + "learning_rate": 2.29040182125716e-06, + "loss": 0.0003, + "num_input_tokens_seen": 23354616, + "step": 11927 + }, + { + "epoch": 1.5809145129224653, + "grad_norm": 10.933884620666504, + "learning_rate": 2.290055908138678e-06, + "loss": 0.2594, + "num_input_tokens_seen": 23356488, + "step": 11928 + }, + { + "epoch": 1.5810470510271704, + "grad_norm": 0.3446574807167053, + "learning_rate": 2.2897099990680623e-06, + "loss": 0.003, + "num_input_tokens_seen": 23357832, + "step": 11929 + }, + { + "epoch": 1.5811795891318754, + "grad_norm": 0.4280853569507599, + "learning_rate": 2.2893640940519837e-06, + "loss": 0.003, + "num_input_tokens_seen": 23360488, + "step": 11930 + }, + { + "epoch": 1.5813121272365804, + "grad_norm": 4.509779930114746, + "learning_rate": 2.2890181930971105e-06, + "loss": 0.1045, + "num_input_tokens_seen": 23362224, + "step": 11931 + }, + { + "epoch": 1.5814446653412855, + "grad_norm": 3.114455223083496, + "learning_rate": 2.2886722962101125e-06, + "loss": 0.0419, + "num_input_tokens_seen": 23364000, + "step": 11932 + }, + { + "epoch": 1.5815772034459907, + "grad_norm": 6.655788421630859, + "learning_rate": 2.288326403397658e-06, + "loss": 0.0708, + "num_input_tokens_seen": 23367056, + "step": 11933 + }, + { + "epoch": 1.581709741550696, + "grad_norm": 6.719675064086914, + "learning_rate": 2.287980514666416e-06, + "loss": 0.0933, + "num_input_tokens_seen": 23369784, + "step": 11934 + }, + { + "epoch": 1.581842279655401, + "grad_norm": 6.188154220581055, + "learning_rate": 2.287634630023056e-06, + "loss": 0.1374, + "num_input_tokens_seen": 23372280, + "step": 11935 + }, + { + "epoch": 1.581974817760106, + "grad_norm": 2.4546737670898438, + "learning_rate": 2.287288749474248e-06, + "loss": 0.0707, + "num_input_tokens_seen": 23373552, + "step": 11936 + }, + { + "epoch": 1.582107355864811, + "grad_norm": 12.997584342956543, + "learning_rate": 2.2869428730266583e-06, + "loss": 0.3838, + "num_input_tokens_seen": 23375656, + "step": 11937 + }, + { + "epoch": 1.5822398939695161, + "grad_norm": 3.504216194152832, + "learning_rate": 2.286597000686958e-06, + "loss": 0.1107, + "num_input_tokens_seen": 23377528, + "step": 11938 + }, + { + "epoch": 1.5823724320742212, + "grad_norm": 18.248077392578125, + "learning_rate": 2.2862511324618144e-06, + "loss": 0.4098, + "num_input_tokens_seen": 23378912, + "step": 11939 + }, + { + "epoch": 1.5825049701789264, + "grad_norm": 5.755631923675537, + "learning_rate": 2.285905268357896e-06, + "loss": 0.0519, + "num_input_tokens_seen": 23380480, + "step": 11940 + }, + { + "epoch": 1.5826375082836317, + "grad_norm": 0.0422113761305809, + "learning_rate": 2.285559408381872e-06, + "loss": 0.0003, + "num_input_tokens_seen": 23381960, + "step": 11941 + }, + { + "epoch": 1.5827700463883367, + "grad_norm": 2.829979658126831, + "learning_rate": 2.285213552540411e-06, + "loss": 0.0265, + "num_input_tokens_seen": 23383888, + "step": 11942 + }, + { + "epoch": 1.5829025844930418, + "grad_norm": 0.0245696771889925, + "learning_rate": 2.2848677008401805e-06, + "loss": 0.0002, + "num_input_tokens_seen": 23386416, + "step": 11943 + }, + { + "epoch": 1.5830351225977468, + "grad_norm": 8.891592979431152, + "learning_rate": 2.2845218532878496e-06, + "loss": 0.2918, + "num_input_tokens_seen": 23388488, + "step": 11944 + }, + { + "epoch": 1.5831676607024519, + "grad_norm": 4.753551483154297, + "learning_rate": 2.284176009890085e-06, + "loss": 0.2055, + "num_input_tokens_seen": 23390168, + "step": 11945 + }, + { + "epoch": 1.583300198807157, + "grad_norm": 0.036085184663534164, + "learning_rate": 2.283830170653556e-06, + "loss": 0.0003, + "num_input_tokens_seen": 23391936, + "step": 11946 + }, + { + "epoch": 1.5834327369118621, + "grad_norm": 2.4430947303771973, + "learning_rate": 2.283484335584931e-06, + "loss": 0.0421, + "num_input_tokens_seen": 23393616, + "step": 11947 + }, + { + "epoch": 1.5835652750165674, + "grad_norm": 9.587576866149902, + "learning_rate": 2.283138504690878e-06, + "loss": 0.1652, + "num_input_tokens_seen": 23395664, + "step": 11948 + }, + { + "epoch": 1.5836978131212724, + "grad_norm": 0.09076646715402603, + "learning_rate": 2.282792677978063e-06, + "loss": 0.0006, + "num_input_tokens_seen": 23397112, + "step": 11949 + }, + { + "epoch": 1.5838303512259775, + "grad_norm": 0.10874997824430466, + "learning_rate": 2.2824468554531548e-06, + "loss": 0.0006, + "num_input_tokens_seen": 23398328, + "step": 11950 + }, + { + "epoch": 1.5839628893306825, + "grad_norm": 0.1414063274860382, + "learning_rate": 2.2821010371228223e-06, + "loss": 0.0009, + "num_input_tokens_seen": 23399776, + "step": 11951 + }, + { + "epoch": 1.5840954274353876, + "grad_norm": 2.4859063625335693, + "learning_rate": 2.2817552229937317e-06, + "loss": 0.0924, + "num_input_tokens_seen": 23401448, + "step": 11952 + }, + { + "epoch": 1.5842279655400928, + "grad_norm": 8.461024284362793, + "learning_rate": 2.2814094130725514e-06, + "loss": 0.1256, + "num_input_tokens_seen": 23403288, + "step": 11953 + }, + { + "epoch": 1.5843605036447979, + "grad_norm": 3.601931095123291, + "learning_rate": 2.2810636073659477e-06, + "loss": 0.0498, + "num_input_tokens_seen": 23406640, + "step": 11954 + }, + { + "epoch": 1.5844930417495031, + "grad_norm": 6.00985050201416, + "learning_rate": 2.2807178058805883e-06, + "loss": 0.0765, + "num_input_tokens_seen": 23407760, + "step": 11955 + }, + { + "epoch": 1.5846255798542082, + "grad_norm": 2.741731882095337, + "learning_rate": 2.280372008623142e-06, + "loss": 0.0149, + "num_input_tokens_seen": 23409248, + "step": 11956 + }, + { + "epoch": 1.5847581179589132, + "grad_norm": 0.04992765188217163, + "learning_rate": 2.280026215600275e-06, + "loss": 0.0003, + "num_input_tokens_seen": 23410656, + "step": 11957 + }, + { + "epoch": 1.5848906560636182, + "grad_norm": 11.561676025390625, + "learning_rate": 2.279680426818654e-06, + "loss": 0.1557, + "num_input_tokens_seen": 23412480, + "step": 11958 + }, + { + "epoch": 1.5850231941683233, + "grad_norm": 0.11860350519418716, + "learning_rate": 2.279334642284946e-06, + "loss": 0.0008, + "num_input_tokens_seen": 23414208, + "step": 11959 + }, + { + "epoch": 1.5851557322730285, + "grad_norm": 0.17873084545135498, + "learning_rate": 2.278988862005818e-06, + "loss": 0.0013, + "num_input_tokens_seen": 23416488, + "step": 11960 + }, + { + "epoch": 1.5852882703777336, + "grad_norm": 3.4755923748016357, + "learning_rate": 2.2786430859879377e-06, + "loss": 0.055, + "num_input_tokens_seen": 23417872, + "step": 11961 + }, + { + "epoch": 1.5854208084824388, + "grad_norm": 16.926040649414062, + "learning_rate": 2.278297314237972e-06, + "loss": 0.2674, + "num_input_tokens_seen": 23419176, + "step": 11962 + }, + { + "epoch": 1.5855533465871439, + "grad_norm": 0.7425496578216553, + "learning_rate": 2.277951546762586e-06, + "loss": 0.0077, + "num_input_tokens_seen": 23421984, + "step": 11963 + }, + { + "epoch": 1.585685884691849, + "grad_norm": 7.739573955535889, + "learning_rate": 2.2776057835684484e-06, + "loss": 0.2317, + "num_input_tokens_seen": 23424104, + "step": 11964 + }, + { + "epoch": 1.585818422796554, + "grad_norm": 0.1579480618238449, + "learning_rate": 2.2772600246622235e-06, + "loss": 0.0011, + "num_input_tokens_seen": 23427024, + "step": 11965 + }, + { + "epoch": 1.585950960901259, + "grad_norm": 6.025252819061279, + "learning_rate": 2.2769142700505798e-06, + "loss": 0.1281, + "num_input_tokens_seen": 23428712, + "step": 11966 + }, + { + "epoch": 1.5860834990059642, + "grad_norm": 3.7656054496765137, + "learning_rate": 2.2765685197401825e-06, + "loss": 0.0312, + "num_input_tokens_seen": 23431160, + "step": 11967 + }, + { + "epoch": 1.5862160371106693, + "grad_norm": 0.3239903151988983, + "learning_rate": 2.2762227737376986e-06, + "loss": 0.0023, + "num_input_tokens_seen": 23432720, + "step": 11968 + }, + { + "epoch": 1.5863485752153745, + "grad_norm": 6.767882823944092, + "learning_rate": 2.275877032049794e-06, + "loss": 0.1052, + "num_input_tokens_seen": 23435072, + "step": 11969 + }, + { + "epoch": 1.5864811133200796, + "grad_norm": 13.696349143981934, + "learning_rate": 2.275531294683135e-06, + "loss": 0.3848, + "num_input_tokens_seen": 23437320, + "step": 11970 + }, + { + "epoch": 1.5866136514247846, + "grad_norm": 0.4737248718738556, + "learning_rate": 2.2751855616443865e-06, + "loss": 0.004, + "num_input_tokens_seen": 23438672, + "step": 11971 + }, + { + "epoch": 1.5867461895294896, + "grad_norm": 1.248619556427002, + "learning_rate": 2.2748398329402166e-06, + "loss": 0.0078, + "num_input_tokens_seen": 23440456, + "step": 11972 + }, + { + "epoch": 1.5868787276341947, + "grad_norm": 2.4747486114501953, + "learning_rate": 2.274494108577289e-06, + "loss": 0.0157, + "num_input_tokens_seen": 23442312, + "step": 11973 + }, + { + "epoch": 1.5870112657389, + "grad_norm": 3.581362009048462, + "learning_rate": 2.274148388562272e-06, + "loss": 0.0611, + "num_input_tokens_seen": 23444000, + "step": 11974 + }, + { + "epoch": 1.587143803843605, + "grad_norm": 0.11212000250816345, + "learning_rate": 2.2738026729018285e-06, + "loss": 0.0008, + "num_input_tokens_seen": 23446304, + "step": 11975 + }, + { + "epoch": 1.5872763419483102, + "grad_norm": 2.074960708618164, + "learning_rate": 2.2734569616026254e-06, + "loss": 0.0198, + "num_input_tokens_seen": 23448416, + "step": 11976 + }, + { + "epoch": 1.5874088800530153, + "grad_norm": 2.1159725189208984, + "learning_rate": 2.2731112546713298e-06, + "loss": 0.0308, + "num_input_tokens_seen": 23450048, + "step": 11977 + }, + { + "epoch": 1.5875414181577203, + "grad_norm": 7.115092754364014, + "learning_rate": 2.272765552114605e-06, + "loss": 0.1194, + "num_input_tokens_seen": 23452120, + "step": 11978 + }, + { + "epoch": 1.5876739562624254, + "grad_norm": 6.320615291595459, + "learning_rate": 2.2724198539391172e-06, + "loss": 0.1532, + "num_input_tokens_seen": 23453704, + "step": 11979 + }, + { + "epoch": 1.5878064943671304, + "grad_norm": 2.811638116836548, + "learning_rate": 2.2720741601515318e-06, + "loss": 0.0388, + "num_input_tokens_seen": 23455464, + "step": 11980 + }, + { + "epoch": 1.5879390324718357, + "grad_norm": 6.016495227813721, + "learning_rate": 2.271728470758513e-06, + "loss": 0.0564, + "num_input_tokens_seen": 23457648, + "step": 11981 + }, + { + "epoch": 1.588071570576541, + "grad_norm": 1.3111093044281006, + "learning_rate": 2.2713827857667267e-06, + "loss": 0.0237, + "num_input_tokens_seen": 23458960, + "step": 11982 + }, + { + "epoch": 1.588204108681246, + "grad_norm": 0.14139525592327118, + "learning_rate": 2.271037105182839e-06, + "loss": 0.001, + "num_input_tokens_seen": 23461624, + "step": 11983 + }, + { + "epoch": 1.588336646785951, + "grad_norm": 5.6465349197387695, + "learning_rate": 2.270691429013513e-06, + "loss": 0.0891, + "num_input_tokens_seen": 23463280, + "step": 11984 + }, + { + "epoch": 1.588469184890656, + "grad_norm": 12.565289497375488, + "learning_rate": 2.2703457572654156e-06, + "loss": 0.4484, + "num_input_tokens_seen": 23466072, + "step": 11985 + }, + { + "epoch": 1.588601722995361, + "grad_norm": 4.7015252113342285, + "learning_rate": 2.2700000899452092e-06, + "loss": 0.0434, + "num_input_tokens_seen": 23468232, + "step": 11986 + }, + { + "epoch": 1.588734261100066, + "grad_norm": 13.96502685546875, + "learning_rate": 2.2696544270595604e-06, + "loss": 0.2317, + "num_input_tokens_seen": 23470192, + "step": 11987 + }, + { + "epoch": 1.5888667992047714, + "grad_norm": 3.691803455352783, + "learning_rate": 2.2693087686151328e-06, + "loss": 0.0803, + "num_input_tokens_seen": 23472304, + "step": 11988 + }, + { + "epoch": 1.5889993373094766, + "grad_norm": 0.108022540807724, + "learning_rate": 2.268963114618592e-06, + "loss": 0.0004, + "num_input_tokens_seen": 23473744, + "step": 11989 + }, + { + "epoch": 1.5891318754141817, + "grad_norm": 5.816701889038086, + "learning_rate": 2.268617465076601e-06, + "loss": 0.0995, + "num_input_tokens_seen": 23475424, + "step": 11990 + }, + { + "epoch": 1.5892644135188867, + "grad_norm": 2.1570260524749756, + "learning_rate": 2.2682718199958243e-06, + "loss": 0.058, + "num_input_tokens_seen": 23478064, + "step": 11991 + }, + { + "epoch": 1.5893969516235917, + "grad_norm": 0.9933966398239136, + "learning_rate": 2.267926179382928e-06, + "loss": 0.0086, + "num_input_tokens_seen": 23479568, + "step": 11992 + }, + { + "epoch": 1.5895294897282968, + "grad_norm": 14.166241645812988, + "learning_rate": 2.2675805432445743e-06, + "loss": 0.2075, + "num_input_tokens_seen": 23481696, + "step": 11993 + }, + { + "epoch": 1.589662027833002, + "grad_norm": 16.522523880004883, + "learning_rate": 2.267234911587429e-06, + "loss": 0.4768, + "num_input_tokens_seen": 23484376, + "step": 11994 + }, + { + "epoch": 1.589794565937707, + "grad_norm": 0.32498085498809814, + "learning_rate": 2.266889284418155e-06, + "loss": 0.0023, + "num_input_tokens_seen": 23486512, + "step": 11995 + }, + { + "epoch": 1.5899271040424123, + "grad_norm": 0.043517787009477615, + "learning_rate": 2.266543661743416e-06, + "loss": 0.0003, + "num_input_tokens_seen": 23488208, + "step": 11996 + }, + { + "epoch": 1.5900596421471174, + "grad_norm": 3.0836009979248047, + "learning_rate": 2.266198043569875e-06, + "loss": 0.013, + "num_input_tokens_seen": 23489824, + "step": 11997 + }, + { + "epoch": 1.5901921802518224, + "grad_norm": 0.045807354152202606, + "learning_rate": 2.265852429904199e-06, + "loss": 0.0003, + "num_input_tokens_seen": 23491776, + "step": 11998 + }, + { + "epoch": 1.5903247183565274, + "grad_norm": 8.787790298461914, + "learning_rate": 2.2655068207530486e-06, + "loss": 0.1947, + "num_input_tokens_seen": 23494568, + "step": 11999 + }, + { + "epoch": 1.5904572564612325, + "grad_norm": 12.732240676879883, + "learning_rate": 2.2651612161230894e-06, + "loss": 0.178, + "num_input_tokens_seen": 23497144, + "step": 12000 + }, + { + "epoch": 1.5905897945659377, + "grad_norm": 6.9903340339660645, + "learning_rate": 2.2648156160209838e-06, + "loss": 0.0812, + "num_input_tokens_seen": 23499120, + "step": 12001 + }, + { + "epoch": 1.5907223326706428, + "grad_norm": 10.034601211547852, + "learning_rate": 2.2644700204533945e-06, + "loss": 0.192, + "num_input_tokens_seen": 23501688, + "step": 12002 + }, + { + "epoch": 1.590854870775348, + "grad_norm": 0.11681302636861801, + "learning_rate": 2.2641244294269855e-06, + "loss": 0.0008, + "num_input_tokens_seen": 23503840, + "step": 12003 + }, + { + "epoch": 1.590987408880053, + "grad_norm": 4.4627909660339355, + "learning_rate": 2.2637788429484215e-06, + "loss": 0.068, + "num_input_tokens_seen": 23506216, + "step": 12004 + }, + { + "epoch": 1.5911199469847581, + "grad_norm": 5.976934432983398, + "learning_rate": 2.2634332610243633e-06, + "loss": 0.1195, + "num_input_tokens_seen": 23508312, + "step": 12005 + }, + { + "epoch": 1.5912524850894632, + "grad_norm": 0.0939573273062706, + "learning_rate": 2.263087683661476e-06, + "loss": 0.0007, + "num_input_tokens_seen": 23510160, + "step": 12006 + }, + { + "epoch": 1.5913850231941682, + "grad_norm": 0.2906567454338074, + "learning_rate": 2.2627421108664203e-06, + "loss": 0.0017, + "num_input_tokens_seen": 23512960, + "step": 12007 + }, + { + "epoch": 1.5915175612988735, + "grad_norm": 6.636651039123535, + "learning_rate": 2.262396542645861e-06, + "loss": 0.1029, + "num_input_tokens_seen": 23515040, + "step": 12008 + }, + { + "epoch": 1.5916500994035785, + "grad_norm": 4.221116542816162, + "learning_rate": 2.262050979006461e-06, + "loss": 0.0411, + "num_input_tokens_seen": 23517824, + "step": 12009 + }, + { + "epoch": 1.5917826375082837, + "grad_norm": 0.06319459527730942, + "learning_rate": 2.261705419954882e-06, + "loss": 0.0004, + "num_input_tokens_seen": 23520768, + "step": 12010 + }, + { + "epoch": 1.5919151756129888, + "grad_norm": 0.10651376098394394, + "learning_rate": 2.261359865497786e-06, + "loss": 0.0008, + "num_input_tokens_seen": 23523056, + "step": 12011 + }, + { + "epoch": 1.5920477137176938, + "grad_norm": 0.1507127285003662, + "learning_rate": 2.2610143156418367e-06, + "loss": 0.0009, + "num_input_tokens_seen": 23524640, + "step": 12012 + }, + { + "epoch": 1.5921802518223989, + "grad_norm": 10.325170516967773, + "learning_rate": 2.2606687703936965e-06, + "loss": 0.2528, + "num_input_tokens_seen": 23527560, + "step": 12013 + }, + { + "epoch": 1.592312789927104, + "grad_norm": 9.946887016296387, + "learning_rate": 2.2603232297600275e-06, + "loss": 0.1841, + "num_input_tokens_seen": 23529936, + "step": 12014 + }, + { + "epoch": 1.5924453280318092, + "grad_norm": 5.147143840789795, + "learning_rate": 2.259977693747493e-06, + "loss": 0.1389, + "num_input_tokens_seen": 23531896, + "step": 12015 + }, + { + "epoch": 1.5925778661365142, + "grad_norm": 1.8185546398162842, + "learning_rate": 2.259632162362753e-06, + "loss": 0.0165, + "num_input_tokens_seen": 23533496, + "step": 12016 + }, + { + "epoch": 1.5927104042412195, + "grad_norm": 2.1618411540985107, + "learning_rate": 2.25928663561247e-06, + "loss": 0.0456, + "num_input_tokens_seen": 23534992, + "step": 12017 + }, + { + "epoch": 1.5928429423459245, + "grad_norm": 11.428789138793945, + "learning_rate": 2.2589411135033074e-06, + "loss": 0.1621, + "num_input_tokens_seen": 23536712, + "step": 12018 + }, + { + "epoch": 1.5929754804506295, + "grad_norm": 0.06381946057081223, + "learning_rate": 2.2585955960419273e-06, + "loss": 0.0004, + "num_input_tokens_seen": 23538400, + "step": 12019 + }, + { + "epoch": 1.5931080185553346, + "grad_norm": 7.035248279571533, + "learning_rate": 2.2582500832349897e-06, + "loss": 0.171, + "num_input_tokens_seen": 23540568, + "step": 12020 + }, + { + "epoch": 1.5932405566600396, + "grad_norm": 10.3733491897583, + "learning_rate": 2.257904575089158e-06, + "loss": 0.2688, + "num_input_tokens_seen": 23542640, + "step": 12021 + }, + { + "epoch": 1.5933730947647449, + "grad_norm": 4.812440872192383, + "learning_rate": 2.2575590716110924e-06, + "loss": 0.0202, + "num_input_tokens_seen": 23545248, + "step": 12022 + }, + { + "epoch": 1.5935056328694501, + "grad_norm": 4.870852947235107, + "learning_rate": 2.257213572807455e-06, + "loss": 0.0537, + "num_input_tokens_seen": 23547008, + "step": 12023 + }, + { + "epoch": 1.5936381709741552, + "grad_norm": 3.8645684719085693, + "learning_rate": 2.2568680786849083e-06, + "loss": 0.0597, + "num_input_tokens_seen": 23548656, + "step": 12024 + }, + { + "epoch": 1.5937707090788602, + "grad_norm": 0.41583776473999023, + "learning_rate": 2.2565225892501127e-06, + "loss": 0.0028, + "num_input_tokens_seen": 23549888, + "step": 12025 + }, + { + "epoch": 1.5939032471835652, + "grad_norm": 0.04926043003797531, + "learning_rate": 2.2561771045097292e-06, + "loss": 0.0003, + "num_input_tokens_seen": 23551088, + "step": 12026 + }, + { + "epoch": 1.5940357852882703, + "grad_norm": 0.017140159383416176, + "learning_rate": 2.2558316244704197e-06, + "loss": 0.0001, + "num_input_tokens_seen": 23552280, + "step": 12027 + }, + { + "epoch": 1.5941683233929753, + "grad_norm": 6.090574264526367, + "learning_rate": 2.2554861491388434e-06, + "loss": 0.0491, + "num_input_tokens_seen": 23555104, + "step": 12028 + }, + { + "epoch": 1.5943008614976806, + "grad_norm": 0.07023546099662781, + "learning_rate": 2.255140678521664e-06, + "loss": 0.0005, + "num_input_tokens_seen": 23556480, + "step": 12029 + }, + { + "epoch": 1.5944333996023858, + "grad_norm": 9.635007858276367, + "learning_rate": 2.2547952126255414e-06, + "loss": 0.1734, + "num_input_tokens_seen": 23558264, + "step": 12030 + }, + { + "epoch": 1.5945659377070909, + "grad_norm": 0.06144785135984421, + "learning_rate": 2.254449751457136e-06, + "loss": 0.0012, + "num_input_tokens_seen": 23560264, + "step": 12031 + }, + { + "epoch": 1.594698475811796, + "grad_norm": 7.15757417678833, + "learning_rate": 2.2541042950231093e-06, + "loss": 0.142, + "num_input_tokens_seen": 23563328, + "step": 12032 + }, + { + "epoch": 1.594831013916501, + "grad_norm": 11.77768611907959, + "learning_rate": 2.2537588433301203e-06, + "loss": 0.1332, + "num_input_tokens_seen": 23565016, + "step": 12033 + }, + { + "epoch": 1.594963552021206, + "grad_norm": 7.994359016418457, + "learning_rate": 2.2534133963848314e-06, + "loss": 0.1303, + "num_input_tokens_seen": 23566696, + "step": 12034 + }, + { + "epoch": 1.5950960901259112, + "grad_norm": 2.325803279876709, + "learning_rate": 2.2530679541939024e-06, + "loss": 0.0321, + "num_input_tokens_seen": 23568384, + "step": 12035 + }, + { + "epoch": 1.5952286282306163, + "grad_norm": 3.0436060428619385, + "learning_rate": 2.2527225167639936e-06, + "loss": 0.0304, + "num_input_tokens_seen": 23570288, + "step": 12036 + }, + { + "epoch": 1.5953611663353215, + "grad_norm": 0.03247248753905296, + "learning_rate": 2.252377084101765e-06, + "loss": 0.0002, + "num_input_tokens_seen": 23571648, + "step": 12037 + }, + { + "epoch": 1.5954937044400266, + "grad_norm": 4.5432448387146, + "learning_rate": 2.2520316562138765e-06, + "loss": 0.0422, + "num_input_tokens_seen": 23574200, + "step": 12038 + }, + { + "epoch": 1.5956262425447316, + "grad_norm": 4.093209266662598, + "learning_rate": 2.25168623310699e-06, + "loss": 0.1059, + "num_input_tokens_seen": 23576048, + "step": 12039 + }, + { + "epoch": 1.5957587806494367, + "grad_norm": 11.41279125213623, + "learning_rate": 2.251340814787764e-06, + "loss": 0.2318, + "num_input_tokens_seen": 23578136, + "step": 12040 + }, + { + "epoch": 1.5958913187541417, + "grad_norm": 9.939678192138672, + "learning_rate": 2.250995401262858e-06, + "loss": 0.3404, + "num_input_tokens_seen": 23579720, + "step": 12041 + }, + { + "epoch": 1.596023856858847, + "grad_norm": 19.935405731201172, + "learning_rate": 2.250649992538933e-06, + "loss": 0.1322, + "num_input_tokens_seen": 23581400, + "step": 12042 + }, + { + "epoch": 1.596156394963552, + "grad_norm": 3.9748375415802, + "learning_rate": 2.250304588622647e-06, + "loss": 0.0503, + "num_input_tokens_seen": 23583840, + "step": 12043 + }, + { + "epoch": 1.5962889330682573, + "grad_norm": 9.957401275634766, + "learning_rate": 2.2499591895206616e-06, + "loss": 0.0559, + "num_input_tokens_seen": 23585616, + "step": 12044 + }, + { + "epoch": 1.5964214711729623, + "grad_norm": 3.113889694213867, + "learning_rate": 2.249613795239636e-06, + "loss": 0.0319, + "num_input_tokens_seen": 23589016, + "step": 12045 + }, + { + "epoch": 1.5965540092776673, + "grad_norm": 8.364652633666992, + "learning_rate": 2.2492684057862285e-06, + "loss": 0.1706, + "num_input_tokens_seen": 23590984, + "step": 12046 + }, + { + "epoch": 1.5966865473823724, + "grad_norm": 13.393416404724121, + "learning_rate": 2.2489230211671e-06, + "loss": 0.1564, + "num_input_tokens_seen": 23592888, + "step": 12047 + }, + { + "epoch": 1.5968190854870774, + "grad_norm": 2.425830602645874, + "learning_rate": 2.248577641388907e-06, + "loss": 0.0155, + "num_input_tokens_seen": 23594240, + "step": 12048 + }, + { + "epoch": 1.5969516235917827, + "grad_norm": 9.745949745178223, + "learning_rate": 2.2482322664583125e-06, + "loss": 0.1156, + "num_input_tokens_seen": 23596344, + "step": 12049 + }, + { + "epoch": 1.5970841616964877, + "grad_norm": 0.1863100528717041, + "learning_rate": 2.2478868963819725e-06, + "loss": 0.0011, + "num_input_tokens_seen": 23597784, + "step": 12050 + }, + { + "epoch": 1.597216699801193, + "grad_norm": 5.555985450744629, + "learning_rate": 2.247541531166548e-06, + "loss": 0.0306, + "num_input_tokens_seen": 23599744, + "step": 12051 + }, + { + "epoch": 1.597349237905898, + "grad_norm": 7.286861419677734, + "learning_rate": 2.247196170818696e-06, + "loss": 0.0855, + "num_input_tokens_seen": 23601440, + "step": 12052 + }, + { + "epoch": 1.597481776010603, + "grad_norm": 0.03255829960107803, + "learning_rate": 2.246850815345077e-06, + "loss": 0.0002, + "num_input_tokens_seen": 23603024, + "step": 12053 + }, + { + "epoch": 1.597614314115308, + "grad_norm": 10.03581428527832, + "learning_rate": 2.2465054647523477e-06, + "loss": 0.2557, + "num_input_tokens_seen": 23605136, + "step": 12054 + }, + { + "epoch": 1.5977468522200131, + "grad_norm": 12.676031112670898, + "learning_rate": 2.2461601190471688e-06, + "loss": 0.225, + "num_input_tokens_seen": 23608496, + "step": 12055 + }, + { + "epoch": 1.5978793903247184, + "grad_norm": 5.926129341125488, + "learning_rate": 2.245814778236198e-06, + "loss": 0.0662, + "num_input_tokens_seen": 23611240, + "step": 12056 + }, + { + "epoch": 1.5980119284294234, + "grad_norm": 4.3048810958862305, + "learning_rate": 2.2454694423260935e-06, + "loss": 0.0966, + "num_input_tokens_seen": 23614336, + "step": 12057 + }, + { + "epoch": 1.5981444665341287, + "grad_norm": 0.2580191195011139, + "learning_rate": 2.2451241113235134e-06, + "loss": 0.0015, + "num_input_tokens_seen": 23616680, + "step": 12058 + }, + { + "epoch": 1.5982770046388337, + "grad_norm": 10.080615043640137, + "learning_rate": 2.244778785235116e-06, + "loss": 0.1924, + "num_input_tokens_seen": 23619168, + "step": 12059 + }, + { + "epoch": 1.5984095427435387, + "grad_norm": 2.785395622253418, + "learning_rate": 2.2444334640675605e-06, + "loss": 0.0168, + "num_input_tokens_seen": 23620712, + "step": 12060 + }, + { + "epoch": 1.5985420808482438, + "grad_norm": 13.239164352416992, + "learning_rate": 2.2440881478275035e-06, + "loss": 0.4522, + "num_input_tokens_seen": 23622944, + "step": 12061 + }, + { + "epoch": 1.5986746189529488, + "grad_norm": 5.530117034912109, + "learning_rate": 2.2437428365216048e-06, + "loss": 0.0441, + "num_input_tokens_seen": 23624376, + "step": 12062 + }, + { + "epoch": 1.598807157057654, + "grad_norm": 2.131474018096924, + "learning_rate": 2.2433975301565204e-06, + "loss": 0.0513, + "num_input_tokens_seen": 23626312, + "step": 12063 + }, + { + "epoch": 1.5989396951623593, + "grad_norm": 4.599630832672119, + "learning_rate": 2.243052228738907e-06, + "loss": 0.0557, + "num_input_tokens_seen": 23628120, + "step": 12064 + }, + { + "epoch": 1.5990722332670644, + "grad_norm": 1.8975003957748413, + "learning_rate": 2.2427069322754253e-06, + "loss": 0.0291, + "num_input_tokens_seen": 23630120, + "step": 12065 + }, + { + "epoch": 1.5992047713717694, + "grad_norm": 1.3744724988937378, + "learning_rate": 2.2423616407727317e-06, + "loss": 0.0262, + "num_input_tokens_seen": 23631912, + "step": 12066 + }, + { + "epoch": 1.5993373094764745, + "grad_norm": 1.3285212516784668, + "learning_rate": 2.242016354237483e-06, + "loss": 0.0086, + "num_input_tokens_seen": 23633504, + "step": 12067 + }, + { + "epoch": 1.5994698475811795, + "grad_norm": 0.34640151262283325, + "learning_rate": 2.2416710726763373e-06, + "loss": 0.0015, + "num_input_tokens_seen": 23634936, + "step": 12068 + }, + { + "epoch": 1.5996023856858845, + "grad_norm": 6.452073574066162, + "learning_rate": 2.241325796095951e-06, + "loss": 0.1873, + "num_input_tokens_seen": 23637376, + "step": 12069 + }, + { + "epoch": 1.5997349237905898, + "grad_norm": 2.916264533996582, + "learning_rate": 2.2409805245029816e-06, + "loss": 0.0114, + "num_input_tokens_seen": 23638984, + "step": 12070 + }, + { + "epoch": 1.599867461895295, + "grad_norm": 16.574281692504883, + "learning_rate": 2.240635257904087e-06, + "loss": 0.2884, + "num_input_tokens_seen": 23641584, + "step": 12071 + }, + { + "epoch": 1.6, + "grad_norm": 0.5606725811958313, + "learning_rate": 2.2402899963059243e-06, + "loss": 0.0095, + "num_input_tokens_seen": 23643232, + "step": 12072 + }, + { + "epoch": 1.6001325381047051, + "grad_norm": 4.341292381286621, + "learning_rate": 2.2399447397151483e-06, + "loss": 0.0386, + "num_input_tokens_seen": 23644632, + "step": 12073 + }, + { + "epoch": 1.6002650762094102, + "grad_norm": 2.5793914794921875, + "learning_rate": 2.2395994881384166e-06, + "loss": 0.045, + "num_input_tokens_seen": 23646016, + "step": 12074 + }, + { + "epoch": 1.6003976143141152, + "grad_norm": 1.784447431564331, + "learning_rate": 2.239254241582388e-06, + "loss": 0.0264, + "num_input_tokens_seen": 23648408, + "step": 12075 + }, + { + "epoch": 1.6005301524188205, + "grad_norm": 11.753952026367188, + "learning_rate": 2.2389090000537166e-06, + "loss": 0.1769, + "num_input_tokens_seen": 23650296, + "step": 12076 + }, + { + "epoch": 1.6006626905235255, + "grad_norm": 3.830230236053467, + "learning_rate": 2.2385637635590604e-06, + "loss": 0.1603, + "num_input_tokens_seen": 23651904, + "step": 12077 + }, + { + "epoch": 1.6007952286282308, + "grad_norm": 16.25090980529785, + "learning_rate": 2.238218532105075e-06, + "loss": 0.2608, + "num_input_tokens_seen": 23654064, + "step": 12078 + }, + { + "epoch": 1.6009277667329358, + "grad_norm": 0.06424489617347717, + "learning_rate": 2.2378733056984166e-06, + "loss": 0.0003, + "num_input_tokens_seen": 23656296, + "step": 12079 + }, + { + "epoch": 1.6010603048376408, + "grad_norm": 18.754308700561523, + "learning_rate": 2.2375280843457415e-06, + "loss": 0.2702, + "num_input_tokens_seen": 23658752, + "step": 12080 + }, + { + "epoch": 1.6011928429423459, + "grad_norm": 0.4402199685573578, + "learning_rate": 2.2371828680537066e-06, + "loss": 0.0019, + "num_input_tokens_seen": 23660688, + "step": 12081 + }, + { + "epoch": 1.601325381047051, + "grad_norm": 2.9767754077911377, + "learning_rate": 2.2368376568289667e-06, + "loss": 0.038, + "num_input_tokens_seen": 23662664, + "step": 12082 + }, + { + "epoch": 1.6014579191517562, + "grad_norm": 3.2979800701141357, + "learning_rate": 2.2364924506781795e-06, + "loss": 0.0462, + "num_input_tokens_seen": 23663872, + "step": 12083 + }, + { + "epoch": 1.6015904572564612, + "grad_norm": 0.8572508692741394, + "learning_rate": 2.2361472496079984e-06, + "loss": 0.0084, + "num_input_tokens_seen": 23665096, + "step": 12084 + }, + { + "epoch": 1.6017229953611665, + "grad_norm": 0.3461538553237915, + "learning_rate": 2.2358020536250802e-06, + "loss": 0.0021, + "num_input_tokens_seen": 23666456, + "step": 12085 + }, + { + "epoch": 1.6018555334658715, + "grad_norm": 0.012356928549706936, + "learning_rate": 2.2354568627360814e-06, + "loss": 0.0001, + "num_input_tokens_seen": 23667640, + "step": 12086 + }, + { + "epoch": 1.6019880715705765, + "grad_norm": 0.007093160878866911, + "learning_rate": 2.235111676947657e-06, + "loss": 0.0, + "num_input_tokens_seen": 23669040, + "step": 12087 + }, + { + "epoch": 1.6021206096752816, + "grad_norm": 11.110785484313965, + "learning_rate": 2.2347664962664615e-06, + "loss": 0.168, + "num_input_tokens_seen": 23672136, + "step": 12088 + }, + { + "epoch": 1.6022531477799866, + "grad_norm": 4.414505958557129, + "learning_rate": 2.234421320699152e-06, + "loss": 0.0555, + "num_input_tokens_seen": 23674040, + "step": 12089 + }, + { + "epoch": 1.6023856858846919, + "grad_norm": 14.67821979522705, + "learning_rate": 2.234076150252381e-06, + "loss": 0.3394, + "num_input_tokens_seen": 23675800, + "step": 12090 + }, + { + "epoch": 1.602518223989397, + "grad_norm": 9.597174644470215, + "learning_rate": 2.233730984932806e-06, + "loss": 0.2338, + "num_input_tokens_seen": 23677872, + "step": 12091 + }, + { + "epoch": 1.6026507620941022, + "grad_norm": 10.621548652648926, + "learning_rate": 2.2333858247470817e-06, + "loss": 0.0426, + "num_input_tokens_seen": 23679944, + "step": 12092 + }, + { + "epoch": 1.6027833001988072, + "grad_norm": 8.552433013916016, + "learning_rate": 2.233040669701863e-06, + "loss": 0.2142, + "num_input_tokens_seen": 23682248, + "step": 12093 + }, + { + "epoch": 1.6029158383035123, + "grad_norm": 17.8165225982666, + "learning_rate": 2.2326955198038035e-06, + "loss": 0.1476, + "num_input_tokens_seen": 23684192, + "step": 12094 + }, + { + "epoch": 1.6030483764082173, + "grad_norm": 3.8980014324188232, + "learning_rate": 2.2323503750595584e-06, + "loss": 0.069, + "num_input_tokens_seen": 23685968, + "step": 12095 + }, + { + "epoch": 1.6031809145129223, + "grad_norm": 4.095605850219727, + "learning_rate": 2.232005235475784e-06, + "loss": 0.0695, + "num_input_tokens_seen": 23688008, + "step": 12096 + }, + { + "epoch": 1.6033134526176276, + "grad_norm": 0.07420234382152557, + "learning_rate": 2.2316601010591324e-06, + "loss": 0.0004, + "num_input_tokens_seen": 23689200, + "step": 12097 + }, + { + "epoch": 1.6034459907223326, + "grad_norm": 17.663143157958984, + "learning_rate": 2.23131497181626e-06, + "loss": 0.4008, + "num_input_tokens_seen": 23691528, + "step": 12098 + }, + { + "epoch": 1.6035785288270379, + "grad_norm": 5.747623443603516, + "learning_rate": 2.23096984775382e-06, + "loss": 0.1598, + "num_input_tokens_seen": 23693224, + "step": 12099 + }, + { + "epoch": 1.603711066931743, + "grad_norm": 0.09166564792394638, + "learning_rate": 2.230624728878466e-06, + "loss": 0.0005, + "num_input_tokens_seen": 23694736, + "step": 12100 + }, + { + "epoch": 1.603843605036448, + "grad_norm": 5.317575454711914, + "learning_rate": 2.230279615196854e-06, + "loss": 0.1417, + "num_input_tokens_seen": 23696600, + "step": 12101 + }, + { + "epoch": 1.603976143141153, + "grad_norm": 4.6134538650512695, + "learning_rate": 2.229934506715638e-06, + "loss": 0.0453, + "num_input_tokens_seen": 23698768, + "step": 12102 + }, + { + "epoch": 1.604108681245858, + "grad_norm": 10.47490119934082, + "learning_rate": 2.2295894034414698e-06, + "loss": 0.2076, + "num_input_tokens_seen": 23701032, + "step": 12103 + }, + { + "epoch": 1.6042412193505633, + "grad_norm": 0.7334733009338379, + "learning_rate": 2.229244305381005e-06, + "loss": 0.0076, + "num_input_tokens_seen": 23702520, + "step": 12104 + }, + { + "epoch": 1.6043737574552683, + "grad_norm": 7.444337844848633, + "learning_rate": 2.228899212540896e-06, + "loss": 0.2234, + "num_input_tokens_seen": 23704664, + "step": 12105 + }, + { + "epoch": 1.6045062955599736, + "grad_norm": 0.06223238632082939, + "learning_rate": 2.2285541249277973e-06, + "loss": 0.0003, + "num_input_tokens_seen": 23706016, + "step": 12106 + }, + { + "epoch": 1.6046388336646786, + "grad_norm": 12.7566556930542, + "learning_rate": 2.228209042548363e-06, + "loss": 0.2054, + "num_input_tokens_seen": 23708592, + "step": 12107 + }, + { + "epoch": 1.6047713717693837, + "grad_norm": 0.07718707621097565, + "learning_rate": 2.2278639654092456e-06, + "loss": 0.0005, + "num_input_tokens_seen": 23710488, + "step": 12108 + }, + { + "epoch": 1.6049039098740887, + "grad_norm": 4.496670722961426, + "learning_rate": 2.227518893517099e-06, + "loss": 0.1394, + "num_input_tokens_seen": 23712448, + "step": 12109 + }, + { + "epoch": 1.6050364479787937, + "grad_norm": 12.513678550720215, + "learning_rate": 2.227173826878575e-06, + "loss": 0.1975, + "num_input_tokens_seen": 23714136, + "step": 12110 + }, + { + "epoch": 1.605168986083499, + "grad_norm": 12.951325416564941, + "learning_rate": 2.226828765500329e-06, + "loss": 0.3007, + "num_input_tokens_seen": 23716624, + "step": 12111 + }, + { + "epoch": 1.6053015241882043, + "grad_norm": 7.823023796081543, + "learning_rate": 2.226483709389012e-06, + "loss": 0.0918, + "num_input_tokens_seen": 23718376, + "step": 12112 + }, + { + "epoch": 1.6054340622929093, + "grad_norm": 6.269279479980469, + "learning_rate": 2.2261386585512787e-06, + "loss": 0.135, + "num_input_tokens_seen": 23720416, + "step": 12113 + }, + { + "epoch": 1.6055666003976143, + "grad_norm": 8.583213806152344, + "learning_rate": 2.2257936129937806e-06, + "loss": 0.2076, + "num_input_tokens_seen": 23723056, + "step": 12114 + }, + { + "epoch": 1.6056991385023194, + "grad_norm": 5.151157379150391, + "learning_rate": 2.225448572723171e-06, + "loss": 0.1087, + "num_input_tokens_seen": 23724376, + "step": 12115 + }, + { + "epoch": 1.6058316766070244, + "grad_norm": 4.307162761688232, + "learning_rate": 2.2251035377461015e-06, + "loss": 0.041, + "num_input_tokens_seen": 23725952, + "step": 12116 + }, + { + "epoch": 1.6059642147117297, + "grad_norm": 3.728837251663208, + "learning_rate": 2.224758508069226e-06, + "loss": 0.0468, + "num_input_tokens_seen": 23727456, + "step": 12117 + }, + { + "epoch": 1.6060967528164347, + "grad_norm": 0.5464895367622375, + "learning_rate": 2.224413483699196e-06, + "loss": 0.0043, + "num_input_tokens_seen": 23729256, + "step": 12118 + }, + { + "epoch": 1.60622929092114, + "grad_norm": 4.899016857147217, + "learning_rate": 2.224068464642665e-06, + "loss": 0.0786, + "num_input_tokens_seen": 23732024, + "step": 12119 + }, + { + "epoch": 1.606361829025845, + "grad_norm": 14.704739570617676, + "learning_rate": 2.223723450906284e-06, + "loss": 0.6352, + "num_input_tokens_seen": 23735016, + "step": 12120 + }, + { + "epoch": 1.60649436713055, + "grad_norm": 9.16898250579834, + "learning_rate": 2.2233784424967045e-06, + "loss": 0.2791, + "num_input_tokens_seen": 23736784, + "step": 12121 + }, + { + "epoch": 1.606626905235255, + "grad_norm": 2.2359039783477783, + "learning_rate": 2.2230334394205807e-06, + "loss": 0.0458, + "num_input_tokens_seen": 23738264, + "step": 12122 + }, + { + "epoch": 1.6067594433399601, + "grad_norm": 0.3673429787158966, + "learning_rate": 2.2226884416845625e-06, + "loss": 0.0048, + "num_input_tokens_seen": 23739744, + "step": 12123 + }, + { + "epoch": 1.6068919814446654, + "grad_norm": 7.755154609680176, + "learning_rate": 2.222343449295303e-06, + "loss": 0.1323, + "num_input_tokens_seen": 23742312, + "step": 12124 + }, + { + "epoch": 1.6070245195493704, + "grad_norm": 6.227529048919678, + "learning_rate": 2.2219984622594536e-06, + "loss": 0.1242, + "num_input_tokens_seen": 23744384, + "step": 12125 + }, + { + "epoch": 1.6071570576540757, + "grad_norm": 0.8258019089698792, + "learning_rate": 2.2216534805836642e-06, + "loss": 0.004, + "num_input_tokens_seen": 23747992, + "step": 12126 + }, + { + "epoch": 1.6072895957587807, + "grad_norm": 12.757667541503906, + "learning_rate": 2.2213085042745888e-06, + "loss": 0.2904, + "num_input_tokens_seen": 23750440, + "step": 12127 + }, + { + "epoch": 1.6074221338634858, + "grad_norm": 11.85292911529541, + "learning_rate": 2.2209635333388777e-06, + "loss": 0.202, + "num_input_tokens_seen": 23752408, + "step": 12128 + }, + { + "epoch": 1.6075546719681908, + "grad_norm": 0.10564951598644257, + "learning_rate": 2.220618567783182e-06, + "loss": 0.0007, + "num_input_tokens_seen": 23753760, + "step": 12129 + }, + { + "epoch": 1.6076872100728958, + "grad_norm": 7.963826656341553, + "learning_rate": 2.2202736076141533e-06, + "loss": 0.1918, + "num_input_tokens_seen": 23755616, + "step": 12130 + }, + { + "epoch": 1.607819748177601, + "grad_norm": 4.999587059020996, + "learning_rate": 2.2199286528384414e-06, + "loss": 0.0739, + "num_input_tokens_seen": 23757544, + "step": 12131 + }, + { + "epoch": 1.6079522862823061, + "grad_norm": 7.816965579986572, + "learning_rate": 2.2195837034626995e-06, + "loss": 0.1504, + "num_input_tokens_seen": 23759440, + "step": 12132 + }, + { + "epoch": 1.6080848243870114, + "grad_norm": 8.595309257507324, + "learning_rate": 2.219238759493577e-06, + "loss": 0.14, + "num_input_tokens_seen": 23761608, + "step": 12133 + }, + { + "epoch": 1.6082173624917164, + "grad_norm": 6.3340535163879395, + "learning_rate": 2.218893820937725e-06, + "loss": 0.0514, + "num_input_tokens_seen": 23763184, + "step": 12134 + }, + { + "epoch": 1.6083499005964215, + "grad_norm": 0.1750938892364502, + "learning_rate": 2.2185488878017934e-06, + "loss": 0.0013, + "num_input_tokens_seen": 23765240, + "step": 12135 + }, + { + "epoch": 1.6084824387011265, + "grad_norm": 1.368272304534912, + "learning_rate": 2.218203960092433e-06, + "loss": 0.0114, + "num_input_tokens_seen": 23766736, + "step": 12136 + }, + { + "epoch": 1.6086149768058315, + "grad_norm": 0.23667441308498383, + "learning_rate": 2.2178590378162957e-06, + "loss": 0.0017, + "num_input_tokens_seen": 23768176, + "step": 12137 + }, + { + "epoch": 1.6087475149105368, + "grad_norm": 1.089863896369934, + "learning_rate": 2.2175141209800306e-06, + "loss": 0.0076, + "num_input_tokens_seen": 23770864, + "step": 12138 + }, + { + "epoch": 1.6088800530152418, + "grad_norm": 4.311971664428711, + "learning_rate": 2.2171692095902885e-06, + "loss": 0.0177, + "num_input_tokens_seen": 23772016, + "step": 12139 + }, + { + "epoch": 1.609012591119947, + "grad_norm": 2.144955635070801, + "learning_rate": 2.216824303653719e-06, + "loss": 0.0117, + "num_input_tokens_seen": 23773832, + "step": 12140 + }, + { + "epoch": 1.6091451292246521, + "grad_norm": 0.579552173614502, + "learning_rate": 2.2164794031769717e-06, + "loss": 0.004, + "num_input_tokens_seen": 23775696, + "step": 12141 + }, + { + "epoch": 1.6092776673293572, + "grad_norm": 7.876455307006836, + "learning_rate": 2.216134508166697e-06, + "loss": 0.191, + "num_input_tokens_seen": 23777464, + "step": 12142 + }, + { + "epoch": 1.6094102054340622, + "grad_norm": 1.7364652156829834, + "learning_rate": 2.2157896186295452e-06, + "loss": 0.0105, + "num_input_tokens_seen": 23779600, + "step": 12143 + }, + { + "epoch": 1.6095427435387673, + "grad_norm": 2.3677937984466553, + "learning_rate": 2.2154447345721655e-06, + "loss": 0.0159, + "num_input_tokens_seen": 23780688, + "step": 12144 + }, + { + "epoch": 1.6096752816434725, + "grad_norm": 8.735386848449707, + "learning_rate": 2.2150998560012084e-06, + "loss": 0.0724, + "num_input_tokens_seen": 23782728, + "step": 12145 + }, + { + "epoch": 1.6098078197481775, + "grad_norm": 0.06201193109154701, + "learning_rate": 2.214754982923322e-06, + "loss": 0.0004, + "num_input_tokens_seen": 23784336, + "step": 12146 + }, + { + "epoch": 1.6099403578528828, + "grad_norm": 8.60351276397705, + "learning_rate": 2.2144101153451556e-06, + "loss": 0.154, + "num_input_tokens_seen": 23786096, + "step": 12147 + }, + { + "epoch": 1.6100728959575878, + "grad_norm": 12.183140754699707, + "learning_rate": 2.2140652532733597e-06, + "loss": 0.1779, + "num_input_tokens_seen": 23787728, + "step": 12148 + }, + { + "epoch": 1.6102054340622929, + "grad_norm": 0.7764325141906738, + "learning_rate": 2.2137203967145836e-06, + "loss": 0.0133, + "num_input_tokens_seen": 23789120, + "step": 12149 + }, + { + "epoch": 1.610337972166998, + "grad_norm": 0.46567386388778687, + "learning_rate": 2.2133755456754753e-06, + "loss": 0.0022, + "num_input_tokens_seen": 23790624, + "step": 12150 + }, + { + "epoch": 1.610470510271703, + "grad_norm": 0.22973310947418213, + "learning_rate": 2.2130307001626843e-06, + "loss": 0.0016, + "num_input_tokens_seen": 23792976, + "step": 12151 + }, + { + "epoch": 1.6106030483764082, + "grad_norm": 5.251440048217773, + "learning_rate": 2.2126858601828587e-06, + "loss": 0.0812, + "num_input_tokens_seen": 23795824, + "step": 12152 + }, + { + "epoch": 1.6107355864811135, + "grad_norm": 0.01834111474454403, + "learning_rate": 2.2123410257426485e-06, + "loss": 0.0001, + "num_input_tokens_seen": 23797272, + "step": 12153 + }, + { + "epoch": 1.6108681245858185, + "grad_norm": 2.3947370052337646, + "learning_rate": 2.211996196848702e-06, + "loss": 0.0113, + "num_input_tokens_seen": 23799352, + "step": 12154 + }, + { + "epoch": 1.6110006626905236, + "grad_norm": 12.605746269226074, + "learning_rate": 2.2116513735076675e-06, + "loss": 0.2007, + "num_input_tokens_seen": 23801032, + "step": 12155 + }, + { + "epoch": 1.6111332007952286, + "grad_norm": 0.2836482524871826, + "learning_rate": 2.211306555726193e-06, + "loss": 0.0015, + "num_input_tokens_seen": 23803352, + "step": 12156 + }, + { + "epoch": 1.6112657388999336, + "grad_norm": 11.056133270263672, + "learning_rate": 2.210961743510927e-06, + "loss": 0.2395, + "num_input_tokens_seen": 23805064, + "step": 12157 + }, + { + "epoch": 1.6113982770046387, + "grad_norm": 4.949639320373535, + "learning_rate": 2.210616936868519e-06, + "loss": 0.0773, + "num_input_tokens_seen": 23806920, + "step": 12158 + }, + { + "epoch": 1.611530815109344, + "grad_norm": 1.883800745010376, + "learning_rate": 2.2102721358056152e-06, + "loss": 0.021, + "num_input_tokens_seen": 23808608, + "step": 12159 + }, + { + "epoch": 1.6116633532140492, + "grad_norm": 23.483491897583008, + "learning_rate": 2.209927340328865e-06, + "loss": 0.496, + "num_input_tokens_seen": 23810704, + "step": 12160 + }, + { + "epoch": 1.6117958913187542, + "grad_norm": 8.260041236877441, + "learning_rate": 2.2095825504449155e-06, + "loss": 0.1199, + "num_input_tokens_seen": 23812280, + "step": 12161 + }, + { + "epoch": 1.6119284294234593, + "grad_norm": 8.29055118560791, + "learning_rate": 2.209237766160414e-06, + "loss": 0.2075, + "num_input_tokens_seen": 23814520, + "step": 12162 + }, + { + "epoch": 1.6120609675281643, + "grad_norm": 5.005826950073242, + "learning_rate": 2.2088929874820103e-06, + "loss": 0.1284, + "num_input_tokens_seen": 23818256, + "step": 12163 + }, + { + "epoch": 1.6121935056328693, + "grad_norm": 1.2379045486450195, + "learning_rate": 2.2085482144163505e-06, + "loss": 0.0143, + "num_input_tokens_seen": 23820704, + "step": 12164 + }, + { + "epoch": 1.6123260437375746, + "grad_norm": 7.655517578125, + "learning_rate": 2.2082034469700817e-06, + "loss": 0.0894, + "num_input_tokens_seen": 23822448, + "step": 12165 + }, + { + "epoch": 1.6124585818422796, + "grad_norm": 7.961968421936035, + "learning_rate": 2.2078586851498524e-06, + "loss": 0.1259, + "num_input_tokens_seen": 23824072, + "step": 12166 + }, + { + "epoch": 1.612591119946985, + "grad_norm": 6.188247203826904, + "learning_rate": 2.207513928962308e-06, + "loss": 0.1398, + "num_input_tokens_seen": 23825456, + "step": 12167 + }, + { + "epoch": 1.61272365805169, + "grad_norm": 9.647710800170898, + "learning_rate": 2.2071691784140975e-06, + "loss": 0.1544, + "num_input_tokens_seen": 23828104, + "step": 12168 + }, + { + "epoch": 1.612856196156395, + "grad_norm": 0.005271644797176123, + "learning_rate": 2.2068244335118673e-06, + "loss": 0.0, + "num_input_tokens_seen": 23829312, + "step": 12169 + }, + { + "epoch": 1.6129887342611, + "grad_norm": 1.428570032119751, + "learning_rate": 2.206479694262265e-06, + "loss": 0.0133, + "num_input_tokens_seen": 23831008, + "step": 12170 + }, + { + "epoch": 1.613121272365805, + "grad_norm": 0.07478219270706177, + "learning_rate": 2.2061349606719357e-06, + "loss": 0.0002, + "num_input_tokens_seen": 23832640, + "step": 12171 + }, + { + "epoch": 1.6132538104705103, + "grad_norm": 2.7547760009765625, + "learning_rate": 2.2057902327475276e-06, + "loss": 0.0397, + "num_input_tokens_seen": 23834736, + "step": 12172 + }, + { + "epoch": 1.6133863485752153, + "grad_norm": 0.0428653322160244, + "learning_rate": 2.2054455104956856e-06, + "loss": 0.0002, + "num_input_tokens_seen": 23836536, + "step": 12173 + }, + { + "epoch": 1.6135188866799206, + "grad_norm": 8.600788116455078, + "learning_rate": 2.2051007939230585e-06, + "loss": 0.1556, + "num_input_tokens_seen": 23838424, + "step": 12174 + }, + { + "epoch": 1.6136514247846256, + "grad_norm": 11.870600700378418, + "learning_rate": 2.2047560830362916e-06, + "loss": 0.3106, + "num_input_tokens_seen": 23840848, + "step": 12175 + }, + { + "epoch": 1.6137839628893307, + "grad_norm": 0.012344558723270893, + "learning_rate": 2.2044113778420305e-06, + "loss": 0.0001, + "num_input_tokens_seen": 23843888, + "step": 12176 + }, + { + "epoch": 1.6139165009940357, + "grad_norm": 7.462380886077881, + "learning_rate": 2.2040666783469225e-06, + "loss": 0.1157, + "num_input_tokens_seen": 23846312, + "step": 12177 + }, + { + "epoch": 1.6140490390987408, + "grad_norm": 0.04515473544597626, + "learning_rate": 2.2037219845576123e-06, + "loss": 0.0003, + "num_input_tokens_seen": 23849448, + "step": 12178 + }, + { + "epoch": 1.614181577203446, + "grad_norm": 3.505774974822998, + "learning_rate": 2.2033772964807477e-06, + "loss": 0.0557, + "num_input_tokens_seen": 23851440, + "step": 12179 + }, + { + "epoch": 1.614314115308151, + "grad_norm": 7.5861711502075195, + "learning_rate": 2.2030326141229725e-06, + "loss": 0.1633, + "num_input_tokens_seen": 23853912, + "step": 12180 + }, + { + "epoch": 1.6144466534128563, + "grad_norm": 0.038526929914951324, + "learning_rate": 2.2026879374909342e-06, + "loss": 0.0003, + "num_input_tokens_seen": 23856944, + "step": 12181 + }, + { + "epoch": 1.6145791915175614, + "grad_norm": 0.11298848688602448, + "learning_rate": 2.202343266591277e-06, + "loss": 0.0008, + "num_input_tokens_seen": 23858536, + "step": 12182 + }, + { + "epoch": 1.6147117296222664, + "grad_norm": 0.09078644961118698, + "learning_rate": 2.201998601430646e-06, + "loss": 0.0006, + "num_input_tokens_seen": 23860280, + "step": 12183 + }, + { + "epoch": 1.6148442677269714, + "grad_norm": 0.14313864707946777, + "learning_rate": 2.201653942015689e-06, + "loss": 0.001, + "num_input_tokens_seen": 23862208, + "step": 12184 + }, + { + "epoch": 1.6149768058316765, + "grad_norm": 13.206945419311523, + "learning_rate": 2.2013092883530503e-06, + "loss": 0.2508, + "num_input_tokens_seen": 23864280, + "step": 12185 + }, + { + "epoch": 1.6151093439363817, + "grad_norm": 5.092008590698242, + "learning_rate": 2.2009646404493737e-06, + "loss": 0.0381, + "num_input_tokens_seen": 23865992, + "step": 12186 + }, + { + "epoch": 1.6152418820410868, + "grad_norm": 13.013864517211914, + "learning_rate": 2.2006199983113054e-06, + "loss": 0.267, + "num_input_tokens_seen": 23868304, + "step": 12187 + }, + { + "epoch": 1.615374420145792, + "grad_norm": 1.087438702583313, + "learning_rate": 2.2002753619454893e-06, + "loss": 0.0054, + "num_input_tokens_seen": 23869952, + "step": 12188 + }, + { + "epoch": 1.615506958250497, + "grad_norm": 4.52363395690918, + "learning_rate": 2.199930731358571e-06, + "loss": 0.0875, + "num_input_tokens_seen": 23871216, + "step": 12189 + }, + { + "epoch": 1.615639496355202, + "grad_norm": 6.581776142120361, + "learning_rate": 2.1995861065571964e-06, + "loss": 0.1426, + "num_input_tokens_seen": 23873200, + "step": 12190 + }, + { + "epoch": 1.6157720344599071, + "grad_norm": 3.655064344406128, + "learning_rate": 2.199241487548008e-06, + "loss": 0.0807, + "num_input_tokens_seen": 23874560, + "step": 12191 + }, + { + "epoch": 1.6159045725646122, + "grad_norm": 6.6377058029174805, + "learning_rate": 2.198896874337652e-06, + "loss": 0.0805, + "num_input_tokens_seen": 23876304, + "step": 12192 + }, + { + "epoch": 1.6160371106693174, + "grad_norm": 4.464651584625244, + "learning_rate": 2.1985522669327708e-06, + "loss": 0.0484, + "num_input_tokens_seen": 23878376, + "step": 12193 + }, + { + "epoch": 1.6161696487740227, + "grad_norm": 0.11952562630176544, + "learning_rate": 2.198207665340011e-06, + "loss": 0.0015, + "num_input_tokens_seen": 23880416, + "step": 12194 + }, + { + "epoch": 1.6163021868787277, + "grad_norm": 0.012678629718720913, + "learning_rate": 2.197863069566015e-06, + "loss": 0.0001, + "num_input_tokens_seen": 23881880, + "step": 12195 + }, + { + "epoch": 1.6164347249834328, + "grad_norm": 0.019663801416754723, + "learning_rate": 2.1975184796174277e-06, + "loss": 0.0001, + "num_input_tokens_seen": 23883832, + "step": 12196 + }, + { + "epoch": 1.6165672630881378, + "grad_norm": 0.15596778690814972, + "learning_rate": 2.1971738955008923e-06, + "loss": 0.001, + "num_input_tokens_seen": 23885752, + "step": 12197 + }, + { + "epoch": 1.6166998011928428, + "grad_norm": 2.5241644382476807, + "learning_rate": 2.196829317223054e-06, + "loss": 0.0342, + "num_input_tokens_seen": 23887816, + "step": 12198 + }, + { + "epoch": 1.6168323392975479, + "grad_norm": 0.008277878165245056, + "learning_rate": 2.1964847447905543e-06, + "loss": 0.0001, + "num_input_tokens_seen": 23889464, + "step": 12199 + }, + { + "epoch": 1.6169648774022531, + "grad_norm": 9.978988647460938, + "learning_rate": 2.196140178210039e-06, + "loss": 0.435, + "num_input_tokens_seen": 23892296, + "step": 12200 + }, + { + "epoch": 1.6170974155069584, + "grad_norm": 6.714853286743164, + "learning_rate": 2.1957956174881505e-06, + "loss": 0.1172, + "num_input_tokens_seen": 23894832, + "step": 12201 + }, + { + "epoch": 1.6172299536116634, + "grad_norm": 3.233020544052124, + "learning_rate": 2.1954510626315326e-06, + "loss": 0.0581, + "num_input_tokens_seen": 23897072, + "step": 12202 + }, + { + "epoch": 1.6173624917163685, + "grad_norm": 8.565999984741211, + "learning_rate": 2.195106513646828e-06, + "loss": 0.2356, + "num_input_tokens_seen": 23899080, + "step": 12203 + }, + { + "epoch": 1.6174950298210735, + "grad_norm": 6.642402648925781, + "learning_rate": 2.19476197054068e-06, + "loss": 0.1647, + "num_input_tokens_seen": 23900624, + "step": 12204 + }, + { + "epoch": 1.6176275679257786, + "grad_norm": 13.953596115112305, + "learning_rate": 2.194417433319732e-06, + "loss": 0.1595, + "num_input_tokens_seen": 23902296, + "step": 12205 + }, + { + "epoch": 1.6177601060304838, + "grad_norm": 5.78306245803833, + "learning_rate": 2.1940729019906268e-06, + "loss": 0.1342, + "num_input_tokens_seen": 23904304, + "step": 12206 + }, + { + "epoch": 1.6178926441351889, + "grad_norm": 2.6341352462768555, + "learning_rate": 2.1937283765600072e-06, + "loss": 0.0213, + "num_input_tokens_seen": 23905712, + "step": 12207 + }, + { + "epoch": 1.618025182239894, + "grad_norm": 5.214654922485352, + "learning_rate": 2.193383857034516e-06, + "loss": 0.1133, + "num_input_tokens_seen": 23907856, + "step": 12208 + }, + { + "epoch": 1.6181577203445991, + "grad_norm": 0.029744593426585197, + "learning_rate": 2.1930393434207946e-06, + "loss": 0.0002, + "num_input_tokens_seen": 23909912, + "step": 12209 + }, + { + "epoch": 1.6182902584493042, + "grad_norm": 1.8159228563308716, + "learning_rate": 2.192694835725487e-06, + "loss": 0.0248, + "num_input_tokens_seen": 23911816, + "step": 12210 + }, + { + "epoch": 1.6184227965540092, + "grad_norm": 6.53033447265625, + "learning_rate": 2.192350333955235e-06, + "loss": 0.1269, + "num_input_tokens_seen": 23913992, + "step": 12211 + }, + { + "epoch": 1.6185553346587143, + "grad_norm": 4.488076210021973, + "learning_rate": 2.192005838116681e-06, + "loss": 0.0463, + "num_input_tokens_seen": 23915584, + "step": 12212 + }, + { + "epoch": 1.6186878727634195, + "grad_norm": 5.838597297668457, + "learning_rate": 2.1916613482164666e-06, + "loss": 0.1021, + "num_input_tokens_seen": 23917528, + "step": 12213 + }, + { + "epoch": 1.6188204108681246, + "grad_norm": 0.02485070936381817, + "learning_rate": 2.1913168642612337e-06, + "loss": 0.0002, + "num_input_tokens_seen": 23919232, + "step": 12214 + }, + { + "epoch": 1.6189529489728298, + "grad_norm": 17.391239166259766, + "learning_rate": 2.190972386257625e-06, + "loss": 0.3862, + "num_input_tokens_seen": 23920912, + "step": 12215 + }, + { + "epoch": 1.6190854870775349, + "grad_norm": 7.286650657653809, + "learning_rate": 2.190627914212282e-06, + "loss": 0.1807, + "num_input_tokens_seen": 23922552, + "step": 12216 + }, + { + "epoch": 1.61921802518224, + "grad_norm": 6.92227029800415, + "learning_rate": 2.1902834481318467e-06, + "loss": 0.2654, + "num_input_tokens_seen": 23924512, + "step": 12217 + }, + { + "epoch": 1.619350563286945, + "grad_norm": 0.24118860065937042, + "learning_rate": 2.18993898802296e-06, + "loss": 0.0017, + "num_input_tokens_seen": 23926144, + "step": 12218 + }, + { + "epoch": 1.61948310139165, + "grad_norm": 12.126229286193848, + "learning_rate": 2.189594533892262e-06, + "loss": 0.2976, + "num_input_tokens_seen": 23929144, + "step": 12219 + }, + { + "epoch": 1.6196156394963552, + "grad_norm": 0.032377004623413086, + "learning_rate": 2.1892500857463974e-06, + "loss": 0.0002, + "num_input_tokens_seen": 23930672, + "step": 12220 + }, + { + "epoch": 1.6197481776010603, + "grad_norm": 8.433162689208984, + "learning_rate": 2.1889056435920046e-06, + "loss": 0.1365, + "num_input_tokens_seen": 23932264, + "step": 12221 + }, + { + "epoch": 1.6198807157057655, + "grad_norm": 0.03534359484910965, + "learning_rate": 2.188561207435726e-06, + "loss": 0.0002, + "num_input_tokens_seen": 23934336, + "step": 12222 + }, + { + "epoch": 1.6200132538104706, + "grad_norm": 3.8081541061401367, + "learning_rate": 2.1882167772842027e-06, + "loss": 0.0628, + "num_input_tokens_seen": 23936576, + "step": 12223 + }, + { + "epoch": 1.6201457919151756, + "grad_norm": 1.4687366485595703, + "learning_rate": 2.1878723531440738e-06, + "loss": 0.0074, + "num_input_tokens_seen": 23938280, + "step": 12224 + }, + { + "epoch": 1.6202783300198806, + "grad_norm": 8.705544471740723, + "learning_rate": 2.187527935021982e-06, + "loss": 0.1004, + "num_input_tokens_seen": 23940592, + "step": 12225 + }, + { + "epoch": 1.6204108681245857, + "grad_norm": 0.34899553656578064, + "learning_rate": 2.187183522924567e-06, + "loss": 0.0037, + "num_input_tokens_seen": 23941984, + "step": 12226 + }, + { + "epoch": 1.620543406229291, + "grad_norm": 3.563113212585449, + "learning_rate": 2.186839116858469e-06, + "loss": 0.0335, + "num_input_tokens_seen": 23943344, + "step": 12227 + }, + { + "epoch": 1.620675944333996, + "grad_norm": 5.219413757324219, + "learning_rate": 2.18649471683033e-06, + "loss": 0.0732, + "num_input_tokens_seen": 23945536, + "step": 12228 + }, + { + "epoch": 1.6208084824387012, + "grad_norm": 10.912976264953613, + "learning_rate": 2.1861503228467885e-06, + "loss": 0.2006, + "num_input_tokens_seen": 23947432, + "step": 12229 + }, + { + "epoch": 1.6209410205434063, + "grad_norm": 3.9802210330963135, + "learning_rate": 2.1858059349144843e-06, + "loss": 0.0851, + "num_input_tokens_seen": 23948736, + "step": 12230 + }, + { + "epoch": 1.6210735586481113, + "grad_norm": 14.880814552307129, + "learning_rate": 2.18546155304006e-06, + "loss": 0.3055, + "num_input_tokens_seen": 23952544, + "step": 12231 + }, + { + "epoch": 1.6212060967528164, + "grad_norm": 0.0526886023581028, + "learning_rate": 2.1851171772301537e-06, + "loss": 0.0003, + "num_input_tokens_seen": 23954512, + "step": 12232 + }, + { + "epoch": 1.6213386348575214, + "grad_norm": 5.47169828414917, + "learning_rate": 2.184772807491405e-06, + "loss": 0.1453, + "num_input_tokens_seen": 23956168, + "step": 12233 + }, + { + "epoch": 1.6214711729622266, + "grad_norm": 2.564249277114868, + "learning_rate": 2.184428443830454e-06, + "loss": 0.0151, + "num_input_tokens_seen": 23957664, + "step": 12234 + }, + { + "epoch": 1.621603711066932, + "grad_norm": 7.02336311340332, + "learning_rate": 2.18408408625394e-06, + "loss": 0.1008, + "num_input_tokens_seen": 23959528, + "step": 12235 + }, + { + "epoch": 1.621736249171637, + "grad_norm": 8.289192199707031, + "learning_rate": 2.183739734768503e-06, + "loss": 0.1961, + "num_input_tokens_seen": 23961576, + "step": 12236 + }, + { + "epoch": 1.621868787276342, + "grad_norm": 0.15752029418945312, + "learning_rate": 2.1833953893807826e-06, + "loss": 0.0012, + "num_input_tokens_seen": 23963936, + "step": 12237 + }, + { + "epoch": 1.622001325381047, + "grad_norm": 0.18260298669338226, + "learning_rate": 2.1830510500974177e-06, + "loss": 0.0012, + "num_input_tokens_seen": 23965464, + "step": 12238 + }, + { + "epoch": 1.622133863485752, + "grad_norm": 0.16990914940834045, + "learning_rate": 2.1827067169250465e-06, + "loss": 0.0012, + "num_input_tokens_seen": 23967896, + "step": 12239 + }, + { + "epoch": 1.622266401590457, + "grad_norm": 0.07923067361116409, + "learning_rate": 2.182362389870308e-06, + "loss": 0.0005, + "num_input_tokens_seen": 23970672, + "step": 12240 + }, + { + "epoch": 1.6223989396951624, + "grad_norm": 1.2383215427398682, + "learning_rate": 2.182018068939843e-06, + "loss": 0.0141, + "num_input_tokens_seen": 23972768, + "step": 12241 + }, + { + "epoch": 1.6225314777998676, + "grad_norm": 0.6879251599311829, + "learning_rate": 2.1816737541402887e-06, + "loss": 0.0085, + "num_input_tokens_seen": 23974632, + "step": 12242 + }, + { + "epoch": 1.6226640159045727, + "grad_norm": 4.832513809204102, + "learning_rate": 2.181329445478284e-06, + "loss": 0.0117, + "num_input_tokens_seen": 23977688, + "step": 12243 + }, + { + "epoch": 1.6227965540092777, + "grad_norm": 13.807974815368652, + "learning_rate": 2.1809851429604672e-06, + "loss": 0.3925, + "num_input_tokens_seen": 23980712, + "step": 12244 + }, + { + "epoch": 1.6229290921139827, + "grad_norm": 30.697736740112305, + "learning_rate": 2.1806408465934763e-06, + "loss": 0.2709, + "num_input_tokens_seen": 23983464, + "step": 12245 + }, + { + "epoch": 1.6230616302186878, + "grad_norm": 0.31796765327453613, + "learning_rate": 2.1802965563839515e-06, + "loss": 0.0017, + "num_input_tokens_seen": 23985040, + "step": 12246 + }, + { + "epoch": 1.623194168323393, + "grad_norm": 10.966869354248047, + "learning_rate": 2.17995227233853e-06, + "loss": 0.0759, + "num_input_tokens_seen": 23987336, + "step": 12247 + }, + { + "epoch": 1.623326706428098, + "grad_norm": 0.09356506168842316, + "learning_rate": 2.179607994463848e-06, + "loss": 0.0005, + "num_input_tokens_seen": 23988584, + "step": 12248 + }, + { + "epoch": 1.6234592445328033, + "grad_norm": 10.212505340576172, + "learning_rate": 2.1792637227665457e-06, + "loss": 0.1459, + "num_input_tokens_seen": 23990616, + "step": 12249 + }, + { + "epoch": 1.6235917826375084, + "grad_norm": 7.387820243835449, + "learning_rate": 2.1789194572532592e-06, + "loss": 0.134, + "num_input_tokens_seen": 23992304, + "step": 12250 + }, + { + "epoch": 1.6237243207422134, + "grad_norm": 8.541766166687012, + "learning_rate": 2.178575197930628e-06, + "loss": 0.1404, + "num_input_tokens_seen": 23994072, + "step": 12251 + }, + { + "epoch": 1.6238568588469184, + "grad_norm": 7.629110813140869, + "learning_rate": 2.1782309448052884e-06, + "loss": 0.0681, + "num_input_tokens_seen": 23995712, + "step": 12252 + }, + { + "epoch": 1.6239893969516235, + "grad_norm": 1.5686620473861694, + "learning_rate": 2.1778866978838783e-06, + "loss": 0.0146, + "num_input_tokens_seen": 23997192, + "step": 12253 + }, + { + "epoch": 1.6241219350563287, + "grad_norm": 0.3828531503677368, + "learning_rate": 2.1775424571730354e-06, + "loss": 0.0025, + "num_input_tokens_seen": 23998744, + "step": 12254 + }, + { + "epoch": 1.6242544731610338, + "grad_norm": 0.7175197005271912, + "learning_rate": 2.177198222679396e-06, + "loss": 0.0054, + "num_input_tokens_seen": 24000152, + "step": 12255 + }, + { + "epoch": 1.624387011265739, + "grad_norm": 20.51130485534668, + "learning_rate": 2.176853994409597e-06, + "loss": 0.3362, + "num_input_tokens_seen": 24002440, + "step": 12256 + }, + { + "epoch": 1.624519549370444, + "grad_norm": 6.283359527587891, + "learning_rate": 2.176509772370276e-06, + "loss": 0.1029, + "num_input_tokens_seen": 24003672, + "step": 12257 + }, + { + "epoch": 1.624652087475149, + "grad_norm": 1.2910292148590088, + "learning_rate": 2.17616555656807e-06, + "loss": 0.0061, + "num_input_tokens_seen": 24005912, + "step": 12258 + }, + { + "epoch": 1.6247846255798541, + "grad_norm": 0.6718969345092773, + "learning_rate": 2.1758213470096155e-06, + "loss": 0.0058, + "num_input_tokens_seen": 24007584, + "step": 12259 + }, + { + "epoch": 1.6249171636845592, + "grad_norm": 2.3538734912872314, + "learning_rate": 2.1754771437015495e-06, + "loss": 0.0287, + "num_input_tokens_seen": 24010000, + "step": 12260 + }, + { + "epoch": 1.6250497017892644, + "grad_norm": 0.49122312664985657, + "learning_rate": 2.175132946650507e-06, + "loss": 0.0034, + "num_input_tokens_seen": 24011528, + "step": 12261 + }, + { + "epoch": 1.6251822398939695, + "grad_norm": 0.05257052555680275, + "learning_rate": 2.1747887558631265e-06, + "loss": 0.0004, + "num_input_tokens_seen": 24013600, + "step": 12262 + }, + { + "epoch": 1.6253147779986747, + "grad_norm": 26.45469856262207, + "learning_rate": 2.174444571346042e-06, + "loss": 0.6719, + "num_input_tokens_seen": 24015488, + "step": 12263 + }, + { + "epoch": 1.6254473161033798, + "grad_norm": 0.13446345925331116, + "learning_rate": 2.174100393105892e-06, + "loss": 0.0006, + "num_input_tokens_seen": 24017144, + "step": 12264 + }, + { + "epoch": 1.6255798542080848, + "grad_norm": 3.2098097801208496, + "learning_rate": 2.1737562211493105e-06, + "loss": 0.0454, + "num_input_tokens_seen": 24018296, + "step": 12265 + }, + { + "epoch": 1.6257123923127899, + "grad_norm": 6.359274387359619, + "learning_rate": 2.1734120554829337e-06, + "loss": 0.1989, + "num_input_tokens_seen": 24020728, + "step": 12266 + }, + { + "epoch": 1.625844930417495, + "grad_norm": 2.8993024826049805, + "learning_rate": 2.1730678961133987e-06, + "loss": 0.0459, + "num_input_tokens_seen": 24023336, + "step": 12267 + }, + { + "epoch": 1.6259774685222002, + "grad_norm": 5.659909248352051, + "learning_rate": 2.1727237430473396e-06, + "loss": 0.1446, + "num_input_tokens_seen": 24025200, + "step": 12268 + }, + { + "epoch": 1.6261100066269052, + "grad_norm": 0.09075940400362015, + "learning_rate": 2.172379596291393e-06, + "loss": 0.0006, + "num_input_tokens_seen": 24027120, + "step": 12269 + }, + { + "epoch": 1.6262425447316105, + "grad_norm": 0.17071126401424408, + "learning_rate": 2.172035455852194e-06, + "loss": 0.0011, + "num_input_tokens_seen": 24030344, + "step": 12270 + }, + { + "epoch": 1.6263750828363155, + "grad_norm": 5.186660289764404, + "learning_rate": 2.1716913217363766e-06, + "loss": 0.1196, + "num_input_tokens_seen": 24032856, + "step": 12271 + }, + { + "epoch": 1.6265076209410205, + "grad_norm": 5.331362247467041, + "learning_rate": 2.1713471939505775e-06, + "loss": 0.1713, + "num_input_tokens_seen": 24036096, + "step": 12272 + }, + { + "epoch": 1.6266401590457256, + "grad_norm": 7.923537731170654, + "learning_rate": 2.171003072501432e-06, + "loss": 0.1377, + "num_input_tokens_seen": 24038560, + "step": 12273 + }, + { + "epoch": 1.6267726971504306, + "grad_norm": 12.071828842163086, + "learning_rate": 2.170658957395573e-06, + "loss": 0.2007, + "num_input_tokens_seen": 24040656, + "step": 12274 + }, + { + "epoch": 1.6269052352551359, + "grad_norm": 0.035343680530786514, + "learning_rate": 2.1703148486396376e-06, + "loss": 0.0002, + "num_input_tokens_seen": 24042904, + "step": 12275 + }, + { + "epoch": 1.627037773359841, + "grad_norm": 9.473772048950195, + "learning_rate": 2.169970746240258e-06, + "loss": 0.3479, + "num_input_tokens_seen": 24045128, + "step": 12276 + }, + { + "epoch": 1.6271703114645462, + "grad_norm": 4.2103118896484375, + "learning_rate": 2.1696266502040713e-06, + "loss": 0.0547, + "num_input_tokens_seen": 24048080, + "step": 12277 + }, + { + "epoch": 1.6273028495692512, + "grad_norm": 19.06356430053711, + "learning_rate": 2.16928256053771e-06, + "loss": 0.3067, + "num_input_tokens_seen": 24049984, + "step": 12278 + }, + { + "epoch": 1.6274353876739562, + "grad_norm": 0.35626089572906494, + "learning_rate": 2.16893847724781e-06, + "loss": 0.0043, + "num_input_tokens_seen": 24051648, + "step": 12279 + }, + { + "epoch": 1.6275679257786613, + "grad_norm": 7.960979461669922, + "learning_rate": 2.1685944003410035e-06, + "loss": 0.044, + "num_input_tokens_seen": 24054592, + "step": 12280 + }, + { + "epoch": 1.6277004638833663, + "grad_norm": 0.07624118775129318, + "learning_rate": 2.1682503298239257e-06, + "loss": 0.0005, + "num_input_tokens_seen": 24056368, + "step": 12281 + }, + { + "epoch": 1.6278330019880716, + "grad_norm": 5.082423686981201, + "learning_rate": 2.167906265703211e-06, + "loss": 0.115, + "num_input_tokens_seen": 24059120, + "step": 12282 + }, + { + "epoch": 1.6279655400927768, + "grad_norm": 28.495990753173828, + "learning_rate": 2.1675622079854924e-06, + "loss": 0.5803, + "num_input_tokens_seen": 24062024, + "step": 12283 + }, + { + "epoch": 1.6280980781974819, + "grad_norm": 0.144692525267601, + "learning_rate": 2.167218156677404e-06, + "loss": 0.001, + "num_input_tokens_seen": 24063656, + "step": 12284 + }, + { + "epoch": 1.628230616302187, + "grad_norm": 1.1071245670318604, + "learning_rate": 2.166874111785579e-06, + "loss": 0.012, + "num_input_tokens_seen": 24065440, + "step": 12285 + }, + { + "epoch": 1.628363154406892, + "grad_norm": 0.02892206981778145, + "learning_rate": 2.1665300733166506e-06, + "loss": 0.0002, + "num_input_tokens_seen": 24066680, + "step": 12286 + }, + { + "epoch": 1.628495692511597, + "grad_norm": 4.876189708709717, + "learning_rate": 2.166186041277252e-06, + "loss": 0.0769, + "num_input_tokens_seen": 24068312, + "step": 12287 + }, + { + "epoch": 1.6286282306163022, + "grad_norm": 7.852471828460693, + "learning_rate": 2.1658420156740177e-06, + "loss": 0.0848, + "num_input_tokens_seen": 24069776, + "step": 12288 + }, + { + "epoch": 1.6287607687210073, + "grad_norm": 0.21920527517795563, + "learning_rate": 2.1654979965135796e-06, + "loss": 0.0016, + "num_input_tokens_seen": 24071464, + "step": 12289 + }, + { + "epoch": 1.6288933068257125, + "grad_norm": 3.705523729324341, + "learning_rate": 2.165153983802571e-06, + "loss": 0.0533, + "num_input_tokens_seen": 24073136, + "step": 12290 + }, + { + "epoch": 1.6290258449304176, + "grad_norm": 5.551515102386475, + "learning_rate": 2.164809977547625e-06, + "loss": 0.2117, + "num_input_tokens_seen": 24074856, + "step": 12291 + }, + { + "epoch": 1.6291583830351226, + "grad_norm": 0.3352087140083313, + "learning_rate": 2.1644659777553725e-06, + "loss": 0.0019, + "num_input_tokens_seen": 24076616, + "step": 12292 + }, + { + "epoch": 1.6292909211398277, + "grad_norm": 5.349156856536865, + "learning_rate": 2.164121984432448e-06, + "loss": 0.1621, + "num_input_tokens_seen": 24078456, + "step": 12293 + }, + { + "epoch": 1.6294234592445327, + "grad_norm": 4.887638092041016, + "learning_rate": 2.163777997585484e-06, + "loss": 0.0452, + "num_input_tokens_seen": 24079584, + "step": 12294 + }, + { + "epoch": 1.629555997349238, + "grad_norm": 0.23635423183441162, + "learning_rate": 2.1634340172211115e-06, + "loss": 0.0017, + "num_input_tokens_seen": 24082256, + "step": 12295 + }, + { + "epoch": 1.629688535453943, + "grad_norm": 0.4234890043735504, + "learning_rate": 2.163090043345964e-06, + "loss": 0.0031, + "num_input_tokens_seen": 24084120, + "step": 12296 + }, + { + "epoch": 1.6298210735586482, + "grad_norm": 6.71112585067749, + "learning_rate": 2.1627460759666717e-06, + "loss": 0.1783, + "num_input_tokens_seen": 24086096, + "step": 12297 + }, + { + "epoch": 1.6299536116633533, + "grad_norm": 6.354191780090332, + "learning_rate": 2.1624021150898685e-06, + "loss": 0.1464, + "num_input_tokens_seen": 24088352, + "step": 12298 + }, + { + "epoch": 1.6300861497680583, + "grad_norm": 1.6057822704315186, + "learning_rate": 2.1620581607221857e-06, + "loss": 0.0617, + "num_input_tokens_seen": 24090280, + "step": 12299 + }, + { + "epoch": 1.6302186878727634, + "grad_norm": 3.694124460220337, + "learning_rate": 2.1617142128702547e-06, + "loss": 0.0817, + "num_input_tokens_seen": 24091992, + "step": 12300 + }, + { + "epoch": 1.6303512259774684, + "grad_norm": 5.845182418823242, + "learning_rate": 2.1613702715407065e-06, + "loss": 0.1298, + "num_input_tokens_seen": 24093856, + "step": 12301 + }, + { + "epoch": 1.6304837640821737, + "grad_norm": 2.443789482116699, + "learning_rate": 2.1610263367401725e-06, + "loss": 0.0239, + "num_input_tokens_seen": 24095496, + "step": 12302 + }, + { + "epoch": 1.6306163021868787, + "grad_norm": 1.5129685401916504, + "learning_rate": 2.160682408475286e-06, + "loss": 0.0175, + "num_input_tokens_seen": 24098032, + "step": 12303 + }, + { + "epoch": 1.630748840291584, + "grad_norm": 5.6132402420043945, + "learning_rate": 2.160338486752676e-06, + "loss": 0.0853, + "num_input_tokens_seen": 24100136, + "step": 12304 + }, + { + "epoch": 1.630881378396289, + "grad_norm": 7.386194229125977, + "learning_rate": 2.1599945715789745e-06, + "loss": 0.1064, + "num_input_tokens_seen": 24101976, + "step": 12305 + }, + { + "epoch": 1.631013916500994, + "grad_norm": 8.79554271697998, + "learning_rate": 2.159650662960812e-06, + "loss": 0.1146, + "num_input_tokens_seen": 24103120, + "step": 12306 + }, + { + "epoch": 1.631146454605699, + "grad_norm": 19.793752670288086, + "learning_rate": 2.159306760904819e-06, + "loss": 0.3015, + "num_input_tokens_seen": 24106056, + "step": 12307 + }, + { + "epoch": 1.631278992710404, + "grad_norm": 6.045555114746094, + "learning_rate": 2.1589628654176278e-06, + "loss": 0.2428, + "num_input_tokens_seen": 24109176, + "step": 12308 + }, + { + "epoch": 1.6314115308151094, + "grad_norm": 0.826233446598053, + "learning_rate": 2.1586189765058677e-06, + "loss": 0.0059, + "num_input_tokens_seen": 24111840, + "step": 12309 + }, + { + "epoch": 1.6315440689198144, + "grad_norm": 0.13360217213630676, + "learning_rate": 2.1582750941761687e-06, + "loss": 0.0009, + "num_input_tokens_seen": 24113088, + "step": 12310 + }, + { + "epoch": 1.6316766070245197, + "grad_norm": 11.775442123413086, + "learning_rate": 2.1579312184351624e-06, + "loss": 0.2375, + "num_input_tokens_seen": 24114848, + "step": 12311 + }, + { + "epoch": 1.6318091451292247, + "grad_norm": 1.3531239032745361, + "learning_rate": 2.1575873492894774e-06, + "loss": 0.0175, + "num_input_tokens_seen": 24116712, + "step": 12312 + }, + { + "epoch": 1.6319416832339297, + "grad_norm": 5.879791259765625, + "learning_rate": 2.157243486745744e-06, + "loss": 0.0745, + "num_input_tokens_seen": 24118568, + "step": 12313 + }, + { + "epoch": 1.6320742213386348, + "grad_norm": 7.022495746612549, + "learning_rate": 2.1568996308105938e-06, + "loss": 0.1986, + "num_input_tokens_seen": 24120176, + "step": 12314 + }, + { + "epoch": 1.6322067594433398, + "grad_norm": 8.20421314239502, + "learning_rate": 2.156555781490655e-06, + "loss": 0.0805, + "num_input_tokens_seen": 24122256, + "step": 12315 + }, + { + "epoch": 1.632339297548045, + "grad_norm": 9.54566478729248, + "learning_rate": 2.156211938792558e-06, + "loss": 0.1411, + "num_input_tokens_seen": 24124856, + "step": 12316 + }, + { + "epoch": 1.6324718356527501, + "grad_norm": 0.2227516621351242, + "learning_rate": 2.1558681027229317e-06, + "loss": 0.0013, + "num_input_tokens_seen": 24126744, + "step": 12317 + }, + { + "epoch": 1.6326043737574554, + "grad_norm": 7.287049293518066, + "learning_rate": 2.155524273288405e-06, + "loss": 0.1283, + "num_input_tokens_seen": 24129032, + "step": 12318 + }, + { + "epoch": 1.6327369118621604, + "grad_norm": 5.9384846687316895, + "learning_rate": 2.1551804504956085e-06, + "loss": 0.1684, + "num_input_tokens_seen": 24130664, + "step": 12319 + }, + { + "epoch": 1.6328694499668654, + "grad_norm": 7.376380920410156, + "learning_rate": 2.1548366343511716e-06, + "loss": 0.1757, + "num_input_tokens_seen": 24133184, + "step": 12320 + }, + { + "epoch": 1.6330019880715705, + "grad_norm": 0.1165706068277359, + "learning_rate": 2.1544928248617214e-06, + "loss": 0.0008, + "num_input_tokens_seen": 24134936, + "step": 12321 + }, + { + "epoch": 1.6331345261762755, + "grad_norm": 13.439699172973633, + "learning_rate": 2.154149022033889e-06, + "loss": 0.1369, + "num_input_tokens_seen": 24136848, + "step": 12322 + }, + { + "epoch": 1.6332670642809808, + "grad_norm": 8.839691162109375, + "learning_rate": 2.1538052258743005e-06, + "loss": 0.0795, + "num_input_tokens_seen": 24138272, + "step": 12323 + }, + { + "epoch": 1.633399602385686, + "grad_norm": 1.825400710105896, + "learning_rate": 2.1534614363895874e-06, + "loss": 0.0183, + "num_input_tokens_seen": 24140224, + "step": 12324 + }, + { + "epoch": 1.633532140490391, + "grad_norm": 6.865872383117676, + "learning_rate": 2.153117653586376e-06, + "loss": 0.1034, + "num_input_tokens_seen": 24142640, + "step": 12325 + }, + { + "epoch": 1.6336646785950961, + "grad_norm": 8.83198070526123, + "learning_rate": 2.1527738774712967e-06, + "loss": 0.1994, + "num_input_tokens_seen": 24144344, + "step": 12326 + }, + { + "epoch": 1.6337972166998012, + "grad_norm": 6.99372673034668, + "learning_rate": 2.1524301080509756e-06, + "loss": 0.0293, + "num_input_tokens_seen": 24145384, + "step": 12327 + }, + { + "epoch": 1.6339297548045062, + "grad_norm": 0.6056116223335266, + "learning_rate": 2.152086345332041e-06, + "loss": 0.0033, + "num_input_tokens_seen": 24146600, + "step": 12328 + }, + { + "epoch": 1.6340622929092112, + "grad_norm": 11.416797637939453, + "learning_rate": 2.1517425893211236e-06, + "loss": 0.22, + "num_input_tokens_seen": 24148160, + "step": 12329 + }, + { + "epoch": 1.6341948310139165, + "grad_norm": 11.116165161132812, + "learning_rate": 2.151398840024849e-06, + "loss": 0.1517, + "num_input_tokens_seen": 24149648, + "step": 12330 + }, + { + "epoch": 1.6343273691186218, + "grad_norm": 3.3952934741973877, + "learning_rate": 2.151055097449844e-06, + "loss": 0.0441, + "num_input_tokens_seen": 24151504, + "step": 12331 + }, + { + "epoch": 1.6344599072233268, + "grad_norm": 7.657194137573242, + "learning_rate": 2.150711361602739e-06, + "loss": 0.198, + "num_input_tokens_seen": 24154040, + "step": 12332 + }, + { + "epoch": 1.6345924453280318, + "grad_norm": 1.5254442691802979, + "learning_rate": 2.150367632490158e-06, + "loss": 0.026, + "num_input_tokens_seen": 24155176, + "step": 12333 + }, + { + "epoch": 1.6347249834327369, + "grad_norm": 0.15173156559467316, + "learning_rate": 2.1500239101187314e-06, + "loss": 0.0011, + "num_input_tokens_seen": 24157424, + "step": 12334 + }, + { + "epoch": 1.634857521537442, + "grad_norm": 2.3594679832458496, + "learning_rate": 2.1496801944950855e-06, + "loss": 0.0433, + "num_input_tokens_seen": 24158968, + "step": 12335 + }, + { + "epoch": 1.6349900596421472, + "grad_norm": 2.927417039871216, + "learning_rate": 2.149336485625847e-06, + "loss": 0.0718, + "num_input_tokens_seen": 24160424, + "step": 12336 + }, + { + "epoch": 1.6351225977468522, + "grad_norm": 3.9697771072387695, + "learning_rate": 2.148992783517643e-06, + "loss": 0.0557, + "num_input_tokens_seen": 24162232, + "step": 12337 + }, + { + "epoch": 1.6352551358515575, + "grad_norm": 0.4772087037563324, + "learning_rate": 2.148649088177099e-06, + "loss": 0.003, + "num_input_tokens_seen": 24163720, + "step": 12338 + }, + { + "epoch": 1.6353876739562625, + "grad_norm": 2.2256126403808594, + "learning_rate": 2.148305399610844e-06, + "loss": 0.0521, + "num_input_tokens_seen": 24165072, + "step": 12339 + }, + { + "epoch": 1.6355202120609675, + "grad_norm": 6.883285045623779, + "learning_rate": 2.147961717825504e-06, + "loss": 0.1028, + "num_input_tokens_seen": 24167072, + "step": 12340 + }, + { + "epoch": 1.6356527501656726, + "grad_norm": 7.347526550292969, + "learning_rate": 2.1476180428277045e-06, + "loss": 0.0364, + "num_input_tokens_seen": 24168664, + "step": 12341 + }, + { + "epoch": 1.6357852882703776, + "grad_norm": 19.199338912963867, + "learning_rate": 2.147274374624072e-06, + "loss": 0.3157, + "num_input_tokens_seen": 24170224, + "step": 12342 + }, + { + "epoch": 1.6359178263750829, + "grad_norm": 0.31988224387168884, + "learning_rate": 2.1469307132212336e-06, + "loss": 0.0012, + "num_input_tokens_seen": 24171352, + "step": 12343 + }, + { + "epoch": 1.636050364479788, + "grad_norm": 12.273748397827148, + "learning_rate": 2.1465870586258135e-06, + "loss": 0.1871, + "num_input_tokens_seen": 24173720, + "step": 12344 + }, + { + "epoch": 1.6361829025844932, + "grad_norm": 0.056358467787504196, + "learning_rate": 2.1462434108444398e-06, + "loss": 0.0004, + "num_input_tokens_seen": 24175416, + "step": 12345 + }, + { + "epoch": 1.6363154406891982, + "grad_norm": 9.270435333251953, + "learning_rate": 2.1458997698837366e-06, + "loss": 0.1235, + "num_input_tokens_seen": 24178592, + "step": 12346 + }, + { + "epoch": 1.6364479787939032, + "grad_norm": 0.17531076073646545, + "learning_rate": 2.1455561357503302e-06, + "loss": 0.0011, + "num_input_tokens_seen": 24180328, + "step": 12347 + }, + { + "epoch": 1.6365805168986083, + "grad_norm": 4.468027114868164, + "learning_rate": 2.145212508450846e-06, + "loss": 0.041, + "num_input_tokens_seen": 24182408, + "step": 12348 + }, + { + "epoch": 1.6367130550033133, + "grad_norm": 3.5951788425445557, + "learning_rate": 2.144868887991909e-06, + "loss": 0.0793, + "num_input_tokens_seen": 24185992, + "step": 12349 + }, + { + "epoch": 1.6368455931080186, + "grad_norm": 3.487384557723999, + "learning_rate": 2.1445252743801454e-06, + "loss": 0.0386, + "num_input_tokens_seen": 24188120, + "step": 12350 + }, + { + "epoch": 1.6369781312127236, + "grad_norm": 4.123757839202881, + "learning_rate": 2.1441816676221794e-06, + "loss": 0.096, + "num_input_tokens_seen": 24190144, + "step": 12351 + }, + { + "epoch": 1.6371106693174289, + "grad_norm": 11.132966995239258, + "learning_rate": 2.143838067724637e-06, + "loss": 0.1793, + "num_input_tokens_seen": 24192624, + "step": 12352 + }, + { + "epoch": 1.637243207422134, + "grad_norm": 14.434907913208008, + "learning_rate": 2.143494474694142e-06, + "loss": 0.1034, + "num_input_tokens_seen": 24194280, + "step": 12353 + }, + { + "epoch": 1.637375745526839, + "grad_norm": 0.43844863772392273, + "learning_rate": 2.1431508885373185e-06, + "loss": 0.0047, + "num_input_tokens_seen": 24195656, + "step": 12354 + }, + { + "epoch": 1.637508283631544, + "grad_norm": 12.554174423217773, + "learning_rate": 2.1428073092607928e-06, + "loss": 0.0724, + "num_input_tokens_seen": 24197384, + "step": 12355 + }, + { + "epoch": 1.637640821736249, + "grad_norm": 0.11264834553003311, + "learning_rate": 2.142463736871189e-06, + "loss": 0.0007, + "num_input_tokens_seen": 24199296, + "step": 12356 + }, + { + "epoch": 1.6377733598409543, + "grad_norm": 9.005589485168457, + "learning_rate": 2.1421201713751308e-06, + "loss": 0.13, + "num_input_tokens_seen": 24201184, + "step": 12357 + }, + { + "epoch": 1.6379058979456593, + "grad_norm": 0.09476404637098312, + "learning_rate": 2.1417766127792426e-06, + "loss": 0.0007, + "num_input_tokens_seen": 24203032, + "step": 12358 + }, + { + "epoch": 1.6380384360503646, + "grad_norm": 12.026439666748047, + "learning_rate": 2.141433061090148e-06, + "loss": 0.0946, + "num_input_tokens_seen": 24204848, + "step": 12359 + }, + { + "epoch": 1.6381709741550696, + "grad_norm": 0.17411094903945923, + "learning_rate": 2.1410895163144714e-06, + "loss": 0.001, + "num_input_tokens_seen": 24207568, + "step": 12360 + }, + { + "epoch": 1.6383035122597747, + "grad_norm": 0.8784928917884827, + "learning_rate": 2.1407459784588367e-06, + "loss": 0.0096, + "num_input_tokens_seen": 24209088, + "step": 12361 + }, + { + "epoch": 1.6384360503644797, + "grad_norm": 13.599257469177246, + "learning_rate": 2.140402447529868e-06, + "loss": 0.117, + "num_input_tokens_seen": 24210904, + "step": 12362 + }, + { + "epoch": 1.6385685884691847, + "grad_norm": 5.713837623596191, + "learning_rate": 2.1400589235341875e-06, + "loss": 0.1008, + "num_input_tokens_seen": 24213800, + "step": 12363 + }, + { + "epoch": 1.63870112657389, + "grad_norm": 8.293967247009277, + "learning_rate": 2.139715406478419e-06, + "loss": 0.1466, + "num_input_tokens_seen": 24216280, + "step": 12364 + }, + { + "epoch": 1.6388336646785953, + "grad_norm": 9.258401870727539, + "learning_rate": 2.139371896369187e-06, + "loss": 0.1177, + "num_input_tokens_seen": 24218056, + "step": 12365 + }, + { + "epoch": 1.6389662027833003, + "grad_norm": 2.244084358215332, + "learning_rate": 2.1390283932131124e-06, + "loss": 0.022, + "num_input_tokens_seen": 24220208, + "step": 12366 + }, + { + "epoch": 1.6390987408880053, + "grad_norm": 7.87792444229126, + "learning_rate": 2.1386848970168205e-06, + "loss": 0.2475, + "num_input_tokens_seen": 24222096, + "step": 12367 + }, + { + "epoch": 1.6392312789927104, + "grad_norm": 12.761465072631836, + "learning_rate": 2.138341407786933e-06, + "loss": 0.2378, + "num_input_tokens_seen": 24224024, + "step": 12368 + }, + { + "epoch": 1.6393638170974154, + "grad_norm": 0.7186816334724426, + "learning_rate": 2.137997925530072e-06, + "loss": 0.0051, + "num_input_tokens_seen": 24226008, + "step": 12369 + }, + { + "epoch": 1.6394963552021204, + "grad_norm": 10.363210678100586, + "learning_rate": 2.13765445025286e-06, + "loss": 0.038, + "num_input_tokens_seen": 24227600, + "step": 12370 + }, + { + "epoch": 1.6396288933068257, + "grad_norm": 0.03165941312909126, + "learning_rate": 2.1373109819619217e-06, + "loss": 0.0002, + "num_input_tokens_seen": 24228672, + "step": 12371 + }, + { + "epoch": 1.639761431411531, + "grad_norm": 6.504363536834717, + "learning_rate": 2.1369675206638773e-06, + "loss": 0.1623, + "num_input_tokens_seen": 24230032, + "step": 12372 + }, + { + "epoch": 1.639893969516236, + "grad_norm": 4.499556064605713, + "learning_rate": 2.1366240663653497e-06, + "loss": 0.0421, + "num_input_tokens_seen": 24232336, + "step": 12373 + }, + { + "epoch": 1.640026507620941, + "grad_norm": 6.130017280578613, + "learning_rate": 2.1362806190729603e-06, + "loss": 0.1673, + "num_input_tokens_seen": 24234024, + "step": 12374 + }, + { + "epoch": 1.640159045725646, + "grad_norm": 3.9800562858581543, + "learning_rate": 2.135937178793331e-06, + "loss": 0.0951, + "num_input_tokens_seen": 24235544, + "step": 12375 + }, + { + "epoch": 1.6402915838303511, + "grad_norm": 7.193431854248047, + "learning_rate": 2.1355937455330857e-06, + "loss": 0.1139, + "num_input_tokens_seen": 24237016, + "step": 12376 + }, + { + "epoch": 1.6404241219350564, + "grad_norm": 3.3577399253845215, + "learning_rate": 2.135250319298844e-06, + "loss": 0.0187, + "num_input_tokens_seen": 24239136, + "step": 12377 + }, + { + "epoch": 1.6405566600397614, + "grad_norm": 6.343914031982422, + "learning_rate": 2.134906900097227e-06, + "loss": 0.0498, + "num_input_tokens_seen": 24240192, + "step": 12378 + }, + { + "epoch": 1.6406891981444667, + "grad_norm": 9.846000671386719, + "learning_rate": 2.134563487934857e-06, + "loss": 0.1887, + "num_input_tokens_seen": 24243296, + "step": 12379 + }, + { + "epoch": 1.6408217362491717, + "grad_norm": 5.912119388580322, + "learning_rate": 2.134220082818355e-06, + "loss": 0.1, + "num_input_tokens_seen": 24245424, + "step": 12380 + }, + { + "epoch": 1.6409542743538768, + "grad_norm": 4.190807342529297, + "learning_rate": 2.133876684754342e-06, + "loss": 0.0566, + "num_input_tokens_seen": 24247392, + "step": 12381 + }, + { + "epoch": 1.6410868124585818, + "grad_norm": 9.9558687210083, + "learning_rate": 2.13353329374944e-06, + "loss": 0.1784, + "num_input_tokens_seen": 24249280, + "step": 12382 + }, + { + "epoch": 1.6412193505632868, + "grad_norm": 0.3515898287296295, + "learning_rate": 2.1331899098102686e-06, + "loss": 0.002, + "num_input_tokens_seen": 24251104, + "step": 12383 + }, + { + "epoch": 1.641351888667992, + "grad_norm": 0.014256564900279045, + "learning_rate": 2.1328465329434485e-06, + "loss": 0.0001, + "num_input_tokens_seen": 24252408, + "step": 12384 + }, + { + "epoch": 1.6414844267726971, + "grad_norm": 0.18012477457523346, + "learning_rate": 2.1325031631555996e-06, + "loss": 0.0013, + "num_input_tokens_seen": 24254224, + "step": 12385 + }, + { + "epoch": 1.6416169648774024, + "grad_norm": 5.629945278167725, + "learning_rate": 2.1321598004533443e-06, + "loss": 0.0802, + "num_input_tokens_seen": 24255800, + "step": 12386 + }, + { + "epoch": 1.6417495029821074, + "grad_norm": 9.999616622924805, + "learning_rate": 2.1318164448433018e-06, + "loss": 0.3465, + "num_input_tokens_seen": 24257672, + "step": 12387 + }, + { + "epoch": 1.6418820410868125, + "grad_norm": 0.18788279592990875, + "learning_rate": 2.1314730963320925e-06, + "loss": 0.001, + "num_input_tokens_seen": 24259552, + "step": 12388 + }, + { + "epoch": 1.6420145791915175, + "grad_norm": 6.357253551483154, + "learning_rate": 2.1311297549263355e-06, + "loss": 0.0528, + "num_input_tokens_seen": 24261224, + "step": 12389 + }, + { + "epoch": 1.6421471172962225, + "grad_norm": 3.5071821212768555, + "learning_rate": 2.130786420632651e-06, + "loss": 0.0188, + "num_input_tokens_seen": 24263192, + "step": 12390 + }, + { + "epoch": 1.6422796554009278, + "grad_norm": 7.720834255218506, + "learning_rate": 2.13044309345766e-06, + "loss": 0.1576, + "num_input_tokens_seen": 24264952, + "step": 12391 + }, + { + "epoch": 1.6424121935056328, + "grad_norm": 6.238147258758545, + "learning_rate": 2.130099773407981e-06, + "loss": 0.1239, + "num_input_tokens_seen": 24267832, + "step": 12392 + }, + { + "epoch": 1.642544731610338, + "grad_norm": 0.023673150688409805, + "learning_rate": 2.1297564604902334e-06, + "loss": 0.0002, + "num_input_tokens_seen": 24269312, + "step": 12393 + }, + { + "epoch": 1.6426772697150431, + "grad_norm": 0.4990113079547882, + "learning_rate": 2.1294131547110363e-06, + "loss": 0.0038, + "num_input_tokens_seen": 24271072, + "step": 12394 + }, + { + "epoch": 1.6428098078197482, + "grad_norm": 0.03196888044476509, + "learning_rate": 2.129069856077009e-06, + "loss": 0.0002, + "num_input_tokens_seen": 24272472, + "step": 12395 + }, + { + "epoch": 1.6429423459244532, + "grad_norm": 8.538680076599121, + "learning_rate": 2.1287265645947707e-06, + "loss": 0.1396, + "num_input_tokens_seen": 24274544, + "step": 12396 + }, + { + "epoch": 1.6430748840291582, + "grad_norm": 7.695530414581299, + "learning_rate": 2.1283832802709414e-06, + "loss": 0.1951, + "num_input_tokens_seen": 24276512, + "step": 12397 + }, + { + "epoch": 1.6432074221338635, + "grad_norm": 1.4054946899414062, + "learning_rate": 2.128040003112139e-06, + "loss": 0.0108, + "num_input_tokens_seen": 24278720, + "step": 12398 + }, + { + "epoch": 1.6433399602385685, + "grad_norm": 14.260143280029297, + "learning_rate": 2.127696733124981e-06, + "loss": 0.3091, + "num_input_tokens_seen": 24280632, + "step": 12399 + }, + { + "epoch": 1.6434724983432738, + "grad_norm": 7.628549575805664, + "learning_rate": 2.1273534703160874e-06, + "loss": 0.1414, + "num_input_tokens_seen": 24281744, + "step": 12400 + }, + { + "epoch": 1.6436050364479788, + "grad_norm": 8.776350975036621, + "learning_rate": 2.1270102146920747e-06, + "loss": 0.1993, + "num_input_tokens_seen": 24284008, + "step": 12401 + }, + { + "epoch": 1.6437375745526839, + "grad_norm": 1.8708082437515259, + "learning_rate": 2.1266669662595632e-06, + "loss": 0.0248, + "num_input_tokens_seen": 24286040, + "step": 12402 + }, + { + "epoch": 1.643870112657389, + "grad_norm": 0.015483245253562927, + "learning_rate": 2.1263237250251707e-06, + "loss": 0.0001, + "num_input_tokens_seen": 24287288, + "step": 12403 + }, + { + "epoch": 1.644002650762094, + "grad_norm": 5.133495330810547, + "learning_rate": 2.125980490995514e-06, + "loss": 0.0806, + "num_input_tokens_seen": 24288968, + "step": 12404 + }, + { + "epoch": 1.6441351888667992, + "grad_norm": 11.187590599060059, + "learning_rate": 2.1256372641772117e-06, + "loss": 0.277, + "num_input_tokens_seen": 24290904, + "step": 12405 + }, + { + "epoch": 1.6442677269715045, + "grad_norm": 2.577056407928467, + "learning_rate": 2.1252940445768805e-06, + "loss": 0.0568, + "num_input_tokens_seen": 24292912, + "step": 12406 + }, + { + "epoch": 1.6444002650762095, + "grad_norm": 8.951775550842285, + "learning_rate": 2.1249508322011393e-06, + "loss": 0.1215, + "num_input_tokens_seen": 24295152, + "step": 12407 + }, + { + "epoch": 1.6445328031809145, + "grad_norm": 0.013995311222970486, + "learning_rate": 2.1246076270566044e-06, + "loss": 0.0001, + "num_input_tokens_seen": 24296504, + "step": 12408 + }, + { + "epoch": 1.6446653412856196, + "grad_norm": 3.754473924636841, + "learning_rate": 2.124264429149894e-06, + "loss": 0.0154, + "num_input_tokens_seen": 24298432, + "step": 12409 + }, + { + "epoch": 1.6447978793903246, + "grad_norm": 6.1042304039001465, + "learning_rate": 2.1239212384876244e-06, + "loss": 0.1446, + "num_input_tokens_seen": 24300640, + "step": 12410 + }, + { + "epoch": 1.6449304174950297, + "grad_norm": 0.02652362734079361, + "learning_rate": 2.1235780550764117e-06, + "loss": 0.0002, + "num_input_tokens_seen": 24301912, + "step": 12411 + }, + { + "epoch": 1.645062955599735, + "grad_norm": 7.00295877456665, + "learning_rate": 2.123234878922875e-06, + "loss": 0.0957, + "num_input_tokens_seen": 24303784, + "step": 12412 + }, + { + "epoch": 1.6451954937044402, + "grad_norm": 14.329706192016602, + "learning_rate": 2.1228917100336296e-06, + "loss": 0.2623, + "num_input_tokens_seen": 24306552, + "step": 12413 + }, + { + "epoch": 1.6453280318091452, + "grad_norm": 0.5431999564170837, + "learning_rate": 2.1225485484152926e-06, + "loss": 0.0037, + "num_input_tokens_seen": 24309288, + "step": 12414 + }, + { + "epoch": 1.6454605699138503, + "grad_norm": 9.130784034729004, + "learning_rate": 2.1222053940744797e-06, + "loss": 0.0707, + "num_input_tokens_seen": 24311272, + "step": 12415 + }, + { + "epoch": 1.6455931080185553, + "grad_norm": 2.556790351867676, + "learning_rate": 2.1218622470178062e-06, + "loss": 0.0313, + "num_input_tokens_seen": 24313504, + "step": 12416 + }, + { + "epoch": 1.6457256461232603, + "grad_norm": 3.167233467102051, + "learning_rate": 2.1215191072518905e-06, + "loss": 0.0328, + "num_input_tokens_seen": 24315040, + "step": 12417 + }, + { + "epoch": 1.6458581842279656, + "grad_norm": 0.04148625209927559, + "learning_rate": 2.1211759747833475e-06, + "loss": 0.0003, + "num_input_tokens_seen": 24316048, + "step": 12418 + }, + { + "epoch": 1.6459907223326706, + "grad_norm": 0.01683766581118107, + "learning_rate": 2.120832849618793e-06, + "loss": 0.0001, + "num_input_tokens_seen": 24318232, + "step": 12419 + }, + { + "epoch": 1.646123260437376, + "grad_norm": 0.610998809337616, + "learning_rate": 2.120489731764843e-06, + "loss": 0.006, + "num_input_tokens_seen": 24320088, + "step": 12420 + }, + { + "epoch": 1.646255798542081, + "grad_norm": 3.4160635471343994, + "learning_rate": 2.1201466212281118e-06, + "loss": 0.0355, + "num_input_tokens_seen": 24322248, + "step": 12421 + }, + { + "epoch": 1.646388336646786, + "grad_norm": 8.202816009521484, + "learning_rate": 2.1198035180152172e-06, + "loss": 0.0805, + "num_input_tokens_seen": 24325024, + "step": 12422 + }, + { + "epoch": 1.646520874751491, + "grad_norm": 0.5978389382362366, + "learning_rate": 2.119460422132772e-06, + "loss": 0.0022, + "num_input_tokens_seen": 24327160, + "step": 12423 + }, + { + "epoch": 1.646653412856196, + "grad_norm": 0.4971310794353485, + "learning_rate": 2.119117333587393e-06, + "loss": 0.004, + "num_input_tokens_seen": 24328616, + "step": 12424 + }, + { + "epoch": 1.6467859509609013, + "grad_norm": 3.668064594268799, + "learning_rate": 2.118774252385694e-06, + "loss": 0.0109, + "num_input_tokens_seen": 24329984, + "step": 12425 + }, + { + "epoch": 1.6469184890656063, + "grad_norm": 2.643575429916382, + "learning_rate": 2.1184311785342916e-06, + "loss": 0.0419, + "num_input_tokens_seen": 24331664, + "step": 12426 + }, + { + "epoch": 1.6470510271703116, + "grad_norm": 3.39406418800354, + "learning_rate": 2.118088112039797e-06, + "loss": 0.1277, + "num_input_tokens_seen": 24333272, + "step": 12427 + }, + { + "epoch": 1.6471835652750166, + "grad_norm": 10.239912033081055, + "learning_rate": 2.1177450529088287e-06, + "loss": 0.2437, + "num_input_tokens_seen": 24334920, + "step": 12428 + }, + { + "epoch": 1.6473161033797217, + "grad_norm": 9.564484596252441, + "learning_rate": 2.1174020011479997e-06, + "loss": 0.148, + "num_input_tokens_seen": 24336496, + "step": 12429 + }, + { + "epoch": 1.6474486414844267, + "grad_norm": 6.252840995788574, + "learning_rate": 2.117058956763924e-06, + "loss": 0.1123, + "num_input_tokens_seen": 24339248, + "step": 12430 + }, + { + "epoch": 1.6475811795891318, + "grad_norm": 1.5266553163528442, + "learning_rate": 2.1167159197632155e-06, + "loss": 0.0113, + "num_input_tokens_seen": 24341096, + "step": 12431 + }, + { + "epoch": 1.647713717693837, + "grad_norm": 5.005025863647461, + "learning_rate": 2.1163728901524874e-06, + "loss": 0.0592, + "num_input_tokens_seen": 24343352, + "step": 12432 + }, + { + "epoch": 1.647846255798542, + "grad_norm": 9.23836612701416, + "learning_rate": 2.116029867938356e-06, + "loss": 0.2653, + "num_input_tokens_seen": 24345248, + "step": 12433 + }, + { + "epoch": 1.6479787939032473, + "grad_norm": 5.859608173370361, + "learning_rate": 2.115686853127433e-06, + "loss": 0.1058, + "num_input_tokens_seen": 24347312, + "step": 12434 + }, + { + "epoch": 1.6481113320079523, + "grad_norm": 0.11150187253952026, + "learning_rate": 2.1153438457263333e-06, + "loss": 0.0007, + "num_input_tokens_seen": 24348384, + "step": 12435 + }, + { + "epoch": 1.6482438701126574, + "grad_norm": 8.661711692810059, + "learning_rate": 2.1150008457416697e-06, + "loss": 0.1311, + "num_input_tokens_seen": 24350000, + "step": 12436 + }, + { + "epoch": 1.6483764082173624, + "grad_norm": 0.07138270884752274, + "learning_rate": 2.114657853180054e-06, + "loss": 0.0004, + "num_input_tokens_seen": 24351528, + "step": 12437 + }, + { + "epoch": 1.6485089463220675, + "grad_norm": 0.5834004878997803, + "learning_rate": 2.1143148680481013e-06, + "loss": 0.0037, + "num_input_tokens_seen": 24353728, + "step": 12438 + }, + { + "epoch": 1.6486414844267727, + "grad_norm": 11.578585624694824, + "learning_rate": 2.113971890352425e-06, + "loss": 0.3861, + "num_input_tokens_seen": 24356208, + "step": 12439 + }, + { + "epoch": 1.6487740225314778, + "grad_norm": 0.024569083005189896, + "learning_rate": 2.113628920099636e-06, + "loss": 0.0002, + "num_input_tokens_seen": 24357792, + "step": 12440 + }, + { + "epoch": 1.648906560636183, + "grad_norm": 5.5889434814453125, + "learning_rate": 2.1132859572963484e-06, + "loss": 0.1011, + "num_input_tokens_seen": 24360120, + "step": 12441 + }, + { + "epoch": 1.649039098740888, + "grad_norm": 0.046275217086076736, + "learning_rate": 2.1129430019491734e-06, + "loss": 0.0003, + "num_input_tokens_seen": 24362144, + "step": 12442 + }, + { + "epoch": 1.649171636845593, + "grad_norm": 2.100266695022583, + "learning_rate": 2.112600054064725e-06, + "loss": 0.0423, + "num_input_tokens_seen": 24364320, + "step": 12443 + }, + { + "epoch": 1.6493041749502981, + "grad_norm": 0.007969129830598831, + "learning_rate": 2.1122571136496152e-06, + "loss": 0.0001, + "num_input_tokens_seen": 24365472, + "step": 12444 + }, + { + "epoch": 1.6494367130550032, + "grad_norm": 4.371706008911133, + "learning_rate": 2.111914180710456e-06, + "loss": 0.066, + "num_input_tokens_seen": 24367608, + "step": 12445 + }, + { + "epoch": 1.6495692511597084, + "grad_norm": 0.03658229857683182, + "learning_rate": 2.111571255253858e-06, + "loss": 0.0003, + "num_input_tokens_seen": 24369040, + "step": 12446 + }, + { + "epoch": 1.6497017892644135, + "grad_norm": 13.418811798095703, + "learning_rate": 2.1112283372864343e-06, + "loss": 0.2112, + "num_input_tokens_seen": 24370424, + "step": 12447 + }, + { + "epoch": 1.6498343273691187, + "grad_norm": 3.0554769039154053, + "learning_rate": 2.1108854268147973e-06, + "loss": 0.0312, + "num_input_tokens_seen": 24372360, + "step": 12448 + }, + { + "epoch": 1.6499668654738238, + "grad_norm": 12.054802894592285, + "learning_rate": 2.1105425238455567e-06, + "loss": 0.3503, + "num_input_tokens_seen": 24374264, + "step": 12449 + }, + { + "epoch": 1.6500994035785288, + "grad_norm": 0.2724110782146454, + "learning_rate": 2.110199628385326e-06, + "loss": 0.0019, + "num_input_tokens_seen": 24376688, + "step": 12450 + }, + { + "epoch": 1.6502319416832338, + "grad_norm": 0.466886043548584, + "learning_rate": 2.1098567404407147e-06, + "loss": 0.0065, + "num_input_tokens_seen": 24378304, + "step": 12451 + }, + { + "epoch": 1.6503644797879389, + "grad_norm": 6.198694705963135, + "learning_rate": 2.1095138600183346e-06, + "loss": 0.1658, + "num_input_tokens_seen": 24380472, + "step": 12452 + }, + { + "epoch": 1.6504970178926441, + "grad_norm": 0.0294928140938282, + "learning_rate": 2.1091709871247967e-06, + "loss": 0.0002, + "num_input_tokens_seen": 24381864, + "step": 12453 + }, + { + "epoch": 1.6506295559973494, + "grad_norm": 0.3923022150993347, + "learning_rate": 2.108828121766712e-06, + "loss": 0.0041, + "num_input_tokens_seen": 24384968, + "step": 12454 + }, + { + "epoch": 1.6507620941020544, + "grad_norm": 11.616300582885742, + "learning_rate": 2.108485263950691e-06, + "loss": 0.2926, + "num_input_tokens_seen": 24387480, + "step": 12455 + }, + { + "epoch": 1.6508946322067595, + "grad_norm": 1.1328603029251099, + "learning_rate": 2.1081424136833445e-06, + "loss": 0.0121, + "num_input_tokens_seen": 24389936, + "step": 12456 + }, + { + "epoch": 1.6510271703114645, + "grad_norm": 0.061572685837745667, + "learning_rate": 2.107799570971282e-06, + "loss": 0.0006, + "num_input_tokens_seen": 24391624, + "step": 12457 + }, + { + "epoch": 1.6511597084161695, + "grad_norm": 0.021700914949178696, + "learning_rate": 2.1074567358211138e-06, + "loss": 0.0001, + "num_input_tokens_seen": 24393856, + "step": 12458 + }, + { + "epoch": 1.6512922465208748, + "grad_norm": 0.034048426896333694, + "learning_rate": 2.1071139082394513e-06, + "loss": 0.0002, + "num_input_tokens_seen": 24395576, + "step": 12459 + }, + { + "epoch": 1.6514247846255798, + "grad_norm": 3.639286994934082, + "learning_rate": 2.1067710882329045e-06, + "loss": 0.0542, + "num_input_tokens_seen": 24398848, + "step": 12460 + }, + { + "epoch": 1.651557322730285, + "grad_norm": 7.196837902069092, + "learning_rate": 2.1064282758080813e-06, + "loss": 0.1072, + "num_input_tokens_seen": 24399832, + "step": 12461 + }, + { + "epoch": 1.6516898608349901, + "grad_norm": 9.317455291748047, + "learning_rate": 2.1060854709715926e-06, + "loss": 0.2076, + "num_input_tokens_seen": 24401656, + "step": 12462 + }, + { + "epoch": 1.6518223989396952, + "grad_norm": 5.6045966148376465, + "learning_rate": 2.1057426737300472e-06, + "loss": 0.1836, + "num_input_tokens_seen": 24403688, + "step": 12463 + }, + { + "epoch": 1.6519549370444002, + "grad_norm": 5.364067077636719, + "learning_rate": 2.105399884090056e-06, + "loss": 0.1557, + "num_input_tokens_seen": 24405744, + "step": 12464 + }, + { + "epoch": 1.6520874751491053, + "grad_norm": 0.03390101343393326, + "learning_rate": 2.1050571020582267e-06, + "loss": 0.0002, + "num_input_tokens_seen": 24407672, + "step": 12465 + }, + { + "epoch": 1.6522200132538105, + "grad_norm": 0.23934955894947052, + "learning_rate": 2.1047143276411696e-06, + "loss": 0.002, + "num_input_tokens_seen": 24410664, + "step": 12466 + }, + { + "epoch": 1.6523525513585156, + "grad_norm": 13.47858715057373, + "learning_rate": 2.1043715608454925e-06, + "loss": 0.3914, + "num_input_tokens_seen": 24412504, + "step": 12467 + }, + { + "epoch": 1.6524850894632208, + "grad_norm": 9.070810317993164, + "learning_rate": 2.104028801677804e-06, + "loss": 0.1937, + "num_input_tokens_seen": 24414632, + "step": 12468 + }, + { + "epoch": 1.6526176275679259, + "grad_norm": 0.33887800574302673, + "learning_rate": 2.1036860501447146e-06, + "loss": 0.0024, + "num_input_tokens_seen": 24416144, + "step": 12469 + }, + { + "epoch": 1.6527501656726309, + "grad_norm": 6.521589279174805, + "learning_rate": 2.103343306252831e-06, + "loss": 0.196, + "num_input_tokens_seen": 24419328, + "step": 12470 + }, + { + "epoch": 1.652882703777336, + "grad_norm": 9.708377838134766, + "learning_rate": 2.1030005700087623e-06, + "loss": 0.251, + "num_input_tokens_seen": 24421480, + "step": 12471 + }, + { + "epoch": 1.653015241882041, + "grad_norm": 0.0695500299334526, + "learning_rate": 2.1026578414191158e-06, + "loss": 0.0005, + "num_input_tokens_seen": 24423504, + "step": 12472 + }, + { + "epoch": 1.6531477799867462, + "grad_norm": 0.27429261803627014, + "learning_rate": 2.1023151204905002e-06, + "loss": 0.0015, + "num_input_tokens_seen": 24425160, + "step": 12473 + }, + { + "epoch": 1.6532803180914513, + "grad_norm": 0.379561185836792, + "learning_rate": 2.1019724072295242e-06, + "loss": 0.0019, + "num_input_tokens_seen": 24427488, + "step": 12474 + }, + { + "epoch": 1.6534128561961565, + "grad_norm": 6.344111919403076, + "learning_rate": 2.101629701642795e-06, + "loss": 0.2113, + "num_input_tokens_seen": 24429328, + "step": 12475 + }, + { + "epoch": 1.6535453943008616, + "grad_norm": 5.661983966827393, + "learning_rate": 2.101287003736919e-06, + "loss": 0.2362, + "num_input_tokens_seen": 24431808, + "step": 12476 + }, + { + "epoch": 1.6536779324055666, + "grad_norm": 0.030649060383439064, + "learning_rate": 2.1009443135185055e-06, + "loss": 0.0002, + "num_input_tokens_seen": 24433264, + "step": 12477 + }, + { + "epoch": 1.6538104705102716, + "grad_norm": 1.733249306678772, + "learning_rate": 2.10060163099416e-06, + "loss": 0.0183, + "num_input_tokens_seen": 24434544, + "step": 12478 + }, + { + "epoch": 1.6539430086149767, + "grad_norm": 14.427350044250488, + "learning_rate": 2.100258956170491e-06, + "loss": 0.2573, + "num_input_tokens_seen": 24436464, + "step": 12479 + }, + { + "epoch": 1.654075546719682, + "grad_norm": 4.297144889831543, + "learning_rate": 2.099916289054105e-06, + "loss": 0.0402, + "num_input_tokens_seen": 24439568, + "step": 12480 + }, + { + "epoch": 1.654208084824387, + "grad_norm": 7.878595352172852, + "learning_rate": 2.0995736296516095e-06, + "loss": 0.1424, + "num_input_tokens_seen": 24441384, + "step": 12481 + }, + { + "epoch": 1.6543406229290922, + "grad_norm": 5.498253345489502, + "learning_rate": 2.0992309779696107e-06, + "loss": 0.0533, + "num_input_tokens_seen": 24443880, + "step": 12482 + }, + { + "epoch": 1.6544731610337973, + "grad_norm": 0.5326500535011292, + "learning_rate": 2.0988883340147153e-06, + "loss": 0.005, + "num_input_tokens_seen": 24445688, + "step": 12483 + }, + { + "epoch": 1.6546056991385023, + "grad_norm": 13.26146125793457, + "learning_rate": 2.098545697793528e-06, + "loss": 0.1083, + "num_input_tokens_seen": 24447616, + "step": 12484 + }, + { + "epoch": 1.6547382372432073, + "grad_norm": 8.662447929382324, + "learning_rate": 2.0982030693126573e-06, + "loss": 0.1542, + "num_input_tokens_seen": 24450336, + "step": 12485 + }, + { + "epoch": 1.6548707753479124, + "grad_norm": 0.15563297271728516, + "learning_rate": 2.0978604485787094e-06, + "loss": 0.0009, + "num_input_tokens_seen": 24453336, + "step": 12486 + }, + { + "epoch": 1.6550033134526176, + "grad_norm": 0.02556902915239334, + "learning_rate": 2.097517835598289e-06, + "loss": 0.0002, + "num_input_tokens_seen": 24454760, + "step": 12487 + }, + { + "epoch": 1.6551358515573227, + "grad_norm": 3.9374892711639404, + "learning_rate": 2.0971752303780026e-06, + "loss": 0.1021, + "num_input_tokens_seen": 24456824, + "step": 12488 + }, + { + "epoch": 1.655268389662028, + "grad_norm": 0.982703447341919, + "learning_rate": 2.0968326329244545e-06, + "loss": 0.0086, + "num_input_tokens_seen": 24458912, + "step": 12489 + }, + { + "epoch": 1.655400927766733, + "grad_norm": 15.71157455444336, + "learning_rate": 2.0964900432442527e-06, + "loss": 0.2874, + "num_input_tokens_seen": 24461336, + "step": 12490 + }, + { + "epoch": 1.655533465871438, + "grad_norm": 5.574721336364746, + "learning_rate": 2.0961474613440007e-06, + "loss": 0.0692, + "num_input_tokens_seen": 24463768, + "step": 12491 + }, + { + "epoch": 1.655666003976143, + "grad_norm": 9.55152416229248, + "learning_rate": 2.0958048872303044e-06, + "loss": 0.2041, + "num_input_tokens_seen": 24465600, + "step": 12492 + }, + { + "epoch": 1.655798542080848, + "grad_norm": 13.484582901000977, + "learning_rate": 2.0954623209097687e-06, + "loss": 0.3454, + "num_input_tokens_seen": 24467320, + "step": 12493 + }, + { + "epoch": 1.6559310801855534, + "grad_norm": 3.6844241619110107, + "learning_rate": 2.0951197623889977e-06, + "loss": 0.0194, + "num_input_tokens_seen": 24469040, + "step": 12494 + }, + { + "epoch": 1.6560636182902586, + "grad_norm": 3.451279878616333, + "learning_rate": 2.094777211674598e-06, + "loss": 0.0445, + "num_input_tokens_seen": 24470736, + "step": 12495 + }, + { + "epoch": 1.6561961563949636, + "grad_norm": 7.777037620544434, + "learning_rate": 2.0944346687731727e-06, + "loss": 0.1592, + "num_input_tokens_seen": 24472576, + "step": 12496 + }, + { + "epoch": 1.6563286944996687, + "grad_norm": 3.831416368484497, + "learning_rate": 2.0940921336913277e-06, + "loss": 0.0395, + "num_input_tokens_seen": 24474160, + "step": 12497 + }, + { + "epoch": 1.6564612326043737, + "grad_norm": 6.333034992218018, + "learning_rate": 2.093749606435666e-06, + "loss": 0.1267, + "num_input_tokens_seen": 24475992, + "step": 12498 + }, + { + "epoch": 1.6565937707090788, + "grad_norm": 3.566476345062256, + "learning_rate": 2.093407087012791e-06, + "loss": 0.037, + "num_input_tokens_seen": 24478568, + "step": 12499 + }, + { + "epoch": 1.6567263088137838, + "grad_norm": 4.482105255126953, + "learning_rate": 2.0930645754293088e-06, + "loss": 0.0437, + "num_input_tokens_seen": 24479792, + "step": 12500 + }, + { + "epoch": 1.656858846918489, + "grad_norm": 6.188427448272705, + "learning_rate": 2.0927220716918226e-06, + "loss": 0.1452, + "num_input_tokens_seen": 24482504, + "step": 12501 + }, + { + "epoch": 1.6569913850231943, + "grad_norm": 6.203248500823975, + "learning_rate": 2.0923795758069355e-06, + "loss": 0.1297, + "num_input_tokens_seen": 24485144, + "step": 12502 + }, + { + "epoch": 1.6571239231278994, + "grad_norm": 14.572771072387695, + "learning_rate": 2.0920370877812515e-06, + "loss": 0.1299, + "num_input_tokens_seen": 24487360, + "step": 12503 + }, + { + "epoch": 1.6572564612326044, + "grad_norm": 0.023911679163575172, + "learning_rate": 2.091694607621373e-06, + "loss": 0.0002, + "num_input_tokens_seen": 24489056, + "step": 12504 + }, + { + "epoch": 1.6573889993373094, + "grad_norm": 0.04058672860264778, + "learning_rate": 2.0913521353339047e-06, + "loss": 0.0003, + "num_input_tokens_seen": 24490472, + "step": 12505 + }, + { + "epoch": 1.6575215374420145, + "grad_norm": 0.7940712571144104, + "learning_rate": 2.0910096709254497e-06, + "loss": 0.0047, + "num_input_tokens_seen": 24492440, + "step": 12506 + }, + { + "epoch": 1.6576540755467197, + "grad_norm": 0.32566484808921814, + "learning_rate": 2.0906672144026105e-06, + "loss": 0.0034, + "num_input_tokens_seen": 24493528, + "step": 12507 + }, + { + "epoch": 1.6577866136514248, + "grad_norm": 4.789429664611816, + "learning_rate": 2.0903247657719893e-06, + "loss": 0.0864, + "num_input_tokens_seen": 24495000, + "step": 12508 + }, + { + "epoch": 1.65791915175613, + "grad_norm": 14.886362075805664, + "learning_rate": 2.0899823250401884e-06, + "loss": 0.1383, + "num_input_tokens_seen": 24496552, + "step": 12509 + }, + { + "epoch": 1.658051689860835, + "grad_norm": 2.8449811935424805, + "learning_rate": 2.0896398922138124e-06, + "loss": 0.0455, + "num_input_tokens_seen": 24498448, + "step": 12510 + }, + { + "epoch": 1.65818422796554, + "grad_norm": 5.603843688964844, + "learning_rate": 2.0892974672994616e-06, + "loss": 0.1309, + "num_input_tokens_seen": 24500344, + "step": 12511 + }, + { + "epoch": 1.6583167660702451, + "grad_norm": 0.040200524032115936, + "learning_rate": 2.0889550503037397e-06, + "loss": 0.0003, + "num_input_tokens_seen": 24502240, + "step": 12512 + }, + { + "epoch": 1.6584493041749502, + "grad_norm": 1.5589414834976196, + "learning_rate": 2.088612641233248e-06, + "loss": 0.047, + "num_input_tokens_seen": 24504016, + "step": 12513 + }, + { + "epoch": 1.6585818422796554, + "grad_norm": 0.021897558122873306, + "learning_rate": 2.0882702400945878e-06, + "loss": 0.0002, + "num_input_tokens_seen": 24505520, + "step": 12514 + }, + { + "epoch": 1.6587143803843605, + "grad_norm": 0.0349416583776474, + "learning_rate": 2.087927846894361e-06, + "loss": 0.0002, + "num_input_tokens_seen": 24507280, + "step": 12515 + }, + { + "epoch": 1.6588469184890657, + "grad_norm": 11.089234352111816, + "learning_rate": 2.0875854616391704e-06, + "loss": 0.1081, + "num_input_tokens_seen": 24509048, + "step": 12516 + }, + { + "epoch": 1.6589794565937708, + "grad_norm": 0.02502138912677765, + "learning_rate": 2.087243084335616e-06, + "loss": 0.0002, + "num_input_tokens_seen": 24510776, + "step": 12517 + }, + { + "epoch": 1.6591119946984758, + "grad_norm": 7.318490982055664, + "learning_rate": 2.086900714990301e-06, + "loss": 0.0372, + "num_input_tokens_seen": 24513248, + "step": 12518 + }, + { + "epoch": 1.6592445328031808, + "grad_norm": 0.06691213697195053, + "learning_rate": 2.086558353609824e-06, + "loss": 0.0005, + "num_input_tokens_seen": 24515896, + "step": 12519 + }, + { + "epoch": 1.6593770709078859, + "grad_norm": 9.124857902526855, + "learning_rate": 2.0862160002007862e-06, + "loss": 0.1411, + "num_input_tokens_seen": 24517760, + "step": 12520 + }, + { + "epoch": 1.6595096090125911, + "grad_norm": 8.442754745483398, + "learning_rate": 2.0858736547697913e-06, + "loss": 0.1265, + "num_input_tokens_seen": 24521072, + "step": 12521 + }, + { + "epoch": 1.6596421471172962, + "grad_norm": 12.957094192504883, + "learning_rate": 2.0855313173234376e-06, + "loss": 0.2824, + "num_input_tokens_seen": 24523272, + "step": 12522 + }, + { + "epoch": 1.6597746852220014, + "grad_norm": 11.337525367736816, + "learning_rate": 2.0851889878683256e-06, + "loss": 0.0921, + "num_input_tokens_seen": 24524976, + "step": 12523 + }, + { + "epoch": 1.6599072233267065, + "grad_norm": 0.05655369162559509, + "learning_rate": 2.0848466664110566e-06, + "loss": 0.0004, + "num_input_tokens_seen": 24526736, + "step": 12524 + }, + { + "epoch": 1.6600397614314115, + "grad_norm": 4.300192356109619, + "learning_rate": 2.084504352958229e-06, + "loss": 0.0735, + "num_input_tokens_seen": 24528568, + "step": 12525 + }, + { + "epoch": 1.6601722995361166, + "grad_norm": 8.011896133422852, + "learning_rate": 2.0841620475164453e-06, + "loss": 0.1172, + "num_input_tokens_seen": 24530832, + "step": 12526 + }, + { + "epoch": 1.6603048376408216, + "grad_norm": 0.05731885880231857, + "learning_rate": 2.0838197500923043e-06, + "loss": 0.0004, + "num_input_tokens_seen": 24532168, + "step": 12527 + }, + { + "epoch": 1.6604373757455269, + "grad_norm": 7.204423904418945, + "learning_rate": 2.0834774606924054e-06, + "loss": 0.1312, + "num_input_tokens_seen": 24533784, + "step": 12528 + }, + { + "epoch": 1.660569913850232, + "grad_norm": 6.379366397857666, + "learning_rate": 2.0831351793233483e-06, + "loss": 0.0559, + "num_input_tokens_seen": 24536096, + "step": 12529 + }, + { + "epoch": 1.6607024519549372, + "grad_norm": 12.552786827087402, + "learning_rate": 2.0827929059917314e-06, + "loss": 0.2615, + "num_input_tokens_seen": 24537944, + "step": 12530 + }, + { + "epoch": 1.6608349900596422, + "grad_norm": 1.3013322353363037, + "learning_rate": 2.0824506407041568e-06, + "loss": 0.0131, + "num_input_tokens_seen": 24540208, + "step": 12531 + }, + { + "epoch": 1.6609675281643472, + "grad_norm": 0.026617733761668205, + "learning_rate": 2.082108383467221e-06, + "loss": 0.0002, + "num_input_tokens_seen": 24542360, + "step": 12532 + }, + { + "epoch": 1.6611000662690523, + "grad_norm": 6.615713119506836, + "learning_rate": 2.0817661342875244e-06, + "loss": 0.0683, + "num_input_tokens_seen": 24543928, + "step": 12533 + }, + { + "epoch": 1.6612326043737573, + "grad_norm": 5.202270030975342, + "learning_rate": 2.081423893171665e-06, + "loss": 0.125, + "num_input_tokens_seen": 24546296, + "step": 12534 + }, + { + "epoch": 1.6613651424784626, + "grad_norm": 12.771049499511719, + "learning_rate": 2.0810816601262408e-06, + "loss": 0.2113, + "num_input_tokens_seen": 24548584, + "step": 12535 + }, + { + "epoch": 1.6614976805831678, + "grad_norm": 0.14189909398555756, + "learning_rate": 2.0807394351578525e-06, + "loss": 0.0009, + "num_input_tokens_seen": 24550576, + "step": 12536 + }, + { + "epoch": 1.6616302186878729, + "grad_norm": 8.984435081481934, + "learning_rate": 2.080397218273097e-06, + "loss": 0.2845, + "num_input_tokens_seen": 24552728, + "step": 12537 + }, + { + "epoch": 1.661762756792578, + "grad_norm": 1.418674111366272, + "learning_rate": 2.080055009478572e-06, + "loss": 0.0086, + "num_input_tokens_seen": 24555464, + "step": 12538 + }, + { + "epoch": 1.661895294897283, + "grad_norm": 0.7918649315834045, + "learning_rate": 2.0797128087808768e-06, + "loss": 0.0131, + "num_input_tokens_seen": 24558280, + "step": 12539 + }, + { + "epoch": 1.662027833001988, + "grad_norm": 7.088196277618408, + "learning_rate": 2.079370616186608e-06, + "loss": 0.1107, + "num_input_tokens_seen": 24560496, + "step": 12540 + }, + { + "epoch": 1.662160371106693, + "grad_norm": 7.949176788330078, + "learning_rate": 2.0790284317023633e-06, + "loss": 0.1897, + "num_input_tokens_seen": 24561984, + "step": 12541 + }, + { + "epoch": 1.6622929092113983, + "grad_norm": 19.827503204345703, + "learning_rate": 2.078686255334742e-06, + "loss": 0.5639, + "num_input_tokens_seen": 24563904, + "step": 12542 + }, + { + "epoch": 1.6624254473161035, + "grad_norm": 3.3346900939941406, + "learning_rate": 2.0783440870903403e-06, + "loss": 0.0613, + "num_input_tokens_seen": 24565848, + "step": 12543 + }, + { + "epoch": 1.6625579854208086, + "grad_norm": 8.350682258605957, + "learning_rate": 2.078001926975755e-06, + "loss": 0.0744, + "num_input_tokens_seen": 24567112, + "step": 12544 + }, + { + "epoch": 1.6626905235255136, + "grad_norm": 1.1854501962661743, + "learning_rate": 2.077659774997584e-06, + "loss": 0.0057, + "num_input_tokens_seen": 24568488, + "step": 12545 + }, + { + "epoch": 1.6628230616302186, + "grad_norm": 0.12997469305992126, + "learning_rate": 2.0773176311624233e-06, + "loss": 0.0008, + "num_input_tokens_seen": 24570408, + "step": 12546 + }, + { + "epoch": 1.6629555997349237, + "grad_norm": 8.987651824951172, + "learning_rate": 2.0769754954768706e-06, + "loss": 0.1976, + "num_input_tokens_seen": 24572680, + "step": 12547 + }, + { + "epoch": 1.663088137839629, + "grad_norm": 5.774478912353516, + "learning_rate": 2.0766333679475226e-06, + "loss": 0.024, + "num_input_tokens_seen": 24575288, + "step": 12548 + }, + { + "epoch": 1.663220675944334, + "grad_norm": 0.04823017120361328, + "learning_rate": 2.076291248580975e-06, + "loss": 0.0003, + "num_input_tokens_seen": 24577456, + "step": 12549 + }, + { + "epoch": 1.6633532140490392, + "grad_norm": 3.733994245529175, + "learning_rate": 2.0759491373838247e-06, + "loss": 0.0266, + "num_input_tokens_seen": 24578800, + "step": 12550 + }, + { + "epoch": 1.6634857521537443, + "grad_norm": 0.06245024874806404, + "learning_rate": 2.0756070343626665e-06, + "loss": 0.0003, + "num_input_tokens_seen": 24580608, + "step": 12551 + }, + { + "epoch": 1.6636182902584493, + "grad_norm": 11.032949447631836, + "learning_rate": 2.0752649395240985e-06, + "loss": 0.1617, + "num_input_tokens_seen": 24582216, + "step": 12552 + }, + { + "epoch": 1.6637508283631544, + "grad_norm": 3.4304184913635254, + "learning_rate": 2.0749228528747154e-06, + "loss": 0.0241, + "num_input_tokens_seen": 24584760, + "step": 12553 + }, + { + "epoch": 1.6638833664678594, + "grad_norm": 0.5522821545600891, + "learning_rate": 2.074580774421113e-06, + "loss": 0.0051, + "num_input_tokens_seen": 24586368, + "step": 12554 + }, + { + "epoch": 1.6640159045725647, + "grad_norm": 0.01259677391499281, + "learning_rate": 2.074238704169886e-06, + "loss": 0.0001, + "num_input_tokens_seen": 24587832, + "step": 12555 + }, + { + "epoch": 1.6641484426772697, + "grad_norm": 23.19214630126953, + "learning_rate": 2.0738966421276306e-06, + "loss": 0.5154, + "num_input_tokens_seen": 24589768, + "step": 12556 + }, + { + "epoch": 1.664280980781975, + "grad_norm": 1.189054250717163, + "learning_rate": 2.0735545883009427e-06, + "loss": 0.0122, + "num_input_tokens_seen": 24591176, + "step": 12557 + }, + { + "epoch": 1.66441351888668, + "grad_norm": 6.5798211097717285, + "learning_rate": 2.073212542696416e-06, + "loss": 0.0512, + "num_input_tokens_seen": 24593472, + "step": 12558 + }, + { + "epoch": 1.664546056991385, + "grad_norm": 0.012018929235637188, + "learning_rate": 2.0728705053206466e-06, + "loss": 0.0001, + "num_input_tokens_seen": 24595120, + "step": 12559 + }, + { + "epoch": 1.66467859509609, + "grad_norm": 25.996784210205078, + "learning_rate": 2.072528476180228e-06, + "loss": 0.4534, + "num_input_tokens_seen": 24597960, + "step": 12560 + }, + { + "epoch": 1.664811133200795, + "grad_norm": 14.749024391174316, + "learning_rate": 2.072186455281755e-06, + "loss": 0.218, + "num_input_tokens_seen": 24600592, + "step": 12561 + }, + { + "epoch": 1.6649436713055004, + "grad_norm": 2.8969414234161377, + "learning_rate": 2.071844442631822e-06, + "loss": 0.0393, + "num_input_tokens_seen": 24602592, + "step": 12562 + }, + { + "epoch": 1.6650762094102054, + "grad_norm": 0.9519136548042297, + "learning_rate": 2.0715024382370245e-06, + "loss": 0.0234, + "num_input_tokens_seen": 24604848, + "step": 12563 + }, + { + "epoch": 1.6652087475149107, + "grad_norm": 1.2902613878250122, + "learning_rate": 2.071160442103955e-06, + "loss": 0.0074, + "num_input_tokens_seen": 24606280, + "step": 12564 + }, + { + "epoch": 1.6653412856196157, + "grad_norm": 1.1296002864837646, + "learning_rate": 2.0708184542392084e-06, + "loss": 0.0114, + "num_input_tokens_seen": 24607472, + "step": 12565 + }, + { + "epoch": 1.6654738237243207, + "grad_norm": 2.483086347579956, + "learning_rate": 2.0704764746493773e-06, + "loss": 0.0488, + "num_input_tokens_seen": 24608736, + "step": 12566 + }, + { + "epoch": 1.6656063618290258, + "grad_norm": 11.920053482055664, + "learning_rate": 2.0701345033410568e-06, + "loss": 0.2988, + "num_input_tokens_seen": 24610232, + "step": 12567 + }, + { + "epoch": 1.6657388999337308, + "grad_norm": 0.0851629227399826, + "learning_rate": 2.0697925403208393e-06, + "loss": 0.0006, + "num_input_tokens_seen": 24612040, + "step": 12568 + }, + { + "epoch": 1.665871438038436, + "grad_norm": 4.684647560119629, + "learning_rate": 2.069450585595319e-06, + "loss": 0.0512, + "num_input_tokens_seen": 24614440, + "step": 12569 + }, + { + "epoch": 1.666003976143141, + "grad_norm": 1.0345211029052734, + "learning_rate": 2.069108639171088e-06, + "loss": 0.0064, + "num_input_tokens_seen": 24617192, + "step": 12570 + }, + { + "epoch": 1.6661365142478464, + "grad_norm": 4.9417314529418945, + "learning_rate": 2.0687667010547397e-06, + "loss": 0.1322, + "num_input_tokens_seen": 24618912, + "step": 12571 + }, + { + "epoch": 1.6662690523525514, + "grad_norm": 3.0760343074798584, + "learning_rate": 2.068424771252866e-06, + "loss": 0.0074, + "num_input_tokens_seen": 24621560, + "step": 12572 + }, + { + "epoch": 1.6664015904572564, + "grad_norm": 7.992963790893555, + "learning_rate": 2.0680828497720613e-06, + "loss": 0.2593, + "num_input_tokens_seen": 24623512, + "step": 12573 + }, + { + "epoch": 1.6665341285619615, + "grad_norm": 5.043349266052246, + "learning_rate": 2.0677409366189176e-06, + "loss": 0.0558, + "num_input_tokens_seen": 24624992, + "step": 12574 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 5.909275054931641, + "learning_rate": 2.067399031800027e-06, + "loss": 0.024, + "num_input_tokens_seen": 24627416, + "step": 12575 + }, + { + "epoch": 1.6667992047713718, + "grad_norm": 6.859180450439453, + "learning_rate": 2.0670571353219806e-06, + "loss": 0.1828, + "num_input_tokens_seen": 24629552, + "step": 12576 + }, + { + "epoch": 1.666931742876077, + "grad_norm": 2.6421396732330322, + "learning_rate": 2.066715247191371e-06, + "loss": 0.0644, + "num_input_tokens_seen": 24631256, + "step": 12577 + }, + { + "epoch": 1.667064280980782, + "grad_norm": 1.6464276313781738, + "learning_rate": 2.066373367414791e-06, + "loss": 0.037, + "num_input_tokens_seen": 24633136, + "step": 12578 + }, + { + "epoch": 1.6671968190854871, + "grad_norm": 13.033422470092773, + "learning_rate": 2.0660314959988316e-06, + "loss": 0.1588, + "num_input_tokens_seen": 24635048, + "step": 12579 + }, + { + "epoch": 1.6673293571901922, + "grad_norm": 7.838176727294922, + "learning_rate": 2.065689632950085e-06, + "loss": 0.219, + "num_input_tokens_seen": 24637056, + "step": 12580 + }, + { + "epoch": 1.6674618952948972, + "grad_norm": 0.04336291551589966, + "learning_rate": 2.0653477782751415e-06, + "loss": 0.0002, + "num_input_tokens_seen": 24638640, + "step": 12581 + }, + { + "epoch": 1.6675944333996022, + "grad_norm": 5.1652655601501465, + "learning_rate": 2.065005931980592e-06, + "loss": 0.0847, + "num_input_tokens_seen": 24640464, + "step": 12582 + }, + { + "epoch": 1.6677269715043075, + "grad_norm": 6.65416955947876, + "learning_rate": 2.0646640940730285e-06, + "loss": 0.1027, + "num_input_tokens_seen": 24643320, + "step": 12583 + }, + { + "epoch": 1.6678595096090127, + "grad_norm": 6.0637006759643555, + "learning_rate": 2.064322264559042e-06, + "loss": 0.1455, + "num_input_tokens_seen": 24646224, + "step": 12584 + }, + { + "epoch": 1.6679920477137178, + "grad_norm": 7.643784046173096, + "learning_rate": 2.0639804434452225e-06, + "loss": 0.1238, + "num_input_tokens_seen": 24648456, + "step": 12585 + }, + { + "epoch": 1.6681245858184228, + "grad_norm": 2.7666585445404053, + "learning_rate": 2.063638630738161e-06, + "loss": 0.0637, + "num_input_tokens_seen": 24650008, + "step": 12586 + }, + { + "epoch": 1.6682571239231279, + "grad_norm": 0.07302656024694443, + "learning_rate": 2.063296826444447e-06, + "loss": 0.0005, + "num_input_tokens_seen": 24652016, + "step": 12587 + }, + { + "epoch": 1.668389662027833, + "grad_norm": 4.4107985496521, + "learning_rate": 2.062955030570672e-06, + "loss": 0.0767, + "num_input_tokens_seen": 24653568, + "step": 12588 + }, + { + "epoch": 1.6685222001325382, + "grad_norm": 7.478450298309326, + "learning_rate": 2.0626132431234256e-06, + "loss": 0.0888, + "num_input_tokens_seen": 24655688, + "step": 12589 + }, + { + "epoch": 1.6686547382372432, + "grad_norm": 3.4438467025756836, + "learning_rate": 2.0622714641092986e-06, + "loss": 0.0563, + "num_input_tokens_seen": 24657224, + "step": 12590 + }, + { + "epoch": 1.6687872763419485, + "grad_norm": 12.547554969787598, + "learning_rate": 2.061929693534878e-06, + "loss": 0.245, + "num_input_tokens_seen": 24659200, + "step": 12591 + }, + { + "epoch": 1.6689198144466535, + "grad_norm": 11.405672073364258, + "learning_rate": 2.0615879314067556e-06, + "loss": 0.2422, + "num_input_tokens_seen": 24662256, + "step": 12592 + }, + { + "epoch": 1.6690523525513585, + "grad_norm": 8.424065589904785, + "learning_rate": 2.0612461777315205e-06, + "loss": 0.2784, + "num_input_tokens_seen": 24664896, + "step": 12593 + }, + { + "epoch": 1.6691848906560636, + "grad_norm": 5.941987991333008, + "learning_rate": 2.0609044325157618e-06, + "loss": 0.0543, + "num_input_tokens_seen": 24667176, + "step": 12594 + }, + { + "epoch": 1.6693174287607686, + "grad_norm": 11.535968780517578, + "learning_rate": 2.060562695766069e-06, + "loss": 0.2176, + "num_input_tokens_seen": 24669816, + "step": 12595 + }, + { + "epoch": 1.6694499668654739, + "grad_norm": 1.825566291809082, + "learning_rate": 2.06022096748903e-06, + "loss": 0.0226, + "num_input_tokens_seen": 24671392, + "step": 12596 + }, + { + "epoch": 1.669582504970179, + "grad_norm": 0.07434290647506714, + "learning_rate": 2.059879247691235e-06, + "loss": 0.0005, + "num_input_tokens_seen": 24673224, + "step": 12597 + }, + { + "epoch": 1.6697150430748842, + "grad_norm": 10.774996757507324, + "learning_rate": 2.05953753637927e-06, + "loss": 0.0836, + "num_input_tokens_seen": 24675440, + "step": 12598 + }, + { + "epoch": 1.6698475811795892, + "grad_norm": 0.09268884360790253, + "learning_rate": 2.0591958335597267e-06, + "loss": 0.0006, + "num_input_tokens_seen": 24676808, + "step": 12599 + }, + { + "epoch": 1.6699801192842942, + "grad_norm": 15.444400787353516, + "learning_rate": 2.058854139239191e-06, + "loss": 0.5013, + "num_input_tokens_seen": 24679704, + "step": 12600 + }, + { + "epoch": 1.6701126573889993, + "grad_norm": 8.134020805358887, + "learning_rate": 2.058512453424253e-06, + "loss": 0.1696, + "num_input_tokens_seen": 24681776, + "step": 12601 + }, + { + "epoch": 1.6702451954937043, + "grad_norm": 0.23860086500644684, + "learning_rate": 2.0581707761214983e-06, + "loss": 0.0016, + "num_input_tokens_seen": 24684072, + "step": 12602 + }, + { + "epoch": 1.6703777335984096, + "grad_norm": 0.11969126760959625, + "learning_rate": 2.0578291073375154e-06, + "loss": 0.0008, + "num_input_tokens_seen": 24687512, + "step": 12603 + }, + { + "epoch": 1.6705102717031146, + "grad_norm": 6.728306293487549, + "learning_rate": 2.0574874470788936e-06, + "loss": 0.097, + "num_input_tokens_seen": 24688808, + "step": 12604 + }, + { + "epoch": 1.6706428098078199, + "grad_norm": 8.078740119934082, + "learning_rate": 2.057145795352219e-06, + "loss": 0.1575, + "num_input_tokens_seen": 24691184, + "step": 12605 + }, + { + "epoch": 1.670775347912525, + "grad_norm": 4.105355262756348, + "learning_rate": 2.0568041521640785e-06, + "loss": 0.0585, + "num_input_tokens_seen": 24692968, + "step": 12606 + }, + { + "epoch": 1.67090788601723, + "grad_norm": 0.03174453601241112, + "learning_rate": 2.05646251752106e-06, + "loss": 0.0002, + "num_input_tokens_seen": 24694040, + "step": 12607 + }, + { + "epoch": 1.671040424121935, + "grad_norm": 2.968130350112915, + "learning_rate": 2.056120891429749e-06, + "loss": 0.0826, + "num_input_tokens_seen": 24697192, + "step": 12608 + }, + { + "epoch": 1.67117296222664, + "grad_norm": 9.39665699005127, + "learning_rate": 2.0557792738967343e-06, + "loss": 0.2193, + "num_input_tokens_seen": 24699000, + "step": 12609 + }, + { + "epoch": 1.6713055003313453, + "grad_norm": 1.4442487955093384, + "learning_rate": 2.0554376649286023e-06, + "loss": 0.0072, + "num_input_tokens_seen": 24701464, + "step": 12610 + }, + { + "epoch": 1.6714380384360503, + "grad_norm": 0.024023912847042084, + "learning_rate": 2.0550960645319377e-06, + "loss": 0.0002, + "num_input_tokens_seen": 24702696, + "step": 12611 + }, + { + "epoch": 1.6715705765407556, + "grad_norm": 6.288366317749023, + "learning_rate": 2.054754472713329e-06, + "loss": 0.133, + "num_input_tokens_seen": 24704576, + "step": 12612 + }, + { + "epoch": 1.6717031146454606, + "grad_norm": 0.5846000909805298, + "learning_rate": 2.05441288947936e-06, + "loss": 0.0059, + "num_input_tokens_seen": 24706032, + "step": 12613 + }, + { + "epoch": 1.6718356527501657, + "grad_norm": 0.11612889915704727, + "learning_rate": 2.0540713148366185e-06, + "loss": 0.0007, + "num_input_tokens_seen": 24707640, + "step": 12614 + }, + { + "epoch": 1.6719681908548707, + "grad_norm": 5.25131368637085, + "learning_rate": 2.0537297487916898e-06, + "loss": 0.1101, + "num_input_tokens_seen": 24710624, + "step": 12615 + }, + { + "epoch": 1.6721007289595757, + "grad_norm": 0.0853138193488121, + "learning_rate": 2.0533881913511597e-06, + "loss": 0.0006, + "num_input_tokens_seen": 24712256, + "step": 12616 + }, + { + "epoch": 1.672233267064281, + "grad_norm": 0.12376310676336288, + "learning_rate": 2.0530466425216127e-06, + "loss": 0.0007, + "num_input_tokens_seen": 24713880, + "step": 12617 + }, + { + "epoch": 1.672365805168986, + "grad_norm": 7.602744102478027, + "learning_rate": 2.0527051023096345e-06, + "loss": 0.129, + "num_input_tokens_seen": 24715856, + "step": 12618 + }, + { + "epoch": 1.6724983432736913, + "grad_norm": 1.3021348714828491, + "learning_rate": 2.0523635707218116e-06, + "loss": 0.0116, + "num_input_tokens_seen": 24717552, + "step": 12619 + }, + { + "epoch": 1.6726308813783963, + "grad_norm": 7.130713939666748, + "learning_rate": 2.0520220477647272e-06, + "loss": 0.2238, + "num_input_tokens_seen": 24719904, + "step": 12620 + }, + { + "epoch": 1.6727634194831014, + "grad_norm": 0.33442631363868713, + "learning_rate": 2.051680533444967e-06, + "loss": 0.0023, + "num_input_tokens_seen": 24722032, + "step": 12621 + }, + { + "epoch": 1.6728959575878064, + "grad_norm": 9.061714172363281, + "learning_rate": 2.0513390277691156e-06, + "loss": 0.1026, + "num_input_tokens_seen": 24724336, + "step": 12622 + }, + { + "epoch": 1.6730284956925114, + "grad_norm": 6.094336032867432, + "learning_rate": 2.0509975307437564e-06, + "loss": 0.1265, + "num_input_tokens_seen": 24725856, + "step": 12623 + }, + { + "epoch": 1.6731610337972167, + "grad_norm": 0.11674538999795914, + "learning_rate": 2.050656042375475e-06, + "loss": 0.0008, + "num_input_tokens_seen": 24727832, + "step": 12624 + }, + { + "epoch": 1.673293571901922, + "grad_norm": 0.40906742215156555, + "learning_rate": 2.0503145626708554e-06, + "loss": 0.0041, + "num_input_tokens_seen": 24729560, + "step": 12625 + }, + { + "epoch": 1.673426110006627, + "grad_norm": 1.3542370796203613, + "learning_rate": 2.0499730916364806e-06, + "loss": 0.0046, + "num_input_tokens_seen": 24731616, + "step": 12626 + }, + { + "epoch": 1.673558648111332, + "grad_norm": 8.894756317138672, + "learning_rate": 2.049631629278936e-06, + "loss": 0.0362, + "num_input_tokens_seen": 24733104, + "step": 12627 + }, + { + "epoch": 1.673691186216037, + "grad_norm": 0.17146232724189758, + "learning_rate": 2.049290175604804e-06, + "loss": 0.0012, + "num_input_tokens_seen": 24735264, + "step": 12628 + }, + { + "epoch": 1.6738237243207421, + "grad_norm": 3.6801795959472656, + "learning_rate": 2.0489487306206676e-06, + "loss": 0.0224, + "num_input_tokens_seen": 24736672, + "step": 12629 + }, + { + "epoch": 1.6739562624254472, + "grad_norm": 11.28310775756836, + "learning_rate": 2.048607294333111e-06, + "loss": 0.1242, + "num_input_tokens_seen": 24737696, + "step": 12630 + }, + { + "epoch": 1.6740888005301524, + "grad_norm": 13.773236274719238, + "learning_rate": 2.0482658667487183e-06, + "loss": 0.227, + "num_input_tokens_seen": 24739472, + "step": 12631 + }, + { + "epoch": 1.6742213386348577, + "grad_norm": 0.10588119179010391, + "learning_rate": 2.04792444787407e-06, + "loss": 0.0007, + "num_input_tokens_seen": 24741400, + "step": 12632 + }, + { + "epoch": 1.6743538767395627, + "grad_norm": 0.23910683393478394, + "learning_rate": 2.047583037715751e-06, + "loss": 0.0012, + "num_input_tokens_seen": 24743104, + "step": 12633 + }, + { + "epoch": 1.6744864148442677, + "grad_norm": 0.7921948432922363, + "learning_rate": 2.047241636280343e-06, + "loss": 0.0059, + "num_input_tokens_seen": 24744648, + "step": 12634 + }, + { + "epoch": 1.6746189529489728, + "grad_norm": 7.8460845947265625, + "learning_rate": 2.0469002435744285e-06, + "loss": 0.2081, + "num_input_tokens_seen": 24746320, + "step": 12635 + }, + { + "epoch": 1.6747514910536778, + "grad_norm": 2.4047162532806396, + "learning_rate": 2.0465588596045896e-06, + "loss": 0.0723, + "num_input_tokens_seen": 24748048, + "step": 12636 + }, + { + "epoch": 1.674884029158383, + "grad_norm": 12.07406234741211, + "learning_rate": 2.0462174843774094e-06, + "loss": 0.1636, + "num_input_tokens_seen": 24750512, + "step": 12637 + }, + { + "epoch": 1.6750165672630881, + "grad_norm": 10.994248390197754, + "learning_rate": 2.0458761178994684e-06, + "loss": 0.248, + "num_input_tokens_seen": 24752584, + "step": 12638 + }, + { + "epoch": 1.6751491053677934, + "grad_norm": 4.051455974578857, + "learning_rate": 2.045534760177349e-06, + "loss": 0.036, + "num_input_tokens_seen": 24754664, + "step": 12639 + }, + { + "epoch": 1.6752816434724984, + "grad_norm": 7.72996187210083, + "learning_rate": 2.0451934112176335e-06, + "loss": 0.0352, + "num_input_tokens_seen": 24756048, + "step": 12640 + }, + { + "epoch": 1.6754141815772035, + "grad_norm": 4.4396162033081055, + "learning_rate": 2.0448520710269026e-06, + "loss": 0.0854, + "num_input_tokens_seen": 24758408, + "step": 12641 + }, + { + "epoch": 1.6755467196819085, + "grad_norm": 0.05243891850113869, + "learning_rate": 2.0445107396117384e-06, + "loss": 0.0003, + "num_input_tokens_seen": 24759536, + "step": 12642 + }, + { + "epoch": 1.6756792577866135, + "grad_norm": 0.19418716430664062, + "learning_rate": 2.044169416978721e-06, + "loss": 0.002, + "num_input_tokens_seen": 24761176, + "step": 12643 + }, + { + "epoch": 1.6758117958913188, + "grad_norm": 12.619674682617188, + "learning_rate": 2.043828103134431e-06, + "loss": 0.1568, + "num_input_tokens_seen": 24763120, + "step": 12644 + }, + { + "epoch": 1.6759443339960238, + "grad_norm": 5.470091342926025, + "learning_rate": 2.04348679808545e-06, + "loss": 0.176, + "num_input_tokens_seen": 24765272, + "step": 12645 + }, + { + "epoch": 1.676076872100729, + "grad_norm": 4.8347625732421875, + "learning_rate": 2.0431455018383595e-06, + "loss": 0.0637, + "num_input_tokens_seen": 24767480, + "step": 12646 + }, + { + "epoch": 1.6762094102054341, + "grad_norm": 1.0146862268447876, + "learning_rate": 2.0428042143997377e-06, + "loss": 0.0056, + "num_input_tokens_seen": 24769416, + "step": 12647 + }, + { + "epoch": 1.6763419483101392, + "grad_norm": 9.917863845825195, + "learning_rate": 2.042462935776167e-06, + "loss": 0.1069, + "num_input_tokens_seen": 24771648, + "step": 12648 + }, + { + "epoch": 1.6764744864148442, + "grad_norm": 14.398921012878418, + "learning_rate": 2.0421216659742256e-06, + "loss": 0.302, + "num_input_tokens_seen": 24774280, + "step": 12649 + }, + { + "epoch": 1.6766070245195492, + "grad_norm": 6.200052261352539, + "learning_rate": 2.0417804050004947e-06, + "loss": 0.1898, + "num_input_tokens_seen": 24775496, + "step": 12650 + }, + { + "epoch": 1.6767395626242545, + "grad_norm": 7.860629558563232, + "learning_rate": 2.0414391528615543e-06, + "loss": 0.2151, + "num_input_tokens_seen": 24778792, + "step": 12651 + }, + { + "epoch": 1.6768721007289595, + "grad_norm": 7.348298072814941, + "learning_rate": 2.041097909563983e-06, + "loss": 0.2003, + "num_input_tokens_seen": 24780336, + "step": 12652 + }, + { + "epoch": 1.6770046388336648, + "grad_norm": 6.182065486907959, + "learning_rate": 2.0407566751143605e-06, + "loss": 0.1058, + "num_input_tokens_seen": 24782248, + "step": 12653 + }, + { + "epoch": 1.6771371769383698, + "grad_norm": 9.230423927307129, + "learning_rate": 2.040415449519266e-06, + "loss": 0.2193, + "num_input_tokens_seen": 24784976, + "step": 12654 + }, + { + "epoch": 1.6772697150430749, + "grad_norm": 10.573271751403809, + "learning_rate": 2.0400742327852786e-06, + "loss": 0.2268, + "num_input_tokens_seen": 24786848, + "step": 12655 + }, + { + "epoch": 1.67740225314778, + "grad_norm": 0.05317999795079231, + "learning_rate": 2.0397330249189775e-06, + "loss": 0.0003, + "num_input_tokens_seen": 24788128, + "step": 12656 + }, + { + "epoch": 1.677534791252485, + "grad_norm": 0.0943717360496521, + "learning_rate": 2.0393918259269414e-06, + "loss": 0.0006, + "num_input_tokens_seen": 24789576, + "step": 12657 + }, + { + "epoch": 1.6776673293571902, + "grad_norm": 8.628497123718262, + "learning_rate": 2.0390506358157493e-06, + "loss": 0.179, + "num_input_tokens_seen": 24791392, + "step": 12658 + }, + { + "epoch": 1.6777998674618952, + "grad_norm": 0.0932103767991066, + "learning_rate": 2.0387094545919776e-06, + "loss": 0.0005, + "num_input_tokens_seen": 24792968, + "step": 12659 + }, + { + "epoch": 1.6779324055666005, + "grad_norm": 10.09632396697998, + "learning_rate": 2.038368282262206e-06, + "loss": 0.0968, + "num_input_tokens_seen": 24794808, + "step": 12660 + }, + { + "epoch": 1.6780649436713055, + "grad_norm": 0.18293797969818115, + "learning_rate": 2.038027118833013e-06, + "loss": 0.0015, + "num_input_tokens_seen": 24796576, + "step": 12661 + }, + { + "epoch": 1.6781974817760106, + "grad_norm": 7.20873498916626, + "learning_rate": 2.0376859643109753e-06, + "loss": 0.2357, + "num_input_tokens_seen": 24798824, + "step": 12662 + }, + { + "epoch": 1.6783300198807156, + "grad_norm": 12.587116241455078, + "learning_rate": 2.0373448187026715e-06, + "loss": 0.1908, + "num_input_tokens_seen": 24799904, + "step": 12663 + }, + { + "epoch": 1.6784625579854207, + "grad_norm": 25.987010955810547, + "learning_rate": 2.037003682014679e-06, + "loss": 0.3322, + "num_input_tokens_seen": 24801768, + "step": 12664 + }, + { + "epoch": 1.678595096090126, + "grad_norm": 0.5166041851043701, + "learning_rate": 2.0366625542535735e-06, + "loss": 0.0031, + "num_input_tokens_seen": 24804696, + "step": 12665 + }, + { + "epoch": 1.6787276341948312, + "grad_norm": 3.3394079208374023, + "learning_rate": 2.0363214354259354e-06, + "loss": 0.0173, + "num_input_tokens_seen": 24805936, + "step": 12666 + }, + { + "epoch": 1.6788601722995362, + "grad_norm": 10.402264595031738, + "learning_rate": 2.0359803255383396e-06, + "loss": 0.0936, + "num_input_tokens_seen": 24808680, + "step": 12667 + }, + { + "epoch": 1.6789927104042413, + "grad_norm": 6.29779577255249, + "learning_rate": 2.035639224597363e-06, + "loss": 0.1491, + "num_input_tokens_seen": 24811680, + "step": 12668 + }, + { + "epoch": 1.6791252485089463, + "grad_norm": 2.9204511642456055, + "learning_rate": 2.0352981326095823e-06, + "loss": 0.0298, + "num_input_tokens_seen": 24813120, + "step": 12669 + }, + { + "epoch": 1.6792577866136513, + "grad_norm": 12.059741020202637, + "learning_rate": 2.0349570495815736e-06, + "loss": 0.2849, + "num_input_tokens_seen": 24816352, + "step": 12670 + }, + { + "epoch": 1.6793903247183564, + "grad_norm": 1.0880364179611206, + "learning_rate": 2.034615975519914e-06, + "loss": 0.0121, + "num_input_tokens_seen": 24818080, + "step": 12671 + }, + { + "epoch": 1.6795228628230616, + "grad_norm": 1.585404396057129, + "learning_rate": 2.0342749104311804e-06, + "loss": 0.0342, + "num_input_tokens_seen": 24821160, + "step": 12672 + }, + { + "epoch": 1.6796554009277669, + "grad_norm": 6.887788772583008, + "learning_rate": 2.033933854321948e-06, + "loss": 0.1198, + "num_input_tokens_seen": 24822936, + "step": 12673 + }, + { + "epoch": 1.679787939032472, + "grad_norm": 11.674654006958008, + "learning_rate": 2.033592807198791e-06, + "loss": 0.4288, + "num_input_tokens_seen": 24825024, + "step": 12674 + }, + { + "epoch": 1.679920477137177, + "grad_norm": 6.087829113006592, + "learning_rate": 2.0332517690682865e-06, + "loss": 0.1138, + "num_input_tokens_seen": 24826968, + "step": 12675 + }, + { + "epoch": 1.680053015241882, + "grad_norm": 3.6697022914886475, + "learning_rate": 2.032910739937011e-06, + "loss": 0.0201, + "num_input_tokens_seen": 24829520, + "step": 12676 + }, + { + "epoch": 1.680185553346587, + "grad_norm": 2.865846633911133, + "learning_rate": 2.032569719811538e-06, + "loss": 0.0543, + "num_input_tokens_seen": 24831992, + "step": 12677 + }, + { + "epoch": 1.6803180914512923, + "grad_norm": 0.08668113499879837, + "learning_rate": 2.0322287086984436e-06, + "loss": 0.0007, + "num_input_tokens_seen": 24834152, + "step": 12678 + }, + { + "epoch": 1.6804506295559973, + "grad_norm": 10.407691955566406, + "learning_rate": 2.031887706604302e-06, + "loss": 0.2593, + "num_input_tokens_seen": 24836208, + "step": 12679 + }, + { + "epoch": 1.6805831676607026, + "grad_norm": 0.05890491232275963, + "learning_rate": 2.031546713535688e-06, + "loss": 0.0007, + "num_input_tokens_seen": 24837576, + "step": 12680 + }, + { + "epoch": 1.6807157057654076, + "grad_norm": 5.605384826660156, + "learning_rate": 2.0312057294991778e-06, + "loss": 0.1024, + "num_input_tokens_seen": 24839760, + "step": 12681 + }, + { + "epoch": 1.6808482438701127, + "grad_norm": 4.910287857055664, + "learning_rate": 2.030864754501344e-06, + "loss": 0.0709, + "num_input_tokens_seen": 24843024, + "step": 12682 + }, + { + "epoch": 1.6809807819748177, + "grad_norm": 1.3881008625030518, + "learning_rate": 2.030523788548761e-06, + "loss": 0.0096, + "num_input_tokens_seen": 24844656, + "step": 12683 + }, + { + "epoch": 1.6811133200795227, + "grad_norm": 8.331984519958496, + "learning_rate": 2.0301828316480036e-06, + "loss": 0.1406, + "num_input_tokens_seen": 24846488, + "step": 12684 + }, + { + "epoch": 1.681245858184228, + "grad_norm": 5.891432762145996, + "learning_rate": 2.0298418838056452e-06, + "loss": 0.1264, + "num_input_tokens_seen": 24848440, + "step": 12685 + }, + { + "epoch": 1.681378396288933, + "grad_norm": 2.178873300552368, + "learning_rate": 2.029500945028259e-06, + "loss": 0.0273, + "num_input_tokens_seen": 24849888, + "step": 12686 + }, + { + "epoch": 1.6815109343936383, + "grad_norm": 0.48465508222579956, + "learning_rate": 2.02916001532242e-06, + "loss": 0.005, + "num_input_tokens_seen": 24851544, + "step": 12687 + }, + { + "epoch": 1.6816434724983433, + "grad_norm": 7.637752056121826, + "learning_rate": 2.0288190946947012e-06, + "loss": 0.1835, + "num_input_tokens_seen": 24853408, + "step": 12688 + }, + { + "epoch": 1.6817760106030484, + "grad_norm": 1.3939573764801025, + "learning_rate": 2.0284781831516743e-06, + "loss": 0.0137, + "num_input_tokens_seen": 24854888, + "step": 12689 + }, + { + "epoch": 1.6819085487077534, + "grad_norm": 0.11237908899784088, + "learning_rate": 2.0281372806999137e-06, + "loss": 0.0008, + "num_input_tokens_seen": 24858272, + "step": 12690 + }, + { + "epoch": 1.6820410868124585, + "grad_norm": 7.96834135055542, + "learning_rate": 2.0277963873459914e-06, + "loss": 0.1319, + "num_input_tokens_seen": 24859968, + "step": 12691 + }, + { + "epoch": 1.6821736249171637, + "grad_norm": 11.280967712402344, + "learning_rate": 2.0274555030964806e-06, + "loss": 0.2455, + "num_input_tokens_seen": 24862744, + "step": 12692 + }, + { + "epoch": 1.6823061630218688, + "grad_norm": 0.0485367625951767, + "learning_rate": 2.0271146279579543e-06, + "loss": 0.0003, + "num_input_tokens_seen": 24864488, + "step": 12693 + }, + { + "epoch": 1.682438701126574, + "grad_norm": 3.847808599472046, + "learning_rate": 2.026773761936984e-06, + "loss": 0.1016, + "num_input_tokens_seen": 24866352, + "step": 12694 + }, + { + "epoch": 1.682571239231279, + "grad_norm": 8.230746269226074, + "learning_rate": 2.0264329050401418e-06, + "loss": 0.1649, + "num_input_tokens_seen": 24869104, + "step": 12695 + }, + { + "epoch": 1.682703777335984, + "grad_norm": 5.045008182525635, + "learning_rate": 2.0260920572739993e-06, + "loss": 0.1183, + "num_input_tokens_seen": 24870480, + "step": 12696 + }, + { + "epoch": 1.6828363154406891, + "grad_norm": 0.29245442152023315, + "learning_rate": 2.02575121864513e-06, + "loss": 0.0043, + "num_input_tokens_seen": 24872616, + "step": 12697 + }, + { + "epoch": 1.6829688535453942, + "grad_norm": 0.054146938025951385, + "learning_rate": 2.0254103891601036e-06, + "loss": 0.0004, + "num_input_tokens_seen": 24873976, + "step": 12698 + }, + { + "epoch": 1.6831013916500994, + "grad_norm": 9.22727108001709, + "learning_rate": 2.0250695688254934e-06, + "loss": 0.0659, + "num_input_tokens_seen": 24875840, + "step": 12699 + }, + { + "epoch": 1.6832339297548045, + "grad_norm": 0.22524474561214447, + "learning_rate": 2.0247287576478683e-06, + "loss": 0.0013, + "num_input_tokens_seen": 24877320, + "step": 12700 + }, + { + "epoch": 1.6833664678595097, + "grad_norm": 0.08207983523607254, + "learning_rate": 2.0243879556338e-06, + "loss": 0.0006, + "num_input_tokens_seen": 24878896, + "step": 12701 + }, + { + "epoch": 1.6834990059642148, + "grad_norm": 5.4703168869018555, + "learning_rate": 2.024047162789862e-06, + "loss": 0.0766, + "num_input_tokens_seen": 24880528, + "step": 12702 + }, + { + "epoch": 1.6836315440689198, + "grad_norm": 8.023066520690918, + "learning_rate": 2.0237063791226217e-06, + "loss": 0.1244, + "num_input_tokens_seen": 24882472, + "step": 12703 + }, + { + "epoch": 1.6837640821736248, + "grad_norm": 2.407456159591675, + "learning_rate": 2.023365604638652e-06, + "loss": 0.0376, + "num_input_tokens_seen": 24884096, + "step": 12704 + }, + { + "epoch": 1.6838966202783299, + "grad_norm": 1.9698694944381714, + "learning_rate": 2.023024839344522e-06, + "loss": 0.0196, + "num_input_tokens_seen": 24885824, + "step": 12705 + }, + { + "epoch": 1.6840291583830351, + "grad_norm": 5.459560394287109, + "learning_rate": 2.0226840832468014e-06, + "loss": 0.1011, + "num_input_tokens_seen": 24887808, + "step": 12706 + }, + { + "epoch": 1.6841616964877404, + "grad_norm": 3.264509439468384, + "learning_rate": 2.022343336352061e-06, + "loss": 0.0386, + "num_input_tokens_seen": 24889064, + "step": 12707 + }, + { + "epoch": 1.6842942345924454, + "grad_norm": 0.021246932446956635, + "learning_rate": 2.0220025986668713e-06, + "loss": 0.0001, + "num_input_tokens_seen": 24890360, + "step": 12708 + }, + { + "epoch": 1.6844267726971505, + "grad_norm": 6.121258735656738, + "learning_rate": 2.021661870197801e-06, + "loss": 0.1554, + "num_input_tokens_seen": 24892232, + "step": 12709 + }, + { + "epoch": 1.6845593108018555, + "grad_norm": 4.035791397094727, + "learning_rate": 2.02132115095142e-06, + "loss": 0.054, + "num_input_tokens_seen": 24893920, + "step": 12710 + }, + { + "epoch": 1.6846918489065605, + "grad_norm": 4.949850082397461, + "learning_rate": 2.0209804409342964e-06, + "loss": 0.0149, + "num_input_tokens_seen": 24895816, + "step": 12711 + }, + { + "epoch": 1.6848243870112656, + "grad_norm": 6.144629955291748, + "learning_rate": 2.0206397401530016e-06, + "loss": 0.1339, + "num_input_tokens_seen": 24897624, + "step": 12712 + }, + { + "epoch": 1.6849569251159708, + "grad_norm": 4.713603973388672, + "learning_rate": 2.0202990486141026e-06, + "loss": 0.0992, + "num_input_tokens_seen": 24899544, + "step": 12713 + }, + { + "epoch": 1.685089463220676, + "grad_norm": 6.870382308959961, + "learning_rate": 2.0199583663241698e-06, + "loss": 0.0739, + "num_input_tokens_seen": 24901504, + "step": 12714 + }, + { + "epoch": 1.6852220013253811, + "grad_norm": 0.4158709943294525, + "learning_rate": 2.0196176932897698e-06, + "loss": 0.0031, + "num_input_tokens_seen": 24902992, + "step": 12715 + }, + { + "epoch": 1.6853545394300862, + "grad_norm": 5.164585113525391, + "learning_rate": 2.019277029517473e-06, + "loss": 0.1671, + "num_input_tokens_seen": 24905624, + "step": 12716 + }, + { + "epoch": 1.6854870775347912, + "grad_norm": 1.1197683811187744, + "learning_rate": 2.0189363750138454e-06, + "loss": 0.0088, + "num_input_tokens_seen": 24908240, + "step": 12717 + }, + { + "epoch": 1.6856196156394962, + "grad_norm": 0.9294412136077881, + "learning_rate": 2.0185957297854567e-06, + "loss": 0.0085, + "num_input_tokens_seen": 24910008, + "step": 12718 + }, + { + "epoch": 1.6857521537442015, + "grad_norm": 11.905794143676758, + "learning_rate": 2.0182550938388754e-06, + "loss": 0.1512, + "num_input_tokens_seen": 24912256, + "step": 12719 + }, + { + "epoch": 1.6858846918489065, + "grad_norm": 3.980170249938965, + "learning_rate": 2.017914467180668e-06, + "loss": 0.063, + "num_input_tokens_seen": 24913640, + "step": 12720 + }, + { + "epoch": 1.6860172299536118, + "grad_norm": 10.411599159240723, + "learning_rate": 2.0175738498174023e-06, + "loss": 0.254, + "num_input_tokens_seen": 24915432, + "step": 12721 + }, + { + "epoch": 1.6861497680583168, + "grad_norm": 3.59291672706604, + "learning_rate": 2.017233241755644e-06, + "loss": 0.0616, + "num_input_tokens_seen": 24918016, + "step": 12722 + }, + { + "epoch": 1.6862823061630219, + "grad_norm": 4.218505382537842, + "learning_rate": 2.0168926430019636e-06, + "loss": 0.0696, + "num_input_tokens_seen": 24920576, + "step": 12723 + }, + { + "epoch": 1.686414844267727, + "grad_norm": 5.425989151000977, + "learning_rate": 2.0165520535629257e-06, + "loss": 0.0682, + "num_input_tokens_seen": 24922336, + "step": 12724 + }, + { + "epoch": 1.686547382372432, + "grad_norm": 5.614704132080078, + "learning_rate": 2.0162114734450983e-06, + "loss": 0.0871, + "num_input_tokens_seen": 24923568, + "step": 12725 + }, + { + "epoch": 1.6866799204771372, + "grad_norm": 5.545320510864258, + "learning_rate": 2.015870902655047e-06, + "loss": 0.034, + "num_input_tokens_seen": 24924760, + "step": 12726 + }, + { + "epoch": 1.6868124585818423, + "grad_norm": 17.401098251342773, + "learning_rate": 2.015530341199338e-06, + "loss": 0.3591, + "num_input_tokens_seen": 24926576, + "step": 12727 + }, + { + "epoch": 1.6869449966865475, + "grad_norm": 4.952393531799316, + "learning_rate": 2.015189789084539e-06, + "loss": 0.0959, + "num_input_tokens_seen": 24929000, + "step": 12728 + }, + { + "epoch": 1.6870775347912526, + "grad_norm": 0.03356529772281647, + "learning_rate": 2.014849246317216e-06, + "loss": 0.0002, + "num_input_tokens_seen": 24930520, + "step": 12729 + }, + { + "epoch": 1.6872100728959576, + "grad_norm": 11.710488319396973, + "learning_rate": 2.014508712903933e-06, + "loss": 0.2033, + "num_input_tokens_seen": 24932792, + "step": 12730 + }, + { + "epoch": 1.6873426110006626, + "grad_norm": 6.032309532165527, + "learning_rate": 2.0141681888512575e-06, + "loss": 0.0901, + "num_input_tokens_seen": 24934104, + "step": 12731 + }, + { + "epoch": 1.6874751491053677, + "grad_norm": 0.9769488573074341, + "learning_rate": 2.0138276741657537e-06, + "loss": 0.005, + "num_input_tokens_seen": 24935912, + "step": 12732 + }, + { + "epoch": 1.687607687210073, + "grad_norm": 0.061998847872018814, + "learning_rate": 2.0134871688539883e-06, + "loss": 0.0004, + "num_input_tokens_seen": 24937304, + "step": 12733 + }, + { + "epoch": 1.687740225314778, + "grad_norm": 9.756516456604004, + "learning_rate": 2.013146672922526e-06, + "loss": 0.0748, + "num_input_tokens_seen": 24940384, + "step": 12734 + }, + { + "epoch": 1.6878727634194832, + "grad_norm": 0.14450809359550476, + "learning_rate": 2.0128061863779324e-06, + "loss": 0.0009, + "num_input_tokens_seen": 24942872, + "step": 12735 + }, + { + "epoch": 1.6880053015241883, + "grad_norm": 9.100576400756836, + "learning_rate": 2.012465709226771e-06, + "loss": 0.1034, + "num_input_tokens_seen": 24944784, + "step": 12736 + }, + { + "epoch": 1.6881378396288933, + "grad_norm": 7.0501909255981445, + "learning_rate": 2.0121252414756058e-06, + "loss": 0.0641, + "num_input_tokens_seen": 24946848, + "step": 12737 + }, + { + "epoch": 1.6882703777335983, + "grad_norm": 11.855430603027344, + "learning_rate": 2.011784783131004e-06, + "loss": 0.394, + "num_input_tokens_seen": 24949040, + "step": 12738 + }, + { + "epoch": 1.6884029158383034, + "grad_norm": 3.8623764514923096, + "learning_rate": 2.0114443341995284e-06, + "loss": 0.0164, + "num_input_tokens_seen": 24951240, + "step": 12739 + }, + { + "epoch": 1.6885354539430086, + "grad_norm": 0.20355388522148132, + "learning_rate": 2.011103894687743e-06, + "loss": 0.0008, + "num_input_tokens_seen": 24952456, + "step": 12740 + }, + { + "epoch": 1.6886679920477137, + "grad_norm": 0.05739414691925049, + "learning_rate": 2.010763464602211e-06, + "loss": 0.0003, + "num_input_tokens_seen": 24955200, + "step": 12741 + }, + { + "epoch": 1.688800530152419, + "grad_norm": 0.08536305278539658, + "learning_rate": 2.0104230439494978e-06, + "loss": 0.0006, + "num_input_tokens_seen": 24957136, + "step": 12742 + }, + { + "epoch": 1.688933068257124, + "grad_norm": 11.392602920532227, + "learning_rate": 2.010082632736165e-06, + "loss": 0.1258, + "num_input_tokens_seen": 24959848, + "step": 12743 + }, + { + "epoch": 1.689065606361829, + "grad_norm": 1.4476748704910278, + "learning_rate": 2.009742230968778e-06, + "loss": 0.0296, + "num_input_tokens_seen": 24961144, + "step": 12744 + }, + { + "epoch": 1.689198144466534, + "grad_norm": 5.229588985443115, + "learning_rate": 2.0094018386538987e-06, + "loss": 0.2428, + "num_input_tokens_seen": 24963456, + "step": 12745 + }, + { + "epoch": 1.689330682571239, + "grad_norm": 5.4506001472473145, + "learning_rate": 2.009061455798091e-06, + "loss": 0.1177, + "num_input_tokens_seen": 24965672, + "step": 12746 + }, + { + "epoch": 1.6894632206759443, + "grad_norm": 7.401181221008301, + "learning_rate": 2.008721082407916e-06, + "loss": 0.155, + "num_input_tokens_seen": 24967712, + "step": 12747 + }, + { + "epoch": 1.6895957587806494, + "grad_norm": 4.8652143478393555, + "learning_rate": 2.0083807184899372e-06, + "loss": 0.0251, + "num_input_tokens_seen": 24970328, + "step": 12748 + }, + { + "epoch": 1.6897282968853546, + "grad_norm": 1.2546063661575317, + "learning_rate": 2.0080403640507186e-06, + "loss": 0.0084, + "num_input_tokens_seen": 24971488, + "step": 12749 + }, + { + "epoch": 1.6898608349900597, + "grad_norm": 7.0598273277282715, + "learning_rate": 2.007700019096821e-06, + "loss": 0.0904, + "num_input_tokens_seen": 24973528, + "step": 12750 + }, + { + "epoch": 1.6899933730947647, + "grad_norm": 0.2379119098186493, + "learning_rate": 2.0073596836348063e-06, + "loss": 0.0011, + "num_input_tokens_seen": 24975168, + "step": 12751 + }, + { + "epoch": 1.6901259111994698, + "grad_norm": 7.3022589683532715, + "learning_rate": 2.007019357671237e-06, + "loss": 0.1053, + "num_input_tokens_seen": 24976704, + "step": 12752 + }, + { + "epoch": 1.6902584493041748, + "grad_norm": 11.376944541931152, + "learning_rate": 2.0066790412126733e-06, + "loss": 0.3253, + "num_input_tokens_seen": 24978536, + "step": 12753 + }, + { + "epoch": 1.69039098740888, + "grad_norm": 0.017835965380072594, + "learning_rate": 2.0063387342656794e-06, + "loss": 0.0001, + "num_input_tokens_seen": 24979792, + "step": 12754 + }, + { + "epoch": 1.6905235255135853, + "grad_norm": 8.806415557861328, + "learning_rate": 2.005998436836815e-06, + "loss": 0.1652, + "num_input_tokens_seen": 24981584, + "step": 12755 + }, + { + "epoch": 1.6906560636182904, + "grad_norm": 9.737452507019043, + "learning_rate": 2.0056581489326412e-06, + "loss": 0.4253, + "num_input_tokens_seen": 24983016, + "step": 12756 + }, + { + "epoch": 1.6907886017229954, + "grad_norm": 4.309791564941406, + "learning_rate": 2.00531787055972e-06, + "loss": 0.072, + "num_input_tokens_seen": 24985824, + "step": 12757 + }, + { + "epoch": 1.6909211398277004, + "grad_norm": 1.5981711149215698, + "learning_rate": 2.0049776017246102e-06, + "loss": 0.018, + "num_input_tokens_seen": 24988160, + "step": 12758 + }, + { + "epoch": 1.6910536779324055, + "grad_norm": 14.912550926208496, + "learning_rate": 2.004637342433875e-06, + "loss": 0.2624, + "num_input_tokens_seen": 24990312, + "step": 12759 + }, + { + "epoch": 1.6911862160371107, + "grad_norm": 9.21568775177002, + "learning_rate": 2.004297092694073e-06, + "loss": 0.2297, + "num_input_tokens_seen": 24992688, + "step": 12760 + }, + { + "epoch": 1.6913187541418158, + "grad_norm": 0.053895000368356705, + "learning_rate": 2.0039568525117654e-06, + "loss": 0.0004, + "num_input_tokens_seen": 24994648, + "step": 12761 + }, + { + "epoch": 1.691451292246521, + "grad_norm": 0.08793166279792786, + "learning_rate": 2.003616621893512e-06, + "loss": 0.0008, + "num_input_tokens_seen": 24996784, + "step": 12762 + }, + { + "epoch": 1.691583830351226, + "grad_norm": 1.3851509094238281, + "learning_rate": 2.003276400845871e-06, + "loss": 0.0131, + "num_input_tokens_seen": 24998768, + "step": 12763 + }, + { + "epoch": 1.691716368455931, + "grad_norm": 6.7313432693481445, + "learning_rate": 2.0029361893754056e-06, + "loss": 0.1055, + "num_input_tokens_seen": 25000112, + "step": 12764 + }, + { + "epoch": 1.6918489065606361, + "grad_norm": 2.242631435394287, + "learning_rate": 2.002595987488673e-06, + "loss": 0.044, + "num_input_tokens_seen": 25001736, + "step": 12765 + }, + { + "epoch": 1.6919814446653412, + "grad_norm": 6.956461429595947, + "learning_rate": 2.0022557951922326e-06, + "loss": 0.099, + "num_input_tokens_seen": 25003088, + "step": 12766 + }, + { + "epoch": 1.6921139827700464, + "grad_norm": 0.08241207897663116, + "learning_rate": 2.001915612492644e-06, + "loss": 0.0006, + "num_input_tokens_seen": 25005336, + "step": 12767 + }, + { + "epoch": 1.6922465208747515, + "grad_norm": 0.11226195842027664, + "learning_rate": 2.001575439396465e-06, + "loss": 0.0007, + "num_input_tokens_seen": 25007080, + "step": 12768 + }, + { + "epoch": 1.6923790589794567, + "grad_norm": 7.938633441925049, + "learning_rate": 2.001235275910256e-06, + "loss": 0.2111, + "num_input_tokens_seen": 25009464, + "step": 12769 + }, + { + "epoch": 1.6925115970841618, + "grad_norm": 4.200077056884766, + "learning_rate": 2.0008951220405755e-06, + "loss": 0.0538, + "num_input_tokens_seen": 25011432, + "step": 12770 + }, + { + "epoch": 1.6926441351888668, + "grad_norm": 10.83367919921875, + "learning_rate": 2.0005549777939807e-06, + "loss": 0.1743, + "num_input_tokens_seen": 25013352, + "step": 12771 + }, + { + "epoch": 1.6927766732935718, + "grad_norm": 7.722321510314941, + "learning_rate": 2.0002148431770317e-06, + "loss": 0.0706, + "num_input_tokens_seen": 25015480, + "step": 12772 + }, + { + "epoch": 1.6929092113982769, + "grad_norm": 2.502812147140503, + "learning_rate": 1.9998747181962847e-06, + "loss": 0.0104, + "num_input_tokens_seen": 25017264, + "step": 12773 + }, + { + "epoch": 1.6930417495029821, + "grad_norm": 7.739894390106201, + "learning_rate": 1.9995346028582972e-06, + "loss": 0.1983, + "num_input_tokens_seen": 25019384, + "step": 12774 + }, + { + "epoch": 1.6931742876076872, + "grad_norm": 7.775674343109131, + "learning_rate": 1.999194497169628e-06, + "loss": 0.1409, + "num_input_tokens_seen": 25021288, + "step": 12775 + }, + { + "epoch": 1.6933068257123924, + "grad_norm": 0.46742701530456543, + "learning_rate": 1.9988544011368353e-06, + "loss": 0.0029, + "num_input_tokens_seen": 25023040, + "step": 12776 + }, + { + "epoch": 1.6934393638170975, + "grad_norm": 0.05308287590742111, + "learning_rate": 1.9985143147664747e-06, + "loss": 0.0004, + "num_input_tokens_seen": 25024832, + "step": 12777 + }, + { + "epoch": 1.6935719019218025, + "grad_norm": 1.8296760320663452, + "learning_rate": 1.998174238065105e-06, + "loss": 0.0253, + "num_input_tokens_seen": 25027320, + "step": 12778 + }, + { + "epoch": 1.6937044400265076, + "grad_norm": 2.3903703689575195, + "learning_rate": 1.997834171039281e-06, + "loss": 0.0252, + "num_input_tokens_seen": 25030016, + "step": 12779 + }, + { + "epoch": 1.6938369781312126, + "grad_norm": 5.6096882820129395, + "learning_rate": 1.9974941136955615e-06, + "loss": 0.0784, + "num_input_tokens_seen": 25031920, + "step": 12780 + }, + { + "epoch": 1.6939695162359178, + "grad_norm": 0.030021576210856438, + "learning_rate": 1.9971540660405015e-06, + "loss": 0.0002, + "num_input_tokens_seen": 25033440, + "step": 12781 + }, + { + "epoch": 1.6941020543406229, + "grad_norm": 14.06274127960205, + "learning_rate": 1.9968140280806584e-06, + "loss": 0.0861, + "num_input_tokens_seen": 25034912, + "step": 12782 + }, + { + "epoch": 1.6942345924453281, + "grad_norm": 1.1634327173233032, + "learning_rate": 1.996473999822588e-06, + "loss": 0.0056, + "num_input_tokens_seen": 25036416, + "step": 12783 + }, + { + "epoch": 1.6943671305500332, + "grad_norm": 5.7439656257629395, + "learning_rate": 1.9961339812728456e-06, + "loss": 0.1512, + "num_input_tokens_seen": 25038952, + "step": 12784 + }, + { + "epoch": 1.6944996686547382, + "grad_norm": 4.169588088989258, + "learning_rate": 1.9957939724379886e-06, + "loss": 0.1342, + "num_input_tokens_seen": 25040208, + "step": 12785 + }, + { + "epoch": 1.6946322067594433, + "grad_norm": 3.202334403991699, + "learning_rate": 1.9954539733245707e-06, + "loss": 0.0534, + "num_input_tokens_seen": 25041784, + "step": 12786 + }, + { + "epoch": 1.6947647448641483, + "grad_norm": 0.019546814262866974, + "learning_rate": 1.9951139839391497e-06, + "loss": 0.0001, + "num_input_tokens_seen": 25042800, + "step": 12787 + }, + { + "epoch": 1.6948972829688536, + "grad_norm": 1.1785850524902344, + "learning_rate": 1.9947740042882784e-06, + "loss": 0.0104, + "num_input_tokens_seen": 25046264, + "step": 12788 + }, + { + "epoch": 1.6950298210735586, + "grad_norm": 14.328036308288574, + "learning_rate": 1.9944340343785123e-06, + "loss": 0.2653, + "num_input_tokens_seen": 25049488, + "step": 12789 + }, + { + "epoch": 1.6951623591782639, + "grad_norm": 3.213524103164673, + "learning_rate": 1.994094074216407e-06, + "loss": 0.0259, + "num_input_tokens_seen": 25051672, + "step": 12790 + }, + { + "epoch": 1.695294897282969, + "grad_norm": 6.974457263946533, + "learning_rate": 1.993754123808518e-06, + "loss": 0.1932, + "num_input_tokens_seen": 25053832, + "step": 12791 + }, + { + "epoch": 1.695427435387674, + "grad_norm": 6.188643932342529, + "learning_rate": 1.993414183161398e-06, + "loss": 0.0151, + "num_input_tokens_seen": 25055656, + "step": 12792 + }, + { + "epoch": 1.695559973492379, + "grad_norm": 0.16637730598449707, + "learning_rate": 1.9930742522816027e-06, + "loss": 0.0009, + "num_input_tokens_seen": 25057560, + "step": 12793 + }, + { + "epoch": 1.695692511597084, + "grad_norm": 1.8586370944976807, + "learning_rate": 1.992734331175684e-06, + "loss": 0.0094, + "num_input_tokens_seen": 25059288, + "step": 12794 + }, + { + "epoch": 1.6958250497017893, + "grad_norm": 4.737232208251953, + "learning_rate": 1.9923944198501983e-06, + "loss": 0.1042, + "num_input_tokens_seen": 25061760, + "step": 12795 + }, + { + "epoch": 1.6959575878064945, + "grad_norm": 4.856111526489258, + "learning_rate": 1.9920545183116983e-06, + "loss": 0.0417, + "num_input_tokens_seen": 25063048, + "step": 12796 + }, + { + "epoch": 1.6960901259111996, + "grad_norm": 11.59985065460205, + "learning_rate": 1.9917146265667386e-06, + "loss": 0.3648, + "num_input_tokens_seen": 25065752, + "step": 12797 + }, + { + "epoch": 1.6962226640159046, + "grad_norm": 0.09558551013469696, + "learning_rate": 1.99137474462187e-06, + "loss": 0.0007, + "num_input_tokens_seen": 25066936, + "step": 12798 + }, + { + "epoch": 1.6963552021206096, + "grad_norm": 0.672170877456665, + "learning_rate": 1.9910348724836486e-06, + "loss": 0.0025, + "num_input_tokens_seen": 25069536, + "step": 12799 + }, + { + "epoch": 1.6964877402253147, + "grad_norm": 6.387729644775391, + "learning_rate": 1.990695010158624e-06, + "loss": 0.0369, + "num_input_tokens_seen": 25071112, + "step": 12800 + }, + { + "epoch": 1.6966202783300197, + "grad_norm": 5.8466410636901855, + "learning_rate": 1.9903551576533524e-06, + "loss": 0.1702, + "num_input_tokens_seen": 25073608, + "step": 12801 + }, + { + "epoch": 1.696752816434725, + "grad_norm": 6.5278000831604, + "learning_rate": 1.990015314974385e-06, + "loss": 0.129, + "num_input_tokens_seen": 25076048, + "step": 12802 + }, + { + "epoch": 1.6968853545394302, + "grad_norm": 3.4784560203552246, + "learning_rate": 1.9896754821282742e-06, + "loss": 0.0971, + "num_input_tokens_seen": 25077384, + "step": 12803 + }, + { + "epoch": 1.6970178926441353, + "grad_norm": 2.5664899349212646, + "learning_rate": 1.9893356591215717e-06, + "loss": 0.0146, + "num_input_tokens_seen": 25079072, + "step": 12804 + }, + { + "epoch": 1.6971504307488403, + "grad_norm": 8.223052978515625, + "learning_rate": 1.9889958459608293e-06, + "loss": 0.2813, + "num_input_tokens_seen": 25080936, + "step": 12805 + }, + { + "epoch": 1.6972829688535453, + "grad_norm": 5.118310451507568, + "learning_rate": 1.9886560426526005e-06, + "loss": 0.1041, + "num_input_tokens_seen": 25083088, + "step": 12806 + }, + { + "epoch": 1.6974155069582504, + "grad_norm": 5.681372165679932, + "learning_rate": 1.9883162492034358e-06, + "loss": 0.102, + "num_input_tokens_seen": 25084576, + "step": 12807 + }, + { + "epoch": 1.6975480450629556, + "grad_norm": 0.0850224569439888, + "learning_rate": 1.987976465619887e-06, + "loss": 0.0006, + "num_input_tokens_seen": 25086408, + "step": 12808 + }, + { + "epoch": 1.6976805831676607, + "grad_norm": 2.82230281829834, + "learning_rate": 1.9876366919085047e-06, + "loss": 0.0387, + "num_input_tokens_seen": 25088248, + "step": 12809 + }, + { + "epoch": 1.697813121272366, + "grad_norm": 8.225298881530762, + "learning_rate": 1.98729692807584e-06, + "loss": 0.2317, + "num_input_tokens_seen": 25090704, + "step": 12810 + }, + { + "epoch": 1.697945659377071, + "grad_norm": 6.414100646972656, + "learning_rate": 1.986957174128445e-06, + "loss": 0.0926, + "num_input_tokens_seen": 25093136, + "step": 12811 + }, + { + "epoch": 1.698078197481776, + "grad_norm": 11.341705322265625, + "learning_rate": 1.9866174300728703e-06, + "loss": 0.176, + "num_input_tokens_seen": 25094864, + "step": 12812 + }, + { + "epoch": 1.698210735586481, + "grad_norm": 0.06421298533678055, + "learning_rate": 1.9862776959156647e-06, + "loss": 0.0004, + "num_input_tokens_seen": 25096928, + "step": 12813 + }, + { + "epoch": 1.698343273691186, + "grad_norm": 0.14488598704338074, + "learning_rate": 1.9859379716633802e-06, + "loss": 0.0009, + "num_input_tokens_seen": 25098656, + "step": 12814 + }, + { + "epoch": 1.6984758117958914, + "grad_norm": 0.122245192527771, + "learning_rate": 1.985598257322565e-06, + "loss": 0.0008, + "num_input_tokens_seen": 25101192, + "step": 12815 + }, + { + "epoch": 1.6986083499005964, + "grad_norm": 0.08149103820323944, + "learning_rate": 1.9852585528997708e-06, + "loss": 0.0005, + "num_input_tokens_seen": 25102432, + "step": 12816 + }, + { + "epoch": 1.6987408880053017, + "grad_norm": 0.32439738512039185, + "learning_rate": 1.9849188584015472e-06, + "loss": 0.002, + "num_input_tokens_seen": 25104568, + "step": 12817 + }, + { + "epoch": 1.6988734261100067, + "grad_norm": 7.605825901031494, + "learning_rate": 1.9845791738344437e-06, + "loss": 0.1147, + "num_input_tokens_seen": 25107384, + "step": 12818 + }, + { + "epoch": 1.6990059642147117, + "grad_norm": 18.420122146606445, + "learning_rate": 1.9842394992050085e-06, + "loss": 0.3156, + "num_input_tokens_seen": 25108872, + "step": 12819 + }, + { + "epoch": 1.6991385023194168, + "grad_norm": 0.06565165519714355, + "learning_rate": 1.9838998345197913e-06, + "loss": 0.0005, + "num_input_tokens_seen": 25110680, + "step": 12820 + }, + { + "epoch": 1.6992710404241218, + "grad_norm": 4.081592559814453, + "learning_rate": 1.9835601797853423e-06, + "loss": 0.1525, + "num_input_tokens_seen": 25112800, + "step": 12821 + }, + { + "epoch": 1.699403578528827, + "grad_norm": 7.03947639465332, + "learning_rate": 1.9832205350082086e-06, + "loss": 0.0638, + "num_input_tokens_seen": 25114688, + "step": 12822 + }, + { + "epoch": 1.699536116633532, + "grad_norm": 0.05665195733308792, + "learning_rate": 1.9828809001949403e-06, + "loss": 0.0007, + "num_input_tokens_seen": 25116784, + "step": 12823 + }, + { + "epoch": 1.6996686547382374, + "grad_norm": 7.985161781311035, + "learning_rate": 1.982541275352084e-06, + "loss": 0.1382, + "num_input_tokens_seen": 25119056, + "step": 12824 + }, + { + "epoch": 1.6998011928429424, + "grad_norm": 2.7630252838134766, + "learning_rate": 1.982201660486189e-06, + "loss": 0.054, + "num_input_tokens_seen": 25120984, + "step": 12825 + }, + { + "epoch": 1.6999337309476474, + "grad_norm": 6.4977240562438965, + "learning_rate": 1.9818620556038038e-06, + "loss": 0.1314, + "num_input_tokens_seen": 25123136, + "step": 12826 + }, + { + "epoch": 1.7000662690523525, + "grad_norm": 5.230261325836182, + "learning_rate": 1.9815224607114756e-06, + "loss": 0.1442, + "num_input_tokens_seen": 25125728, + "step": 12827 + }, + { + "epoch": 1.7001988071570575, + "grad_norm": 3.487452268600464, + "learning_rate": 1.9811828758157516e-06, + "loss": 0.0159, + "num_input_tokens_seen": 25127704, + "step": 12828 + }, + { + "epoch": 1.7003313452617628, + "grad_norm": 0.13720905780792236, + "learning_rate": 1.9808433009231805e-06, + "loss": 0.0009, + "num_input_tokens_seen": 25130696, + "step": 12829 + }, + { + "epoch": 1.7004638833664678, + "grad_norm": 12.22507381439209, + "learning_rate": 1.9805037360403077e-06, + "loss": 0.2611, + "num_input_tokens_seen": 25133464, + "step": 12830 + }, + { + "epoch": 1.700596421471173, + "grad_norm": 6.1097822189331055, + "learning_rate": 1.9801641811736812e-06, + "loss": 0.0701, + "num_input_tokens_seen": 25135320, + "step": 12831 + }, + { + "epoch": 1.700728959575878, + "grad_norm": 14.340020179748535, + "learning_rate": 1.9798246363298483e-06, + "loss": 0.3184, + "num_input_tokens_seen": 25137288, + "step": 12832 + }, + { + "epoch": 1.7008614976805831, + "grad_norm": 2.389646530151367, + "learning_rate": 1.979485101515356e-06, + "loss": 0.0079, + "num_input_tokens_seen": 25139168, + "step": 12833 + }, + { + "epoch": 1.7009940357852882, + "grad_norm": 7.921056747436523, + "learning_rate": 1.979145576736749e-06, + "loss": 0.174, + "num_input_tokens_seen": 25141184, + "step": 12834 + }, + { + "epoch": 1.7011265738899932, + "grad_norm": 5.061671257019043, + "learning_rate": 1.9788060620005753e-06, + "loss": 0.0873, + "num_input_tokens_seen": 25143912, + "step": 12835 + }, + { + "epoch": 1.7012591119946985, + "grad_norm": 0.09500450640916824, + "learning_rate": 1.9784665573133792e-06, + "loss": 0.0007, + "num_input_tokens_seen": 25145664, + "step": 12836 + }, + { + "epoch": 1.7013916500994037, + "grad_norm": 0.07005027681589127, + "learning_rate": 1.978127062681708e-06, + "loss": 0.0005, + "num_input_tokens_seen": 25147416, + "step": 12837 + }, + { + "epoch": 1.7015241882041088, + "grad_norm": 5.714458465576172, + "learning_rate": 1.977787578112108e-06, + "loss": 0.1814, + "num_input_tokens_seen": 25149664, + "step": 12838 + }, + { + "epoch": 1.7016567263088138, + "grad_norm": 0.32779788970947266, + "learning_rate": 1.977448103611123e-06, + "loss": 0.002, + "num_input_tokens_seen": 25151080, + "step": 12839 + }, + { + "epoch": 1.7017892644135189, + "grad_norm": 3.521857738494873, + "learning_rate": 1.9771086391852997e-06, + "loss": 0.142, + "num_input_tokens_seen": 25153248, + "step": 12840 + }, + { + "epoch": 1.701921802518224, + "grad_norm": 2.8645503520965576, + "learning_rate": 1.976769184841181e-06, + "loss": 0.0404, + "num_input_tokens_seen": 25155544, + "step": 12841 + }, + { + "epoch": 1.702054340622929, + "grad_norm": 8.962677001953125, + "learning_rate": 1.976429740585315e-06, + "loss": 0.326, + "num_input_tokens_seen": 25158640, + "step": 12842 + }, + { + "epoch": 1.7021868787276342, + "grad_norm": 7.273736000061035, + "learning_rate": 1.9760903064242443e-06, + "loss": 0.1915, + "num_input_tokens_seen": 25160368, + "step": 12843 + }, + { + "epoch": 1.7023194168323394, + "grad_norm": 10.350948333740234, + "learning_rate": 1.975750882364514e-06, + "loss": 0.1862, + "num_input_tokens_seen": 25162264, + "step": 12844 + }, + { + "epoch": 1.7024519549370445, + "grad_norm": 4.000223159790039, + "learning_rate": 1.975411468412668e-06, + "loss": 0.0811, + "num_input_tokens_seen": 25164552, + "step": 12845 + }, + { + "epoch": 1.7025844930417495, + "grad_norm": 0.11374349892139435, + "learning_rate": 1.9750720645752504e-06, + "loss": 0.0008, + "num_input_tokens_seen": 25166344, + "step": 12846 + }, + { + "epoch": 1.7027170311464546, + "grad_norm": 3.1200685501098633, + "learning_rate": 1.9747326708588068e-06, + "loss": 0.031, + "num_input_tokens_seen": 25169040, + "step": 12847 + }, + { + "epoch": 1.7028495692511596, + "grad_norm": 0.2940390408039093, + "learning_rate": 1.9743932872698782e-06, + "loss": 0.0029, + "num_input_tokens_seen": 25170896, + "step": 12848 + }, + { + "epoch": 1.7029821073558649, + "grad_norm": 6.407944202423096, + "learning_rate": 1.974053913815011e-06, + "loss": 0.0864, + "num_input_tokens_seen": 25173216, + "step": 12849 + }, + { + "epoch": 1.70311464546057, + "grad_norm": 0.043770596385002136, + "learning_rate": 1.973714550500747e-06, + "loss": 0.0004, + "num_input_tokens_seen": 25174736, + "step": 12850 + }, + { + "epoch": 1.7032471835652752, + "grad_norm": 4.929108142852783, + "learning_rate": 1.9733751973336286e-06, + "loss": 0.2188, + "num_input_tokens_seen": 25176568, + "step": 12851 + }, + { + "epoch": 1.7033797216699802, + "grad_norm": 0.05548323318362236, + "learning_rate": 1.9730358543202e-06, + "loss": 0.0004, + "num_input_tokens_seen": 25177736, + "step": 12852 + }, + { + "epoch": 1.7035122597746852, + "grad_norm": 7.9768147468566895, + "learning_rate": 1.972696521467004e-06, + "loss": 0.088, + "num_input_tokens_seen": 25179480, + "step": 12853 + }, + { + "epoch": 1.7036447978793903, + "grad_norm": 7.188138484954834, + "learning_rate": 1.9723571987805827e-06, + "loss": 0.1615, + "num_input_tokens_seen": 25180896, + "step": 12854 + }, + { + "epoch": 1.7037773359840953, + "grad_norm": 0.2759703993797302, + "learning_rate": 1.972017886267479e-06, + "loss": 0.0015, + "num_input_tokens_seen": 25183240, + "step": 12855 + }, + { + "epoch": 1.7039098740888006, + "grad_norm": 4.041205883026123, + "learning_rate": 1.9716785839342346e-06, + "loss": 0.0559, + "num_input_tokens_seen": 25185760, + "step": 12856 + }, + { + "epoch": 1.7040424121935056, + "grad_norm": 2.1722493171691895, + "learning_rate": 1.9713392917873904e-06, + "loss": 0.0279, + "num_input_tokens_seen": 25188016, + "step": 12857 + }, + { + "epoch": 1.7041749502982109, + "grad_norm": 3.214900255203247, + "learning_rate": 1.9710000098334904e-06, + "loss": 0.0429, + "num_input_tokens_seen": 25189560, + "step": 12858 + }, + { + "epoch": 1.704307488402916, + "grad_norm": 4.12236213684082, + "learning_rate": 1.9706607380790753e-06, + "loss": 0.015, + "num_input_tokens_seen": 25191360, + "step": 12859 + }, + { + "epoch": 1.704440026507621, + "grad_norm": 4.766012191772461, + "learning_rate": 1.9703214765306856e-06, + "loss": 0.0578, + "num_input_tokens_seen": 25192976, + "step": 12860 + }, + { + "epoch": 1.704572564612326, + "grad_norm": 4.3406782150268555, + "learning_rate": 1.969982225194864e-06, + "loss": 0.044, + "num_input_tokens_seen": 25195176, + "step": 12861 + }, + { + "epoch": 1.704705102717031, + "grad_norm": 2.634781837463379, + "learning_rate": 1.96964298407815e-06, + "loss": 0.0144, + "num_input_tokens_seen": 25196568, + "step": 12862 + }, + { + "epoch": 1.7048376408217363, + "grad_norm": 3.550370454788208, + "learning_rate": 1.969303753187085e-06, + "loss": 0.0241, + "num_input_tokens_seen": 25197896, + "step": 12863 + }, + { + "epoch": 1.7049701789264413, + "grad_norm": 0.29504385590553284, + "learning_rate": 1.968964532528211e-06, + "loss": 0.0021, + "num_input_tokens_seen": 25199928, + "step": 12864 + }, + { + "epoch": 1.7051027170311466, + "grad_norm": 7.524374485015869, + "learning_rate": 1.968625322108067e-06, + "loss": 0.1624, + "num_input_tokens_seen": 25202344, + "step": 12865 + }, + { + "epoch": 1.7052352551358516, + "grad_norm": 1.0269582271575928, + "learning_rate": 1.968286121933192e-06, + "loss": 0.0111, + "num_input_tokens_seen": 25203776, + "step": 12866 + }, + { + "epoch": 1.7053677932405567, + "grad_norm": 2.7155814170837402, + "learning_rate": 1.967946932010128e-06, + "loss": 0.0263, + "num_input_tokens_seen": 25205744, + "step": 12867 + }, + { + "epoch": 1.7055003313452617, + "grad_norm": 0.11605389416217804, + "learning_rate": 1.9676077523454147e-06, + "loss": 0.0008, + "num_input_tokens_seen": 25207528, + "step": 12868 + }, + { + "epoch": 1.7056328694499667, + "grad_norm": 4.656682014465332, + "learning_rate": 1.9672685829455907e-06, + "loss": 0.0593, + "num_input_tokens_seen": 25209328, + "step": 12869 + }, + { + "epoch": 1.705765407554672, + "grad_norm": 8.918543815612793, + "learning_rate": 1.9669294238171965e-06, + "loss": 0.321, + "num_input_tokens_seen": 25211432, + "step": 12870 + }, + { + "epoch": 1.705897945659377, + "grad_norm": 6.532780170440674, + "learning_rate": 1.966590274966771e-06, + "loss": 0.1576, + "num_input_tokens_seen": 25212968, + "step": 12871 + }, + { + "epoch": 1.7060304837640823, + "grad_norm": 7.527016639709473, + "learning_rate": 1.9662511364008515e-06, + "loss": 0.1332, + "num_input_tokens_seen": 25215680, + "step": 12872 + }, + { + "epoch": 1.7061630218687873, + "grad_norm": 10.352439880371094, + "learning_rate": 1.965912008125979e-06, + "loss": 0.1764, + "num_input_tokens_seen": 25216768, + "step": 12873 + }, + { + "epoch": 1.7062955599734924, + "grad_norm": 8.730112075805664, + "learning_rate": 1.965572890148692e-06, + "loss": 0.2169, + "num_input_tokens_seen": 25218848, + "step": 12874 + }, + { + "epoch": 1.7064280980781974, + "grad_norm": 0.13366344571113586, + "learning_rate": 1.965233782475528e-06, + "loss": 0.001, + "num_input_tokens_seen": 25220408, + "step": 12875 + }, + { + "epoch": 1.7065606361829024, + "grad_norm": 3.3352928161621094, + "learning_rate": 1.9648946851130257e-06, + "loss": 0.0754, + "num_input_tokens_seen": 25223032, + "step": 12876 + }, + { + "epoch": 1.7066931742876077, + "grad_norm": 3.0042777061462402, + "learning_rate": 1.9645555980677223e-06, + "loss": 0.0265, + "num_input_tokens_seen": 25225336, + "step": 12877 + }, + { + "epoch": 1.706825712392313, + "grad_norm": 2.6346731185913086, + "learning_rate": 1.964216521346157e-06, + "loss": 0.0167, + "num_input_tokens_seen": 25227080, + "step": 12878 + }, + { + "epoch": 1.706958250497018, + "grad_norm": 0.4826218783855438, + "learning_rate": 1.9638774549548667e-06, + "loss": 0.0024, + "num_input_tokens_seen": 25228352, + "step": 12879 + }, + { + "epoch": 1.707090788601723, + "grad_norm": 12.289925575256348, + "learning_rate": 1.9635383989003893e-06, + "loss": 0.1238, + "num_input_tokens_seen": 25229624, + "step": 12880 + }, + { + "epoch": 1.707223326706428, + "grad_norm": 1.0928581953048706, + "learning_rate": 1.963199353189261e-06, + "loss": 0.0053, + "num_input_tokens_seen": 25231536, + "step": 12881 + }, + { + "epoch": 1.707355864811133, + "grad_norm": 6.0725531578063965, + "learning_rate": 1.9628603178280194e-06, + "loss": 0.0763, + "num_input_tokens_seen": 25233512, + "step": 12882 + }, + { + "epoch": 1.7074884029158381, + "grad_norm": 0.18519461154937744, + "learning_rate": 1.9625212928232017e-06, + "loss": 0.0013, + "num_input_tokens_seen": 25235264, + "step": 12883 + }, + { + "epoch": 1.7076209410205434, + "grad_norm": 0.1186414584517479, + "learning_rate": 1.962182278181344e-06, + "loss": 0.0009, + "num_input_tokens_seen": 25236376, + "step": 12884 + }, + { + "epoch": 1.7077534791252487, + "grad_norm": 0.196207195520401, + "learning_rate": 1.9618432739089843e-06, + "loss": 0.0022, + "num_input_tokens_seen": 25238568, + "step": 12885 + }, + { + "epoch": 1.7078860172299537, + "grad_norm": 8.262133598327637, + "learning_rate": 1.961504280012657e-06, + "loss": 0.1467, + "num_input_tokens_seen": 25240464, + "step": 12886 + }, + { + "epoch": 1.7080185553346587, + "grad_norm": 9.807334899902344, + "learning_rate": 1.961165296498898e-06, + "loss": 0.0695, + "num_input_tokens_seen": 25242432, + "step": 12887 + }, + { + "epoch": 1.7081510934393638, + "grad_norm": 4.123963832855225, + "learning_rate": 1.9608263233742435e-06, + "loss": 0.1098, + "num_input_tokens_seen": 25244704, + "step": 12888 + }, + { + "epoch": 1.7082836315440688, + "grad_norm": 1.6036781072616577, + "learning_rate": 1.9604873606452303e-06, + "loss": 0.018, + "num_input_tokens_seen": 25246352, + "step": 12889 + }, + { + "epoch": 1.708416169648774, + "grad_norm": 8.156929969787598, + "learning_rate": 1.9601484083183924e-06, + "loss": 0.1065, + "num_input_tokens_seen": 25248592, + "step": 12890 + }, + { + "epoch": 1.7085487077534791, + "grad_norm": 0.09577160328626633, + "learning_rate": 1.9598094664002664e-06, + "loss": 0.0007, + "num_input_tokens_seen": 25250888, + "step": 12891 + }, + { + "epoch": 1.7086812458581844, + "grad_norm": 9.051547050476074, + "learning_rate": 1.9594705348973858e-06, + "loss": 0.211, + "num_input_tokens_seen": 25252800, + "step": 12892 + }, + { + "epoch": 1.7088137839628894, + "grad_norm": 1.5764238834381104, + "learning_rate": 1.959131613816286e-06, + "loss": 0.0167, + "num_input_tokens_seen": 25254112, + "step": 12893 + }, + { + "epoch": 1.7089463220675944, + "grad_norm": 21.03456687927246, + "learning_rate": 1.9587927031635024e-06, + "loss": 0.7037, + "num_input_tokens_seen": 25256128, + "step": 12894 + }, + { + "epoch": 1.7090788601722995, + "grad_norm": 13.756447792053223, + "learning_rate": 1.958453802945569e-06, + "loss": 0.2294, + "num_input_tokens_seen": 25257848, + "step": 12895 + }, + { + "epoch": 1.7092113982770045, + "grad_norm": 7.335909843444824, + "learning_rate": 1.9581149131690192e-06, + "loss": 0.1319, + "num_input_tokens_seen": 25259888, + "step": 12896 + }, + { + "epoch": 1.7093439363817098, + "grad_norm": 14.74302864074707, + "learning_rate": 1.9577760338403887e-06, + "loss": 0.2358, + "num_input_tokens_seen": 25262352, + "step": 12897 + }, + { + "epoch": 1.7094764744864148, + "grad_norm": 11.128101348876953, + "learning_rate": 1.957437164966209e-06, + "loss": 0.0626, + "num_input_tokens_seen": 25264176, + "step": 12898 + }, + { + "epoch": 1.70960901259112, + "grad_norm": 3.5048234462738037, + "learning_rate": 1.9570983065530154e-06, + "loss": 0.048, + "num_input_tokens_seen": 25265752, + "step": 12899 + }, + { + "epoch": 1.7097415506958251, + "grad_norm": 3.2065393924713135, + "learning_rate": 1.956759458607342e-06, + "loss": 0.0916, + "num_input_tokens_seen": 25267608, + "step": 12900 + }, + { + "epoch": 1.7098740888005302, + "grad_norm": 11.249932289123535, + "learning_rate": 1.9564206211357203e-06, + "loss": 0.133, + "num_input_tokens_seen": 25269584, + "step": 12901 + }, + { + "epoch": 1.7100066269052352, + "grad_norm": 3.8407392501831055, + "learning_rate": 1.9560817941446843e-06, + "loss": 0.05, + "num_input_tokens_seen": 25271288, + "step": 12902 + }, + { + "epoch": 1.7101391650099402, + "grad_norm": 1.8990479707717896, + "learning_rate": 1.9557429776407653e-06, + "loss": 0.0228, + "num_input_tokens_seen": 25272728, + "step": 12903 + }, + { + "epoch": 1.7102717031146455, + "grad_norm": 3.9293930530548096, + "learning_rate": 1.9554041716304987e-06, + "loss": 0.1042, + "num_input_tokens_seen": 25275072, + "step": 12904 + }, + { + "epoch": 1.7104042412193505, + "grad_norm": 11.22649097442627, + "learning_rate": 1.9550653761204145e-06, + "loss": 0.2575, + "num_input_tokens_seen": 25277120, + "step": 12905 + }, + { + "epoch": 1.7105367793240558, + "grad_norm": 0.17238593101501465, + "learning_rate": 1.9547265911170465e-06, + "loss": 0.0012, + "num_input_tokens_seen": 25279400, + "step": 12906 + }, + { + "epoch": 1.7106693174287608, + "grad_norm": 0.1866070181131363, + "learning_rate": 1.9543878166269252e-06, + "loss": 0.0013, + "num_input_tokens_seen": 25280752, + "step": 12907 + }, + { + "epoch": 1.7108018555334659, + "grad_norm": 6.779706954956055, + "learning_rate": 1.954049052656583e-06, + "loss": 0.0431, + "num_input_tokens_seen": 25282480, + "step": 12908 + }, + { + "epoch": 1.710934393638171, + "grad_norm": 7.632396697998047, + "learning_rate": 1.953710299212553e-06, + "loss": 0.0676, + "num_input_tokens_seen": 25284840, + "step": 12909 + }, + { + "epoch": 1.711066931742876, + "grad_norm": 0.09920109063386917, + "learning_rate": 1.9533715563013645e-06, + "loss": 0.0012, + "num_input_tokens_seen": 25286504, + "step": 12910 + }, + { + "epoch": 1.7111994698475812, + "grad_norm": 0.17513515055179596, + "learning_rate": 1.9530328239295496e-06, + "loss": 0.0012, + "num_input_tokens_seen": 25288984, + "step": 12911 + }, + { + "epoch": 1.7113320079522862, + "grad_norm": 0.07903392612934113, + "learning_rate": 1.9526941021036395e-06, + "loss": 0.0006, + "num_input_tokens_seen": 25291656, + "step": 12912 + }, + { + "epoch": 1.7114645460569915, + "grad_norm": 0.1345091015100479, + "learning_rate": 1.952355390830164e-06, + "loss": 0.001, + "num_input_tokens_seen": 25293104, + "step": 12913 + }, + { + "epoch": 1.7115970841616965, + "grad_norm": 11.88608169555664, + "learning_rate": 1.952016690115654e-06, + "loss": 0.1905, + "num_input_tokens_seen": 25294808, + "step": 12914 + }, + { + "epoch": 1.7117296222664016, + "grad_norm": 0.10170052945613861, + "learning_rate": 1.9516779999666414e-06, + "loss": 0.0007, + "num_input_tokens_seen": 25296280, + "step": 12915 + }, + { + "epoch": 1.7118621603711066, + "grad_norm": 0.13856835663318634, + "learning_rate": 1.951339320389655e-06, + "loss": 0.001, + "num_input_tokens_seen": 25297952, + "step": 12916 + }, + { + "epoch": 1.7119946984758116, + "grad_norm": 8.13261890411377, + "learning_rate": 1.9510006513912246e-06, + "loss": 0.0702, + "num_input_tokens_seen": 25299320, + "step": 12917 + }, + { + "epoch": 1.712127236580517, + "grad_norm": 0.2003929316997528, + "learning_rate": 1.9506619929778813e-06, + "loss": 0.0013, + "num_input_tokens_seen": 25301040, + "step": 12918 + }, + { + "epoch": 1.712259774685222, + "grad_norm": 0.603008508682251, + "learning_rate": 1.950323345156152e-06, + "loss": 0.0047, + "num_input_tokens_seen": 25302504, + "step": 12919 + }, + { + "epoch": 1.7123923127899272, + "grad_norm": 0.052780456840991974, + "learning_rate": 1.9499847079325685e-06, + "loss": 0.0004, + "num_input_tokens_seen": 25303744, + "step": 12920 + }, + { + "epoch": 1.7125248508946322, + "grad_norm": 10.793846130371094, + "learning_rate": 1.9496460813136602e-06, + "loss": 0.2512, + "num_input_tokens_seen": 25306144, + "step": 12921 + }, + { + "epoch": 1.7126573889993373, + "grad_norm": 5.912188529968262, + "learning_rate": 1.9493074653059542e-06, + "loss": 0.232, + "num_input_tokens_seen": 25307864, + "step": 12922 + }, + { + "epoch": 1.7127899271040423, + "grad_norm": 0.06489851325750351, + "learning_rate": 1.9489688599159807e-06, + "loss": 0.0005, + "num_input_tokens_seen": 25310784, + "step": 12923 + }, + { + "epoch": 1.7129224652087474, + "grad_norm": 5.481436252593994, + "learning_rate": 1.948630265150267e-06, + "loss": 0.134, + "num_input_tokens_seen": 25312800, + "step": 12924 + }, + { + "epoch": 1.7130550033134526, + "grad_norm": 4.623965740203857, + "learning_rate": 1.948291681015343e-06, + "loss": 0.1298, + "num_input_tokens_seen": 25314040, + "step": 12925 + }, + { + "epoch": 1.7131875414181579, + "grad_norm": 9.266488075256348, + "learning_rate": 1.9479531075177354e-06, + "loss": 0.1475, + "num_input_tokens_seen": 25315728, + "step": 12926 + }, + { + "epoch": 1.713320079522863, + "grad_norm": 0.7103762030601501, + "learning_rate": 1.947614544663973e-06, + "loss": 0.0046, + "num_input_tokens_seen": 25317664, + "step": 12927 + }, + { + "epoch": 1.713452617627568, + "grad_norm": 7.603853225708008, + "learning_rate": 1.947275992460583e-06, + "loss": 0.1927, + "num_input_tokens_seen": 25319960, + "step": 12928 + }, + { + "epoch": 1.713585155732273, + "grad_norm": 3.0583081245422363, + "learning_rate": 1.946937450914093e-06, + "loss": 0.0187, + "num_input_tokens_seen": 25322976, + "step": 12929 + }, + { + "epoch": 1.713717693836978, + "grad_norm": 0.05097518861293793, + "learning_rate": 1.9465989200310315e-06, + "loss": 0.0003, + "num_input_tokens_seen": 25324032, + "step": 12930 + }, + { + "epoch": 1.7138502319416833, + "grad_norm": 2.333779811859131, + "learning_rate": 1.9462603998179237e-06, + "loss": 0.0128, + "num_input_tokens_seen": 25326072, + "step": 12931 + }, + { + "epoch": 1.7139827700463883, + "grad_norm": 6.8236823081970215, + "learning_rate": 1.945921890281298e-06, + "loss": 0.1774, + "num_input_tokens_seen": 25327728, + "step": 12932 + }, + { + "epoch": 1.7141153081510936, + "grad_norm": 1.0247021913528442, + "learning_rate": 1.9455833914276805e-06, + "loss": 0.027, + "num_input_tokens_seen": 25329344, + "step": 12933 + }, + { + "epoch": 1.7142478462557986, + "grad_norm": 0.08461688458919525, + "learning_rate": 1.9452449032635966e-06, + "loss": 0.0006, + "num_input_tokens_seen": 25332464, + "step": 12934 + }, + { + "epoch": 1.7143803843605037, + "grad_norm": 12.286478042602539, + "learning_rate": 1.9449064257955745e-06, + "loss": 0.131, + "num_input_tokens_seen": 25334256, + "step": 12935 + }, + { + "epoch": 1.7145129224652087, + "grad_norm": 0.03181973844766617, + "learning_rate": 1.9445679590301395e-06, + "loss": 0.0002, + "num_input_tokens_seen": 25336192, + "step": 12936 + }, + { + "epoch": 1.7146454605699137, + "grad_norm": 9.853219032287598, + "learning_rate": 1.944229502973817e-06, + "loss": 0.2565, + "num_input_tokens_seen": 25338688, + "step": 12937 + }, + { + "epoch": 1.714777998674619, + "grad_norm": 1.6905488967895508, + "learning_rate": 1.9438910576331336e-06, + "loss": 0.0157, + "num_input_tokens_seen": 25340520, + "step": 12938 + }, + { + "epoch": 1.714910536779324, + "grad_norm": 1.551859974861145, + "learning_rate": 1.9435526230146135e-06, + "loss": 0.0169, + "num_input_tokens_seen": 25342424, + "step": 12939 + }, + { + "epoch": 1.7150430748840293, + "grad_norm": 0.029570886865258217, + "learning_rate": 1.9432141991247834e-06, + "loss": 0.0002, + "num_input_tokens_seen": 25344520, + "step": 12940 + }, + { + "epoch": 1.7151756129887343, + "grad_norm": 0.024803733453154564, + "learning_rate": 1.9428757859701676e-06, + "loss": 0.0002, + "num_input_tokens_seen": 25346280, + "step": 12941 + }, + { + "epoch": 1.7153081510934394, + "grad_norm": 4.9073805809021, + "learning_rate": 1.942537383557291e-06, + "loss": 0.0585, + "num_input_tokens_seen": 25348296, + "step": 12942 + }, + { + "epoch": 1.7154406891981444, + "grad_norm": 6.587100505828857, + "learning_rate": 1.9421989918926783e-06, + "loss": 0.0598, + "num_input_tokens_seen": 25350136, + "step": 12943 + }, + { + "epoch": 1.7155732273028494, + "grad_norm": 4.08974027633667, + "learning_rate": 1.9418606109828535e-06, + "loss": 0.1186, + "num_input_tokens_seen": 25351440, + "step": 12944 + }, + { + "epoch": 1.7157057654075547, + "grad_norm": 3.507312297821045, + "learning_rate": 1.9415222408343408e-06, + "loss": 0.0448, + "num_input_tokens_seen": 25352808, + "step": 12945 + }, + { + "epoch": 1.7158383035122597, + "grad_norm": 8.362565994262695, + "learning_rate": 1.941183881453665e-06, + "loss": 0.0936, + "num_input_tokens_seen": 25354960, + "step": 12946 + }, + { + "epoch": 1.715970841616965, + "grad_norm": 7.943873405456543, + "learning_rate": 1.94084553284735e-06, + "loss": 0.1312, + "num_input_tokens_seen": 25357448, + "step": 12947 + }, + { + "epoch": 1.71610337972167, + "grad_norm": 0.4840361773967743, + "learning_rate": 1.9405071950219185e-06, + "loss": 0.0065, + "num_input_tokens_seen": 25360488, + "step": 12948 + }, + { + "epoch": 1.716235917826375, + "grad_norm": 3.952924966812134, + "learning_rate": 1.940168867983894e-06, + "loss": 0.0325, + "num_input_tokens_seen": 25362192, + "step": 12949 + }, + { + "epoch": 1.7163684559310801, + "grad_norm": 9.478523254394531, + "learning_rate": 1.9398305517397994e-06, + "loss": 0.2605, + "num_input_tokens_seen": 25364152, + "step": 12950 + }, + { + "epoch": 1.7165009940357852, + "grad_norm": 0.018006248399615288, + "learning_rate": 1.9394922462961594e-06, + "loss": 0.0001, + "num_input_tokens_seen": 25365736, + "step": 12951 + }, + { + "epoch": 1.7166335321404904, + "grad_norm": 1.2977339029312134, + "learning_rate": 1.9391539516594947e-06, + "loss": 0.0181, + "num_input_tokens_seen": 25367152, + "step": 12952 + }, + { + "epoch": 1.7167660702451955, + "grad_norm": 9.765443801879883, + "learning_rate": 1.9388156678363296e-06, + "loss": 0.161, + "num_input_tokens_seen": 25369000, + "step": 12953 + }, + { + "epoch": 1.7168986083499007, + "grad_norm": 0.019091330468654633, + "learning_rate": 1.938477394833185e-06, + "loss": 0.0001, + "num_input_tokens_seen": 25371936, + "step": 12954 + }, + { + "epoch": 1.7170311464546058, + "grad_norm": 8.533186912536621, + "learning_rate": 1.9381391326565825e-06, + "loss": 0.3349, + "num_input_tokens_seen": 25374328, + "step": 12955 + }, + { + "epoch": 1.7171636845593108, + "grad_norm": 0.030274169519543648, + "learning_rate": 1.9378008813130465e-06, + "loss": 0.0002, + "num_input_tokens_seen": 25376424, + "step": 12956 + }, + { + "epoch": 1.7172962226640158, + "grad_norm": 3.8452515602111816, + "learning_rate": 1.937462640809098e-06, + "loss": 0.0324, + "num_input_tokens_seen": 25379008, + "step": 12957 + }, + { + "epoch": 1.7174287607687209, + "grad_norm": 7.430222511291504, + "learning_rate": 1.9371244111512566e-06, + "loss": 0.227, + "num_input_tokens_seen": 25381856, + "step": 12958 + }, + { + "epoch": 1.7175612988734261, + "grad_norm": 0.11693737655878067, + "learning_rate": 1.9367861923460457e-06, + "loss": 0.0008, + "num_input_tokens_seen": 25383960, + "step": 12959 + }, + { + "epoch": 1.7176938369781312, + "grad_norm": 0.7076490521430969, + "learning_rate": 1.9364479843999844e-06, + "loss": 0.0059, + "num_input_tokens_seen": 25385720, + "step": 12960 + }, + { + "epoch": 1.7178263750828364, + "grad_norm": 0.05456510931253433, + "learning_rate": 1.936109787319595e-06, + "loss": 0.0004, + "num_input_tokens_seen": 25388368, + "step": 12961 + }, + { + "epoch": 1.7179589131875415, + "grad_norm": 0.04048958420753479, + "learning_rate": 1.9357716011113988e-06, + "loss": 0.0003, + "num_input_tokens_seen": 25391392, + "step": 12962 + }, + { + "epoch": 1.7180914512922465, + "grad_norm": 0.015053503215312958, + "learning_rate": 1.9354334257819153e-06, + "loss": 0.0001, + "num_input_tokens_seen": 25393552, + "step": 12963 + }, + { + "epoch": 1.7182239893969515, + "grad_norm": 0.02331184223294258, + "learning_rate": 1.935095261337664e-06, + "loss": 0.0002, + "num_input_tokens_seen": 25394992, + "step": 12964 + }, + { + "epoch": 1.7183565275016566, + "grad_norm": 0.024966707453131676, + "learning_rate": 1.9347571077851657e-06, + "loss": 0.0002, + "num_input_tokens_seen": 25398240, + "step": 12965 + }, + { + "epoch": 1.7184890656063618, + "grad_norm": 9.616430282592773, + "learning_rate": 1.9344189651309413e-06, + "loss": 0.1576, + "num_input_tokens_seen": 25400192, + "step": 12966 + }, + { + "epoch": 1.718621603711067, + "grad_norm": 0.6272745132446289, + "learning_rate": 1.934080833381509e-06, + "loss": 0.0117, + "num_input_tokens_seen": 25402176, + "step": 12967 + }, + { + "epoch": 1.7187541418157721, + "grad_norm": 2.9754433631896973, + "learning_rate": 1.9337427125433887e-06, + "loss": 0.0474, + "num_input_tokens_seen": 25404024, + "step": 12968 + }, + { + "epoch": 1.7188866799204772, + "grad_norm": 6.730123519897461, + "learning_rate": 1.933404602623099e-06, + "loss": 0.1755, + "num_input_tokens_seen": 25406656, + "step": 12969 + }, + { + "epoch": 1.7190192180251822, + "grad_norm": 4.9462995529174805, + "learning_rate": 1.9330665036271605e-06, + "loss": 0.0971, + "num_input_tokens_seen": 25408440, + "step": 12970 + }, + { + "epoch": 1.7191517561298872, + "grad_norm": 3.910475492477417, + "learning_rate": 1.9327284155620894e-06, + "loss": 0.0285, + "num_input_tokens_seen": 25410128, + "step": 12971 + }, + { + "epoch": 1.7192842942345923, + "grad_norm": 4.765516757965088, + "learning_rate": 1.932390338434407e-06, + "loss": 0.1699, + "num_input_tokens_seen": 25412120, + "step": 12972 + }, + { + "epoch": 1.7194168323392975, + "grad_norm": 0.9485150575637817, + "learning_rate": 1.93205227225063e-06, + "loss": 0.0073, + "num_input_tokens_seen": 25413696, + "step": 12973 + }, + { + "epoch": 1.7195493704440028, + "grad_norm": 14.877690315246582, + "learning_rate": 1.9317142170172772e-06, + "loss": 0.1385, + "num_input_tokens_seen": 25415472, + "step": 12974 + }, + { + "epoch": 1.7196819085487078, + "grad_norm": 0.026858855038881302, + "learning_rate": 1.9313761727408665e-06, + "loss": 0.0002, + "num_input_tokens_seen": 25417296, + "step": 12975 + }, + { + "epoch": 1.7198144466534129, + "grad_norm": 0.018342316150665283, + "learning_rate": 1.9310381394279145e-06, + "loss": 0.0001, + "num_input_tokens_seen": 25419584, + "step": 12976 + }, + { + "epoch": 1.719946984758118, + "grad_norm": 8.671225547790527, + "learning_rate": 1.9307001170849404e-06, + "loss": 0.1557, + "num_input_tokens_seen": 25421784, + "step": 12977 + }, + { + "epoch": 1.720079522862823, + "grad_norm": 0.030548563227057457, + "learning_rate": 1.9303621057184612e-06, + "loss": 0.0002, + "num_input_tokens_seen": 25423136, + "step": 12978 + }, + { + "epoch": 1.7202120609675282, + "grad_norm": 0.48374390602111816, + "learning_rate": 1.930024105334993e-06, + "loss": 0.0035, + "num_input_tokens_seen": 25424832, + "step": 12979 + }, + { + "epoch": 1.7203445990722332, + "grad_norm": 0.04003541171550751, + "learning_rate": 1.9296861159410537e-06, + "loss": 0.0002, + "num_input_tokens_seen": 25427464, + "step": 12980 + }, + { + "epoch": 1.7204771371769385, + "grad_norm": 0.11772596836090088, + "learning_rate": 1.9293481375431585e-06, + "loss": 0.0008, + "num_input_tokens_seen": 25430272, + "step": 12981 + }, + { + "epoch": 1.7206096752816435, + "grad_norm": 0.01597120612859726, + "learning_rate": 1.9290101701478254e-06, + "loss": 0.0001, + "num_input_tokens_seen": 25431464, + "step": 12982 + }, + { + "epoch": 1.7207422133863486, + "grad_norm": 6.537518501281738, + "learning_rate": 1.928672213761571e-06, + "loss": 0.0543, + "num_input_tokens_seen": 25433232, + "step": 12983 + }, + { + "epoch": 1.7208747514910536, + "grad_norm": 0.38836613297462463, + "learning_rate": 1.928334268390909e-06, + "loss": 0.0023, + "num_input_tokens_seen": 25435816, + "step": 12984 + }, + { + "epoch": 1.7210072895957587, + "grad_norm": 0.6606959700584412, + "learning_rate": 1.927996334042358e-06, + "loss": 0.0032, + "num_input_tokens_seen": 25437248, + "step": 12985 + }, + { + "epoch": 1.721139827700464, + "grad_norm": 0.01738104596734047, + "learning_rate": 1.927658410722431e-06, + "loss": 0.0001, + "num_input_tokens_seen": 25439360, + "step": 12986 + }, + { + "epoch": 1.721272365805169, + "grad_norm": 7.767855167388916, + "learning_rate": 1.927320498437646e-06, + "loss": 0.1018, + "num_input_tokens_seen": 25441456, + "step": 12987 + }, + { + "epoch": 1.7214049039098742, + "grad_norm": 7.30949592590332, + "learning_rate": 1.9269825971945163e-06, + "loss": 0.0966, + "num_input_tokens_seen": 25444120, + "step": 12988 + }, + { + "epoch": 1.7215374420145793, + "grad_norm": 0.005745692178606987, + "learning_rate": 1.9266447069995574e-06, + "loss": 0.0, + "num_input_tokens_seen": 25445192, + "step": 12989 + }, + { + "epoch": 1.7216699801192843, + "grad_norm": 5.081228256225586, + "learning_rate": 1.926306827859284e-06, + "loss": 0.0333, + "num_input_tokens_seen": 25446992, + "step": 12990 + }, + { + "epoch": 1.7218025182239893, + "grad_norm": 9.082262992858887, + "learning_rate": 1.92596895978021e-06, + "loss": 0.2526, + "num_input_tokens_seen": 25450152, + "step": 12991 + }, + { + "epoch": 1.7219350563286944, + "grad_norm": 3.9443247318267822, + "learning_rate": 1.9256311027688516e-06, + "loss": 0.0712, + "num_input_tokens_seen": 25452712, + "step": 12992 + }, + { + "epoch": 1.7220675944333996, + "grad_norm": 1.3829736709594727, + "learning_rate": 1.9252932568317217e-06, + "loss": 0.0096, + "num_input_tokens_seen": 25454152, + "step": 12993 + }, + { + "epoch": 1.7222001325381047, + "grad_norm": 8.036885261535645, + "learning_rate": 1.924955421975334e-06, + "loss": 0.1776, + "num_input_tokens_seen": 25457376, + "step": 12994 + }, + { + "epoch": 1.72233267064281, + "grad_norm": 0.4590553343296051, + "learning_rate": 1.9246175982062032e-06, + "loss": 0.0038, + "num_input_tokens_seen": 25458912, + "step": 12995 + }, + { + "epoch": 1.722465208747515, + "grad_norm": 9.813371658325195, + "learning_rate": 1.924279785530841e-06, + "loss": 0.2084, + "num_input_tokens_seen": 25460872, + "step": 12996 + }, + { + "epoch": 1.72259774685222, + "grad_norm": 3.721602439880371, + "learning_rate": 1.9239419839557618e-06, + "loss": 0.024, + "num_input_tokens_seen": 25462848, + "step": 12997 + }, + { + "epoch": 1.722730284956925, + "grad_norm": 8.09642505645752, + "learning_rate": 1.923604193487479e-06, + "loss": 0.1626, + "num_input_tokens_seen": 25465584, + "step": 12998 + }, + { + "epoch": 1.72286282306163, + "grad_norm": 3.3792200088500977, + "learning_rate": 1.9232664141325045e-06, + "loss": 0.072, + "num_input_tokens_seen": 25467256, + "step": 12999 + }, + { + "epoch": 1.7229953611663353, + "grad_norm": 6.518139362335205, + "learning_rate": 1.922928645897352e-06, + "loss": 0.0907, + "num_input_tokens_seen": 25469816, + "step": 13000 + }, + { + "epoch": 1.7231278992710404, + "grad_norm": 5.251297473907471, + "learning_rate": 1.9225908887885335e-06, + "loss": 0.0391, + "num_input_tokens_seen": 25471760, + "step": 13001 + }, + { + "epoch": 1.7232604373757456, + "grad_norm": 0.06077706441283226, + "learning_rate": 1.9222531428125594e-06, + "loss": 0.0004, + "num_input_tokens_seen": 25473336, + "step": 13002 + }, + { + "epoch": 1.7233929754804507, + "grad_norm": 0.9986122846603394, + "learning_rate": 1.9219154079759444e-06, + "loss": 0.0044, + "num_input_tokens_seen": 25475424, + "step": 13003 + }, + { + "epoch": 1.7235255135851557, + "grad_norm": 10.366222381591797, + "learning_rate": 1.9215776842851997e-06, + "loss": 0.1737, + "num_input_tokens_seen": 25477024, + "step": 13004 + }, + { + "epoch": 1.7236580516898607, + "grad_norm": 9.17481803894043, + "learning_rate": 1.921239971746835e-06, + "loss": 0.0655, + "num_input_tokens_seen": 25478544, + "step": 13005 + }, + { + "epoch": 1.7237905897945658, + "grad_norm": 0.4798559546470642, + "learning_rate": 1.920902270367364e-06, + "loss": 0.0036, + "num_input_tokens_seen": 25480784, + "step": 13006 + }, + { + "epoch": 1.723923127899271, + "grad_norm": 0.03695900738239288, + "learning_rate": 1.920564580153296e-06, + "loss": 0.0003, + "num_input_tokens_seen": 25482432, + "step": 13007 + }, + { + "epoch": 1.7240556660039763, + "grad_norm": 9.76543140411377, + "learning_rate": 1.9202269011111425e-06, + "loss": 0.1283, + "num_input_tokens_seen": 25483712, + "step": 13008 + }, + { + "epoch": 1.7241882041086813, + "grad_norm": 11.855833053588867, + "learning_rate": 1.919889233247415e-06, + "loss": 0.2655, + "num_input_tokens_seen": 25485432, + "step": 13009 + }, + { + "epoch": 1.7243207422133864, + "grad_norm": 16.928966522216797, + "learning_rate": 1.9195515765686237e-06, + "loss": 0.4454, + "num_input_tokens_seen": 25487728, + "step": 13010 + }, + { + "epoch": 1.7244532803180914, + "grad_norm": 0.03161786124110222, + "learning_rate": 1.9192139310812775e-06, + "loss": 0.0002, + "num_input_tokens_seen": 25489504, + "step": 13011 + }, + { + "epoch": 1.7245858184227965, + "grad_norm": 0.618473470211029, + "learning_rate": 1.9188762967918872e-06, + "loss": 0.003, + "num_input_tokens_seen": 25491968, + "step": 13012 + }, + { + "epoch": 1.7247183565275015, + "grad_norm": 0.17127221822738647, + "learning_rate": 1.9185386737069633e-06, + "loss": 0.0011, + "num_input_tokens_seen": 25493896, + "step": 13013 + }, + { + "epoch": 1.7248508946322068, + "grad_norm": 0.016987809911370277, + "learning_rate": 1.918201061833015e-06, + "loss": 0.0001, + "num_input_tokens_seen": 25496536, + "step": 13014 + }, + { + "epoch": 1.724983432736912, + "grad_norm": 14.260407447814941, + "learning_rate": 1.917863461176552e-06, + "loss": 0.043, + "num_input_tokens_seen": 25497984, + "step": 13015 + }, + { + "epoch": 1.725115970841617, + "grad_norm": 0.014989822171628475, + "learning_rate": 1.9175258717440827e-06, + "loss": 0.0001, + "num_input_tokens_seen": 25499792, + "step": 13016 + }, + { + "epoch": 1.725248508946322, + "grad_norm": 9.37473201751709, + "learning_rate": 1.9171882935421153e-06, + "loss": 0.2087, + "num_input_tokens_seen": 25501992, + "step": 13017 + }, + { + "epoch": 1.7253810470510271, + "grad_norm": 0.022801384329795837, + "learning_rate": 1.9168507265771606e-06, + "loss": 0.0002, + "num_input_tokens_seen": 25503832, + "step": 13018 + }, + { + "epoch": 1.7255135851557322, + "grad_norm": 0.021364616230130196, + "learning_rate": 1.9165131708557264e-06, + "loss": 0.0001, + "num_input_tokens_seen": 25506160, + "step": 13019 + }, + { + "epoch": 1.7256461232604374, + "grad_norm": 0.31378209590911865, + "learning_rate": 1.9161756263843203e-06, + "loss": 0.0014, + "num_input_tokens_seen": 25507712, + "step": 13020 + }, + { + "epoch": 1.7257786613651425, + "grad_norm": 12.725200653076172, + "learning_rate": 1.9158380931694514e-06, + "loss": 0.3018, + "num_input_tokens_seen": 25509632, + "step": 13021 + }, + { + "epoch": 1.7259111994698477, + "grad_norm": 0.026845267042517662, + "learning_rate": 1.9155005712176262e-06, + "loss": 0.0002, + "num_input_tokens_seen": 25511456, + "step": 13022 + }, + { + "epoch": 1.7260437375745528, + "grad_norm": 3.088853359222412, + "learning_rate": 1.9151630605353538e-06, + "loss": 0.1026, + "num_input_tokens_seen": 25512904, + "step": 13023 + }, + { + "epoch": 1.7261762756792578, + "grad_norm": 9.048897743225098, + "learning_rate": 1.9148255611291413e-06, + "loss": 0.1868, + "num_input_tokens_seen": 25514712, + "step": 13024 + }, + { + "epoch": 1.7263088137839628, + "grad_norm": 7.238923072814941, + "learning_rate": 1.914488073005496e-06, + "loss": 0.1861, + "num_input_tokens_seen": 25516072, + "step": 13025 + }, + { + "epoch": 1.7264413518886679, + "grad_norm": 2.513960599899292, + "learning_rate": 1.914150596170924e-06, + "loss": 0.0105, + "num_input_tokens_seen": 25518640, + "step": 13026 + }, + { + "epoch": 1.7265738899933731, + "grad_norm": 0.04520354047417641, + "learning_rate": 1.9138131306319325e-06, + "loss": 0.0003, + "num_input_tokens_seen": 25520896, + "step": 13027 + }, + { + "epoch": 1.7267064280980782, + "grad_norm": 14.103716850280762, + "learning_rate": 1.913475676395028e-06, + "loss": 0.2667, + "num_input_tokens_seen": 25522992, + "step": 13028 + }, + { + "epoch": 1.7268389662027834, + "grad_norm": 5.991722106933594, + "learning_rate": 1.9131382334667173e-06, + "loss": 0.1617, + "num_input_tokens_seen": 25525256, + "step": 13029 + }, + { + "epoch": 1.7269715043074885, + "grad_norm": 0.05218103900551796, + "learning_rate": 1.912800801853507e-06, + "loss": 0.0003, + "num_input_tokens_seen": 25527344, + "step": 13030 + }, + { + "epoch": 1.7271040424121935, + "grad_norm": 0.04634048789739609, + "learning_rate": 1.912463381561902e-06, + "loss": 0.0003, + "num_input_tokens_seen": 25529976, + "step": 13031 + }, + { + "epoch": 1.7272365805168985, + "grad_norm": 0.06538394838571548, + "learning_rate": 1.912125972598408e-06, + "loss": 0.0004, + "num_input_tokens_seen": 25531576, + "step": 13032 + }, + { + "epoch": 1.7273691186216036, + "grad_norm": 8.11008071899414, + "learning_rate": 1.9117885749695303e-06, + "loss": 0.1277, + "num_input_tokens_seen": 25533056, + "step": 13033 + }, + { + "epoch": 1.7275016567263088, + "grad_norm": 9.15585994720459, + "learning_rate": 1.9114511886817753e-06, + "loss": 0.254, + "num_input_tokens_seen": 25535160, + "step": 13034 + }, + { + "epoch": 1.7276341948310139, + "grad_norm": 0.22886191308498383, + "learning_rate": 1.9111138137416475e-06, + "loss": 0.001, + "num_input_tokens_seen": 25536888, + "step": 13035 + }, + { + "epoch": 1.7277667329357191, + "grad_norm": 0.2796797752380371, + "learning_rate": 1.910776450155652e-06, + "loss": 0.0013, + "num_input_tokens_seen": 25538800, + "step": 13036 + }, + { + "epoch": 1.7278992710404242, + "grad_norm": 0.021284453570842743, + "learning_rate": 1.9104390979302923e-06, + "loss": 0.0001, + "num_input_tokens_seen": 25539872, + "step": 13037 + }, + { + "epoch": 1.7280318091451292, + "grad_norm": 17.202625274658203, + "learning_rate": 1.9101017570720732e-06, + "loss": 0.3886, + "num_input_tokens_seen": 25541976, + "step": 13038 + }, + { + "epoch": 1.7281643472498343, + "grad_norm": 0.31213247776031494, + "learning_rate": 1.9097644275875e-06, + "loss": 0.0025, + "num_input_tokens_seen": 25543576, + "step": 13039 + }, + { + "epoch": 1.7282968853545393, + "grad_norm": 18.33671760559082, + "learning_rate": 1.909427109483076e-06, + "loss": 0.4609, + "num_input_tokens_seen": 25545360, + "step": 13040 + }, + { + "epoch": 1.7284294234592446, + "grad_norm": 0.07300771772861481, + "learning_rate": 1.909089802765304e-06, + "loss": 0.0004, + "num_input_tokens_seen": 25546664, + "step": 13041 + }, + { + "epoch": 1.7285619615639496, + "grad_norm": 0.02774183265864849, + "learning_rate": 1.908752507440689e-06, + "loss": 0.0002, + "num_input_tokens_seen": 25547992, + "step": 13042 + }, + { + "epoch": 1.7286944996686548, + "grad_norm": 0.18687549233436584, + "learning_rate": 1.9084152235157326e-06, + "loss": 0.0012, + "num_input_tokens_seen": 25549680, + "step": 13043 + }, + { + "epoch": 1.7288270377733599, + "grad_norm": 13.019149780273438, + "learning_rate": 1.908077950996939e-06, + "loss": 0.1361, + "num_input_tokens_seen": 25552496, + "step": 13044 + }, + { + "epoch": 1.728959575878065, + "grad_norm": 8.08028793334961, + "learning_rate": 1.9077406898908115e-06, + "loss": 0.1816, + "num_input_tokens_seen": 25554640, + "step": 13045 + }, + { + "epoch": 1.72909211398277, + "grad_norm": 4.3986382484436035, + "learning_rate": 1.9074034402038515e-06, + "loss": 0.1204, + "num_input_tokens_seen": 25556280, + "step": 13046 + }, + { + "epoch": 1.729224652087475, + "grad_norm": 8.971397399902344, + "learning_rate": 1.9070662019425626e-06, + "loss": 0.1495, + "num_input_tokens_seen": 25558752, + "step": 13047 + }, + { + "epoch": 1.7293571901921803, + "grad_norm": 4.190964698791504, + "learning_rate": 1.9067289751134455e-06, + "loss": 0.1532, + "num_input_tokens_seen": 25560128, + "step": 13048 + }, + { + "epoch": 1.7294897282968855, + "grad_norm": 12.704618453979492, + "learning_rate": 1.9063917597230035e-06, + "loss": 0.241, + "num_input_tokens_seen": 25562424, + "step": 13049 + }, + { + "epoch": 1.7296222664015906, + "grad_norm": 4.931826591491699, + "learning_rate": 1.9060545557777379e-06, + "loss": 0.07, + "num_input_tokens_seen": 25564824, + "step": 13050 + }, + { + "epoch": 1.7297548045062956, + "grad_norm": 6.741169452667236, + "learning_rate": 1.9057173632841508e-06, + "loss": 0.1122, + "num_input_tokens_seen": 25566456, + "step": 13051 + }, + { + "epoch": 1.7298873426110006, + "grad_norm": 0.03542790561914444, + "learning_rate": 1.905380182248742e-06, + "loss": 0.0002, + "num_input_tokens_seen": 25568600, + "step": 13052 + }, + { + "epoch": 1.7300198807157057, + "grad_norm": 5.197339057922363, + "learning_rate": 1.9050430126780132e-06, + "loss": 0.0363, + "num_input_tokens_seen": 25569840, + "step": 13053 + }, + { + "epoch": 1.7301524188204107, + "grad_norm": 8.976947784423828, + "learning_rate": 1.9047058545784663e-06, + "loss": 0.2729, + "num_input_tokens_seen": 25571432, + "step": 13054 + }, + { + "epoch": 1.730284956925116, + "grad_norm": 0.047798532992601395, + "learning_rate": 1.9043687079566015e-06, + "loss": 0.0003, + "num_input_tokens_seen": 25572864, + "step": 13055 + }, + { + "epoch": 1.7304174950298212, + "grad_norm": 0.2912397086620331, + "learning_rate": 1.9040315728189185e-06, + "loss": 0.0033, + "num_input_tokens_seen": 25574144, + "step": 13056 + }, + { + "epoch": 1.7305500331345263, + "grad_norm": 0.4822416305541992, + "learning_rate": 1.903694449171918e-06, + "loss": 0.0033, + "num_input_tokens_seen": 25575784, + "step": 13057 + }, + { + "epoch": 1.7306825712392313, + "grad_norm": 8.978565216064453, + "learning_rate": 1.9033573370220997e-06, + "loss": 0.2154, + "num_input_tokens_seen": 25578200, + "step": 13058 + }, + { + "epoch": 1.7308151093439363, + "grad_norm": 0.6280280947685242, + "learning_rate": 1.9030202363759632e-06, + "loss": 0.0025, + "num_input_tokens_seen": 25579872, + "step": 13059 + }, + { + "epoch": 1.7309476474486414, + "grad_norm": 5.788191795349121, + "learning_rate": 1.9026831472400089e-06, + "loss": 0.0508, + "num_input_tokens_seen": 25581328, + "step": 13060 + }, + { + "epoch": 1.7310801855533466, + "grad_norm": 14.122097969055176, + "learning_rate": 1.9023460696207355e-06, + "loss": 0.21, + "num_input_tokens_seen": 25583104, + "step": 13061 + }, + { + "epoch": 1.7312127236580517, + "grad_norm": 0.884870707988739, + "learning_rate": 1.9020090035246424e-06, + "loss": 0.0068, + "num_input_tokens_seen": 25585088, + "step": 13062 + }, + { + "epoch": 1.731345261762757, + "grad_norm": 6.882485866546631, + "learning_rate": 1.9016719489582285e-06, + "loss": 0.1528, + "num_input_tokens_seen": 25587280, + "step": 13063 + }, + { + "epoch": 1.731477799867462, + "grad_norm": 6.080814361572266, + "learning_rate": 1.9013349059279909e-06, + "loss": 0.1395, + "num_input_tokens_seen": 25589328, + "step": 13064 + }, + { + "epoch": 1.731610337972167, + "grad_norm": 7.457761764526367, + "learning_rate": 1.90099787444043e-06, + "loss": 0.0887, + "num_input_tokens_seen": 25591040, + "step": 13065 + }, + { + "epoch": 1.731742876076872, + "grad_norm": 1.1022241115570068, + "learning_rate": 1.9006608545020438e-06, + "loss": 0.0104, + "num_input_tokens_seen": 25593216, + "step": 13066 + }, + { + "epoch": 1.731875414181577, + "grad_norm": 9.114222526550293, + "learning_rate": 1.9003238461193293e-06, + "loss": 0.3008, + "num_input_tokens_seen": 25595280, + "step": 13067 + }, + { + "epoch": 1.7320079522862823, + "grad_norm": 0.07977958768606186, + "learning_rate": 1.8999868492987852e-06, + "loss": 0.0005, + "num_input_tokens_seen": 25597384, + "step": 13068 + }, + { + "epoch": 1.7321404903909874, + "grad_norm": 0.48369911313056946, + "learning_rate": 1.8996498640469074e-06, + "loss": 0.0036, + "num_input_tokens_seen": 25598432, + "step": 13069 + }, + { + "epoch": 1.7322730284956926, + "grad_norm": 6.657774448394775, + "learning_rate": 1.8993128903701957e-06, + "loss": 0.0726, + "num_input_tokens_seen": 25600096, + "step": 13070 + }, + { + "epoch": 1.7324055666003977, + "grad_norm": 0.02275262027978897, + "learning_rate": 1.8989759282751452e-06, + "loss": 0.0002, + "num_input_tokens_seen": 25601984, + "step": 13071 + }, + { + "epoch": 1.7325381047051027, + "grad_norm": 5.385279655456543, + "learning_rate": 1.898638977768254e-06, + "loss": 0.0622, + "num_input_tokens_seen": 25604472, + "step": 13072 + }, + { + "epoch": 1.7326706428098078, + "grad_norm": 6.461187362670898, + "learning_rate": 1.8983020388560175e-06, + "loss": 0.0841, + "num_input_tokens_seen": 25606424, + "step": 13073 + }, + { + "epoch": 1.7328031809145128, + "grad_norm": 3.1076791286468506, + "learning_rate": 1.8979651115449323e-06, + "loss": 0.0977, + "num_input_tokens_seen": 25608568, + "step": 13074 + }, + { + "epoch": 1.732935719019218, + "grad_norm": 0.034549444913864136, + "learning_rate": 1.8976281958414963e-06, + "loss": 0.0002, + "num_input_tokens_seen": 25610496, + "step": 13075 + }, + { + "epoch": 1.733068257123923, + "grad_norm": 0.6905530691146851, + "learning_rate": 1.8972912917522035e-06, + "loss": 0.0066, + "num_input_tokens_seen": 25614288, + "step": 13076 + }, + { + "epoch": 1.7332007952286284, + "grad_norm": 7.6261796951293945, + "learning_rate": 1.8969543992835513e-06, + "loss": 0.2133, + "num_input_tokens_seen": 25615848, + "step": 13077 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 4.502918243408203, + "learning_rate": 1.8966175184420339e-06, + "loss": 0.1162, + "num_input_tokens_seen": 25617600, + "step": 13078 + }, + { + "epoch": 1.7334658714380384, + "grad_norm": 0.9794042110443115, + "learning_rate": 1.896280649234146e-06, + "loss": 0.0097, + "num_input_tokens_seen": 25619568, + "step": 13079 + }, + { + "epoch": 1.7335984095427435, + "grad_norm": 10.694192886352539, + "learning_rate": 1.8959437916663842e-06, + "loss": 0.2792, + "num_input_tokens_seen": 25621488, + "step": 13080 + }, + { + "epoch": 1.7337309476474485, + "grad_norm": 14.452932357788086, + "learning_rate": 1.8956069457452435e-06, + "loss": 0.2473, + "num_input_tokens_seen": 25623264, + "step": 13081 + }, + { + "epoch": 1.7338634857521538, + "grad_norm": 3.105473279953003, + "learning_rate": 1.895270111477217e-06, + "loss": 0.0947, + "num_input_tokens_seen": 25625224, + "step": 13082 + }, + { + "epoch": 1.7339960238568588, + "grad_norm": 2.924893617630005, + "learning_rate": 1.894933288868801e-06, + "loss": 0.0447, + "num_input_tokens_seen": 25628176, + "step": 13083 + }, + { + "epoch": 1.734128561961564, + "grad_norm": 0.10207723826169968, + "learning_rate": 1.8945964779264875e-06, + "loss": 0.0007, + "num_input_tokens_seen": 25629584, + "step": 13084 + }, + { + "epoch": 1.734261100066269, + "grad_norm": 2.798213481903076, + "learning_rate": 1.8942596786567718e-06, + "loss": 0.0541, + "num_input_tokens_seen": 25631152, + "step": 13085 + }, + { + "epoch": 1.7343936381709741, + "grad_norm": 11.149018287658691, + "learning_rate": 1.8939228910661475e-06, + "loss": 0.3583, + "num_input_tokens_seen": 25633720, + "step": 13086 + }, + { + "epoch": 1.7345261762756792, + "grad_norm": 8.458293914794922, + "learning_rate": 1.8935861151611085e-06, + "loss": 0.1327, + "num_input_tokens_seen": 25635400, + "step": 13087 + }, + { + "epoch": 1.7346587143803842, + "grad_norm": 4.468495845794678, + "learning_rate": 1.8932493509481467e-06, + "loss": 0.0839, + "num_input_tokens_seen": 25637768, + "step": 13088 + }, + { + "epoch": 1.7347912524850895, + "grad_norm": 7.832223892211914, + "learning_rate": 1.8929125984337568e-06, + "loss": 0.0785, + "num_input_tokens_seen": 25639840, + "step": 13089 + }, + { + "epoch": 1.7349237905897945, + "grad_norm": 0.06582982838153839, + "learning_rate": 1.8925758576244296e-06, + "loss": 0.0005, + "num_input_tokens_seen": 25641128, + "step": 13090 + }, + { + "epoch": 1.7350563286944998, + "grad_norm": 3.918466567993164, + "learning_rate": 1.8922391285266594e-06, + "loss": 0.1126, + "num_input_tokens_seen": 25643216, + "step": 13091 + }, + { + "epoch": 1.7351888667992048, + "grad_norm": 8.050939559936523, + "learning_rate": 1.8919024111469386e-06, + "loss": 0.1419, + "num_input_tokens_seen": 25646328, + "step": 13092 + }, + { + "epoch": 1.7353214049039098, + "grad_norm": 1.9532281160354614, + "learning_rate": 1.8915657054917592e-06, + "loss": 0.0326, + "num_input_tokens_seen": 25647792, + "step": 13093 + }, + { + "epoch": 1.7354539430086149, + "grad_norm": 17.39086151123047, + "learning_rate": 1.891229011567612e-06, + "loss": 0.4299, + "num_input_tokens_seen": 25650672, + "step": 13094 + }, + { + "epoch": 1.73558648111332, + "grad_norm": 11.847719192504883, + "learning_rate": 1.8908923293809886e-06, + "loss": 0.3618, + "num_input_tokens_seen": 25652352, + "step": 13095 + }, + { + "epoch": 1.7357190192180252, + "grad_norm": 1.2087526321411133, + "learning_rate": 1.8905556589383825e-06, + "loss": 0.0447, + "num_input_tokens_seen": 25654112, + "step": 13096 + }, + { + "epoch": 1.7358515573227304, + "grad_norm": 18.694190979003906, + "learning_rate": 1.8902190002462833e-06, + "loss": 0.3111, + "num_input_tokens_seen": 25655784, + "step": 13097 + }, + { + "epoch": 1.7359840954274355, + "grad_norm": 6.14501428604126, + "learning_rate": 1.889882353311183e-06, + "loss": 0.1424, + "num_input_tokens_seen": 25657400, + "step": 13098 + }, + { + "epoch": 1.7361166335321405, + "grad_norm": 1.7411673069000244, + "learning_rate": 1.889545718139571e-06, + "loss": 0.0089, + "num_input_tokens_seen": 25658872, + "step": 13099 + }, + { + "epoch": 1.7362491716368456, + "grad_norm": 0.4039137363433838, + "learning_rate": 1.8892090947379385e-06, + "loss": 0.0019, + "num_input_tokens_seen": 25660040, + "step": 13100 + }, + { + "epoch": 1.7363817097415506, + "grad_norm": 0.22721141576766968, + "learning_rate": 1.888872483112777e-06, + "loss": 0.0024, + "num_input_tokens_seen": 25661816, + "step": 13101 + }, + { + "epoch": 1.7365142478462559, + "grad_norm": 27.439496994018555, + "learning_rate": 1.8885358832705756e-06, + "loss": 0.3079, + "num_input_tokens_seen": 25663632, + "step": 13102 + }, + { + "epoch": 1.736646785950961, + "grad_norm": 10.742377281188965, + "learning_rate": 1.8881992952178235e-06, + "loss": 0.3334, + "num_input_tokens_seen": 25665832, + "step": 13103 + }, + { + "epoch": 1.7367793240556662, + "grad_norm": 6.057638645172119, + "learning_rate": 1.8878627189610118e-06, + "loss": 0.0926, + "num_input_tokens_seen": 25667472, + "step": 13104 + }, + { + "epoch": 1.7369118621603712, + "grad_norm": 1.8793964385986328, + "learning_rate": 1.8875261545066281e-06, + "loss": 0.014, + "num_input_tokens_seen": 25668928, + "step": 13105 + }, + { + "epoch": 1.7370444002650762, + "grad_norm": 8.9644136428833, + "learning_rate": 1.887189601861163e-06, + "loss": 0.126, + "num_input_tokens_seen": 25670928, + "step": 13106 + }, + { + "epoch": 1.7371769383697813, + "grad_norm": 4.533306121826172, + "learning_rate": 1.886853061031106e-06, + "loss": 0.1174, + "num_input_tokens_seen": 25673216, + "step": 13107 + }, + { + "epoch": 1.7373094764744863, + "grad_norm": 7.831053256988525, + "learning_rate": 1.8865165320229445e-06, + "loss": 0.0471, + "num_input_tokens_seen": 25674904, + "step": 13108 + }, + { + "epoch": 1.7374420145791916, + "grad_norm": 6.345348358154297, + "learning_rate": 1.8861800148431675e-06, + "loss": 0.0609, + "num_input_tokens_seen": 25676400, + "step": 13109 + }, + { + "epoch": 1.7375745526838966, + "grad_norm": 9.677519798278809, + "learning_rate": 1.8858435094982624e-06, + "loss": 0.1029, + "num_input_tokens_seen": 25678256, + "step": 13110 + }, + { + "epoch": 1.7377070907886019, + "grad_norm": 2.7851407527923584, + "learning_rate": 1.8855070159947192e-06, + "loss": 0.0266, + "num_input_tokens_seen": 25680216, + "step": 13111 + }, + { + "epoch": 1.737839628893307, + "grad_norm": 3.6120333671569824, + "learning_rate": 1.8851705343390241e-06, + "loss": 0.0361, + "num_input_tokens_seen": 25682280, + "step": 13112 + }, + { + "epoch": 1.737972166998012, + "grad_norm": 0.8822866678237915, + "learning_rate": 1.8848340645376659e-06, + "loss": 0.0057, + "num_input_tokens_seen": 25683912, + "step": 13113 + }, + { + "epoch": 1.738104705102717, + "grad_norm": 12.27357006072998, + "learning_rate": 1.8844976065971306e-06, + "loss": 0.1819, + "num_input_tokens_seen": 25685528, + "step": 13114 + }, + { + "epoch": 1.738237243207422, + "grad_norm": 0.29308661818504333, + "learning_rate": 1.8841611605239068e-06, + "loss": 0.0021, + "num_input_tokens_seen": 25687584, + "step": 13115 + }, + { + "epoch": 1.7383697813121273, + "grad_norm": 0.20190446078777313, + "learning_rate": 1.8838247263244797e-06, + "loss": 0.0014, + "num_input_tokens_seen": 25689856, + "step": 13116 + }, + { + "epoch": 1.7385023194168323, + "grad_norm": 0.26959624886512756, + "learning_rate": 1.8834883040053376e-06, + "loss": 0.002, + "num_input_tokens_seen": 25691384, + "step": 13117 + }, + { + "epoch": 1.7386348575215376, + "grad_norm": 0.5388979911804199, + "learning_rate": 1.8831518935729658e-06, + "loss": 0.0038, + "num_input_tokens_seen": 25693080, + "step": 13118 + }, + { + "epoch": 1.7387673956262426, + "grad_norm": 13.03867244720459, + "learning_rate": 1.882815495033852e-06, + "loss": 0.3087, + "num_input_tokens_seen": 25696168, + "step": 13119 + }, + { + "epoch": 1.7388999337309476, + "grad_norm": 3.3906145095825195, + "learning_rate": 1.8824791083944804e-06, + "loss": 0.0434, + "num_input_tokens_seen": 25698120, + "step": 13120 + }, + { + "epoch": 1.7390324718356527, + "grad_norm": 0.5830864310264587, + "learning_rate": 1.8821427336613373e-06, + "loss": 0.004, + "num_input_tokens_seen": 25700224, + "step": 13121 + }, + { + "epoch": 1.7391650099403577, + "grad_norm": 7.49354362487793, + "learning_rate": 1.8818063708409093e-06, + "loss": 0.1101, + "num_input_tokens_seen": 25701640, + "step": 13122 + }, + { + "epoch": 1.739297548045063, + "grad_norm": 16.447891235351562, + "learning_rate": 1.8814700199396813e-06, + "loss": 0.305, + "num_input_tokens_seen": 25703376, + "step": 13123 + }, + { + "epoch": 1.739430086149768, + "grad_norm": 1.42312753200531, + "learning_rate": 1.881133680964137e-06, + "loss": 0.0097, + "num_input_tokens_seen": 25705088, + "step": 13124 + }, + { + "epoch": 1.7395626242544733, + "grad_norm": 7.820713520050049, + "learning_rate": 1.880797353920763e-06, + "loss": 0.1402, + "num_input_tokens_seen": 25707064, + "step": 13125 + }, + { + "epoch": 1.7396951623591783, + "grad_norm": 3.5970516204833984, + "learning_rate": 1.8804610388160425e-06, + "loss": 0.0959, + "num_input_tokens_seen": 25709032, + "step": 13126 + }, + { + "epoch": 1.7398277004638834, + "grad_norm": 0.07361502200365067, + "learning_rate": 1.8801247356564605e-06, + "loss": 0.0005, + "num_input_tokens_seen": 25710504, + "step": 13127 + }, + { + "epoch": 1.7399602385685884, + "grad_norm": 0.124573715031147, + "learning_rate": 1.879788444448502e-06, + "loss": 0.0009, + "num_input_tokens_seen": 25713072, + "step": 13128 + }, + { + "epoch": 1.7400927766732934, + "grad_norm": 2.5478551387786865, + "learning_rate": 1.8794521651986497e-06, + "loss": 0.0204, + "num_input_tokens_seen": 25715048, + "step": 13129 + }, + { + "epoch": 1.7402253147779987, + "grad_norm": 0.17135240137577057, + "learning_rate": 1.8791158979133883e-06, + "loss": 0.0012, + "num_input_tokens_seen": 25716656, + "step": 13130 + }, + { + "epoch": 1.7403578528827037, + "grad_norm": 9.008345603942871, + "learning_rate": 1.8787796425991998e-06, + "loss": 0.2389, + "num_input_tokens_seen": 25719440, + "step": 13131 + }, + { + "epoch": 1.740490390987409, + "grad_norm": 4.280391216278076, + "learning_rate": 1.8784433992625689e-06, + "loss": 0.0587, + "num_input_tokens_seen": 25722336, + "step": 13132 + }, + { + "epoch": 1.740622929092114, + "grad_norm": 7.120774745941162, + "learning_rate": 1.878107167909978e-06, + "loss": 0.1713, + "num_input_tokens_seen": 25723928, + "step": 13133 + }, + { + "epoch": 1.740755467196819, + "grad_norm": 5.924616813659668, + "learning_rate": 1.8777709485479102e-06, + "loss": 0.056, + "num_input_tokens_seen": 25725840, + "step": 13134 + }, + { + "epoch": 1.740888005301524, + "grad_norm": 5.2308430671691895, + "learning_rate": 1.8774347411828472e-06, + "loss": 0.077, + "num_input_tokens_seen": 25728808, + "step": 13135 + }, + { + "epoch": 1.7410205434062291, + "grad_norm": 0.045758865773677826, + "learning_rate": 1.8770985458212714e-06, + "loss": 0.0003, + "num_input_tokens_seen": 25730144, + "step": 13136 + }, + { + "epoch": 1.7411530815109344, + "grad_norm": 15.992263793945312, + "learning_rate": 1.8767623624696669e-06, + "loss": 0.2872, + "num_input_tokens_seen": 25732384, + "step": 13137 + }, + { + "epoch": 1.7412856196156397, + "grad_norm": 12.600237846374512, + "learning_rate": 1.8764261911345128e-06, + "loss": 0.2052, + "num_input_tokens_seen": 25734296, + "step": 13138 + }, + { + "epoch": 1.7414181577203447, + "grad_norm": 6.148332118988037, + "learning_rate": 1.8760900318222927e-06, + "loss": 0.1472, + "num_input_tokens_seen": 25736088, + "step": 13139 + }, + { + "epoch": 1.7415506958250497, + "grad_norm": 0.052879493683576584, + "learning_rate": 1.875753884539487e-06, + "loss": 0.0004, + "num_input_tokens_seen": 25737416, + "step": 13140 + }, + { + "epoch": 1.7416832339297548, + "grad_norm": 10.572572708129883, + "learning_rate": 1.8754177492925762e-06, + "loss": 0.2063, + "num_input_tokens_seen": 25739088, + "step": 13141 + }, + { + "epoch": 1.7418157720344598, + "grad_norm": 0.04633089900016785, + "learning_rate": 1.8750816260880428e-06, + "loss": 0.0003, + "num_input_tokens_seen": 25740352, + "step": 13142 + }, + { + "epoch": 1.7419483101391648, + "grad_norm": 8.72778606414795, + "learning_rate": 1.874745514932367e-06, + "loss": 0.2329, + "num_input_tokens_seen": 25742000, + "step": 13143 + }, + { + "epoch": 1.74208084824387, + "grad_norm": 5.052142143249512, + "learning_rate": 1.8744094158320286e-06, + "loss": 0.0846, + "num_input_tokens_seen": 25743904, + "step": 13144 + }, + { + "epoch": 1.7422133863485754, + "grad_norm": 5.877135276794434, + "learning_rate": 1.8740733287935087e-06, + "loss": 0.1993, + "num_input_tokens_seen": 25745936, + "step": 13145 + }, + { + "epoch": 1.7423459244532804, + "grad_norm": 0.03169787675142288, + "learning_rate": 1.8737372538232867e-06, + "loss": 0.0002, + "num_input_tokens_seen": 25747952, + "step": 13146 + }, + { + "epoch": 1.7424784625579854, + "grad_norm": 6.226495265960693, + "learning_rate": 1.8734011909278416e-06, + "loss": 0.0393, + "num_input_tokens_seen": 25749992, + "step": 13147 + }, + { + "epoch": 1.7426110006626905, + "grad_norm": 0.28707942366600037, + "learning_rate": 1.8730651401136545e-06, + "loss": 0.0018, + "num_input_tokens_seen": 25751616, + "step": 13148 + }, + { + "epoch": 1.7427435387673955, + "grad_norm": 16.98242950439453, + "learning_rate": 1.8727291013872043e-06, + "loss": 0.3908, + "num_input_tokens_seen": 25753624, + "step": 13149 + }, + { + "epoch": 1.7428760768721008, + "grad_norm": 23.877166748046875, + "learning_rate": 1.872393074754969e-06, + "loss": 0.1701, + "num_input_tokens_seen": 25755880, + "step": 13150 + }, + { + "epoch": 1.7430086149768058, + "grad_norm": 12.88788890838623, + "learning_rate": 1.8720570602234292e-06, + "loss": 0.3201, + "num_input_tokens_seen": 25758312, + "step": 13151 + }, + { + "epoch": 1.743141153081511, + "grad_norm": 0.09495490789413452, + "learning_rate": 1.8717210577990613e-06, + "loss": 0.0007, + "num_input_tokens_seen": 25760040, + "step": 13152 + }, + { + "epoch": 1.7432736911862161, + "grad_norm": 0.08636965602636337, + "learning_rate": 1.871385067488345e-06, + "loss": 0.0012, + "num_input_tokens_seen": 25761632, + "step": 13153 + }, + { + "epoch": 1.7434062292909212, + "grad_norm": 5.337892532348633, + "learning_rate": 1.871049089297759e-06, + "loss": 0.044, + "num_input_tokens_seen": 25763864, + "step": 13154 + }, + { + "epoch": 1.7435387673956262, + "grad_norm": 0.041656509041786194, + "learning_rate": 1.8707131232337805e-06, + "loss": 0.0003, + "num_input_tokens_seen": 25765960, + "step": 13155 + }, + { + "epoch": 1.7436713055003312, + "grad_norm": 0.0716482549905777, + "learning_rate": 1.8703771693028867e-06, + "loss": 0.0005, + "num_input_tokens_seen": 25767184, + "step": 13156 + }, + { + "epoch": 1.7438038436050365, + "grad_norm": 3.553178071975708, + "learning_rate": 1.8700412275115548e-06, + "loss": 0.0665, + "num_input_tokens_seen": 25768808, + "step": 13157 + }, + { + "epoch": 1.7439363817097415, + "grad_norm": 0.07221023738384247, + "learning_rate": 1.8697052978662637e-06, + "loss": 0.0005, + "num_input_tokens_seen": 25769976, + "step": 13158 + }, + { + "epoch": 1.7440689198144468, + "grad_norm": 4.6998515129089355, + "learning_rate": 1.8693693803734885e-06, + "loss": 0.0673, + "num_input_tokens_seen": 25772496, + "step": 13159 + }, + { + "epoch": 1.7442014579191518, + "grad_norm": 4.6739044189453125, + "learning_rate": 1.8690334750397076e-06, + "loss": 0.1554, + "num_input_tokens_seen": 25775072, + "step": 13160 + }, + { + "epoch": 1.7443339960238569, + "grad_norm": 0.10845929384231567, + "learning_rate": 1.8686975818713966e-06, + "loss": 0.0008, + "num_input_tokens_seen": 25777240, + "step": 13161 + }, + { + "epoch": 1.744466534128562, + "grad_norm": 0.8639370799064636, + "learning_rate": 1.86836170087503e-06, + "loss": 0.0073, + "num_input_tokens_seen": 25778984, + "step": 13162 + }, + { + "epoch": 1.744599072233267, + "grad_norm": 13.141666412353516, + "learning_rate": 1.8680258320570865e-06, + "loss": 0.3134, + "num_input_tokens_seen": 25781232, + "step": 13163 + }, + { + "epoch": 1.7447316103379722, + "grad_norm": 4.9684062004089355, + "learning_rate": 1.867689975424042e-06, + "loss": 0.0418, + "num_input_tokens_seen": 25782568, + "step": 13164 + }, + { + "epoch": 1.7448641484426772, + "grad_norm": 7.025899887084961, + "learning_rate": 1.8673541309823695e-06, + "loss": 0.1701, + "num_input_tokens_seen": 25784304, + "step": 13165 + }, + { + "epoch": 1.7449966865473825, + "grad_norm": 12.610382080078125, + "learning_rate": 1.867018298738547e-06, + "loss": 0.2313, + "num_input_tokens_seen": 25786864, + "step": 13166 + }, + { + "epoch": 1.7451292246520875, + "grad_norm": 15.220202445983887, + "learning_rate": 1.8666824786990472e-06, + "loss": 0.6231, + "num_input_tokens_seen": 25789456, + "step": 13167 + }, + { + "epoch": 1.7452617627567926, + "grad_norm": 0.42006996273994446, + "learning_rate": 1.8663466708703464e-06, + "loss": 0.0034, + "num_input_tokens_seen": 25790816, + "step": 13168 + }, + { + "epoch": 1.7453943008614976, + "grad_norm": 0.023408928886055946, + "learning_rate": 1.8660108752589195e-06, + "loss": 0.0002, + "num_input_tokens_seen": 25792344, + "step": 13169 + }, + { + "epoch": 1.7455268389662026, + "grad_norm": 7.248992443084717, + "learning_rate": 1.8656750918712407e-06, + "loss": 0.1105, + "num_input_tokens_seen": 25794048, + "step": 13170 + }, + { + "epoch": 1.745659377070908, + "grad_norm": 12.495877265930176, + "learning_rate": 1.8653393207137827e-06, + "loss": 0.3723, + "num_input_tokens_seen": 25796568, + "step": 13171 + }, + { + "epoch": 1.745791915175613, + "grad_norm": 0.10930247604846954, + "learning_rate": 1.8650035617930213e-06, + "loss": 0.0007, + "num_input_tokens_seen": 25799536, + "step": 13172 + }, + { + "epoch": 1.7459244532803182, + "grad_norm": 1.6016252040863037, + "learning_rate": 1.8646678151154279e-06, + "loss": 0.0056, + "num_input_tokens_seen": 25801736, + "step": 13173 + }, + { + "epoch": 1.7460569913850232, + "grad_norm": 1.1544947624206543, + "learning_rate": 1.8643320806874782e-06, + "loss": 0.0204, + "num_input_tokens_seen": 25802848, + "step": 13174 + }, + { + "epoch": 1.7461895294897283, + "grad_norm": 5.832897663116455, + "learning_rate": 1.8639963585156448e-06, + "loss": 0.0876, + "num_input_tokens_seen": 25804440, + "step": 13175 + }, + { + "epoch": 1.7463220675944333, + "grad_norm": 5.867482662200928, + "learning_rate": 1.8636606486064004e-06, + "loss": 0.0873, + "num_input_tokens_seen": 25806144, + "step": 13176 + }, + { + "epoch": 1.7464546056991384, + "grad_norm": 12.540392875671387, + "learning_rate": 1.8633249509662171e-06, + "loss": 0.28, + "num_input_tokens_seen": 25808000, + "step": 13177 + }, + { + "epoch": 1.7465871438038436, + "grad_norm": 2.272188425064087, + "learning_rate": 1.8629892656015674e-06, + "loss": 0.0169, + "num_input_tokens_seen": 25810016, + "step": 13178 + }, + { + "epoch": 1.7467196819085489, + "grad_norm": 6.704890251159668, + "learning_rate": 1.8626535925189254e-06, + "loss": 0.0627, + "num_input_tokens_seen": 25812464, + "step": 13179 + }, + { + "epoch": 1.746852220013254, + "grad_norm": 4.896420001983643, + "learning_rate": 1.8623179317247613e-06, + "loss": 0.1029, + "num_input_tokens_seen": 25814416, + "step": 13180 + }, + { + "epoch": 1.746984758117959, + "grad_norm": 6.29615592956543, + "learning_rate": 1.861982283225548e-06, + "loss": 0.1936, + "num_input_tokens_seen": 25817056, + "step": 13181 + }, + { + "epoch": 1.747117296222664, + "grad_norm": 0.054206494241952896, + "learning_rate": 1.8616466470277556e-06, + "loss": 0.0004, + "num_input_tokens_seen": 25818648, + "step": 13182 + }, + { + "epoch": 1.747249834327369, + "grad_norm": 0.7271104454994202, + "learning_rate": 1.8613110231378556e-06, + "loss": 0.0051, + "num_input_tokens_seen": 25820040, + "step": 13183 + }, + { + "epoch": 1.747382372432074, + "grad_norm": 0.153843954205513, + "learning_rate": 1.860975411562321e-06, + "loss": 0.001, + "num_input_tokens_seen": 25821496, + "step": 13184 + }, + { + "epoch": 1.7475149105367793, + "grad_norm": 3.7120132446289062, + "learning_rate": 1.8606398123076214e-06, + "loss": 0.0163, + "num_input_tokens_seen": 25823304, + "step": 13185 + }, + { + "epoch": 1.7476474486414846, + "grad_norm": 0.08009041100740433, + "learning_rate": 1.8603042253802265e-06, + "loss": 0.0005, + "num_input_tokens_seen": 25824448, + "step": 13186 + }, + { + "epoch": 1.7477799867461896, + "grad_norm": 4.458845138549805, + "learning_rate": 1.859968650786608e-06, + "loss": 0.1101, + "num_input_tokens_seen": 25826256, + "step": 13187 + }, + { + "epoch": 1.7479125248508947, + "grad_norm": 23.582969665527344, + "learning_rate": 1.8596330885332343e-06, + "loss": 0.1904, + "num_input_tokens_seen": 25828232, + "step": 13188 + }, + { + "epoch": 1.7480450629555997, + "grad_norm": 0.07295103371143341, + "learning_rate": 1.859297538626577e-06, + "loss": 0.0005, + "num_input_tokens_seen": 25829760, + "step": 13189 + }, + { + "epoch": 1.7481776010603047, + "grad_norm": 1.8930844068527222, + "learning_rate": 1.858962001073106e-06, + "loss": 0.0168, + "num_input_tokens_seen": 25831504, + "step": 13190 + }, + { + "epoch": 1.74831013916501, + "grad_norm": 0.02087903395295143, + "learning_rate": 1.8586264758792891e-06, + "loss": 0.0001, + "num_input_tokens_seen": 25832832, + "step": 13191 + }, + { + "epoch": 1.748442677269715, + "grad_norm": 5.503264427185059, + "learning_rate": 1.8582909630515966e-06, + "loss": 0.0901, + "num_input_tokens_seen": 25834648, + "step": 13192 + }, + { + "epoch": 1.7485752153744203, + "grad_norm": 7.643683433532715, + "learning_rate": 1.8579554625964963e-06, + "loss": 0.1621, + "num_input_tokens_seen": 25836880, + "step": 13193 + }, + { + "epoch": 1.7487077534791253, + "grad_norm": 0.032397691160440445, + "learning_rate": 1.8576199745204584e-06, + "loss": 0.0002, + "num_input_tokens_seen": 25839256, + "step": 13194 + }, + { + "epoch": 1.7488402915838304, + "grad_norm": 10.693230628967285, + "learning_rate": 1.8572844988299499e-06, + "loss": 0.0633, + "num_input_tokens_seen": 25841176, + "step": 13195 + }, + { + "epoch": 1.7489728296885354, + "grad_norm": 0.033294517546892166, + "learning_rate": 1.8569490355314406e-06, + "loss": 0.0002, + "num_input_tokens_seen": 25842200, + "step": 13196 + }, + { + "epoch": 1.7491053677932404, + "grad_norm": 2.4439501762390137, + "learning_rate": 1.8566135846313964e-06, + "loss": 0.0182, + "num_input_tokens_seen": 25845208, + "step": 13197 + }, + { + "epoch": 1.7492379058979457, + "grad_norm": 4.2268171310424805, + "learning_rate": 1.8562781461362862e-06, + "loss": 0.0883, + "num_input_tokens_seen": 25847032, + "step": 13198 + }, + { + "epoch": 1.7493704440026507, + "grad_norm": 5.963039875030518, + "learning_rate": 1.8559427200525781e-06, + "loss": 0.0587, + "num_input_tokens_seen": 25849032, + "step": 13199 + }, + { + "epoch": 1.749502982107356, + "grad_norm": 4.655706405639648, + "learning_rate": 1.855607306386739e-06, + "loss": 0.0348, + "num_input_tokens_seen": 25851224, + "step": 13200 + }, + { + "epoch": 1.749635520212061, + "grad_norm": 1.0831652879714966, + "learning_rate": 1.8552719051452351e-06, + "loss": 0.0093, + "num_input_tokens_seen": 25854504, + "step": 13201 + }, + { + "epoch": 1.749768058316766, + "grad_norm": 2.2987921237945557, + "learning_rate": 1.8549365163345337e-06, + "loss": 0.0074, + "num_input_tokens_seen": 25856656, + "step": 13202 + }, + { + "epoch": 1.749900596421471, + "grad_norm": 4.251829624176025, + "learning_rate": 1.8546011399611008e-06, + "loss": 0.0359, + "num_input_tokens_seen": 25859024, + "step": 13203 + }, + { + "epoch": 1.7500331345261761, + "grad_norm": 0.08689743280410767, + "learning_rate": 1.854265776031403e-06, + "loss": 0.0005, + "num_input_tokens_seen": 25860296, + "step": 13204 + }, + { + "epoch": 1.7501656726308814, + "grad_norm": 0.07350675761699677, + "learning_rate": 1.8539304245519074e-06, + "loss": 0.0005, + "num_input_tokens_seen": 25861688, + "step": 13205 + }, + { + "epoch": 1.7502982107355864, + "grad_norm": 8.132402420043945, + "learning_rate": 1.8535950855290782e-06, + "loss": 0.117, + "num_input_tokens_seen": 25864864, + "step": 13206 + }, + { + "epoch": 1.7504307488402917, + "grad_norm": 15.59964656829834, + "learning_rate": 1.8532597589693824e-06, + "loss": 0.2806, + "num_input_tokens_seen": 25866456, + "step": 13207 + }, + { + "epoch": 1.7505632869449967, + "grad_norm": 5.976990222930908, + "learning_rate": 1.8529244448792846e-06, + "loss": 0.0977, + "num_input_tokens_seen": 25868176, + "step": 13208 + }, + { + "epoch": 1.7506958250497018, + "grad_norm": 0.10554125905036926, + "learning_rate": 1.8525891432652487e-06, + "loss": 0.0007, + "num_input_tokens_seen": 25871160, + "step": 13209 + }, + { + "epoch": 1.7508283631544068, + "grad_norm": 0.3990043103694916, + "learning_rate": 1.8522538541337416e-06, + "loss": 0.0039, + "num_input_tokens_seen": 25872864, + "step": 13210 + }, + { + "epoch": 1.7509609012591119, + "grad_norm": 3.1750261783599854, + "learning_rate": 1.8519185774912274e-06, + "loss": 0.0184, + "num_input_tokens_seen": 25874800, + "step": 13211 + }, + { + "epoch": 1.7510934393638171, + "grad_norm": 2.0271036624908447, + "learning_rate": 1.8515833133441693e-06, + "loss": 0.0138, + "num_input_tokens_seen": 25877088, + "step": 13212 + }, + { + "epoch": 1.7512259774685222, + "grad_norm": 3.635862350463867, + "learning_rate": 1.8512480616990331e-06, + "loss": 0.0726, + "num_input_tokens_seen": 25878552, + "step": 13213 + }, + { + "epoch": 1.7513585155732274, + "grad_norm": 6.329824447631836, + "learning_rate": 1.8509128225622809e-06, + "loss": 0.0854, + "num_input_tokens_seen": 25880456, + "step": 13214 + }, + { + "epoch": 1.7514910536779325, + "grad_norm": 0.08290381729602814, + "learning_rate": 1.850577595940378e-06, + "loss": 0.0007, + "num_input_tokens_seen": 25881848, + "step": 13215 + }, + { + "epoch": 1.7516235917826375, + "grad_norm": 4.596584796905518, + "learning_rate": 1.8502423818397866e-06, + "loss": 0.0818, + "num_input_tokens_seen": 25883976, + "step": 13216 + }, + { + "epoch": 1.7517561298873425, + "grad_norm": 0.05735500901937485, + "learning_rate": 1.849907180266971e-06, + "loss": 0.0004, + "num_input_tokens_seen": 25885832, + "step": 13217 + }, + { + "epoch": 1.7518886679920476, + "grad_norm": 7.78150749206543, + "learning_rate": 1.8495719912283927e-06, + "loss": 0.1335, + "num_input_tokens_seen": 25887232, + "step": 13218 + }, + { + "epoch": 1.7520212060967528, + "grad_norm": 3.030229330062866, + "learning_rate": 1.8492368147305146e-06, + "loss": 0.0542, + "num_input_tokens_seen": 25888656, + "step": 13219 + }, + { + "epoch": 1.752153744201458, + "grad_norm": 10.420186042785645, + "learning_rate": 1.8489016507798008e-06, + "loss": 0.1322, + "num_input_tokens_seen": 25890904, + "step": 13220 + }, + { + "epoch": 1.7522862823061631, + "grad_norm": 5.942259311676025, + "learning_rate": 1.8485664993827118e-06, + "loss": 0.0843, + "num_input_tokens_seen": 25892224, + "step": 13221 + }, + { + "epoch": 1.7524188204108682, + "grad_norm": 5.46798038482666, + "learning_rate": 1.848231360545711e-06, + "loss": 0.0505, + "num_input_tokens_seen": 25893960, + "step": 13222 + }, + { + "epoch": 1.7525513585155732, + "grad_norm": 14.25269889831543, + "learning_rate": 1.8478962342752584e-06, + "loss": 0.2394, + "num_input_tokens_seen": 25896312, + "step": 13223 + }, + { + "epoch": 1.7526838966202782, + "grad_norm": 0.0497421957552433, + "learning_rate": 1.8475611205778157e-06, + "loss": 0.0003, + "num_input_tokens_seen": 25898016, + "step": 13224 + }, + { + "epoch": 1.7528164347249833, + "grad_norm": 0.10333182662725449, + "learning_rate": 1.8472260194598452e-06, + "loss": 0.0006, + "num_input_tokens_seen": 25899432, + "step": 13225 + }, + { + "epoch": 1.7529489728296885, + "grad_norm": 18.27739715576172, + "learning_rate": 1.8468909309278077e-06, + "loss": 0.4308, + "num_input_tokens_seen": 25901824, + "step": 13226 + }, + { + "epoch": 1.7530815109343938, + "grad_norm": 4.959158420562744, + "learning_rate": 1.8465558549881634e-06, + "loss": 0.0749, + "num_input_tokens_seen": 25903376, + "step": 13227 + }, + { + "epoch": 1.7532140490390988, + "grad_norm": 8.135969161987305, + "learning_rate": 1.8462207916473734e-06, + "loss": 0.0701, + "num_input_tokens_seen": 25905720, + "step": 13228 + }, + { + "epoch": 1.7533465871438039, + "grad_norm": 10.232457160949707, + "learning_rate": 1.845885740911897e-06, + "loss": 0.1634, + "num_input_tokens_seen": 25908840, + "step": 13229 + }, + { + "epoch": 1.753479125248509, + "grad_norm": 3.797595739364624, + "learning_rate": 1.8455507027881947e-06, + "loss": 0.0386, + "num_input_tokens_seen": 25911160, + "step": 13230 + }, + { + "epoch": 1.753611663353214, + "grad_norm": 0.034186821430921555, + "learning_rate": 1.8452156772827262e-06, + "loss": 0.0002, + "num_input_tokens_seen": 25912192, + "step": 13231 + }, + { + "epoch": 1.7537442014579192, + "grad_norm": 0.04699575528502464, + "learning_rate": 1.8448806644019521e-06, + "loss": 0.0003, + "num_input_tokens_seen": 25913832, + "step": 13232 + }, + { + "epoch": 1.7538767395626242, + "grad_norm": 0.11514517664909363, + "learning_rate": 1.8445456641523301e-06, + "loss": 0.0008, + "num_input_tokens_seen": 25915744, + "step": 13233 + }, + { + "epoch": 1.7540092776673295, + "grad_norm": 8.83983325958252, + "learning_rate": 1.84421067654032e-06, + "loss": 0.1713, + "num_input_tokens_seen": 25917528, + "step": 13234 + }, + { + "epoch": 1.7541418157720345, + "grad_norm": 0.04224582016468048, + "learning_rate": 1.8438757015723799e-06, + "loss": 0.0003, + "num_input_tokens_seen": 25918664, + "step": 13235 + }, + { + "epoch": 1.7542743538767396, + "grad_norm": 5.819526195526123, + "learning_rate": 1.8435407392549693e-06, + "loss": 0.1828, + "num_input_tokens_seen": 25920456, + "step": 13236 + }, + { + "epoch": 1.7544068919814446, + "grad_norm": 0.0651773065328598, + "learning_rate": 1.8432057895945465e-06, + "loss": 0.0004, + "num_input_tokens_seen": 25922968, + "step": 13237 + }, + { + "epoch": 1.7545394300861497, + "grad_norm": 7.067255020141602, + "learning_rate": 1.8428708525975691e-06, + "loss": 0.2533, + "num_input_tokens_seen": 25924688, + "step": 13238 + }, + { + "epoch": 1.754671968190855, + "grad_norm": 17.50310516357422, + "learning_rate": 1.8425359282704947e-06, + "loss": 0.5267, + "num_input_tokens_seen": 25926256, + "step": 13239 + }, + { + "epoch": 1.75480450629556, + "grad_norm": 0.031550925225019455, + "learning_rate": 1.8422010166197808e-06, + "loss": 0.0002, + "num_input_tokens_seen": 25927696, + "step": 13240 + }, + { + "epoch": 1.7549370444002652, + "grad_norm": 0.14289146661758423, + "learning_rate": 1.8418661176518856e-06, + "loss": 0.001, + "num_input_tokens_seen": 25929728, + "step": 13241 + }, + { + "epoch": 1.7550695825049702, + "grad_norm": 5.97039794921875, + "learning_rate": 1.8415312313732654e-06, + "loss": 0.0255, + "num_input_tokens_seen": 25931208, + "step": 13242 + }, + { + "epoch": 1.7552021206096753, + "grad_norm": 11.62657642364502, + "learning_rate": 1.841196357790378e-06, + "loss": 0.3737, + "num_input_tokens_seen": 25933816, + "step": 13243 + }, + { + "epoch": 1.7553346587143803, + "grad_norm": 5.982364177703857, + "learning_rate": 1.8408614969096786e-06, + "loss": 0.1064, + "num_input_tokens_seen": 25935712, + "step": 13244 + }, + { + "epoch": 1.7554671968190854, + "grad_norm": 1.9968152046203613, + "learning_rate": 1.840526648737624e-06, + "loss": 0.0057, + "num_input_tokens_seen": 25937248, + "step": 13245 + }, + { + "epoch": 1.7555997349237906, + "grad_norm": 13.09157943725586, + "learning_rate": 1.8401918132806712e-06, + "loss": 0.1437, + "num_input_tokens_seen": 25940280, + "step": 13246 + }, + { + "epoch": 1.7557322730284957, + "grad_norm": 7.104730129241943, + "learning_rate": 1.8398569905452758e-06, + "loss": 0.1337, + "num_input_tokens_seen": 25942664, + "step": 13247 + }, + { + "epoch": 1.755864811133201, + "grad_norm": 6.378207206726074, + "learning_rate": 1.839522180537892e-06, + "loss": 0.1762, + "num_input_tokens_seen": 25945864, + "step": 13248 + }, + { + "epoch": 1.755997349237906, + "grad_norm": 0.8392379879951477, + "learning_rate": 1.839187383264977e-06, + "loss": 0.0068, + "num_input_tokens_seen": 25947384, + "step": 13249 + }, + { + "epoch": 1.756129887342611, + "grad_norm": 3.1426799297332764, + "learning_rate": 1.838852598732984e-06, + "loss": 0.0446, + "num_input_tokens_seen": 25948784, + "step": 13250 + }, + { + "epoch": 1.756262425447316, + "grad_norm": 9.247245788574219, + "learning_rate": 1.83851782694837e-06, + "loss": 0.1077, + "num_input_tokens_seen": 25950808, + "step": 13251 + }, + { + "epoch": 1.756394963552021, + "grad_norm": 3.5976529121398926, + "learning_rate": 1.8381830679175888e-06, + "loss": 0.08, + "num_input_tokens_seen": 25952800, + "step": 13252 + }, + { + "epoch": 1.7565275016567263, + "grad_norm": 9.186699867248535, + "learning_rate": 1.8378483216470947e-06, + "loss": 0.1164, + "num_input_tokens_seen": 25954840, + "step": 13253 + }, + { + "epoch": 1.7566600397614314, + "grad_norm": 1.1243354082107544, + "learning_rate": 1.837513588143341e-06, + "loss": 0.0089, + "num_input_tokens_seen": 25957080, + "step": 13254 + }, + { + "epoch": 1.7567925778661366, + "grad_norm": 3.6414906978607178, + "learning_rate": 1.837178867412782e-06, + "loss": 0.1142, + "num_input_tokens_seen": 25959032, + "step": 13255 + }, + { + "epoch": 1.7569251159708417, + "grad_norm": 8.727334022521973, + "learning_rate": 1.8368441594618728e-06, + "loss": 0.1718, + "num_input_tokens_seen": 25961208, + "step": 13256 + }, + { + "epoch": 1.7570576540755467, + "grad_norm": 10.041175842285156, + "learning_rate": 1.836509464297065e-06, + "loss": 0.3581, + "num_input_tokens_seen": 25963264, + "step": 13257 + }, + { + "epoch": 1.7571901921802517, + "grad_norm": 6.096699237823486, + "learning_rate": 1.8361747819248132e-06, + "loss": 0.1799, + "num_input_tokens_seen": 25964760, + "step": 13258 + }, + { + "epoch": 1.7573227302849568, + "grad_norm": 10.178114891052246, + "learning_rate": 1.8358401123515687e-06, + "loss": 0.1833, + "num_input_tokens_seen": 25966856, + "step": 13259 + }, + { + "epoch": 1.757455268389662, + "grad_norm": 8.405535697937012, + "learning_rate": 1.835505455583786e-06, + "loss": 0.2639, + "num_input_tokens_seen": 25969736, + "step": 13260 + }, + { + "epoch": 1.757587806494367, + "grad_norm": 0.2606665790081024, + "learning_rate": 1.835170811627915e-06, + "loss": 0.0017, + "num_input_tokens_seen": 25971208, + "step": 13261 + }, + { + "epoch": 1.7577203445990723, + "grad_norm": 11.175729751586914, + "learning_rate": 1.8348361804904108e-06, + "loss": 0.1123, + "num_input_tokens_seen": 25972960, + "step": 13262 + }, + { + "epoch": 1.7578528827037774, + "grad_norm": 3.448608636856079, + "learning_rate": 1.8345015621777229e-06, + "loss": 0.1087, + "num_input_tokens_seen": 25974592, + "step": 13263 + }, + { + "epoch": 1.7579854208084824, + "grad_norm": 11.325621604919434, + "learning_rate": 1.8341669566963049e-06, + "loss": 0.358, + "num_input_tokens_seen": 25976800, + "step": 13264 + }, + { + "epoch": 1.7581179589131875, + "grad_norm": 2.2079126834869385, + "learning_rate": 1.8338323640526067e-06, + "loss": 0.0143, + "num_input_tokens_seen": 25978360, + "step": 13265 + }, + { + "epoch": 1.7582504970178925, + "grad_norm": 0.7914609313011169, + "learning_rate": 1.8334977842530796e-06, + "loss": 0.0049, + "num_input_tokens_seen": 25980064, + "step": 13266 + }, + { + "epoch": 1.7583830351225977, + "grad_norm": 6.6813530921936035, + "learning_rate": 1.833163217304176e-06, + "loss": 0.0539, + "num_input_tokens_seen": 25981752, + "step": 13267 + }, + { + "epoch": 1.758515573227303, + "grad_norm": 1.5735331773757935, + "learning_rate": 1.8328286632123456e-06, + "loss": 0.0162, + "num_input_tokens_seen": 25983608, + "step": 13268 + }, + { + "epoch": 1.758648111332008, + "grad_norm": 10.642547607421875, + "learning_rate": 1.8324941219840386e-06, + "loss": 0.0702, + "num_input_tokens_seen": 25985752, + "step": 13269 + }, + { + "epoch": 1.758780649436713, + "grad_norm": 0.03585246950387955, + "learning_rate": 1.8321595936257056e-06, + "loss": 0.0002, + "num_input_tokens_seen": 25987496, + "step": 13270 + }, + { + "epoch": 1.7589131875414181, + "grad_norm": 4.550493240356445, + "learning_rate": 1.8318250781437954e-06, + "loss": 0.0845, + "num_input_tokens_seen": 25989600, + "step": 13271 + }, + { + "epoch": 1.7590457256461232, + "grad_norm": 0.033398739993572235, + "learning_rate": 1.8314905755447593e-06, + "loss": 0.0002, + "num_input_tokens_seen": 25991584, + "step": 13272 + }, + { + "epoch": 1.7591782637508282, + "grad_norm": 0.43674394488334656, + "learning_rate": 1.8311560858350464e-06, + "loss": 0.0031, + "num_input_tokens_seen": 25994128, + "step": 13273 + }, + { + "epoch": 1.7593108018555335, + "grad_norm": 3.791036367416382, + "learning_rate": 1.8308216090211051e-06, + "loss": 0.0377, + "num_input_tokens_seen": 25996496, + "step": 13274 + }, + { + "epoch": 1.7594433399602387, + "grad_norm": 2.650909185409546, + "learning_rate": 1.8304871451093854e-06, + "loss": 0.0178, + "num_input_tokens_seen": 25999168, + "step": 13275 + }, + { + "epoch": 1.7595758780649438, + "grad_norm": 9.707881927490234, + "learning_rate": 1.8301526941063342e-06, + "loss": 0.2642, + "num_input_tokens_seen": 26001224, + "step": 13276 + }, + { + "epoch": 1.7597084161696488, + "grad_norm": 4.185418128967285, + "learning_rate": 1.8298182560184024e-06, + "loss": 0.0634, + "num_input_tokens_seen": 26002856, + "step": 13277 + }, + { + "epoch": 1.7598409542743538, + "grad_norm": 0.6371215581893921, + "learning_rate": 1.8294838308520364e-06, + "loss": 0.0051, + "num_input_tokens_seen": 26004864, + "step": 13278 + }, + { + "epoch": 1.7599734923790589, + "grad_norm": 0.11040137708187103, + "learning_rate": 1.8291494186136854e-06, + "loss": 0.0008, + "num_input_tokens_seen": 26007696, + "step": 13279 + }, + { + "epoch": 1.7601060304837641, + "grad_norm": 5.452227592468262, + "learning_rate": 1.8288150193097957e-06, + "loss": 0.0456, + "num_input_tokens_seen": 26009728, + "step": 13280 + }, + { + "epoch": 1.7602385685884692, + "grad_norm": 0.08406528830528259, + "learning_rate": 1.8284806329468147e-06, + "loss": 0.0006, + "num_input_tokens_seen": 26011696, + "step": 13281 + }, + { + "epoch": 1.7603711066931744, + "grad_norm": 9.989354133605957, + "learning_rate": 1.8281462595311916e-06, + "loss": 0.2924, + "num_input_tokens_seen": 26014136, + "step": 13282 + }, + { + "epoch": 1.7605036447978795, + "grad_norm": 3.976863145828247, + "learning_rate": 1.8278118990693715e-06, + "loss": 0.133, + "num_input_tokens_seen": 26016520, + "step": 13283 + }, + { + "epoch": 1.7606361829025845, + "grad_norm": 6.685466766357422, + "learning_rate": 1.8274775515678025e-06, + "loss": 0.1611, + "num_input_tokens_seen": 26018384, + "step": 13284 + }, + { + "epoch": 1.7607687210072895, + "grad_norm": 7.333414077758789, + "learning_rate": 1.8271432170329301e-06, + "loss": 0.1233, + "num_input_tokens_seen": 26020448, + "step": 13285 + }, + { + "epoch": 1.7609012591119946, + "grad_norm": 0.15389417111873627, + "learning_rate": 1.8268088954712002e-06, + "loss": 0.0009, + "num_input_tokens_seen": 26021984, + "step": 13286 + }, + { + "epoch": 1.7610337972166998, + "grad_norm": 0.13992318511009216, + "learning_rate": 1.8264745868890588e-06, + "loss": 0.0009, + "num_input_tokens_seen": 26024240, + "step": 13287 + }, + { + "epoch": 1.7611663353214049, + "grad_norm": 6.342142105102539, + "learning_rate": 1.8261402912929526e-06, + "loss": 0.0304, + "num_input_tokens_seen": 26025840, + "step": 13288 + }, + { + "epoch": 1.7612988734261101, + "grad_norm": 4.117879390716553, + "learning_rate": 1.8258060086893264e-06, + "loss": 0.0647, + "num_input_tokens_seen": 26028048, + "step": 13289 + }, + { + "epoch": 1.7614314115308152, + "grad_norm": 13.412225723266602, + "learning_rate": 1.8254717390846256e-06, + "loss": 0.1972, + "num_input_tokens_seen": 26029712, + "step": 13290 + }, + { + "epoch": 1.7615639496355202, + "grad_norm": 0.11279463022947311, + "learning_rate": 1.8251374824852952e-06, + "loss": 0.0014, + "num_input_tokens_seen": 26031128, + "step": 13291 + }, + { + "epoch": 1.7616964877402252, + "grad_norm": 6.854719161987305, + "learning_rate": 1.8248032388977789e-06, + "loss": 0.0356, + "num_input_tokens_seen": 26032952, + "step": 13292 + }, + { + "epoch": 1.7618290258449303, + "grad_norm": 7.266012668609619, + "learning_rate": 1.8244690083285222e-06, + "loss": 0.1225, + "num_input_tokens_seen": 26035016, + "step": 13293 + }, + { + "epoch": 1.7619615639496355, + "grad_norm": 3.237112045288086, + "learning_rate": 1.8241347907839693e-06, + "loss": 0.0533, + "num_input_tokens_seen": 26037192, + "step": 13294 + }, + { + "epoch": 1.7620941020543406, + "grad_norm": 0.10018216073513031, + "learning_rate": 1.8238005862705637e-06, + "loss": 0.0007, + "num_input_tokens_seen": 26038728, + "step": 13295 + }, + { + "epoch": 1.7622266401590458, + "grad_norm": 0.10982474684715271, + "learning_rate": 1.8234663947947495e-06, + "loss": 0.0009, + "num_input_tokens_seen": 26040496, + "step": 13296 + }, + { + "epoch": 1.7623591782637509, + "grad_norm": 6.79662561416626, + "learning_rate": 1.8231322163629694e-06, + "loss": 0.1501, + "num_input_tokens_seen": 26042608, + "step": 13297 + }, + { + "epoch": 1.762491716368456, + "grad_norm": 0.019663462415337563, + "learning_rate": 1.8227980509816672e-06, + "loss": 0.0001, + "num_input_tokens_seen": 26043832, + "step": 13298 + }, + { + "epoch": 1.762624254473161, + "grad_norm": 4.207809925079346, + "learning_rate": 1.8224638986572866e-06, + "loss": 0.0248, + "num_input_tokens_seen": 26046776, + "step": 13299 + }, + { + "epoch": 1.762756792577866, + "grad_norm": 6.082076549530029, + "learning_rate": 1.8221297593962693e-06, + "loss": 0.0332, + "num_input_tokens_seen": 26048992, + "step": 13300 + }, + { + "epoch": 1.7628893306825713, + "grad_norm": 5.609279632568359, + "learning_rate": 1.8217956332050574e-06, + "loss": 0.0613, + "num_input_tokens_seen": 26050976, + "step": 13301 + }, + { + "epoch": 1.7630218687872763, + "grad_norm": 0.029026927426457405, + "learning_rate": 1.8214615200900928e-06, + "loss": 0.0002, + "num_input_tokens_seen": 26052616, + "step": 13302 + }, + { + "epoch": 1.7631544068919816, + "grad_norm": 8.886504173278809, + "learning_rate": 1.8211274200578195e-06, + "loss": 0.164, + "num_input_tokens_seen": 26054576, + "step": 13303 + }, + { + "epoch": 1.7632869449966866, + "grad_norm": 5.733160972595215, + "learning_rate": 1.820793333114677e-06, + "loss": 0.0919, + "num_input_tokens_seen": 26057008, + "step": 13304 + }, + { + "epoch": 1.7634194831013916, + "grad_norm": 3.641957998275757, + "learning_rate": 1.8204592592671086e-06, + "loss": 0.0445, + "num_input_tokens_seen": 26058720, + "step": 13305 + }, + { + "epoch": 1.7635520212060967, + "grad_norm": 12.187167167663574, + "learning_rate": 1.8201251985215543e-06, + "loss": 0.3685, + "num_input_tokens_seen": 26060536, + "step": 13306 + }, + { + "epoch": 1.7636845593108017, + "grad_norm": 2.0413999557495117, + "learning_rate": 1.819791150884454e-06, + "loss": 0.0231, + "num_input_tokens_seen": 26062424, + "step": 13307 + }, + { + "epoch": 1.763817097415507, + "grad_norm": 0.06175190210342407, + "learning_rate": 1.8194571163622501e-06, + "loss": 0.0004, + "num_input_tokens_seen": 26064480, + "step": 13308 + }, + { + "epoch": 1.7639496355202122, + "grad_norm": 9.745804786682129, + "learning_rate": 1.819123094961383e-06, + "loss": 0.2713, + "num_input_tokens_seen": 26066920, + "step": 13309 + }, + { + "epoch": 1.7640821736249173, + "grad_norm": 5.513734340667725, + "learning_rate": 1.818789086688292e-06, + "loss": 0.0445, + "num_input_tokens_seen": 26069264, + "step": 13310 + }, + { + "epoch": 1.7642147117296223, + "grad_norm": 9.865388870239258, + "learning_rate": 1.8184550915494178e-06, + "loss": 0.1193, + "num_input_tokens_seen": 26072400, + "step": 13311 + }, + { + "epoch": 1.7643472498343273, + "grad_norm": 0.043767258524894714, + "learning_rate": 1.8181211095511987e-06, + "loss": 0.0003, + "num_input_tokens_seen": 26073728, + "step": 13312 + }, + { + "epoch": 1.7644797879390324, + "grad_norm": 7.026047706604004, + "learning_rate": 1.8177871407000752e-06, + "loss": 0.1611, + "num_input_tokens_seen": 26075376, + "step": 13313 + }, + { + "epoch": 1.7646123260437374, + "grad_norm": 6.628903865814209, + "learning_rate": 1.8174531850024872e-06, + "loss": 0.1508, + "num_input_tokens_seen": 26077440, + "step": 13314 + }, + { + "epoch": 1.7647448641484427, + "grad_norm": 8.901960372924805, + "learning_rate": 1.8171192424648723e-06, + "loss": 0.1671, + "num_input_tokens_seen": 26078960, + "step": 13315 + }, + { + "epoch": 1.764877402253148, + "grad_norm": 4.426534652709961, + "learning_rate": 1.816785313093669e-06, + "loss": 0.129, + "num_input_tokens_seen": 26080512, + "step": 13316 + }, + { + "epoch": 1.765009940357853, + "grad_norm": 4.744714736938477, + "learning_rate": 1.8164513968953168e-06, + "loss": 0.0673, + "num_input_tokens_seen": 26081912, + "step": 13317 + }, + { + "epoch": 1.765142478462558, + "grad_norm": 8.981192588806152, + "learning_rate": 1.816117493876252e-06, + "loss": 0.2466, + "num_input_tokens_seen": 26084656, + "step": 13318 + }, + { + "epoch": 1.765275016567263, + "grad_norm": 0.03272116929292679, + "learning_rate": 1.8157836040429148e-06, + "loss": 0.0002, + "num_input_tokens_seen": 26085960, + "step": 13319 + }, + { + "epoch": 1.765407554671968, + "grad_norm": 1.422000527381897, + "learning_rate": 1.8154497274017419e-06, + "loss": 0.0131, + "num_input_tokens_seen": 26088224, + "step": 13320 + }, + { + "epoch": 1.7655400927766733, + "grad_norm": 2.6446731090545654, + "learning_rate": 1.8151158639591704e-06, + "loss": 0.0217, + "num_input_tokens_seen": 26090296, + "step": 13321 + }, + { + "epoch": 1.7656726308813784, + "grad_norm": 0.13455215096473694, + "learning_rate": 1.8147820137216372e-06, + "loss": 0.0009, + "num_input_tokens_seen": 26092144, + "step": 13322 + }, + { + "epoch": 1.7658051689860836, + "grad_norm": 0.1676560491323471, + "learning_rate": 1.8144481766955786e-06, + "loss": 0.0011, + "num_input_tokens_seen": 26094824, + "step": 13323 + }, + { + "epoch": 1.7659377070907887, + "grad_norm": 0.08842529356479645, + "learning_rate": 1.8141143528874333e-06, + "loss": 0.0006, + "num_input_tokens_seen": 26096760, + "step": 13324 + }, + { + "epoch": 1.7660702451954937, + "grad_norm": 0.8473754525184631, + "learning_rate": 1.8137805423036362e-06, + "loss": 0.0069, + "num_input_tokens_seen": 26098376, + "step": 13325 + }, + { + "epoch": 1.7662027833001988, + "grad_norm": 9.894218444824219, + "learning_rate": 1.8134467449506238e-06, + "loss": 0.1822, + "num_input_tokens_seen": 26100976, + "step": 13326 + }, + { + "epoch": 1.7663353214049038, + "grad_norm": 2.4969701766967773, + "learning_rate": 1.8131129608348313e-06, + "loss": 0.0435, + "num_input_tokens_seen": 26103296, + "step": 13327 + }, + { + "epoch": 1.766467859509609, + "grad_norm": 12.712018966674805, + "learning_rate": 1.8127791899626945e-06, + "loss": 0.0785, + "num_input_tokens_seen": 26105696, + "step": 13328 + }, + { + "epoch": 1.766600397614314, + "grad_norm": 6.921119213104248, + "learning_rate": 1.8124454323406498e-06, + "loss": 0.2801, + "num_input_tokens_seen": 26107520, + "step": 13329 + }, + { + "epoch": 1.7667329357190193, + "grad_norm": 0.08384755253791809, + "learning_rate": 1.8121116879751318e-06, + "loss": 0.0006, + "num_input_tokens_seen": 26109392, + "step": 13330 + }, + { + "epoch": 1.7668654738237244, + "grad_norm": 8.647369384765625, + "learning_rate": 1.8117779568725744e-06, + "loss": 0.1132, + "num_input_tokens_seen": 26111032, + "step": 13331 + }, + { + "epoch": 1.7669980119284294, + "grad_norm": 7.249828815460205, + "learning_rate": 1.8114442390394133e-06, + "loss": 0.102, + "num_input_tokens_seen": 26112808, + "step": 13332 + }, + { + "epoch": 1.7671305500331345, + "grad_norm": 11.361310958862305, + "learning_rate": 1.811110534482081e-06, + "loss": 0.2261, + "num_input_tokens_seen": 26114904, + "step": 13333 + }, + { + "epoch": 1.7672630881378395, + "grad_norm": 6.145482540130615, + "learning_rate": 1.8107768432070132e-06, + "loss": 0.1693, + "num_input_tokens_seen": 26117480, + "step": 13334 + }, + { + "epoch": 1.7673956262425448, + "grad_norm": 0.0804702639579773, + "learning_rate": 1.8104431652206445e-06, + "loss": 0.0006, + "num_input_tokens_seen": 26119056, + "step": 13335 + }, + { + "epoch": 1.7675281643472498, + "grad_norm": 5.249117851257324, + "learning_rate": 1.8101095005294062e-06, + "loss": 0.1283, + "num_input_tokens_seen": 26121800, + "step": 13336 + }, + { + "epoch": 1.767660702451955, + "grad_norm": 10.45899772644043, + "learning_rate": 1.8097758491397333e-06, + "loss": 0.0526, + "num_input_tokens_seen": 26123496, + "step": 13337 + }, + { + "epoch": 1.76779324055666, + "grad_norm": 4.932031631469727, + "learning_rate": 1.8094422110580572e-06, + "loss": 0.0606, + "num_input_tokens_seen": 26125240, + "step": 13338 + }, + { + "epoch": 1.7679257786613651, + "grad_norm": 23.031370162963867, + "learning_rate": 1.8091085862908129e-06, + "loss": 0.5152, + "num_input_tokens_seen": 26127216, + "step": 13339 + }, + { + "epoch": 1.7680583167660702, + "grad_norm": 14.864603996276855, + "learning_rate": 1.808774974844431e-06, + "loss": 0.147, + "num_input_tokens_seen": 26128664, + "step": 13340 + }, + { + "epoch": 1.7681908548707752, + "grad_norm": 0.20994803309440613, + "learning_rate": 1.808441376725345e-06, + "loss": 0.0014, + "num_input_tokens_seen": 26131368, + "step": 13341 + }, + { + "epoch": 1.7683233929754805, + "grad_norm": 7.267364978790283, + "learning_rate": 1.8081077919399856e-06, + "loss": 0.1685, + "num_input_tokens_seen": 26132968, + "step": 13342 + }, + { + "epoch": 1.7684559310801855, + "grad_norm": 6.2768025398254395, + "learning_rate": 1.807774220494786e-06, + "loss": 0.1362, + "num_input_tokens_seen": 26135368, + "step": 13343 + }, + { + "epoch": 1.7685884691848908, + "grad_norm": 0.21776196360588074, + "learning_rate": 1.807440662396176e-06, + "loss": 0.0009, + "num_input_tokens_seen": 26136968, + "step": 13344 + }, + { + "epoch": 1.7687210072895958, + "grad_norm": 13.371339797973633, + "learning_rate": 1.8071071176505885e-06, + "loss": 0.2396, + "num_input_tokens_seen": 26139048, + "step": 13345 + }, + { + "epoch": 1.7688535453943008, + "grad_norm": 0.07095903903245926, + "learning_rate": 1.8067735862644538e-06, + "loss": 0.0005, + "num_input_tokens_seen": 26140208, + "step": 13346 + }, + { + "epoch": 1.7689860834990059, + "grad_norm": 2.3926947116851807, + "learning_rate": 1.8064400682442026e-06, + "loss": 0.0162, + "num_input_tokens_seen": 26141832, + "step": 13347 + }, + { + "epoch": 1.769118621603711, + "grad_norm": 11.18416976928711, + "learning_rate": 1.806106563596265e-06, + "loss": 0.1153, + "num_input_tokens_seen": 26144008, + "step": 13348 + }, + { + "epoch": 1.7692511597084162, + "grad_norm": 0.2509409487247467, + "learning_rate": 1.805773072327071e-06, + "loss": 0.0023, + "num_input_tokens_seen": 26145536, + "step": 13349 + }, + { + "epoch": 1.7693836978131214, + "grad_norm": 0.05051880329847336, + "learning_rate": 1.8054395944430522e-06, + "loss": 0.0004, + "num_input_tokens_seen": 26147216, + "step": 13350 + }, + { + "epoch": 1.7695162359178265, + "grad_norm": 11.642457008361816, + "learning_rate": 1.8051061299506367e-06, + "loss": 0.1048, + "num_input_tokens_seen": 26148696, + "step": 13351 + }, + { + "epoch": 1.7696487740225315, + "grad_norm": 5.685125350952148, + "learning_rate": 1.8047726788562547e-06, + "loss": 0.1111, + "num_input_tokens_seen": 26150272, + "step": 13352 + }, + { + "epoch": 1.7697813121272365, + "grad_norm": 0.08838338404893875, + "learning_rate": 1.8044392411663352e-06, + "loss": 0.0009, + "num_input_tokens_seen": 26152208, + "step": 13353 + }, + { + "epoch": 1.7699138502319416, + "grad_norm": 10.367325782775879, + "learning_rate": 1.804105816887306e-06, + "loss": 0.159, + "num_input_tokens_seen": 26153952, + "step": 13354 + }, + { + "epoch": 1.7700463883366466, + "grad_norm": 9.181132316589355, + "learning_rate": 1.8037724060255974e-06, + "loss": 0.2002, + "num_input_tokens_seen": 26156168, + "step": 13355 + }, + { + "epoch": 1.7701789264413519, + "grad_norm": 0.7446281909942627, + "learning_rate": 1.8034390085876374e-06, + "loss": 0.0051, + "num_input_tokens_seen": 26158112, + "step": 13356 + }, + { + "epoch": 1.7703114645460571, + "grad_norm": 3.625669240951538, + "learning_rate": 1.8031056245798534e-06, + "loss": 0.0501, + "num_input_tokens_seen": 26159800, + "step": 13357 + }, + { + "epoch": 1.7704440026507622, + "grad_norm": 6.587719440460205, + "learning_rate": 1.802772254008674e-06, + "loss": 0.1169, + "num_input_tokens_seen": 26162200, + "step": 13358 + }, + { + "epoch": 1.7705765407554672, + "grad_norm": 8.141780853271484, + "learning_rate": 1.8024388968805256e-06, + "loss": 0.2387, + "num_input_tokens_seen": 26164584, + "step": 13359 + }, + { + "epoch": 1.7707090788601723, + "grad_norm": 0.7558437585830688, + "learning_rate": 1.8021055532018378e-06, + "loss": 0.0027, + "num_input_tokens_seen": 26165920, + "step": 13360 + }, + { + "epoch": 1.7708416169648773, + "grad_norm": 7.009064674377441, + "learning_rate": 1.8017722229790357e-06, + "loss": 0.0551, + "num_input_tokens_seen": 26168640, + "step": 13361 + }, + { + "epoch": 1.7709741550695826, + "grad_norm": 11.312932968139648, + "learning_rate": 1.8014389062185471e-06, + "loss": 0.2057, + "num_input_tokens_seen": 26170904, + "step": 13362 + }, + { + "epoch": 1.7711066931742876, + "grad_norm": 8.712352752685547, + "learning_rate": 1.8011056029267982e-06, + "loss": 0.1786, + "num_input_tokens_seen": 26173232, + "step": 13363 + }, + { + "epoch": 1.7712392312789929, + "grad_norm": 1.4643484354019165, + "learning_rate": 1.8007723131102145e-06, + "loss": 0.0031, + "num_input_tokens_seen": 26175232, + "step": 13364 + }, + { + "epoch": 1.771371769383698, + "grad_norm": 5.197396278381348, + "learning_rate": 1.8004390367752246e-06, + "loss": 0.1762, + "num_input_tokens_seen": 26176536, + "step": 13365 + }, + { + "epoch": 1.771504307488403, + "grad_norm": 5.54130220413208, + "learning_rate": 1.8001057739282516e-06, + "loss": 0.0538, + "num_input_tokens_seen": 26177944, + "step": 13366 + }, + { + "epoch": 1.771636845593108, + "grad_norm": 2.771286725997925, + "learning_rate": 1.7997725245757228e-06, + "loss": 0.0386, + "num_input_tokens_seen": 26180376, + "step": 13367 + }, + { + "epoch": 1.771769383697813, + "grad_norm": 2.1708176136016846, + "learning_rate": 1.7994392887240627e-06, + "loss": 0.0346, + "num_input_tokens_seen": 26182280, + "step": 13368 + }, + { + "epoch": 1.7719019218025183, + "grad_norm": 8.870991706848145, + "learning_rate": 1.7991060663796955e-06, + "loss": 0.1807, + "num_input_tokens_seen": 26184008, + "step": 13369 + }, + { + "epoch": 1.7720344599072233, + "grad_norm": 0.03824019432067871, + "learning_rate": 1.7987728575490473e-06, + "loss": 0.0003, + "num_input_tokens_seen": 26185960, + "step": 13370 + }, + { + "epoch": 1.7721669980119286, + "grad_norm": 0.05623386800289154, + "learning_rate": 1.7984396622385428e-06, + "loss": 0.0003, + "num_input_tokens_seen": 26187536, + "step": 13371 + }, + { + "epoch": 1.7722995361166336, + "grad_norm": 3.9962210655212402, + "learning_rate": 1.7981064804546051e-06, + "loss": 0.0338, + "num_input_tokens_seen": 26188712, + "step": 13372 + }, + { + "epoch": 1.7724320742213386, + "grad_norm": 1.25719153881073, + "learning_rate": 1.7977733122036588e-06, + "loss": 0.022, + "num_input_tokens_seen": 26191064, + "step": 13373 + }, + { + "epoch": 1.7725646123260437, + "grad_norm": 3.2366583347320557, + "learning_rate": 1.797440157492128e-06, + "loss": 0.0264, + "num_input_tokens_seen": 26193072, + "step": 13374 + }, + { + "epoch": 1.7726971504307487, + "grad_norm": 0.09606379270553589, + "learning_rate": 1.7971070163264342e-06, + "loss": 0.0006, + "num_input_tokens_seen": 26194640, + "step": 13375 + }, + { + "epoch": 1.772829688535454, + "grad_norm": 0.278709352016449, + "learning_rate": 1.7967738887130027e-06, + "loss": 0.0021, + "num_input_tokens_seen": 26196248, + "step": 13376 + }, + { + "epoch": 1.772962226640159, + "grad_norm": 16.020023345947266, + "learning_rate": 1.7964407746582564e-06, + "loss": 0.3367, + "num_input_tokens_seen": 26198400, + "step": 13377 + }, + { + "epoch": 1.7730947647448643, + "grad_norm": 17.73349952697754, + "learning_rate": 1.7961076741686167e-06, + "loss": 0.328, + "num_input_tokens_seen": 26200416, + "step": 13378 + }, + { + "epoch": 1.7732273028495693, + "grad_norm": 9.19074821472168, + "learning_rate": 1.7957745872505073e-06, + "loss": 0.1946, + "num_input_tokens_seen": 26201848, + "step": 13379 + }, + { + "epoch": 1.7733598409542743, + "grad_norm": 6.2438645362854, + "learning_rate": 1.7954415139103482e-06, + "loss": 0.1587, + "num_input_tokens_seen": 26204504, + "step": 13380 + }, + { + "epoch": 1.7734923790589794, + "grad_norm": 0.054031115025281906, + "learning_rate": 1.7951084541545635e-06, + "loss": 0.0003, + "num_input_tokens_seen": 26206032, + "step": 13381 + }, + { + "epoch": 1.7736249171636844, + "grad_norm": 5.091292858123779, + "learning_rate": 1.7947754079895746e-06, + "loss": 0.0649, + "num_input_tokens_seen": 26208136, + "step": 13382 + }, + { + "epoch": 1.7737574552683897, + "grad_norm": 2.5021262168884277, + "learning_rate": 1.7944423754218027e-06, + "loss": 0.0196, + "num_input_tokens_seen": 26209288, + "step": 13383 + }, + { + "epoch": 1.7738899933730947, + "grad_norm": 3.8201444149017334, + "learning_rate": 1.7941093564576675e-06, + "loss": 0.1095, + "num_input_tokens_seen": 26211056, + "step": 13384 + }, + { + "epoch": 1.7740225314778, + "grad_norm": 8.680951118469238, + "learning_rate": 1.7937763511035904e-06, + "loss": 0.2078, + "num_input_tokens_seen": 26213440, + "step": 13385 + }, + { + "epoch": 1.774155069582505, + "grad_norm": 5.585438251495361, + "learning_rate": 1.793443359365994e-06, + "loss": 0.0488, + "num_input_tokens_seen": 26214968, + "step": 13386 + }, + { + "epoch": 1.77428760768721, + "grad_norm": 4.130827903747559, + "learning_rate": 1.7931103812512961e-06, + "loss": 0.0409, + "num_input_tokens_seen": 26217280, + "step": 13387 + }, + { + "epoch": 1.774420145791915, + "grad_norm": 0.0676233321428299, + "learning_rate": 1.7927774167659184e-06, + "loss": 0.0005, + "num_input_tokens_seen": 26219144, + "step": 13388 + }, + { + "epoch": 1.7745526838966201, + "grad_norm": 2.69486927986145, + "learning_rate": 1.7924444659162794e-06, + "loss": 0.0381, + "num_input_tokens_seen": 26220576, + "step": 13389 + }, + { + "epoch": 1.7746852220013254, + "grad_norm": 0.058274440467357635, + "learning_rate": 1.7921115287087987e-06, + "loss": 0.0004, + "num_input_tokens_seen": 26221680, + "step": 13390 + }, + { + "epoch": 1.7748177601060304, + "grad_norm": 1.6165335178375244, + "learning_rate": 1.791778605149897e-06, + "loss": 0.0197, + "num_input_tokens_seen": 26223264, + "step": 13391 + }, + { + "epoch": 1.7749502982107357, + "grad_norm": 0.11474625021219254, + "learning_rate": 1.7914456952459925e-06, + "loss": 0.0007, + "num_input_tokens_seen": 26224720, + "step": 13392 + }, + { + "epoch": 1.7750828363154407, + "grad_norm": 10.287863731384277, + "learning_rate": 1.7911127990035032e-06, + "loss": 0.2253, + "num_input_tokens_seen": 26227888, + "step": 13393 + }, + { + "epoch": 1.7752153744201458, + "grad_norm": 0.13895466923713684, + "learning_rate": 1.7907799164288488e-06, + "loss": 0.0008, + "num_input_tokens_seen": 26229400, + "step": 13394 + }, + { + "epoch": 1.7753479125248508, + "grad_norm": 13.10749340057373, + "learning_rate": 1.7904470475284458e-06, + "loss": 0.0616, + "num_input_tokens_seen": 26231744, + "step": 13395 + }, + { + "epoch": 1.7754804506295558, + "grad_norm": 1.2784216403961182, + "learning_rate": 1.790114192308714e-06, + "loss": 0.0084, + "num_input_tokens_seen": 26233288, + "step": 13396 + }, + { + "epoch": 1.775612988734261, + "grad_norm": 7.149410247802734, + "learning_rate": 1.7897813507760703e-06, + "loss": 0.1859, + "num_input_tokens_seen": 26235072, + "step": 13397 + }, + { + "epoch": 1.7757455268389664, + "grad_norm": 2.749980926513672, + "learning_rate": 1.7894485229369324e-06, + "loss": 0.0439, + "num_input_tokens_seen": 26237280, + "step": 13398 + }, + { + "epoch": 1.7758780649436714, + "grad_norm": 14.785832405090332, + "learning_rate": 1.7891157087977167e-06, + "loss": 0.1956, + "num_input_tokens_seen": 26239000, + "step": 13399 + }, + { + "epoch": 1.7760106030483764, + "grad_norm": 6.9281768798828125, + "learning_rate": 1.7887829083648412e-06, + "loss": 0.1365, + "num_input_tokens_seen": 26240544, + "step": 13400 + }, + { + "epoch": 1.7761431411530815, + "grad_norm": 4.815896034240723, + "learning_rate": 1.7884501216447203e-06, + "loss": 0.0491, + "num_input_tokens_seen": 26242096, + "step": 13401 + }, + { + "epoch": 1.7762756792577865, + "grad_norm": 0.03162899985909462, + "learning_rate": 1.7881173486437727e-06, + "loss": 0.0002, + "num_input_tokens_seen": 26243392, + "step": 13402 + }, + { + "epoch": 1.7764082173624918, + "grad_norm": 0.020663641393184662, + "learning_rate": 1.7877845893684142e-06, + "loss": 0.0001, + "num_input_tokens_seen": 26245264, + "step": 13403 + }, + { + "epoch": 1.7765407554671968, + "grad_norm": 0.09768334776163101, + "learning_rate": 1.7874518438250598e-06, + "loss": 0.0005, + "num_input_tokens_seen": 26246472, + "step": 13404 + }, + { + "epoch": 1.776673293571902, + "grad_norm": 5.173131465911865, + "learning_rate": 1.7871191120201258e-06, + "loss": 0.2027, + "num_input_tokens_seen": 26248504, + "step": 13405 + }, + { + "epoch": 1.776805831676607, + "grad_norm": 5.359694957733154, + "learning_rate": 1.7867863939600264e-06, + "loss": 0.116, + "num_input_tokens_seen": 26250160, + "step": 13406 + }, + { + "epoch": 1.7769383697813121, + "grad_norm": 12.673229217529297, + "learning_rate": 1.786453689651178e-06, + "loss": 0.1645, + "num_input_tokens_seen": 26252048, + "step": 13407 + }, + { + "epoch": 1.7770709078860172, + "grad_norm": 1.9817533493041992, + "learning_rate": 1.7861209990999945e-06, + "loss": 0.0183, + "num_input_tokens_seen": 26254136, + "step": 13408 + }, + { + "epoch": 1.7772034459907222, + "grad_norm": 4.115328311920166, + "learning_rate": 1.7857883223128907e-06, + "loss": 0.0867, + "num_input_tokens_seen": 26256184, + "step": 13409 + }, + { + "epoch": 1.7773359840954275, + "grad_norm": 2.607085943222046, + "learning_rate": 1.7854556592962808e-06, + "loss": 0.009, + "num_input_tokens_seen": 26258104, + "step": 13410 + }, + { + "epoch": 1.7774685222001325, + "grad_norm": 4.658054351806641, + "learning_rate": 1.7851230100565775e-06, + "loss": 0.0533, + "num_input_tokens_seen": 26260872, + "step": 13411 + }, + { + "epoch": 1.7776010603048378, + "grad_norm": 0.08323761075735092, + "learning_rate": 1.7847903746001975e-06, + "loss": 0.0006, + "num_input_tokens_seen": 26263280, + "step": 13412 + }, + { + "epoch": 1.7777335984095428, + "grad_norm": 0.019861726090312004, + "learning_rate": 1.7844577529335522e-06, + "loss": 0.0001, + "num_input_tokens_seen": 26265576, + "step": 13413 + }, + { + "epoch": 1.7778661365142479, + "grad_norm": 15.19128704071045, + "learning_rate": 1.7841251450630542e-06, + "loss": 0.3136, + "num_input_tokens_seen": 26267568, + "step": 13414 + }, + { + "epoch": 1.777998674618953, + "grad_norm": 6.320358753204346, + "learning_rate": 1.783792550995118e-06, + "loss": 0.1024, + "num_input_tokens_seen": 26269344, + "step": 13415 + }, + { + "epoch": 1.778131212723658, + "grad_norm": 6.339293956756592, + "learning_rate": 1.7834599707361546e-06, + "loss": 0.1564, + "num_input_tokens_seen": 26271072, + "step": 13416 + }, + { + "epoch": 1.7782637508283632, + "grad_norm": 0.14896027743816376, + "learning_rate": 1.783127404292578e-06, + "loss": 0.0008, + "num_input_tokens_seen": 26272920, + "step": 13417 + }, + { + "epoch": 1.7783962889330682, + "grad_norm": 9.629888534545898, + "learning_rate": 1.7827948516707997e-06, + "loss": 0.3419, + "num_input_tokens_seen": 26275072, + "step": 13418 + }, + { + "epoch": 1.7785288270377735, + "grad_norm": 8.338888168334961, + "learning_rate": 1.782462312877231e-06, + "loss": 0.1074, + "num_input_tokens_seen": 26276800, + "step": 13419 + }, + { + "epoch": 1.7786613651424785, + "grad_norm": 7.826952934265137, + "learning_rate": 1.7821297879182845e-06, + "loss": 0.1438, + "num_input_tokens_seen": 26279448, + "step": 13420 + }, + { + "epoch": 1.7787939032471836, + "grad_norm": 0.3374713957309723, + "learning_rate": 1.7817972768003696e-06, + "loss": 0.0019, + "num_input_tokens_seen": 26281344, + "step": 13421 + }, + { + "epoch": 1.7789264413518886, + "grad_norm": 2.3968751430511475, + "learning_rate": 1.7814647795299e-06, + "loss": 0.0132, + "num_input_tokens_seen": 26283264, + "step": 13422 + }, + { + "epoch": 1.7790589794565936, + "grad_norm": 3.1393566131591797, + "learning_rate": 1.7811322961132848e-06, + "loss": 0.0354, + "num_input_tokens_seen": 26285936, + "step": 13423 + }, + { + "epoch": 1.779191517561299, + "grad_norm": 10.10363483428955, + "learning_rate": 1.780799826556935e-06, + "loss": 0.3335, + "num_input_tokens_seen": 26288504, + "step": 13424 + }, + { + "epoch": 1.779324055666004, + "grad_norm": 5.010140895843506, + "learning_rate": 1.7804673708672605e-06, + "loss": 0.197, + "num_input_tokens_seen": 26291208, + "step": 13425 + }, + { + "epoch": 1.7794565937707092, + "grad_norm": 1.3880749940872192, + "learning_rate": 1.7801349290506707e-06, + "loss": 0.0411, + "num_input_tokens_seen": 26293248, + "step": 13426 + }, + { + "epoch": 1.7795891318754142, + "grad_norm": 10.586915016174316, + "learning_rate": 1.7798025011135771e-06, + "loss": 0.1511, + "num_input_tokens_seen": 26295120, + "step": 13427 + }, + { + "epoch": 1.7797216699801193, + "grad_norm": 9.952249526977539, + "learning_rate": 1.7794700870623883e-06, + "loss": 0.293, + "num_input_tokens_seen": 26297024, + "step": 13428 + }, + { + "epoch": 1.7798542080848243, + "grad_norm": 6.022063732147217, + "learning_rate": 1.7791376869035126e-06, + "loss": 0.051, + "num_input_tokens_seen": 26299584, + "step": 13429 + }, + { + "epoch": 1.7799867461895293, + "grad_norm": 0.047561485320329666, + "learning_rate": 1.7788053006433603e-06, + "loss": 0.0003, + "num_input_tokens_seen": 26301144, + "step": 13430 + }, + { + "epoch": 1.7801192842942346, + "grad_norm": 9.478787422180176, + "learning_rate": 1.7784729282883387e-06, + "loss": 0.1083, + "num_input_tokens_seen": 26303168, + "step": 13431 + }, + { + "epoch": 1.7802518223989396, + "grad_norm": 9.005231857299805, + "learning_rate": 1.7781405698448561e-06, + "loss": 0.0965, + "num_input_tokens_seen": 26304840, + "step": 13432 + }, + { + "epoch": 1.780384360503645, + "grad_norm": 6.288226127624512, + "learning_rate": 1.7778082253193228e-06, + "loss": 0.065, + "num_input_tokens_seen": 26306496, + "step": 13433 + }, + { + "epoch": 1.78051689860835, + "grad_norm": 6.611120223999023, + "learning_rate": 1.7774758947181441e-06, + "loss": 0.1703, + "num_input_tokens_seen": 26308640, + "step": 13434 + }, + { + "epoch": 1.780649436713055, + "grad_norm": 5.324704647064209, + "learning_rate": 1.7771435780477295e-06, + "loss": 0.097, + "num_input_tokens_seen": 26310256, + "step": 13435 + }, + { + "epoch": 1.78078197481776, + "grad_norm": 14.94013786315918, + "learning_rate": 1.7768112753144851e-06, + "loss": 0.3921, + "num_input_tokens_seen": 26314352, + "step": 13436 + }, + { + "epoch": 1.780914512922465, + "grad_norm": 6.602219581604004, + "learning_rate": 1.7764789865248172e-06, + "loss": 0.1216, + "num_input_tokens_seen": 26316408, + "step": 13437 + }, + { + "epoch": 1.7810470510271703, + "grad_norm": 11.934491157531738, + "learning_rate": 1.7761467116851344e-06, + "loss": 0.1263, + "num_input_tokens_seen": 26317904, + "step": 13438 + }, + { + "epoch": 1.7811795891318756, + "grad_norm": 0.04531030356884003, + "learning_rate": 1.7758144508018426e-06, + "loss": 0.0003, + "num_input_tokens_seen": 26319080, + "step": 13439 + }, + { + "epoch": 1.7813121272365806, + "grad_norm": 0.6069450974464417, + "learning_rate": 1.7754822038813474e-06, + "loss": 0.0092, + "num_input_tokens_seen": 26320784, + "step": 13440 + }, + { + "epoch": 1.7814446653412856, + "grad_norm": 0.36157214641571045, + "learning_rate": 1.7751499709300557e-06, + "loss": 0.003, + "num_input_tokens_seen": 26323464, + "step": 13441 + }, + { + "epoch": 1.7815772034459907, + "grad_norm": 0.5376377105712891, + "learning_rate": 1.7748177519543713e-06, + "loss": 0.0048, + "num_input_tokens_seen": 26324952, + "step": 13442 + }, + { + "epoch": 1.7817097415506957, + "grad_norm": 0.12397444248199463, + "learning_rate": 1.7744855469607016e-06, + "loss": 0.0008, + "num_input_tokens_seen": 26326344, + "step": 13443 + }, + { + "epoch": 1.7818422796554008, + "grad_norm": 5.922483444213867, + "learning_rate": 1.7741533559554517e-06, + "loss": 0.1018, + "num_input_tokens_seen": 26328088, + "step": 13444 + }, + { + "epoch": 1.781974817760106, + "grad_norm": 1.7656452655792236, + "learning_rate": 1.7738211789450256e-06, + "loss": 0.0189, + "num_input_tokens_seen": 26329424, + "step": 13445 + }, + { + "epoch": 1.7821073558648113, + "grad_norm": 8.005962371826172, + "learning_rate": 1.7734890159358275e-06, + "loss": 0.2931, + "num_input_tokens_seen": 26331264, + "step": 13446 + }, + { + "epoch": 1.7822398939695163, + "grad_norm": 13.92824935913086, + "learning_rate": 1.7731568669342626e-06, + "loss": 0.1386, + "num_input_tokens_seen": 26333056, + "step": 13447 + }, + { + "epoch": 1.7823724320742214, + "grad_norm": 13.277717590332031, + "learning_rate": 1.772824731946735e-06, + "loss": 0.4788, + "num_input_tokens_seen": 26335320, + "step": 13448 + }, + { + "epoch": 1.7825049701789264, + "grad_norm": 0.05376683548092842, + "learning_rate": 1.772492610979648e-06, + "loss": 0.0003, + "num_input_tokens_seen": 26336736, + "step": 13449 + }, + { + "epoch": 1.7826375082836314, + "grad_norm": 2.0635359287261963, + "learning_rate": 1.7721605040394057e-06, + "loss": 0.041, + "num_input_tokens_seen": 26338536, + "step": 13450 + }, + { + "epoch": 1.7827700463883367, + "grad_norm": 7.3322954177856445, + "learning_rate": 1.7718284111324111e-06, + "loss": 0.0747, + "num_input_tokens_seen": 26340760, + "step": 13451 + }, + { + "epoch": 1.7829025844930417, + "grad_norm": 4.662420749664307, + "learning_rate": 1.771496332265066e-06, + "loss": 0.1281, + "num_input_tokens_seen": 26343128, + "step": 13452 + }, + { + "epoch": 1.783035122597747, + "grad_norm": 15.219648361206055, + "learning_rate": 1.7711642674437745e-06, + "loss": 0.2469, + "num_input_tokens_seen": 26345048, + "step": 13453 + }, + { + "epoch": 1.783167660702452, + "grad_norm": 0.1409814953804016, + "learning_rate": 1.7708322166749398e-06, + "loss": 0.0009, + "num_input_tokens_seen": 26347120, + "step": 13454 + }, + { + "epoch": 1.783300198807157, + "grad_norm": 3.001920700073242, + "learning_rate": 1.7705001799649621e-06, + "loss": 0.0281, + "num_input_tokens_seen": 26348784, + "step": 13455 + }, + { + "epoch": 1.783432736911862, + "grad_norm": 7.071722030639648, + "learning_rate": 1.7701681573202447e-06, + "loss": 0.2493, + "num_input_tokens_seen": 26351200, + "step": 13456 + }, + { + "epoch": 1.7835652750165671, + "grad_norm": 0.050525981932878494, + "learning_rate": 1.7698361487471883e-06, + "loss": 0.0003, + "num_input_tokens_seen": 26352632, + "step": 13457 + }, + { + "epoch": 1.7836978131212724, + "grad_norm": 2.120507001876831, + "learning_rate": 1.769504154252194e-06, + "loss": 0.011, + "num_input_tokens_seen": 26354232, + "step": 13458 + }, + { + "epoch": 1.7838303512259774, + "grad_norm": 9.166775703430176, + "learning_rate": 1.7691721738416648e-06, + "loss": 0.1264, + "num_input_tokens_seen": 26356088, + "step": 13459 + }, + { + "epoch": 1.7839628893306827, + "grad_norm": 0.021382562816143036, + "learning_rate": 1.7688402075220001e-06, + "loss": 0.0001, + "num_input_tokens_seen": 26357616, + "step": 13460 + }, + { + "epoch": 1.7840954274353877, + "grad_norm": 2.573509454727173, + "learning_rate": 1.7685082552996003e-06, + "loss": 0.0812, + "num_input_tokens_seen": 26359840, + "step": 13461 + }, + { + "epoch": 1.7842279655400928, + "grad_norm": 0.966750979423523, + "learning_rate": 1.7681763171808661e-06, + "loss": 0.0089, + "num_input_tokens_seen": 26361320, + "step": 13462 + }, + { + "epoch": 1.7843605036447978, + "grad_norm": 6.177943229675293, + "learning_rate": 1.7678443931721967e-06, + "loss": 0.0357, + "num_input_tokens_seen": 26363328, + "step": 13463 + }, + { + "epoch": 1.7844930417495029, + "grad_norm": 2.3456945419311523, + "learning_rate": 1.7675124832799928e-06, + "loss": 0.0145, + "num_input_tokens_seen": 26364944, + "step": 13464 + }, + { + "epoch": 1.784625579854208, + "grad_norm": 5.720907688140869, + "learning_rate": 1.7671805875106541e-06, + "loss": 0.0675, + "num_input_tokens_seen": 26367280, + "step": 13465 + }, + { + "epoch": 1.7847581179589131, + "grad_norm": 11.857833862304688, + "learning_rate": 1.7668487058705791e-06, + "loss": 0.1665, + "num_input_tokens_seen": 26369504, + "step": 13466 + }, + { + "epoch": 1.7848906560636184, + "grad_norm": 2.4518649578094482, + "learning_rate": 1.7665168383661662e-06, + "loss": 0.021, + "num_input_tokens_seen": 26370704, + "step": 13467 + }, + { + "epoch": 1.7850231941683234, + "grad_norm": 10.11327075958252, + "learning_rate": 1.7661849850038146e-06, + "loss": 0.1587, + "num_input_tokens_seen": 26373680, + "step": 13468 + }, + { + "epoch": 1.7851557322730285, + "grad_norm": 3.9352738857269287, + "learning_rate": 1.7658531457899235e-06, + "loss": 0.0232, + "num_input_tokens_seen": 26375024, + "step": 13469 + }, + { + "epoch": 1.7852882703777335, + "grad_norm": 6.68298864364624, + "learning_rate": 1.7655213207308896e-06, + "loss": 0.1716, + "num_input_tokens_seen": 26377064, + "step": 13470 + }, + { + "epoch": 1.7854208084824386, + "grad_norm": 0.03484855964779854, + "learning_rate": 1.765189509833112e-06, + "loss": 0.0002, + "num_input_tokens_seen": 26379728, + "step": 13471 + }, + { + "epoch": 1.7855533465871438, + "grad_norm": 10.722467422485352, + "learning_rate": 1.764857713102987e-06, + "loss": 0.3023, + "num_input_tokens_seen": 26381392, + "step": 13472 + }, + { + "epoch": 1.7856858846918489, + "grad_norm": 14.940919876098633, + "learning_rate": 1.764525930546912e-06, + "loss": 0.4127, + "num_input_tokens_seen": 26383632, + "step": 13473 + }, + { + "epoch": 1.7858184227965541, + "grad_norm": 6.522045135498047, + "learning_rate": 1.7641941621712855e-06, + "loss": 0.1384, + "num_input_tokens_seen": 26386040, + "step": 13474 + }, + { + "epoch": 1.7859509609012592, + "grad_norm": 0.385433554649353, + "learning_rate": 1.7638624079825028e-06, + "loss": 0.0017, + "num_input_tokens_seen": 26387192, + "step": 13475 + }, + { + "epoch": 1.7860834990059642, + "grad_norm": 0.13633976876735687, + "learning_rate": 1.7635306679869608e-06, + "loss": 0.0006, + "num_input_tokens_seen": 26388840, + "step": 13476 + }, + { + "epoch": 1.7862160371106692, + "grad_norm": 14.278340339660645, + "learning_rate": 1.7631989421910556e-06, + "loss": 0.3722, + "num_input_tokens_seen": 26390304, + "step": 13477 + }, + { + "epoch": 1.7863485752153743, + "grad_norm": 0.1277688592672348, + "learning_rate": 1.7628672306011823e-06, + "loss": 0.0008, + "num_input_tokens_seen": 26392192, + "step": 13478 + }, + { + "epoch": 1.7864811133200795, + "grad_norm": 6.736323356628418, + "learning_rate": 1.7625355332237375e-06, + "loss": 0.0655, + "num_input_tokens_seen": 26394368, + "step": 13479 + }, + { + "epoch": 1.7866136514247848, + "grad_norm": 0.057496052235364914, + "learning_rate": 1.7622038500651174e-06, + "loss": 0.0004, + "num_input_tokens_seen": 26395656, + "step": 13480 + }, + { + "epoch": 1.7867461895294898, + "grad_norm": 6.027022838592529, + "learning_rate": 1.7618721811317151e-06, + "loss": 0.1119, + "num_input_tokens_seen": 26397816, + "step": 13481 + }, + { + "epoch": 1.7868787276341949, + "grad_norm": 0.9153688549995422, + "learning_rate": 1.7615405264299273e-06, + "loss": 0.0085, + "num_input_tokens_seen": 26400648, + "step": 13482 + }, + { + "epoch": 1.7870112657389, + "grad_norm": 9.573477745056152, + "learning_rate": 1.761208885966146e-06, + "loss": 0.1165, + "num_input_tokens_seen": 26402616, + "step": 13483 + }, + { + "epoch": 1.787143803843605, + "grad_norm": 0.09726032614707947, + "learning_rate": 1.7608772597467685e-06, + "loss": 0.0004, + "num_input_tokens_seen": 26405488, + "step": 13484 + }, + { + "epoch": 1.78727634194831, + "grad_norm": 8.382540702819824, + "learning_rate": 1.7605456477781868e-06, + "loss": 0.0468, + "num_input_tokens_seen": 26406856, + "step": 13485 + }, + { + "epoch": 1.7874088800530152, + "grad_norm": 9.463828086853027, + "learning_rate": 1.7602140500667956e-06, + "loss": 0.1541, + "num_input_tokens_seen": 26408696, + "step": 13486 + }, + { + "epoch": 1.7875414181577205, + "grad_norm": 5.387556076049805, + "learning_rate": 1.7598824666189873e-06, + "loss": 0.0759, + "num_input_tokens_seen": 26410920, + "step": 13487 + }, + { + "epoch": 1.7876739562624255, + "grad_norm": 5.628362655639648, + "learning_rate": 1.759550897441156e-06, + "loss": 0.0813, + "num_input_tokens_seen": 26412512, + "step": 13488 + }, + { + "epoch": 1.7878064943671306, + "grad_norm": 0.048675842583179474, + "learning_rate": 1.759219342539693e-06, + "loss": 0.0003, + "num_input_tokens_seen": 26414216, + "step": 13489 + }, + { + "epoch": 1.7879390324718356, + "grad_norm": 18.880474090576172, + "learning_rate": 1.7588878019209937e-06, + "loss": 0.3935, + "num_input_tokens_seen": 26416240, + "step": 13490 + }, + { + "epoch": 1.7880715705765406, + "grad_norm": 7.306013107299805, + "learning_rate": 1.7585562755914483e-06, + "loss": 0.1431, + "num_input_tokens_seen": 26418304, + "step": 13491 + }, + { + "epoch": 1.788204108681246, + "grad_norm": 0.01743192970752716, + "learning_rate": 1.7582247635574496e-06, + "loss": 0.0001, + "num_input_tokens_seen": 26419608, + "step": 13492 + }, + { + "epoch": 1.788336646785951, + "grad_norm": 4.326083660125732, + "learning_rate": 1.7578932658253887e-06, + "loss": 0.0285, + "num_input_tokens_seen": 26421672, + "step": 13493 + }, + { + "epoch": 1.7884691848906562, + "grad_norm": 6.223625183105469, + "learning_rate": 1.7575617824016573e-06, + "loss": 0.1041, + "num_input_tokens_seen": 26423560, + "step": 13494 + }, + { + "epoch": 1.7886017229953612, + "grad_norm": 3.8484771251678467, + "learning_rate": 1.7572303132926478e-06, + "loss": 0.0275, + "num_input_tokens_seen": 26425144, + "step": 13495 + }, + { + "epoch": 1.7887342611000663, + "grad_norm": 2.3663957118988037, + "learning_rate": 1.7568988585047494e-06, + "loss": 0.041, + "num_input_tokens_seen": 26426984, + "step": 13496 + }, + { + "epoch": 1.7888667992047713, + "grad_norm": 9.78691291809082, + "learning_rate": 1.756567418044355e-06, + "loss": 0.1365, + "num_input_tokens_seen": 26429088, + "step": 13497 + }, + { + "epoch": 1.7889993373094764, + "grad_norm": 7.348280906677246, + "learning_rate": 1.756235991917853e-06, + "loss": 0.0833, + "num_input_tokens_seen": 26432088, + "step": 13498 + }, + { + "epoch": 1.7891318754141816, + "grad_norm": 0.9052292108535767, + "learning_rate": 1.7559045801316332e-06, + "loss": 0.004, + "num_input_tokens_seen": 26433784, + "step": 13499 + }, + { + "epoch": 1.7892644135188867, + "grad_norm": 5.542092800140381, + "learning_rate": 1.7555731826920868e-06, + "loss": 0.0684, + "num_input_tokens_seen": 26435824, + "step": 13500 + }, + { + "epoch": 1.789396951623592, + "grad_norm": 4.9773054122924805, + "learning_rate": 1.7552417996056036e-06, + "loss": 0.103, + "num_input_tokens_seen": 26437856, + "step": 13501 + }, + { + "epoch": 1.789529489728297, + "grad_norm": 4.7303853034973145, + "learning_rate": 1.7549104308785719e-06, + "loss": 0.0561, + "num_input_tokens_seen": 26439624, + "step": 13502 + }, + { + "epoch": 1.789662027833002, + "grad_norm": 16.13039779663086, + "learning_rate": 1.754579076517382e-06, + "loss": 0.3716, + "num_input_tokens_seen": 26441712, + "step": 13503 + }, + { + "epoch": 1.789794565937707, + "grad_norm": 9.999297142028809, + "learning_rate": 1.7542477365284203e-06, + "loss": 0.2693, + "num_input_tokens_seen": 26444336, + "step": 13504 + }, + { + "epoch": 1.789927104042412, + "grad_norm": 5.910900115966797, + "learning_rate": 1.753916410918078e-06, + "loss": 0.1271, + "num_input_tokens_seen": 26446168, + "step": 13505 + }, + { + "epoch": 1.7900596421471173, + "grad_norm": 3.7617299556732178, + "learning_rate": 1.753585099692741e-06, + "loss": 0.0268, + "num_input_tokens_seen": 26448120, + "step": 13506 + }, + { + "epoch": 1.7901921802518224, + "grad_norm": 6.987460136413574, + "learning_rate": 1.753253802858799e-06, + "loss": 0.1695, + "num_input_tokens_seen": 26449784, + "step": 13507 + }, + { + "epoch": 1.7903247183565276, + "grad_norm": 10.058489799499512, + "learning_rate": 1.752922520422638e-06, + "loss": 0.1045, + "num_input_tokens_seen": 26451184, + "step": 13508 + }, + { + "epoch": 1.7904572564612327, + "grad_norm": 5.993460178375244, + "learning_rate": 1.7525912523906463e-06, + "loss": 0.0867, + "num_input_tokens_seen": 26453200, + "step": 13509 + }, + { + "epoch": 1.7905897945659377, + "grad_norm": 0.06365442276000977, + "learning_rate": 1.7522599987692113e-06, + "loss": 0.0004, + "num_input_tokens_seen": 26455272, + "step": 13510 + }, + { + "epoch": 1.7907223326706427, + "grad_norm": 3.2359840869903564, + "learning_rate": 1.7519287595647191e-06, + "loss": 0.0837, + "num_input_tokens_seen": 26458008, + "step": 13511 + }, + { + "epoch": 1.7908548707753478, + "grad_norm": 0.769130527973175, + "learning_rate": 1.7515975347835574e-06, + "loss": 0.0083, + "num_input_tokens_seen": 26459280, + "step": 13512 + }, + { + "epoch": 1.790987408880053, + "grad_norm": 0.13804055750370026, + "learning_rate": 1.751266324432111e-06, + "loss": 0.0007, + "num_input_tokens_seen": 26461704, + "step": 13513 + }, + { + "epoch": 1.791119946984758, + "grad_norm": 0.006949859205633402, + "learning_rate": 1.750935128516766e-06, + "loss": 0.0, + "num_input_tokens_seen": 26463096, + "step": 13514 + }, + { + "epoch": 1.7912524850894633, + "grad_norm": 3.3558826446533203, + "learning_rate": 1.750603947043908e-06, + "loss": 0.0282, + "num_input_tokens_seen": 26464760, + "step": 13515 + }, + { + "epoch": 1.7913850231941684, + "grad_norm": 3.903610944747925, + "learning_rate": 1.7502727800199236e-06, + "loss": 0.0898, + "num_input_tokens_seen": 26466440, + "step": 13516 + }, + { + "epoch": 1.7915175612988734, + "grad_norm": 19.085941314697266, + "learning_rate": 1.7499416274511972e-06, + "loss": 0.294, + "num_input_tokens_seen": 26468912, + "step": 13517 + }, + { + "epoch": 1.7916500994035784, + "grad_norm": 8.417341232299805, + "learning_rate": 1.749610489344114e-06, + "loss": 0.1095, + "num_input_tokens_seen": 26470760, + "step": 13518 + }, + { + "epoch": 1.7917826375082835, + "grad_norm": 4.599940299987793, + "learning_rate": 1.749279365705058e-06, + "loss": 0.0586, + "num_input_tokens_seen": 26473040, + "step": 13519 + }, + { + "epoch": 1.7919151756129887, + "grad_norm": 11.888713836669922, + "learning_rate": 1.7489482565404132e-06, + "loss": 0.0688, + "num_input_tokens_seen": 26475120, + "step": 13520 + }, + { + "epoch": 1.792047713717694, + "grad_norm": 6.165769577026367, + "learning_rate": 1.748617161856564e-06, + "loss": 0.1128, + "num_input_tokens_seen": 26477328, + "step": 13521 + }, + { + "epoch": 1.792180251822399, + "grad_norm": 5.052001953125, + "learning_rate": 1.748286081659895e-06, + "loss": 0.1703, + "num_input_tokens_seen": 26479664, + "step": 13522 + }, + { + "epoch": 1.792312789927104, + "grad_norm": 0.01829586736857891, + "learning_rate": 1.7479550159567887e-06, + "loss": 0.0001, + "num_input_tokens_seen": 26481896, + "step": 13523 + }, + { + "epoch": 1.7924453280318091, + "grad_norm": 14.04226303100586, + "learning_rate": 1.7476239647536288e-06, + "loss": 0.2669, + "num_input_tokens_seen": 26483984, + "step": 13524 + }, + { + "epoch": 1.7925778661365142, + "grad_norm": 5.46922492980957, + "learning_rate": 1.747292928056797e-06, + "loss": 0.0745, + "num_input_tokens_seen": 26486808, + "step": 13525 + }, + { + "epoch": 1.7927104042412192, + "grad_norm": 2.4571304321289062, + "learning_rate": 1.7469619058726772e-06, + "loss": 0.0448, + "num_input_tokens_seen": 26488192, + "step": 13526 + }, + { + "epoch": 1.7928429423459245, + "grad_norm": 8.714128494262695, + "learning_rate": 1.7466308982076518e-06, + "loss": 0.11, + "num_input_tokens_seen": 26489440, + "step": 13527 + }, + { + "epoch": 1.7929754804506297, + "grad_norm": 10.794597625732422, + "learning_rate": 1.7462999050681026e-06, + "loss": 0.1668, + "num_input_tokens_seen": 26491104, + "step": 13528 + }, + { + "epoch": 1.7931080185553347, + "grad_norm": 5.617539405822754, + "learning_rate": 1.7459689264604107e-06, + "loss": 0.2166, + "num_input_tokens_seen": 26493376, + "step": 13529 + }, + { + "epoch": 1.7932405566600398, + "grad_norm": 1.0309244394302368, + "learning_rate": 1.7456379623909575e-06, + "loss": 0.0116, + "num_input_tokens_seen": 26495320, + "step": 13530 + }, + { + "epoch": 1.7933730947647448, + "grad_norm": 0.028823330998420715, + "learning_rate": 1.7453070128661256e-06, + "loss": 0.0002, + "num_input_tokens_seen": 26496744, + "step": 13531 + }, + { + "epoch": 1.7935056328694499, + "grad_norm": 4.579731464385986, + "learning_rate": 1.7449760778922947e-06, + "loss": 0.0972, + "num_input_tokens_seen": 26499152, + "step": 13532 + }, + { + "epoch": 1.7936381709741551, + "grad_norm": 7.258098602294922, + "learning_rate": 1.7446451574758466e-06, + "loss": 0.1696, + "num_input_tokens_seen": 26500872, + "step": 13533 + }, + { + "epoch": 1.7937707090788602, + "grad_norm": 8.057473182678223, + "learning_rate": 1.7443142516231603e-06, + "loss": 0.083, + "num_input_tokens_seen": 26502896, + "step": 13534 + }, + { + "epoch": 1.7939032471835654, + "grad_norm": 0.05750260874629021, + "learning_rate": 1.7439833603406158e-06, + "loss": 0.0004, + "num_input_tokens_seen": 26504624, + "step": 13535 + }, + { + "epoch": 1.7940357852882705, + "grad_norm": 10.32970905303955, + "learning_rate": 1.7436524836345952e-06, + "loss": 0.1633, + "num_input_tokens_seen": 26506696, + "step": 13536 + }, + { + "epoch": 1.7941683233929755, + "grad_norm": 0.13353323936462402, + "learning_rate": 1.7433216215114763e-06, + "loss": 0.0009, + "num_input_tokens_seen": 26508144, + "step": 13537 + }, + { + "epoch": 1.7943008614976805, + "grad_norm": 0.14368011057376862, + "learning_rate": 1.742990773977638e-06, + "loss": 0.0009, + "num_input_tokens_seen": 26509864, + "step": 13538 + }, + { + "epoch": 1.7944333996023856, + "grad_norm": 4.054242134094238, + "learning_rate": 1.7426599410394602e-06, + "loss": 0.0812, + "num_input_tokens_seen": 26511192, + "step": 13539 + }, + { + "epoch": 1.7945659377070908, + "grad_norm": 1.5258116722106934, + "learning_rate": 1.74232912270332e-06, + "loss": 0.0156, + "num_input_tokens_seen": 26512832, + "step": 13540 + }, + { + "epoch": 1.7946984758117959, + "grad_norm": 4.265691757202148, + "learning_rate": 1.741998318975598e-06, + "loss": 0.0349, + "num_input_tokens_seen": 26514280, + "step": 13541 + }, + { + "epoch": 1.7948310139165011, + "grad_norm": 0.618071436882019, + "learning_rate": 1.7416675298626718e-06, + "loss": 0.0063, + "num_input_tokens_seen": 26516008, + "step": 13542 + }, + { + "epoch": 1.7949635520212062, + "grad_norm": 3.9547102451324463, + "learning_rate": 1.7413367553709185e-06, + "loss": 0.0895, + "num_input_tokens_seen": 26517992, + "step": 13543 + }, + { + "epoch": 1.7950960901259112, + "grad_norm": 2.9521284103393555, + "learning_rate": 1.7410059955067154e-06, + "loss": 0.0578, + "num_input_tokens_seen": 26520208, + "step": 13544 + }, + { + "epoch": 1.7952286282306162, + "grad_norm": 1.614125370979309, + "learning_rate": 1.740675250276441e-06, + "loss": 0.0085, + "num_input_tokens_seen": 26522472, + "step": 13545 + }, + { + "epoch": 1.7953611663353213, + "grad_norm": 0.09426449984312057, + "learning_rate": 1.7403445196864703e-06, + "loss": 0.0006, + "num_input_tokens_seen": 26524544, + "step": 13546 + }, + { + "epoch": 1.7954937044400265, + "grad_norm": 0.11491863429546356, + "learning_rate": 1.7400138037431823e-06, + "loss": 0.0008, + "num_input_tokens_seen": 26526264, + "step": 13547 + }, + { + "epoch": 1.7956262425447316, + "grad_norm": 2.086135149002075, + "learning_rate": 1.7396831024529525e-06, + "loss": 0.0179, + "num_input_tokens_seen": 26528608, + "step": 13548 + }, + { + "epoch": 1.7957587806494368, + "grad_norm": 5.821713924407959, + "learning_rate": 1.739352415822157e-06, + "loss": 0.1035, + "num_input_tokens_seen": 26531048, + "step": 13549 + }, + { + "epoch": 1.7958913187541419, + "grad_norm": 6.0157790184021, + "learning_rate": 1.7390217438571715e-06, + "loss": 0.0893, + "num_input_tokens_seen": 26533936, + "step": 13550 + }, + { + "epoch": 1.796023856858847, + "grad_norm": 4.216887950897217, + "learning_rate": 1.7386910865643713e-06, + "loss": 0.0398, + "num_input_tokens_seen": 26537224, + "step": 13551 + }, + { + "epoch": 1.796156394963552, + "grad_norm": 7.900070667266846, + "learning_rate": 1.7383604439501329e-06, + "loss": 0.1789, + "num_input_tokens_seen": 26538696, + "step": 13552 + }, + { + "epoch": 1.796288933068257, + "grad_norm": 11.829741477966309, + "learning_rate": 1.7380298160208303e-06, + "loss": 0.2307, + "num_input_tokens_seen": 26541264, + "step": 13553 + }, + { + "epoch": 1.7964214711729622, + "grad_norm": 0.060499157756567, + "learning_rate": 1.7376992027828388e-06, + "loss": 0.0003, + "num_input_tokens_seen": 26543296, + "step": 13554 + }, + { + "epoch": 1.7965540092776673, + "grad_norm": 0.09370407462120056, + "learning_rate": 1.737368604242532e-06, + "loss": 0.0006, + "num_input_tokens_seen": 26544448, + "step": 13555 + }, + { + "epoch": 1.7966865473823725, + "grad_norm": 8.654318809509277, + "learning_rate": 1.7370380204062841e-06, + "loss": 0.0912, + "num_input_tokens_seen": 26546640, + "step": 13556 + }, + { + "epoch": 1.7968190854870776, + "grad_norm": 2.1261088848114014, + "learning_rate": 1.7367074512804707e-06, + "loss": 0.0168, + "num_input_tokens_seen": 26548192, + "step": 13557 + }, + { + "epoch": 1.7969516235917826, + "grad_norm": 10.142276763916016, + "learning_rate": 1.7363768968714642e-06, + "loss": 0.1837, + "num_input_tokens_seen": 26550632, + "step": 13558 + }, + { + "epoch": 1.7970841616964877, + "grad_norm": 0.11329805105924606, + "learning_rate": 1.7360463571856373e-06, + "loss": 0.0005, + "num_input_tokens_seen": 26552040, + "step": 13559 + }, + { + "epoch": 1.7972166998011927, + "grad_norm": 7.3871073722839355, + "learning_rate": 1.7357158322293645e-06, + "loss": 0.0909, + "num_input_tokens_seen": 26554096, + "step": 13560 + }, + { + "epoch": 1.797349237905898, + "grad_norm": 0.03587912768125534, + "learning_rate": 1.735385322009016e-06, + "loss": 0.0002, + "num_input_tokens_seen": 26555496, + "step": 13561 + }, + { + "epoch": 1.797481776010603, + "grad_norm": 13.763813972473145, + "learning_rate": 1.7350548265309672e-06, + "loss": 0.33, + "num_input_tokens_seen": 26557528, + "step": 13562 + }, + { + "epoch": 1.7976143141153083, + "grad_norm": 0.15195626020431519, + "learning_rate": 1.7347243458015892e-06, + "loss": 0.001, + "num_input_tokens_seen": 26558928, + "step": 13563 + }, + { + "epoch": 1.7977468522200133, + "grad_norm": 8.007662773132324, + "learning_rate": 1.7343938798272532e-06, + "loss": 0.2904, + "num_input_tokens_seen": 26561016, + "step": 13564 + }, + { + "epoch": 1.7978793903247183, + "grad_norm": 9.498983383178711, + "learning_rate": 1.734063428614332e-06, + "loss": 0.2363, + "num_input_tokens_seen": 26562632, + "step": 13565 + }, + { + "epoch": 1.7980119284294234, + "grad_norm": 1.003184199333191, + "learning_rate": 1.7337329921691953e-06, + "loss": 0.0166, + "num_input_tokens_seen": 26564696, + "step": 13566 + }, + { + "epoch": 1.7981444665341284, + "grad_norm": 2.855196952819824, + "learning_rate": 1.733402570498216e-06, + "loss": 0.0562, + "num_input_tokens_seen": 26567088, + "step": 13567 + }, + { + "epoch": 1.7982770046388337, + "grad_norm": 4.377383232116699, + "learning_rate": 1.7330721636077636e-06, + "loss": 0.0543, + "num_input_tokens_seen": 26569288, + "step": 13568 + }, + { + "epoch": 1.798409542743539, + "grad_norm": 0.5915123820304871, + "learning_rate": 1.7327417715042094e-06, + "loss": 0.0036, + "num_input_tokens_seen": 26571312, + "step": 13569 + }, + { + "epoch": 1.798542080848244, + "grad_norm": 0.7797229886054993, + "learning_rate": 1.7324113941939226e-06, + "loss": 0.0042, + "num_input_tokens_seen": 26573120, + "step": 13570 + }, + { + "epoch": 1.798674618952949, + "grad_norm": 2.479902982711792, + "learning_rate": 1.7320810316832742e-06, + "loss": 0.0248, + "num_input_tokens_seen": 26574344, + "step": 13571 + }, + { + "epoch": 1.798807157057654, + "grad_norm": 16.192241668701172, + "learning_rate": 1.7317506839786325e-06, + "loss": 0.2575, + "num_input_tokens_seen": 26575472, + "step": 13572 + }, + { + "epoch": 1.798939695162359, + "grad_norm": 4.9803876876831055, + "learning_rate": 1.7314203510863684e-06, + "loss": 0.076, + "num_input_tokens_seen": 26577832, + "step": 13573 + }, + { + "epoch": 1.7990722332670643, + "grad_norm": 3.7477424144744873, + "learning_rate": 1.7310900330128497e-06, + "loss": 0.0598, + "num_input_tokens_seen": 26580248, + "step": 13574 + }, + { + "epoch": 1.7992047713717694, + "grad_norm": 3.946439743041992, + "learning_rate": 1.730759729764446e-06, + "loss": 0.0485, + "num_input_tokens_seen": 26582192, + "step": 13575 + }, + { + "epoch": 1.7993373094764746, + "grad_norm": 7.128304481506348, + "learning_rate": 1.7304294413475249e-06, + "loss": 0.1106, + "num_input_tokens_seen": 26584632, + "step": 13576 + }, + { + "epoch": 1.7994698475811797, + "grad_norm": 8.386787414550781, + "learning_rate": 1.7300991677684548e-06, + "loss": 0.3167, + "num_input_tokens_seen": 26586568, + "step": 13577 + }, + { + "epoch": 1.7996023856858847, + "grad_norm": 10.198125839233398, + "learning_rate": 1.7297689090336045e-06, + "loss": 0.313, + "num_input_tokens_seen": 26588128, + "step": 13578 + }, + { + "epoch": 1.7997349237905897, + "grad_norm": 3.5055384635925293, + "learning_rate": 1.7294386651493405e-06, + "loss": 0.0592, + "num_input_tokens_seen": 26590480, + "step": 13579 + }, + { + "epoch": 1.7998674618952948, + "grad_norm": 1.3968749046325684, + "learning_rate": 1.7291084361220318e-06, + "loss": 0.0158, + "num_input_tokens_seen": 26592504, + "step": 13580 + }, + { + "epoch": 1.8, + "grad_norm": 11.910711288452148, + "learning_rate": 1.7287782219580436e-06, + "loss": 0.2836, + "num_input_tokens_seen": 26594864, + "step": 13581 + }, + { + "epoch": 1.800132538104705, + "grad_norm": 6.525597095489502, + "learning_rate": 1.7284480226637424e-06, + "loss": 0.094, + "num_input_tokens_seen": 26596816, + "step": 13582 + }, + { + "epoch": 1.8002650762094103, + "grad_norm": 0.04156789928674698, + "learning_rate": 1.7281178382454961e-06, + "loss": 0.0003, + "num_input_tokens_seen": 26598440, + "step": 13583 + }, + { + "epoch": 1.8003976143141154, + "grad_norm": 0.04346594214439392, + "learning_rate": 1.7277876687096708e-06, + "loss": 0.0003, + "num_input_tokens_seen": 26600104, + "step": 13584 + }, + { + "epoch": 1.8005301524188204, + "grad_norm": 0.19566258788108826, + "learning_rate": 1.7274575140626318e-06, + "loss": 0.002, + "num_input_tokens_seen": 26601488, + "step": 13585 + }, + { + "epoch": 1.8006626905235255, + "grad_norm": 5.085778713226318, + "learning_rate": 1.7271273743107452e-06, + "loss": 0.1162, + "num_input_tokens_seen": 26604400, + "step": 13586 + }, + { + "epoch": 1.8007952286282305, + "grad_norm": 0.1026371568441391, + "learning_rate": 1.726797249460375e-06, + "loss": 0.0007, + "num_input_tokens_seen": 26605960, + "step": 13587 + }, + { + "epoch": 1.8009277667329358, + "grad_norm": 7.790092945098877, + "learning_rate": 1.7264671395178878e-06, + "loss": 0.174, + "num_input_tokens_seen": 26608232, + "step": 13588 + }, + { + "epoch": 1.8010603048376408, + "grad_norm": 5.643495559692383, + "learning_rate": 1.726137044489648e-06, + "loss": 0.0995, + "num_input_tokens_seen": 26609656, + "step": 13589 + }, + { + "epoch": 1.801192842942346, + "grad_norm": 0.09065470099449158, + "learning_rate": 1.7258069643820202e-06, + "loss": 0.0006, + "num_input_tokens_seen": 26611408, + "step": 13590 + }, + { + "epoch": 1.801325381047051, + "grad_norm": 0.048655908554792404, + "learning_rate": 1.7254768992013677e-06, + "loss": 0.0005, + "num_input_tokens_seen": 26612792, + "step": 13591 + }, + { + "epoch": 1.8014579191517561, + "grad_norm": 7.888485431671143, + "learning_rate": 1.725146848954054e-06, + "loss": 0.1185, + "num_input_tokens_seen": 26615728, + "step": 13592 + }, + { + "epoch": 1.8015904572564612, + "grad_norm": 3.8968069553375244, + "learning_rate": 1.7248168136464448e-06, + "loss": 0.0351, + "num_input_tokens_seen": 26617360, + "step": 13593 + }, + { + "epoch": 1.8017229953611662, + "grad_norm": 2.9872806072235107, + "learning_rate": 1.7244867932849019e-06, + "loss": 0.039, + "num_input_tokens_seen": 26619392, + "step": 13594 + }, + { + "epoch": 1.8018555334658715, + "grad_norm": 12.942712783813477, + "learning_rate": 1.7241567878757889e-06, + "loss": 0.2252, + "num_input_tokens_seen": 26621448, + "step": 13595 + }, + { + "epoch": 1.8019880715705765, + "grad_norm": 9.66864013671875, + "learning_rate": 1.7238267974254685e-06, + "loss": 0.0616, + "num_input_tokens_seen": 26623072, + "step": 13596 + }, + { + "epoch": 1.8021206096752818, + "grad_norm": 10.495954513549805, + "learning_rate": 1.7234968219403013e-06, + "loss": 0.0878, + "num_input_tokens_seen": 26624984, + "step": 13597 + }, + { + "epoch": 1.8022531477799868, + "grad_norm": 10.488879203796387, + "learning_rate": 1.7231668614266522e-06, + "loss": 0.22, + "num_input_tokens_seen": 26627216, + "step": 13598 + }, + { + "epoch": 1.8023856858846918, + "grad_norm": 7.993254661560059, + "learning_rate": 1.7228369158908822e-06, + "loss": 0.1477, + "num_input_tokens_seen": 26629384, + "step": 13599 + }, + { + "epoch": 1.8025182239893969, + "grad_norm": 0.19708530604839325, + "learning_rate": 1.7225069853393522e-06, + "loss": 0.0013, + "num_input_tokens_seen": 26631320, + "step": 13600 + }, + { + "epoch": 1.802650762094102, + "grad_norm": 4.13322639465332, + "learning_rate": 1.7221770697784243e-06, + "loss": 0.0199, + "num_input_tokens_seen": 26633664, + "step": 13601 + }, + { + "epoch": 1.8027833001988072, + "grad_norm": 8.290809631347656, + "learning_rate": 1.7218471692144587e-06, + "loss": 0.0152, + "num_input_tokens_seen": 26634968, + "step": 13602 + }, + { + "epoch": 1.8029158383035122, + "grad_norm": 0.03503880277276039, + "learning_rate": 1.7215172836538157e-06, + "loss": 0.0002, + "num_input_tokens_seen": 26637880, + "step": 13603 + }, + { + "epoch": 1.8030483764082175, + "grad_norm": 0.2906630337238312, + "learning_rate": 1.7211874131028577e-06, + "loss": 0.0018, + "num_input_tokens_seen": 26639416, + "step": 13604 + }, + { + "epoch": 1.8031809145129225, + "grad_norm": 6.96110200881958, + "learning_rate": 1.7208575575679436e-06, + "loss": 0.1194, + "num_input_tokens_seen": 26642928, + "step": 13605 + }, + { + "epoch": 1.8033134526176275, + "grad_norm": 0.18759137392044067, + "learning_rate": 1.720527717055433e-06, + "loss": 0.0009, + "num_input_tokens_seen": 26644112, + "step": 13606 + }, + { + "epoch": 1.8034459907223326, + "grad_norm": 4.520011901855469, + "learning_rate": 1.720197891571686e-06, + "loss": 0.0268, + "num_input_tokens_seen": 26645568, + "step": 13607 + }, + { + "epoch": 1.8035785288270376, + "grad_norm": 1.9104048013687134, + "learning_rate": 1.719868081123061e-06, + "loss": 0.0266, + "num_input_tokens_seen": 26647664, + "step": 13608 + }, + { + "epoch": 1.8037110669317429, + "grad_norm": 2.5512845516204834, + "learning_rate": 1.7195382857159176e-06, + "loss": 0.0091, + "num_input_tokens_seen": 26649456, + "step": 13609 + }, + { + "epoch": 1.8038436050364481, + "grad_norm": 5.563266754150391, + "learning_rate": 1.719208505356615e-06, + "loss": 0.0554, + "num_input_tokens_seen": 26650968, + "step": 13610 + }, + { + "epoch": 1.8039761431411532, + "grad_norm": 6.9681782722473145, + "learning_rate": 1.7188787400515114e-06, + "loss": 0.2038, + "num_input_tokens_seen": 26653208, + "step": 13611 + }, + { + "epoch": 1.8041086812458582, + "grad_norm": 0.06808216124773026, + "learning_rate": 1.7185489898069641e-06, + "loss": 0.0004, + "num_input_tokens_seen": 26654600, + "step": 13612 + }, + { + "epoch": 1.8042412193505633, + "grad_norm": 6.808321952819824, + "learning_rate": 1.7182192546293303e-06, + "loss": 0.1034, + "num_input_tokens_seen": 26656608, + "step": 13613 + }, + { + "epoch": 1.8043737574552683, + "grad_norm": 6.002277851104736, + "learning_rate": 1.71788953452497e-06, + "loss": 0.0643, + "num_input_tokens_seen": 26658680, + "step": 13614 + }, + { + "epoch": 1.8045062955599733, + "grad_norm": 0.5653548240661621, + "learning_rate": 1.7175598295002385e-06, + "loss": 0.0049, + "num_input_tokens_seen": 26660064, + "step": 13615 + }, + { + "epoch": 1.8046388336646786, + "grad_norm": 10.631072998046875, + "learning_rate": 1.7172301395614937e-06, + "loss": 0.2928, + "num_input_tokens_seen": 26662008, + "step": 13616 + }, + { + "epoch": 1.8047713717693838, + "grad_norm": 6.783126354217529, + "learning_rate": 1.7169004647150913e-06, + "loss": 0.0919, + "num_input_tokens_seen": 26663448, + "step": 13617 + }, + { + "epoch": 1.8049039098740889, + "grad_norm": 0.16702903807163239, + "learning_rate": 1.7165708049673878e-06, + "loss": 0.0006, + "num_input_tokens_seen": 26664744, + "step": 13618 + }, + { + "epoch": 1.805036447978794, + "grad_norm": 11.521212577819824, + "learning_rate": 1.7162411603247404e-06, + "loss": 0.2545, + "num_input_tokens_seen": 26666416, + "step": 13619 + }, + { + "epoch": 1.805168986083499, + "grad_norm": 3.0798070430755615, + "learning_rate": 1.7159115307935043e-06, + "loss": 0.0195, + "num_input_tokens_seen": 26668064, + "step": 13620 + }, + { + "epoch": 1.805301524188204, + "grad_norm": 15.158918380737305, + "learning_rate": 1.7155819163800344e-06, + "loss": 0.5022, + "num_input_tokens_seen": 26670824, + "step": 13621 + }, + { + "epoch": 1.8054340622929093, + "grad_norm": 3.5827548503875732, + "learning_rate": 1.7152523170906866e-06, + "loss": 0.0277, + "num_input_tokens_seen": 26672688, + "step": 13622 + }, + { + "epoch": 1.8055666003976143, + "grad_norm": 6.0312724113464355, + "learning_rate": 1.7149227329318146e-06, + "loss": 0.1122, + "num_input_tokens_seen": 26674328, + "step": 13623 + }, + { + "epoch": 1.8056991385023196, + "grad_norm": 14.950557708740234, + "learning_rate": 1.7145931639097743e-06, + "loss": 0.0874, + "num_input_tokens_seen": 26676328, + "step": 13624 + }, + { + "epoch": 1.8058316766070246, + "grad_norm": 0.04970107600092888, + "learning_rate": 1.71426361003092e-06, + "loss": 0.0004, + "num_input_tokens_seen": 26677920, + "step": 13625 + }, + { + "epoch": 1.8059642147117296, + "grad_norm": 0.009666800498962402, + "learning_rate": 1.7139340713016054e-06, + "loss": 0.0001, + "num_input_tokens_seen": 26679504, + "step": 13626 + }, + { + "epoch": 1.8060967528164347, + "grad_norm": 0.17921720445156097, + "learning_rate": 1.7136045477281843e-06, + "loss": 0.001, + "num_input_tokens_seen": 26681328, + "step": 13627 + }, + { + "epoch": 1.8062292909211397, + "grad_norm": 14.232503890991211, + "learning_rate": 1.71327503931701e-06, + "loss": 0.21, + "num_input_tokens_seen": 26682720, + "step": 13628 + }, + { + "epoch": 1.806361829025845, + "grad_norm": 5.11373233795166, + "learning_rate": 1.712945546074435e-06, + "loss": 0.128, + "num_input_tokens_seen": 26684456, + "step": 13629 + }, + { + "epoch": 1.80649436713055, + "grad_norm": 10.042037010192871, + "learning_rate": 1.7126160680068127e-06, + "loss": 0.2852, + "num_input_tokens_seen": 26686768, + "step": 13630 + }, + { + "epoch": 1.8066269052352553, + "grad_norm": 0.00946376658976078, + "learning_rate": 1.7122866051204967e-06, + "loss": 0.0001, + "num_input_tokens_seen": 26688088, + "step": 13631 + }, + { + "epoch": 1.8067594433399603, + "grad_norm": 2.2493419647216797, + "learning_rate": 1.711957157421838e-06, + "loss": 0.0352, + "num_input_tokens_seen": 26689744, + "step": 13632 + }, + { + "epoch": 1.8068919814446653, + "grad_norm": 9.617035865783691, + "learning_rate": 1.7116277249171891e-06, + "loss": 0.0983, + "num_input_tokens_seen": 26691496, + "step": 13633 + }, + { + "epoch": 1.8070245195493704, + "grad_norm": 0.009915320202708244, + "learning_rate": 1.711298307612901e-06, + "loss": 0.0001, + "num_input_tokens_seen": 26692680, + "step": 13634 + }, + { + "epoch": 1.8071570576540754, + "grad_norm": 0.6328532695770264, + "learning_rate": 1.7109689055153261e-06, + "loss": 0.0042, + "num_input_tokens_seen": 26693848, + "step": 13635 + }, + { + "epoch": 1.8072895957587807, + "grad_norm": 5.911764621734619, + "learning_rate": 1.7106395186308146e-06, + "loss": 0.0704, + "num_input_tokens_seen": 26697232, + "step": 13636 + }, + { + "epoch": 1.8074221338634857, + "grad_norm": 0.10018294304609299, + "learning_rate": 1.7103101469657185e-06, + "loss": 0.0005, + "num_input_tokens_seen": 26699112, + "step": 13637 + }, + { + "epoch": 1.807554671968191, + "grad_norm": 5.8439812660217285, + "learning_rate": 1.709980790526387e-06, + "loss": 0.0521, + "num_input_tokens_seen": 26700936, + "step": 13638 + }, + { + "epoch": 1.807687210072896, + "grad_norm": 0.07201984524726868, + "learning_rate": 1.7096514493191701e-06, + "loss": 0.0005, + "num_input_tokens_seen": 26702760, + "step": 13639 + }, + { + "epoch": 1.807819748177601, + "grad_norm": 0.28943994641304016, + "learning_rate": 1.7093221233504198e-06, + "loss": 0.0012, + "num_input_tokens_seen": 26704312, + "step": 13640 + }, + { + "epoch": 1.807952286282306, + "grad_norm": 11.068626403808594, + "learning_rate": 1.7089928126264838e-06, + "loss": 0.1829, + "num_input_tokens_seen": 26706344, + "step": 13641 + }, + { + "epoch": 1.8080848243870111, + "grad_norm": 9.36474609375, + "learning_rate": 1.7086635171537125e-06, + "loss": 0.1136, + "num_input_tokens_seen": 26708640, + "step": 13642 + }, + { + "epoch": 1.8082173624917164, + "grad_norm": 5.960879325866699, + "learning_rate": 1.7083342369384548e-06, + "loss": 0.0941, + "num_input_tokens_seen": 26710696, + "step": 13643 + }, + { + "epoch": 1.8083499005964214, + "grad_norm": 5.929085731506348, + "learning_rate": 1.7080049719870578e-06, + "loss": 0.0572, + "num_input_tokens_seen": 26713216, + "step": 13644 + }, + { + "epoch": 1.8084824387011267, + "grad_norm": 11.264347076416016, + "learning_rate": 1.707675722305872e-06, + "loss": 0.3341, + "num_input_tokens_seen": 26715296, + "step": 13645 + }, + { + "epoch": 1.8086149768058317, + "grad_norm": 0.9554985761642456, + "learning_rate": 1.7073464879012453e-06, + "loss": 0.0138, + "num_input_tokens_seen": 26718112, + "step": 13646 + }, + { + "epoch": 1.8087475149105368, + "grad_norm": 7.452111721038818, + "learning_rate": 1.7070172687795245e-06, + "loss": 0.2013, + "num_input_tokens_seen": 26720048, + "step": 13647 + }, + { + "epoch": 1.8088800530152418, + "grad_norm": 12.128656387329102, + "learning_rate": 1.7066880649470584e-06, + "loss": 0.2462, + "num_input_tokens_seen": 26722552, + "step": 13648 + }, + { + "epoch": 1.8090125911199468, + "grad_norm": 0.11128123104572296, + "learning_rate": 1.7063588764101928e-06, + "loss": 0.0006, + "num_input_tokens_seen": 26724080, + "step": 13649 + }, + { + "epoch": 1.809145129224652, + "grad_norm": 0.059791263192892075, + "learning_rate": 1.7060297031752764e-06, + "loss": 0.0002, + "num_input_tokens_seen": 26726616, + "step": 13650 + }, + { + "epoch": 1.8092776673293574, + "grad_norm": 1.3709524869918823, + "learning_rate": 1.705700545248655e-06, + "loss": 0.0138, + "num_input_tokens_seen": 26728320, + "step": 13651 + }, + { + "epoch": 1.8094102054340624, + "grad_norm": 0.5233966708183289, + "learning_rate": 1.7053714026366753e-06, + "loss": 0.0016, + "num_input_tokens_seen": 26731032, + "step": 13652 + }, + { + "epoch": 1.8095427435387674, + "grad_norm": 4.8162407875061035, + "learning_rate": 1.7050422753456825e-06, + "loss": 0.0477, + "num_input_tokens_seen": 26732992, + "step": 13653 + }, + { + "epoch": 1.8096752816434725, + "grad_norm": 7.267902374267578, + "learning_rate": 1.7047131633820224e-06, + "loss": 0.1934, + "num_input_tokens_seen": 26735048, + "step": 13654 + }, + { + "epoch": 1.8098078197481775, + "grad_norm": 2.571226119995117, + "learning_rate": 1.7043840667520422e-06, + "loss": 0.0513, + "num_input_tokens_seen": 26736256, + "step": 13655 + }, + { + "epoch": 1.8099403578528825, + "grad_norm": 2.307441473007202, + "learning_rate": 1.704054985462086e-06, + "loss": 0.033, + "num_input_tokens_seen": 26737936, + "step": 13656 + }, + { + "epoch": 1.8100728959575878, + "grad_norm": 4.7839484214782715, + "learning_rate": 1.7037259195184991e-06, + "loss": 0.0635, + "num_input_tokens_seen": 26740024, + "step": 13657 + }, + { + "epoch": 1.810205434062293, + "grad_norm": 19.164243698120117, + "learning_rate": 1.7033968689276255e-06, + "loss": 0.1451, + "num_input_tokens_seen": 26741552, + "step": 13658 + }, + { + "epoch": 1.810337972166998, + "grad_norm": 0.010858150199055672, + "learning_rate": 1.7030678336958096e-06, + "loss": 0.0001, + "num_input_tokens_seen": 26742904, + "step": 13659 + }, + { + "epoch": 1.8104705102717031, + "grad_norm": 0.1636306196451187, + "learning_rate": 1.7027388138293949e-06, + "loss": 0.001, + "num_input_tokens_seen": 26745016, + "step": 13660 + }, + { + "epoch": 1.8106030483764082, + "grad_norm": 5.617348670959473, + "learning_rate": 1.7024098093347268e-06, + "loss": 0.0648, + "num_input_tokens_seen": 26747032, + "step": 13661 + }, + { + "epoch": 1.8107355864811132, + "grad_norm": 0.15999209880828857, + "learning_rate": 1.7020808202181477e-06, + "loss": 0.0011, + "num_input_tokens_seen": 26748912, + "step": 13662 + }, + { + "epoch": 1.8108681245858185, + "grad_norm": 0.053092870861291885, + "learning_rate": 1.701751846486001e-06, + "loss": 0.0004, + "num_input_tokens_seen": 26751184, + "step": 13663 + }, + { + "epoch": 1.8110006626905235, + "grad_norm": 9.189582824707031, + "learning_rate": 1.7014228881446298e-06, + "loss": 0.065, + "num_input_tokens_seen": 26753176, + "step": 13664 + }, + { + "epoch": 1.8111332007952288, + "grad_norm": 0.009918643161654472, + "learning_rate": 1.701093945200375e-06, + "loss": 0.0001, + "num_input_tokens_seen": 26754864, + "step": 13665 + }, + { + "epoch": 1.8112657388999338, + "grad_norm": 8.936410903930664, + "learning_rate": 1.7007650176595805e-06, + "loss": 0.1359, + "num_input_tokens_seen": 26756920, + "step": 13666 + }, + { + "epoch": 1.8113982770046388, + "grad_norm": 0.03011741116642952, + "learning_rate": 1.7004361055285884e-06, + "loss": 0.0002, + "num_input_tokens_seen": 26758456, + "step": 13667 + }, + { + "epoch": 1.8115308151093439, + "grad_norm": 10.357450485229492, + "learning_rate": 1.7001072088137394e-06, + "loss": 0.0476, + "num_input_tokens_seen": 26759872, + "step": 13668 + }, + { + "epoch": 1.811663353214049, + "grad_norm": 0.13559092581272125, + "learning_rate": 1.6997783275213753e-06, + "loss": 0.0007, + "num_input_tokens_seen": 26762368, + "step": 13669 + }, + { + "epoch": 1.8117958913187542, + "grad_norm": 0.04285366088151932, + "learning_rate": 1.6994494616578364e-06, + "loss": 0.0003, + "num_input_tokens_seen": 26763496, + "step": 13670 + }, + { + "epoch": 1.8119284294234592, + "grad_norm": 9.891592979431152, + "learning_rate": 1.6991206112294645e-06, + "loss": 0.1344, + "num_input_tokens_seen": 26765280, + "step": 13671 + }, + { + "epoch": 1.8120609675281645, + "grad_norm": 10.47362995147705, + "learning_rate": 1.6987917762426004e-06, + "loss": 0.3321, + "num_input_tokens_seen": 26767248, + "step": 13672 + }, + { + "epoch": 1.8121935056328695, + "grad_norm": 8.145865440368652, + "learning_rate": 1.6984629567035837e-06, + "loss": 0.0749, + "num_input_tokens_seen": 26769912, + "step": 13673 + }, + { + "epoch": 1.8123260437375746, + "grad_norm": 0.024657173082232475, + "learning_rate": 1.6981341526187534e-06, + "loss": 0.0002, + "num_input_tokens_seen": 26771528, + "step": 13674 + }, + { + "epoch": 1.8124585818422796, + "grad_norm": 1.251681923866272, + "learning_rate": 1.6978053639944492e-06, + "loss": 0.0104, + "num_input_tokens_seen": 26773696, + "step": 13675 + }, + { + "epoch": 1.8125911199469846, + "grad_norm": 0.3379829525947571, + "learning_rate": 1.6974765908370122e-06, + "loss": 0.0024, + "num_input_tokens_seen": 26774888, + "step": 13676 + }, + { + "epoch": 1.81272365805169, + "grad_norm": 10.027912139892578, + "learning_rate": 1.6971478331527795e-06, + "loss": 0.224, + "num_input_tokens_seen": 26777000, + "step": 13677 + }, + { + "epoch": 1.812856196156395, + "grad_norm": 0.03354233130812645, + "learning_rate": 1.6968190909480914e-06, + "loss": 0.0002, + "num_input_tokens_seen": 26778480, + "step": 13678 + }, + { + "epoch": 1.8129887342611002, + "grad_norm": 0.06725697219371796, + "learning_rate": 1.6964903642292844e-06, + "loss": 0.0005, + "num_input_tokens_seen": 26780264, + "step": 13679 + }, + { + "epoch": 1.8131212723658052, + "grad_norm": 8.696191787719727, + "learning_rate": 1.696161653002697e-06, + "loss": 0.0966, + "num_input_tokens_seen": 26782072, + "step": 13680 + }, + { + "epoch": 1.8132538104705103, + "grad_norm": 7.6293230056762695, + "learning_rate": 1.6958329572746684e-06, + "loss": 0.2191, + "num_input_tokens_seen": 26783656, + "step": 13681 + }, + { + "epoch": 1.8133863485752153, + "grad_norm": 10.231730461120605, + "learning_rate": 1.6955042770515355e-06, + "loss": 0.0424, + "num_input_tokens_seen": 26786080, + "step": 13682 + }, + { + "epoch": 1.8135188866799203, + "grad_norm": 4.857458591461182, + "learning_rate": 1.6951756123396343e-06, + "loss": 0.0611, + "num_input_tokens_seen": 26787528, + "step": 13683 + }, + { + "epoch": 1.8136514247846256, + "grad_norm": 4.548752784729004, + "learning_rate": 1.6948469631453028e-06, + "loss": 0.0782, + "num_input_tokens_seen": 26789128, + "step": 13684 + }, + { + "epoch": 1.8137839628893306, + "grad_norm": 3.4163079261779785, + "learning_rate": 1.6945183294748769e-06, + "loss": 0.0294, + "num_input_tokens_seen": 26791208, + "step": 13685 + }, + { + "epoch": 1.813916500994036, + "grad_norm": 9.968764305114746, + "learning_rate": 1.6941897113346933e-06, + "loss": 0.1163, + "num_input_tokens_seen": 26793928, + "step": 13686 + }, + { + "epoch": 1.814049039098741, + "grad_norm": 2.150343179702759, + "learning_rate": 1.693861108731089e-06, + "loss": 0.0146, + "num_input_tokens_seen": 26796392, + "step": 13687 + }, + { + "epoch": 1.814181577203446, + "grad_norm": 1.8683606386184692, + "learning_rate": 1.6935325216703983e-06, + "loss": 0.0198, + "num_input_tokens_seen": 26798760, + "step": 13688 + }, + { + "epoch": 1.814314115308151, + "grad_norm": 3.1207141876220703, + "learning_rate": 1.6932039501589564e-06, + "loss": 0.0068, + "num_input_tokens_seen": 26800560, + "step": 13689 + }, + { + "epoch": 1.814446653412856, + "grad_norm": 1.9343913793563843, + "learning_rate": 1.6928753942030995e-06, + "loss": 0.0057, + "num_input_tokens_seen": 26801992, + "step": 13690 + }, + { + "epoch": 1.8145791915175613, + "grad_norm": 0.008053869009017944, + "learning_rate": 1.6925468538091605e-06, + "loss": 0.0001, + "num_input_tokens_seen": 26803168, + "step": 13691 + }, + { + "epoch": 1.8147117296222666, + "grad_norm": 10.861091613769531, + "learning_rate": 1.6922183289834763e-06, + "loss": 0.119, + "num_input_tokens_seen": 26805744, + "step": 13692 + }, + { + "epoch": 1.8148442677269716, + "grad_norm": 0.044451646506786346, + "learning_rate": 1.6918898197323801e-06, + "loss": 0.0003, + "num_input_tokens_seen": 26807080, + "step": 13693 + }, + { + "epoch": 1.8149768058316766, + "grad_norm": 6.328668117523193, + "learning_rate": 1.6915613260622052e-06, + "loss": 0.1501, + "num_input_tokens_seen": 26808856, + "step": 13694 + }, + { + "epoch": 1.8151093439363817, + "grad_norm": 0.015891091898083687, + "learning_rate": 1.6912328479792862e-06, + "loss": 0.0001, + "num_input_tokens_seen": 26810664, + "step": 13695 + }, + { + "epoch": 1.8152418820410867, + "grad_norm": 5.2263407707214355, + "learning_rate": 1.6909043854899549e-06, + "loss": 0.2133, + "num_input_tokens_seen": 26812336, + "step": 13696 + }, + { + "epoch": 1.8153744201457918, + "grad_norm": 0.04794013872742653, + "learning_rate": 1.690575938600546e-06, + "loss": 0.0002, + "num_input_tokens_seen": 26814384, + "step": 13697 + }, + { + "epoch": 1.815506958250497, + "grad_norm": 0.009285589680075645, + "learning_rate": 1.6902475073173907e-06, + "loss": 0.0001, + "num_input_tokens_seen": 26816448, + "step": 13698 + }, + { + "epoch": 1.8156394963552023, + "grad_norm": 3.815653085708618, + "learning_rate": 1.689919091646823e-06, + "loss": 0.0531, + "num_input_tokens_seen": 26818128, + "step": 13699 + }, + { + "epoch": 1.8157720344599073, + "grad_norm": 0.13947026431560516, + "learning_rate": 1.6895906915951735e-06, + "loss": 0.0006, + "num_input_tokens_seen": 26819392, + "step": 13700 + }, + { + "epoch": 1.8159045725646124, + "grad_norm": 3.3179209232330322, + "learning_rate": 1.689262307168774e-06, + "loss": 0.0241, + "num_input_tokens_seen": 26820648, + "step": 13701 + }, + { + "epoch": 1.8160371106693174, + "grad_norm": 3.806905746459961, + "learning_rate": 1.6889339383739575e-06, + "loss": 0.0397, + "num_input_tokens_seen": 26822816, + "step": 13702 + }, + { + "epoch": 1.8161696487740224, + "grad_norm": 13.256583213806152, + "learning_rate": 1.6886055852170542e-06, + "loss": 0.168, + "num_input_tokens_seen": 26824760, + "step": 13703 + }, + { + "epoch": 1.8163021868787277, + "grad_norm": 7.474447250366211, + "learning_rate": 1.6882772477043947e-06, + "loss": 0.0962, + "num_input_tokens_seen": 26825984, + "step": 13704 + }, + { + "epoch": 1.8164347249834327, + "grad_norm": 0.0040419134311378, + "learning_rate": 1.6879489258423098e-06, + "loss": 0.0, + "num_input_tokens_seen": 26827184, + "step": 13705 + }, + { + "epoch": 1.816567263088138, + "grad_norm": 7.081380367279053, + "learning_rate": 1.6876206196371292e-06, + "loss": 0.1079, + "num_input_tokens_seen": 26829016, + "step": 13706 + }, + { + "epoch": 1.816699801192843, + "grad_norm": 0.09496753662824631, + "learning_rate": 1.6872923290951838e-06, + "loss": 0.0004, + "num_input_tokens_seen": 26830672, + "step": 13707 + }, + { + "epoch": 1.816832339297548, + "grad_norm": 0.4694381058216095, + "learning_rate": 1.6869640542228035e-06, + "loss": 0.0037, + "num_input_tokens_seen": 26832448, + "step": 13708 + }, + { + "epoch": 1.816964877402253, + "grad_norm": 7.322230815887451, + "learning_rate": 1.686635795026317e-06, + "loss": 0.171, + "num_input_tokens_seen": 26834768, + "step": 13709 + }, + { + "epoch": 1.8170974155069581, + "grad_norm": 0.045652344822883606, + "learning_rate": 1.6863075515120537e-06, + "loss": 0.0003, + "num_input_tokens_seen": 26836552, + "step": 13710 + }, + { + "epoch": 1.8172299536116634, + "grad_norm": 4.509487152099609, + "learning_rate": 1.685979323686341e-06, + "loss": 0.0526, + "num_input_tokens_seen": 26838272, + "step": 13711 + }, + { + "epoch": 1.8173624917163684, + "grad_norm": 9.306081771850586, + "learning_rate": 1.6856511115555094e-06, + "loss": 0.2782, + "num_input_tokens_seen": 26840216, + "step": 13712 + }, + { + "epoch": 1.8174950298210737, + "grad_norm": 0.01784774661064148, + "learning_rate": 1.685322915125886e-06, + "loss": 0.0001, + "num_input_tokens_seen": 26841656, + "step": 13713 + }, + { + "epoch": 1.8176275679257787, + "grad_norm": 1.1983287334442139, + "learning_rate": 1.6849947344037992e-06, + "loss": 0.0116, + "num_input_tokens_seen": 26844424, + "step": 13714 + }, + { + "epoch": 1.8177601060304838, + "grad_norm": 6.389251232147217, + "learning_rate": 1.6846665693955755e-06, + "loss": 0.1549, + "num_input_tokens_seen": 26846376, + "step": 13715 + }, + { + "epoch": 1.8178926441351888, + "grad_norm": 0.005492255557328463, + "learning_rate": 1.6843384201075435e-06, + "loss": 0.0, + "num_input_tokens_seen": 26847488, + "step": 13716 + }, + { + "epoch": 1.8180251822398938, + "grad_norm": 0.051206156611442566, + "learning_rate": 1.6840102865460283e-06, + "loss": 0.0002, + "num_input_tokens_seen": 26849768, + "step": 13717 + }, + { + "epoch": 1.818157720344599, + "grad_norm": 4.382839679718018, + "learning_rate": 1.6836821687173587e-06, + "loss": 0.0479, + "num_input_tokens_seen": 26852280, + "step": 13718 + }, + { + "epoch": 1.8182902584493041, + "grad_norm": 9.08458423614502, + "learning_rate": 1.6833540666278597e-06, + "loss": 0.1701, + "num_input_tokens_seen": 26854712, + "step": 13719 + }, + { + "epoch": 1.8184227965540094, + "grad_norm": 0.05404914170503616, + "learning_rate": 1.6830259802838583e-06, + "loss": 0.0003, + "num_input_tokens_seen": 26857440, + "step": 13720 + }, + { + "epoch": 1.8185553346587144, + "grad_norm": 6.850743293762207, + "learning_rate": 1.682697909691679e-06, + "loss": 0.1703, + "num_input_tokens_seen": 26859160, + "step": 13721 + }, + { + "epoch": 1.8186878727634195, + "grad_norm": 5.739080429077148, + "learning_rate": 1.6823698548576472e-06, + "loss": 0.0481, + "num_input_tokens_seen": 26860976, + "step": 13722 + }, + { + "epoch": 1.8188204108681245, + "grad_norm": 0.010712482035160065, + "learning_rate": 1.6820418157880897e-06, + "loss": 0.0001, + "num_input_tokens_seen": 26862392, + "step": 13723 + }, + { + "epoch": 1.8189529489728296, + "grad_norm": 12.20919132232666, + "learning_rate": 1.6817137924893294e-06, + "loss": 0.1654, + "num_input_tokens_seen": 26864872, + "step": 13724 + }, + { + "epoch": 1.8190854870775348, + "grad_norm": 0.7742608785629272, + "learning_rate": 1.6813857849676928e-06, + "loss": 0.0062, + "num_input_tokens_seen": 26866232, + "step": 13725 + }, + { + "epoch": 1.8192180251822399, + "grad_norm": 0.014639589935541153, + "learning_rate": 1.6810577932295027e-06, + "loss": 0.0001, + "num_input_tokens_seen": 26868664, + "step": 13726 + }, + { + "epoch": 1.819350563286945, + "grad_norm": 6.568999767303467, + "learning_rate": 1.6807298172810823e-06, + "loss": 0.1415, + "num_input_tokens_seen": 26870320, + "step": 13727 + }, + { + "epoch": 1.8194831013916501, + "grad_norm": 4.937580585479736, + "learning_rate": 1.6804018571287567e-06, + "loss": 0.1147, + "num_input_tokens_seen": 26872040, + "step": 13728 + }, + { + "epoch": 1.8196156394963552, + "grad_norm": 4.05419921875, + "learning_rate": 1.6800739127788496e-06, + "loss": 0.0751, + "num_input_tokens_seen": 26873576, + "step": 13729 + }, + { + "epoch": 1.8197481776010602, + "grad_norm": 0.010826563462615013, + "learning_rate": 1.6797459842376822e-06, + "loss": 0.0001, + "num_input_tokens_seen": 26875024, + "step": 13730 + }, + { + "epoch": 1.8198807157057653, + "grad_norm": 13.572599411010742, + "learning_rate": 1.6794180715115789e-06, + "loss": 0.2934, + "num_input_tokens_seen": 26876344, + "step": 13731 + }, + { + "epoch": 1.8200132538104705, + "grad_norm": 0.021069012582302094, + "learning_rate": 1.6790901746068596e-06, + "loss": 0.0001, + "num_input_tokens_seen": 26878216, + "step": 13732 + }, + { + "epoch": 1.8201457919151756, + "grad_norm": 3.3978617191314697, + "learning_rate": 1.6787622935298492e-06, + "loss": 0.0416, + "num_input_tokens_seen": 26881376, + "step": 13733 + }, + { + "epoch": 1.8202783300198808, + "grad_norm": 8.631749153137207, + "learning_rate": 1.6784344282868687e-06, + "loss": 0.1326, + "num_input_tokens_seen": 26883344, + "step": 13734 + }, + { + "epoch": 1.8204108681245859, + "grad_norm": 12.571732521057129, + "learning_rate": 1.6781065788842389e-06, + "loss": 0.3232, + "num_input_tokens_seen": 26885592, + "step": 13735 + }, + { + "epoch": 1.820543406229291, + "grad_norm": 0.009790387004613876, + "learning_rate": 1.6777787453282809e-06, + "loss": 0.0001, + "num_input_tokens_seen": 26887328, + "step": 13736 + }, + { + "epoch": 1.820675944333996, + "grad_norm": 0.017618641257286072, + "learning_rate": 1.6774509276253153e-06, + "loss": 0.0001, + "num_input_tokens_seen": 26888496, + "step": 13737 + }, + { + "epoch": 1.820808482438701, + "grad_norm": 6.8649492263793945, + "learning_rate": 1.6771231257816644e-06, + "loss": 0.1033, + "num_input_tokens_seen": 26890752, + "step": 13738 + }, + { + "epoch": 1.8209410205434062, + "grad_norm": 11.55799674987793, + "learning_rate": 1.6767953398036472e-06, + "loss": 0.2336, + "num_input_tokens_seen": 26892608, + "step": 13739 + }, + { + "epoch": 1.8210735586481115, + "grad_norm": 0.009338089264929295, + "learning_rate": 1.6764675696975839e-06, + "loss": 0.0001, + "num_input_tokens_seen": 26893728, + "step": 13740 + }, + { + "epoch": 1.8212060967528165, + "grad_norm": 3.628601551055908, + "learning_rate": 1.676139815469794e-06, + "loss": 0.0603, + "num_input_tokens_seen": 26895360, + "step": 13741 + }, + { + "epoch": 1.8213386348575216, + "grad_norm": 0.8932361602783203, + "learning_rate": 1.6758120771265956e-06, + "loss": 0.0059, + "num_input_tokens_seen": 26896688, + "step": 13742 + }, + { + "epoch": 1.8214711729622266, + "grad_norm": 2.2833235263824463, + "learning_rate": 1.6754843546743097e-06, + "loss": 0.012, + "num_input_tokens_seen": 26897864, + "step": 13743 + }, + { + "epoch": 1.8216037110669316, + "grad_norm": 0.19593948125839233, + "learning_rate": 1.6751566481192546e-06, + "loss": 0.0011, + "num_input_tokens_seen": 26899728, + "step": 13744 + }, + { + "epoch": 1.821736249171637, + "grad_norm": 9.055383682250977, + "learning_rate": 1.6748289574677478e-06, + "loss": 0.1494, + "num_input_tokens_seen": 26901688, + "step": 13745 + }, + { + "epoch": 1.821868787276342, + "grad_norm": 0.0995856523513794, + "learning_rate": 1.6745012827261086e-06, + "loss": 0.0006, + "num_input_tokens_seen": 26903544, + "step": 13746 + }, + { + "epoch": 1.8220013253810472, + "grad_norm": 0.008811777457594872, + "learning_rate": 1.6741736239006537e-06, + "loss": 0.0001, + "num_input_tokens_seen": 26904848, + "step": 13747 + }, + { + "epoch": 1.8221338634857522, + "grad_norm": 6.910976886749268, + "learning_rate": 1.6738459809977003e-06, + "loss": 0.1535, + "num_input_tokens_seen": 26907536, + "step": 13748 + }, + { + "epoch": 1.8222664015904573, + "grad_norm": 7.716386795043945, + "learning_rate": 1.6735183540235679e-06, + "loss": 0.1874, + "num_input_tokens_seen": 26909880, + "step": 13749 + }, + { + "epoch": 1.8223989396951623, + "grad_norm": 1.5253716707229614, + "learning_rate": 1.6731907429845712e-06, + "loss": 0.0262, + "num_input_tokens_seen": 26912184, + "step": 13750 + }, + { + "epoch": 1.8225314777998673, + "grad_norm": 2.856844663619995, + "learning_rate": 1.6728631478870268e-06, + "loss": 0.0343, + "num_input_tokens_seen": 26914560, + "step": 13751 + }, + { + "epoch": 1.8226640159045726, + "grad_norm": 2.0767605304718018, + "learning_rate": 1.6725355687372523e-06, + "loss": 0.0837, + "num_input_tokens_seen": 26915920, + "step": 13752 + }, + { + "epoch": 1.8227965540092776, + "grad_norm": 0.07072246819734573, + "learning_rate": 1.6722080055415617e-06, + "loss": 0.0004, + "num_input_tokens_seen": 26917720, + "step": 13753 + }, + { + "epoch": 1.822929092113983, + "grad_norm": 2.274055004119873, + "learning_rate": 1.6718804583062726e-06, + "loss": 0.0202, + "num_input_tokens_seen": 26920904, + "step": 13754 + }, + { + "epoch": 1.823061630218688, + "grad_norm": 9.156266212463379, + "learning_rate": 1.6715529270377002e-06, + "loss": 0.1091, + "num_input_tokens_seen": 26923824, + "step": 13755 + }, + { + "epoch": 1.823194168323393, + "grad_norm": 0.47864511609077454, + "learning_rate": 1.6712254117421584e-06, + "loss": 0.0034, + "num_input_tokens_seen": 26925224, + "step": 13756 + }, + { + "epoch": 1.823326706428098, + "grad_norm": 9.103716850280762, + "learning_rate": 1.6708979124259623e-06, + "loss": 0.2924, + "num_input_tokens_seen": 26927512, + "step": 13757 + }, + { + "epoch": 1.823459244532803, + "grad_norm": 0.0825406014919281, + "learning_rate": 1.6705704290954256e-06, + "loss": 0.0004, + "num_input_tokens_seen": 26929104, + "step": 13758 + }, + { + "epoch": 1.8235917826375083, + "grad_norm": 0.38563787937164307, + "learning_rate": 1.6702429617568642e-06, + "loss": 0.0023, + "num_input_tokens_seen": 26931560, + "step": 13759 + }, + { + "epoch": 1.8237243207422134, + "grad_norm": 11.694995880126953, + "learning_rate": 1.6699155104165903e-06, + "loss": 0.3276, + "num_input_tokens_seen": 26934328, + "step": 13760 + }, + { + "epoch": 1.8238568588469186, + "grad_norm": 0.1280740350484848, + "learning_rate": 1.669588075080919e-06, + "loss": 0.0008, + "num_input_tokens_seen": 26936952, + "step": 13761 + }, + { + "epoch": 1.8239893969516237, + "grad_norm": 11.53473949432373, + "learning_rate": 1.6692606557561617e-06, + "loss": 0.2586, + "num_input_tokens_seen": 26938784, + "step": 13762 + }, + { + "epoch": 1.8241219350563287, + "grad_norm": 0.010729270987212658, + "learning_rate": 1.6689332524486316e-06, + "loss": 0.0001, + "num_input_tokens_seen": 26939944, + "step": 13763 + }, + { + "epoch": 1.8242544731610337, + "grad_norm": 1.6182721853256226, + "learning_rate": 1.6686058651646424e-06, + "loss": 0.035, + "num_input_tokens_seen": 26941680, + "step": 13764 + }, + { + "epoch": 1.8243870112657388, + "grad_norm": 15.488408088684082, + "learning_rate": 1.6682784939105062e-06, + "loss": 0.3258, + "num_input_tokens_seen": 26944840, + "step": 13765 + }, + { + "epoch": 1.824519549370444, + "grad_norm": 8.061427116394043, + "learning_rate": 1.6679511386925337e-06, + "loss": 0.0239, + "num_input_tokens_seen": 26946376, + "step": 13766 + }, + { + "epoch": 1.824652087475149, + "grad_norm": 3.0624942779541016, + "learning_rate": 1.6676237995170378e-06, + "loss": 0.036, + "num_input_tokens_seen": 26947840, + "step": 13767 + }, + { + "epoch": 1.8247846255798543, + "grad_norm": 0.7073823809623718, + "learning_rate": 1.6672964763903283e-06, + "loss": 0.0034, + "num_input_tokens_seen": 26949824, + "step": 13768 + }, + { + "epoch": 1.8249171636845594, + "grad_norm": 1.0578587055206299, + "learning_rate": 1.6669691693187173e-06, + "loss": 0.0062, + "num_input_tokens_seen": 26951168, + "step": 13769 + }, + { + "epoch": 1.8250497017892644, + "grad_norm": 10.005724906921387, + "learning_rate": 1.6666418783085164e-06, + "loss": 0.1139, + "num_input_tokens_seen": 26953392, + "step": 13770 + }, + { + "epoch": 1.8251822398939694, + "grad_norm": 0.40264299511909485, + "learning_rate": 1.6663146033660342e-06, + "loss": 0.0027, + "num_input_tokens_seen": 26954656, + "step": 13771 + }, + { + "epoch": 1.8253147779986745, + "grad_norm": 3.569605588912964, + "learning_rate": 1.6659873444975822e-06, + "loss": 0.0326, + "num_input_tokens_seen": 26956848, + "step": 13772 + }, + { + "epoch": 1.8254473161033797, + "grad_norm": 14.310128211975098, + "learning_rate": 1.6656601017094695e-06, + "loss": 0.6194, + "num_input_tokens_seen": 26959960, + "step": 13773 + }, + { + "epoch": 1.8255798542080848, + "grad_norm": 2.8636529445648193, + "learning_rate": 1.6653328750080044e-06, + "loss": 0.0351, + "num_input_tokens_seen": 26961912, + "step": 13774 + }, + { + "epoch": 1.82571239231279, + "grad_norm": 0.026560654863715172, + "learning_rate": 1.6650056643994984e-06, + "loss": 0.0002, + "num_input_tokens_seen": 26963896, + "step": 13775 + }, + { + "epoch": 1.825844930417495, + "grad_norm": 0.09332794696092606, + "learning_rate": 1.664678469890259e-06, + "loss": 0.0004, + "num_input_tokens_seen": 26965544, + "step": 13776 + }, + { + "epoch": 1.8259774685222, + "grad_norm": 6.64898157119751, + "learning_rate": 1.664351291486595e-06, + "loss": 0.2414, + "num_input_tokens_seen": 26968288, + "step": 13777 + }, + { + "epoch": 1.8261100066269051, + "grad_norm": 10.039253234863281, + "learning_rate": 1.6640241291948149e-06, + "loss": 0.1969, + "num_input_tokens_seen": 26970664, + "step": 13778 + }, + { + "epoch": 1.8262425447316102, + "grad_norm": 0.018191635608673096, + "learning_rate": 1.6636969830212252e-06, + "loss": 0.0001, + "num_input_tokens_seen": 26973448, + "step": 13779 + }, + { + "epoch": 1.8263750828363154, + "grad_norm": 7.044430732727051, + "learning_rate": 1.6633698529721356e-06, + "loss": 0.186, + "num_input_tokens_seen": 26975616, + "step": 13780 + }, + { + "epoch": 1.8265076209410207, + "grad_norm": 0.45687752962112427, + "learning_rate": 1.663042739053852e-06, + "loss": 0.0018, + "num_input_tokens_seen": 26978536, + "step": 13781 + }, + { + "epoch": 1.8266401590457257, + "grad_norm": 0.25599250197410583, + "learning_rate": 1.6627156412726822e-06, + "loss": 0.0017, + "num_input_tokens_seen": 26980416, + "step": 13782 + }, + { + "epoch": 1.8267726971504308, + "grad_norm": 0.06131959706544876, + "learning_rate": 1.6623885596349322e-06, + "loss": 0.0004, + "num_input_tokens_seen": 26983392, + "step": 13783 + }, + { + "epoch": 1.8269052352551358, + "grad_norm": 1.2535897493362427, + "learning_rate": 1.6620614941469077e-06, + "loss": 0.0107, + "num_input_tokens_seen": 26985216, + "step": 13784 + }, + { + "epoch": 1.8270377733598409, + "grad_norm": 12.878814697265625, + "learning_rate": 1.661734444814917e-06, + "loss": 0.3665, + "num_input_tokens_seen": 26987192, + "step": 13785 + }, + { + "epoch": 1.827170311464546, + "grad_norm": 0.008703788742423058, + "learning_rate": 1.661407411645264e-06, + "loss": 0.0, + "num_input_tokens_seen": 26989336, + "step": 13786 + }, + { + "epoch": 1.8273028495692512, + "grad_norm": 0.16823241114616394, + "learning_rate": 1.6610803946442546e-06, + "loss": 0.0008, + "num_input_tokens_seen": 26990960, + "step": 13787 + }, + { + "epoch": 1.8274353876739564, + "grad_norm": 1.7259310483932495, + "learning_rate": 1.6607533938181947e-06, + "loss": 0.0045, + "num_input_tokens_seen": 26992416, + "step": 13788 + }, + { + "epoch": 1.8275679257786615, + "grad_norm": 5.079961776733398, + "learning_rate": 1.6604264091733868e-06, + "loss": 0.1147, + "num_input_tokens_seen": 26995592, + "step": 13789 + }, + { + "epoch": 1.8277004638833665, + "grad_norm": 13.560264587402344, + "learning_rate": 1.6600994407161374e-06, + "loss": 0.3034, + "num_input_tokens_seen": 26998032, + "step": 13790 + }, + { + "epoch": 1.8278330019880715, + "grad_norm": 13.923527717590332, + "learning_rate": 1.659772488452751e-06, + "loss": 0.1741, + "num_input_tokens_seen": 27000152, + "step": 13791 + }, + { + "epoch": 1.8279655400927766, + "grad_norm": 5.546554088592529, + "learning_rate": 1.6594455523895297e-06, + "loss": 0.0912, + "num_input_tokens_seen": 27001560, + "step": 13792 + }, + { + "epoch": 1.8280980781974818, + "grad_norm": 0.16090849041938782, + "learning_rate": 1.659118632532779e-06, + "loss": 0.0008, + "num_input_tokens_seen": 27004304, + "step": 13793 + }, + { + "epoch": 1.8282306163021869, + "grad_norm": 7.135461807250977, + "learning_rate": 1.6587917288888e-06, + "loss": 0.1001, + "num_input_tokens_seen": 27005992, + "step": 13794 + }, + { + "epoch": 1.8283631544068921, + "grad_norm": 12.784433364868164, + "learning_rate": 1.6584648414638978e-06, + "loss": 0.3311, + "num_input_tokens_seen": 27007816, + "step": 13795 + }, + { + "epoch": 1.8284956925115972, + "grad_norm": 8.874312400817871, + "learning_rate": 1.6581379702643735e-06, + "loss": 0.2221, + "num_input_tokens_seen": 27009736, + "step": 13796 + }, + { + "epoch": 1.8286282306163022, + "grad_norm": 3.3028244972229004, + "learning_rate": 1.6578111152965304e-06, + "loss": 0.0928, + "num_input_tokens_seen": 27011040, + "step": 13797 + }, + { + "epoch": 1.8287607687210072, + "grad_norm": 0.1971087008714676, + "learning_rate": 1.6574842765666696e-06, + "loss": 0.0017, + "num_input_tokens_seen": 27012776, + "step": 13798 + }, + { + "epoch": 1.8288933068257123, + "grad_norm": 11.006721496582031, + "learning_rate": 1.6571574540810924e-06, + "loss": 0.143, + "num_input_tokens_seen": 27015296, + "step": 13799 + }, + { + "epoch": 1.8290258449304175, + "grad_norm": 13.00698184967041, + "learning_rate": 1.6568306478461021e-06, + "loss": 0.3248, + "num_input_tokens_seen": 27017784, + "step": 13800 + }, + { + "epoch": 1.8291583830351226, + "grad_norm": 0.019007472321391106, + "learning_rate": 1.6565038578679981e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27019336, + "step": 13801 + }, + { + "epoch": 1.8292909211398278, + "grad_norm": 10.42569637298584, + "learning_rate": 1.6561770841530822e-06, + "loss": 0.225, + "num_input_tokens_seen": 27021536, + "step": 13802 + }, + { + "epoch": 1.8294234592445329, + "grad_norm": 0.6118856072425842, + "learning_rate": 1.6558503267076542e-06, + "loss": 0.0009, + "num_input_tokens_seen": 27023296, + "step": 13803 + }, + { + "epoch": 1.829555997349238, + "grad_norm": 9.374180793762207, + "learning_rate": 1.6555235855380137e-06, + "loss": 0.1746, + "num_input_tokens_seen": 27025944, + "step": 13804 + }, + { + "epoch": 1.829688535453943, + "grad_norm": 0.3436411917209625, + "learning_rate": 1.6551968606504604e-06, + "loss": 0.0019, + "num_input_tokens_seen": 27028376, + "step": 13805 + }, + { + "epoch": 1.829821073558648, + "grad_norm": 3.397745132446289, + "learning_rate": 1.6548701520512954e-06, + "loss": 0.0346, + "num_input_tokens_seen": 27029856, + "step": 13806 + }, + { + "epoch": 1.8299536116633532, + "grad_norm": 5.769129753112793, + "learning_rate": 1.6545434597468169e-06, + "loss": 0.076, + "num_input_tokens_seen": 27031688, + "step": 13807 + }, + { + "epoch": 1.8300861497680583, + "grad_norm": 5.274041175842285, + "learning_rate": 1.6542167837433237e-06, + "loss": 0.1264, + "num_input_tokens_seen": 27033400, + "step": 13808 + }, + { + "epoch": 1.8302186878727635, + "grad_norm": 2.8539609909057617, + "learning_rate": 1.6538901240471146e-06, + "loss": 0.0187, + "num_input_tokens_seen": 27035072, + "step": 13809 + }, + { + "epoch": 1.8303512259774686, + "grad_norm": 6.022241115570068, + "learning_rate": 1.6535634806644868e-06, + "loss": 0.171, + "num_input_tokens_seen": 27037248, + "step": 13810 + }, + { + "epoch": 1.8304837640821736, + "grad_norm": 5.588782787322998, + "learning_rate": 1.6532368536017393e-06, + "loss": 0.0299, + "num_input_tokens_seen": 27040104, + "step": 13811 + }, + { + "epoch": 1.8306163021868787, + "grad_norm": 11.182650566101074, + "learning_rate": 1.65291024286517e-06, + "loss": 0.3177, + "num_input_tokens_seen": 27042304, + "step": 13812 + }, + { + "epoch": 1.8307488402915837, + "grad_norm": 2.9568207263946533, + "learning_rate": 1.652583648461075e-06, + "loss": 0.0317, + "num_input_tokens_seen": 27044256, + "step": 13813 + }, + { + "epoch": 1.830881378396289, + "grad_norm": 7.3278117179870605, + "learning_rate": 1.6522570703957524e-06, + "loss": 0.1726, + "num_input_tokens_seen": 27046656, + "step": 13814 + }, + { + "epoch": 1.831013916500994, + "grad_norm": 6.596843719482422, + "learning_rate": 1.6519305086754973e-06, + "loss": 0.0889, + "num_input_tokens_seen": 27048896, + "step": 13815 + }, + { + "epoch": 1.8311464546056992, + "grad_norm": 1.1608270406723022, + "learning_rate": 1.6516039633066072e-06, + "loss": 0.0064, + "num_input_tokens_seen": 27050328, + "step": 13816 + }, + { + "epoch": 1.8312789927104043, + "grad_norm": 0.11678385734558105, + "learning_rate": 1.651277434295379e-06, + "loss": 0.0009, + "num_input_tokens_seen": 27052944, + "step": 13817 + }, + { + "epoch": 1.8314115308151093, + "grad_norm": 1.559276819229126, + "learning_rate": 1.6509509216481074e-06, + "loss": 0.0065, + "num_input_tokens_seen": 27054648, + "step": 13818 + }, + { + "epoch": 1.8315440689198144, + "grad_norm": 14.69687557220459, + "learning_rate": 1.6506244253710867e-06, + "loss": 0.2799, + "num_input_tokens_seen": 27056528, + "step": 13819 + }, + { + "epoch": 1.8316766070245194, + "grad_norm": 8.848258972167969, + "learning_rate": 1.6502979454706132e-06, + "loss": 0.1513, + "num_input_tokens_seen": 27058360, + "step": 13820 + }, + { + "epoch": 1.8318091451292247, + "grad_norm": 0.059766095131635666, + "learning_rate": 1.6499714819529822e-06, + "loss": 0.0004, + "num_input_tokens_seen": 27061080, + "step": 13821 + }, + { + "epoch": 1.83194168323393, + "grad_norm": 5.284430027008057, + "learning_rate": 1.6496450348244866e-06, + "loss": 0.1444, + "num_input_tokens_seen": 27063464, + "step": 13822 + }, + { + "epoch": 1.832074221338635, + "grad_norm": 0.013883347623050213, + "learning_rate": 1.6493186040914221e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27065136, + "step": 13823 + }, + { + "epoch": 1.83220675944334, + "grad_norm": 0.011097117327153683, + "learning_rate": 1.6489921897600814e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27066456, + "step": 13824 + }, + { + "epoch": 1.832339297548045, + "grad_norm": 3.7872228622436523, + "learning_rate": 1.6486657918367577e-06, + "loss": 0.0279, + "num_input_tokens_seen": 27067928, + "step": 13825 + }, + { + "epoch": 1.83247183565275, + "grad_norm": 2.3467352390289307, + "learning_rate": 1.648339410327746e-06, + "loss": 0.0238, + "num_input_tokens_seen": 27069232, + "step": 13826 + }, + { + "epoch": 1.832604373757455, + "grad_norm": 6.7321014404296875, + "learning_rate": 1.6480130452393378e-06, + "loss": 0.0255, + "num_input_tokens_seen": 27070784, + "step": 13827 + }, + { + "epoch": 1.8327369118621604, + "grad_norm": 1.1521660089492798, + "learning_rate": 1.6476866965778255e-06, + "loss": 0.0061, + "num_input_tokens_seen": 27072480, + "step": 13828 + }, + { + "epoch": 1.8328694499668656, + "grad_norm": 7.264502048492432, + "learning_rate": 1.6473603643495022e-06, + "loss": 0.0687, + "num_input_tokens_seen": 27074232, + "step": 13829 + }, + { + "epoch": 1.8330019880715707, + "grad_norm": 0.0028025333303958178, + "learning_rate": 1.6470340485606589e-06, + "loss": 0.0, + "num_input_tokens_seen": 27075144, + "step": 13830 + }, + { + "epoch": 1.8331345261762757, + "grad_norm": 9.593247413635254, + "learning_rate": 1.6467077492175868e-06, + "loss": 0.2325, + "num_input_tokens_seen": 27077752, + "step": 13831 + }, + { + "epoch": 1.8332670642809807, + "grad_norm": 4.072556018829346, + "learning_rate": 1.646381466326579e-06, + "loss": 0.0884, + "num_input_tokens_seen": 27079960, + "step": 13832 + }, + { + "epoch": 1.8333996023856858, + "grad_norm": 4.867427825927734, + "learning_rate": 1.6460551998939256e-06, + "loss": 0.0427, + "num_input_tokens_seen": 27083048, + "step": 13833 + }, + { + "epoch": 1.833532140490391, + "grad_norm": 5.690303325653076, + "learning_rate": 1.6457289499259163e-06, + "loss": 0.206, + "num_input_tokens_seen": 27085240, + "step": 13834 + }, + { + "epoch": 1.833664678595096, + "grad_norm": 0.08800266683101654, + "learning_rate": 1.6454027164288427e-06, + "loss": 0.0006, + "num_input_tokens_seen": 27086888, + "step": 13835 + }, + { + "epoch": 1.8337972166998013, + "grad_norm": 11.869974136352539, + "learning_rate": 1.6450764994089936e-06, + "loss": 0.2796, + "num_input_tokens_seen": 27089128, + "step": 13836 + }, + { + "epoch": 1.8339297548045064, + "grad_norm": 2.932394027709961, + "learning_rate": 1.6447502988726593e-06, + "loss": 0.0314, + "num_input_tokens_seen": 27091160, + "step": 13837 + }, + { + "epoch": 1.8340622929092114, + "grad_norm": 0.0045118173584342, + "learning_rate": 1.6444241148261304e-06, + "loss": 0.0, + "num_input_tokens_seen": 27092552, + "step": 13838 + }, + { + "epoch": 1.8341948310139164, + "grad_norm": 9.20363712310791, + "learning_rate": 1.6440979472756937e-06, + "loss": 0.1359, + "num_input_tokens_seen": 27095384, + "step": 13839 + }, + { + "epoch": 1.8343273691186215, + "grad_norm": 6.207511901855469, + "learning_rate": 1.6437717962276401e-06, + "loss": 0.144, + "num_input_tokens_seen": 27097704, + "step": 13840 + }, + { + "epoch": 1.8344599072233267, + "grad_norm": 0.47158950567245483, + "learning_rate": 1.6434456616882555e-06, + "loss": 0.0027, + "num_input_tokens_seen": 27100760, + "step": 13841 + }, + { + "epoch": 1.8345924453280318, + "grad_norm": 3.188619375228882, + "learning_rate": 1.6431195436638306e-06, + "loss": 0.0398, + "num_input_tokens_seen": 27102584, + "step": 13842 + }, + { + "epoch": 1.834724983432737, + "grad_norm": 8.77248477935791, + "learning_rate": 1.6427934421606518e-06, + "loss": 0.1688, + "num_input_tokens_seen": 27104424, + "step": 13843 + }, + { + "epoch": 1.834857521537442, + "grad_norm": 0.036850396543741226, + "learning_rate": 1.642467357185007e-06, + "loss": 0.0004, + "num_input_tokens_seen": 27105824, + "step": 13844 + }, + { + "epoch": 1.8349900596421471, + "grad_norm": 4.096722602844238, + "learning_rate": 1.6421412887431827e-06, + "loss": 0.0875, + "num_input_tokens_seen": 27107904, + "step": 13845 + }, + { + "epoch": 1.8351225977468522, + "grad_norm": 1.0746254920959473, + "learning_rate": 1.6418152368414653e-06, + "loss": 0.0095, + "num_input_tokens_seen": 27109304, + "step": 13846 + }, + { + "epoch": 1.8352551358515572, + "grad_norm": 3.4912657737731934, + "learning_rate": 1.6414892014861435e-06, + "loss": 0.0568, + "num_input_tokens_seen": 27111008, + "step": 13847 + }, + { + "epoch": 1.8353876739562625, + "grad_norm": 0.034731313586235046, + "learning_rate": 1.6411631826835023e-06, + "loss": 0.0002, + "num_input_tokens_seen": 27112296, + "step": 13848 + }, + { + "epoch": 1.8355202120609675, + "grad_norm": 0.11095807701349258, + "learning_rate": 1.6408371804398265e-06, + "loss": 0.0008, + "num_input_tokens_seen": 27114776, + "step": 13849 + }, + { + "epoch": 1.8356527501656728, + "grad_norm": 0.028012752532958984, + "learning_rate": 1.6405111947614036e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27116176, + "step": 13850 + }, + { + "epoch": 1.8357852882703778, + "grad_norm": 0.01476307399570942, + "learning_rate": 1.6401852256545164e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27117632, + "step": 13851 + }, + { + "epoch": 1.8359178263750828, + "grad_norm": 5.5086870193481445, + "learning_rate": 1.639859273125452e-06, + "loss": 0.151, + "num_input_tokens_seen": 27121088, + "step": 13852 + }, + { + "epoch": 1.8360503644797879, + "grad_norm": 0.03542112931609154, + "learning_rate": 1.6395333371804945e-06, + "loss": 0.0002, + "num_input_tokens_seen": 27122480, + "step": 13853 + }, + { + "epoch": 1.836182902584493, + "grad_norm": 8.717010498046875, + "learning_rate": 1.6392074178259275e-06, + "loss": 0.2459, + "num_input_tokens_seen": 27124240, + "step": 13854 + }, + { + "epoch": 1.8363154406891982, + "grad_norm": 2.0424365997314453, + "learning_rate": 1.6388815150680354e-06, + "loss": 0.0335, + "num_input_tokens_seen": 27125632, + "step": 13855 + }, + { + "epoch": 1.8364479787939032, + "grad_norm": 8.256789207458496, + "learning_rate": 1.6385556289131011e-06, + "loss": 0.0681, + "num_input_tokens_seen": 27127088, + "step": 13856 + }, + { + "epoch": 1.8365805168986085, + "grad_norm": 0.03946193307638168, + "learning_rate": 1.6382297593674095e-06, + "loss": 0.0003, + "num_input_tokens_seen": 27129312, + "step": 13857 + }, + { + "epoch": 1.8367130550033135, + "grad_norm": 0.06909149140119553, + "learning_rate": 1.637903906437242e-06, + "loss": 0.0005, + "num_input_tokens_seen": 27131520, + "step": 13858 + }, + { + "epoch": 1.8368455931080185, + "grad_norm": 5.448919773101807, + "learning_rate": 1.637578070128883e-06, + "loss": 0.0683, + "num_input_tokens_seen": 27132952, + "step": 13859 + }, + { + "epoch": 1.8369781312127236, + "grad_norm": 5.2275896072387695, + "learning_rate": 1.6372522504486128e-06, + "loss": 0.1855, + "num_input_tokens_seen": 27134560, + "step": 13860 + }, + { + "epoch": 1.8371106693174286, + "grad_norm": 0.21436308324337006, + "learning_rate": 1.6369264474027147e-06, + "loss": 0.002, + "num_input_tokens_seen": 27136256, + "step": 13861 + }, + { + "epoch": 1.8372432074221339, + "grad_norm": 8.917954444885254, + "learning_rate": 1.6366006609974694e-06, + "loss": 0.1654, + "num_input_tokens_seen": 27138432, + "step": 13862 + }, + { + "epoch": 1.8373757455268391, + "grad_norm": 0.016078729182481766, + "learning_rate": 1.63627489123916e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27139904, + "step": 13863 + }, + { + "epoch": 1.8375082836315442, + "grad_norm": 0.3419841527938843, + "learning_rate": 1.6359491381340659e-06, + "loss": 0.002, + "num_input_tokens_seen": 27141424, + "step": 13864 + }, + { + "epoch": 1.8376408217362492, + "grad_norm": 18.228151321411133, + "learning_rate": 1.635623401688469e-06, + "loss": 0.4718, + "num_input_tokens_seen": 27143824, + "step": 13865 + }, + { + "epoch": 1.8377733598409542, + "grad_norm": 0.36260828375816345, + "learning_rate": 1.6352976819086487e-06, + "loss": 0.0024, + "num_input_tokens_seen": 27146424, + "step": 13866 + }, + { + "epoch": 1.8379058979456593, + "grad_norm": 7.668232440948486, + "learning_rate": 1.634971978800885e-06, + "loss": 0.1208, + "num_input_tokens_seen": 27148760, + "step": 13867 + }, + { + "epoch": 1.8380384360503643, + "grad_norm": 8.884526252746582, + "learning_rate": 1.6346462923714595e-06, + "loss": 0.1417, + "num_input_tokens_seen": 27150520, + "step": 13868 + }, + { + "epoch": 1.8381709741550696, + "grad_norm": 0.27711406350135803, + "learning_rate": 1.63432062262665e-06, + "loss": 0.0022, + "num_input_tokens_seen": 27151784, + "step": 13869 + }, + { + "epoch": 1.8383035122597748, + "grad_norm": 1.8439300060272217, + "learning_rate": 1.6339949695727366e-06, + "loss": 0.0177, + "num_input_tokens_seen": 27154184, + "step": 13870 + }, + { + "epoch": 1.8384360503644799, + "grad_norm": 10.513206481933594, + "learning_rate": 1.6336693332159976e-06, + "loss": 0.262, + "num_input_tokens_seen": 27156032, + "step": 13871 + }, + { + "epoch": 1.838568588469185, + "grad_norm": 6.80433464050293, + "learning_rate": 1.6333437135627106e-06, + "loss": 0.1038, + "num_input_tokens_seen": 27157584, + "step": 13872 + }, + { + "epoch": 1.83870112657389, + "grad_norm": 3.8492870330810547, + "learning_rate": 1.633018110619155e-06, + "loss": 0.0385, + "num_input_tokens_seen": 27159904, + "step": 13873 + }, + { + "epoch": 1.838833664678595, + "grad_norm": 1.2773135900497437, + "learning_rate": 1.6326925243916086e-06, + "loss": 0.0114, + "num_input_tokens_seen": 27162224, + "step": 13874 + }, + { + "epoch": 1.8389662027833003, + "grad_norm": 4.104182720184326, + "learning_rate": 1.6323669548863486e-06, + "loss": 0.0776, + "num_input_tokens_seen": 27163848, + "step": 13875 + }, + { + "epoch": 1.8390987408880053, + "grad_norm": 4.230804920196533, + "learning_rate": 1.6320414021096525e-06, + "loss": 0.0323, + "num_input_tokens_seen": 27165672, + "step": 13876 + }, + { + "epoch": 1.8392312789927105, + "grad_norm": 7.107476711273193, + "learning_rate": 1.6317158660677962e-06, + "loss": 0.104, + "num_input_tokens_seen": 27167248, + "step": 13877 + }, + { + "epoch": 1.8393638170974156, + "grad_norm": 0.048487525433301926, + "learning_rate": 1.6313903467670572e-06, + "loss": 0.0003, + "num_input_tokens_seen": 27168760, + "step": 13878 + }, + { + "epoch": 1.8394963552021206, + "grad_norm": 6.286853313446045, + "learning_rate": 1.6310648442137122e-06, + "loss": 0.0352, + "num_input_tokens_seen": 27170928, + "step": 13879 + }, + { + "epoch": 1.8396288933068257, + "grad_norm": 0.051462478935718536, + "learning_rate": 1.6307393584140362e-06, + "loss": 0.0003, + "num_input_tokens_seen": 27172368, + "step": 13880 + }, + { + "epoch": 1.8397614314115307, + "grad_norm": 1.7079052925109863, + "learning_rate": 1.6304138893743049e-06, + "loss": 0.0109, + "num_input_tokens_seen": 27174272, + "step": 13881 + }, + { + "epoch": 1.839893969516236, + "grad_norm": 3.1776630878448486, + "learning_rate": 1.630088437100793e-06, + "loss": 0.0732, + "num_input_tokens_seen": 27175648, + "step": 13882 + }, + { + "epoch": 1.840026507620941, + "grad_norm": 1.640306830406189, + "learning_rate": 1.6297630015997772e-06, + "loss": 0.0073, + "num_input_tokens_seen": 27177552, + "step": 13883 + }, + { + "epoch": 1.8401590457256463, + "grad_norm": 5.89121150970459, + "learning_rate": 1.6294375828775306e-06, + "loss": 0.1325, + "num_input_tokens_seen": 27179584, + "step": 13884 + }, + { + "epoch": 1.8402915838303513, + "grad_norm": 0.9397594928741455, + "learning_rate": 1.6291121809403287e-06, + "loss": 0.008, + "num_input_tokens_seen": 27181080, + "step": 13885 + }, + { + "epoch": 1.8404241219350563, + "grad_norm": 12.027832984924316, + "learning_rate": 1.6287867957944443e-06, + "loss": 0.1802, + "num_input_tokens_seen": 27183104, + "step": 13886 + }, + { + "epoch": 1.8405566600397614, + "grad_norm": 0.012186788953840733, + "learning_rate": 1.6284614274461514e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27184432, + "step": 13887 + }, + { + "epoch": 1.8406891981444664, + "grad_norm": 5.671597003936768, + "learning_rate": 1.6281360759017225e-06, + "loss": 0.0672, + "num_input_tokens_seen": 27186416, + "step": 13888 + }, + { + "epoch": 1.8408217362491717, + "grad_norm": 4.935153961181641, + "learning_rate": 1.6278107411674326e-06, + "loss": 0.0794, + "num_input_tokens_seen": 27188088, + "step": 13889 + }, + { + "epoch": 1.8409542743538767, + "grad_norm": 0.025124434381723404, + "learning_rate": 1.6274854232495533e-06, + "loss": 0.0002, + "num_input_tokens_seen": 27189216, + "step": 13890 + }, + { + "epoch": 1.841086812458582, + "grad_norm": 0.014137721620500088, + "learning_rate": 1.627160122154357e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27190336, + "step": 13891 + }, + { + "epoch": 1.841219350563287, + "grad_norm": 8.36864185333252, + "learning_rate": 1.6268348378881154e-06, + "loss": 0.1229, + "num_input_tokens_seen": 27192560, + "step": 13892 + }, + { + "epoch": 1.841351888667992, + "grad_norm": 2.253206729888916, + "learning_rate": 1.6265095704570998e-06, + "loss": 0.0254, + "num_input_tokens_seen": 27194656, + "step": 13893 + }, + { + "epoch": 1.841484426772697, + "grad_norm": 0.022708438336849213, + "learning_rate": 1.6261843198675834e-06, + "loss": 0.0002, + "num_input_tokens_seen": 27196712, + "step": 13894 + }, + { + "epoch": 1.8416169648774021, + "grad_norm": 15.598467826843262, + "learning_rate": 1.625859086125836e-06, + "loss": 0.4337, + "num_input_tokens_seen": 27198640, + "step": 13895 + }, + { + "epoch": 1.8417495029821074, + "grad_norm": 3.3812882900238037, + "learning_rate": 1.6255338692381281e-06, + "loss": 0.0298, + "num_input_tokens_seen": 27201216, + "step": 13896 + }, + { + "epoch": 1.8418820410868124, + "grad_norm": 0.04800642654299736, + "learning_rate": 1.6252086692107311e-06, + "loss": 0.0003, + "num_input_tokens_seen": 27202672, + "step": 13897 + }, + { + "epoch": 1.8420145791915177, + "grad_norm": 6.243416786193848, + "learning_rate": 1.624883486049913e-06, + "loss": 0.0231, + "num_input_tokens_seen": 27205240, + "step": 13898 + }, + { + "epoch": 1.8421471172962227, + "grad_norm": 5.604911804199219, + "learning_rate": 1.6245583197619457e-06, + "loss": 0.1072, + "num_input_tokens_seen": 27207776, + "step": 13899 + }, + { + "epoch": 1.8422796554009278, + "grad_norm": 11.394950866699219, + "learning_rate": 1.6242331703530987e-06, + "loss": 0.266, + "num_input_tokens_seen": 27210536, + "step": 13900 + }, + { + "epoch": 1.8424121935056328, + "grad_norm": 0.3222777545452118, + "learning_rate": 1.62390803782964e-06, + "loss": 0.0018, + "num_input_tokens_seen": 27212432, + "step": 13901 + }, + { + "epoch": 1.8425447316103378, + "grad_norm": 0.02999718300998211, + "learning_rate": 1.6235829221978384e-06, + "loss": 0.0002, + "num_input_tokens_seen": 27215920, + "step": 13902 + }, + { + "epoch": 1.842677269715043, + "grad_norm": 0.12040308117866516, + "learning_rate": 1.623257823463962e-06, + "loss": 0.0009, + "num_input_tokens_seen": 27217680, + "step": 13903 + }, + { + "epoch": 1.8428098078197481, + "grad_norm": 12.774290084838867, + "learning_rate": 1.6229327416342807e-06, + "loss": 0.2924, + "num_input_tokens_seen": 27220720, + "step": 13904 + }, + { + "epoch": 1.8429423459244534, + "grad_norm": 0.04908725991845131, + "learning_rate": 1.6226076767150606e-06, + "loss": 0.0003, + "num_input_tokens_seen": 27222592, + "step": 13905 + }, + { + "epoch": 1.8430748840291584, + "grad_norm": 0.043988581746816635, + "learning_rate": 1.6222826287125703e-06, + "loss": 0.0003, + "num_input_tokens_seen": 27224520, + "step": 13906 + }, + { + "epoch": 1.8432074221338635, + "grad_norm": 3.9521000385284424, + "learning_rate": 1.6219575976330761e-06, + "loss": 0.125, + "num_input_tokens_seen": 27226824, + "step": 13907 + }, + { + "epoch": 1.8433399602385685, + "grad_norm": 0.17472125589847565, + "learning_rate": 1.6216325834828448e-06, + "loss": 0.0006, + "num_input_tokens_seen": 27228456, + "step": 13908 + }, + { + "epoch": 1.8434724983432735, + "grad_norm": 8.633052825927734, + "learning_rate": 1.6213075862681438e-06, + "loss": 0.1033, + "num_input_tokens_seen": 27230704, + "step": 13909 + }, + { + "epoch": 1.8436050364479788, + "grad_norm": 0.26491162180900574, + "learning_rate": 1.6209826059952389e-06, + "loss": 0.0014, + "num_input_tokens_seen": 27232040, + "step": 13910 + }, + { + "epoch": 1.843737574552684, + "grad_norm": 5.425374984741211, + "learning_rate": 1.6206576426703955e-06, + "loss": 0.1877, + "num_input_tokens_seen": 27234424, + "step": 13911 + }, + { + "epoch": 1.843870112657389, + "grad_norm": 0.01962396875023842, + "learning_rate": 1.6203326962998795e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27236416, + "step": 13912 + }, + { + "epoch": 1.8440026507620941, + "grad_norm": 3.920288324356079, + "learning_rate": 1.620007766889955e-06, + "loss": 0.0815, + "num_input_tokens_seen": 27238360, + "step": 13913 + }, + { + "epoch": 1.8441351888667992, + "grad_norm": 0.009308967739343643, + "learning_rate": 1.6196828544468884e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27239960, + "step": 13914 + }, + { + "epoch": 1.8442677269715042, + "grad_norm": 4.488587856292725, + "learning_rate": 1.6193579589769442e-06, + "loss": 0.0794, + "num_input_tokens_seen": 27241488, + "step": 13915 + }, + { + "epoch": 1.8444002650762092, + "grad_norm": 0.12009155750274658, + "learning_rate": 1.6190330804863862e-06, + "loss": 0.0009, + "num_input_tokens_seen": 27242688, + "step": 13916 + }, + { + "epoch": 1.8445328031809145, + "grad_norm": 6.791052341461182, + "learning_rate": 1.6187082189814776e-06, + "loss": 0.0646, + "num_input_tokens_seen": 27244456, + "step": 13917 + }, + { + "epoch": 1.8446653412856198, + "grad_norm": 0.3420785963535309, + "learning_rate": 1.618383374468483e-06, + "loss": 0.0017, + "num_input_tokens_seen": 27247152, + "step": 13918 + }, + { + "epoch": 1.8447978793903248, + "grad_norm": 7.968814849853516, + "learning_rate": 1.6180585469536642e-06, + "loss": 0.1689, + "num_input_tokens_seen": 27248880, + "step": 13919 + }, + { + "epoch": 1.8449304174950298, + "grad_norm": 12.798491477966309, + "learning_rate": 1.6177337364432856e-06, + "loss": 0.1882, + "num_input_tokens_seen": 27251168, + "step": 13920 + }, + { + "epoch": 1.8450629555997349, + "grad_norm": 0.011770655401051044, + "learning_rate": 1.617408942943609e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27252512, + "step": 13921 + }, + { + "epoch": 1.84519549370444, + "grad_norm": 8.078296661376953, + "learning_rate": 1.6170841664608972e-06, + "loss": 0.0919, + "num_input_tokens_seen": 27254704, + "step": 13922 + }, + { + "epoch": 1.8453280318091452, + "grad_norm": 10.796738624572754, + "learning_rate": 1.616759407001412e-06, + "loss": 0.1064, + "num_input_tokens_seen": 27256600, + "step": 13923 + }, + { + "epoch": 1.8454605699138502, + "grad_norm": 5.751252174377441, + "learning_rate": 1.6164346645714136e-06, + "loss": 0.0367, + "num_input_tokens_seen": 27258288, + "step": 13924 + }, + { + "epoch": 1.8455931080185555, + "grad_norm": 7.025866985321045, + "learning_rate": 1.6161099391771657e-06, + "loss": 0.1076, + "num_input_tokens_seen": 27260544, + "step": 13925 + }, + { + "epoch": 1.8457256461232605, + "grad_norm": 0.058026351034641266, + "learning_rate": 1.615785230824927e-06, + "loss": 0.0004, + "num_input_tokens_seen": 27262656, + "step": 13926 + }, + { + "epoch": 1.8458581842279655, + "grad_norm": 9.957240104675293, + "learning_rate": 1.61546053952096e-06, + "loss": 0.1794, + "num_input_tokens_seen": 27265104, + "step": 13927 + }, + { + "epoch": 1.8459907223326706, + "grad_norm": 0.019311200827360153, + "learning_rate": 1.6151358652715232e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27266552, + "step": 13928 + }, + { + "epoch": 1.8461232604373756, + "grad_norm": 5.569981098175049, + "learning_rate": 1.6148112080828769e-06, + "loss": 0.1175, + "num_input_tokens_seen": 27267888, + "step": 13929 + }, + { + "epoch": 1.8462557985420809, + "grad_norm": 13.44492244720459, + "learning_rate": 1.614486567961282e-06, + "loss": 0.3523, + "num_input_tokens_seen": 27270104, + "step": 13930 + }, + { + "epoch": 1.846388336646786, + "grad_norm": 0.009062015451490879, + "learning_rate": 1.6141619449129967e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27272192, + "step": 13931 + }, + { + "epoch": 1.8465208747514912, + "grad_norm": 6.686935901641846, + "learning_rate": 1.6138373389442808e-06, + "loss": 0.1667, + "num_input_tokens_seen": 27274240, + "step": 13932 + }, + { + "epoch": 1.8466534128561962, + "grad_norm": 10.427565574645996, + "learning_rate": 1.6135127500613923e-06, + "loss": 0.2066, + "num_input_tokens_seen": 27275936, + "step": 13933 + }, + { + "epoch": 1.8467859509609013, + "grad_norm": 0.014309673570096493, + "learning_rate": 1.6131881782705882e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27277224, + "step": 13934 + }, + { + "epoch": 1.8469184890656063, + "grad_norm": 1.2452702522277832, + "learning_rate": 1.6128636235781287e-06, + "loss": 0.0141, + "num_input_tokens_seen": 27279720, + "step": 13935 + }, + { + "epoch": 1.8470510271703113, + "grad_norm": 7.199913024902344, + "learning_rate": 1.6125390859902707e-06, + "loss": 0.045, + "num_input_tokens_seen": 27281352, + "step": 13936 + }, + { + "epoch": 1.8471835652750166, + "grad_norm": 4.5833024978637695, + "learning_rate": 1.6122145655132708e-06, + "loss": 0.0523, + "num_input_tokens_seen": 27283384, + "step": 13937 + }, + { + "epoch": 1.8473161033797216, + "grad_norm": 0.027738282456994057, + "learning_rate": 1.6118900621533876e-06, + "loss": 0.0002, + "num_input_tokens_seen": 27284768, + "step": 13938 + }, + { + "epoch": 1.847448641484427, + "grad_norm": 8.264155387878418, + "learning_rate": 1.6115655759168752e-06, + "loss": 0.1847, + "num_input_tokens_seen": 27286624, + "step": 13939 + }, + { + "epoch": 1.847581179589132, + "grad_norm": 0.049756504595279694, + "learning_rate": 1.6112411068099926e-06, + "loss": 0.0003, + "num_input_tokens_seen": 27288376, + "step": 13940 + }, + { + "epoch": 1.847713717693837, + "grad_norm": 4.072049140930176, + "learning_rate": 1.6109166548389944e-06, + "loss": 0.0824, + "num_input_tokens_seen": 27290784, + "step": 13941 + }, + { + "epoch": 1.847846255798542, + "grad_norm": 0.01436234824359417, + "learning_rate": 1.6105922200101363e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27292512, + "step": 13942 + }, + { + "epoch": 1.847978793903247, + "grad_norm": 9.681053161621094, + "learning_rate": 1.6102678023296736e-06, + "loss": 0.0317, + "num_input_tokens_seen": 27294440, + "step": 13943 + }, + { + "epoch": 1.8481113320079523, + "grad_norm": 6.828869819641113, + "learning_rate": 1.6099434018038618e-06, + "loss": 0.1161, + "num_input_tokens_seen": 27296840, + "step": 13944 + }, + { + "epoch": 1.8482438701126573, + "grad_norm": 0.17491109669208527, + "learning_rate": 1.6096190184389543e-06, + "loss": 0.0012, + "num_input_tokens_seen": 27298392, + "step": 13945 + }, + { + "epoch": 1.8483764082173626, + "grad_norm": 0.04074319452047348, + "learning_rate": 1.6092946522412067e-06, + "loss": 0.0003, + "num_input_tokens_seen": 27299800, + "step": 13946 + }, + { + "epoch": 1.8485089463220676, + "grad_norm": 1.0000303983688354, + "learning_rate": 1.6089703032168736e-06, + "loss": 0.008, + "num_input_tokens_seen": 27301680, + "step": 13947 + }, + { + "epoch": 1.8486414844267727, + "grad_norm": 5.7232489585876465, + "learning_rate": 1.6086459713722074e-06, + "loss": 0.052, + "num_input_tokens_seen": 27303464, + "step": 13948 + }, + { + "epoch": 1.8487740225314777, + "grad_norm": 0.5723093152046204, + "learning_rate": 1.6083216567134613e-06, + "loss": 0.0028, + "num_input_tokens_seen": 27305280, + "step": 13949 + }, + { + "epoch": 1.8489065606361827, + "grad_norm": 7.835122108459473, + "learning_rate": 1.6079973592468885e-06, + "loss": 0.3004, + "num_input_tokens_seen": 27307936, + "step": 13950 + }, + { + "epoch": 1.849039098740888, + "grad_norm": 6.931849479675293, + "learning_rate": 1.6076730789787432e-06, + "loss": 0.053, + "num_input_tokens_seen": 27309952, + "step": 13951 + }, + { + "epoch": 1.8491716368455933, + "grad_norm": 6.869166374206543, + "learning_rate": 1.6073488159152756e-06, + "loss": 0.1109, + "num_input_tokens_seen": 27311904, + "step": 13952 + }, + { + "epoch": 1.8493041749502983, + "grad_norm": 1.4577000141143799, + "learning_rate": 1.607024570062739e-06, + "loss": 0.0096, + "num_input_tokens_seen": 27313232, + "step": 13953 + }, + { + "epoch": 1.8494367130550033, + "grad_norm": 0.00696646748110652, + "learning_rate": 1.6067003414273852e-06, + "loss": 0.0, + "num_input_tokens_seen": 27314680, + "step": 13954 + }, + { + "epoch": 1.8495692511597084, + "grad_norm": 6.642423152923584, + "learning_rate": 1.6063761300154637e-06, + "loss": 0.0996, + "num_input_tokens_seen": 27316664, + "step": 13955 + }, + { + "epoch": 1.8497017892644134, + "grad_norm": 0.1272624433040619, + "learning_rate": 1.6060519358332274e-06, + "loss": 0.0008, + "num_input_tokens_seen": 27318480, + "step": 13956 + }, + { + "epoch": 1.8498343273691185, + "grad_norm": 0.01872388646006584, + "learning_rate": 1.6057277588869272e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27319840, + "step": 13957 + }, + { + "epoch": 1.8499668654738237, + "grad_norm": 13.574581146240234, + "learning_rate": 1.605403599182812e-06, + "loss": 0.2887, + "num_input_tokens_seen": 27321576, + "step": 13958 + }, + { + "epoch": 1.850099403578529, + "grad_norm": 10.073512077331543, + "learning_rate": 1.6050794567271332e-06, + "loss": 0.1268, + "num_input_tokens_seen": 27323912, + "step": 13959 + }, + { + "epoch": 1.850231941683234, + "grad_norm": 7.7751784324646, + "learning_rate": 1.6047553315261389e-06, + "loss": 0.1739, + "num_input_tokens_seen": 27326104, + "step": 13960 + }, + { + "epoch": 1.850364479787939, + "grad_norm": 0.280917763710022, + "learning_rate": 1.6044312235860798e-06, + "loss": 0.0029, + "num_input_tokens_seen": 27328960, + "step": 13961 + }, + { + "epoch": 1.850497017892644, + "grad_norm": 0.022387413308024406, + "learning_rate": 1.6041071329132051e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27330936, + "step": 13962 + }, + { + "epoch": 1.8506295559973491, + "grad_norm": 7.828739643096924, + "learning_rate": 1.603783059513763e-06, + "loss": 0.186, + "num_input_tokens_seen": 27332472, + "step": 13963 + }, + { + "epoch": 1.8507620941020544, + "grad_norm": 11.340049743652344, + "learning_rate": 1.6034590033940012e-06, + "loss": 0.3185, + "num_input_tokens_seen": 27334648, + "step": 13964 + }, + { + "epoch": 1.8508946322067594, + "grad_norm": 0.044904373586177826, + "learning_rate": 1.6031349645601676e-06, + "loss": 0.0003, + "num_input_tokens_seen": 27338744, + "step": 13965 + }, + { + "epoch": 1.8510271703114647, + "grad_norm": 3.734807014465332, + "learning_rate": 1.6028109430185118e-06, + "loss": 0.1173, + "num_input_tokens_seen": 27340216, + "step": 13966 + }, + { + "epoch": 1.8511597084161697, + "grad_norm": 8.725793838500977, + "learning_rate": 1.6024869387752795e-06, + "loss": 0.2331, + "num_input_tokens_seen": 27342480, + "step": 13967 + }, + { + "epoch": 1.8512922465208748, + "grad_norm": 0.06117292121052742, + "learning_rate": 1.6021629518367187e-06, + "loss": 0.0004, + "num_input_tokens_seen": 27343904, + "step": 13968 + }, + { + "epoch": 1.8514247846255798, + "grad_norm": 0.13022427260875702, + "learning_rate": 1.6018389822090749e-06, + "loss": 0.0009, + "num_input_tokens_seen": 27345912, + "step": 13969 + }, + { + "epoch": 1.8515573227302848, + "grad_norm": 1.2696954011917114, + "learning_rate": 1.6015150298985955e-06, + "loss": 0.0085, + "num_input_tokens_seen": 27347576, + "step": 13970 + }, + { + "epoch": 1.85168986083499, + "grad_norm": 3.114445209503174, + "learning_rate": 1.6011910949115255e-06, + "loss": 0.0297, + "num_input_tokens_seen": 27349512, + "step": 13971 + }, + { + "epoch": 1.8518223989396951, + "grad_norm": 1.1157437562942505, + "learning_rate": 1.6008671772541123e-06, + "loss": 0.008, + "num_input_tokens_seen": 27351552, + "step": 13972 + }, + { + "epoch": 1.8519549370444004, + "grad_norm": 7.569608211517334, + "learning_rate": 1.6005432769325996e-06, + "loss": 0.2241, + "num_input_tokens_seen": 27354200, + "step": 13973 + }, + { + "epoch": 1.8520874751491054, + "grad_norm": 4.073066234588623, + "learning_rate": 1.6002193939532334e-06, + "loss": 0.0974, + "num_input_tokens_seen": 27355872, + "step": 13974 + }, + { + "epoch": 1.8522200132538105, + "grad_norm": 15.10389518737793, + "learning_rate": 1.5998955283222575e-06, + "loss": 0.3372, + "num_input_tokens_seen": 27358112, + "step": 13975 + }, + { + "epoch": 1.8523525513585155, + "grad_norm": 0.165694460272789, + "learning_rate": 1.5995716800459165e-06, + "loss": 0.0017, + "num_input_tokens_seen": 27360560, + "step": 13976 + }, + { + "epoch": 1.8524850894632205, + "grad_norm": 0.12496459484100342, + "learning_rate": 1.5992478491304553e-06, + "loss": 0.0007, + "num_input_tokens_seen": 27362000, + "step": 13977 + }, + { + "epoch": 1.8526176275679258, + "grad_norm": 14.195602416992188, + "learning_rate": 1.5989240355821174e-06, + "loss": 0.2502, + "num_input_tokens_seen": 27363720, + "step": 13978 + }, + { + "epoch": 1.8527501656726308, + "grad_norm": 7.37322473526001, + "learning_rate": 1.5986002394071448e-06, + "loss": 0.1504, + "num_input_tokens_seen": 27366216, + "step": 13979 + }, + { + "epoch": 1.852882703777336, + "grad_norm": 0.9753274321556091, + "learning_rate": 1.5982764606117824e-06, + "loss": 0.0058, + "num_input_tokens_seen": 27369520, + "step": 13980 + }, + { + "epoch": 1.8530152418820411, + "grad_norm": 0.024563701823353767, + "learning_rate": 1.5979526992022704e-06, + "loss": 0.0002, + "num_input_tokens_seen": 27370848, + "step": 13981 + }, + { + "epoch": 1.8531477799867462, + "grad_norm": 6.1171159744262695, + "learning_rate": 1.5976289551848534e-06, + "loss": 0.1848, + "num_input_tokens_seen": 27373552, + "step": 13982 + }, + { + "epoch": 1.8532803180914512, + "grad_norm": 0.3338046967983246, + "learning_rate": 1.5973052285657732e-06, + "loss": 0.0022, + "num_input_tokens_seen": 27375648, + "step": 13983 + }, + { + "epoch": 1.8534128561961563, + "grad_norm": 0.04239899292588234, + "learning_rate": 1.5969815193512706e-06, + "loss": 0.0003, + "num_input_tokens_seen": 27378784, + "step": 13984 + }, + { + "epoch": 1.8535453943008615, + "grad_norm": 6.776947021484375, + "learning_rate": 1.5966578275475874e-06, + "loss": 0.1346, + "num_input_tokens_seen": 27380392, + "step": 13985 + }, + { + "epoch": 1.8536779324055666, + "grad_norm": 4.0636982917785645, + "learning_rate": 1.5963341531609636e-06, + "loss": 0.0526, + "num_input_tokens_seen": 27382176, + "step": 13986 + }, + { + "epoch": 1.8538104705102718, + "grad_norm": 9.393250465393066, + "learning_rate": 1.5960104961976416e-06, + "loss": 0.131, + "num_input_tokens_seen": 27384184, + "step": 13987 + }, + { + "epoch": 1.8539430086149769, + "grad_norm": 9.086524963378906, + "learning_rate": 1.5956868566638606e-06, + "loss": 0.1133, + "num_input_tokens_seen": 27387496, + "step": 13988 + }, + { + "epoch": 1.8540755467196819, + "grad_norm": 0.10945142805576324, + "learning_rate": 1.5953632345658613e-06, + "loss": 0.0006, + "num_input_tokens_seen": 27389392, + "step": 13989 + }, + { + "epoch": 1.854208084824387, + "grad_norm": 0.21892905235290527, + "learning_rate": 1.5950396299098824e-06, + "loss": 0.0022, + "num_input_tokens_seen": 27391104, + "step": 13990 + }, + { + "epoch": 1.854340622929092, + "grad_norm": 4.318899631500244, + "learning_rate": 1.594716042702163e-06, + "loss": 0.1062, + "num_input_tokens_seen": 27393056, + "step": 13991 + }, + { + "epoch": 1.8544731610337972, + "grad_norm": 8.38369369506836, + "learning_rate": 1.594392472948944e-06, + "loss": 0.1179, + "num_input_tokens_seen": 27394632, + "step": 13992 + }, + { + "epoch": 1.8546056991385025, + "grad_norm": 2.521267890930176, + "learning_rate": 1.594068920656463e-06, + "loss": 0.0247, + "num_input_tokens_seen": 27396520, + "step": 13993 + }, + { + "epoch": 1.8547382372432075, + "grad_norm": 9.66817569732666, + "learning_rate": 1.5937453858309576e-06, + "loss": 0.0745, + "num_input_tokens_seen": 27398440, + "step": 13994 + }, + { + "epoch": 1.8548707753479126, + "grad_norm": 0.025035610422492027, + "learning_rate": 1.5934218684786668e-06, + "loss": 0.0002, + "num_input_tokens_seen": 27400864, + "step": 13995 + }, + { + "epoch": 1.8550033134526176, + "grad_norm": 7.201386451721191, + "learning_rate": 1.5930983686058268e-06, + "loss": 0.0854, + "num_input_tokens_seen": 27402648, + "step": 13996 + }, + { + "epoch": 1.8551358515573226, + "grad_norm": 3.18255615234375, + "learning_rate": 1.5927748862186766e-06, + "loss": 0.0411, + "num_input_tokens_seen": 27404400, + "step": 13997 + }, + { + "epoch": 1.8552683896620277, + "grad_norm": 3.538731336593628, + "learning_rate": 1.5924514213234527e-06, + "loss": 0.0599, + "num_input_tokens_seen": 27406736, + "step": 13998 + }, + { + "epoch": 1.855400927766733, + "grad_norm": 4.173019886016846, + "learning_rate": 1.5921279739263912e-06, + "loss": 0.1204, + "num_input_tokens_seen": 27408944, + "step": 13999 + }, + { + "epoch": 1.8555334658714382, + "grad_norm": 7.232189655303955, + "learning_rate": 1.5918045440337288e-06, + "loss": 0.0291, + "num_input_tokens_seen": 27410552, + "step": 14000 + }, + { + "epoch": 1.8556660039761432, + "grad_norm": 2.672259569168091, + "learning_rate": 1.5914811316517014e-06, + "loss": 0.0254, + "num_input_tokens_seen": 27412000, + "step": 14001 + }, + { + "epoch": 1.8557985420808483, + "grad_norm": 9.737458229064941, + "learning_rate": 1.591157736786544e-06, + "loss": 0.1322, + "num_input_tokens_seen": 27413680, + "step": 14002 + }, + { + "epoch": 1.8559310801855533, + "grad_norm": 10.601056098937988, + "learning_rate": 1.5908343594444925e-06, + "loss": 0.2236, + "num_input_tokens_seen": 27415424, + "step": 14003 + }, + { + "epoch": 1.8560636182902583, + "grad_norm": 4.919938087463379, + "learning_rate": 1.5905109996317824e-06, + "loss": 0.1042, + "num_input_tokens_seen": 27417976, + "step": 14004 + }, + { + "epoch": 1.8561961563949636, + "grad_norm": 7.361004829406738, + "learning_rate": 1.590187657354647e-06, + "loss": 0.1729, + "num_input_tokens_seen": 27420120, + "step": 14005 + }, + { + "epoch": 1.8563286944996686, + "grad_norm": 5.509590148925781, + "learning_rate": 1.589864332619322e-06, + "loss": 0.133, + "num_input_tokens_seen": 27422488, + "step": 14006 + }, + { + "epoch": 1.856461232604374, + "grad_norm": 9.648595809936523, + "learning_rate": 1.5895410254320392e-06, + "loss": 0.1818, + "num_input_tokens_seen": 27425128, + "step": 14007 + }, + { + "epoch": 1.856593770709079, + "grad_norm": 7.623651027679443, + "learning_rate": 1.5892177357990346e-06, + "loss": 0.1737, + "num_input_tokens_seen": 27427800, + "step": 14008 + }, + { + "epoch": 1.856726308813784, + "grad_norm": 11.454166412353516, + "learning_rate": 1.5888944637265403e-06, + "loss": 0.245, + "num_input_tokens_seen": 27429896, + "step": 14009 + }, + { + "epoch": 1.856858846918489, + "grad_norm": 0.018681271001696587, + "learning_rate": 1.588571209220789e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27431144, + "step": 14010 + }, + { + "epoch": 1.856991385023194, + "grad_norm": 5.7312774658203125, + "learning_rate": 1.5882479722880137e-06, + "loss": 0.1725, + "num_input_tokens_seen": 27433432, + "step": 14011 + }, + { + "epoch": 1.8571239231278993, + "grad_norm": 8.795912742614746, + "learning_rate": 1.5879247529344458e-06, + "loss": 0.2056, + "num_input_tokens_seen": 27435512, + "step": 14012 + }, + { + "epoch": 1.8572564612326043, + "grad_norm": 4.559013843536377, + "learning_rate": 1.5876015511663188e-06, + "loss": 0.0343, + "num_input_tokens_seen": 27437560, + "step": 14013 + }, + { + "epoch": 1.8573889993373096, + "grad_norm": 2.495793342590332, + "learning_rate": 1.5872783669898627e-06, + "loss": 0.0298, + "num_input_tokens_seen": 27440328, + "step": 14014 + }, + { + "epoch": 1.8575215374420146, + "grad_norm": 8.94723892211914, + "learning_rate": 1.58695520041131e-06, + "loss": 0.1245, + "num_input_tokens_seen": 27442192, + "step": 14015 + }, + { + "epoch": 1.8576540755467197, + "grad_norm": 6.600645065307617, + "learning_rate": 1.586632051436891e-06, + "loss": 0.1558, + "num_input_tokens_seen": 27444648, + "step": 14016 + }, + { + "epoch": 1.8577866136514247, + "grad_norm": 0.3052669167518616, + "learning_rate": 1.5863089200728348e-06, + "loss": 0.0021, + "num_input_tokens_seen": 27446096, + "step": 14017 + }, + { + "epoch": 1.8579191517561298, + "grad_norm": 0.01038262527436018, + "learning_rate": 1.585985806325374e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27447840, + "step": 14018 + }, + { + "epoch": 1.858051689860835, + "grad_norm": 0.07914943993091583, + "learning_rate": 1.5856627102007377e-06, + "loss": 0.0005, + "num_input_tokens_seen": 27449352, + "step": 14019 + }, + { + "epoch": 1.85818422796554, + "grad_norm": 4.2371110916137695, + "learning_rate": 1.5853396317051545e-06, + "loss": 0.0667, + "num_input_tokens_seen": 27451104, + "step": 14020 + }, + { + "epoch": 1.8583167660702453, + "grad_norm": 3.034331798553467, + "learning_rate": 1.5850165708448544e-06, + "loss": 0.058, + "num_input_tokens_seen": 27453960, + "step": 14021 + }, + { + "epoch": 1.8584493041749504, + "grad_norm": 7.8173909187316895, + "learning_rate": 1.5846935276260655e-06, + "loss": 0.1548, + "num_input_tokens_seen": 27456320, + "step": 14022 + }, + { + "epoch": 1.8585818422796554, + "grad_norm": 0.024954702705144882, + "learning_rate": 1.584370502055017e-06, + "loss": 0.0002, + "num_input_tokens_seen": 27458512, + "step": 14023 + }, + { + "epoch": 1.8587143803843604, + "grad_norm": 0.014869432896375656, + "learning_rate": 1.5840474941379377e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27460192, + "step": 14024 + }, + { + "epoch": 1.8588469184890655, + "grad_norm": 1.6893668174743652, + "learning_rate": 1.5837245038810545e-06, + "loss": 0.0391, + "num_input_tokens_seen": 27461488, + "step": 14025 + }, + { + "epoch": 1.8589794565937707, + "grad_norm": 19.471981048583984, + "learning_rate": 1.5834015312905942e-06, + "loss": 0.1645, + "num_input_tokens_seen": 27463680, + "step": 14026 + }, + { + "epoch": 1.8591119946984758, + "grad_norm": 3.3017728328704834, + "learning_rate": 1.5830785763727846e-06, + "loss": 0.0314, + "num_input_tokens_seen": 27466160, + "step": 14027 + }, + { + "epoch": 1.859244532803181, + "grad_norm": 4.193112373352051, + "learning_rate": 1.5827556391338533e-06, + "loss": 0.0901, + "num_input_tokens_seen": 27467984, + "step": 14028 + }, + { + "epoch": 1.859377070907886, + "grad_norm": 7.224668502807617, + "learning_rate": 1.5824327195800258e-06, + "loss": 0.0945, + "num_input_tokens_seen": 27470184, + "step": 14029 + }, + { + "epoch": 1.859509609012591, + "grad_norm": 9.425884246826172, + "learning_rate": 1.5821098177175286e-06, + "loss": 0.0956, + "num_input_tokens_seen": 27472040, + "step": 14030 + }, + { + "epoch": 1.8596421471172961, + "grad_norm": 0.13361337780952454, + "learning_rate": 1.5817869335525875e-06, + "loss": 0.0009, + "num_input_tokens_seen": 27473744, + "step": 14031 + }, + { + "epoch": 1.8597746852220012, + "grad_norm": 0.06496568769216537, + "learning_rate": 1.5814640670914277e-06, + "loss": 0.0004, + "num_input_tokens_seen": 27475400, + "step": 14032 + }, + { + "epoch": 1.8599072233267064, + "grad_norm": 1.2887072563171387, + "learning_rate": 1.5811412183402733e-06, + "loss": 0.0142, + "num_input_tokens_seen": 27477552, + "step": 14033 + }, + { + "epoch": 1.8600397614314115, + "grad_norm": 13.932268142700195, + "learning_rate": 1.5808183873053512e-06, + "loss": 0.1555, + "num_input_tokens_seen": 27479376, + "step": 14034 + }, + { + "epoch": 1.8601722995361167, + "grad_norm": 0.6396550536155701, + "learning_rate": 1.5804955739928845e-06, + "loss": 0.0031, + "num_input_tokens_seen": 27481344, + "step": 14035 + }, + { + "epoch": 1.8603048376408218, + "grad_norm": 0.10826234519481659, + "learning_rate": 1.5801727784090975e-06, + "loss": 0.0007, + "num_input_tokens_seen": 27483784, + "step": 14036 + }, + { + "epoch": 1.8604373757455268, + "grad_norm": 15.175018310546875, + "learning_rate": 1.5798500005602138e-06, + "loss": 0.1715, + "num_input_tokens_seen": 27486864, + "step": 14037 + }, + { + "epoch": 1.8605699138502318, + "grad_norm": 7.901054859161377, + "learning_rate": 1.5795272404524562e-06, + "loss": 0.1474, + "num_input_tokens_seen": 27488392, + "step": 14038 + }, + { + "epoch": 1.8607024519549369, + "grad_norm": 7.47947883605957, + "learning_rate": 1.5792044980920495e-06, + "loss": 0.2055, + "num_input_tokens_seen": 27490576, + "step": 14039 + }, + { + "epoch": 1.8608349900596421, + "grad_norm": 6.945105075836182, + "learning_rate": 1.5788817734852153e-06, + "loss": 0.076, + "num_input_tokens_seen": 27492160, + "step": 14040 + }, + { + "epoch": 1.8609675281643474, + "grad_norm": 0.22695119678974152, + "learning_rate": 1.5785590666381751e-06, + "loss": 0.0016, + "num_input_tokens_seen": 27493832, + "step": 14041 + }, + { + "epoch": 1.8611000662690524, + "grad_norm": 10.272282600402832, + "learning_rate": 1.5782363775571528e-06, + "loss": 0.1275, + "num_input_tokens_seen": 27495520, + "step": 14042 + }, + { + "epoch": 1.8612326043737575, + "grad_norm": 18.20184326171875, + "learning_rate": 1.5779137062483673e-06, + "loss": 0.121, + "num_input_tokens_seen": 27497152, + "step": 14043 + }, + { + "epoch": 1.8613651424784625, + "grad_norm": 0.03172851726412773, + "learning_rate": 1.5775910527180426e-06, + "loss": 0.0002, + "num_input_tokens_seen": 27498416, + "step": 14044 + }, + { + "epoch": 1.8614976805831676, + "grad_norm": 0.014296247623860836, + "learning_rate": 1.5772684169723994e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27500552, + "step": 14045 + }, + { + "epoch": 1.8616302186878728, + "grad_norm": 1.530669927597046, + "learning_rate": 1.5769457990176578e-06, + "loss": 0.0152, + "num_input_tokens_seen": 27502200, + "step": 14046 + }, + { + "epoch": 1.8617627567925779, + "grad_norm": 13.09257984161377, + "learning_rate": 1.5766231988600372e-06, + "loss": 0.3608, + "num_input_tokens_seen": 27504616, + "step": 14047 + }, + { + "epoch": 1.8618952948972831, + "grad_norm": 5.582982063293457, + "learning_rate": 1.5763006165057576e-06, + "loss": 0.0573, + "num_input_tokens_seen": 27507344, + "step": 14048 + }, + { + "epoch": 1.8620278330019882, + "grad_norm": 0.08706431835889816, + "learning_rate": 1.5759780519610408e-06, + "loss": 0.0004, + "num_input_tokens_seen": 27509656, + "step": 14049 + }, + { + "epoch": 1.8621603711066932, + "grad_norm": 7.549373149871826, + "learning_rate": 1.5756555052321038e-06, + "loss": 0.1238, + "num_input_tokens_seen": 27511616, + "step": 14050 + }, + { + "epoch": 1.8622929092113982, + "grad_norm": 0.08817126601934433, + "learning_rate": 1.575332976325167e-06, + "loss": 0.0005, + "num_input_tokens_seen": 27513216, + "step": 14051 + }, + { + "epoch": 1.8624254473161033, + "grad_norm": 4.608008861541748, + "learning_rate": 1.5750104652464475e-06, + "loss": 0.0692, + "num_input_tokens_seen": 27515032, + "step": 14052 + }, + { + "epoch": 1.8625579854208085, + "grad_norm": 0.2380451112985611, + "learning_rate": 1.5746879720021643e-06, + "loss": 0.002, + "num_input_tokens_seen": 27516744, + "step": 14053 + }, + { + "epoch": 1.8626905235255136, + "grad_norm": 0.027397483587265015, + "learning_rate": 1.574365496598536e-06, + "loss": 0.0002, + "num_input_tokens_seen": 27518568, + "step": 14054 + }, + { + "epoch": 1.8628230616302188, + "grad_norm": 0.028521781787276268, + "learning_rate": 1.57404303904178e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27519960, + "step": 14055 + }, + { + "epoch": 1.8629555997349239, + "grad_norm": 10.670886039733887, + "learning_rate": 1.573720599338112e-06, + "loss": 0.1222, + "num_input_tokens_seen": 27521696, + "step": 14056 + }, + { + "epoch": 1.863088137839629, + "grad_norm": 13.77548599243164, + "learning_rate": 1.5733981774937504e-06, + "loss": 0.3294, + "num_input_tokens_seen": 27524016, + "step": 14057 + }, + { + "epoch": 1.863220675944334, + "grad_norm": 0.04214996099472046, + "learning_rate": 1.5730757735149108e-06, + "loss": 0.0003, + "num_input_tokens_seen": 27526088, + "step": 14058 + }, + { + "epoch": 1.863353214049039, + "grad_norm": 0.023070091381669044, + "learning_rate": 1.572753387407809e-06, + "loss": 0.0003, + "num_input_tokens_seen": 27527896, + "step": 14059 + }, + { + "epoch": 1.8634857521537442, + "grad_norm": 4.00438928604126, + "learning_rate": 1.572431019178663e-06, + "loss": 0.0983, + "num_input_tokens_seen": 27529944, + "step": 14060 + }, + { + "epoch": 1.8636182902584493, + "grad_norm": 9.056439399719238, + "learning_rate": 1.5721086688336865e-06, + "loss": 0.1958, + "num_input_tokens_seen": 27531616, + "step": 14061 + }, + { + "epoch": 1.8637508283631545, + "grad_norm": 0.13469092547893524, + "learning_rate": 1.5717863363790944e-06, + "loss": 0.0012, + "num_input_tokens_seen": 27533216, + "step": 14062 + }, + { + "epoch": 1.8638833664678596, + "grad_norm": 6.646396160125732, + "learning_rate": 1.571464021821103e-06, + "loss": 0.1111, + "num_input_tokens_seen": 27535040, + "step": 14063 + }, + { + "epoch": 1.8640159045725646, + "grad_norm": 12.421241760253906, + "learning_rate": 1.5711417251659244e-06, + "loss": 0.2117, + "num_input_tokens_seen": 27537416, + "step": 14064 + }, + { + "epoch": 1.8641484426772696, + "grad_norm": 10.393312454223633, + "learning_rate": 1.5708194464197746e-06, + "loss": 0.1951, + "num_input_tokens_seen": 27538888, + "step": 14065 + }, + { + "epoch": 1.8642809807819747, + "grad_norm": 0.6602757573127747, + "learning_rate": 1.5704971855888678e-06, + "loss": 0.0036, + "num_input_tokens_seen": 27541072, + "step": 14066 + }, + { + "epoch": 1.86441351888668, + "grad_norm": 2.201115846633911, + "learning_rate": 1.5701749426794155e-06, + "loss": 0.0082, + "num_input_tokens_seen": 27542296, + "step": 14067 + }, + { + "epoch": 1.864546056991385, + "grad_norm": 8.992050170898438, + "learning_rate": 1.5698527176976327e-06, + "loss": 0.0599, + "num_input_tokens_seen": 27545088, + "step": 14068 + }, + { + "epoch": 1.8646785950960902, + "grad_norm": 0.030985083431005478, + "learning_rate": 1.5695305106497299e-06, + "loss": 0.0002, + "num_input_tokens_seen": 27547440, + "step": 14069 + }, + { + "epoch": 1.8648111332007953, + "grad_norm": 0.2746868133544922, + "learning_rate": 1.5692083215419219e-06, + "loss": 0.0018, + "num_input_tokens_seen": 27548440, + "step": 14070 + }, + { + "epoch": 1.8649436713055003, + "grad_norm": 6.842190265655518, + "learning_rate": 1.5688861503804193e-06, + "loss": 0.1316, + "num_input_tokens_seen": 27551152, + "step": 14071 + }, + { + "epoch": 1.8650762094102054, + "grad_norm": 0.06427037715911865, + "learning_rate": 1.5685639971714342e-06, + "loss": 0.0004, + "num_input_tokens_seen": 27552376, + "step": 14072 + }, + { + "epoch": 1.8652087475149104, + "grad_norm": 0.01843716762959957, + "learning_rate": 1.5682418619211773e-06, + "loss": 0.0002, + "num_input_tokens_seen": 27553976, + "step": 14073 + }, + { + "epoch": 1.8653412856196157, + "grad_norm": 0.5826998949050903, + "learning_rate": 1.5679197446358597e-06, + "loss": 0.0033, + "num_input_tokens_seen": 27555176, + "step": 14074 + }, + { + "epoch": 1.8654738237243207, + "grad_norm": 4.751593112945557, + "learning_rate": 1.5675976453216934e-06, + "loss": 0.024, + "num_input_tokens_seen": 27557160, + "step": 14075 + }, + { + "epoch": 1.865606361829026, + "grad_norm": 6.868110179901123, + "learning_rate": 1.5672755639848875e-06, + "loss": 0.0942, + "num_input_tokens_seen": 27559640, + "step": 14076 + }, + { + "epoch": 1.865738899933731, + "grad_norm": 13.622636795043945, + "learning_rate": 1.5669535006316522e-06, + "loss": 0.1881, + "num_input_tokens_seen": 27562296, + "step": 14077 + }, + { + "epoch": 1.865871438038436, + "grad_norm": 5.738794326782227, + "learning_rate": 1.5666314552681971e-06, + "loss": 0.1445, + "num_input_tokens_seen": 27564360, + "step": 14078 + }, + { + "epoch": 1.866003976143141, + "grad_norm": 0.025654606521129608, + "learning_rate": 1.5663094279007307e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27566120, + "step": 14079 + }, + { + "epoch": 1.866136514247846, + "grad_norm": 1.25222647190094, + "learning_rate": 1.5659874185354635e-06, + "loss": 0.0031, + "num_input_tokens_seen": 27567736, + "step": 14080 + }, + { + "epoch": 1.8662690523525514, + "grad_norm": 10.99868392944336, + "learning_rate": 1.5656654271786032e-06, + "loss": 0.2584, + "num_input_tokens_seen": 27570424, + "step": 14081 + }, + { + "epoch": 1.8664015904572566, + "grad_norm": 18.30653190612793, + "learning_rate": 1.5653434538363576e-06, + "loss": 0.4025, + "num_input_tokens_seen": 27572344, + "step": 14082 + }, + { + "epoch": 1.8665341285619617, + "grad_norm": 0.05185495689511299, + "learning_rate": 1.5650214985149354e-06, + "loss": 0.0002, + "num_input_tokens_seen": 27574608, + "step": 14083 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 10.017414093017578, + "learning_rate": 1.5646995612205428e-06, + "loss": 0.2672, + "num_input_tokens_seen": 27576512, + "step": 14084 + }, + { + "epoch": 1.8667992047713717, + "grad_norm": 5.13302755355835, + "learning_rate": 1.5643776419593892e-06, + "loss": 0.1458, + "num_input_tokens_seen": 27579064, + "step": 14085 + }, + { + "epoch": 1.8669317428760768, + "grad_norm": 2.797567367553711, + "learning_rate": 1.5640557407376795e-06, + "loss": 0.0275, + "num_input_tokens_seen": 27580768, + "step": 14086 + }, + { + "epoch": 1.8670642809807818, + "grad_norm": 5.849459171295166, + "learning_rate": 1.563733857561621e-06, + "loss": 0.2215, + "num_input_tokens_seen": 27582680, + "step": 14087 + }, + { + "epoch": 1.867196819085487, + "grad_norm": 5.6513752937316895, + "learning_rate": 1.563411992437419e-06, + "loss": 0.1996, + "num_input_tokens_seen": 27584880, + "step": 14088 + }, + { + "epoch": 1.8673293571901923, + "grad_norm": 6.223866939544678, + "learning_rate": 1.563090145371281e-06, + "loss": 0.0866, + "num_input_tokens_seen": 27586944, + "step": 14089 + }, + { + "epoch": 1.8674618952948974, + "grad_norm": 0.02455967105925083, + "learning_rate": 1.5627683163694099e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27588392, + "step": 14090 + }, + { + "epoch": 1.8675944333996024, + "grad_norm": 9.010183334350586, + "learning_rate": 1.5624465054380127e-06, + "loss": 0.1063, + "num_input_tokens_seen": 27589840, + "step": 14091 + }, + { + "epoch": 1.8677269715043074, + "grad_norm": 14.108307838439941, + "learning_rate": 1.5621247125832945e-06, + "loss": 0.1836, + "num_input_tokens_seen": 27590992, + "step": 14092 + }, + { + "epoch": 1.8678595096090125, + "grad_norm": 2.7079474925994873, + "learning_rate": 1.5618029378114586e-06, + "loss": 0.0192, + "num_input_tokens_seen": 27592352, + "step": 14093 + }, + { + "epoch": 1.8679920477137177, + "grad_norm": 7.940439701080322, + "learning_rate": 1.561481181128709e-06, + "loss": 0.0523, + "num_input_tokens_seen": 27594800, + "step": 14094 + }, + { + "epoch": 1.8681245858184228, + "grad_norm": 2.4309580326080322, + "learning_rate": 1.561159442541249e-06, + "loss": 0.0276, + "num_input_tokens_seen": 27596448, + "step": 14095 + }, + { + "epoch": 1.868257123923128, + "grad_norm": 0.06321682035923004, + "learning_rate": 1.5608377220552833e-06, + "loss": 0.0004, + "num_input_tokens_seen": 27597680, + "step": 14096 + }, + { + "epoch": 1.868389662027833, + "grad_norm": 14.411108016967773, + "learning_rate": 1.560516019677014e-06, + "loss": 0.198, + "num_input_tokens_seen": 27599624, + "step": 14097 + }, + { + "epoch": 1.8685222001325381, + "grad_norm": 7.884324550628662, + "learning_rate": 1.5601943354126448e-06, + "loss": 0.2242, + "num_input_tokens_seen": 27602272, + "step": 14098 + }, + { + "epoch": 1.8686547382372432, + "grad_norm": 0.11647869646549225, + "learning_rate": 1.5598726692683768e-06, + "loss": 0.0008, + "num_input_tokens_seen": 27603960, + "step": 14099 + }, + { + "epoch": 1.8687872763419482, + "grad_norm": 4.44553279876709, + "learning_rate": 1.5595510212504117e-06, + "loss": 0.0141, + "num_input_tokens_seen": 27606096, + "step": 14100 + }, + { + "epoch": 1.8689198144466534, + "grad_norm": 7.342031002044678, + "learning_rate": 1.5592293913649518e-06, + "loss": 0.1585, + "num_input_tokens_seen": 27608552, + "step": 14101 + }, + { + "epoch": 1.8690523525513585, + "grad_norm": 8.569260597229004, + "learning_rate": 1.5589077796181987e-06, + "loss": 0.0692, + "num_input_tokens_seen": 27609752, + "step": 14102 + }, + { + "epoch": 1.8691848906560637, + "grad_norm": 8.670154571533203, + "learning_rate": 1.5585861860163527e-06, + "loss": 0.3442, + "num_input_tokens_seen": 27612616, + "step": 14103 + }, + { + "epoch": 1.8693174287607688, + "grad_norm": 0.08332082629203796, + "learning_rate": 1.5582646105656144e-06, + "loss": 0.0006, + "num_input_tokens_seen": 27614520, + "step": 14104 + }, + { + "epoch": 1.8694499668654738, + "grad_norm": 7.1704421043396, + "learning_rate": 1.5579430532721834e-06, + "loss": 0.1656, + "num_input_tokens_seen": 27616672, + "step": 14105 + }, + { + "epoch": 1.8695825049701789, + "grad_norm": 6.13423490524292, + "learning_rate": 1.5576215141422607e-06, + "loss": 0.1529, + "num_input_tokens_seen": 27618888, + "step": 14106 + }, + { + "epoch": 1.869715043074884, + "grad_norm": 0.7433794140815735, + "learning_rate": 1.557299993182046e-06, + "loss": 0.0065, + "num_input_tokens_seen": 27620320, + "step": 14107 + }, + { + "epoch": 1.8698475811795892, + "grad_norm": 5.6730637550354, + "learning_rate": 1.5569784903977374e-06, + "loss": 0.0713, + "num_input_tokens_seen": 27622528, + "step": 14108 + }, + { + "epoch": 1.8699801192842942, + "grad_norm": 9.642454147338867, + "learning_rate": 1.5566570057955338e-06, + "loss": 0.1931, + "num_input_tokens_seen": 27624272, + "step": 14109 + }, + { + "epoch": 1.8701126573889995, + "grad_norm": 8.89737606048584, + "learning_rate": 1.5563355393816331e-06, + "loss": 0.0978, + "num_input_tokens_seen": 27625856, + "step": 14110 + }, + { + "epoch": 1.8702451954937045, + "grad_norm": 3.121096134185791, + "learning_rate": 1.5560140911622356e-06, + "loss": 0.0478, + "num_input_tokens_seen": 27628000, + "step": 14111 + }, + { + "epoch": 1.8703777335984095, + "grad_norm": 5.902791500091553, + "learning_rate": 1.555692661143537e-06, + "loss": 0.076, + "num_input_tokens_seen": 27630312, + "step": 14112 + }, + { + "epoch": 1.8705102717031146, + "grad_norm": 0.08562323451042175, + "learning_rate": 1.555371249331736e-06, + "loss": 0.0005, + "num_input_tokens_seen": 27632248, + "step": 14113 + }, + { + "epoch": 1.8706428098078196, + "grad_norm": 0.014794080518186092, + "learning_rate": 1.5550498557330285e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27633880, + "step": 14114 + }, + { + "epoch": 1.8707753479125249, + "grad_norm": 4.893691539764404, + "learning_rate": 1.554728480353612e-06, + "loss": 0.0827, + "num_input_tokens_seen": 27636152, + "step": 14115 + }, + { + "epoch": 1.87090788601723, + "grad_norm": 0.2623775899410248, + "learning_rate": 1.5544071231996817e-06, + "loss": 0.0014, + "num_input_tokens_seen": 27637648, + "step": 14116 + }, + { + "epoch": 1.8710404241219352, + "grad_norm": 16.594982147216797, + "learning_rate": 1.5540857842774348e-06, + "loss": 0.127, + "num_input_tokens_seen": 27639688, + "step": 14117 + }, + { + "epoch": 1.8711729622266402, + "grad_norm": 0.02024855650961399, + "learning_rate": 1.5537644635930666e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27641480, + "step": 14118 + }, + { + "epoch": 1.8713055003313452, + "grad_norm": 0.43249231576919556, + "learning_rate": 1.5534431611527728e-06, + "loss": 0.0029, + "num_input_tokens_seen": 27642728, + "step": 14119 + }, + { + "epoch": 1.8714380384360503, + "grad_norm": 0.016199866309762, + "learning_rate": 1.5531218769627471e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27643880, + "step": 14120 + }, + { + "epoch": 1.8715705765407553, + "grad_norm": 6.5732269287109375, + "learning_rate": 1.5528006110291843e-06, + "loss": 0.1685, + "num_input_tokens_seen": 27645680, + "step": 14121 + }, + { + "epoch": 1.8717031146454606, + "grad_norm": 5.988508701324463, + "learning_rate": 1.5524793633582803e-06, + "loss": 0.0793, + "num_input_tokens_seen": 27647824, + "step": 14122 + }, + { + "epoch": 1.8718356527501658, + "grad_norm": 0.005703015718609095, + "learning_rate": 1.552158133956228e-06, + "loss": 0.0, + "num_input_tokens_seen": 27649104, + "step": 14123 + }, + { + "epoch": 1.8719681908548709, + "grad_norm": 3.762333869934082, + "learning_rate": 1.55183692282922e-06, + "loss": 0.0514, + "num_input_tokens_seen": 27651296, + "step": 14124 + }, + { + "epoch": 1.872100728959576, + "grad_norm": 2.4711837768554688, + "learning_rate": 1.551515729983451e-06, + "loss": 0.0518, + "num_input_tokens_seen": 27652752, + "step": 14125 + }, + { + "epoch": 1.872233267064281, + "grad_norm": 0.010249309241771698, + "learning_rate": 1.5511945554251118e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27654984, + "step": 14126 + }, + { + "epoch": 1.872365805168986, + "grad_norm": 0.12878605723381042, + "learning_rate": 1.5508733991603963e-06, + "loss": 0.0009, + "num_input_tokens_seen": 27657000, + "step": 14127 + }, + { + "epoch": 1.872498343273691, + "grad_norm": 8.662306785583496, + "learning_rate": 1.5505522611954977e-06, + "loss": 0.0594, + "num_input_tokens_seen": 27658360, + "step": 14128 + }, + { + "epoch": 1.8726308813783963, + "grad_norm": 0.018929364159703255, + "learning_rate": 1.5502311415366055e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27659688, + "step": 14129 + }, + { + "epoch": 1.8727634194831015, + "grad_norm": 9.33077335357666, + "learning_rate": 1.5499100401899128e-06, + "loss": 0.2064, + "num_input_tokens_seen": 27662104, + "step": 14130 + }, + { + "epoch": 1.8728959575878066, + "grad_norm": 0.01811094768345356, + "learning_rate": 1.5495889571616088e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27664192, + "step": 14131 + }, + { + "epoch": 1.8730284956925116, + "grad_norm": 0.36984556913375854, + "learning_rate": 1.5492678924578862e-06, + "loss": 0.0014, + "num_input_tokens_seen": 27665664, + "step": 14132 + }, + { + "epoch": 1.8731610337972167, + "grad_norm": 7.323185443878174, + "learning_rate": 1.5489468460849343e-06, + "loss": 0.2072, + "num_input_tokens_seen": 27667776, + "step": 14133 + }, + { + "epoch": 1.8732935719019217, + "grad_norm": 5.878580093383789, + "learning_rate": 1.5486258180489438e-06, + "loss": 0.1306, + "num_input_tokens_seen": 27670016, + "step": 14134 + }, + { + "epoch": 1.873426110006627, + "grad_norm": 15.401344299316406, + "learning_rate": 1.5483048083561036e-06, + "loss": 0.3602, + "num_input_tokens_seen": 27672016, + "step": 14135 + }, + { + "epoch": 1.873558648111332, + "grad_norm": 3.4676899909973145, + "learning_rate": 1.547983817012602e-06, + "loss": 0.04, + "num_input_tokens_seen": 27674544, + "step": 14136 + }, + { + "epoch": 1.8736911862160373, + "grad_norm": 0.5248491168022156, + "learning_rate": 1.547662844024631e-06, + "loss": 0.0035, + "num_input_tokens_seen": 27676320, + "step": 14137 + }, + { + "epoch": 1.8738237243207423, + "grad_norm": 11.228452682495117, + "learning_rate": 1.547341889398377e-06, + "loss": 0.235, + "num_input_tokens_seen": 27678528, + "step": 14138 + }, + { + "epoch": 1.8739562624254473, + "grad_norm": 0.4221613109111786, + "learning_rate": 1.547020953140028e-06, + "loss": 0.0025, + "num_input_tokens_seen": 27680944, + "step": 14139 + }, + { + "epoch": 1.8740888005301524, + "grad_norm": 0.4591045677661896, + "learning_rate": 1.5467000352557728e-06, + "loss": 0.0027, + "num_input_tokens_seen": 27683312, + "step": 14140 + }, + { + "epoch": 1.8742213386348574, + "grad_norm": 0.1005808413028717, + "learning_rate": 1.546379135751798e-06, + "loss": 0.0007, + "num_input_tokens_seen": 27685488, + "step": 14141 + }, + { + "epoch": 1.8743538767395627, + "grad_norm": 3.4582455158233643, + "learning_rate": 1.5460582546342914e-06, + "loss": 0.0943, + "num_input_tokens_seen": 27687320, + "step": 14142 + }, + { + "epoch": 1.8744864148442677, + "grad_norm": 2.817018508911133, + "learning_rate": 1.5457373919094403e-06, + "loss": 0.0743, + "num_input_tokens_seen": 27689304, + "step": 14143 + }, + { + "epoch": 1.874618952948973, + "grad_norm": 7.5096659660339355, + "learning_rate": 1.5454165475834304e-06, + "loss": 0.2677, + "num_input_tokens_seen": 27690992, + "step": 14144 + }, + { + "epoch": 1.874751491053678, + "grad_norm": 0.29318150877952576, + "learning_rate": 1.5450957216624485e-06, + "loss": 0.002, + "num_input_tokens_seen": 27693664, + "step": 14145 + }, + { + "epoch": 1.874884029158383, + "grad_norm": 3.20991849899292, + "learning_rate": 1.5447749141526797e-06, + "loss": 0.0355, + "num_input_tokens_seen": 27695512, + "step": 14146 + }, + { + "epoch": 1.875016567263088, + "grad_norm": 8.455756187438965, + "learning_rate": 1.5444541250603085e-06, + "loss": 0.1597, + "num_input_tokens_seen": 27697240, + "step": 14147 + }, + { + "epoch": 1.8751491053677931, + "grad_norm": 0.04607174172997475, + "learning_rate": 1.5441333543915213e-06, + "loss": 0.0003, + "num_input_tokens_seen": 27699272, + "step": 14148 + }, + { + "epoch": 1.8752816434724984, + "grad_norm": 0.011877795681357384, + "learning_rate": 1.543812602152503e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27701264, + "step": 14149 + }, + { + "epoch": 1.8754141815772034, + "grad_norm": 0.01679817959666252, + "learning_rate": 1.5434918683494372e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27704680, + "step": 14150 + }, + { + "epoch": 1.8755467196819087, + "grad_norm": 11.755133628845215, + "learning_rate": 1.5431711529885083e-06, + "loss": 0.1482, + "num_input_tokens_seen": 27707016, + "step": 14151 + }, + { + "epoch": 1.8756792577866137, + "grad_norm": 0.7682393789291382, + "learning_rate": 1.5428504560758985e-06, + "loss": 0.0065, + "num_input_tokens_seen": 27709136, + "step": 14152 + }, + { + "epoch": 1.8758117958913187, + "grad_norm": 4.193154335021973, + "learning_rate": 1.5425297776177934e-06, + "loss": 0.0505, + "num_input_tokens_seen": 27711376, + "step": 14153 + }, + { + "epoch": 1.8759443339960238, + "grad_norm": 3.6658122539520264, + "learning_rate": 1.542209117620374e-06, + "loss": 0.1059, + "num_input_tokens_seen": 27712952, + "step": 14154 + }, + { + "epoch": 1.8760768721007288, + "grad_norm": 4.236730575561523, + "learning_rate": 1.5418884760898246e-06, + "loss": 0.0584, + "num_input_tokens_seen": 27714352, + "step": 14155 + }, + { + "epoch": 1.876209410205434, + "grad_norm": 0.13003882765769958, + "learning_rate": 1.5415678530323253e-06, + "loss": 0.0013, + "num_input_tokens_seen": 27716488, + "step": 14156 + }, + { + "epoch": 1.8763419483101391, + "grad_norm": 4.575048923492432, + "learning_rate": 1.5412472484540586e-06, + "loss": 0.1099, + "num_input_tokens_seen": 27718232, + "step": 14157 + }, + { + "epoch": 1.8764744864148444, + "grad_norm": 0.09059682488441467, + "learning_rate": 1.5409266623612075e-06, + "loss": 0.001, + "num_input_tokens_seen": 27719440, + "step": 14158 + }, + { + "epoch": 1.8766070245195494, + "grad_norm": 2.265956401824951, + "learning_rate": 1.5406060947599516e-06, + "loss": 0.0315, + "num_input_tokens_seen": 27721368, + "step": 14159 + }, + { + "epoch": 1.8767395626242545, + "grad_norm": 3.48268985748291, + "learning_rate": 1.5402855456564724e-06, + "loss": 0.0369, + "num_input_tokens_seen": 27723808, + "step": 14160 + }, + { + "epoch": 1.8768721007289595, + "grad_norm": 8.093901634216309, + "learning_rate": 1.5399650150569497e-06, + "loss": 0.134, + "num_input_tokens_seen": 27726160, + "step": 14161 + }, + { + "epoch": 1.8770046388336645, + "grad_norm": 0.06780815124511719, + "learning_rate": 1.5396445029675633e-06, + "loss": 0.0007, + "num_input_tokens_seen": 27727704, + "step": 14162 + }, + { + "epoch": 1.8771371769383698, + "grad_norm": 0.02056991308927536, + "learning_rate": 1.5393240093944933e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27729304, + "step": 14163 + }, + { + "epoch": 1.877269715043075, + "grad_norm": 0.01351380255073309, + "learning_rate": 1.5390035343439203e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27730728, + "step": 14164 + }, + { + "epoch": 1.87740225314778, + "grad_norm": 0.03180774301290512, + "learning_rate": 1.5386830778220212e-06, + "loss": 0.0002, + "num_input_tokens_seen": 27732752, + "step": 14165 + }, + { + "epoch": 1.8775347912524851, + "grad_norm": 0.5520296692848206, + "learning_rate": 1.538362639834976e-06, + "loss": 0.0026, + "num_input_tokens_seen": 27734064, + "step": 14166 + }, + { + "epoch": 1.8776673293571902, + "grad_norm": 3.5574514865875244, + "learning_rate": 1.5380422203889617e-06, + "loss": 0.0891, + "num_input_tokens_seen": 27737048, + "step": 14167 + }, + { + "epoch": 1.8777998674618952, + "grad_norm": 6.2398858070373535, + "learning_rate": 1.5377218194901572e-06, + "loss": 0.1338, + "num_input_tokens_seen": 27739168, + "step": 14168 + }, + { + "epoch": 1.8779324055666002, + "grad_norm": 6.654844760894775, + "learning_rate": 1.5374014371447407e-06, + "loss": 0.1584, + "num_input_tokens_seen": 27741224, + "step": 14169 + }, + { + "epoch": 1.8780649436713055, + "grad_norm": 0.8270488977432251, + "learning_rate": 1.5370810733588886e-06, + "loss": 0.0056, + "num_input_tokens_seen": 27742832, + "step": 14170 + }, + { + "epoch": 1.8781974817760108, + "grad_norm": 3.3100860118865967, + "learning_rate": 1.536760728138777e-06, + "loss": 0.0175, + "num_input_tokens_seen": 27744520, + "step": 14171 + }, + { + "epoch": 1.8783300198807158, + "grad_norm": 3.5916621685028076, + "learning_rate": 1.5364404014905826e-06, + "loss": 0.0373, + "num_input_tokens_seen": 27746240, + "step": 14172 + }, + { + "epoch": 1.8784625579854208, + "grad_norm": 11.471362113952637, + "learning_rate": 1.5361200934204828e-06, + "loss": 0.3222, + "num_input_tokens_seen": 27749224, + "step": 14173 + }, + { + "epoch": 1.8785950960901259, + "grad_norm": 13.595063209533691, + "learning_rate": 1.5357998039346522e-06, + "loss": 0.2773, + "num_input_tokens_seen": 27751168, + "step": 14174 + }, + { + "epoch": 1.878727634194831, + "grad_norm": 8.046184539794922, + "learning_rate": 1.5354795330392671e-06, + "loss": 0.2175, + "num_input_tokens_seen": 27752904, + "step": 14175 + }, + { + "epoch": 1.8788601722995362, + "grad_norm": 20.819852828979492, + "learning_rate": 1.5351592807405019e-06, + "loss": 0.4446, + "num_input_tokens_seen": 27754832, + "step": 14176 + }, + { + "epoch": 1.8789927104042412, + "grad_norm": 13.947122573852539, + "learning_rate": 1.5348390470445313e-06, + "loss": 0.2745, + "num_input_tokens_seen": 27757136, + "step": 14177 + }, + { + "epoch": 1.8791252485089465, + "grad_norm": 7.672341823577881, + "learning_rate": 1.534518831957529e-06, + "loss": 0.1065, + "num_input_tokens_seen": 27758952, + "step": 14178 + }, + { + "epoch": 1.8792577866136515, + "grad_norm": 0.04887288808822632, + "learning_rate": 1.5341986354856704e-06, + "loss": 0.0003, + "num_input_tokens_seen": 27760208, + "step": 14179 + }, + { + "epoch": 1.8793903247183565, + "grad_norm": 14.544113159179688, + "learning_rate": 1.5338784576351279e-06, + "loss": 0.2286, + "num_input_tokens_seen": 27761952, + "step": 14180 + }, + { + "epoch": 1.8795228628230616, + "grad_norm": 7.330190658569336, + "learning_rate": 1.5335582984120755e-06, + "loss": 0.2086, + "num_input_tokens_seen": 27764008, + "step": 14181 + }, + { + "epoch": 1.8796554009277666, + "grad_norm": 5.348171234130859, + "learning_rate": 1.5332381578226856e-06, + "loss": 0.1372, + "num_input_tokens_seen": 27766392, + "step": 14182 + }, + { + "epoch": 1.8797879390324719, + "grad_norm": 2.904508113861084, + "learning_rate": 1.5329180358731307e-06, + "loss": 0.036, + "num_input_tokens_seen": 27767544, + "step": 14183 + }, + { + "epoch": 1.879920477137177, + "grad_norm": 1.784377098083496, + "learning_rate": 1.532597932569584e-06, + "loss": 0.0108, + "num_input_tokens_seen": 27769288, + "step": 14184 + }, + { + "epoch": 1.8800530152418822, + "grad_norm": 7.41031551361084, + "learning_rate": 1.5322778479182165e-06, + "loss": 0.2059, + "num_input_tokens_seen": 27771568, + "step": 14185 + }, + { + "epoch": 1.8801855533465872, + "grad_norm": 6.613982200622559, + "learning_rate": 1.5319577819251994e-06, + "loss": 0.0737, + "num_input_tokens_seen": 27773672, + "step": 14186 + }, + { + "epoch": 1.8803180914512923, + "grad_norm": 8.324676513671875, + "learning_rate": 1.5316377345967044e-06, + "loss": 0.2048, + "num_input_tokens_seen": 27775744, + "step": 14187 + }, + { + "epoch": 1.8804506295559973, + "grad_norm": 0.004979959223419428, + "learning_rate": 1.531317705938901e-06, + "loss": 0.0, + "num_input_tokens_seen": 27777024, + "step": 14188 + }, + { + "epoch": 1.8805831676607023, + "grad_norm": 4.630685329437256, + "learning_rate": 1.5309976959579608e-06, + "loss": 0.0372, + "num_input_tokens_seen": 27778352, + "step": 14189 + }, + { + "epoch": 1.8807157057654076, + "grad_norm": 0.2787049412727356, + "learning_rate": 1.530677704660054e-06, + "loss": 0.0016, + "num_input_tokens_seen": 27780600, + "step": 14190 + }, + { + "epoch": 1.8808482438701126, + "grad_norm": 6.769437313079834, + "learning_rate": 1.5303577320513503e-06, + "loss": 0.128, + "num_input_tokens_seen": 27782112, + "step": 14191 + }, + { + "epoch": 1.8809807819748179, + "grad_norm": 6.621524333953857, + "learning_rate": 1.5300377781380174e-06, + "loss": 0.0603, + "num_input_tokens_seen": 27783912, + "step": 14192 + }, + { + "epoch": 1.881113320079523, + "grad_norm": 3.9912326335906982, + "learning_rate": 1.5297178429262249e-06, + "loss": 0.056, + "num_input_tokens_seen": 27785288, + "step": 14193 + }, + { + "epoch": 1.881245858184228, + "grad_norm": 8.607402801513672, + "learning_rate": 1.5293979264221428e-06, + "loss": 0.2546, + "num_input_tokens_seen": 27787424, + "step": 14194 + }, + { + "epoch": 1.881378396288933, + "grad_norm": 2.973525047302246, + "learning_rate": 1.5290780286319378e-06, + "loss": 0.0305, + "num_input_tokens_seen": 27788792, + "step": 14195 + }, + { + "epoch": 1.881510934393638, + "grad_norm": 1.6846907138824463, + "learning_rate": 1.5287581495617787e-06, + "loss": 0.0144, + "num_input_tokens_seen": 27791840, + "step": 14196 + }, + { + "epoch": 1.8816434724983433, + "grad_norm": 0.5597503781318665, + "learning_rate": 1.5284382892178318e-06, + "loss": 0.0026, + "num_input_tokens_seen": 27792976, + "step": 14197 + }, + { + "epoch": 1.8817760106030483, + "grad_norm": 14.210325241088867, + "learning_rate": 1.5281184476062646e-06, + "loss": 0.4343, + "num_input_tokens_seen": 27794680, + "step": 14198 + }, + { + "epoch": 1.8819085487077536, + "grad_norm": 3.1090426445007324, + "learning_rate": 1.5277986247332452e-06, + "loss": 0.0166, + "num_input_tokens_seen": 27795664, + "step": 14199 + }, + { + "epoch": 1.8820410868124586, + "grad_norm": 7.3814167976379395, + "learning_rate": 1.527478820604939e-06, + "loss": 0.0908, + "num_input_tokens_seen": 27797904, + "step": 14200 + }, + { + "epoch": 1.8821736249171637, + "grad_norm": 4.604297161102295, + "learning_rate": 1.5271590352275115e-06, + "loss": 0.079, + "num_input_tokens_seen": 27800208, + "step": 14201 + }, + { + "epoch": 1.8823061630218687, + "grad_norm": 0.49099692702293396, + "learning_rate": 1.5268392686071294e-06, + "loss": 0.0035, + "num_input_tokens_seen": 27802016, + "step": 14202 + }, + { + "epoch": 1.8824387011265737, + "grad_norm": 6.846774101257324, + "learning_rate": 1.5265195207499572e-06, + "loss": 0.2203, + "num_input_tokens_seen": 27803824, + "step": 14203 + }, + { + "epoch": 1.882571239231279, + "grad_norm": 12.261384963989258, + "learning_rate": 1.5261997916621596e-06, + "loss": 0.2852, + "num_input_tokens_seen": 27805592, + "step": 14204 + }, + { + "epoch": 1.882703777335984, + "grad_norm": 0.028447190299630165, + "learning_rate": 1.5258800813499029e-06, + "loss": 0.0002, + "num_input_tokens_seen": 27807104, + "step": 14205 + }, + { + "epoch": 1.8828363154406893, + "grad_norm": 4.67616605758667, + "learning_rate": 1.52556038981935e-06, + "loss": 0.0867, + "num_input_tokens_seen": 27809288, + "step": 14206 + }, + { + "epoch": 1.8829688535453943, + "grad_norm": 12.975214004516602, + "learning_rate": 1.5252407170766648e-06, + "loss": 0.3973, + "num_input_tokens_seen": 27811560, + "step": 14207 + }, + { + "epoch": 1.8831013916500994, + "grad_norm": 11.32382869720459, + "learning_rate": 1.5249210631280115e-06, + "loss": 0.1903, + "num_input_tokens_seen": 27813304, + "step": 14208 + }, + { + "epoch": 1.8832339297548044, + "grad_norm": 11.548720359802246, + "learning_rate": 1.5246014279795514e-06, + "loss": 0.1593, + "num_input_tokens_seen": 27815688, + "step": 14209 + }, + { + "epoch": 1.8833664678595095, + "grad_norm": 7.484272003173828, + "learning_rate": 1.5242818116374496e-06, + "loss": 0.1952, + "num_input_tokens_seen": 27817856, + "step": 14210 + }, + { + "epoch": 1.8834990059642147, + "grad_norm": 4.7804975509643555, + "learning_rate": 1.523962214107868e-06, + "loss": 0.1388, + "num_input_tokens_seen": 27819472, + "step": 14211 + }, + { + "epoch": 1.88363154406892, + "grad_norm": 6.387711524963379, + "learning_rate": 1.5236426353969674e-06, + "loss": 0.11, + "num_input_tokens_seen": 27821720, + "step": 14212 + }, + { + "epoch": 1.883764082173625, + "grad_norm": 0.024781053885817528, + "learning_rate": 1.5233230755109113e-06, + "loss": 0.0002, + "num_input_tokens_seen": 27823816, + "step": 14213 + }, + { + "epoch": 1.88389662027833, + "grad_norm": 3.994509220123291, + "learning_rate": 1.5230035344558591e-06, + "loss": 0.0941, + "num_input_tokens_seen": 27826544, + "step": 14214 + }, + { + "epoch": 1.884029158383035, + "grad_norm": 0.47853708267211914, + "learning_rate": 1.5226840122379733e-06, + "loss": 0.0039, + "num_input_tokens_seen": 27827976, + "step": 14215 + }, + { + "epoch": 1.8841616964877401, + "grad_norm": 0.031244471669197083, + "learning_rate": 1.5223645088634137e-06, + "loss": 0.0002, + "num_input_tokens_seen": 27829864, + "step": 14216 + }, + { + "epoch": 1.8842942345924454, + "grad_norm": 5.578570365905762, + "learning_rate": 1.5220450243383414e-06, + "loss": 0.2205, + "num_input_tokens_seen": 27831688, + "step": 14217 + }, + { + "epoch": 1.8844267726971504, + "grad_norm": 0.08892669528722763, + "learning_rate": 1.521725558668915e-06, + "loss": 0.0005, + "num_input_tokens_seen": 27833240, + "step": 14218 + }, + { + "epoch": 1.8845593108018557, + "grad_norm": 0.7252469062805176, + "learning_rate": 1.5214061118612944e-06, + "loss": 0.0078, + "num_input_tokens_seen": 27835168, + "step": 14219 + }, + { + "epoch": 1.8846918489065607, + "grad_norm": 8.59192180633545, + "learning_rate": 1.5210866839216398e-06, + "loss": 0.1983, + "num_input_tokens_seen": 27837544, + "step": 14220 + }, + { + "epoch": 1.8848243870112658, + "grad_norm": 0.0762672871351242, + "learning_rate": 1.5207672748561091e-06, + "loss": 0.0004, + "num_input_tokens_seen": 27840384, + "step": 14221 + }, + { + "epoch": 1.8849569251159708, + "grad_norm": 0.011521776206791401, + "learning_rate": 1.5204478846708613e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27842280, + "step": 14222 + }, + { + "epoch": 1.8850894632206758, + "grad_norm": 10.749360084533691, + "learning_rate": 1.520128513372054e-06, + "loss": 0.1443, + "num_input_tokens_seen": 27844736, + "step": 14223 + }, + { + "epoch": 1.885222001325381, + "grad_norm": 9.174150466918945, + "learning_rate": 1.519809160965844e-06, + "loss": 0.1424, + "num_input_tokens_seen": 27845864, + "step": 14224 + }, + { + "epoch": 1.8853545394300861, + "grad_norm": 8.902331352233887, + "learning_rate": 1.5194898274583897e-06, + "loss": 0.225, + "num_input_tokens_seen": 27848304, + "step": 14225 + }, + { + "epoch": 1.8854870775347914, + "grad_norm": 0.05295057222247124, + "learning_rate": 1.5191705128558486e-06, + "loss": 0.0003, + "num_input_tokens_seen": 27850288, + "step": 14226 + }, + { + "epoch": 1.8856196156394964, + "grad_norm": 5.496303558349609, + "learning_rate": 1.5188512171643766e-06, + "loss": 0.1148, + "num_input_tokens_seen": 27853224, + "step": 14227 + }, + { + "epoch": 1.8857521537442015, + "grad_norm": 5.95568323135376, + "learning_rate": 1.51853194039013e-06, + "loss": 0.0913, + "num_input_tokens_seen": 27854952, + "step": 14228 + }, + { + "epoch": 1.8858846918489065, + "grad_norm": 8.58351993560791, + "learning_rate": 1.5182126825392637e-06, + "loss": 0.0292, + "num_input_tokens_seen": 27856480, + "step": 14229 + }, + { + "epoch": 1.8860172299536115, + "grad_norm": 0.4832451045513153, + "learning_rate": 1.5178934436179355e-06, + "loss": 0.003, + "num_input_tokens_seen": 27859520, + "step": 14230 + }, + { + "epoch": 1.8861497680583168, + "grad_norm": 12.110719680786133, + "learning_rate": 1.5175742236322983e-06, + "loss": 0.2945, + "num_input_tokens_seen": 27861232, + "step": 14231 + }, + { + "epoch": 1.8862823061630218, + "grad_norm": 0.042423758655786514, + "learning_rate": 1.5172550225885085e-06, + "loss": 0.0003, + "num_input_tokens_seen": 27862368, + "step": 14232 + }, + { + "epoch": 1.886414844267727, + "grad_norm": 13.581439018249512, + "learning_rate": 1.5169358404927195e-06, + "loss": 0.165, + "num_input_tokens_seen": 27864432, + "step": 14233 + }, + { + "epoch": 1.8865473823724321, + "grad_norm": 8.45539379119873, + "learning_rate": 1.5166166773510859e-06, + "loss": 0.2729, + "num_input_tokens_seen": 27867288, + "step": 14234 + }, + { + "epoch": 1.8866799204771372, + "grad_norm": 5.302933692932129, + "learning_rate": 1.5162975331697605e-06, + "loss": 0.0498, + "num_input_tokens_seen": 27869392, + "step": 14235 + }, + { + "epoch": 1.8868124585818422, + "grad_norm": 0.05805386230349541, + "learning_rate": 1.5159784079548976e-06, + "loss": 0.0004, + "num_input_tokens_seen": 27872304, + "step": 14236 + }, + { + "epoch": 1.8869449966865472, + "grad_norm": 16.82742691040039, + "learning_rate": 1.5156593017126503e-06, + "loss": 0.3361, + "num_input_tokens_seen": 27874224, + "step": 14237 + }, + { + "epoch": 1.8870775347912525, + "grad_norm": 4.378699779510498, + "learning_rate": 1.5153402144491709e-06, + "loss": 0.1523, + "num_input_tokens_seen": 27876304, + "step": 14238 + }, + { + "epoch": 1.8872100728959575, + "grad_norm": 8.531929969787598, + "learning_rate": 1.5150211461706104e-06, + "loss": 0.1984, + "num_input_tokens_seen": 27878312, + "step": 14239 + }, + { + "epoch": 1.8873426110006628, + "grad_norm": 5.728397369384766, + "learning_rate": 1.5147020968831217e-06, + "loss": 0.1479, + "num_input_tokens_seen": 27880432, + "step": 14240 + }, + { + "epoch": 1.8874751491053678, + "grad_norm": 0.06679793447256088, + "learning_rate": 1.5143830665928572e-06, + "loss": 0.0005, + "num_input_tokens_seen": 27882560, + "step": 14241 + }, + { + "epoch": 1.8876076872100729, + "grad_norm": 7.64691162109375, + "learning_rate": 1.5140640553059665e-06, + "loss": 0.1823, + "num_input_tokens_seen": 27884456, + "step": 14242 + }, + { + "epoch": 1.887740225314778, + "grad_norm": 6.689384460449219, + "learning_rate": 1.5137450630286017e-06, + "loss": 0.0674, + "num_input_tokens_seen": 27886184, + "step": 14243 + }, + { + "epoch": 1.887872763419483, + "grad_norm": 5.318139553070068, + "learning_rate": 1.5134260897669124e-06, + "loss": 0.2196, + "num_input_tokens_seen": 27888120, + "step": 14244 + }, + { + "epoch": 1.8880053015241882, + "grad_norm": 6.747684001922607, + "learning_rate": 1.5131071355270477e-06, + "loss": 0.0582, + "num_input_tokens_seen": 27889992, + "step": 14245 + }, + { + "epoch": 1.8881378396288933, + "grad_norm": 7.919478893280029, + "learning_rate": 1.5127882003151586e-06, + "loss": 0.0899, + "num_input_tokens_seen": 27891440, + "step": 14246 + }, + { + "epoch": 1.8882703777335985, + "grad_norm": 1.8978681564331055, + "learning_rate": 1.5124692841373946e-06, + "loss": 0.0181, + "num_input_tokens_seen": 27893120, + "step": 14247 + }, + { + "epoch": 1.8884029158383036, + "grad_norm": 10.8031587600708, + "learning_rate": 1.5121503869999039e-06, + "loss": 0.1444, + "num_input_tokens_seen": 27895096, + "step": 14248 + }, + { + "epoch": 1.8885354539430086, + "grad_norm": 0.20030514895915985, + "learning_rate": 1.5118315089088352e-06, + "loss": 0.0013, + "num_input_tokens_seen": 27896880, + "step": 14249 + }, + { + "epoch": 1.8886679920477136, + "grad_norm": 5.710576057434082, + "learning_rate": 1.5115126498703358e-06, + "loss": 0.1196, + "num_input_tokens_seen": 27898664, + "step": 14250 + }, + { + "epoch": 1.8888005301524187, + "grad_norm": 1.7244715690612793, + "learning_rate": 1.5111938098905551e-06, + "loss": 0.0282, + "num_input_tokens_seen": 27900248, + "step": 14251 + }, + { + "epoch": 1.888933068257124, + "grad_norm": 8.273646354675293, + "learning_rate": 1.5108749889756402e-06, + "loss": 0.2399, + "num_input_tokens_seen": 27902208, + "step": 14252 + }, + { + "epoch": 1.8890656063618292, + "grad_norm": 0.2664487361907959, + "learning_rate": 1.5105561871317382e-06, + "loss": 0.0012, + "num_input_tokens_seen": 27903720, + "step": 14253 + }, + { + "epoch": 1.8891981444665342, + "grad_norm": 0.879421055316925, + "learning_rate": 1.5102374043649949e-06, + "loss": 0.008, + "num_input_tokens_seen": 27905344, + "step": 14254 + }, + { + "epoch": 1.8893306825712393, + "grad_norm": 3.511857748031616, + "learning_rate": 1.5099186406815566e-06, + "loss": 0.107, + "num_input_tokens_seen": 27907000, + "step": 14255 + }, + { + "epoch": 1.8894632206759443, + "grad_norm": 13.081283569335938, + "learning_rate": 1.509599896087571e-06, + "loss": 0.2253, + "num_input_tokens_seen": 27909448, + "step": 14256 + }, + { + "epoch": 1.8895957587806493, + "grad_norm": 2.9708056449890137, + "learning_rate": 1.5092811705891821e-06, + "loss": 0.0578, + "num_input_tokens_seen": 27911520, + "step": 14257 + }, + { + "epoch": 1.8897282968853544, + "grad_norm": 7.5729570388793945, + "learning_rate": 1.5089624641925364e-06, + "loss": 0.1322, + "num_input_tokens_seen": 27913576, + "step": 14258 + }, + { + "epoch": 1.8898608349900596, + "grad_norm": 5.853022575378418, + "learning_rate": 1.5086437769037774e-06, + "loss": 0.0632, + "num_input_tokens_seen": 27915648, + "step": 14259 + }, + { + "epoch": 1.889993373094765, + "grad_norm": 4.938845157623291, + "learning_rate": 1.5083251087290506e-06, + "loss": 0.1517, + "num_input_tokens_seen": 27917456, + "step": 14260 + }, + { + "epoch": 1.89012591119947, + "grad_norm": 9.295892715454102, + "learning_rate": 1.5080064596744991e-06, + "loss": 0.1141, + "num_input_tokens_seen": 27919840, + "step": 14261 + }, + { + "epoch": 1.890258449304175, + "grad_norm": 0.05260400101542473, + "learning_rate": 1.5076878297462682e-06, + "loss": 0.0004, + "num_input_tokens_seen": 27921872, + "step": 14262 + }, + { + "epoch": 1.89039098740888, + "grad_norm": 6.452965259552002, + "learning_rate": 1.5073692189505002e-06, + "loss": 0.0754, + "num_input_tokens_seen": 27923656, + "step": 14263 + }, + { + "epoch": 1.890523525513585, + "grad_norm": 9.336358070373535, + "learning_rate": 1.5070506272933388e-06, + "loss": 0.132, + "num_input_tokens_seen": 27925384, + "step": 14264 + }, + { + "epoch": 1.8906560636182903, + "grad_norm": 12.07724666595459, + "learning_rate": 1.5067320547809262e-06, + "loss": 0.1299, + "num_input_tokens_seen": 27927048, + "step": 14265 + }, + { + "epoch": 1.8907886017229953, + "grad_norm": 4.5465497970581055, + "learning_rate": 1.5064135014194038e-06, + "loss": 0.0925, + "num_input_tokens_seen": 27929672, + "step": 14266 + }, + { + "epoch": 1.8909211398277006, + "grad_norm": 0.21656224131584167, + "learning_rate": 1.5060949672149155e-06, + "loss": 0.0016, + "num_input_tokens_seen": 27931512, + "step": 14267 + }, + { + "epoch": 1.8910536779324056, + "grad_norm": 3.129467010498047, + "learning_rate": 1.5057764521736023e-06, + "loss": 0.0421, + "num_input_tokens_seen": 27933240, + "step": 14268 + }, + { + "epoch": 1.8911862160371107, + "grad_norm": 0.29531675577163696, + "learning_rate": 1.5054579563016043e-06, + "loss": 0.002, + "num_input_tokens_seen": 27935240, + "step": 14269 + }, + { + "epoch": 1.8913187541418157, + "grad_norm": 2.3875887393951416, + "learning_rate": 1.5051394796050633e-06, + "loss": 0.0639, + "num_input_tokens_seen": 27936672, + "step": 14270 + }, + { + "epoch": 1.8914512922465208, + "grad_norm": 2.8848841190338135, + "learning_rate": 1.5048210220901188e-06, + "loss": 0.0573, + "num_input_tokens_seen": 27938768, + "step": 14271 + }, + { + "epoch": 1.891583830351226, + "grad_norm": 1.1078637838363647, + "learning_rate": 1.5045025837629119e-06, + "loss": 0.0176, + "num_input_tokens_seen": 27940488, + "step": 14272 + }, + { + "epoch": 1.891716368455931, + "grad_norm": 5.7457804679870605, + "learning_rate": 1.5041841646295823e-06, + "loss": 0.083, + "num_input_tokens_seen": 27943296, + "step": 14273 + }, + { + "epoch": 1.8918489065606363, + "grad_norm": 3.2154109477996826, + "learning_rate": 1.5038657646962685e-06, + "loss": 0.046, + "num_input_tokens_seen": 27945424, + "step": 14274 + }, + { + "epoch": 1.8919814446653413, + "grad_norm": 0.18191538751125336, + "learning_rate": 1.5035473839691106e-06, + "loss": 0.0013, + "num_input_tokens_seen": 27946792, + "step": 14275 + }, + { + "epoch": 1.8921139827700464, + "grad_norm": 6.376058578491211, + "learning_rate": 1.5032290224542457e-06, + "loss": 0.1323, + "num_input_tokens_seen": 27948536, + "step": 14276 + }, + { + "epoch": 1.8922465208747514, + "grad_norm": 1.948290467262268, + "learning_rate": 1.5029106801578137e-06, + "loss": 0.0232, + "num_input_tokens_seen": 27951112, + "step": 14277 + }, + { + "epoch": 1.8923790589794565, + "grad_norm": 2.884453296661377, + "learning_rate": 1.5025923570859507e-06, + "loss": 0.0565, + "num_input_tokens_seen": 27953256, + "step": 14278 + }, + { + "epoch": 1.8925115970841617, + "grad_norm": 4.846578598022461, + "learning_rate": 1.5022740532447963e-06, + "loss": 0.0617, + "num_input_tokens_seen": 27955088, + "step": 14279 + }, + { + "epoch": 1.8926441351888668, + "grad_norm": 8.830873489379883, + "learning_rate": 1.5019557686404857e-06, + "loss": 0.1141, + "num_input_tokens_seen": 27957256, + "step": 14280 + }, + { + "epoch": 1.892776673293572, + "grad_norm": 11.722908020019531, + "learning_rate": 1.5016375032791557e-06, + "loss": 0.2234, + "num_input_tokens_seen": 27959528, + "step": 14281 + }, + { + "epoch": 1.892909211398277, + "grad_norm": 2.970183849334717, + "learning_rate": 1.5013192571669447e-06, + "loss": 0.0451, + "num_input_tokens_seen": 27961360, + "step": 14282 + }, + { + "epoch": 1.893041749502982, + "grad_norm": 0.02193407341837883, + "learning_rate": 1.5010010303099872e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27962528, + "step": 14283 + }, + { + "epoch": 1.8931742876076871, + "grad_norm": 8.791484832763672, + "learning_rate": 1.5006828227144188e-06, + "loss": 0.1738, + "num_input_tokens_seen": 27964928, + "step": 14284 + }, + { + "epoch": 1.8933068257123922, + "grad_norm": 10.523104667663574, + "learning_rate": 1.500364634386375e-06, + "loss": 0.2605, + "num_input_tokens_seen": 27967328, + "step": 14285 + }, + { + "epoch": 1.8934393638170974, + "grad_norm": 7.065920352935791, + "learning_rate": 1.5000464653319898e-06, + "loss": 0.084, + "num_input_tokens_seen": 27970240, + "step": 14286 + }, + { + "epoch": 1.8935719019218025, + "grad_norm": 0.354267954826355, + "learning_rate": 1.4997283155573995e-06, + "loss": 0.0026, + "num_input_tokens_seen": 27972456, + "step": 14287 + }, + { + "epoch": 1.8937044400265077, + "grad_norm": 7.797153949737549, + "learning_rate": 1.4994101850687373e-06, + "loss": 0.1817, + "num_input_tokens_seen": 27974952, + "step": 14288 + }, + { + "epoch": 1.8938369781312128, + "grad_norm": 1.6423008441925049, + "learning_rate": 1.4990920738721366e-06, + "loss": 0.0185, + "num_input_tokens_seen": 27976568, + "step": 14289 + }, + { + "epoch": 1.8939695162359178, + "grad_norm": 5.955856800079346, + "learning_rate": 1.4987739819737318e-06, + "loss": 0.0578, + "num_input_tokens_seen": 27978704, + "step": 14290 + }, + { + "epoch": 1.8941020543406228, + "grad_norm": 1.1075438261032104, + "learning_rate": 1.4984559093796556e-06, + "loss": 0.0055, + "num_input_tokens_seen": 27980928, + "step": 14291 + }, + { + "epoch": 1.8942345924453279, + "grad_norm": 6.061550140380859, + "learning_rate": 1.4981378560960391e-06, + "loss": 0.2094, + "num_input_tokens_seen": 27982896, + "step": 14292 + }, + { + "epoch": 1.8943671305500331, + "grad_norm": 0.028597114607691765, + "learning_rate": 1.4978198221290162e-06, + "loss": 0.0002, + "num_input_tokens_seen": 27984288, + "step": 14293 + }, + { + "epoch": 1.8944996686547384, + "grad_norm": 0.02162368781864643, + "learning_rate": 1.4975018074847192e-06, + "loss": 0.0001, + "num_input_tokens_seen": 27985624, + "step": 14294 + }, + { + "epoch": 1.8946322067594434, + "grad_norm": 7.240030288696289, + "learning_rate": 1.4971838121692787e-06, + "loss": 0.0964, + "num_input_tokens_seen": 27987408, + "step": 14295 + }, + { + "epoch": 1.8947647448641485, + "grad_norm": 3.035247802734375, + "learning_rate": 1.4968658361888263e-06, + "loss": 0.0395, + "num_input_tokens_seen": 27988832, + "step": 14296 + }, + { + "epoch": 1.8948972829688535, + "grad_norm": 0.40507200360298157, + "learning_rate": 1.4965478795494918e-06, + "loss": 0.0033, + "num_input_tokens_seen": 27990640, + "step": 14297 + }, + { + "epoch": 1.8950298210735586, + "grad_norm": 0.08990246802568436, + "learning_rate": 1.4962299422574073e-06, + "loss": 0.0011, + "num_input_tokens_seen": 27992480, + "step": 14298 + }, + { + "epoch": 1.8951623591782636, + "grad_norm": 5.55600643157959, + "learning_rate": 1.4959120243187014e-06, + "loss": 0.1198, + "num_input_tokens_seen": 27994440, + "step": 14299 + }, + { + "epoch": 1.8952948972829688, + "grad_norm": 0.4813433289527893, + "learning_rate": 1.4955941257395052e-06, + "loss": 0.0026, + "num_input_tokens_seen": 27995992, + "step": 14300 + }, + { + "epoch": 1.895427435387674, + "grad_norm": 7.863996505737305, + "learning_rate": 1.4952762465259462e-06, + "loss": 0.2081, + "num_input_tokens_seen": 27998032, + "step": 14301 + }, + { + "epoch": 1.8955599734923791, + "grad_norm": 6.6828460693359375, + "learning_rate": 1.494958386684154e-06, + "loss": 0.1, + "num_input_tokens_seen": 27999768, + "step": 14302 + }, + { + "epoch": 1.8956925115970842, + "grad_norm": 0.056097324937582016, + "learning_rate": 1.4946405462202584e-06, + "loss": 0.0003, + "num_input_tokens_seen": 28001960, + "step": 14303 + }, + { + "epoch": 1.8958250497017892, + "grad_norm": 2.3371753692626953, + "learning_rate": 1.4943227251403864e-06, + "loss": 0.0228, + "num_input_tokens_seen": 28003872, + "step": 14304 + }, + { + "epoch": 1.8959575878064943, + "grad_norm": 9.442514419555664, + "learning_rate": 1.494004923450666e-06, + "loss": 0.0682, + "num_input_tokens_seen": 28006552, + "step": 14305 + }, + { + "epoch": 1.8960901259111995, + "grad_norm": 0.09930270910263062, + "learning_rate": 1.4936871411572252e-06, + "loss": 0.0007, + "num_input_tokens_seen": 28008416, + "step": 14306 + }, + { + "epoch": 1.8962226640159046, + "grad_norm": 5.885056495666504, + "learning_rate": 1.493369378266189e-06, + "loss": 0.1503, + "num_input_tokens_seen": 28010272, + "step": 14307 + }, + { + "epoch": 1.8963552021206098, + "grad_norm": 8.787714958190918, + "learning_rate": 1.4930516347836866e-06, + "loss": 0.0635, + "num_input_tokens_seen": 28012504, + "step": 14308 + }, + { + "epoch": 1.8964877402253149, + "grad_norm": 0.06251006573438644, + "learning_rate": 1.4927339107158437e-06, + "loss": 0.0003, + "num_input_tokens_seen": 28013752, + "step": 14309 + }, + { + "epoch": 1.89662027833002, + "grad_norm": 13.839811325073242, + "learning_rate": 1.4924162060687852e-06, + "loss": 0.1318, + "num_input_tokens_seen": 28015640, + "step": 14310 + }, + { + "epoch": 1.896752816434725, + "grad_norm": 8.636619567871094, + "learning_rate": 1.4920985208486383e-06, + "loss": 0.2112, + "num_input_tokens_seen": 28017792, + "step": 14311 + }, + { + "epoch": 1.89688535453943, + "grad_norm": 0.3030971586704254, + "learning_rate": 1.4917808550615259e-06, + "loss": 0.002, + "num_input_tokens_seen": 28019632, + "step": 14312 + }, + { + "epoch": 1.8970178926441352, + "grad_norm": 7.821743488311768, + "learning_rate": 1.4914632087135746e-06, + "loss": 0.1602, + "num_input_tokens_seen": 28021112, + "step": 14313 + }, + { + "epoch": 1.8971504307488403, + "grad_norm": 4.045699596405029, + "learning_rate": 1.4911455818109088e-06, + "loss": 0.0127, + "num_input_tokens_seen": 28023192, + "step": 14314 + }, + { + "epoch": 1.8972829688535455, + "grad_norm": 0.016956863924860954, + "learning_rate": 1.4908279743596528e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28024776, + "step": 14315 + }, + { + "epoch": 1.8974155069582506, + "grad_norm": 12.531383514404297, + "learning_rate": 1.4905103863659283e-06, + "loss": 0.2924, + "num_input_tokens_seen": 28026712, + "step": 14316 + }, + { + "epoch": 1.8975480450629556, + "grad_norm": 0.20813694596290588, + "learning_rate": 1.490192817835861e-06, + "loss": 0.0008, + "num_input_tokens_seen": 28028040, + "step": 14317 + }, + { + "epoch": 1.8976805831676606, + "grad_norm": 0.11262309551239014, + "learning_rate": 1.489875268775572e-06, + "loss": 0.0008, + "num_input_tokens_seen": 28029880, + "step": 14318 + }, + { + "epoch": 1.8978131212723657, + "grad_norm": 14.015881538391113, + "learning_rate": 1.4895577391911845e-06, + "loss": 0.2578, + "num_input_tokens_seen": 28031792, + "step": 14319 + }, + { + "epoch": 1.897945659377071, + "grad_norm": 6.26746940612793, + "learning_rate": 1.4892402290888216e-06, + "loss": 0.1009, + "num_input_tokens_seen": 28033560, + "step": 14320 + }, + { + "epoch": 1.898078197481776, + "grad_norm": 2.322051525115967, + "learning_rate": 1.4889227384746046e-06, + "loss": 0.0801, + "num_input_tokens_seen": 28035584, + "step": 14321 + }, + { + "epoch": 1.8982107355864812, + "grad_norm": 17.970823287963867, + "learning_rate": 1.4886052673546541e-06, + "loss": 0.7695, + "num_input_tokens_seen": 28038384, + "step": 14322 + }, + { + "epoch": 1.8983432736911863, + "grad_norm": 11.251123428344727, + "learning_rate": 1.4882878157350911e-06, + "loss": 0.2448, + "num_input_tokens_seen": 28041272, + "step": 14323 + }, + { + "epoch": 1.8984758117958913, + "grad_norm": 5.492765426635742, + "learning_rate": 1.4879703836220384e-06, + "loss": 0.1414, + "num_input_tokens_seen": 28043456, + "step": 14324 + }, + { + "epoch": 1.8986083499005963, + "grad_norm": 2.0926756858825684, + "learning_rate": 1.4876529710216137e-06, + "loss": 0.032, + "num_input_tokens_seen": 28045240, + "step": 14325 + }, + { + "epoch": 1.8987408880053014, + "grad_norm": 8.932882308959961, + "learning_rate": 1.487335577939939e-06, + "loss": 0.1369, + "num_input_tokens_seen": 28047104, + "step": 14326 + }, + { + "epoch": 1.8988734261100066, + "grad_norm": 0.20662327110767365, + "learning_rate": 1.4870182043831324e-06, + "loss": 0.0012, + "num_input_tokens_seen": 28048776, + "step": 14327 + }, + { + "epoch": 1.8990059642147117, + "grad_norm": 7.576310634613037, + "learning_rate": 1.4867008503573132e-06, + "loss": 0.1316, + "num_input_tokens_seen": 28050712, + "step": 14328 + }, + { + "epoch": 1.899138502319417, + "grad_norm": 0.0881069079041481, + "learning_rate": 1.4863835158686017e-06, + "loss": 0.0006, + "num_input_tokens_seen": 28052496, + "step": 14329 + }, + { + "epoch": 1.899271040424122, + "grad_norm": 0.13089138269424438, + "learning_rate": 1.486066200923115e-06, + "loss": 0.0006, + "num_input_tokens_seen": 28055888, + "step": 14330 + }, + { + "epoch": 1.899403578528827, + "grad_norm": 0.008391792885959148, + "learning_rate": 1.485748905526971e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28057192, + "step": 14331 + }, + { + "epoch": 1.899536116633532, + "grad_norm": 0.11081094294786453, + "learning_rate": 1.4854316296862886e-06, + "loss": 0.0008, + "num_input_tokens_seen": 28059696, + "step": 14332 + }, + { + "epoch": 1.899668654738237, + "grad_norm": 12.982908248901367, + "learning_rate": 1.485114373407183e-06, + "loss": 0.2887, + "num_input_tokens_seen": 28061152, + "step": 14333 + }, + { + "epoch": 1.8998011928429424, + "grad_norm": 10.592294692993164, + "learning_rate": 1.4847971366957731e-06, + "loss": 0.137, + "num_input_tokens_seen": 28063424, + "step": 14334 + }, + { + "epoch": 1.8999337309476476, + "grad_norm": 13.612466812133789, + "learning_rate": 1.4844799195581748e-06, + "loss": 0.229, + "num_input_tokens_seen": 28066184, + "step": 14335 + }, + { + "epoch": 1.9000662690523527, + "grad_norm": 6.599224090576172, + "learning_rate": 1.484162722000505e-06, + "loss": 0.0817, + "num_input_tokens_seen": 28067912, + "step": 14336 + }, + { + "epoch": 1.9001988071570577, + "grad_norm": 7.283559322357178, + "learning_rate": 1.4838455440288776e-06, + "loss": 0.0855, + "num_input_tokens_seen": 28070736, + "step": 14337 + }, + { + "epoch": 1.9003313452617627, + "grad_norm": 3.0039098262786865, + "learning_rate": 1.4835283856494087e-06, + "loss": 0.0327, + "num_input_tokens_seen": 28072152, + "step": 14338 + }, + { + "epoch": 1.9004638833664678, + "grad_norm": 0.7411717176437378, + "learning_rate": 1.483211246868215e-06, + "loss": 0.0047, + "num_input_tokens_seen": 28074344, + "step": 14339 + }, + { + "epoch": 1.9005964214711728, + "grad_norm": 0.8947293162345886, + "learning_rate": 1.4828941276914095e-06, + "loss": 0.0052, + "num_input_tokens_seen": 28075616, + "step": 14340 + }, + { + "epoch": 1.900728959575878, + "grad_norm": 0.17384140193462372, + "learning_rate": 1.4825770281251067e-06, + "loss": 0.0012, + "num_input_tokens_seen": 28077608, + "step": 14341 + }, + { + "epoch": 1.9008614976805833, + "grad_norm": 2.4023985862731934, + "learning_rate": 1.4822599481754207e-06, + "loss": 0.043, + "num_input_tokens_seen": 28079160, + "step": 14342 + }, + { + "epoch": 1.9009940357852884, + "grad_norm": 0.0192425400018692, + "learning_rate": 1.4819428878484646e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28080592, + "step": 14343 + }, + { + "epoch": 1.9011265738899934, + "grad_norm": 6.405759811401367, + "learning_rate": 1.4816258471503528e-06, + "loss": 0.178, + "num_input_tokens_seen": 28082840, + "step": 14344 + }, + { + "epoch": 1.9012591119946984, + "grad_norm": 7.292095184326172, + "learning_rate": 1.4813088260871971e-06, + "loss": 0.2574, + "num_input_tokens_seen": 28085552, + "step": 14345 + }, + { + "epoch": 1.9013916500994035, + "grad_norm": 0.047887712717056274, + "learning_rate": 1.4809918246651094e-06, + "loss": 0.0003, + "num_input_tokens_seen": 28087632, + "step": 14346 + }, + { + "epoch": 1.9015241882041087, + "grad_norm": 1.4407761096954346, + "learning_rate": 1.4806748428902031e-06, + "loss": 0.007, + "num_input_tokens_seen": 28089304, + "step": 14347 + }, + { + "epoch": 1.9016567263088138, + "grad_norm": 8.873480796813965, + "learning_rate": 1.4803578807685887e-06, + "loss": 0.1905, + "num_input_tokens_seen": 28091848, + "step": 14348 + }, + { + "epoch": 1.901789264413519, + "grad_norm": 6.293337345123291, + "learning_rate": 1.4800409383063768e-06, + "loss": 0.0805, + "num_input_tokens_seen": 28093408, + "step": 14349 + }, + { + "epoch": 1.901921802518224, + "grad_norm": 10.04287052154541, + "learning_rate": 1.4797240155096804e-06, + "loss": 0.1499, + "num_input_tokens_seen": 28095336, + "step": 14350 + }, + { + "epoch": 1.902054340622929, + "grad_norm": 9.148497581481934, + "learning_rate": 1.4794071123846088e-06, + "loss": 0.2759, + "num_input_tokens_seen": 28097592, + "step": 14351 + }, + { + "epoch": 1.9021868787276341, + "grad_norm": 4.605007648468018, + "learning_rate": 1.4790902289372717e-06, + "loss": 0.0422, + "num_input_tokens_seen": 28099000, + "step": 14352 + }, + { + "epoch": 1.9023194168323392, + "grad_norm": 2.3926753997802734, + "learning_rate": 1.4787733651737798e-06, + "loss": 0.0389, + "num_input_tokens_seen": 28100640, + "step": 14353 + }, + { + "epoch": 1.9024519549370444, + "grad_norm": 4.708327770233154, + "learning_rate": 1.4784565211002411e-06, + "loss": 0.1527, + "num_input_tokens_seen": 28102488, + "step": 14354 + }, + { + "epoch": 1.9025844930417495, + "grad_norm": 5.654538631439209, + "learning_rate": 1.4781396967227657e-06, + "loss": 0.0778, + "num_input_tokens_seen": 28104552, + "step": 14355 + }, + { + "epoch": 1.9027170311464547, + "grad_norm": 3.59625244140625, + "learning_rate": 1.477822892047462e-06, + "loss": 0.0229, + "num_input_tokens_seen": 28105848, + "step": 14356 + }, + { + "epoch": 1.9028495692511598, + "grad_norm": 11.639248847961426, + "learning_rate": 1.4775061070804381e-06, + "loss": 0.2296, + "num_input_tokens_seen": 28107576, + "step": 14357 + }, + { + "epoch": 1.9029821073558648, + "grad_norm": 0.9375972151756287, + "learning_rate": 1.4771893418278021e-06, + "loss": 0.0052, + "num_input_tokens_seen": 28109224, + "step": 14358 + }, + { + "epoch": 1.9031146454605699, + "grad_norm": 7.5576934814453125, + "learning_rate": 1.4768725962956604e-06, + "loss": 0.1128, + "num_input_tokens_seen": 28111720, + "step": 14359 + }, + { + "epoch": 1.903247183565275, + "grad_norm": 0.03962313383817673, + "learning_rate": 1.4765558704901216e-06, + "loss": 0.0003, + "num_input_tokens_seen": 28113136, + "step": 14360 + }, + { + "epoch": 1.9033797216699802, + "grad_norm": 12.488739013671875, + "learning_rate": 1.476239164417292e-06, + "loss": 0.3789, + "num_input_tokens_seen": 28115472, + "step": 14361 + }, + { + "epoch": 1.9035122597746852, + "grad_norm": 10.276759147644043, + "learning_rate": 1.475922478083277e-06, + "loss": 0.1178, + "num_input_tokens_seen": 28117432, + "step": 14362 + }, + { + "epoch": 1.9036447978793904, + "grad_norm": 5.166616916656494, + "learning_rate": 1.4756058114941829e-06, + "loss": 0.1, + "num_input_tokens_seen": 28119168, + "step": 14363 + }, + { + "epoch": 1.9037773359840955, + "grad_norm": 8.708499908447266, + "learning_rate": 1.475289164656115e-06, + "loss": 0.11, + "num_input_tokens_seen": 28121496, + "step": 14364 + }, + { + "epoch": 1.9039098740888005, + "grad_norm": 2.347527265548706, + "learning_rate": 1.47497253757518e-06, + "loss": 0.0266, + "num_input_tokens_seen": 28123840, + "step": 14365 + }, + { + "epoch": 1.9040424121935056, + "grad_norm": 3.3438308238983154, + "learning_rate": 1.4746559302574813e-06, + "loss": 0.0506, + "num_input_tokens_seen": 28125752, + "step": 14366 + }, + { + "epoch": 1.9041749502982106, + "grad_norm": 0.06118813902139664, + "learning_rate": 1.4743393427091241e-06, + "loss": 0.0003, + "num_input_tokens_seen": 28127592, + "step": 14367 + }, + { + "epoch": 1.9043074884029159, + "grad_norm": 5.182222366333008, + "learning_rate": 1.4740227749362116e-06, + "loss": 0.0208, + "num_input_tokens_seen": 28129648, + "step": 14368 + }, + { + "epoch": 1.904440026507621, + "grad_norm": 0.24628907442092896, + "learning_rate": 1.4737062269448467e-06, + "loss": 0.0029, + "num_input_tokens_seen": 28131216, + "step": 14369 + }, + { + "epoch": 1.9045725646123262, + "grad_norm": 0.0353533960878849, + "learning_rate": 1.4733896987411344e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28133968, + "step": 14370 + }, + { + "epoch": 1.9047051027170312, + "grad_norm": 15.884420394897461, + "learning_rate": 1.4730731903311777e-06, + "loss": 0.2577, + "num_input_tokens_seen": 28135648, + "step": 14371 + }, + { + "epoch": 1.9048376408217362, + "grad_norm": 2.725804090499878, + "learning_rate": 1.4727567017210775e-06, + "loss": 0.0388, + "num_input_tokens_seen": 28137824, + "step": 14372 + }, + { + "epoch": 1.9049701789264413, + "grad_norm": 0.030706198886036873, + "learning_rate": 1.4724402329169374e-06, + "loss": 0.0002, + "num_input_tokens_seen": 28139976, + "step": 14373 + }, + { + "epoch": 1.9051027170311463, + "grad_norm": 3.243273973464966, + "learning_rate": 1.4721237839248586e-06, + "loss": 0.0252, + "num_input_tokens_seen": 28141480, + "step": 14374 + }, + { + "epoch": 1.9052352551358516, + "grad_norm": 5.909368515014648, + "learning_rate": 1.471807354750941e-06, + "loss": 0.1111, + "num_input_tokens_seen": 28143864, + "step": 14375 + }, + { + "epoch": 1.9053677932405566, + "grad_norm": 0.021473951637744904, + "learning_rate": 1.4714909454012877e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28145512, + "step": 14376 + }, + { + "epoch": 1.9055003313452619, + "grad_norm": 13.313080787658691, + "learning_rate": 1.471174555881999e-06, + "loss": 0.1762, + "num_input_tokens_seen": 28147960, + "step": 14377 + }, + { + "epoch": 1.905632869449967, + "grad_norm": 0.03659152612090111, + "learning_rate": 1.4708581861991738e-06, + "loss": 0.0002, + "num_input_tokens_seen": 28149416, + "step": 14378 + }, + { + "epoch": 1.905765407554672, + "grad_norm": 0.6014538407325745, + "learning_rate": 1.4705418363589136e-06, + "loss": 0.0063, + "num_input_tokens_seen": 28151248, + "step": 14379 + }, + { + "epoch": 1.905897945659377, + "grad_norm": 7.331469535827637, + "learning_rate": 1.470225506367316e-06, + "loss": 0.1582, + "num_input_tokens_seen": 28152584, + "step": 14380 + }, + { + "epoch": 1.906030483764082, + "grad_norm": 6.527358531951904, + "learning_rate": 1.4699091962304813e-06, + "loss": 0.0996, + "num_input_tokens_seen": 28154832, + "step": 14381 + }, + { + "epoch": 1.9061630218687873, + "grad_norm": 4.263115406036377, + "learning_rate": 1.4695929059545084e-06, + "loss": 0.0608, + "num_input_tokens_seen": 28156968, + "step": 14382 + }, + { + "epoch": 1.9062955599734925, + "grad_norm": 0.8157362937927246, + "learning_rate": 1.4692766355454953e-06, + "loss": 0.0062, + "num_input_tokens_seen": 28158504, + "step": 14383 + }, + { + "epoch": 1.9064280980781976, + "grad_norm": 7.034056186676025, + "learning_rate": 1.468960385009539e-06, + "loss": 0.081, + "num_input_tokens_seen": 28160304, + "step": 14384 + }, + { + "epoch": 1.9065606361829026, + "grad_norm": 4.196228981018066, + "learning_rate": 1.4686441543527374e-06, + "loss": 0.0594, + "num_input_tokens_seen": 28161808, + "step": 14385 + }, + { + "epoch": 1.9066931742876077, + "grad_norm": 7.431917190551758, + "learning_rate": 1.4683279435811891e-06, + "loss": 0.1528, + "num_input_tokens_seen": 28164776, + "step": 14386 + }, + { + "epoch": 1.9068257123923127, + "grad_norm": 0.017847903072834015, + "learning_rate": 1.4680117527009893e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28167688, + "step": 14387 + }, + { + "epoch": 1.9069582504970177, + "grad_norm": 14.12850570678711, + "learning_rate": 1.4676955817182353e-06, + "loss": 0.1797, + "num_input_tokens_seen": 28169632, + "step": 14388 + }, + { + "epoch": 1.907090788601723, + "grad_norm": 0.0770014226436615, + "learning_rate": 1.4673794306390227e-06, + "loss": 0.0005, + "num_input_tokens_seen": 28171432, + "step": 14389 + }, + { + "epoch": 1.9072233267064282, + "grad_norm": 1.719918131828308, + "learning_rate": 1.4670632994694456e-06, + "loss": 0.0173, + "num_input_tokens_seen": 28174176, + "step": 14390 + }, + { + "epoch": 1.9073558648111333, + "grad_norm": 11.846678733825684, + "learning_rate": 1.4667471882156014e-06, + "loss": 0.3294, + "num_input_tokens_seen": 28176272, + "step": 14391 + }, + { + "epoch": 1.9074884029158383, + "grad_norm": 16.303253173828125, + "learning_rate": 1.466431096883585e-06, + "loss": 0.3537, + "num_input_tokens_seen": 28178376, + "step": 14392 + }, + { + "epoch": 1.9076209410205434, + "grad_norm": 13.694519996643066, + "learning_rate": 1.4661150254794893e-06, + "loss": 0.3723, + "num_input_tokens_seen": 28180328, + "step": 14393 + }, + { + "epoch": 1.9077534791252484, + "grad_norm": 1.6928051710128784, + "learning_rate": 1.4657989740094098e-06, + "loss": 0.0158, + "num_input_tokens_seen": 28183784, + "step": 14394 + }, + { + "epoch": 1.9078860172299537, + "grad_norm": 3.777658700942993, + "learning_rate": 1.4654829424794382e-06, + "loss": 0.0392, + "num_input_tokens_seen": 28185912, + "step": 14395 + }, + { + "epoch": 1.9080185553346587, + "grad_norm": 7.629262447357178, + "learning_rate": 1.46516693089567e-06, + "loss": 0.1371, + "num_input_tokens_seen": 28188032, + "step": 14396 + }, + { + "epoch": 1.908151093439364, + "grad_norm": 0.03158547356724739, + "learning_rate": 1.4648509392641978e-06, + "loss": 0.0002, + "num_input_tokens_seen": 28189152, + "step": 14397 + }, + { + "epoch": 1.908283631544069, + "grad_norm": 8.083187103271484, + "learning_rate": 1.4645349675911134e-06, + "loss": 0.177, + "num_input_tokens_seen": 28191096, + "step": 14398 + }, + { + "epoch": 1.908416169648774, + "grad_norm": 8.413670539855957, + "learning_rate": 1.4642190158825087e-06, + "loss": 0.113, + "num_input_tokens_seen": 28192832, + "step": 14399 + }, + { + "epoch": 1.908548707753479, + "grad_norm": 2.553278684616089, + "learning_rate": 1.4639030841444755e-06, + "loss": 0.0124, + "num_input_tokens_seen": 28194032, + "step": 14400 + }, + { + "epoch": 1.908681245858184, + "grad_norm": 3.7777161598205566, + "learning_rate": 1.4635871723831063e-06, + "loss": 0.0247, + "num_input_tokens_seen": 28196496, + "step": 14401 + }, + { + "epoch": 1.9088137839628894, + "grad_norm": 7.240584850311279, + "learning_rate": 1.4632712806044912e-06, + "loss": 0.2198, + "num_input_tokens_seen": 28199000, + "step": 14402 + }, + { + "epoch": 1.9089463220675944, + "grad_norm": 4.616669178009033, + "learning_rate": 1.4629554088147216e-06, + "loss": 0.0736, + "num_input_tokens_seen": 28201512, + "step": 14403 + }, + { + "epoch": 1.9090788601722997, + "grad_norm": 0.03577223792672157, + "learning_rate": 1.462639557019887e-06, + "loss": 0.0002, + "num_input_tokens_seen": 28203448, + "step": 14404 + }, + { + "epoch": 1.9092113982770047, + "grad_norm": 0.5985246300697327, + "learning_rate": 1.4623237252260768e-06, + "loss": 0.0045, + "num_input_tokens_seen": 28205032, + "step": 14405 + }, + { + "epoch": 1.9093439363817097, + "grad_norm": 1.786458134651184, + "learning_rate": 1.4620079134393805e-06, + "loss": 0.0104, + "num_input_tokens_seen": 28207592, + "step": 14406 + }, + { + "epoch": 1.9094764744864148, + "grad_norm": 9.66949462890625, + "learning_rate": 1.4616921216658888e-06, + "loss": 0.0831, + "num_input_tokens_seen": 28209416, + "step": 14407 + }, + { + "epoch": 1.9096090125911198, + "grad_norm": 3.496964931488037, + "learning_rate": 1.461376349911689e-06, + "loss": 0.1269, + "num_input_tokens_seen": 28211272, + "step": 14408 + }, + { + "epoch": 1.909741550695825, + "grad_norm": 7.710900783538818, + "learning_rate": 1.4610605981828698e-06, + "loss": 0.0361, + "num_input_tokens_seen": 28212904, + "step": 14409 + }, + { + "epoch": 1.9098740888005301, + "grad_norm": 10.75462532043457, + "learning_rate": 1.460744866485519e-06, + "loss": 0.1921, + "num_input_tokens_seen": 28214664, + "step": 14410 + }, + { + "epoch": 1.9100066269052354, + "grad_norm": 4.333770275115967, + "learning_rate": 1.4604291548257232e-06, + "loss": 0.0662, + "num_input_tokens_seen": 28216712, + "step": 14411 + }, + { + "epoch": 1.9101391650099404, + "grad_norm": 0.3493688702583313, + "learning_rate": 1.4601134632095714e-06, + "loss": 0.0016, + "num_input_tokens_seen": 28218592, + "step": 14412 + }, + { + "epoch": 1.9102717031146454, + "grad_norm": 7.69537353515625, + "learning_rate": 1.4597977916431495e-06, + "loss": 0.2792, + "num_input_tokens_seen": 28220784, + "step": 14413 + }, + { + "epoch": 1.9104042412193505, + "grad_norm": 0.009506812319159508, + "learning_rate": 1.4594821401325435e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28222192, + "step": 14414 + }, + { + "epoch": 1.9105367793240555, + "grad_norm": 0.018124233931303024, + "learning_rate": 1.45916650868384e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28223536, + "step": 14415 + }, + { + "epoch": 1.9106693174287608, + "grad_norm": 8.8605375289917, + "learning_rate": 1.4588508973031232e-06, + "loss": 0.1683, + "num_input_tokens_seen": 28225344, + "step": 14416 + }, + { + "epoch": 1.9108018555334658, + "grad_norm": 0.038766033947467804, + "learning_rate": 1.4585353059964802e-06, + "loss": 0.0003, + "num_input_tokens_seen": 28228392, + "step": 14417 + }, + { + "epoch": 1.910934393638171, + "grad_norm": 3.79392409324646, + "learning_rate": 1.4582197347699945e-06, + "loss": 0.0555, + "num_input_tokens_seen": 28230320, + "step": 14418 + }, + { + "epoch": 1.9110669317428761, + "grad_norm": 0.014220799319446087, + "learning_rate": 1.4579041836297513e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28232240, + "step": 14419 + }, + { + "epoch": 1.9111994698475812, + "grad_norm": 0.041857972741127014, + "learning_rate": 1.457588652581835e-06, + "loss": 0.0003, + "num_input_tokens_seen": 28234104, + "step": 14420 + }, + { + "epoch": 1.9113320079522862, + "grad_norm": 4.487502574920654, + "learning_rate": 1.4572731416323271e-06, + "loss": 0.0583, + "num_input_tokens_seen": 28235904, + "step": 14421 + }, + { + "epoch": 1.9114645460569912, + "grad_norm": 6.965250015258789, + "learning_rate": 1.4569576507873136e-06, + "loss": 0.1566, + "num_input_tokens_seen": 28237464, + "step": 14422 + }, + { + "epoch": 1.9115970841616965, + "grad_norm": 2.7727701663970947, + "learning_rate": 1.4566421800528763e-06, + "loss": 0.0352, + "num_input_tokens_seen": 28239080, + "step": 14423 + }, + { + "epoch": 1.9117296222664018, + "grad_norm": 2.2861809730529785, + "learning_rate": 1.4563267294350963e-06, + "loss": 0.0076, + "num_input_tokens_seen": 28240456, + "step": 14424 + }, + { + "epoch": 1.9118621603711068, + "grad_norm": 4.952845096588135, + "learning_rate": 1.456011298940058e-06, + "loss": 0.1238, + "num_input_tokens_seen": 28242544, + "step": 14425 + }, + { + "epoch": 1.9119946984758118, + "grad_norm": 0.06084465608000755, + "learning_rate": 1.455695888573841e-06, + "loss": 0.0003, + "num_input_tokens_seen": 28243800, + "step": 14426 + }, + { + "epoch": 1.9121272365805169, + "grad_norm": 0.1038113608956337, + "learning_rate": 1.455380498342529e-06, + "loss": 0.0007, + "num_input_tokens_seen": 28245056, + "step": 14427 + }, + { + "epoch": 1.912259774685222, + "grad_norm": 0.410986065864563, + "learning_rate": 1.4550651282522013e-06, + "loss": 0.0026, + "num_input_tokens_seen": 28247008, + "step": 14428 + }, + { + "epoch": 1.912392312789927, + "grad_norm": 4.654015064239502, + "learning_rate": 1.454749778308939e-06, + "loss": 0.0305, + "num_input_tokens_seen": 28249144, + "step": 14429 + }, + { + "epoch": 1.9125248508946322, + "grad_norm": 0.7885858416557312, + "learning_rate": 1.454434448518821e-06, + "loss": 0.004, + "num_input_tokens_seen": 28250840, + "step": 14430 + }, + { + "epoch": 1.9126573889993375, + "grad_norm": 0.9576358795166016, + "learning_rate": 1.454119138887929e-06, + "loss": 0.0113, + "num_input_tokens_seen": 28252736, + "step": 14431 + }, + { + "epoch": 1.9127899271040425, + "grad_norm": 0.18812473118305206, + "learning_rate": 1.4538038494223406e-06, + "loss": 0.0013, + "num_input_tokens_seen": 28254168, + "step": 14432 + }, + { + "epoch": 1.9129224652087475, + "grad_norm": 2.846705913543701, + "learning_rate": 1.4534885801281368e-06, + "loss": 0.0578, + "num_input_tokens_seen": 28255824, + "step": 14433 + }, + { + "epoch": 1.9130550033134526, + "grad_norm": 0.4817339777946472, + "learning_rate": 1.453173331011395e-06, + "loss": 0.0035, + "num_input_tokens_seen": 28257200, + "step": 14434 + }, + { + "epoch": 1.9131875414181576, + "grad_norm": 15.955060005187988, + "learning_rate": 1.4528581020781934e-06, + "loss": 0.3218, + "num_input_tokens_seen": 28258792, + "step": 14435 + }, + { + "epoch": 1.9133200795228629, + "grad_norm": 0.01600560173392296, + "learning_rate": 1.4525428933346095e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28261008, + "step": 14436 + }, + { + "epoch": 1.913452617627568, + "grad_norm": 0.1418578028678894, + "learning_rate": 1.4522277047867208e-06, + "loss": 0.0009, + "num_input_tokens_seen": 28262264, + "step": 14437 + }, + { + "epoch": 1.9135851557322732, + "grad_norm": 0.00897623784840107, + "learning_rate": 1.451912536440606e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28263784, + "step": 14438 + }, + { + "epoch": 1.9137176938369782, + "grad_norm": 8.139069557189941, + "learning_rate": 1.4515973883023404e-06, + "loss": 0.198, + "num_input_tokens_seen": 28265240, + "step": 14439 + }, + { + "epoch": 1.9138502319416832, + "grad_norm": 13.951101303100586, + "learning_rate": 1.4512822603780003e-06, + "loss": 0.2427, + "num_input_tokens_seen": 28266936, + "step": 14440 + }, + { + "epoch": 1.9139827700463883, + "grad_norm": 5.647830486297607, + "learning_rate": 1.4509671526736608e-06, + "loss": 0.0571, + "num_input_tokens_seen": 28268864, + "step": 14441 + }, + { + "epoch": 1.9141153081510933, + "grad_norm": 2.023261308670044, + "learning_rate": 1.4506520651953984e-06, + "loss": 0.0148, + "num_input_tokens_seen": 28270984, + "step": 14442 + }, + { + "epoch": 1.9142478462557986, + "grad_norm": 0.047415055334568024, + "learning_rate": 1.450336997949289e-06, + "loss": 0.0003, + "num_input_tokens_seen": 28272856, + "step": 14443 + }, + { + "epoch": 1.9143803843605036, + "grad_norm": 6.8397440910339355, + "learning_rate": 1.4500219509414065e-06, + "loss": 0.148, + "num_input_tokens_seen": 28274344, + "step": 14444 + }, + { + "epoch": 1.9145129224652089, + "grad_norm": 5.173931121826172, + "learning_rate": 1.449706924177825e-06, + "loss": 0.1555, + "num_input_tokens_seen": 28277216, + "step": 14445 + }, + { + "epoch": 1.914645460569914, + "grad_norm": 4.43999719619751, + "learning_rate": 1.4493919176646187e-06, + "loss": 0.0803, + "num_input_tokens_seen": 28279592, + "step": 14446 + }, + { + "epoch": 1.914777998674619, + "grad_norm": 0.019125768914818764, + "learning_rate": 1.4490769314078596e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28281408, + "step": 14447 + }, + { + "epoch": 1.914910536779324, + "grad_norm": 0.1252020001411438, + "learning_rate": 1.4487619654136225e-06, + "loss": 0.0009, + "num_input_tokens_seen": 28282864, + "step": 14448 + }, + { + "epoch": 1.915043074884029, + "grad_norm": 12.181060791015625, + "learning_rate": 1.4484470196879813e-06, + "loss": 0.181, + "num_input_tokens_seen": 28284616, + "step": 14449 + }, + { + "epoch": 1.9151756129887343, + "grad_norm": 1.028161644935608, + "learning_rate": 1.4481320942370065e-06, + "loss": 0.0237, + "num_input_tokens_seen": 28286152, + "step": 14450 + }, + { + "epoch": 1.9153081510934393, + "grad_norm": 0.042308032512664795, + "learning_rate": 1.4478171890667709e-06, + "loss": 0.0003, + "num_input_tokens_seen": 28287936, + "step": 14451 + }, + { + "epoch": 1.9154406891981446, + "grad_norm": 4.946495532989502, + "learning_rate": 1.4475023041833445e-06, + "loss": 0.0608, + "num_input_tokens_seen": 28290448, + "step": 14452 + }, + { + "epoch": 1.9155732273028496, + "grad_norm": 7.273913383483887, + "learning_rate": 1.447187439592801e-06, + "loss": 0.0919, + "num_input_tokens_seen": 28292752, + "step": 14453 + }, + { + "epoch": 1.9157057654075547, + "grad_norm": 12.199535369873047, + "learning_rate": 1.4468725953012086e-06, + "loss": 0.263, + "num_input_tokens_seen": 28295144, + "step": 14454 + }, + { + "epoch": 1.9158383035122597, + "grad_norm": 0.02083752676844597, + "learning_rate": 1.4465577713146404e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28296528, + "step": 14455 + }, + { + "epoch": 1.9159708416169647, + "grad_norm": 5.584634304046631, + "learning_rate": 1.4462429676391648e-06, + "loss": 0.1056, + "num_input_tokens_seen": 28297968, + "step": 14456 + }, + { + "epoch": 1.91610337972167, + "grad_norm": 9.967475891113281, + "learning_rate": 1.4459281842808509e-06, + "loss": 0.168, + "num_input_tokens_seen": 28299520, + "step": 14457 + }, + { + "epoch": 1.916235917826375, + "grad_norm": 6.392851829528809, + "learning_rate": 1.44561342124577e-06, + "loss": 0.0691, + "num_input_tokens_seen": 28301536, + "step": 14458 + }, + { + "epoch": 1.9163684559310803, + "grad_norm": 0.0661567896604538, + "learning_rate": 1.4452986785399895e-06, + "loss": 0.0005, + "num_input_tokens_seen": 28303600, + "step": 14459 + }, + { + "epoch": 1.9165009940357853, + "grad_norm": 0.017987767234444618, + "learning_rate": 1.4449839561695772e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28305760, + "step": 14460 + }, + { + "epoch": 1.9166335321404904, + "grad_norm": 2.0596981048583984, + "learning_rate": 1.4446692541406026e-06, + "loss": 0.0193, + "num_input_tokens_seen": 28307536, + "step": 14461 + }, + { + "epoch": 1.9167660702451954, + "grad_norm": 4.042823314666748, + "learning_rate": 1.4443545724591334e-06, + "loss": 0.1048, + "num_input_tokens_seen": 28310744, + "step": 14462 + }, + { + "epoch": 1.9168986083499004, + "grad_norm": 0.46401628851890564, + "learning_rate": 1.444039911131235e-06, + "loss": 0.0026, + "num_input_tokens_seen": 28313048, + "step": 14463 + }, + { + "epoch": 1.9170311464546057, + "grad_norm": 6.0290021896362305, + "learning_rate": 1.4437252701629768e-06, + "loss": 0.1615, + "num_input_tokens_seen": 28314656, + "step": 14464 + }, + { + "epoch": 1.917163684559311, + "grad_norm": 13.83427906036377, + "learning_rate": 1.4434106495604228e-06, + "loss": 0.1866, + "num_input_tokens_seen": 28317160, + "step": 14465 + }, + { + "epoch": 1.917296222664016, + "grad_norm": 0.4276355803012848, + "learning_rate": 1.4430960493296415e-06, + "loss": 0.002, + "num_input_tokens_seen": 28318520, + "step": 14466 + }, + { + "epoch": 1.917428760768721, + "grad_norm": 8.597620964050293, + "learning_rate": 1.4427814694766972e-06, + "loss": 0.1137, + "num_input_tokens_seen": 28320352, + "step": 14467 + }, + { + "epoch": 1.917561298873426, + "grad_norm": 0.1282566487789154, + "learning_rate": 1.4424669100076548e-06, + "loss": 0.0008, + "num_input_tokens_seen": 28321768, + "step": 14468 + }, + { + "epoch": 1.9176938369781311, + "grad_norm": 0.11541794240474701, + "learning_rate": 1.4421523709285812e-06, + "loss": 0.0008, + "num_input_tokens_seen": 28323864, + "step": 14469 + }, + { + "epoch": 1.9178263750828362, + "grad_norm": 0.02432304620742798, + "learning_rate": 1.4418378522455396e-06, + "loss": 0.0002, + "num_input_tokens_seen": 28325320, + "step": 14470 + }, + { + "epoch": 1.9179589131875414, + "grad_norm": 0.05120117962360382, + "learning_rate": 1.4415233539645929e-06, + "loss": 0.0003, + "num_input_tokens_seen": 28326944, + "step": 14471 + }, + { + "epoch": 1.9180914512922467, + "grad_norm": 7.1296000480651855, + "learning_rate": 1.441208876091807e-06, + "loss": 0.204, + "num_input_tokens_seen": 28329280, + "step": 14472 + }, + { + "epoch": 1.9182239893969517, + "grad_norm": 0.5529530048370361, + "learning_rate": 1.4408944186332434e-06, + "loss": 0.0035, + "num_input_tokens_seen": 28330808, + "step": 14473 + }, + { + "epoch": 1.9183565275016567, + "grad_norm": 15.381190299987793, + "learning_rate": 1.4405799815949675e-06, + "loss": 0.0981, + "num_input_tokens_seen": 28333024, + "step": 14474 + }, + { + "epoch": 1.9184890656063618, + "grad_norm": 0.797319233417511, + "learning_rate": 1.4402655649830402e-06, + "loss": 0.0064, + "num_input_tokens_seen": 28334656, + "step": 14475 + }, + { + "epoch": 1.9186216037110668, + "grad_norm": 0.040282685309648514, + "learning_rate": 1.439951168803524e-06, + "loss": 0.0003, + "num_input_tokens_seen": 28336960, + "step": 14476 + }, + { + "epoch": 1.918754141815772, + "grad_norm": 5.1428375244140625, + "learning_rate": 1.4396367930624794e-06, + "loss": 0.0402, + "num_input_tokens_seen": 28338560, + "step": 14477 + }, + { + "epoch": 1.9188866799204771, + "grad_norm": 0.1068536713719368, + "learning_rate": 1.4393224377659686e-06, + "loss": 0.0005, + "num_input_tokens_seen": 28340040, + "step": 14478 + }, + { + "epoch": 1.9190192180251824, + "grad_norm": 11.043889999389648, + "learning_rate": 1.4390081029200542e-06, + "loss": 0.1553, + "num_input_tokens_seen": 28341696, + "step": 14479 + }, + { + "epoch": 1.9191517561298874, + "grad_norm": 8.010275840759277, + "learning_rate": 1.4386937885307957e-06, + "loss": 0.106, + "num_input_tokens_seen": 28343592, + "step": 14480 + }, + { + "epoch": 1.9192842942345925, + "grad_norm": 6.7649688720703125, + "learning_rate": 1.4383794946042532e-06, + "loss": 0.0336, + "num_input_tokens_seen": 28345120, + "step": 14481 + }, + { + "epoch": 1.9194168323392975, + "grad_norm": 4.509862899780273, + "learning_rate": 1.4380652211464863e-06, + "loss": 0.0782, + "num_input_tokens_seen": 28347256, + "step": 14482 + }, + { + "epoch": 1.9195493704440025, + "grad_norm": 0.003938871435821056, + "learning_rate": 1.4377509681635532e-06, + "loss": 0.0, + "num_input_tokens_seen": 28348584, + "step": 14483 + }, + { + "epoch": 1.9196819085487078, + "grad_norm": 7.963360786437988, + "learning_rate": 1.4374367356615147e-06, + "loss": 0.1787, + "num_input_tokens_seen": 28352040, + "step": 14484 + }, + { + "epoch": 1.9198144466534128, + "grad_norm": 16.060741424560547, + "learning_rate": 1.4371225236464298e-06, + "loss": 0.1265, + "num_input_tokens_seen": 28353848, + "step": 14485 + }, + { + "epoch": 1.919946984758118, + "grad_norm": 5.6197829246521, + "learning_rate": 1.4368083321243556e-06, + "loss": 0.0287, + "num_input_tokens_seen": 28355648, + "step": 14486 + }, + { + "epoch": 1.9200795228628231, + "grad_norm": 18.31355094909668, + "learning_rate": 1.4364941611013505e-06, + "loss": 0.4455, + "num_input_tokens_seen": 28357760, + "step": 14487 + }, + { + "epoch": 1.9202120609675282, + "grad_norm": 10.761831283569336, + "learning_rate": 1.4361800105834717e-06, + "loss": 0.3233, + "num_input_tokens_seen": 28359992, + "step": 14488 + }, + { + "epoch": 1.9203445990722332, + "grad_norm": 0.015185891650617123, + "learning_rate": 1.435865880576775e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28361888, + "step": 14489 + }, + { + "epoch": 1.9204771371769382, + "grad_norm": 13.749794006347656, + "learning_rate": 1.4355517710873184e-06, + "loss": 0.2082, + "num_input_tokens_seen": 28363392, + "step": 14490 + }, + { + "epoch": 1.9206096752816435, + "grad_norm": 0.10136348009109497, + "learning_rate": 1.4352376821211589e-06, + "loss": 0.0006, + "num_input_tokens_seen": 28366384, + "step": 14491 + }, + { + "epoch": 1.9207422133863485, + "grad_norm": 1.2167774438858032, + "learning_rate": 1.4349236136843514e-06, + "loss": 0.0148, + "num_input_tokens_seen": 28369472, + "step": 14492 + }, + { + "epoch": 1.9208747514910538, + "grad_norm": 6.388339996337891, + "learning_rate": 1.4346095657829512e-06, + "loss": 0.1352, + "num_input_tokens_seen": 28372040, + "step": 14493 + }, + { + "epoch": 1.9210072895957588, + "grad_norm": 11.298988342285156, + "learning_rate": 1.4342955384230128e-06, + "loss": 0.1768, + "num_input_tokens_seen": 28374000, + "step": 14494 + }, + { + "epoch": 1.9211398277004639, + "grad_norm": 13.300827980041504, + "learning_rate": 1.4339815316105915e-06, + "loss": 0.0622, + "num_input_tokens_seen": 28375688, + "step": 14495 + }, + { + "epoch": 1.921272365805169, + "grad_norm": 0.8824145197868347, + "learning_rate": 1.433667545351743e-06, + "loss": 0.0082, + "num_input_tokens_seen": 28377128, + "step": 14496 + }, + { + "epoch": 1.921404903909874, + "grad_norm": 5.568937301635742, + "learning_rate": 1.4333535796525194e-06, + "loss": 0.1287, + "num_input_tokens_seen": 28379520, + "step": 14497 + }, + { + "epoch": 1.9215374420145792, + "grad_norm": 4.481696128845215, + "learning_rate": 1.433039634518975e-06, + "loss": 0.193, + "num_input_tokens_seen": 28381736, + "step": 14498 + }, + { + "epoch": 1.9216699801192842, + "grad_norm": 3.872025966644287, + "learning_rate": 1.4327257099571613e-06, + "loss": 0.0855, + "num_input_tokens_seen": 28383640, + "step": 14499 + }, + { + "epoch": 1.9218025182239895, + "grad_norm": 0.012724610045552254, + "learning_rate": 1.4324118059731333e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28385472, + "step": 14500 + }, + { + "epoch": 1.9219350563286945, + "grad_norm": 0.017046354711055756, + "learning_rate": 1.4320979225729408e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28386776, + "step": 14501 + }, + { + "epoch": 1.9220675944333996, + "grad_norm": 15.89002513885498, + "learning_rate": 1.4317840597626387e-06, + "loss": 0.2919, + "num_input_tokens_seen": 28389416, + "step": 14502 + }, + { + "epoch": 1.9222001325381046, + "grad_norm": 12.773972511291504, + "learning_rate": 1.4314702175482764e-06, + "loss": 0.3049, + "num_input_tokens_seen": 28392144, + "step": 14503 + }, + { + "epoch": 1.9223326706428097, + "grad_norm": 6.142188549041748, + "learning_rate": 1.4311563959359043e-06, + "loss": 0.1264, + "num_input_tokens_seen": 28393408, + "step": 14504 + }, + { + "epoch": 1.922465208747515, + "grad_norm": 0.16367526352405548, + "learning_rate": 1.4308425949315758e-06, + "loss": 0.0009, + "num_input_tokens_seen": 28395808, + "step": 14505 + }, + { + "epoch": 1.9225977468522202, + "grad_norm": 10.55117416381836, + "learning_rate": 1.4305288145413392e-06, + "loss": 0.2586, + "num_input_tokens_seen": 28398720, + "step": 14506 + }, + { + "epoch": 1.9227302849569252, + "grad_norm": 0.3092440366744995, + "learning_rate": 1.430215054771244e-06, + "loss": 0.0021, + "num_input_tokens_seen": 28401272, + "step": 14507 + }, + { + "epoch": 1.9228628230616303, + "grad_norm": 3.0988776683807373, + "learning_rate": 1.4299013156273415e-06, + "loss": 0.0202, + "num_input_tokens_seen": 28402992, + "step": 14508 + }, + { + "epoch": 1.9229953611663353, + "grad_norm": 12.431899070739746, + "learning_rate": 1.429587597115679e-06, + "loss": 0.2637, + "num_input_tokens_seen": 28404952, + "step": 14509 + }, + { + "epoch": 1.9231278992710403, + "grad_norm": 6.113668918609619, + "learning_rate": 1.4292738992423066e-06, + "loss": 0.065, + "num_input_tokens_seen": 28406928, + "step": 14510 + }, + { + "epoch": 1.9232604373757454, + "grad_norm": 8.003411293029785, + "learning_rate": 1.4289602220132725e-06, + "loss": 0.1521, + "num_input_tokens_seen": 28409048, + "step": 14511 + }, + { + "epoch": 1.9233929754804506, + "grad_norm": 0.47147056460380554, + "learning_rate": 1.4286465654346243e-06, + "loss": 0.0016, + "num_input_tokens_seen": 28411096, + "step": 14512 + }, + { + "epoch": 1.9235255135851559, + "grad_norm": 1.3238030672073364, + "learning_rate": 1.4283329295124082e-06, + "loss": 0.0102, + "num_input_tokens_seen": 28412552, + "step": 14513 + }, + { + "epoch": 1.923658051689861, + "grad_norm": 0.018518054857850075, + "learning_rate": 1.4280193142526728e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28413448, + "step": 14514 + }, + { + "epoch": 1.923790589794566, + "grad_norm": 8.533416748046875, + "learning_rate": 1.4277057196614652e-06, + "loss": 0.0843, + "num_input_tokens_seen": 28415400, + "step": 14515 + }, + { + "epoch": 1.923923127899271, + "grad_norm": 5.133886814117432, + "learning_rate": 1.4273921457448309e-06, + "loss": 0.0702, + "num_input_tokens_seen": 28417248, + "step": 14516 + }, + { + "epoch": 1.924055666003976, + "grad_norm": 9.422061920166016, + "learning_rate": 1.4270785925088166e-06, + "loss": 0.1534, + "num_input_tokens_seen": 28419720, + "step": 14517 + }, + { + "epoch": 1.9241882041086813, + "grad_norm": 0.08589878678321838, + "learning_rate": 1.4267650599594656e-06, + "loss": 0.0006, + "num_input_tokens_seen": 28421624, + "step": 14518 + }, + { + "epoch": 1.9243207422133863, + "grad_norm": 12.224078178405762, + "learning_rate": 1.426451548102826e-06, + "loss": 0.3511, + "num_input_tokens_seen": 28424400, + "step": 14519 + }, + { + "epoch": 1.9244532803180916, + "grad_norm": 2.035879373550415, + "learning_rate": 1.4261380569449402e-06, + "loss": 0.0303, + "num_input_tokens_seen": 28426280, + "step": 14520 + }, + { + "epoch": 1.9245858184227966, + "grad_norm": 5.446162223815918, + "learning_rate": 1.425824586491854e-06, + "loss": 0.1369, + "num_input_tokens_seen": 28428648, + "step": 14521 + }, + { + "epoch": 1.9247183565275017, + "grad_norm": 6.415218353271484, + "learning_rate": 1.4255111367496111e-06, + "loss": 0.0557, + "num_input_tokens_seen": 28430432, + "step": 14522 + }, + { + "epoch": 1.9248508946322067, + "grad_norm": 2.677095413208008, + "learning_rate": 1.4251977077242549e-06, + "loss": 0.03, + "num_input_tokens_seen": 28433368, + "step": 14523 + }, + { + "epoch": 1.9249834327369117, + "grad_norm": 0.11434061080217361, + "learning_rate": 1.424884299421827e-06, + "loss": 0.0006, + "num_input_tokens_seen": 28435096, + "step": 14524 + }, + { + "epoch": 1.925115970841617, + "grad_norm": 6.683300971984863, + "learning_rate": 1.424570911848372e-06, + "loss": 0.0791, + "num_input_tokens_seen": 28436568, + "step": 14525 + }, + { + "epoch": 1.925248508946322, + "grad_norm": 0.058350272476673126, + "learning_rate": 1.4242575450099324e-06, + "loss": 0.0004, + "num_input_tokens_seen": 28437888, + "step": 14526 + }, + { + "epoch": 1.9253810470510273, + "grad_norm": 1.7590957880020142, + "learning_rate": 1.4239441989125497e-06, + "loss": 0.0119, + "num_input_tokens_seen": 28439328, + "step": 14527 + }, + { + "epoch": 1.9255135851557323, + "grad_norm": 0.2827964425086975, + "learning_rate": 1.4236308735622648e-06, + "loss": 0.0014, + "num_input_tokens_seen": 28440480, + "step": 14528 + }, + { + "epoch": 1.9256461232604374, + "grad_norm": 0.735896646976471, + "learning_rate": 1.4233175689651193e-06, + "loss": 0.0034, + "num_input_tokens_seen": 28441984, + "step": 14529 + }, + { + "epoch": 1.9257786613651424, + "grad_norm": 8.681852340698242, + "learning_rate": 1.4230042851271531e-06, + "loss": 0.2294, + "num_input_tokens_seen": 28444504, + "step": 14530 + }, + { + "epoch": 1.9259111994698475, + "grad_norm": 0.06583154201507568, + "learning_rate": 1.422691022054407e-06, + "loss": 0.0004, + "num_input_tokens_seen": 28446456, + "step": 14531 + }, + { + "epoch": 1.9260437375745527, + "grad_norm": 10.106581687927246, + "learning_rate": 1.422377779752922e-06, + "loss": 0.417, + "num_input_tokens_seen": 28449344, + "step": 14532 + }, + { + "epoch": 1.9261762756792578, + "grad_norm": 4.655391216278076, + "learning_rate": 1.4220645582287373e-06, + "loss": 0.0279, + "num_input_tokens_seen": 28452056, + "step": 14533 + }, + { + "epoch": 1.926308813783963, + "grad_norm": 7.4563727378845215, + "learning_rate": 1.4217513574878913e-06, + "loss": 0.2362, + "num_input_tokens_seen": 28453816, + "step": 14534 + }, + { + "epoch": 1.926441351888668, + "grad_norm": 6.052512168884277, + "learning_rate": 1.4214381775364223e-06, + "loss": 0.0793, + "num_input_tokens_seen": 28456120, + "step": 14535 + }, + { + "epoch": 1.926573889993373, + "grad_norm": 0.5566394329071045, + "learning_rate": 1.4211250183803698e-06, + "loss": 0.0042, + "num_input_tokens_seen": 28458064, + "step": 14536 + }, + { + "epoch": 1.9267064280980781, + "grad_norm": 8.530388832092285, + "learning_rate": 1.4208118800257703e-06, + "loss": 0.129, + "num_input_tokens_seen": 28461024, + "step": 14537 + }, + { + "epoch": 1.9268389662027832, + "grad_norm": 7.138511657714844, + "learning_rate": 1.4204987624786634e-06, + "loss": 0.1371, + "num_input_tokens_seen": 28462256, + "step": 14538 + }, + { + "epoch": 1.9269715043074884, + "grad_norm": 2.7308220863342285, + "learning_rate": 1.4201856657450853e-06, + "loss": 0.0509, + "num_input_tokens_seen": 28464200, + "step": 14539 + }, + { + "epoch": 1.9271040424121935, + "grad_norm": 11.084391593933105, + "learning_rate": 1.4198725898310712e-06, + "loss": 0.0417, + "num_input_tokens_seen": 28465608, + "step": 14540 + }, + { + "epoch": 1.9272365805168987, + "grad_norm": 6.178929328918457, + "learning_rate": 1.4195595347426593e-06, + "loss": 0.1377, + "num_input_tokens_seen": 28467528, + "step": 14541 + }, + { + "epoch": 1.9273691186216038, + "grad_norm": 8.426530838012695, + "learning_rate": 1.4192465004858856e-06, + "loss": 0.1989, + "num_input_tokens_seen": 28469400, + "step": 14542 + }, + { + "epoch": 1.9275016567263088, + "grad_norm": 0.036309584975242615, + "learning_rate": 1.4189334870667836e-06, + "loss": 0.0002, + "num_input_tokens_seen": 28471080, + "step": 14543 + }, + { + "epoch": 1.9276341948310138, + "grad_norm": 6.903929233551025, + "learning_rate": 1.4186204944913912e-06, + "loss": 0.2244, + "num_input_tokens_seen": 28472616, + "step": 14544 + }, + { + "epoch": 1.9277667329357189, + "grad_norm": 0.0949307307600975, + "learning_rate": 1.418307522765741e-06, + "loss": 0.0007, + "num_input_tokens_seen": 28475080, + "step": 14545 + }, + { + "epoch": 1.9278992710404241, + "grad_norm": 8.07598876953125, + "learning_rate": 1.4179945718958677e-06, + "loss": 0.2139, + "num_input_tokens_seen": 28477320, + "step": 14546 + }, + { + "epoch": 1.9280318091451292, + "grad_norm": 2.8299310207366943, + "learning_rate": 1.417681641887806e-06, + "loss": 0.0325, + "num_input_tokens_seen": 28478616, + "step": 14547 + }, + { + "epoch": 1.9281643472498344, + "grad_norm": 4.273895263671875, + "learning_rate": 1.417368732747588e-06, + "loss": 0.0073, + "num_input_tokens_seen": 28480344, + "step": 14548 + }, + { + "epoch": 1.9282968853545395, + "grad_norm": 8.862760543823242, + "learning_rate": 1.4170558444812488e-06, + "loss": 0.288, + "num_input_tokens_seen": 28483040, + "step": 14549 + }, + { + "epoch": 1.9284294234592445, + "grad_norm": 10.843574523925781, + "learning_rate": 1.41674297709482e-06, + "loss": 0.308, + "num_input_tokens_seen": 28484592, + "step": 14550 + }, + { + "epoch": 1.9285619615639495, + "grad_norm": 2.0836310386657715, + "learning_rate": 1.4164301305943327e-06, + "loss": 0.0151, + "num_input_tokens_seen": 28486000, + "step": 14551 + }, + { + "epoch": 1.9286944996686546, + "grad_norm": 6.719988822937012, + "learning_rate": 1.4161173049858212e-06, + "loss": 0.1317, + "num_input_tokens_seen": 28488416, + "step": 14552 + }, + { + "epoch": 1.9288270377733598, + "grad_norm": 0.10176331549882889, + "learning_rate": 1.4158045002753156e-06, + "loss": 0.0007, + "num_input_tokens_seen": 28490080, + "step": 14553 + }, + { + "epoch": 1.928959575878065, + "grad_norm": 0.10741537809371948, + "learning_rate": 1.4154917164688464e-06, + "loss": 0.0006, + "num_input_tokens_seen": 28491152, + "step": 14554 + }, + { + "epoch": 1.9290921139827701, + "grad_norm": 7.245759963989258, + "learning_rate": 1.4151789535724458e-06, + "loss": 0.1009, + "num_input_tokens_seen": 28493680, + "step": 14555 + }, + { + "epoch": 1.9292246520874752, + "grad_norm": 2.1338467597961426, + "learning_rate": 1.4148662115921425e-06, + "loss": 0.0261, + "num_input_tokens_seen": 28495600, + "step": 14556 + }, + { + "epoch": 1.9293571901921802, + "grad_norm": 5.444281578063965, + "learning_rate": 1.4145534905339683e-06, + "loss": 0.0332, + "num_input_tokens_seen": 28498256, + "step": 14557 + }, + { + "epoch": 1.9294897282968853, + "grad_norm": 0.01628575660288334, + "learning_rate": 1.4142407904039516e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28499712, + "step": 14558 + }, + { + "epoch": 1.9296222664015903, + "grad_norm": 0.2893092632293701, + "learning_rate": 1.4139281112081216e-06, + "loss": 0.0016, + "num_input_tokens_seen": 28501712, + "step": 14559 + }, + { + "epoch": 1.9297548045062956, + "grad_norm": 3.414501190185547, + "learning_rate": 1.4136154529525053e-06, + "loss": 0.0326, + "num_input_tokens_seen": 28503560, + "step": 14560 + }, + { + "epoch": 1.9298873426110008, + "grad_norm": 3.0467071533203125, + "learning_rate": 1.4133028156431333e-06, + "loss": 0.0195, + "num_input_tokens_seen": 28506384, + "step": 14561 + }, + { + "epoch": 1.9300198807157058, + "grad_norm": 2.362006902694702, + "learning_rate": 1.4129901992860335e-06, + "loss": 0.0192, + "num_input_tokens_seen": 28507744, + "step": 14562 + }, + { + "epoch": 1.9301524188204109, + "grad_norm": 2.259338855743408, + "learning_rate": 1.4126776038872319e-06, + "loss": 0.0253, + "num_input_tokens_seen": 28509584, + "step": 14563 + }, + { + "epoch": 1.930284956925116, + "grad_norm": 0.021720731630921364, + "learning_rate": 1.4123650294527568e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28511840, + "step": 14564 + }, + { + "epoch": 1.930417495029821, + "grad_norm": 0.5089496970176697, + "learning_rate": 1.4120524759886343e-06, + "loss": 0.0024, + "num_input_tokens_seen": 28513416, + "step": 14565 + }, + { + "epoch": 1.9305500331345262, + "grad_norm": 6.575616836547852, + "learning_rate": 1.4117399435008895e-06, + "loss": 0.1721, + "num_input_tokens_seen": 28515256, + "step": 14566 + }, + { + "epoch": 1.9306825712392313, + "grad_norm": 13.048566818237305, + "learning_rate": 1.411427431995549e-06, + "loss": 0.3847, + "num_input_tokens_seen": 28517632, + "step": 14567 + }, + { + "epoch": 1.9308151093439365, + "grad_norm": 3.2150468826293945, + "learning_rate": 1.4111149414786401e-06, + "loss": 0.0263, + "num_input_tokens_seen": 28519672, + "step": 14568 + }, + { + "epoch": 1.9309476474486416, + "grad_norm": 4.492439270019531, + "learning_rate": 1.4108024719561857e-06, + "loss": 0.0916, + "num_input_tokens_seen": 28521608, + "step": 14569 + }, + { + "epoch": 1.9310801855533466, + "grad_norm": 12.285988807678223, + "learning_rate": 1.4104900234342118e-06, + "loss": 0.1564, + "num_input_tokens_seen": 28523072, + "step": 14570 + }, + { + "epoch": 1.9312127236580516, + "grad_norm": 0.035410039126873016, + "learning_rate": 1.4101775959187402e-06, + "loss": 0.0002, + "num_input_tokens_seen": 28525472, + "step": 14571 + }, + { + "epoch": 1.9313452617627567, + "grad_norm": 0.3409900963306427, + "learning_rate": 1.4098651894157967e-06, + "loss": 0.0023, + "num_input_tokens_seen": 28527584, + "step": 14572 + }, + { + "epoch": 1.931477799867462, + "grad_norm": 2.896045684814453, + "learning_rate": 1.4095528039314055e-06, + "loss": 0.0978, + "num_input_tokens_seen": 28529592, + "step": 14573 + }, + { + "epoch": 1.931610337972167, + "grad_norm": 6.497830867767334, + "learning_rate": 1.4092404394715884e-06, + "loss": 0.1189, + "num_input_tokens_seen": 28531528, + "step": 14574 + }, + { + "epoch": 1.9317428760768722, + "grad_norm": 12.242908477783203, + "learning_rate": 1.408928096042368e-06, + "loss": 0.2839, + "num_input_tokens_seen": 28533696, + "step": 14575 + }, + { + "epoch": 1.9318754141815773, + "grad_norm": 9.6052885055542, + "learning_rate": 1.4086157736497667e-06, + "loss": 0.183, + "num_input_tokens_seen": 28536056, + "step": 14576 + }, + { + "epoch": 1.9320079522862823, + "grad_norm": 5.231362342834473, + "learning_rate": 1.4083034722998052e-06, + "loss": 0.1055, + "num_input_tokens_seen": 28537856, + "step": 14577 + }, + { + "epoch": 1.9321404903909873, + "grad_norm": 1.3147205114364624, + "learning_rate": 1.4079911919985055e-06, + "loss": 0.0289, + "num_input_tokens_seen": 28539440, + "step": 14578 + }, + { + "epoch": 1.9322730284956924, + "grad_norm": 0.8507856130599976, + "learning_rate": 1.4076789327518903e-06, + "loss": 0.0049, + "num_input_tokens_seen": 28541352, + "step": 14579 + }, + { + "epoch": 1.9324055666003976, + "grad_norm": 0.0316254086792469, + "learning_rate": 1.407366694565979e-06, + "loss": 0.0002, + "num_input_tokens_seen": 28543128, + "step": 14580 + }, + { + "epoch": 1.9325381047051027, + "grad_norm": 11.630775451660156, + "learning_rate": 1.4070544774467914e-06, + "loss": 0.2546, + "num_input_tokens_seen": 28545184, + "step": 14581 + }, + { + "epoch": 1.932670642809808, + "grad_norm": 7.414968490600586, + "learning_rate": 1.4067422814003463e-06, + "loss": 0.1368, + "num_input_tokens_seen": 28547024, + "step": 14582 + }, + { + "epoch": 1.932803180914513, + "grad_norm": 5.031907081604004, + "learning_rate": 1.4064301064326654e-06, + "loss": 0.0291, + "num_input_tokens_seen": 28548816, + "step": 14583 + }, + { + "epoch": 1.932935719019218, + "grad_norm": 1.428748607635498, + "learning_rate": 1.4061179525497653e-06, + "loss": 0.0119, + "num_input_tokens_seen": 28550096, + "step": 14584 + }, + { + "epoch": 1.933068257123923, + "grad_norm": 0.13369989395141602, + "learning_rate": 1.4058058197576671e-06, + "loss": 0.0009, + "num_input_tokens_seen": 28552456, + "step": 14585 + }, + { + "epoch": 1.933200795228628, + "grad_norm": 6.122941493988037, + "learning_rate": 1.4054937080623871e-06, + "loss": 0.1364, + "num_input_tokens_seen": 28553968, + "step": 14586 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 0.22955013811588287, + "learning_rate": 1.4051816174699428e-06, + "loss": 0.0016, + "num_input_tokens_seen": 28556112, + "step": 14587 + }, + { + "epoch": 1.9334658714380384, + "grad_norm": 1.2341172695159912, + "learning_rate": 1.404869547986353e-06, + "loss": 0.0216, + "num_input_tokens_seen": 28558024, + "step": 14588 + }, + { + "epoch": 1.9335984095427436, + "grad_norm": 0.12363634258508682, + "learning_rate": 1.404557499617634e-06, + "loss": 0.0009, + "num_input_tokens_seen": 28560184, + "step": 14589 + }, + { + "epoch": 1.9337309476474487, + "grad_norm": 5.505316734313965, + "learning_rate": 1.4042454723698006e-06, + "loss": 0.0758, + "num_input_tokens_seen": 28562176, + "step": 14590 + }, + { + "epoch": 1.9338634857521537, + "grad_norm": 7.320895671844482, + "learning_rate": 1.403933466248872e-06, + "loss": 0.0663, + "num_input_tokens_seen": 28564672, + "step": 14591 + }, + { + "epoch": 1.9339960238568588, + "grad_norm": 2.202357053756714, + "learning_rate": 1.4036214812608613e-06, + "loss": 0.0212, + "num_input_tokens_seen": 28566352, + "step": 14592 + }, + { + "epoch": 1.9341285619615638, + "grad_norm": 7.182968616485596, + "learning_rate": 1.4033095174117856e-06, + "loss": 0.2428, + "num_input_tokens_seen": 28568816, + "step": 14593 + }, + { + "epoch": 1.934261100066269, + "grad_norm": 0.03679567575454712, + "learning_rate": 1.4029975747076588e-06, + "loss": 0.0002, + "num_input_tokens_seen": 28570288, + "step": 14594 + }, + { + "epoch": 1.9343936381709743, + "grad_norm": 0.23885899782180786, + "learning_rate": 1.4026856531544957e-06, + "loss": 0.0007, + "num_input_tokens_seen": 28571896, + "step": 14595 + }, + { + "epoch": 1.9345261762756794, + "grad_norm": 4.883234977722168, + "learning_rate": 1.4023737527583092e-06, + "loss": 0.1274, + "num_input_tokens_seen": 28574656, + "step": 14596 + }, + { + "epoch": 1.9346587143803844, + "grad_norm": 0.20378774404525757, + "learning_rate": 1.402061873525114e-06, + "loss": 0.001, + "num_input_tokens_seen": 28576904, + "step": 14597 + }, + { + "epoch": 1.9347912524850894, + "grad_norm": 6.817506313323975, + "learning_rate": 1.4017500154609243e-06, + "loss": 0.1862, + "num_input_tokens_seen": 28579128, + "step": 14598 + }, + { + "epoch": 1.9349237905897945, + "grad_norm": 0.020190421491861343, + "learning_rate": 1.4014381785717517e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28580552, + "step": 14599 + }, + { + "epoch": 1.9350563286944995, + "grad_norm": 7.722602367401123, + "learning_rate": 1.401126362863609e-06, + "loss": 0.0682, + "num_input_tokens_seen": 28582944, + "step": 14600 + }, + { + "epoch": 1.9351888667992048, + "grad_norm": 2.3326830863952637, + "learning_rate": 1.4008145683425072e-06, + "loss": 0.0194, + "num_input_tokens_seen": 28584680, + "step": 14601 + }, + { + "epoch": 1.93532140490391, + "grad_norm": 0.14842797815799713, + "learning_rate": 1.40050279501446e-06, + "loss": 0.0009, + "num_input_tokens_seen": 28586224, + "step": 14602 + }, + { + "epoch": 1.935453943008615, + "grad_norm": 9.247849464416504, + "learning_rate": 1.4001910428854762e-06, + "loss": 0.038, + "num_input_tokens_seen": 28587928, + "step": 14603 + }, + { + "epoch": 1.93558648111332, + "grad_norm": 3.5706992149353027, + "learning_rate": 1.399879311961569e-06, + "loss": 0.0604, + "num_input_tokens_seen": 28590128, + "step": 14604 + }, + { + "epoch": 1.9357190192180251, + "grad_norm": 0.06635161489248276, + "learning_rate": 1.3995676022487472e-06, + "loss": 0.0005, + "num_input_tokens_seen": 28591568, + "step": 14605 + }, + { + "epoch": 1.9358515573227302, + "grad_norm": 6.257473945617676, + "learning_rate": 1.3992559137530215e-06, + "loss": 0.1042, + "num_input_tokens_seen": 28592904, + "step": 14606 + }, + { + "epoch": 1.9359840954274354, + "grad_norm": 10.173773765563965, + "learning_rate": 1.3989442464804e-06, + "loss": 0.0915, + "num_input_tokens_seen": 28595128, + "step": 14607 + }, + { + "epoch": 1.9361166335321405, + "grad_norm": 6.5874786376953125, + "learning_rate": 1.3986326004368934e-06, + "loss": 0.096, + "num_input_tokens_seen": 28596920, + "step": 14608 + }, + { + "epoch": 1.9362491716368457, + "grad_norm": 0.01202325988560915, + "learning_rate": 1.3983209756285105e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28598648, + "step": 14609 + }, + { + "epoch": 1.9363817097415508, + "grad_norm": 7.9543986320495605, + "learning_rate": 1.3980093720612592e-06, + "loss": 0.1664, + "num_input_tokens_seen": 28601216, + "step": 14610 + }, + { + "epoch": 1.9365142478462558, + "grad_norm": 14.929435729980469, + "learning_rate": 1.3976977897411478e-06, + "loss": 0.1642, + "num_input_tokens_seen": 28603568, + "step": 14611 + }, + { + "epoch": 1.9366467859509608, + "grad_norm": 0.027997706085443497, + "learning_rate": 1.3973862286741836e-06, + "loss": 0.0002, + "num_input_tokens_seen": 28605384, + "step": 14612 + }, + { + "epoch": 1.9367793240556659, + "grad_norm": 0.03364469110965729, + "learning_rate": 1.3970746888663722e-06, + "loss": 0.0002, + "num_input_tokens_seen": 28607496, + "step": 14613 + }, + { + "epoch": 1.9369118621603711, + "grad_norm": 0.009779251180589199, + "learning_rate": 1.3967631703237217e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28609008, + "step": 14614 + }, + { + "epoch": 1.9370444002650762, + "grad_norm": 6.381690502166748, + "learning_rate": 1.3964516730522393e-06, + "loss": 0.1407, + "num_input_tokens_seen": 28611136, + "step": 14615 + }, + { + "epoch": 1.9371769383697814, + "grad_norm": 0.03693179041147232, + "learning_rate": 1.39614019705793e-06, + "loss": 0.0003, + "num_input_tokens_seen": 28612552, + "step": 14616 + }, + { + "epoch": 1.9373094764744865, + "grad_norm": 2.7750744819641113, + "learning_rate": 1.3958287423467991e-06, + "loss": 0.0243, + "num_input_tokens_seen": 28614976, + "step": 14617 + }, + { + "epoch": 1.9374420145791915, + "grad_norm": 0.13844090700149536, + "learning_rate": 1.395517308924851e-06, + "loss": 0.0005, + "num_input_tokens_seen": 28616448, + "step": 14618 + }, + { + "epoch": 1.9375745526838966, + "grad_norm": 9.090386390686035, + "learning_rate": 1.3952058967980922e-06, + "loss": 0.1497, + "num_input_tokens_seen": 28618648, + "step": 14619 + }, + { + "epoch": 1.9377070907886016, + "grad_norm": 0.8743656277656555, + "learning_rate": 1.3948945059725247e-06, + "loss": 0.0044, + "num_input_tokens_seen": 28621160, + "step": 14620 + }, + { + "epoch": 1.9378396288933069, + "grad_norm": 5.282532691955566, + "learning_rate": 1.3945831364541545e-06, + "loss": 0.118, + "num_input_tokens_seen": 28623200, + "step": 14621 + }, + { + "epoch": 1.937972166998012, + "grad_norm": 0.013399886898696423, + "learning_rate": 1.3942717882489841e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28624480, + "step": 14622 + }, + { + "epoch": 1.9381047051027172, + "grad_norm": 4.835678577423096, + "learning_rate": 1.3939604613630154e-06, + "loss": 0.1337, + "num_input_tokens_seen": 28626880, + "step": 14623 + }, + { + "epoch": 1.9382372432074222, + "grad_norm": 0.15624113380908966, + "learning_rate": 1.393649155802253e-06, + "loss": 0.0011, + "num_input_tokens_seen": 28628120, + "step": 14624 + }, + { + "epoch": 1.9383697813121272, + "grad_norm": 5.864776134490967, + "learning_rate": 1.3933378715726969e-06, + "loss": 0.1204, + "num_input_tokens_seen": 28630312, + "step": 14625 + }, + { + "epoch": 1.9385023194168323, + "grad_norm": 8.581633567810059, + "learning_rate": 1.3930266086803513e-06, + "loss": 0.1206, + "num_input_tokens_seen": 28632528, + "step": 14626 + }, + { + "epoch": 1.9386348575215373, + "grad_norm": 11.165970802307129, + "learning_rate": 1.3927153671312166e-06, + "loss": 0.0718, + "num_input_tokens_seen": 28634224, + "step": 14627 + }, + { + "epoch": 1.9387673956262426, + "grad_norm": 4.139102458953857, + "learning_rate": 1.3924041469312922e-06, + "loss": 0.0701, + "num_input_tokens_seen": 28635904, + "step": 14628 + }, + { + "epoch": 1.9388999337309476, + "grad_norm": 12.607486724853516, + "learning_rate": 1.3920929480865808e-06, + "loss": 0.1329, + "num_input_tokens_seen": 28637448, + "step": 14629 + }, + { + "epoch": 1.9390324718356529, + "grad_norm": 0.06494516879320145, + "learning_rate": 1.3917817706030816e-06, + "loss": 0.0005, + "num_input_tokens_seen": 28639344, + "step": 14630 + }, + { + "epoch": 1.939165009940358, + "grad_norm": 6.403761863708496, + "learning_rate": 1.391470614486794e-06, + "loss": 0.1125, + "num_input_tokens_seen": 28641000, + "step": 14631 + }, + { + "epoch": 1.939297548045063, + "grad_norm": 5.491398334503174, + "learning_rate": 1.3911594797437184e-06, + "loss": 0.156, + "num_input_tokens_seen": 28643280, + "step": 14632 + }, + { + "epoch": 1.939430086149768, + "grad_norm": 13.304581642150879, + "learning_rate": 1.3908483663798528e-06, + "loss": 0.205, + "num_input_tokens_seen": 28645368, + "step": 14633 + }, + { + "epoch": 1.939562624254473, + "grad_norm": 0.6657033562660217, + "learning_rate": 1.3905372744011946e-06, + "loss": 0.0064, + "num_input_tokens_seen": 28646632, + "step": 14634 + }, + { + "epoch": 1.9396951623591783, + "grad_norm": 12.469066619873047, + "learning_rate": 1.3902262038137449e-06, + "loss": 0.2113, + "num_input_tokens_seen": 28649280, + "step": 14635 + }, + { + "epoch": 1.9398277004638835, + "grad_norm": 9.4768705368042, + "learning_rate": 1.389915154623499e-06, + "loss": 0.1522, + "num_input_tokens_seen": 28650688, + "step": 14636 + }, + { + "epoch": 1.9399602385685886, + "grad_norm": 0.017998971045017242, + "learning_rate": 1.3896041268364539e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28652944, + "step": 14637 + }, + { + "epoch": 1.9400927766732936, + "grad_norm": 4.276162147521973, + "learning_rate": 1.389293120458608e-06, + "loss": 0.0487, + "num_input_tokens_seen": 28654792, + "step": 14638 + }, + { + "epoch": 1.9402253147779986, + "grad_norm": 0.05417890101671219, + "learning_rate": 1.3889821354959565e-06, + "loss": 0.0004, + "num_input_tokens_seen": 28657328, + "step": 14639 + }, + { + "epoch": 1.9403578528827037, + "grad_norm": 0.012029226869344711, + "learning_rate": 1.3886711719544965e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28658552, + "step": 14640 + }, + { + "epoch": 1.9404903909874087, + "grad_norm": 6.007042407989502, + "learning_rate": 1.3883602298402232e-06, + "loss": 0.1798, + "num_input_tokens_seen": 28661368, + "step": 14641 + }, + { + "epoch": 1.940622929092114, + "grad_norm": 3.0820279121398926, + "learning_rate": 1.3880493091591314e-06, + "loss": 0.0209, + "num_input_tokens_seen": 28663264, + "step": 14642 + }, + { + "epoch": 1.9407554671968192, + "grad_norm": 0.015500135719776154, + "learning_rate": 1.3877384099172147e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28664552, + "step": 14643 + }, + { + "epoch": 1.9408880053015243, + "grad_norm": 0.01828496716916561, + "learning_rate": 1.3874275321204694e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28666448, + "step": 14644 + }, + { + "epoch": 1.9410205434062293, + "grad_norm": 2.442807674407959, + "learning_rate": 1.3871166757748893e-06, + "loss": 0.05, + "num_input_tokens_seen": 28668632, + "step": 14645 + }, + { + "epoch": 1.9411530815109344, + "grad_norm": 0.013095048256218433, + "learning_rate": 1.3868058408864676e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28670520, + "step": 14646 + }, + { + "epoch": 1.9412856196156394, + "grad_norm": 4.288139820098877, + "learning_rate": 1.3864950274611974e-06, + "loss": 0.1245, + "num_input_tokens_seen": 28672664, + "step": 14647 + }, + { + "epoch": 1.9414181577203447, + "grad_norm": 4.198489665985107, + "learning_rate": 1.3861842355050712e-06, + "loss": 0.0641, + "num_input_tokens_seen": 28674776, + "step": 14648 + }, + { + "epoch": 1.9415506958250497, + "grad_norm": 5.618628978729248, + "learning_rate": 1.3858734650240802e-06, + "loss": 0.1448, + "num_input_tokens_seen": 28677080, + "step": 14649 + }, + { + "epoch": 1.941683233929755, + "grad_norm": 0.05193592235445976, + "learning_rate": 1.3855627160242175e-06, + "loss": 0.0003, + "num_input_tokens_seen": 28679472, + "step": 14650 + }, + { + "epoch": 1.94181577203446, + "grad_norm": 7.951456069946289, + "learning_rate": 1.3852519885114751e-06, + "loss": 0.2437, + "num_input_tokens_seen": 28682320, + "step": 14651 + }, + { + "epoch": 1.941948310139165, + "grad_norm": 0.0734550878405571, + "learning_rate": 1.3849412824918438e-06, + "loss": 0.0005, + "num_input_tokens_seen": 28684384, + "step": 14652 + }, + { + "epoch": 1.94208084824387, + "grad_norm": 7.789407730102539, + "learning_rate": 1.3846305979713136e-06, + "loss": 0.1154, + "num_input_tokens_seen": 28685984, + "step": 14653 + }, + { + "epoch": 1.942213386348575, + "grad_norm": 2.748812437057495, + "learning_rate": 1.3843199349558738e-06, + "loss": 0.057, + "num_input_tokens_seen": 28688784, + "step": 14654 + }, + { + "epoch": 1.9423459244532804, + "grad_norm": 8.302749633789062, + "learning_rate": 1.384009293451516e-06, + "loss": 0.23, + "num_input_tokens_seen": 28691192, + "step": 14655 + }, + { + "epoch": 1.9424784625579854, + "grad_norm": 0.023133253678679466, + "learning_rate": 1.3836986734642295e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28692840, + "step": 14656 + }, + { + "epoch": 1.9426110006626907, + "grad_norm": 9.725693702697754, + "learning_rate": 1.3833880750000023e-06, + "loss": 0.2032, + "num_input_tokens_seen": 28695208, + "step": 14657 + }, + { + "epoch": 1.9427435387673957, + "grad_norm": 0.8284279704093933, + "learning_rate": 1.3830774980648237e-06, + "loss": 0.011, + "num_input_tokens_seen": 28696320, + "step": 14658 + }, + { + "epoch": 1.9428760768721007, + "grad_norm": 9.413773536682129, + "learning_rate": 1.3827669426646812e-06, + "loss": 0.1615, + "num_input_tokens_seen": 28698312, + "step": 14659 + }, + { + "epoch": 1.9430086149768058, + "grad_norm": 9.086424827575684, + "learning_rate": 1.382456408805562e-06, + "loss": 0.0592, + "num_input_tokens_seen": 28701128, + "step": 14660 + }, + { + "epoch": 1.9431411530815108, + "grad_norm": 3.0442428588867188, + "learning_rate": 1.3821458964934542e-06, + "loss": 0.0153, + "num_input_tokens_seen": 28703192, + "step": 14661 + }, + { + "epoch": 1.943273691186216, + "grad_norm": 7.821380615234375, + "learning_rate": 1.3818354057343459e-06, + "loss": 0.2679, + "num_input_tokens_seen": 28705632, + "step": 14662 + }, + { + "epoch": 1.943406229290921, + "grad_norm": 5.3158793449401855, + "learning_rate": 1.381524936534222e-06, + "loss": 0.0335, + "num_input_tokens_seen": 28707256, + "step": 14663 + }, + { + "epoch": 1.9435387673956264, + "grad_norm": 8.956122398376465, + "learning_rate": 1.381214488899069e-06, + "loss": 0.1111, + "num_input_tokens_seen": 28709672, + "step": 14664 + }, + { + "epoch": 1.9436713055003314, + "grad_norm": 0.029865404590964317, + "learning_rate": 1.3809040628348714e-06, + "loss": 0.0002, + "num_input_tokens_seen": 28710904, + "step": 14665 + }, + { + "epoch": 1.9438038436050364, + "grad_norm": 0.052165061235427856, + "learning_rate": 1.3805936583476164e-06, + "loss": 0.0004, + "num_input_tokens_seen": 28713232, + "step": 14666 + }, + { + "epoch": 1.9439363817097415, + "grad_norm": 8.316707611083984, + "learning_rate": 1.380283275443287e-06, + "loss": 0.2502, + "num_input_tokens_seen": 28715352, + "step": 14667 + }, + { + "epoch": 1.9440689198144465, + "grad_norm": 0.02401457354426384, + "learning_rate": 1.3799729141278701e-06, + "loss": 0.0002, + "num_input_tokens_seen": 28717496, + "step": 14668 + }, + { + "epoch": 1.9442014579191518, + "grad_norm": 11.197239875793457, + "learning_rate": 1.3796625744073471e-06, + "loss": 0.1392, + "num_input_tokens_seen": 28719832, + "step": 14669 + }, + { + "epoch": 1.9443339960238568, + "grad_norm": 2.6308822631835938, + "learning_rate": 1.379352256287702e-06, + "loss": 0.0333, + "num_input_tokens_seen": 28721440, + "step": 14670 + }, + { + "epoch": 1.944466534128562, + "grad_norm": 0.01773657090961933, + "learning_rate": 1.3790419597749198e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28722944, + "step": 14671 + }, + { + "epoch": 1.9445990722332671, + "grad_norm": 2.6789276599884033, + "learning_rate": 1.3787316848749816e-06, + "loss": 0.0292, + "num_input_tokens_seen": 28724200, + "step": 14672 + }, + { + "epoch": 1.9447316103379721, + "grad_norm": 4.435582160949707, + "learning_rate": 1.3784214315938691e-06, + "loss": 0.0155, + "num_input_tokens_seen": 28726272, + "step": 14673 + }, + { + "epoch": 1.9448641484426772, + "grad_norm": 3.9609580039978027, + "learning_rate": 1.3781111999375662e-06, + "loss": 0.0947, + "num_input_tokens_seen": 28728752, + "step": 14674 + }, + { + "epoch": 1.9449966865473822, + "grad_norm": 7.49310827255249, + "learning_rate": 1.377800989912052e-06, + "loss": 0.0435, + "num_input_tokens_seen": 28730496, + "step": 14675 + }, + { + "epoch": 1.9451292246520875, + "grad_norm": 0.042895857244729996, + "learning_rate": 1.3774908015233102e-06, + "loss": 0.0003, + "num_input_tokens_seen": 28732424, + "step": 14676 + }, + { + "epoch": 1.9452617627567925, + "grad_norm": 1.0791136026382446, + "learning_rate": 1.37718063477732e-06, + "loss": 0.0039, + "num_input_tokens_seen": 28734376, + "step": 14677 + }, + { + "epoch": 1.9453943008614978, + "grad_norm": 9.463659286499023, + "learning_rate": 1.3768704896800608e-06, + "loss": 0.1, + "num_input_tokens_seen": 28736528, + "step": 14678 + }, + { + "epoch": 1.9455268389662028, + "grad_norm": 0.14655163884162903, + "learning_rate": 1.3765603662375143e-06, + "loss": 0.0009, + "num_input_tokens_seen": 28738208, + "step": 14679 + }, + { + "epoch": 1.9456593770709079, + "grad_norm": 7.303836345672607, + "learning_rate": 1.376250264455658e-06, + "loss": 0.1555, + "num_input_tokens_seen": 28740544, + "step": 14680 + }, + { + "epoch": 1.945791915175613, + "grad_norm": 0.0055287862196564674, + "learning_rate": 1.3759401843404728e-06, + "loss": 0.0, + "num_input_tokens_seen": 28741696, + "step": 14681 + }, + { + "epoch": 1.945924453280318, + "grad_norm": 0.026806052774190903, + "learning_rate": 1.3756301258979366e-06, + "loss": 0.0002, + "num_input_tokens_seen": 28743088, + "step": 14682 + }, + { + "epoch": 1.9460569913850232, + "grad_norm": 2.64050030708313, + "learning_rate": 1.3753200891340268e-06, + "loss": 0.0314, + "num_input_tokens_seen": 28744848, + "step": 14683 + }, + { + "epoch": 1.9461895294897285, + "grad_norm": 9.417302131652832, + "learning_rate": 1.375010074054721e-06, + "loss": 0.0711, + "num_input_tokens_seen": 28746840, + "step": 14684 + }, + { + "epoch": 1.9463220675944335, + "grad_norm": 0.08746056258678436, + "learning_rate": 1.3747000806659966e-06, + "loss": 0.0006, + "num_input_tokens_seen": 28748728, + "step": 14685 + }, + { + "epoch": 1.9464546056991385, + "grad_norm": 4.36961555480957, + "learning_rate": 1.3743901089738321e-06, + "loss": 0.0159, + "num_input_tokens_seen": 28750760, + "step": 14686 + }, + { + "epoch": 1.9465871438038436, + "grad_norm": 6.71688985824585, + "learning_rate": 1.3740801589842029e-06, + "loss": 0.2345, + "num_input_tokens_seen": 28752752, + "step": 14687 + }, + { + "epoch": 1.9467196819085486, + "grad_norm": 0.05255643650889397, + "learning_rate": 1.373770230703085e-06, + "loss": 0.0004, + "num_input_tokens_seen": 28755400, + "step": 14688 + }, + { + "epoch": 1.9468522200132539, + "grad_norm": 0.06645620614290237, + "learning_rate": 1.3734603241364542e-06, + "loss": 0.0004, + "num_input_tokens_seen": 28758176, + "step": 14689 + }, + { + "epoch": 1.946984758117959, + "grad_norm": 6.391659259796143, + "learning_rate": 1.3731504392902838e-06, + "loss": 0.212, + "num_input_tokens_seen": 28760712, + "step": 14690 + }, + { + "epoch": 1.9471172962226642, + "grad_norm": 4.438530445098877, + "learning_rate": 1.372840576170551e-06, + "loss": 0.1191, + "num_input_tokens_seen": 28762472, + "step": 14691 + }, + { + "epoch": 1.9472498343273692, + "grad_norm": 4.386795997619629, + "learning_rate": 1.37253073478323e-06, + "loss": 0.0904, + "num_input_tokens_seen": 28764632, + "step": 14692 + }, + { + "epoch": 1.9473823724320742, + "grad_norm": 18.760112762451172, + "learning_rate": 1.3722209151342947e-06, + "loss": 0.2118, + "num_input_tokens_seen": 28766648, + "step": 14693 + }, + { + "epoch": 1.9475149105367793, + "grad_norm": 4.268288612365723, + "learning_rate": 1.3719111172297178e-06, + "loss": 0.112, + "num_input_tokens_seen": 28768864, + "step": 14694 + }, + { + "epoch": 1.9476474486414843, + "grad_norm": 18.963247299194336, + "learning_rate": 1.371601341075473e-06, + "loss": 0.1057, + "num_input_tokens_seen": 28770656, + "step": 14695 + }, + { + "epoch": 1.9477799867461896, + "grad_norm": 19.655405044555664, + "learning_rate": 1.3712915866775318e-06, + "loss": 0.1838, + "num_input_tokens_seen": 28772496, + "step": 14696 + }, + { + "epoch": 1.9479125248508946, + "grad_norm": 2.468867063522339, + "learning_rate": 1.3709818540418673e-06, + "loss": 0.0346, + "num_input_tokens_seen": 28774016, + "step": 14697 + }, + { + "epoch": 1.9480450629555999, + "grad_norm": 0.01792384870350361, + "learning_rate": 1.3706721431744527e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28775704, + "step": 14698 + }, + { + "epoch": 1.948177601060305, + "grad_norm": 13.417511940002441, + "learning_rate": 1.3703624540812582e-06, + "loss": 0.1681, + "num_input_tokens_seen": 28777936, + "step": 14699 + }, + { + "epoch": 1.94831013916501, + "grad_norm": 0.07239798456430435, + "learning_rate": 1.3700527867682547e-06, + "loss": 0.0004, + "num_input_tokens_seen": 28779744, + "step": 14700 + }, + { + "epoch": 1.948442677269715, + "grad_norm": 0.047060493379831314, + "learning_rate": 1.3697431412414123e-06, + "loss": 0.0002, + "num_input_tokens_seen": 28781784, + "step": 14701 + }, + { + "epoch": 1.94857521537442, + "grad_norm": 5.032929420471191, + "learning_rate": 1.3694335175067024e-06, + "loss": 0.0318, + "num_input_tokens_seen": 28783944, + "step": 14702 + }, + { + "epoch": 1.9487077534791253, + "grad_norm": 13.099404335021973, + "learning_rate": 1.3691239155700936e-06, + "loss": 0.5596, + "num_input_tokens_seen": 28786248, + "step": 14703 + }, + { + "epoch": 1.9488402915838303, + "grad_norm": 0.6062484979629517, + "learning_rate": 1.3688143354375568e-06, + "loss": 0.0024, + "num_input_tokens_seen": 28787976, + "step": 14704 + }, + { + "epoch": 1.9489728296885356, + "grad_norm": 11.420902252197266, + "learning_rate": 1.36850477711506e-06, + "loss": 0.3039, + "num_input_tokens_seen": 28790520, + "step": 14705 + }, + { + "epoch": 1.9491053677932406, + "grad_norm": 3.5838351249694824, + "learning_rate": 1.3681952406085707e-06, + "loss": 0.044, + "num_input_tokens_seen": 28792464, + "step": 14706 + }, + { + "epoch": 1.9492379058979457, + "grad_norm": 0.1063372939825058, + "learning_rate": 1.3678857259240588e-06, + "loss": 0.0007, + "num_input_tokens_seen": 28795000, + "step": 14707 + }, + { + "epoch": 1.9493704440026507, + "grad_norm": 13.210976600646973, + "learning_rate": 1.3675762330674902e-06, + "loss": 0.243, + "num_input_tokens_seen": 28796944, + "step": 14708 + }, + { + "epoch": 1.9495029821073557, + "grad_norm": 0.0642801821231842, + "learning_rate": 1.3672667620448338e-06, + "loss": 0.0005, + "num_input_tokens_seen": 28798032, + "step": 14709 + }, + { + "epoch": 1.949635520212061, + "grad_norm": 4.017451763153076, + "learning_rate": 1.366957312862056e-06, + "loss": 0.0535, + "num_input_tokens_seen": 28800880, + "step": 14710 + }, + { + "epoch": 1.949768058316766, + "grad_norm": 0.038332462310791016, + "learning_rate": 1.366647885525122e-06, + "loss": 0.0003, + "num_input_tokens_seen": 28803424, + "step": 14711 + }, + { + "epoch": 1.9499005964214713, + "grad_norm": 0.19714200496673584, + "learning_rate": 1.3663384800399992e-06, + "loss": 0.001, + "num_input_tokens_seen": 28806088, + "step": 14712 + }, + { + "epoch": 1.9500331345261763, + "grad_norm": 2.0855164527893066, + "learning_rate": 1.3660290964126527e-06, + "loss": 0.0348, + "num_input_tokens_seen": 28808120, + "step": 14713 + }, + { + "epoch": 1.9501656726308814, + "grad_norm": 6.318480491638184, + "learning_rate": 1.3657197346490463e-06, + "loss": 0.1104, + "num_input_tokens_seen": 28810040, + "step": 14714 + }, + { + "epoch": 1.9502982107355864, + "grad_norm": 0.8638790249824524, + "learning_rate": 1.3654103947551473e-06, + "loss": 0.0056, + "num_input_tokens_seen": 28811488, + "step": 14715 + }, + { + "epoch": 1.9504307488402914, + "grad_norm": 0.014980366453528404, + "learning_rate": 1.3651010767369178e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28813856, + "step": 14716 + }, + { + "epoch": 1.9505632869449967, + "grad_norm": 7.972902297973633, + "learning_rate": 1.364791780600323e-06, + "loss": 0.2391, + "num_input_tokens_seen": 28815328, + "step": 14717 + }, + { + "epoch": 1.9506958250497017, + "grad_norm": 1.5050245523452759, + "learning_rate": 1.3644825063513259e-06, + "loss": 0.0154, + "num_input_tokens_seen": 28817680, + "step": 14718 + }, + { + "epoch": 1.950828363154407, + "grad_norm": 7.361939907073975, + "learning_rate": 1.3641732539958895e-06, + "loss": 0.1417, + "num_input_tokens_seen": 28821288, + "step": 14719 + }, + { + "epoch": 1.950960901259112, + "grad_norm": 7.258730888366699, + "learning_rate": 1.3638640235399753e-06, + "loss": 0.1803, + "num_input_tokens_seen": 28822944, + "step": 14720 + }, + { + "epoch": 1.951093439363817, + "grad_norm": 11.681361198425293, + "learning_rate": 1.3635548149895475e-06, + "loss": 0.1543, + "num_input_tokens_seen": 28825040, + "step": 14721 + }, + { + "epoch": 1.951225977468522, + "grad_norm": 11.65433406829834, + "learning_rate": 1.3632456283505658e-06, + "loss": 0.0755, + "num_input_tokens_seen": 28826760, + "step": 14722 + }, + { + "epoch": 1.9513585155732271, + "grad_norm": 11.703380584716797, + "learning_rate": 1.3629364636289939e-06, + "loss": 0.3211, + "num_input_tokens_seen": 28828416, + "step": 14723 + }, + { + "epoch": 1.9514910536779324, + "grad_norm": 7.617846965789795, + "learning_rate": 1.3626273208307912e-06, + "loss": 0.1217, + "num_input_tokens_seen": 28830176, + "step": 14724 + }, + { + "epoch": 1.9516235917826377, + "grad_norm": 0.021498100832104683, + "learning_rate": 1.3623181999619185e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28831640, + "step": 14725 + }, + { + "epoch": 1.9517561298873427, + "grad_norm": 0.011297020129859447, + "learning_rate": 1.3620091010283342e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28832944, + "step": 14726 + }, + { + "epoch": 1.9518886679920477, + "grad_norm": 0.020409978926181793, + "learning_rate": 1.361700024036e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28834448, + "step": 14727 + }, + { + "epoch": 1.9520212060967528, + "grad_norm": 12.548238754272461, + "learning_rate": 1.3613909689908755e-06, + "loss": 0.1465, + "num_input_tokens_seen": 28836440, + "step": 14728 + }, + { + "epoch": 1.9521537442014578, + "grad_norm": 3.1456000804901123, + "learning_rate": 1.3610819358989186e-06, + "loss": 0.0533, + "num_input_tokens_seen": 28838608, + "step": 14729 + }, + { + "epoch": 1.9522862823061629, + "grad_norm": 8.6223783493042, + "learning_rate": 1.3607729247660877e-06, + "loss": 0.0548, + "num_input_tokens_seen": 28839856, + "step": 14730 + }, + { + "epoch": 1.9524188204108681, + "grad_norm": 5.883285999298096, + "learning_rate": 1.3604639355983395e-06, + "loss": 0.0986, + "num_input_tokens_seen": 28841472, + "step": 14731 + }, + { + "epoch": 1.9525513585155734, + "grad_norm": 3.47570538520813, + "learning_rate": 1.3601549684016332e-06, + "loss": 0.0196, + "num_input_tokens_seen": 28842608, + "step": 14732 + }, + { + "epoch": 1.9526838966202784, + "grad_norm": 0.17962372303009033, + "learning_rate": 1.359846023181926e-06, + "loss": 0.0013, + "num_input_tokens_seen": 28845312, + "step": 14733 + }, + { + "epoch": 1.9528164347249835, + "grad_norm": 9.276968955993652, + "learning_rate": 1.3595370999451745e-06, + "loss": 0.0722, + "num_input_tokens_seen": 28847736, + "step": 14734 + }, + { + "epoch": 1.9529489728296885, + "grad_norm": 6.3852219581604, + "learning_rate": 1.3592281986973343e-06, + "loss": 0.1305, + "num_input_tokens_seen": 28849720, + "step": 14735 + }, + { + "epoch": 1.9530815109343935, + "grad_norm": 1.415165662765503, + "learning_rate": 1.3589193194443612e-06, + "loss": 0.0125, + "num_input_tokens_seen": 28851184, + "step": 14736 + }, + { + "epoch": 1.9532140490390988, + "grad_norm": 3.0497210025787354, + "learning_rate": 1.35861046219221e-06, + "loss": 0.0207, + "num_input_tokens_seen": 28852312, + "step": 14737 + }, + { + "epoch": 1.9533465871438038, + "grad_norm": 5.9520087242126465, + "learning_rate": 1.3583016269468362e-06, + "loss": 0.0929, + "num_input_tokens_seen": 28854000, + "step": 14738 + }, + { + "epoch": 1.953479125248509, + "grad_norm": 0.1340920776128769, + "learning_rate": 1.3579928137141962e-06, + "loss": 0.0009, + "num_input_tokens_seen": 28856472, + "step": 14739 + }, + { + "epoch": 1.9536116633532141, + "grad_norm": 2.7222604751586914, + "learning_rate": 1.357684022500242e-06, + "loss": 0.0634, + "num_input_tokens_seen": 28858360, + "step": 14740 + }, + { + "epoch": 1.9537442014579192, + "grad_norm": 13.631305694580078, + "learning_rate": 1.3573752533109283e-06, + "loss": 0.2023, + "num_input_tokens_seen": 28860544, + "step": 14741 + }, + { + "epoch": 1.9538767395626242, + "grad_norm": 0.012690866366028786, + "learning_rate": 1.3570665061522067e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28862008, + "step": 14742 + }, + { + "epoch": 1.9540092776673292, + "grad_norm": 1.4957959651947021, + "learning_rate": 1.3567577810300325e-06, + "loss": 0.0172, + "num_input_tokens_seen": 28863200, + "step": 14743 + }, + { + "epoch": 1.9541418157720345, + "grad_norm": 5.015166282653809, + "learning_rate": 1.3564490779503558e-06, + "loss": 0.1212, + "num_input_tokens_seen": 28864504, + "step": 14744 + }, + { + "epoch": 1.9542743538767395, + "grad_norm": 9.7816801071167, + "learning_rate": 1.3561403969191311e-06, + "loss": 0.1368, + "num_input_tokens_seen": 28866632, + "step": 14745 + }, + { + "epoch": 1.9544068919814448, + "grad_norm": 14.755289077758789, + "learning_rate": 1.3558317379423083e-06, + "loss": 0.0704, + "num_input_tokens_seen": 28868200, + "step": 14746 + }, + { + "epoch": 1.9545394300861498, + "grad_norm": 0.02982102334499359, + "learning_rate": 1.355523101025839e-06, + "loss": 0.0001, + "num_input_tokens_seen": 28870272, + "step": 14747 + }, + { + "epoch": 1.9546719681908549, + "grad_norm": 0.2428574562072754, + "learning_rate": 1.355214486175673e-06, + "loss": 0.0014, + "num_input_tokens_seen": 28871696, + "step": 14748 + }, + { + "epoch": 1.95480450629556, + "grad_norm": 6.164900302886963, + "learning_rate": 1.3549058933977625e-06, + "loss": 0.2072, + "num_input_tokens_seen": 28873736, + "step": 14749 + }, + { + "epoch": 1.954937044400265, + "grad_norm": 5.2071075439453125, + "learning_rate": 1.3545973226980553e-06, + "loss": 0.0374, + "num_input_tokens_seen": 28875728, + "step": 14750 + }, + { + "epoch": 1.9550695825049702, + "grad_norm": 0.033458635210990906, + "learning_rate": 1.3542887740825025e-06, + "loss": 0.0002, + "num_input_tokens_seen": 28877208, + "step": 14751 + }, + { + "epoch": 1.9552021206096752, + "grad_norm": 7.464413642883301, + "learning_rate": 1.353980247557053e-06, + "loss": 0.1056, + "num_input_tokens_seen": 28878968, + "step": 14752 + }, + { + "epoch": 1.9553346587143805, + "grad_norm": 0.07793564349412918, + "learning_rate": 1.3536717431276535e-06, + "loss": 0.0007, + "num_input_tokens_seen": 28880576, + "step": 14753 + }, + { + "epoch": 1.9554671968190855, + "grad_norm": 8.440590858459473, + "learning_rate": 1.353363260800255e-06, + "loss": 0.1812, + "num_input_tokens_seen": 28882184, + "step": 14754 + }, + { + "epoch": 1.9555997349237906, + "grad_norm": 6.732773780822754, + "learning_rate": 1.3530548005808026e-06, + "loss": 0.0935, + "num_input_tokens_seen": 28883616, + "step": 14755 + }, + { + "epoch": 1.9557322730284956, + "grad_norm": 18.820287704467773, + "learning_rate": 1.352746362475246e-06, + "loss": 0.3558, + "num_input_tokens_seen": 28886080, + "step": 14756 + }, + { + "epoch": 1.9558648111332007, + "grad_norm": 5.306849002838135, + "learning_rate": 1.352437946489531e-06, + "loss": 0.118, + "num_input_tokens_seen": 28887760, + "step": 14757 + }, + { + "epoch": 1.955997349237906, + "grad_norm": 3.3443362712860107, + "learning_rate": 1.352129552629603e-06, + "loss": 0.0818, + "num_input_tokens_seen": 28889480, + "step": 14758 + }, + { + "epoch": 1.956129887342611, + "grad_norm": 3.163560628890991, + "learning_rate": 1.3518211809014098e-06, + "loss": 0.0115, + "num_input_tokens_seen": 28892896, + "step": 14759 + }, + { + "epoch": 1.9562624254473162, + "grad_norm": 2.148686408996582, + "learning_rate": 1.3515128313108966e-06, + "loss": 0.0085, + "num_input_tokens_seen": 28895552, + "step": 14760 + }, + { + "epoch": 1.9563949635520212, + "grad_norm": 1.6802475452423096, + "learning_rate": 1.3512045038640075e-06, + "loss": 0.0133, + "num_input_tokens_seen": 28897432, + "step": 14761 + }, + { + "epoch": 1.9565275016567263, + "grad_norm": 5.437595844268799, + "learning_rate": 1.350896198566689e-06, + "loss": 0.0432, + "num_input_tokens_seen": 28899544, + "step": 14762 + }, + { + "epoch": 1.9566600397614313, + "grad_norm": 2.3035781383514404, + "learning_rate": 1.3505879154248835e-06, + "loss": 0.0299, + "num_input_tokens_seen": 28900832, + "step": 14763 + }, + { + "epoch": 1.9567925778661364, + "grad_norm": 7.638674259185791, + "learning_rate": 1.3502796544445368e-06, + "loss": 0.1946, + "num_input_tokens_seen": 28902944, + "step": 14764 + }, + { + "epoch": 1.9569251159708416, + "grad_norm": 7.515084743499756, + "learning_rate": 1.3499714156315916e-06, + "loss": 0.1615, + "num_input_tokens_seen": 28904680, + "step": 14765 + }, + { + "epoch": 1.9570576540755469, + "grad_norm": 0.0420452281832695, + "learning_rate": 1.3496631989919907e-06, + "loss": 0.0003, + "num_input_tokens_seen": 28906528, + "step": 14766 + }, + { + "epoch": 1.957190192180252, + "grad_norm": 8.105485916137695, + "learning_rate": 1.3493550045316762e-06, + "loss": 0.1913, + "num_input_tokens_seen": 28908128, + "step": 14767 + }, + { + "epoch": 1.957322730284957, + "grad_norm": 8.300603866577148, + "learning_rate": 1.3490468322565908e-06, + "loss": 0.2639, + "num_input_tokens_seen": 28910664, + "step": 14768 + }, + { + "epoch": 1.957455268389662, + "grad_norm": 2.736945390701294, + "learning_rate": 1.3487386821726774e-06, + "loss": 0.0196, + "num_input_tokens_seen": 28912144, + "step": 14769 + }, + { + "epoch": 1.957587806494367, + "grad_norm": 3.5149857997894287, + "learning_rate": 1.3484305542858768e-06, + "loss": 0.0309, + "num_input_tokens_seen": 28913920, + "step": 14770 + }, + { + "epoch": 1.957720344599072, + "grad_norm": 7.921489715576172, + "learning_rate": 1.348122448602129e-06, + "loss": 0.0791, + "num_input_tokens_seen": 28915960, + "step": 14771 + }, + { + "epoch": 1.9578528827037773, + "grad_norm": 0.1012229174375534, + "learning_rate": 1.3478143651273757e-06, + "loss": 0.0007, + "num_input_tokens_seen": 28917984, + "step": 14772 + }, + { + "epoch": 1.9579854208084826, + "grad_norm": 0.15451732277870178, + "learning_rate": 1.3475063038675546e-06, + "loss": 0.001, + "num_input_tokens_seen": 28921048, + "step": 14773 + }, + { + "epoch": 1.9581179589131876, + "grad_norm": 0.0649258941411972, + "learning_rate": 1.3471982648286075e-06, + "loss": 0.0004, + "num_input_tokens_seen": 28922800, + "step": 14774 + }, + { + "epoch": 1.9582504970178927, + "grad_norm": 0.14012020826339722, + "learning_rate": 1.346890248016474e-06, + "loss": 0.0008, + "num_input_tokens_seen": 28924640, + "step": 14775 + }, + { + "epoch": 1.9583830351225977, + "grad_norm": 0.11704191565513611, + "learning_rate": 1.3465822534370921e-06, + "loss": 0.002, + "num_input_tokens_seen": 28925744, + "step": 14776 + }, + { + "epoch": 1.9585155732273027, + "grad_norm": 4.9457292556762695, + "learning_rate": 1.3462742810963997e-06, + "loss": 0.062, + "num_input_tokens_seen": 28927888, + "step": 14777 + }, + { + "epoch": 1.958648111332008, + "grad_norm": 0.006525153294205666, + "learning_rate": 1.345966331000335e-06, + "loss": 0.0, + "num_input_tokens_seen": 28929232, + "step": 14778 + }, + { + "epoch": 1.958780649436713, + "grad_norm": 9.499655723571777, + "learning_rate": 1.3456584031548354e-06, + "loss": 0.1507, + "num_input_tokens_seen": 28931536, + "step": 14779 + }, + { + "epoch": 1.9589131875414183, + "grad_norm": 1.9727414846420288, + "learning_rate": 1.3453504975658375e-06, + "loss": 0.0067, + "num_input_tokens_seen": 28932968, + "step": 14780 + }, + { + "epoch": 1.9590457256461233, + "grad_norm": 5.839777946472168, + "learning_rate": 1.3450426142392796e-06, + "loss": 0.0223, + "num_input_tokens_seen": 28935296, + "step": 14781 + }, + { + "epoch": 1.9591782637508284, + "grad_norm": 0.09975003451108932, + "learning_rate": 1.3447347531810967e-06, + "loss": 0.0006, + "num_input_tokens_seen": 28938112, + "step": 14782 + }, + { + "epoch": 1.9593108018555334, + "grad_norm": 5.660719871520996, + "learning_rate": 1.344426914397225e-06, + "loss": 0.1118, + "num_input_tokens_seen": 28940288, + "step": 14783 + }, + { + "epoch": 1.9594433399602385, + "grad_norm": 0.7562618255615234, + "learning_rate": 1.3441190978935987e-06, + "loss": 0.0064, + "num_input_tokens_seen": 28941848, + "step": 14784 + }, + { + "epoch": 1.9595758780649437, + "grad_norm": 5.044239521026611, + "learning_rate": 1.3438113036761536e-06, + "loss": 0.0834, + "num_input_tokens_seen": 28943928, + "step": 14785 + }, + { + "epoch": 1.9597084161696487, + "grad_norm": 5.52391242980957, + "learning_rate": 1.343503531750825e-06, + "loss": 0.0968, + "num_input_tokens_seen": 28945944, + "step": 14786 + }, + { + "epoch": 1.959840954274354, + "grad_norm": 0.2941533923149109, + "learning_rate": 1.3431957821235457e-06, + "loss": 0.0012, + "num_input_tokens_seen": 28948536, + "step": 14787 + }, + { + "epoch": 1.959973492379059, + "grad_norm": 16.821487426757812, + "learning_rate": 1.3428880548002504e-06, + "loss": 0.3574, + "num_input_tokens_seen": 28950568, + "step": 14788 + }, + { + "epoch": 1.960106030483764, + "grad_norm": 17.752267837524414, + "learning_rate": 1.3425803497868706e-06, + "loss": 0.2871, + "num_input_tokens_seen": 28953712, + "step": 14789 + }, + { + "epoch": 1.9602385685884691, + "grad_norm": 12.329522132873535, + "learning_rate": 1.342272667089341e-06, + "loss": 0.3062, + "num_input_tokens_seen": 28956760, + "step": 14790 + }, + { + "epoch": 1.9603711066931742, + "grad_norm": 3.807431697845459, + "learning_rate": 1.341965006713592e-06, + "loss": 0.0854, + "num_input_tokens_seen": 28958576, + "step": 14791 + }, + { + "epoch": 1.9605036447978794, + "grad_norm": 9.05179500579834, + "learning_rate": 1.3416573686655576e-06, + "loss": 0.2176, + "num_input_tokens_seen": 28959904, + "step": 14792 + }, + { + "epoch": 1.9606361829025845, + "grad_norm": 7.315910339355469, + "learning_rate": 1.3413497529511683e-06, + "loss": 0.0312, + "num_input_tokens_seen": 28960888, + "step": 14793 + }, + { + "epoch": 1.9607687210072897, + "grad_norm": 11.204451560974121, + "learning_rate": 1.341042159576354e-06, + "loss": 0.2346, + "num_input_tokens_seen": 28962776, + "step": 14794 + }, + { + "epoch": 1.9609012591119948, + "grad_norm": 0.03700202703475952, + "learning_rate": 1.3407345885470472e-06, + "loss": 0.0002, + "num_input_tokens_seen": 28964880, + "step": 14795 + }, + { + "epoch": 1.9610337972166998, + "grad_norm": 9.792357444763184, + "learning_rate": 1.3404270398691774e-06, + "loss": 0.1411, + "num_input_tokens_seen": 28966656, + "step": 14796 + }, + { + "epoch": 1.9611663353214048, + "grad_norm": 0.025123370811343193, + "learning_rate": 1.3401195135486733e-06, + "loss": 0.0003, + "num_input_tokens_seen": 28968184, + "step": 14797 + }, + { + "epoch": 1.9612988734261099, + "grad_norm": 19.056276321411133, + "learning_rate": 1.3398120095914663e-06, + "loss": 0.3234, + "num_input_tokens_seen": 28969728, + "step": 14798 + }, + { + "epoch": 1.9614314115308151, + "grad_norm": 6.127427101135254, + "learning_rate": 1.339504528003483e-06, + "loss": 0.0672, + "num_input_tokens_seen": 28972272, + "step": 14799 + }, + { + "epoch": 1.9615639496355202, + "grad_norm": 3.6329784393310547, + "learning_rate": 1.3391970687906536e-06, + "loss": 0.0301, + "num_input_tokens_seen": 28974416, + "step": 14800 + }, + { + "epoch": 1.9616964877402254, + "grad_norm": 13.48880386352539, + "learning_rate": 1.3388896319589059e-06, + "loss": 0.3449, + "num_input_tokens_seen": 28976384, + "step": 14801 + }, + { + "epoch": 1.9618290258449305, + "grad_norm": 2.9296669960021973, + "learning_rate": 1.3385822175141672e-06, + "loss": 0.0217, + "num_input_tokens_seen": 28978552, + "step": 14802 + }, + { + "epoch": 1.9619615639496355, + "grad_norm": 11.347014427185059, + "learning_rate": 1.3382748254623633e-06, + "loss": 0.2705, + "num_input_tokens_seen": 28980192, + "step": 14803 + }, + { + "epoch": 1.9620941020543405, + "grad_norm": 15.219050407409668, + "learning_rate": 1.3379674558094233e-06, + "loss": 0.2193, + "num_input_tokens_seen": 28982120, + "step": 14804 + }, + { + "epoch": 1.9622266401590456, + "grad_norm": 0.1350218802690506, + "learning_rate": 1.3376601085612712e-06, + "loss": 0.0009, + "num_input_tokens_seen": 28984304, + "step": 14805 + }, + { + "epoch": 1.9623591782637508, + "grad_norm": 7.657247543334961, + "learning_rate": 1.3373527837238354e-06, + "loss": 0.0721, + "num_input_tokens_seen": 28985760, + "step": 14806 + }, + { + "epoch": 1.962491716368456, + "grad_norm": 1.1194578409194946, + "learning_rate": 1.3370454813030398e-06, + "loss": 0.0041, + "num_input_tokens_seen": 28987376, + "step": 14807 + }, + { + "epoch": 1.9626242544731611, + "grad_norm": 9.307700157165527, + "learning_rate": 1.3367382013048086e-06, + "loss": 0.3009, + "num_input_tokens_seen": 28989408, + "step": 14808 + }, + { + "epoch": 1.9627567925778662, + "grad_norm": 1.810801386833191, + "learning_rate": 1.3364309437350687e-06, + "loss": 0.0038, + "num_input_tokens_seen": 28990928, + "step": 14809 + }, + { + "epoch": 1.9628893306825712, + "grad_norm": 9.638711929321289, + "learning_rate": 1.3361237085997413e-06, + "loss": 0.0461, + "num_input_tokens_seen": 28993784, + "step": 14810 + }, + { + "epoch": 1.9630218687872762, + "grad_norm": 8.024345397949219, + "learning_rate": 1.3358164959047532e-06, + "loss": 0.2054, + "num_input_tokens_seen": 28996848, + "step": 14811 + }, + { + "epoch": 1.9631544068919813, + "grad_norm": 7.8461432456970215, + "learning_rate": 1.3355093056560261e-06, + "loss": 0.228, + "num_input_tokens_seen": 28998736, + "step": 14812 + }, + { + "epoch": 1.9632869449966865, + "grad_norm": 4.576639175415039, + "learning_rate": 1.3352021378594828e-06, + "loss": 0.0702, + "num_input_tokens_seen": 29001328, + "step": 14813 + }, + { + "epoch": 1.9634194831013918, + "grad_norm": 7.119454860687256, + "learning_rate": 1.334894992521045e-06, + "loss": 0.1664, + "num_input_tokens_seen": 29003232, + "step": 14814 + }, + { + "epoch": 1.9635520212060968, + "grad_norm": 11.186477661132812, + "learning_rate": 1.3345878696466352e-06, + "loss": 0.2555, + "num_input_tokens_seen": 29006120, + "step": 14815 + }, + { + "epoch": 1.9636845593108019, + "grad_norm": 0.7726321220397949, + "learning_rate": 1.3342807692421766e-06, + "loss": 0.0082, + "num_input_tokens_seen": 29008304, + "step": 14816 + }, + { + "epoch": 1.963817097415507, + "grad_norm": 0.3907501697540283, + "learning_rate": 1.333973691313589e-06, + "loss": 0.0019, + "num_input_tokens_seen": 29010816, + "step": 14817 + }, + { + "epoch": 1.963949635520212, + "grad_norm": 0.6814530491828918, + "learning_rate": 1.3336666358667928e-06, + "loss": 0.003, + "num_input_tokens_seen": 29012416, + "step": 14818 + }, + { + "epoch": 1.9640821736249172, + "grad_norm": 4.373108386993408, + "learning_rate": 1.3333596029077085e-06, + "loss": 0.0409, + "num_input_tokens_seen": 29013872, + "step": 14819 + }, + { + "epoch": 1.9642147117296223, + "grad_norm": 13.989361763000488, + "learning_rate": 1.333052592442255e-06, + "loss": 0.2727, + "num_input_tokens_seen": 29015744, + "step": 14820 + }, + { + "epoch": 1.9643472498343275, + "grad_norm": 10.115144729614258, + "learning_rate": 1.3327456044763525e-06, + "loss": 0.1478, + "num_input_tokens_seen": 29017968, + "step": 14821 + }, + { + "epoch": 1.9644797879390326, + "grad_norm": 3.979059934616089, + "learning_rate": 1.3324386390159209e-06, + "loss": 0.0426, + "num_input_tokens_seen": 29019912, + "step": 14822 + }, + { + "epoch": 1.9646123260437376, + "grad_norm": 1.4778681993484497, + "learning_rate": 1.3321316960668778e-06, + "loss": 0.0071, + "num_input_tokens_seen": 29021576, + "step": 14823 + }, + { + "epoch": 1.9647448641484426, + "grad_norm": 0.6960933208465576, + "learning_rate": 1.3318247756351414e-06, + "loss": 0.0095, + "num_input_tokens_seen": 29025288, + "step": 14824 + }, + { + "epoch": 1.9648774022531477, + "grad_norm": 3.7301688194274902, + "learning_rate": 1.331517877726628e-06, + "loss": 0.032, + "num_input_tokens_seen": 29027112, + "step": 14825 + }, + { + "epoch": 1.965009940357853, + "grad_norm": 10.736807823181152, + "learning_rate": 1.3312110023472572e-06, + "loss": 0.2991, + "num_input_tokens_seen": 29029048, + "step": 14826 + }, + { + "epoch": 1.965142478462558, + "grad_norm": 12.247562408447266, + "learning_rate": 1.3309041495029435e-06, + "loss": 0.1796, + "num_input_tokens_seen": 29031320, + "step": 14827 + }, + { + "epoch": 1.9652750165672632, + "grad_norm": 7.985411643981934, + "learning_rate": 1.3305973191996051e-06, + "loss": 0.0956, + "num_input_tokens_seen": 29033432, + "step": 14828 + }, + { + "epoch": 1.9654075546719683, + "grad_norm": 6.477376461029053, + "learning_rate": 1.3302905114431575e-06, + "loss": 0.1322, + "num_input_tokens_seen": 29034968, + "step": 14829 + }, + { + "epoch": 1.9655400927766733, + "grad_norm": 6.349009037017822, + "learning_rate": 1.3299837262395149e-06, + "loss": 0.1587, + "num_input_tokens_seen": 29037440, + "step": 14830 + }, + { + "epoch": 1.9656726308813783, + "grad_norm": 2.4001638889312744, + "learning_rate": 1.3296769635945938e-06, + "loss": 0.0535, + "num_input_tokens_seen": 29038912, + "step": 14831 + }, + { + "epoch": 1.9658051689860834, + "grad_norm": 0.303119421005249, + "learning_rate": 1.3293702235143085e-06, + "loss": 0.0032, + "num_input_tokens_seen": 29040984, + "step": 14832 + }, + { + "epoch": 1.9659377070907886, + "grad_norm": 9.171063423156738, + "learning_rate": 1.3290635060045715e-06, + "loss": 0.188, + "num_input_tokens_seen": 29043144, + "step": 14833 + }, + { + "epoch": 1.9660702451954937, + "grad_norm": 7.708461284637451, + "learning_rate": 1.3287568110712992e-06, + "loss": 0.0982, + "num_input_tokens_seen": 29044872, + "step": 14834 + }, + { + "epoch": 1.966202783300199, + "grad_norm": 0.05881329998373985, + "learning_rate": 1.3284501387204035e-06, + "loss": 0.0004, + "num_input_tokens_seen": 29046184, + "step": 14835 + }, + { + "epoch": 1.966335321404904, + "grad_norm": 8.188450813293457, + "learning_rate": 1.3281434889577965e-06, + "loss": 0.0541, + "num_input_tokens_seen": 29047728, + "step": 14836 + }, + { + "epoch": 1.966467859509609, + "grad_norm": 3.0618462562561035, + "learning_rate": 1.3278368617893921e-06, + "loss": 0.0167, + "num_input_tokens_seen": 29050424, + "step": 14837 + }, + { + "epoch": 1.966600397614314, + "grad_norm": 7.491856575012207, + "learning_rate": 1.3275302572211008e-06, + "loss": 0.2012, + "num_input_tokens_seen": 29053792, + "step": 14838 + }, + { + "epoch": 1.966732935719019, + "grad_norm": 4.789963722229004, + "learning_rate": 1.327223675258836e-06, + "loss": 0.0283, + "num_input_tokens_seen": 29055400, + "step": 14839 + }, + { + "epoch": 1.9668654738237243, + "grad_norm": 6.070686340332031, + "learning_rate": 1.326917115908508e-06, + "loss": 0.1422, + "num_input_tokens_seen": 29057416, + "step": 14840 + }, + { + "epoch": 1.9669980119284294, + "grad_norm": 7.813318252563477, + "learning_rate": 1.3266105791760258e-06, + "loss": 0.105, + "num_input_tokens_seen": 29059456, + "step": 14841 + }, + { + "epoch": 1.9671305500331346, + "grad_norm": 9.205842018127441, + "learning_rate": 1.3263040650673026e-06, + "loss": 0.1905, + "num_input_tokens_seen": 29061224, + "step": 14842 + }, + { + "epoch": 1.9672630881378397, + "grad_norm": 0.06410596519708633, + "learning_rate": 1.3259975735882462e-06, + "loss": 0.0005, + "num_input_tokens_seen": 29063216, + "step": 14843 + }, + { + "epoch": 1.9673956262425447, + "grad_norm": 0.04958916828036308, + "learning_rate": 1.3256911047447662e-06, + "loss": 0.0003, + "num_input_tokens_seen": 29065608, + "step": 14844 + }, + { + "epoch": 1.9675281643472498, + "grad_norm": 1.9039580821990967, + "learning_rate": 1.3253846585427724e-06, + "loss": 0.0269, + "num_input_tokens_seen": 29068024, + "step": 14845 + }, + { + "epoch": 1.9676607024519548, + "grad_norm": 7.274997711181641, + "learning_rate": 1.3250782349881716e-06, + "loss": 0.103, + "num_input_tokens_seen": 29070232, + "step": 14846 + }, + { + "epoch": 1.96779324055666, + "grad_norm": 0.006930314935743809, + "learning_rate": 1.3247718340868744e-06, + "loss": 0.0, + "num_input_tokens_seen": 29071296, + "step": 14847 + }, + { + "epoch": 1.967925778661365, + "grad_norm": 5.472148418426514, + "learning_rate": 1.3244654558447863e-06, + "loss": 0.054, + "num_input_tokens_seen": 29073296, + "step": 14848 + }, + { + "epoch": 1.9680583167660703, + "grad_norm": 8.862009048461914, + "learning_rate": 1.3241591002678155e-06, + "loss": 0.2247, + "num_input_tokens_seen": 29075688, + "step": 14849 + }, + { + "epoch": 1.9681908548707754, + "grad_norm": 0.2510817050933838, + "learning_rate": 1.3238527673618675e-06, + "loss": 0.0015, + "num_input_tokens_seen": 29077016, + "step": 14850 + }, + { + "epoch": 1.9683233929754804, + "grad_norm": 10.646903991699219, + "learning_rate": 1.3235464571328497e-06, + "loss": 0.1412, + "num_input_tokens_seen": 29078744, + "step": 14851 + }, + { + "epoch": 1.9684559310801855, + "grad_norm": 12.21786880493164, + "learning_rate": 1.3232401695866686e-06, + "loss": 0.1384, + "num_input_tokens_seen": 29081712, + "step": 14852 + }, + { + "epoch": 1.9685884691848905, + "grad_norm": 3.819003105163574, + "learning_rate": 1.322933904729229e-06, + "loss": 0.0447, + "num_input_tokens_seen": 29084032, + "step": 14853 + }, + { + "epoch": 1.9687210072895958, + "grad_norm": 4.5528717041015625, + "learning_rate": 1.3226276625664353e-06, + "loss": 0.089, + "num_input_tokens_seen": 29085648, + "step": 14854 + }, + { + "epoch": 1.968853545394301, + "grad_norm": 5.275665760040283, + "learning_rate": 1.3223214431041929e-06, + "loss": 0.0181, + "num_input_tokens_seen": 29087120, + "step": 14855 + }, + { + "epoch": 1.968986083499006, + "grad_norm": 10.391576766967773, + "learning_rate": 1.3220152463484042e-06, + "loss": 0.1076, + "num_input_tokens_seen": 29088736, + "step": 14856 + }, + { + "epoch": 1.969118621603711, + "grad_norm": 0.052978191524744034, + "learning_rate": 1.3217090723049745e-06, + "loss": 0.0004, + "num_input_tokens_seen": 29091064, + "step": 14857 + }, + { + "epoch": 1.9692511597084161, + "grad_norm": 3.019630193710327, + "learning_rate": 1.3214029209798074e-06, + "loss": 0.0401, + "num_input_tokens_seen": 29093032, + "step": 14858 + }, + { + "epoch": 1.9693836978131212, + "grad_norm": 2.8398525714874268, + "learning_rate": 1.3210967923788054e-06, + "loss": 0.0994, + "num_input_tokens_seen": 29094688, + "step": 14859 + }, + { + "epoch": 1.9695162359178264, + "grad_norm": 15.511089324951172, + "learning_rate": 1.3207906865078704e-06, + "loss": 0.2524, + "num_input_tokens_seen": 29096784, + "step": 14860 + }, + { + "epoch": 1.9696487740225315, + "grad_norm": 0.3232240676879883, + "learning_rate": 1.3204846033729036e-06, + "loss": 0.0024, + "num_input_tokens_seen": 29098552, + "step": 14861 + }, + { + "epoch": 1.9697813121272367, + "grad_norm": 6.295639514923096, + "learning_rate": 1.320178542979807e-06, + "loss": 0.0469, + "num_input_tokens_seen": 29101168, + "step": 14862 + }, + { + "epoch": 1.9699138502319418, + "grad_norm": 4.049402713775635, + "learning_rate": 1.3198725053344833e-06, + "loss": 0.0713, + "num_input_tokens_seen": 29103640, + "step": 14863 + }, + { + "epoch": 1.9700463883366468, + "grad_norm": 8.442045211791992, + "learning_rate": 1.3195664904428318e-06, + "loss": 0.0896, + "num_input_tokens_seen": 29104888, + "step": 14864 + }, + { + "epoch": 1.9701789264413518, + "grad_norm": 6.76145076751709, + "learning_rate": 1.3192604983107525e-06, + "loss": 0.1174, + "num_input_tokens_seen": 29106536, + "step": 14865 + }, + { + "epoch": 1.9703114645460569, + "grad_norm": 0.0621190145611763, + "learning_rate": 1.3189545289441457e-06, + "loss": 0.0004, + "num_input_tokens_seen": 29108056, + "step": 14866 + }, + { + "epoch": 1.9704440026507621, + "grad_norm": 0.18670029938220978, + "learning_rate": 1.318648582348909e-06, + "loss": 0.0009, + "num_input_tokens_seen": 29109832, + "step": 14867 + }, + { + "epoch": 1.9705765407554672, + "grad_norm": 9.867266654968262, + "learning_rate": 1.3183426585309428e-06, + "loss": 0.1914, + "num_input_tokens_seen": 29111520, + "step": 14868 + }, + { + "epoch": 1.9707090788601724, + "grad_norm": 3.6712915897369385, + "learning_rate": 1.3180367574961461e-06, + "loss": 0.0461, + "num_input_tokens_seen": 29113688, + "step": 14869 + }, + { + "epoch": 1.9708416169648775, + "grad_norm": 8.011804580688477, + "learning_rate": 1.3177308792504164e-06, + "loss": 0.1234, + "num_input_tokens_seen": 29116272, + "step": 14870 + }, + { + "epoch": 1.9709741550695825, + "grad_norm": 9.179527282714844, + "learning_rate": 1.3174250237996509e-06, + "loss": 0.3132, + "num_input_tokens_seen": 29118408, + "step": 14871 + }, + { + "epoch": 1.9711066931742875, + "grad_norm": 12.198481559753418, + "learning_rate": 1.3171191911497456e-06, + "loss": 0.3657, + "num_input_tokens_seen": 29120560, + "step": 14872 + }, + { + "epoch": 1.9712392312789926, + "grad_norm": 7.397515296936035, + "learning_rate": 1.3168133813065994e-06, + "loss": 0.1178, + "num_input_tokens_seen": 29122136, + "step": 14873 + }, + { + "epoch": 1.9713717693836978, + "grad_norm": 6.7663254737854, + "learning_rate": 1.3165075942761065e-06, + "loss": 0.1387, + "num_input_tokens_seen": 29124096, + "step": 14874 + }, + { + "epoch": 1.9715043074884029, + "grad_norm": 7.305816650390625, + "learning_rate": 1.3162018300641647e-06, + "loss": 0.162, + "num_input_tokens_seen": 29126896, + "step": 14875 + }, + { + "epoch": 1.9716368455931081, + "grad_norm": 0.1084723025560379, + "learning_rate": 1.3158960886766681e-06, + "loss": 0.0007, + "num_input_tokens_seen": 29129072, + "step": 14876 + }, + { + "epoch": 1.9717693836978132, + "grad_norm": 0.490786075592041, + "learning_rate": 1.3155903701195108e-06, + "loss": 0.0028, + "num_input_tokens_seen": 29130328, + "step": 14877 + }, + { + "epoch": 1.9719019218025182, + "grad_norm": 0.09074121713638306, + "learning_rate": 1.3152846743985892e-06, + "loss": 0.0006, + "num_input_tokens_seen": 29132168, + "step": 14878 + }, + { + "epoch": 1.9720344599072233, + "grad_norm": 4.867978096008301, + "learning_rate": 1.3149790015197967e-06, + "loss": 0.118, + "num_input_tokens_seen": 29134792, + "step": 14879 + }, + { + "epoch": 1.9721669980119283, + "grad_norm": 0.21490608155727386, + "learning_rate": 1.3146733514890252e-06, + "loss": 0.0015, + "num_input_tokens_seen": 29136160, + "step": 14880 + }, + { + "epoch": 1.9722995361166336, + "grad_norm": 1.7828940153121948, + "learning_rate": 1.3143677243121705e-06, + "loss": 0.0064, + "num_input_tokens_seen": 29138280, + "step": 14881 + }, + { + "epoch": 1.9724320742213386, + "grad_norm": 0.07908859103918076, + "learning_rate": 1.3140621199951226e-06, + "loss": 0.0005, + "num_input_tokens_seen": 29139656, + "step": 14882 + }, + { + "epoch": 1.9725646123260439, + "grad_norm": 9.952167510986328, + "learning_rate": 1.3137565385437764e-06, + "loss": 0.2031, + "num_input_tokens_seen": 29141464, + "step": 14883 + }, + { + "epoch": 1.972697150430749, + "grad_norm": 0.9749848246574402, + "learning_rate": 1.3134509799640223e-06, + "loss": 0.0089, + "num_input_tokens_seen": 29143280, + "step": 14884 + }, + { + "epoch": 1.972829688535454, + "grad_norm": 0.8748016357421875, + "learning_rate": 1.3131454442617523e-06, + "loss": 0.0075, + "num_input_tokens_seen": 29144672, + "step": 14885 + }, + { + "epoch": 1.972962226640159, + "grad_norm": 0.11748529970645905, + "learning_rate": 1.3128399314428556e-06, + "loss": 0.0009, + "num_input_tokens_seen": 29147152, + "step": 14886 + }, + { + "epoch": 1.973094764744864, + "grad_norm": 3.612283706665039, + "learning_rate": 1.312534441513224e-06, + "loss": 0.049, + "num_input_tokens_seen": 29149008, + "step": 14887 + }, + { + "epoch": 1.9732273028495693, + "grad_norm": 0.13075599074363708, + "learning_rate": 1.3122289744787487e-06, + "loss": 0.0009, + "num_input_tokens_seen": 29152248, + "step": 14888 + }, + { + "epoch": 1.9733598409542743, + "grad_norm": 10.999829292297363, + "learning_rate": 1.311923530345318e-06, + "loss": 0.1947, + "num_input_tokens_seen": 29153776, + "step": 14889 + }, + { + "epoch": 1.9734923790589796, + "grad_norm": 5.921285152435303, + "learning_rate": 1.3116181091188212e-06, + "loss": 0.1024, + "num_input_tokens_seen": 29155592, + "step": 14890 + }, + { + "epoch": 1.9736249171636846, + "grad_norm": 6.906350612640381, + "learning_rate": 1.3113127108051463e-06, + "loss": 0.1521, + "num_input_tokens_seen": 29157848, + "step": 14891 + }, + { + "epoch": 1.9737574552683896, + "grad_norm": 4.8795037269592285, + "learning_rate": 1.3110073354101832e-06, + "loss": 0.111, + "num_input_tokens_seen": 29159712, + "step": 14892 + }, + { + "epoch": 1.9738899933730947, + "grad_norm": 13.084979057312012, + "learning_rate": 1.3107019829398182e-06, + "loss": 0.2393, + "num_input_tokens_seen": 29161472, + "step": 14893 + }, + { + "epoch": 1.9740225314777997, + "grad_norm": 6.58968448638916, + "learning_rate": 1.3103966533999402e-06, + "loss": 0.0351, + "num_input_tokens_seen": 29162904, + "step": 14894 + }, + { + "epoch": 1.974155069582505, + "grad_norm": 2.4697012901306152, + "learning_rate": 1.3100913467964351e-06, + "loss": 0.0163, + "num_input_tokens_seen": 29164904, + "step": 14895 + }, + { + "epoch": 1.9742876076872102, + "grad_norm": 1.950269341468811, + "learning_rate": 1.30978606313519e-06, + "loss": 0.0211, + "num_input_tokens_seen": 29166832, + "step": 14896 + }, + { + "epoch": 1.9744201457919153, + "grad_norm": 0.040524259209632874, + "learning_rate": 1.3094808024220898e-06, + "loss": 0.0003, + "num_input_tokens_seen": 29168232, + "step": 14897 + }, + { + "epoch": 1.9745526838966203, + "grad_norm": 3.985295057296753, + "learning_rate": 1.3091755646630212e-06, + "loss": 0.0162, + "num_input_tokens_seen": 29171008, + "step": 14898 + }, + { + "epoch": 1.9746852220013253, + "grad_norm": 3.2362852096557617, + "learning_rate": 1.3088703498638696e-06, + "loss": 0.0734, + "num_input_tokens_seen": 29172720, + "step": 14899 + }, + { + "epoch": 1.9748177601060304, + "grad_norm": 5.5034894943237305, + "learning_rate": 1.3085651580305197e-06, + "loss": 0.0695, + "num_input_tokens_seen": 29173792, + "step": 14900 + }, + { + "epoch": 1.9749502982107354, + "grad_norm": 1.1807979345321655, + "learning_rate": 1.3082599891688559e-06, + "loss": 0.0089, + "num_input_tokens_seen": 29174992, + "step": 14901 + }, + { + "epoch": 1.9750828363154407, + "grad_norm": 7.608094692230225, + "learning_rate": 1.307954843284761e-06, + "loss": 0.1127, + "num_input_tokens_seen": 29177336, + "step": 14902 + }, + { + "epoch": 1.975215374420146, + "grad_norm": 0.7388755679130554, + "learning_rate": 1.3076497203841181e-06, + "loss": 0.004, + "num_input_tokens_seen": 29179072, + "step": 14903 + }, + { + "epoch": 1.975347912524851, + "grad_norm": 0.21701079607009888, + "learning_rate": 1.3073446204728115e-06, + "loss": 0.0009, + "num_input_tokens_seen": 29180800, + "step": 14904 + }, + { + "epoch": 1.975480450629556, + "grad_norm": 17.94910430908203, + "learning_rate": 1.3070395435567244e-06, + "loss": 0.1879, + "num_input_tokens_seen": 29183088, + "step": 14905 + }, + { + "epoch": 1.975612988734261, + "grad_norm": 11.324033737182617, + "learning_rate": 1.3067344896417378e-06, + "loss": 0.1777, + "num_input_tokens_seen": 29185368, + "step": 14906 + }, + { + "epoch": 1.975745526838966, + "grad_norm": 8.709357261657715, + "learning_rate": 1.3064294587337332e-06, + "loss": 0.1871, + "num_input_tokens_seen": 29187272, + "step": 14907 + }, + { + "epoch": 1.9758780649436714, + "grad_norm": 7.662909984588623, + "learning_rate": 1.3061244508385913e-06, + "loss": 0.1256, + "num_input_tokens_seen": 29189696, + "step": 14908 + }, + { + "epoch": 1.9760106030483764, + "grad_norm": 6.987009048461914, + "learning_rate": 1.3058194659621943e-06, + "loss": 0.0594, + "num_input_tokens_seen": 29190872, + "step": 14909 + }, + { + "epoch": 1.9761431411530817, + "grad_norm": 0.11000020802021027, + "learning_rate": 1.305514504110421e-06, + "loss": 0.0008, + "num_input_tokens_seen": 29192568, + "step": 14910 + }, + { + "epoch": 1.9762756792577867, + "grad_norm": 2.610076427459717, + "learning_rate": 1.3052095652891534e-06, + "loss": 0.0745, + "num_input_tokens_seen": 29194888, + "step": 14911 + }, + { + "epoch": 1.9764082173624917, + "grad_norm": 6.124466419219971, + "learning_rate": 1.3049046495042692e-06, + "loss": 0.1152, + "num_input_tokens_seen": 29196760, + "step": 14912 + }, + { + "epoch": 1.9765407554671968, + "grad_norm": 5.125068664550781, + "learning_rate": 1.3045997567616469e-06, + "loss": 0.1195, + "num_input_tokens_seen": 29198416, + "step": 14913 + }, + { + "epoch": 1.9766732935719018, + "grad_norm": 4.630000591278076, + "learning_rate": 1.304294887067167e-06, + "loss": 0.0866, + "num_input_tokens_seen": 29200840, + "step": 14914 + }, + { + "epoch": 1.976805831676607, + "grad_norm": 3.579505681991577, + "learning_rate": 1.3039900404267056e-06, + "loss": 0.089, + "num_input_tokens_seen": 29202832, + "step": 14915 + }, + { + "epoch": 1.976938369781312, + "grad_norm": 5.4929938316345215, + "learning_rate": 1.3036852168461421e-06, + "loss": 0.0656, + "num_input_tokens_seen": 29205192, + "step": 14916 + }, + { + "epoch": 1.9770709078860174, + "grad_norm": 1.0215038061141968, + "learning_rate": 1.3033804163313531e-06, + "loss": 0.0107, + "num_input_tokens_seen": 29206824, + "step": 14917 + }, + { + "epoch": 1.9772034459907224, + "grad_norm": 0.11023305356502533, + "learning_rate": 1.3030756388882155e-06, + "loss": 0.0006, + "num_input_tokens_seen": 29208696, + "step": 14918 + }, + { + "epoch": 1.9773359840954274, + "grad_norm": 12.214183807373047, + "learning_rate": 1.302770884522604e-06, + "loss": 0.1962, + "num_input_tokens_seen": 29210240, + "step": 14919 + }, + { + "epoch": 1.9774685222001325, + "grad_norm": 3.282313346862793, + "learning_rate": 1.3024661532403966e-06, + "loss": 0.0581, + "num_input_tokens_seen": 29211904, + "step": 14920 + }, + { + "epoch": 1.9776010603048375, + "grad_norm": 11.025667190551758, + "learning_rate": 1.3021614450474667e-06, + "loss": 0.0229, + "num_input_tokens_seen": 29213560, + "step": 14921 + }, + { + "epoch": 1.9777335984095428, + "grad_norm": 5.648624420166016, + "learning_rate": 1.301856759949692e-06, + "loss": 0.0601, + "num_input_tokens_seen": 29215976, + "step": 14922 + }, + { + "epoch": 1.9778661365142478, + "grad_norm": 2.7478837966918945, + "learning_rate": 1.3015520979529452e-06, + "loss": 0.0413, + "num_input_tokens_seen": 29218344, + "step": 14923 + }, + { + "epoch": 1.977998674618953, + "grad_norm": 0.031490396708250046, + "learning_rate": 1.3012474590631e-06, + "loss": 0.0002, + "num_input_tokens_seen": 29219864, + "step": 14924 + }, + { + "epoch": 1.978131212723658, + "grad_norm": 0.01297858078032732, + "learning_rate": 1.3009428432860316e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29220920, + "step": 14925 + }, + { + "epoch": 1.9782637508283631, + "grad_norm": 5.139960289001465, + "learning_rate": 1.3006382506276124e-06, + "loss": 0.0232, + "num_input_tokens_seen": 29222472, + "step": 14926 + }, + { + "epoch": 1.9783962889330682, + "grad_norm": 2.93013858795166, + "learning_rate": 1.300333681093714e-06, + "loss": 0.0579, + "num_input_tokens_seen": 29224632, + "step": 14927 + }, + { + "epoch": 1.9785288270377732, + "grad_norm": 12.069201469421387, + "learning_rate": 1.300029134690211e-06, + "loss": 0.1486, + "num_input_tokens_seen": 29227104, + "step": 14928 + }, + { + "epoch": 1.9786613651424785, + "grad_norm": 0.27290380001068115, + "learning_rate": 1.2997246114229734e-06, + "loss": 0.0019, + "num_input_tokens_seen": 29228960, + "step": 14929 + }, + { + "epoch": 1.9787939032471835, + "grad_norm": 0.05895405262708664, + "learning_rate": 1.2994201112978738e-06, + "loss": 0.0004, + "num_input_tokens_seen": 29230672, + "step": 14930 + }, + { + "epoch": 1.9789264413518888, + "grad_norm": 11.887923240661621, + "learning_rate": 1.2991156343207828e-06, + "loss": 0.1428, + "num_input_tokens_seen": 29231792, + "step": 14931 + }, + { + "epoch": 1.9790589794565938, + "grad_norm": 2.5906054973602295, + "learning_rate": 1.298811180497571e-06, + "loss": 0.04, + "num_input_tokens_seen": 29234272, + "step": 14932 + }, + { + "epoch": 1.9791915175612989, + "grad_norm": 1.6383649110794067, + "learning_rate": 1.298506749834107e-06, + "loss": 0.0149, + "num_input_tokens_seen": 29236328, + "step": 14933 + }, + { + "epoch": 1.979324055666004, + "grad_norm": 5.924740314483643, + "learning_rate": 1.2982023423362623e-06, + "loss": 0.0668, + "num_input_tokens_seen": 29237912, + "step": 14934 + }, + { + "epoch": 1.979456593770709, + "grad_norm": 9.834704399108887, + "learning_rate": 1.2978979580099057e-06, + "loss": 0.1525, + "num_input_tokens_seen": 29239856, + "step": 14935 + }, + { + "epoch": 1.9795891318754142, + "grad_norm": 0.2781409025192261, + "learning_rate": 1.2975935968609062e-06, + "loss": 0.002, + "num_input_tokens_seen": 29242096, + "step": 14936 + }, + { + "epoch": 1.9797216699801194, + "grad_norm": 6.552262306213379, + "learning_rate": 1.297289258895132e-06, + "loss": 0.1053, + "num_input_tokens_seen": 29244288, + "step": 14937 + }, + { + "epoch": 1.9798542080848245, + "grad_norm": 6.329280376434326, + "learning_rate": 1.2969849441184502e-06, + "loss": 0.0869, + "num_input_tokens_seen": 29245704, + "step": 14938 + }, + { + "epoch": 1.9799867461895295, + "grad_norm": 0.014991187490522861, + "learning_rate": 1.2966806525367272e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29247408, + "step": 14939 + }, + { + "epoch": 1.9801192842942346, + "grad_norm": 0.03031809628009796, + "learning_rate": 1.296376384155832e-06, + "loss": 0.0002, + "num_input_tokens_seen": 29249080, + "step": 14940 + }, + { + "epoch": 1.9802518223989396, + "grad_norm": 0.1248452365398407, + "learning_rate": 1.296072138981631e-06, + "loss": 0.0008, + "num_input_tokens_seen": 29251096, + "step": 14941 + }, + { + "epoch": 1.9803843605036446, + "grad_norm": 6.064358711242676, + "learning_rate": 1.29576791701999e-06, + "loss": 0.0806, + "num_input_tokens_seen": 29253056, + "step": 14942 + }, + { + "epoch": 1.98051689860835, + "grad_norm": 1.6882566213607788, + "learning_rate": 1.2954637182767744e-06, + "loss": 0.0132, + "num_input_tokens_seen": 29254328, + "step": 14943 + }, + { + "epoch": 1.9806494367130552, + "grad_norm": 2.212655544281006, + "learning_rate": 1.295159542757848e-06, + "loss": 0.0274, + "num_input_tokens_seen": 29256456, + "step": 14944 + }, + { + "epoch": 1.9807819748177602, + "grad_norm": 15.62509536743164, + "learning_rate": 1.294855390469077e-06, + "loss": 0.2974, + "num_input_tokens_seen": 29258448, + "step": 14945 + }, + { + "epoch": 1.9809145129224652, + "grad_norm": 9.234081268310547, + "learning_rate": 1.2945512614163264e-06, + "loss": 0.2232, + "num_input_tokens_seen": 29260472, + "step": 14946 + }, + { + "epoch": 1.9810470510271703, + "grad_norm": 0.1798482984304428, + "learning_rate": 1.2942471556054592e-06, + "loss": 0.0011, + "num_input_tokens_seen": 29262472, + "step": 14947 + }, + { + "epoch": 1.9811795891318753, + "grad_norm": 0.27187666296958923, + "learning_rate": 1.2939430730423384e-06, + "loss": 0.0018, + "num_input_tokens_seen": 29263880, + "step": 14948 + }, + { + "epoch": 1.9813121272365806, + "grad_norm": 3.7519893646240234, + "learning_rate": 1.2936390137328272e-06, + "loss": 0.0446, + "num_input_tokens_seen": 29265784, + "step": 14949 + }, + { + "epoch": 1.9814446653412856, + "grad_norm": 8.041322708129883, + "learning_rate": 1.293334977682787e-06, + "loss": 0.0829, + "num_input_tokens_seen": 29267456, + "step": 14950 + }, + { + "epoch": 1.9815772034459909, + "grad_norm": 8.203190803527832, + "learning_rate": 1.293030964898081e-06, + "loss": 0.1435, + "num_input_tokens_seen": 29269440, + "step": 14951 + }, + { + "epoch": 1.981709741550696, + "grad_norm": 1.8591647148132324, + "learning_rate": 1.2927269753845716e-06, + "loss": 0.0345, + "num_input_tokens_seen": 29271752, + "step": 14952 + }, + { + "epoch": 1.981842279655401, + "grad_norm": 4.607569217681885, + "learning_rate": 1.292423009148119e-06, + "loss": 0.0684, + "num_input_tokens_seen": 29273344, + "step": 14953 + }, + { + "epoch": 1.981974817760106, + "grad_norm": 11.287821769714355, + "learning_rate": 1.292119066194584e-06, + "loss": 0.2822, + "num_input_tokens_seen": 29276672, + "step": 14954 + }, + { + "epoch": 1.982107355864811, + "grad_norm": 1.4310781955718994, + "learning_rate": 1.2918151465298256e-06, + "loss": 0.0052, + "num_input_tokens_seen": 29278536, + "step": 14955 + }, + { + "epoch": 1.9822398939695163, + "grad_norm": 5.801882743835449, + "learning_rate": 1.2915112501597055e-06, + "loss": 0.0541, + "num_input_tokens_seen": 29280320, + "step": 14956 + }, + { + "epoch": 1.9823724320742213, + "grad_norm": 3.4595510959625244, + "learning_rate": 1.2912073770900813e-06, + "loss": 0.0695, + "num_input_tokens_seen": 29284408, + "step": 14957 + }, + { + "epoch": 1.9825049701789266, + "grad_norm": 0.13448534905910492, + "learning_rate": 1.2909035273268137e-06, + "loss": 0.0008, + "num_input_tokens_seen": 29286696, + "step": 14958 + }, + { + "epoch": 1.9826375082836316, + "grad_norm": 3.0875461101531982, + "learning_rate": 1.2905997008757604e-06, + "loss": 0.0182, + "num_input_tokens_seen": 29290048, + "step": 14959 + }, + { + "epoch": 1.9827700463883366, + "grad_norm": 11.961390495300293, + "learning_rate": 1.2902958977427776e-06, + "loss": 0.1786, + "num_input_tokens_seen": 29291840, + "step": 14960 + }, + { + "epoch": 1.9829025844930417, + "grad_norm": 1.6517081260681152, + "learning_rate": 1.289992117933726e-06, + "loss": 0.0101, + "num_input_tokens_seen": 29293864, + "step": 14961 + }, + { + "epoch": 1.9830351225977467, + "grad_norm": 6.134220600128174, + "learning_rate": 1.2896883614544609e-06, + "loss": 0.0632, + "num_input_tokens_seen": 29296264, + "step": 14962 + }, + { + "epoch": 1.983167660702452, + "grad_norm": 10.05469799041748, + "learning_rate": 1.2893846283108385e-06, + "loss": 0.2714, + "num_input_tokens_seen": 29298672, + "step": 14963 + }, + { + "epoch": 1.983300198807157, + "grad_norm": 0.03645828366279602, + "learning_rate": 1.289080918508716e-06, + "loss": 0.0003, + "num_input_tokens_seen": 29300440, + "step": 14964 + }, + { + "epoch": 1.9834327369118623, + "grad_norm": 0.010693750344216824, + "learning_rate": 1.2887772320539482e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29301776, + "step": 14965 + }, + { + "epoch": 1.9835652750165673, + "grad_norm": 12.394100189208984, + "learning_rate": 1.2884735689523919e-06, + "loss": 0.1351, + "num_input_tokens_seen": 29304024, + "step": 14966 + }, + { + "epoch": 1.9836978131212724, + "grad_norm": 4.284935474395752, + "learning_rate": 1.288169929209901e-06, + "loss": 0.0238, + "num_input_tokens_seen": 29305296, + "step": 14967 + }, + { + "epoch": 1.9838303512259774, + "grad_norm": 1.1791915893554688, + "learning_rate": 1.2878663128323288e-06, + "loss": 0.0041, + "num_input_tokens_seen": 29307056, + "step": 14968 + }, + { + "epoch": 1.9839628893306824, + "grad_norm": 6.600908279418945, + "learning_rate": 1.2875627198255314e-06, + "loss": 0.0933, + "num_input_tokens_seen": 29309096, + "step": 14969 + }, + { + "epoch": 1.9840954274353877, + "grad_norm": 3.78352427482605, + "learning_rate": 1.2872591501953599e-06, + "loss": 0.0663, + "num_input_tokens_seen": 29311128, + "step": 14970 + }, + { + "epoch": 1.9842279655400927, + "grad_norm": 9.576079368591309, + "learning_rate": 1.28695560394767e-06, + "loss": 0.1543, + "num_input_tokens_seen": 29313704, + "step": 14971 + }, + { + "epoch": 1.984360503644798, + "grad_norm": 13.428544044494629, + "learning_rate": 1.2866520810883123e-06, + "loss": 0.2213, + "num_input_tokens_seen": 29315752, + "step": 14972 + }, + { + "epoch": 1.984493041749503, + "grad_norm": 9.78488826751709, + "learning_rate": 1.28634858162314e-06, + "loss": 0.1732, + "num_input_tokens_seen": 29317384, + "step": 14973 + }, + { + "epoch": 1.984625579854208, + "grad_norm": 0.15593236684799194, + "learning_rate": 1.2860451055580032e-06, + "loss": 0.0011, + "num_input_tokens_seen": 29319288, + "step": 14974 + }, + { + "epoch": 1.984758117958913, + "grad_norm": 8.514893531799316, + "learning_rate": 1.285741652898755e-06, + "loss": 0.147, + "num_input_tokens_seen": 29321472, + "step": 14975 + }, + { + "epoch": 1.9848906560636181, + "grad_norm": 0.022653639316558838, + "learning_rate": 1.2854382236512447e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29323360, + "step": 14976 + }, + { + "epoch": 1.9850231941683234, + "grad_norm": 13.34898853302002, + "learning_rate": 1.2851348178213244e-06, + "loss": 0.3377, + "num_input_tokens_seen": 29325512, + "step": 14977 + }, + { + "epoch": 1.9851557322730287, + "grad_norm": 2.5087296962738037, + "learning_rate": 1.2848314354148428e-06, + "loss": 0.048, + "num_input_tokens_seen": 29327640, + "step": 14978 + }, + { + "epoch": 1.9852882703777337, + "grad_norm": 12.470858573913574, + "learning_rate": 1.2845280764376494e-06, + "loss": 0.1751, + "num_input_tokens_seen": 29329704, + "step": 14979 + }, + { + "epoch": 1.9854208084824387, + "grad_norm": 9.162280082702637, + "learning_rate": 1.2842247408955922e-06, + "loss": 0.1787, + "num_input_tokens_seen": 29332008, + "step": 14980 + }, + { + "epoch": 1.9855533465871438, + "grad_norm": 0.6234849691390991, + "learning_rate": 1.2839214287945206e-06, + "loss": 0.0087, + "num_input_tokens_seen": 29335288, + "step": 14981 + }, + { + "epoch": 1.9856858846918488, + "grad_norm": 6.43628454208374, + "learning_rate": 1.283618140140284e-06, + "loss": 0.0763, + "num_input_tokens_seen": 29336896, + "step": 14982 + }, + { + "epoch": 1.9858184227965539, + "grad_norm": 0.008916890248656273, + "learning_rate": 1.2833148749387287e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29338760, + "step": 14983 + }, + { + "epoch": 1.985950960901259, + "grad_norm": 0.018107829615473747, + "learning_rate": 1.2830116331957019e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29340080, + "step": 14984 + }, + { + "epoch": 1.9860834990059644, + "grad_norm": 12.252679824829102, + "learning_rate": 1.2827084149170504e-06, + "loss": 0.2424, + "num_input_tokens_seen": 29342512, + "step": 14985 + }, + { + "epoch": 1.9862160371106694, + "grad_norm": 5.954042434692383, + "learning_rate": 1.2824052201086194e-06, + "loss": 0.0339, + "num_input_tokens_seen": 29343576, + "step": 14986 + }, + { + "epoch": 1.9863485752153744, + "grad_norm": 8.77004337310791, + "learning_rate": 1.282102048776256e-06, + "loss": 0.3532, + "num_input_tokens_seen": 29346144, + "step": 14987 + }, + { + "epoch": 1.9864811133200795, + "grad_norm": 0.08105558902025223, + "learning_rate": 1.281798900925806e-06, + "loss": 0.0005, + "num_input_tokens_seen": 29347616, + "step": 14988 + }, + { + "epoch": 1.9866136514247845, + "grad_norm": 1.0167502164840698, + "learning_rate": 1.2814957765631133e-06, + "loss": 0.0113, + "num_input_tokens_seen": 29350208, + "step": 14989 + }, + { + "epoch": 1.9867461895294898, + "grad_norm": 1.1385411024093628, + "learning_rate": 1.281192675694023e-06, + "loss": 0.0074, + "num_input_tokens_seen": 29351832, + "step": 14990 + }, + { + "epoch": 1.9868787276341948, + "grad_norm": 0.15233564376831055, + "learning_rate": 1.2808895983243777e-06, + "loss": 0.0011, + "num_input_tokens_seen": 29353456, + "step": 14991 + }, + { + "epoch": 1.9870112657389, + "grad_norm": 21.02981948852539, + "learning_rate": 1.280586544460023e-06, + "loss": 0.1437, + "num_input_tokens_seen": 29355392, + "step": 14992 + }, + { + "epoch": 1.9871438038436051, + "grad_norm": 0.1611795574426651, + "learning_rate": 1.2802835141067998e-06, + "loss": 0.0009, + "num_input_tokens_seen": 29357824, + "step": 14993 + }, + { + "epoch": 1.9872763419483102, + "grad_norm": 7.989131450653076, + "learning_rate": 1.2799805072705528e-06, + "loss": 0.1645, + "num_input_tokens_seen": 29359688, + "step": 14994 + }, + { + "epoch": 1.9874088800530152, + "grad_norm": 5.402070045471191, + "learning_rate": 1.2796775239571238e-06, + "loss": 0.1805, + "num_input_tokens_seen": 29361864, + "step": 14995 + }, + { + "epoch": 1.9875414181577202, + "grad_norm": 4.802786827087402, + "learning_rate": 1.2793745641723526e-06, + "loss": 0.0348, + "num_input_tokens_seen": 29363776, + "step": 14996 + }, + { + "epoch": 1.9876739562624255, + "grad_norm": 6.909824371337891, + "learning_rate": 1.279071627922083e-06, + "loss": 0.0937, + "num_input_tokens_seen": 29366240, + "step": 14997 + }, + { + "epoch": 1.9878064943671305, + "grad_norm": 0.03242842108011246, + "learning_rate": 1.2787687152121536e-06, + "loss": 0.0002, + "num_input_tokens_seen": 29368624, + "step": 14998 + }, + { + "epoch": 1.9879390324718358, + "grad_norm": 9.624953269958496, + "learning_rate": 1.2784658260484067e-06, + "loss": 0.1457, + "num_input_tokens_seen": 29370592, + "step": 14999 + }, + { + "epoch": 1.9880715705765408, + "grad_norm": 9.715973854064941, + "learning_rate": 1.278162960436682e-06, + "loss": 0.3169, + "num_input_tokens_seen": 29373352, + "step": 15000 + }, + { + "epoch": 1.9882041086812459, + "grad_norm": 8.819573402404785, + "learning_rate": 1.2778601183828166e-06, + "loss": 0.0317, + "num_input_tokens_seen": 29375288, + "step": 15001 + }, + { + "epoch": 1.988336646785951, + "grad_norm": 0.09542513638734818, + "learning_rate": 1.2775572998926525e-06, + "loss": 0.0007, + "num_input_tokens_seen": 29378192, + "step": 15002 + }, + { + "epoch": 1.988469184890656, + "grad_norm": 5.76531457901001, + "learning_rate": 1.277254504972027e-06, + "loss": 0.061, + "num_input_tokens_seen": 29379672, + "step": 15003 + }, + { + "epoch": 1.9886017229953612, + "grad_norm": 6.633455753326416, + "learning_rate": 1.276951733626777e-06, + "loss": 0.0427, + "num_input_tokens_seen": 29380944, + "step": 15004 + }, + { + "epoch": 1.9887342611000662, + "grad_norm": 0.007879002019762993, + "learning_rate": 1.2766489858627423e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29382496, + "step": 15005 + }, + { + "epoch": 1.9888667992047715, + "grad_norm": 4.493411064147949, + "learning_rate": 1.2763462616857595e-06, + "loss": 0.0926, + "num_input_tokens_seen": 29385048, + "step": 15006 + }, + { + "epoch": 1.9889993373094765, + "grad_norm": 0.019767822697758675, + "learning_rate": 1.2760435611016635e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29386640, + "step": 15007 + }, + { + "epoch": 1.9891318754141816, + "grad_norm": 8.19939136505127, + "learning_rate": 1.2757408841162932e-06, + "loss": 0.1468, + "num_input_tokens_seen": 29388456, + "step": 15008 + }, + { + "epoch": 1.9892644135188866, + "grad_norm": 2.3931753635406494, + "learning_rate": 1.2754382307354834e-06, + "loss": 0.028, + "num_input_tokens_seen": 29391184, + "step": 15009 + }, + { + "epoch": 1.9893969516235916, + "grad_norm": 0.019784769043326378, + "learning_rate": 1.275135600965068e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29392904, + "step": 15010 + }, + { + "epoch": 1.989529489728297, + "grad_norm": 0.04237458482384682, + "learning_rate": 1.2748329948108844e-06, + "loss": 0.0003, + "num_input_tokens_seen": 29394976, + "step": 15011 + }, + { + "epoch": 1.989662027833002, + "grad_norm": 10.403319358825684, + "learning_rate": 1.274530412278765e-06, + "loss": 0.0748, + "num_input_tokens_seen": 29397088, + "step": 15012 + }, + { + "epoch": 1.9897945659377072, + "grad_norm": 3.1516165733337402, + "learning_rate": 1.274227853374545e-06, + "loss": 0.0216, + "num_input_tokens_seen": 29399208, + "step": 15013 + }, + { + "epoch": 1.9899271040424122, + "grad_norm": 0.20235677063465118, + "learning_rate": 1.2739253181040584e-06, + "loss": 0.0011, + "num_input_tokens_seen": 29401288, + "step": 15014 + }, + { + "epoch": 1.9900596421471173, + "grad_norm": 9.354519844055176, + "learning_rate": 1.273622806473137e-06, + "loss": 0.1541, + "num_input_tokens_seen": 29403640, + "step": 15015 + }, + { + "epoch": 1.9901921802518223, + "grad_norm": 9.492431640625, + "learning_rate": 1.2733203184876131e-06, + "loss": 0.2732, + "num_input_tokens_seen": 29405320, + "step": 15016 + }, + { + "epoch": 1.9903247183565274, + "grad_norm": 0.05043940991163254, + "learning_rate": 1.2730178541533194e-06, + "loss": 0.0003, + "num_input_tokens_seen": 29408200, + "step": 15017 + }, + { + "epoch": 1.9904572564612326, + "grad_norm": 12.103384017944336, + "learning_rate": 1.2727154134760895e-06, + "loss": 0.2796, + "num_input_tokens_seen": 29410792, + "step": 15018 + }, + { + "epoch": 1.9905897945659377, + "grad_norm": 0.019128022715449333, + "learning_rate": 1.2724129964617526e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29412104, + "step": 15019 + }, + { + "epoch": 1.990722332670643, + "grad_norm": 10.11974811553955, + "learning_rate": 1.2721106031161402e-06, + "loss": 0.1888, + "num_input_tokens_seen": 29413384, + "step": 15020 + }, + { + "epoch": 1.990854870775348, + "grad_norm": 5.954976558685303, + "learning_rate": 1.2718082334450815e-06, + "loss": 0.0819, + "num_input_tokens_seen": 29415640, + "step": 15021 + }, + { + "epoch": 1.990987408880053, + "grad_norm": 0.015992214903235435, + "learning_rate": 1.2715058874544072e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29417184, + "step": 15022 + }, + { + "epoch": 1.991119946984758, + "grad_norm": 0.13484159111976624, + "learning_rate": 1.271203565149948e-06, + "loss": 0.0011, + "num_input_tokens_seen": 29418928, + "step": 15023 + }, + { + "epoch": 1.991252485089463, + "grad_norm": 5.495573043823242, + "learning_rate": 1.2709012665375317e-06, + "loss": 0.1355, + "num_input_tokens_seen": 29421456, + "step": 15024 + }, + { + "epoch": 1.9913850231941683, + "grad_norm": 0.017603913322091103, + "learning_rate": 1.2705989916229868e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29423592, + "step": 15025 + }, + { + "epoch": 1.9915175612988736, + "grad_norm": 0.016949553042650223, + "learning_rate": 1.2702967404121416e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29424832, + "step": 15026 + }, + { + "epoch": 1.9916500994035786, + "grad_norm": 0.02186252363026142, + "learning_rate": 1.2699945129108227e-06, + "loss": 0.0002, + "num_input_tokens_seen": 29426384, + "step": 15027 + }, + { + "epoch": 1.9917826375082837, + "grad_norm": 10.439915657043457, + "learning_rate": 1.2696923091248578e-06, + "loss": 0.2374, + "num_input_tokens_seen": 29429032, + "step": 15028 + }, + { + "epoch": 1.9919151756129887, + "grad_norm": 2.7696712017059326, + "learning_rate": 1.2693901290600751e-06, + "loss": 0.0247, + "num_input_tokens_seen": 29430120, + "step": 15029 + }, + { + "epoch": 1.9920477137176937, + "grad_norm": 0.057111505419015884, + "learning_rate": 1.2690879727223e-06, + "loss": 0.0003, + "num_input_tokens_seen": 29431440, + "step": 15030 + }, + { + "epoch": 1.9921802518223988, + "grad_norm": 11.524256706237793, + "learning_rate": 1.2687858401173576e-06, + "loss": 0.1556, + "num_input_tokens_seen": 29434448, + "step": 15031 + }, + { + "epoch": 1.992312789927104, + "grad_norm": 0.019454503431916237, + "learning_rate": 1.2684837312510735e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29437064, + "step": 15032 + }, + { + "epoch": 1.9924453280318093, + "grad_norm": 0.01588483527302742, + "learning_rate": 1.2681816461292715e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29438856, + "step": 15033 + }, + { + "epoch": 1.9925778661365143, + "grad_norm": 3.3085122108459473, + "learning_rate": 1.2678795847577773e-06, + "loss": 0.0475, + "num_input_tokens_seen": 29440328, + "step": 15034 + }, + { + "epoch": 1.9927104042412194, + "grad_norm": 6.721047401428223, + "learning_rate": 1.267577547142416e-06, + "loss": 0.1052, + "num_input_tokens_seen": 29441952, + "step": 15035 + }, + { + "epoch": 1.9928429423459244, + "grad_norm": 1.3945090770721436, + "learning_rate": 1.2672755332890098e-06, + "loss": 0.0092, + "num_input_tokens_seen": 29444152, + "step": 15036 + }, + { + "epoch": 1.9929754804506294, + "grad_norm": 4.494974136352539, + "learning_rate": 1.266973543203381e-06, + "loss": 0.0672, + "num_input_tokens_seen": 29446344, + "step": 15037 + }, + { + "epoch": 1.9931080185553347, + "grad_norm": 0.1965724527835846, + "learning_rate": 1.2666715768913526e-06, + "loss": 0.0011, + "num_input_tokens_seen": 29448096, + "step": 15038 + }, + { + "epoch": 1.9932405566600397, + "grad_norm": 8.084900856018066, + "learning_rate": 1.266369634358748e-06, + "loss": 0.0702, + "num_input_tokens_seen": 29450032, + "step": 15039 + }, + { + "epoch": 1.993373094764745, + "grad_norm": 4.564589023590088, + "learning_rate": 1.2660677156113866e-06, + "loss": 0.0435, + "num_input_tokens_seen": 29452160, + "step": 15040 + }, + { + "epoch": 1.99350563286945, + "grad_norm": 0.008167847990989685, + "learning_rate": 1.2657658206550922e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29454440, + "step": 15041 + }, + { + "epoch": 1.993638170974155, + "grad_norm": 8.761433601379395, + "learning_rate": 1.2654639494956839e-06, + "loss": 0.1451, + "num_input_tokens_seen": 29457368, + "step": 15042 + }, + { + "epoch": 1.9937707090788601, + "grad_norm": 0.22472958266735077, + "learning_rate": 1.2651621021389815e-06, + "loss": 0.0021, + "num_input_tokens_seen": 29459496, + "step": 15043 + }, + { + "epoch": 1.9939032471835652, + "grad_norm": 5.541337013244629, + "learning_rate": 1.2648602785908066e-06, + "loss": 0.032, + "num_input_tokens_seen": 29461240, + "step": 15044 + }, + { + "epoch": 1.9940357852882704, + "grad_norm": 9.68825912475586, + "learning_rate": 1.2645584788569766e-06, + "loss": 0.1136, + "num_input_tokens_seen": 29463320, + "step": 15045 + }, + { + "epoch": 1.9941683233929755, + "grad_norm": 12.671784400939941, + "learning_rate": 1.2642567029433122e-06, + "loss": 0.294, + "num_input_tokens_seen": 29465024, + "step": 15046 + }, + { + "epoch": 1.9943008614976807, + "grad_norm": 7.378289222717285, + "learning_rate": 1.2639549508556315e-06, + "loss": 0.1446, + "num_input_tokens_seen": 29467136, + "step": 15047 + }, + { + "epoch": 1.9944333996023857, + "grad_norm": 13.606983184814453, + "learning_rate": 1.2636532225997508e-06, + "loss": 0.2464, + "num_input_tokens_seen": 29470032, + "step": 15048 + }, + { + "epoch": 1.9945659377070908, + "grad_norm": 11.810920715332031, + "learning_rate": 1.2633515181814899e-06, + "loss": 0.3046, + "num_input_tokens_seen": 29471640, + "step": 15049 + }, + { + "epoch": 1.9946984758117958, + "grad_norm": 0.07023024559020996, + "learning_rate": 1.2630498376066648e-06, + "loss": 0.0007, + "num_input_tokens_seen": 29472784, + "step": 15050 + }, + { + "epoch": 1.9948310139165009, + "grad_norm": 4.564071178436279, + "learning_rate": 1.2627481808810916e-06, + "loss": 0.085, + "num_input_tokens_seen": 29475096, + "step": 15051 + }, + { + "epoch": 1.9949635520212061, + "grad_norm": 14.904413223266602, + "learning_rate": 1.2624465480105873e-06, + "loss": 0.1335, + "num_input_tokens_seen": 29477208, + "step": 15052 + }, + { + "epoch": 1.9950960901259112, + "grad_norm": 1.5181065797805786, + "learning_rate": 1.2621449390009665e-06, + "loss": 0.0058, + "num_input_tokens_seen": 29479152, + "step": 15053 + }, + { + "epoch": 1.9952286282306164, + "grad_norm": 3.748671531677246, + "learning_rate": 1.2618433538580467e-06, + "loss": 0.0141, + "num_input_tokens_seen": 29481648, + "step": 15054 + }, + { + "epoch": 1.9953611663353215, + "grad_norm": 8.37869930267334, + "learning_rate": 1.2615417925876404e-06, + "loss": 0.0841, + "num_input_tokens_seen": 29484088, + "step": 15055 + }, + { + "epoch": 1.9954937044400265, + "grad_norm": 12.78702163696289, + "learning_rate": 1.2612402551955635e-06, + "loss": 0.3809, + "num_input_tokens_seen": 29487416, + "step": 15056 + }, + { + "epoch": 1.9956262425447315, + "grad_norm": 5.244921684265137, + "learning_rate": 1.2609387416876278e-06, + "loss": 0.0544, + "num_input_tokens_seen": 29489216, + "step": 15057 + }, + { + "epoch": 1.9957587806494366, + "grad_norm": 0.020142938941717148, + "learning_rate": 1.260637252069648e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29490456, + "step": 15058 + }, + { + "epoch": 1.9958913187541418, + "grad_norm": 0.10015234351158142, + "learning_rate": 1.2603357863474377e-06, + "loss": 0.0009, + "num_input_tokens_seen": 29491752, + "step": 15059 + }, + { + "epoch": 1.9960238568588469, + "grad_norm": 0.058342497795820236, + "learning_rate": 1.2600343445268088e-06, + "loss": 0.0004, + "num_input_tokens_seen": 29493824, + "step": 15060 + }, + { + "epoch": 1.9961563949635521, + "grad_norm": 11.053287506103516, + "learning_rate": 1.2597329266135733e-06, + "loss": 0.2014, + "num_input_tokens_seen": 29495696, + "step": 15061 + }, + { + "epoch": 1.9962889330682572, + "grad_norm": 0.010415097698569298, + "learning_rate": 1.259431532613542e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29497568, + "step": 15062 + }, + { + "epoch": 1.9964214711729622, + "grad_norm": 9.868606567382812, + "learning_rate": 1.2591301625325263e-06, + "loss": 0.0954, + "num_input_tokens_seen": 29499272, + "step": 15063 + }, + { + "epoch": 1.9965540092776672, + "grad_norm": 1.5083472728729248, + "learning_rate": 1.2588288163763368e-06, + "loss": 0.0048, + "num_input_tokens_seen": 29500944, + "step": 15064 + }, + { + "epoch": 1.9966865473823723, + "grad_norm": 0.022500133141875267, + "learning_rate": 1.2585274941507849e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29503232, + "step": 15065 + }, + { + "epoch": 1.9968190854870775, + "grad_norm": 5.153323173522949, + "learning_rate": 1.2582261958616793e-06, + "loss": 0.0779, + "num_input_tokens_seen": 29506240, + "step": 15066 + }, + { + "epoch": 1.9969516235917828, + "grad_norm": 8.314993858337402, + "learning_rate": 1.2579249215148295e-06, + "loss": 0.1002, + "num_input_tokens_seen": 29508104, + "step": 15067 + }, + { + "epoch": 1.9970841616964878, + "grad_norm": 0.017672164365649223, + "learning_rate": 1.257623671116044e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29510288, + "step": 15068 + }, + { + "epoch": 1.9972166998011929, + "grad_norm": 7.050225734710693, + "learning_rate": 1.2573224446711302e-06, + "loss": 0.0436, + "num_input_tokens_seen": 29511936, + "step": 15069 + }, + { + "epoch": 1.997349237905898, + "grad_norm": 0.0177437886595726, + "learning_rate": 1.2570212421858968e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29514080, + "step": 15070 + }, + { + "epoch": 1.997481776010603, + "grad_norm": 7.996880531311035, + "learning_rate": 1.2567200636661525e-06, + "loss": 0.1354, + "num_input_tokens_seen": 29515600, + "step": 15071 + }, + { + "epoch": 1.997614314115308, + "grad_norm": 1.1732653379440308, + "learning_rate": 1.256418909117703e-06, + "loss": 0.0033, + "num_input_tokens_seen": 29517688, + "step": 15072 + }, + { + "epoch": 1.9977468522200132, + "grad_norm": 2.853372573852539, + "learning_rate": 1.2561177785463551e-06, + "loss": 0.1032, + "num_input_tokens_seen": 29519392, + "step": 15073 + }, + { + "epoch": 1.9978793903247185, + "grad_norm": 0.03616528958082199, + "learning_rate": 1.2558166719579135e-06, + "loss": 0.0002, + "num_input_tokens_seen": 29520968, + "step": 15074 + }, + { + "epoch": 1.9980119284294235, + "grad_norm": 12.990983963012695, + "learning_rate": 1.2555155893581844e-06, + "loss": 0.101, + "num_input_tokens_seen": 29522328, + "step": 15075 + }, + { + "epoch": 1.9981444665341286, + "grad_norm": 8.177651405334473, + "learning_rate": 1.2552145307529744e-06, + "loss": 0.0778, + "num_input_tokens_seen": 29523648, + "step": 15076 + }, + { + "epoch": 1.9982770046388336, + "grad_norm": 9.80813980102539, + "learning_rate": 1.2549134961480869e-06, + "loss": 0.1563, + "num_input_tokens_seen": 29525496, + "step": 15077 + }, + { + "epoch": 1.9984095427435387, + "grad_norm": 5.188504219055176, + "learning_rate": 1.2546124855493264e-06, + "loss": 0.0124, + "num_input_tokens_seen": 29527768, + "step": 15078 + }, + { + "epoch": 1.998542080848244, + "grad_norm": 9.325885772705078, + "learning_rate": 1.2543114989624952e-06, + "loss": 0.1933, + "num_input_tokens_seen": 29530048, + "step": 15079 + }, + { + "epoch": 1.998674618952949, + "grad_norm": 0.05279998108744621, + "learning_rate": 1.2540105363933986e-06, + "loss": 0.0003, + "num_input_tokens_seen": 29533176, + "step": 15080 + }, + { + "epoch": 1.9988071570576542, + "grad_norm": 0.13106754422187805, + "learning_rate": 1.2537095978478376e-06, + "loss": 0.0009, + "num_input_tokens_seen": 29534296, + "step": 15081 + }, + { + "epoch": 1.9989396951623593, + "grad_norm": 9.442612648010254, + "learning_rate": 1.2534086833316161e-06, + "loss": 0.1579, + "num_input_tokens_seen": 29536736, + "step": 15082 + }, + { + "epoch": 1.9990722332670643, + "grad_norm": 7.0040202140808105, + "learning_rate": 1.2531077928505352e-06, + "loss": 0.1403, + "num_input_tokens_seen": 29538600, + "step": 15083 + }, + { + "epoch": 1.9992047713717693, + "grad_norm": 6.355255603790283, + "learning_rate": 1.2528069264103953e-06, + "loss": 0.0355, + "num_input_tokens_seen": 29540384, + "step": 15084 + }, + { + "epoch": 1.9993373094764744, + "grad_norm": 0.04498272016644478, + "learning_rate": 1.2525060840169994e-06, + "loss": 0.0003, + "num_input_tokens_seen": 29542256, + "step": 15085 + }, + { + "epoch": 1.9994698475811796, + "grad_norm": 3.21889066696167, + "learning_rate": 1.2522052656761468e-06, + "loss": 0.0246, + "num_input_tokens_seen": 29544240, + "step": 15086 + }, + { + "epoch": 1.9996023856858847, + "grad_norm": 0.15578404068946838, + "learning_rate": 1.2519044713936362e-06, + "loss": 0.0008, + "num_input_tokens_seen": 29545400, + "step": 15087 + }, + { + "epoch": 1.99973492379059, + "grad_norm": 0.052895065397024155, + "learning_rate": 1.2516037011752697e-06, + "loss": 0.0003, + "num_input_tokens_seen": 29547640, + "step": 15088 + }, + { + "epoch": 1.999867461895295, + "grad_norm": 0.007841318845748901, + "learning_rate": 1.2513029550268448e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29549512, + "step": 15089 + }, + { + "epoch": 2.0, + "grad_norm": 0.009754818864166737, + "learning_rate": 1.2510022329541595e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29550656, + "step": 15090 + }, + { + "epoch": 2.000132538104705, + "grad_norm": 5.919904708862305, + "learning_rate": 1.2507015349630138e-06, + "loss": 0.1083, + "num_input_tokens_seen": 29552936, + "step": 15091 + }, + { + "epoch": 2.00026507620941, + "grad_norm": 4.115756511688232, + "learning_rate": 1.2504008610592044e-06, + "loss": 0.0574, + "num_input_tokens_seen": 29554888, + "step": 15092 + }, + { + "epoch": 2.000397614314115, + "grad_norm": 0.02611556090414524, + "learning_rate": 1.2501002112485272e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29556392, + "step": 15093 + }, + { + "epoch": 2.0005301524188206, + "grad_norm": 1.6775933504104614, + "learning_rate": 1.2497995855367815e-06, + "loss": 0.0267, + "num_input_tokens_seen": 29558520, + "step": 15094 + }, + { + "epoch": 2.0006626905235256, + "grad_norm": 6.486643314361572, + "learning_rate": 1.249498983929761e-06, + "loss": 0.0855, + "num_input_tokens_seen": 29561528, + "step": 15095 + }, + { + "epoch": 2.0007952286282307, + "grad_norm": 20.230329513549805, + "learning_rate": 1.2491984064332641e-06, + "loss": 0.342, + "num_input_tokens_seen": 29564856, + "step": 15096 + }, + { + "epoch": 2.0009277667329357, + "grad_norm": 3.673888683319092, + "learning_rate": 1.2488978530530846e-06, + "loss": 0.0249, + "num_input_tokens_seen": 29566744, + "step": 15097 + }, + { + "epoch": 2.0010603048376407, + "grad_norm": 0.041721418499946594, + "learning_rate": 1.2485973237950166e-06, + "loss": 0.0003, + "num_input_tokens_seen": 29569040, + "step": 15098 + }, + { + "epoch": 2.001192842942346, + "grad_norm": 4.339807510375977, + "learning_rate": 1.2482968186648564e-06, + "loss": 0.0357, + "num_input_tokens_seen": 29570504, + "step": 15099 + }, + { + "epoch": 2.001325381047051, + "grad_norm": 0.12106949090957642, + "learning_rate": 1.2479963376683961e-06, + "loss": 0.0016, + "num_input_tokens_seen": 29572128, + "step": 15100 + }, + { + "epoch": 2.0014579191517563, + "grad_norm": 0.1160544604063034, + "learning_rate": 1.2476958808114311e-06, + "loss": 0.0008, + "num_input_tokens_seen": 29573536, + "step": 15101 + }, + { + "epoch": 2.0015904572564613, + "grad_norm": 6.40834379196167, + "learning_rate": 1.2473954480997535e-06, + "loss": 0.0735, + "num_input_tokens_seen": 29575272, + "step": 15102 + }, + { + "epoch": 2.0017229953611664, + "grad_norm": 3.791527271270752, + "learning_rate": 1.247095039539156e-06, + "loss": 0.0761, + "num_input_tokens_seen": 29577560, + "step": 15103 + }, + { + "epoch": 2.0018555334658714, + "grad_norm": 7.423145771026611, + "learning_rate": 1.2467946551354292e-06, + "loss": 0.0704, + "num_input_tokens_seen": 29579320, + "step": 15104 + }, + { + "epoch": 2.0019880715705765, + "grad_norm": 0.035152118653059006, + "learning_rate": 1.2464942948943659e-06, + "loss": 0.0003, + "num_input_tokens_seen": 29581824, + "step": 15105 + }, + { + "epoch": 2.0021206096752815, + "grad_norm": 3.599280834197998, + "learning_rate": 1.2461939588217587e-06, + "loss": 0.0434, + "num_input_tokens_seen": 29584464, + "step": 15106 + }, + { + "epoch": 2.0022531477799865, + "grad_norm": 0.05427742376923561, + "learning_rate": 1.2458936469233964e-06, + "loss": 0.0003, + "num_input_tokens_seen": 29586088, + "step": 15107 + }, + { + "epoch": 2.002385685884692, + "grad_norm": 12.067769050598145, + "learning_rate": 1.2455933592050704e-06, + "loss": 0.0512, + "num_input_tokens_seen": 29587672, + "step": 15108 + }, + { + "epoch": 2.002518223989397, + "grad_norm": 0.18524309992790222, + "learning_rate": 1.2452930956725692e-06, + "loss": 0.0016, + "num_input_tokens_seen": 29589080, + "step": 15109 + }, + { + "epoch": 2.002650762094102, + "grad_norm": 0.008161114528775215, + "learning_rate": 1.244992856331682e-06, + "loss": 0.0, + "num_input_tokens_seen": 29590432, + "step": 15110 + }, + { + "epoch": 2.002783300198807, + "grad_norm": 0.12214435636997223, + "learning_rate": 1.2446926411881984e-06, + "loss": 0.0011, + "num_input_tokens_seen": 29592240, + "step": 15111 + }, + { + "epoch": 2.002915838303512, + "grad_norm": 6.472517967224121, + "learning_rate": 1.2443924502479077e-06, + "loss": 0.1308, + "num_input_tokens_seen": 29594880, + "step": 15112 + }, + { + "epoch": 2.003048376408217, + "grad_norm": 0.4883570671081543, + "learning_rate": 1.2440922835165964e-06, + "loss": 0.0046, + "num_input_tokens_seen": 29596896, + "step": 15113 + }, + { + "epoch": 2.0031809145129227, + "grad_norm": 3.0127828121185303, + "learning_rate": 1.2437921410000525e-06, + "loss": 0.0507, + "num_input_tokens_seen": 29599512, + "step": 15114 + }, + { + "epoch": 2.0033134526176277, + "grad_norm": 9.517441749572754, + "learning_rate": 1.2434920227040617e-06, + "loss": 0.1389, + "num_input_tokens_seen": 29601368, + "step": 15115 + }, + { + "epoch": 2.0034459907223328, + "grad_norm": 2.953688144683838, + "learning_rate": 1.2431919286344126e-06, + "loss": 0.0378, + "num_input_tokens_seen": 29602944, + "step": 15116 + }, + { + "epoch": 2.003578528827038, + "grad_norm": 0.25521379709243774, + "learning_rate": 1.2428918587968891e-06, + "loss": 0.0021, + "num_input_tokens_seen": 29604312, + "step": 15117 + }, + { + "epoch": 2.003711066931743, + "grad_norm": 7.0436015129089355, + "learning_rate": 1.2425918131972789e-06, + "loss": 0.0587, + "num_input_tokens_seen": 29605992, + "step": 15118 + }, + { + "epoch": 2.003843605036448, + "grad_norm": 0.0047504594549536705, + "learning_rate": 1.2422917918413658e-06, + "loss": 0.0, + "num_input_tokens_seen": 29607248, + "step": 15119 + }, + { + "epoch": 2.003976143141153, + "grad_norm": 0.08487773686647415, + "learning_rate": 1.2419917947349347e-06, + "loss": 0.0004, + "num_input_tokens_seen": 29608984, + "step": 15120 + }, + { + "epoch": 2.0041086812458584, + "grad_norm": 1.627276062965393, + "learning_rate": 1.2416918218837683e-06, + "loss": 0.012, + "num_input_tokens_seen": 29611736, + "step": 15121 + }, + { + "epoch": 2.0042412193505634, + "grad_norm": 5.123021125793457, + "learning_rate": 1.2413918732936531e-06, + "loss": 0.1271, + "num_input_tokens_seen": 29613368, + "step": 15122 + }, + { + "epoch": 2.0043737574552685, + "grad_norm": 2.0752036571502686, + "learning_rate": 1.24109194897037e-06, + "loss": 0.0386, + "num_input_tokens_seen": 29615496, + "step": 15123 + }, + { + "epoch": 2.0045062955599735, + "grad_norm": 5.433506011962891, + "learning_rate": 1.240792048919703e-06, + "loss": 0.0739, + "num_input_tokens_seen": 29617728, + "step": 15124 + }, + { + "epoch": 2.0046388336646785, + "grad_norm": 0.01832788623869419, + "learning_rate": 1.2404921731474345e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29621160, + "step": 15125 + }, + { + "epoch": 2.0047713717693836, + "grad_norm": 0.01260112039744854, + "learning_rate": 1.2401923216593444e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29623032, + "step": 15126 + }, + { + "epoch": 2.0049039098740886, + "grad_norm": 0.840470552444458, + "learning_rate": 1.2398924944612167e-06, + "loss": 0.011, + "num_input_tokens_seen": 29624856, + "step": 15127 + }, + { + "epoch": 2.005036447978794, + "grad_norm": 0.4741806089878082, + "learning_rate": 1.23959269155883e-06, + "loss": 0.003, + "num_input_tokens_seen": 29626432, + "step": 15128 + }, + { + "epoch": 2.005168986083499, + "grad_norm": 10.055848121643066, + "learning_rate": 1.2392929129579668e-06, + "loss": 0.086, + "num_input_tokens_seen": 29629224, + "step": 15129 + }, + { + "epoch": 2.005301524188204, + "grad_norm": 5.045922756195068, + "learning_rate": 1.2389931586644055e-06, + "loss": 0.0784, + "num_input_tokens_seen": 29631120, + "step": 15130 + }, + { + "epoch": 2.005434062292909, + "grad_norm": 0.01161034032702446, + "learning_rate": 1.2386934286839255e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29632512, + "step": 15131 + }, + { + "epoch": 2.0055666003976143, + "grad_norm": 0.11529643088579178, + "learning_rate": 1.238393723022307e-06, + "loss": 0.0007, + "num_input_tokens_seen": 29634728, + "step": 15132 + }, + { + "epoch": 2.0056991385023193, + "grad_norm": 4.420339107513428, + "learning_rate": 1.2380940416853281e-06, + "loss": 0.0503, + "num_input_tokens_seen": 29636776, + "step": 15133 + }, + { + "epoch": 2.0058316766070243, + "grad_norm": 0.012172609567642212, + "learning_rate": 1.2377943846787654e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29639592, + "step": 15134 + }, + { + "epoch": 2.00596421471173, + "grad_norm": 1.3013160228729248, + "learning_rate": 1.237494752008399e-06, + "loss": 0.0107, + "num_input_tokens_seen": 29641176, + "step": 15135 + }, + { + "epoch": 2.006096752816435, + "grad_norm": 0.016934173181653023, + "learning_rate": 1.2371951436800036e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29642672, + "step": 15136 + }, + { + "epoch": 2.00622929092114, + "grad_norm": 13.0425386428833, + "learning_rate": 1.2368955596993579e-06, + "loss": 0.215, + "num_input_tokens_seen": 29644856, + "step": 15137 + }, + { + "epoch": 2.006361829025845, + "grad_norm": 0.5551536679267883, + "learning_rate": 1.2365960000722373e-06, + "loss": 0.003, + "num_input_tokens_seen": 29648072, + "step": 15138 + }, + { + "epoch": 2.00649436713055, + "grad_norm": 0.7830763459205627, + "learning_rate": 1.2362964648044172e-06, + "loss": 0.0058, + "num_input_tokens_seen": 29649560, + "step": 15139 + }, + { + "epoch": 2.006626905235255, + "grad_norm": 3.7863247394561768, + "learning_rate": 1.2359969539016723e-06, + "loss": 0.0881, + "num_input_tokens_seen": 29651840, + "step": 15140 + }, + { + "epoch": 2.00675944333996, + "grad_norm": 0.06023525074124336, + "learning_rate": 1.235697467369778e-06, + "loss": 0.0004, + "num_input_tokens_seen": 29653584, + "step": 15141 + }, + { + "epoch": 2.0068919814446655, + "grad_norm": 0.13295461237430573, + "learning_rate": 1.2353980052145097e-06, + "loss": 0.0009, + "num_input_tokens_seen": 29654976, + "step": 15142 + }, + { + "epoch": 2.0070245195493706, + "grad_norm": 5.31843900680542, + "learning_rate": 1.2350985674416405e-06, + "loss": 0.0438, + "num_input_tokens_seen": 29657848, + "step": 15143 + }, + { + "epoch": 2.0071570576540756, + "grad_norm": 1.993880033493042, + "learning_rate": 1.2347991540569432e-06, + "loss": 0.0147, + "num_input_tokens_seen": 29659848, + "step": 15144 + }, + { + "epoch": 2.0072895957587806, + "grad_norm": 5.006574630737305, + "learning_rate": 1.234499765066191e-06, + "loss": 0.0817, + "num_input_tokens_seen": 29661752, + "step": 15145 + }, + { + "epoch": 2.0074221338634857, + "grad_norm": 0.037379782646894455, + "learning_rate": 1.2342004004751556e-06, + "loss": 0.0003, + "num_input_tokens_seen": 29663624, + "step": 15146 + }, + { + "epoch": 2.0075546719681907, + "grad_norm": 11.495149612426758, + "learning_rate": 1.2339010602896099e-06, + "loss": 0.2195, + "num_input_tokens_seen": 29665784, + "step": 15147 + }, + { + "epoch": 2.0076872100728957, + "grad_norm": 0.45331811904907227, + "learning_rate": 1.2336017445153259e-06, + "loss": 0.004, + "num_input_tokens_seen": 29667904, + "step": 15148 + }, + { + "epoch": 2.0078197481776012, + "grad_norm": 0.0663229376077652, + "learning_rate": 1.2333024531580737e-06, + "loss": 0.0004, + "num_input_tokens_seen": 29669104, + "step": 15149 + }, + { + "epoch": 2.0079522862823063, + "grad_norm": 0.05696311593055725, + "learning_rate": 1.2330031862236243e-06, + "loss": 0.0003, + "num_input_tokens_seen": 29671192, + "step": 15150 + }, + { + "epoch": 2.0080848243870113, + "grad_norm": 4.318670749664307, + "learning_rate": 1.2327039437177466e-06, + "loss": 0.084, + "num_input_tokens_seen": 29673240, + "step": 15151 + }, + { + "epoch": 2.0082173624917163, + "grad_norm": 6.141419887542725, + "learning_rate": 1.232404725646211e-06, + "loss": 0.1721, + "num_input_tokens_seen": 29676536, + "step": 15152 + }, + { + "epoch": 2.0083499005964214, + "grad_norm": 7.464241027832031, + "learning_rate": 1.232105532014788e-06, + "loss": 0.1398, + "num_input_tokens_seen": 29679320, + "step": 15153 + }, + { + "epoch": 2.0084824387011264, + "grad_norm": 6.127373218536377, + "learning_rate": 1.2318063628292446e-06, + "loss": 0.1356, + "num_input_tokens_seen": 29681480, + "step": 15154 + }, + { + "epoch": 2.008614976805832, + "grad_norm": 1.2671122550964355, + "learning_rate": 1.2315072180953498e-06, + "loss": 0.005, + "num_input_tokens_seen": 29682904, + "step": 15155 + }, + { + "epoch": 2.008747514910537, + "grad_norm": 2.4507052898406982, + "learning_rate": 1.2312080978188706e-06, + "loss": 0.0517, + "num_input_tokens_seen": 29684504, + "step": 15156 + }, + { + "epoch": 2.008880053015242, + "grad_norm": 6.647762775421143, + "learning_rate": 1.2309090020055737e-06, + "loss": 0.1557, + "num_input_tokens_seen": 29686880, + "step": 15157 + }, + { + "epoch": 2.009012591119947, + "grad_norm": 2.059262275695801, + "learning_rate": 1.2306099306612269e-06, + "loss": 0.0223, + "num_input_tokens_seen": 29688896, + "step": 15158 + }, + { + "epoch": 2.009145129224652, + "grad_norm": 9.299222946166992, + "learning_rate": 1.230310883791597e-06, + "loss": 0.0602, + "num_input_tokens_seen": 29690840, + "step": 15159 + }, + { + "epoch": 2.009277667329357, + "grad_norm": 0.022173378616571426, + "learning_rate": 1.2300118614024495e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29692296, + "step": 15160 + }, + { + "epoch": 2.009410205434062, + "grad_norm": 1.4448367357254028, + "learning_rate": 1.2297128634995491e-06, + "loss": 0.0052, + "num_input_tokens_seen": 29695232, + "step": 15161 + }, + { + "epoch": 2.0095427435387676, + "grad_norm": 0.004576769657433033, + "learning_rate": 1.2294138900886604e-06, + "loss": 0.0, + "num_input_tokens_seen": 29696816, + "step": 15162 + }, + { + "epoch": 2.0096752816434726, + "grad_norm": 0.49271130561828613, + "learning_rate": 1.2291149411755494e-06, + "loss": 0.0027, + "num_input_tokens_seen": 29698840, + "step": 15163 + }, + { + "epoch": 2.0098078197481777, + "grad_norm": 2.2824604511260986, + "learning_rate": 1.2288160167659777e-06, + "loss": 0.0253, + "num_input_tokens_seen": 29701192, + "step": 15164 + }, + { + "epoch": 2.0099403578528827, + "grad_norm": 0.44211632013320923, + "learning_rate": 1.2285171168657115e-06, + "loss": 0.0034, + "num_input_tokens_seen": 29703864, + "step": 15165 + }, + { + "epoch": 2.0100728959575878, + "grad_norm": 0.0071510993875563145, + "learning_rate": 1.2282182414805122e-06, + "loss": 0.0, + "num_input_tokens_seen": 29705888, + "step": 15166 + }, + { + "epoch": 2.010205434062293, + "grad_norm": 0.02814430743455887, + "learning_rate": 1.2279193906161419e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29707912, + "step": 15167 + }, + { + "epoch": 2.010337972166998, + "grad_norm": 1.4974178075790405, + "learning_rate": 1.227620564278364e-06, + "loss": 0.0208, + "num_input_tokens_seen": 29709648, + "step": 15168 + }, + { + "epoch": 2.0104705102717033, + "grad_norm": 3.9743356704711914, + "learning_rate": 1.2273217624729394e-06, + "loss": 0.0344, + "num_input_tokens_seen": 29711232, + "step": 15169 + }, + { + "epoch": 2.0106030483764084, + "grad_norm": 0.022412829101085663, + "learning_rate": 1.2270229852056282e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29712976, + "step": 15170 + }, + { + "epoch": 2.0107355864811134, + "grad_norm": 0.027235014364123344, + "learning_rate": 1.2267242324821933e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29714584, + "step": 15171 + }, + { + "epoch": 2.0108681245858184, + "grad_norm": 5.773441314697266, + "learning_rate": 1.2264255043083922e-06, + "loss": 0.0932, + "num_input_tokens_seen": 29716632, + "step": 15172 + }, + { + "epoch": 2.0110006626905235, + "grad_norm": 3.4036712646484375, + "learning_rate": 1.2261268006899871e-06, + "loss": 0.0605, + "num_input_tokens_seen": 29719256, + "step": 15173 + }, + { + "epoch": 2.0111332007952285, + "grad_norm": 6.906212329864502, + "learning_rate": 1.2258281216327362e-06, + "loss": 0.0501, + "num_input_tokens_seen": 29721216, + "step": 15174 + }, + { + "epoch": 2.0112657388999335, + "grad_norm": 0.5139371156692505, + "learning_rate": 1.2255294671423981e-06, + "loss": 0.0034, + "num_input_tokens_seen": 29722312, + "step": 15175 + }, + { + "epoch": 2.011398277004639, + "grad_norm": 7.2212324142456055, + "learning_rate": 1.2252308372247302e-06, + "loss": 0.1799, + "num_input_tokens_seen": 29724480, + "step": 15176 + }, + { + "epoch": 2.011530815109344, + "grad_norm": 0.012958239763975143, + "learning_rate": 1.2249322318854917e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29725984, + "step": 15177 + }, + { + "epoch": 2.011663353214049, + "grad_norm": 0.0588473379611969, + "learning_rate": 1.2246336511304387e-06, + "loss": 0.0007, + "num_input_tokens_seen": 29727384, + "step": 15178 + }, + { + "epoch": 2.011795891318754, + "grad_norm": 4.8784332275390625, + "learning_rate": 1.22433509496533e-06, + "loss": 0.083, + "num_input_tokens_seen": 29729664, + "step": 15179 + }, + { + "epoch": 2.011928429423459, + "grad_norm": 1.375204086303711, + "learning_rate": 1.2240365633959204e-06, + "loss": 0.0118, + "num_input_tokens_seen": 29731392, + "step": 15180 + }, + { + "epoch": 2.012060967528164, + "grad_norm": 4.534610748291016, + "learning_rate": 1.2237380564279653e-06, + "loss": 0.0186, + "num_input_tokens_seen": 29733168, + "step": 15181 + }, + { + "epoch": 2.0121935056328693, + "grad_norm": 3.7775685787200928, + "learning_rate": 1.2234395740672214e-06, + "loss": 0.0965, + "num_input_tokens_seen": 29735376, + "step": 15182 + }, + { + "epoch": 2.0123260437375747, + "grad_norm": 0.049699317663908005, + "learning_rate": 1.2231411163194427e-06, + "loss": 0.0002, + "num_input_tokens_seen": 29736832, + "step": 15183 + }, + { + "epoch": 2.0124585818422798, + "grad_norm": 0.007601501420140266, + "learning_rate": 1.222842683190385e-06, + "loss": 0.0, + "num_input_tokens_seen": 29738048, + "step": 15184 + }, + { + "epoch": 2.012591119946985, + "grad_norm": 0.006033395882695913, + "learning_rate": 1.2225442746858013e-06, + "loss": 0.0, + "num_input_tokens_seen": 29740464, + "step": 15185 + }, + { + "epoch": 2.01272365805169, + "grad_norm": 0.007894371636211872, + "learning_rate": 1.2222458908114454e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29741992, + "step": 15186 + }, + { + "epoch": 2.012856196156395, + "grad_norm": 5.792721271514893, + "learning_rate": 1.221947531573069e-06, + "loss": 0.0975, + "num_input_tokens_seen": 29743776, + "step": 15187 + }, + { + "epoch": 2.0129887342611, + "grad_norm": 0.027021728456020355, + "learning_rate": 1.2216491969764264e-06, + "loss": 0.0002, + "num_input_tokens_seen": 29746424, + "step": 15188 + }, + { + "epoch": 2.013121272365805, + "grad_norm": 2.9402287006378174, + "learning_rate": 1.2213508870272697e-06, + "loss": 0.0246, + "num_input_tokens_seen": 29747968, + "step": 15189 + }, + { + "epoch": 2.0132538104705104, + "grad_norm": 0.02559376321732998, + "learning_rate": 1.22105260173135e-06, + "loss": 0.0002, + "num_input_tokens_seen": 29749696, + "step": 15190 + }, + { + "epoch": 2.0133863485752155, + "grad_norm": 8.399367332458496, + "learning_rate": 1.2207543410944187e-06, + "loss": 0.0762, + "num_input_tokens_seen": 29752280, + "step": 15191 + }, + { + "epoch": 2.0135188866799205, + "grad_norm": 0.15359823405742645, + "learning_rate": 1.2204561051222257e-06, + "loss": 0.0016, + "num_input_tokens_seen": 29754736, + "step": 15192 + }, + { + "epoch": 2.0136514247846256, + "grad_norm": 0.07110943645238876, + "learning_rate": 1.2201578938205213e-06, + "loss": 0.0004, + "num_input_tokens_seen": 29756904, + "step": 15193 + }, + { + "epoch": 2.0137839628893306, + "grad_norm": 2.5328383445739746, + "learning_rate": 1.2198597071950554e-06, + "loss": 0.0143, + "num_input_tokens_seen": 29758304, + "step": 15194 + }, + { + "epoch": 2.0139165009940356, + "grad_norm": 0.02865474298596382, + "learning_rate": 1.2195615452515786e-06, + "loss": 0.0003, + "num_input_tokens_seen": 29759600, + "step": 15195 + }, + { + "epoch": 2.0140490390987407, + "grad_norm": 4.0343828201293945, + "learning_rate": 1.2192634079958383e-06, + "loss": 0.0431, + "num_input_tokens_seen": 29762208, + "step": 15196 + }, + { + "epoch": 2.014181577203446, + "grad_norm": 0.005436458624899387, + "learning_rate": 1.2189652954335831e-06, + "loss": 0.0, + "num_input_tokens_seen": 29764696, + "step": 15197 + }, + { + "epoch": 2.014314115308151, + "grad_norm": 6.861377716064453, + "learning_rate": 1.2186672075705597e-06, + "loss": 0.1206, + "num_input_tokens_seen": 29767304, + "step": 15198 + }, + { + "epoch": 2.0144466534128562, + "grad_norm": 0.002730043139308691, + "learning_rate": 1.2183691444125175e-06, + "loss": 0.0, + "num_input_tokens_seen": 29768368, + "step": 15199 + }, + { + "epoch": 2.0145791915175613, + "grad_norm": 2.558640480041504, + "learning_rate": 1.2180711059652014e-06, + "loss": 0.0122, + "num_input_tokens_seen": 29770232, + "step": 15200 + }, + { + "epoch": 2.0147117296222663, + "grad_norm": 7.908247470855713, + "learning_rate": 1.2177730922343596e-06, + "loss": 0.0613, + "num_input_tokens_seen": 29772016, + "step": 15201 + }, + { + "epoch": 2.0148442677269713, + "grad_norm": 4.929527759552002, + "learning_rate": 1.2174751032257377e-06, + "loss": 0.1274, + "num_input_tokens_seen": 29773888, + "step": 15202 + }, + { + "epoch": 2.014976805831677, + "grad_norm": 0.01971244625747204, + "learning_rate": 1.217177138945079e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29776472, + "step": 15203 + }, + { + "epoch": 2.015109343936382, + "grad_norm": 0.012376708909869194, + "learning_rate": 1.216879199398131e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29778960, + "step": 15204 + }, + { + "epoch": 2.015241882041087, + "grad_norm": 3.049663543701172, + "learning_rate": 1.2165812845906367e-06, + "loss": 0.0538, + "num_input_tokens_seen": 29780928, + "step": 15205 + }, + { + "epoch": 2.015374420145792, + "grad_norm": 2.2577626705169678, + "learning_rate": 1.2162833945283413e-06, + "loss": 0.0196, + "num_input_tokens_seen": 29783616, + "step": 15206 + }, + { + "epoch": 2.015506958250497, + "grad_norm": 0.9467397928237915, + "learning_rate": 1.2159855292169876e-06, + "loss": 0.0059, + "num_input_tokens_seen": 29785344, + "step": 15207 + }, + { + "epoch": 2.015639496355202, + "grad_norm": 0.0121595598757267, + "learning_rate": 1.2156876886623187e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29786688, + "step": 15208 + }, + { + "epoch": 2.015772034459907, + "grad_norm": 4.222128868103027, + "learning_rate": 1.215389872870076e-06, + "loss": 0.0808, + "num_input_tokens_seen": 29788960, + "step": 15209 + }, + { + "epoch": 2.0159045725646125, + "grad_norm": 0.03825254738330841, + "learning_rate": 1.215092081846004e-06, + "loss": 0.0002, + "num_input_tokens_seen": 29790944, + "step": 15210 + }, + { + "epoch": 2.0160371106693176, + "grad_norm": 5.104211330413818, + "learning_rate": 1.2147943155958422e-06, + "loss": 0.1107, + "num_input_tokens_seen": 29792952, + "step": 15211 + }, + { + "epoch": 2.0161696487740226, + "grad_norm": 0.14616405963897705, + "learning_rate": 1.2144965741253334e-06, + "loss": 0.0013, + "num_input_tokens_seen": 29794592, + "step": 15212 + }, + { + "epoch": 2.0163021868787276, + "grad_norm": 7.3096418380737305, + "learning_rate": 1.2141988574402176e-06, + "loss": 0.2027, + "num_input_tokens_seen": 29796712, + "step": 15213 + }, + { + "epoch": 2.0164347249834327, + "grad_norm": 0.05767657235264778, + "learning_rate": 1.2139011655462338e-06, + "loss": 0.0004, + "num_input_tokens_seen": 29798760, + "step": 15214 + }, + { + "epoch": 2.0165672630881377, + "grad_norm": 0.0084043825045228, + "learning_rate": 1.2136034984491239e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29800232, + "step": 15215 + }, + { + "epoch": 2.0166998011928428, + "grad_norm": 1.687228798866272, + "learning_rate": 1.2133058561546256e-06, + "loss": 0.0072, + "num_input_tokens_seen": 29802072, + "step": 15216 + }, + { + "epoch": 2.0168323392975482, + "grad_norm": 5.91296911239624, + "learning_rate": 1.2130082386684774e-06, + "loss": 0.0874, + "num_input_tokens_seen": 29804128, + "step": 15217 + }, + { + "epoch": 2.0169648774022533, + "grad_norm": 1.568162202835083, + "learning_rate": 1.212710645996419e-06, + "loss": 0.0155, + "num_input_tokens_seen": 29806688, + "step": 15218 + }, + { + "epoch": 2.0170974155069583, + "grad_norm": 0.6029535531997681, + "learning_rate": 1.2124130781441866e-06, + "loss": 0.0062, + "num_input_tokens_seen": 29807960, + "step": 15219 + }, + { + "epoch": 2.0172299536116634, + "grad_norm": 0.06386114656925201, + "learning_rate": 1.2121155351175192e-06, + "loss": 0.0004, + "num_input_tokens_seen": 29809432, + "step": 15220 + }, + { + "epoch": 2.0173624917163684, + "grad_norm": 1.3042752742767334, + "learning_rate": 1.2118180169221528e-06, + "loss": 0.0114, + "num_input_tokens_seen": 29810968, + "step": 15221 + }, + { + "epoch": 2.0174950298210734, + "grad_norm": 0.684651792049408, + "learning_rate": 1.2115205235638236e-06, + "loss": 0.0041, + "num_input_tokens_seen": 29813104, + "step": 15222 + }, + { + "epoch": 2.0176275679257785, + "grad_norm": 0.15129369497299194, + "learning_rate": 1.2112230550482667e-06, + "loss": 0.0012, + "num_input_tokens_seen": 29815968, + "step": 15223 + }, + { + "epoch": 2.017760106030484, + "grad_norm": 4.362415790557861, + "learning_rate": 1.2109256113812184e-06, + "loss": 0.1207, + "num_input_tokens_seen": 29818456, + "step": 15224 + }, + { + "epoch": 2.017892644135189, + "grad_norm": 0.029975976794958115, + "learning_rate": 1.2106281925684143e-06, + "loss": 0.0002, + "num_input_tokens_seen": 29821184, + "step": 15225 + }, + { + "epoch": 2.018025182239894, + "grad_norm": 0.04778642579913139, + "learning_rate": 1.2103307986155886e-06, + "loss": 0.0003, + "num_input_tokens_seen": 29822928, + "step": 15226 + }, + { + "epoch": 2.018157720344599, + "grad_norm": 8.91598129272461, + "learning_rate": 1.2100334295284743e-06, + "loss": 0.1414, + "num_input_tokens_seen": 29824496, + "step": 15227 + }, + { + "epoch": 2.018290258449304, + "grad_norm": 3.7696890830993652, + "learning_rate": 1.2097360853128056e-06, + "loss": 0.0197, + "num_input_tokens_seen": 29826248, + "step": 15228 + }, + { + "epoch": 2.018422796554009, + "grad_norm": 1.7981632947921753, + "learning_rate": 1.2094387659743146e-06, + "loss": 0.0167, + "num_input_tokens_seen": 29827568, + "step": 15229 + }, + { + "epoch": 2.018555334658714, + "grad_norm": 0.0281238816678524, + "learning_rate": 1.2091414715187342e-06, + "loss": 0.0002, + "num_input_tokens_seen": 29829200, + "step": 15230 + }, + { + "epoch": 2.0186878727634197, + "grad_norm": 7.779087066650391, + "learning_rate": 1.208844201951798e-06, + "loss": 0.0916, + "num_input_tokens_seen": 29831248, + "step": 15231 + }, + { + "epoch": 2.0188204108681247, + "grad_norm": 0.7851071953773499, + "learning_rate": 1.2085469572792358e-06, + "loss": 0.0051, + "num_input_tokens_seen": 29833504, + "step": 15232 + }, + { + "epoch": 2.0189529489728297, + "grad_norm": 3.720806837081909, + "learning_rate": 1.2082497375067794e-06, + "loss": 0.0078, + "num_input_tokens_seen": 29835680, + "step": 15233 + }, + { + "epoch": 2.0190854870775348, + "grad_norm": 9.002038955688477, + "learning_rate": 1.2079525426401584e-06, + "loss": 0.0302, + "num_input_tokens_seen": 29837360, + "step": 15234 + }, + { + "epoch": 2.01921802518224, + "grad_norm": 0.008532645180821419, + "learning_rate": 1.2076553726851037e-06, + "loss": 0.0, + "num_input_tokens_seen": 29838520, + "step": 15235 + }, + { + "epoch": 2.019350563286945, + "grad_norm": 1.1136425733566284, + "learning_rate": 1.2073582276473458e-06, + "loss": 0.0268, + "num_input_tokens_seen": 29840016, + "step": 15236 + }, + { + "epoch": 2.01948310139165, + "grad_norm": 0.3532464802265167, + "learning_rate": 1.207061107532613e-06, + "loss": 0.003, + "num_input_tokens_seen": 29842464, + "step": 15237 + }, + { + "epoch": 2.0196156394963554, + "grad_norm": 0.44632378220558167, + "learning_rate": 1.206764012346634e-06, + "loss": 0.001, + "num_input_tokens_seen": 29843464, + "step": 15238 + }, + { + "epoch": 2.0197481776010604, + "grad_norm": 0.03249475359916687, + "learning_rate": 1.2064669420951372e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29844944, + "step": 15239 + }, + { + "epoch": 2.0198807157057654, + "grad_norm": 0.2376423329114914, + "learning_rate": 1.206169896783849e-06, + "loss": 0.0012, + "num_input_tokens_seen": 29846344, + "step": 15240 + }, + { + "epoch": 2.0200132538104705, + "grad_norm": 6.868544101715088, + "learning_rate": 1.2058728764184976e-06, + "loss": 0.0905, + "num_input_tokens_seen": 29848288, + "step": 15241 + }, + { + "epoch": 2.0201457919151755, + "grad_norm": 1.6024513244628906, + "learning_rate": 1.2055758810048112e-06, + "loss": 0.0185, + "num_input_tokens_seen": 29850584, + "step": 15242 + }, + { + "epoch": 2.0202783300198806, + "grad_norm": 21.54218292236328, + "learning_rate": 1.2052789105485147e-06, + "loss": 0.1268, + "num_input_tokens_seen": 29852320, + "step": 15243 + }, + { + "epoch": 2.020410868124586, + "grad_norm": 7.852740287780762, + "learning_rate": 1.2049819650553337e-06, + "loss": 0.1137, + "num_input_tokens_seen": 29854552, + "step": 15244 + }, + { + "epoch": 2.020543406229291, + "grad_norm": 0.07821565121412277, + "learning_rate": 1.2046850445309931e-06, + "loss": 0.0005, + "num_input_tokens_seen": 29856416, + "step": 15245 + }, + { + "epoch": 2.020675944333996, + "grad_norm": 0.07747553288936615, + "learning_rate": 1.2043881489812195e-06, + "loss": 0.0003, + "num_input_tokens_seen": 29857760, + "step": 15246 + }, + { + "epoch": 2.020808482438701, + "grad_norm": 7.670420169830322, + "learning_rate": 1.204091278411735e-06, + "loss": 0.1363, + "num_input_tokens_seen": 29859568, + "step": 15247 + }, + { + "epoch": 2.020941020543406, + "grad_norm": 1.9448819160461426, + "learning_rate": 1.2037944328282658e-06, + "loss": 0.0176, + "num_input_tokens_seen": 29862856, + "step": 15248 + }, + { + "epoch": 2.0210735586481112, + "grad_norm": 4.692911624908447, + "learning_rate": 1.2034976122365342e-06, + "loss": 0.0864, + "num_input_tokens_seen": 29864464, + "step": 15249 + }, + { + "epoch": 2.0212060967528163, + "grad_norm": 2.484718084335327, + "learning_rate": 1.2032008166422619e-06, + "loss": 0.0264, + "num_input_tokens_seen": 29866120, + "step": 15250 + }, + { + "epoch": 2.0213386348575217, + "grad_norm": 0.4270928204059601, + "learning_rate": 1.202904046051173e-06, + "loss": 0.0013, + "num_input_tokens_seen": 29868520, + "step": 15251 + }, + { + "epoch": 2.021471172962227, + "grad_norm": 0.013615209609270096, + "learning_rate": 1.2026073004689898e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29870120, + "step": 15252 + }, + { + "epoch": 2.021603711066932, + "grad_norm": 0.7138726711273193, + "learning_rate": 1.2023105799014312e-06, + "loss": 0.0064, + "num_input_tokens_seen": 29872192, + "step": 15253 + }, + { + "epoch": 2.021736249171637, + "grad_norm": 0.44421058893203735, + "learning_rate": 1.2020138843542212e-06, + "loss": 0.0031, + "num_input_tokens_seen": 29874232, + "step": 15254 + }, + { + "epoch": 2.021868787276342, + "grad_norm": 2.6380679607391357, + "learning_rate": 1.201717213833078e-06, + "loss": 0.0705, + "num_input_tokens_seen": 29876648, + "step": 15255 + }, + { + "epoch": 2.022001325381047, + "grad_norm": 1.0653774738311768, + "learning_rate": 1.2014205683437233e-06, + "loss": 0.0068, + "num_input_tokens_seen": 29878928, + "step": 15256 + }, + { + "epoch": 2.022133863485752, + "grad_norm": 0.010241253301501274, + "learning_rate": 1.201123947891876e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29880376, + "step": 15257 + }, + { + "epoch": 2.0222664015904575, + "grad_norm": 0.19768106937408447, + "learning_rate": 1.200827352483254e-06, + "loss": 0.001, + "num_input_tokens_seen": 29882480, + "step": 15258 + }, + { + "epoch": 2.0223989396951625, + "grad_norm": 4.946951389312744, + "learning_rate": 1.200530782123578e-06, + "loss": 0.0185, + "num_input_tokens_seen": 29885456, + "step": 15259 + }, + { + "epoch": 2.0225314777998675, + "grad_norm": 0.0911523774266243, + "learning_rate": 1.2002342368185638e-06, + "loss": 0.0004, + "num_input_tokens_seen": 29887768, + "step": 15260 + }, + { + "epoch": 2.0226640159045726, + "grad_norm": 2.6283979415893555, + "learning_rate": 1.1999377165739313e-06, + "loss": 0.0332, + "num_input_tokens_seen": 29890248, + "step": 15261 + }, + { + "epoch": 2.0227965540092776, + "grad_norm": 0.0077752284705638885, + "learning_rate": 1.1996412213953965e-06, + "loss": 0.0, + "num_input_tokens_seen": 29893032, + "step": 15262 + }, + { + "epoch": 2.0229290921139826, + "grad_norm": 7.464926719665527, + "learning_rate": 1.1993447512886762e-06, + "loss": 0.106, + "num_input_tokens_seen": 29895488, + "step": 15263 + }, + { + "epoch": 2.0230616302186877, + "grad_norm": 1.0570793151855469, + "learning_rate": 1.199048306259485e-06, + "loss": 0.0046, + "num_input_tokens_seen": 29897672, + "step": 15264 + }, + { + "epoch": 2.023194168323393, + "grad_norm": 7.703364372253418, + "learning_rate": 1.1987518863135411e-06, + "loss": 0.1396, + "num_input_tokens_seen": 29899144, + "step": 15265 + }, + { + "epoch": 2.023326706428098, + "grad_norm": 7.377545356750488, + "learning_rate": 1.1984554914565574e-06, + "loss": 0.0811, + "num_input_tokens_seen": 29901024, + "step": 15266 + }, + { + "epoch": 2.0234592445328032, + "grad_norm": 2.522902488708496, + "learning_rate": 1.198159121694251e-06, + "loss": 0.0191, + "num_input_tokens_seen": 29902280, + "step": 15267 + }, + { + "epoch": 2.0235917826375083, + "grad_norm": 0.2588520050048828, + "learning_rate": 1.1978627770323346e-06, + "loss": 0.0017, + "num_input_tokens_seen": 29904928, + "step": 15268 + }, + { + "epoch": 2.0237243207422133, + "grad_norm": 3.2792465686798096, + "learning_rate": 1.1975664574765223e-06, + "loss": 0.0315, + "num_input_tokens_seen": 29906792, + "step": 15269 + }, + { + "epoch": 2.0238568588469183, + "grad_norm": 2.2357795238494873, + "learning_rate": 1.1972701630325259e-06, + "loss": 0.0241, + "num_input_tokens_seen": 29908608, + "step": 15270 + }, + { + "epoch": 2.0239893969516234, + "grad_norm": 4.498013019561768, + "learning_rate": 1.19697389370606e-06, + "loss": 0.0515, + "num_input_tokens_seen": 29910512, + "step": 15271 + }, + { + "epoch": 2.024121935056329, + "grad_norm": 0.0038268158677965403, + "learning_rate": 1.1966776495028367e-06, + "loss": 0.0, + "num_input_tokens_seen": 29911696, + "step": 15272 + }, + { + "epoch": 2.024254473161034, + "grad_norm": 0.4202002286911011, + "learning_rate": 1.1963814304285678e-06, + "loss": 0.0013, + "num_input_tokens_seen": 29913736, + "step": 15273 + }, + { + "epoch": 2.024387011265739, + "grad_norm": 0.04044542461633682, + "learning_rate": 1.196085236488964e-06, + "loss": 0.0002, + "num_input_tokens_seen": 29914976, + "step": 15274 + }, + { + "epoch": 2.024519549370444, + "grad_norm": 41.17549514770508, + "learning_rate": 1.195789067689737e-06, + "loss": 0.1119, + "num_input_tokens_seen": 29916816, + "step": 15275 + }, + { + "epoch": 2.024652087475149, + "grad_norm": 0.0474514402449131, + "learning_rate": 1.1954929240365948e-06, + "loss": 0.0002, + "num_input_tokens_seen": 29918648, + "step": 15276 + }, + { + "epoch": 2.024784625579854, + "grad_norm": 4.464804172515869, + "learning_rate": 1.1951968055352492e-06, + "loss": 0.0194, + "num_input_tokens_seen": 29920696, + "step": 15277 + }, + { + "epoch": 2.024917163684559, + "grad_norm": 1.248387336730957, + "learning_rate": 1.1949007121914104e-06, + "loss": 0.0072, + "num_input_tokens_seen": 29922424, + "step": 15278 + }, + { + "epoch": 2.0250497017892646, + "grad_norm": 0.38243377208709717, + "learning_rate": 1.1946046440107862e-06, + "loss": 0.0018, + "num_input_tokens_seen": 29924016, + "step": 15279 + }, + { + "epoch": 2.0251822398939696, + "grad_norm": 0.020485827699303627, + "learning_rate": 1.194308600999085e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29925976, + "step": 15280 + }, + { + "epoch": 2.0253147779986747, + "grad_norm": 5.347994327545166, + "learning_rate": 1.1940125831620136e-06, + "loss": 0.0599, + "num_input_tokens_seen": 29928000, + "step": 15281 + }, + { + "epoch": 2.0254473161033797, + "grad_norm": 5.370848655700684, + "learning_rate": 1.1937165905052816e-06, + "loss": 0.0452, + "num_input_tokens_seen": 29929752, + "step": 15282 + }, + { + "epoch": 2.0255798542080847, + "grad_norm": 1.3163117170333862, + "learning_rate": 1.1934206230345941e-06, + "loss": 0.0034, + "num_input_tokens_seen": 29932064, + "step": 15283 + }, + { + "epoch": 2.0257123923127898, + "grad_norm": 23.513174057006836, + "learning_rate": 1.193124680755659e-06, + "loss": 0.0903, + "num_input_tokens_seen": 29933504, + "step": 15284 + }, + { + "epoch": 2.0258449304174952, + "grad_norm": 6.303351879119873, + "learning_rate": 1.192828763674182e-06, + "loss": 0.1034, + "num_input_tokens_seen": 29935928, + "step": 15285 + }, + { + "epoch": 2.0259774685222003, + "grad_norm": 0.9886231422424316, + "learning_rate": 1.192532871795867e-06, + "loss": 0.0022, + "num_input_tokens_seen": 29937720, + "step": 15286 + }, + { + "epoch": 2.0261100066269053, + "grad_norm": 0.08166089653968811, + "learning_rate": 1.1922370051264212e-06, + "loss": 0.0005, + "num_input_tokens_seen": 29939936, + "step": 15287 + }, + { + "epoch": 2.0262425447316104, + "grad_norm": 0.05150807648897171, + "learning_rate": 1.1919411636715473e-06, + "loss": 0.0003, + "num_input_tokens_seen": 29941608, + "step": 15288 + }, + { + "epoch": 2.0263750828363154, + "grad_norm": 0.008243517950177193, + "learning_rate": 1.1916453474369509e-06, + "loss": 0.0, + "num_input_tokens_seen": 29942688, + "step": 15289 + }, + { + "epoch": 2.0265076209410204, + "grad_norm": 2.652451515197754, + "learning_rate": 1.1913495564283347e-06, + "loss": 0.0262, + "num_input_tokens_seen": 29945032, + "step": 15290 + }, + { + "epoch": 2.0266401590457255, + "grad_norm": 0.046244293451309204, + "learning_rate": 1.1910537906514022e-06, + "loss": 0.0003, + "num_input_tokens_seen": 29947072, + "step": 15291 + }, + { + "epoch": 2.026772697150431, + "grad_norm": 0.011096510104835033, + "learning_rate": 1.1907580501118543e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29948936, + "step": 15292 + }, + { + "epoch": 2.026905235255136, + "grad_norm": 0.0010619750246405602, + "learning_rate": 1.1904623348153957e-06, + "loss": 0.0, + "num_input_tokens_seen": 29950152, + "step": 15293 + }, + { + "epoch": 2.027037773359841, + "grad_norm": 7.358760356903076, + "learning_rate": 1.1901666447677254e-06, + "loss": 0.0592, + "num_input_tokens_seen": 29952048, + "step": 15294 + }, + { + "epoch": 2.027170311464546, + "grad_norm": 8.173836708068848, + "learning_rate": 1.1898709799745469e-06, + "loss": 0.1449, + "num_input_tokens_seen": 29954272, + "step": 15295 + }, + { + "epoch": 2.027302849569251, + "grad_norm": 3.8574378490448, + "learning_rate": 1.1895753404415597e-06, + "loss": 0.0348, + "num_input_tokens_seen": 29956792, + "step": 15296 + }, + { + "epoch": 2.027435387673956, + "grad_norm": 2.9293763637542725, + "learning_rate": 1.189279726174463e-06, + "loss": 0.0246, + "num_input_tokens_seen": 29958752, + "step": 15297 + }, + { + "epoch": 2.027567925778661, + "grad_norm": 0.022476427257061005, + "learning_rate": 1.1889841371789585e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29960248, + "step": 15298 + }, + { + "epoch": 2.0277004638833667, + "grad_norm": 8.384184837341309, + "learning_rate": 1.1886885734607439e-06, + "loss": 0.1421, + "num_input_tokens_seen": 29962328, + "step": 15299 + }, + { + "epoch": 2.0278330019880717, + "grad_norm": 7.027342796325684, + "learning_rate": 1.1883930350255177e-06, + "loss": 0.2152, + "num_input_tokens_seen": 29963808, + "step": 15300 + }, + { + "epoch": 2.0279655400927767, + "grad_norm": 4.785160541534424, + "learning_rate": 1.1880975218789795e-06, + "loss": 0.1047, + "num_input_tokens_seen": 29965368, + "step": 15301 + }, + { + "epoch": 2.028098078197482, + "grad_norm": 0.04877548664808273, + "learning_rate": 1.1878020340268247e-06, + "loss": 0.0003, + "num_input_tokens_seen": 29966448, + "step": 15302 + }, + { + "epoch": 2.028230616302187, + "grad_norm": 3.3449134826660156, + "learning_rate": 1.1875065714747534e-06, + "loss": 0.0544, + "num_input_tokens_seen": 29968504, + "step": 15303 + }, + { + "epoch": 2.028363154406892, + "grad_norm": 0.007268261164426804, + "learning_rate": 1.1872111342284607e-06, + "loss": 0.0, + "num_input_tokens_seen": 29970248, + "step": 15304 + }, + { + "epoch": 2.028495692511597, + "grad_norm": 2.8656768798828125, + "learning_rate": 1.186915722293643e-06, + "loss": 0.045, + "num_input_tokens_seen": 29971792, + "step": 15305 + }, + { + "epoch": 2.0286282306163024, + "grad_norm": 2.714986801147461, + "learning_rate": 1.1866203356759951e-06, + "loss": 0.0388, + "num_input_tokens_seen": 29973912, + "step": 15306 + }, + { + "epoch": 2.0287607687210074, + "grad_norm": 3.7185399532318115, + "learning_rate": 1.186324974381213e-06, + "loss": 0.0388, + "num_input_tokens_seen": 29976600, + "step": 15307 + }, + { + "epoch": 2.0288933068257125, + "grad_norm": 3.9431533813476562, + "learning_rate": 1.1860296384149927e-06, + "loss": 0.0192, + "num_input_tokens_seen": 29978608, + "step": 15308 + }, + { + "epoch": 2.0290258449304175, + "grad_norm": 8.5753173828125, + "learning_rate": 1.1857343277830274e-06, + "loss": 0.1883, + "num_input_tokens_seen": 29980800, + "step": 15309 + }, + { + "epoch": 2.0291583830351225, + "grad_norm": 15.502911567687988, + "learning_rate": 1.1854390424910109e-06, + "loss": 0.0947, + "num_input_tokens_seen": 29982400, + "step": 15310 + }, + { + "epoch": 2.0292909211398276, + "grad_norm": 6.202790260314941, + "learning_rate": 1.1851437825446357e-06, + "loss": 0.108, + "num_input_tokens_seen": 29984176, + "step": 15311 + }, + { + "epoch": 2.0294234592445326, + "grad_norm": 1.4931821823120117, + "learning_rate": 1.1848485479495952e-06, + "loss": 0.0043, + "num_input_tokens_seen": 29985728, + "step": 15312 + }, + { + "epoch": 2.029555997349238, + "grad_norm": 6.15144157409668, + "learning_rate": 1.1845533387115833e-06, + "loss": 0.0126, + "num_input_tokens_seen": 29987344, + "step": 15313 + }, + { + "epoch": 2.029688535453943, + "grad_norm": 19.61886215209961, + "learning_rate": 1.1842581548362903e-06, + "loss": 0.065, + "num_input_tokens_seen": 29989896, + "step": 15314 + }, + { + "epoch": 2.029821073558648, + "grad_norm": 0.01945546269416809, + "learning_rate": 1.1839629963294078e-06, + "loss": 0.0001, + "num_input_tokens_seen": 29991496, + "step": 15315 + }, + { + "epoch": 2.029953611663353, + "grad_norm": 11.86999225616455, + "learning_rate": 1.1836678631966264e-06, + "loss": 0.2258, + "num_input_tokens_seen": 29993552, + "step": 15316 + }, + { + "epoch": 2.0300861497680582, + "grad_norm": 0.04611857980489731, + "learning_rate": 1.1833727554436362e-06, + "loss": 0.0003, + "num_input_tokens_seen": 29995008, + "step": 15317 + }, + { + "epoch": 2.0302186878727633, + "grad_norm": 0.006616382859647274, + "learning_rate": 1.183077673076127e-06, + "loss": 0.0, + "num_input_tokens_seen": 29996512, + "step": 15318 + }, + { + "epoch": 2.0303512259774683, + "grad_norm": 10.329608917236328, + "learning_rate": 1.1827826160997902e-06, + "loss": 0.1922, + "num_input_tokens_seen": 29998448, + "step": 15319 + }, + { + "epoch": 2.030483764082174, + "grad_norm": 3.6575441360473633, + "learning_rate": 1.1824875845203128e-06, + "loss": 0.0373, + "num_input_tokens_seen": 30000648, + "step": 15320 + }, + { + "epoch": 2.030616302186879, + "grad_norm": 4.689066410064697, + "learning_rate": 1.182192578343384e-06, + "loss": 0.0727, + "num_input_tokens_seen": 30002952, + "step": 15321 + }, + { + "epoch": 2.030748840291584, + "grad_norm": 0.13371823728084564, + "learning_rate": 1.1818975975746912e-06, + "loss": 0.0006, + "num_input_tokens_seen": 30005776, + "step": 15322 + }, + { + "epoch": 2.030881378396289, + "grad_norm": 0.005644764751195908, + "learning_rate": 1.1816026422199214e-06, + "loss": 0.0, + "num_input_tokens_seen": 30007320, + "step": 15323 + }, + { + "epoch": 2.031013916500994, + "grad_norm": 2.0594704151153564, + "learning_rate": 1.1813077122847621e-06, + "loss": 0.0107, + "num_input_tokens_seen": 30009336, + "step": 15324 + }, + { + "epoch": 2.031146454605699, + "grad_norm": 0.21792414784431458, + "learning_rate": 1.1810128077749007e-06, + "loss": 0.0015, + "num_input_tokens_seen": 30012256, + "step": 15325 + }, + { + "epoch": 2.031278992710404, + "grad_norm": 2.6742823123931885, + "learning_rate": 1.1807179286960222e-06, + "loss": 0.014, + "num_input_tokens_seen": 30014024, + "step": 15326 + }, + { + "epoch": 2.0314115308151095, + "grad_norm": 3.1119117736816406, + "learning_rate": 1.1804230750538123e-06, + "loss": 0.0187, + "num_input_tokens_seen": 30015544, + "step": 15327 + }, + { + "epoch": 2.0315440689198145, + "grad_norm": 0.002943409839645028, + "learning_rate": 1.1801282468539546e-06, + "loss": 0.0, + "num_input_tokens_seen": 30016928, + "step": 15328 + }, + { + "epoch": 2.0316766070245196, + "grad_norm": 7.745203971862793, + "learning_rate": 1.1798334441021362e-06, + "loss": 0.0868, + "num_input_tokens_seen": 30019240, + "step": 15329 + }, + { + "epoch": 2.0318091451292246, + "grad_norm": 6.1142168045043945, + "learning_rate": 1.1795386668040384e-06, + "loss": 0.0452, + "num_input_tokens_seen": 30021848, + "step": 15330 + }, + { + "epoch": 2.0319416832339297, + "grad_norm": 11.032805442810059, + "learning_rate": 1.1792439149653473e-06, + "loss": 0.1326, + "num_input_tokens_seen": 30023656, + "step": 15331 + }, + { + "epoch": 2.0320742213386347, + "grad_norm": 0.008492165245115757, + "learning_rate": 1.1789491885917446e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30025192, + "step": 15332 + }, + { + "epoch": 2.03220675944334, + "grad_norm": 11.665322303771973, + "learning_rate": 1.178654487688912e-06, + "loss": 0.1104, + "num_input_tokens_seen": 30027864, + "step": 15333 + }, + { + "epoch": 2.032339297548045, + "grad_norm": 0.021263157948851585, + "learning_rate": 1.1783598122625331e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30029488, + "step": 15334 + }, + { + "epoch": 2.0324718356527502, + "grad_norm": 0.06934729218482971, + "learning_rate": 1.178065162318289e-06, + "loss": 0.0004, + "num_input_tokens_seen": 30031128, + "step": 15335 + }, + { + "epoch": 2.0326043737574553, + "grad_norm": 3.3151307106018066, + "learning_rate": 1.1777705378618596e-06, + "loss": 0.0805, + "num_input_tokens_seen": 30033104, + "step": 15336 + }, + { + "epoch": 2.0327369118621603, + "grad_norm": 0.015553489327430725, + "learning_rate": 1.1774759388989272e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30034344, + "step": 15337 + }, + { + "epoch": 2.0328694499668654, + "grad_norm": 1.8725436925888062, + "learning_rate": 1.17718136543517e-06, + "loss": 0.0257, + "num_input_tokens_seen": 30035944, + "step": 15338 + }, + { + "epoch": 2.0330019880715704, + "grad_norm": 5.809375286102295, + "learning_rate": 1.17688681747627e-06, + "loss": 0.1169, + "num_input_tokens_seen": 30038672, + "step": 15339 + }, + { + "epoch": 2.033134526176276, + "grad_norm": 0.1271497756242752, + "learning_rate": 1.1765922950279044e-06, + "loss": 0.0008, + "num_input_tokens_seen": 30040784, + "step": 15340 + }, + { + "epoch": 2.033267064280981, + "grad_norm": 2.8759517669677734, + "learning_rate": 1.1762977980957515e-06, + "loss": 0.0239, + "num_input_tokens_seen": 30043168, + "step": 15341 + }, + { + "epoch": 2.033399602385686, + "grad_norm": 0.804709255695343, + "learning_rate": 1.1760033266854915e-06, + "loss": 0.0053, + "num_input_tokens_seen": 30044808, + "step": 15342 + }, + { + "epoch": 2.033532140490391, + "grad_norm": 0.05708109959959984, + "learning_rate": 1.1757088808027997e-06, + "loss": 0.0002, + "num_input_tokens_seen": 30046544, + "step": 15343 + }, + { + "epoch": 2.033664678595096, + "grad_norm": 0.03585457056760788, + "learning_rate": 1.1754144604533553e-06, + "loss": 0.0002, + "num_input_tokens_seen": 30048896, + "step": 15344 + }, + { + "epoch": 2.033797216699801, + "grad_norm": 0.022247979417443275, + "learning_rate": 1.1751200656428334e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30050048, + "step": 15345 + }, + { + "epoch": 2.033929754804506, + "grad_norm": 5.2681732177734375, + "learning_rate": 1.1748256963769108e-06, + "loss": 0.0535, + "num_input_tokens_seen": 30051568, + "step": 15346 + }, + { + "epoch": 2.0340622929092116, + "grad_norm": 0.027139918878674507, + "learning_rate": 1.1745313526612623e-06, + "loss": 0.0002, + "num_input_tokens_seen": 30052864, + "step": 15347 + }, + { + "epoch": 2.0341948310139166, + "grad_norm": 0.004139519762247801, + "learning_rate": 1.1742370345015644e-06, + "loss": 0.0, + "num_input_tokens_seen": 30054848, + "step": 15348 + }, + { + "epoch": 2.0343273691186217, + "grad_norm": 18.6748104095459, + "learning_rate": 1.1739427419034905e-06, + "loss": 0.2886, + "num_input_tokens_seen": 30055912, + "step": 15349 + }, + { + "epoch": 2.0344599072233267, + "grad_norm": 1.8263636827468872, + "learning_rate": 1.1736484748727158e-06, + "loss": 0.0225, + "num_input_tokens_seen": 30059152, + "step": 15350 + }, + { + "epoch": 2.0345924453280317, + "grad_norm": 5.624429702758789, + "learning_rate": 1.173354233414914e-06, + "loss": 0.0521, + "num_input_tokens_seen": 30061048, + "step": 15351 + }, + { + "epoch": 2.0347249834327368, + "grad_norm": 6.0545454025268555, + "learning_rate": 1.1730600175357574e-06, + "loss": 0.0436, + "num_input_tokens_seen": 30062432, + "step": 15352 + }, + { + "epoch": 2.034857521537442, + "grad_norm": 12.210949897766113, + "learning_rate": 1.1727658272409182e-06, + "loss": 0.0577, + "num_input_tokens_seen": 30064376, + "step": 15353 + }, + { + "epoch": 2.0349900596421473, + "grad_norm": 2.5179665088653564, + "learning_rate": 1.1724716625360693e-06, + "loss": 0.0192, + "num_input_tokens_seen": 30066808, + "step": 15354 + }, + { + "epoch": 2.0351225977468523, + "grad_norm": 0.8342695832252502, + "learning_rate": 1.172177523426884e-06, + "loss": 0.0075, + "num_input_tokens_seen": 30068480, + "step": 15355 + }, + { + "epoch": 2.0352551358515574, + "grad_norm": 3.4192352294921875, + "learning_rate": 1.1718834099190316e-06, + "loss": 0.0401, + "num_input_tokens_seen": 30070832, + "step": 15356 + }, + { + "epoch": 2.0353876739562624, + "grad_norm": 0.4294804036617279, + "learning_rate": 1.1715893220181834e-06, + "loss": 0.0012, + "num_input_tokens_seen": 30073520, + "step": 15357 + }, + { + "epoch": 2.0355202120609674, + "grad_norm": 4.821839809417725, + "learning_rate": 1.1712952597300096e-06, + "loss": 0.0843, + "num_input_tokens_seen": 30075176, + "step": 15358 + }, + { + "epoch": 2.0356527501656725, + "grad_norm": 4.818764686584473, + "learning_rate": 1.171001223060179e-06, + "loss": 0.2097, + "num_input_tokens_seen": 30077112, + "step": 15359 + }, + { + "epoch": 2.0357852882703775, + "grad_norm": 4.719126224517822, + "learning_rate": 1.1707072120143614e-06, + "loss": 0.0727, + "num_input_tokens_seen": 30079016, + "step": 15360 + }, + { + "epoch": 2.035917826375083, + "grad_norm": 0.1781775951385498, + "learning_rate": 1.1704132265982267e-06, + "loss": 0.0013, + "num_input_tokens_seen": 30082448, + "step": 15361 + }, + { + "epoch": 2.036050364479788, + "grad_norm": 0.010710245929658413, + "learning_rate": 1.1701192668174425e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30084792, + "step": 15362 + }, + { + "epoch": 2.036182902584493, + "grad_norm": 7.136807441711426, + "learning_rate": 1.169825332677676e-06, + "loss": 0.0495, + "num_input_tokens_seen": 30087312, + "step": 15363 + }, + { + "epoch": 2.036315440689198, + "grad_norm": 2.6020267009735107, + "learning_rate": 1.1695314241845937e-06, + "loss": 0.1133, + "num_input_tokens_seen": 30089120, + "step": 15364 + }, + { + "epoch": 2.036447978793903, + "grad_norm": 0.005258308723568916, + "learning_rate": 1.1692375413438637e-06, + "loss": 0.0, + "num_input_tokens_seen": 30090448, + "step": 15365 + }, + { + "epoch": 2.036580516898608, + "grad_norm": 13.674539566040039, + "learning_rate": 1.168943684161153e-06, + "loss": 0.3189, + "num_input_tokens_seen": 30092168, + "step": 15366 + }, + { + "epoch": 2.0367130550033137, + "grad_norm": 0.2764515280723572, + "learning_rate": 1.1686498526421258e-06, + "loss": 0.0014, + "num_input_tokens_seen": 30094032, + "step": 15367 + }, + { + "epoch": 2.0368455931080187, + "grad_norm": 5.22561502456665, + "learning_rate": 1.1683560467924482e-06, + "loss": 0.0812, + "num_input_tokens_seen": 30096536, + "step": 15368 + }, + { + "epoch": 2.0369781312127238, + "grad_norm": 4.668165683746338, + "learning_rate": 1.1680622666177837e-06, + "loss": 0.0279, + "num_input_tokens_seen": 30098408, + "step": 15369 + }, + { + "epoch": 2.037110669317429, + "grad_norm": 1.4709421396255493, + "learning_rate": 1.1677685121237985e-06, + "loss": 0.004, + "num_input_tokens_seen": 30100256, + "step": 15370 + }, + { + "epoch": 2.037243207422134, + "grad_norm": 5.9648942947387695, + "learning_rate": 1.1674747833161546e-06, + "loss": 0.0613, + "num_input_tokens_seen": 30102192, + "step": 15371 + }, + { + "epoch": 2.037375745526839, + "grad_norm": 0.2224637269973755, + "learning_rate": 1.1671810802005168e-06, + "loss": 0.0011, + "num_input_tokens_seen": 30103496, + "step": 15372 + }, + { + "epoch": 2.037508283631544, + "grad_norm": 0.009925957769155502, + "learning_rate": 1.1668874027825475e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30104880, + "step": 15373 + }, + { + "epoch": 2.0376408217362494, + "grad_norm": 0.4790247082710266, + "learning_rate": 1.1665937510679076e-06, + "loss": 0.0039, + "num_input_tokens_seen": 30106464, + "step": 15374 + }, + { + "epoch": 2.0377733598409544, + "grad_norm": 0.19355347752571106, + "learning_rate": 1.1663001250622612e-06, + "loss": 0.0007, + "num_input_tokens_seen": 30107752, + "step": 15375 + }, + { + "epoch": 2.0379058979456595, + "grad_norm": 0.020875394344329834, + "learning_rate": 1.1660065247712685e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30109184, + "step": 15376 + }, + { + "epoch": 2.0380384360503645, + "grad_norm": 5.725100994110107, + "learning_rate": 1.165712950200589e-06, + "loss": 0.0322, + "num_input_tokens_seen": 30111824, + "step": 15377 + }, + { + "epoch": 2.0381709741550695, + "grad_norm": 2.8452649116516113, + "learning_rate": 1.1654194013558856e-06, + "loss": 0.0155, + "num_input_tokens_seen": 30113328, + "step": 15378 + }, + { + "epoch": 2.0383035122597746, + "grad_norm": 2.3905649185180664, + "learning_rate": 1.1651258782428165e-06, + "loss": 0.0407, + "num_input_tokens_seen": 30115192, + "step": 15379 + }, + { + "epoch": 2.0384360503644796, + "grad_norm": 8.178534507751465, + "learning_rate": 1.1648323808670405e-06, + "loss": 0.0886, + "num_input_tokens_seen": 30117032, + "step": 15380 + }, + { + "epoch": 2.038568588469185, + "grad_norm": 1.2329213619232178, + "learning_rate": 1.1645389092342186e-06, + "loss": 0.0078, + "num_input_tokens_seen": 30118664, + "step": 15381 + }, + { + "epoch": 2.03870112657389, + "grad_norm": 2.0993881225585938, + "learning_rate": 1.1642454633500074e-06, + "loss": 0.0167, + "num_input_tokens_seen": 30120672, + "step": 15382 + }, + { + "epoch": 2.038833664678595, + "grad_norm": 6.3968071937561035, + "learning_rate": 1.1639520432200642e-06, + "loss": 0.0933, + "num_input_tokens_seen": 30123200, + "step": 15383 + }, + { + "epoch": 2.0389662027833, + "grad_norm": 0.024023214355111122, + "learning_rate": 1.1636586488500485e-06, + "loss": 0.0002, + "num_input_tokens_seen": 30124688, + "step": 15384 + }, + { + "epoch": 2.0390987408880052, + "grad_norm": 3.002664804458618, + "learning_rate": 1.163365280245615e-06, + "loss": 0.0438, + "num_input_tokens_seen": 30126192, + "step": 15385 + }, + { + "epoch": 2.0392312789927103, + "grad_norm": 6.3573503494262695, + "learning_rate": 1.1630719374124217e-06, + "loss": 0.0181, + "num_input_tokens_seen": 30127984, + "step": 15386 + }, + { + "epoch": 2.0393638170974153, + "grad_norm": 11.962425231933594, + "learning_rate": 1.1627786203561238e-06, + "loss": 0.2718, + "num_input_tokens_seen": 30129600, + "step": 15387 + }, + { + "epoch": 2.039496355202121, + "grad_norm": 0.06466907262802124, + "learning_rate": 1.1624853290823756e-06, + "loss": 0.0004, + "num_input_tokens_seen": 30132280, + "step": 15388 + }, + { + "epoch": 2.039628893306826, + "grad_norm": 0.05370205640792847, + "learning_rate": 1.1621920635968342e-06, + "loss": 0.0002, + "num_input_tokens_seen": 30133528, + "step": 15389 + }, + { + "epoch": 2.039761431411531, + "grad_norm": 2.077615976333618, + "learning_rate": 1.1618988239051516e-06, + "loss": 0.0109, + "num_input_tokens_seen": 30135600, + "step": 15390 + }, + { + "epoch": 2.039893969516236, + "grad_norm": 0.1612795740365982, + "learning_rate": 1.1616056100129836e-06, + "loss": 0.0017, + "num_input_tokens_seen": 30137016, + "step": 15391 + }, + { + "epoch": 2.040026507620941, + "grad_norm": 2.0692880153656006, + "learning_rate": 1.161312421925983e-06, + "loss": 0.0117, + "num_input_tokens_seen": 30138352, + "step": 15392 + }, + { + "epoch": 2.040159045725646, + "grad_norm": 1.9984077215194702, + "learning_rate": 1.161019259649802e-06, + "loss": 0.0212, + "num_input_tokens_seen": 30140160, + "step": 15393 + }, + { + "epoch": 2.040291583830351, + "grad_norm": 8.414917945861816, + "learning_rate": 1.160726123190093e-06, + "loss": 0.1568, + "num_input_tokens_seen": 30142336, + "step": 15394 + }, + { + "epoch": 2.0404241219350565, + "grad_norm": 0.40815141797065735, + "learning_rate": 1.160433012552508e-06, + "loss": 0.0011, + "num_input_tokens_seen": 30144568, + "step": 15395 + }, + { + "epoch": 2.0405566600397615, + "grad_norm": 1.9453282356262207, + "learning_rate": 1.1601399277426995e-06, + "loss": 0.0163, + "num_input_tokens_seen": 30146416, + "step": 15396 + }, + { + "epoch": 2.0406891981444666, + "grad_norm": 1.9006175994873047, + "learning_rate": 1.1598468687663176e-06, + "loss": 0.0121, + "num_input_tokens_seen": 30148232, + "step": 15397 + }, + { + "epoch": 2.0408217362491716, + "grad_norm": 0.01095342356711626, + "learning_rate": 1.1595538356290126e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30149208, + "step": 15398 + }, + { + "epoch": 2.0409542743538767, + "grad_norm": 0.33900848031044006, + "learning_rate": 1.1592608283364343e-06, + "loss": 0.0013, + "num_input_tokens_seen": 30152288, + "step": 15399 + }, + { + "epoch": 2.0410868124585817, + "grad_norm": 0.035532236099243164, + "learning_rate": 1.1589678468942314e-06, + "loss": 0.0002, + "num_input_tokens_seen": 30154544, + "step": 15400 + }, + { + "epoch": 2.0412193505632867, + "grad_norm": 22.57343292236328, + "learning_rate": 1.1586748913080535e-06, + "loss": 0.2515, + "num_input_tokens_seen": 30157216, + "step": 15401 + }, + { + "epoch": 2.041351888667992, + "grad_norm": 0.033173151314258575, + "learning_rate": 1.15838196158355e-06, + "loss": 0.0002, + "num_input_tokens_seen": 30159656, + "step": 15402 + }, + { + "epoch": 2.0414844267726973, + "grad_norm": 4.623154163360596, + "learning_rate": 1.1580890577263676e-06, + "loss": 0.0924, + "num_input_tokens_seen": 30161840, + "step": 15403 + }, + { + "epoch": 2.0416169648774023, + "grad_norm": 0.023964587599039078, + "learning_rate": 1.157796179742154e-06, + "loss": 0.0002, + "num_input_tokens_seen": 30163008, + "step": 15404 + }, + { + "epoch": 2.0417495029821073, + "grad_norm": 2.778046131134033, + "learning_rate": 1.157503327636556e-06, + "loss": 0.0333, + "num_input_tokens_seen": 30164592, + "step": 15405 + }, + { + "epoch": 2.0418820410868124, + "grad_norm": 5.143064022064209, + "learning_rate": 1.1572105014152191e-06, + "loss": 0.0285, + "num_input_tokens_seen": 30166896, + "step": 15406 + }, + { + "epoch": 2.0420145791915174, + "grad_norm": 4.412442684173584, + "learning_rate": 1.1569177010837898e-06, + "loss": 0.0589, + "num_input_tokens_seen": 30169000, + "step": 15407 + }, + { + "epoch": 2.0421471172962224, + "grad_norm": 0.06809312850236893, + "learning_rate": 1.156624926647915e-06, + "loss": 0.0005, + "num_input_tokens_seen": 30171584, + "step": 15408 + }, + { + "epoch": 2.042279655400928, + "grad_norm": 1.3891651630401611, + "learning_rate": 1.1563321781132382e-06, + "loss": 0.01, + "num_input_tokens_seen": 30173872, + "step": 15409 + }, + { + "epoch": 2.042412193505633, + "grad_norm": 3.154660940170288, + "learning_rate": 1.156039455485404e-06, + "loss": 0.0681, + "num_input_tokens_seen": 30175920, + "step": 15410 + }, + { + "epoch": 2.042544731610338, + "grad_norm": 0.14920218288898468, + "learning_rate": 1.1557467587700554e-06, + "loss": 0.0009, + "num_input_tokens_seen": 30177496, + "step": 15411 + }, + { + "epoch": 2.042677269715043, + "grad_norm": 0.16647225618362427, + "learning_rate": 1.1554540879728377e-06, + "loss": 0.001, + "num_input_tokens_seen": 30179216, + "step": 15412 + }, + { + "epoch": 2.042809807819748, + "grad_norm": 8.918163299560547, + "learning_rate": 1.1551614430993915e-06, + "loss": 0.0918, + "num_input_tokens_seen": 30181696, + "step": 15413 + }, + { + "epoch": 2.042942345924453, + "grad_norm": 0.14532943069934845, + "learning_rate": 1.1548688241553617e-06, + "loss": 0.0006, + "num_input_tokens_seen": 30183464, + "step": 15414 + }, + { + "epoch": 2.0430748840291586, + "grad_norm": 2.7827036380767822, + "learning_rate": 1.1545762311463887e-06, + "loss": 0.0224, + "num_input_tokens_seen": 30185928, + "step": 15415 + }, + { + "epoch": 2.0432074221338636, + "grad_norm": 9.949211120605469, + "learning_rate": 1.1542836640781133e-06, + "loss": 0.1479, + "num_input_tokens_seen": 30187944, + "step": 15416 + }, + { + "epoch": 2.0433399602385687, + "grad_norm": 0.08917324244976044, + "learning_rate": 1.1539911229561779e-06, + "loss": 0.0004, + "num_input_tokens_seen": 30189712, + "step": 15417 + }, + { + "epoch": 2.0434724983432737, + "grad_norm": 10.60329532623291, + "learning_rate": 1.1536986077862217e-06, + "loss": 0.0888, + "num_input_tokens_seen": 30191384, + "step": 15418 + }, + { + "epoch": 2.0436050364479788, + "grad_norm": 1.9410332441329956, + "learning_rate": 1.1534061185738856e-06, + "loss": 0.0221, + "num_input_tokens_seen": 30193528, + "step": 15419 + }, + { + "epoch": 2.043737574552684, + "grad_norm": 3.2017059326171875, + "learning_rate": 1.1531136553248087e-06, + "loss": 0.0387, + "num_input_tokens_seen": 30196168, + "step": 15420 + }, + { + "epoch": 2.043870112657389, + "grad_norm": 9.080785751342773, + "learning_rate": 1.1528212180446288e-06, + "loss": 0.0347, + "num_input_tokens_seen": 30198264, + "step": 15421 + }, + { + "epoch": 2.0440026507620943, + "grad_norm": 2.7377426624298096, + "learning_rate": 1.1525288067389861e-06, + "loss": 0.0615, + "num_input_tokens_seen": 30199400, + "step": 15422 + }, + { + "epoch": 2.0441351888667993, + "grad_norm": 0.007506837602704763, + "learning_rate": 1.1522364214135175e-06, + "loss": 0.0, + "num_input_tokens_seen": 30200568, + "step": 15423 + }, + { + "epoch": 2.0442677269715044, + "grad_norm": 0.007028147578239441, + "learning_rate": 1.1519440620738597e-06, + "loss": 0.0, + "num_input_tokens_seen": 30201848, + "step": 15424 + }, + { + "epoch": 2.0444002650762094, + "grad_norm": 2.7612202167510986, + "learning_rate": 1.1516517287256512e-06, + "loss": 0.0168, + "num_input_tokens_seen": 30204040, + "step": 15425 + }, + { + "epoch": 2.0445328031809145, + "grad_norm": 0.12472488731145859, + "learning_rate": 1.1513594213745266e-06, + "loss": 0.0012, + "num_input_tokens_seen": 30205464, + "step": 15426 + }, + { + "epoch": 2.0446653412856195, + "grad_norm": 0.03016464225947857, + "learning_rate": 1.1510671400261237e-06, + "loss": 0.0002, + "num_input_tokens_seen": 30207496, + "step": 15427 + }, + { + "epoch": 2.0447978793903245, + "grad_norm": 5.654534816741943, + "learning_rate": 1.150774884686077e-06, + "loss": 0.0939, + "num_input_tokens_seen": 30209024, + "step": 15428 + }, + { + "epoch": 2.04493041749503, + "grad_norm": 2.3443603515625, + "learning_rate": 1.1504826553600209e-06, + "loss": 0.0158, + "num_input_tokens_seen": 30211512, + "step": 15429 + }, + { + "epoch": 2.045062955599735, + "grad_norm": 6.268019199371338, + "learning_rate": 1.1501904520535898e-06, + "loss": 0.0263, + "num_input_tokens_seen": 30213200, + "step": 15430 + }, + { + "epoch": 2.04519549370444, + "grad_norm": 0.08043542504310608, + "learning_rate": 1.1498982747724177e-06, + "loss": 0.0005, + "num_input_tokens_seen": 30215480, + "step": 15431 + }, + { + "epoch": 2.045328031809145, + "grad_norm": 0.059432096779346466, + "learning_rate": 1.1496061235221393e-06, + "loss": 0.0003, + "num_input_tokens_seen": 30217360, + "step": 15432 + }, + { + "epoch": 2.04546056991385, + "grad_norm": 0.03536738455295563, + "learning_rate": 1.1493139983083862e-06, + "loss": 0.0002, + "num_input_tokens_seen": 30218544, + "step": 15433 + }, + { + "epoch": 2.045593108018555, + "grad_norm": 4.008764743804932, + "learning_rate": 1.149021899136791e-06, + "loss": 0.0735, + "num_input_tokens_seen": 30220376, + "step": 15434 + }, + { + "epoch": 2.0457256461232602, + "grad_norm": 0.033605098724365234, + "learning_rate": 1.148729826012986e-06, + "loss": 0.0003, + "num_input_tokens_seen": 30222848, + "step": 15435 + }, + { + "epoch": 2.0458581842279657, + "grad_norm": 0.011303989216685295, + "learning_rate": 1.1484377789426006e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30224120, + "step": 15436 + }, + { + "epoch": 2.0459907223326708, + "grad_norm": 0.6575780510902405, + "learning_rate": 1.1481457579312678e-06, + "loss": 0.0018, + "num_input_tokens_seen": 30226472, + "step": 15437 + }, + { + "epoch": 2.046123260437376, + "grad_norm": 0.8789747953414917, + "learning_rate": 1.1478537629846182e-06, + "loss": 0.0047, + "num_input_tokens_seen": 30227880, + "step": 15438 + }, + { + "epoch": 2.046255798542081, + "grad_norm": 7.1446919441223145, + "learning_rate": 1.1475617941082806e-06, + "loss": 0.1004, + "num_input_tokens_seen": 30229840, + "step": 15439 + }, + { + "epoch": 2.046388336646786, + "grad_norm": 0.6669303178787231, + "learning_rate": 1.1472698513078848e-06, + "loss": 0.0028, + "num_input_tokens_seen": 30231168, + "step": 15440 + }, + { + "epoch": 2.046520874751491, + "grad_norm": 0.5729780197143555, + "learning_rate": 1.1469779345890583e-06, + "loss": 0.0028, + "num_input_tokens_seen": 30233496, + "step": 15441 + }, + { + "epoch": 2.046653412856196, + "grad_norm": 0.06405214220285416, + "learning_rate": 1.1466860439574306e-06, + "loss": 0.0003, + "num_input_tokens_seen": 30234848, + "step": 15442 + }, + { + "epoch": 2.0467859509609014, + "grad_norm": 6.955703258514404, + "learning_rate": 1.146394179418631e-06, + "loss": 0.0564, + "num_input_tokens_seen": 30236880, + "step": 15443 + }, + { + "epoch": 2.0469184890656065, + "grad_norm": 5.256700038909912, + "learning_rate": 1.146102340978285e-06, + "loss": 0.1062, + "num_input_tokens_seen": 30238968, + "step": 15444 + }, + { + "epoch": 2.0470510271703115, + "grad_norm": 16.536304473876953, + "learning_rate": 1.1458105286420202e-06, + "loss": 0.186, + "num_input_tokens_seen": 30240384, + "step": 15445 + }, + { + "epoch": 2.0471835652750165, + "grad_norm": 3.872830867767334, + "learning_rate": 1.1455187424154621e-06, + "loss": 0.0179, + "num_input_tokens_seen": 30242088, + "step": 15446 + }, + { + "epoch": 2.0473161033797216, + "grad_norm": 0.3322814106941223, + "learning_rate": 1.1452269823042363e-06, + "loss": 0.0021, + "num_input_tokens_seen": 30243600, + "step": 15447 + }, + { + "epoch": 2.0474486414844266, + "grad_norm": 0.013216373510658741, + "learning_rate": 1.1449352483139693e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30246392, + "step": 15448 + }, + { + "epoch": 2.0475811795891317, + "grad_norm": 0.015294334851205349, + "learning_rate": 1.144643540450286e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30247912, + "step": 15449 + }, + { + "epoch": 2.047713717693837, + "grad_norm": 0.11774849146604538, + "learning_rate": 1.1443518587188104e-06, + "loss": 0.0008, + "num_input_tokens_seen": 30250704, + "step": 15450 + }, + { + "epoch": 2.047846255798542, + "grad_norm": 0.2029794603586197, + "learning_rate": 1.1440602031251662e-06, + "loss": 0.0011, + "num_input_tokens_seen": 30252040, + "step": 15451 + }, + { + "epoch": 2.047978793903247, + "grad_norm": 4.941562175750732, + "learning_rate": 1.143768573674976e-06, + "loss": 0.0869, + "num_input_tokens_seen": 30253912, + "step": 15452 + }, + { + "epoch": 2.0481113320079523, + "grad_norm": 0.008615760132670403, + "learning_rate": 1.143476970373864e-06, + "loss": 0.0, + "num_input_tokens_seen": 30255840, + "step": 15453 + }, + { + "epoch": 2.0482438701126573, + "grad_norm": 0.020869944244623184, + "learning_rate": 1.1431853932274508e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30257608, + "step": 15454 + }, + { + "epoch": 2.0483764082173623, + "grad_norm": 3.7428441047668457, + "learning_rate": 1.1428938422413606e-06, + "loss": 0.0487, + "num_input_tokens_seen": 30259896, + "step": 15455 + }, + { + "epoch": 2.048508946322068, + "grad_norm": 1.587575078010559, + "learning_rate": 1.1426023174212129e-06, + "loss": 0.0066, + "num_input_tokens_seen": 30261880, + "step": 15456 + }, + { + "epoch": 2.048641484426773, + "grad_norm": 0.0302698016166687, + "learning_rate": 1.1423108187726282e-06, + "loss": 0.0002, + "num_input_tokens_seen": 30263880, + "step": 15457 + }, + { + "epoch": 2.048774022531478, + "grad_norm": 2.4072816371917725, + "learning_rate": 1.1420193463012283e-06, + "loss": 0.0148, + "num_input_tokens_seen": 30265400, + "step": 15458 + }, + { + "epoch": 2.048906560636183, + "grad_norm": 1.6232753992080688, + "learning_rate": 1.1417279000126322e-06, + "loss": 0.0291, + "num_input_tokens_seen": 30266712, + "step": 15459 + }, + { + "epoch": 2.049039098740888, + "grad_norm": 0.09677279740571976, + "learning_rate": 1.1414364799124583e-06, + "loss": 0.0004, + "num_input_tokens_seen": 30268440, + "step": 15460 + }, + { + "epoch": 2.049171636845593, + "grad_norm": 5.05303955078125, + "learning_rate": 1.1411450860063276e-06, + "loss": 0.1457, + "num_input_tokens_seen": 30271320, + "step": 15461 + }, + { + "epoch": 2.049304174950298, + "grad_norm": 2.8378937244415283, + "learning_rate": 1.1408537182998566e-06, + "loss": 0.0142, + "num_input_tokens_seen": 30272864, + "step": 15462 + }, + { + "epoch": 2.0494367130550035, + "grad_norm": 0.013245287351310253, + "learning_rate": 1.140562376798663e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30274024, + "step": 15463 + }, + { + "epoch": 2.0495692511597086, + "grad_norm": 13.191045761108398, + "learning_rate": 1.140271061508365e-06, + "loss": 0.2502, + "num_input_tokens_seen": 30276216, + "step": 15464 + }, + { + "epoch": 2.0497017892644136, + "grad_norm": 0.01051954086869955, + "learning_rate": 1.1399797724345796e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30278240, + "step": 15465 + }, + { + "epoch": 2.0498343273691186, + "grad_norm": 4.330178260803223, + "learning_rate": 1.139688509582921e-06, + "loss": 0.0488, + "num_input_tokens_seen": 30280528, + "step": 15466 + }, + { + "epoch": 2.0499668654738237, + "grad_norm": 7.327389717102051, + "learning_rate": 1.1393972729590074e-06, + "loss": 0.114, + "num_input_tokens_seen": 30282336, + "step": 15467 + }, + { + "epoch": 2.0500994035785287, + "grad_norm": 0.01825900934636593, + "learning_rate": 1.1391060625684525e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30284048, + "step": 15468 + }, + { + "epoch": 2.0502319416832337, + "grad_norm": 0.23606400191783905, + "learning_rate": 1.1388148784168721e-06, + "loss": 0.0011, + "num_input_tokens_seen": 30286488, + "step": 15469 + }, + { + "epoch": 2.0503644797879392, + "grad_norm": 4.142344951629639, + "learning_rate": 1.1385237205098798e-06, + "loss": 0.0571, + "num_input_tokens_seen": 30289064, + "step": 15470 + }, + { + "epoch": 2.0504970178926443, + "grad_norm": 4.883716106414795, + "learning_rate": 1.1382325888530888e-06, + "loss": 0.0405, + "num_input_tokens_seen": 30291088, + "step": 15471 + }, + { + "epoch": 2.0506295559973493, + "grad_norm": 0.017013655975461006, + "learning_rate": 1.1379414834521135e-06, + "loss": 0.0002, + "num_input_tokens_seen": 30293072, + "step": 15472 + }, + { + "epoch": 2.0507620941020543, + "grad_norm": 8.204032897949219, + "learning_rate": 1.1376504043125653e-06, + "loss": 0.0763, + "num_input_tokens_seen": 30294864, + "step": 15473 + }, + { + "epoch": 2.0508946322067594, + "grad_norm": 0.7804144620895386, + "learning_rate": 1.137359351440058e-06, + "loss": 0.0048, + "num_input_tokens_seen": 30296696, + "step": 15474 + }, + { + "epoch": 2.0510271703114644, + "grad_norm": 7.004912376403809, + "learning_rate": 1.1370683248402022e-06, + "loss": 0.0242, + "num_input_tokens_seen": 30298160, + "step": 15475 + }, + { + "epoch": 2.0511597084161695, + "grad_norm": 5.126420974731445, + "learning_rate": 1.1367773245186099e-06, + "loss": 0.0514, + "num_input_tokens_seen": 30299824, + "step": 15476 + }, + { + "epoch": 2.051292246520875, + "grad_norm": 0.9875059723854065, + "learning_rate": 1.13648635048089e-06, + "loss": 0.0079, + "num_input_tokens_seen": 30301712, + "step": 15477 + }, + { + "epoch": 2.05142478462558, + "grad_norm": 4.170960426330566, + "learning_rate": 1.136195402732654e-06, + "loss": 0.0148, + "num_input_tokens_seen": 30303296, + "step": 15478 + }, + { + "epoch": 2.051557322730285, + "grad_norm": 6.785438060760498, + "learning_rate": 1.135904481279512e-06, + "loss": 0.0868, + "num_input_tokens_seen": 30306048, + "step": 15479 + }, + { + "epoch": 2.05168986083499, + "grad_norm": 0.008930460549890995, + "learning_rate": 1.135613586127073e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30307528, + "step": 15480 + }, + { + "epoch": 2.051822398939695, + "grad_norm": 0.007853156886994839, + "learning_rate": 1.1353227172809453e-06, + "loss": 0.0, + "num_input_tokens_seen": 30308824, + "step": 15481 + }, + { + "epoch": 2.0519549370444, + "grad_norm": 0.020774422213435173, + "learning_rate": 1.135031874746737e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30310824, + "step": 15482 + }, + { + "epoch": 2.052087475149105, + "grad_norm": 5.407145023345947, + "learning_rate": 1.1347410585300548e-06, + "loss": 0.1288, + "num_input_tokens_seen": 30312184, + "step": 15483 + }, + { + "epoch": 2.0522200132538106, + "grad_norm": 12.113182067871094, + "learning_rate": 1.1344502686365068e-06, + "loss": 0.4789, + "num_input_tokens_seen": 30315592, + "step": 15484 + }, + { + "epoch": 2.0523525513585157, + "grad_norm": 0.30816036462783813, + "learning_rate": 1.1341595050717007e-06, + "loss": 0.0025, + "num_input_tokens_seen": 30317504, + "step": 15485 + }, + { + "epoch": 2.0524850894632207, + "grad_norm": 0.20900918543338776, + "learning_rate": 1.1338687678412415e-06, + "loss": 0.0013, + "num_input_tokens_seen": 30319048, + "step": 15486 + }, + { + "epoch": 2.0526176275679258, + "grad_norm": 2.184577226638794, + "learning_rate": 1.133578056950735e-06, + "loss": 0.0158, + "num_input_tokens_seen": 30321536, + "step": 15487 + }, + { + "epoch": 2.052750165672631, + "grad_norm": 11.88760757446289, + "learning_rate": 1.1332873724057855e-06, + "loss": 0.2146, + "num_input_tokens_seen": 30324288, + "step": 15488 + }, + { + "epoch": 2.052882703777336, + "grad_norm": 2.711827516555786, + "learning_rate": 1.1329967142119987e-06, + "loss": 0.0231, + "num_input_tokens_seen": 30326168, + "step": 15489 + }, + { + "epoch": 2.053015241882041, + "grad_norm": 0.0417311005294323, + "learning_rate": 1.1327060823749781e-06, + "loss": 0.0003, + "num_input_tokens_seen": 30328504, + "step": 15490 + }, + { + "epoch": 2.0531477799867464, + "grad_norm": 0.04743383079767227, + "learning_rate": 1.1324154769003278e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30331040, + "step": 15491 + }, + { + "epoch": 2.0532803180914514, + "grad_norm": 7.717831134796143, + "learning_rate": 1.1321248977936507e-06, + "loss": 0.1109, + "num_input_tokens_seen": 30333448, + "step": 15492 + }, + { + "epoch": 2.0534128561961564, + "grad_norm": 5.353625297546387, + "learning_rate": 1.1318343450605494e-06, + "loss": 0.0681, + "num_input_tokens_seen": 30335584, + "step": 15493 + }, + { + "epoch": 2.0535453943008615, + "grad_norm": 0.005502821411937475, + "learning_rate": 1.1315438187066248e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30337128, + "step": 15494 + }, + { + "epoch": 2.0536779324055665, + "grad_norm": 0.4938804805278778, + "learning_rate": 1.1312533187374791e-06, + "loss": 0.0019, + "num_input_tokens_seen": 30339712, + "step": 15495 + }, + { + "epoch": 2.0538104705102715, + "grad_norm": 0.006141394376754761, + "learning_rate": 1.1309628451587148e-06, + "loss": 0.0, + "num_input_tokens_seen": 30340648, + "step": 15496 + }, + { + "epoch": 2.0539430086149766, + "grad_norm": 0.20575563609600067, + "learning_rate": 1.1306723979759312e-06, + "loss": 0.001, + "num_input_tokens_seen": 30342432, + "step": 15497 + }, + { + "epoch": 2.054075546719682, + "grad_norm": 5.541965484619141, + "learning_rate": 1.1303819771947286e-06, + "loss": 0.1498, + "num_input_tokens_seen": 30344944, + "step": 15498 + }, + { + "epoch": 2.054208084824387, + "grad_norm": 2.4689931869506836, + "learning_rate": 1.1300915828207054e-06, + "loss": 0.0327, + "num_input_tokens_seen": 30346720, + "step": 15499 + }, + { + "epoch": 2.054340622929092, + "grad_norm": 0.0040749190375208855, + "learning_rate": 1.1298012148594622e-06, + "loss": 0.0, + "num_input_tokens_seen": 30347888, + "step": 15500 + }, + { + "epoch": 2.054473161033797, + "grad_norm": 0.02767675742506981, + "learning_rate": 1.1295108733165962e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30349184, + "step": 15501 + }, + { + "epoch": 2.054605699138502, + "grad_norm": 11.09276294708252, + "learning_rate": 1.129220558197707e-06, + "loss": 0.1451, + "num_input_tokens_seen": 30351272, + "step": 15502 + }, + { + "epoch": 2.0547382372432073, + "grad_norm": 11.46237850189209, + "learning_rate": 1.128930269508391e-06, + "loss": 0.1995, + "num_input_tokens_seen": 30353744, + "step": 15503 + }, + { + "epoch": 2.0548707753479127, + "grad_norm": 6.3047194480896, + "learning_rate": 1.1286400072542442e-06, + "loss": 0.02, + "num_input_tokens_seen": 30355888, + "step": 15504 + }, + { + "epoch": 2.0550033134526178, + "grad_norm": 0.3586597442626953, + "learning_rate": 1.128349771440865e-06, + "loss": 0.0013, + "num_input_tokens_seen": 30358368, + "step": 15505 + }, + { + "epoch": 2.055135851557323, + "grad_norm": 1.7006893157958984, + "learning_rate": 1.1280595620738485e-06, + "loss": 0.0076, + "num_input_tokens_seen": 30360104, + "step": 15506 + }, + { + "epoch": 2.055268389662028, + "grad_norm": 0.008015359751880169, + "learning_rate": 1.1277693791587893e-06, + "loss": 0.0, + "num_input_tokens_seen": 30361744, + "step": 15507 + }, + { + "epoch": 2.055400927766733, + "grad_norm": 5.141878604888916, + "learning_rate": 1.127479222701284e-06, + "loss": 0.0734, + "num_input_tokens_seen": 30363432, + "step": 15508 + }, + { + "epoch": 2.055533465871438, + "grad_norm": 6.357543468475342, + "learning_rate": 1.127189092706925e-06, + "loss": 0.0214, + "num_input_tokens_seen": 30364920, + "step": 15509 + }, + { + "epoch": 2.055666003976143, + "grad_norm": 0.00899139791727066, + "learning_rate": 1.1268989891813085e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30366640, + "step": 15510 + }, + { + "epoch": 2.0557985420808484, + "grad_norm": 3.8238232135772705, + "learning_rate": 1.1266089121300266e-06, + "loss": 0.0286, + "num_input_tokens_seen": 30368456, + "step": 15511 + }, + { + "epoch": 2.0559310801855535, + "grad_norm": 3.2958567142486572, + "learning_rate": 1.1263188615586727e-06, + "loss": 0.048, + "num_input_tokens_seen": 30369848, + "step": 15512 + }, + { + "epoch": 2.0560636182902585, + "grad_norm": 4.718137741088867, + "learning_rate": 1.1260288374728375e-06, + "loss": 0.0691, + "num_input_tokens_seen": 30372616, + "step": 15513 + }, + { + "epoch": 2.0561961563949636, + "grad_norm": 3.1013102531433105, + "learning_rate": 1.125738839878114e-06, + "loss": 0.0253, + "num_input_tokens_seen": 30374704, + "step": 15514 + }, + { + "epoch": 2.0563286944996686, + "grad_norm": 0.003794013289734721, + "learning_rate": 1.1254488687800952e-06, + "loss": 0.0, + "num_input_tokens_seen": 30376192, + "step": 15515 + }, + { + "epoch": 2.0564612326043736, + "grad_norm": 30.242639541625977, + "learning_rate": 1.1251589241843702e-06, + "loss": 0.0751, + "num_input_tokens_seen": 30377376, + "step": 15516 + }, + { + "epoch": 2.0565937707090787, + "grad_norm": 0.0033865952864289284, + "learning_rate": 1.1248690060965294e-06, + "loss": 0.0, + "num_input_tokens_seen": 30378576, + "step": 15517 + }, + { + "epoch": 2.056726308813784, + "grad_norm": 0.4592157304286957, + "learning_rate": 1.1245791145221632e-06, + "loss": 0.0031, + "num_input_tokens_seen": 30380880, + "step": 15518 + }, + { + "epoch": 2.056858846918489, + "grad_norm": 0.007732014171779156, + "learning_rate": 1.1242892494668592e-06, + "loss": 0.0, + "num_input_tokens_seen": 30382664, + "step": 15519 + }, + { + "epoch": 2.0569913850231942, + "grad_norm": 1.5622478723526, + "learning_rate": 1.1239994109362079e-06, + "loss": 0.0083, + "num_input_tokens_seen": 30384704, + "step": 15520 + }, + { + "epoch": 2.0571239231278993, + "grad_norm": 7.317628383636475, + "learning_rate": 1.123709598935798e-06, + "loss": 0.0685, + "num_input_tokens_seen": 30387576, + "step": 15521 + }, + { + "epoch": 2.0572564612326043, + "grad_norm": 4.161335468292236, + "learning_rate": 1.1234198134712162e-06, + "loss": 0.0629, + "num_input_tokens_seen": 30389448, + "step": 15522 + }, + { + "epoch": 2.0573889993373093, + "grad_norm": 9.382643699645996, + "learning_rate": 1.1231300545480505e-06, + "loss": 0.0559, + "num_input_tokens_seen": 30391792, + "step": 15523 + }, + { + "epoch": 2.0575215374420144, + "grad_norm": 5.019509315490723, + "learning_rate": 1.122840322171886e-06, + "loss": 0.0602, + "num_input_tokens_seen": 30393288, + "step": 15524 + }, + { + "epoch": 2.05765407554672, + "grad_norm": 0.18929769098758698, + "learning_rate": 1.1225506163483102e-06, + "loss": 0.0014, + "num_input_tokens_seen": 30394712, + "step": 15525 + }, + { + "epoch": 2.057786613651425, + "grad_norm": 10.043717384338379, + "learning_rate": 1.1222609370829096e-06, + "loss": 0.091, + "num_input_tokens_seen": 30397232, + "step": 15526 + }, + { + "epoch": 2.05791915175613, + "grad_norm": 8.963563919067383, + "learning_rate": 1.1219712843812688e-06, + "loss": 0.17, + "num_input_tokens_seen": 30399200, + "step": 15527 + }, + { + "epoch": 2.058051689860835, + "grad_norm": 0.6126041412353516, + "learning_rate": 1.1216816582489724e-06, + "loss": 0.0059, + "num_input_tokens_seen": 30401032, + "step": 15528 + }, + { + "epoch": 2.05818422796554, + "grad_norm": 0.0023997880052775145, + "learning_rate": 1.121392058691604e-06, + "loss": 0.0, + "num_input_tokens_seen": 30402640, + "step": 15529 + }, + { + "epoch": 2.058316766070245, + "grad_norm": 6.047012805938721, + "learning_rate": 1.1211024857147473e-06, + "loss": 0.1444, + "num_input_tokens_seen": 30404832, + "step": 15530 + }, + { + "epoch": 2.05844930417495, + "grad_norm": 4.93029260635376, + "learning_rate": 1.1208129393239856e-06, + "loss": 0.0412, + "num_input_tokens_seen": 30406320, + "step": 15531 + }, + { + "epoch": 2.0585818422796556, + "grad_norm": 6.695636749267578, + "learning_rate": 1.120523419524903e-06, + "loss": 0.1009, + "num_input_tokens_seen": 30408704, + "step": 15532 + }, + { + "epoch": 2.0587143803843606, + "grad_norm": 2.5120351314544678, + "learning_rate": 1.1202339263230805e-06, + "loss": 0.0102, + "num_input_tokens_seen": 30410976, + "step": 15533 + }, + { + "epoch": 2.0588469184890656, + "grad_norm": 0.014119965955615044, + "learning_rate": 1.1199444597240998e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30412760, + "step": 15534 + }, + { + "epoch": 2.0589794565937707, + "grad_norm": 2.5599968433380127, + "learning_rate": 1.119655019733541e-06, + "loss": 0.04, + "num_input_tokens_seen": 30414296, + "step": 15535 + }, + { + "epoch": 2.0591119946984757, + "grad_norm": 4.034679889678955, + "learning_rate": 1.1193656063569863e-06, + "loss": 0.069, + "num_input_tokens_seen": 30416200, + "step": 15536 + }, + { + "epoch": 2.0592445328031808, + "grad_norm": 4.223156929016113, + "learning_rate": 1.1190762196000147e-06, + "loss": 0.0233, + "num_input_tokens_seen": 30418624, + "step": 15537 + }, + { + "epoch": 2.0593770709078862, + "grad_norm": 0.1604103446006775, + "learning_rate": 1.1187868594682068e-06, + "loss": 0.0008, + "num_input_tokens_seen": 30420280, + "step": 15538 + }, + { + "epoch": 2.0595096090125913, + "grad_norm": 0.03562053292989731, + "learning_rate": 1.1184975259671414e-06, + "loss": 0.0002, + "num_input_tokens_seen": 30422072, + "step": 15539 + }, + { + "epoch": 2.0596421471172963, + "grad_norm": 8.290109634399414, + "learning_rate": 1.1182082191023955e-06, + "loss": 0.028, + "num_input_tokens_seen": 30424096, + "step": 15540 + }, + { + "epoch": 2.0597746852220014, + "grad_norm": 1.834067702293396, + "learning_rate": 1.1179189388795495e-06, + "loss": 0.0529, + "num_input_tokens_seen": 30426288, + "step": 15541 + }, + { + "epoch": 2.0599072233267064, + "grad_norm": 0.09055990725755692, + "learning_rate": 1.1176296853041798e-06, + "loss": 0.0004, + "num_input_tokens_seen": 30428312, + "step": 15542 + }, + { + "epoch": 2.0600397614314114, + "grad_norm": 0.26919493079185486, + "learning_rate": 1.1173404583818625e-06, + "loss": 0.0011, + "num_input_tokens_seen": 30430160, + "step": 15543 + }, + { + "epoch": 2.0601722995361165, + "grad_norm": 22.029064178466797, + "learning_rate": 1.1170512581181758e-06, + "loss": 0.0489, + "num_input_tokens_seen": 30432776, + "step": 15544 + }, + { + "epoch": 2.060304837640822, + "grad_norm": 5.290366172790527, + "learning_rate": 1.1167620845186941e-06, + "loss": 0.0781, + "num_input_tokens_seen": 30435080, + "step": 15545 + }, + { + "epoch": 2.060437375745527, + "grad_norm": 2.166100263595581, + "learning_rate": 1.1164729375889945e-06, + "loss": 0.0091, + "num_input_tokens_seen": 30436928, + "step": 15546 + }, + { + "epoch": 2.060569913850232, + "grad_norm": 0.04409104958176613, + "learning_rate": 1.1161838173346516e-06, + "loss": 0.0003, + "num_input_tokens_seen": 30438928, + "step": 15547 + }, + { + "epoch": 2.060702451954937, + "grad_norm": 2.7763524055480957, + "learning_rate": 1.1158947237612378e-06, + "loss": 0.0603, + "num_input_tokens_seen": 30441016, + "step": 15548 + }, + { + "epoch": 2.060834990059642, + "grad_norm": 0.26584023237228394, + "learning_rate": 1.1156056568743298e-06, + "loss": 0.0006, + "num_input_tokens_seen": 30442504, + "step": 15549 + }, + { + "epoch": 2.060967528164347, + "grad_norm": 0.04696989804506302, + "learning_rate": 1.1153166166794996e-06, + "loss": 0.0003, + "num_input_tokens_seen": 30445112, + "step": 15550 + }, + { + "epoch": 2.061100066269052, + "grad_norm": 0.21734485030174255, + "learning_rate": 1.1150276031823194e-06, + "loss": 0.0009, + "num_input_tokens_seen": 30446656, + "step": 15551 + }, + { + "epoch": 2.0612326043737577, + "grad_norm": 0.039966125041246414, + "learning_rate": 1.1147386163883633e-06, + "loss": 0.0002, + "num_input_tokens_seen": 30449464, + "step": 15552 + }, + { + "epoch": 2.0613651424784627, + "grad_norm": 0.12212853878736496, + "learning_rate": 1.1144496563032026e-06, + "loss": 0.0003, + "num_input_tokens_seen": 30451464, + "step": 15553 + }, + { + "epoch": 2.0614976805831677, + "grad_norm": 0.15539419651031494, + "learning_rate": 1.1141607229324072e-06, + "loss": 0.0005, + "num_input_tokens_seen": 30453408, + "step": 15554 + }, + { + "epoch": 2.0616302186878728, + "grad_norm": 1.5211808681488037, + "learning_rate": 1.11387181628155e-06, + "loss": 0.0093, + "num_input_tokens_seen": 30454696, + "step": 15555 + }, + { + "epoch": 2.061762756792578, + "grad_norm": 0.7057515382766724, + "learning_rate": 1.1135829363561998e-06, + "loss": 0.0082, + "num_input_tokens_seen": 30457304, + "step": 15556 + }, + { + "epoch": 2.061895294897283, + "grad_norm": 0.09861530363559723, + "learning_rate": 1.1132940831619277e-06, + "loss": 0.0005, + "num_input_tokens_seen": 30459696, + "step": 15557 + }, + { + "epoch": 2.062027833001988, + "grad_norm": 1.4271506071090698, + "learning_rate": 1.1130052567043024e-06, + "loss": 0.0305, + "num_input_tokens_seen": 30462216, + "step": 15558 + }, + { + "epoch": 2.0621603711066934, + "grad_norm": 3.823723554611206, + "learning_rate": 1.1127164569888923e-06, + "loss": 0.0536, + "num_input_tokens_seen": 30464016, + "step": 15559 + }, + { + "epoch": 2.0622929092113984, + "grad_norm": 0.010620694607496262, + "learning_rate": 1.1124276840212654e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30465360, + "step": 15560 + }, + { + "epoch": 2.0624254473161034, + "grad_norm": 0.34943994879722595, + "learning_rate": 1.11213893780699e-06, + "loss": 0.0021, + "num_input_tokens_seen": 30468168, + "step": 15561 + }, + { + "epoch": 2.0625579854208085, + "grad_norm": 0.05902186036109924, + "learning_rate": 1.1118502183516338e-06, + "loss": 0.0002, + "num_input_tokens_seen": 30469416, + "step": 15562 + }, + { + "epoch": 2.0626905235255135, + "grad_norm": 6.210346221923828, + "learning_rate": 1.1115615256607634e-06, + "loss": 0.0733, + "num_input_tokens_seen": 30471592, + "step": 15563 + }, + { + "epoch": 2.0628230616302186, + "grad_norm": 1.0637727975845337, + "learning_rate": 1.1112728597399447e-06, + "loss": 0.0133, + "num_input_tokens_seen": 30473728, + "step": 15564 + }, + { + "epoch": 2.0629555997349236, + "grad_norm": 5.535642147064209, + "learning_rate": 1.1109842205947429e-06, + "loss": 0.0697, + "num_input_tokens_seen": 30476608, + "step": 15565 + }, + { + "epoch": 2.063088137839629, + "grad_norm": 7.507172584533691, + "learning_rate": 1.110695608230723e-06, + "loss": 0.0352, + "num_input_tokens_seen": 30479048, + "step": 15566 + }, + { + "epoch": 2.063220675944334, + "grad_norm": 12.416289329528809, + "learning_rate": 1.1104070226534502e-06, + "loss": 0.0238, + "num_input_tokens_seen": 30482240, + "step": 15567 + }, + { + "epoch": 2.063353214049039, + "grad_norm": 4.581946849822998, + "learning_rate": 1.1101184638684895e-06, + "loss": 0.1026, + "num_input_tokens_seen": 30484552, + "step": 15568 + }, + { + "epoch": 2.063485752153744, + "grad_norm": 3.624699354171753, + "learning_rate": 1.1098299318814034e-06, + "loss": 0.0426, + "num_input_tokens_seen": 30487320, + "step": 15569 + }, + { + "epoch": 2.0636182902584492, + "grad_norm": 0.01999441161751747, + "learning_rate": 1.1095414266977555e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30488840, + "step": 15570 + }, + { + "epoch": 2.0637508283631543, + "grad_norm": 0.0074171340093016624, + "learning_rate": 1.1092529483231072e-06, + "loss": 0.0, + "num_input_tokens_seen": 30490904, + "step": 15571 + }, + { + "epoch": 2.0638833664678593, + "grad_norm": 0.4564054310321808, + "learning_rate": 1.1089644967630222e-06, + "loss": 0.0018, + "num_input_tokens_seen": 30493416, + "step": 15572 + }, + { + "epoch": 2.064015904572565, + "grad_norm": 3.3340210914611816, + "learning_rate": 1.1086760720230608e-06, + "loss": 0.0284, + "num_input_tokens_seen": 30495224, + "step": 15573 + }, + { + "epoch": 2.06414844267727, + "grad_norm": 4.7557573318481445, + "learning_rate": 1.108387674108785e-06, + "loss": 0.0622, + "num_input_tokens_seen": 30497176, + "step": 15574 + }, + { + "epoch": 2.064280980781975, + "grad_norm": 3.7444674968719482, + "learning_rate": 1.1080993030257552e-06, + "loss": 0.0214, + "num_input_tokens_seen": 30499200, + "step": 15575 + }, + { + "epoch": 2.06441351888668, + "grad_norm": 0.49361658096313477, + "learning_rate": 1.1078109587795311e-06, + "loss": 0.0016, + "num_input_tokens_seen": 30501944, + "step": 15576 + }, + { + "epoch": 2.064546056991385, + "grad_norm": 0.2739591896533966, + "learning_rate": 1.107522641375671e-06, + "loss": 0.0014, + "num_input_tokens_seen": 30504800, + "step": 15577 + }, + { + "epoch": 2.06467859509609, + "grad_norm": 3.0551302433013916, + "learning_rate": 1.107234350819735e-06, + "loss": 0.0338, + "num_input_tokens_seen": 30506624, + "step": 15578 + }, + { + "epoch": 2.064811133200795, + "grad_norm": 0.0026384168304502964, + "learning_rate": 1.1069460871172824e-06, + "loss": 0.0, + "num_input_tokens_seen": 30508160, + "step": 15579 + }, + { + "epoch": 2.0649436713055005, + "grad_norm": 8.979805946350098, + "learning_rate": 1.1066578502738704e-06, + "loss": 0.0652, + "num_input_tokens_seen": 30510032, + "step": 15580 + }, + { + "epoch": 2.0650762094102055, + "grad_norm": 0.24447518587112427, + "learning_rate": 1.1063696402950564e-06, + "loss": 0.0012, + "num_input_tokens_seen": 30511552, + "step": 15581 + }, + { + "epoch": 2.0652087475149106, + "grad_norm": 0.006602340843528509, + "learning_rate": 1.106081457186396e-06, + "loss": 0.0, + "num_input_tokens_seen": 30513392, + "step": 15582 + }, + { + "epoch": 2.0653412856196156, + "grad_norm": 0.08674134314060211, + "learning_rate": 1.1057933009534477e-06, + "loss": 0.0003, + "num_input_tokens_seen": 30515072, + "step": 15583 + }, + { + "epoch": 2.0654738237243206, + "grad_norm": 4.394766807556152, + "learning_rate": 1.1055051716017653e-06, + "loss": 0.039, + "num_input_tokens_seen": 30516920, + "step": 15584 + }, + { + "epoch": 2.0656063618290257, + "grad_norm": 0.7011305093765259, + "learning_rate": 1.1052170691369064e-06, + "loss": 0.003, + "num_input_tokens_seen": 30518464, + "step": 15585 + }, + { + "epoch": 2.0657388999337307, + "grad_norm": 2.0482728481292725, + "learning_rate": 1.1049289935644245e-06, + "loss": 0.0063, + "num_input_tokens_seen": 30520368, + "step": 15586 + }, + { + "epoch": 2.065871438038436, + "grad_norm": 0.005785080138593912, + "learning_rate": 1.1046409448898732e-06, + "loss": 0.0, + "num_input_tokens_seen": 30521688, + "step": 15587 + }, + { + "epoch": 2.0660039761431412, + "grad_norm": 1.315003752708435, + "learning_rate": 1.104352923118808e-06, + "loss": 0.0059, + "num_input_tokens_seen": 30523168, + "step": 15588 + }, + { + "epoch": 2.0661365142478463, + "grad_norm": 8.820904731750488, + "learning_rate": 1.1040649282567812e-06, + "loss": 0.0734, + "num_input_tokens_seen": 30524560, + "step": 15589 + }, + { + "epoch": 2.0662690523525513, + "grad_norm": 4.032207012176514, + "learning_rate": 1.1037769603093445e-06, + "loss": 0.0164, + "num_input_tokens_seen": 30526568, + "step": 15590 + }, + { + "epoch": 2.0664015904572564, + "grad_norm": 0.007713421247899532, + "learning_rate": 1.1034890192820522e-06, + "loss": 0.0, + "num_input_tokens_seen": 30528984, + "step": 15591 + }, + { + "epoch": 2.0665341285619614, + "grad_norm": 0.1903281807899475, + "learning_rate": 1.1032011051804542e-06, + "loss": 0.0013, + "num_input_tokens_seen": 30531552, + "step": 15592 + }, + { + "epoch": 2.066666666666667, + "grad_norm": 1.278005599975586, + "learning_rate": 1.1029132180101031e-06, + "loss": 0.0074, + "num_input_tokens_seen": 30533880, + "step": 15593 + }, + { + "epoch": 2.066799204771372, + "grad_norm": 2.003617525100708, + "learning_rate": 1.1026253577765494e-06, + "loss": 0.016, + "num_input_tokens_seen": 30536072, + "step": 15594 + }, + { + "epoch": 2.066931742876077, + "grad_norm": 11.7857666015625, + "learning_rate": 1.1023375244853427e-06, + "loss": 0.2563, + "num_input_tokens_seen": 30538456, + "step": 15595 + }, + { + "epoch": 2.067064280980782, + "grad_norm": 17.706703186035156, + "learning_rate": 1.1020497181420317e-06, + "loss": 0.1293, + "num_input_tokens_seen": 30540712, + "step": 15596 + }, + { + "epoch": 2.067196819085487, + "grad_norm": 5.082114219665527, + "learning_rate": 1.1017619387521667e-06, + "loss": 0.028, + "num_input_tokens_seen": 30542912, + "step": 15597 + }, + { + "epoch": 2.067329357190192, + "grad_norm": 3.0628409385681152, + "learning_rate": 1.1014741863212969e-06, + "loss": 0.0083, + "num_input_tokens_seen": 30544888, + "step": 15598 + }, + { + "epoch": 2.067461895294897, + "grad_norm": 0.015810780227184296, + "learning_rate": 1.1011864608549694e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30546912, + "step": 15599 + }, + { + "epoch": 2.0675944333996026, + "grad_norm": 0.07897867262363434, + "learning_rate": 1.1008987623587323e-06, + "loss": 0.0004, + "num_input_tokens_seen": 30548136, + "step": 15600 + }, + { + "epoch": 2.0677269715043076, + "grad_norm": 7.408140659332275, + "learning_rate": 1.100611090838131e-06, + "loss": 0.0762, + "num_input_tokens_seen": 30549832, + "step": 15601 + }, + { + "epoch": 2.0678595096090127, + "grad_norm": 0.05522603169083595, + "learning_rate": 1.1003234462987134e-06, + "loss": 0.0003, + "num_input_tokens_seen": 30552488, + "step": 15602 + }, + { + "epoch": 2.0679920477137177, + "grad_norm": 2.586780071258545, + "learning_rate": 1.1000358287460264e-06, + "loss": 0.0204, + "num_input_tokens_seen": 30554136, + "step": 15603 + }, + { + "epoch": 2.0681245858184227, + "grad_norm": 7.495153903961182, + "learning_rate": 1.0997482381856142e-06, + "loss": 0.0526, + "num_input_tokens_seen": 30555520, + "step": 15604 + }, + { + "epoch": 2.0682571239231278, + "grad_norm": 3.767066478729248, + "learning_rate": 1.0994606746230218e-06, + "loss": 0.0153, + "num_input_tokens_seen": 30557344, + "step": 15605 + }, + { + "epoch": 2.068389662027833, + "grad_norm": 0.003917413763701916, + "learning_rate": 1.099173138063794e-06, + "loss": 0.0, + "num_input_tokens_seen": 30560048, + "step": 15606 + }, + { + "epoch": 2.0685222001325383, + "grad_norm": 10.267508506774902, + "learning_rate": 1.0988856285134738e-06, + "loss": 0.0995, + "num_input_tokens_seen": 30562744, + "step": 15607 + }, + { + "epoch": 2.0686547382372433, + "grad_norm": 5.591651439666748, + "learning_rate": 1.0985981459776049e-06, + "loss": 0.0298, + "num_input_tokens_seen": 30563864, + "step": 15608 + }, + { + "epoch": 2.0687872763419484, + "grad_norm": 0.0009399955742992461, + "learning_rate": 1.0983106904617316e-06, + "loss": 0.0, + "num_input_tokens_seen": 30565544, + "step": 15609 + }, + { + "epoch": 2.0689198144466534, + "grad_norm": 3.633188247680664, + "learning_rate": 1.098023261971395e-06, + "loss": 0.0162, + "num_input_tokens_seen": 30566696, + "step": 15610 + }, + { + "epoch": 2.0690523525513584, + "grad_norm": 4.150951385498047, + "learning_rate": 1.0977358605121375e-06, + "loss": 0.0442, + "num_input_tokens_seen": 30568888, + "step": 15611 + }, + { + "epoch": 2.0691848906560635, + "grad_norm": 3.9707016944885254, + "learning_rate": 1.0974484860894996e-06, + "loss": 0.0364, + "num_input_tokens_seen": 30570688, + "step": 15612 + }, + { + "epoch": 2.0693174287607685, + "grad_norm": 10.351153373718262, + "learning_rate": 1.0971611387090213e-06, + "loss": 0.174, + "num_input_tokens_seen": 30573416, + "step": 15613 + }, + { + "epoch": 2.069449966865474, + "grad_norm": 0.05251673609018326, + "learning_rate": 1.0968738183762446e-06, + "loss": 0.0002, + "num_input_tokens_seen": 30575064, + "step": 15614 + }, + { + "epoch": 2.069582504970179, + "grad_norm": 0.006597693543881178, + "learning_rate": 1.0965865250967093e-06, + "loss": 0.0, + "num_input_tokens_seen": 30576736, + "step": 15615 + }, + { + "epoch": 2.069715043074884, + "grad_norm": 0.0020662625320255756, + "learning_rate": 1.0962992588759539e-06, + "loss": 0.0, + "num_input_tokens_seen": 30578488, + "step": 15616 + }, + { + "epoch": 2.069847581179589, + "grad_norm": 11.229107856750488, + "learning_rate": 1.0960120197195173e-06, + "loss": 0.0317, + "num_input_tokens_seen": 30580072, + "step": 15617 + }, + { + "epoch": 2.069980119284294, + "grad_norm": 0.003116697771474719, + "learning_rate": 1.0957248076329365e-06, + "loss": 0.0, + "num_input_tokens_seen": 30581568, + "step": 15618 + }, + { + "epoch": 2.070112657388999, + "grad_norm": 6.35180139541626, + "learning_rate": 1.0954376226217512e-06, + "loss": 0.0588, + "num_input_tokens_seen": 30583744, + "step": 15619 + }, + { + "epoch": 2.0702451954937042, + "grad_norm": 8.3189697265625, + "learning_rate": 1.0951504646914965e-06, + "loss": 0.0829, + "num_input_tokens_seen": 30585736, + "step": 15620 + }, + { + "epoch": 2.0703777335984097, + "grad_norm": 0.0018414610531181097, + "learning_rate": 1.0948633338477108e-06, + "loss": 0.0, + "num_input_tokens_seen": 30587112, + "step": 15621 + }, + { + "epoch": 2.0705102717031147, + "grad_norm": 2.02024245262146, + "learning_rate": 1.0945762300959295e-06, + "loss": 0.0086, + "num_input_tokens_seen": 30588616, + "step": 15622 + }, + { + "epoch": 2.07064280980782, + "grad_norm": 1.6707075834274292, + "learning_rate": 1.0942891534416872e-06, + "loss": 0.0138, + "num_input_tokens_seen": 30590376, + "step": 15623 + }, + { + "epoch": 2.070775347912525, + "grad_norm": 8.425698280334473, + "learning_rate": 1.0940021038905204e-06, + "loss": 0.0832, + "num_input_tokens_seen": 30592304, + "step": 15624 + }, + { + "epoch": 2.07090788601723, + "grad_norm": 1.5428061485290527, + "learning_rate": 1.0937150814479633e-06, + "loss": 0.0109, + "num_input_tokens_seen": 30594056, + "step": 15625 + }, + { + "epoch": 2.071040424121935, + "grad_norm": 0.11055504530668259, + "learning_rate": 1.0934280861195483e-06, + "loss": 0.0008, + "num_input_tokens_seen": 30596888, + "step": 15626 + }, + { + "epoch": 2.0711729622266404, + "grad_norm": 8.346808433532715, + "learning_rate": 1.0931411179108115e-06, + "loss": 0.0952, + "num_input_tokens_seen": 30598880, + "step": 15627 + }, + { + "epoch": 2.0713055003313454, + "grad_norm": 15.91106128692627, + "learning_rate": 1.0928541768272833e-06, + "loss": 0.2532, + "num_input_tokens_seen": 30601056, + "step": 15628 + }, + { + "epoch": 2.0714380384360505, + "grad_norm": 0.3768092095851898, + "learning_rate": 1.0925672628744986e-06, + "loss": 0.0016, + "num_input_tokens_seen": 30602640, + "step": 15629 + }, + { + "epoch": 2.0715705765407555, + "grad_norm": 6.7477192878723145, + "learning_rate": 1.0922803760579875e-06, + "loss": 0.116, + "num_input_tokens_seen": 30604736, + "step": 15630 + }, + { + "epoch": 2.0717031146454605, + "grad_norm": 3.7975878715515137, + "learning_rate": 1.0919935163832812e-06, + "loss": 0.0343, + "num_input_tokens_seen": 30607064, + "step": 15631 + }, + { + "epoch": 2.0718356527501656, + "grad_norm": 0.006137949414551258, + "learning_rate": 1.091706683855912e-06, + "loss": 0.0, + "num_input_tokens_seen": 30609384, + "step": 15632 + }, + { + "epoch": 2.0719681908548706, + "grad_norm": 5.424675464630127, + "learning_rate": 1.0914198784814098e-06, + "loss": 0.0522, + "num_input_tokens_seen": 30611400, + "step": 15633 + }, + { + "epoch": 2.072100728959576, + "grad_norm": 0.001382120419293642, + "learning_rate": 1.0911331002653028e-06, + "loss": 0.0, + "num_input_tokens_seen": 30613456, + "step": 15634 + }, + { + "epoch": 2.072233267064281, + "grad_norm": 11.579028129577637, + "learning_rate": 1.0908463492131227e-06, + "loss": 0.1353, + "num_input_tokens_seen": 30615264, + "step": 15635 + }, + { + "epoch": 2.072365805168986, + "grad_norm": 0.1011621356010437, + "learning_rate": 1.0905596253303968e-06, + "loss": 0.0005, + "num_input_tokens_seen": 30617640, + "step": 15636 + }, + { + "epoch": 2.072498343273691, + "grad_norm": 0.0009561904589645565, + "learning_rate": 1.0902729286226533e-06, + "loss": 0.0, + "num_input_tokens_seen": 30619496, + "step": 15637 + }, + { + "epoch": 2.0726308813783962, + "grad_norm": 6.552178382873535, + "learning_rate": 1.0899862590954208e-06, + "loss": 0.0836, + "num_input_tokens_seen": 30621368, + "step": 15638 + }, + { + "epoch": 2.0727634194831013, + "grad_norm": 10.103249549865723, + "learning_rate": 1.0896996167542248e-06, + "loss": 0.0541, + "num_input_tokens_seen": 30623288, + "step": 15639 + }, + { + "epoch": 2.0728959575878063, + "grad_norm": 0.0059693981893360615, + "learning_rate": 1.0894130016045945e-06, + "loss": 0.0, + "num_input_tokens_seen": 30624696, + "step": 15640 + }, + { + "epoch": 2.073028495692512, + "grad_norm": 0.0026253091637045145, + "learning_rate": 1.0891264136520543e-06, + "loss": 0.0, + "num_input_tokens_seen": 30626360, + "step": 15641 + }, + { + "epoch": 2.073161033797217, + "grad_norm": 0.027038373053073883, + "learning_rate": 1.0888398529021305e-06, + "loss": 0.0002, + "num_input_tokens_seen": 30628784, + "step": 15642 + }, + { + "epoch": 2.073293571901922, + "grad_norm": 4.406414985656738, + "learning_rate": 1.0885533193603465e-06, + "loss": 0.0233, + "num_input_tokens_seen": 30630728, + "step": 15643 + }, + { + "epoch": 2.073426110006627, + "grad_norm": 10.237838745117188, + "learning_rate": 1.0882668130322286e-06, + "loss": 0.0855, + "num_input_tokens_seen": 30633304, + "step": 15644 + }, + { + "epoch": 2.073558648111332, + "grad_norm": 0.11725832521915436, + "learning_rate": 1.0879803339233012e-06, + "loss": 0.0004, + "num_input_tokens_seen": 30634888, + "step": 15645 + }, + { + "epoch": 2.073691186216037, + "grad_norm": 0.007014408241957426, + "learning_rate": 1.087693882039087e-06, + "loss": 0.0, + "num_input_tokens_seen": 30636456, + "step": 15646 + }, + { + "epoch": 2.073823724320742, + "grad_norm": 3.143859624862671, + "learning_rate": 1.087407457385109e-06, + "loss": 0.0237, + "num_input_tokens_seen": 30638368, + "step": 15647 + }, + { + "epoch": 2.0739562624254475, + "grad_norm": 0.0034100699704140425, + "learning_rate": 1.08712105996689e-06, + "loss": 0.0, + "num_input_tokens_seen": 30640248, + "step": 15648 + }, + { + "epoch": 2.0740888005301525, + "grad_norm": 0.0039882175624370575, + "learning_rate": 1.0868346897899504e-06, + "loss": 0.0, + "num_input_tokens_seen": 30641344, + "step": 15649 + }, + { + "epoch": 2.0742213386348576, + "grad_norm": 1.9186447858810425, + "learning_rate": 1.0865483468598131e-06, + "loss": 0.0037, + "num_input_tokens_seen": 30642760, + "step": 15650 + }, + { + "epoch": 2.0743538767395626, + "grad_norm": 7.453917026519775, + "learning_rate": 1.0862620311819995e-06, + "loss": 0.1281, + "num_input_tokens_seen": 30645128, + "step": 15651 + }, + { + "epoch": 2.0744864148442677, + "grad_norm": 0.030806077644228935, + "learning_rate": 1.0859757427620293e-06, + "loss": 0.0002, + "num_input_tokens_seen": 30647232, + "step": 15652 + }, + { + "epoch": 2.0746189529489727, + "grad_norm": 1.731955647468567, + "learning_rate": 1.0856894816054226e-06, + "loss": 0.0041, + "num_input_tokens_seen": 30648952, + "step": 15653 + }, + { + "epoch": 2.0747514910536777, + "grad_norm": 0.0033239470794796944, + "learning_rate": 1.085403247717697e-06, + "loss": 0.0, + "num_input_tokens_seen": 30650640, + "step": 15654 + }, + { + "epoch": 2.074884029158383, + "grad_norm": 3.5409607887268066, + "learning_rate": 1.085117041104373e-06, + "loss": 0.0253, + "num_input_tokens_seen": 30652376, + "step": 15655 + }, + { + "epoch": 2.0750165672630883, + "grad_norm": 0.012350587174296379, + "learning_rate": 1.0848308617709692e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30653896, + "step": 15656 + }, + { + "epoch": 2.0751491053677933, + "grad_norm": 15.54128360748291, + "learning_rate": 1.0845447097230028e-06, + "loss": 0.1714, + "num_input_tokens_seen": 30656072, + "step": 15657 + }, + { + "epoch": 2.0752816434724983, + "grad_norm": 5.086503505706787, + "learning_rate": 1.0842585849659907e-06, + "loss": 0.0153, + "num_input_tokens_seen": 30658000, + "step": 15658 + }, + { + "epoch": 2.0754141815772034, + "grad_norm": 3.505248785018921, + "learning_rate": 1.0839724875054489e-06, + "loss": 0.0341, + "num_input_tokens_seen": 30660904, + "step": 15659 + }, + { + "epoch": 2.0755467196819084, + "grad_norm": 0.0017466357676312327, + "learning_rate": 1.0836864173468952e-06, + "loss": 0.0, + "num_input_tokens_seen": 30662008, + "step": 15660 + }, + { + "epoch": 2.0756792577866134, + "grad_norm": 8.56963062286377, + "learning_rate": 1.0834003744958437e-06, + "loss": 0.0618, + "num_input_tokens_seen": 30664192, + "step": 15661 + }, + { + "epoch": 2.075811795891319, + "grad_norm": 4.824145317077637, + "learning_rate": 1.0831143589578114e-06, + "loss": 0.0246, + "num_input_tokens_seen": 30666016, + "step": 15662 + }, + { + "epoch": 2.075944333996024, + "grad_norm": 2.6475234031677246, + "learning_rate": 1.0828283707383114e-06, + "loss": 0.0212, + "num_input_tokens_seen": 30668576, + "step": 15663 + }, + { + "epoch": 2.076076872100729, + "grad_norm": 0.005755844060331583, + "learning_rate": 1.0825424098428584e-06, + "loss": 0.0, + "num_input_tokens_seen": 30669928, + "step": 15664 + }, + { + "epoch": 2.076209410205434, + "grad_norm": 0.6506161689758301, + "learning_rate": 1.0822564762769642e-06, + "loss": 0.003, + "num_input_tokens_seen": 30671304, + "step": 15665 + }, + { + "epoch": 2.076341948310139, + "grad_norm": 6.855581283569336, + "learning_rate": 1.0819705700461444e-06, + "loss": 0.1124, + "num_input_tokens_seen": 30674312, + "step": 15666 + }, + { + "epoch": 2.076474486414844, + "grad_norm": 0.0036196091677993536, + "learning_rate": 1.0816846911559093e-06, + "loss": 0.0, + "num_input_tokens_seen": 30675928, + "step": 15667 + }, + { + "epoch": 2.076607024519549, + "grad_norm": 6.430718421936035, + "learning_rate": 1.0813988396117726e-06, + "loss": 0.0972, + "num_input_tokens_seen": 30678384, + "step": 15668 + }, + { + "epoch": 2.0767395626242546, + "grad_norm": 0.9290914535522461, + "learning_rate": 1.0811130154192449e-06, + "loss": 0.0041, + "num_input_tokens_seen": 30679816, + "step": 15669 + }, + { + "epoch": 2.0768721007289597, + "grad_norm": 0.01108595822006464, + "learning_rate": 1.0808272185838365e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30682344, + "step": 15670 + }, + { + "epoch": 2.0770046388336647, + "grad_norm": 0.23477958142757416, + "learning_rate": 1.080541449111059e-06, + "loss": 0.0009, + "num_input_tokens_seen": 30683728, + "step": 15671 + }, + { + "epoch": 2.0771371769383697, + "grad_norm": 0.3388933539390564, + "learning_rate": 1.0802557070064216e-06, + "loss": 0.0017, + "num_input_tokens_seen": 30685456, + "step": 15672 + }, + { + "epoch": 2.077269715043075, + "grad_norm": 0.6729179620742798, + "learning_rate": 1.0799699922754325e-06, + "loss": 0.0061, + "num_input_tokens_seen": 30687184, + "step": 15673 + }, + { + "epoch": 2.07740225314778, + "grad_norm": 3.9146993160247803, + "learning_rate": 1.0796843049236029e-06, + "loss": 0.0381, + "num_input_tokens_seen": 30689368, + "step": 15674 + }, + { + "epoch": 2.0775347912524853, + "grad_norm": 1.718767762184143, + "learning_rate": 1.0793986449564386e-06, + "loss": 0.0074, + "num_input_tokens_seen": 30692136, + "step": 15675 + }, + { + "epoch": 2.0776673293571903, + "grad_norm": 0.03927424177527428, + "learning_rate": 1.0791130123794493e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30693712, + "step": 15676 + }, + { + "epoch": 2.0777998674618954, + "grad_norm": 10.546500205993652, + "learning_rate": 1.0788274071981417e-06, + "loss": 0.1615, + "num_input_tokens_seen": 30696520, + "step": 15677 + }, + { + "epoch": 2.0779324055666004, + "grad_norm": 4.845993995666504, + "learning_rate": 1.0785418294180217e-06, + "loss": 0.0101, + "num_input_tokens_seen": 30698120, + "step": 15678 + }, + { + "epoch": 2.0780649436713055, + "grad_norm": 0.7104319930076599, + "learning_rate": 1.0782562790445952e-06, + "loss": 0.003, + "num_input_tokens_seen": 30700216, + "step": 15679 + }, + { + "epoch": 2.0781974817760105, + "grad_norm": 0.008045575581490993, + "learning_rate": 1.0779707560833683e-06, + "loss": 0.0, + "num_input_tokens_seen": 30702304, + "step": 15680 + }, + { + "epoch": 2.0783300198807155, + "grad_norm": 1.4252724647521973, + "learning_rate": 1.077685260539847e-06, + "loss": 0.0125, + "num_input_tokens_seen": 30704840, + "step": 15681 + }, + { + "epoch": 2.078462557985421, + "grad_norm": 0.8729941844940186, + "learning_rate": 1.0773997924195354e-06, + "loss": 0.0107, + "num_input_tokens_seen": 30706792, + "step": 15682 + }, + { + "epoch": 2.078595096090126, + "grad_norm": 0.23830415308475494, + "learning_rate": 1.077114351727937e-06, + "loss": 0.0008, + "num_input_tokens_seen": 30709048, + "step": 15683 + }, + { + "epoch": 2.078727634194831, + "grad_norm": 0.021350722759962082, + "learning_rate": 1.0768289384705547e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30710952, + "step": 15684 + }, + { + "epoch": 2.078860172299536, + "grad_norm": 6.688378810882568, + "learning_rate": 1.0765435526528921e-06, + "loss": 0.0804, + "num_input_tokens_seen": 30713296, + "step": 15685 + }, + { + "epoch": 2.078992710404241, + "grad_norm": 0.025096315890550613, + "learning_rate": 1.0762581942804528e-06, + "loss": 0.0002, + "num_input_tokens_seen": 30715184, + "step": 15686 + }, + { + "epoch": 2.079125248508946, + "grad_norm": 0.0030500064603984356, + "learning_rate": 1.0759728633587376e-06, + "loss": 0.0, + "num_input_tokens_seen": 30717624, + "step": 15687 + }, + { + "epoch": 2.0792577866136512, + "grad_norm": 0.0016398221487179399, + "learning_rate": 1.0756875598932483e-06, + "loss": 0.0, + "num_input_tokens_seen": 30719416, + "step": 15688 + }, + { + "epoch": 2.0793903247183567, + "grad_norm": 0.026959480717778206, + "learning_rate": 1.075402283889485e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30720976, + "step": 15689 + }, + { + "epoch": 2.0795228628230618, + "grad_norm": 8.587615966796875, + "learning_rate": 1.075117035352948e-06, + "loss": 0.144, + "num_input_tokens_seen": 30722408, + "step": 15690 + }, + { + "epoch": 2.079655400927767, + "grad_norm": 26.740758895874023, + "learning_rate": 1.0748318142891373e-06, + "loss": 0.2816, + "num_input_tokens_seen": 30724808, + "step": 15691 + }, + { + "epoch": 2.079787939032472, + "grad_norm": 12.042533874511719, + "learning_rate": 1.0745466207035532e-06, + "loss": 0.1271, + "num_input_tokens_seen": 30727648, + "step": 15692 + }, + { + "epoch": 2.079920477137177, + "grad_norm": 0.0011338124750182033, + "learning_rate": 1.0742614546016938e-06, + "loss": 0.0, + "num_input_tokens_seen": 30729016, + "step": 15693 + }, + { + "epoch": 2.080053015241882, + "grad_norm": 9.849433898925781, + "learning_rate": 1.073976315989057e-06, + "loss": 0.2909, + "num_input_tokens_seen": 30731112, + "step": 15694 + }, + { + "epoch": 2.080185553346587, + "grad_norm": 0.049355655908584595, + "learning_rate": 1.0736912048711406e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30732456, + "step": 15695 + }, + { + "epoch": 2.0803180914512924, + "grad_norm": 0.24109269678592682, + "learning_rate": 1.073406121253441e-06, + "loss": 0.0004, + "num_input_tokens_seen": 30734696, + "step": 15696 + }, + { + "epoch": 2.0804506295559975, + "grad_norm": 5.256565570831299, + "learning_rate": 1.0731210651414554e-06, + "loss": 0.0737, + "num_input_tokens_seen": 30736760, + "step": 15697 + }, + { + "epoch": 2.0805831676607025, + "grad_norm": 3.112889528274536, + "learning_rate": 1.072836036540681e-06, + "loss": 0.0072, + "num_input_tokens_seen": 30738496, + "step": 15698 + }, + { + "epoch": 2.0807157057654075, + "grad_norm": 0.0024426295422017574, + "learning_rate": 1.072551035456612e-06, + "loss": 0.0, + "num_input_tokens_seen": 30739992, + "step": 15699 + }, + { + "epoch": 2.0808482438701126, + "grad_norm": 0.4003775715827942, + "learning_rate": 1.072266061894744e-06, + "loss": 0.003, + "num_input_tokens_seen": 30741296, + "step": 15700 + }, + { + "epoch": 2.0809807819748176, + "grad_norm": 0.0005502295680344105, + "learning_rate": 1.0719811158605705e-06, + "loss": 0.0, + "num_input_tokens_seen": 30742888, + "step": 15701 + }, + { + "epoch": 2.0811133200795227, + "grad_norm": 27.595748901367188, + "learning_rate": 1.0716961973595872e-06, + "loss": 0.3875, + "num_input_tokens_seen": 30745384, + "step": 15702 + }, + { + "epoch": 2.081245858184228, + "grad_norm": 0.013666347600519657, + "learning_rate": 1.0714113063972855e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30747200, + "step": 15703 + }, + { + "epoch": 2.081378396288933, + "grad_norm": 9.846649169921875, + "learning_rate": 1.0711264429791603e-06, + "loss": 0.1984, + "num_input_tokens_seen": 30749872, + "step": 15704 + }, + { + "epoch": 2.081510934393638, + "grad_norm": 0.540513277053833, + "learning_rate": 1.070841607110703e-06, + "loss": 0.0012, + "num_input_tokens_seen": 30752200, + "step": 15705 + }, + { + "epoch": 2.0816434724983433, + "grad_norm": 0.006293957587331533, + "learning_rate": 1.0705567987974044e-06, + "loss": 0.0, + "num_input_tokens_seen": 30754784, + "step": 15706 + }, + { + "epoch": 2.0817760106030483, + "grad_norm": 8.046182632446289, + "learning_rate": 1.070272018044758e-06, + "loss": 0.0724, + "num_input_tokens_seen": 30756776, + "step": 15707 + }, + { + "epoch": 2.0819085487077533, + "grad_norm": 5.029483795166016, + "learning_rate": 1.0699872648582524e-06, + "loss": 0.0823, + "num_input_tokens_seen": 30758872, + "step": 15708 + }, + { + "epoch": 2.082041086812459, + "grad_norm": 7.679155349731445, + "learning_rate": 1.0697025392433796e-06, + "loss": 0.2164, + "num_input_tokens_seen": 30761368, + "step": 15709 + }, + { + "epoch": 2.082173624917164, + "grad_norm": 0.005175208672881126, + "learning_rate": 1.0694178412056286e-06, + "loss": 0.0, + "num_input_tokens_seen": 30763600, + "step": 15710 + }, + { + "epoch": 2.082306163021869, + "grad_norm": 5.752948760986328, + "learning_rate": 1.0691331707504876e-06, + "loss": 0.1721, + "num_input_tokens_seen": 30766288, + "step": 15711 + }, + { + "epoch": 2.082438701126574, + "grad_norm": 0.7267287969589233, + "learning_rate": 1.0688485278834473e-06, + "loss": 0.0034, + "num_input_tokens_seen": 30768656, + "step": 15712 + }, + { + "epoch": 2.082571239231279, + "grad_norm": 0.00043109190301038325, + "learning_rate": 1.0685639126099946e-06, + "loss": 0.0, + "num_input_tokens_seen": 30770320, + "step": 15713 + }, + { + "epoch": 2.082703777335984, + "grad_norm": 0.012856950983405113, + "learning_rate": 1.068279324935616e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30771712, + "step": 15714 + }, + { + "epoch": 2.082836315440689, + "grad_norm": 0.01941719278693199, + "learning_rate": 1.067994764865801e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30774136, + "step": 15715 + }, + { + "epoch": 2.0829688535453945, + "grad_norm": 8.578277587890625, + "learning_rate": 1.0677102324060337e-06, + "loss": 0.0513, + "num_input_tokens_seen": 30776848, + "step": 15716 + }, + { + "epoch": 2.0831013916500996, + "grad_norm": 0.0035853462759405375, + "learning_rate": 1.0674257275618022e-06, + "loss": 0.0, + "num_input_tokens_seen": 30778608, + "step": 15717 + }, + { + "epoch": 2.0832339297548046, + "grad_norm": 1.1351906061172485, + "learning_rate": 1.0671412503385909e-06, + "loss": 0.0083, + "num_input_tokens_seen": 30780360, + "step": 15718 + }, + { + "epoch": 2.0833664678595096, + "grad_norm": 0.0022894712164998055, + "learning_rate": 1.0668568007418847e-06, + "loss": 0.0, + "num_input_tokens_seen": 30782232, + "step": 15719 + }, + { + "epoch": 2.0834990059642147, + "grad_norm": 17.818450927734375, + "learning_rate": 1.0665723787771674e-06, + "loss": 0.0557, + "num_input_tokens_seen": 30784704, + "step": 15720 + }, + { + "epoch": 2.0836315440689197, + "grad_norm": 0.0014743497595191002, + "learning_rate": 1.066287984449924e-06, + "loss": 0.0, + "num_input_tokens_seen": 30786176, + "step": 15721 + }, + { + "epoch": 2.0837640821736247, + "grad_norm": 0.0021800766699016094, + "learning_rate": 1.0660036177656368e-06, + "loss": 0.0, + "num_input_tokens_seen": 30788672, + "step": 15722 + }, + { + "epoch": 2.0838966202783302, + "grad_norm": 0.005285558290779591, + "learning_rate": 1.0657192787297899e-06, + "loss": 0.0, + "num_input_tokens_seen": 30790512, + "step": 15723 + }, + { + "epoch": 2.0840291583830353, + "grad_norm": 7.095405101776123, + "learning_rate": 1.0654349673478647e-06, + "loss": 0.0623, + "num_input_tokens_seen": 30793192, + "step": 15724 + }, + { + "epoch": 2.0841616964877403, + "grad_norm": 12.228784561157227, + "learning_rate": 1.0651506836253432e-06, + "loss": 0.0968, + "num_input_tokens_seen": 30795608, + "step": 15725 + }, + { + "epoch": 2.0842942345924453, + "grad_norm": 7.088786602020264, + "learning_rate": 1.064866427567705e-06, + "loss": 0.0733, + "num_input_tokens_seen": 30797592, + "step": 15726 + }, + { + "epoch": 2.0844267726971504, + "grad_norm": 1.7811346054077148, + "learning_rate": 1.0645821991804325e-06, + "loss": 0.0037, + "num_input_tokens_seen": 30799416, + "step": 15727 + }, + { + "epoch": 2.0845593108018554, + "grad_norm": 0.026214804500341415, + "learning_rate": 1.0642979984690062e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30800784, + "step": 15728 + }, + { + "epoch": 2.0846918489065605, + "grad_norm": 0.023870134726166725, + "learning_rate": 1.0640138254389051e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30803352, + "step": 15729 + }, + { + "epoch": 2.084824387011266, + "grad_norm": 8.722799301147461, + "learning_rate": 1.063729680095608e-06, + "loss": 0.0493, + "num_input_tokens_seen": 30805408, + "step": 15730 + }, + { + "epoch": 2.084956925115971, + "grad_norm": 13.073636054992676, + "learning_rate": 1.0634455624445927e-06, + "loss": 0.1758, + "num_input_tokens_seen": 30807136, + "step": 15731 + }, + { + "epoch": 2.085089463220676, + "grad_norm": 3.7375450134277344, + "learning_rate": 1.0631614724913387e-06, + "loss": 0.0738, + "num_input_tokens_seen": 30808688, + "step": 15732 + }, + { + "epoch": 2.085222001325381, + "grad_norm": 2.272077798843384, + "learning_rate": 1.062877410241322e-06, + "loss": 0.0223, + "num_input_tokens_seen": 30810712, + "step": 15733 + }, + { + "epoch": 2.085354539430086, + "grad_norm": 4.507503509521484, + "learning_rate": 1.0625933757000209e-06, + "loss": 0.0783, + "num_input_tokens_seen": 30813480, + "step": 15734 + }, + { + "epoch": 2.085487077534791, + "grad_norm": 6.6946702003479, + "learning_rate": 1.0623093688729114e-06, + "loss": 0.0495, + "num_input_tokens_seen": 30815104, + "step": 15735 + }, + { + "epoch": 2.085619615639496, + "grad_norm": 10.333877563476562, + "learning_rate": 1.062025389765469e-06, + "loss": 0.1036, + "num_input_tokens_seen": 30816592, + "step": 15736 + }, + { + "epoch": 2.0857521537442016, + "grad_norm": 0.027525320649147034, + "learning_rate": 1.0617414383831682e-06, + "loss": 0.0002, + "num_input_tokens_seen": 30818584, + "step": 15737 + }, + { + "epoch": 2.0858846918489067, + "grad_norm": 0.006066696718335152, + "learning_rate": 1.0614575147314845e-06, + "loss": 0.0, + "num_input_tokens_seen": 30820576, + "step": 15738 + }, + { + "epoch": 2.0860172299536117, + "grad_norm": 0.0009308741427958012, + "learning_rate": 1.0611736188158934e-06, + "loss": 0.0, + "num_input_tokens_seen": 30821864, + "step": 15739 + }, + { + "epoch": 2.0861497680583168, + "grad_norm": 0.46596428751945496, + "learning_rate": 1.0608897506418676e-06, + "loss": 0.0012, + "num_input_tokens_seen": 30823344, + "step": 15740 + }, + { + "epoch": 2.086282306163022, + "grad_norm": 0.00742074241861701, + "learning_rate": 1.0606059102148799e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30825264, + "step": 15741 + }, + { + "epoch": 2.086414844267727, + "grad_norm": 0.006214355118572712, + "learning_rate": 1.0603220975404024e-06, + "loss": 0.0, + "num_input_tokens_seen": 30827480, + "step": 15742 + }, + { + "epoch": 2.086547382372432, + "grad_norm": 0.006490678060799837, + "learning_rate": 1.060038312623909e-06, + "loss": 0.0, + "num_input_tokens_seen": 30829008, + "step": 15743 + }, + { + "epoch": 2.0866799204771374, + "grad_norm": 1.5100146532058716, + "learning_rate": 1.0597545554708696e-06, + "loss": 0.0117, + "num_input_tokens_seen": 30830976, + "step": 15744 + }, + { + "epoch": 2.0868124585818424, + "grad_norm": 3.784529447555542, + "learning_rate": 1.0594708260867567e-06, + "loss": 0.0677, + "num_input_tokens_seen": 30832760, + "step": 15745 + }, + { + "epoch": 2.0869449966865474, + "grad_norm": 0.9434930682182312, + "learning_rate": 1.05918712447704e-06, + "loss": 0.014, + "num_input_tokens_seen": 30834480, + "step": 15746 + }, + { + "epoch": 2.0870775347912525, + "grad_norm": 0.37713301181793213, + "learning_rate": 1.0589034506471888e-06, + "loss": 0.0017, + "num_input_tokens_seen": 30835880, + "step": 15747 + }, + { + "epoch": 2.0872100728959575, + "grad_norm": 4.0520477294921875, + "learning_rate": 1.0586198046026739e-06, + "loss": 0.0194, + "num_input_tokens_seen": 30838720, + "step": 15748 + }, + { + "epoch": 2.0873426110006625, + "grad_norm": 0.01979968696832657, + "learning_rate": 1.0583361863489637e-06, + "loss": 0.0002, + "num_input_tokens_seen": 30841552, + "step": 15749 + }, + { + "epoch": 2.0874751491053676, + "grad_norm": 5.28080940246582, + "learning_rate": 1.0580525958915252e-06, + "loss": 0.0282, + "num_input_tokens_seen": 30843368, + "step": 15750 + }, + { + "epoch": 2.087607687210073, + "grad_norm": 0.10151394456624985, + "learning_rate": 1.0577690332358284e-06, + "loss": 0.0009, + "num_input_tokens_seen": 30845472, + "step": 15751 + }, + { + "epoch": 2.087740225314778, + "grad_norm": 7.579641342163086, + "learning_rate": 1.0574854983873399e-06, + "loss": 0.0678, + "num_input_tokens_seen": 30846880, + "step": 15752 + }, + { + "epoch": 2.087872763419483, + "grad_norm": 2.807560682296753, + "learning_rate": 1.057201991351525e-06, + "loss": 0.0331, + "num_input_tokens_seen": 30848208, + "step": 15753 + }, + { + "epoch": 2.088005301524188, + "grad_norm": 0.01925724372267723, + "learning_rate": 1.0569185121338518e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30849456, + "step": 15754 + }, + { + "epoch": 2.088137839628893, + "grad_norm": 0.04092009738087654, + "learning_rate": 1.0566350607397855e-06, + "loss": 0.0002, + "num_input_tokens_seen": 30851448, + "step": 15755 + }, + { + "epoch": 2.0882703777335982, + "grad_norm": 0.0008798489579930902, + "learning_rate": 1.0563516371747899e-06, + "loss": 0.0, + "num_input_tokens_seen": 30852776, + "step": 15756 + }, + { + "epoch": 2.0884029158383033, + "grad_norm": 4.096704483032227, + "learning_rate": 1.0560682414443315e-06, + "loss": 0.0821, + "num_input_tokens_seen": 30854656, + "step": 15757 + }, + { + "epoch": 2.0885354539430088, + "grad_norm": 5.232241153717041, + "learning_rate": 1.055784873553873e-06, + "loss": 0.0692, + "num_input_tokens_seen": 30857176, + "step": 15758 + }, + { + "epoch": 2.088667992047714, + "grad_norm": 0.0029214764945209026, + "learning_rate": 1.0555015335088786e-06, + "loss": 0.0, + "num_input_tokens_seen": 30858496, + "step": 15759 + }, + { + "epoch": 2.088800530152419, + "grad_norm": 0.010639924556016922, + "learning_rate": 1.0552182213148119e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30861792, + "step": 15760 + }, + { + "epoch": 2.088933068257124, + "grad_norm": 0.005878801457583904, + "learning_rate": 1.0549349369771333e-06, + "loss": 0.0, + "num_input_tokens_seen": 30863576, + "step": 15761 + }, + { + "epoch": 2.089065606361829, + "grad_norm": 0.4838116765022278, + "learning_rate": 1.0546516805013068e-06, + "loss": 0.0034, + "num_input_tokens_seen": 30865000, + "step": 15762 + }, + { + "epoch": 2.089198144466534, + "grad_norm": 4.713444232940674, + "learning_rate": 1.0543684518927924e-06, + "loss": 0.0361, + "num_input_tokens_seen": 30866720, + "step": 15763 + }, + { + "epoch": 2.0893306825712394, + "grad_norm": 1.145693302154541, + "learning_rate": 1.0540852511570524e-06, + "loss": 0.0094, + "num_input_tokens_seen": 30868248, + "step": 15764 + }, + { + "epoch": 2.0894632206759445, + "grad_norm": 0.01238220650702715, + "learning_rate": 1.053802078299546e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30869544, + "step": 15765 + }, + { + "epoch": 2.0895957587806495, + "grad_norm": 10.222021102905273, + "learning_rate": 1.0535189333257334e-06, + "loss": 0.1139, + "num_input_tokens_seen": 30872752, + "step": 15766 + }, + { + "epoch": 2.0897282968853546, + "grad_norm": 8.535482406616211, + "learning_rate": 1.0532358162410728e-06, + "loss": 0.0782, + "num_input_tokens_seen": 30874544, + "step": 15767 + }, + { + "epoch": 2.0898608349900596, + "grad_norm": 0.03400085121393204, + "learning_rate": 1.0529527270510236e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30875968, + "step": 15768 + }, + { + "epoch": 2.0899933730947646, + "grad_norm": 0.1607217788696289, + "learning_rate": 1.0526696657610452e-06, + "loss": 0.0003, + "num_input_tokens_seen": 30877944, + "step": 15769 + }, + { + "epoch": 2.0901259111994697, + "grad_norm": 4.264970779418945, + "learning_rate": 1.0523866323765939e-06, + "loss": 0.0465, + "num_input_tokens_seen": 30879688, + "step": 15770 + }, + { + "epoch": 2.090258449304175, + "grad_norm": 0.02093798853456974, + "learning_rate": 1.0521036269031272e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30881104, + "step": 15771 + }, + { + "epoch": 2.09039098740888, + "grad_norm": 0.010911175981163979, + "learning_rate": 1.0518206493461011e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30882808, + "step": 15772 + }, + { + "epoch": 2.0905235255135852, + "grad_norm": 0.8723084926605225, + "learning_rate": 1.0515376997109714e-06, + "loss": 0.0039, + "num_input_tokens_seen": 30885168, + "step": 15773 + }, + { + "epoch": 2.0906560636182903, + "grad_norm": 0.6231458783149719, + "learning_rate": 1.0512547780031936e-06, + "loss": 0.0034, + "num_input_tokens_seen": 30886416, + "step": 15774 + }, + { + "epoch": 2.0907886017229953, + "grad_norm": 1.060481309890747, + "learning_rate": 1.0509718842282244e-06, + "loss": 0.0116, + "num_input_tokens_seen": 30888296, + "step": 15775 + }, + { + "epoch": 2.0909211398277003, + "grad_norm": 0.2933771312236786, + "learning_rate": 1.050689018391517e-06, + "loss": 0.0019, + "num_input_tokens_seen": 30889768, + "step": 15776 + }, + { + "epoch": 2.0910536779324054, + "grad_norm": 0.39020004868507385, + "learning_rate": 1.050406180498525e-06, + "loss": 0.003, + "num_input_tokens_seen": 30892304, + "step": 15777 + }, + { + "epoch": 2.091186216037111, + "grad_norm": 0.008754143491387367, + "learning_rate": 1.0501233705547017e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30893848, + "step": 15778 + }, + { + "epoch": 2.091318754141816, + "grad_norm": 3.399904489517212, + "learning_rate": 1.049840588565499e-06, + "loss": 0.0442, + "num_input_tokens_seen": 30895400, + "step": 15779 + }, + { + "epoch": 2.091451292246521, + "grad_norm": 9.70892333984375, + "learning_rate": 1.0495578345363707e-06, + "loss": 0.1642, + "num_input_tokens_seen": 30896888, + "step": 15780 + }, + { + "epoch": 2.091583830351226, + "grad_norm": 0.03751688078045845, + "learning_rate": 1.0492751084727684e-06, + "loss": 0.0002, + "num_input_tokens_seen": 30898304, + "step": 15781 + }, + { + "epoch": 2.091716368455931, + "grad_norm": 0.0934995487332344, + "learning_rate": 1.0489924103801433e-06, + "loss": 0.0003, + "num_input_tokens_seen": 30900344, + "step": 15782 + }, + { + "epoch": 2.091848906560636, + "grad_norm": 2.26963210105896, + "learning_rate": 1.0487097402639452e-06, + "loss": 0.0145, + "num_input_tokens_seen": 30903096, + "step": 15783 + }, + { + "epoch": 2.091981444665341, + "grad_norm": 2.6690514087677, + "learning_rate": 1.0484270981296236e-06, + "loss": 0.021, + "num_input_tokens_seen": 30904928, + "step": 15784 + }, + { + "epoch": 2.0921139827700466, + "grad_norm": 0.009096693247556686, + "learning_rate": 1.0481444839826288e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30906928, + "step": 15785 + }, + { + "epoch": 2.0922465208747516, + "grad_norm": 0.009975298307836056, + "learning_rate": 1.047861897828411e-06, + "loss": 0.0, + "num_input_tokens_seen": 30908728, + "step": 15786 + }, + { + "epoch": 2.0923790589794566, + "grad_norm": 0.12505173683166504, + "learning_rate": 1.0475793396724176e-06, + "loss": 0.0008, + "num_input_tokens_seen": 30911792, + "step": 15787 + }, + { + "epoch": 2.0925115970841617, + "grad_norm": 0.007718898355960846, + "learning_rate": 1.0472968095200967e-06, + "loss": 0.0, + "num_input_tokens_seen": 30913344, + "step": 15788 + }, + { + "epoch": 2.0926441351888667, + "grad_norm": 0.005531830247491598, + "learning_rate": 1.0470143073768943e-06, + "loss": 0.0, + "num_input_tokens_seen": 30915392, + "step": 15789 + }, + { + "epoch": 2.0927766732935718, + "grad_norm": 1.8126064538955688, + "learning_rate": 1.0467318332482593e-06, + "loss": 0.0171, + "num_input_tokens_seen": 30917344, + "step": 15790 + }, + { + "epoch": 2.092909211398277, + "grad_norm": 2.6063647270202637, + "learning_rate": 1.0464493871396363e-06, + "loss": 0.019, + "num_input_tokens_seen": 30919432, + "step": 15791 + }, + { + "epoch": 2.0930417495029823, + "grad_norm": 15.73050594329834, + "learning_rate": 1.0461669690564727e-06, + "loss": 0.0927, + "num_input_tokens_seen": 30922680, + "step": 15792 + }, + { + "epoch": 2.0931742876076873, + "grad_norm": 0.19147707521915436, + "learning_rate": 1.045884579004213e-06, + "loss": 0.0007, + "num_input_tokens_seen": 30924400, + "step": 15793 + }, + { + "epoch": 2.0933068257123923, + "grad_norm": 0.904897153377533, + "learning_rate": 1.0456022169883009e-06, + "loss": 0.0036, + "num_input_tokens_seen": 30926376, + "step": 15794 + }, + { + "epoch": 2.0934393638170974, + "grad_norm": 1.8791215419769287, + "learning_rate": 1.0453198830141823e-06, + "loss": 0.0051, + "num_input_tokens_seen": 30928608, + "step": 15795 + }, + { + "epoch": 2.0935719019218024, + "grad_norm": 8.004446029663086, + "learning_rate": 1.0450375770872997e-06, + "loss": 0.2814, + "num_input_tokens_seen": 30930464, + "step": 15796 + }, + { + "epoch": 2.0937044400265075, + "grad_norm": 0.0018178971949964762, + "learning_rate": 1.0447552992130956e-06, + "loss": 0.0, + "num_input_tokens_seen": 30931936, + "step": 15797 + }, + { + "epoch": 2.093836978131213, + "grad_norm": 2.8168511390686035, + "learning_rate": 1.044473049397014e-06, + "loss": 0.0341, + "num_input_tokens_seen": 30934056, + "step": 15798 + }, + { + "epoch": 2.093969516235918, + "grad_norm": 0.004515695385634899, + "learning_rate": 1.0441908276444953e-06, + "loss": 0.0, + "num_input_tokens_seen": 30937272, + "step": 15799 + }, + { + "epoch": 2.094102054340623, + "grad_norm": 3.269775867462158, + "learning_rate": 1.0439086339609828e-06, + "loss": 0.0326, + "num_input_tokens_seen": 30939704, + "step": 15800 + }, + { + "epoch": 2.094234592445328, + "grad_norm": 0.002671262715011835, + "learning_rate": 1.0436264683519163e-06, + "loss": 0.0, + "num_input_tokens_seen": 30941200, + "step": 15801 + }, + { + "epoch": 2.094367130550033, + "grad_norm": 2.299652576446533, + "learning_rate": 1.043344330822736e-06, + "loss": 0.0179, + "num_input_tokens_seen": 30943296, + "step": 15802 + }, + { + "epoch": 2.094499668654738, + "grad_norm": 0.0012987097725272179, + "learning_rate": 1.043062221378881e-06, + "loss": 0.0, + "num_input_tokens_seen": 30944720, + "step": 15803 + }, + { + "epoch": 2.094632206759443, + "grad_norm": 8.830548286437988, + "learning_rate": 1.0427801400257917e-06, + "loss": 0.0977, + "num_input_tokens_seen": 30947608, + "step": 15804 + }, + { + "epoch": 2.0947647448641487, + "grad_norm": 10.112425804138184, + "learning_rate": 1.042498086768907e-06, + "loss": 0.142, + "num_input_tokens_seen": 30949600, + "step": 15805 + }, + { + "epoch": 2.0948972829688537, + "grad_norm": 0.003482073312625289, + "learning_rate": 1.0422160616136653e-06, + "loss": 0.0, + "num_input_tokens_seen": 30951128, + "step": 15806 + }, + { + "epoch": 2.0950298210735587, + "grad_norm": 0.055163461714982986, + "learning_rate": 1.0419340645655032e-06, + "loss": 0.0004, + "num_input_tokens_seen": 30952920, + "step": 15807 + }, + { + "epoch": 2.0951623591782638, + "grad_norm": 5.391628742218018, + "learning_rate": 1.0416520956298585e-06, + "loss": 0.1044, + "num_input_tokens_seen": 30954656, + "step": 15808 + }, + { + "epoch": 2.095294897282969, + "grad_norm": 0.007142943330109119, + "learning_rate": 1.0413701548121664e-06, + "loss": 0.0, + "num_input_tokens_seen": 30956592, + "step": 15809 + }, + { + "epoch": 2.095427435387674, + "grad_norm": 4.809557914733887, + "learning_rate": 1.041088242117864e-06, + "loss": 0.0855, + "num_input_tokens_seen": 30959008, + "step": 15810 + }, + { + "epoch": 2.095559973492379, + "grad_norm": 5.410335063934326, + "learning_rate": 1.0408063575523874e-06, + "loss": 0.0861, + "num_input_tokens_seen": 30961624, + "step": 15811 + }, + { + "epoch": 2.0956925115970844, + "grad_norm": 0.03331515192985535, + "learning_rate": 1.040524501121171e-06, + "loss": 0.0002, + "num_input_tokens_seen": 30964056, + "step": 15812 + }, + { + "epoch": 2.0958250497017894, + "grad_norm": 0.6891270875930786, + "learning_rate": 1.0402426728296495e-06, + "loss": 0.0035, + "num_input_tokens_seen": 30966128, + "step": 15813 + }, + { + "epoch": 2.0959575878064944, + "grad_norm": 5.943944931030273, + "learning_rate": 1.039960872683255e-06, + "loss": 0.035, + "num_input_tokens_seen": 30967760, + "step": 15814 + }, + { + "epoch": 2.0960901259111995, + "grad_norm": 1.827434778213501, + "learning_rate": 1.0396791006874218e-06, + "loss": 0.0212, + "num_input_tokens_seen": 30970392, + "step": 15815 + }, + { + "epoch": 2.0962226640159045, + "grad_norm": 10.489002227783203, + "learning_rate": 1.0393973568475843e-06, + "loss": 0.0826, + "num_input_tokens_seen": 30971784, + "step": 15816 + }, + { + "epoch": 2.0963552021206096, + "grad_norm": 0.013517944142222404, + "learning_rate": 1.0391156411691733e-06, + "loss": 0.0001, + "num_input_tokens_seen": 30973392, + "step": 15817 + }, + { + "epoch": 2.0964877402253146, + "grad_norm": 8.6538667678833, + "learning_rate": 1.0388339536576206e-06, + "loss": 0.1636, + "num_input_tokens_seen": 30975472, + "step": 15818 + }, + { + "epoch": 2.09662027833002, + "grad_norm": 8.85480785369873, + "learning_rate": 1.0385522943183572e-06, + "loss": 0.1616, + "num_input_tokens_seen": 30977696, + "step": 15819 + }, + { + "epoch": 2.096752816434725, + "grad_norm": 0.04853692278265953, + "learning_rate": 1.038270663156813e-06, + "loss": 0.0002, + "num_input_tokens_seen": 30980488, + "step": 15820 + }, + { + "epoch": 2.09688535453943, + "grad_norm": 0.42406994104385376, + "learning_rate": 1.0379890601784184e-06, + "loss": 0.0031, + "num_input_tokens_seen": 30982168, + "step": 15821 + }, + { + "epoch": 2.097017892644135, + "grad_norm": 8.757364273071289, + "learning_rate": 1.0377074853886045e-06, + "loss": 0.202, + "num_input_tokens_seen": 30984176, + "step": 15822 + }, + { + "epoch": 2.09715043074884, + "grad_norm": 0.6955289840698242, + "learning_rate": 1.0374259387927992e-06, + "loss": 0.0037, + "num_input_tokens_seen": 30985968, + "step": 15823 + }, + { + "epoch": 2.0972829688535453, + "grad_norm": 4.125407695770264, + "learning_rate": 1.0371444203964307e-06, + "loss": 0.0324, + "num_input_tokens_seen": 30987344, + "step": 15824 + }, + { + "epoch": 2.0974155069582503, + "grad_norm": 1.7335323095321655, + "learning_rate": 1.036862930204926e-06, + "loss": 0.0097, + "num_input_tokens_seen": 30990208, + "step": 15825 + }, + { + "epoch": 2.097548045062956, + "grad_norm": 0.03396067023277283, + "learning_rate": 1.0365814682237142e-06, + "loss": 0.0002, + "num_input_tokens_seen": 30992280, + "step": 15826 + }, + { + "epoch": 2.097680583167661, + "grad_norm": 0.005177884362637997, + "learning_rate": 1.0363000344582205e-06, + "loss": 0.0, + "num_input_tokens_seen": 30993784, + "step": 15827 + }, + { + "epoch": 2.097813121272366, + "grad_norm": 7.927840709686279, + "learning_rate": 1.0360186289138727e-06, + "loss": 0.1362, + "num_input_tokens_seen": 30995592, + "step": 15828 + }, + { + "epoch": 2.097945659377071, + "grad_norm": 6.203861236572266, + "learning_rate": 1.0357372515960956e-06, + "loss": 0.057, + "num_input_tokens_seen": 30997440, + "step": 15829 + }, + { + "epoch": 2.098078197481776, + "grad_norm": 0.4799310266971588, + "learning_rate": 1.0354559025103136e-06, + "loss": 0.0013, + "num_input_tokens_seen": 30999304, + "step": 15830 + }, + { + "epoch": 2.098210735586481, + "grad_norm": 6.7321906089782715, + "learning_rate": 1.035174581661953e-06, + "loss": 0.0681, + "num_input_tokens_seen": 31001368, + "step": 15831 + }, + { + "epoch": 2.098343273691186, + "grad_norm": 0.24848414957523346, + "learning_rate": 1.0348932890564373e-06, + "loss": 0.0013, + "num_input_tokens_seen": 31002600, + "step": 15832 + }, + { + "epoch": 2.0984758117958915, + "grad_norm": 6.72873067855835, + "learning_rate": 1.0346120246991884e-06, + "loss": 0.0252, + "num_input_tokens_seen": 31004312, + "step": 15833 + }, + { + "epoch": 2.0986083499005965, + "grad_norm": 0.00857654120773077, + "learning_rate": 1.0343307885956315e-06, + "loss": 0.0, + "num_input_tokens_seen": 31005856, + "step": 15834 + }, + { + "epoch": 2.0987408880053016, + "grad_norm": 8.274728775024414, + "learning_rate": 1.0340495807511882e-06, + "loss": 0.2256, + "num_input_tokens_seen": 31007416, + "step": 15835 + }, + { + "epoch": 2.0988734261100066, + "grad_norm": 0.3416224420070648, + "learning_rate": 1.0337684011712796e-06, + "loss": 0.0009, + "num_input_tokens_seen": 31009888, + "step": 15836 + }, + { + "epoch": 2.0990059642147116, + "grad_norm": 0.013166550546884537, + "learning_rate": 1.0334872498613285e-06, + "loss": 0.0001, + "num_input_tokens_seen": 31011936, + "step": 15837 + }, + { + "epoch": 2.0991385023194167, + "grad_norm": 5.158039093017578, + "learning_rate": 1.033206126826754e-06, + "loss": 0.0488, + "num_input_tokens_seen": 31013952, + "step": 15838 + }, + { + "epoch": 2.0992710404241217, + "grad_norm": 14.018601417541504, + "learning_rate": 1.032925032072978e-06, + "loss": 0.1567, + "num_input_tokens_seen": 31015928, + "step": 15839 + }, + { + "epoch": 2.099403578528827, + "grad_norm": 0.015515340492129326, + "learning_rate": 1.03264396560542e-06, + "loss": 0.0001, + "num_input_tokens_seen": 31017544, + "step": 15840 + }, + { + "epoch": 2.0995361166335322, + "grad_norm": 11.384153366088867, + "learning_rate": 1.032362927429497e-06, + "loss": 0.0522, + "num_input_tokens_seen": 31018744, + "step": 15841 + }, + { + "epoch": 2.0996686547382373, + "grad_norm": 0.00803851056843996, + "learning_rate": 1.0320819175506305e-06, + "loss": 0.0, + "num_input_tokens_seen": 31020168, + "step": 15842 + }, + { + "epoch": 2.0998011928429423, + "grad_norm": 1.1091930866241455, + "learning_rate": 1.0318009359742373e-06, + "loss": 0.0043, + "num_input_tokens_seen": 31021616, + "step": 15843 + }, + { + "epoch": 2.0999337309476473, + "grad_norm": 3.204047918319702, + "learning_rate": 1.031519982705734e-06, + "loss": 0.0057, + "num_input_tokens_seen": 31023720, + "step": 15844 + }, + { + "epoch": 2.1000662690523524, + "grad_norm": 1.6260164976119995, + "learning_rate": 1.0312390577505393e-06, + "loss": 0.0068, + "num_input_tokens_seen": 31025744, + "step": 15845 + }, + { + "epoch": 2.100198807157058, + "grad_norm": 3.710658550262451, + "learning_rate": 1.0309581611140681e-06, + "loss": 0.0631, + "num_input_tokens_seen": 31027752, + "step": 15846 + }, + { + "epoch": 2.100331345261763, + "grad_norm": 8.605237007141113, + "learning_rate": 1.0306772928017378e-06, + "loss": 0.1073, + "num_input_tokens_seen": 31030376, + "step": 15847 + }, + { + "epoch": 2.100463883366468, + "grad_norm": 3.5550413131713867, + "learning_rate": 1.0303964528189629e-06, + "loss": 0.0655, + "num_input_tokens_seen": 31033416, + "step": 15848 + }, + { + "epoch": 2.100596421471173, + "grad_norm": 0.021615002304315567, + "learning_rate": 1.0301156411711582e-06, + "loss": 0.0001, + "num_input_tokens_seen": 31035224, + "step": 15849 + }, + { + "epoch": 2.100728959575878, + "grad_norm": 0.008738775737583637, + "learning_rate": 1.029834857863737e-06, + "loss": 0.0, + "num_input_tokens_seen": 31036480, + "step": 15850 + }, + { + "epoch": 2.100861497680583, + "grad_norm": 0.017540819942951202, + "learning_rate": 1.0295541029021142e-06, + "loss": 0.0001, + "num_input_tokens_seen": 31037800, + "step": 15851 + }, + { + "epoch": 2.100994035785288, + "grad_norm": 1.212479829788208, + "learning_rate": 1.0292733762917033e-06, + "loss": 0.0077, + "num_input_tokens_seen": 31039352, + "step": 15852 + }, + { + "epoch": 2.1011265738899936, + "grad_norm": 2.112584114074707, + "learning_rate": 1.0289926780379165e-06, + "loss": 0.0166, + "num_input_tokens_seen": 31041736, + "step": 15853 + }, + { + "epoch": 2.1012591119946986, + "grad_norm": 0.16184380650520325, + "learning_rate": 1.0287120081461654e-06, + "loss": 0.0017, + "num_input_tokens_seen": 31043096, + "step": 15854 + }, + { + "epoch": 2.1013916500994037, + "grad_norm": 0.008505909703671932, + "learning_rate": 1.028431366621862e-06, + "loss": 0.0001, + "num_input_tokens_seen": 31044904, + "step": 15855 + }, + { + "epoch": 2.1015241882041087, + "grad_norm": 1.3323477506637573, + "learning_rate": 1.0281507534704164e-06, + "loss": 0.0119, + "num_input_tokens_seen": 31046952, + "step": 15856 + }, + { + "epoch": 2.1016567263088137, + "grad_norm": 2.7004241943359375, + "learning_rate": 1.0278701686972391e-06, + "loss": 0.0187, + "num_input_tokens_seen": 31048984, + "step": 15857 + }, + { + "epoch": 2.1017892644135188, + "grad_norm": 0.1896669864654541, + "learning_rate": 1.0275896123077416e-06, + "loss": 0.0005, + "num_input_tokens_seen": 31050856, + "step": 15858 + }, + { + "epoch": 2.101921802518224, + "grad_norm": 9.643930435180664, + "learning_rate": 1.0273090843073324e-06, + "loss": 0.195, + "num_input_tokens_seen": 31053096, + "step": 15859 + }, + { + "epoch": 2.1020543406229293, + "grad_norm": 2.5604121685028076, + "learning_rate": 1.02702858470142e-06, + "loss": 0.0164, + "num_input_tokens_seen": 31054928, + "step": 15860 + }, + { + "epoch": 2.1021868787276343, + "grad_norm": 0.011899290606379509, + "learning_rate": 1.0267481134954113e-06, + "loss": 0.0, + "num_input_tokens_seen": 31056072, + "step": 15861 + }, + { + "epoch": 2.1023194168323394, + "grad_norm": 2.410548448562622, + "learning_rate": 1.0264676706947166e-06, + "loss": 0.0429, + "num_input_tokens_seen": 31057920, + "step": 15862 + }, + { + "epoch": 2.1024519549370444, + "grad_norm": 0.5909061431884766, + "learning_rate": 1.0261872563047406e-06, + "loss": 0.0039, + "num_input_tokens_seen": 31059768, + "step": 15863 + }, + { + "epoch": 2.1025844930417494, + "grad_norm": 5.635152339935303, + "learning_rate": 1.0259068703308923e-06, + "loss": 0.039, + "num_input_tokens_seen": 31061800, + "step": 15864 + }, + { + "epoch": 2.1027170311464545, + "grad_norm": 12.06176471710205, + "learning_rate": 1.0256265127785762e-06, + "loss": 0.1931, + "num_input_tokens_seen": 31064000, + "step": 15865 + }, + { + "epoch": 2.1028495692511595, + "grad_norm": 1.787699580192566, + "learning_rate": 1.025346183653198e-06, + "loss": 0.008, + "num_input_tokens_seen": 31065752, + "step": 15866 + }, + { + "epoch": 2.102982107355865, + "grad_norm": 0.023606708273291588, + "learning_rate": 1.0250658829601617e-06, + "loss": 0.0002, + "num_input_tokens_seen": 31067616, + "step": 15867 + }, + { + "epoch": 2.10311464546057, + "grad_norm": 12.96444034576416, + "learning_rate": 1.024785610704873e-06, + "loss": 0.1763, + "num_input_tokens_seen": 31069712, + "step": 15868 + }, + { + "epoch": 2.103247183565275, + "grad_norm": 9.074623107910156, + "learning_rate": 1.0245053668927362e-06, + "loss": 0.1813, + "num_input_tokens_seen": 31072392, + "step": 15869 + }, + { + "epoch": 2.10337972166998, + "grad_norm": 0.41853076219558716, + "learning_rate": 1.0242251515291537e-06, + "loss": 0.0022, + "num_input_tokens_seen": 31073896, + "step": 15870 + }, + { + "epoch": 2.103512259774685, + "grad_norm": 0.07969501614570618, + "learning_rate": 1.0239449646195285e-06, + "loss": 0.0004, + "num_input_tokens_seen": 31076264, + "step": 15871 + }, + { + "epoch": 2.10364479787939, + "grad_norm": 8.31026554107666, + "learning_rate": 1.0236648061692616e-06, + "loss": 0.1045, + "num_input_tokens_seen": 31077656, + "step": 15872 + }, + { + "epoch": 2.103777335984095, + "grad_norm": 8.121954917907715, + "learning_rate": 1.0233846761837567e-06, + "loss": 0.0953, + "num_input_tokens_seen": 31079920, + "step": 15873 + }, + { + "epoch": 2.1039098740888007, + "grad_norm": 0.7636157274246216, + "learning_rate": 1.0231045746684133e-06, + "loss": 0.0024, + "num_input_tokens_seen": 31082640, + "step": 15874 + }, + { + "epoch": 2.1040424121935057, + "grad_norm": 10.376847267150879, + "learning_rate": 1.0228245016286331e-06, + "loss": 0.113, + "num_input_tokens_seen": 31085368, + "step": 15875 + }, + { + "epoch": 2.1041749502982108, + "grad_norm": 2.506401777267456, + "learning_rate": 1.0225444570698156e-06, + "loss": 0.0224, + "num_input_tokens_seen": 31087456, + "step": 15876 + }, + { + "epoch": 2.104307488402916, + "grad_norm": 9.249018669128418, + "learning_rate": 1.0222644409973596e-06, + "loss": 0.0401, + "num_input_tokens_seen": 31089272, + "step": 15877 + }, + { + "epoch": 2.104440026507621, + "grad_norm": 4.882899761199951, + "learning_rate": 1.021984453416665e-06, + "loss": 0.0245, + "num_input_tokens_seen": 31091184, + "step": 15878 + }, + { + "epoch": 2.104572564612326, + "grad_norm": 6.868321418762207, + "learning_rate": 1.0217044943331301e-06, + "loss": 0.0598, + "num_input_tokens_seen": 31093072, + "step": 15879 + }, + { + "epoch": 2.1047051027170314, + "grad_norm": 2.662890672683716, + "learning_rate": 1.0214245637521514e-06, + "loss": 0.0146, + "num_input_tokens_seen": 31094528, + "step": 15880 + }, + { + "epoch": 2.1048376408217364, + "grad_norm": 2.5376861095428467, + "learning_rate": 1.0211446616791281e-06, + "loss": 0.0185, + "num_input_tokens_seen": 31097088, + "step": 15881 + }, + { + "epoch": 2.1049701789264414, + "grad_norm": 0.005738845095038414, + "learning_rate": 1.020864788119455e-06, + "loss": 0.0, + "num_input_tokens_seen": 31098992, + "step": 15882 + }, + { + "epoch": 2.1051027170311465, + "grad_norm": 0.010538849979639053, + "learning_rate": 1.02058494307853e-06, + "loss": 0.0001, + "num_input_tokens_seen": 31101856, + "step": 15883 + }, + { + "epoch": 2.1052352551358515, + "grad_norm": 0.040175389498472214, + "learning_rate": 1.020305126561748e-06, + "loss": 0.0002, + "num_input_tokens_seen": 31104856, + "step": 15884 + }, + { + "epoch": 2.1053677932405566, + "grad_norm": 0.065611831843853, + "learning_rate": 1.020025338574504e-06, + "loss": 0.0003, + "num_input_tokens_seen": 31106136, + "step": 15885 + }, + { + "epoch": 2.1055003313452616, + "grad_norm": 0.049114301800727844, + "learning_rate": 1.0197455791221913e-06, + "loss": 0.0004, + "num_input_tokens_seen": 31108328, + "step": 15886 + }, + { + "epoch": 2.105632869449967, + "grad_norm": 3.7536838054656982, + "learning_rate": 1.0194658482102051e-06, + "loss": 0.0462, + "num_input_tokens_seen": 31110456, + "step": 15887 + }, + { + "epoch": 2.105765407554672, + "grad_norm": 7.078662872314453, + "learning_rate": 1.0191861458439397e-06, + "loss": 0.1159, + "num_input_tokens_seen": 31112360, + "step": 15888 + }, + { + "epoch": 2.105897945659377, + "grad_norm": 1.5046520233154297, + "learning_rate": 1.0189064720287871e-06, + "loss": 0.0147, + "num_input_tokens_seen": 31114248, + "step": 15889 + }, + { + "epoch": 2.106030483764082, + "grad_norm": 0.04844164475798607, + "learning_rate": 1.0186268267701393e-06, + "loss": 0.0003, + "num_input_tokens_seen": 31115720, + "step": 15890 + }, + { + "epoch": 2.1061630218687872, + "grad_norm": 3.4674792289733887, + "learning_rate": 1.0183472100733874e-06, + "loss": 0.0072, + "num_input_tokens_seen": 31117232, + "step": 15891 + }, + { + "epoch": 2.1062955599734923, + "grad_norm": 5.048121929168701, + "learning_rate": 1.0180676219439242e-06, + "loss": 0.0588, + "num_input_tokens_seen": 31119160, + "step": 15892 + }, + { + "epoch": 2.1064280980781973, + "grad_norm": 0.27222344279289246, + "learning_rate": 1.0177880623871389e-06, + "loss": 0.002, + "num_input_tokens_seen": 31120824, + "step": 15893 + }, + { + "epoch": 2.106560636182903, + "grad_norm": 9.71750259399414, + "learning_rate": 1.017508531408423e-06, + "loss": 0.1307, + "num_input_tokens_seen": 31122784, + "step": 15894 + }, + { + "epoch": 2.106693174287608, + "grad_norm": 0.01615164987742901, + "learning_rate": 1.0172290290131654e-06, + "loss": 0.0001, + "num_input_tokens_seen": 31124168, + "step": 15895 + }, + { + "epoch": 2.106825712392313, + "grad_norm": 12.569095611572266, + "learning_rate": 1.0169495552067549e-06, + "loss": 0.1598, + "num_input_tokens_seen": 31126464, + "step": 15896 + }, + { + "epoch": 2.106958250497018, + "grad_norm": 0.011246584355831146, + "learning_rate": 1.0166701099945793e-06, + "loss": 0.0001, + "num_input_tokens_seen": 31127872, + "step": 15897 + }, + { + "epoch": 2.107090788601723, + "grad_norm": 0.0025829714722931385, + "learning_rate": 1.0163906933820275e-06, + "loss": 0.0, + "num_input_tokens_seen": 31129064, + "step": 15898 + }, + { + "epoch": 2.107223326706428, + "grad_norm": 0.15759709477424622, + "learning_rate": 1.0161113053744872e-06, + "loss": 0.0005, + "num_input_tokens_seen": 31130464, + "step": 15899 + }, + { + "epoch": 2.107355864811133, + "grad_norm": 0.044281162321567535, + "learning_rate": 1.0158319459773448e-06, + "loss": 0.0003, + "num_input_tokens_seen": 31132808, + "step": 15900 + }, + { + "epoch": 2.1074884029158385, + "grad_norm": 6.325697422027588, + "learning_rate": 1.015552615195986e-06, + "loss": 0.1642, + "num_input_tokens_seen": 31134688, + "step": 15901 + }, + { + "epoch": 2.1076209410205435, + "grad_norm": 0.09797005355358124, + "learning_rate": 1.0152733130357972e-06, + "loss": 0.0004, + "num_input_tokens_seen": 31136432, + "step": 15902 + }, + { + "epoch": 2.1077534791252486, + "grad_norm": 0.2263217568397522, + "learning_rate": 1.0149940395021623e-06, + "loss": 0.0011, + "num_input_tokens_seen": 31138456, + "step": 15903 + }, + { + "epoch": 2.1078860172299536, + "grad_norm": 5.082943916320801, + "learning_rate": 1.0147147946004666e-06, + "loss": 0.0426, + "num_input_tokens_seen": 31140576, + "step": 15904 + }, + { + "epoch": 2.1080185553346587, + "grad_norm": 0.004184241872280836, + "learning_rate": 1.0144355783360953e-06, + "loss": 0.0, + "num_input_tokens_seen": 31142120, + "step": 15905 + }, + { + "epoch": 2.1081510934393637, + "grad_norm": 0.0060673123225569725, + "learning_rate": 1.0141563907144307e-06, + "loss": 0.0, + "num_input_tokens_seen": 31143664, + "step": 15906 + }, + { + "epoch": 2.1082836315440687, + "grad_norm": 7.490222930908203, + "learning_rate": 1.0138772317408563e-06, + "loss": 0.0756, + "num_input_tokens_seen": 31145864, + "step": 15907 + }, + { + "epoch": 2.108416169648774, + "grad_norm": 7.861691951751709, + "learning_rate": 1.013598101420753e-06, + "loss": 0.1636, + "num_input_tokens_seen": 31148160, + "step": 15908 + }, + { + "epoch": 2.1085487077534792, + "grad_norm": 5.4685750007629395, + "learning_rate": 1.0133189997595045e-06, + "loss": 0.0634, + "num_input_tokens_seen": 31149776, + "step": 15909 + }, + { + "epoch": 2.1086812458581843, + "grad_norm": 8.065530776977539, + "learning_rate": 1.0130399267624905e-06, + "loss": 0.0376, + "num_input_tokens_seen": 31151288, + "step": 15910 + }, + { + "epoch": 2.1088137839628893, + "grad_norm": 6.5656890869140625, + "learning_rate": 1.0127608824350936e-06, + "loss": 0.0458, + "num_input_tokens_seen": 31153056, + "step": 15911 + }, + { + "epoch": 2.1089463220675944, + "grad_norm": 10.338579177856445, + "learning_rate": 1.0124818667826924e-06, + "loss": 0.1058, + "num_input_tokens_seen": 31155616, + "step": 15912 + }, + { + "epoch": 2.1090788601722994, + "grad_norm": 0.01501214038580656, + "learning_rate": 1.0122028798106664e-06, + "loss": 0.0001, + "num_input_tokens_seen": 31157160, + "step": 15913 + }, + { + "epoch": 2.1092113982770044, + "grad_norm": 0.12210843712091446, + "learning_rate": 1.0119239215243963e-06, + "loss": 0.0008, + "num_input_tokens_seen": 31158744, + "step": 15914 + }, + { + "epoch": 2.10934393638171, + "grad_norm": 2.61883544921875, + "learning_rate": 1.0116449919292593e-06, + "loss": 0.0112, + "num_input_tokens_seen": 31161656, + "step": 15915 + }, + { + "epoch": 2.109476474486415, + "grad_norm": 0.305880606174469, + "learning_rate": 1.0113660910306328e-06, + "loss": 0.0013, + "num_input_tokens_seen": 31163680, + "step": 15916 + }, + { + "epoch": 2.10960901259112, + "grad_norm": 3.902306079864502, + "learning_rate": 1.011087218833896e-06, + "loss": 0.0667, + "num_input_tokens_seen": 31165960, + "step": 15917 + }, + { + "epoch": 2.109741550695825, + "grad_norm": 0.007223166991025209, + "learning_rate": 1.0108083753444236e-06, + "loss": 0.0, + "num_input_tokens_seen": 31167888, + "step": 15918 + }, + { + "epoch": 2.10987408880053, + "grad_norm": 3.985076904296875, + "learning_rate": 1.0105295605675944e-06, + "loss": 0.055, + "num_input_tokens_seen": 31169528, + "step": 15919 + }, + { + "epoch": 2.110006626905235, + "grad_norm": 0.010819315910339355, + "learning_rate": 1.0102507745087824e-06, + "loss": 0.0001, + "num_input_tokens_seen": 31171032, + "step": 15920 + }, + { + "epoch": 2.11013916500994, + "grad_norm": 0.2970164120197296, + "learning_rate": 1.0099720171733625e-06, + "loss": 0.0014, + "num_input_tokens_seen": 31173288, + "step": 15921 + }, + { + "epoch": 2.1102717031146456, + "grad_norm": 1.3870792388916016, + "learning_rate": 1.009693288566711e-06, + "loss": 0.0072, + "num_input_tokens_seen": 31175504, + "step": 15922 + }, + { + "epoch": 2.1104042412193507, + "grad_norm": 3.8207406997680664, + "learning_rate": 1.0094145886942009e-06, + "loss": 0.0414, + "num_input_tokens_seen": 31177416, + "step": 15923 + }, + { + "epoch": 2.1105367793240557, + "grad_norm": 0.01220555417239666, + "learning_rate": 1.0091359175612048e-06, + "loss": 0.0001, + "num_input_tokens_seen": 31178704, + "step": 15924 + }, + { + "epoch": 2.1106693174287607, + "grad_norm": 5.103734970092773, + "learning_rate": 1.0088572751730979e-06, + "loss": 0.1156, + "num_input_tokens_seen": 31180560, + "step": 15925 + }, + { + "epoch": 2.1108018555334658, + "grad_norm": 2.618957042694092, + "learning_rate": 1.008578661535251e-06, + "loss": 0.0129, + "num_input_tokens_seen": 31182320, + "step": 15926 + }, + { + "epoch": 2.110934393638171, + "grad_norm": 0.3424279987812042, + "learning_rate": 1.008300076653036e-06, + "loss": 0.0017, + "num_input_tokens_seen": 31184304, + "step": 15927 + }, + { + "epoch": 2.111066931742876, + "grad_norm": 4.514231204986572, + "learning_rate": 1.0080215205318254e-06, + "loss": 0.0777, + "num_input_tokens_seen": 31187208, + "step": 15928 + }, + { + "epoch": 2.1111994698475813, + "grad_norm": 15.18362045288086, + "learning_rate": 1.0077429931769879e-06, + "loss": 0.0963, + "num_input_tokens_seen": 31188136, + "step": 15929 + }, + { + "epoch": 2.1113320079522864, + "grad_norm": 0.2735540270805359, + "learning_rate": 1.0074644945938961e-06, + "loss": 0.0008, + "num_input_tokens_seen": 31189872, + "step": 15930 + }, + { + "epoch": 2.1114645460569914, + "grad_norm": 4.074028968811035, + "learning_rate": 1.0071860247879183e-06, + "loss": 0.1265, + "num_input_tokens_seen": 31191560, + "step": 15931 + }, + { + "epoch": 2.1115970841616964, + "grad_norm": 8.576210975646973, + "learning_rate": 1.0069075837644239e-06, + "loss": 0.1008, + "num_input_tokens_seen": 31193600, + "step": 15932 + }, + { + "epoch": 2.1117296222664015, + "grad_norm": 5.452406406402588, + "learning_rate": 1.0066291715287805e-06, + "loss": 0.0579, + "num_input_tokens_seen": 31196304, + "step": 15933 + }, + { + "epoch": 2.1118621603711065, + "grad_norm": 0.02765427902340889, + "learning_rate": 1.006350788086357e-06, + "loss": 0.0003, + "num_input_tokens_seen": 31198392, + "step": 15934 + }, + { + "epoch": 2.111994698475812, + "grad_norm": 8.22218132019043, + "learning_rate": 1.0060724334425212e-06, + "loss": 0.0574, + "num_input_tokens_seen": 31200320, + "step": 15935 + }, + { + "epoch": 2.112127236580517, + "grad_norm": 1.6232014894485474, + "learning_rate": 1.0057941076026399e-06, + "loss": 0.009, + "num_input_tokens_seen": 31203344, + "step": 15936 + }, + { + "epoch": 2.112259774685222, + "grad_norm": 0.7143456935882568, + "learning_rate": 1.0055158105720785e-06, + "loss": 0.004, + "num_input_tokens_seen": 31205064, + "step": 15937 + }, + { + "epoch": 2.112392312789927, + "grad_norm": 2.280484199523926, + "learning_rate": 1.0052375423562038e-06, + "loss": 0.0142, + "num_input_tokens_seen": 31207080, + "step": 15938 + }, + { + "epoch": 2.112524850894632, + "grad_norm": 0.03964947536587715, + "learning_rate": 1.0049593029603794e-06, + "loss": 0.0004, + "num_input_tokens_seen": 31208808, + "step": 15939 + }, + { + "epoch": 2.112657388999337, + "grad_norm": 0.06965214759111404, + "learning_rate": 1.0046810923899707e-06, + "loss": 0.0003, + "num_input_tokens_seen": 31210128, + "step": 15940 + }, + { + "epoch": 2.1127899271040422, + "grad_norm": 1.1292997598648071, + "learning_rate": 1.0044029106503437e-06, + "loss": 0.0075, + "num_input_tokens_seen": 31212664, + "step": 15941 + }, + { + "epoch": 2.1129224652087477, + "grad_norm": 3.159147262573242, + "learning_rate": 1.0041247577468596e-06, + "loss": 0.0537, + "num_input_tokens_seen": 31215080, + "step": 15942 + }, + { + "epoch": 2.1130550033134528, + "grad_norm": 7.49318790435791, + "learning_rate": 1.0038466336848826e-06, + "loss": 0.1755, + "num_input_tokens_seen": 31217272, + "step": 15943 + }, + { + "epoch": 2.113187541418158, + "grad_norm": 0.04262695461511612, + "learning_rate": 1.0035685384697736e-06, + "loss": 0.0002, + "num_input_tokens_seen": 31218632, + "step": 15944 + }, + { + "epoch": 2.113320079522863, + "grad_norm": 5.605983734130859, + "learning_rate": 1.0032904721068953e-06, + "loss": 0.0864, + "num_input_tokens_seen": 31220264, + "step": 15945 + }, + { + "epoch": 2.113452617627568, + "grad_norm": 3.892982006072998, + "learning_rate": 1.0030124346016104e-06, + "loss": 0.023, + "num_input_tokens_seen": 31223744, + "step": 15946 + }, + { + "epoch": 2.113585155732273, + "grad_norm": 0.2520269751548767, + "learning_rate": 1.0027344259592786e-06, + "loss": 0.0009, + "num_input_tokens_seen": 31225840, + "step": 15947 + }, + { + "epoch": 2.113717693836978, + "grad_norm": 3.2943215370178223, + "learning_rate": 1.0024564461852598e-06, + "loss": 0.0315, + "num_input_tokens_seen": 31227520, + "step": 15948 + }, + { + "epoch": 2.1138502319416834, + "grad_norm": 10.864590644836426, + "learning_rate": 1.0021784952849137e-06, + "loss": 0.155, + "num_input_tokens_seen": 31228864, + "step": 15949 + }, + { + "epoch": 2.1139827700463885, + "grad_norm": 1.5398967266082764, + "learning_rate": 1.0019005732635987e-06, + "loss": 0.0094, + "num_input_tokens_seen": 31230448, + "step": 15950 + }, + { + "epoch": 2.1141153081510935, + "grad_norm": 12.04830551147461, + "learning_rate": 1.0016226801266744e-06, + "loss": 0.2083, + "num_input_tokens_seen": 31233456, + "step": 15951 + }, + { + "epoch": 2.1142478462557985, + "grad_norm": 0.008589185774326324, + "learning_rate": 1.001344815879499e-06, + "loss": 0.0001, + "num_input_tokens_seen": 31234648, + "step": 15952 + }, + { + "epoch": 2.1143803843605036, + "grad_norm": 1.2209203243255615, + "learning_rate": 1.0010669805274295e-06, + "loss": 0.0027, + "num_input_tokens_seen": 31236400, + "step": 15953 + }, + { + "epoch": 2.1145129224652086, + "grad_norm": 0.07187455892562866, + "learning_rate": 1.0007891740758227e-06, + "loss": 0.0005, + "num_input_tokens_seen": 31238224, + "step": 15954 + }, + { + "epoch": 2.1146454605699136, + "grad_norm": 0.7243847250938416, + "learning_rate": 1.0005113965300343e-06, + "loss": 0.0043, + "num_input_tokens_seen": 31240200, + "step": 15955 + }, + { + "epoch": 2.114777998674619, + "grad_norm": 0.3958112895488739, + "learning_rate": 1.0002336478954211e-06, + "loss": 0.0009, + "num_input_tokens_seen": 31242016, + "step": 15956 + }, + { + "epoch": 2.114910536779324, + "grad_norm": 1.3609455823898315, + "learning_rate": 9.99955928177337e-07, + "loss": 0.0191, + "num_input_tokens_seen": 31244464, + "step": 15957 + }, + { + "epoch": 2.115043074884029, + "grad_norm": 2.805771827697754, + "learning_rate": 9.996782373811384e-07, + "loss": 0.0157, + "num_input_tokens_seen": 31246024, + "step": 15958 + }, + { + "epoch": 2.1151756129887342, + "grad_norm": 1.5725172758102417, + "learning_rate": 9.994005755121784e-07, + "loss": 0.0156, + "num_input_tokens_seen": 31247872, + "step": 15959 + }, + { + "epoch": 2.1153081510934393, + "grad_norm": 0.12252626568078995, + "learning_rate": 9.991229425758095e-07, + "loss": 0.0009, + "num_input_tokens_seen": 31249864, + "step": 15960 + }, + { + "epoch": 2.1154406891981443, + "grad_norm": 0.005042593460530043, + "learning_rate": 9.988453385773866e-07, + "loss": 0.0, + "num_input_tokens_seen": 31251632, + "step": 15961 + }, + { + "epoch": 2.1155732273028494, + "grad_norm": 0.13577313721179962, + "learning_rate": 9.985677635222612e-07, + "loss": 0.0006, + "num_input_tokens_seen": 31254840, + "step": 15962 + }, + { + "epoch": 2.115705765407555, + "grad_norm": 1.2255592346191406, + "learning_rate": 9.982902174157844e-07, + "loss": 0.0101, + "num_input_tokens_seen": 31256480, + "step": 15963 + }, + { + "epoch": 2.11583830351226, + "grad_norm": 3.4641661643981934, + "learning_rate": 9.980127002633088e-07, + "loss": 0.0094, + "num_input_tokens_seen": 31258584, + "step": 15964 + }, + { + "epoch": 2.115970841616965, + "grad_norm": 1.2744196653366089, + "learning_rate": 9.97735212070184e-07, + "loss": 0.007, + "num_input_tokens_seen": 31261520, + "step": 15965 + }, + { + "epoch": 2.11610337972167, + "grad_norm": 8.298657417297363, + "learning_rate": 9.974577528417613e-07, + "loss": 0.081, + "num_input_tokens_seen": 31263224, + "step": 15966 + }, + { + "epoch": 2.116235917826375, + "grad_norm": 5.913610458374023, + "learning_rate": 9.971803225833894e-07, + "loss": 0.1112, + "num_input_tokens_seen": 31266648, + "step": 15967 + }, + { + "epoch": 2.11636845593108, + "grad_norm": 0.01634967513382435, + "learning_rate": 9.96902921300418e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31269184, + "step": 15968 + }, + { + "epoch": 2.1165009940357855, + "grad_norm": 5.69858455657959, + "learning_rate": 9.966255489981943e-07, + "loss": 0.1304, + "num_input_tokens_seen": 31270872, + "step": 15969 + }, + { + "epoch": 2.1166335321404905, + "grad_norm": 8.893097877502441, + "learning_rate": 9.963482056820669e-07, + "loss": 0.1104, + "num_input_tokens_seen": 31273080, + "step": 15970 + }, + { + "epoch": 2.1167660702451956, + "grad_norm": 3.0060222148895264, + "learning_rate": 9.960708913573846e-07, + "loss": 0.0116, + "num_input_tokens_seen": 31274496, + "step": 15971 + }, + { + "epoch": 2.1168986083499006, + "grad_norm": 3.0118281841278076, + "learning_rate": 9.957936060294928e-07, + "loss": 0.0218, + "num_input_tokens_seen": 31276672, + "step": 15972 + }, + { + "epoch": 2.1170311464546057, + "grad_norm": 3.4252753257751465, + "learning_rate": 9.955163497037382e-07, + "loss": 0.0354, + "num_input_tokens_seen": 31278472, + "step": 15973 + }, + { + "epoch": 2.1171636845593107, + "grad_norm": 4.892392158508301, + "learning_rate": 9.952391223854653e-07, + "loss": 0.0609, + "num_input_tokens_seen": 31280056, + "step": 15974 + }, + { + "epoch": 2.1172962226640157, + "grad_norm": 21.116289138793945, + "learning_rate": 9.949619240800198e-07, + "loss": 0.1467, + "num_input_tokens_seen": 31281928, + "step": 15975 + }, + { + "epoch": 2.117428760768721, + "grad_norm": 0.01190525945276022, + "learning_rate": 9.94684754792748e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31283376, + "step": 15976 + }, + { + "epoch": 2.1175612988734263, + "grad_norm": 3.9007999897003174, + "learning_rate": 9.944076145289925e-07, + "loss": 0.0598, + "num_input_tokens_seen": 31285232, + "step": 15977 + }, + { + "epoch": 2.1176938369781313, + "grad_norm": 0.11487153172492981, + "learning_rate": 9.941305032940968e-07, + "loss": 0.0007, + "num_input_tokens_seen": 31286440, + "step": 15978 + }, + { + "epoch": 2.1178263750828363, + "grad_norm": 0.6625118851661682, + "learning_rate": 9.938534210934038e-07, + "loss": 0.0047, + "num_input_tokens_seen": 31287760, + "step": 15979 + }, + { + "epoch": 2.1179589131875414, + "grad_norm": 18.259695053100586, + "learning_rate": 9.935763679322553e-07, + "loss": 0.2927, + "num_input_tokens_seen": 31289680, + "step": 15980 + }, + { + "epoch": 2.1180914512922464, + "grad_norm": 18.65428924560547, + "learning_rate": 9.932993438159931e-07, + "loss": 0.0817, + "num_input_tokens_seen": 31291560, + "step": 15981 + }, + { + "epoch": 2.1182239893969514, + "grad_norm": 1.9163814783096313, + "learning_rate": 9.930223487499605e-07, + "loss": 0.0079, + "num_input_tokens_seen": 31293960, + "step": 15982 + }, + { + "epoch": 2.118356527501657, + "grad_norm": 2.2967591285705566, + "learning_rate": 9.92745382739496e-07, + "loss": 0.0155, + "num_input_tokens_seen": 31296736, + "step": 15983 + }, + { + "epoch": 2.118489065606362, + "grad_norm": 0.02329343557357788, + "learning_rate": 9.924684457899408e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31298504, + "step": 15984 + }, + { + "epoch": 2.118621603711067, + "grad_norm": 0.04614463821053505, + "learning_rate": 9.92191537906634e-07, + "loss": 0.0003, + "num_input_tokens_seen": 31300224, + "step": 15985 + }, + { + "epoch": 2.118754141815772, + "grad_norm": 3.9793007373809814, + "learning_rate": 9.919146590949133e-07, + "loss": 0.0707, + "num_input_tokens_seen": 31301880, + "step": 15986 + }, + { + "epoch": 2.118886679920477, + "grad_norm": 0.554431140422821, + "learning_rate": 9.916378093601184e-07, + "loss": 0.0022, + "num_input_tokens_seen": 31303712, + "step": 15987 + }, + { + "epoch": 2.119019218025182, + "grad_norm": 5.004972457885742, + "learning_rate": 9.913609887075881e-07, + "loss": 0.1117, + "num_input_tokens_seen": 31306464, + "step": 15988 + }, + { + "epoch": 2.119151756129887, + "grad_norm": 12.363912582397461, + "learning_rate": 9.910841971426589e-07, + "loss": 0.2642, + "num_input_tokens_seen": 31309136, + "step": 15989 + }, + { + "epoch": 2.1192842942345926, + "grad_norm": 2.2543981075286865, + "learning_rate": 9.90807434670667e-07, + "loss": 0.0288, + "num_input_tokens_seen": 31311288, + "step": 15990 + }, + { + "epoch": 2.1194168323392977, + "grad_norm": 2.8324477672576904, + "learning_rate": 9.905307012969481e-07, + "loss": 0.0156, + "num_input_tokens_seen": 31313904, + "step": 15991 + }, + { + "epoch": 2.1195493704440027, + "grad_norm": 0.549984335899353, + "learning_rate": 9.902539970268393e-07, + "loss": 0.0048, + "num_input_tokens_seen": 31315568, + "step": 15992 + }, + { + "epoch": 2.1196819085487077, + "grad_norm": 0.00872162077575922, + "learning_rate": 9.899773218656745e-07, + "loss": 0.0, + "num_input_tokens_seen": 31317144, + "step": 15993 + }, + { + "epoch": 2.119814446653413, + "grad_norm": 7.813802242279053, + "learning_rate": 9.89700675818789e-07, + "loss": 0.1264, + "num_input_tokens_seen": 31318880, + "step": 15994 + }, + { + "epoch": 2.119946984758118, + "grad_norm": 2.485776662826538, + "learning_rate": 9.894240588915168e-07, + "loss": 0.033, + "num_input_tokens_seen": 31320832, + "step": 15995 + }, + { + "epoch": 2.120079522862823, + "grad_norm": 0.8729603886604309, + "learning_rate": 9.8914747108919e-07, + "loss": 0.0046, + "num_input_tokens_seen": 31322296, + "step": 15996 + }, + { + "epoch": 2.1202120609675283, + "grad_norm": 1.4836153984069824, + "learning_rate": 9.888709124171429e-07, + "loss": 0.0095, + "num_input_tokens_seen": 31324000, + "step": 15997 + }, + { + "epoch": 2.1203445990722334, + "grad_norm": 0.16681288182735443, + "learning_rate": 9.885943828807062e-07, + "loss": 0.0004, + "num_input_tokens_seen": 31325640, + "step": 15998 + }, + { + "epoch": 2.1204771371769384, + "grad_norm": 7.054441452026367, + "learning_rate": 9.883178824852133e-07, + "loss": 0.0537, + "num_input_tokens_seen": 31326896, + "step": 15999 + }, + { + "epoch": 2.1206096752816435, + "grad_norm": 4.794320583343506, + "learning_rate": 9.880414112359945e-07, + "loss": 0.0548, + "num_input_tokens_seen": 31328744, + "step": 16000 + }, + { + "epoch": 2.1207422133863485, + "grad_norm": 0.05177433416247368, + "learning_rate": 9.877649691383797e-07, + "loss": 0.0003, + "num_input_tokens_seen": 31330512, + "step": 16001 + }, + { + "epoch": 2.1208747514910535, + "grad_norm": 1.4180458784103394, + "learning_rate": 9.874885561977e-07, + "loss": 0.0101, + "num_input_tokens_seen": 31333200, + "step": 16002 + }, + { + "epoch": 2.1210072895957586, + "grad_norm": 0.0033680996857583523, + "learning_rate": 9.872121724192849e-07, + "loss": 0.0, + "num_input_tokens_seen": 31334888, + "step": 16003 + }, + { + "epoch": 2.121139827700464, + "grad_norm": 3.0794763565063477, + "learning_rate": 9.869358178084614e-07, + "loss": 0.0148, + "num_input_tokens_seen": 31336512, + "step": 16004 + }, + { + "epoch": 2.121272365805169, + "grad_norm": 0.07193078100681305, + "learning_rate": 9.866594923705602e-07, + "loss": 0.0004, + "num_input_tokens_seen": 31339328, + "step": 16005 + }, + { + "epoch": 2.121404903909874, + "grad_norm": 0.027965696528553963, + "learning_rate": 9.86383196110908e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31341512, + "step": 16006 + }, + { + "epoch": 2.121537442014579, + "grad_norm": 6.011144638061523, + "learning_rate": 9.861069290348313e-07, + "loss": 0.0635, + "num_input_tokens_seen": 31342960, + "step": 16007 + }, + { + "epoch": 2.121669980119284, + "grad_norm": 9.07699966430664, + "learning_rate": 9.858306911476577e-07, + "loss": 0.1208, + "num_input_tokens_seen": 31346008, + "step": 16008 + }, + { + "epoch": 2.1218025182239892, + "grad_norm": 17.39802360534668, + "learning_rate": 9.855544824547133e-07, + "loss": 0.1293, + "num_input_tokens_seen": 31348080, + "step": 16009 + }, + { + "epoch": 2.1219350563286943, + "grad_norm": 5.495151519775391, + "learning_rate": 9.852783029613224e-07, + "loss": 0.0322, + "num_input_tokens_seen": 31350000, + "step": 16010 + }, + { + "epoch": 2.1220675944333998, + "grad_norm": 3.1166462898254395, + "learning_rate": 9.850021526728119e-07, + "loss": 0.0201, + "num_input_tokens_seen": 31351936, + "step": 16011 + }, + { + "epoch": 2.122200132538105, + "grad_norm": 0.42338091135025024, + "learning_rate": 9.847260315945038e-07, + "loss": 0.0023, + "num_input_tokens_seen": 31355128, + "step": 16012 + }, + { + "epoch": 2.12233267064281, + "grad_norm": 0.005450528580695391, + "learning_rate": 9.844499397317239e-07, + "loss": 0.0, + "num_input_tokens_seen": 31356600, + "step": 16013 + }, + { + "epoch": 2.122465208747515, + "grad_norm": 5.787471771240234, + "learning_rate": 9.841738770897948e-07, + "loss": 0.0883, + "num_input_tokens_seen": 31358824, + "step": 16014 + }, + { + "epoch": 2.12259774685222, + "grad_norm": 10.324423789978027, + "learning_rate": 9.838978436740393e-07, + "loss": 0.1273, + "num_input_tokens_seen": 31361000, + "step": 16015 + }, + { + "epoch": 2.122730284956925, + "grad_norm": 0.052438125014305115, + "learning_rate": 9.83621839489778e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31362840, + "step": 16016 + }, + { + "epoch": 2.1228628230616304, + "grad_norm": 0.0914560854434967, + "learning_rate": 9.833458645423341e-07, + "loss": 0.0004, + "num_input_tokens_seen": 31364752, + "step": 16017 + }, + { + "epoch": 2.1229953611663355, + "grad_norm": 11.242875099182129, + "learning_rate": 9.830699188370292e-07, + "loss": 0.1178, + "num_input_tokens_seen": 31366752, + "step": 16018 + }, + { + "epoch": 2.1231278992710405, + "grad_norm": 0.5296412706375122, + "learning_rate": 9.827940023791823e-07, + "loss": 0.0024, + "num_input_tokens_seen": 31369016, + "step": 16019 + }, + { + "epoch": 2.1232604373757455, + "grad_norm": 4.376650810241699, + "learning_rate": 9.825181151741138e-07, + "loss": 0.02, + "num_input_tokens_seen": 31372000, + "step": 16020 + }, + { + "epoch": 2.1233929754804506, + "grad_norm": 0.0015726073179394007, + "learning_rate": 9.822422572271422e-07, + "loss": 0.0, + "num_input_tokens_seen": 31373640, + "step": 16021 + }, + { + "epoch": 2.1235255135851556, + "grad_norm": 1.7273216247558594, + "learning_rate": 9.81966428543588e-07, + "loss": 0.0129, + "num_input_tokens_seen": 31375480, + "step": 16022 + }, + { + "epoch": 2.1236580516898607, + "grad_norm": 0.04559807851910591, + "learning_rate": 9.81690629128767e-07, + "loss": 0.0004, + "num_input_tokens_seen": 31376720, + "step": 16023 + }, + { + "epoch": 2.123790589794566, + "grad_norm": 4.228126525878906, + "learning_rate": 9.81414858987999e-07, + "loss": 0.0361, + "num_input_tokens_seen": 31378512, + "step": 16024 + }, + { + "epoch": 2.123923127899271, + "grad_norm": 0.025753268972039223, + "learning_rate": 9.811391181266003e-07, + "loss": 0.0002, + "num_input_tokens_seen": 31381368, + "step": 16025 + }, + { + "epoch": 2.124055666003976, + "grad_norm": 7.938784122467041, + "learning_rate": 9.80863406549887e-07, + "loss": 0.1007, + "num_input_tokens_seen": 31383648, + "step": 16026 + }, + { + "epoch": 2.1241882041086813, + "grad_norm": 0.003898393828421831, + "learning_rate": 9.805877242631745e-07, + "loss": 0.0, + "num_input_tokens_seen": 31386144, + "step": 16027 + }, + { + "epoch": 2.1243207422133863, + "grad_norm": 0.2597859799861908, + "learning_rate": 9.80312071271779e-07, + "loss": 0.0013, + "num_input_tokens_seen": 31388736, + "step": 16028 + }, + { + "epoch": 2.1244532803180913, + "grad_norm": 0.004055833909660578, + "learning_rate": 9.800364475810158e-07, + "loss": 0.0, + "num_input_tokens_seen": 31391560, + "step": 16029 + }, + { + "epoch": 2.1245858184227964, + "grad_norm": 0.02276485599577427, + "learning_rate": 9.797608531961986e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31392736, + "step": 16030 + }, + { + "epoch": 2.124718356527502, + "grad_norm": 0.012399180792272091, + "learning_rate": 9.79485288122641e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31394608, + "step": 16031 + }, + { + "epoch": 2.124850894632207, + "grad_norm": 1.0706788301467896, + "learning_rate": 9.79209752365655e-07, + "loss": 0.0049, + "num_input_tokens_seen": 31396424, + "step": 16032 + }, + { + "epoch": 2.124983432736912, + "grad_norm": 15.55600643157959, + "learning_rate": 9.789342459305547e-07, + "loss": 0.0434, + "num_input_tokens_seen": 31398288, + "step": 16033 + }, + { + "epoch": 2.125115970841617, + "grad_norm": 0.31384390592575073, + "learning_rate": 9.786587688226509e-07, + "loss": 0.0009, + "num_input_tokens_seen": 31400048, + "step": 16034 + }, + { + "epoch": 2.125248508946322, + "grad_norm": 2.1488723754882812, + "learning_rate": 9.783833210472566e-07, + "loss": 0.0171, + "num_input_tokens_seen": 31402672, + "step": 16035 + }, + { + "epoch": 2.125381047051027, + "grad_norm": 1.6600793600082397, + "learning_rate": 9.781079026096813e-07, + "loss": 0.0088, + "num_input_tokens_seen": 31406328, + "step": 16036 + }, + { + "epoch": 2.125513585155732, + "grad_norm": 9.093141555786133, + "learning_rate": 9.778325135152356e-07, + "loss": 0.0366, + "num_input_tokens_seen": 31408984, + "step": 16037 + }, + { + "epoch": 2.1256461232604376, + "grad_norm": 7.948393821716309, + "learning_rate": 9.775571537692282e-07, + "loss": 0.0802, + "num_input_tokens_seen": 31410848, + "step": 16038 + }, + { + "epoch": 2.1257786613651426, + "grad_norm": 0.02403217926621437, + "learning_rate": 9.7728182337697e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31412928, + "step": 16039 + }, + { + "epoch": 2.1259111994698476, + "grad_norm": 5.993346214294434, + "learning_rate": 9.77006522343768e-07, + "loss": 0.0818, + "num_input_tokens_seen": 31414568, + "step": 16040 + }, + { + "epoch": 2.1260437375745527, + "grad_norm": 0.8455209136009216, + "learning_rate": 9.76731250674932e-07, + "loss": 0.0045, + "num_input_tokens_seen": 31416680, + "step": 16041 + }, + { + "epoch": 2.1261762756792577, + "grad_norm": 3.664370059967041, + "learning_rate": 9.76456008375768e-07, + "loss": 0.0114, + "num_input_tokens_seen": 31418768, + "step": 16042 + }, + { + "epoch": 2.1263088137839627, + "grad_norm": 2.9327104091644287, + "learning_rate": 9.761807954515822e-07, + "loss": 0.0336, + "num_input_tokens_seen": 31421104, + "step": 16043 + }, + { + "epoch": 2.126441351888668, + "grad_norm": 1.1890151500701904, + "learning_rate": 9.759056119076828e-07, + "loss": 0.0064, + "num_input_tokens_seen": 31422896, + "step": 16044 + }, + { + "epoch": 2.1265738899933733, + "grad_norm": 9.620471000671387, + "learning_rate": 9.75630457749375e-07, + "loss": 0.0794, + "num_input_tokens_seen": 31423800, + "step": 16045 + }, + { + "epoch": 2.1267064280980783, + "grad_norm": 0.2077944129705429, + "learning_rate": 9.753553329819625e-07, + "loss": 0.0005, + "num_input_tokens_seen": 31425248, + "step": 16046 + }, + { + "epoch": 2.1268389662027833, + "grad_norm": 2.230100154876709, + "learning_rate": 9.750802376107518e-07, + "loss": 0.0072, + "num_input_tokens_seen": 31426896, + "step": 16047 + }, + { + "epoch": 2.1269715043074884, + "grad_norm": 0.0026732729747891426, + "learning_rate": 9.748051716410453e-07, + "loss": 0.0, + "num_input_tokens_seen": 31428568, + "step": 16048 + }, + { + "epoch": 2.1271040424121934, + "grad_norm": 16.94239616394043, + "learning_rate": 9.74530135078148e-07, + "loss": 0.2333, + "num_input_tokens_seen": 31430664, + "step": 16049 + }, + { + "epoch": 2.1272365805168985, + "grad_norm": 5.136758327484131, + "learning_rate": 9.742551279273625e-07, + "loss": 0.1026, + "num_input_tokens_seen": 31433024, + "step": 16050 + }, + { + "epoch": 2.127369118621604, + "grad_norm": 0.01774507761001587, + "learning_rate": 9.73980150193989e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31434856, + "step": 16051 + }, + { + "epoch": 2.127501656726309, + "grad_norm": 8.356134414672852, + "learning_rate": 9.737052018833326e-07, + "loss": 0.1812, + "num_input_tokens_seen": 31437320, + "step": 16052 + }, + { + "epoch": 2.127634194831014, + "grad_norm": 7.8876800537109375, + "learning_rate": 9.734302830006917e-07, + "loss": 0.1496, + "num_input_tokens_seen": 31439160, + "step": 16053 + }, + { + "epoch": 2.127766732935719, + "grad_norm": 0.06373758614063263, + "learning_rate": 9.731553935513688e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31440584, + "step": 16054 + }, + { + "epoch": 2.127899271040424, + "grad_norm": 0.20476488769054413, + "learning_rate": 9.728805335406635e-07, + "loss": 0.0012, + "num_input_tokens_seen": 31442464, + "step": 16055 + }, + { + "epoch": 2.128031809145129, + "grad_norm": 0.143484428524971, + "learning_rate": 9.726057029738748e-07, + "loss": 0.0007, + "num_input_tokens_seen": 31444384, + "step": 16056 + }, + { + "epoch": 2.128164347249834, + "grad_norm": 0.20091512799263, + "learning_rate": 9.72330901856301e-07, + "loss": 0.0008, + "num_input_tokens_seen": 31446272, + "step": 16057 + }, + { + "epoch": 2.1282968853545396, + "grad_norm": 0.10682906955480576, + "learning_rate": 9.720561301932413e-07, + "loss": 0.0006, + "num_input_tokens_seen": 31448248, + "step": 16058 + }, + { + "epoch": 2.1284294234592447, + "grad_norm": 5.417721748352051, + "learning_rate": 9.717813879899946e-07, + "loss": 0.0129, + "num_input_tokens_seen": 31450112, + "step": 16059 + }, + { + "epoch": 2.1285619615639497, + "grad_norm": 9.350624084472656, + "learning_rate": 9.715066752518567e-07, + "loss": 0.029, + "num_input_tokens_seen": 31452304, + "step": 16060 + }, + { + "epoch": 2.1286944996686548, + "grad_norm": 10.480183601379395, + "learning_rate": 9.712319919841246e-07, + "loss": 0.0676, + "num_input_tokens_seen": 31454384, + "step": 16061 + }, + { + "epoch": 2.12882703777336, + "grad_norm": 0.050779614597558975, + "learning_rate": 9.709573381920945e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31455936, + "step": 16062 + }, + { + "epoch": 2.128959575878065, + "grad_norm": 0.06771764904260635, + "learning_rate": 9.706827138810608e-07, + "loss": 0.0003, + "num_input_tokens_seen": 31458576, + "step": 16063 + }, + { + "epoch": 2.12909211398277, + "grad_norm": 0.0005205015768297017, + "learning_rate": 9.704081190563192e-07, + "loss": 0.0, + "num_input_tokens_seen": 31459744, + "step": 16064 + }, + { + "epoch": 2.1292246520874754, + "grad_norm": 8.332296371459961, + "learning_rate": 9.701335537231655e-07, + "loss": 0.155, + "num_input_tokens_seen": 31461928, + "step": 16065 + }, + { + "epoch": 2.1293571901921804, + "grad_norm": 0.04889029264450073, + "learning_rate": 9.698590178868919e-07, + "loss": 0.0004, + "num_input_tokens_seen": 31463544, + "step": 16066 + }, + { + "epoch": 2.1294897282968854, + "grad_norm": 7.734241008758545, + "learning_rate": 9.695845115527924e-07, + "loss": 0.0321, + "num_input_tokens_seen": 31466328, + "step": 16067 + }, + { + "epoch": 2.1296222664015905, + "grad_norm": 0.019083110615611076, + "learning_rate": 9.693100347261592e-07, + "loss": 0.0, + "num_input_tokens_seen": 31467624, + "step": 16068 + }, + { + "epoch": 2.1297548045062955, + "grad_norm": 0.463288813829422, + "learning_rate": 9.690355874122833e-07, + "loss": 0.0015, + "num_input_tokens_seen": 31469360, + "step": 16069 + }, + { + "epoch": 2.1298873426110005, + "grad_norm": 2.1234171390533447, + "learning_rate": 9.687611696164579e-07, + "loss": 0.0048, + "num_input_tokens_seen": 31471312, + "step": 16070 + }, + { + "epoch": 2.1300198807157056, + "grad_norm": 2.0269062519073486, + "learning_rate": 9.68486781343974e-07, + "loss": 0.0138, + "num_input_tokens_seen": 31473232, + "step": 16071 + }, + { + "epoch": 2.130152418820411, + "grad_norm": 0.0676170065999031, + "learning_rate": 9.682124226001216e-07, + "loss": 0.0003, + "num_input_tokens_seen": 31475432, + "step": 16072 + }, + { + "epoch": 2.130284956925116, + "grad_norm": 0.0007454505539499223, + "learning_rate": 9.679380933901905e-07, + "loss": 0.0, + "num_input_tokens_seen": 31476696, + "step": 16073 + }, + { + "epoch": 2.130417495029821, + "grad_norm": 10.975934982299805, + "learning_rate": 9.67663793719469e-07, + "loss": 0.3094, + "num_input_tokens_seen": 31479936, + "step": 16074 + }, + { + "epoch": 2.130550033134526, + "grad_norm": 5.366577625274658, + "learning_rate": 9.673895235932465e-07, + "loss": 0.0135, + "num_input_tokens_seen": 31481816, + "step": 16075 + }, + { + "epoch": 2.130682571239231, + "grad_norm": 0.029743866994976997, + "learning_rate": 9.671152830168124e-07, + "loss": 0.0002, + "num_input_tokens_seen": 31483224, + "step": 16076 + }, + { + "epoch": 2.1308151093439363, + "grad_norm": 0.019760040566325188, + "learning_rate": 9.66841071995453e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31485992, + "step": 16077 + }, + { + "epoch": 2.1309476474486413, + "grad_norm": 4.939930438995361, + "learning_rate": 9.665668905344555e-07, + "loss": 0.0324, + "num_input_tokens_seen": 31487416, + "step": 16078 + }, + { + "epoch": 2.1310801855533468, + "grad_norm": 8.268176078796387, + "learning_rate": 9.662927386391053e-07, + "loss": 0.2331, + "num_input_tokens_seen": 31490056, + "step": 16079 + }, + { + "epoch": 2.131212723658052, + "grad_norm": 4.270195960998535, + "learning_rate": 9.660186163146898e-07, + "loss": 0.0779, + "num_input_tokens_seen": 31493184, + "step": 16080 + }, + { + "epoch": 2.131345261762757, + "grad_norm": 2.958547592163086, + "learning_rate": 9.65744523566493e-07, + "loss": 0.0077, + "num_input_tokens_seen": 31494936, + "step": 16081 + }, + { + "epoch": 2.131477799867462, + "grad_norm": 0.027521083131432533, + "learning_rate": 9.654704603998011e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31497288, + "step": 16082 + }, + { + "epoch": 2.131610337972167, + "grad_norm": 0.007150406017899513, + "learning_rate": 9.65196426819897e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31499520, + "step": 16083 + }, + { + "epoch": 2.131742876076872, + "grad_norm": 0.10605286061763763, + "learning_rate": 9.64922422832064e-07, + "loss": 0.0005, + "num_input_tokens_seen": 31501992, + "step": 16084 + }, + { + "epoch": 2.131875414181577, + "grad_norm": 0.02458733692765236, + "learning_rate": 9.646484484415867e-07, + "loss": 0.0002, + "num_input_tokens_seen": 31504480, + "step": 16085 + }, + { + "epoch": 2.1320079522862825, + "grad_norm": 0.4717639088630676, + "learning_rate": 9.643745036537461e-07, + "loss": 0.0014, + "num_input_tokens_seen": 31506680, + "step": 16086 + }, + { + "epoch": 2.1321404903909875, + "grad_norm": 5.20466947555542, + "learning_rate": 9.64100588473824e-07, + "loss": 0.0596, + "num_input_tokens_seen": 31508600, + "step": 16087 + }, + { + "epoch": 2.1322730284956926, + "grad_norm": 2.5027499198913574, + "learning_rate": 9.638267029071027e-07, + "loss": 0.0214, + "num_input_tokens_seen": 31510616, + "step": 16088 + }, + { + "epoch": 2.1324055666003976, + "grad_norm": 2.1486763954162598, + "learning_rate": 9.635528469588614e-07, + "loss": 0.0115, + "num_input_tokens_seen": 31512112, + "step": 16089 + }, + { + "epoch": 2.1325381047051026, + "grad_norm": 2.9380295276641846, + "learning_rate": 9.632790206343822e-07, + "loss": 0.0243, + "num_input_tokens_seen": 31513848, + "step": 16090 + }, + { + "epoch": 2.1326706428098077, + "grad_norm": 0.004848671145737171, + "learning_rate": 9.630052239389438e-07, + "loss": 0.0, + "num_input_tokens_seen": 31515720, + "step": 16091 + }, + { + "epoch": 2.1328031809145127, + "grad_norm": 2.190732955932617, + "learning_rate": 9.627314568778246e-07, + "loss": 0.0184, + "num_input_tokens_seen": 31517528, + "step": 16092 + }, + { + "epoch": 2.132935719019218, + "grad_norm": 3.7082090377807617, + "learning_rate": 9.62457719456303e-07, + "loss": 0.017, + "num_input_tokens_seen": 31520472, + "step": 16093 + }, + { + "epoch": 2.1330682571239232, + "grad_norm": 0.31992393732070923, + "learning_rate": 9.621840116796577e-07, + "loss": 0.0021, + "num_input_tokens_seen": 31522328, + "step": 16094 + }, + { + "epoch": 2.1332007952286283, + "grad_norm": 16.42392349243164, + "learning_rate": 9.619103335531646e-07, + "loss": 0.1801, + "num_input_tokens_seen": 31524352, + "step": 16095 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 0.23243871331214905, + "learning_rate": 9.616366850821025e-07, + "loss": 0.0012, + "num_input_tokens_seen": 31526336, + "step": 16096 + }, + { + "epoch": 2.1334658714380383, + "grad_norm": 5.045673847198486, + "learning_rate": 9.613630662717463e-07, + "loss": 0.0154, + "num_input_tokens_seen": 31528144, + "step": 16097 + }, + { + "epoch": 2.1335984095427434, + "grad_norm": 0.18616047501564026, + "learning_rate": 9.610894771273716e-07, + "loss": 0.0012, + "num_input_tokens_seen": 31529640, + "step": 16098 + }, + { + "epoch": 2.1337309476474484, + "grad_norm": 2.5941483974456787, + "learning_rate": 9.608159176542525e-07, + "loss": 0.0059, + "num_input_tokens_seen": 31531184, + "step": 16099 + }, + { + "epoch": 2.133863485752154, + "grad_norm": 3.2780237197875977, + "learning_rate": 9.605423878576646e-07, + "loss": 0.0554, + "num_input_tokens_seen": 31533720, + "step": 16100 + }, + { + "epoch": 2.133996023856859, + "grad_norm": 0.0056445179507136345, + "learning_rate": 9.602688877428822e-07, + "loss": 0.0, + "num_input_tokens_seen": 31535168, + "step": 16101 + }, + { + "epoch": 2.134128561961564, + "grad_norm": 11.270421028137207, + "learning_rate": 9.59995417315178e-07, + "loss": 0.1496, + "num_input_tokens_seen": 31537376, + "step": 16102 + }, + { + "epoch": 2.134261100066269, + "grad_norm": 3.9139246940612793, + "learning_rate": 9.597219765798241e-07, + "loss": 0.0455, + "num_input_tokens_seen": 31539704, + "step": 16103 + }, + { + "epoch": 2.134393638170974, + "grad_norm": 4.7583746910095215, + "learning_rate": 9.594485655420926e-07, + "loss": 0.0339, + "num_input_tokens_seen": 31541552, + "step": 16104 + }, + { + "epoch": 2.134526176275679, + "grad_norm": 8.119093894958496, + "learning_rate": 9.591751842072552e-07, + "loss": 0.069, + "num_input_tokens_seen": 31543048, + "step": 16105 + }, + { + "epoch": 2.134658714380384, + "grad_norm": 0.2847495377063751, + "learning_rate": 9.589018325805842e-07, + "loss": 0.001, + "num_input_tokens_seen": 31544696, + "step": 16106 + }, + { + "epoch": 2.1347912524850896, + "grad_norm": 0.023146186023950577, + "learning_rate": 9.586285106673492e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31545984, + "step": 16107 + }, + { + "epoch": 2.1349237905897946, + "grad_norm": 0.024002045392990112, + "learning_rate": 9.5835521847282e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31547232, + "step": 16108 + }, + { + "epoch": 2.1350563286944997, + "grad_norm": 0.11687221378087997, + "learning_rate": 9.580819560022654e-07, + "loss": 0.0006, + "num_input_tokens_seen": 31548888, + "step": 16109 + }, + { + "epoch": 2.1351888667992047, + "grad_norm": 0.19441835582256317, + "learning_rate": 9.578087232609536e-07, + "loss": 0.0004, + "num_input_tokens_seen": 31550224, + "step": 16110 + }, + { + "epoch": 2.1353214049039098, + "grad_norm": 1.8425302505493164, + "learning_rate": 9.575355202541533e-07, + "loss": 0.0129, + "num_input_tokens_seen": 31552104, + "step": 16111 + }, + { + "epoch": 2.135453943008615, + "grad_norm": 0.005997309461236, + "learning_rate": 9.572623469871336e-07, + "loss": 0.0, + "num_input_tokens_seen": 31553696, + "step": 16112 + }, + { + "epoch": 2.1355864811133203, + "grad_norm": 0.42871078848838806, + "learning_rate": 9.569892034651598e-07, + "loss": 0.002, + "num_input_tokens_seen": 31555336, + "step": 16113 + }, + { + "epoch": 2.1357190192180253, + "grad_norm": 0.8083357214927673, + "learning_rate": 9.567160896934988e-07, + "loss": 0.0017, + "num_input_tokens_seen": 31557016, + "step": 16114 + }, + { + "epoch": 2.1358515573227304, + "grad_norm": 0.09059926122426987, + "learning_rate": 9.564430056774151e-07, + "loss": 0.0003, + "num_input_tokens_seen": 31558904, + "step": 16115 + }, + { + "epoch": 2.1359840954274354, + "grad_norm": 0.07792149484157562, + "learning_rate": 9.561699514221763e-07, + "loss": 0.0002, + "num_input_tokens_seen": 31560864, + "step": 16116 + }, + { + "epoch": 2.1361166335321404, + "grad_norm": 1.5018413066864014, + "learning_rate": 9.558969269330448e-07, + "loss": 0.0065, + "num_input_tokens_seen": 31563040, + "step": 16117 + }, + { + "epoch": 2.1362491716368455, + "grad_norm": 1.0631963014602661, + "learning_rate": 9.556239322152866e-07, + "loss": 0.0048, + "num_input_tokens_seen": 31564928, + "step": 16118 + }, + { + "epoch": 2.1363817097415505, + "grad_norm": 4.5471625328063965, + "learning_rate": 9.553509672741646e-07, + "loss": 0.1125, + "num_input_tokens_seen": 31567320, + "step": 16119 + }, + { + "epoch": 2.136514247846256, + "grad_norm": 0.09209774434566498, + "learning_rate": 9.550780321149414e-07, + "loss": 0.0003, + "num_input_tokens_seen": 31568448, + "step": 16120 + }, + { + "epoch": 2.136646785950961, + "grad_norm": 1.036643147468567, + "learning_rate": 9.548051267428787e-07, + "loss": 0.0063, + "num_input_tokens_seen": 31570088, + "step": 16121 + }, + { + "epoch": 2.136779324055666, + "grad_norm": 0.916536808013916, + "learning_rate": 9.545322511632401e-07, + "loss": 0.0044, + "num_input_tokens_seen": 31572064, + "step": 16122 + }, + { + "epoch": 2.136911862160371, + "grad_norm": 4.551767349243164, + "learning_rate": 9.542594053812848e-07, + "loss": 0.0298, + "num_input_tokens_seen": 31573320, + "step": 16123 + }, + { + "epoch": 2.137044400265076, + "grad_norm": 7.391839027404785, + "learning_rate": 9.539865894022755e-07, + "loss": 0.0669, + "num_input_tokens_seen": 31575096, + "step": 16124 + }, + { + "epoch": 2.137176938369781, + "grad_norm": 0.07263413816690445, + "learning_rate": 9.537138032314713e-07, + "loss": 0.0005, + "num_input_tokens_seen": 31576344, + "step": 16125 + }, + { + "epoch": 2.137309476474486, + "grad_norm": 25.39282989501953, + "learning_rate": 9.534410468741309e-07, + "loss": 0.2838, + "num_input_tokens_seen": 31578712, + "step": 16126 + }, + { + "epoch": 2.1374420145791917, + "grad_norm": 0.557320773601532, + "learning_rate": 9.531683203355146e-07, + "loss": 0.0024, + "num_input_tokens_seen": 31580344, + "step": 16127 + }, + { + "epoch": 2.1375745526838967, + "grad_norm": 0.2329101413488388, + "learning_rate": 9.528956236208797e-07, + "loss": 0.0008, + "num_input_tokens_seen": 31581864, + "step": 16128 + }, + { + "epoch": 2.1377070907886018, + "grad_norm": 6.1121087074279785, + "learning_rate": 9.52622956735485e-07, + "loss": 0.0442, + "num_input_tokens_seen": 31583704, + "step": 16129 + }, + { + "epoch": 2.137839628893307, + "grad_norm": 0.03288009390234947, + "learning_rate": 9.523503196845873e-07, + "loss": 0.0002, + "num_input_tokens_seen": 31585480, + "step": 16130 + }, + { + "epoch": 2.137972166998012, + "grad_norm": 7.6009368896484375, + "learning_rate": 9.520777124734423e-07, + "loss": 0.0741, + "num_input_tokens_seen": 31587488, + "step": 16131 + }, + { + "epoch": 2.138104705102717, + "grad_norm": 0.21550118923187256, + "learning_rate": 9.518051351073079e-07, + "loss": 0.0009, + "num_input_tokens_seen": 31589280, + "step": 16132 + }, + { + "epoch": 2.1382372432074224, + "grad_norm": 0.002960493788123131, + "learning_rate": 9.515325875914383e-07, + "loss": 0.0, + "num_input_tokens_seen": 31591024, + "step": 16133 + }, + { + "epoch": 2.1383697813121274, + "grad_norm": 0.2940198481082916, + "learning_rate": 9.512600699310878e-07, + "loss": 0.0015, + "num_input_tokens_seen": 31592736, + "step": 16134 + }, + { + "epoch": 2.1385023194168324, + "grad_norm": 8.266256332397461, + "learning_rate": 9.509875821315126e-07, + "loss": 0.1394, + "num_input_tokens_seen": 31594472, + "step": 16135 + }, + { + "epoch": 2.1386348575215375, + "grad_norm": 0.0049509345553815365, + "learning_rate": 9.507151241979643e-07, + "loss": 0.0, + "num_input_tokens_seen": 31596128, + "step": 16136 + }, + { + "epoch": 2.1387673956262425, + "grad_norm": 0.012205123901367188, + "learning_rate": 9.504426961356983e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31598272, + "step": 16137 + }, + { + "epoch": 2.1388999337309476, + "grad_norm": 3.4081907272338867, + "learning_rate": 9.501702979499658e-07, + "loss": 0.0113, + "num_input_tokens_seen": 31599664, + "step": 16138 + }, + { + "epoch": 2.1390324718356526, + "grad_norm": 0.01042995322495699, + "learning_rate": 9.498979296460195e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31602992, + "step": 16139 + }, + { + "epoch": 2.139165009940358, + "grad_norm": 0.04743897169828415, + "learning_rate": 9.496255912291095e-07, + "loss": 0.0003, + "num_input_tokens_seen": 31605688, + "step": 16140 + }, + { + "epoch": 2.139297548045063, + "grad_norm": 0.01022800151258707, + "learning_rate": 9.493532827044874e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31607216, + "step": 16141 + }, + { + "epoch": 2.139430086149768, + "grad_norm": 0.012211198918521404, + "learning_rate": 9.490810040774048e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31609624, + "step": 16142 + }, + { + "epoch": 2.139562624254473, + "grad_norm": 0.0026489999145269394, + "learning_rate": 9.488087553531103e-07, + "loss": 0.0, + "num_input_tokens_seen": 31611952, + "step": 16143 + }, + { + "epoch": 2.1396951623591782, + "grad_norm": 9.145845413208008, + "learning_rate": 9.485365365368532e-07, + "loss": 0.0543, + "num_input_tokens_seen": 31614104, + "step": 16144 + }, + { + "epoch": 2.1398277004638833, + "grad_norm": 0.007660697679966688, + "learning_rate": 9.482643476338821e-07, + "loss": 0.0, + "num_input_tokens_seen": 31615896, + "step": 16145 + }, + { + "epoch": 2.1399602385685883, + "grad_norm": 9.002161979675293, + "learning_rate": 9.479921886494437e-07, + "loss": 0.144, + "num_input_tokens_seen": 31617768, + "step": 16146 + }, + { + "epoch": 2.140092776673294, + "grad_norm": 0.4844762086868286, + "learning_rate": 9.477200595887868e-07, + "loss": 0.0024, + "num_input_tokens_seen": 31619504, + "step": 16147 + }, + { + "epoch": 2.140225314777999, + "grad_norm": 0.017951352521777153, + "learning_rate": 9.474479604571588e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31621960, + "step": 16148 + }, + { + "epoch": 2.140357852882704, + "grad_norm": 0.007865986786782742, + "learning_rate": 9.47175891259805e-07, + "loss": 0.0, + "num_input_tokens_seen": 31623280, + "step": 16149 + }, + { + "epoch": 2.140490390987409, + "grad_norm": 0.0025304299779236317, + "learning_rate": 9.469038520019716e-07, + "loss": 0.0, + "num_input_tokens_seen": 31624776, + "step": 16150 + }, + { + "epoch": 2.140622929092114, + "grad_norm": 4.182048320770264, + "learning_rate": 9.466318426889032e-07, + "loss": 0.0633, + "num_input_tokens_seen": 31626616, + "step": 16151 + }, + { + "epoch": 2.140755467196819, + "grad_norm": 1.7968076467514038, + "learning_rate": 9.463598633258437e-07, + "loss": 0.009, + "num_input_tokens_seen": 31629864, + "step": 16152 + }, + { + "epoch": 2.140888005301524, + "grad_norm": 0.07165373116731644, + "learning_rate": 9.460879139180376e-07, + "loss": 0.0004, + "num_input_tokens_seen": 31631792, + "step": 16153 + }, + { + "epoch": 2.1410205434062295, + "grad_norm": 8.528525352478027, + "learning_rate": 9.458159944707296e-07, + "loss": 0.1164, + "num_input_tokens_seen": 31633744, + "step": 16154 + }, + { + "epoch": 2.1411530815109345, + "grad_norm": 12.273754119873047, + "learning_rate": 9.455441049891612e-07, + "loss": 0.0443, + "num_input_tokens_seen": 31635440, + "step": 16155 + }, + { + "epoch": 2.1412856196156396, + "grad_norm": 0.01286132913082838, + "learning_rate": 9.452722454785749e-07, + "loss": 0.0, + "num_input_tokens_seen": 31637784, + "step": 16156 + }, + { + "epoch": 2.1414181577203446, + "grad_norm": 0.09625675529241562, + "learning_rate": 9.450004159442114e-07, + "loss": 0.0007, + "num_input_tokens_seen": 31639752, + "step": 16157 + }, + { + "epoch": 2.1415506958250496, + "grad_norm": 0.0034940026234835386, + "learning_rate": 9.447286163913128e-07, + "loss": 0.0, + "num_input_tokens_seen": 31641744, + "step": 16158 + }, + { + "epoch": 2.1416832339297547, + "grad_norm": 1.0387438535690308, + "learning_rate": 9.4445684682512e-07, + "loss": 0.0025, + "num_input_tokens_seen": 31643552, + "step": 16159 + }, + { + "epoch": 2.1418157720344597, + "grad_norm": 0.13720466196537018, + "learning_rate": 9.441851072508726e-07, + "loss": 0.0006, + "num_input_tokens_seen": 31645824, + "step": 16160 + }, + { + "epoch": 2.141948310139165, + "grad_norm": 7.158527851104736, + "learning_rate": 9.439133976738094e-07, + "loss": 0.0594, + "num_input_tokens_seen": 31647216, + "step": 16161 + }, + { + "epoch": 2.1420808482438702, + "grad_norm": 3.2074477672576904, + "learning_rate": 9.436417180991686e-07, + "loss": 0.0703, + "num_input_tokens_seen": 31648936, + "step": 16162 + }, + { + "epoch": 2.1422133863485753, + "grad_norm": 0.04987587034702301, + "learning_rate": 9.4337006853219e-07, + "loss": 0.0002, + "num_input_tokens_seen": 31651040, + "step": 16163 + }, + { + "epoch": 2.1423459244532803, + "grad_norm": 0.001366147887893021, + "learning_rate": 9.430984489781095e-07, + "loss": 0.0, + "num_input_tokens_seen": 31653640, + "step": 16164 + }, + { + "epoch": 2.1424784625579854, + "grad_norm": 8.50261402130127, + "learning_rate": 9.428268594421658e-07, + "loss": 0.1152, + "num_input_tokens_seen": 31655560, + "step": 16165 + }, + { + "epoch": 2.1426110006626904, + "grad_norm": 0.003825852647423744, + "learning_rate": 9.425552999295945e-07, + "loss": 0.0, + "num_input_tokens_seen": 31657472, + "step": 16166 + }, + { + "epoch": 2.1427435387673954, + "grad_norm": 12.706666946411133, + "learning_rate": 9.422837704456306e-07, + "loss": 0.1536, + "num_input_tokens_seen": 31659328, + "step": 16167 + }, + { + "epoch": 2.142876076872101, + "grad_norm": 10.95345401763916, + "learning_rate": 9.420122709955109e-07, + "loss": 0.1022, + "num_input_tokens_seen": 31661112, + "step": 16168 + }, + { + "epoch": 2.143008614976806, + "grad_norm": 0.2134910523891449, + "learning_rate": 9.417408015844695e-07, + "loss": 0.0007, + "num_input_tokens_seen": 31662688, + "step": 16169 + }, + { + "epoch": 2.143141153081511, + "grad_norm": 0.00654176389798522, + "learning_rate": 9.414693622177398e-07, + "loss": 0.0, + "num_input_tokens_seen": 31664840, + "step": 16170 + }, + { + "epoch": 2.143273691186216, + "grad_norm": 0.009351802058517933, + "learning_rate": 9.411979529005566e-07, + "loss": 0.0, + "num_input_tokens_seen": 31666400, + "step": 16171 + }, + { + "epoch": 2.143406229290921, + "grad_norm": 8.891071319580078, + "learning_rate": 9.409265736381515e-07, + "loss": 0.1067, + "num_input_tokens_seen": 31667960, + "step": 16172 + }, + { + "epoch": 2.143538767395626, + "grad_norm": 9.3027925491333, + "learning_rate": 9.406552244357584e-07, + "loss": 0.1951, + "num_input_tokens_seen": 31670512, + "step": 16173 + }, + { + "epoch": 2.143671305500331, + "grad_norm": 11.335594177246094, + "learning_rate": 9.403839052986083e-07, + "loss": 0.0715, + "num_input_tokens_seen": 31673912, + "step": 16174 + }, + { + "epoch": 2.1438038436050366, + "grad_norm": 0.0008645054185763001, + "learning_rate": 9.401126162319324e-07, + "loss": 0.0, + "num_input_tokens_seen": 31675224, + "step": 16175 + }, + { + "epoch": 2.1439363817097417, + "grad_norm": 0.1555292159318924, + "learning_rate": 9.398413572409606e-07, + "loss": 0.0008, + "num_input_tokens_seen": 31676928, + "step": 16176 + }, + { + "epoch": 2.1440689198144467, + "grad_norm": 0.08556928485631943, + "learning_rate": 9.395701283309245e-07, + "loss": 0.0005, + "num_input_tokens_seen": 31679056, + "step": 16177 + }, + { + "epoch": 2.1442014579191517, + "grad_norm": 0.001294081099331379, + "learning_rate": 9.39298929507052e-07, + "loss": 0.0, + "num_input_tokens_seen": 31680600, + "step": 16178 + }, + { + "epoch": 2.1443339960238568, + "grad_norm": 6.066516876220703, + "learning_rate": 9.390277607745738e-07, + "loss": 0.051, + "num_input_tokens_seen": 31682176, + "step": 16179 + }, + { + "epoch": 2.144466534128562, + "grad_norm": 0.6687232851982117, + "learning_rate": 9.38756622138717e-07, + "loss": 0.0039, + "num_input_tokens_seen": 31684032, + "step": 16180 + }, + { + "epoch": 2.144599072233267, + "grad_norm": 2.428736448287964, + "learning_rate": 9.384855136047088e-07, + "loss": 0.0211, + "num_input_tokens_seen": 31686432, + "step": 16181 + }, + { + "epoch": 2.1447316103379723, + "grad_norm": 3.453789234161377, + "learning_rate": 9.382144351777781e-07, + "loss": 0.0191, + "num_input_tokens_seen": 31688400, + "step": 16182 + }, + { + "epoch": 2.1448641484426774, + "grad_norm": 5.131911277770996, + "learning_rate": 9.379433868631493e-07, + "loss": 0.0699, + "num_input_tokens_seen": 31690552, + "step": 16183 + }, + { + "epoch": 2.1449966865473824, + "grad_norm": 0.19395923614501953, + "learning_rate": 9.376723686660508e-07, + "loss": 0.0009, + "num_input_tokens_seen": 31691560, + "step": 16184 + }, + { + "epoch": 2.1451292246520874, + "grad_norm": 2.869584560394287, + "learning_rate": 9.374013805917068e-07, + "loss": 0.0236, + "num_input_tokens_seen": 31693864, + "step": 16185 + }, + { + "epoch": 2.1452617627567925, + "grad_norm": 0.021380914375185966, + "learning_rate": 9.37130422645342e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31695288, + "step": 16186 + }, + { + "epoch": 2.1453943008614975, + "grad_norm": 0.022113464772701263, + "learning_rate": 9.368594948321802e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31696592, + "step": 16187 + }, + { + "epoch": 2.1455268389662026, + "grad_norm": 0.000471513340016827, + "learning_rate": 9.365885971574456e-07, + "loss": 0.0, + "num_input_tokens_seen": 31698200, + "step": 16188 + }, + { + "epoch": 2.145659377070908, + "grad_norm": 2.6180081367492676, + "learning_rate": 9.36317729626362e-07, + "loss": 0.0118, + "num_input_tokens_seen": 31699976, + "step": 16189 + }, + { + "epoch": 2.145791915175613, + "grad_norm": 0.01344381459057331, + "learning_rate": 9.360468922441518e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31703056, + "step": 16190 + }, + { + "epoch": 2.145924453280318, + "grad_norm": 6.0160722732543945, + "learning_rate": 9.357760850160361e-07, + "loss": 0.0688, + "num_input_tokens_seen": 31705304, + "step": 16191 + }, + { + "epoch": 2.146056991385023, + "grad_norm": 0.00019644621352199465, + "learning_rate": 9.355053079472365e-07, + "loss": 0.0, + "num_input_tokens_seen": 31706328, + "step": 16192 + }, + { + "epoch": 2.146189529489728, + "grad_norm": 4.057782173156738, + "learning_rate": 9.352345610429733e-07, + "loss": 0.0427, + "num_input_tokens_seen": 31708208, + "step": 16193 + }, + { + "epoch": 2.1463220675944332, + "grad_norm": 9.826642990112305, + "learning_rate": 9.349638443084669e-07, + "loss": 0.133, + "num_input_tokens_seen": 31710168, + "step": 16194 + }, + { + "epoch": 2.1464546056991387, + "grad_norm": 0.0009362462442368269, + "learning_rate": 9.346931577489383e-07, + "loss": 0.0, + "num_input_tokens_seen": 31711272, + "step": 16195 + }, + { + "epoch": 2.1465871438038437, + "grad_norm": 0.16852040588855743, + "learning_rate": 9.344225013696054e-07, + "loss": 0.0009, + "num_input_tokens_seen": 31713656, + "step": 16196 + }, + { + "epoch": 2.146719681908549, + "grad_norm": 0.003864973085001111, + "learning_rate": 9.341518751756867e-07, + "loss": 0.0, + "num_input_tokens_seen": 31715360, + "step": 16197 + }, + { + "epoch": 2.146852220013254, + "grad_norm": 0.02112683095037937, + "learning_rate": 9.338812791723995e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31717800, + "step": 16198 + }, + { + "epoch": 2.146984758117959, + "grad_norm": 0.005130508448928595, + "learning_rate": 9.336107133649622e-07, + "loss": 0.0, + "num_input_tokens_seen": 31720088, + "step": 16199 + }, + { + "epoch": 2.147117296222664, + "grad_norm": 2.187309741973877, + "learning_rate": 9.333401777585902e-07, + "loss": 0.0057, + "num_input_tokens_seen": 31724184, + "step": 16200 + }, + { + "epoch": 2.147249834327369, + "grad_norm": 0.09846512228250504, + "learning_rate": 9.330696723585012e-07, + "loss": 0.0005, + "num_input_tokens_seen": 31726112, + "step": 16201 + }, + { + "epoch": 2.1473823724320744, + "grad_norm": 2.1234724521636963, + "learning_rate": 9.327991971699097e-07, + "loss": 0.0204, + "num_input_tokens_seen": 31728040, + "step": 16202 + }, + { + "epoch": 2.1475149105367795, + "grad_norm": 3.657836437225342, + "learning_rate": 9.325287521980303e-07, + "loss": 0.0352, + "num_input_tokens_seen": 31729840, + "step": 16203 + }, + { + "epoch": 2.1476474486414845, + "grad_norm": 0.13401801884174347, + "learning_rate": 9.322583374480784e-07, + "loss": 0.0004, + "num_input_tokens_seen": 31731120, + "step": 16204 + }, + { + "epoch": 2.1477799867461895, + "grad_norm": 0.01660325564444065, + "learning_rate": 9.319879529252676e-07, + "loss": 0.0, + "num_input_tokens_seen": 31732256, + "step": 16205 + }, + { + "epoch": 2.1479125248508946, + "grad_norm": 0.04423169046640396, + "learning_rate": 9.317175986348098e-07, + "loss": 0.0003, + "num_input_tokens_seen": 31733656, + "step": 16206 + }, + { + "epoch": 2.1480450629555996, + "grad_norm": 1.9051802158355713, + "learning_rate": 9.314472745819194e-07, + "loss": 0.0105, + "num_input_tokens_seen": 31734736, + "step": 16207 + }, + { + "epoch": 2.1481776010603046, + "grad_norm": 0.26849591732025146, + "learning_rate": 9.311769807718076e-07, + "loss": 0.0012, + "num_input_tokens_seen": 31735888, + "step": 16208 + }, + { + "epoch": 2.14831013916501, + "grad_norm": 0.002372746355831623, + "learning_rate": 9.309067172096853e-07, + "loss": 0.0, + "num_input_tokens_seen": 31737800, + "step": 16209 + }, + { + "epoch": 2.148442677269715, + "grad_norm": 13.417776107788086, + "learning_rate": 9.306364839007645e-07, + "loss": 0.16, + "num_input_tokens_seen": 31739504, + "step": 16210 + }, + { + "epoch": 2.14857521537442, + "grad_norm": 4.874273300170898, + "learning_rate": 9.303662808502542e-07, + "loss": 0.0115, + "num_input_tokens_seen": 31741464, + "step": 16211 + }, + { + "epoch": 2.1487077534791252, + "grad_norm": 9.336577415466309, + "learning_rate": 9.300961080633659e-07, + "loss": 0.056, + "num_input_tokens_seen": 31743016, + "step": 16212 + }, + { + "epoch": 2.1488402915838303, + "grad_norm": 0.1485712081193924, + "learning_rate": 9.298259655453074e-07, + "loss": 0.0011, + "num_input_tokens_seen": 31745192, + "step": 16213 + }, + { + "epoch": 2.1489728296885353, + "grad_norm": 1.395764946937561, + "learning_rate": 9.295558533012867e-07, + "loss": 0.0223, + "num_input_tokens_seen": 31747440, + "step": 16214 + }, + { + "epoch": 2.1491053677932404, + "grad_norm": 0.10491873323917389, + "learning_rate": 9.292857713365134e-07, + "loss": 0.0007, + "num_input_tokens_seen": 31749304, + "step": 16215 + }, + { + "epoch": 2.149237905897946, + "grad_norm": 1.2465040683746338, + "learning_rate": 9.290157196561939e-07, + "loss": 0.0081, + "num_input_tokens_seen": 31751832, + "step": 16216 + }, + { + "epoch": 2.149370444002651, + "grad_norm": 0.0006115460419096053, + "learning_rate": 9.287456982655343e-07, + "loss": 0.0, + "num_input_tokens_seen": 31753424, + "step": 16217 + }, + { + "epoch": 2.149502982107356, + "grad_norm": 11.82949447631836, + "learning_rate": 9.284757071697426e-07, + "loss": 0.0768, + "num_input_tokens_seen": 31755656, + "step": 16218 + }, + { + "epoch": 2.149635520212061, + "grad_norm": 5.837401390075684, + "learning_rate": 9.282057463740224e-07, + "loss": 0.0344, + "num_input_tokens_seen": 31757504, + "step": 16219 + }, + { + "epoch": 2.149768058316766, + "grad_norm": 0.28341424465179443, + "learning_rate": 9.279358158835808e-07, + "loss": 0.0015, + "num_input_tokens_seen": 31759040, + "step": 16220 + }, + { + "epoch": 2.149900596421471, + "grad_norm": 11.043609619140625, + "learning_rate": 9.276659157036208e-07, + "loss": 0.1107, + "num_input_tokens_seen": 31761504, + "step": 16221 + }, + { + "epoch": 2.1500331345261765, + "grad_norm": 2.8094217777252197, + "learning_rate": 9.273960458393469e-07, + "loss": 0.0186, + "num_input_tokens_seen": 31763600, + "step": 16222 + }, + { + "epoch": 2.1501656726308815, + "grad_norm": 19.75331687927246, + "learning_rate": 9.271262062959613e-07, + "loss": 0.1066, + "num_input_tokens_seen": 31766008, + "step": 16223 + }, + { + "epoch": 2.1502982107355866, + "grad_norm": 4.31569766998291, + "learning_rate": 9.268563970786673e-07, + "loss": 0.0426, + "num_input_tokens_seen": 31767904, + "step": 16224 + }, + { + "epoch": 2.1504307488402916, + "grad_norm": 11.932907104492188, + "learning_rate": 9.265866181926683e-07, + "loss": 0.1437, + "num_input_tokens_seen": 31770104, + "step": 16225 + }, + { + "epoch": 2.1505632869449967, + "grad_norm": 13.723921775817871, + "learning_rate": 9.263168696431646e-07, + "loss": 0.2389, + "num_input_tokens_seen": 31772200, + "step": 16226 + }, + { + "epoch": 2.1506958250497017, + "grad_norm": 0.01985761523246765, + "learning_rate": 9.260471514353575e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31774056, + "step": 16227 + }, + { + "epoch": 2.1508283631544067, + "grad_norm": 6.340223789215088, + "learning_rate": 9.257774635744468e-07, + "loss": 0.0343, + "num_input_tokens_seen": 31775864, + "step": 16228 + }, + { + "epoch": 2.150960901259112, + "grad_norm": 0.005917272996157408, + "learning_rate": 9.255078060656319e-07, + "loss": 0.0, + "num_input_tokens_seen": 31777704, + "step": 16229 + }, + { + "epoch": 2.1510934393638173, + "grad_norm": 1.6982152462005615, + "learning_rate": 9.252381789141126e-07, + "loss": 0.012, + "num_input_tokens_seen": 31780568, + "step": 16230 + }, + { + "epoch": 2.1512259774685223, + "grad_norm": 0.007099973503500223, + "learning_rate": 9.249685821250887e-07, + "loss": 0.0, + "num_input_tokens_seen": 31782160, + "step": 16231 + }, + { + "epoch": 2.1513585155732273, + "grad_norm": 4.672240734100342, + "learning_rate": 9.246990157037569e-07, + "loss": 0.039, + "num_input_tokens_seen": 31784696, + "step": 16232 + }, + { + "epoch": 2.1514910536779324, + "grad_norm": 0.9148737788200378, + "learning_rate": 9.24429479655315e-07, + "loss": 0.0036, + "num_input_tokens_seen": 31786296, + "step": 16233 + }, + { + "epoch": 2.1516235917826374, + "grad_norm": 6.01843786239624, + "learning_rate": 9.241599739849585e-07, + "loss": 0.0548, + "num_input_tokens_seen": 31788592, + "step": 16234 + }, + { + "epoch": 2.1517561298873424, + "grad_norm": 7.280119895935059, + "learning_rate": 9.23890498697885e-07, + "loss": 0.2176, + "num_input_tokens_seen": 31790984, + "step": 16235 + }, + { + "epoch": 2.151888667992048, + "grad_norm": 0.0006190254935063422, + "learning_rate": 9.23621053799291e-07, + "loss": 0.0, + "num_input_tokens_seen": 31792400, + "step": 16236 + }, + { + "epoch": 2.152021206096753, + "grad_norm": 0.054704733192920685, + "learning_rate": 9.233516392943703e-07, + "loss": 0.0002, + "num_input_tokens_seen": 31794048, + "step": 16237 + }, + { + "epoch": 2.152153744201458, + "grad_norm": 0.028584782034158707, + "learning_rate": 9.230822551883179e-07, + "loss": 0.0002, + "num_input_tokens_seen": 31796120, + "step": 16238 + }, + { + "epoch": 2.152286282306163, + "grad_norm": 0.0022372419480234385, + "learning_rate": 9.228129014863271e-07, + "loss": 0.0, + "num_input_tokens_seen": 31797432, + "step": 16239 + }, + { + "epoch": 2.152418820410868, + "grad_norm": 7.567423343658447, + "learning_rate": 9.225435781935909e-07, + "loss": 0.1204, + "num_input_tokens_seen": 31799864, + "step": 16240 + }, + { + "epoch": 2.152551358515573, + "grad_norm": 0.0038301527965813875, + "learning_rate": 9.22274285315303e-07, + "loss": 0.0, + "num_input_tokens_seen": 31801048, + "step": 16241 + }, + { + "epoch": 2.152683896620278, + "grad_norm": 3.1408138275146484, + "learning_rate": 9.22005022856656e-07, + "loss": 0.0355, + "num_input_tokens_seen": 31803160, + "step": 16242 + }, + { + "epoch": 2.1528164347249836, + "grad_norm": 2.440631866455078, + "learning_rate": 9.217357908228409e-07, + "loss": 0.0166, + "num_input_tokens_seen": 31805464, + "step": 16243 + }, + { + "epoch": 2.1529489728296887, + "grad_norm": 11.36818790435791, + "learning_rate": 9.214665892190484e-07, + "loss": 0.1031, + "num_input_tokens_seen": 31807144, + "step": 16244 + }, + { + "epoch": 2.1530815109343937, + "grad_norm": 8.45652961730957, + "learning_rate": 9.211974180504679e-07, + "loss": 0.0603, + "num_input_tokens_seen": 31809568, + "step": 16245 + }, + { + "epoch": 2.1532140490390987, + "grad_norm": 0.635740339756012, + "learning_rate": 9.209282773222916e-07, + "loss": 0.0047, + "num_input_tokens_seen": 31811128, + "step": 16246 + }, + { + "epoch": 2.153346587143804, + "grad_norm": 3.3793442249298096, + "learning_rate": 9.206591670397064e-07, + "loss": 0.0206, + "num_input_tokens_seen": 31812496, + "step": 16247 + }, + { + "epoch": 2.153479125248509, + "grad_norm": 20.20541763305664, + "learning_rate": 9.20390087207903e-07, + "loss": 0.3728, + "num_input_tokens_seen": 31814680, + "step": 16248 + }, + { + "epoch": 2.153611663353214, + "grad_norm": 1.4733071327209473, + "learning_rate": 9.201210378320683e-07, + "loss": 0.0057, + "num_input_tokens_seen": 31816064, + "step": 16249 + }, + { + "epoch": 2.1537442014579193, + "grad_norm": 13.205214500427246, + "learning_rate": 9.19852018917389e-07, + "loss": 0.0845, + "num_input_tokens_seen": 31818832, + "step": 16250 + }, + { + "epoch": 2.1538767395626244, + "grad_norm": 4.623105525970459, + "learning_rate": 9.195830304690539e-07, + "loss": 0.0465, + "num_input_tokens_seen": 31820320, + "step": 16251 + }, + { + "epoch": 2.1540092776673294, + "grad_norm": 0.00034794409293681383, + "learning_rate": 9.193140724922481e-07, + "loss": 0.0, + "num_input_tokens_seen": 31821624, + "step": 16252 + }, + { + "epoch": 2.1541418157720345, + "grad_norm": 0.00833373237401247, + "learning_rate": 9.190451449921567e-07, + "loss": 0.0, + "num_input_tokens_seen": 31823232, + "step": 16253 + }, + { + "epoch": 2.1542743538767395, + "grad_norm": 0.908818244934082, + "learning_rate": 9.187762479739665e-07, + "loss": 0.0029, + "num_input_tokens_seen": 31824936, + "step": 16254 + }, + { + "epoch": 2.1544068919814445, + "grad_norm": 13.074993133544922, + "learning_rate": 9.185073814428599e-07, + "loss": 0.0965, + "num_input_tokens_seen": 31826928, + "step": 16255 + }, + { + "epoch": 2.1545394300861496, + "grad_norm": 1.2655054330825806, + "learning_rate": 9.18238545404023e-07, + "loss": 0.0096, + "num_input_tokens_seen": 31829144, + "step": 16256 + }, + { + "epoch": 2.154671968190855, + "grad_norm": 0.42516398429870605, + "learning_rate": 9.179697398626383e-07, + "loss": 0.0012, + "num_input_tokens_seen": 31831304, + "step": 16257 + }, + { + "epoch": 2.15480450629556, + "grad_norm": 0.005742785520851612, + "learning_rate": 9.177009648238882e-07, + "loss": 0.0, + "num_input_tokens_seen": 31832680, + "step": 16258 + }, + { + "epoch": 2.154937044400265, + "grad_norm": 5.27374267578125, + "learning_rate": 9.174322202929542e-07, + "loss": 0.0438, + "num_input_tokens_seen": 31835200, + "step": 16259 + }, + { + "epoch": 2.15506958250497, + "grad_norm": 0.4690706431865692, + "learning_rate": 9.171635062750189e-07, + "loss": 0.0012, + "num_input_tokens_seen": 31837488, + "step": 16260 + }, + { + "epoch": 2.155202120609675, + "grad_norm": 2.084514856338501, + "learning_rate": 9.168948227752639e-07, + "loss": 0.009, + "num_input_tokens_seen": 31839624, + "step": 16261 + }, + { + "epoch": 2.1553346587143802, + "grad_norm": 0.22302556037902832, + "learning_rate": 9.166261697988687e-07, + "loss": 0.0005, + "num_input_tokens_seen": 31841016, + "step": 16262 + }, + { + "epoch": 2.1554671968190853, + "grad_norm": 0.0037711237091571093, + "learning_rate": 9.163575473510136e-07, + "loss": 0.0, + "num_input_tokens_seen": 31842840, + "step": 16263 + }, + { + "epoch": 2.1555997349237908, + "grad_norm": 21.521520614624023, + "learning_rate": 9.160889554368766e-07, + "loss": 0.1458, + "num_input_tokens_seen": 31844912, + "step": 16264 + }, + { + "epoch": 2.155732273028496, + "grad_norm": 10.142009735107422, + "learning_rate": 9.158203940616378e-07, + "loss": 0.0926, + "num_input_tokens_seen": 31846984, + "step": 16265 + }, + { + "epoch": 2.155864811133201, + "grad_norm": 0.055978547781705856, + "learning_rate": 9.155518632304738e-07, + "loss": 0.0002, + "num_input_tokens_seen": 31849272, + "step": 16266 + }, + { + "epoch": 2.155997349237906, + "grad_norm": 0.02795438840985298, + "learning_rate": 9.152833629485641e-07, + "loss": 0.0002, + "num_input_tokens_seen": 31851536, + "step": 16267 + }, + { + "epoch": 2.156129887342611, + "grad_norm": 0.01588273234665394, + "learning_rate": 9.150148932210842e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31852920, + "step": 16268 + }, + { + "epoch": 2.156262425447316, + "grad_norm": 0.002919077407568693, + "learning_rate": 9.147464540532108e-07, + "loss": 0.0, + "num_input_tokens_seen": 31854528, + "step": 16269 + }, + { + "epoch": 2.156394963552021, + "grad_norm": 15.744500160217285, + "learning_rate": 9.144780454501181e-07, + "loss": 0.021, + "num_input_tokens_seen": 31857072, + "step": 16270 + }, + { + "epoch": 2.1565275016567265, + "grad_norm": 0.1647627055644989, + "learning_rate": 9.142096674169829e-07, + "loss": 0.001, + "num_input_tokens_seen": 31858992, + "step": 16271 + }, + { + "epoch": 2.1566600397614315, + "grad_norm": 3.7383735179901123, + "learning_rate": 9.1394131995898e-07, + "loss": 0.0498, + "num_input_tokens_seen": 31861304, + "step": 16272 + }, + { + "epoch": 2.1567925778661365, + "grad_norm": 0.02968645468354225, + "learning_rate": 9.136730030812827e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31863736, + "step": 16273 + }, + { + "epoch": 2.1569251159708416, + "grad_norm": 6.896124839782715, + "learning_rate": 9.134047167890639e-07, + "loss": 0.0399, + "num_input_tokens_seen": 31865544, + "step": 16274 + }, + { + "epoch": 2.1570576540755466, + "grad_norm": 0.00278258603066206, + "learning_rate": 9.131364610874971e-07, + "loss": 0.0, + "num_input_tokens_seen": 31867304, + "step": 16275 + }, + { + "epoch": 2.1571901921802517, + "grad_norm": 0.0015640315832570195, + "learning_rate": 9.128682359817532e-07, + "loss": 0.0, + "num_input_tokens_seen": 31869168, + "step": 16276 + }, + { + "epoch": 2.1573227302849567, + "grad_norm": 2.173910140991211, + "learning_rate": 9.126000414770042e-07, + "loss": 0.0099, + "num_input_tokens_seen": 31871640, + "step": 16277 + }, + { + "epoch": 2.157455268389662, + "grad_norm": 0.7526139616966248, + "learning_rate": 9.123318775784227e-07, + "loss": 0.0015, + "num_input_tokens_seen": 31873528, + "step": 16278 + }, + { + "epoch": 2.157587806494367, + "grad_norm": 8.941062927246094, + "learning_rate": 9.120637442911779e-07, + "loss": 0.121, + "num_input_tokens_seen": 31875728, + "step": 16279 + }, + { + "epoch": 2.1577203445990722, + "grad_norm": 0.17758284509181976, + "learning_rate": 9.117956416204393e-07, + "loss": 0.0006, + "num_input_tokens_seen": 31877872, + "step": 16280 + }, + { + "epoch": 2.1578528827037773, + "grad_norm": 0.0022955616004765034, + "learning_rate": 9.115275695713757e-07, + "loss": 0.0, + "num_input_tokens_seen": 31879272, + "step": 16281 + }, + { + "epoch": 2.1579854208084823, + "grad_norm": 0.0018755816854536533, + "learning_rate": 9.112595281491571e-07, + "loss": 0.0, + "num_input_tokens_seen": 31881072, + "step": 16282 + }, + { + "epoch": 2.1581179589131874, + "grad_norm": 9.561823844909668, + "learning_rate": 9.109915173589501e-07, + "loss": 0.061, + "num_input_tokens_seen": 31883496, + "step": 16283 + }, + { + "epoch": 2.158250497017893, + "grad_norm": 1.0439566373825073, + "learning_rate": 9.107235372059236e-07, + "loss": 0.002, + "num_input_tokens_seen": 31886032, + "step": 16284 + }, + { + "epoch": 2.158383035122598, + "grad_norm": 1.5033072233200073, + "learning_rate": 9.10455587695244e-07, + "loss": 0.014, + "num_input_tokens_seen": 31888432, + "step": 16285 + }, + { + "epoch": 2.158515573227303, + "grad_norm": 0.009445133619010448, + "learning_rate": 9.10187668832076e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31890104, + "step": 16286 + }, + { + "epoch": 2.158648111332008, + "grad_norm": 6.966722011566162, + "learning_rate": 9.099197806215875e-07, + "loss": 0.1207, + "num_input_tokens_seen": 31893096, + "step": 16287 + }, + { + "epoch": 2.158780649436713, + "grad_norm": 6.857524871826172, + "learning_rate": 9.096519230689418e-07, + "loss": 0.0516, + "num_input_tokens_seen": 31895480, + "step": 16288 + }, + { + "epoch": 2.158913187541418, + "grad_norm": 0.0023178448900580406, + "learning_rate": 9.093840961793048e-07, + "loss": 0.0, + "num_input_tokens_seen": 31896976, + "step": 16289 + }, + { + "epoch": 2.159045725646123, + "grad_norm": 15.556846618652344, + "learning_rate": 9.091162999578398e-07, + "loss": 0.0339, + "num_input_tokens_seen": 31898712, + "step": 16290 + }, + { + "epoch": 2.1591782637508286, + "grad_norm": 5.336599349975586, + "learning_rate": 9.088485344097095e-07, + "loss": 0.1755, + "num_input_tokens_seen": 31901136, + "step": 16291 + }, + { + "epoch": 2.1593108018555336, + "grad_norm": 10.42993450164795, + "learning_rate": 9.085807995400775e-07, + "loss": 0.134, + "num_input_tokens_seen": 31903784, + "step": 16292 + }, + { + "epoch": 2.1594433399602386, + "grad_norm": 15.401012420654297, + "learning_rate": 9.083130953541058e-07, + "loss": 0.2098, + "num_input_tokens_seen": 31905600, + "step": 16293 + }, + { + "epoch": 2.1595758780649437, + "grad_norm": 5.290477275848389, + "learning_rate": 9.080454218569548e-07, + "loss": 0.034, + "num_input_tokens_seen": 31907768, + "step": 16294 + }, + { + "epoch": 2.1597084161696487, + "grad_norm": 0.0001051467697834596, + "learning_rate": 9.077777790537873e-07, + "loss": 0.0, + "num_input_tokens_seen": 31908672, + "step": 16295 + }, + { + "epoch": 2.1598409542743537, + "grad_norm": 6.950831413269043, + "learning_rate": 9.075101669497624e-07, + "loss": 0.0871, + "num_input_tokens_seen": 31911128, + "step": 16296 + }, + { + "epoch": 2.159973492379059, + "grad_norm": 0.002090666675940156, + "learning_rate": 9.072425855500394e-07, + "loss": 0.0, + "num_input_tokens_seen": 31912776, + "step": 16297 + }, + { + "epoch": 2.1601060304837643, + "grad_norm": 0.03462889418005943, + "learning_rate": 9.069750348597789e-07, + "loss": 0.0002, + "num_input_tokens_seen": 31915280, + "step": 16298 + }, + { + "epoch": 2.1602385685884693, + "grad_norm": 5.907403945922852, + "learning_rate": 9.067075148841387e-07, + "loss": 0.037, + "num_input_tokens_seen": 31917376, + "step": 16299 + }, + { + "epoch": 2.1603711066931743, + "grad_norm": 0.0038726096972823143, + "learning_rate": 9.064400256282757e-07, + "loss": 0.0, + "num_input_tokens_seen": 31919512, + "step": 16300 + }, + { + "epoch": 2.1605036447978794, + "grad_norm": 6.817558765411377, + "learning_rate": 9.061725670973495e-07, + "loss": 0.0686, + "num_input_tokens_seen": 31921544, + "step": 16301 + }, + { + "epoch": 2.1606361829025844, + "grad_norm": 0.0014543962897732854, + "learning_rate": 9.059051392965149e-07, + "loss": 0.0, + "num_input_tokens_seen": 31922904, + "step": 16302 + }, + { + "epoch": 2.1607687210072895, + "grad_norm": 0.020216206088662148, + "learning_rate": 9.056377422309295e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31924352, + "step": 16303 + }, + { + "epoch": 2.160901259111995, + "grad_norm": 0.029913511127233505, + "learning_rate": 9.053703759057486e-07, + "loss": 0.0002, + "num_input_tokens_seen": 31926120, + "step": 16304 + }, + { + "epoch": 2.1610337972167, + "grad_norm": 0.002127459505572915, + "learning_rate": 9.051030403261268e-07, + "loss": 0.0, + "num_input_tokens_seen": 31927632, + "step": 16305 + }, + { + "epoch": 2.161166335321405, + "grad_norm": 0.481268048286438, + "learning_rate": 9.048357354972179e-07, + "loss": 0.0015, + "num_input_tokens_seen": 31929504, + "step": 16306 + }, + { + "epoch": 2.16129887342611, + "grad_norm": 1.5549007654190063, + "learning_rate": 9.045684614241762e-07, + "loss": 0.0152, + "num_input_tokens_seen": 31931032, + "step": 16307 + }, + { + "epoch": 2.161431411530815, + "grad_norm": 11.882316589355469, + "learning_rate": 9.043012181121563e-07, + "loss": 0.1671, + "num_input_tokens_seen": 31933400, + "step": 16308 + }, + { + "epoch": 2.16156394963552, + "grad_norm": 0.005381524097174406, + "learning_rate": 9.040340055663097e-07, + "loss": 0.0, + "num_input_tokens_seen": 31934976, + "step": 16309 + }, + { + "epoch": 2.161696487740225, + "grad_norm": 21.94584083557129, + "learning_rate": 9.037668237917885e-07, + "loss": 0.1753, + "num_input_tokens_seen": 31936584, + "step": 16310 + }, + { + "epoch": 2.1618290258449306, + "grad_norm": 12.660004615783691, + "learning_rate": 9.03499672793744e-07, + "loss": 0.1887, + "num_input_tokens_seen": 31938328, + "step": 16311 + }, + { + "epoch": 2.1619615639496357, + "grad_norm": 9.35374641418457, + "learning_rate": 9.032325525773263e-07, + "loss": 0.1923, + "num_input_tokens_seen": 31941208, + "step": 16312 + }, + { + "epoch": 2.1620941020543407, + "grad_norm": 0.05568844452500343, + "learning_rate": 9.029654631476867e-07, + "loss": 0.0003, + "num_input_tokens_seen": 31942760, + "step": 16313 + }, + { + "epoch": 2.1622266401590458, + "grad_norm": 2.158252239227295, + "learning_rate": 9.026984045099754e-07, + "loss": 0.0132, + "num_input_tokens_seen": 31944504, + "step": 16314 + }, + { + "epoch": 2.162359178263751, + "grad_norm": 4.426802158355713, + "learning_rate": 9.02431376669341e-07, + "loss": 0.049, + "num_input_tokens_seen": 31946232, + "step": 16315 + }, + { + "epoch": 2.162491716368456, + "grad_norm": 0.012634278275072575, + "learning_rate": 9.021643796309315e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31947648, + "step": 16316 + }, + { + "epoch": 2.162624254473161, + "grad_norm": 0.016432592645287514, + "learning_rate": 9.018974133998945e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31949688, + "step": 16317 + }, + { + "epoch": 2.1627567925778663, + "grad_norm": 11.417874336242676, + "learning_rate": 9.016304779813776e-07, + "loss": 0.1375, + "num_input_tokens_seen": 31952120, + "step": 16318 + }, + { + "epoch": 2.1628893306825714, + "grad_norm": 4.407896041870117, + "learning_rate": 9.013635733805287e-07, + "loss": 0.04, + "num_input_tokens_seen": 31954096, + "step": 16319 + }, + { + "epoch": 2.1630218687872764, + "grad_norm": 0.015102413482964039, + "learning_rate": 9.010966996024928e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31955704, + "step": 16320 + }, + { + "epoch": 2.1631544068919815, + "grad_norm": 2.9844865798950195, + "learning_rate": 9.008298566524157e-07, + "loss": 0.0215, + "num_input_tokens_seen": 31957776, + "step": 16321 + }, + { + "epoch": 2.1632869449966865, + "grad_norm": 3.856421947479248, + "learning_rate": 9.005630445354421e-07, + "loss": 0.0098, + "num_input_tokens_seen": 31959696, + "step": 16322 + }, + { + "epoch": 2.1634194831013915, + "grad_norm": 4.188047885894775, + "learning_rate": 9.002962632567159e-07, + "loss": 0.0515, + "num_input_tokens_seen": 31961584, + "step": 16323 + }, + { + "epoch": 2.1635520212060966, + "grad_norm": 12.772509574890137, + "learning_rate": 9.00029512821381e-07, + "loss": 0.213, + "num_input_tokens_seen": 31963224, + "step": 16324 + }, + { + "epoch": 2.163684559310802, + "grad_norm": 5.330288887023926, + "learning_rate": 8.997627932345815e-07, + "loss": 0.0212, + "num_input_tokens_seen": 31965448, + "step": 16325 + }, + { + "epoch": 2.163817097415507, + "grad_norm": 1.5916491746902466, + "learning_rate": 8.994961045014597e-07, + "loss": 0.0035, + "num_input_tokens_seen": 31967560, + "step": 16326 + }, + { + "epoch": 2.163949635520212, + "grad_norm": 3.6776208877563477, + "learning_rate": 8.992294466271567e-07, + "loss": 0.0205, + "num_input_tokens_seen": 31969520, + "step": 16327 + }, + { + "epoch": 2.164082173624917, + "grad_norm": 0.0019953283481299877, + "learning_rate": 8.989628196168137e-07, + "loss": 0.0, + "num_input_tokens_seen": 31971832, + "step": 16328 + }, + { + "epoch": 2.164214711729622, + "grad_norm": 7.119016170501709, + "learning_rate": 8.98696223475573e-07, + "loss": 0.1252, + "num_input_tokens_seen": 31974008, + "step": 16329 + }, + { + "epoch": 2.1643472498343272, + "grad_norm": 0.41225719451904297, + "learning_rate": 8.984296582085727e-07, + "loss": 0.0019, + "num_input_tokens_seen": 31976488, + "step": 16330 + }, + { + "epoch": 2.1644797879390323, + "grad_norm": 0.029439054429531097, + "learning_rate": 8.981631238209546e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31977584, + "step": 16331 + }, + { + "epoch": 2.1646123260437378, + "grad_norm": 1.5935410261154175, + "learning_rate": 8.978966203178562e-07, + "loss": 0.0069, + "num_input_tokens_seen": 31980568, + "step": 16332 + }, + { + "epoch": 2.164744864148443, + "grad_norm": 2.0579516887664795, + "learning_rate": 8.976301477044156e-07, + "loss": 0.0139, + "num_input_tokens_seen": 31982512, + "step": 16333 + }, + { + "epoch": 2.164877402253148, + "grad_norm": 0.0004243080038577318, + "learning_rate": 8.973637059857718e-07, + "loss": 0.0, + "num_input_tokens_seen": 31983808, + "step": 16334 + }, + { + "epoch": 2.165009940357853, + "grad_norm": 0.023278528824448586, + "learning_rate": 8.970972951670615e-07, + "loss": 0.0001, + "num_input_tokens_seen": 31986056, + "step": 16335 + }, + { + "epoch": 2.165142478462558, + "grad_norm": 0.03380856662988663, + "learning_rate": 8.968309152534202e-07, + "loss": 0.0002, + "num_input_tokens_seen": 31987320, + "step": 16336 + }, + { + "epoch": 2.165275016567263, + "grad_norm": 0.03233925253152847, + "learning_rate": 8.965645662499858e-07, + "loss": 0.0002, + "num_input_tokens_seen": 31989424, + "step": 16337 + }, + { + "epoch": 2.165407554671968, + "grad_norm": 6.156039237976074, + "learning_rate": 8.962982481618917e-07, + "loss": 0.0337, + "num_input_tokens_seen": 31991760, + "step": 16338 + }, + { + "epoch": 2.1655400927766735, + "grad_norm": 0.1429811418056488, + "learning_rate": 8.960319609942747e-07, + "loss": 0.0008, + "num_input_tokens_seen": 31993368, + "step": 16339 + }, + { + "epoch": 2.1656726308813785, + "grad_norm": 6.612421989440918, + "learning_rate": 8.95765704752268e-07, + "loss": 0.0419, + "num_input_tokens_seen": 31994936, + "step": 16340 + }, + { + "epoch": 2.1658051689860836, + "grad_norm": 14.760262489318848, + "learning_rate": 8.954994794410044e-07, + "loss": 0.1268, + "num_input_tokens_seen": 31997000, + "step": 16341 + }, + { + "epoch": 2.1659377070907886, + "grad_norm": 5.923161029815674, + "learning_rate": 8.952332850656184e-07, + "loss": 0.0384, + "num_input_tokens_seen": 31999112, + "step": 16342 + }, + { + "epoch": 2.1660702451954936, + "grad_norm": 1.843796730041504, + "learning_rate": 8.949671216312411e-07, + "loss": 0.0161, + "num_input_tokens_seen": 32000936, + "step": 16343 + }, + { + "epoch": 2.1662027833001987, + "grad_norm": 0.029117045924067497, + "learning_rate": 8.947009891430056e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32002680, + "step": 16344 + }, + { + "epoch": 2.1663353214049037, + "grad_norm": 0.0009290559682995081, + "learning_rate": 8.944348876060426e-07, + "loss": 0.0, + "num_input_tokens_seen": 32003992, + "step": 16345 + }, + { + "epoch": 2.166467859509609, + "grad_norm": 12.250194549560547, + "learning_rate": 8.941688170254826e-07, + "loss": 0.2548, + "num_input_tokens_seen": 32006504, + "step": 16346 + }, + { + "epoch": 2.166600397614314, + "grad_norm": 0.05723026394844055, + "learning_rate": 8.939027774064546e-07, + "loss": 0.0002, + "num_input_tokens_seen": 32007704, + "step": 16347 + }, + { + "epoch": 2.1667329357190193, + "grad_norm": 5.852693557739258, + "learning_rate": 8.936367687540893e-07, + "loss": 0.0433, + "num_input_tokens_seen": 32009424, + "step": 16348 + }, + { + "epoch": 2.1668654738237243, + "grad_norm": 7.560417175292969, + "learning_rate": 8.933707910735159e-07, + "loss": 0.1496, + "num_input_tokens_seen": 32011336, + "step": 16349 + }, + { + "epoch": 2.1669980119284293, + "grad_norm": 0.049138542264699936, + "learning_rate": 8.93104844369862e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32013048, + "step": 16350 + }, + { + "epoch": 2.1671305500331344, + "grad_norm": 0.06259223073720932, + "learning_rate": 8.928389286482555e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32014336, + "step": 16351 + }, + { + "epoch": 2.1672630881378394, + "grad_norm": 0.0021550392266362906, + "learning_rate": 8.925730439138231e-07, + "loss": 0.0, + "num_input_tokens_seen": 32015744, + "step": 16352 + }, + { + "epoch": 2.167395626242545, + "grad_norm": 1.1689785718917847, + "learning_rate": 8.923071901716903e-07, + "loss": 0.0093, + "num_input_tokens_seen": 32017672, + "step": 16353 + }, + { + "epoch": 2.16752816434725, + "grad_norm": 1.061637043952942, + "learning_rate": 8.92041367426984e-07, + "loss": 0.0057, + "num_input_tokens_seen": 32019680, + "step": 16354 + }, + { + "epoch": 2.167660702451955, + "grad_norm": 0.0325373113155365, + "learning_rate": 8.917755756848304e-07, + "loss": 0.0002, + "num_input_tokens_seen": 32021520, + "step": 16355 + }, + { + "epoch": 2.16779324055666, + "grad_norm": 0.025693921372294426, + "learning_rate": 8.915098149503528e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32023792, + "step": 16356 + }, + { + "epoch": 2.167925778661365, + "grad_norm": 0.03290697559714317, + "learning_rate": 8.912440852286761e-07, + "loss": 0.0002, + "num_input_tokens_seen": 32025896, + "step": 16357 + }, + { + "epoch": 2.16805831676607, + "grad_norm": 12.221582412719727, + "learning_rate": 8.90978386524923e-07, + "loss": 0.1534, + "num_input_tokens_seen": 32028048, + "step": 16358 + }, + { + "epoch": 2.168190854870775, + "grad_norm": 9.645878791809082, + "learning_rate": 8.907127188442158e-07, + "loss": 0.1706, + "num_input_tokens_seen": 32029840, + "step": 16359 + }, + { + "epoch": 2.1683233929754806, + "grad_norm": 0.11162801086902618, + "learning_rate": 8.904470821916775e-07, + "loss": 0.0008, + "num_input_tokens_seen": 32033240, + "step": 16360 + }, + { + "epoch": 2.1684559310801856, + "grad_norm": 0.0052468422800302505, + "learning_rate": 8.901814765724307e-07, + "loss": 0.0, + "num_input_tokens_seen": 32035536, + "step": 16361 + }, + { + "epoch": 2.1685884691848907, + "grad_norm": 1.023766040802002, + "learning_rate": 8.899159019915957e-07, + "loss": 0.0048, + "num_input_tokens_seen": 32037288, + "step": 16362 + }, + { + "epoch": 2.1687210072895957, + "grad_norm": 2.839874029159546, + "learning_rate": 8.896503584542926e-07, + "loss": 0.0263, + "num_input_tokens_seen": 32039344, + "step": 16363 + }, + { + "epoch": 2.1688535453943008, + "grad_norm": 3.3941948413848877, + "learning_rate": 8.893848459656409e-07, + "loss": 0.0192, + "num_input_tokens_seen": 32040592, + "step": 16364 + }, + { + "epoch": 2.168986083499006, + "grad_norm": 9.822312355041504, + "learning_rate": 8.891193645307614e-07, + "loss": 0.1876, + "num_input_tokens_seen": 32043248, + "step": 16365 + }, + { + "epoch": 2.1691186216037113, + "grad_norm": 4.423951148986816, + "learning_rate": 8.888539141547709e-07, + "loss": 0.0195, + "num_input_tokens_seen": 32045520, + "step": 16366 + }, + { + "epoch": 2.1692511597084163, + "grad_norm": 0.8300106525421143, + "learning_rate": 8.885884948427892e-07, + "loss": 0.0025, + "num_input_tokens_seen": 32046968, + "step": 16367 + }, + { + "epoch": 2.1693836978131213, + "grad_norm": 4.133303165435791, + "learning_rate": 8.883231065999329e-07, + "loss": 0.0396, + "num_input_tokens_seen": 32049584, + "step": 16368 + }, + { + "epoch": 2.1695162359178264, + "grad_norm": 2.550511121749878, + "learning_rate": 8.88057749431318e-07, + "loss": 0.0129, + "num_input_tokens_seen": 32052280, + "step": 16369 + }, + { + "epoch": 2.1696487740225314, + "grad_norm": 0.015759941190481186, + "learning_rate": 8.877924233420629e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32054144, + "step": 16370 + }, + { + "epoch": 2.1697813121272365, + "grad_norm": 7.0325212478637695, + "learning_rate": 8.87527128337281e-07, + "loss": 0.0467, + "num_input_tokens_seen": 32056360, + "step": 16371 + }, + { + "epoch": 2.1699138502319415, + "grad_norm": 0.008993811905384064, + "learning_rate": 8.872618644220895e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32058656, + "step": 16372 + }, + { + "epoch": 2.170046388336647, + "grad_norm": 0.009372849948704243, + "learning_rate": 8.869966316016015e-07, + "loss": 0.0, + "num_input_tokens_seen": 32060056, + "step": 16373 + }, + { + "epoch": 2.170178926441352, + "grad_norm": 9.357629776000977, + "learning_rate": 8.867314298809307e-07, + "loss": 0.0731, + "num_input_tokens_seen": 32062008, + "step": 16374 + }, + { + "epoch": 2.170311464546057, + "grad_norm": 0.0015723721589893103, + "learning_rate": 8.864662592651915e-07, + "loss": 0.0, + "num_input_tokens_seen": 32063568, + "step": 16375 + }, + { + "epoch": 2.170444002650762, + "grad_norm": 6.99209451675415, + "learning_rate": 8.86201119759496e-07, + "loss": 0.0972, + "num_input_tokens_seen": 32065272, + "step": 16376 + }, + { + "epoch": 2.170576540755467, + "grad_norm": 2.6327908039093018, + "learning_rate": 8.859360113689555e-07, + "loss": 0.0238, + "num_input_tokens_seen": 32067056, + "step": 16377 + }, + { + "epoch": 2.170709078860172, + "grad_norm": 0.40840256214141846, + "learning_rate": 8.85670934098683e-07, + "loss": 0.0027, + "num_input_tokens_seen": 32069440, + "step": 16378 + }, + { + "epoch": 2.170841616964877, + "grad_norm": 8.189689636230469, + "learning_rate": 8.854058879537883e-07, + "loss": 0.1326, + "num_input_tokens_seen": 32071336, + "step": 16379 + }, + { + "epoch": 2.1709741550695827, + "grad_norm": 0.0016157937934622169, + "learning_rate": 8.851408729393812e-07, + "loss": 0.0, + "num_input_tokens_seen": 32072680, + "step": 16380 + }, + { + "epoch": 2.1711066931742877, + "grad_norm": 0.0027830060571432114, + "learning_rate": 8.848758890605732e-07, + "loss": 0.0, + "num_input_tokens_seen": 32073808, + "step": 16381 + }, + { + "epoch": 2.1712392312789928, + "grad_norm": 8.20941162109375, + "learning_rate": 8.846109363224719e-07, + "loss": 0.0528, + "num_input_tokens_seen": 32075672, + "step": 16382 + }, + { + "epoch": 2.171371769383698, + "grad_norm": 11.36362361907959, + "learning_rate": 8.843460147301855e-07, + "loss": 0.2182, + "num_input_tokens_seen": 32078504, + "step": 16383 + }, + { + "epoch": 2.171504307488403, + "grad_norm": 12.60506820678711, + "learning_rate": 8.840811242888234e-07, + "loss": 0.1181, + "num_input_tokens_seen": 32080232, + "step": 16384 + }, + { + "epoch": 2.171636845593108, + "grad_norm": 0.005351016763597727, + "learning_rate": 8.838162650034912e-07, + "loss": 0.0, + "num_input_tokens_seen": 32081528, + "step": 16385 + }, + { + "epoch": 2.171769383697813, + "grad_norm": 0.03924409672617912, + "learning_rate": 8.835514368792972e-07, + "loss": 0.0002, + "num_input_tokens_seen": 32082912, + "step": 16386 + }, + { + "epoch": 2.1719019218025184, + "grad_norm": 0.007604679558426142, + "learning_rate": 8.832866399213463e-07, + "loss": 0.0, + "num_input_tokens_seen": 32084208, + "step": 16387 + }, + { + "epoch": 2.1720344599072234, + "grad_norm": 0.004771833308041096, + "learning_rate": 8.830218741347446e-07, + "loss": 0.0, + "num_input_tokens_seen": 32085776, + "step": 16388 + }, + { + "epoch": 2.1721669980119285, + "grad_norm": 8.157522201538086, + "learning_rate": 8.827571395245957e-07, + "loss": 0.1436, + "num_input_tokens_seen": 32088256, + "step": 16389 + }, + { + "epoch": 2.1722995361166335, + "grad_norm": 0.0032732984982430935, + "learning_rate": 8.824924360960049e-07, + "loss": 0.0, + "num_input_tokens_seen": 32089280, + "step": 16390 + }, + { + "epoch": 2.1724320742213385, + "grad_norm": 2.30367374420166, + "learning_rate": 8.822277638540766e-07, + "loss": 0.0134, + "num_input_tokens_seen": 32091872, + "step": 16391 + }, + { + "epoch": 2.1725646123260436, + "grad_norm": 2.602233648300171, + "learning_rate": 8.81963122803913e-07, + "loss": 0.01, + "num_input_tokens_seen": 32093488, + "step": 16392 + }, + { + "epoch": 2.172697150430749, + "grad_norm": 0.10372641682624817, + "learning_rate": 8.816985129506167e-07, + "loss": 0.0004, + "num_input_tokens_seen": 32095336, + "step": 16393 + }, + { + "epoch": 2.172829688535454, + "grad_norm": 0.8153428435325623, + "learning_rate": 8.814339342992886e-07, + "loss": 0.0028, + "num_input_tokens_seen": 32096808, + "step": 16394 + }, + { + "epoch": 2.172962226640159, + "grad_norm": 0.026446107774972916, + "learning_rate": 8.811693868550309e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32098168, + "step": 16395 + }, + { + "epoch": 2.173094764744864, + "grad_norm": 0.0017325254157185555, + "learning_rate": 8.809048706229451e-07, + "loss": 0.0, + "num_input_tokens_seen": 32099296, + "step": 16396 + }, + { + "epoch": 2.173227302849569, + "grad_norm": 2.445796012878418, + "learning_rate": 8.806403856081302e-07, + "loss": 0.0075, + "num_input_tokens_seen": 32101200, + "step": 16397 + }, + { + "epoch": 2.1733598409542743, + "grad_norm": 0.000621061073616147, + "learning_rate": 8.803759318156857e-07, + "loss": 0.0, + "num_input_tokens_seen": 32103104, + "step": 16398 + }, + { + "epoch": 2.1734923790589793, + "grad_norm": 12.472467422485352, + "learning_rate": 8.801115092507109e-07, + "loss": 0.0221, + "num_input_tokens_seen": 32104904, + "step": 16399 + }, + { + "epoch": 2.1736249171636848, + "grad_norm": 0.02417619526386261, + "learning_rate": 8.798471179183027e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32106984, + "step": 16400 + }, + { + "epoch": 2.17375745526839, + "grad_norm": 0.0045822300016880035, + "learning_rate": 8.795827578235597e-07, + "loss": 0.0, + "num_input_tokens_seen": 32109672, + "step": 16401 + }, + { + "epoch": 2.173889993373095, + "grad_norm": 6.171019554138184, + "learning_rate": 8.793184289715797e-07, + "loss": 0.0263, + "num_input_tokens_seen": 32111304, + "step": 16402 + }, + { + "epoch": 2.1740225314778, + "grad_norm": 4.854536056518555, + "learning_rate": 8.790541313674586e-07, + "loss": 0.0605, + "num_input_tokens_seen": 32113288, + "step": 16403 + }, + { + "epoch": 2.174155069582505, + "grad_norm": 7.49040412902832, + "learning_rate": 8.787898650162921e-07, + "loss": 0.0583, + "num_input_tokens_seen": 32115152, + "step": 16404 + }, + { + "epoch": 2.17428760768721, + "grad_norm": 28.883913040161133, + "learning_rate": 8.785256299231745e-07, + "loss": 0.1415, + "num_input_tokens_seen": 32117824, + "step": 16405 + }, + { + "epoch": 2.174420145791915, + "grad_norm": 0.005794128403067589, + "learning_rate": 8.782614260932024e-07, + "loss": 0.0, + "num_input_tokens_seen": 32119520, + "step": 16406 + }, + { + "epoch": 2.1745526838966205, + "grad_norm": 0.000828034186270088, + "learning_rate": 8.779972535314676e-07, + "loss": 0.0, + "num_input_tokens_seen": 32120832, + "step": 16407 + }, + { + "epoch": 2.1746852220013255, + "grad_norm": 0.02971603535115719, + "learning_rate": 8.777331122430658e-07, + "loss": 0.0002, + "num_input_tokens_seen": 32122680, + "step": 16408 + }, + { + "epoch": 2.1748177601060306, + "grad_norm": 0.3095114231109619, + "learning_rate": 8.774690022330887e-07, + "loss": 0.0009, + "num_input_tokens_seen": 32124488, + "step": 16409 + }, + { + "epoch": 2.1749502982107356, + "grad_norm": 0.0004727187333628535, + "learning_rate": 8.772049235066285e-07, + "loss": 0.0, + "num_input_tokens_seen": 32125944, + "step": 16410 + }, + { + "epoch": 2.1750828363154406, + "grad_norm": 9.651729583740234, + "learning_rate": 8.769408760687762e-07, + "loss": 0.0438, + "num_input_tokens_seen": 32128248, + "step": 16411 + }, + { + "epoch": 2.1752153744201457, + "grad_norm": 6.555649280548096, + "learning_rate": 8.766768599246242e-07, + "loss": 0.1173, + "num_input_tokens_seen": 32130064, + "step": 16412 + }, + { + "epoch": 2.1753479125248507, + "grad_norm": 5.126222133636475, + "learning_rate": 8.764128750792614e-07, + "loss": 0.0537, + "num_input_tokens_seen": 32132144, + "step": 16413 + }, + { + "epoch": 2.175480450629556, + "grad_norm": 0.00010566495620878413, + "learning_rate": 8.761489215377794e-07, + "loss": 0.0, + "num_input_tokens_seen": 32133336, + "step": 16414 + }, + { + "epoch": 2.1756129887342612, + "grad_norm": 0.0008844821713864803, + "learning_rate": 8.758849993052662e-07, + "loss": 0.0, + "num_input_tokens_seen": 32134744, + "step": 16415 + }, + { + "epoch": 2.1757455268389663, + "grad_norm": 7.598770618438721, + "learning_rate": 8.756211083868099e-07, + "loss": 0.1315, + "num_input_tokens_seen": 32136376, + "step": 16416 + }, + { + "epoch": 2.1758780649436713, + "grad_norm": 1.847217082977295, + "learning_rate": 8.753572487875e-07, + "loss": 0.0246, + "num_input_tokens_seen": 32138960, + "step": 16417 + }, + { + "epoch": 2.1760106030483763, + "grad_norm": 4.659603595733643, + "learning_rate": 8.750934205124226e-07, + "loss": 0.0441, + "num_input_tokens_seen": 32140568, + "step": 16418 + }, + { + "epoch": 2.1761431411530814, + "grad_norm": 3.483297348022461, + "learning_rate": 8.748296235666653e-07, + "loss": 0.0112, + "num_input_tokens_seen": 32142280, + "step": 16419 + }, + { + "epoch": 2.1762756792577864, + "grad_norm": 0.6684598326683044, + "learning_rate": 8.745658579553146e-07, + "loss": 0.0035, + "num_input_tokens_seen": 32145432, + "step": 16420 + }, + { + "epoch": 2.176408217362492, + "grad_norm": 0.006277100648730993, + "learning_rate": 8.743021236834543e-07, + "loss": 0.0, + "num_input_tokens_seen": 32146944, + "step": 16421 + }, + { + "epoch": 2.176540755467197, + "grad_norm": 10.952794075012207, + "learning_rate": 8.740384207561715e-07, + "loss": 0.1437, + "num_input_tokens_seen": 32149264, + "step": 16422 + }, + { + "epoch": 2.176673293571902, + "grad_norm": 0.21170520782470703, + "learning_rate": 8.737747491785495e-07, + "loss": 0.0016, + "num_input_tokens_seen": 32151928, + "step": 16423 + }, + { + "epoch": 2.176805831676607, + "grad_norm": 4.580447196960449, + "learning_rate": 8.735111089556716e-07, + "loss": 0.0668, + "num_input_tokens_seen": 32154184, + "step": 16424 + }, + { + "epoch": 2.176938369781312, + "grad_norm": 0.015064590610563755, + "learning_rate": 8.732475000926222e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32156408, + "step": 16425 + }, + { + "epoch": 2.177070907886017, + "grad_norm": 0.00421043811365962, + "learning_rate": 8.729839225944825e-07, + "loss": 0.0, + "num_input_tokens_seen": 32157944, + "step": 16426 + }, + { + "epoch": 2.177203445990722, + "grad_norm": 6.991897106170654, + "learning_rate": 8.727203764663361e-07, + "loss": 0.1093, + "num_input_tokens_seen": 32159584, + "step": 16427 + }, + { + "epoch": 2.1773359840954276, + "grad_norm": 7.092871189117432, + "learning_rate": 8.724568617132634e-07, + "loss": 0.0364, + "num_input_tokens_seen": 32162008, + "step": 16428 + }, + { + "epoch": 2.1774685222001327, + "grad_norm": 0.14041098952293396, + "learning_rate": 8.721933783403453e-07, + "loss": 0.0007, + "num_input_tokens_seen": 32163584, + "step": 16429 + }, + { + "epoch": 2.1776010603048377, + "grad_norm": 1.4582666158676147, + "learning_rate": 8.719299263526607e-07, + "loss": 0.011, + "num_input_tokens_seen": 32165488, + "step": 16430 + }, + { + "epoch": 2.1777335984095427, + "grad_norm": 0.014736780896782875, + "learning_rate": 8.716665057552903e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32167400, + "step": 16431 + }, + { + "epoch": 2.1778661365142478, + "grad_norm": 0.014151995070278645, + "learning_rate": 8.71403116553314e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32168904, + "step": 16432 + }, + { + "epoch": 2.177998674618953, + "grad_norm": 31.933948516845703, + "learning_rate": 8.711397587518092e-07, + "loss": 0.1483, + "num_input_tokens_seen": 32171248, + "step": 16433 + }, + { + "epoch": 2.178131212723658, + "grad_norm": 10.695718765258789, + "learning_rate": 8.708764323558533e-07, + "loss": 0.1239, + "num_input_tokens_seen": 32173288, + "step": 16434 + }, + { + "epoch": 2.1782637508283633, + "grad_norm": 0.054913926869630814, + "learning_rate": 8.70613137370524e-07, + "loss": 0.0003, + "num_input_tokens_seen": 32175416, + "step": 16435 + }, + { + "epoch": 2.1783962889330684, + "grad_norm": 0.011469722725450993, + "learning_rate": 8.703498738008965e-07, + "loss": 0.0, + "num_input_tokens_seen": 32176960, + "step": 16436 + }, + { + "epoch": 2.1785288270377734, + "grad_norm": 3.1874213218688965, + "learning_rate": 8.700866416520473e-07, + "loss": 0.0561, + "num_input_tokens_seen": 32179128, + "step": 16437 + }, + { + "epoch": 2.1786613651424784, + "grad_norm": 0.2929127812385559, + "learning_rate": 8.698234409290535e-07, + "loss": 0.0007, + "num_input_tokens_seen": 32181264, + "step": 16438 + }, + { + "epoch": 2.1787939032471835, + "grad_norm": 0.13607093691825867, + "learning_rate": 8.69560271636988e-07, + "loss": 0.0006, + "num_input_tokens_seen": 32182888, + "step": 16439 + }, + { + "epoch": 2.1789264413518885, + "grad_norm": 10.752280235290527, + "learning_rate": 8.692971337809253e-07, + "loss": 0.1905, + "num_input_tokens_seen": 32184776, + "step": 16440 + }, + { + "epoch": 2.1790589794565935, + "grad_norm": 0.25187352299690247, + "learning_rate": 8.690340273659387e-07, + "loss": 0.0019, + "num_input_tokens_seen": 32186816, + "step": 16441 + }, + { + "epoch": 2.179191517561299, + "grad_norm": 0.00032210929202847183, + "learning_rate": 8.687709523971005e-07, + "loss": 0.0, + "num_input_tokens_seen": 32188776, + "step": 16442 + }, + { + "epoch": 2.179324055666004, + "grad_norm": 0.8406180143356323, + "learning_rate": 8.685079088794838e-07, + "loss": 0.0064, + "num_input_tokens_seen": 32190864, + "step": 16443 + }, + { + "epoch": 2.179456593770709, + "grad_norm": 1.4805094003677368, + "learning_rate": 8.682448968181606e-07, + "loss": 0.0067, + "num_input_tokens_seen": 32193240, + "step": 16444 + }, + { + "epoch": 2.179589131875414, + "grad_norm": 4.409290313720703, + "learning_rate": 8.679819162182019e-07, + "loss": 0.0389, + "num_input_tokens_seen": 32195128, + "step": 16445 + }, + { + "epoch": 2.179721669980119, + "grad_norm": 10.170660972595215, + "learning_rate": 8.677189670846775e-07, + "loss": 0.1347, + "num_input_tokens_seen": 32196936, + "step": 16446 + }, + { + "epoch": 2.179854208084824, + "grad_norm": 0.0032179669942706823, + "learning_rate": 8.674560494226566e-07, + "loss": 0.0, + "num_input_tokens_seen": 32199496, + "step": 16447 + }, + { + "epoch": 2.1799867461895293, + "grad_norm": 0.002872919663786888, + "learning_rate": 8.671931632372094e-07, + "loss": 0.0, + "num_input_tokens_seen": 32201520, + "step": 16448 + }, + { + "epoch": 2.1801192842942347, + "grad_norm": 16.09385871887207, + "learning_rate": 8.669303085334052e-07, + "loss": 0.0794, + "num_input_tokens_seen": 32203240, + "step": 16449 + }, + { + "epoch": 2.1802518223989398, + "grad_norm": 0.00070221844362095, + "learning_rate": 8.666674853163112e-07, + "loss": 0.0, + "num_input_tokens_seen": 32204696, + "step": 16450 + }, + { + "epoch": 2.180384360503645, + "grad_norm": 0.7485249638557434, + "learning_rate": 8.664046935909951e-07, + "loss": 0.0079, + "num_input_tokens_seen": 32206840, + "step": 16451 + }, + { + "epoch": 2.18051689860835, + "grad_norm": 0.018234029412269592, + "learning_rate": 8.661419333625223e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32208752, + "step": 16452 + }, + { + "epoch": 2.180649436713055, + "grad_norm": 0.006148578133434057, + "learning_rate": 8.658792046359612e-07, + "loss": 0.0, + "num_input_tokens_seen": 32210688, + "step": 16453 + }, + { + "epoch": 2.18078197481776, + "grad_norm": 0.008509054780006409, + "learning_rate": 8.656165074163755e-07, + "loss": 0.0, + "num_input_tokens_seen": 32213352, + "step": 16454 + }, + { + "epoch": 2.1809145129224654, + "grad_norm": 0.5992113351821899, + "learning_rate": 8.65353841708832e-07, + "loss": 0.0029, + "num_input_tokens_seen": 32216352, + "step": 16455 + }, + { + "epoch": 2.1810470510271704, + "grad_norm": 4.760822772979736, + "learning_rate": 8.650912075183938e-07, + "loss": 0.0668, + "num_input_tokens_seen": 32217840, + "step": 16456 + }, + { + "epoch": 2.1811795891318755, + "grad_norm": 5.125214576721191, + "learning_rate": 8.648286048501245e-07, + "loss": 0.0355, + "num_input_tokens_seen": 32219240, + "step": 16457 + }, + { + "epoch": 2.1813121272365805, + "grad_norm": 0.00920234713703394, + "learning_rate": 8.645660337090883e-07, + "loss": 0.0, + "num_input_tokens_seen": 32221272, + "step": 16458 + }, + { + "epoch": 2.1814446653412856, + "grad_norm": 4.813745498657227, + "learning_rate": 8.643034941003473e-07, + "loss": 0.0245, + "num_input_tokens_seen": 32223256, + "step": 16459 + }, + { + "epoch": 2.1815772034459906, + "grad_norm": 2.919987916946411, + "learning_rate": 8.640409860289625e-07, + "loss": 0.0132, + "num_input_tokens_seen": 32225296, + "step": 16460 + }, + { + "epoch": 2.1817097415506956, + "grad_norm": 0.3796074092388153, + "learning_rate": 8.63778509499997e-07, + "loss": 0.0021, + "num_input_tokens_seen": 32227656, + "step": 16461 + }, + { + "epoch": 2.181842279655401, + "grad_norm": 0.007414957042783499, + "learning_rate": 8.635160645185095e-07, + "loss": 0.0, + "num_input_tokens_seen": 32229696, + "step": 16462 + }, + { + "epoch": 2.181974817760106, + "grad_norm": 9.742204666137695, + "learning_rate": 8.632536510895623e-07, + "loss": 0.0411, + "num_input_tokens_seen": 32231600, + "step": 16463 + }, + { + "epoch": 2.182107355864811, + "grad_norm": 0.11146488785743713, + "learning_rate": 8.629912692182138e-07, + "loss": 0.0005, + "num_input_tokens_seen": 32233768, + "step": 16464 + }, + { + "epoch": 2.1822398939695162, + "grad_norm": 9.099875450134277, + "learning_rate": 8.627289189095231e-07, + "loss": 0.0349, + "num_input_tokens_seen": 32235808, + "step": 16465 + }, + { + "epoch": 2.1823724320742213, + "grad_norm": 3.2788429260253906, + "learning_rate": 8.624666001685472e-07, + "loss": 0.0173, + "num_input_tokens_seen": 32237232, + "step": 16466 + }, + { + "epoch": 2.1825049701789263, + "grad_norm": 9.758905410766602, + "learning_rate": 8.62204313000346e-07, + "loss": 0.2092, + "num_input_tokens_seen": 32239024, + "step": 16467 + }, + { + "epoch": 2.1826375082836313, + "grad_norm": 0.002124861115589738, + "learning_rate": 8.619420574099746e-07, + "loss": 0.0, + "num_input_tokens_seen": 32240352, + "step": 16468 + }, + { + "epoch": 2.182770046388337, + "grad_norm": 11.174568176269531, + "learning_rate": 8.616798334024912e-07, + "loss": 0.0916, + "num_input_tokens_seen": 32242624, + "step": 16469 + }, + { + "epoch": 2.182902584493042, + "grad_norm": 2.3794476985931396, + "learning_rate": 8.614176409829509e-07, + "loss": 0.0285, + "num_input_tokens_seen": 32244880, + "step": 16470 + }, + { + "epoch": 2.183035122597747, + "grad_norm": 12.936930656433105, + "learning_rate": 8.611554801564079e-07, + "loss": 0.1134, + "num_input_tokens_seen": 32246784, + "step": 16471 + }, + { + "epoch": 2.183167660702452, + "grad_norm": 4.877511978149414, + "learning_rate": 8.608933509279188e-07, + "loss": 0.0481, + "num_input_tokens_seen": 32249168, + "step": 16472 + }, + { + "epoch": 2.183300198807157, + "grad_norm": 5.332021236419678, + "learning_rate": 8.606312533025358e-07, + "loss": 0.0173, + "num_input_tokens_seen": 32251776, + "step": 16473 + }, + { + "epoch": 2.183432736911862, + "grad_norm": 0.06053527072072029, + "learning_rate": 8.603691872853137e-07, + "loss": 0.0003, + "num_input_tokens_seen": 32254008, + "step": 16474 + }, + { + "epoch": 2.1835652750165675, + "grad_norm": 6.473111152648926, + "learning_rate": 8.601071528813051e-07, + "loss": 0.0711, + "num_input_tokens_seen": 32255752, + "step": 16475 + }, + { + "epoch": 2.1836978131212725, + "grad_norm": 0.3763246536254883, + "learning_rate": 8.598451500955618e-07, + "loss": 0.0015, + "num_input_tokens_seen": 32257264, + "step": 16476 + }, + { + "epoch": 2.1838303512259776, + "grad_norm": 11.567811012268066, + "learning_rate": 8.59583178933135e-07, + "loss": 0.0955, + "num_input_tokens_seen": 32259352, + "step": 16477 + }, + { + "epoch": 2.1839628893306826, + "grad_norm": 0.2376769632101059, + "learning_rate": 8.593212393990757e-07, + "loss": 0.0008, + "num_input_tokens_seen": 32261312, + "step": 16478 + }, + { + "epoch": 2.1840954274353876, + "grad_norm": 1.1171437501907349, + "learning_rate": 8.590593314984358e-07, + "loss": 0.0068, + "num_input_tokens_seen": 32264008, + "step": 16479 + }, + { + "epoch": 2.1842279655400927, + "grad_norm": 3.2087719440460205, + "learning_rate": 8.587974552362641e-07, + "loss": 0.0166, + "num_input_tokens_seen": 32265616, + "step": 16480 + }, + { + "epoch": 2.1843605036447977, + "grad_norm": 0.0557829886674881, + "learning_rate": 8.585356106176093e-07, + "loss": 0.0003, + "num_input_tokens_seen": 32266856, + "step": 16481 + }, + { + "epoch": 2.184493041749503, + "grad_norm": 0.45588070154190063, + "learning_rate": 8.582737976475205e-07, + "loss": 0.0015, + "num_input_tokens_seen": 32268440, + "step": 16482 + }, + { + "epoch": 2.1846255798542082, + "grad_norm": 8.27389144897461, + "learning_rate": 8.580120163310449e-07, + "loss": 0.1307, + "num_input_tokens_seen": 32270936, + "step": 16483 + }, + { + "epoch": 2.1847581179589133, + "grad_norm": 0.00022269890177994967, + "learning_rate": 8.577502666732301e-07, + "loss": 0.0, + "num_input_tokens_seen": 32272136, + "step": 16484 + }, + { + "epoch": 2.1848906560636183, + "grad_norm": 0.062458913773298264, + "learning_rate": 8.57488548679124e-07, + "loss": 0.0003, + "num_input_tokens_seen": 32274168, + "step": 16485 + }, + { + "epoch": 2.1850231941683234, + "grad_norm": 1.0263233184814453, + "learning_rate": 8.572268623537719e-07, + "loss": 0.004, + "num_input_tokens_seen": 32275984, + "step": 16486 + }, + { + "epoch": 2.1851557322730284, + "grad_norm": 9.072057723999023, + "learning_rate": 8.569652077022189e-07, + "loss": 0.1722, + "num_input_tokens_seen": 32278328, + "step": 16487 + }, + { + "epoch": 2.1852882703777334, + "grad_norm": 4.590048313140869, + "learning_rate": 8.567035847295096e-07, + "loss": 0.0146, + "num_input_tokens_seen": 32280760, + "step": 16488 + }, + { + "epoch": 2.185420808482439, + "grad_norm": 5.983201503753662, + "learning_rate": 8.564419934406895e-07, + "loss": 0.0707, + "num_input_tokens_seen": 32282480, + "step": 16489 + }, + { + "epoch": 2.185553346587144, + "grad_norm": 0.006004919297993183, + "learning_rate": 8.561804338408008e-07, + "loss": 0.0, + "num_input_tokens_seen": 32284248, + "step": 16490 + }, + { + "epoch": 2.185685884691849, + "grad_norm": 0.013163235038518906, + "learning_rate": 8.559189059348883e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32285648, + "step": 16491 + }, + { + "epoch": 2.185818422796554, + "grad_norm": 0.0029043289832770824, + "learning_rate": 8.556574097279933e-07, + "loss": 0.0, + "num_input_tokens_seen": 32287144, + "step": 16492 + }, + { + "epoch": 2.185950960901259, + "grad_norm": 11.34067153930664, + "learning_rate": 8.553959452251579e-07, + "loss": 0.0773, + "num_input_tokens_seen": 32288296, + "step": 16493 + }, + { + "epoch": 2.186083499005964, + "grad_norm": 4.677226543426514, + "learning_rate": 8.551345124314225e-07, + "loss": 0.1079, + "num_input_tokens_seen": 32290272, + "step": 16494 + }, + { + "epoch": 2.186216037110669, + "grad_norm": 2.2177865505218506, + "learning_rate": 8.548731113518292e-07, + "loss": 0.0089, + "num_input_tokens_seen": 32292848, + "step": 16495 + }, + { + "epoch": 2.1863485752153746, + "grad_norm": 0.01839907467365265, + "learning_rate": 8.546117419914162e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32294272, + "step": 16496 + }, + { + "epoch": 2.1864811133200797, + "grad_norm": 9.268694877624512, + "learning_rate": 8.543504043552251e-07, + "loss": 0.072, + "num_input_tokens_seen": 32296432, + "step": 16497 + }, + { + "epoch": 2.1866136514247847, + "grad_norm": 0.09686466306447983, + "learning_rate": 8.540890984482933e-07, + "loss": 0.0005, + "num_input_tokens_seen": 32298352, + "step": 16498 + }, + { + "epoch": 2.1867461895294897, + "grad_norm": 4.109129428863525, + "learning_rate": 8.538278242756581e-07, + "loss": 0.0428, + "num_input_tokens_seen": 32300280, + "step": 16499 + }, + { + "epoch": 2.1868787276341948, + "grad_norm": 0.01647254452109337, + "learning_rate": 8.535665818423591e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32303048, + "step": 16500 + }, + { + "epoch": 2.1870112657389, + "grad_norm": 0.8939768671989441, + "learning_rate": 8.533053711534314e-07, + "loss": 0.0059, + "num_input_tokens_seen": 32304792, + "step": 16501 + }, + { + "epoch": 2.187143803843605, + "grad_norm": 0.7834133505821228, + "learning_rate": 8.530441922139132e-07, + "loss": 0.0049, + "num_input_tokens_seen": 32306704, + "step": 16502 + }, + { + "epoch": 2.1872763419483103, + "grad_norm": 0.9840855002403259, + "learning_rate": 8.527830450288387e-07, + "loss": 0.0031, + "num_input_tokens_seen": 32308656, + "step": 16503 + }, + { + "epoch": 2.1874088800530154, + "grad_norm": 0.002331552328541875, + "learning_rate": 8.52521929603243e-07, + "loss": 0.0, + "num_input_tokens_seen": 32310176, + "step": 16504 + }, + { + "epoch": 2.1875414181577204, + "grad_norm": 0.054897282272577286, + "learning_rate": 8.522608459421617e-07, + "loss": 0.0003, + "num_input_tokens_seen": 32311280, + "step": 16505 + }, + { + "epoch": 2.1876739562624254, + "grad_norm": 0.0017160241259261966, + "learning_rate": 8.519997940506281e-07, + "loss": 0.0, + "num_input_tokens_seen": 32312800, + "step": 16506 + }, + { + "epoch": 2.1878064943671305, + "grad_norm": 6.117879867553711, + "learning_rate": 8.517387739336743e-07, + "loss": 0.151, + "num_input_tokens_seen": 32314872, + "step": 16507 + }, + { + "epoch": 2.1879390324718355, + "grad_norm": 6.077617645263672, + "learning_rate": 8.514777855963349e-07, + "loss": 0.037, + "num_input_tokens_seen": 32316736, + "step": 16508 + }, + { + "epoch": 2.1880715705765406, + "grad_norm": 0.6500634551048279, + "learning_rate": 8.512168290436404e-07, + "loss": 0.0018, + "num_input_tokens_seen": 32318048, + "step": 16509 + }, + { + "epoch": 2.188204108681246, + "grad_norm": 0.3877097964286804, + "learning_rate": 8.509559042806237e-07, + "loss": 0.0014, + "num_input_tokens_seen": 32319984, + "step": 16510 + }, + { + "epoch": 2.188336646785951, + "grad_norm": 4.627021312713623, + "learning_rate": 8.506950113123147e-07, + "loss": 0.0462, + "num_input_tokens_seen": 32321976, + "step": 16511 + }, + { + "epoch": 2.188469184890656, + "grad_norm": 0.014595831744372845, + "learning_rate": 8.504341501437435e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32323768, + "step": 16512 + }, + { + "epoch": 2.188601722995361, + "grad_norm": 1.4934358596801758, + "learning_rate": 8.501733207799393e-07, + "loss": 0.0055, + "num_input_tokens_seen": 32326672, + "step": 16513 + }, + { + "epoch": 2.188734261100066, + "grad_norm": 0.03707326203584671, + "learning_rate": 8.499125232259318e-07, + "loss": 0.0002, + "num_input_tokens_seen": 32328864, + "step": 16514 + }, + { + "epoch": 2.1888667992047712, + "grad_norm": 7.298274993896484, + "learning_rate": 8.496517574867496e-07, + "loss": 0.0906, + "num_input_tokens_seen": 32330664, + "step": 16515 + }, + { + "epoch": 2.1889993373094763, + "grad_norm": 6.642621040344238, + "learning_rate": 8.493910235674202e-07, + "loss": 0.0201, + "num_input_tokens_seen": 32332488, + "step": 16516 + }, + { + "epoch": 2.1891318754141817, + "grad_norm": 4.521309852600098, + "learning_rate": 8.491303214729704e-07, + "loss": 0.0748, + "num_input_tokens_seen": 32334168, + "step": 16517 + }, + { + "epoch": 2.189264413518887, + "grad_norm": 4.338212490081787, + "learning_rate": 8.488696512084271e-07, + "loss": 0.0217, + "num_input_tokens_seen": 32335400, + "step": 16518 + }, + { + "epoch": 2.189396951623592, + "grad_norm": 0.009581783786416054, + "learning_rate": 8.486090127788152e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32337752, + "step": 16519 + }, + { + "epoch": 2.189529489728297, + "grad_norm": 0.30978938937187195, + "learning_rate": 8.483484061891606e-07, + "loss": 0.0008, + "num_input_tokens_seen": 32339376, + "step": 16520 + }, + { + "epoch": 2.189662027833002, + "grad_norm": 5.421680450439453, + "learning_rate": 8.48087831444489e-07, + "loss": 0.1188, + "num_input_tokens_seen": 32341144, + "step": 16521 + }, + { + "epoch": 2.189794565937707, + "grad_norm": 0.004379692953079939, + "learning_rate": 8.478272885498237e-07, + "loss": 0.0, + "num_input_tokens_seen": 32342976, + "step": 16522 + }, + { + "epoch": 2.189927104042412, + "grad_norm": 7.226802349090576, + "learning_rate": 8.475667775101881e-07, + "loss": 0.0322, + "num_input_tokens_seen": 32344952, + "step": 16523 + }, + { + "epoch": 2.1900596421471175, + "grad_norm": 4.630097389221191, + "learning_rate": 8.47306298330604e-07, + "loss": 0.0721, + "num_input_tokens_seen": 32346784, + "step": 16524 + }, + { + "epoch": 2.1901921802518225, + "grad_norm": 0.0009398441179655492, + "learning_rate": 8.470458510160945e-07, + "loss": 0.0, + "num_input_tokens_seen": 32347896, + "step": 16525 + }, + { + "epoch": 2.1903247183565275, + "grad_norm": 12.004461288452148, + "learning_rate": 8.467854355716823e-07, + "loss": 0.0944, + "num_input_tokens_seen": 32350472, + "step": 16526 + }, + { + "epoch": 2.1904572564612326, + "grad_norm": 3.0960607528686523, + "learning_rate": 8.465250520023874e-07, + "loss": 0.0292, + "num_input_tokens_seen": 32352168, + "step": 16527 + }, + { + "epoch": 2.1905897945659376, + "grad_norm": 0.22534780204296112, + "learning_rate": 8.462647003132301e-07, + "loss": 0.0017, + "num_input_tokens_seen": 32355112, + "step": 16528 + }, + { + "epoch": 2.1907223326706426, + "grad_norm": 0.004439543467015028, + "learning_rate": 8.460043805092305e-07, + "loss": 0.0, + "num_input_tokens_seen": 32356496, + "step": 16529 + }, + { + "epoch": 2.1908548707753477, + "grad_norm": 0.5017176270484924, + "learning_rate": 8.457440925954063e-07, + "loss": 0.0021, + "num_input_tokens_seen": 32358320, + "step": 16530 + }, + { + "epoch": 2.190987408880053, + "grad_norm": 0.25894895195961, + "learning_rate": 8.454838365767773e-07, + "loss": 0.0015, + "num_input_tokens_seen": 32360008, + "step": 16531 + }, + { + "epoch": 2.191119946984758, + "grad_norm": 2.5129244327545166, + "learning_rate": 8.452236124583624e-07, + "loss": 0.0112, + "num_input_tokens_seen": 32361728, + "step": 16532 + }, + { + "epoch": 2.1912524850894632, + "grad_norm": 0.026141416281461716, + "learning_rate": 8.449634202451775e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32364472, + "step": 16533 + }, + { + "epoch": 2.1913850231941683, + "grad_norm": 19.51425552368164, + "learning_rate": 8.447032599422398e-07, + "loss": 0.0969, + "num_input_tokens_seen": 32367040, + "step": 16534 + }, + { + "epoch": 2.1915175612988733, + "grad_norm": 2.536156415939331, + "learning_rate": 8.444431315545642e-07, + "loss": 0.0434, + "num_input_tokens_seen": 32369136, + "step": 16535 + }, + { + "epoch": 2.1916500994035784, + "grad_norm": 22.4178466796875, + "learning_rate": 8.441830350871679e-07, + "loss": 0.2222, + "num_input_tokens_seen": 32371248, + "step": 16536 + }, + { + "epoch": 2.191782637508284, + "grad_norm": 0.6694629192352295, + "learning_rate": 8.439229705450644e-07, + "loss": 0.0033, + "num_input_tokens_seen": 32372928, + "step": 16537 + }, + { + "epoch": 2.191915175612989, + "grad_norm": 4.045031547546387, + "learning_rate": 8.436629379332693e-07, + "loss": 0.1143, + "num_input_tokens_seen": 32374416, + "step": 16538 + }, + { + "epoch": 2.192047713717694, + "grad_norm": 13.851426124572754, + "learning_rate": 8.434029372567953e-07, + "loss": 0.1598, + "num_input_tokens_seen": 32376856, + "step": 16539 + }, + { + "epoch": 2.192180251822399, + "grad_norm": 3.52768874168396, + "learning_rate": 8.431429685206547e-07, + "loss": 0.0589, + "num_input_tokens_seen": 32379648, + "step": 16540 + }, + { + "epoch": 2.192312789927104, + "grad_norm": 0.001265904982574284, + "learning_rate": 8.428830317298614e-07, + "loss": 0.0, + "num_input_tokens_seen": 32380928, + "step": 16541 + }, + { + "epoch": 2.192445328031809, + "grad_norm": 17.802461624145508, + "learning_rate": 8.426231268894266e-07, + "loss": 0.3686, + "num_input_tokens_seen": 32383712, + "step": 16542 + }, + { + "epoch": 2.192577866136514, + "grad_norm": 6.707571506500244, + "learning_rate": 8.423632540043603e-07, + "loss": 0.0644, + "num_input_tokens_seen": 32385624, + "step": 16543 + }, + { + "epoch": 2.1927104042412195, + "grad_norm": 1.2263346910476685, + "learning_rate": 8.421034130796748e-07, + "loss": 0.0016, + "num_input_tokens_seen": 32387808, + "step": 16544 + }, + { + "epoch": 2.1928429423459246, + "grad_norm": 0.021623998880386353, + "learning_rate": 8.418436041203787e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32390280, + "step": 16545 + }, + { + "epoch": 2.1929754804506296, + "grad_norm": 0.09988313168287277, + "learning_rate": 8.415838271314825e-07, + "loss": 0.0003, + "num_input_tokens_seen": 32391824, + "step": 16546 + }, + { + "epoch": 2.1931080185553347, + "grad_norm": 0.05018071457743645, + "learning_rate": 8.413240821179941e-07, + "loss": 0.0004, + "num_input_tokens_seen": 32393304, + "step": 16547 + }, + { + "epoch": 2.1932405566600397, + "grad_norm": 6.960991382598877, + "learning_rate": 8.410643690849215e-07, + "loss": 0.0847, + "num_input_tokens_seen": 32395504, + "step": 16548 + }, + { + "epoch": 2.1933730947647447, + "grad_norm": 0.014155656099319458, + "learning_rate": 8.408046880372717e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32397216, + "step": 16549 + }, + { + "epoch": 2.1935056328694498, + "grad_norm": 3.7262864112854004, + "learning_rate": 8.40545038980053e-07, + "loss": 0.0348, + "num_input_tokens_seen": 32399032, + "step": 16550 + }, + { + "epoch": 2.1936381709741553, + "grad_norm": 5.613471031188965, + "learning_rate": 8.402854219182699e-07, + "loss": 0.117, + "num_input_tokens_seen": 32400608, + "step": 16551 + }, + { + "epoch": 2.1937707090788603, + "grad_norm": 0.0005248048692010343, + "learning_rate": 8.400258368569297e-07, + "loss": 0.0, + "num_input_tokens_seen": 32401912, + "step": 16552 + }, + { + "epoch": 2.1939032471835653, + "grad_norm": 11.366371154785156, + "learning_rate": 8.397662838010365e-07, + "loss": 0.1462, + "num_input_tokens_seen": 32404024, + "step": 16553 + }, + { + "epoch": 2.1940357852882704, + "grad_norm": 14.923547744750977, + "learning_rate": 8.395067627555937e-07, + "loss": 0.136, + "num_input_tokens_seen": 32405784, + "step": 16554 + }, + { + "epoch": 2.1941683233929754, + "grad_norm": 8.638009071350098, + "learning_rate": 8.392472737256069e-07, + "loss": 0.1959, + "num_input_tokens_seen": 32407592, + "step": 16555 + }, + { + "epoch": 2.1943008614976804, + "grad_norm": 3.1502857208251953, + "learning_rate": 8.389878167160775e-07, + "loss": 0.0251, + "num_input_tokens_seen": 32410416, + "step": 16556 + }, + { + "epoch": 2.1944333996023855, + "grad_norm": 6.3921966552734375, + "learning_rate": 8.387283917320099e-07, + "loss": 0.0642, + "num_input_tokens_seen": 32412280, + "step": 16557 + }, + { + "epoch": 2.194565937707091, + "grad_norm": 4.556399345397949, + "learning_rate": 8.384689987784047e-07, + "loss": 0.0306, + "num_input_tokens_seen": 32414312, + "step": 16558 + }, + { + "epoch": 2.194698475811796, + "grad_norm": 0.0019420023309066892, + "learning_rate": 8.382096378602639e-07, + "loss": 0.0, + "num_input_tokens_seen": 32415872, + "step": 16559 + }, + { + "epoch": 2.194831013916501, + "grad_norm": 0.4089391529560089, + "learning_rate": 8.379503089825863e-07, + "loss": 0.0013, + "num_input_tokens_seen": 32417544, + "step": 16560 + }, + { + "epoch": 2.194963552021206, + "grad_norm": 0.0075423214584589005, + "learning_rate": 8.376910121503737e-07, + "loss": 0.0, + "num_input_tokens_seen": 32419272, + "step": 16561 + }, + { + "epoch": 2.195096090125911, + "grad_norm": 15.569775581359863, + "learning_rate": 8.374317473686259e-07, + "loss": 0.4376, + "num_input_tokens_seen": 32421000, + "step": 16562 + }, + { + "epoch": 2.195228628230616, + "grad_norm": 2.2102811336517334, + "learning_rate": 8.371725146423409e-07, + "loss": 0.0118, + "num_input_tokens_seen": 32422856, + "step": 16563 + }, + { + "epoch": 2.1953611663353216, + "grad_norm": 0.0014408637071028352, + "learning_rate": 8.369133139765171e-07, + "loss": 0.0, + "num_input_tokens_seen": 32424520, + "step": 16564 + }, + { + "epoch": 2.1954937044400267, + "grad_norm": 0.6970670819282532, + "learning_rate": 8.366541453761515e-07, + "loss": 0.0043, + "num_input_tokens_seen": 32426328, + "step": 16565 + }, + { + "epoch": 2.1956262425447317, + "grad_norm": 0.0062262252904474735, + "learning_rate": 8.363950088462411e-07, + "loss": 0.0, + "num_input_tokens_seen": 32428272, + "step": 16566 + }, + { + "epoch": 2.1957587806494367, + "grad_norm": 0.2609323263168335, + "learning_rate": 8.361359043917824e-07, + "loss": 0.0008, + "num_input_tokens_seen": 32430400, + "step": 16567 + }, + { + "epoch": 2.195891318754142, + "grad_norm": 3.6146416664123535, + "learning_rate": 8.358768320177721e-07, + "loss": 0.0305, + "num_input_tokens_seen": 32432120, + "step": 16568 + }, + { + "epoch": 2.196023856858847, + "grad_norm": 0.02466299571096897, + "learning_rate": 8.356177917292044e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32434464, + "step": 16569 + }, + { + "epoch": 2.196156394963552, + "grad_norm": 4.014787197113037, + "learning_rate": 8.353587835310742e-07, + "loss": 0.1579, + "num_input_tokens_seen": 32436200, + "step": 16570 + }, + { + "epoch": 2.1962889330682573, + "grad_norm": 3.356247901916504, + "learning_rate": 8.350998074283739e-07, + "loss": 0.0287, + "num_input_tokens_seen": 32438448, + "step": 16571 + }, + { + "epoch": 2.1964214711729624, + "grad_norm": 1.1415212154388428, + "learning_rate": 8.348408634260988e-07, + "loss": 0.056, + "num_input_tokens_seen": 32439848, + "step": 16572 + }, + { + "epoch": 2.1965540092776674, + "grad_norm": 0.006085709203034639, + "learning_rate": 8.345819515292397e-07, + "loss": 0.0, + "num_input_tokens_seen": 32441440, + "step": 16573 + }, + { + "epoch": 2.1966865473823725, + "grad_norm": 0.005499835126101971, + "learning_rate": 8.343230717427906e-07, + "loss": 0.0, + "num_input_tokens_seen": 32443456, + "step": 16574 + }, + { + "epoch": 2.1968190854870775, + "grad_norm": 0.042069803923368454, + "learning_rate": 8.340642240717414e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32444768, + "step": 16575 + }, + { + "epoch": 2.1969516235917825, + "grad_norm": 0.10017409920692444, + "learning_rate": 8.338054085210826e-07, + "loss": 0.0006, + "num_input_tokens_seen": 32446032, + "step": 16576 + }, + { + "epoch": 2.1970841616964876, + "grad_norm": 5.813887119293213, + "learning_rate": 8.335466250958058e-07, + "loss": 0.0932, + "num_input_tokens_seen": 32448032, + "step": 16577 + }, + { + "epoch": 2.197216699801193, + "grad_norm": 0.003848862834274769, + "learning_rate": 8.332878738008987e-07, + "loss": 0.0, + "num_input_tokens_seen": 32449888, + "step": 16578 + }, + { + "epoch": 2.197349237905898, + "grad_norm": 7.274786472320557, + "learning_rate": 8.330291546413522e-07, + "loss": 0.1682, + "num_input_tokens_seen": 32451952, + "step": 16579 + }, + { + "epoch": 2.197481776010603, + "grad_norm": 0.03515326976776123, + "learning_rate": 8.327704676221534e-07, + "loss": 0.0002, + "num_input_tokens_seen": 32453320, + "step": 16580 + }, + { + "epoch": 2.197614314115308, + "grad_norm": 1.4648410081863403, + "learning_rate": 8.325118127482903e-07, + "loss": 0.0101, + "num_input_tokens_seen": 32455504, + "step": 16581 + }, + { + "epoch": 2.197746852220013, + "grad_norm": 0.4241339862346649, + "learning_rate": 8.322531900247491e-07, + "loss": 0.0018, + "num_input_tokens_seen": 32457264, + "step": 16582 + }, + { + "epoch": 2.1978793903247182, + "grad_norm": 0.006087826564908028, + "learning_rate": 8.319945994565173e-07, + "loss": 0.0, + "num_input_tokens_seen": 32458400, + "step": 16583 + }, + { + "epoch": 2.1980119284294233, + "grad_norm": 4.9124579429626465, + "learning_rate": 8.3173604104858e-07, + "loss": 0.0198, + "num_input_tokens_seen": 32460224, + "step": 16584 + }, + { + "epoch": 2.1981444665341288, + "grad_norm": 0.5875461101531982, + "learning_rate": 8.314775148059231e-07, + "loss": 0.003, + "num_input_tokens_seen": 32461784, + "step": 16585 + }, + { + "epoch": 2.198277004638834, + "grad_norm": 0.9466050267219543, + "learning_rate": 8.312190207335311e-07, + "loss": 0.0109, + "num_input_tokens_seen": 32463432, + "step": 16586 + }, + { + "epoch": 2.198409542743539, + "grad_norm": 0.1075681522488594, + "learning_rate": 8.309605588363867e-07, + "loss": 0.0006, + "num_input_tokens_seen": 32465416, + "step": 16587 + }, + { + "epoch": 2.198542080848244, + "grad_norm": 0.742644190788269, + "learning_rate": 8.30702129119475e-07, + "loss": 0.002, + "num_input_tokens_seen": 32467280, + "step": 16588 + }, + { + "epoch": 2.198674618952949, + "grad_norm": 0.026090240105986595, + "learning_rate": 8.304437315877775e-07, + "loss": 0.0002, + "num_input_tokens_seen": 32469152, + "step": 16589 + }, + { + "epoch": 2.198807157057654, + "grad_norm": 2.9998509883880615, + "learning_rate": 8.301853662462761e-07, + "loss": 0.0238, + "num_input_tokens_seen": 32470744, + "step": 16590 + }, + { + "epoch": 2.198939695162359, + "grad_norm": 2.4081480503082275, + "learning_rate": 8.299270330999537e-07, + "loss": 0.0113, + "num_input_tokens_seen": 32472704, + "step": 16591 + }, + { + "epoch": 2.1990722332670645, + "grad_norm": 9.72903060913086, + "learning_rate": 8.296687321537892e-07, + "loss": 0.1357, + "num_input_tokens_seen": 32474560, + "step": 16592 + }, + { + "epoch": 2.1992047713717695, + "grad_norm": 19.19078826904297, + "learning_rate": 8.294104634127648e-07, + "loss": 0.2384, + "num_input_tokens_seen": 32476024, + "step": 16593 + }, + { + "epoch": 2.1993373094764745, + "grad_norm": 3.7494773864746094, + "learning_rate": 8.291522268818592e-07, + "loss": 0.0314, + "num_input_tokens_seen": 32478448, + "step": 16594 + }, + { + "epoch": 2.1994698475811796, + "grad_norm": 0.023266751319169998, + "learning_rate": 8.288940225660511e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32479872, + "step": 16595 + }, + { + "epoch": 2.1996023856858846, + "grad_norm": 7.55430269241333, + "learning_rate": 8.286358504703185e-07, + "loss": 0.054, + "num_input_tokens_seen": 32481576, + "step": 16596 + }, + { + "epoch": 2.1997349237905897, + "grad_norm": 4.618304252624512, + "learning_rate": 8.283777105996393e-07, + "loss": 0.0751, + "num_input_tokens_seen": 32484224, + "step": 16597 + }, + { + "epoch": 2.1998674618952947, + "grad_norm": 4.229299545288086, + "learning_rate": 8.281196029589922e-07, + "loss": 0.0499, + "num_input_tokens_seen": 32486936, + "step": 16598 + }, + { + "epoch": 2.2, + "grad_norm": 7.89039421081543, + "learning_rate": 8.278615275533525e-07, + "loss": 0.0353, + "num_input_tokens_seen": 32488552, + "step": 16599 + }, + { + "epoch": 2.200132538104705, + "grad_norm": 0.11992044746875763, + "learning_rate": 8.276034843876963e-07, + "loss": 0.0005, + "num_input_tokens_seen": 32490840, + "step": 16600 + }, + { + "epoch": 2.2002650762094103, + "grad_norm": 0.6093519926071167, + "learning_rate": 8.273454734669984e-07, + "loss": 0.0024, + "num_input_tokens_seen": 32492768, + "step": 16601 + }, + { + "epoch": 2.2003976143141153, + "grad_norm": 9.697235107421875, + "learning_rate": 8.270874947962326e-07, + "loss": 0.0194, + "num_input_tokens_seen": 32494560, + "step": 16602 + }, + { + "epoch": 2.2005301524188203, + "grad_norm": 6.168498992919922, + "learning_rate": 8.268295483803745e-07, + "loss": 0.0408, + "num_input_tokens_seen": 32496280, + "step": 16603 + }, + { + "epoch": 2.2006626905235254, + "grad_norm": 0.09125087410211563, + "learning_rate": 8.265716342243973e-07, + "loss": 0.0004, + "num_input_tokens_seen": 32497696, + "step": 16604 + }, + { + "epoch": 2.2007952286282304, + "grad_norm": 0.1364143341779709, + "learning_rate": 8.263137523332737e-07, + "loss": 0.0006, + "num_input_tokens_seen": 32499456, + "step": 16605 + }, + { + "epoch": 2.200927766732936, + "grad_norm": 0.16730304062366486, + "learning_rate": 8.260559027119758e-07, + "loss": 0.0009, + "num_input_tokens_seen": 32501152, + "step": 16606 + }, + { + "epoch": 2.201060304837641, + "grad_norm": 0.7307750582695007, + "learning_rate": 8.257980853654737e-07, + "loss": 0.005, + "num_input_tokens_seen": 32503192, + "step": 16607 + }, + { + "epoch": 2.201192842942346, + "grad_norm": 4.9991278648376465, + "learning_rate": 8.255403002987397e-07, + "loss": 0.0624, + "num_input_tokens_seen": 32504960, + "step": 16608 + }, + { + "epoch": 2.201325381047051, + "grad_norm": 1.3817178010940552, + "learning_rate": 8.252825475167445e-07, + "loss": 0.0081, + "num_input_tokens_seen": 32506576, + "step": 16609 + }, + { + "epoch": 2.201457919151756, + "grad_norm": 6.537020683288574, + "learning_rate": 8.250248270244571e-07, + "loss": 0.0729, + "num_input_tokens_seen": 32508504, + "step": 16610 + }, + { + "epoch": 2.201590457256461, + "grad_norm": 0.4114634394645691, + "learning_rate": 8.247671388268466e-07, + "loss": 0.0025, + "num_input_tokens_seen": 32510600, + "step": 16611 + }, + { + "epoch": 2.201722995361166, + "grad_norm": 0.014505065977573395, + "learning_rate": 8.245094829288816e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32511728, + "step": 16612 + }, + { + "epoch": 2.2018555334658716, + "grad_norm": 4.720791339874268, + "learning_rate": 8.242518593355287e-07, + "loss": 0.0358, + "num_input_tokens_seen": 32513384, + "step": 16613 + }, + { + "epoch": 2.2019880715705766, + "grad_norm": 0.02104461006820202, + "learning_rate": 8.239942680517557e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32514920, + "step": 16614 + }, + { + "epoch": 2.2021206096752817, + "grad_norm": 5.362493991851807, + "learning_rate": 8.237367090825305e-07, + "loss": 0.0328, + "num_input_tokens_seen": 32516760, + "step": 16615 + }, + { + "epoch": 2.2022531477799867, + "grad_norm": 4.5430588722229, + "learning_rate": 8.234791824328179e-07, + "loss": 0.0263, + "num_input_tokens_seen": 32518536, + "step": 16616 + }, + { + "epoch": 2.2023856858846917, + "grad_norm": 0.19859392940998077, + "learning_rate": 8.232216881075833e-07, + "loss": 0.0007, + "num_input_tokens_seen": 32520016, + "step": 16617 + }, + { + "epoch": 2.202518223989397, + "grad_norm": 9.521278381347656, + "learning_rate": 8.229642261117904e-07, + "loss": 0.082, + "num_input_tokens_seen": 32522224, + "step": 16618 + }, + { + "epoch": 2.202650762094102, + "grad_norm": 0.004801636096090078, + "learning_rate": 8.227067964504049e-07, + "loss": 0.0, + "num_input_tokens_seen": 32524912, + "step": 16619 + }, + { + "epoch": 2.2027833001988073, + "grad_norm": 0.005690832156687975, + "learning_rate": 8.224493991283886e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32528456, + "step": 16620 + }, + { + "epoch": 2.2029158383035123, + "grad_norm": 7.751559734344482, + "learning_rate": 8.221920341507062e-07, + "loss": 0.0428, + "num_input_tokens_seen": 32530576, + "step": 16621 + }, + { + "epoch": 2.2030483764082174, + "grad_norm": 6.102159023284912, + "learning_rate": 8.219347015223186e-07, + "loss": 0.1543, + "num_input_tokens_seen": 32532640, + "step": 16622 + }, + { + "epoch": 2.2031809145129224, + "grad_norm": 8.567307472229004, + "learning_rate": 8.216774012481871e-07, + "loss": 0.1467, + "num_input_tokens_seen": 32534216, + "step": 16623 + }, + { + "epoch": 2.2033134526176275, + "grad_norm": 0.2641359865665436, + "learning_rate": 8.214201333332735e-07, + "loss": 0.001, + "num_input_tokens_seen": 32536472, + "step": 16624 + }, + { + "epoch": 2.2034459907223325, + "grad_norm": 3.0709033012390137, + "learning_rate": 8.211628977825381e-07, + "loss": 0.0175, + "num_input_tokens_seen": 32538728, + "step": 16625 + }, + { + "epoch": 2.203578528827038, + "grad_norm": 1.872612476348877, + "learning_rate": 8.209056946009394e-07, + "loss": 0.01, + "num_input_tokens_seen": 32541896, + "step": 16626 + }, + { + "epoch": 2.203711066931743, + "grad_norm": 7.065799236297607, + "learning_rate": 8.206485237934378e-07, + "loss": 0.1257, + "num_input_tokens_seen": 32543664, + "step": 16627 + }, + { + "epoch": 2.203843605036448, + "grad_norm": 39.87990188598633, + "learning_rate": 8.203913853649906e-07, + "loss": 0.2339, + "num_input_tokens_seen": 32545744, + "step": 16628 + }, + { + "epoch": 2.203976143141153, + "grad_norm": 0.061339370906353, + "learning_rate": 8.201342793205569e-07, + "loss": 0.0003, + "num_input_tokens_seen": 32547568, + "step": 16629 + }, + { + "epoch": 2.204108681245858, + "grad_norm": 2.6804893016815186, + "learning_rate": 8.198772056650933e-07, + "loss": 0.0112, + "num_input_tokens_seen": 32549776, + "step": 16630 + }, + { + "epoch": 2.204241219350563, + "grad_norm": 1.8356661796569824, + "learning_rate": 8.196201644035551e-07, + "loss": 0.0081, + "num_input_tokens_seen": 32552144, + "step": 16631 + }, + { + "epoch": 2.204373757455268, + "grad_norm": 5.348264217376709, + "learning_rate": 8.193631555409004e-07, + "loss": 0.017, + "num_input_tokens_seen": 32554296, + "step": 16632 + }, + { + "epoch": 2.2045062955599737, + "grad_norm": 0.0521753765642643, + "learning_rate": 8.191061790820829e-07, + "loss": 0.0003, + "num_input_tokens_seen": 32555704, + "step": 16633 + }, + { + "epoch": 2.2046388336646787, + "grad_norm": 5.644750118255615, + "learning_rate": 8.188492350320582e-07, + "loss": 0.0312, + "num_input_tokens_seen": 32558272, + "step": 16634 + }, + { + "epoch": 2.2047713717693838, + "grad_norm": 0.15384791791439056, + "learning_rate": 8.185923233957802e-07, + "loss": 0.0008, + "num_input_tokens_seen": 32561136, + "step": 16635 + }, + { + "epoch": 2.204903909874089, + "grad_norm": 1.3080133199691772, + "learning_rate": 8.183354441782024e-07, + "loss": 0.0111, + "num_input_tokens_seen": 32564160, + "step": 16636 + }, + { + "epoch": 2.205036447978794, + "grad_norm": 13.994810104370117, + "learning_rate": 8.180785973842761e-07, + "loss": 0.2095, + "num_input_tokens_seen": 32566080, + "step": 16637 + }, + { + "epoch": 2.205168986083499, + "grad_norm": 0.05276124179363251, + "learning_rate": 8.178217830189558e-07, + "loss": 0.0003, + "num_input_tokens_seen": 32568168, + "step": 16638 + }, + { + "epoch": 2.205301524188204, + "grad_norm": 2.325650453567505, + "learning_rate": 8.175650010871914e-07, + "loss": 0.0243, + "num_input_tokens_seen": 32570304, + "step": 16639 + }, + { + "epoch": 2.2054340622929094, + "grad_norm": 7.423624515533447, + "learning_rate": 8.173082515939348e-07, + "loss": 0.0249, + "num_input_tokens_seen": 32571744, + "step": 16640 + }, + { + "epoch": 2.2055666003976144, + "grad_norm": 1.45160973072052, + "learning_rate": 8.170515345441362e-07, + "loss": 0.011, + "num_input_tokens_seen": 32574384, + "step": 16641 + }, + { + "epoch": 2.2056991385023195, + "grad_norm": 4.078460216522217, + "learning_rate": 8.16794849942745e-07, + "loss": 0.0393, + "num_input_tokens_seen": 32576632, + "step": 16642 + }, + { + "epoch": 2.2058316766070245, + "grad_norm": 8.769227981567383, + "learning_rate": 8.165381977947093e-07, + "loss": 0.0647, + "num_input_tokens_seen": 32579056, + "step": 16643 + }, + { + "epoch": 2.2059642147117295, + "grad_norm": 10.753658294677734, + "learning_rate": 8.162815781049787e-07, + "loss": 0.0603, + "num_input_tokens_seen": 32581168, + "step": 16644 + }, + { + "epoch": 2.2060967528164346, + "grad_norm": 1.511300802230835, + "learning_rate": 8.160249908785017e-07, + "loss": 0.0058, + "num_input_tokens_seen": 32582640, + "step": 16645 + }, + { + "epoch": 2.20622929092114, + "grad_norm": 2.334587335586548, + "learning_rate": 8.157684361202242e-07, + "loss": 0.0021, + "num_input_tokens_seen": 32584368, + "step": 16646 + }, + { + "epoch": 2.206361829025845, + "grad_norm": 0.17476649582386017, + "learning_rate": 8.155119138350937e-07, + "loss": 0.0008, + "num_input_tokens_seen": 32585976, + "step": 16647 + }, + { + "epoch": 2.20649436713055, + "grad_norm": 0.07024337351322174, + "learning_rate": 8.152554240280552e-07, + "loss": 0.0002, + "num_input_tokens_seen": 32587624, + "step": 16648 + }, + { + "epoch": 2.206626905235255, + "grad_norm": 11.482390403747559, + "learning_rate": 8.149989667040537e-07, + "loss": 0.1987, + "num_input_tokens_seen": 32590440, + "step": 16649 + }, + { + "epoch": 2.20675944333996, + "grad_norm": 6.394782066345215, + "learning_rate": 8.147425418680343e-07, + "loss": 0.1144, + "num_input_tokens_seen": 32592160, + "step": 16650 + }, + { + "epoch": 2.2068919814446653, + "grad_norm": 0.014118979685008526, + "learning_rate": 8.144861495249424e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32593752, + "step": 16651 + }, + { + "epoch": 2.2070245195493703, + "grad_norm": 0.0030741780065000057, + "learning_rate": 8.142297896797203e-07, + "loss": 0.0, + "num_input_tokens_seen": 32594712, + "step": 16652 + }, + { + "epoch": 2.2071570576540758, + "grad_norm": 1.702795386314392, + "learning_rate": 8.139734623373105e-07, + "loss": 0.0133, + "num_input_tokens_seen": 32596552, + "step": 16653 + }, + { + "epoch": 2.207289595758781, + "grad_norm": 0.07550080865621567, + "learning_rate": 8.137171675026548e-07, + "loss": 0.0004, + "num_input_tokens_seen": 32599304, + "step": 16654 + }, + { + "epoch": 2.207422133863486, + "grad_norm": 0.027994675561785698, + "learning_rate": 8.134609051806964e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32601208, + "step": 16655 + }, + { + "epoch": 2.207554671968191, + "grad_norm": 6.728792667388916, + "learning_rate": 8.132046753763742e-07, + "loss": 0.0448, + "num_input_tokens_seen": 32603712, + "step": 16656 + }, + { + "epoch": 2.207687210072896, + "grad_norm": 0.005654821638017893, + "learning_rate": 8.129484780946304e-07, + "loss": 0.0, + "num_input_tokens_seen": 32604976, + "step": 16657 + }, + { + "epoch": 2.207819748177601, + "grad_norm": 7.937885761260986, + "learning_rate": 8.126923133404038e-07, + "loss": 0.0474, + "num_input_tokens_seen": 32606568, + "step": 16658 + }, + { + "epoch": 2.207952286282306, + "grad_norm": 0.036151982843875885, + "learning_rate": 8.124361811186327e-07, + "loss": 0.0003, + "num_input_tokens_seen": 32609280, + "step": 16659 + }, + { + "epoch": 2.2080848243870115, + "grad_norm": 0.022098803892731667, + "learning_rate": 8.12180081434257e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32610808, + "step": 16660 + }, + { + "epoch": 2.2082173624917165, + "grad_norm": 0.08687372505664825, + "learning_rate": 8.119240142922127e-07, + "loss": 0.0006, + "num_input_tokens_seen": 32613120, + "step": 16661 + }, + { + "epoch": 2.2083499005964216, + "grad_norm": 0.07096479833126068, + "learning_rate": 8.116679796974389e-07, + "loss": 0.0004, + "num_input_tokens_seen": 32615904, + "step": 16662 + }, + { + "epoch": 2.2084824387011266, + "grad_norm": 1.863954782485962, + "learning_rate": 8.114119776548713e-07, + "loss": 0.0535, + "num_input_tokens_seen": 32617392, + "step": 16663 + }, + { + "epoch": 2.2086149768058316, + "grad_norm": 1.3302783966064453, + "learning_rate": 8.111560081694456e-07, + "loss": 0.0041, + "num_input_tokens_seen": 32618928, + "step": 16664 + }, + { + "epoch": 2.2087475149105367, + "grad_norm": 0.08653838187456131, + "learning_rate": 8.109000712460963e-07, + "loss": 0.0005, + "num_input_tokens_seen": 32620528, + "step": 16665 + }, + { + "epoch": 2.2088800530152417, + "grad_norm": 2.831339120864868, + "learning_rate": 8.106441668897597e-07, + "loss": 0.0101, + "num_input_tokens_seen": 32622336, + "step": 16666 + }, + { + "epoch": 2.209012591119947, + "grad_norm": 0.018863758072257042, + "learning_rate": 8.103882951053679e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32624384, + "step": 16667 + }, + { + "epoch": 2.2091451292246522, + "grad_norm": 1.2936036586761475, + "learning_rate": 8.101324558978566e-07, + "loss": 0.01, + "num_input_tokens_seen": 32626240, + "step": 16668 + }, + { + "epoch": 2.2092776673293573, + "grad_norm": 3.5421905517578125, + "learning_rate": 8.098766492721572e-07, + "loss": 0.0242, + "num_input_tokens_seen": 32628296, + "step": 16669 + }, + { + "epoch": 2.2094102054340623, + "grad_norm": 2.020473003387451, + "learning_rate": 8.096208752332007e-07, + "loss": 0.0088, + "num_input_tokens_seen": 32629944, + "step": 16670 + }, + { + "epoch": 2.2095427435387673, + "grad_norm": 1.850967526435852, + "learning_rate": 8.093651337859213e-07, + "loss": 0.0043, + "num_input_tokens_seen": 32632552, + "step": 16671 + }, + { + "epoch": 2.2096752816434724, + "grad_norm": 0.899761974811554, + "learning_rate": 8.091094249352483e-07, + "loss": 0.003, + "num_input_tokens_seen": 32633896, + "step": 16672 + }, + { + "epoch": 2.2098078197481774, + "grad_norm": 0.008522593416273594, + "learning_rate": 8.088537486861109e-07, + "loss": 0.0, + "num_input_tokens_seen": 32635216, + "step": 16673 + }, + { + "epoch": 2.209940357852883, + "grad_norm": 2.7074930667877197, + "learning_rate": 8.08598105043441e-07, + "loss": 0.015, + "num_input_tokens_seen": 32636744, + "step": 16674 + }, + { + "epoch": 2.210072895957588, + "grad_norm": 0.05558546632528305, + "learning_rate": 8.083424940121657e-07, + "loss": 0.0003, + "num_input_tokens_seen": 32638208, + "step": 16675 + }, + { + "epoch": 2.210205434062293, + "grad_norm": 5.4678473472595215, + "learning_rate": 8.080869155972149e-07, + "loss": 0.0389, + "num_input_tokens_seen": 32640824, + "step": 16676 + }, + { + "epoch": 2.210337972166998, + "grad_norm": 1.7655972242355347, + "learning_rate": 8.078313698035154e-07, + "loss": 0.008, + "num_input_tokens_seen": 32642824, + "step": 16677 + }, + { + "epoch": 2.210470510271703, + "grad_norm": 12.888080596923828, + "learning_rate": 8.075758566359946e-07, + "loss": 0.2428, + "num_input_tokens_seen": 32644768, + "step": 16678 + }, + { + "epoch": 2.210603048376408, + "grad_norm": 0.07891834527254105, + "learning_rate": 8.073203760995779e-07, + "loss": 0.0008, + "num_input_tokens_seen": 32647160, + "step": 16679 + }, + { + "epoch": 2.210735586481113, + "grad_norm": 0.09008906036615372, + "learning_rate": 8.070649281991921e-07, + "loss": 0.0005, + "num_input_tokens_seen": 32649656, + "step": 16680 + }, + { + "epoch": 2.2108681245858186, + "grad_norm": 0.0008526545134373009, + "learning_rate": 8.068095129397635e-07, + "loss": 0.0, + "num_input_tokens_seen": 32650712, + "step": 16681 + }, + { + "epoch": 2.2110006626905236, + "grad_norm": 3.7387852668762207, + "learning_rate": 8.06554130326215e-07, + "loss": 0.0449, + "num_input_tokens_seen": 32652728, + "step": 16682 + }, + { + "epoch": 2.2111332007952287, + "grad_norm": 1.3489062786102295, + "learning_rate": 8.062987803634717e-07, + "loss": 0.0107, + "num_input_tokens_seen": 32655664, + "step": 16683 + }, + { + "epoch": 2.2112657388999337, + "grad_norm": 0.04860488697886467, + "learning_rate": 8.060434630564551e-07, + "loss": 0.0003, + "num_input_tokens_seen": 32657376, + "step": 16684 + }, + { + "epoch": 2.2113982770046388, + "grad_norm": 0.004314373712986708, + "learning_rate": 8.057881784100893e-07, + "loss": 0.0, + "num_input_tokens_seen": 32658584, + "step": 16685 + }, + { + "epoch": 2.211530815109344, + "grad_norm": 0.15226362645626068, + "learning_rate": 8.055329264292971e-07, + "loss": 0.0009, + "num_input_tokens_seen": 32661000, + "step": 16686 + }, + { + "epoch": 2.211663353214049, + "grad_norm": 0.039376676082611084, + "learning_rate": 8.052777071189988e-07, + "loss": 0.0002, + "num_input_tokens_seen": 32662072, + "step": 16687 + }, + { + "epoch": 2.2117958913187543, + "grad_norm": 8.777050971984863, + "learning_rate": 8.050225204841155e-07, + "loss": 0.0442, + "num_input_tokens_seen": 32664272, + "step": 16688 + }, + { + "epoch": 2.2119284294234594, + "grad_norm": 0.025790821760892868, + "learning_rate": 8.047673665295674e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32665920, + "step": 16689 + }, + { + "epoch": 2.2120609675281644, + "grad_norm": 3.2006733417510986, + "learning_rate": 8.045122452602733e-07, + "loss": 0.0223, + "num_input_tokens_seen": 32668416, + "step": 16690 + }, + { + "epoch": 2.2121935056328694, + "grad_norm": 0.013851161114871502, + "learning_rate": 8.042571566811524e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32670808, + "step": 16691 + }, + { + "epoch": 2.2123260437375745, + "grad_norm": 8.55340576171875, + "learning_rate": 8.040021007971243e-07, + "loss": 0.1795, + "num_input_tokens_seen": 32673360, + "step": 16692 + }, + { + "epoch": 2.2124585818422795, + "grad_norm": 5.639500617980957, + "learning_rate": 8.03747077613106e-07, + "loss": 0.0623, + "num_input_tokens_seen": 32675440, + "step": 16693 + }, + { + "epoch": 2.2125911199469845, + "grad_norm": 2.0292844772338867, + "learning_rate": 8.034920871340137e-07, + "loss": 0.0284, + "num_input_tokens_seen": 32677120, + "step": 16694 + }, + { + "epoch": 2.21272365805169, + "grad_norm": 3.819126844406128, + "learning_rate": 8.032371293647648e-07, + "loss": 0.0502, + "num_input_tokens_seen": 32679992, + "step": 16695 + }, + { + "epoch": 2.212856196156395, + "grad_norm": 10.78261661529541, + "learning_rate": 8.029822043102736e-07, + "loss": 0.2112, + "num_input_tokens_seen": 32682128, + "step": 16696 + }, + { + "epoch": 2.2129887342611, + "grad_norm": 8.088414192199707, + "learning_rate": 8.027273119754561e-07, + "loss": 0.1415, + "num_input_tokens_seen": 32684480, + "step": 16697 + }, + { + "epoch": 2.213121272365805, + "grad_norm": 0.008392506279051304, + "learning_rate": 8.024724523652278e-07, + "loss": 0.0, + "num_input_tokens_seen": 32686376, + "step": 16698 + }, + { + "epoch": 2.21325381047051, + "grad_norm": 1.8394683599472046, + "learning_rate": 8.022176254845018e-07, + "loss": 0.0032, + "num_input_tokens_seen": 32688608, + "step": 16699 + }, + { + "epoch": 2.213386348575215, + "grad_norm": 0.01865784451365471, + "learning_rate": 8.019628313381911e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32690232, + "step": 16700 + }, + { + "epoch": 2.2135188866799203, + "grad_norm": 11.831122398376465, + "learning_rate": 8.017080699312074e-07, + "loss": 0.1272, + "num_input_tokens_seen": 32692264, + "step": 16701 + }, + { + "epoch": 2.2136514247846257, + "grad_norm": 33.87800979614258, + "learning_rate": 8.01453341268465e-07, + "loss": 0.1935, + "num_input_tokens_seen": 32694768, + "step": 16702 + }, + { + "epoch": 2.2137839628893308, + "grad_norm": 5.259215831756592, + "learning_rate": 8.011986453548728e-07, + "loss": 0.0492, + "num_input_tokens_seen": 32697744, + "step": 16703 + }, + { + "epoch": 2.213916500994036, + "grad_norm": 4.190415382385254, + "learning_rate": 8.009439821953435e-07, + "loss": 0.03, + "num_input_tokens_seen": 32699416, + "step": 16704 + }, + { + "epoch": 2.214049039098741, + "grad_norm": 0.271725594997406, + "learning_rate": 8.006893517947864e-07, + "loss": 0.0009, + "num_input_tokens_seen": 32700832, + "step": 16705 + }, + { + "epoch": 2.214181577203446, + "grad_norm": 6.647470474243164, + "learning_rate": 8.004347541581101e-07, + "loss": 0.1416, + "num_input_tokens_seen": 32702416, + "step": 16706 + }, + { + "epoch": 2.214314115308151, + "grad_norm": 0.08262130618095398, + "learning_rate": 8.00180189290225e-07, + "loss": 0.0005, + "num_input_tokens_seen": 32703392, + "step": 16707 + }, + { + "epoch": 2.2144466534128564, + "grad_norm": 2.116563081741333, + "learning_rate": 7.999256571960384e-07, + "loss": 0.0254, + "num_input_tokens_seen": 32706056, + "step": 16708 + }, + { + "epoch": 2.2145791915175614, + "grad_norm": 4.042857646942139, + "learning_rate": 7.996711578804572e-07, + "loss": 0.0558, + "num_input_tokens_seen": 32707720, + "step": 16709 + }, + { + "epoch": 2.2147117296222665, + "grad_norm": 9.864252090454102, + "learning_rate": 7.9941669134839e-07, + "loss": 0.1283, + "num_input_tokens_seen": 32709656, + "step": 16710 + }, + { + "epoch": 2.2148442677269715, + "grad_norm": 0.07578421384096146, + "learning_rate": 7.991622576047414e-07, + "loss": 0.0005, + "num_input_tokens_seen": 32712440, + "step": 16711 + }, + { + "epoch": 2.2149768058316766, + "grad_norm": 14.956700325012207, + "learning_rate": 7.989078566544184e-07, + "loss": 0.1698, + "num_input_tokens_seen": 32714304, + "step": 16712 + }, + { + "epoch": 2.2151093439363816, + "grad_norm": 0.04846203327178955, + "learning_rate": 7.986534885023256e-07, + "loss": 0.0003, + "num_input_tokens_seen": 32715920, + "step": 16713 + }, + { + "epoch": 2.2152418820410866, + "grad_norm": 0.521878719329834, + "learning_rate": 7.983991531533664e-07, + "loss": 0.0006, + "num_input_tokens_seen": 32717800, + "step": 16714 + }, + { + "epoch": 2.215374420145792, + "grad_norm": 8.667522430419922, + "learning_rate": 7.981448506124462e-07, + "loss": 0.088, + "num_input_tokens_seen": 32719520, + "step": 16715 + }, + { + "epoch": 2.215506958250497, + "grad_norm": 2.6611430644989014, + "learning_rate": 7.978905808844664e-07, + "loss": 0.0219, + "num_input_tokens_seen": 32721576, + "step": 16716 + }, + { + "epoch": 2.215639496355202, + "grad_norm": 0.46542611718177795, + "learning_rate": 7.976363439743312e-07, + "loss": 0.0011, + "num_input_tokens_seen": 32723552, + "step": 16717 + }, + { + "epoch": 2.2157720344599072, + "grad_norm": 0.14953237771987915, + "learning_rate": 7.97382139886942e-07, + "loss": 0.0007, + "num_input_tokens_seen": 32724960, + "step": 16718 + }, + { + "epoch": 2.2159045725646123, + "grad_norm": 1.4781609773635864, + "learning_rate": 7.971279686271996e-07, + "loss": 0.0059, + "num_input_tokens_seen": 32727200, + "step": 16719 + }, + { + "epoch": 2.2160371106693173, + "grad_norm": 0.09826435148715973, + "learning_rate": 7.968738302000037e-07, + "loss": 0.0006, + "num_input_tokens_seen": 32730008, + "step": 16720 + }, + { + "epoch": 2.2161696487740223, + "grad_norm": 0.0050934781320393085, + "learning_rate": 7.966197246102564e-07, + "loss": 0.0, + "num_input_tokens_seen": 32731416, + "step": 16721 + }, + { + "epoch": 2.216302186878728, + "grad_norm": 1.038631558418274, + "learning_rate": 7.963656518628548e-07, + "loss": 0.0045, + "num_input_tokens_seen": 32733712, + "step": 16722 + }, + { + "epoch": 2.216434724983433, + "grad_norm": 0.0030344754923135042, + "learning_rate": 7.961116119626996e-07, + "loss": 0.0, + "num_input_tokens_seen": 32734952, + "step": 16723 + }, + { + "epoch": 2.216567263088138, + "grad_norm": 5.364160060882568, + "learning_rate": 7.958576049146882e-07, + "loss": 0.0527, + "num_input_tokens_seen": 32738368, + "step": 16724 + }, + { + "epoch": 2.216699801192843, + "grad_norm": 6.421642303466797, + "learning_rate": 7.956036307237175e-07, + "loss": 0.0665, + "num_input_tokens_seen": 32741408, + "step": 16725 + }, + { + "epoch": 2.216832339297548, + "grad_norm": 3.126265287399292, + "learning_rate": 7.953496893946841e-07, + "loss": 0.018, + "num_input_tokens_seen": 32743104, + "step": 16726 + }, + { + "epoch": 2.216964877402253, + "grad_norm": 6.314855575561523, + "learning_rate": 7.950957809324844e-07, + "loss": 0.0542, + "num_input_tokens_seen": 32744432, + "step": 16727 + }, + { + "epoch": 2.217097415506958, + "grad_norm": 0.0030501838773489, + "learning_rate": 7.94841905342015e-07, + "loss": 0.0, + "num_input_tokens_seen": 32745808, + "step": 16728 + }, + { + "epoch": 2.2172299536116635, + "grad_norm": 1.86149001121521, + "learning_rate": 7.945880626281702e-07, + "loss": 0.0068, + "num_input_tokens_seen": 32747824, + "step": 16729 + }, + { + "epoch": 2.2173624917163686, + "grad_norm": 0.003636228386312723, + "learning_rate": 7.943342527958439e-07, + "loss": 0.0, + "num_input_tokens_seen": 32749104, + "step": 16730 + }, + { + "epoch": 2.2174950298210736, + "grad_norm": 0.0030643995851278305, + "learning_rate": 7.940804758499302e-07, + "loss": 0.0, + "num_input_tokens_seen": 32750304, + "step": 16731 + }, + { + "epoch": 2.2176275679257786, + "grad_norm": 0.007272868882864714, + "learning_rate": 7.938267317953205e-07, + "loss": 0.0, + "num_input_tokens_seen": 32753040, + "step": 16732 + }, + { + "epoch": 2.2177601060304837, + "grad_norm": 0.0020478698424994946, + "learning_rate": 7.935730206369088e-07, + "loss": 0.0, + "num_input_tokens_seen": 32755064, + "step": 16733 + }, + { + "epoch": 2.2178926441351887, + "grad_norm": 0.010774509981274605, + "learning_rate": 7.933193423795871e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32756472, + "step": 16734 + }, + { + "epoch": 2.218025182239894, + "grad_norm": 0.019999122247099876, + "learning_rate": 7.930656970282457e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32758520, + "step": 16735 + }, + { + "epoch": 2.2181577203445992, + "grad_norm": 5.0175018310546875, + "learning_rate": 7.928120845877754e-07, + "loss": 0.0405, + "num_input_tokens_seen": 32760848, + "step": 16736 + }, + { + "epoch": 2.2182902584493043, + "grad_norm": 0.40085798501968384, + "learning_rate": 7.925585050630652e-07, + "loss": 0.0017, + "num_input_tokens_seen": 32762192, + "step": 16737 + }, + { + "epoch": 2.2184227965540093, + "grad_norm": 1.258528232574463, + "learning_rate": 7.923049584590048e-07, + "loss": 0.0028, + "num_input_tokens_seen": 32764536, + "step": 16738 + }, + { + "epoch": 2.2185553346587144, + "grad_norm": 0.00344264879822731, + "learning_rate": 7.920514447804836e-07, + "loss": 0.0, + "num_input_tokens_seen": 32765776, + "step": 16739 + }, + { + "epoch": 2.2186878727634194, + "grad_norm": 0.1471462845802307, + "learning_rate": 7.917979640323889e-07, + "loss": 0.0003, + "num_input_tokens_seen": 32767744, + "step": 16740 + }, + { + "epoch": 2.2188204108681244, + "grad_norm": 16.883075714111328, + "learning_rate": 7.915445162196078e-07, + "loss": 0.0807, + "num_input_tokens_seen": 32770552, + "step": 16741 + }, + { + "epoch": 2.21895294897283, + "grad_norm": 0.003178149461746216, + "learning_rate": 7.912911013470262e-07, + "loss": 0.0, + "num_input_tokens_seen": 32772016, + "step": 16742 + }, + { + "epoch": 2.219085487077535, + "grad_norm": 0.009037372656166553, + "learning_rate": 7.910377194195318e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32774032, + "step": 16743 + }, + { + "epoch": 2.21921802518224, + "grad_norm": 7.590287208557129, + "learning_rate": 7.907843704420082e-07, + "loss": 0.1487, + "num_input_tokens_seen": 32776160, + "step": 16744 + }, + { + "epoch": 2.219350563286945, + "grad_norm": 16.331134796142578, + "learning_rate": 7.905310544193421e-07, + "loss": 0.1996, + "num_input_tokens_seen": 32779040, + "step": 16745 + }, + { + "epoch": 2.21948310139165, + "grad_norm": 3.98402738571167, + "learning_rate": 7.902777713564163e-07, + "loss": 0.0271, + "num_input_tokens_seen": 32780704, + "step": 16746 + }, + { + "epoch": 2.219615639496355, + "grad_norm": 2.928151845932007, + "learning_rate": 7.90024521258114e-07, + "loss": 0.0434, + "num_input_tokens_seen": 32783016, + "step": 16747 + }, + { + "epoch": 2.21974817760106, + "grad_norm": 2.808687686920166, + "learning_rate": 7.897713041293192e-07, + "loss": 0.0301, + "num_input_tokens_seen": 32784904, + "step": 16748 + }, + { + "epoch": 2.2198807157057656, + "grad_norm": 8.514762878417969, + "learning_rate": 7.895181199749135e-07, + "loss": 0.1574, + "num_input_tokens_seen": 32786824, + "step": 16749 + }, + { + "epoch": 2.2200132538104707, + "grad_norm": 8.368415832519531, + "learning_rate": 7.892649687997778e-07, + "loss": 0.2039, + "num_input_tokens_seen": 32788912, + "step": 16750 + }, + { + "epoch": 2.2201457919151757, + "grad_norm": 0.14122797548770905, + "learning_rate": 7.890118506087946e-07, + "loss": 0.0006, + "num_input_tokens_seen": 32790280, + "step": 16751 + }, + { + "epoch": 2.2202783300198807, + "grad_norm": 0.20348365604877472, + "learning_rate": 7.887587654068429e-07, + "loss": 0.0008, + "num_input_tokens_seen": 32792608, + "step": 16752 + }, + { + "epoch": 2.2204108681245858, + "grad_norm": 0.0017029353184625506, + "learning_rate": 7.885057131988022e-07, + "loss": 0.0, + "num_input_tokens_seen": 32794576, + "step": 16753 + }, + { + "epoch": 2.220543406229291, + "grad_norm": 13.156296730041504, + "learning_rate": 7.882526939895527e-07, + "loss": 0.0717, + "num_input_tokens_seen": 32797072, + "step": 16754 + }, + { + "epoch": 2.220675944333996, + "grad_norm": 0.002089474583044648, + "learning_rate": 7.879997077839721e-07, + "loss": 0.0, + "num_input_tokens_seen": 32799328, + "step": 16755 + }, + { + "epoch": 2.2208084824387013, + "grad_norm": 0.0729425773024559, + "learning_rate": 7.877467545869375e-07, + "loss": 0.0002, + "num_input_tokens_seen": 32800976, + "step": 16756 + }, + { + "epoch": 2.2209410205434064, + "grad_norm": 19.320341110229492, + "learning_rate": 7.874938344033275e-07, + "loss": 0.0668, + "num_input_tokens_seen": 32802664, + "step": 16757 + }, + { + "epoch": 2.2210735586481114, + "grad_norm": 0.7069845795631409, + "learning_rate": 7.872409472380168e-07, + "loss": 0.0024, + "num_input_tokens_seen": 32804888, + "step": 16758 + }, + { + "epoch": 2.2212060967528164, + "grad_norm": 13.998156547546387, + "learning_rate": 7.869880930958831e-07, + "loss": 0.1536, + "num_input_tokens_seen": 32806976, + "step": 16759 + }, + { + "epoch": 2.2213386348575215, + "grad_norm": 11.881112098693848, + "learning_rate": 7.867352719818008e-07, + "loss": 0.1128, + "num_input_tokens_seen": 32808640, + "step": 16760 + }, + { + "epoch": 2.2214711729622265, + "grad_norm": 0.02552521601319313, + "learning_rate": 7.864824839006436e-07, + "loss": 0.0002, + "num_input_tokens_seen": 32810936, + "step": 16761 + }, + { + "epoch": 2.2216037110669316, + "grad_norm": 6.907048225402832, + "learning_rate": 7.862297288572867e-07, + "loss": 0.0552, + "num_input_tokens_seen": 32812712, + "step": 16762 + }, + { + "epoch": 2.221736249171637, + "grad_norm": 0.07258374243974686, + "learning_rate": 7.859770068566025e-07, + "loss": 0.0004, + "num_input_tokens_seen": 32813872, + "step": 16763 + }, + { + "epoch": 2.221868787276342, + "grad_norm": 1.089223027229309, + "learning_rate": 7.857243179034646e-07, + "loss": 0.0089, + "num_input_tokens_seen": 32816008, + "step": 16764 + }, + { + "epoch": 2.222001325381047, + "grad_norm": 0.012824106961488724, + "learning_rate": 7.854716620027444e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32817448, + "step": 16765 + }, + { + "epoch": 2.222133863485752, + "grad_norm": 2.4600746631622314, + "learning_rate": 7.852190391593137e-07, + "loss": 0.0088, + "num_input_tokens_seen": 32819360, + "step": 16766 + }, + { + "epoch": 2.222266401590457, + "grad_norm": 11.200146675109863, + "learning_rate": 7.849664493780418e-07, + "loss": 0.0731, + "num_input_tokens_seen": 32821344, + "step": 16767 + }, + { + "epoch": 2.2223989396951622, + "grad_norm": 8.935330390930176, + "learning_rate": 7.847138926638001e-07, + "loss": 0.0448, + "num_input_tokens_seen": 32823088, + "step": 16768 + }, + { + "epoch": 2.2225314777998673, + "grad_norm": 9.436653137207031, + "learning_rate": 7.844613690214583e-07, + "loss": 0.183, + "num_input_tokens_seen": 32825512, + "step": 16769 + }, + { + "epoch": 2.2226640159045727, + "grad_norm": 2.1117258071899414, + "learning_rate": 7.842088784558852e-07, + "loss": 0.0187, + "num_input_tokens_seen": 32826872, + "step": 16770 + }, + { + "epoch": 2.222796554009278, + "grad_norm": 0.6298040747642517, + "learning_rate": 7.839564209719483e-07, + "loss": 0.0062, + "num_input_tokens_seen": 32828456, + "step": 16771 + }, + { + "epoch": 2.222929092113983, + "grad_norm": 0.007674476597458124, + "learning_rate": 7.837039965745156e-07, + "loss": 0.0, + "num_input_tokens_seen": 32830192, + "step": 16772 + }, + { + "epoch": 2.223061630218688, + "grad_norm": 2.7154605388641357, + "learning_rate": 7.834516052684529e-07, + "loss": 0.0183, + "num_input_tokens_seen": 32831744, + "step": 16773 + }, + { + "epoch": 2.223194168323393, + "grad_norm": 7.294100284576416, + "learning_rate": 7.831992470586278e-07, + "loss": 0.062, + "num_input_tokens_seen": 32833944, + "step": 16774 + }, + { + "epoch": 2.223326706428098, + "grad_norm": 0.8680421710014343, + "learning_rate": 7.829469219499062e-07, + "loss": 0.0032, + "num_input_tokens_seen": 32835720, + "step": 16775 + }, + { + "epoch": 2.223459244532803, + "grad_norm": 4.07961368560791, + "learning_rate": 7.826946299471522e-07, + "loss": 0.0232, + "num_input_tokens_seen": 32837632, + "step": 16776 + }, + { + "epoch": 2.2235917826375085, + "grad_norm": 2.9624900817871094, + "learning_rate": 7.824423710552307e-07, + "loss": 0.024, + "num_input_tokens_seen": 32840040, + "step": 16777 + }, + { + "epoch": 2.2237243207422135, + "grad_norm": 0.04612875357270241, + "learning_rate": 7.821901452790042e-07, + "loss": 0.0003, + "num_input_tokens_seen": 32842168, + "step": 16778 + }, + { + "epoch": 2.2238568588469185, + "grad_norm": 0.7843812108039856, + "learning_rate": 7.819379526233378e-07, + "loss": 0.0022, + "num_input_tokens_seen": 32844008, + "step": 16779 + }, + { + "epoch": 2.2239893969516236, + "grad_norm": 8.857994079589844, + "learning_rate": 7.816857930930921e-07, + "loss": 0.1771, + "num_input_tokens_seen": 32847000, + "step": 16780 + }, + { + "epoch": 2.2241219350563286, + "grad_norm": 0.023332104086875916, + "learning_rate": 7.814336666931302e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32848480, + "step": 16781 + }, + { + "epoch": 2.2242544731610336, + "grad_norm": 0.018700242042541504, + "learning_rate": 7.811815734283132e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32851512, + "step": 16782 + }, + { + "epoch": 2.2243870112657387, + "grad_norm": 0.07183434814214706, + "learning_rate": 7.809295133035008e-07, + "loss": 0.0002, + "num_input_tokens_seen": 32852688, + "step": 16783 + }, + { + "epoch": 2.224519549370444, + "grad_norm": 0.02249935269355774, + "learning_rate": 7.806774863235528e-07, + "loss": 0.0002, + "num_input_tokens_seen": 32854968, + "step": 16784 + }, + { + "epoch": 2.224652087475149, + "grad_norm": 12.225175857543945, + "learning_rate": 7.804254924933297e-07, + "loss": 0.0679, + "num_input_tokens_seen": 32857048, + "step": 16785 + }, + { + "epoch": 2.2247846255798542, + "grad_norm": 9.280887603759766, + "learning_rate": 7.801735318176884e-07, + "loss": 0.068, + "num_input_tokens_seen": 32860120, + "step": 16786 + }, + { + "epoch": 2.2249171636845593, + "grad_norm": 1.8865400552749634, + "learning_rate": 7.79921604301489e-07, + "loss": 0.0093, + "num_input_tokens_seen": 32861808, + "step": 16787 + }, + { + "epoch": 2.2250497017892643, + "grad_norm": 4.242854118347168, + "learning_rate": 7.796697099495873e-07, + "loss": 0.0217, + "num_input_tokens_seen": 32864368, + "step": 16788 + }, + { + "epoch": 2.2251822398939693, + "grad_norm": 2.5046091079711914, + "learning_rate": 7.794178487668399e-07, + "loss": 0.028, + "num_input_tokens_seen": 32866704, + "step": 16789 + }, + { + "epoch": 2.2253147779986744, + "grad_norm": 0.00515584135428071, + "learning_rate": 7.79166020758104e-07, + "loss": 0.0, + "num_input_tokens_seen": 32867920, + "step": 16790 + }, + { + "epoch": 2.22544731610338, + "grad_norm": 1.6025937795639038, + "learning_rate": 7.789142259282337e-07, + "loss": 0.0085, + "num_input_tokens_seen": 32870016, + "step": 16791 + }, + { + "epoch": 2.225579854208085, + "grad_norm": 10.482933044433594, + "learning_rate": 7.78662464282085e-07, + "loss": 0.0865, + "num_input_tokens_seen": 32872256, + "step": 16792 + }, + { + "epoch": 2.22571239231279, + "grad_norm": 0.24362346529960632, + "learning_rate": 7.784107358245116e-07, + "loss": 0.001, + "num_input_tokens_seen": 32874080, + "step": 16793 + }, + { + "epoch": 2.225844930417495, + "grad_norm": 4.1341376304626465, + "learning_rate": 7.781590405603662e-07, + "loss": 0.0325, + "num_input_tokens_seen": 32876624, + "step": 16794 + }, + { + "epoch": 2.2259774685222, + "grad_norm": 0.031135013327002525, + "learning_rate": 7.779073784945029e-07, + "loss": 0.0002, + "num_input_tokens_seen": 32878480, + "step": 16795 + }, + { + "epoch": 2.226110006626905, + "grad_norm": 0.22645603120326996, + "learning_rate": 7.776557496317736e-07, + "loss": 0.0009, + "num_input_tokens_seen": 32880408, + "step": 16796 + }, + { + "epoch": 2.2262425447316105, + "grad_norm": 0.4474324584007263, + "learning_rate": 7.774041539770286e-07, + "loss": 0.0023, + "num_input_tokens_seen": 32882112, + "step": 16797 + }, + { + "epoch": 2.2263750828363156, + "grad_norm": 0.04452750086784363, + "learning_rate": 7.771525915351208e-07, + "loss": 0.0003, + "num_input_tokens_seen": 32883992, + "step": 16798 + }, + { + "epoch": 2.2265076209410206, + "grad_norm": 0.023510918021202087, + "learning_rate": 7.769010623108985e-07, + "loss": 0.0002, + "num_input_tokens_seen": 32885952, + "step": 16799 + }, + { + "epoch": 2.2266401590457257, + "grad_norm": 8.683014869689941, + "learning_rate": 7.766495663092135e-07, + "loss": 0.0579, + "num_input_tokens_seen": 32888128, + "step": 16800 + }, + { + "epoch": 2.2267726971504307, + "grad_norm": 0.09306829422712326, + "learning_rate": 7.763981035349139e-07, + "loss": 0.0009, + "num_input_tokens_seen": 32890128, + "step": 16801 + }, + { + "epoch": 2.2269052352551357, + "grad_norm": 5.03671932220459, + "learning_rate": 7.761466739928477e-07, + "loss": 0.0541, + "num_input_tokens_seen": 32892408, + "step": 16802 + }, + { + "epoch": 2.2270377733598408, + "grad_norm": 8.420702934265137, + "learning_rate": 7.75895277687862e-07, + "loss": 0.099, + "num_input_tokens_seen": 32894032, + "step": 16803 + }, + { + "epoch": 2.2271703114645462, + "grad_norm": 0.0015115085989236832, + "learning_rate": 7.756439146248048e-07, + "loss": 0.0, + "num_input_tokens_seen": 32896496, + "step": 16804 + }, + { + "epoch": 2.2273028495692513, + "grad_norm": 5.855273723602295, + "learning_rate": 7.753925848085233e-07, + "loss": 0.1367, + "num_input_tokens_seen": 32898304, + "step": 16805 + }, + { + "epoch": 2.2274353876739563, + "grad_norm": 5.21196985244751, + "learning_rate": 7.751412882438624e-07, + "loss": 0.0346, + "num_input_tokens_seen": 32899920, + "step": 16806 + }, + { + "epoch": 2.2275679257786614, + "grad_norm": 5.572089195251465, + "learning_rate": 7.748900249356678e-07, + "loss": 0.0711, + "num_input_tokens_seen": 32902280, + "step": 16807 + }, + { + "epoch": 2.2277004638833664, + "grad_norm": 1.6589252948760986, + "learning_rate": 7.746387948887832e-07, + "loss": 0.0224, + "num_input_tokens_seen": 32904160, + "step": 16808 + }, + { + "epoch": 2.2278330019880714, + "grad_norm": 8.232868194580078, + "learning_rate": 7.743875981080523e-07, + "loss": 0.1095, + "num_input_tokens_seen": 32905992, + "step": 16809 + }, + { + "epoch": 2.2279655400927765, + "grad_norm": 0.2717493772506714, + "learning_rate": 7.741364345983188e-07, + "loss": 0.0012, + "num_input_tokens_seen": 32907888, + "step": 16810 + }, + { + "epoch": 2.228098078197482, + "grad_norm": 0.007738631684333086, + "learning_rate": 7.738853043644263e-07, + "loss": 0.0, + "num_input_tokens_seen": 32909512, + "step": 16811 + }, + { + "epoch": 2.228230616302187, + "grad_norm": 1.4256256818771362, + "learning_rate": 7.736342074112161e-07, + "loss": 0.0081, + "num_input_tokens_seen": 32911976, + "step": 16812 + }, + { + "epoch": 2.228363154406892, + "grad_norm": 0.02294052019715309, + "learning_rate": 7.73383143743529e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32913472, + "step": 16813 + }, + { + "epoch": 2.228495692511597, + "grad_norm": 9.606719017028809, + "learning_rate": 7.731321133662054e-07, + "loss": 0.0895, + "num_input_tokens_seen": 32916200, + "step": 16814 + }, + { + "epoch": 2.228628230616302, + "grad_norm": 9.122140884399414, + "learning_rate": 7.72881116284086e-07, + "loss": 0.1038, + "num_input_tokens_seen": 32917632, + "step": 16815 + }, + { + "epoch": 2.228760768721007, + "grad_norm": 11.978522300720215, + "learning_rate": 7.726301525020108e-07, + "loss": 0.1113, + "num_input_tokens_seen": 32919336, + "step": 16816 + }, + { + "epoch": 2.2288933068257126, + "grad_norm": 16.88580894470215, + "learning_rate": 7.72379222024818e-07, + "loss": 0.0639, + "num_input_tokens_seen": 32921392, + "step": 16817 + }, + { + "epoch": 2.2290258449304177, + "grad_norm": 0.0026927597355097532, + "learning_rate": 7.721283248573458e-07, + "loss": 0.0, + "num_input_tokens_seen": 32922456, + "step": 16818 + }, + { + "epoch": 2.2291583830351227, + "grad_norm": 3.9795942306518555, + "learning_rate": 7.718774610044313e-07, + "loss": 0.0689, + "num_input_tokens_seen": 32924256, + "step": 16819 + }, + { + "epoch": 2.2292909211398277, + "grad_norm": 0.08937589079141617, + "learning_rate": 7.716266304709108e-07, + "loss": 0.0005, + "num_input_tokens_seen": 32925872, + "step": 16820 + }, + { + "epoch": 2.229423459244533, + "grad_norm": 0.9465895891189575, + "learning_rate": 7.713758332616211e-07, + "loss": 0.0065, + "num_input_tokens_seen": 32927648, + "step": 16821 + }, + { + "epoch": 2.229555997349238, + "grad_norm": 0.005049168597906828, + "learning_rate": 7.711250693813987e-07, + "loss": 0.0, + "num_input_tokens_seen": 32929048, + "step": 16822 + }, + { + "epoch": 2.229688535453943, + "grad_norm": 7.678413391113281, + "learning_rate": 7.708743388350776e-07, + "loss": 0.1044, + "num_input_tokens_seen": 32931952, + "step": 16823 + }, + { + "epoch": 2.2298210735586483, + "grad_norm": 0.03244945779442787, + "learning_rate": 7.706236416274924e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32933168, + "step": 16824 + }, + { + "epoch": 2.2299536116633534, + "grad_norm": 5.87950325012207, + "learning_rate": 7.703729777634752e-07, + "loss": 0.0991, + "num_input_tokens_seen": 32935240, + "step": 16825 + }, + { + "epoch": 2.2300861497680584, + "grad_norm": 4.280216217041016, + "learning_rate": 7.701223472478614e-07, + "loss": 0.0765, + "num_input_tokens_seen": 32937104, + "step": 16826 + }, + { + "epoch": 2.2302186878727635, + "grad_norm": 4.3898468017578125, + "learning_rate": 7.69871750085481e-07, + "loss": 0.0287, + "num_input_tokens_seen": 32939512, + "step": 16827 + }, + { + "epoch": 2.2303512259774685, + "grad_norm": 0.023879118263721466, + "learning_rate": 7.696211862811678e-07, + "loss": 0.0002, + "num_input_tokens_seen": 32942248, + "step": 16828 + }, + { + "epoch": 2.2304837640821735, + "grad_norm": 0.22583164274692535, + "learning_rate": 7.693706558397518e-07, + "loss": 0.0005, + "num_input_tokens_seen": 32944688, + "step": 16829 + }, + { + "epoch": 2.2306163021868786, + "grad_norm": 6.575562477111816, + "learning_rate": 7.691201587660627e-07, + "loss": 0.0557, + "num_input_tokens_seen": 32946912, + "step": 16830 + }, + { + "epoch": 2.230748840291584, + "grad_norm": 1.9189794063568115, + "learning_rate": 7.688696950649316e-07, + "loss": 0.0181, + "num_input_tokens_seen": 32949096, + "step": 16831 + }, + { + "epoch": 2.230881378396289, + "grad_norm": 0.0566316582262516, + "learning_rate": 7.686192647411872e-07, + "loss": 0.0003, + "num_input_tokens_seen": 32950768, + "step": 16832 + }, + { + "epoch": 2.231013916500994, + "grad_norm": 0.004090065136551857, + "learning_rate": 7.68368867799657e-07, + "loss": 0.0, + "num_input_tokens_seen": 32952216, + "step": 16833 + }, + { + "epoch": 2.231146454605699, + "grad_norm": 0.05288415029644966, + "learning_rate": 7.681185042451705e-07, + "loss": 0.0003, + "num_input_tokens_seen": 32953680, + "step": 16834 + }, + { + "epoch": 2.231278992710404, + "grad_norm": 12.346076965332031, + "learning_rate": 7.67868174082553e-07, + "loss": 0.2145, + "num_input_tokens_seen": 32955704, + "step": 16835 + }, + { + "epoch": 2.2314115308151092, + "grad_norm": 2.4233548641204834, + "learning_rate": 7.676178773166329e-07, + "loss": 0.0339, + "num_input_tokens_seen": 32957976, + "step": 16836 + }, + { + "epoch": 2.2315440689198143, + "grad_norm": 2.8592193126678467, + "learning_rate": 7.673676139522354e-07, + "loss": 0.0298, + "num_input_tokens_seen": 32960248, + "step": 16837 + }, + { + "epoch": 2.2316766070245198, + "grad_norm": 9.551393508911133, + "learning_rate": 7.671173839941853e-07, + "loss": 0.1521, + "num_input_tokens_seen": 32962496, + "step": 16838 + }, + { + "epoch": 2.231809145129225, + "grad_norm": 1.5247143507003784, + "learning_rate": 7.668671874473069e-07, + "loss": 0.0058, + "num_input_tokens_seen": 32964224, + "step": 16839 + }, + { + "epoch": 2.23194168323393, + "grad_norm": 0.003496789839118719, + "learning_rate": 7.666170243164256e-07, + "loss": 0.0, + "num_input_tokens_seen": 32965472, + "step": 16840 + }, + { + "epoch": 2.232074221338635, + "grad_norm": 0.012420128099620342, + "learning_rate": 7.663668946063629e-07, + "loss": 0.0001, + "num_input_tokens_seen": 32967160, + "step": 16841 + }, + { + "epoch": 2.23220675944334, + "grad_norm": 1.791758418083191, + "learning_rate": 7.661167983219434e-07, + "loss": 0.0036, + "num_input_tokens_seen": 32968752, + "step": 16842 + }, + { + "epoch": 2.232339297548045, + "grad_norm": 0.036408431828022, + "learning_rate": 7.65866735467988e-07, + "loss": 0.0002, + "num_input_tokens_seen": 32970248, + "step": 16843 + }, + { + "epoch": 2.23247183565275, + "grad_norm": 0.41861966252326965, + "learning_rate": 7.656167060493177e-07, + "loss": 0.0014, + "num_input_tokens_seen": 32971504, + "step": 16844 + }, + { + "epoch": 2.2326043737574555, + "grad_norm": 6.684903144836426, + "learning_rate": 7.653667100707543e-07, + "loss": 0.0581, + "num_input_tokens_seen": 32973264, + "step": 16845 + }, + { + "epoch": 2.2327369118621605, + "grad_norm": 0.05923507362604141, + "learning_rate": 7.651167475371169e-07, + "loss": 0.0004, + "num_input_tokens_seen": 32975264, + "step": 16846 + }, + { + "epoch": 2.2328694499668655, + "grad_norm": 7.135880470275879, + "learning_rate": 7.648668184532259e-07, + "loss": 0.1775, + "num_input_tokens_seen": 32977424, + "step": 16847 + }, + { + "epoch": 2.2330019880715706, + "grad_norm": 0.12432753294706345, + "learning_rate": 7.646169228239001e-07, + "loss": 0.0007, + "num_input_tokens_seen": 32978816, + "step": 16848 + }, + { + "epoch": 2.2331345261762756, + "grad_norm": 5.3253278732299805, + "learning_rate": 7.643670606539569e-07, + "loss": 0.0423, + "num_input_tokens_seen": 32980712, + "step": 16849 + }, + { + "epoch": 2.2332670642809807, + "grad_norm": 12.118880271911621, + "learning_rate": 7.641172319482132e-07, + "loss": 0.0448, + "num_input_tokens_seen": 32982552, + "step": 16850 + }, + { + "epoch": 2.2333996023856857, + "grad_norm": 7.602474212646484, + "learning_rate": 7.638674367114871e-07, + "loss": 0.0754, + "num_input_tokens_seen": 32983872, + "step": 16851 + }, + { + "epoch": 2.233532140490391, + "grad_norm": 8.997574806213379, + "learning_rate": 7.636176749485952e-07, + "loss": 0.0346, + "num_input_tokens_seen": 32986112, + "step": 16852 + }, + { + "epoch": 2.233664678595096, + "grad_norm": 3.0022573471069336, + "learning_rate": 7.633679466643523e-07, + "loss": 0.0302, + "num_input_tokens_seen": 32988736, + "step": 16853 + }, + { + "epoch": 2.2337972166998012, + "grad_norm": 0.2232588678598404, + "learning_rate": 7.631182518635736e-07, + "loss": 0.001, + "num_input_tokens_seen": 32990432, + "step": 16854 + }, + { + "epoch": 2.2339297548045063, + "grad_norm": 6.605444431304932, + "learning_rate": 7.62868590551073e-07, + "loss": 0.0114, + "num_input_tokens_seen": 32992056, + "step": 16855 + }, + { + "epoch": 2.2340622929092113, + "grad_norm": 3.551065683364868, + "learning_rate": 7.626189627316635e-07, + "loss": 0.0208, + "num_input_tokens_seen": 32994072, + "step": 16856 + }, + { + "epoch": 2.2341948310139164, + "grad_norm": 0.006108480971306562, + "learning_rate": 7.623693684101593e-07, + "loss": 0.0, + "num_input_tokens_seen": 32996192, + "step": 16857 + }, + { + "epoch": 2.2343273691186214, + "grad_norm": 6.323681831359863, + "learning_rate": 7.62119807591373e-07, + "loss": 0.0387, + "num_input_tokens_seen": 32998896, + "step": 16858 + }, + { + "epoch": 2.234459907223327, + "grad_norm": 0.006001184228807688, + "learning_rate": 7.618702802801158e-07, + "loss": 0.0, + "num_input_tokens_seen": 33000384, + "step": 16859 + }, + { + "epoch": 2.234592445328032, + "grad_norm": 0.016904622316360474, + "learning_rate": 7.616207864811986e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33002672, + "step": 16860 + }, + { + "epoch": 2.234724983432737, + "grad_norm": 0.011634291149675846, + "learning_rate": 7.61371326199431e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33005736, + "step": 16861 + }, + { + "epoch": 2.234857521537442, + "grad_norm": 0.1562369465827942, + "learning_rate": 7.611218994396246e-07, + "loss": 0.0009, + "num_input_tokens_seen": 33008200, + "step": 16862 + }, + { + "epoch": 2.234990059642147, + "grad_norm": 6.2754974365234375, + "learning_rate": 7.608725062065867e-07, + "loss": 0.0866, + "num_input_tokens_seen": 33010296, + "step": 16863 + }, + { + "epoch": 2.235122597746852, + "grad_norm": 4.577400207519531, + "learning_rate": 7.606231465051275e-07, + "loss": 0.0477, + "num_input_tokens_seen": 33012424, + "step": 16864 + }, + { + "epoch": 2.235255135851557, + "grad_norm": 8.511613845825195, + "learning_rate": 7.60373820340054e-07, + "loss": 0.019, + "num_input_tokens_seen": 33014248, + "step": 16865 + }, + { + "epoch": 2.2353876739562626, + "grad_norm": 0.03635704889893532, + "learning_rate": 7.601245277161731e-07, + "loss": 0.0002, + "num_input_tokens_seen": 33016752, + "step": 16866 + }, + { + "epoch": 2.2355202120609676, + "grad_norm": 0.031176378950476646, + "learning_rate": 7.59875268638291e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33018336, + "step": 16867 + }, + { + "epoch": 2.2356527501656727, + "grad_norm": 1.4374096393585205, + "learning_rate": 7.596260431112143e-07, + "loss": 0.0057, + "num_input_tokens_seen": 33019848, + "step": 16868 + }, + { + "epoch": 2.2357852882703777, + "grad_norm": 0.0037554167211055756, + "learning_rate": 7.593768511397487e-07, + "loss": 0.0, + "num_input_tokens_seen": 33021632, + "step": 16869 + }, + { + "epoch": 2.2359178263750827, + "grad_norm": 1.0293668508529663, + "learning_rate": 7.591276927286984e-07, + "loss": 0.0035, + "num_input_tokens_seen": 33023120, + "step": 16870 + }, + { + "epoch": 2.2360503644797878, + "grad_norm": 16.602781295776367, + "learning_rate": 7.588785678828669e-07, + "loss": 0.1243, + "num_input_tokens_seen": 33025768, + "step": 16871 + }, + { + "epoch": 2.236182902584493, + "grad_norm": 5.745857238769531, + "learning_rate": 7.586294766070571e-07, + "loss": 0.0997, + "num_input_tokens_seen": 33027528, + "step": 16872 + }, + { + "epoch": 2.2363154406891983, + "grad_norm": 12.850393295288086, + "learning_rate": 7.583804189060729e-07, + "loss": 0.1092, + "num_input_tokens_seen": 33029304, + "step": 16873 + }, + { + "epoch": 2.2364479787939033, + "grad_norm": 0.005411691032350063, + "learning_rate": 7.581313947847152e-07, + "loss": 0.0, + "num_input_tokens_seen": 33030968, + "step": 16874 + }, + { + "epoch": 2.2365805168986084, + "grad_norm": 16.323911666870117, + "learning_rate": 7.578824042477865e-07, + "loss": 0.1628, + "num_input_tokens_seen": 33033288, + "step": 16875 + }, + { + "epoch": 2.2367130550033134, + "grad_norm": 4.657427787780762, + "learning_rate": 7.576334473000868e-07, + "loss": 0.039, + "num_input_tokens_seen": 33036128, + "step": 16876 + }, + { + "epoch": 2.2368455931080184, + "grad_norm": 0.03632344678044319, + "learning_rate": 7.573845239464156e-07, + "loss": 0.0002, + "num_input_tokens_seen": 33038088, + "step": 16877 + }, + { + "epoch": 2.2369781312127235, + "grad_norm": 4.514306545257568, + "learning_rate": 7.571356341915737e-07, + "loss": 0.0317, + "num_input_tokens_seen": 33039920, + "step": 16878 + }, + { + "epoch": 2.237110669317429, + "grad_norm": 0.035691998898983, + "learning_rate": 7.568867780403591e-07, + "loss": 0.0002, + "num_input_tokens_seen": 33042304, + "step": 16879 + }, + { + "epoch": 2.237243207422134, + "grad_norm": 0.023275118321180344, + "learning_rate": 7.566379554975692e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33044136, + "step": 16880 + }, + { + "epoch": 2.237375745526839, + "grad_norm": 0.015332008711993694, + "learning_rate": 7.563891665680029e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33045944, + "step": 16881 + }, + { + "epoch": 2.237508283631544, + "grad_norm": 0.03269106149673462, + "learning_rate": 7.561404112564555e-07, + "loss": 0.0003, + "num_input_tokens_seen": 33047960, + "step": 16882 + }, + { + "epoch": 2.237640821736249, + "grad_norm": 10.386913299560547, + "learning_rate": 7.558916895677249e-07, + "loss": 0.1604, + "num_input_tokens_seen": 33050280, + "step": 16883 + }, + { + "epoch": 2.237773359840954, + "grad_norm": 0.0517621673643589, + "learning_rate": 7.556430015066055e-07, + "loss": 0.0003, + "num_input_tokens_seen": 33052088, + "step": 16884 + }, + { + "epoch": 2.237905897945659, + "grad_norm": 0.02438417449593544, + "learning_rate": 7.553943470778927e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33054248, + "step": 16885 + }, + { + "epoch": 2.2380384360503647, + "grad_norm": 1.4206547737121582, + "learning_rate": 7.551457262863792e-07, + "loss": 0.0036, + "num_input_tokens_seen": 33056528, + "step": 16886 + }, + { + "epoch": 2.2381709741550697, + "grad_norm": 0.03010665439069271, + "learning_rate": 7.548971391368604e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33057968, + "step": 16887 + }, + { + "epoch": 2.2383035122597748, + "grad_norm": 8.699987411499023, + "learning_rate": 7.54648585634129e-07, + "loss": 0.0633, + "num_input_tokens_seen": 33060152, + "step": 16888 + }, + { + "epoch": 2.23843605036448, + "grad_norm": 0.5269138216972351, + "learning_rate": 7.544000657829773e-07, + "loss": 0.0013, + "num_input_tokens_seen": 33062752, + "step": 16889 + }, + { + "epoch": 2.238568588469185, + "grad_norm": 11.476886749267578, + "learning_rate": 7.541515795881962e-07, + "loss": 0.0702, + "num_input_tokens_seen": 33064720, + "step": 16890 + }, + { + "epoch": 2.23870112657389, + "grad_norm": 0.032786667346954346, + "learning_rate": 7.539031270545771e-07, + "loss": 0.0003, + "num_input_tokens_seen": 33066504, + "step": 16891 + }, + { + "epoch": 2.238833664678595, + "grad_norm": 6.707982063293457, + "learning_rate": 7.536547081869097e-07, + "loss": 0.0603, + "num_input_tokens_seen": 33068576, + "step": 16892 + }, + { + "epoch": 2.2389662027833004, + "grad_norm": 6.941318035125732, + "learning_rate": 7.534063229899841e-07, + "loss": 0.0964, + "num_input_tokens_seen": 33070272, + "step": 16893 + }, + { + "epoch": 2.2390987408880054, + "grad_norm": 0.03787539154291153, + "learning_rate": 7.531579714685902e-07, + "loss": 0.0002, + "num_input_tokens_seen": 33072072, + "step": 16894 + }, + { + "epoch": 2.2392312789927105, + "grad_norm": 0.012064690701663494, + "learning_rate": 7.52909653627516e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33073680, + "step": 16895 + }, + { + "epoch": 2.2393638170974155, + "grad_norm": 7.186579704284668, + "learning_rate": 7.526613694715487e-07, + "loss": 0.1506, + "num_input_tokens_seen": 33075944, + "step": 16896 + }, + { + "epoch": 2.2394963552021205, + "grad_norm": 5.0201005935668945, + "learning_rate": 7.524131190054748e-07, + "loss": 0.1431, + "num_input_tokens_seen": 33077960, + "step": 16897 + }, + { + "epoch": 2.2396288933068256, + "grad_norm": 0.024932969361543655, + "learning_rate": 7.521649022340816e-07, + "loss": 0.0002, + "num_input_tokens_seen": 33079896, + "step": 16898 + }, + { + "epoch": 2.2397614314115306, + "grad_norm": 6.946509838104248, + "learning_rate": 7.519167191621557e-07, + "loss": 0.1276, + "num_input_tokens_seen": 33081808, + "step": 16899 + }, + { + "epoch": 2.239893969516236, + "grad_norm": 5.004718780517578, + "learning_rate": 7.516685697944815e-07, + "loss": 0.0327, + "num_input_tokens_seen": 33083904, + "step": 16900 + }, + { + "epoch": 2.240026507620941, + "grad_norm": 0.18041519820690155, + "learning_rate": 7.514204541358433e-07, + "loss": 0.0005, + "num_input_tokens_seen": 33085488, + "step": 16901 + }, + { + "epoch": 2.240159045725646, + "grad_norm": 9.388216972351074, + "learning_rate": 7.511723721910249e-07, + "loss": 0.1134, + "num_input_tokens_seen": 33088224, + "step": 16902 + }, + { + "epoch": 2.240291583830351, + "grad_norm": 20.20255470275879, + "learning_rate": 7.50924323964809e-07, + "loss": 0.0625, + "num_input_tokens_seen": 33090024, + "step": 16903 + }, + { + "epoch": 2.2404241219350562, + "grad_norm": 3.309440851211548, + "learning_rate": 7.506763094619788e-07, + "loss": 0.0896, + "num_input_tokens_seen": 33091528, + "step": 16904 + }, + { + "epoch": 2.2405566600397613, + "grad_norm": 0.022734861820936203, + "learning_rate": 7.504283286873168e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33094696, + "step": 16905 + }, + { + "epoch": 2.2406891981444668, + "grad_norm": 0.40271276235580444, + "learning_rate": 7.501803816456036e-07, + "loss": 0.0025, + "num_input_tokens_seen": 33096208, + "step": 16906 + }, + { + "epoch": 2.240821736249172, + "grad_norm": 2.050246238708496, + "learning_rate": 7.499324683416198e-07, + "loss": 0.0304, + "num_input_tokens_seen": 33098976, + "step": 16907 + }, + { + "epoch": 2.240954274353877, + "grad_norm": 4.149683475494385, + "learning_rate": 7.496845887801447e-07, + "loss": 0.0381, + "num_input_tokens_seen": 33100744, + "step": 16908 + }, + { + "epoch": 2.241086812458582, + "grad_norm": 2.672752618789673, + "learning_rate": 7.494367429659588e-07, + "loss": 0.0177, + "num_input_tokens_seen": 33103808, + "step": 16909 + }, + { + "epoch": 2.241219350563287, + "grad_norm": 0.034271519631147385, + "learning_rate": 7.491889309038391e-07, + "loss": 0.0002, + "num_input_tokens_seen": 33105776, + "step": 16910 + }, + { + "epoch": 2.241351888667992, + "grad_norm": 3.9220025539398193, + "learning_rate": 7.48941152598566e-07, + "loss": 0.0483, + "num_input_tokens_seen": 33107360, + "step": 16911 + }, + { + "epoch": 2.241484426772697, + "grad_norm": 1.9702450037002563, + "learning_rate": 7.486934080549152e-07, + "loss": 0.0052, + "num_input_tokens_seen": 33109944, + "step": 16912 + }, + { + "epoch": 2.2416169648774025, + "grad_norm": 0.05215461179614067, + "learning_rate": 7.484456972776627e-07, + "loss": 0.0002, + "num_input_tokens_seen": 33111712, + "step": 16913 + }, + { + "epoch": 2.2417495029821075, + "grad_norm": 1.3901699781417847, + "learning_rate": 7.481980202715866e-07, + "loss": 0.021, + "num_input_tokens_seen": 33113120, + "step": 16914 + }, + { + "epoch": 2.2418820410868125, + "grad_norm": 3.4148108959198, + "learning_rate": 7.479503770414608e-07, + "loss": 0.0168, + "num_input_tokens_seen": 33114328, + "step": 16915 + }, + { + "epoch": 2.2420145791915176, + "grad_norm": 6.229007244110107, + "learning_rate": 7.477027675920601e-07, + "loss": 0.0885, + "num_input_tokens_seen": 33116640, + "step": 16916 + }, + { + "epoch": 2.2421471172962226, + "grad_norm": 0.06428591907024384, + "learning_rate": 7.474551919281591e-07, + "loss": 0.0004, + "num_input_tokens_seen": 33118752, + "step": 16917 + }, + { + "epoch": 2.2422796554009277, + "grad_norm": 0.08684971183538437, + "learning_rate": 7.472076500545305e-07, + "loss": 0.0004, + "num_input_tokens_seen": 33120712, + "step": 16918 + }, + { + "epoch": 2.2424121935056327, + "grad_norm": 0.01004707906395197, + "learning_rate": 7.469601419759487e-07, + "loss": 0.0, + "num_input_tokens_seen": 33121736, + "step": 16919 + }, + { + "epoch": 2.242544731610338, + "grad_norm": 1.1019450426101685, + "learning_rate": 7.467126676971844e-07, + "loss": 0.0022, + "num_input_tokens_seen": 33123832, + "step": 16920 + }, + { + "epoch": 2.242677269715043, + "grad_norm": 6.03755521774292, + "learning_rate": 7.464652272230086e-07, + "loss": 0.051, + "num_input_tokens_seen": 33125720, + "step": 16921 + }, + { + "epoch": 2.2428098078197483, + "grad_norm": 0.15352152287960052, + "learning_rate": 7.462178205581938e-07, + "loss": 0.0004, + "num_input_tokens_seen": 33126824, + "step": 16922 + }, + { + "epoch": 2.2429423459244533, + "grad_norm": 0.057611990720033646, + "learning_rate": 7.459704477075091e-07, + "loss": 0.0002, + "num_input_tokens_seen": 33128256, + "step": 16923 + }, + { + "epoch": 2.2430748840291583, + "grad_norm": 0.0035571185871958733, + "learning_rate": 7.457231086757238e-07, + "loss": 0.0, + "num_input_tokens_seen": 33129632, + "step": 16924 + }, + { + "epoch": 2.2432074221338634, + "grad_norm": 8.817341804504395, + "learning_rate": 7.454758034676077e-07, + "loss": 0.1276, + "num_input_tokens_seen": 33131984, + "step": 16925 + }, + { + "epoch": 2.2433399602385684, + "grad_norm": 0.9332042336463928, + "learning_rate": 7.452285320879285e-07, + "loss": 0.0072, + "num_input_tokens_seen": 33133360, + "step": 16926 + }, + { + "epoch": 2.243472498343274, + "grad_norm": 0.01454442273825407, + "learning_rate": 7.449812945414531e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33135912, + "step": 16927 + }, + { + "epoch": 2.243605036447979, + "grad_norm": 2.380324125289917, + "learning_rate": 7.447340908329498e-07, + "loss": 0.005, + "num_input_tokens_seen": 33137376, + "step": 16928 + }, + { + "epoch": 2.243737574552684, + "grad_norm": 2.013598680496216, + "learning_rate": 7.444869209671832e-07, + "loss": 0.0081, + "num_input_tokens_seen": 33139472, + "step": 16929 + }, + { + "epoch": 2.243870112657389, + "grad_norm": 2.8410463333129883, + "learning_rate": 7.442397849489208e-07, + "loss": 0.0195, + "num_input_tokens_seen": 33141328, + "step": 16930 + }, + { + "epoch": 2.244002650762094, + "grad_norm": 6.372222423553467, + "learning_rate": 7.439926827829263e-07, + "loss": 0.0395, + "num_input_tokens_seen": 33143000, + "step": 16931 + }, + { + "epoch": 2.244135188866799, + "grad_norm": 3.541367769241333, + "learning_rate": 7.437456144739646e-07, + "loss": 0.0338, + "num_input_tokens_seen": 33145592, + "step": 16932 + }, + { + "epoch": 2.244267726971504, + "grad_norm": 11.826542854309082, + "learning_rate": 7.434985800267982e-07, + "loss": 0.3244, + "num_input_tokens_seen": 33148368, + "step": 16933 + }, + { + "epoch": 2.2444002650762096, + "grad_norm": 0.0016426644288003445, + "learning_rate": 7.432515794461909e-07, + "loss": 0.0, + "num_input_tokens_seen": 33149584, + "step": 16934 + }, + { + "epoch": 2.2445328031809146, + "grad_norm": 0.05555108189582825, + "learning_rate": 7.430046127369056e-07, + "loss": 0.0003, + "num_input_tokens_seen": 33151136, + "step": 16935 + }, + { + "epoch": 2.2446653412856197, + "grad_norm": 33.76434326171875, + "learning_rate": 7.427576799037037e-07, + "loss": 0.0903, + "num_input_tokens_seen": 33153080, + "step": 16936 + }, + { + "epoch": 2.2447978793903247, + "grad_norm": 3.8862385749816895, + "learning_rate": 7.42510780951346e-07, + "loss": 0.0602, + "num_input_tokens_seen": 33154752, + "step": 16937 + }, + { + "epoch": 2.2449304174950298, + "grad_norm": 3.9612860679626465, + "learning_rate": 7.422639158845929e-07, + "loss": 0.0391, + "num_input_tokens_seen": 33157032, + "step": 16938 + }, + { + "epoch": 2.245062955599735, + "grad_norm": 5.586040496826172, + "learning_rate": 7.420170847082031e-07, + "loss": 0.0569, + "num_input_tokens_seen": 33159320, + "step": 16939 + }, + { + "epoch": 2.24519549370444, + "grad_norm": 7.117069244384766, + "learning_rate": 7.417702874269369e-07, + "loss": 0.0452, + "num_input_tokens_seen": 33161272, + "step": 16940 + }, + { + "epoch": 2.2453280318091453, + "grad_norm": 4.0987348556518555, + "learning_rate": 7.415235240455531e-07, + "loss": 0.0441, + "num_input_tokens_seen": 33163008, + "step": 16941 + }, + { + "epoch": 2.2454605699138503, + "grad_norm": 0.15577000379562378, + "learning_rate": 7.41276794568809e-07, + "loss": 0.0008, + "num_input_tokens_seen": 33165608, + "step": 16942 + }, + { + "epoch": 2.2455931080185554, + "grad_norm": 8.19677734375, + "learning_rate": 7.410300990014616e-07, + "loss": 0.0833, + "num_input_tokens_seen": 33167264, + "step": 16943 + }, + { + "epoch": 2.2457256461232604, + "grad_norm": 6.9497456550598145, + "learning_rate": 7.407834373482667e-07, + "loss": 0.1176, + "num_input_tokens_seen": 33169496, + "step": 16944 + }, + { + "epoch": 2.2458581842279655, + "grad_norm": 13.79938793182373, + "learning_rate": 7.405368096139812e-07, + "loss": 0.0209, + "num_input_tokens_seen": 33171328, + "step": 16945 + }, + { + "epoch": 2.2459907223326705, + "grad_norm": 0.0009604765800759196, + "learning_rate": 7.402902158033593e-07, + "loss": 0.0, + "num_input_tokens_seen": 33172608, + "step": 16946 + }, + { + "epoch": 2.2461232604373755, + "grad_norm": 5.148166179656982, + "learning_rate": 7.400436559211566e-07, + "loss": 0.0145, + "num_input_tokens_seen": 33174096, + "step": 16947 + }, + { + "epoch": 2.246255798542081, + "grad_norm": 10.47939395904541, + "learning_rate": 7.397971299721262e-07, + "loss": 0.2954, + "num_input_tokens_seen": 33176136, + "step": 16948 + }, + { + "epoch": 2.246388336646786, + "grad_norm": 0.39862412214279175, + "learning_rate": 7.39550637961021e-07, + "loss": 0.0017, + "num_input_tokens_seen": 33178616, + "step": 16949 + }, + { + "epoch": 2.246520874751491, + "grad_norm": 0.8816923499107361, + "learning_rate": 7.393041798925946e-07, + "loss": 0.0042, + "num_input_tokens_seen": 33180512, + "step": 16950 + }, + { + "epoch": 2.246653412856196, + "grad_norm": 5.178239345550537, + "learning_rate": 7.390577557715975e-07, + "loss": 0.0181, + "num_input_tokens_seen": 33182776, + "step": 16951 + }, + { + "epoch": 2.246785950960901, + "grad_norm": 0.4570491313934326, + "learning_rate": 7.388113656027824e-07, + "loss": 0.0015, + "num_input_tokens_seen": 33184552, + "step": 16952 + }, + { + "epoch": 2.246918489065606, + "grad_norm": 0.24592724442481995, + "learning_rate": 7.385650093908992e-07, + "loss": 0.0008, + "num_input_tokens_seen": 33186048, + "step": 16953 + }, + { + "epoch": 2.2470510271703112, + "grad_norm": 0.02860875241458416, + "learning_rate": 7.383186871406978e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33187976, + "step": 16954 + }, + { + "epoch": 2.2471835652750167, + "grad_norm": 5.490097522735596, + "learning_rate": 7.380723988569266e-07, + "loss": 0.0519, + "num_input_tokens_seen": 33189960, + "step": 16955 + }, + { + "epoch": 2.2473161033797218, + "grad_norm": 0.05248027667403221, + "learning_rate": 7.378261445443358e-07, + "loss": 0.0003, + "num_input_tokens_seen": 33191816, + "step": 16956 + }, + { + "epoch": 2.247448641484427, + "grad_norm": 6.085441589355469, + "learning_rate": 7.375799242076717e-07, + "loss": 0.1032, + "num_input_tokens_seen": 33193520, + "step": 16957 + }, + { + "epoch": 2.247581179589132, + "grad_norm": 1.8667737245559692, + "learning_rate": 7.373337378516835e-07, + "loss": 0.0072, + "num_input_tokens_seen": 33195192, + "step": 16958 + }, + { + "epoch": 2.247713717693837, + "grad_norm": 0.11567371338605881, + "learning_rate": 7.370875854811169e-07, + "loss": 0.0004, + "num_input_tokens_seen": 33196400, + "step": 16959 + }, + { + "epoch": 2.247846255798542, + "grad_norm": 0.025613414123654366, + "learning_rate": 7.368414671007168e-07, + "loss": 0.0002, + "num_input_tokens_seen": 33199000, + "step": 16960 + }, + { + "epoch": 2.247978793903247, + "grad_norm": 9.597737312316895, + "learning_rate": 7.365953827152303e-07, + "loss": 0.0675, + "num_input_tokens_seen": 33200792, + "step": 16961 + }, + { + "epoch": 2.2481113320079524, + "grad_norm": 2.251826047897339, + "learning_rate": 7.363493323294015e-07, + "loss": 0.0272, + "num_input_tokens_seen": 33202880, + "step": 16962 + }, + { + "epoch": 2.2482438701126575, + "grad_norm": 13.39017391204834, + "learning_rate": 7.361033159479735e-07, + "loss": 0.2389, + "num_input_tokens_seen": 33206120, + "step": 16963 + }, + { + "epoch": 2.2483764082173625, + "grad_norm": 0.005103795789182186, + "learning_rate": 7.358573335756911e-07, + "loss": 0.0, + "num_input_tokens_seen": 33207640, + "step": 16964 + }, + { + "epoch": 2.2485089463220675, + "grad_norm": 0.002359428210183978, + "learning_rate": 7.356113852172955e-07, + "loss": 0.0, + "num_input_tokens_seen": 33209136, + "step": 16965 + }, + { + "epoch": 2.2486414844267726, + "grad_norm": 2.287686586380005, + "learning_rate": 7.353654708775307e-07, + "loss": 0.014, + "num_input_tokens_seen": 33211088, + "step": 16966 + }, + { + "epoch": 2.2487740225314776, + "grad_norm": 0.009008623659610748, + "learning_rate": 7.351195905611366e-07, + "loss": 0.0, + "num_input_tokens_seen": 33213160, + "step": 16967 + }, + { + "epoch": 2.248906560636183, + "grad_norm": 2.3068599700927734, + "learning_rate": 7.348737442728543e-07, + "loss": 0.0258, + "num_input_tokens_seen": 33215664, + "step": 16968 + }, + { + "epoch": 2.249039098740888, + "grad_norm": 0.019841045141220093, + "learning_rate": 7.346279320174235e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33218024, + "step": 16969 + }, + { + "epoch": 2.249171636845593, + "grad_norm": 10.485225677490234, + "learning_rate": 7.343821537995838e-07, + "loss": 0.035, + "num_input_tokens_seen": 33220160, + "step": 16970 + }, + { + "epoch": 2.249304174950298, + "grad_norm": 3.2425382137298584, + "learning_rate": 7.34136409624075e-07, + "loss": 0.0347, + "num_input_tokens_seen": 33222120, + "step": 16971 + }, + { + "epoch": 2.2494367130550033, + "grad_norm": 1.1141399145126343, + "learning_rate": 7.338906994956344e-07, + "loss": 0.008, + "num_input_tokens_seen": 33224224, + "step": 16972 + }, + { + "epoch": 2.2495692511597083, + "grad_norm": 8.122712135314941, + "learning_rate": 7.336450234189996e-07, + "loss": 0.0266, + "num_input_tokens_seen": 33225584, + "step": 16973 + }, + { + "epoch": 2.2497017892644133, + "grad_norm": 0.0022021664772182703, + "learning_rate": 7.333993813989063e-07, + "loss": 0.0, + "num_input_tokens_seen": 33227344, + "step": 16974 + }, + { + "epoch": 2.249834327369119, + "grad_norm": 0.004999179393053055, + "learning_rate": 7.331537734400918e-07, + "loss": 0.0, + "num_input_tokens_seen": 33229120, + "step": 16975 + }, + { + "epoch": 2.249966865473824, + "grad_norm": 5.185283184051514, + "learning_rate": 7.329081995472922e-07, + "loss": 0.0417, + "num_input_tokens_seen": 33231208, + "step": 16976 + }, + { + "epoch": 2.250099403578529, + "grad_norm": 0.05826427787542343, + "learning_rate": 7.326626597252415e-07, + "loss": 0.0003, + "num_input_tokens_seen": 33233224, + "step": 16977 + }, + { + "epoch": 2.250231941683234, + "grad_norm": 1.199054479598999, + "learning_rate": 7.32417153978674e-07, + "loss": 0.0137, + "num_input_tokens_seen": 33234856, + "step": 16978 + }, + { + "epoch": 2.250364479787939, + "grad_norm": 0.12127724289894104, + "learning_rate": 7.321716823123232e-07, + "loss": 0.0006, + "num_input_tokens_seen": 33236680, + "step": 16979 + }, + { + "epoch": 2.250497017892644, + "grad_norm": 0.14179617166519165, + "learning_rate": 7.319262447309211e-07, + "loss": 0.0007, + "num_input_tokens_seen": 33237960, + "step": 16980 + }, + { + "epoch": 2.2506295559973495, + "grad_norm": 1.946344017982483, + "learning_rate": 7.316808412392007e-07, + "loss": 0.0224, + "num_input_tokens_seen": 33240376, + "step": 16981 + }, + { + "epoch": 2.2507620941020545, + "grad_norm": 0.01241837628185749, + "learning_rate": 7.314354718418946e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33241496, + "step": 16982 + }, + { + "epoch": 2.2508946322067596, + "grad_norm": 0.029551241546869278, + "learning_rate": 7.311901365437324e-07, + "loss": 0.0002, + "num_input_tokens_seen": 33243336, + "step": 16983 + }, + { + "epoch": 2.2510271703114646, + "grad_norm": 0.0013921392383053899, + "learning_rate": 7.309448353494447e-07, + "loss": 0.0, + "num_input_tokens_seen": 33245632, + "step": 16984 + }, + { + "epoch": 2.2511597084161696, + "grad_norm": 0.009858516976237297, + "learning_rate": 7.30699568263761e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33247704, + "step": 16985 + }, + { + "epoch": 2.2512922465208747, + "grad_norm": 4.5557026863098145, + "learning_rate": 7.304543352914093e-07, + "loss": 0.0337, + "num_input_tokens_seen": 33249880, + "step": 16986 + }, + { + "epoch": 2.2514247846255797, + "grad_norm": 0.007679368369281292, + "learning_rate": 7.302091364371189e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33251720, + "step": 16987 + }, + { + "epoch": 2.251557322730285, + "grad_norm": 0.5825331211090088, + "learning_rate": 7.299639717056181e-07, + "loss": 0.0017, + "num_input_tokens_seen": 33254000, + "step": 16988 + }, + { + "epoch": 2.2516898608349902, + "grad_norm": 0.21338331699371338, + "learning_rate": 7.297188411016329e-07, + "loss": 0.0007, + "num_input_tokens_seen": 33255880, + "step": 16989 + }, + { + "epoch": 2.2518223989396953, + "grad_norm": 0.024999016895890236, + "learning_rate": 7.294737446298897e-07, + "loss": 0.0002, + "num_input_tokens_seen": 33257496, + "step": 16990 + }, + { + "epoch": 2.2519549370444003, + "grad_norm": 8.251734733581543, + "learning_rate": 7.292286822951134e-07, + "loss": 0.0932, + "num_input_tokens_seen": 33260696, + "step": 16991 + }, + { + "epoch": 2.2520874751491053, + "grad_norm": 0.07410049438476562, + "learning_rate": 7.289836541020303e-07, + "loss": 0.0007, + "num_input_tokens_seen": 33262080, + "step": 16992 + }, + { + "epoch": 2.2522200132538104, + "grad_norm": 6.856213092803955, + "learning_rate": 7.287386600553636e-07, + "loss": 0.1598, + "num_input_tokens_seen": 33263912, + "step": 16993 + }, + { + "epoch": 2.2523525513585154, + "grad_norm": 4.953722953796387, + "learning_rate": 7.28493700159838e-07, + "loss": 0.0426, + "num_input_tokens_seen": 33265872, + "step": 16994 + }, + { + "epoch": 2.252485089463221, + "grad_norm": 8.043919563293457, + "learning_rate": 7.282487744201761e-07, + "loss": 0.2656, + "num_input_tokens_seen": 33268296, + "step": 16995 + }, + { + "epoch": 2.252617627567926, + "grad_norm": 0.09154964983463287, + "learning_rate": 7.280038828410993e-07, + "loss": 0.0006, + "num_input_tokens_seen": 33270864, + "step": 16996 + }, + { + "epoch": 2.252750165672631, + "grad_norm": 7.528890609741211, + "learning_rate": 7.277590254273309e-07, + "loss": 0.057, + "num_input_tokens_seen": 33272744, + "step": 16997 + }, + { + "epoch": 2.252882703777336, + "grad_norm": 0.8290294408798218, + "learning_rate": 7.27514202183591e-07, + "loss": 0.0013, + "num_input_tokens_seen": 33275888, + "step": 16998 + }, + { + "epoch": 2.253015241882041, + "grad_norm": 9.201217651367188, + "learning_rate": 7.272694131145993e-07, + "loss": 0.097, + "num_input_tokens_seen": 33278096, + "step": 16999 + }, + { + "epoch": 2.253147779986746, + "grad_norm": 3.091181755065918, + "learning_rate": 7.27024658225077e-07, + "loss": 0.0207, + "num_input_tokens_seen": 33279976, + "step": 17000 + }, + { + "epoch": 2.253280318091451, + "grad_norm": 12.717276573181152, + "learning_rate": 7.267799375197418e-07, + "loss": 0.0488, + "num_input_tokens_seen": 33282160, + "step": 17001 + }, + { + "epoch": 2.2534128561961566, + "grad_norm": 6.23261022567749, + "learning_rate": 7.265352510033133e-07, + "loss": 0.0907, + "num_input_tokens_seen": 33284280, + "step": 17002 + }, + { + "epoch": 2.2535453943008616, + "grad_norm": 0.938535213470459, + "learning_rate": 7.262905986805088e-07, + "loss": 0.0014, + "num_input_tokens_seen": 33285744, + "step": 17003 + }, + { + "epoch": 2.2536779324055667, + "grad_norm": 6.548614501953125, + "learning_rate": 7.26045980556044e-07, + "loss": 0.1454, + "num_input_tokens_seen": 33287680, + "step": 17004 + }, + { + "epoch": 2.2538104705102717, + "grad_norm": 0.0010226767044514418, + "learning_rate": 7.258013966346378e-07, + "loss": 0.0, + "num_input_tokens_seen": 33288992, + "step": 17005 + }, + { + "epoch": 2.2539430086149768, + "grad_norm": 18.0094051361084, + "learning_rate": 7.255568469210033e-07, + "loss": 0.1424, + "num_input_tokens_seen": 33291112, + "step": 17006 + }, + { + "epoch": 2.254075546719682, + "grad_norm": 0.029837286099791527, + "learning_rate": 7.253123314198582e-07, + "loss": 0.0002, + "num_input_tokens_seen": 33292688, + "step": 17007 + }, + { + "epoch": 2.254208084824387, + "grad_norm": 0.45921260118484497, + "learning_rate": 7.250678501359154e-07, + "loss": 0.0015, + "num_input_tokens_seen": 33294184, + "step": 17008 + }, + { + "epoch": 2.2543406229290923, + "grad_norm": 0.15770047903060913, + "learning_rate": 7.248234030738888e-07, + "loss": 0.0007, + "num_input_tokens_seen": 33295864, + "step": 17009 + }, + { + "epoch": 2.2544731610337974, + "grad_norm": 0.04620315134525299, + "learning_rate": 7.245789902384908e-07, + "loss": 0.0002, + "num_input_tokens_seen": 33297752, + "step": 17010 + }, + { + "epoch": 2.2546056991385024, + "grad_norm": 0.03155709058046341, + "learning_rate": 7.243346116344357e-07, + "loss": 0.0002, + "num_input_tokens_seen": 33299992, + "step": 17011 + }, + { + "epoch": 2.2547382372432074, + "grad_norm": 0.09925250709056854, + "learning_rate": 7.240902672664335e-07, + "loss": 0.0007, + "num_input_tokens_seen": 33302624, + "step": 17012 + }, + { + "epoch": 2.2548707753479125, + "grad_norm": 7.438824653625488, + "learning_rate": 7.238459571391967e-07, + "loss": 0.0215, + "num_input_tokens_seen": 33304592, + "step": 17013 + }, + { + "epoch": 2.2550033134526175, + "grad_norm": 11.135329246520996, + "learning_rate": 7.236016812574353e-07, + "loss": 0.145, + "num_input_tokens_seen": 33306424, + "step": 17014 + }, + { + "epoch": 2.2551358515573225, + "grad_norm": 8.121220588684082, + "learning_rate": 7.233574396258589e-07, + "loss": 0.0753, + "num_input_tokens_seen": 33308296, + "step": 17015 + }, + { + "epoch": 2.255268389662028, + "grad_norm": 8.018284797668457, + "learning_rate": 7.231132322491758e-07, + "loss": 0.142, + "num_input_tokens_seen": 33310360, + "step": 17016 + }, + { + "epoch": 2.255400927766733, + "grad_norm": 6.242404937744141, + "learning_rate": 7.228690591320955e-07, + "loss": 0.054, + "num_input_tokens_seen": 33313120, + "step": 17017 + }, + { + "epoch": 2.255533465871438, + "grad_norm": 0.0052351271733641624, + "learning_rate": 7.226249202793264e-07, + "loss": 0.0, + "num_input_tokens_seen": 33314784, + "step": 17018 + }, + { + "epoch": 2.255666003976143, + "grad_norm": 5.323879718780518, + "learning_rate": 7.223808156955752e-07, + "loss": 0.0839, + "num_input_tokens_seen": 33316912, + "step": 17019 + }, + { + "epoch": 2.255798542080848, + "grad_norm": 0.013013062067329884, + "learning_rate": 7.221367453855479e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33318312, + "step": 17020 + }, + { + "epoch": 2.255931080185553, + "grad_norm": 0.5815606713294983, + "learning_rate": 7.21892709353951e-07, + "loss": 0.0028, + "num_input_tokens_seen": 33320152, + "step": 17021 + }, + { + "epoch": 2.2560636182902583, + "grad_norm": 12.585061073303223, + "learning_rate": 7.216487076054881e-07, + "loss": 0.0501, + "num_input_tokens_seen": 33321984, + "step": 17022 + }, + { + "epoch": 2.2561961563949637, + "grad_norm": 9.091734886169434, + "learning_rate": 7.214047401448654e-07, + "loss": 0.0925, + "num_input_tokens_seen": 33324048, + "step": 17023 + }, + { + "epoch": 2.2563286944996688, + "grad_norm": 0.014654953964054585, + "learning_rate": 7.211608069767867e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33326504, + "step": 17024 + }, + { + "epoch": 2.256461232604374, + "grad_norm": 11.473068237304688, + "learning_rate": 7.20916908105955e-07, + "loss": 0.0729, + "num_input_tokens_seen": 33329408, + "step": 17025 + }, + { + "epoch": 2.256593770709079, + "grad_norm": 9.856328010559082, + "learning_rate": 7.206730435370727e-07, + "loss": 0.3116, + "num_input_tokens_seen": 33332712, + "step": 17026 + }, + { + "epoch": 2.256726308813784, + "grad_norm": 3.275170087814331, + "learning_rate": 7.204292132748406e-07, + "loss": 0.028, + "num_input_tokens_seen": 33335248, + "step": 17027 + }, + { + "epoch": 2.256858846918489, + "grad_norm": 13.823945045471191, + "learning_rate": 7.201854173239611e-07, + "loss": 0.0794, + "num_input_tokens_seen": 33337632, + "step": 17028 + }, + { + "epoch": 2.256991385023194, + "grad_norm": 0.018344588577747345, + "learning_rate": 7.199416556891353e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33339544, + "step": 17029 + }, + { + "epoch": 2.2571239231278994, + "grad_norm": 0.004460104275494814, + "learning_rate": 7.196979283750625e-07, + "loss": 0.0, + "num_input_tokens_seen": 33340872, + "step": 17030 + }, + { + "epoch": 2.2572564612326045, + "grad_norm": 0.0099294139072299, + "learning_rate": 7.194542353864417e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33342704, + "step": 17031 + }, + { + "epoch": 2.2573889993373095, + "grad_norm": 10.825511932373047, + "learning_rate": 7.192105767279709e-07, + "loss": 0.0708, + "num_input_tokens_seen": 33345512, + "step": 17032 + }, + { + "epoch": 2.2575215374420146, + "grad_norm": 2.3327972888946533, + "learning_rate": 7.189669524043491e-07, + "loss": 0.0107, + "num_input_tokens_seen": 33347488, + "step": 17033 + }, + { + "epoch": 2.2576540755467196, + "grad_norm": 0.012312586419284344, + "learning_rate": 7.187233624202727e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33349240, + "step": 17034 + }, + { + "epoch": 2.2577866136514246, + "grad_norm": 4.407036781311035, + "learning_rate": 7.184798067804394e-07, + "loss": 0.0186, + "num_input_tokens_seen": 33351416, + "step": 17035 + }, + { + "epoch": 2.2579191517561297, + "grad_norm": 4.566205024719238, + "learning_rate": 7.182362854895445e-07, + "loss": 0.0494, + "num_input_tokens_seen": 33353592, + "step": 17036 + }, + { + "epoch": 2.258051689860835, + "grad_norm": 0.45175901055336, + "learning_rate": 7.17992798552283e-07, + "loss": 0.0014, + "num_input_tokens_seen": 33355712, + "step": 17037 + }, + { + "epoch": 2.25818422796554, + "grad_norm": 9.643722534179688, + "learning_rate": 7.177493459733493e-07, + "loss": 0.0615, + "num_input_tokens_seen": 33357592, + "step": 17038 + }, + { + "epoch": 2.2583167660702452, + "grad_norm": 2.7171199321746826, + "learning_rate": 7.175059277574381e-07, + "loss": 0.013, + "num_input_tokens_seen": 33359768, + "step": 17039 + }, + { + "epoch": 2.2584493041749503, + "grad_norm": 0.007482321932911873, + "learning_rate": 7.172625439092414e-07, + "loss": 0.0, + "num_input_tokens_seen": 33361080, + "step": 17040 + }, + { + "epoch": 2.2585818422796553, + "grad_norm": 3.1417672634124756, + "learning_rate": 7.17019194433454e-07, + "loss": 0.0192, + "num_input_tokens_seen": 33362912, + "step": 17041 + }, + { + "epoch": 2.2587143803843603, + "grad_norm": 1.260101556777954, + "learning_rate": 7.167758793347662e-07, + "loss": 0.007, + "num_input_tokens_seen": 33364456, + "step": 17042 + }, + { + "epoch": 2.2588469184890654, + "grad_norm": 0.04365816339850426, + "learning_rate": 7.16532598617869e-07, + "loss": 0.0002, + "num_input_tokens_seen": 33365776, + "step": 17043 + }, + { + "epoch": 2.258979456593771, + "grad_norm": 2.846550226211548, + "learning_rate": 7.162893522874542e-07, + "loss": 0.0374, + "num_input_tokens_seen": 33367800, + "step": 17044 + }, + { + "epoch": 2.259111994698476, + "grad_norm": 10.436948776245117, + "learning_rate": 7.160461403482116e-07, + "loss": 0.0774, + "num_input_tokens_seen": 33369648, + "step": 17045 + }, + { + "epoch": 2.259244532803181, + "grad_norm": 1.1857064962387085, + "learning_rate": 7.15802962804829e-07, + "loss": 0.0059, + "num_input_tokens_seen": 33371712, + "step": 17046 + }, + { + "epoch": 2.259377070907886, + "grad_norm": 4.265346527099609, + "learning_rate": 7.15559819661997e-07, + "loss": 0.0406, + "num_input_tokens_seen": 33373840, + "step": 17047 + }, + { + "epoch": 2.259509609012591, + "grad_norm": 2.5166361331939697, + "learning_rate": 7.153167109244019e-07, + "loss": 0.0032, + "num_input_tokens_seen": 33375624, + "step": 17048 + }, + { + "epoch": 2.259642147117296, + "grad_norm": 16.485225677490234, + "learning_rate": 7.150736365967326e-07, + "loss": 0.1606, + "num_input_tokens_seen": 33377456, + "step": 17049 + }, + { + "epoch": 2.259774685222001, + "grad_norm": 0.01974525861442089, + "learning_rate": 7.14830596683675e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33379136, + "step": 17050 + }, + { + "epoch": 2.2599072233267066, + "grad_norm": 3.940603494644165, + "learning_rate": 7.145875911899142e-07, + "loss": 0.046, + "num_input_tokens_seen": 33381288, + "step": 17051 + }, + { + "epoch": 2.2600397614314116, + "grad_norm": 4.029561519622803, + "learning_rate": 7.143446201201371e-07, + "loss": 0.0675, + "num_input_tokens_seen": 33383160, + "step": 17052 + }, + { + "epoch": 2.2601722995361166, + "grad_norm": 3.8174209594726562, + "learning_rate": 7.141016834790268e-07, + "loss": 0.0511, + "num_input_tokens_seen": 33385000, + "step": 17053 + }, + { + "epoch": 2.2603048376408217, + "grad_norm": 7.234439373016357, + "learning_rate": 7.138587812712688e-07, + "loss": 0.0859, + "num_input_tokens_seen": 33386880, + "step": 17054 + }, + { + "epoch": 2.2604373757455267, + "grad_norm": 0.0020689249504357576, + "learning_rate": 7.136159135015458e-07, + "loss": 0.0, + "num_input_tokens_seen": 33388736, + "step": 17055 + }, + { + "epoch": 2.2605699138502318, + "grad_norm": 17.782318115234375, + "learning_rate": 7.133730801745403e-07, + "loss": 0.1428, + "num_input_tokens_seen": 33390496, + "step": 17056 + }, + { + "epoch": 2.260702451954937, + "grad_norm": 0.0010639396496117115, + "learning_rate": 7.131302812949336e-07, + "loss": 0.0, + "num_input_tokens_seen": 33392032, + "step": 17057 + }, + { + "epoch": 2.2608349900596423, + "grad_norm": 2.871506690979004, + "learning_rate": 7.128875168674076e-07, + "loss": 0.0456, + "num_input_tokens_seen": 33393448, + "step": 17058 + }, + { + "epoch": 2.2609675281643473, + "grad_norm": 8.979290962219238, + "learning_rate": 7.126447868966441e-07, + "loss": 0.1293, + "num_input_tokens_seen": 33395736, + "step": 17059 + }, + { + "epoch": 2.2611000662690524, + "grad_norm": 1.359527587890625, + "learning_rate": 7.124020913873222e-07, + "loss": 0.0124, + "num_input_tokens_seen": 33397952, + "step": 17060 + }, + { + "epoch": 2.2612326043737574, + "grad_norm": 1.771660327911377, + "learning_rate": 7.121594303441209e-07, + "loss": 0.0221, + "num_input_tokens_seen": 33399672, + "step": 17061 + }, + { + "epoch": 2.2613651424784624, + "grad_norm": 1.9435672760009766, + "learning_rate": 7.119168037717194e-07, + "loss": 0.0082, + "num_input_tokens_seen": 33402112, + "step": 17062 + }, + { + "epoch": 2.2614976805831675, + "grad_norm": 11.306968688964844, + "learning_rate": 7.116742116747946e-07, + "loss": 0.0698, + "num_input_tokens_seen": 33404152, + "step": 17063 + }, + { + "epoch": 2.261630218687873, + "grad_norm": 0.4223754107952118, + "learning_rate": 7.114316540580249e-07, + "loss": 0.0026, + "num_input_tokens_seen": 33406424, + "step": 17064 + }, + { + "epoch": 2.261762756792578, + "grad_norm": 0.0034249506425112486, + "learning_rate": 7.111891309260874e-07, + "loss": 0.0, + "num_input_tokens_seen": 33408680, + "step": 17065 + }, + { + "epoch": 2.261895294897283, + "grad_norm": 6.980777740478516, + "learning_rate": 7.109466422836572e-07, + "loss": 0.0219, + "num_input_tokens_seen": 33410392, + "step": 17066 + }, + { + "epoch": 2.262027833001988, + "grad_norm": 10.331180572509766, + "learning_rate": 7.107041881354105e-07, + "loss": 0.058, + "num_input_tokens_seen": 33412096, + "step": 17067 + }, + { + "epoch": 2.262160371106693, + "grad_norm": 5.07744026184082, + "learning_rate": 7.10461768486021e-07, + "loss": 0.0718, + "num_input_tokens_seen": 33413712, + "step": 17068 + }, + { + "epoch": 2.262292909211398, + "grad_norm": 1.858208417892456, + "learning_rate": 7.102193833401624e-07, + "loss": 0.0205, + "num_input_tokens_seen": 33415520, + "step": 17069 + }, + { + "epoch": 2.2624254473161036, + "grad_norm": 0.005696216598153114, + "learning_rate": 7.099770327025091e-07, + "loss": 0.0, + "num_input_tokens_seen": 33417472, + "step": 17070 + }, + { + "epoch": 2.2625579854208087, + "grad_norm": 0.42151540517807007, + "learning_rate": 7.097347165777337e-07, + "loss": 0.0019, + "num_input_tokens_seen": 33419488, + "step": 17071 + }, + { + "epoch": 2.2626905235255137, + "grad_norm": 0.011717605404555798, + "learning_rate": 7.094924349705085e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33421376, + "step": 17072 + }, + { + "epoch": 2.2628230616302187, + "grad_norm": 1.4437997341156006, + "learning_rate": 7.092501878855043e-07, + "loss": 0.0146, + "num_input_tokens_seen": 33424000, + "step": 17073 + }, + { + "epoch": 2.2629555997349238, + "grad_norm": 8.156682014465332, + "learning_rate": 7.09007975327391e-07, + "loss": 0.0659, + "num_input_tokens_seen": 33425504, + "step": 17074 + }, + { + "epoch": 2.263088137839629, + "grad_norm": 0.010174905881285667, + "learning_rate": 7.087657973008402e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33428224, + "step": 17075 + }, + { + "epoch": 2.263220675944334, + "grad_norm": 0.017349964007735252, + "learning_rate": 7.085236538105197e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33430328, + "step": 17076 + }, + { + "epoch": 2.2633532140490393, + "grad_norm": 2.326019048690796, + "learning_rate": 7.082815448611e-07, + "loss": 0.0141, + "num_input_tokens_seen": 33432640, + "step": 17077 + }, + { + "epoch": 2.2634857521537444, + "grad_norm": 1.7204554080963135, + "learning_rate": 7.080394704572483e-07, + "loss": 0.0089, + "num_input_tokens_seen": 33434496, + "step": 17078 + }, + { + "epoch": 2.2636182902584494, + "grad_norm": 13.064606666564941, + "learning_rate": 7.077974306036306e-07, + "loss": 0.2042, + "num_input_tokens_seen": 33436744, + "step": 17079 + }, + { + "epoch": 2.2637508283631544, + "grad_norm": 7.238364219665527, + "learning_rate": 7.075554253049158e-07, + "loss": 0.1136, + "num_input_tokens_seen": 33438784, + "step": 17080 + }, + { + "epoch": 2.2638833664678595, + "grad_norm": 0.0011723835486918688, + "learning_rate": 7.073134545657681e-07, + "loss": 0.0, + "num_input_tokens_seen": 33440016, + "step": 17081 + }, + { + "epoch": 2.2640159045725645, + "grad_norm": 2.58549165725708, + "learning_rate": 7.070715183908547e-07, + "loss": 0.026, + "num_input_tokens_seen": 33442192, + "step": 17082 + }, + { + "epoch": 2.2641484426772696, + "grad_norm": 0.0011719600297510624, + "learning_rate": 7.06829616784839e-07, + "loss": 0.0, + "num_input_tokens_seen": 33443864, + "step": 17083 + }, + { + "epoch": 2.264280980781975, + "grad_norm": 6.057003498077393, + "learning_rate": 7.065877497523848e-07, + "loss": 0.0824, + "num_input_tokens_seen": 33446200, + "step": 17084 + }, + { + "epoch": 2.26441351888668, + "grad_norm": 0.07533871382474899, + "learning_rate": 7.063459172981568e-07, + "loss": 0.0005, + "num_input_tokens_seen": 33448704, + "step": 17085 + }, + { + "epoch": 2.264546056991385, + "grad_norm": 0.6643083095550537, + "learning_rate": 7.061041194268167e-07, + "loss": 0.0014, + "num_input_tokens_seen": 33449952, + "step": 17086 + }, + { + "epoch": 2.26467859509609, + "grad_norm": 0.0022547622211277485, + "learning_rate": 7.058623561430261e-07, + "loss": 0.0, + "num_input_tokens_seen": 33451216, + "step": 17087 + }, + { + "epoch": 2.264811133200795, + "grad_norm": 0.006819826550781727, + "learning_rate": 7.056206274514476e-07, + "loss": 0.0, + "num_input_tokens_seen": 33452800, + "step": 17088 + }, + { + "epoch": 2.2649436713055002, + "grad_norm": 0.6193521022796631, + "learning_rate": 7.053789333567407e-07, + "loss": 0.0024, + "num_input_tokens_seen": 33454936, + "step": 17089 + }, + { + "epoch": 2.2650762094102053, + "grad_norm": 0.03560137003660202, + "learning_rate": 7.051372738635667e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33456544, + "step": 17090 + }, + { + "epoch": 2.2652087475149107, + "grad_norm": 0.6458995938301086, + "learning_rate": 7.048956489765843e-07, + "loss": 0.0032, + "num_input_tokens_seen": 33458240, + "step": 17091 + }, + { + "epoch": 2.265341285619616, + "grad_norm": 0.011510340496897697, + "learning_rate": 7.046540587004522e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33460168, + "step": 17092 + }, + { + "epoch": 2.265473823724321, + "grad_norm": 5.781800746917725, + "learning_rate": 7.044125030398274e-07, + "loss": 0.0751, + "num_input_tokens_seen": 33462416, + "step": 17093 + }, + { + "epoch": 2.265606361829026, + "grad_norm": 0.3827444314956665, + "learning_rate": 7.041709819993692e-07, + "loss": 0.0021, + "num_input_tokens_seen": 33463584, + "step": 17094 + }, + { + "epoch": 2.265738899933731, + "grad_norm": 0.0319744311273098, + "learning_rate": 7.039294955837325e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33465960, + "step": 17095 + }, + { + "epoch": 2.265871438038436, + "grad_norm": 0.05853966251015663, + "learning_rate": 7.036880437975749e-07, + "loss": 0.0003, + "num_input_tokens_seen": 33467160, + "step": 17096 + }, + { + "epoch": 2.266003976143141, + "grad_norm": 0.000984572572633624, + "learning_rate": 7.034466266455512e-07, + "loss": 0.0, + "num_input_tokens_seen": 33468736, + "step": 17097 + }, + { + "epoch": 2.2661365142478465, + "grad_norm": 3.6588613986968994, + "learning_rate": 7.032052441323159e-07, + "loss": 0.0682, + "num_input_tokens_seen": 33470264, + "step": 17098 + }, + { + "epoch": 2.2662690523525515, + "grad_norm": 0.00226827641017735, + "learning_rate": 7.029638962625221e-07, + "loss": 0.0, + "num_input_tokens_seen": 33472176, + "step": 17099 + }, + { + "epoch": 2.2664015904572565, + "grad_norm": 0.03111991472542286, + "learning_rate": 7.027225830408244e-07, + "loss": 0.0003, + "num_input_tokens_seen": 33473952, + "step": 17100 + }, + { + "epoch": 2.2665341285619616, + "grad_norm": 0.0010720975697040558, + "learning_rate": 7.024813044718756e-07, + "loss": 0.0, + "num_input_tokens_seen": 33475240, + "step": 17101 + }, + { + "epoch": 2.2666666666666666, + "grad_norm": 0.12969861924648285, + "learning_rate": 7.022400605603274e-07, + "loss": 0.0005, + "num_input_tokens_seen": 33476960, + "step": 17102 + }, + { + "epoch": 2.2667992047713716, + "grad_norm": 2.939267635345459, + "learning_rate": 7.019988513108311e-07, + "loss": 0.0157, + "num_input_tokens_seen": 33478912, + "step": 17103 + }, + { + "epoch": 2.2669317428760767, + "grad_norm": 0.008926770649850368, + "learning_rate": 7.017576767280366e-07, + "loss": 0.0, + "num_input_tokens_seen": 33479808, + "step": 17104 + }, + { + "epoch": 2.267064280980782, + "grad_norm": 17.49217987060547, + "learning_rate": 7.015165368165943e-07, + "loss": 0.0523, + "num_input_tokens_seen": 33481952, + "step": 17105 + }, + { + "epoch": 2.267196819085487, + "grad_norm": 5.248671054840088, + "learning_rate": 7.012754315811551e-07, + "loss": 0.0309, + "num_input_tokens_seen": 33483568, + "step": 17106 + }, + { + "epoch": 2.2673293571901922, + "grad_norm": 5.2360029220581055, + "learning_rate": 7.010343610263662e-07, + "loss": 0.0074, + "num_input_tokens_seen": 33485488, + "step": 17107 + }, + { + "epoch": 2.2674618952948973, + "grad_norm": 0.1071607917547226, + "learning_rate": 7.007933251568761e-07, + "loss": 0.0006, + "num_input_tokens_seen": 33487736, + "step": 17108 + }, + { + "epoch": 2.2675944333996023, + "grad_norm": 6.894968509674072, + "learning_rate": 7.005523239773318e-07, + "loss": 0.0496, + "num_input_tokens_seen": 33489752, + "step": 17109 + }, + { + "epoch": 2.2677269715043074, + "grad_norm": 5.43184232711792, + "learning_rate": 7.003113574923792e-07, + "loss": 0.0903, + "num_input_tokens_seen": 33492048, + "step": 17110 + }, + { + "epoch": 2.2678595096090124, + "grad_norm": 18.994386672973633, + "learning_rate": 7.000704257066651e-07, + "loss": 0.3573, + "num_input_tokens_seen": 33495816, + "step": 17111 + }, + { + "epoch": 2.267992047713718, + "grad_norm": 0.03943656384944916, + "learning_rate": 6.998295286248358e-07, + "loss": 0.0002, + "num_input_tokens_seen": 33497576, + "step": 17112 + }, + { + "epoch": 2.268124585818423, + "grad_norm": 8.894815444946289, + "learning_rate": 6.99588666251535e-07, + "loss": 0.0445, + "num_input_tokens_seen": 33500624, + "step": 17113 + }, + { + "epoch": 2.268257123923128, + "grad_norm": 8.034154891967773, + "learning_rate": 6.993478385914068e-07, + "loss": 0.163, + "num_input_tokens_seen": 33503448, + "step": 17114 + }, + { + "epoch": 2.268389662027833, + "grad_norm": 1.729291319847107, + "learning_rate": 6.991070456490937e-07, + "loss": 0.0033, + "num_input_tokens_seen": 33505568, + "step": 17115 + }, + { + "epoch": 2.268522200132538, + "grad_norm": 0.0003514259587973356, + "learning_rate": 6.988662874292399e-07, + "loss": 0.0, + "num_input_tokens_seen": 33506880, + "step": 17116 + }, + { + "epoch": 2.268654738237243, + "grad_norm": 0.053705185651779175, + "learning_rate": 6.986255639364858e-07, + "loss": 0.0003, + "num_input_tokens_seen": 33508824, + "step": 17117 + }, + { + "epoch": 2.268787276341948, + "grad_norm": 0.02403806895017624, + "learning_rate": 6.983848751754743e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33511352, + "step": 17118 + }, + { + "epoch": 2.2689198144466536, + "grad_norm": 7.016064643859863, + "learning_rate": 6.981442211508455e-07, + "loss": 0.0171, + "num_input_tokens_seen": 33512760, + "step": 17119 + }, + { + "epoch": 2.2690523525513586, + "grad_norm": 0.0016484780935570598, + "learning_rate": 6.97903601867238e-07, + "loss": 0.0, + "num_input_tokens_seen": 33514008, + "step": 17120 + }, + { + "epoch": 2.2691848906560637, + "grad_norm": 0.0005891553009860218, + "learning_rate": 6.976630173292934e-07, + "loss": 0.0, + "num_input_tokens_seen": 33515640, + "step": 17121 + }, + { + "epoch": 2.2693174287607687, + "grad_norm": 0.0027264917735010386, + "learning_rate": 6.974224675416492e-07, + "loss": 0.0, + "num_input_tokens_seen": 33516984, + "step": 17122 + }, + { + "epoch": 2.2694499668654737, + "grad_norm": 58.46063995361328, + "learning_rate": 6.971819525089424e-07, + "loss": 0.2015, + "num_input_tokens_seen": 33518608, + "step": 17123 + }, + { + "epoch": 2.2695825049701788, + "grad_norm": 0.3263755738735199, + "learning_rate": 6.96941472235812e-07, + "loss": 0.001, + "num_input_tokens_seen": 33520104, + "step": 17124 + }, + { + "epoch": 2.269715043074884, + "grad_norm": 1.459546446800232, + "learning_rate": 6.96701026726894e-07, + "loss": 0.0166, + "num_input_tokens_seen": 33522696, + "step": 17125 + }, + { + "epoch": 2.2698475811795893, + "grad_norm": 0.0055835675448179245, + "learning_rate": 6.964606159868231e-07, + "loss": 0.0, + "num_input_tokens_seen": 33523824, + "step": 17126 + }, + { + "epoch": 2.2699801192842943, + "grad_norm": 1.5316977500915527, + "learning_rate": 6.962202400202369e-07, + "loss": 0.006, + "num_input_tokens_seen": 33525976, + "step": 17127 + }, + { + "epoch": 2.2701126573889994, + "grad_norm": 2.9116830825805664, + "learning_rate": 6.959798988317687e-07, + "loss": 0.0997, + "num_input_tokens_seen": 33527600, + "step": 17128 + }, + { + "epoch": 2.2702451954937044, + "grad_norm": 0.001662257476709783, + "learning_rate": 6.957395924260519e-07, + "loss": 0.0, + "num_input_tokens_seen": 33528968, + "step": 17129 + }, + { + "epoch": 2.2703777335984094, + "grad_norm": 13.415013313293457, + "learning_rate": 6.954993208077212e-07, + "loss": 0.2684, + "num_input_tokens_seen": 33531096, + "step": 17130 + }, + { + "epoch": 2.2705102717031145, + "grad_norm": 6.666678428649902, + "learning_rate": 6.952590839814075e-07, + "loss": 0.0295, + "num_input_tokens_seen": 33533008, + "step": 17131 + }, + { + "epoch": 2.2706428098078195, + "grad_norm": 1.479818344116211, + "learning_rate": 6.950188819517448e-07, + "loss": 0.0036, + "num_input_tokens_seen": 33534416, + "step": 17132 + }, + { + "epoch": 2.270775347912525, + "grad_norm": 1.9704241752624512, + "learning_rate": 6.947787147233628e-07, + "loss": 0.0255, + "num_input_tokens_seen": 33535944, + "step": 17133 + }, + { + "epoch": 2.27090788601723, + "grad_norm": 10.994791030883789, + "learning_rate": 6.945385823008921e-07, + "loss": 0.1061, + "num_input_tokens_seen": 33538416, + "step": 17134 + }, + { + "epoch": 2.271040424121935, + "grad_norm": 0.4942953288555145, + "learning_rate": 6.942984846889639e-07, + "loss": 0.0011, + "num_input_tokens_seen": 33540640, + "step": 17135 + }, + { + "epoch": 2.27117296222664, + "grad_norm": 0.0034859045408666134, + "learning_rate": 6.940584218922055e-07, + "loss": 0.0, + "num_input_tokens_seen": 33541816, + "step": 17136 + }, + { + "epoch": 2.271305500331345, + "grad_norm": 3.9905009269714355, + "learning_rate": 6.938183939152476e-07, + "loss": 0.0579, + "num_input_tokens_seen": 33543344, + "step": 17137 + }, + { + "epoch": 2.27143803843605, + "grad_norm": 1.971089243888855, + "learning_rate": 6.93578400762717e-07, + "loss": 0.0386, + "num_input_tokens_seen": 33544736, + "step": 17138 + }, + { + "epoch": 2.2715705765407552, + "grad_norm": 0.011892507784068584, + "learning_rate": 6.933384424392411e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33546296, + "step": 17139 + }, + { + "epoch": 2.2717031146454607, + "grad_norm": 2.8978114128112793, + "learning_rate": 6.930985189494455e-07, + "loss": 0.0252, + "num_input_tokens_seen": 33548160, + "step": 17140 + }, + { + "epoch": 2.2718356527501657, + "grad_norm": 0.026118934154510498, + "learning_rate": 6.92858630297957e-07, + "loss": 0.0002, + "num_input_tokens_seen": 33551112, + "step": 17141 + }, + { + "epoch": 2.271968190854871, + "grad_norm": 0.013567560352385044, + "learning_rate": 6.926187764894015e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33552872, + "step": 17142 + }, + { + "epoch": 2.272100728959576, + "grad_norm": 0.000865636277012527, + "learning_rate": 6.923789575284032e-07, + "loss": 0.0, + "num_input_tokens_seen": 33554248, + "step": 17143 + }, + { + "epoch": 2.272233267064281, + "grad_norm": 0.06431833654642105, + "learning_rate": 6.921391734195851e-07, + "loss": 0.0003, + "num_input_tokens_seen": 33556152, + "step": 17144 + }, + { + "epoch": 2.272365805168986, + "grad_norm": 0.014610175043344498, + "learning_rate": 6.918994241675711e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33557872, + "step": 17145 + }, + { + "epoch": 2.2724983432736914, + "grad_norm": 9.243873596191406, + "learning_rate": 6.916597097769828e-07, + "loss": 0.0831, + "num_input_tokens_seen": 33560256, + "step": 17146 + }, + { + "epoch": 2.2726308813783964, + "grad_norm": 1.0632901191711426, + "learning_rate": 6.914200302524429e-07, + "loss": 0.0071, + "num_input_tokens_seen": 33562720, + "step": 17147 + }, + { + "epoch": 2.2727634194831015, + "grad_norm": 0.8729991912841797, + "learning_rate": 6.911803855985732e-07, + "loss": 0.0057, + "num_input_tokens_seen": 33564608, + "step": 17148 + }, + { + "epoch": 2.2728959575878065, + "grad_norm": 4.214803218841553, + "learning_rate": 6.909407758199935e-07, + "loss": 0.0334, + "num_input_tokens_seen": 33566472, + "step": 17149 + }, + { + "epoch": 2.2730284956925115, + "grad_norm": 7.322861671447754, + "learning_rate": 6.907012009213238e-07, + "loss": 0.0863, + "num_input_tokens_seen": 33568248, + "step": 17150 + }, + { + "epoch": 2.2731610337972166, + "grad_norm": 12.07220458984375, + "learning_rate": 6.904616609071829e-07, + "loss": 0.2201, + "num_input_tokens_seen": 33570320, + "step": 17151 + }, + { + "epoch": 2.273293571901922, + "grad_norm": 0.015552198514342308, + "learning_rate": 6.90222155782189e-07, + "loss": 0.0, + "num_input_tokens_seen": 33571584, + "step": 17152 + }, + { + "epoch": 2.273426110006627, + "grad_norm": 0.09032328426837921, + "learning_rate": 6.8998268555096e-07, + "loss": 0.0005, + "num_input_tokens_seen": 33573392, + "step": 17153 + }, + { + "epoch": 2.273558648111332, + "grad_norm": 1.9093289375305176, + "learning_rate": 6.897432502181145e-07, + "loss": 0.012, + "num_input_tokens_seen": 33575624, + "step": 17154 + }, + { + "epoch": 2.273691186216037, + "grad_norm": 0.001122051733545959, + "learning_rate": 6.89503849788268e-07, + "loss": 0.0, + "num_input_tokens_seen": 33577144, + "step": 17155 + }, + { + "epoch": 2.273823724320742, + "grad_norm": 3.0209832191467285, + "learning_rate": 6.892644842660362e-07, + "loss": 0.0154, + "num_input_tokens_seen": 33579280, + "step": 17156 + }, + { + "epoch": 2.2739562624254472, + "grad_norm": 0.7257760167121887, + "learning_rate": 6.890251536560335e-07, + "loss": 0.0045, + "num_input_tokens_seen": 33581336, + "step": 17157 + }, + { + "epoch": 2.2740888005301523, + "grad_norm": 0.005764069501310587, + "learning_rate": 6.887858579628753e-07, + "loss": 0.0, + "num_input_tokens_seen": 33582808, + "step": 17158 + }, + { + "epoch": 2.2742213386348578, + "grad_norm": 0.5104198455810547, + "learning_rate": 6.885465971911762e-07, + "loss": 0.0024, + "num_input_tokens_seen": 33584280, + "step": 17159 + }, + { + "epoch": 2.274353876739563, + "grad_norm": 0.007316774222999811, + "learning_rate": 6.883073713455479e-07, + "loss": 0.0, + "num_input_tokens_seen": 33585656, + "step": 17160 + }, + { + "epoch": 2.274486414844268, + "grad_norm": 0.9329511523246765, + "learning_rate": 6.880681804306036e-07, + "loss": 0.0074, + "num_input_tokens_seen": 33588024, + "step": 17161 + }, + { + "epoch": 2.274618952948973, + "grad_norm": 0.043191444128751755, + "learning_rate": 6.878290244509539e-07, + "loss": 0.0002, + "num_input_tokens_seen": 33589800, + "step": 17162 + }, + { + "epoch": 2.274751491053678, + "grad_norm": 0.42962002754211426, + "learning_rate": 6.875899034112115e-07, + "loss": 0.0024, + "num_input_tokens_seen": 33592000, + "step": 17163 + }, + { + "epoch": 2.274884029158383, + "grad_norm": 5.053799152374268, + "learning_rate": 6.873508173159857e-07, + "loss": 0.0627, + "num_input_tokens_seen": 33594208, + "step": 17164 + }, + { + "epoch": 2.275016567263088, + "grad_norm": 0.026130879297852516, + "learning_rate": 6.87111766169887e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33596808, + "step": 17165 + }, + { + "epoch": 2.2751491053677935, + "grad_norm": 7.647356033325195, + "learning_rate": 6.868727499775241e-07, + "loss": 0.1306, + "num_input_tokens_seen": 33598840, + "step": 17166 + }, + { + "epoch": 2.2752816434724985, + "grad_norm": 0.06074904650449753, + "learning_rate": 6.866337687435046e-07, + "loss": 0.0003, + "num_input_tokens_seen": 33600424, + "step": 17167 + }, + { + "epoch": 2.2754141815772035, + "grad_norm": 1.1922178268432617, + "learning_rate": 6.863948224724379e-07, + "loss": 0.0041, + "num_input_tokens_seen": 33602592, + "step": 17168 + }, + { + "epoch": 2.2755467196819086, + "grad_norm": 0.1906481385231018, + "learning_rate": 6.861559111689303e-07, + "loss": 0.0012, + "num_input_tokens_seen": 33604336, + "step": 17169 + }, + { + "epoch": 2.2756792577866136, + "grad_norm": 0.006680391263216734, + "learning_rate": 6.859170348375871e-07, + "loss": 0.0, + "num_input_tokens_seen": 33605840, + "step": 17170 + }, + { + "epoch": 2.2758117958913187, + "grad_norm": 7.8207106590271, + "learning_rate": 6.856781934830156e-07, + "loss": 0.0804, + "num_input_tokens_seen": 33608032, + "step": 17171 + }, + { + "epoch": 2.2759443339960237, + "grad_norm": 8.933942794799805, + "learning_rate": 6.854393871098195e-07, + "loss": 0.1089, + "num_input_tokens_seen": 33611552, + "step": 17172 + }, + { + "epoch": 2.276076872100729, + "grad_norm": 13.739423751831055, + "learning_rate": 6.852006157226046e-07, + "loss": 0.158, + "num_input_tokens_seen": 33614256, + "step": 17173 + }, + { + "epoch": 2.276209410205434, + "grad_norm": 0.02363816648721695, + "learning_rate": 6.849618793259737e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33616656, + "step": 17174 + }, + { + "epoch": 2.2763419483101393, + "grad_norm": 0.738784670829773, + "learning_rate": 6.8472317792453e-07, + "loss": 0.0029, + "num_input_tokens_seen": 33618368, + "step": 17175 + }, + { + "epoch": 2.2764744864148443, + "grad_norm": 0.012709178030490875, + "learning_rate": 6.844845115228751e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33620192, + "step": 17176 + }, + { + "epoch": 2.2766070245195493, + "grad_norm": 3.3790881633758545, + "learning_rate": 6.84245880125611e-07, + "loss": 0.0358, + "num_input_tokens_seen": 33623224, + "step": 17177 + }, + { + "epoch": 2.2767395626242544, + "grad_norm": 10.765379905700684, + "learning_rate": 6.840072837373398e-07, + "loss": 0.1547, + "num_input_tokens_seen": 33625000, + "step": 17178 + }, + { + "epoch": 2.2768721007289594, + "grad_norm": 0.6045952439308167, + "learning_rate": 6.837687223626612e-07, + "loss": 0.0013, + "num_input_tokens_seen": 33627496, + "step": 17179 + }, + { + "epoch": 2.277004638833665, + "grad_norm": 0.2937418818473816, + "learning_rate": 6.835301960061744e-07, + "loss": 0.0012, + "num_input_tokens_seen": 33629232, + "step": 17180 + }, + { + "epoch": 2.27713717693837, + "grad_norm": 0.7388772964477539, + "learning_rate": 6.832917046724785e-07, + "loss": 0.0018, + "num_input_tokens_seen": 33631200, + "step": 17181 + }, + { + "epoch": 2.277269715043075, + "grad_norm": 2.528766632080078, + "learning_rate": 6.830532483661712e-07, + "loss": 0.0408, + "num_input_tokens_seen": 33633344, + "step": 17182 + }, + { + "epoch": 2.27740225314778, + "grad_norm": 0.02051347680389881, + "learning_rate": 6.828148270918505e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33636488, + "step": 17183 + }, + { + "epoch": 2.277534791252485, + "grad_norm": 0.04230494797229767, + "learning_rate": 6.825764408541144e-07, + "loss": 0.0003, + "num_input_tokens_seen": 33638464, + "step": 17184 + }, + { + "epoch": 2.27766732935719, + "grad_norm": 4.3038787841796875, + "learning_rate": 6.823380896575585e-07, + "loss": 0.0577, + "num_input_tokens_seen": 33640176, + "step": 17185 + }, + { + "epoch": 2.277799867461895, + "grad_norm": 2.9065003395080566, + "learning_rate": 6.820997735067777e-07, + "loss": 0.0167, + "num_input_tokens_seen": 33642520, + "step": 17186 + }, + { + "epoch": 2.2779324055666006, + "grad_norm": 3.8384220600128174, + "learning_rate": 6.818614924063671e-07, + "loss": 0.031, + "num_input_tokens_seen": 33644568, + "step": 17187 + }, + { + "epoch": 2.2780649436713056, + "grad_norm": 0.02768145315349102, + "learning_rate": 6.81623246360921e-07, + "loss": 0.0002, + "num_input_tokens_seen": 33646560, + "step": 17188 + }, + { + "epoch": 2.2781974817760107, + "grad_norm": 0.3521237075328827, + "learning_rate": 6.813850353750337e-07, + "loss": 0.0014, + "num_input_tokens_seen": 33648456, + "step": 17189 + }, + { + "epoch": 2.2783300198807157, + "grad_norm": 0.8258435130119324, + "learning_rate": 6.811468594532977e-07, + "loss": 0.0027, + "num_input_tokens_seen": 33650088, + "step": 17190 + }, + { + "epoch": 2.2784625579854207, + "grad_norm": 4.314850330352783, + "learning_rate": 6.809087186003053e-07, + "loss": 0.0406, + "num_input_tokens_seen": 33653144, + "step": 17191 + }, + { + "epoch": 2.278595096090126, + "grad_norm": 22.659772872924805, + "learning_rate": 6.806706128206473e-07, + "loss": 0.3002, + "num_input_tokens_seen": 33654744, + "step": 17192 + }, + { + "epoch": 2.278727634194831, + "grad_norm": 23.597919464111328, + "learning_rate": 6.804325421189145e-07, + "loss": 0.0992, + "num_input_tokens_seen": 33657344, + "step": 17193 + }, + { + "epoch": 2.2788601722995363, + "grad_norm": 17.917827606201172, + "learning_rate": 6.801945064996973e-07, + "loss": 0.1021, + "num_input_tokens_seen": 33658688, + "step": 17194 + }, + { + "epoch": 2.2789927104042413, + "grad_norm": 12.376590728759766, + "learning_rate": 6.799565059675864e-07, + "loss": 0.0142, + "num_input_tokens_seen": 33661416, + "step": 17195 + }, + { + "epoch": 2.2791252485089464, + "grad_norm": 7.23751974105835, + "learning_rate": 6.797185405271694e-07, + "loss": 0.0291, + "num_input_tokens_seen": 33663792, + "step": 17196 + }, + { + "epoch": 2.2792577866136514, + "grad_norm": 0.008207432925701141, + "learning_rate": 6.794806101830348e-07, + "loss": 0.0, + "num_input_tokens_seen": 33665336, + "step": 17197 + }, + { + "epoch": 2.2793903247183565, + "grad_norm": 6.429232597351074, + "learning_rate": 6.792427149397693e-07, + "loss": 0.0436, + "num_input_tokens_seen": 33667752, + "step": 17198 + }, + { + "epoch": 2.2795228628230615, + "grad_norm": 4.00638484954834, + "learning_rate": 6.790048548019612e-07, + "loss": 0.0528, + "num_input_tokens_seen": 33669544, + "step": 17199 + }, + { + "epoch": 2.2796554009277665, + "grad_norm": 4.34742546081543, + "learning_rate": 6.787670297741947e-07, + "loss": 0.0536, + "num_input_tokens_seen": 33671752, + "step": 17200 + }, + { + "epoch": 2.279787939032472, + "grad_norm": 20.829042434692383, + "learning_rate": 6.785292398610571e-07, + "loss": 0.3345, + "num_input_tokens_seen": 33674176, + "step": 17201 + }, + { + "epoch": 2.279920477137177, + "grad_norm": 20.358360290527344, + "learning_rate": 6.782914850671324e-07, + "loss": 0.1129, + "num_input_tokens_seen": 33676080, + "step": 17202 + }, + { + "epoch": 2.280053015241882, + "grad_norm": 0.4165240526199341, + "learning_rate": 6.780537653970037e-07, + "loss": 0.002, + "num_input_tokens_seen": 33678136, + "step": 17203 + }, + { + "epoch": 2.280185553346587, + "grad_norm": 0.361018568277359, + "learning_rate": 6.778160808552561e-07, + "loss": 0.0022, + "num_input_tokens_seen": 33679800, + "step": 17204 + }, + { + "epoch": 2.280318091451292, + "grad_norm": 6.846236705780029, + "learning_rate": 6.775784314464717e-07, + "loss": 0.0458, + "num_input_tokens_seen": 33681648, + "step": 17205 + }, + { + "epoch": 2.280450629555997, + "grad_norm": 0.1197298914194107, + "learning_rate": 6.773408171752316e-07, + "loss": 0.0006, + "num_input_tokens_seen": 33683120, + "step": 17206 + }, + { + "epoch": 2.2805831676607022, + "grad_norm": 6.323878765106201, + "learning_rate": 6.771032380461184e-07, + "loss": 0.0177, + "num_input_tokens_seen": 33684512, + "step": 17207 + }, + { + "epoch": 2.2807157057654077, + "grad_norm": 8.152327537536621, + "learning_rate": 6.768656940637125e-07, + "loss": 0.1193, + "num_input_tokens_seen": 33686056, + "step": 17208 + }, + { + "epoch": 2.2808482438701128, + "grad_norm": 0.11660125106573105, + "learning_rate": 6.766281852325932e-07, + "loss": 0.0006, + "num_input_tokens_seen": 33687872, + "step": 17209 + }, + { + "epoch": 2.280980781974818, + "grad_norm": 0.023254210129380226, + "learning_rate": 6.763907115573407e-07, + "loss": 0.0002, + "num_input_tokens_seen": 33690192, + "step": 17210 + }, + { + "epoch": 2.281113320079523, + "grad_norm": 3.624354124069214, + "learning_rate": 6.761532730425327e-07, + "loss": 0.0317, + "num_input_tokens_seen": 33692256, + "step": 17211 + }, + { + "epoch": 2.281245858184228, + "grad_norm": 0.04643579199910164, + "learning_rate": 6.759158696927487e-07, + "loss": 0.0002, + "num_input_tokens_seen": 33693376, + "step": 17212 + }, + { + "epoch": 2.281378396288933, + "grad_norm": 0.2431427389383316, + "learning_rate": 6.756785015125649e-07, + "loss": 0.0005, + "num_input_tokens_seen": 33694912, + "step": 17213 + }, + { + "epoch": 2.281510934393638, + "grad_norm": 0.32663822174072266, + "learning_rate": 6.754411685065576e-07, + "loss": 0.0012, + "num_input_tokens_seen": 33696032, + "step": 17214 + }, + { + "epoch": 2.2816434724983434, + "grad_norm": 0.010221016593277454, + "learning_rate": 6.752038706793041e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33697520, + "step": 17215 + }, + { + "epoch": 2.2817760106030485, + "grad_norm": 5.816827774047852, + "learning_rate": 6.749666080353787e-07, + "loss": 0.1025, + "num_input_tokens_seen": 33700296, + "step": 17216 + }, + { + "epoch": 2.2819085487077535, + "grad_norm": 0.02846689522266388, + "learning_rate": 6.747293805793553e-07, + "loss": 0.0002, + "num_input_tokens_seen": 33702616, + "step": 17217 + }, + { + "epoch": 2.2820410868124585, + "grad_norm": 9.767670631408691, + "learning_rate": 6.744921883158092e-07, + "loss": 0.0314, + "num_input_tokens_seen": 33704072, + "step": 17218 + }, + { + "epoch": 2.2821736249171636, + "grad_norm": 7.06099271774292, + "learning_rate": 6.742550312493126e-07, + "loss": 0.119, + "num_input_tokens_seen": 33705896, + "step": 17219 + }, + { + "epoch": 2.2823061630218686, + "grad_norm": 2.4810256958007812, + "learning_rate": 6.740179093844393e-07, + "loss": 0.0113, + "num_input_tokens_seen": 33707768, + "step": 17220 + }, + { + "epoch": 2.2824387011265737, + "grad_norm": 5.996617794036865, + "learning_rate": 6.737808227257603e-07, + "loss": 0.0398, + "num_input_tokens_seen": 33709248, + "step": 17221 + }, + { + "epoch": 2.282571239231279, + "grad_norm": 0.001515745185315609, + "learning_rate": 6.735437712778467e-07, + "loss": 0.0, + "num_input_tokens_seen": 33711072, + "step": 17222 + }, + { + "epoch": 2.282703777335984, + "grad_norm": 27.77338981628418, + "learning_rate": 6.733067550452687e-07, + "loss": 0.0613, + "num_input_tokens_seen": 33713432, + "step": 17223 + }, + { + "epoch": 2.282836315440689, + "grad_norm": 0.020663917064666748, + "learning_rate": 6.730697740325965e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33715088, + "step": 17224 + }, + { + "epoch": 2.2829688535453942, + "grad_norm": 0.8269073963165283, + "learning_rate": 6.728328282444003e-07, + "loss": 0.0031, + "num_input_tokens_seen": 33717072, + "step": 17225 + }, + { + "epoch": 2.2831013916500993, + "grad_norm": 0.0026791368145495653, + "learning_rate": 6.725959176852476e-07, + "loss": 0.0, + "num_input_tokens_seen": 33718352, + "step": 17226 + }, + { + "epoch": 2.2832339297548043, + "grad_norm": 0.3679599463939667, + "learning_rate": 6.723590423597062e-07, + "loss": 0.0022, + "num_input_tokens_seen": 33720272, + "step": 17227 + }, + { + "epoch": 2.2833664678595094, + "grad_norm": 42.52999496459961, + "learning_rate": 6.721222022723431e-07, + "loss": 0.1488, + "num_input_tokens_seen": 33722280, + "step": 17228 + }, + { + "epoch": 2.283499005964215, + "grad_norm": 1.9862474203109741, + "learning_rate": 6.718853974277245e-07, + "loss": 0.0049, + "num_input_tokens_seen": 33724184, + "step": 17229 + }, + { + "epoch": 2.28363154406892, + "grad_norm": 0.30755671858787537, + "learning_rate": 6.716486278304165e-07, + "loss": 0.0009, + "num_input_tokens_seen": 33726280, + "step": 17230 + }, + { + "epoch": 2.283764082173625, + "grad_norm": 15.270941734313965, + "learning_rate": 6.71411893484985e-07, + "loss": 0.1066, + "num_input_tokens_seen": 33728560, + "step": 17231 + }, + { + "epoch": 2.28389662027833, + "grad_norm": 4.877659797668457, + "learning_rate": 6.711751943959938e-07, + "loss": 0.0396, + "num_input_tokens_seen": 33730176, + "step": 17232 + }, + { + "epoch": 2.284029158383035, + "grad_norm": 4.986163139343262, + "learning_rate": 6.709385305680063e-07, + "loss": 0.0268, + "num_input_tokens_seen": 33731800, + "step": 17233 + }, + { + "epoch": 2.28416169648774, + "grad_norm": 0.004596749786287546, + "learning_rate": 6.707019020055849e-07, + "loss": 0.0, + "num_input_tokens_seen": 33733056, + "step": 17234 + }, + { + "epoch": 2.2842942345924455, + "grad_norm": 0.005441832821816206, + "learning_rate": 6.704653087132937e-07, + "loss": 0.0, + "num_input_tokens_seen": 33734616, + "step": 17235 + }, + { + "epoch": 2.2844267726971506, + "grad_norm": 0.09138097614049911, + "learning_rate": 6.702287506956928e-07, + "loss": 0.0003, + "num_input_tokens_seen": 33736160, + "step": 17236 + }, + { + "epoch": 2.2845593108018556, + "grad_norm": 2.222072124481201, + "learning_rate": 6.699922279573445e-07, + "loss": 0.012, + "num_input_tokens_seen": 33738232, + "step": 17237 + }, + { + "epoch": 2.2846918489065606, + "grad_norm": 8.131004333496094, + "learning_rate": 6.697557405028085e-07, + "loss": 0.0893, + "num_input_tokens_seen": 33740632, + "step": 17238 + }, + { + "epoch": 2.2848243870112657, + "grad_norm": 0.1675586998462677, + "learning_rate": 6.695192883366447e-07, + "loss": 0.0009, + "num_input_tokens_seen": 33742424, + "step": 17239 + }, + { + "epoch": 2.2849569251159707, + "grad_norm": 0.0021445814054459333, + "learning_rate": 6.692828714634109e-07, + "loss": 0.0, + "num_input_tokens_seen": 33744584, + "step": 17240 + }, + { + "epoch": 2.285089463220676, + "grad_norm": 3.928619384765625, + "learning_rate": 6.690464898876661e-07, + "loss": 0.0279, + "num_input_tokens_seen": 33746568, + "step": 17241 + }, + { + "epoch": 2.2852220013253812, + "grad_norm": 1.7057392597198486, + "learning_rate": 6.688101436139691e-07, + "loss": 0.0092, + "num_input_tokens_seen": 33748416, + "step": 17242 + }, + { + "epoch": 2.2853545394300863, + "grad_norm": 4.884451389312744, + "learning_rate": 6.685738326468754e-07, + "loss": 0.063, + "num_input_tokens_seen": 33750016, + "step": 17243 + }, + { + "epoch": 2.2854870775347913, + "grad_norm": 2.580627679824829, + "learning_rate": 6.68337556990942e-07, + "loss": 0.0143, + "num_input_tokens_seen": 33752336, + "step": 17244 + }, + { + "epoch": 2.2856196156394963, + "grad_norm": 7.145664215087891, + "learning_rate": 6.681013166507233e-07, + "loss": 0.0333, + "num_input_tokens_seen": 33755384, + "step": 17245 + }, + { + "epoch": 2.2857521537442014, + "grad_norm": 0.000903077598195523, + "learning_rate": 6.678651116307758e-07, + "loss": 0.0, + "num_input_tokens_seen": 33756728, + "step": 17246 + }, + { + "epoch": 2.2858846918489064, + "grad_norm": 0.4160853922367096, + "learning_rate": 6.676289419356521e-07, + "loss": 0.0022, + "num_input_tokens_seen": 33758520, + "step": 17247 + }, + { + "epoch": 2.286017229953612, + "grad_norm": 13.208892822265625, + "learning_rate": 6.673928075699071e-07, + "loss": 0.0622, + "num_input_tokens_seen": 33760368, + "step": 17248 + }, + { + "epoch": 2.286149768058317, + "grad_norm": 0.06685098260641098, + "learning_rate": 6.671567085380932e-07, + "loss": 0.0003, + "num_input_tokens_seen": 33762208, + "step": 17249 + }, + { + "epoch": 2.286282306163022, + "grad_norm": 10.69072151184082, + "learning_rate": 6.669206448447613e-07, + "loss": 0.0501, + "num_input_tokens_seen": 33764400, + "step": 17250 + }, + { + "epoch": 2.286414844267727, + "grad_norm": 1.2607381343841553, + "learning_rate": 6.666846164944649e-07, + "loss": 0.0026, + "num_input_tokens_seen": 33765888, + "step": 17251 + }, + { + "epoch": 2.286547382372432, + "grad_norm": 4.747951507568359, + "learning_rate": 6.664486234917538e-07, + "loss": 0.0309, + "num_input_tokens_seen": 33767912, + "step": 17252 + }, + { + "epoch": 2.286679920477137, + "grad_norm": 0.2989721894264221, + "learning_rate": 6.662126658411775e-07, + "loss": 0.0011, + "num_input_tokens_seen": 33769776, + "step": 17253 + }, + { + "epoch": 2.286812458581842, + "grad_norm": 0.012381265871226788, + "learning_rate": 6.659767435472866e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33772736, + "step": 17254 + }, + { + "epoch": 2.2869449966865476, + "grad_norm": 4.098250865936279, + "learning_rate": 6.657408566146289e-07, + "loss": 0.0235, + "num_input_tokens_seen": 33774912, + "step": 17255 + }, + { + "epoch": 2.2870775347912526, + "grad_norm": 0.8500166535377502, + "learning_rate": 6.655050050477535e-07, + "loss": 0.0052, + "num_input_tokens_seen": 33776888, + "step": 17256 + }, + { + "epoch": 2.2872100728959577, + "grad_norm": 1.0955398082733154, + "learning_rate": 6.65269188851207e-07, + "loss": 0.0048, + "num_input_tokens_seen": 33778176, + "step": 17257 + }, + { + "epoch": 2.2873426110006627, + "grad_norm": 8.449195861816406, + "learning_rate": 6.650334080295365e-07, + "loss": 0.052, + "num_input_tokens_seen": 33780144, + "step": 17258 + }, + { + "epoch": 2.2874751491053678, + "grad_norm": 6.545801162719727, + "learning_rate": 6.647976625872868e-07, + "loss": 0.0737, + "num_input_tokens_seen": 33781968, + "step": 17259 + }, + { + "epoch": 2.287607687210073, + "grad_norm": 6.022459506988525, + "learning_rate": 6.645619525290043e-07, + "loss": 0.0971, + "num_input_tokens_seen": 33783496, + "step": 17260 + }, + { + "epoch": 2.287740225314778, + "grad_norm": 0.01692957989871502, + "learning_rate": 6.643262778592341e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33785808, + "step": 17261 + }, + { + "epoch": 2.2878727634194833, + "grad_norm": 0.012556714937090874, + "learning_rate": 6.6409063858252e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33787776, + "step": 17262 + }, + { + "epoch": 2.2880053015241884, + "grad_norm": 11.481949806213379, + "learning_rate": 6.638550347034045e-07, + "loss": 0.0821, + "num_input_tokens_seen": 33789488, + "step": 17263 + }, + { + "epoch": 2.2881378396288934, + "grad_norm": 0.008078769780695438, + "learning_rate": 6.636194662264303e-07, + "loss": 0.0, + "num_input_tokens_seen": 33790504, + "step": 17264 + }, + { + "epoch": 2.2882703777335984, + "grad_norm": 14.337054252624512, + "learning_rate": 6.633839331561392e-07, + "loss": 0.1183, + "num_input_tokens_seen": 33793288, + "step": 17265 + }, + { + "epoch": 2.2884029158383035, + "grad_norm": 9.385490417480469, + "learning_rate": 6.631484354970741e-07, + "loss": 0.0376, + "num_input_tokens_seen": 33796216, + "step": 17266 + }, + { + "epoch": 2.2885354539430085, + "grad_norm": 8.287616729736328, + "learning_rate": 6.629129732537739e-07, + "loss": 0.0705, + "num_input_tokens_seen": 33798200, + "step": 17267 + }, + { + "epoch": 2.2886679920477135, + "grad_norm": 6.220067977905273, + "learning_rate": 6.626775464307791e-07, + "loss": 0.087, + "num_input_tokens_seen": 33800256, + "step": 17268 + }, + { + "epoch": 2.288800530152419, + "grad_norm": 8.855901718139648, + "learning_rate": 6.62442155032629e-07, + "loss": 0.0838, + "num_input_tokens_seen": 33802768, + "step": 17269 + }, + { + "epoch": 2.288933068257124, + "grad_norm": 0.005684299394488335, + "learning_rate": 6.622067990638605e-07, + "loss": 0.0, + "num_input_tokens_seen": 33804776, + "step": 17270 + }, + { + "epoch": 2.289065606361829, + "grad_norm": 5.73058557510376, + "learning_rate": 6.619714785290132e-07, + "loss": 0.0335, + "num_input_tokens_seen": 33806040, + "step": 17271 + }, + { + "epoch": 2.289198144466534, + "grad_norm": 0.6521172523498535, + "learning_rate": 6.617361934326244e-07, + "loss": 0.0017, + "num_input_tokens_seen": 33808376, + "step": 17272 + }, + { + "epoch": 2.289330682571239, + "grad_norm": 1.0816845893859863, + "learning_rate": 6.615009437792299e-07, + "loss": 0.0024, + "num_input_tokens_seen": 33810080, + "step": 17273 + }, + { + "epoch": 2.289463220675944, + "grad_norm": 5.530097961425781, + "learning_rate": 6.612657295733657e-07, + "loss": 0.0157, + "num_input_tokens_seen": 33811880, + "step": 17274 + }, + { + "epoch": 2.2895957587806492, + "grad_norm": 0.012489169836044312, + "learning_rate": 6.610305508195666e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33813536, + "step": 17275 + }, + { + "epoch": 2.2897282968853547, + "grad_norm": 0.06914237886667252, + "learning_rate": 6.607954075223663e-07, + "loss": 0.0002, + "num_input_tokens_seen": 33816128, + "step": 17276 + }, + { + "epoch": 2.2898608349900598, + "grad_norm": 0.00520706083625555, + "learning_rate": 6.605602996862992e-07, + "loss": 0.0, + "num_input_tokens_seen": 33818088, + "step": 17277 + }, + { + "epoch": 2.289993373094765, + "grad_norm": 1.816707968711853, + "learning_rate": 6.603252273158994e-07, + "loss": 0.0163, + "num_input_tokens_seen": 33820080, + "step": 17278 + }, + { + "epoch": 2.29012591119947, + "grad_norm": 14.06533432006836, + "learning_rate": 6.600901904156984e-07, + "loss": 0.1308, + "num_input_tokens_seen": 33822288, + "step": 17279 + }, + { + "epoch": 2.290258449304175, + "grad_norm": 0.6660693287849426, + "learning_rate": 6.59855188990228e-07, + "loss": 0.0022, + "num_input_tokens_seen": 33824472, + "step": 17280 + }, + { + "epoch": 2.29039098740888, + "grad_norm": 4.842857360839844, + "learning_rate": 6.59620223044018e-07, + "loss": 0.0346, + "num_input_tokens_seen": 33827584, + "step": 17281 + }, + { + "epoch": 2.290523525513585, + "grad_norm": 0.002522021532058716, + "learning_rate": 6.593852925816005e-07, + "loss": 0.0, + "num_input_tokens_seen": 33830136, + "step": 17282 + }, + { + "epoch": 2.2906560636182904, + "grad_norm": 14.116924285888672, + "learning_rate": 6.591503976075034e-07, + "loss": 0.0853, + "num_input_tokens_seen": 33831800, + "step": 17283 + }, + { + "epoch": 2.2907886017229955, + "grad_norm": 0.1627703607082367, + "learning_rate": 6.589155381262577e-07, + "loss": 0.0003, + "num_input_tokens_seen": 33834160, + "step": 17284 + }, + { + "epoch": 2.2909211398277005, + "grad_norm": 4.929999828338623, + "learning_rate": 6.586807141423903e-07, + "loss": 0.0165, + "num_input_tokens_seen": 33835960, + "step": 17285 + }, + { + "epoch": 2.2910536779324056, + "grad_norm": 2.613187074661255, + "learning_rate": 6.584459256604281e-07, + "loss": 0.0136, + "num_input_tokens_seen": 33837936, + "step": 17286 + }, + { + "epoch": 2.2911862160371106, + "grad_norm": 8.95068359375, + "learning_rate": 6.582111726849e-07, + "loss": 0.0701, + "num_input_tokens_seen": 33841376, + "step": 17287 + }, + { + "epoch": 2.2913187541418156, + "grad_norm": 0.3064829409122467, + "learning_rate": 6.579764552203308e-07, + "loss": 0.0021, + "num_input_tokens_seen": 33843752, + "step": 17288 + }, + { + "epoch": 2.2914512922465207, + "grad_norm": 4.905959129333496, + "learning_rate": 6.577417732712458e-07, + "loss": 0.0363, + "num_input_tokens_seen": 33845680, + "step": 17289 + }, + { + "epoch": 2.291583830351226, + "grad_norm": 56.3450813293457, + "learning_rate": 6.575071268421706e-07, + "loss": 0.2672, + "num_input_tokens_seen": 33847416, + "step": 17290 + }, + { + "epoch": 2.291716368455931, + "grad_norm": 0.002270760713145137, + "learning_rate": 6.572725159376286e-07, + "loss": 0.0, + "num_input_tokens_seen": 33848968, + "step": 17291 + }, + { + "epoch": 2.2918489065606362, + "grad_norm": 0.5188500881195068, + "learning_rate": 6.570379405621446e-07, + "loss": 0.005, + "num_input_tokens_seen": 33850560, + "step": 17292 + }, + { + "epoch": 2.2919814446653413, + "grad_norm": 7.926735877990723, + "learning_rate": 6.568034007202403e-07, + "loss": 0.0786, + "num_input_tokens_seen": 33853016, + "step": 17293 + }, + { + "epoch": 2.2921139827700463, + "grad_norm": 0.5771359205245972, + "learning_rate": 6.565688964164374e-07, + "loss": 0.0022, + "num_input_tokens_seen": 33855240, + "step": 17294 + }, + { + "epoch": 2.2922465208747513, + "grad_norm": 0.03052263706922531, + "learning_rate": 6.563344276552586e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33856840, + "step": 17295 + }, + { + "epoch": 2.2923790589794564, + "grad_norm": 0.004999891854822636, + "learning_rate": 6.560999944412238e-07, + "loss": 0.0, + "num_input_tokens_seen": 33858272, + "step": 17296 + }, + { + "epoch": 2.292511597084162, + "grad_norm": 0.029456255957484245, + "learning_rate": 6.558655967788524e-07, + "loss": 0.0002, + "num_input_tokens_seen": 33860792, + "step": 17297 + }, + { + "epoch": 2.292644135188867, + "grad_norm": 0.009995020925998688, + "learning_rate": 6.556312346726654e-07, + "loss": 0.0, + "num_input_tokens_seen": 33862336, + "step": 17298 + }, + { + "epoch": 2.292776673293572, + "grad_norm": 11.427156448364258, + "learning_rate": 6.553969081271805e-07, + "loss": 0.0707, + "num_input_tokens_seen": 33863992, + "step": 17299 + }, + { + "epoch": 2.292909211398277, + "grad_norm": 0.0020285286009311676, + "learning_rate": 6.55162617146915e-07, + "loss": 0.0, + "num_input_tokens_seen": 33865800, + "step": 17300 + }, + { + "epoch": 2.293041749502982, + "grad_norm": 4.617921352386475, + "learning_rate": 6.549283617363875e-07, + "loss": 0.1224, + "num_input_tokens_seen": 33869032, + "step": 17301 + }, + { + "epoch": 2.293174287607687, + "grad_norm": 8.810025215148926, + "learning_rate": 6.546941419001135e-07, + "loss": 0.0938, + "num_input_tokens_seen": 33871104, + "step": 17302 + }, + { + "epoch": 2.293306825712392, + "grad_norm": 6.478766918182373, + "learning_rate": 6.5445995764261e-07, + "loss": 0.0204, + "num_input_tokens_seen": 33873448, + "step": 17303 + }, + { + "epoch": 2.2934393638170976, + "grad_norm": 3.5151381492614746, + "learning_rate": 6.54225808968392e-07, + "loss": 0.0097, + "num_input_tokens_seen": 33876304, + "step": 17304 + }, + { + "epoch": 2.2935719019218026, + "grad_norm": 0.003657284192740917, + "learning_rate": 6.539916958819733e-07, + "loss": 0.0, + "num_input_tokens_seen": 33877808, + "step": 17305 + }, + { + "epoch": 2.2937044400265076, + "grad_norm": 0.00358693883754313, + "learning_rate": 6.53757618387868e-07, + "loss": 0.0, + "num_input_tokens_seen": 33879408, + "step": 17306 + }, + { + "epoch": 2.2938369781312127, + "grad_norm": 12.573284149169922, + "learning_rate": 6.535235764905889e-07, + "loss": 0.2451, + "num_input_tokens_seen": 33881760, + "step": 17307 + }, + { + "epoch": 2.2939695162359177, + "grad_norm": 2.56776762008667, + "learning_rate": 6.532895701946504e-07, + "loss": 0.0287, + "num_input_tokens_seen": 33883376, + "step": 17308 + }, + { + "epoch": 2.2941020543406228, + "grad_norm": 0.7532691955566406, + "learning_rate": 6.530555995045626e-07, + "loss": 0.0051, + "num_input_tokens_seen": 33884928, + "step": 17309 + }, + { + "epoch": 2.294234592445328, + "grad_norm": 4.874903202056885, + "learning_rate": 6.528216644248369e-07, + "loss": 0.0968, + "num_input_tokens_seen": 33886616, + "step": 17310 + }, + { + "epoch": 2.2943671305500333, + "grad_norm": 3.833845615386963, + "learning_rate": 6.52587764959984e-07, + "loss": 0.0647, + "num_input_tokens_seen": 33889248, + "step": 17311 + }, + { + "epoch": 2.2944996686547383, + "grad_norm": 4.895381927490234, + "learning_rate": 6.523539011145128e-07, + "loss": 0.0362, + "num_input_tokens_seen": 33891480, + "step": 17312 + }, + { + "epoch": 2.2946322067594433, + "grad_norm": 4.5541672706604, + "learning_rate": 6.521200728929328e-07, + "loss": 0.0175, + "num_input_tokens_seen": 33893144, + "step": 17313 + }, + { + "epoch": 2.2947647448641484, + "grad_norm": 3.6391258239746094, + "learning_rate": 6.518862802997534e-07, + "loss": 0.0135, + "num_input_tokens_seen": 33895376, + "step": 17314 + }, + { + "epoch": 2.2948972829688534, + "grad_norm": 3.257713794708252, + "learning_rate": 6.516525233394816e-07, + "loss": 0.0188, + "num_input_tokens_seen": 33896896, + "step": 17315 + }, + { + "epoch": 2.2950298210735585, + "grad_norm": 0.002250396180897951, + "learning_rate": 6.51418802016624e-07, + "loss": 0.0, + "num_input_tokens_seen": 33898976, + "step": 17316 + }, + { + "epoch": 2.295162359178264, + "grad_norm": 3.0367236137390137, + "learning_rate": 6.511851163356867e-07, + "loss": 0.0201, + "num_input_tokens_seen": 33901416, + "step": 17317 + }, + { + "epoch": 2.295294897282969, + "grad_norm": 0.1726343184709549, + "learning_rate": 6.509514663011757e-07, + "loss": 0.0007, + "num_input_tokens_seen": 33902760, + "step": 17318 + }, + { + "epoch": 2.295427435387674, + "grad_norm": 0.002547759795561433, + "learning_rate": 6.507178519175966e-07, + "loss": 0.0, + "num_input_tokens_seen": 33905008, + "step": 17319 + }, + { + "epoch": 2.295559973492379, + "grad_norm": 0.0041004871018230915, + "learning_rate": 6.504842731894531e-07, + "loss": 0.0, + "num_input_tokens_seen": 33906432, + "step": 17320 + }, + { + "epoch": 2.295692511597084, + "grad_norm": 0.0021888678893446922, + "learning_rate": 6.502507301212488e-07, + "loss": 0.0, + "num_input_tokens_seen": 33908648, + "step": 17321 + }, + { + "epoch": 2.295825049701789, + "grad_norm": 2.3207578659057617, + "learning_rate": 6.500172227174859e-07, + "loss": 0.0167, + "num_input_tokens_seen": 33910496, + "step": 17322 + }, + { + "epoch": 2.2959575878064946, + "grad_norm": 14.416236877441406, + "learning_rate": 6.497837509826676e-07, + "loss": 0.1378, + "num_input_tokens_seen": 33912680, + "step": 17323 + }, + { + "epoch": 2.2960901259111997, + "grad_norm": 4.982019424438477, + "learning_rate": 6.495503149212945e-07, + "loss": 0.0722, + "num_input_tokens_seen": 33915152, + "step": 17324 + }, + { + "epoch": 2.2962226640159047, + "grad_norm": 3.0305893421173096, + "learning_rate": 6.493169145378686e-07, + "loss": 0.0164, + "num_input_tokens_seen": 33917536, + "step": 17325 + }, + { + "epoch": 2.2963552021206097, + "grad_norm": 2.0287387371063232, + "learning_rate": 6.490835498368892e-07, + "loss": 0.0164, + "num_input_tokens_seen": 33918984, + "step": 17326 + }, + { + "epoch": 2.2964877402253148, + "grad_norm": 1.8714946508407593, + "learning_rate": 6.488502208228559e-07, + "loss": 0.0056, + "num_input_tokens_seen": 33920328, + "step": 17327 + }, + { + "epoch": 2.29662027833002, + "grad_norm": 3.10929012298584, + "learning_rate": 6.486169275002669e-07, + "loss": 0.0195, + "num_input_tokens_seen": 33921904, + "step": 17328 + }, + { + "epoch": 2.296752816434725, + "grad_norm": 2.7655887603759766, + "learning_rate": 6.483836698736213e-07, + "loss": 0.023, + "num_input_tokens_seen": 33924048, + "step": 17329 + }, + { + "epoch": 2.2968853545394303, + "grad_norm": 0.03369118273258209, + "learning_rate": 6.481504479474148e-07, + "loss": 0.0002, + "num_input_tokens_seen": 33926016, + "step": 17330 + }, + { + "epoch": 2.2970178926441354, + "grad_norm": 1.4533121585845947, + "learning_rate": 6.479172617261464e-07, + "loss": 0.0045, + "num_input_tokens_seen": 33927696, + "step": 17331 + }, + { + "epoch": 2.2971504307488404, + "grad_norm": 8.492051124572754, + "learning_rate": 6.476841112143106e-07, + "loss": 0.0806, + "num_input_tokens_seen": 33929656, + "step": 17332 + }, + { + "epoch": 2.2972829688535454, + "grad_norm": 0.09242302924394608, + "learning_rate": 6.474509964164025e-07, + "loss": 0.0003, + "num_input_tokens_seen": 33932552, + "step": 17333 + }, + { + "epoch": 2.2974155069582505, + "grad_norm": 9.955517768859863, + "learning_rate": 6.472179173369178e-07, + "loss": 0.2098, + "num_input_tokens_seen": 33934816, + "step": 17334 + }, + { + "epoch": 2.2975480450629555, + "grad_norm": 4.512258529663086, + "learning_rate": 6.469848739803497e-07, + "loss": 0.0335, + "num_input_tokens_seen": 33937048, + "step": 17335 + }, + { + "epoch": 2.2976805831676606, + "grad_norm": 0.002681705402210355, + "learning_rate": 6.46751866351191e-07, + "loss": 0.0, + "num_input_tokens_seen": 33938520, + "step": 17336 + }, + { + "epoch": 2.297813121272366, + "grad_norm": 0.028903024271130562, + "learning_rate": 6.465188944539356e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33941328, + "step": 17337 + }, + { + "epoch": 2.297945659377071, + "grad_norm": 0.04518900811672211, + "learning_rate": 6.462859582930736e-07, + "loss": 0.0002, + "num_input_tokens_seen": 33943960, + "step": 17338 + }, + { + "epoch": 2.298078197481776, + "grad_norm": 14.531993865966797, + "learning_rate": 6.460530578730981e-07, + "loss": 0.1869, + "num_input_tokens_seen": 33946264, + "step": 17339 + }, + { + "epoch": 2.298210735586481, + "grad_norm": 1.4146671295166016, + "learning_rate": 6.458201931984984e-07, + "loss": 0.0069, + "num_input_tokens_seen": 33948304, + "step": 17340 + }, + { + "epoch": 2.298343273691186, + "grad_norm": 8.857565879821777, + "learning_rate": 6.455873642737645e-07, + "loss": 0.0814, + "num_input_tokens_seen": 33950536, + "step": 17341 + }, + { + "epoch": 2.298475811795891, + "grad_norm": 4.4139204025268555, + "learning_rate": 6.453545711033849e-07, + "loss": 0.0592, + "num_input_tokens_seen": 33952336, + "step": 17342 + }, + { + "epoch": 2.2986083499005963, + "grad_norm": 0.003098440356552601, + "learning_rate": 6.451218136918486e-07, + "loss": 0.0, + "num_input_tokens_seen": 33954080, + "step": 17343 + }, + { + "epoch": 2.2987408880053017, + "grad_norm": 0.04723961278796196, + "learning_rate": 6.44889092043644e-07, + "loss": 0.0002, + "num_input_tokens_seen": 33956408, + "step": 17344 + }, + { + "epoch": 2.298873426110007, + "grad_norm": 0.01167050376534462, + "learning_rate": 6.446564061632576e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33958672, + "step": 17345 + }, + { + "epoch": 2.299005964214712, + "grad_norm": 4.171689033508301, + "learning_rate": 6.444237560551758e-07, + "loss": 0.0231, + "num_input_tokens_seen": 33960352, + "step": 17346 + }, + { + "epoch": 2.299138502319417, + "grad_norm": 11.312682151794434, + "learning_rate": 6.44191141723883e-07, + "loss": 0.1031, + "num_input_tokens_seen": 33962328, + "step": 17347 + }, + { + "epoch": 2.299271040424122, + "grad_norm": 4.8951311111450195, + "learning_rate": 6.439585631738654e-07, + "loss": 0.0783, + "num_input_tokens_seen": 33964296, + "step": 17348 + }, + { + "epoch": 2.299403578528827, + "grad_norm": 0.0019012545235455036, + "learning_rate": 6.43726020409608e-07, + "loss": 0.0, + "num_input_tokens_seen": 33966328, + "step": 17349 + }, + { + "epoch": 2.299536116633532, + "grad_norm": 10.843986511230469, + "learning_rate": 6.434935134355932e-07, + "loss": 0.0733, + "num_input_tokens_seen": 33968160, + "step": 17350 + }, + { + "epoch": 2.2996686547382374, + "grad_norm": 0.03430134057998657, + "learning_rate": 6.432610422563043e-07, + "loss": 0.0002, + "num_input_tokens_seen": 33970848, + "step": 17351 + }, + { + "epoch": 2.2998011928429425, + "grad_norm": 0.8317570686340332, + "learning_rate": 6.430286068762234e-07, + "loss": 0.004, + "num_input_tokens_seen": 33972328, + "step": 17352 + }, + { + "epoch": 2.2999337309476475, + "grad_norm": 0.0016466263914480805, + "learning_rate": 6.427962072998315e-07, + "loss": 0.0, + "num_input_tokens_seen": 33973720, + "step": 17353 + }, + { + "epoch": 2.3000662690523526, + "grad_norm": 0.003895401954650879, + "learning_rate": 6.425638435316098e-07, + "loss": 0.0, + "num_input_tokens_seen": 33975072, + "step": 17354 + }, + { + "epoch": 2.3001988071570576, + "grad_norm": 0.8389959335327148, + "learning_rate": 6.423315155760393e-07, + "loss": 0.0026, + "num_input_tokens_seen": 33976880, + "step": 17355 + }, + { + "epoch": 2.3003313452617626, + "grad_norm": 0.15988609194755554, + "learning_rate": 6.420992234375986e-07, + "loss": 0.0003, + "num_input_tokens_seen": 33978424, + "step": 17356 + }, + { + "epoch": 2.3004638833664677, + "grad_norm": 0.3522300124168396, + "learning_rate": 6.418669671207666e-07, + "loss": 0.0018, + "num_input_tokens_seen": 33980192, + "step": 17357 + }, + { + "epoch": 2.300596421471173, + "grad_norm": 2.64011905528605e-05, + "learning_rate": 6.416347466300216e-07, + "loss": 0.0, + "num_input_tokens_seen": 33981056, + "step": 17358 + }, + { + "epoch": 2.300728959575878, + "grad_norm": 0.010874209925532341, + "learning_rate": 6.414025619698397e-07, + "loss": 0.0, + "num_input_tokens_seen": 33982080, + "step": 17359 + }, + { + "epoch": 2.3008614976805832, + "grad_norm": 52.74601745605469, + "learning_rate": 6.411704131446986e-07, + "loss": 0.7225, + "num_input_tokens_seen": 33984096, + "step": 17360 + }, + { + "epoch": 2.3009940357852883, + "grad_norm": 0.0023135209921747446, + "learning_rate": 6.409383001590752e-07, + "loss": 0.0, + "num_input_tokens_seen": 33985400, + "step": 17361 + }, + { + "epoch": 2.3011265738899933, + "grad_norm": 5.6897196769714355, + "learning_rate": 6.407062230174438e-07, + "loss": 0.0224, + "num_input_tokens_seen": 33986784, + "step": 17362 + }, + { + "epoch": 2.3012591119946983, + "grad_norm": 6.37033224105835, + "learning_rate": 6.40474181724279e-07, + "loss": 0.0405, + "num_input_tokens_seen": 33989128, + "step": 17363 + }, + { + "epoch": 2.3013916500994034, + "grad_norm": 5.6761579513549805, + "learning_rate": 6.402421762840541e-07, + "loss": 0.0201, + "num_input_tokens_seen": 33991496, + "step": 17364 + }, + { + "epoch": 2.301524188204109, + "grad_norm": 3.370162010192871, + "learning_rate": 6.400102067012437e-07, + "loss": 0.0334, + "num_input_tokens_seen": 33993168, + "step": 17365 + }, + { + "epoch": 2.301656726308814, + "grad_norm": 3.429090976715088, + "learning_rate": 6.397782729803189e-07, + "loss": 0.0384, + "num_input_tokens_seen": 33995208, + "step": 17366 + }, + { + "epoch": 2.301789264413519, + "grad_norm": 0.014693419449031353, + "learning_rate": 6.39546375125753e-07, + "loss": 0.0001, + "num_input_tokens_seen": 33996824, + "step": 17367 + }, + { + "epoch": 2.301921802518224, + "grad_norm": 4.771506309509277, + "learning_rate": 6.393145131420167e-07, + "loss": 0.0355, + "num_input_tokens_seen": 33998208, + "step": 17368 + }, + { + "epoch": 2.302054340622929, + "grad_norm": 4.365818500518799, + "learning_rate": 6.390826870335793e-07, + "loss": 0.0469, + "num_input_tokens_seen": 34000552, + "step": 17369 + }, + { + "epoch": 2.302186878727634, + "grad_norm": 0.0018083525355905294, + "learning_rate": 6.388508968049123e-07, + "loss": 0.0, + "num_input_tokens_seen": 34002112, + "step": 17370 + }, + { + "epoch": 2.302319416832339, + "grad_norm": 9.06754207611084, + "learning_rate": 6.386191424604832e-07, + "loss": 0.1127, + "num_input_tokens_seen": 34003632, + "step": 17371 + }, + { + "epoch": 2.3024519549370446, + "grad_norm": 0.02337239868938923, + "learning_rate": 6.383874240047616e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34005224, + "step": 17372 + }, + { + "epoch": 2.3025844930417496, + "grad_norm": 6.863280296325684, + "learning_rate": 6.381557414422151e-07, + "loss": 0.073, + "num_input_tokens_seen": 34007104, + "step": 17373 + }, + { + "epoch": 2.3027170311464547, + "grad_norm": 2.428260326385498, + "learning_rate": 6.379240947773093e-07, + "loss": 0.026, + "num_input_tokens_seen": 34008888, + "step": 17374 + }, + { + "epoch": 2.3028495692511597, + "grad_norm": 6.709317684173584, + "learning_rate": 6.376924840145126e-07, + "loss": 0.0415, + "num_input_tokens_seen": 34010552, + "step": 17375 + }, + { + "epoch": 2.3029821073558647, + "grad_norm": 0.001654457300901413, + "learning_rate": 6.374609091582892e-07, + "loss": 0.0, + "num_input_tokens_seen": 34011992, + "step": 17376 + }, + { + "epoch": 2.3031146454605698, + "grad_norm": 8.586210250854492, + "learning_rate": 6.372293702131038e-07, + "loss": 0.2208, + "num_input_tokens_seen": 34014768, + "step": 17377 + }, + { + "epoch": 2.303247183565275, + "grad_norm": 0.009012611582875252, + "learning_rate": 6.36997867183422e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34017744, + "step": 17378 + }, + { + "epoch": 2.3033797216699803, + "grad_norm": 7.295900821685791, + "learning_rate": 6.367664000737056e-07, + "loss": 0.0199, + "num_input_tokens_seen": 34020088, + "step": 17379 + }, + { + "epoch": 2.3035122597746853, + "grad_norm": 4.810347557067871, + "learning_rate": 6.365349688884193e-07, + "loss": 0.0312, + "num_input_tokens_seen": 34022312, + "step": 17380 + }, + { + "epoch": 2.3036447978793904, + "grad_norm": 10.125640869140625, + "learning_rate": 6.363035736320244e-07, + "loss": 0.2754, + "num_input_tokens_seen": 34023816, + "step": 17381 + }, + { + "epoch": 2.3037773359840954, + "grad_norm": 25.63713836669922, + "learning_rate": 6.360722143089823e-07, + "loss": 0.2223, + "num_input_tokens_seen": 34026080, + "step": 17382 + }, + { + "epoch": 2.3039098740888004, + "grad_norm": 8.104232788085938, + "learning_rate": 6.358408909237529e-07, + "loss": 0.0653, + "num_input_tokens_seen": 34028392, + "step": 17383 + }, + { + "epoch": 2.3040424121935055, + "grad_norm": 0.022738385945558548, + "learning_rate": 6.356096034807979e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34029440, + "step": 17384 + }, + { + "epoch": 2.3041749502982105, + "grad_norm": 5.936624050140381, + "learning_rate": 6.353783519845752e-07, + "loss": 0.0462, + "num_input_tokens_seen": 34031408, + "step": 17385 + }, + { + "epoch": 2.304307488402916, + "grad_norm": 4.948999404907227, + "learning_rate": 6.351471364395448e-07, + "loss": 0.0506, + "num_input_tokens_seen": 34034312, + "step": 17386 + }, + { + "epoch": 2.304440026507621, + "grad_norm": 5.173861026763916, + "learning_rate": 6.349159568501642e-07, + "loss": 0.0325, + "num_input_tokens_seen": 34036928, + "step": 17387 + }, + { + "epoch": 2.304572564612326, + "grad_norm": 0.008088823407888412, + "learning_rate": 6.346848132208907e-07, + "loss": 0.0, + "num_input_tokens_seen": 34039264, + "step": 17388 + }, + { + "epoch": 2.304705102717031, + "grad_norm": 0.00701061962172389, + "learning_rate": 6.344537055561801e-07, + "loss": 0.0, + "num_input_tokens_seen": 34041416, + "step": 17389 + }, + { + "epoch": 2.304837640821736, + "grad_norm": 6.62491512298584, + "learning_rate": 6.34222633860489e-07, + "loss": 0.0709, + "num_input_tokens_seen": 34043184, + "step": 17390 + }, + { + "epoch": 2.304970178926441, + "grad_norm": 8.353791236877441, + "learning_rate": 6.339915981382733e-07, + "loss": 0.0487, + "num_input_tokens_seen": 34045520, + "step": 17391 + }, + { + "epoch": 2.305102717031146, + "grad_norm": 0.5801758170127869, + "learning_rate": 6.337605983939868e-07, + "loss": 0.002, + "num_input_tokens_seen": 34047208, + "step": 17392 + }, + { + "epoch": 2.3052352551358517, + "grad_norm": 0.09612268209457397, + "learning_rate": 6.335296346320835e-07, + "loss": 0.0004, + "num_input_tokens_seen": 34048992, + "step": 17393 + }, + { + "epoch": 2.3053677932405567, + "grad_norm": 4.952916622161865, + "learning_rate": 6.33298706857016e-07, + "loss": 0.0265, + "num_input_tokens_seen": 34051096, + "step": 17394 + }, + { + "epoch": 2.3055003313452618, + "grad_norm": 2.593785285949707, + "learning_rate": 6.330678150732375e-07, + "loss": 0.0081, + "num_input_tokens_seen": 34053120, + "step": 17395 + }, + { + "epoch": 2.305632869449967, + "grad_norm": 0.009736204519867897, + "learning_rate": 6.328369592851991e-07, + "loss": 0.0, + "num_input_tokens_seen": 34055416, + "step": 17396 + }, + { + "epoch": 2.305765407554672, + "grad_norm": 1.550420880317688, + "learning_rate": 6.326061394973526e-07, + "loss": 0.0034, + "num_input_tokens_seen": 34057256, + "step": 17397 + }, + { + "epoch": 2.305897945659377, + "grad_norm": 9.012833595275879, + "learning_rate": 6.323753557141482e-07, + "loss": 0.0569, + "num_input_tokens_seen": 34058736, + "step": 17398 + }, + { + "epoch": 2.306030483764082, + "grad_norm": 0.5295625925064087, + "learning_rate": 6.321446079400353e-07, + "loss": 0.0015, + "num_input_tokens_seen": 34060792, + "step": 17399 + }, + { + "epoch": 2.3061630218687874, + "grad_norm": 0.010379916988313198, + "learning_rate": 6.319138961794622e-07, + "loss": 0.0, + "num_input_tokens_seen": 34062224, + "step": 17400 + }, + { + "epoch": 2.3062955599734924, + "grad_norm": 0.6112679243087769, + "learning_rate": 6.316832204368778e-07, + "loss": 0.0052, + "num_input_tokens_seen": 34063704, + "step": 17401 + }, + { + "epoch": 2.3064280980781975, + "grad_norm": 0.11307967454195023, + "learning_rate": 6.314525807167307e-07, + "loss": 0.0006, + "num_input_tokens_seen": 34065944, + "step": 17402 + }, + { + "epoch": 2.3065606361829025, + "grad_norm": 27.54071044921875, + "learning_rate": 6.312219770234668e-07, + "loss": 0.24, + "num_input_tokens_seen": 34068440, + "step": 17403 + }, + { + "epoch": 2.3066931742876076, + "grad_norm": 0.0004808777302969247, + "learning_rate": 6.309914093615324e-07, + "loss": 0.0, + "num_input_tokens_seen": 34069632, + "step": 17404 + }, + { + "epoch": 2.3068257123923126, + "grad_norm": 1.9486286640167236, + "learning_rate": 6.307608777353721e-07, + "loss": 0.0122, + "num_input_tokens_seen": 34071432, + "step": 17405 + }, + { + "epoch": 2.306958250497018, + "grad_norm": 3.151869535446167, + "learning_rate": 6.305303821494327e-07, + "loss": 0.0369, + "num_input_tokens_seen": 34073680, + "step": 17406 + }, + { + "epoch": 2.307090788601723, + "grad_norm": 0.1785130351781845, + "learning_rate": 6.302999226081563e-07, + "loss": 0.0006, + "num_input_tokens_seen": 34075440, + "step": 17407 + }, + { + "epoch": 2.307223326706428, + "grad_norm": 0.012536933645606041, + "learning_rate": 6.300694991159878e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34076752, + "step": 17408 + }, + { + "epoch": 2.307355864811133, + "grad_norm": 1.7249717712402344, + "learning_rate": 6.298391116773695e-07, + "loss": 0.0086, + "num_input_tokens_seen": 34078056, + "step": 17409 + }, + { + "epoch": 2.3074884029158382, + "grad_norm": 0.023449795320630074, + "learning_rate": 6.296087602967433e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34080032, + "step": 17410 + }, + { + "epoch": 2.3076209410205433, + "grad_norm": 0.009539180435240269, + "learning_rate": 6.293784449785497e-07, + "loss": 0.0, + "num_input_tokens_seen": 34081704, + "step": 17411 + }, + { + "epoch": 2.3077534791252488, + "grad_norm": 9.424128532409668, + "learning_rate": 6.291481657272311e-07, + "loss": 0.0805, + "num_input_tokens_seen": 34083408, + "step": 17412 + }, + { + "epoch": 2.307886017229954, + "grad_norm": 0.021270060911774635, + "learning_rate": 6.289179225472256e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34085712, + "step": 17413 + }, + { + "epoch": 2.308018555334659, + "grad_norm": 0.05455867946147919, + "learning_rate": 6.286877154429741e-07, + "loss": 0.0003, + "num_input_tokens_seen": 34087864, + "step": 17414 + }, + { + "epoch": 2.308151093439364, + "grad_norm": 3.456040620803833, + "learning_rate": 6.284575444189142e-07, + "loss": 0.0066, + "num_input_tokens_seen": 34090424, + "step": 17415 + }, + { + "epoch": 2.308283631544069, + "grad_norm": 5.385505676269531, + "learning_rate": 6.282274094794835e-07, + "loss": 0.0408, + "num_input_tokens_seen": 34092952, + "step": 17416 + }, + { + "epoch": 2.308416169648774, + "grad_norm": 0.07475034892559052, + "learning_rate": 6.279973106291204e-07, + "loss": 0.0003, + "num_input_tokens_seen": 34094800, + "step": 17417 + }, + { + "epoch": 2.308548707753479, + "grad_norm": 0.001812883885577321, + "learning_rate": 6.277672478722605e-07, + "loss": 0.0, + "num_input_tokens_seen": 34096464, + "step": 17418 + }, + { + "epoch": 2.3086812458581845, + "grad_norm": 0.48870426416397095, + "learning_rate": 6.275372212133387e-07, + "loss": 0.0021, + "num_input_tokens_seen": 34097952, + "step": 17419 + }, + { + "epoch": 2.3088137839628895, + "grad_norm": 9.149806022644043, + "learning_rate": 6.273072306567921e-07, + "loss": 0.0641, + "num_input_tokens_seen": 34100152, + "step": 17420 + }, + { + "epoch": 2.3089463220675945, + "grad_norm": 15.399107933044434, + "learning_rate": 6.270772762070529e-07, + "loss": 0.426, + "num_input_tokens_seen": 34102056, + "step": 17421 + }, + { + "epoch": 2.3090788601722996, + "grad_norm": 0.0061037675477564335, + "learning_rate": 6.268473578685569e-07, + "loss": 0.0, + "num_input_tokens_seen": 34103656, + "step": 17422 + }, + { + "epoch": 2.3092113982770046, + "grad_norm": 5.362113952636719, + "learning_rate": 6.266174756457358e-07, + "loss": 0.0199, + "num_input_tokens_seen": 34105312, + "step": 17423 + }, + { + "epoch": 2.3093439363817096, + "grad_norm": 0.015905849635601044, + "learning_rate": 6.263876295430216e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34108112, + "step": 17424 + }, + { + "epoch": 2.3094764744864147, + "grad_norm": 0.21974192559719086, + "learning_rate": 6.261578195648471e-07, + "loss": 0.0008, + "num_input_tokens_seen": 34110136, + "step": 17425 + }, + { + "epoch": 2.30960901259112, + "grad_norm": 7.264702320098877, + "learning_rate": 6.259280457156419e-07, + "loss": 0.0346, + "num_input_tokens_seen": 34112896, + "step": 17426 + }, + { + "epoch": 2.309741550695825, + "grad_norm": 9.846139907836914, + "learning_rate": 6.256983079998375e-07, + "loss": 0.0257, + "num_input_tokens_seen": 34115064, + "step": 17427 + }, + { + "epoch": 2.3098740888005302, + "grad_norm": 8.1863431930542, + "learning_rate": 6.254686064218629e-07, + "loss": 0.0205, + "num_input_tokens_seen": 34116904, + "step": 17428 + }, + { + "epoch": 2.3100066269052353, + "grad_norm": 2.7293477058410645, + "learning_rate": 6.252389409861464e-07, + "loss": 0.0825, + "num_input_tokens_seen": 34119224, + "step": 17429 + }, + { + "epoch": 2.3101391650099403, + "grad_norm": 0.007741585373878479, + "learning_rate": 6.250093116971159e-07, + "loss": 0.0, + "num_input_tokens_seen": 34121072, + "step": 17430 + }, + { + "epoch": 2.3102717031146454, + "grad_norm": 13.848981857299805, + "learning_rate": 6.247797185591994e-07, + "loss": 0.1134, + "num_input_tokens_seen": 34122296, + "step": 17431 + }, + { + "epoch": 2.3104042412193504, + "grad_norm": 0.0018485405016690493, + "learning_rate": 6.245501615768243e-07, + "loss": 0.0, + "num_input_tokens_seen": 34124584, + "step": 17432 + }, + { + "epoch": 2.310536779324056, + "grad_norm": 0.0035898378118872643, + "learning_rate": 6.243206407544158e-07, + "loss": 0.0, + "num_input_tokens_seen": 34126312, + "step": 17433 + }, + { + "epoch": 2.310669317428761, + "grad_norm": 0.7700881958007812, + "learning_rate": 6.240911560963994e-07, + "loss": 0.0014, + "num_input_tokens_seen": 34128536, + "step": 17434 + }, + { + "epoch": 2.310801855533466, + "grad_norm": 0.039068803191185, + "learning_rate": 6.238617076071998e-07, + "loss": 0.0002, + "num_input_tokens_seen": 34130360, + "step": 17435 + }, + { + "epoch": 2.310934393638171, + "grad_norm": 9.645856857299805, + "learning_rate": 6.236322952912397e-07, + "loss": 0.1447, + "num_input_tokens_seen": 34132576, + "step": 17436 + }, + { + "epoch": 2.311066931742876, + "grad_norm": 0.009736839681863785, + "learning_rate": 6.234029191529437e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34134328, + "step": 17437 + }, + { + "epoch": 2.311199469847581, + "grad_norm": 0.0023013781756162643, + "learning_rate": 6.231735791967344e-07, + "loss": 0.0, + "num_input_tokens_seen": 34135768, + "step": 17438 + }, + { + "epoch": 2.311332007952286, + "grad_norm": 0.0032288292422890663, + "learning_rate": 6.229442754270334e-07, + "loss": 0.0, + "num_input_tokens_seen": 34137864, + "step": 17439 + }, + { + "epoch": 2.3114645460569916, + "grad_norm": 12.149410247802734, + "learning_rate": 6.227150078482619e-07, + "loss": 0.1378, + "num_input_tokens_seen": 34139944, + "step": 17440 + }, + { + "epoch": 2.3115970841616966, + "grad_norm": 0.38071051239967346, + "learning_rate": 6.224857764648399e-07, + "loss": 0.0023, + "num_input_tokens_seen": 34142256, + "step": 17441 + }, + { + "epoch": 2.3117296222664017, + "grad_norm": 0.6701205968856812, + "learning_rate": 6.222565812811865e-07, + "loss": 0.0021, + "num_input_tokens_seen": 34144328, + "step": 17442 + }, + { + "epoch": 2.3118621603711067, + "grad_norm": 0.0004493389278650284, + "learning_rate": 6.220274223017217e-07, + "loss": 0.0, + "num_input_tokens_seen": 34145800, + "step": 17443 + }, + { + "epoch": 2.3119946984758117, + "grad_norm": 0.0058547696098685265, + "learning_rate": 6.217982995308647e-07, + "loss": 0.0, + "num_input_tokens_seen": 34147584, + "step": 17444 + }, + { + "epoch": 2.3121272365805168, + "grad_norm": 6.524087905883789, + "learning_rate": 6.215692129730319e-07, + "loss": 0.0741, + "num_input_tokens_seen": 34149656, + "step": 17445 + }, + { + "epoch": 2.312259774685222, + "grad_norm": 0.07347678393125534, + "learning_rate": 6.213401626326407e-07, + "loss": 0.0003, + "num_input_tokens_seen": 34151488, + "step": 17446 + }, + { + "epoch": 2.3123923127899273, + "grad_norm": 5.120787620544434, + "learning_rate": 6.211111485141064e-07, + "loss": 0.0344, + "num_input_tokens_seen": 34153144, + "step": 17447 + }, + { + "epoch": 2.3125248508946323, + "grad_norm": 5.737492561340332, + "learning_rate": 6.208821706218454e-07, + "loss": 0.0298, + "num_input_tokens_seen": 34154944, + "step": 17448 + }, + { + "epoch": 2.3126573889993374, + "grad_norm": 0.05463609844446182, + "learning_rate": 6.206532289602732e-07, + "loss": 0.0003, + "num_input_tokens_seen": 34156928, + "step": 17449 + }, + { + "epoch": 2.3127899271040424, + "grad_norm": 7.841002464294434, + "learning_rate": 6.204243235338031e-07, + "loss": 0.2214, + "num_input_tokens_seen": 34158720, + "step": 17450 + }, + { + "epoch": 2.3129224652087474, + "grad_norm": 7.806709289550781, + "learning_rate": 6.201954543468489e-07, + "loss": 0.1132, + "num_input_tokens_seen": 34161208, + "step": 17451 + }, + { + "epoch": 2.3130550033134525, + "grad_norm": 6.092062473297119, + "learning_rate": 6.199666214038221e-07, + "loss": 0.061, + "num_input_tokens_seen": 34163440, + "step": 17452 + }, + { + "epoch": 2.3131875414181575, + "grad_norm": 12.838902473449707, + "learning_rate": 6.197378247091371e-07, + "loss": 0.219, + "num_input_tokens_seen": 34165624, + "step": 17453 + }, + { + "epoch": 2.313320079522863, + "grad_norm": 8.640066146850586, + "learning_rate": 6.195090642672028e-07, + "loss": 0.1452, + "num_input_tokens_seen": 34168928, + "step": 17454 + }, + { + "epoch": 2.313452617627568, + "grad_norm": 1.8319815397262573, + "learning_rate": 6.192803400824319e-07, + "loss": 0.0304, + "num_input_tokens_seen": 34170888, + "step": 17455 + }, + { + "epoch": 2.313585155732273, + "grad_norm": 15.635564804077148, + "learning_rate": 6.190516521592333e-07, + "loss": 0.2442, + "num_input_tokens_seen": 34173792, + "step": 17456 + }, + { + "epoch": 2.313717693836978, + "grad_norm": 0.3475709557533264, + "learning_rate": 6.18823000502016e-07, + "loss": 0.0011, + "num_input_tokens_seen": 34175000, + "step": 17457 + }, + { + "epoch": 2.313850231941683, + "grad_norm": 10.969120025634766, + "learning_rate": 6.185943851151896e-07, + "loss": 0.077, + "num_input_tokens_seen": 34176984, + "step": 17458 + }, + { + "epoch": 2.313982770046388, + "grad_norm": 9.706406593322754, + "learning_rate": 6.183658060031614e-07, + "loss": 0.0662, + "num_input_tokens_seen": 34179128, + "step": 17459 + }, + { + "epoch": 2.3141153081510932, + "grad_norm": 7.153923511505127, + "learning_rate": 6.181372631703375e-07, + "loss": 0.0451, + "num_input_tokens_seen": 34181608, + "step": 17460 + }, + { + "epoch": 2.3142478462557987, + "grad_norm": 12.300728797912598, + "learning_rate": 6.179087566211264e-07, + "loss": 0.0482, + "num_input_tokens_seen": 34183088, + "step": 17461 + }, + { + "epoch": 2.3143803843605038, + "grad_norm": 6.0571722984313965, + "learning_rate": 6.176802863599321e-07, + "loss": 0.0868, + "num_input_tokens_seen": 34184704, + "step": 17462 + }, + { + "epoch": 2.314512922465209, + "grad_norm": 5.647861480712891, + "learning_rate": 6.174518523911613e-07, + "loss": 0.034, + "num_input_tokens_seen": 34186272, + "step": 17463 + }, + { + "epoch": 2.314645460569914, + "grad_norm": 8.814887046813965, + "learning_rate": 6.172234547192174e-07, + "loss": 0.0987, + "num_input_tokens_seen": 34187792, + "step": 17464 + }, + { + "epoch": 2.314777998674619, + "grad_norm": 1.4750927686691284, + "learning_rate": 6.169950933485042e-07, + "loss": 0.0103, + "num_input_tokens_seen": 34190128, + "step": 17465 + }, + { + "epoch": 2.314910536779324, + "grad_norm": 0.6887134909629822, + "learning_rate": 6.167667682834236e-07, + "loss": 0.0019, + "num_input_tokens_seen": 34192536, + "step": 17466 + }, + { + "epoch": 2.315043074884029, + "grad_norm": 2.6638736724853516, + "learning_rate": 6.165384795283799e-07, + "loss": 0.0078, + "num_input_tokens_seen": 34194648, + "step": 17467 + }, + { + "epoch": 2.3151756129887344, + "grad_norm": 13.005650520324707, + "learning_rate": 6.16310227087773e-07, + "loss": 0.2692, + "num_input_tokens_seen": 34196904, + "step": 17468 + }, + { + "epoch": 2.3153081510934395, + "grad_norm": 11.053622245788574, + "learning_rate": 6.160820109660048e-07, + "loss": 0.1966, + "num_input_tokens_seen": 34199080, + "step": 17469 + }, + { + "epoch": 2.3154406891981445, + "grad_norm": 9.409247398376465, + "learning_rate": 6.158538311674753e-07, + "loss": 0.0866, + "num_input_tokens_seen": 34200928, + "step": 17470 + }, + { + "epoch": 2.3155732273028495, + "grad_norm": 2.0592150688171387, + "learning_rate": 6.156256876965838e-07, + "loss": 0.0075, + "num_input_tokens_seen": 34204216, + "step": 17471 + }, + { + "epoch": 2.3157057654075546, + "grad_norm": 13.114396095275879, + "learning_rate": 6.153975805577281e-07, + "loss": 0.2038, + "num_input_tokens_seen": 34205888, + "step": 17472 + }, + { + "epoch": 2.3158383035122596, + "grad_norm": 0.004014753736555576, + "learning_rate": 6.151695097553073e-07, + "loss": 0.0, + "num_input_tokens_seen": 34207552, + "step": 17473 + }, + { + "epoch": 2.3159708416169646, + "grad_norm": 5.944278240203857, + "learning_rate": 6.149414752937191e-07, + "loss": 0.0299, + "num_input_tokens_seen": 34209816, + "step": 17474 + }, + { + "epoch": 2.31610337972167, + "grad_norm": 0.00847630389034748, + "learning_rate": 6.1471347717736e-07, + "loss": 0.0, + "num_input_tokens_seen": 34211296, + "step": 17475 + }, + { + "epoch": 2.316235917826375, + "grad_norm": 7.452869892120361, + "learning_rate": 6.144855154106256e-07, + "loss": 0.1066, + "num_input_tokens_seen": 34213184, + "step": 17476 + }, + { + "epoch": 2.31636845593108, + "grad_norm": 6.115650177001953, + "learning_rate": 6.142575899979103e-07, + "loss": 0.1479, + "num_input_tokens_seen": 34216040, + "step": 17477 + }, + { + "epoch": 2.3165009940357852, + "grad_norm": 0.02240338735282421, + "learning_rate": 6.140297009436097e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34218040, + "step": 17478 + }, + { + "epoch": 2.3166335321404903, + "grad_norm": 8.943943977355957, + "learning_rate": 6.138018482521183e-07, + "loss": 0.0322, + "num_input_tokens_seen": 34219688, + "step": 17479 + }, + { + "epoch": 2.3167660702451953, + "grad_norm": 0.027780799195170403, + "learning_rate": 6.135740319278285e-07, + "loss": 0.0002, + "num_input_tokens_seen": 34221992, + "step": 17480 + }, + { + "epoch": 2.3168986083499004, + "grad_norm": 7.907403469085693, + "learning_rate": 6.133462519751327e-07, + "loss": 0.0222, + "num_input_tokens_seen": 34223448, + "step": 17481 + }, + { + "epoch": 2.317031146454606, + "grad_norm": 0.031955257058143616, + "learning_rate": 6.131185083984226e-07, + "loss": 0.0002, + "num_input_tokens_seen": 34226208, + "step": 17482 + }, + { + "epoch": 2.317163684559311, + "grad_norm": 0.004978460725396872, + "learning_rate": 6.128908012020887e-07, + "loss": 0.0, + "num_input_tokens_seen": 34227936, + "step": 17483 + }, + { + "epoch": 2.317296222664016, + "grad_norm": 0.023532742634415627, + "learning_rate": 6.12663130390522e-07, + "loss": 0.0002, + "num_input_tokens_seen": 34230144, + "step": 17484 + }, + { + "epoch": 2.317428760768721, + "grad_norm": 5.205182075500488, + "learning_rate": 6.124354959681131e-07, + "loss": 0.093, + "num_input_tokens_seen": 34231816, + "step": 17485 + }, + { + "epoch": 2.317561298873426, + "grad_norm": 0.02311347983777523, + "learning_rate": 6.122078979392498e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34233904, + "step": 17486 + }, + { + "epoch": 2.317693836978131, + "grad_norm": 0.003053916385397315, + "learning_rate": 6.119803363083205e-07, + "loss": 0.0, + "num_input_tokens_seen": 34236184, + "step": 17487 + }, + { + "epoch": 2.317826375082836, + "grad_norm": 5.242100238800049, + "learning_rate": 6.117528110797122e-07, + "loss": 0.0513, + "num_input_tokens_seen": 34238384, + "step": 17488 + }, + { + "epoch": 2.3179589131875415, + "grad_norm": 1.8733874559402466, + "learning_rate": 6.11525322257813e-07, + "loss": 0.0075, + "num_input_tokens_seen": 34241056, + "step": 17489 + }, + { + "epoch": 2.3180914512922466, + "grad_norm": 11.557846069335938, + "learning_rate": 6.112978698470073e-07, + "loss": 0.1964, + "num_input_tokens_seen": 34243896, + "step": 17490 + }, + { + "epoch": 2.3182239893969516, + "grad_norm": 0.04524039849638939, + "learning_rate": 6.110704538516829e-07, + "loss": 0.0002, + "num_input_tokens_seen": 34245288, + "step": 17491 + }, + { + "epoch": 2.3183565275016567, + "grad_norm": 2.667144536972046, + "learning_rate": 6.108430742762231e-07, + "loss": 0.0251, + "num_input_tokens_seen": 34246584, + "step": 17492 + }, + { + "epoch": 2.3184890656063617, + "grad_norm": 5.066293239593506, + "learning_rate": 6.106157311250111e-07, + "loss": 0.0443, + "num_input_tokens_seen": 34248840, + "step": 17493 + }, + { + "epoch": 2.318621603711067, + "grad_norm": 6.315952301025391, + "learning_rate": 6.103884244024322e-07, + "loss": 0.0969, + "num_input_tokens_seen": 34250496, + "step": 17494 + }, + { + "epoch": 2.318754141815772, + "grad_norm": 0.030659189447760582, + "learning_rate": 6.10161154112868e-07, + "loss": 0.0002, + "num_input_tokens_seen": 34252152, + "step": 17495 + }, + { + "epoch": 2.3188866799204773, + "grad_norm": 6.6434645652771, + "learning_rate": 6.099339202606997e-07, + "loss": 0.0696, + "num_input_tokens_seen": 34254248, + "step": 17496 + }, + { + "epoch": 2.3190192180251823, + "grad_norm": 5.905621528625488, + "learning_rate": 6.097067228503101e-07, + "loss": 0.0397, + "num_input_tokens_seen": 34256592, + "step": 17497 + }, + { + "epoch": 2.3191517561298873, + "grad_norm": 0.005513168405741453, + "learning_rate": 6.094795618860789e-07, + "loss": 0.0, + "num_input_tokens_seen": 34258024, + "step": 17498 + }, + { + "epoch": 2.3192842942345924, + "grad_norm": 0.559405505657196, + "learning_rate": 6.092524373723852e-07, + "loss": 0.0017, + "num_input_tokens_seen": 34260064, + "step": 17499 + }, + { + "epoch": 2.3194168323392974, + "grad_norm": 6.910223007202148, + "learning_rate": 6.090253493136098e-07, + "loss": 0.0494, + "num_input_tokens_seen": 34261832, + "step": 17500 + }, + { + "epoch": 2.319549370444003, + "grad_norm": 6.656306266784668, + "learning_rate": 6.087982977141293e-07, + "loss": 0.0762, + "num_input_tokens_seen": 34263784, + "step": 17501 + }, + { + "epoch": 2.319681908548708, + "grad_norm": 0.011607710272073746, + "learning_rate": 6.08571282578323e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34266144, + "step": 17502 + }, + { + "epoch": 2.319814446653413, + "grad_norm": 0.09195312112569809, + "learning_rate": 6.083443039105671e-07, + "loss": 0.0003, + "num_input_tokens_seen": 34267784, + "step": 17503 + }, + { + "epoch": 2.319946984758118, + "grad_norm": 0.049151066690683365, + "learning_rate": 6.081173617152375e-07, + "loss": 0.0002, + "num_input_tokens_seen": 34269048, + "step": 17504 + }, + { + "epoch": 2.320079522862823, + "grad_norm": 1.0855520963668823, + "learning_rate": 6.078904559967108e-07, + "loss": 0.0037, + "num_input_tokens_seen": 34271224, + "step": 17505 + }, + { + "epoch": 2.320212060967528, + "grad_norm": 0.02064576745033264, + "learning_rate": 6.076635867593616e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34272872, + "step": 17506 + }, + { + "epoch": 2.320344599072233, + "grad_norm": 0.24677470326423645, + "learning_rate": 6.074367540075632e-07, + "loss": 0.0006, + "num_input_tokens_seen": 34274416, + "step": 17507 + }, + { + "epoch": 2.3204771371769386, + "grad_norm": 0.006489635445177555, + "learning_rate": 6.072099577456902e-07, + "loss": 0.0, + "num_input_tokens_seen": 34277424, + "step": 17508 + }, + { + "epoch": 2.3206096752816436, + "grad_norm": 0.04955438897013664, + "learning_rate": 6.069831979781146e-07, + "loss": 0.0003, + "num_input_tokens_seen": 34278896, + "step": 17509 + }, + { + "epoch": 2.3207422133863487, + "grad_norm": 0.3823723793029785, + "learning_rate": 6.067564747092095e-07, + "loss": 0.001, + "num_input_tokens_seen": 34281480, + "step": 17510 + }, + { + "epoch": 2.3208747514910537, + "grad_norm": 0.013615868985652924, + "learning_rate": 6.065297879433458e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34283104, + "step": 17511 + }, + { + "epoch": 2.3210072895957587, + "grad_norm": 9.324460983276367, + "learning_rate": 6.063031376848938e-07, + "loss": 0.0176, + "num_input_tokens_seen": 34284752, + "step": 17512 + }, + { + "epoch": 2.321139827700464, + "grad_norm": 0.017228689044713974, + "learning_rate": 6.06076523938223e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34287584, + "step": 17513 + }, + { + "epoch": 2.321272365805169, + "grad_norm": 3.7913687229156494, + "learning_rate": 6.058499467077033e-07, + "loss": 0.0407, + "num_input_tokens_seen": 34289824, + "step": 17514 + }, + { + "epoch": 2.3214049039098743, + "grad_norm": 0.0031578068155795336, + "learning_rate": 6.056234059977043e-07, + "loss": 0.0, + "num_input_tokens_seen": 34291168, + "step": 17515 + }, + { + "epoch": 2.3215374420145793, + "grad_norm": 0.03165988251566887, + "learning_rate": 6.053969018125927e-07, + "loss": 0.0002, + "num_input_tokens_seen": 34293304, + "step": 17516 + }, + { + "epoch": 2.3216699801192844, + "grad_norm": 7.365094184875488, + "learning_rate": 6.051704341567358e-07, + "loss": 0.0518, + "num_input_tokens_seen": 34295616, + "step": 17517 + }, + { + "epoch": 2.3218025182239894, + "grad_norm": 0.009375381283462048, + "learning_rate": 6.049440030345003e-07, + "loss": 0.0, + "num_input_tokens_seen": 34298152, + "step": 17518 + }, + { + "epoch": 2.3219350563286945, + "grad_norm": 8.492448806762695, + "learning_rate": 6.04717608450251e-07, + "loss": 0.0381, + "num_input_tokens_seen": 34299664, + "step": 17519 + }, + { + "epoch": 2.3220675944333995, + "grad_norm": 0.08540981262922287, + "learning_rate": 6.044912504083536e-07, + "loss": 0.0004, + "num_input_tokens_seen": 34301096, + "step": 17520 + }, + { + "epoch": 2.3222001325381045, + "grad_norm": 2.470785140991211, + "learning_rate": 6.042649289131733e-07, + "loss": 0.0222, + "num_input_tokens_seen": 34303216, + "step": 17521 + }, + { + "epoch": 2.32233267064281, + "grad_norm": 1.3713630437850952, + "learning_rate": 6.040386439690729e-07, + "loss": 0.0145, + "num_input_tokens_seen": 34304960, + "step": 17522 + }, + { + "epoch": 2.322465208747515, + "grad_norm": 1.4415295124053955, + "learning_rate": 6.038123955804154e-07, + "loss": 0.0022, + "num_input_tokens_seen": 34306656, + "step": 17523 + }, + { + "epoch": 2.32259774685222, + "grad_norm": 0.616477370262146, + "learning_rate": 6.035861837515633e-07, + "loss": 0.0016, + "num_input_tokens_seen": 34308576, + "step": 17524 + }, + { + "epoch": 2.322730284956925, + "grad_norm": 20.2178897857666, + "learning_rate": 6.033600084868768e-07, + "loss": 0.2979, + "num_input_tokens_seen": 34310992, + "step": 17525 + }, + { + "epoch": 2.32286282306163, + "grad_norm": 19.339744567871094, + "learning_rate": 6.03133869790718e-07, + "loss": 0.0704, + "num_input_tokens_seen": 34313544, + "step": 17526 + }, + { + "epoch": 2.322995361166335, + "grad_norm": 0.0014028099831193686, + "learning_rate": 6.029077676674472e-07, + "loss": 0.0, + "num_input_tokens_seen": 34316032, + "step": 17527 + }, + { + "epoch": 2.3231278992710402, + "grad_norm": 4.5301833152771, + "learning_rate": 6.026817021214237e-07, + "loss": 0.0536, + "num_input_tokens_seen": 34318136, + "step": 17528 + }, + { + "epoch": 2.3232604373757457, + "grad_norm": 6.068673133850098, + "learning_rate": 6.024556731570056e-07, + "loss": 0.1258, + "num_input_tokens_seen": 34320352, + "step": 17529 + }, + { + "epoch": 2.3233929754804508, + "grad_norm": 14.968864440917969, + "learning_rate": 6.022296807785508e-07, + "loss": 0.2476, + "num_input_tokens_seen": 34322648, + "step": 17530 + }, + { + "epoch": 2.323525513585156, + "grad_norm": 0.001745099201798439, + "learning_rate": 6.020037249904164e-07, + "loss": 0.0, + "num_input_tokens_seen": 34324120, + "step": 17531 + }, + { + "epoch": 2.323658051689861, + "grad_norm": 8.776473045349121, + "learning_rate": 6.017778057969609e-07, + "loss": 0.0319, + "num_input_tokens_seen": 34326000, + "step": 17532 + }, + { + "epoch": 2.323790589794566, + "grad_norm": 0.08864894509315491, + "learning_rate": 6.015519232025386e-07, + "loss": 0.0004, + "num_input_tokens_seen": 34327576, + "step": 17533 + }, + { + "epoch": 2.323923127899271, + "grad_norm": 0.003666086122393608, + "learning_rate": 6.013260772115046e-07, + "loss": 0.0, + "num_input_tokens_seen": 34328952, + "step": 17534 + }, + { + "epoch": 2.324055666003976, + "grad_norm": 0.023752545937895775, + "learning_rate": 6.011002678282132e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34332288, + "step": 17535 + }, + { + "epoch": 2.3241882041086814, + "grad_norm": 8.306499481201172, + "learning_rate": 6.008744950570191e-07, + "loss": 0.1092, + "num_input_tokens_seen": 34334576, + "step": 17536 + }, + { + "epoch": 2.3243207422133865, + "grad_norm": 8.844409942626953, + "learning_rate": 6.006487589022744e-07, + "loss": 0.0365, + "num_input_tokens_seen": 34335944, + "step": 17537 + }, + { + "epoch": 2.3244532803180915, + "grad_norm": 0.24395331740379333, + "learning_rate": 6.004230593683327e-07, + "loss": 0.0014, + "num_input_tokens_seen": 34338768, + "step": 17538 + }, + { + "epoch": 2.3245858184227965, + "grad_norm": 0.002414434216916561, + "learning_rate": 6.001973964595448e-07, + "loss": 0.0, + "num_input_tokens_seen": 34339976, + "step": 17539 + }, + { + "epoch": 2.3247183565275016, + "grad_norm": 0.125911682844162, + "learning_rate": 5.99971770180261e-07, + "loss": 0.0007, + "num_input_tokens_seen": 34342448, + "step": 17540 + }, + { + "epoch": 2.3248508946322066, + "grad_norm": 6.829288959503174, + "learning_rate": 5.997461805348329e-07, + "loss": 0.1215, + "num_input_tokens_seen": 34344600, + "step": 17541 + }, + { + "epoch": 2.3249834327369117, + "grad_norm": 5.314018249511719, + "learning_rate": 5.995206275276091e-07, + "loss": 0.0511, + "num_input_tokens_seen": 34346912, + "step": 17542 + }, + { + "epoch": 2.325115970841617, + "grad_norm": 1.6567069292068481, + "learning_rate": 5.992951111629383e-07, + "loss": 0.0018, + "num_input_tokens_seen": 34349424, + "step": 17543 + }, + { + "epoch": 2.325248508946322, + "grad_norm": 0.2776012718677521, + "learning_rate": 5.990696314451697e-07, + "loss": 0.0019, + "num_input_tokens_seen": 34352312, + "step": 17544 + }, + { + "epoch": 2.325381047051027, + "grad_norm": 0.9171609878540039, + "learning_rate": 5.988441883786491e-07, + "loss": 0.0017, + "num_input_tokens_seen": 34354080, + "step": 17545 + }, + { + "epoch": 2.3255135851557323, + "grad_norm": 1.4970611333847046, + "learning_rate": 5.98618781967725e-07, + "loss": 0.0071, + "num_input_tokens_seen": 34355952, + "step": 17546 + }, + { + "epoch": 2.3256461232604373, + "grad_norm": 0.01866193301975727, + "learning_rate": 5.983934122167423e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34357944, + "step": 17547 + }, + { + "epoch": 2.3257786613651423, + "grad_norm": 3.1532461643218994, + "learning_rate": 5.981680791300465e-07, + "loss": 0.0095, + "num_input_tokens_seen": 34359232, + "step": 17548 + }, + { + "epoch": 2.3259111994698474, + "grad_norm": 6.691530704498291, + "learning_rate": 5.979427827119816e-07, + "loss": 0.0979, + "num_input_tokens_seen": 34361424, + "step": 17549 + }, + { + "epoch": 2.326043737574553, + "grad_norm": 0.0038891262374818325, + "learning_rate": 5.977175229668919e-07, + "loss": 0.0, + "num_input_tokens_seen": 34363136, + "step": 17550 + }, + { + "epoch": 2.326176275679258, + "grad_norm": 0.061748627573251724, + "learning_rate": 5.974922998991211e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34364912, + "step": 17551 + }, + { + "epoch": 2.326308813783963, + "grad_norm": 0.21538491547107697, + "learning_rate": 5.972671135130118e-07, + "loss": 0.0011, + "num_input_tokens_seen": 34367016, + "step": 17552 + }, + { + "epoch": 2.326441351888668, + "grad_norm": 0.022838467732071877, + "learning_rate": 5.970419638129046e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34368632, + "step": 17553 + }, + { + "epoch": 2.326573889993373, + "grad_norm": 5.000819206237793, + "learning_rate": 5.968168508031407e-07, + "loss": 0.0393, + "num_input_tokens_seen": 34370168, + "step": 17554 + }, + { + "epoch": 2.326706428098078, + "grad_norm": 3.834779739379883, + "learning_rate": 5.965917744880617e-07, + "loss": 0.0788, + "num_input_tokens_seen": 34372136, + "step": 17555 + }, + { + "epoch": 2.326838966202783, + "grad_norm": 0.0011536399833858013, + "learning_rate": 5.963667348720053e-07, + "loss": 0.0, + "num_input_tokens_seen": 34374408, + "step": 17556 + }, + { + "epoch": 2.3269715043074886, + "grad_norm": 0.04256986454129219, + "learning_rate": 5.961417319593126e-07, + "loss": 0.0002, + "num_input_tokens_seen": 34376312, + "step": 17557 + }, + { + "epoch": 2.3271040424121936, + "grad_norm": 0.032977521419525146, + "learning_rate": 5.959167657543206e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34378272, + "step": 17558 + }, + { + "epoch": 2.3272365805168986, + "grad_norm": 0.0018362957052886486, + "learning_rate": 5.956918362613667e-07, + "loss": 0.0, + "num_input_tokens_seen": 34379568, + "step": 17559 + }, + { + "epoch": 2.3273691186216037, + "grad_norm": 0.31921473145484924, + "learning_rate": 5.954669434847873e-07, + "loss": 0.0011, + "num_input_tokens_seen": 34381752, + "step": 17560 + }, + { + "epoch": 2.3275016567263087, + "grad_norm": 1.303246259689331, + "learning_rate": 5.952420874289192e-07, + "loss": 0.0108, + "num_input_tokens_seen": 34383656, + "step": 17561 + }, + { + "epoch": 2.3276341948310137, + "grad_norm": 4.99975061416626, + "learning_rate": 5.950172680980984e-07, + "loss": 0.035, + "num_input_tokens_seen": 34386096, + "step": 17562 + }, + { + "epoch": 2.327766732935719, + "grad_norm": 0.11590797454118729, + "learning_rate": 5.947924854966589e-07, + "loss": 0.0004, + "num_input_tokens_seen": 34388016, + "step": 17563 + }, + { + "epoch": 2.3278992710404243, + "grad_norm": 0.016185129061341286, + "learning_rate": 5.945677396289345e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34390080, + "step": 17564 + }, + { + "epoch": 2.3280318091451293, + "grad_norm": 8.13571834564209, + "learning_rate": 5.943430304992584e-07, + "loss": 0.0579, + "num_input_tokens_seen": 34392320, + "step": 17565 + }, + { + "epoch": 2.3281643472498343, + "grad_norm": 0.0026644368190318346, + "learning_rate": 5.941183581119628e-07, + "loss": 0.0, + "num_input_tokens_seen": 34393632, + "step": 17566 + }, + { + "epoch": 2.3282968853545394, + "grad_norm": 0.5635635852813721, + "learning_rate": 5.9389372247138e-07, + "loss": 0.0046, + "num_input_tokens_seen": 34395712, + "step": 17567 + }, + { + "epoch": 2.3284294234592444, + "grad_norm": 0.028154077008366585, + "learning_rate": 5.93669123581842e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34397280, + "step": 17568 + }, + { + "epoch": 2.3285619615639495, + "grad_norm": 0.045411452651023865, + "learning_rate": 5.934445614476783e-07, + "loss": 0.0003, + "num_input_tokens_seen": 34400192, + "step": 17569 + }, + { + "epoch": 2.3286944996686545, + "grad_norm": 0.38398876786231995, + "learning_rate": 5.93220036073219e-07, + "loss": 0.0018, + "num_input_tokens_seen": 34402632, + "step": 17570 + }, + { + "epoch": 2.32882703777336, + "grad_norm": 0.02424808032810688, + "learning_rate": 5.929955474627918e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34404312, + "step": 17571 + }, + { + "epoch": 2.328959575878065, + "grad_norm": 0.024519380182027817, + "learning_rate": 5.92771095620727e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34406496, + "step": 17572 + }, + { + "epoch": 2.32909211398277, + "grad_norm": 2.9812142848968506, + "learning_rate": 5.925466805513502e-07, + "loss": 0.0093, + "num_input_tokens_seen": 34408376, + "step": 17573 + }, + { + "epoch": 2.329224652087475, + "grad_norm": 0.002150762127712369, + "learning_rate": 5.9232230225899e-07, + "loss": 0.0, + "num_input_tokens_seen": 34409896, + "step": 17574 + }, + { + "epoch": 2.32935719019218, + "grad_norm": 0.24683372676372528, + "learning_rate": 5.920979607479722e-07, + "loss": 0.0013, + "num_input_tokens_seen": 34411864, + "step": 17575 + }, + { + "epoch": 2.329489728296885, + "grad_norm": 0.06616546958684921, + "learning_rate": 5.918736560226207e-07, + "loss": 0.0002, + "num_input_tokens_seen": 34413440, + "step": 17576 + }, + { + "epoch": 2.3296222664015906, + "grad_norm": 0.39153680205345154, + "learning_rate": 5.916493880872626e-07, + "loss": 0.0008, + "num_input_tokens_seen": 34414840, + "step": 17577 + }, + { + "epoch": 2.3297548045062957, + "grad_norm": 2.3274972438812256, + "learning_rate": 5.914251569462203e-07, + "loss": 0.0064, + "num_input_tokens_seen": 34416264, + "step": 17578 + }, + { + "epoch": 2.3298873426110007, + "grad_norm": 0.011358942836523056, + "learning_rate": 5.912009626038173e-07, + "loss": 0.0, + "num_input_tokens_seen": 34417808, + "step": 17579 + }, + { + "epoch": 2.3300198807157058, + "grad_norm": 1.6097493171691895, + "learning_rate": 5.909768050643774e-07, + "loss": 0.0071, + "num_input_tokens_seen": 34419184, + "step": 17580 + }, + { + "epoch": 2.330152418820411, + "grad_norm": 0.07590187340974808, + "learning_rate": 5.907526843322215e-07, + "loss": 0.0004, + "num_input_tokens_seen": 34421040, + "step": 17581 + }, + { + "epoch": 2.330284956925116, + "grad_norm": 1.1785894632339478, + "learning_rate": 5.905286004116701e-07, + "loss": 0.011, + "num_input_tokens_seen": 34423184, + "step": 17582 + }, + { + "epoch": 2.3304174950298213, + "grad_norm": 0.01239937637001276, + "learning_rate": 5.903045533070453e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34425616, + "step": 17583 + }, + { + "epoch": 2.3305500331345264, + "grad_norm": 4.729865074157715, + "learning_rate": 5.900805430226659e-07, + "loss": 0.0315, + "num_input_tokens_seen": 34427488, + "step": 17584 + }, + { + "epoch": 2.3306825712392314, + "grad_norm": 8.750303268432617, + "learning_rate": 5.898565695628513e-07, + "loss": 0.1421, + "num_input_tokens_seen": 34429720, + "step": 17585 + }, + { + "epoch": 2.3308151093439364, + "grad_norm": 2.342752456665039, + "learning_rate": 5.896326329319202e-07, + "loss": 0.0149, + "num_input_tokens_seen": 34431872, + "step": 17586 + }, + { + "epoch": 2.3309476474486415, + "grad_norm": 0.30999332666397095, + "learning_rate": 5.894087331341888e-07, + "loss": 0.0011, + "num_input_tokens_seen": 34433552, + "step": 17587 + }, + { + "epoch": 2.3310801855533465, + "grad_norm": 0.008141516707837582, + "learning_rate": 5.891848701739761e-07, + "loss": 0.0, + "num_input_tokens_seen": 34435480, + "step": 17588 + }, + { + "epoch": 2.3312127236580515, + "grad_norm": 0.00680879782885313, + "learning_rate": 5.88961044055597e-07, + "loss": 0.0, + "num_input_tokens_seen": 34438048, + "step": 17589 + }, + { + "epoch": 2.331345261762757, + "grad_norm": 2.4601306915283203, + "learning_rate": 5.88737254783367e-07, + "loss": 0.009, + "num_input_tokens_seen": 34439640, + "step": 17590 + }, + { + "epoch": 2.331477799867462, + "grad_norm": 15.724160194396973, + "learning_rate": 5.885135023616018e-07, + "loss": 0.2631, + "num_input_tokens_seen": 34441520, + "step": 17591 + }, + { + "epoch": 2.331610337972167, + "grad_norm": 0.8926312923431396, + "learning_rate": 5.882897867946142e-07, + "loss": 0.0032, + "num_input_tokens_seen": 34443424, + "step": 17592 + }, + { + "epoch": 2.331742876076872, + "grad_norm": 0.7517162561416626, + "learning_rate": 5.880661080867189e-07, + "loss": 0.002, + "num_input_tokens_seen": 34445096, + "step": 17593 + }, + { + "epoch": 2.331875414181577, + "grad_norm": 6.381129741668701, + "learning_rate": 5.87842466242228e-07, + "loss": 0.198, + "num_input_tokens_seen": 34448240, + "step": 17594 + }, + { + "epoch": 2.332007952286282, + "grad_norm": 0.0035911162849515676, + "learning_rate": 5.876188612654535e-07, + "loss": 0.0, + "num_input_tokens_seen": 34450992, + "step": 17595 + }, + { + "epoch": 2.3321404903909873, + "grad_norm": 0.4697306454181671, + "learning_rate": 5.87395293160706e-07, + "loss": 0.0012, + "num_input_tokens_seen": 34452304, + "step": 17596 + }, + { + "epoch": 2.3322730284956927, + "grad_norm": 8.574317932128906, + "learning_rate": 5.87171761932297e-07, + "loss": 0.0329, + "num_input_tokens_seen": 34454016, + "step": 17597 + }, + { + "epoch": 2.3324055666003978, + "grad_norm": 9.321894645690918, + "learning_rate": 5.869482675845362e-07, + "loss": 0.0835, + "num_input_tokens_seen": 34455512, + "step": 17598 + }, + { + "epoch": 2.332538104705103, + "grad_norm": 4.100316047668457, + "learning_rate": 5.86724810121733e-07, + "loss": 0.0285, + "num_input_tokens_seen": 34457192, + "step": 17599 + }, + { + "epoch": 2.332670642809808, + "grad_norm": 0.007517335936427116, + "learning_rate": 5.865013895481949e-07, + "loss": 0.0, + "num_input_tokens_seen": 34459344, + "step": 17600 + }, + { + "epoch": 2.332803180914513, + "grad_norm": 0.024029167369008064, + "learning_rate": 5.862780058682304e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34461416, + "step": 17601 + }, + { + "epoch": 2.332935719019218, + "grad_norm": 1.6193987131118774, + "learning_rate": 5.860546590861454e-07, + "loss": 0.0059, + "num_input_tokens_seen": 34464192, + "step": 17602 + }, + { + "epoch": 2.333068257123923, + "grad_norm": 0.6515053510665894, + "learning_rate": 5.85831349206247e-07, + "loss": 0.003, + "num_input_tokens_seen": 34466248, + "step": 17603 + }, + { + "epoch": 2.3332007952286284, + "grad_norm": 2.240462303161621, + "learning_rate": 5.856080762328411e-07, + "loss": 0.0097, + "num_input_tokens_seen": 34468264, + "step": 17604 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.0017234095139428973, + "learning_rate": 5.853848401702326e-07, + "loss": 0.0, + "num_input_tokens_seen": 34469696, + "step": 17605 + }, + { + "epoch": 2.3334658714380385, + "grad_norm": 13.389839172363281, + "learning_rate": 5.851616410227248e-07, + "loss": 0.074, + "num_input_tokens_seen": 34471480, + "step": 17606 + }, + { + "epoch": 2.3335984095427436, + "grad_norm": 7.411147117614746, + "learning_rate": 5.84938478794621e-07, + "loss": 0.1003, + "num_input_tokens_seen": 34474000, + "step": 17607 + }, + { + "epoch": 2.3337309476474486, + "grad_norm": 3.7476766109466553, + "learning_rate": 5.847153534902244e-07, + "loss": 0.0121, + "num_input_tokens_seen": 34476152, + "step": 17608 + }, + { + "epoch": 2.3338634857521536, + "grad_norm": 8.368447303771973, + "learning_rate": 5.844922651138379e-07, + "loss": 0.0476, + "num_input_tokens_seen": 34477408, + "step": 17609 + }, + { + "epoch": 2.3339960238568587, + "grad_norm": 0.0031103440560400486, + "learning_rate": 5.84269213669762e-07, + "loss": 0.0, + "num_input_tokens_seen": 34479064, + "step": 17610 + }, + { + "epoch": 2.334128561961564, + "grad_norm": 1.5398308038711548, + "learning_rate": 5.840461991622973e-07, + "loss": 0.0027, + "num_input_tokens_seen": 34480920, + "step": 17611 + }, + { + "epoch": 2.334261100066269, + "grad_norm": 1.2834726572036743, + "learning_rate": 5.838232215957435e-07, + "loss": 0.0048, + "num_input_tokens_seen": 34482728, + "step": 17612 + }, + { + "epoch": 2.3343936381709742, + "grad_norm": 11.526973724365234, + "learning_rate": 5.83600280974399e-07, + "loss": 0.1979, + "num_input_tokens_seen": 34485888, + "step": 17613 + }, + { + "epoch": 2.3345261762756793, + "grad_norm": 0.0016845462378114462, + "learning_rate": 5.833773773025637e-07, + "loss": 0.0, + "num_input_tokens_seen": 34487272, + "step": 17614 + }, + { + "epoch": 2.3346587143803843, + "grad_norm": 1.169955849647522, + "learning_rate": 5.831545105845348e-07, + "loss": 0.0036, + "num_input_tokens_seen": 34489872, + "step": 17615 + }, + { + "epoch": 2.3347912524850893, + "grad_norm": 0.0023731132969260216, + "learning_rate": 5.829316808246097e-07, + "loss": 0.0, + "num_input_tokens_seen": 34491024, + "step": 17616 + }, + { + "epoch": 2.3349237905897944, + "grad_norm": 0.0006979045574553311, + "learning_rate": 5.827088880270843e-07, + "loss": 0.0, + "num_input_tokens_seen": 34492744, + "step": 17617 + }, + { + "epoch": 2.3350563286945, + "grad_norm": 30.849979400634766, + "learning_rate": 5.824861321962533e-07, + "loss": 0.1466, + "num_input_tokens_seen": 34494392, + "step": 17618 + }, + { + "epoch": 2.335188866799205, + "grad_norm": 0.17767080664634705, + "learning_rate": 5.822634133364133e-07, + "loss": 0.0005, + "num_input_tokens_seen": 34495960, + "step": 17619 + }, + { + "epoch": 2.33532140490391, + "grad_norm": 1.834625244140625, + "learning_rate": 5.82040731451857e-07, + "loss": 0.0034, + "num_input_tokens_seen": 34498224, + "step": 17620 + }, + { + "epoch": 2.335453943008615, + "grad_norm": 0.0036174836568534374, + "learning_rate": 5.818180865468792e-07, + "loss": 0.0, + "num_input_tokens_seen": 34501312, + "step": 17621 + }, + { + "epoch": 2.33558648111332, + "grad_norm": 1.0485659837722778, + "learning_rate": 5.815954786257721e-07, + "loss": 0.0067, + "num_input_tokens_seen": 34503248, + "step": 17622 + }, + { + "epoch": 2.335719019218025, + "grad_norm": 11.614334106445312, + "learning_rate": 5.813729076928268e-07, + "loss": 0.2444, + "num_input_tokens_seen": 34505016, + "step": 17623 + }, + { + "epoch": 2.33585155732273, + "grad_norm": 0.011938834562897682, + "learning_rate": 5.811503737523361e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34506968, + "step": 17624 + }, + { + "epoch": 2.3359840954274356, + "grad_norm": 0.06251510977745056, + "learning_rate": 5.809278768085902e-07, + "loss": 0.0004, + "num_input_tokens_seen": 34508808, + "step": 17625 + }, + { + "epoch": 2.3361166335321406, + "grad_norm": 0.004024461843073368, + "learning_rate": 5.807054168658777e-07, + "loss": 0.0, + "num_input_tokens_seen": 34510120, + "step": 17626 + }, + { + "epoch": 2.3362491716368456, + "grad_norm": 0.038239527493715286, + "learning_rate": 5.804829939284898e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34512520, + "step": 17627 + }, + { + "epoch": 2.3363817097415507, + "grad_norm": 0.5902060270309448, + "learning_rate": 5.802606080007131e-07, + "loss": 0.0031, + "num_input_tokens_seen": 34515128, + "step": 17628 + }, + { + "epoch": 2.3365142478462557, + "grad_norm": 6.013433456420898, + "learning_rate": 5.80038259086837e-07, + "loss": 0.0362, + "num_input_tokens_seen": 34516472, + "step": 17629 + }, + { + "epoch": 2.3366467859509608, + "grad_norm": 0.7240469455718994, + "learning_rate": 5.798159471911477e-07, + "loss": 0.0019, + "num_input_tokens_seen": 34519112, + "step": 17630 + }, + { + "epoch": 2.336779324055666, + "grad_norm": 10.085693359375, + "learning_rate": 5.795936723179315e-07, + "loss": 0.0955, + "num_input_tokens_seen": 34521000, + "step": 17631 + }, + { + "epoch": 2.3369118621603713, + "grad_norm": 0.003768005408346653, + "learning_rate": 5.793714344714735e-07, + "loss": 0.0, + "num_input_tokens_seen": 34523176, + "step": 17632 + }, + { + "epoch": 2.3370444002650763, + "grad_norm": 0.0018901200965046883, + "learning_rate": 5.791492336560595e-07, + "loss": 0.0, + "num_input_tokens_seen": 34525432, + "step": 17633 + }, + { + "epoch": 2.3371769383697814, + "grad_norm": 0.00073260284261778, + "learning_rate": 5.789270698759739e-07, + "loss": 0.0, + "num_input_tokens_seen": 34526752, + "step": 17634 + }, + { + "epoch": 2.3373094764744864, + "grad_norm": 0.0010563539108261466, + "learning_rate": 5.787049431354996e-07, + "loss": 0.0, + "num_input_tokens_seen": 34528248, + "step": 17635 + }, + { + "epoch": 2.3374420145791914, + "grad_norm": 4.916848182678223, + "learning_rate": 5.784828534389195e-07, + "loss": 0.0468, + "num_input_tokens_seen": 34529840, + "step": 17636 + }, + { + "epoch": 2.3375745526838965, + "grad_norm": 6.7946014404296875, + "learning_rate": 5.782608007905147e-07, + "loss": 0.0493, + "num_input_tokens_seen": 34532160, + "step": 17637 + }, + { + "epoch": 2.3377070907886015, + "grad_norm": 0.02789085917174816, + "learning_rate": 5.780387851945684e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34534480, + "step": 17638 + }, + { + "epoch": 2.337839628893307, + "grad_norm": 0.0032948346342891455, + "learning_rate": 5.778168066553594e-07, + "loss": 0.0, + "num_input_tokens_seen": 34536216, + "step": 17639 + }, + { + "epoch": 2.337972166998012, + "grad_norm": 6.148429870605469, + "learning_rate": 5.77594865177169e-07, + "loss": 0.0682, + "num_input_tokens_seen": 34537800, + "step": 17640 + }, + { + "epoch": 2.338104705102717, + "grad_norm": 1.0397987365722656, + "learning_rate": 5.773729607642758e-07, + "loss": 0.0017, + "num_input_tokens_seen": 34538984, + "step": 17641 + }, + { + "epoch": 2.338237243207422, + "grad_norm": 0.0028499579057097435, + "learning_rate": 5.771510934209584e-07, + "loss": 0.0, + "num_input_tokens_seen": 34541104, + "step": 17642 + }, + { + "epoch": 2.338369781312127, + "grad_norm": 7.302776336669922, + "learning_rate": 5.769292631514936e-07, + "loss": 0.0455, + "num_input_tokens_seen": 34543216, + "step": 17643 + }, + { + "epoch": 2.338502319416832, + "grad_norm": 0.0008341590291820467, + "learning_rate": 5.767074699601594e-07, + "loss": 0.0, + "num_input_tokens_seen": 34544688, + "step": 17644 + }, + { + "epoch": 2.338634857521537, + "grad_norm": 13.567420959472656, + "learning_rate": 5.764857138512328e-07, + "loss": 0.111, + "num_input_tokens_seen": 34546616, + "step": 17645 + }, + { + "epoch": 2.3387673956262427, + "grad_norm": 0.006198303308337927, + "learning_rate": 5.762639948289883e-07, + "loss": 0.0, + "num_input_tokens_seen": 34548192, + "step": 17646 + }, + { + "epoch": 2.3388999337309477, + "grad_norm": 0.005080592352896929, + "learning_rate": 5.760423128977013e-07, + "loss": 0.0, + "num_input_tokens_seen": 34549656, + "step": 17647 + }, + { + "epoch": 2.3390324718356528, + "grad_norm": 0.016734832897782326, + "learning_rate": 5.758206680616458e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34551456, + "step": 17648 + }, + { + "epoch": 2.339165009940358, + "grad_norm": 0.006007571239024401, + "learning_rate": 5.755990603250943e-07, + "loss": 0.0, + "num_input_tokens_seen": 34552504, + "step": 17649 + }, + { + "epoch": 2.339297548045063, + "grad_norm": 0.012889403849840164, + "learning_rate": 5.753774896923206e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34554592, + "step": 17650 + }, + { + "epoch": 2.339430086149768, + "grad_norm": 20.680400848388672, + "learning_rate": 5.751559561675973e-07, + "loss": 0.2099, + "num_input_tokens_seen": 34557712, + "step": 17651 + }, + { + "epoch": 2.339562624254473, + "grad_norm": 2.7195699214935303, + "learning_rate": 5.749344597551954e-07, + "loss": 0.0184, + "num_input_tokens_seen": 34560080, + "step": 17652 + }, + { + "epoch": 2.3396951623591784, + "grad_norm": 8.803916931152344, + "learning_rate": 5.747130004593848e-07, + "loss": 0.0801, + "num_input_tokens_seen": 34562064, + "step": 17653 + }, + { + "epoch": 2.3398277004638834, + "grad_norm": 0.022728994488716125, + "learning_rate": 5.744915782844352e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34563544, + "step": 17654 + }, + { + "epoch": 2.3399602385685885, + "grad_norm": 0.7523548603057861, + "learning_rate": 5.742701932346167e-07, + "loss": 0.0017, + "num_input_tokens_seen": 34565216, + "step": 17655 + }, + { + "epoch": 2.3400927766732935, + "grad_norm": 0.005428750533610582, + "learning_rate": 5.740488453141971e-07, + "loss": 0.0, + "num_input_tokens_seen": 34566696, + "step": 17656 + }, + { + "epoch": 2.3402253147779986, + "grad_norm": 0.010625329799950123, + "learning_rate": 5.738275345274447e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34569632, + "step": 17657 + }, + { + "epoch": 2.3403578528827036, + "grad_norm": 1.113126516342163, + "learning_rate": 5.736062608786267e-07, + "loss": 0.0026, + "num_input_tokens_seen": 34572264, + "step": 17658 + }, + { + "epoch": 2.3404903909874086, + "grad_norm": 1.1560052633285522, + "learning_rate": 5.73385024372008e-07, + "loss": 0.0083, + "num_input_tokens_seen": 34574368, + "step": 17659 + }, + { + "epoch": 2.340622929092114, + "grad_norm": 0.0013780479785054922, + "learning_rate": 5.731638250118556e-07, + "loss": 0.0, + "num_input_tokens_seen": 34575560, + "step": 17660 + }, + { + "epoch": 2.340755467196819, + "grad_norm": 10.694581985473633, + "learning_rate": 5.729426628024334e-07, + "loss": 0.1475, + "num_input_tokens_seen": 34577296, + "step": 17661 + }, + { + "epoch": 2.340888005301524, + "grad_norm": 0.01612936146557331, + "learning_rate": 5.727215377480069e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34578832, + "step": 17662 + }, + { + "epoch": 2.3410205434062292, + "grad_norm": 8.009583473205566, + "learning_rate": 5.725004498528386e-07, + "loss": 0.0343, + "num_input_tokens_seen": 34580296, + "step": 17663 + }, + { + "epoch": 2.3411530815109343, + "grad_norm": 6.21445369720459, + "learning_rate": 5.722793991211905e-07, + "loss": 0.1695, + "num_input_tokens_seen": 34582832, + "step": 17664 + }, + { + "epoch": 2.3412856196156397, + "grad_norm": 0.005807481240481138, + "learning_rate": 5.720583855573264e-07, + "loss": 0.0, + "num_input_tokens_seen": 34584568, + "step": 17665 + }, + { + "epoch": 2.341418157720345, + "grad_norm": 4.9380693435668945, + "learning_rate": 5.718374091655066e-07, + "loss": 0.0559, + "num_input_tokens_seen": 34586744, + "step": 17666 + }, + { + "epoch": 2.34155069582505, + "grad_norm": 0.3747180998325348, + "learning_rate": 5.716164699499907e-07, + "loss": 0.0029, + "num_input_tokens_seen": 34588336, + "step": 17667 + }, + { + "epoch": 2.341683233929755, + "grad_norm": 0.00281967269256711, + "learning_rate": 5.713955679150406e-07, + "loss": 0.0, + "num_input_tokens_seen": 34590536, + "step": 17668 + }, + { + "epoch": 2.34181577203446, + "grad_norm": 0.02476070448756218, + "learning_rate": 5.711747030649146e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34592808, + "step": 17669 + }, + { + "epoch": 2.341948310139165, + "grad_norm": 0.07425132393836975, + "learning_rate": 5.709538754038699e-07, + "loss": 0.0005, + "num_input_tokens_seen": 34594592, + "step": 17670 + }, + { + "epoch": 2.34208084824387, + "grad_norm": 0.524229109287262, + "learning_rate": 5.707330849361662e-07, + "loss": 0.0024, + "num_input_tokens_seen": 34596144, + "step": 17671 + }, + { + "epoch": 2.3422133863485755, + "grad_norm": 0.003684363095089793, + "learning_rate": 5.705123316660593e-07, + "loss": 0.0, + "num_input_tokens_seen": 34597640, + "step": 17672 + }, + { + "epoch": 2.3423459244532805, + "grad_norm": 11.933063507080078, + "learning_rate": 5.702916155978053e-07, + "loss": 0.264, + "num_input_tokens_seen": 34599504, + "step": 17673 + }, + { + "epoch": 2.3424784625579855, + "grad_norm": 0.027486419305205345, + "learning_rate": 5.700709367356605e-07, + "loss": 0.0002, + "num_input_tokens_seen": 34601696, + "step": 17674 + }, + { + "epoch": 2.3426110006626906, + "grad_norm": 15.83261489868164, + "learning_rate": 5.698502950838791e-07, + "loss": 0.1737, + "num_input_tokens_seen": 34603864, + "step": 17675 + }, + { + "epoch": 2.3427435387673956, + "grad_norm": 0.036929138004779816, + "learning_rate": 5.69629690646716e-07, + "loss": 0.0002, + "num_input_tokens_seen": 34606560, + "step": 17676 + }, + { + "epoch": 2.3428760768721006, + "grad_norm": 0.30800551176071167, + "learning_rate": 5.694091234284244e-07, + "loss": 0.0005, + "num_input_tokens_seen": 34607928, + "step": 17677 + }, + { + "epoch": 2.3430086149768057, + "grad_norm": 0.014617717824876308, + "learning_rate": 5.691885934332566e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34609344, + "step": 17678 + }, + { + "epoch": 2.343141153081511, + "grad_norm": 2.2037384510040283, + "learning_rate": 5.68968100665464e-07, + "loss": 0.014, + "num_input_tokens_seen": 34611312, + "step": 17679 + }, + { + "epoch": 2.343273691186216, + "grad_norm": 0.6311166882514954, + "learning_rate": 5.687476451292989e-07, + "loss": 0.0024, + "num_input_tokens_seen": 34613488, + "step": 17680 + }, + { + "epoch": 2.3434062292909212, + "grad_norm": 8.508742332458496, + "learning_rate": 5.685272268290118e-07, + "loss": 0.082, + "num_input_tokens_seen": 34615712, + "step": 17681 + }, + { + "epoch": 2.3435387673956263, + "grad_norm": 13.103141784667969, + "learning_rate": 5.683068457688523e-07, + "loss": 0.2137, + "num_input_tokens_seen": 34617984, + "step": 17682 + }, + { + "epoch": 2.3436713055003313, + "grad_norm": 6.899737358093262, + "learning_rate": 5.680865019530696e-07, + "loss": 0.1721, + "num_input_tokens_seen": 34621272, + "step": 17683 + }, + { + "epoch": 2.3438038436050364, + "grad_norm": 3.798013925552368, + "learning_rate": 5.678661953859119e-07, + "loss": 0.0305, + "num_input_tokens_seen": 34622768, + "step": 17684 + }, + { + "epoch": 2.3439363817097414, + "grad_norm": 1.620809555053711, + "learning_rate": 5.676459260716258e-07, + "loss": 0.004, + "num_input_tokens_seen": 34624488, + "step": 17685 + }, + { + "epoch": 2.344068919814447, + "grad_norm": 0.008004768751561642, + "learning_rate": 5.674256940144596e-07, + "loss": 0.0, + "num_input_tokens_seen": 34626440, + "step": 17686 + }, + { + "epoch": 2.344201457919152, + "grad_norm": 10.122072219848633, + "learning_rate": 5.672054992186598e-07, + "loss": 0.2048, + "num_input_tokens_seen": 34628592, + "step": 17687 + }, + { + "epoch": 2.344333996023857, + "grad_norm": 0.0034033653791993856, + "learning_rate": 5.669853416884716e-07, + "loss": 0.0, + "num_input_tokens_seen": 34630704, + "step": 17688 + }, + { + "epoch": 2.344466534128562, + "grad_norm": 0.010396234691143036, + "learning_rate": 5.667652214281394e-07, + "loss": 0.0, + "num_input_tokens_seen": 34632304, + "step": 17689 + }, + { + "epoch": 2.344599072233267, + "grad_norm": 8.901415824890137, + "learning_rate": 5.665451384419066e-07, + "loss": 0.0744, + "num_input_tokens_seen": 34634696, + "step": 17690 + }, + { + "epoch": 2.344731610337972, + "grad_norm": 3.1280500888824463, + "learning_rate": 5.663250927340175e-07, + "loss": 0.0252, + "num_input_tokens_seen": 34637976, + "step": 17691 + }, + { + "epoch": 2.344864148442677, + "grad_norm": 0.04859183728694916, + "learning_rate": 5.661050843087151e-07, + "loss": 0.0003, + "num_input_tokens_seen": 34639560, + "step": 17692 + }, + { + "epoch": 2.3449966865473826, + "grad_norm": 0.6622272729873657, + "learning_rate": 5.658851131702409e-07, + "loss": 0.0018, + "num_input_tokens_seen": 34641272, + "step": 17693 + }, + { + "epoch": 2.3451292246520876, + "grad_norm": 6.628150463104248, + "learning_rate": 5.65665179322836e-07, + "loss": 0.086, + "num_input_tokens_seen": 34643360, + "step": 17694 + }, + { + "epoch": 2.3452617627567927, + "grad_norm": 4.334777355194092, + "learning_rate": 5.654452827707408e-07, + "loss": 0.0795, + "num_input_tokens_seen": 34645640, + "step": 17695 + }, + { + "epoch": 2.3453943008614977, + "grad_norm": 3.512651205062866, + "learning_rate": 5.652254235181944e-07, + "loss": 0.0051, + "num_input_tokens_seen": 34647968, + "step": 17696 + }, + { + "epoch": 2.3455268389662027, + "grad_norm": 0.21486470103263855, + "learning_rate": 5.650056015694369e-07, + "loss": 0.001, + "num_input_tokens_seen": 34649840, + "step": 17697 + }, + { + "epoch": 2.3456593770709078, + "grad_norm": 5.920050144195557, + "learning_rate": 5.647858169287065e-07, + "loss": 0.1063, + "num_input_tokens_seen": 34651032, + "step": 17698 + }, + { + "epoch": 2.345791915175613, + "grad_norm": 0.037337467074394226, + "learning_rate": 5.645660696002411e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34654096, + "step": 17699 + }, + { + "epoch": 2.3459244532803183, + "grad_norm": 14.79647445678711, + "learning_rate": 5.643463595882767e-07, + "loss": 0.1011, + "num_input_tokens_seen": 34655648, + "step": 17700 + }, + { + "epoch": 2.3460569913850233, + "grad_norm": 2.6657516956329346, + "learning_rate": 5.641266868970493e-07, + "loss": 0.0029, + "num_input_tokens_seen": 34656776, + "step": 17701 + }, + { + "epoch": 2.3461895294897284, + "grad_norm": 0.3144253194332123, + "learning_rate": 5.639070515307954e-07, + "loss": 0.0007, + "num_input_tokens_seen": 34659248, + "step": 17702 + }, + { + "epoch": 2.3463220675944334, + "grad_norm": 6.668276786804199, + "learning_rate": 5.636874534937486e-07, + "loss": 0.0904, + "num_input_tokens_seen": 34660952, + "step": 17703 + }, + { + "epoch": 2.3464546056991384, + "grad_norm": 12.12117862701416, + "learning_rate": 5.634678927901441e-07, + "loss": 0.048, + "num_input_tokens_seen": 34662824, + "step": 17704 + }, + { + "epoch": 2.3465871438038435, + "grad_norm": 0.19158318638801575, + "learning_rate": 5.632483694242147e-07, + "loss": 0.0013, + "num_input_tokens_seen": 34665488, + "step": 17705 + }, + { + "epoch": 2.3467196819085485, + "grad_norm": 7.196081638336182, + "learning_rate": 5.63028883400192e-07, + "loss": 0.0294, + "num_input_tokens_seen": 34668152, + "step": 17706 + }, + { + "epoch": 2.346852220013254, + "grad_norm": 2.298901081085205, + "learning_rate": 5.628094347223095e-07, + "loss": 0.0155, + "num_input_tokens_seen": 34670272, + "step": 17707 + }, + { + "epoch": 2.346984758117959, + "grad_norm": 1.7421051263809204, + "learning_rate": 5.625900233947975e-07, + "loss": 0.0077, + "num_input_tokens_seen": 34672408, + "step": 17708 + }, + { + "epoch": 2.347117296222664, + "grad_norm": 0.01892952248454094, + "learning_rate": 5.623706494218856e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34674040, + "step": 17709 + }, + { + "epoch": 2.347249834327369, + "grad_norm": 0.3770451843738556, + "learning_rate": 5.621513128078051e-07, + "loss": 0.0009, + "num_input_tokens_seen": 34675488, + "step": 17710 + }, + { + "epoch": 2.347382372432074, + "grad_norm": 5.000911235809326, + "learning_rate": 5.619320135567832e-07, + "loss": 0.0468, + "num_input_tokens_seen": 34676968, + "step": 17711 + }, + { + "epoch": 2.347514910536779, + "grad_norm": 0.031268663704395294, + "learning_rate": 5.6171275167305e-07, + "loss": 0.0002, + "num_input_tokens_seen": 34678864, + "step": 17712 + }, + { + "epoch": 2.3476474486414842, + "grad_norm": 0.04435854032635689, + "learning_rate": 5.614935271608321e-07, + "loss": 0.0002, + "num_input_tokens_seen": 34680768, + "step": 17713 + }, + { + "epoch": 2.3477799867461897, + "grad_norm": 5.804210662841797, + "learning_rate": 5.612743400243556e-07, + "loss": 0.0381, + "num_input_tokens_seen": 34682792, + "step": 17714 + }, + { + "epoch": 2.3479125248508947, + "grad_norm": 0.022592786699533463, + "learning_rate": 5.610551902678479e-07, + "loss": 0.0002, + "num_input_tokens_seen": 34684464, + "step": 17715 + }, + { + "epoch": 2.3480450629556, + "grad_norm": 0.5734321475028992, + "learning_rate": 5.608360778955332e-07, + "loss": 0.0037, + "num_input_tokens_seen": 34686480, + "step": 17716 + }, + { + "epoch": 2.348177601060305, + "grad_norm": 0.5023155808448792, + "learning_rate": 5.606170029116372e-07, + "loss": 0.0025, + "num_input_tokens_seen": 34689600, + "step": 17717 + }, + { + "epoch": 2.34831013916501, + "grad_norm": 0.013136826455593109, + "learning_rate": 5.603979653203833e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34692392, + "step": 17718 + }, + { + "epoch": 2.348442677269715, + "grad_norm": 0.43581584095954895, + "learning_rate": 5.601789651259948e-07, + "loss": 0.0021, + "num_input_tokens_seen": 34695192, + "step": 17719 + }, + { + "epoch": 2.34857521537442, + "grad_norm": 0.006335685029625893, + "learning_rate": 5.599600023326935e-07, + "loss": 0.0, + "num_input_tokens_seen": 34696936, + "step": 17720 + }, + { + "epoch": 2.3487077534791254, + "grad_norm": 0.07462137937545776, + "learning_rate": 5.597410769447015e-07, + "loss": 0.0003, + "num_input_tokens_seen": 34698768, + "step": 17721 + }, + { + "epoch": 2.3488402915838305, + "grad_norm": 7.639307022094727, + "learning_rate": 5.595221889662408e-07, + "loss": 0.0667, + "num_input_tokens_seen": 34700992, + "step": 17722 + }, + { + "epoch": 2.3489728296885355, + "grad_norm": 0.016179468482732773, + "learning_rate": 5.593033384015309e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34702368, + "step": 17723 + }, + { + "epoch": 2.3491053677932405, + "grad_norm": 7.36820650100708, + "learning_rate": 5.590845252547914e-07, + "loss": 0.0405, + "num_input_tokens_seen": 34704528, + "step": 17724 + }, + { + "epoch": 2.3492379058979456, + "grad_norm": 5.769047737121582, + "learning_rate": 5.588657495302413e-07, + "loss": 0.0515, + "num_input_tokens_seen": 34706688, + "step": 17725 + }, + { + "epoch": 2.3493704440026506, + "grad_norm": 14.09211254119873, + "learning_rate": 5.586470112320979e-07, + "loss": 0.2951, + "num_input_tokens_seen": 34708408, + "step": 17726 + }, + { + "epoch": 2.3495029821073556, + "grad_norm": 0.00042978691635653377, + "learning_rate": 5.584283103645796e-07, + "loss": 0.0, + "num_input_tokens_seen": 34709968, + "step": 17727 + }, + { + "epoch": 2.349635520212061, + "grad_norm": 2.0365657806396484, + "learning_rate": 5.582096469319037e-07, + "loss": 0.0227, + "num_input_tokens_seen": 34711760, + "step": 17728 + }, + { + "epoch": 2.349768058316766, + "grad_norm": 2.229172945022583, + "learning_rate": 5.579910209382849e-07, + "loss": 0.0134, + "num_input_tokens_seen": 34714864, + "step": 17729 + }, + { + "epoch": 2.349900596421471, + "grad_norm": 0.028426343575119972, + "learning_rate": 5.577724323879394e-07, + "loss": 0.0002, + "num_input_tokens_seen": 34717008, + "step": 17730 + }, + { + "epoch": 2.3500331345261762, + "grad_norm": 0.0038648161571472883, + "learning_rate": 5.575538812850812e-07, + "loss": 0.0, + "num_input_tokens_seen": 34718416, + "step": 17731 + }, + { + "epoch": 2.3501656726308813, + "grad_norm": 0.08603201806545258, + "learning_rate": 5.573353676339233e-07, + "loss": 0.0005, + "num_input_tokens_seen": 34720600, + "step": 17732 + }, + { + "epoch": 2.3502982107355863, + "grad_norm": 0.3695821166038513, + "learning_rate": 5.571168914386796e-07, + "loss": 0.0015, + "num_input_tokens_seen": 34722144, + "step": 17733 + }, + { + "epoch": 2.3504307488402914, + "grad_norm": 7.192691802978516, + "learning_rate": 5.568984527035634e-07, + "loss": 0.0442, + "num_input_tokens_seen": 34723952, + "step": 17734 + }, + { + "epoch": 2.350563286944997, + "grad_norm": 3.2021846771240234, + "learning_rate": 5.566800514327858e-07, + "loss": 0.0188, + "num_input_tokens_seen": 34725880, + "step": 17735 + }, + { + "epoch": 2.350695825049702, + "grad_norm": 0.20106668770313263, + "learning_rate": 5.56461687630557e-07, + "loss": 0.0013, + "num_input_tokens_seen": 34728776, + "step": 17736 + }, + { + "epoch": 2.350828363154407, + "grad_norm": 4.721794128417969, + "learning_rate": 5.562433613010873e-07, + "loss": 0.0585, + "num_input_tokens_seen": 34732240, + "step": 17737 + }, + { + "epoch": 2.350960901259112, + "grad_norm": 6.336400985717773, + "learning_rate": 5.560250724485869e-07, + "loss": 0.2205, + "num_input_tokens_seen": 34733816, + "step": 17738 + }, + { + "epoch": 2.351093439363817, + "grad_norm": 0.14346235990524292, + "learning_rate": 5.558068210772635e-07, + "loss": 0.0005, + "num_input_tokens_seen": 34735024, + "step": 17739 + }, + { + "epoch": 2.351225977468522, + "grad_norm": 13.50838565826416, + "learning_rate": 5.555886071913269e-07, + "loss": 0.0865, + "num_input_tokens_seen": 34736608, + "step": 17740 + }, + { + "epoch": 2.351358515573227, + "grad_norm": 1.2651335000991821, + "learning_rate": 5.553704307949828e-07, + "loss": 0.005, + "num_input_tokens_seen": 34738744, + "step": 17741 + }, + { + "epoch": 2.3514910536779325, + "grad_norm": 12.157574653625488, + "learning_rate": 5.551522918924379e-07, + "loss": 0.1472, + "num_input_tokens_seen": 34740784, + "step": 17742 + }, + { + "epoch": 2.3516235917826376, + "grad_norm": 0.006949228700250387, + "learning_rate": 5.549341904878994e-07, + "loss": 0.0, + "num_input_tokens_seen": 34742096, + "step": 17743 + }, + { + "epoch": 2.3517561298873426, + "grad_norm": 0.06277748942375183, + "learning_rate": 5.547161265855705e-07, + "loss": 0.0002, + "num_input_tokens_seen": 34743528, + "step": 17744 + }, + { + "epoch": 2.3518886679920477, + "grad_norm": 4.195850849151611, + "learning_rate": 5.544981001896577e-07, + "loss": 0.0313, + "num_input_tokens_seen": 34745024, + "step": 17745 + }, + { + "epoch": 2.3520212060967527, + "grad_norm": 0.015423735603690147, + "learning_rate": 5.542801113043633e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34746928, + "step": 17746 + }, + { + "epoch": 2.3521537442014577, + "grad_norm": 0.009266690351068974, + "learning_rate": 5.540621599338902e-07, + "loss": 0.0, + "num_input_tokens_seen": 34748920, + "step": 17747 + }, + { + "epoch": 2.352286282306163, + "grad_norm": 0.4075634479522705, + "learning_rate": 5.538442460824417e-07, + "loss": 0.0006, + "num_input_tokens_seen": 34750248, + "step": 17748 + }, + { + "epoch": 2.3524188204108682, + "grad_norm": 0.6627886295318604, + "learning_rate": 5.536263697542185e-07, + "loss": 0.0029, + "num_input_tokens_seen": 34752216, + "step": 17749 + }, + { + "epoch": 2.3525513585155733, + "grad_norm": 0.010573110543191433, + "learning_rate": 5.534085309534212e-07, + "loss": 0.0, + "num_input_tokens_seen": 34753800, + "step": 17750 + }, + { + "epoch": 2.3526838966202783, + "grad_norm": 0.007072928361594677, + "learning_rate": 5.531907296842509e-07, + "loss": 0.0, + "num_input_tokens_seen": 34756456, + "step": 17751 + }, + { + "epoch": 2.3528164347249834, + "grad_norm": 4.514529228210449, + "learning_rate": 5.529729659509059e-07, + "loss": 0.005, + "num_input_tokens_seen": 34759208, + "step": 17752 + }, + { + "epoch": 2.3529489728296884, + "grad_norm": 0.06037493422627449, + "learning_rate": 5.527552397575858e-07, + "loss": 0.0004, + "num_input_tokens_seen": 34760880, + "step": 17753 + }, + { + "epoch": 2.353081510934394, + "grad_norm": 5.566935062408447, + "learning_rate": 5.525375511084879e-07, + "loss": 0.0514, + "num_input_tokens_seen": 34762376, + "step": 17754 + }, + { + "epoch": 2.353214049039099, + "grad_norm": 0.19817079603672028, + "learning_rate": 5.523199000078097e-07, + "loss": 0.0016, + "num_input_tokens_seen": 34764224, + "step": 17755 + }, + { + "epoch": 2.353346587143804, + "grad_norm": 0.2292715162038803, + "learning_rate": 5.521022864597466e-07, + "loss": 0.0009, + "num_input_tokens_seen": 34766176, + "step": 17756 + }, + { + "epoch": 2.353479125248509, + "grad_norm": 0.10731744021177292, + "learning_rate": 5.51884710468496e-07, + "loss": 0.0007, + "num_input_tokens_seen": 34768736, + "step": 17757 + }, + { + "epoch": 2.353611663353214, + "grad_norm": 0.15225407481193542, + "learning_rate": 5.516671720382513e-07, + "loss": 0.0005, + "num_input_tokens_seen": 34770984, + "step": 17758 + }, + { + "epoch": 2.353744201457919, + "grad_norm": 0.021273259073495865, + "learning_rate": 5.514496711732084e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34773880, + "step": 17759 + }, + { + "epoch": 2.353876739562624, + "grad_norm": 11.22998332977295, + "learning_rate": 5.512322078775603e-07, + "loss": 0.0522, + "num_input_tokens_seen": 34775472, + "step": 17760 + }, + { + "epoch": 2.3540092776673296, + "grad_norm": 0.0019208334852010012, + "learning_rate": 5.510147821554992e-07, + "loss": 0.0, + "num_input_tokens_seen": 34776936, + "step": 17761 + }, + { + "epoch": 2.3541418157720346, + "grad_norm": 9.687651634216309, + "learning_rate": 5.507973940112171e-07, + "loss": 0.0306, + "num_input_tokens_seen": 34778568, + "step": 17762 + }, + { + "epoch": 2.3542743538767397, + "grad_norm": 0.006687212735414505, + "learning_rate": 5.505800434489062e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34779968, + "step": 17763 + }, + { + "epoch": 2.3544068919814447, + "grad_norm": 0.004950502887368202, + "learning_rate": 5.503627304727574e-07, + "loss": 0.0, + "num_input_tokens_seen": 34781096, + "step": 17764 + }, + { + "epoch": 2.3545394300861497, + "grad_norm": 0.000394042901461944, + "learning_rate": 5.501454550869601e-07, + "loss": 0.0, + "num_input_tokens_seen": 34782696, + "step": 17765 + }, + { + "epoch": 2.354671968190855, + "grad_norm": 0.005813139490783215, + "learning_rate": 5.499282172957035e-07, + "loss": 0.0, + "num_input_tokens_seen": 34784456, + "step": 17766 + }, + { + "epoch": 2.35480450629556, + "grad_norm": 0.003222453873604536, + "learning_rate": 5.497110171031755e-07, + "loss": 0.0, + "num_input_tokens_seen": 34786464, + "step": 17767 + }, + { + "epoch": 2.3549370444002653, + "grad_norm": 3.797583818435669, + "learning_rate": 5.494938545135645e-07, + "loss": 0.0179, + "num_input_tokens_seen": 34788392, + "step": 17768 + }, + { + "epoch": 2.3550695825049703, + "grad_norm": 12.619410514831543, + "learning_rate": 5.492767295310584e-07, + "loss": 0.1586, + "num_input_tokens_seen": 34790208, + "step": 17769 + }, + { + "epoch": 2.3552021206096754, + "grad_norm": 1.6835739612579346, + "learning_rate": 5.490596421598426e-07, + "loss": 0.0082, + "num_input_tokens_seen": 34792184, + "step": 17770 + }, + { + "epoch": 2.3553346587143804, + "grad_norm": 0.0028643279802054167, + "learning_rate": 5.488425924041027e-07, + "loss": 0.0, + "num_input_tokens_seen": 34793376, + "step": 17771 + }, + { + "epoch": 2.3554671968190855, + "grad_norm": 0.024906128644943237, + "learning_rate": 5.486255802680237e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34796184, + "step": 17772 + }, + { + "epoch": 2.3555997349237905, + "grad_norm": 0.06921370327472687, + "learning_rate": 5.484086057557892e-07, + "loss": 0.0003, + "num_input_tokens_seen": 34797928, + "step": 17773 + }, + { + "epoch": 2.3557322730284955, + "grad_norm": 4.544421672821045, + "learning_rate": 5.48191668871583e-07, + "loss": 0.0481, + "num_input_tokens_seen": 34801088, + "step": 17774 + }, + { + "epoch": 2.355864811133201, + "grad_norm": 8.340815544128418, + "learning_rate": 5.479747696195887e-07, + "loss": 0.0668, + "num_input_tokens_seen": 34803200, + "step": 17775 + }, + { + "epoch": 2.355997349237906, + "grad_norm": 15.619290351867676, + "learning_rate": 5.477579080039872e-07, + "loss": 0.1167, + "num_input_tokens_seen": 34805032, + "step": 17776 + }, + { + "epoch": 2.356129887342611, + "grad_norm": 0.011956840753555298, + "learning_rate": 5.475410840289603e-07, + "loss": 0.0, + "num_input_tokens_seen": 34806768, + "step": 17777 + }, + { + "epoch": 2.356262425447316, + "grad_norm": 5.885092258453369, + "learning_rate": 5.473242976986873e-07, + "loss": 0.0278, + "num_input_tokens_seen": 34808912, + "step": 17778 + }, + { + "epoch": 2.356394963552021, + "grad_norm": 8.172468185424805, + "learning_rate": 5.471075490173497e-07, + "loss": 0.1538, + "num_input_tokens_seen": 34811248, + "step": 17779 + }, + { + "epoch": 2.356527501656726, + "grad_norm": 0.0726267471909523, + "learning_rate": 5.46890837989125e-07, + "loss": 0.0002, + "num_input_tokens_seen": 34813176, + "step": 17780 + }, + { + "epoch": 2.3566600397614312, + "grad_norm": 3.2818403244018555, + "learning_rate": 5.466741646181931e-07, + "loss": 0.0112, + "num_input_tokens_seen": 34814648, + "step": 17781 + }, + { + "epoch": 2.3567925778661367, + "grad_norm": 2.5179409980773926, + "learning_rate": 5.464575289087309e-07, + "loss": 0.025, + "num_input_tokens_seen": 34816784, + "step": 17782 + }, + { + "epoch": 2.3569251159708418, + "grad_norm": 0.38421401381492615, + "learning_rate": 5.462409308649153e-07, + "loss": 0.0006, + "num_input_tokens_seen": 34819160, + "step": 17783 + }, + { + "epoch": 2.357057654075547, + "grad_norm": 1.3034144639968872, + "learning_rate": 5.460243704909216e-07, + "loss": 0.0037, + "num_input_tokens_seen": 34820752, + "step": 17784 + }, + { + "epoch": 2.357190192180252, + "grad_norm": 0.011089128442108631, + "learning_rate": 5.458078477909265e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34823032, + "step": 17785 + }, + { + "epoch": 2.357322730284957, + "grad_norm": 2.9588115215301514, + "learning_rate": 5.455913627691037e-07, + "loss": 0.0127, + "num_input_tokens_seen": 34824352, + "step": 17786 + }, + { + "epoch": 2.357455268389662, + "grad_norm": 0.6513294577598572, + "learning_rate": 5.453749154296284e-07, + "loss": 0.0033, + "num_input_tokens_seen": 34826720, + "step": 17787 + }, + { + "epoch": 2.357587806494367, + "grad_norm": 0.0477660708129406, + "learning_rate": 5.451585057766731e-07, + "loss": 0.0002, + "num_input_tokens_seen": 34828336, + "step": 17788 + }, + { + "epoch": 2.3577203445990724, + "grad_norm": 0.04405251145362854, + "learning_rate": 5.449421338144098e-07, + "loss": 0.0003, + "num_input_tokens_seen": 34829632, + "step": 17789 + }, + { + "epoch": 2.3578528827037775, + "grad_norm": 0.622642993927002, + "learning_rate": 5.447257995470115e-07, + "loss": 0.0045, + "num_input_tokens_seen": 34831280, + "step": 17790 + }, + { + "epoch": 2.3579854208084825, + "grad_norm": 14.460265159606934, + "learning_rate": 5.445095029786482e-07, + "loss": 0.0946, + "num_input_tokens_seen": 34833664, + "step": 17791 + }, + { + "epoch": 2.3581179589131875, + "grad_norm": 16.176565170288086, + "learning_rate": 5.44293244113491e-07, + "loss": 0.2562, + "num_input_tokens_seen": 34836032, + "step": 17792 + }, + { + "epoch": 2.3582504970178926, + "grad_norm": 0.8949641585350037, + "learning_rate": 5.440770229557096e-07, + "loss": 0.0033, + "num_input_tokens_seen": 34838848, + "step": 17793 + }, + { + "epoch": 2.3583830351225976, + "grad_norm": 2.097344160079956, + "learning_rate": 5.438608395094716e-07, + "loss": 0.0127, + "num_input_tokens_seen": 34841240, + "step": 17794 + }, + { + "epoch": 2.3585155732273027, + "grad_norm": 3.823345422744751, + "learning_rate": 5.43644693778947e-07, + "loss": 0.0848, + "num_input_tokens_seen": 34842928, + "step": 17795 + }, + { + "epoch": 2.358648111332008, + "grad_norm": 9.817488670349121, + "learning_rate": 5.43428585768302e-07, + "loss": 0.1675, + "num_input_tokens_seen": 34844552, + "step": 17796 + }, + { + "epoch": 2.358780649436713, + "grad_norm": 0.4838305115699768, + "learning_rate": 5.432125154817031e-07, + "loss": 0.0014, + "num_input_tokens_seen": 34846448, + "step": 17797 + }, + { + "epoch": 2.358913187541418, + "grad_norm": 0.0014672423712909222, + "learning_rate": 5.429964829233176e-07, + "loss": 0.0, + "num_input_tokens_seen": 34847816, + "step": 17798 + }, + { + "epoch": 2.3590457256461232, + "grad_norm": 12.609827995300293, + "learning_rate": 5.427804880973092e-07, + "loss": 0.0998, + "num_input_tokens_seen": 34849824, + "step": 17799 + }, + { + "epoch": 2.3591782637508283, + "grad_norm": 0.11343824863433838, + "learning_rate": 5.425645310078437e-07, + "loss": 0.0002, + "num_input_tokens_seen": 34851760, + "step": 17800 + }, + { + "epoch": 2.3593108018555333, + "grad_norm": 0.000761420582421124, + "learning_rate": 5.423486116590848e-07, + "loss": 0.0, + "num_input_tokens_seen": 34853312, + "step": 17801 + }, + { + "epoch": 2.3594433399602384, + "grad_norm": 0.0037159365601837635, + "learning_rate": 5.421327300551949e-07, + "loss": 0.0, + "num_input_tokens_seen": 34854920, + "step": 17802 + }, + { + "epoch": 2.359575878064944, + "grad_norm": 0.044683124870061874, + "learning_rate": 5.41916886200336e-07, + "loss": 0.0002, + "num_input_tokens_seen": 34857776, + "step": 17803 + }, + { + "epoch": 2.359708416169649, + "grad_norm": 8.224814414978027, + "learning_rate": 5.417010800986703e-07, + "loss": 0.0627, + "num_input_tokens_seen": 34859904, + "step": 17804 + }, + { + "epoch": 2.359840954274354, + "grad_norm": 0.014492901042103767, + "learning_rate": 5.414853117543595e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34861488, + "step": 17805 + }, + { + "epoch": 2.359973492379059, + "grad_norm": 0.03416812792420387, + "learning_rate": 5.412695811715629e-07, + "loss": 0.0002, + "num_input_tokens_seen": 34863568, + "step": 17806 + }, + { + "epoch": 2.360106030483764, + "grad_norm": 0.07887546718120575, + "learning_rate": 5.410538883544403e-07, + "loss": 0.0004, + "num_input_tokens_seen": 34865272, + "step": 17807 + }, + { + "epoch": 2.360238568588469, + "grad_norm": 3.5303268432617188, + "learning_rate": 5.408382333071499e-07, + "loss": 0.0109, + "num_input_tokens_seen": 34867280, + "step": 17808 + }, + { + "epoch": 2.360371106693174, + "grad_norm": 2.625739812850952, + "learning_rate": 5.406226160338493e-07, + "loss": 0.0112, + "num_input_tokens_seen": 34868576, + "step": 17809 + }, + { + "epoch": 2.3605036447978796, + "grad_norm": 0.021193284541368484, + "learning_rate": 5.404070365386965e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34870904, + "step": 17810 + }, + { + "epoch": 2.3606361829025846, + "grad_norm": 13.384984016418457, + "learning_rate": 5.401914948258482e-07, + "loss": 0.4126, + "num_input_tokens_seen": 34872520, + "step": 17811 + }, + { + "epoch": 2.3607687210072896, + "grad_norm": 6.515951156616211, + "learning_rate": 5.399759908994603e-07, + "loss": 0.0546, + "num_input_tokens_seen": 34874272, + "step": 17812 + }, + { + "epoch": 2.3609012591119947, + "grad_norm": 0.02144785411655903, + "learning_rate": 5.397605247636872e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34876208, + "step": 17813 + }, + { + "epoch": 2.3610337972166997, + "grad_norm": 2.452218770980835, + "learning_rate": 5.395450964226836e-07, + "loss": 0.0139, + "num_input_tokens_seen": 34877984, + "step": 17814 + }, + { + "epoch": 2.3611663353214047, + "grad_norm": 0.08504106849431992, + "learning_rate": 5.393297058806022e-07, + "loss": 0.0004, + "num_input_tokens_seen": 34879712, + "step": 17815 + }, + { + "epoch": 2.36129887342611, + "grad_norm": 4.739107608795166, + "learning_rate": 5.391143531415968e-07, + "loss": 0.0423, + "num_input_tokens_seen": 34881640, + "step": 17816 + }, + { + "epoch": 2.3614314115308153, + "grad_norm": 0.00333572831004858, + "learning_rate": 5.3889903820982e-07, + "loss": 0.0, + "num_input_tokens_seen": 34883232, + "step": 17817 + }, + { + "epoch": 2.3615639496355203, + "grad_norm": 3.4142680168151855, + "learning_rate": 5.386837610894227e-07, + "loss": 0.0266, + "num_input_tokens_seen": 34884760, + "step": 17818 + }, + { + "epoch": 2.3616964877402253, + "grad_norm": 0.03513708338141441, + "learning_rate": 5.384685217845553e-07, + "loss": 0.0003, + "num_input_tokens_seen": 34886192, + "step": 17819 + }, + { + "epoch": 2.3618290258449304, + "grad_norm": 0.0014698661398142576, + "learning_rate": 5.382533202993676e-07, + "loss": 0.0, + "num_input_tokens_seen": 34887560, + "step": 17820 + }, + { + "epoch": 2.3619615639496354, + "grad_norm": 8.751652717590332, + "learning_rate": 5.380381566380091e-07, + "loss": 0.1137, + "num_input_tokens_seen": 34889320, + "step": 17821 + }, + { + "epoch": 2.3620941020543404, + "grad_norm": 1.3840405941009521, + "learning_rate": 5.378230308046292e-07, + "loss": 0.0124, + "num_input_tokens_seen": 34891104, + "step": 17822 + }, + { + "epoch": 2.3622266401590455, + "grad_norm": 0.005729067139327526, + "learning_rate": 5.376079428033748e-07, + "loss": 0.0, + "num_input_tokens_seen": 34892544, + "step": 17823 + }, + { + "epoch": 2.362359178263751, + "grad_norm": 0.006435960065573454, + "learning_rate": 5.373928926383931e-07, + "loss": 0.0, + "num_input_tokens_seen": 34894008, + "step": 17824 + }, + { + "epoch": 2.362491716368456, + "grad_norm": 0.02071291208267212, + "learning_rate": 5.371778803138298e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34895968, + "step": 17825 + }, + { + "epoch": 2.362624254473161, + "grad_norm": 0.08949607610702515, + "learning_rate": 5.369629058338316e-07, + "loss": 0.0003, + "num_input_tokens_seen": 34897824, + "step": 17826 + }, + { + "epoch": 2.362756792577866, + "grad_norm": 4.069151401519775, + "learning_rate": 5.367479692025421e-07, + "loss": 0.0125, + "num_input_tokens_seen": 34899360, + "step": 17827 + }, + { + "epoch": 2.362889330682571, + "grad_norm": 0.1709366738796234, + "learning_rate": 5.365330704241067e-07, + "loss": 0.0004, + "num_input_tokens_seen": 34900800, + "step": 17828 + }, + { + "epoch": 2.363021868787276, + "grad_norm": 3.698108673095703, + "learning_rate": 5.363182095026684e-07, + "loss": 0.1146, + "num_input_tokens_seen": 34901960, + "step": 17829 + }, + { + "epoch": 2.363154406891981, + "grad_norm": 0.12536188960075378, + "learning_rate": 5.361033864423689e-07, + "loss": 0.0004, + "num_input_tokens_seen": 34903920, + "step": 17830 + }, + { + "epoch": 2.3632869449966867, + "grad_norm": 9.294626235961914, + "learning_rate": 5.358886012473519e-07, + "loss": 0.0983, + "num_input_tokens_seen": 34906024, + "step": 17831 + }, + { + "epoch": 2.3634194831013917, + "grad_norm": 7.790512561798096, + "learning_rate": 5.356738539217574e-07, + "loss": 0.1184, + "num_input_tokens_seen": 34908192, + "step": 17832 + }, + { + "epoch": 2.3635520212060968, + "grad_norm": 0.006251086946576834, + "learning_rate": 5.354591444697254e-07, + "loss": 0.0, + "num_input_tokens_seen": 34909984, + "step": 17833 + }, + { + "epoch": 2.363684559310802, + "grad_norm": 0.01941339485347271, + "learning_rate": 5.352444728953971e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34912024, + "step": 17834 + }, + { + "epoch": 2.363817097415507, + "grad_norm": 0.14774712920188904, + "learning_rate": 5.350298392029102e-07, + "loss": 0.0007, + "num_input_tokens_seen": 34914168, + "step": 17835 + }, + { + "epoch": 2.363949635520212, + "grad_norm": 0.02007228322327137, + "learning_rate": 5.348152433964041e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34916856, + "step": 17836 + }, + { + "epoch": 2.3640821736249173, + "grad_norm": 0.005125201307237148, + "learning_rate": 5.346006854800157e-07, + "loss": 0.0, + "num_input_tokens_seen": 34918304, + "step": 17837 + }, + { + "epoch": 2.3642147117296224, + "grad_norm": 0.0013130478328093886, + "learning_rate": 5.343861654578822e-07, + "loss": 0.0, + "num_input_tokens_seen": 34919984, + "step": 17838 + }, + { + "epoch": 2.3643472498343274, + "grad_norm": 0.05215870589017868, + "learning_rate": 5.341716833341387e-07, + "loss": 0.0003, + "num_input_tokens_seen": 34922064, + "step": 17839 + }, + { + "epoch": 2.3644797879390325, + "grad_norm": 0.010648339055478573, + "learning_rate": 5.339572391129219e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34924240, + "step": 17840 + }, + { + "epoch": 2.3646123260437375, + "grad_norm": 8.304643630981445, + "learning_rate": 5.337428327983651e-07, + "loss": 0.0448, + "num_input_tokens_seen": 34927104, + "step": 17841 + }, + { + "epoch": 2.3647448641484425, + "grad_norm": 4.449934482574463, + "learning_rate": 5.335284643946037e-07, + "loss": 0.0302, + "num_input_tokens_seen": 34929184, + "step": 17842 + }, + { + "epoch": 2.364877402253148, + "grad_norm": 4.5544047355651855, + "learning_rate": 5.333141339057701e-07, + "loss": 0.0668, + "num_input_tokens_seen": 34932024, + "step": 17843 + }, + { + "epoch": 2.365009940357853, + "grad_norm": 6.115437984466553, + "learning_rate": 5.330998413359961e-07, + "loss": 0.1089, + "num_input_tokens_seen": 34934024, + "step": 17844 + }, + { + "epoch": 2.365142478462558, + "grad_norm": 0.21305787563323975, + "learning_rate": 5.328855866894148e-07, + "loss": 0.0015, + "num_input_tokens_seen": 34938024, + "step": 17845 + }, + { + "epoch": 2.365275016567263, + "grad_norm": 0.1069553792476654, + "learning_rate": 5.326713699701558e-07, + "loss": 0.0003, + "num_input_tokens_seen": 34940904, + "step": 17846 + }, + { + "epoch": 2.365407554671968, + "grad_norm": 0.09191861748695374, + "learning_rate": 5.324571911823503e-07, + "loss": 0.0005, + "num_input_tokens_seen": 34942432, + "step": 17847 + }, + { + "epoch": 2.365540092776673, + "grad_norm": 0.0708169937133789, + "learning_rate": 5.322430503301279e-07, + "loss": 0.0004, + "num_input_tokens_seen": 34944304, + "step": 17848 + }, + { + "epoch": 2.3656726308813782, + "grad_norm": 0.021539289504289627, + "learning_rate": 5.320289474176168e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34946368, + "step": 17849 + }, + { + "epoch": 2.3658051689860837, + "grad_norm": 10.883551597595215, + "learning_rate": 5.318148824489444e-07, + "loss": 0.2097, + "num_input_tokens_seen": 34948184, + "step": 17850 + }, + { + "epoch": 2.3659377070907888, + "grad_norm": 7.069830894470215, + "learning_rate": 5.31600855428239e-07, + "loss": 0.0622, + "num_input_tokens_seen": 34949784, + "step": 17851 + }, + { + "epoch": 2.366070245195494, + "grad_norm": 0.017010947689414024, + "learning_rate": 5.313868663596278e-07, + "loss": 0.0, + "num_input_tokens_seen": 34950880, + "step": 17852 + }, + { + "epoch": 2.366202783300199, + "grad_norm": 3.331115961074829, + "learning_rate": 5.311729152472359e-07, + "loss": 0.0341, + "num_input_tokens_seen": 34952992, + "step": 17853 + }, + { + "epoch": 2.366335321404904, + "grad_norm": 3.3038928508758545, + "learning_rate": 5.309590020951882e-07, + "loss": 0.0229, + "num_input_tokens_seen": 34954744, + "step": 17854 + }, + { + "epoch": 2.366467859509609, + "grad_norm": 13.353264808654785, + "learning_rate": 5.307451269076094e-07, + "loss": 0.096, + "num_input_tokens_seen": 34956920, + "step": 17855 + }, + { + "epoch": 2.366600397614314, + "grad_norm": 7.725100994110107, + "learning_rate": 5.305312896886225e-07, + "loss": 0.1217, + "num_input_tokens_seen": 34959288, + "step": 17856 + }, + { + "epoch": 2.3667329357190194, + "grad_norm": 7.400303840637207, + "learning_rate": 5.303174904423508e-07, + "loss": 0.0451, + "num_input_tokens_seen": 34961584, + "step": 17857 + }, + { + "epoch": 2.3668654738237245, + "grad_norm": 0.034982409328222275, + "learning_rate": 5.301037291729174e-07, + "loss": 0.0002, + "num_input_tokens_seen": 34964136, + "step": 17858 + }, + { + "epoch": 2.3669980119284295, + "grad_norm": 17.188966751098633, + "learning_rate": 5.298900058844434e-07, + "loss": 0.3595, + "num_input_tokens_seen": 34966144, + "step": 17859 + }, + { + "epoch": 2.3671305500331346, + "grad_norm": 6.802550315856934, + "learning_rate": 5.29676320581049e-07, + "loss": 0.0478, + "num_input_tokens_seen": 34968408, + "step": 17860 + }, + { + "epoch": 2.3672630881378396, + "grad_norm": 0.0053653898648917675, + "learning_rate": 5.294626732668537e-07, + "loss": 0.0, + "num_input_tokens_seen": 34970144, + "step": 17861 + }, + { + "epoch": 2.3673956262425446, + "grad_norm": 0.41255563497543335, + "learning_rate": 5.29249063945978e-07, + "loss": 0.0006, + "num_input_tokens_seen": 34971456, + "step": 17862 + }, + { + "epoch": 2.3675281643472497, + "grad_norm": 0.002705028047785163, + "learning_rate": 5.290354926225394e-07, + "loss": 0.0, + "num_input_tokens_seen": 34972904, + "step": 17863 + }, + { + "epoch": 2.367660702451955, + "grad_norm": 3.1188786029815674, + "learning_rate": 5.288219593006568e-07, + "loss": 0.0151, + "num_input_tokens_seen": 34975032, + "step": 17864 + }, + { + "epoch": 2.36779324055666, + "grad_norm": 5.2777886390686035, + "learning_rate": 5.286084639844466e-07, + "loss": 0.0507, + "num_input_tokens_seen": 34977496, + "step": 17865 + }, + { + "epoch": 2.367925778661365, + "grad_norm": 0.011733113788068295, + "learning_rate": 5.283950066780247e-07, + "loss": 0.0001, + "num_input_tokens_seen": 34980208, + "step": 17866 + }, + { + "epoch": 2.3680583167660703, + "grad_norm": 0.05583785101771355, + "learning_rate": 5.281815873855079e-07, + "loss": 0.0003, + "num_input_tokens_seen": 34981880, + "step": 17867 + }, + { + "epoch": 2.3681908548707753, + "grad_norm": 0.726688802242279, + "learning_rate": 5.279682061110103e-07, + "loss": 0.0054, + "num_input_tokens_seen": 34984176, + "step": 17868 + }, + { + "epoch": 2.3683233929754803, + "grad_norm": 14.965158462524414, + "learning_rate": 5.277548628586455e-07, + "loss": 0.1856, + "num_input_tokens_seen": 34986376, + "step": 17869 + }, + { + "epoch": 2.3684559310801854, + "grad_norm": 0.09659792482852936, + "learning_rate": 5.27541557632528e-07, + "loss": 0.0003, + "num_input_tokens_seen": 34987456, + "step": 17870 + }, + { + "epoch": 2.368588469184891, + "grad_norm": 0.051037922501564026, + "learning_rate": 5.273282904367701e-07, + "loss": 0.0003, + "num_input_tokens_seen": 34989776, + "step": 17871 + }, + { + "epoch": 2.368721007289596, + "grad_norm": 1.24022376537323, + "learning_rate": 5.27115061275483e-07, + "loss": 0.0086, + "num_input_tokens_seen": 34991440, + "step": 17872 + }, + { + "epoch": 2.368853545394301, + "grad_norm": 4.3537421226501465, + "learning_rate": 5.269018701527792e-07, + "loss": 0.0186, + "num_input_tokens_seen": 34993792, + "step": 17873 + }, + { + "epoch": 2.368986083499006, + "grad_norm": 0.8581714630126953, + "learning_rate": 5.26688717072768e-07, + "loss": 0.005, + "num_input_tokens_seen": 34995168, + "step": 17874 + }, + { + "epoch": 2.369118621603711, + "grad_norm": 0.7221105098724365, + "learning_rate": 5.2647560203956e-07, + "loss": 0.0028, + "num_input_tokens_seen": 34997720, + "step": 17875 + }, + { + "epoch": 2.369251159708416, + "grad_norm": 0.7373464703559875, + "learning_rate": 5.262625250572642e-07, + "loss": 0.0027, + "num_input_tokens_seen": 34999224, + "step": 17876 + }, + { + "epoch": 2.369383697813121, + "grad_norm": 1.2851636409759521, + "learning_rate": 5.260494861299875e-07, + "loss": 0.0076, + "num_input_tokens_seen": 35002000, + "step": 17877 + }, + { + "epoch": 2.3695162359178266, + "grad_norm": 0.0021371126640588045, + "learning_rate": 5.258364852618395e-07, + "loss": 0.0, + "num_input_tokens_seen": 35003248, + "step": 17878 + }, + { + "epoch": 2.3696487740225316, + "grad_norm": 0.009958760812878609, + "learning_rate": 5.256235224569259e-07, + "loss": 0.0, + "num_input_tokens_seen": 35004456, + "step": 17879 + }, + { + "epoch": 2.3697813121272366, + "grad_norm": 0.18617406487464905, + "learning_rate": 5.254105977193519e-07, + "loss": 0.0011, + "num_input_tokens_seen": 35005936, + "step": 17880 + }, + { + "epoch": 2.3699138502319417, + "grad_norm": 0.04857746139168739, + "learning_rate": 5.251977110532247e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35007568, + "step": 17881 + }, + { + "epoch": 2.3700463883366467, + "grad_norm": 0.02612837590277195, + "learning_rate": 5.249848624626472e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35008632, + "step": 17882 + }, + { + "epoch": 2.3701789264413518, + "grad_norm": 0.039052385836839676, + "learning_rate": 5.247720519517246e-07, + "loss": 0.0002, + "num_input_tokens_seen": 35011656, + "step": 17883 + }, + { + "epoch": 2.370311464546057, + "grad_norm": 6.019850730895996, + "learning_rate": 5.245592795245597e-07, + "loss": 0.1, + "num_input_tokens_seen": 35013664, + "step": 17884 + }, + { + "epoch": 2.3704440026507623, + "grad_norm": 4.909743785858154, + "learning_rate": 5.243465451852548e-07, + "loss": 0.0281, + "num_input_tokens_seen": 35017024, + "step": 17885 + }, + { + "epoch": 2.3705765407554673, + "grad_norm": 1.2272380590438843, + "learning_rate": 5.241338489379102e-07, + "loss": 0.0013, + "num_input_tokens_seen": 35018928, + "step": 17886 + }, + { + "epoch": 2.3707090788601723, + "grad_norm": 10.323346138000488, + "learning_rate": 5.239211907866288e-07, + "loss": 0.0381, + "num_input_tokens_seen": 35021384, + "step": 17887 + }, + { + "epoch": 2.3708416169648774, + "grad_norm": 1.7019084692001343, + "learning_rate": 5.237085707355103e-07, + "loss": 0.0294, + "num_input_tokens_seen": 35023480, + "step": 17888 + }, + { + "epoch": 2.3709741550695824, + "grad_norm": 10.059270858764648, + "learning_rate": 5.23495988788654e-07, + "loss": 0.0693, + "num_input_tokens_seen": 35026272, + "step": 17889 + }, + { + "epoch": 2.3711066931742875, + "grad_norm": 11.62209415435791, + "learning_rate": 5.232834449501584e-07, + "loss": 0.2196, + "num_input_tokens_seen": 35029488, + "step": 17890 + }, + { + "epoch": 2.3712392312789925, + "grad_norm": 0.04964036867022514, + "learning_rate": 5.230709392241218e-07, + "loss": 0.0003, + "num_input_tokens_seen": 35030808, + "step": 17891 + }, + { + "epoch": 2.371371769383698, + "grad_norm": 0.0075523448176681995, + "learning_rate": 5.228584716146407e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35032568, + "step": 17892 + }, + { + "epoch": 2.371504307488403, + "grad_norm": 5.44835090637207, + "learning_rate": 5.226460421258123e-07, + "loss": 0.0563, + "num_input_tokens_seen": 35034280, + "step": 17893 + }, + { + "epoch": 2.371636845593108, + "grad_norm": 9.484182357788086, + "learning_rate": 5.224336507617328e-07, + "loss": 0.0412, + "num_input_tokens_seen": 35035984, + "step": 17894 + }, + { + "epoch": 2.371769383697813, + "grad_norm": 0.0019039280014112592, + "learning_rate": 5.222212975264967e-07, + "loss": 0.0, + "num_input_tokens_seen": 35037384, + "step": 17895 + }, + { + "epoch": 2.371901921802518, + "grad_norm": 0.003801350947469473, + "learning_rate": 5.220089824241987e-07, + "loss": 0.0, + "num_input_tokens_seen": 35039080, + "step": 17896 + }, + { + "epoch": 2.372034459907223, + "grad_norm": 0.01347715687006712, + "learning_rate": 5.217967054589313e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35041720, + "step": 17897 + }, + { + "epoch": 2.372166998011928, + "grad_norm": 0.5990731716156006, + "learning_rate": 5.215844666347878e-07, + "loss": 0.0029, + "num_input_tokens_seen": 35044024, + "step": 17898 + }, + { + "epoch": 2.3722995361166337, + "grad_norm": 0.0016939198831096292, + "learning_rate": 5.213722659558615e-07, + "loss": 0.0, + "num_input_tokens_seen": 35045528, + "step": 17899 + }, + { + "epoch": 2.3724320742213387, + "grad_norm": 0.015038426034152508, + "learning_rate": 5.211601034262429e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35047216, + "step": 17900 + }, + { + "epoch": 2.3725646123260438, + "grad_norm": 0.00941755436360836, + "learning_rate": 5.209479790500227e-07, + "loss": 0.0, + "num_input_tokens_seen": 35048776, + "step": 17901 + }, + { + "epoch": 2.372697150430749, + "grad_norm": 0.4415956437587738, + "learning_rate": 5.207358928312905e-07, + "loss": 0.0031, + "num_input_tokens_seen": 35050856, + "step": 17902 + }, + { + "epoch": 2.372829688535454, + "grad_norm": 8.411741256713867, + "learning_rate": 5.205238447741354e-07, + "loss": 0.0919, + "num_input_tokens_seen": 35052512, + "step": 17903 + }, + { + "epoch": 2.372962226640159, + "grad_norm": 11.490489959716797, + "learning_rate": 5.203118348826461e-07, + "loss": 0.1827, + "num_input_tokens_seen": 35054720, + "step": 17904 + }, + { + "epoch": 2.373094764744864, + "grad_norm": 4.641931056976318, + "learning_rate": 5.200998631609108e-07, + "loss": 0.0327, + "num_input_tokens_seen": 35056288, + "step": 17905 + }, + { + "epoch": 2.3732273028495694, + "grad_norm": 0.0057508512400090694, + "learning_rate": 5.19887929613016e-07, + "loss": 0.0, + "num_input_tokens_seen": 35057760, + "step": 17906 + }, + { + "epoch": 2.3733598409542744, + "grad_norm": 0.0136649738997221, + "learning_rate": 5.196760342430479e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35059320, + "step": 17907 + }, + { + "epoch": 2.3734923790589795, + "grad_norm": 2.1688294410705566, + "learning_rate": 5.194641770550915e-07, + "loss": 0.0065, + "num_input_tokens_seen": 35061304, + "step": 17908 + }, + { + "epoch": 2.3736249171636845, + "grad_norm": 4.105848789215088, + "learning_rate": 5.192523580532327e-07, + "loss": 0.0157, + "num_input_tokens_seen": 35063072, + "step": 17909 + }, + { + "epoch": 2.3737574552683895, + "grad_norm": 2.588618278503418, + "learning_rate": 5.190405772415544e-07, + "loss": 0.0096, + "num_input_tokens_seen": 35065232, + "step": 17910 + }, + { + "epoch": 2.3738899933730946, + "grad_norm": 7.72862434387207, + "learning_rate": 5.188288346241408e-07, + "loss": 0.1321, + "num_input_tokens_seen": 35067312, + "step": 17911 + }, + { + "epoch": 2.3740225314777996, + "grad_norm": 3.853945255279541, + "learning_rate": 5.186171302050741e-07, + "loss": 0.0312, + "num_input_tokens_seen": 35069312, + "step": 17912 + }, + { + "epoch": 2.374155069582505, + "grad_norm": 0.037188608199357986, + "learning_rate": 5.18405463988435e-07, + "loss": 0.0002, + "num_input_tokens_seen": 35070592, + "step": 17913 + }, + { + "epoch": 2.37428760768721, + "grad_norm": 0.12726527452468872, + "learning_rate": 5.181938359783068e-07, + "loss": 0.0004, + "num_input_tokens_seen": 35073056, + "step": 17914 + }, + { + "epoch": 2.374420145791915, + "grad_norm": 1.373931884765625, + "learning_rate": 5.179822461787685e-07, + "loss": 0.0074, + "num_input_tokens_seen": 35074264, + "step": 17915 + }, + { + "epoch": 2.37455268389662, + "grad_norm": 0.5332595705986023, + "learning_rate": 5.177706945938988e-07, + "loss": 0.0023, + "num_input_tokens_seen": 35075912, + "step": 17916 + }, + { + "epoch": 2.3746852220013253, + "grad_norm": 0.020860079675912857, + "learning_rate": 5.175591812277786e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35078416, + "step": 17917 + }, + { + "epoch": 2.3748177601060303, + "grad_norm": 14.250446319580078, + "learning_rate": 5.17347706084484e-07, + "loss": 0.1237, + "num_input_tokens_seen": 35081096, + "step": 17918 + }, + { + "epoch": 2.3749502982107358, + "grad_norm": 0.016021398827433586, + "learning_rate": 5.171362691680943e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35082424, + "step": 17919 + }, + { + "epoch": 2.375082836315441, + "grad_norm": 5.326563835144043, + "learning_rate": 5.169248704826851e-07, + "loss": 0.0995, + "num_input_tokens_seen": 35084496, + "step": 17920 + }, + { + "epoch": 2.375215374420146, + "grad_norm": 4.2434282302856445, + "learning_rate": 5.167135100323326e-07, + "loss": 0.0501, + "num_input_tokens_seen": 35086184, + "step": 17921 + }, + { + "epoch": 2.375347912524851, + "grad_norm": 0.0033141958992928267, + "learning_rate": 5.165021878211112e-07, + "loss": 0.0, + "num_input_tokens_seen": 35088112, + "step": 17922 + }, + { + "epoch": 2.375480450629556, + "grad_norm": 12.980299949645996, + "learning_rate": 5.162909038530959e-07, + "loss": 0.0782, + "num_input_tokens_seen": 35089776, + "step": 17923 + }, + { + "epoch": 2.375612988734261, + "grad_norm": 0.0279797725379467, + "learning_rate": 5.16079658132361e-07, + "loss": 0.0002, + "num_input_tokens_seen": 35091968, + "step": 17924 + }, + { + "epoch": 2.3757455268389664, + "grad_norm": 0.002451819134876132, + "learning_rate": 5.158684506629791e-07, + "loss": 0.0, + "num_input_tokens_seen": 35093752, + "step": 17925 + }, + { + "epoch": 2.3758780649436715, + "grad_norm": 4.416686534881592, + "learning_rate": 5.156572814490222e-07, + "loss": 0.0184, + "num_input_tokens_seen": 35095728, + "step": 17926 + }, + { + "epoch": 2.3760106030483765, + "grad_norm": 16.32256317138672, + "learning_rate": 5.15446150494561e-07, + "loss": 0.219, + "num_input_tokens_seen": 35098104, + "step": 17927 + }, + { + "epoch": 2.3761431411530816, + "grad_norm": 9.956915855407715, + "learning_rate": 5.15235057803668e-07, + "loss": 0.1266, + "num_input_tokens_seen": 35099584, + "step": 17928 + }, + { + "epoch": 2.3762756792577866, + "grad_norm": 0.011720165610313416, + "learning_rate": 5.150240033804116e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35100448, + "step": 17929 + }, + { + "epoch": 2.3764082173624916, + "grad_norm": 29.025320053100586, + "learning_rate": 5.148129872288624e-07, + "loss": 0.2877, + "num_input_tokens_seen": 35102896, + "step": 17930 + }, + { + "epoch": 2.3765407554671967, + "grad_norm": 4.745757579803467, + "learning_rate": 5.146020093530882e-07, + "loss": 0.068, + "num_input_tokens_seen": 35104752, + "step": 17931 + }, + { + "epoch": 2.376673293571902, + "grad_norm": 5.560002326965332, + "learning_rate": 5.143910697571571e-07, + "loss": 0.0428, + "num_input_tokens_seen": 35106720, + "step": 17932 + }, + { + "epoch": 2.376805831676607, + "grad_norm": 0.0006894422695040703, + "learning_rate": 5.141801684451353e-07, + "loss": 0.0, + "num_input_tokens_seen": 35108136, + "step": 17933 + }, + { + "epoch": 2.3769383697813122, + "grad_norm": 0.004063890781253576, + "learning_rate": 5.139693054210895e-07, + "loss": 0.0, + "num_input_tokens_seen": 35109344, + "step": 17934 + }, + { + "epoch": 2.3770709078860173, + "grad_norm": 2.4895477294921875, + "learning_rate": 5.137584806890866e-07, + "loss": 0.0061, + "num_input_tokens_seen": 35111176, + "step": 17935 + }, + { + "epoch": 2.3772034459907223, + "grad_norm": 0.04672565683722496, + "learning_rate": 5.135476942531903e-07, + "loss": 0.0003, + "num_input_tokens_seen": 35113120, + "step": 17936 + }, + { + "epoch": 2.3773359840954273, + "grad_norm": 11.556010246276855, + "learning_rate": 5.133369461174647e-07, + "loss": 0.1202, + "num_input_tokens_seen": 35115736, + "step": 17937 + }, + { + "epoch": 2.3774685222001324, + "grad_norm": 1.7991528511047363, + "learning_rate": 5.131262362859738e-07, + "loss": 0.0392, + "num_input_tokens_seen": 35117408, + "step": 17938 + }, + { + "epoch": 2.377601060304838, + "grad_norm": 12.910868644714355, + "learning_rate": 5.129155647627787e-07, + "loss": 0.1214, + "num_input_tokens_seen": 35119496, + "step": 17939 + }, + { + "epoch": 2.377733598409543, + "grad_norm": 0.011820790357887745, + "learning_rate": 5.127049315519422e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35121976, + "step": 17940 + }, + { + "epoch": 2.377866136514248, + "grad_norm": 5.301743984222412, + "learning_rate": 5.124943366575263e-07, + "loss": 0.0559, + "num_input_tokens_seen": 35124528, + "step": 17941 + }, + { + "epoch": 2.377998674618953, + "grad_norm": 0.08360441029071808, + "learning_rate": 5.122837800835909e-07, + "loss": 0.0003, + "num_input_tokens_seen": 35126296, + "step": 17942 + }, + { + "epoch": 2.378131212723658, + "grad_norm": 0.007838529534637928, + "learning_rate": 5.120732618341956e-07, + "loss": 0.0, + "num_input_tokens_seen": 35128448, + "step": 17943 + }, + { + "epoch": 2.378263750828363, + "grad_norm": 3.1454050540924072, + "learning_rate": 5.118627819133981e-07, + "loss": 0.0371, + "num_input_tokens_seen": 35130176, + "step": 17944 + }, + { + "epoch": 2.378396288933068, + "grad_norm": 0.10389475524425507, + "learning_rate": 5.116523403252588e-07, + "loss": 0.0005, + "num_input_tokens_seen": 35132152, + "step": 17945 + }, + { + "epoch": 2.3785288270377736, + "grad_norm": 1.531826138496399, + "learning_rate": 5.11441937073833e-07, + "loss": 0.0071, + "num_input_tokens_seen": 35134392, + "step": 17946 + }, + { + "epoch": 2.3786613651424786, + "grad_norm": 9.040937423706055, + "learning_rate": 5.112315721631795e-07, + "loss": 0.0807, + "num_input_tokens_seen": 35136856, + "step": 17947 + }, + { + "epoch": 2.3787939032471836, + "grad_norm": 0.019851993769407272, + "learning_rate": 5.110212455973532e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35138552, + "step": 17948 + }, + { + "epoch": 2.3789264413518887, + "grad_norm": 2.7148897647857666, + "learning_rate": 5.108109573804085e-07, + "loss": 0.0255, + "num_input_tokens_seen": 35140792, + "step": 17949 + }, + { + "epoch": 2.3790589794565937, + "grad_norm": 0.0018293705070391297, + "learning_rate": 5.106007075164018e-07, + "loss": 0.0, + "num_input_tokens_seen": 35141992, + "step": 17950 + }, + { + "epoch": 2.3791915175612988, + "grad_norm": 10.630107879638672, + "learning_rate": 5.10390496009385e-07, + "loss": 0.1317, + "num_input_tokens_seen": 35144960, + "step": 17951 + }, + { + "epoch": 2.379324055666004, + "grad_norm": 0.5853550434112549, + "learning_rate": 5.101803228634127e-07, + "loss": 0.004, + "num_input_tokens_seen": 35146760, + "step": 17952 + }, + { + "epoch": 2.3794565937707093, + "grad_norm": 2.4751431941986084, + "learning_rate": 5.099701880825366e-07, + "loss": 0.0169, + "num_input_tokens_seen": 35148960, + "step": 17953 + }, + { + "epoch": 2.3795891318754143, + "grad_norm": 15.273058891296387, + "learning_rate": 5.097600916708081e-07, + "loss": 0.1693, + "num_input_tokens_seen": 35150912, + "step": 17954 + }, + { + "epoch": 2.3797216699801194, + "grad_norm": 0.010250061750411987, + "learning_rate": 5.095500336322772e-07, + "loss": 0.0, + "num_input_tokens_seen": 35152320, + "step": 17955 + }, + { + "epoch": 2.3798542080848244, + "grad_norm": 0.0028061058837920427, + "learning_rate": 5.093400139709956e-07, + "loss": 0.0, + "num_input_tokens_seen": 35154024, + "step": 17956 + }, + { + "epoch": 2.3799867461895294, + "grad_norm": 0.008211425505578518, + "learning_rate": 5.091300326910112e-07, + "loss": 0.0, + "num_input_tokens_seen": 35155424, + "step": 17957 + }, + { + "epoch": 2.3801192842942345, + "grad_norm": 6.763659954071045, + "learning_rate": 5.089200897963736e-07, + "loss": 0.054, + "num_input_tokens_seen": 35157520, + "step": 17958 + }, + { + "epoch": 2.3802518223989395, + "grad_norm": 0.00388971040956676, + "learning_rate": 5.087101852911305e-07, + "loss": 0.0, + "num_input_tokens_seen": 35159736, + "step": 17959 + }, + { + "epoch": 2.380384360503645, + "grad_norm": 0.019698435440659523, + "learning_rate": 5.085003191793281e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35162384, + "step": 17960 + }, + { + "epoch": 2.38051689860835, + "grad_norm": 8.145330429077148, + "learning_rate": 5.082904914650139e-07, + "loss": 0.0431, + "num_input_tokens_seen": 35164688, + "step": 17961 + }, + { + "epoch": 2.380649436713055, + "grad_norm": 27.213287353515625, + "learning_rate": 5.080807021522332e-07, + "loss": 0.317, + "num_input_tokens_seen": 35167208, + "step": 17962 + }, + { + "epoch": 2.38078197481776, + "grad_norm": 1.1561954021453857, + "learning_rate": 5.0787095124503e-07, + "loss": 0.0031, + "num_input_tokens_seen": 35168752, + "step": 17963 + }, + { + "epoch": 2.380914512922465, + "grad_norm": 5.090138912200928, + "learning_rate": 5.076612387474497e-07, + "loss": 0.0269, + "num_input_tokens_seen": 35170752, + "step": 17964 + }, + { + "epoch": 2.38104705102717, + "grad_norm": 7.058280944824219, + "learning_rate": 5.074515646635348e-07, + "loss": 0.169, + "num_input_tokens_seen": 35172608, + "step": 17965 + }, + { + "epoch": 2.381179589131875, + "grad_norm": 0.06047163903713226, + "learning_rate": 5.072419289973285e-07, + "loss": 0.0003, + "num_input_tokens_seen": 35174192, + "step": 17966 + }, + { + "epoch": 2.3813121272365807, + "grad_norm": 9.287342071533203, + "learning_rate": 5.07032331752873e-07, + "loss": 0.057, + "num_input_tokens_seen": 35175912, + "step": 17967 + }, + { + "epoch": 2.3814446653412857, + "grad_norm": 0.004644722677767277, + "learning_rate": 5.068227729342087e-07, + "loss": 0.0, + "num_input_tokens_seen": 35177384, + "step": 17968 + }, + { + "epoch": 2.3815772034459908, + "grad_norm": 2.6040971279144287, + "learning_rate": 5.06613252545376e-07, + "loss": 0.0165, + "num_input_tokens_seen": 35179480, + "step": 17969 + }, + { + "epoch": 2.381709741550696, + "grad_norm": 0.4529944360256195, + "learning_rate": 5.064037705904145e-07, + "loss": 0.0014, + "num_input_tokens_seen": 35180792, + "step": 17970 + }, + { + "epoch": 2.381842279655401, + "grad_norm": 0.011084871366620064, + "learning_rate": 5.061943270733647e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35183032, + "step": 17971 + }, + { + "epoch": 2.381974817760106, + "grad_norm": 0.0010478443000465631, + "learning_rate": 5.059849219982637e-07, + "loss": 0.0, + "num_input_tokens_seen": 35184392, + "step": 17972 + }, + { + "epoch": 2.382107355864811, + "grad_norm": 0.01291678473353386, + "learning_rate": 5.057755553691488e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35186064, + "step": 17973 + }, + { + "epoch": 2.3822398939695164, + "grad_norm": 0.2940012216567993, + "learning_rate": 5.055662271900571e-07, + "loss": 0.0018, + "num_input_tokens_seen": 35188208, + "step": 17974 + }, + { + "epoch": 2.3823724320742214, + "grad_norm": 0.0004471022402867675, + "learning_rate": 5.053569374650239e-07, + "loss": 0.0, + "num_input_tokens_seen": 35189328, + "step": 17975 + }, + { + "epoch": 2.3825049701789265, + "grad_norm": 0.07502160221338272, + "learning_rate": 5.051476861980847e-07, + "loss": 0.0004, + "num_input_tokens_seen": 35191320, + "step": 17976 + }, + { + "epoch": 2.3826375082836315, + "grad_norm": 2.128720283508301, + "learning_rate": 5.04938473393275e-07, + "loss": 0.0076, + "num_input_tokens_seen": 35193352, + "step": 17977 + }, + { + "epoch": 2.3827700463883366, + "grad_norm": 1.0366307497024536, + "learning_rate": 5.047292990546279e-07, + "loss": 0.0035, + "num_input_tokens_seen": 35195296, + "step": 17978 + }, + { + "epoch": 2.3829025844930416, + "grad_norm": 18.773576736450195, + "learning_rate": 5.045201631861762e-07, + "loss": 0.1827, + "num_input_tokens_seen": 35196928, + "step": 17979 + }, + { + "epoch": 2.3830351225977466, + "grad_norm": 0.2759280502796173, + "learning_rate": 5.04311065791952e-07, + "loss": 0.0007, + "num_input_tokens_seen": 35198584, + "step": 17980 + }, + { + "epoch": 2.383167660702452, + "grad_norm": 0.12578228116035461, + "learning_rate": 5.041020068759872e-07, + "loss": 0.001, + "num_input_tokens_seen": 35200152, + "step": 17981 + }, + { + "epoch": 2.383300198807157, + "grad_norm": 5.767873287200928, + "learning_rate": 5.038929864423131e-07, + "loss": 0.0657, + "num_input_tokens_seen": 35201896, + "step": 17982 + }, + { + "epoch": 2.383432736911862, + "grad_norm": 1.2965185642242432, + "learning_rate": 5.036840044949593e-07, + "loss": 0.0065, + "num_input_tokens_seen": 35204400, + "step": 17983 + }, + { + "epoch": 2.3835652750165672, + "grad_norm": 0.08428892493247986, + "learning_rate": 5.034750610379552e-07, + "loss": 0.0003, + "num_input_tokens_seen": 35205976, + "step": 17984 + }, + { + "epoch": 2.3836978131212723, + "grad_norm": 0.0033565452322363853, + "learning_rate": 5.03266156075329e-07, + "loss": 0.0, + "num_input_tokens_seen": 35207136, + "step": 17985 + }, + { + "epoch": 2.3838303512259773, + "grad_norm": 20.848039627075195, + "learning_rate": 5.030572896111083e-07, + "loss": 0.4158, + "num_input_tokens_seen": 35209056, + "step": 17986 + }, + { + "epoch": 2.3839628893306823, + "grad_norm": 0.0039758263155817986, + "learning_rate": 5.028484616493206e-07, + "loss": 0.0, + "num_input_tokens_seen": 35210576, + "step": 17987 + }, + { + "epoch": 2.384095427435388, + "grad_norm": 0.051597170531749725, + "learning_rate": 5.02639672193993e-07, + "loss": 0.0002, + "num_input_tokens_seen": 35213472, + "step": 17988 + }, + { + "epoch": 2.384227965540093, + "grad_norm": 7.71702241897583, + "learning_rate": 5.024309212491507e-07, + "loss": 0.1154, + "num_input_tokens_seen": 35214656, + "step": 17989 + }, + { + "epoch": 2.384360503644798, + "grad_norm": 0.05339091643691063, + "learning_rate": 5.022222088188181e-07, + "loss": 0.0002, + "num_input_tokens_seen": 35216120, + "step": 17990 + }, + { + "epoch": 2.384493041749503, + "grad_norm": 4.701279163360596, + "learning_rate": 5.020135349070188e-07, + "loss": 0.0208, + "num_input_tokens_seen": 35217824, + "step": 17991 + }, + { + "epoch": 2.384625579854208, + "grad_norm": 0.05870716646313667, + "learning_rate": 5.018048995177774e-07, + "loss": 0.0004, + "num_input_tokens_seen": 35220152, + "step": 17992 + }, + { + "epoch": 2.384758117958913, + "grad_norm": 0.023498330265283585, + "learning_rate": 5.015963026551154e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35221680, + "step": 17993 + }, + { + "epoch": 2.384890656063618, + "grad_norm": 0.011212069541215897, + "learning_rate": 5.013877443230563e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35222912, + "step": 17994 + }, + { + "epoch": 2.3850231941683235, + "grad_norm": 4.699239253997803, + "learning_rate": 5.0117922452562e-07, + "loss": 0.0257, + "num_input_tokens_seen": 35224592, + "step": 17995 + }, + { + "epoch": 2.3851557322730286, + "grad_norm": 0.34986379742622375, + "learning_rate": 5.009707432668265e-07, + "loss": 0.0026, + "num_input_tokens_seen": 35227224, + "step": 17996 + }, + { + "epoch": 2.3852882703777336, + "grad_norm": 0.06769727170467377, + "learning_rate": 5.007623005506971e-07, + "loss": 0.0003, + "num_input_tokens_seen": 35229136, + "step": 17997 + }, + { + "epoch": 2.3854208084824386, + "grad_norm": 0.008928444236516953, + "learning_rate": 5.005538963812493e-07, + "loss": 0.0, + "num_input_tokens_seen": 35230864, + "step": 17998 + }, + { + "epoch": 2.3855533465871437, + "grad_norm": 1.4347572326660156, + "learning_rate": 5.003455307625013e-07, + "loss": 0.0038, + "num_input_tokens_seen": 35232392, + "step": 17999 + }, + { + "epoch": 2.3856858846918487, + "grad_norm": 0.0009519436280243099, + "learning_rate": 5.001372036984717e-07, + "loss": 0.0, + "num_input_tokens_seen": 35233888, + "step": 18000 + }, + { + "epoch": 2.3858184227965538, + "grad_norm": 0.19126462936401367, + "learning_rate": 4.999289151931755e-07, + "loss": 0.0016, + "num_input_tokens_seen": 35235856, + "step": 18001 + }, + { + "epoch": 2.3859509609012592, + "grad_norm": 1.900070071220398, + "learning_rate": 4.997206652506303e-07, + "loss": 0.0222, + "num_input_tokens_seen": 35238312, + "step": 18002 + }, + { + "epoch": 2.3860834990059643, + "grad_norm": 0.0020103466231375933, + "learning_rate": 4.995124538748508e-07, + "loss": 0.0, + "num_input_tokens_seen": 35239352, + "step": 18003 + }, + { + "epoch": 2.3862160371106693, + "grad_norm": 3.927797317504883, + "learning_rate": 4.993042810698507e-07, + "loss": 0.0412, + "num_input_tokens_seen": 35241336, + "step": 18004 + }, + { + "epoch": 2.3863485752153744, + "grad_norm": 6.308338642120361, + "learning_rate": 4.990961468396446e-07, + "loss": 0.0379, + "num_input_tokens_seen": 35243040, + "step": 18005 + }, + { + "epoch": 2.3864811133200794, + "grad_norm": 0.0027931940276175737, + "learning_rate": 4.988880511882447e-07, + "loss": 0.0, + "num_input_tokens_seen": 35244544, + "step": 18006 + }, + { + "epoch": 2.3866136514247844, + "grad_norm": 0.0005692624254152179, + "learning_rate": 4.986799941196643e-07, + "loss": 0.0, + "num_input_tokens_seen": 35245904, + "step": 18007 + }, + { + "epoch": 2.38674618952949, + "grad_norm": 0.8246408700942993, + "learning_rate": 4.984719756379141e-07, + "loss": 0.0058, + "num_input_tokens_seen": 35247760, + "step": 18008 + }, + { + "epoch": 2.386878727634195, + "grad_norm": 4.985980033874512, + "learning_rate": 4.98263995747005e-07, + "loss": 0.1092, + "num_input_tokens_seen": 35250064, + "step": 18009 + }, + { + "epoch": 2.3870112657389, + "grad_norm": 8.307755470275879, + "learning_rate": 4.980560544509467e-07, + "loss": 0.0233, + "num_input_tokens_seen": 35251792, + "step": 18010 + }, + { + "epoch": 2.387143803843605, + "grad_norm": 2.374363899230957, + "learning_rate": 4.97848151753749e-07, + "loss": 0.0041, + "num_input_tokens_seen": 35253312, + "step": 18011 + }, + { + "epoch": 2.38727634194831, + "grad_norm": 0.2714846432209015, + "learning_rate": 4.976402876594196e-07, + "loss": 0.0025, + "num_input_tokens_seen": 35255408, + "step": 18012 + }, + { + "epoch": 2.387408880053015, + "grad_norm": 1.2457352876663208, + "learning_rate": 4.974324621719676e-07, + "loss": 0.0015, + "num_input_tokens_seen": 35256760, + "step": 18013 + }, + { + "epoch": 2.3875414181577206, + "grad_norm": 0.08192801475524902, + "learning_rate": 4.972246752953991e-07, + "loss": 0.0006, + "num_input_tokens_seen": 35258120, + "step": 18014 + }, + { + "epoch": 2.3876739562624256, + "grad_norm": 7.829074382781982, + "learning_rate": 4.970169270337205e-07, + "loss": 0.1039, + "num_input_tokens_seen": 35260672, + "step": 18015 + }, + { + "epoch": 2.3878064943671307, + "grad_norm": 0.00040108885150402784, + "learning_rate": 4.968092173909367e-07, + "loss": 0.0, + "num_input_tokens_seen": 35261984, + "step": 18016 + }, + { + "epoch": 2.3879390324718357, + "grad_norm": 0.026104629039764404, + "learning_rate": 4.96601546371053e-07, + "loss": 0.0002, + "num_input_tokens_seen": 35263496, + "step": 18017 + }, + { + "epoch": 2.3880715705765407, + "grad_norm": 1.5895689725875854, + "learning_rate": 4.963939139780743e-07, + "loss": 0.0134, + "num_input_tokens_seen": 35265152, + "step": 18018 + }, + { + "epoch": 2.3882041086812458, + "grad_norm": 0.054714106023311615, + "learning_rate": 4.961863202160033e-07, + "loss": 0.0003, + "num_input_tokens_seen": 35267224, + "step": 18019 + }, + { + "epoch": 2.388336646785951, + "grad_norm": 0.39415812492370605, + "learning_rate": 4.959787650888423e-07, + "loss": 0.002, + "num_input_tokens_seen": 35268608, + "step": 18020 + }, + { + "epoch": 2.3884691848906563, + "grad_norm": 6.771104335784912, + "learning_rate": 4.957712486005933e-07, + "loss": 0.0345, + "num_input_tokens_seen": 35271072, + "step": 18021 + }, + { + "epoch": 2.3886017229953613, + "grad_norm": 0.0914878249168396, + "learning_rate": 4.955637707552565e-07, + "loss": 0.0006, + "num_input_tokens_seen": 35274304, + "step": 18022 + }, + { + "epoch": 2.3887342611000664, + "grad_norm": 12.44843578338623, + "learning_rate": 4.95356331556833e-07, + "loss": 0.1105, + "num_input_tokens_seen": 35276080, + "step": 18023 + }, + { + "epoch": 2.3888667992047714, + "grad_norm": 0.018801383674144745, + "learning_rate": 4.951489310093229e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35278400, + "step": 18024 + }, + { + "epoch": 2.3889993373094764, + "grad_norm": 1.5168118476867676, + "learning_rate": 4.949415691167248e-07, + "loss": 0.0072, + "num_input_tokens_seen": 35281000, + "step": 18025 + }, + { + "epoch": 2.3891318754141815, + "grad_norm": 1.2630919218063354, + "learning_rate": 4.947342458830362e-07, + "loss": 0.0035, + "num_input_tokens_seen": 35283512, + "step": 18026 + }, + { + "epoch": 2.3892644135188865, + "grad_norm": 6.789336204528809, + "learning_rate": 4.945269613122541e-07, + "loss": 0.0633, + "num_input_tokens_seen": 35285888, + "step": 18027 + }, + { + "epoch": 2.389396951623592, + "grad_norm": 3.4781599044799805, + "learning_rate": 4.943197154083765e-07, + "loss": 0.0615, + "num_input_tokens_seen": 35287712, + "step": 18028 + }, + { + "epoch": 2.389529489728297, + "grad_norm": 3.359518527984619, + "learning_rate": 4.941125081753976e-07, + "loss": 0.0166, + "num_input_tokens_seen": 35289416, + "step": 18029 + }, + { + "epoch": 2.389662027833002, + "grad_norm": 0.02825923264026642, + "learning_rate": 4.93905339617314e-07, + "loss": 0.0002, + "num_input_tokens_seen": 35291576, + "step": 18030 + }, + { + "epoch": 2.389794565937707, + "grad_norm": 0.02149035781621933, + "learning_rate": 4.936982097381193e-07, + "loss": 0.0002, + "num_input_tokens_seen": 35293720, + "step": 18031 + }, + { + "epoch": 2.389927104042412, + "grad_norm": 1.1241884231567383, + "learning_rate": 4.934911185418067e-07, + "loss": 0.0117, + "num_input_tokens_seen": 35295640, + "step": 18032 + }, + { + "epoch": 2.390059642147117, + "grad_norm": 1.6864402294158936, + "learning_rate": 4.932840660323704e-07, + "loss": 0.0055, + "num_input_tokens_seen": 35298008, + "step": 18033 + }, + { + "epoch": 2.3901921802518222, + "grad_norm": 0.28321102261543274, + "learning_rate": 4.930770522138006e-07, + "loss": 0.0007, + "num_input_tokens_seen": 35300616, + "step": 18034 + }, + { + "epoch": 2.3903247183565277, + "grad_norm": 2.5125503540039062, + "learning_rate": 4.928700770900907e-07, + "loss": 0.0148, + "num_input_tokens_seen": 35302488, + "step": 18035 + }, + { + "epoch": 2.3904572564612327, + "grad_norm": 0.0071412259712815285, + "learning_rate": 4.926631406652302e-07, + "loss": 0.0, + "num_input_tokens_seen": 35304448, + "step": 18036 + }, + { + "epoch": 2.390589794565938, + "grad_norm": 1.1198303699493408, + "learning_rate": 4.924562429432087e-07, + "loss": 0.002, + "num_input_tokens_seen": 35306392, + "step": 18037 + }, + { + "epoch": 2.390722332670643, + "grad_norm": 10.347325325012207, + "learning_rate": 4.922493839280162e-07, + "loss": 0.114, + "num_input_tokens_seen": 35308816, + "step": 18038 + }, + { + "epoch": 2.390854870775348, + "grad_norm": 5.873945236206055, + "learning_rate": 4.920425636236406e-07, + "loss": 0.0505, + "num_input_tokens_seen": 35311312, + "step": 18039 + }, + { + "epoch": 2.390987408880053, + "grad_norm": 14.954638481140137, + "learning_rate": 4.918357820340691e-07, + "loss": 0.08, + "num_input_tokens_seen": 35313320, + "step": 18040 + }, + { + "epoch": 2.391119946984758, + "grad_norm": 0.0013999970396980643, + "learning_rate": 4.916290391632897e-07, + "loss": 0.0, + "num_input_tokens_seen": 35314648, + "step": 18041 + }, + { + "epoch": 2.3912524850894634, + "grad_norm": 6.466757297515869, + "learning_rate": 4.914223350152878e-07, + "loss": 0.089, + "num_input_tokens_seen": 35316752, + "step": 18042 + }, + { + "epoch": 2.3913850231941685, + "grad_norm": 11.982085227966309, + "learning_rate": 4.912156695940484e-07, + "loss": 0.2459, + "num_input_tokens_seen": 35319048, + "step": 18043 + }, + { + "epoch": 2.3915175612988735, + "grad_norm": 8.030454635620117, + "learning_rate": 4.910090429035575e-07, + "loss": 0.0442, + "num_input_tokens_seen": 35321152, + "step": 18044 + }, + { + "epoch": 2.3916500994035785, + "grad_norm": 1.0896844863891602, + "learning_rate": 4.90802454947798e-07, + "loss": 0.006, + "num_input_tokens_seen": 35322944, + "step": 18045 + }, + { + "epoch": 2.3917826375082836, + "grad_norm": 0.01842232048511505, + "learning_rate": 4.905959057307525e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35325872, + "step": 18046 + }, + { + "epoch": 2.3919151756129886, + "grad_norm": 2.0054469108581543, + "learning_rate": 4.903893952564049e-07, + "loss": 0.0131, + "num_input_tokens_seen": 35328032, + "step": 18047 + }, + { + "epoch": 2.3920477137176936, + "grad_norm": 5.685820579528809, + "learning_rate": 4.901829235287356e-07, + "loss": 0.0998, + "num_input_tokens_seen": 35330712, + "step": 18048 + }, + { + "epoch": 2.392180251822399, + "grad_norm": 0.6882884502410889, + "learning_rate": 4.899764905517265e-07, + "loss": 0.0009, + "num_input_tokens_seen": 35332624, + "step": 18049 + }, + { + "epoch": 2.392312789927104, + "grad_norm": 0.051814790815114975, + "learning_rate": 4.897700963293572e-07, + "loss": 0.0003, + "num_input_tokens_seen": 35334872, + "step": 18050 + }, + { + "epoch": 2.392445328031809, + "grad_norm": 0.00880675483494997, + "learning_rate": 4.895637408656075e-07, + "loss": 0.0, + "num_input_tokens_seen": 35338232, + "step": 18051 + }, + { + "epoch": 2.3925778661365142, + "grad_norm": 0.015536037273705006, + "learning_rate": 4.89357424164455e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35340032, + "step": 18052 + }, + { + "epoch": 2.3927104042412193, + "grad_norm": 6.628492832183838, + "learning_rate": 4.891511462298784e-07, + "loss": 0.0855, + "num_input_tokens_seen": 35342624, + "step": 18053 + }, + { + "epoch": 2.3928429423459243, + "grad_norm": 8.933393478393555, + "learning_rate": 4.889449070658558e-07, + "loss": 0.1109, + "num_input_tokens_seen": 35344496, + "step": 18054 + }, + { + "epoch": 2.3929754804506294, + "grad_norm": 0.040000446140766144, + "learning_rate": 4.887387066763625e-07, + "loss": 0.0002, + "num_input_tokens_seen": 35346080, + "step": 18055 + }, + { + "epoch": 2.393108018555335, + "grad_norm": 0.0006348509923554957, + "learning_rate": 4.885325450653747e-07, + "loss": 0.0, + "num_input_tokens_seen": 35347320, + "step": 18056 + }, + { + "epoch": 2.39324055666004, + "grad_norm": 2.0450828075408936, + "learning_rate": 4.883264222368661e-07, + "loss": 0.0125, + "num_input_tokens_seen": 35349328, + "step": 18057 + }, + { + "epoch": 2.393373094764745, + "grad_norm": 2.423931837081909, + "learning_rate": 4.881203381948121e-07, + "loss": 0.0036, + "num_input_tokens_seen": 35351672, + "step": 18058 + }, + { + "epoch": 2.39350563286945, + "grad_norm": 7.251317501068115, + "learning_rate": 4.879142929431863e-07, + "loss": 0.0996, + "num_input_tokens_seen": 35353376, + "step": 18059 + }, + { + "epoch": 2.393638170974155, + "grad_norm": 0.002008693525567651, + "learning_rate": 4.877082864859611e-07, + "loss": 0.0, + "num_input_tokens_seen": 35354744, + "step": 18060 + }, + { + "epoch": 2.39377070907886, + "grad_norm": 0.0003734497295226902, + "learning_rate": 4.875023188271085e-07, + "loss": 0.0, + "num_input_tokens_seen": 35356232, + "step": 18061 + }, + { + "epoch": 2.393903247183565, + "grad_norm": 0.4390147626399994, + "learning_rate": 4.872963899705996e-07, + "loss": 0.0021, + "num_input_tokens_seen": 35358208, + "step": 18062 + }, + { + "epoch": 2.3940357852882705, + "grad_norm": 0.10972969979047775, + "learning_rate": 4.87090499920404e-07, + "loss": 0.0004, + "num_input_tokens_seen": 35359712, + "step": 18063 + }, + { + "epoch": 2.3941683233929756, + "grad_norm": 0.0017021679086610675, + "learning_rate": 4.86884648680492e-07, + "loss": 0.0, + "num_input_tokens_seen": 35361240, + "step": 18064 + }, + { + "epoch": 2.3943008614976806, + "grad_norm": 0.0034408760257065296, + "learning_rate": 4.866788362548336e-07, + "loss": 0.0, + "num_input_tokens_seen": 35363384, + "step": 18065 + }, + { + "epoch": 2.3944333996023857, + "grad_norm": 6.084743499755859, + "learning_rate": 4.864730626473962e-07, + "loss": 0.0302, + "num_input_tokens_seen": 35365480, + "step": 18066 + }, + { + "epoch": 2.3945659377070907, + "grad_norm": 0.0009702668758109212, + "learning_rate": 4.862673278621472e-07, + "loss": 0.0, + "num_input_tokens_seen": 35366976, + "step": 18067 + }, + { + "epoch": 2.3946984758117957, + "grad_norm": 0.8273171782493591, + "learning_rate": 4.860616319030534e-07, + "loss": 0.0027, + "num_input_tokens_seen": 35368600, + "step": 18068 + }, + { + "epoch": 2.3948310139165008, + "grad_norm": 13.322151184082031, + "learning_rate": 4.858559747740798e-07, + "loss": 0.2269, + "num_input_tokens_seen": 35371112, + "step": 18069 + }, + { + "epoch": 2.3949635520212063, + "grad_norm": 0.0008569032070226967, + "learning_rate": 4.856503564791926e-07, + "loss": 0.0, + "num_input_tokens_seen": 35372480, + "step": 18070 + }, + { + "epoch": 2.3950960901259113, + "grad_norm": 10.47722339630127, + "learning_rate": 4.85444777022357e-07, + "loss": 0.1041, + "num_input_tokens_seen": 35374320, + "step": 18071 + }, + { + "epoch": 2.3952286282306163, + "grad_norm": 4.897650718688965, + "learning_rate": 4.852392364075357e-07, + "loss": 0.0603, + "num_input_tokens_seen": 35376208, + "step": 18072 + }, + { + "epoch": 2.3953611663353214, + "grad_norm": 0.1023603081703186, + "learning_rate": 4.850337346386916e-07, + "loss": 0.0003, + "num_input_tokens_seen": 35377888, + "step": 18073 + }, + { + "epoch": 2.3954937044400264, + "grad_norm": 0.007483744993805885, + "learning_rate": 4.848282717197866e-07, + "loss": 0.0, + "num_input_tokens_seen": 35380432, + "step": 18074 + }, + { + "epoch": 2.3956262425447314, + "grad_norm": 15.703322410583496, + "learning_rate": 4.846228476547837e-07, + "loss": 0.213, + "num_input_tokens_seen": 35381888, + "step": 18075 + }, + { + "epoch": 2.3957587806494365, + "grad_norm": 0.016008207574486732, + "learning_rate": 4.844174624476417e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35383520, + "step": 18076 + }, + { + "epoch": 2.395891318754142, + "grad_norm": 0.029001623392105103, + "learning_rate": 4.842121161023222e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35384928, + "step": 18077 + }, + { + "epoch": 2.396023856858847, + "grad_norm": 0.0028256913647055626, + "learning_rate": 4.840068086227837e-07, + "loss": 0.0, + "num_input_tokens_seen": 35386688, + "step": 18078 + }, + { + "epoch": 2.396156394963552, + "grad_norm": 0.0021167208906263113, + "learning_rate": 4.838015400129842e-07, + "loss": 0.0, + "num_input_tokens_seen": 35387960, + "step": 18079 + }, + { + "epoch": 2.396288933068257, + "grad_norm": 0.008040952496230602, + "learning_rate": 4.835963102768823e-07, + "loss": 0.0, + "num_input_tokens_seen": 35389696, + "step": 18080 + }, + { + "epoch": 2.396421471172962, + "grad_norm": 5.167471408843994, + "learning_rate": 4.83391119418434e-07, + "loss": 0.0479, + "num_input_tokens_seen": 35391584, + "step": 18081 + }, + { + "epoch": 2.396554009277667, + "grad_norm": 0.015606870874762535, + "learning_rate": 4.83185967441597e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35393152, + "step": 18082 + }, + { + "epoch": 2.396686547382372, + "grad_norm": 0.0010417028097435832, + "learning_rate": 4.829808543503256e-07, + "loss": 0.0, + "num_input_tokens_seen": 35394720, + "step": 18083 + }, + { + "epoch": 2.3968190854870777, + "grad_norm": 12.353904724121094, + "learning_rate": 4.827757801485744e-07, + "loss": 0.3407, + "num_input_tokens_seen": 35396064, + "step": 18084 + }, + { + "epoch": 2.3969516235917827, + "grad_norm": 0.0026123649440705776, + "learning_rate": 4.825707448402983e-07, + "loss": 0.0, + "num_input_tokens_seen": 35397248, + "step": 18085 + }, + { + "epoch": 2.3970841616964877, + "grad_norm": 9.295557022094727, + "learning_rate": 4.8236574842945e-07, + "loss": 0.1123, + "num_input_tokens_seen": 35398992, + "step": 18086 + }, + { + "epoch": 2.397216699801193, + "grad_norm": 0.011044790036976337, + "learning_rate": 4.821607909199813e-07, + "loss": 0.0, + "num_input_tokens_seen": 35400512, + "step": 18087 + }, + { + "epoch": 2.397349237905898, + "grad_norm": 12.825777053833008, + "learning_rate": 4.819558723158452e-07, + "loss": 0.0981, + "num_input_tokens_seen": 35401960, + "step": 18088 + }, + { + "epoch": 2.397481776010603, + "grad_norm": 2.2096049785614014, + "learning_rate": 4.817509926209915e-07, + "loss": 0.0071, + "num_input_tokens_seen": 35404072, + "step": 18089 + }, + { + "epoch": 2.3976143141153083, + "grad_norm": 3.461277961730957, + "learning_rate": 4.815461518393713e-07, + "loss": 0.0305, + "num_input_tokens_seen": 35406176, + "step": 18090 + }, + { + "epoch": 2.3977468522200134, + "grad_norm": 0.8326310515403748, + "learning_rate": 4.813413499749342e-07, + "loss": 0.0053, + "num_input_tokens_seen": 35407744, + "step": 18091 + }, + { + "epoch": 2.3978793903247184, + "grad_norm": 0.1329413205385208, + "learning_rate": 4.811365870316281e-07, + "loss": 0.0008, + "num_input_tokens_seen": 35409624, + "step": 18092 + }, + { + "epoch": 2.3980119284294235, + "grad_norm": 0.04257022216916084, + "learning_rate": 4.809318630134011e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35411664, + "step": 18093 + }, + { + "epoch": 2.3981444665341285, + "grad_norm": 0.1821708083152771, + "learning_rate": 4.807271779242004e-07, + "loss": 0.0009, + "num_input_tokens_seen": 35414072, + "step": 18094 + }, + { + "epoch": 2.3982770046388335, + "grad_norm": 5.76358699798584, + "learning_rate": 4.805225317679735e-07, + "loss": 0.0339, + "num_input_tokens_seen": 35416488, + "step": 18095 + }, + { + "epoch": 2.398409542743539, + "grad_norm": 13.017148971557617, + "learning_rate": 4.803179245486653e-07, + "loss": 0.1863, + "num_input_tokens_seen": 35418200, + "step": 18096 + }, + { + "epoch": 2.398542080848244, + "grad_norm": 7.730273723602295, + "learning_rate": 4.801133562702207e-07, + "loss": 0.1846, + "num_input_tokens_seen": 35419816, + "step": 18097 + }, + { + "epoch": 2.398674618952949, + "grad_norm": 2.7729527950286865, + "learning_rate": 4.799088269365842e-07, + "loss": 0.0062, + "num_input_tokens_seen": 35421520, + "step": 18098 + }, + { + "epoch": 2.398807157057654, + "grad_norm": 1.5554373264312744, + "learning_rate": 4.797043365516985e-07, + "loss": 0.003, + "num_input_tokens_seen": 35423816, + "step": 18099 + }, + { + "epoch": 2.398939695162359, + "grad_norm": 0.007829305715858936, + "learning_rate": 4.794998851195067e-07, + "loss": 0.0, + "num_input_tokens_seen": 35426056, + "step": 18100 + }, + { + "epoch": 2.399072233267064, + "grad_norm": 0.003956565633416176, + "learning_rate": 4.792954726439516e-07, + "loss": 0.0, + "num_input_tokens_seen": 35427320, + "step": 18101 + }, + { + "epoch": 2.3992047713717692, + "grad_norm": 0.22089898586273193, + "learning_rate": 4.790910991289741e-07, + "loss": 0.0004, + "num_input_tokens_seen": 35429160, + "step": 18102 + }, + { + "epoch": 2.3993373094764747, + "grad_norm": 0.002144523896276951, + "learning_rate": 4.788867645785142e-07, + "loss": 0.0, + "num_input_tokens_seen": 35430880, + "step": 18103 + }, + { + "epoch": 2.3994698475811798, + "grad_norm": 1.5161532163619995, + "learning_rate": 4.786824689965119e-07, + "loss": 0.0101, + "num_input_tokens_seen": 35434080, + "step": 18104 + }, + { + "epoch": 2.399602385685885, + "grad_norm": 3.8733556270599365, + "learning_rate": 4.784782123869053e-07, + "loss": 0.0914, + "num_input_tokens_seen": 35437784, + "step": 18105 + }, + { + "epoch": 2.39973492379059, + "grad_norm": 0.0008409291040152311, + "learning_rate": 4.782739947536333e-07, + "loss": 0.0, + "num_input_tokens_seen": 35439264, + "step": 18106 + }, + { + "epoch": 2.399867461895295, + "grad_norm": 12.212885856628418, + "learning_rate": 4.780698161006342e-07, + "loss": 0.3253, + "num_input_tokens_seen": 35440960, + "step": 18107 + }, + { + "epoch": 2.4, + "grad_norm": 0.002437872113659978, + "learning_rate": 4.778656764318438e-07, + "loss": 0.0, + "num_input_tokens_seen": 35442304, + "step": 18108 + }, + { + "epoch": 2.400132538104705, + "grad_norm": 4.9556779861450195, + "learning_rate": 4.776615757511979e-07, + "loss": 0.0659, + "num_input_tokens_seen": 35444136, + "step": 18109 + }, + { + "epoch": 2.4002650762094104, + "grad_norm": 0.32914668321609497, + "learning_rate": 4.774575140626317e-07, + "loss": 0.0023, + "num_input_tokens_seen": 35446416, + "step": 18110 + }, + { + "epoch": 2.4003976143141155, + "grad_norm": 0.006213827058672905, + "learning_rate": 4.772534913700796e-07, + "loss": 0.0, + "num_input_tokens_seen": 35449088, + "step": 18111 + }, + { + "epoch": 2.4005301524188205, + "grad_norm": 0.001021114643663168, + "learning_rate": 4.770495076774764e-07, + "loss": 0.0, + "num_input_tokens_seen": 35450408, + "step": 18112 + }, + { + "epoch": 2.4006626905235255, + "grad_norm": 0.0019876467995345592, + "learning_rate": 4.768455629887544e-07, + "loss": 0.0, + "num_input_tokens_seen": 35451696, + "step": 18113 + }, + { + "epoch": 2.4007952286282306, + "grad_norm": 9.154934883117676, + "learning_rate": 4.7664165730784554e-07, + "loss": 0.1141, + "num_input_tokens_seen": 35453632, + "step": 18114 + }, + { + "epoch": 2.4009277667329356, + "grad_norm": 12.785589218139648, + "learning_rate": 4.764377906386805e-07, + "loss": 0.3063, + "num_input_tokens_seen": 35456144, + "step": 18115 + }, + { + "epoch": 2.4010603048376407, + "grad_norm": 0.00505057442933321, + "learning_rate": 4.762339629851917e-07, + "loss": 0.0, + "num_input_tokens_seen": 35457600, + "step": 18116 + }, + { + "epoch": 2.401192842942346, + "grad_norm": 8.348502159118652, + "learning_rate": 4.7603017435130766e-07, + "loss": 0.0453, + "num_input_tokens_seen": 35459672, + "step": 18117 + }, + { + "epoch": 2.401325381047051, + "grad_norm": 18.144914627075195, + "learning_rate": 4.758264247409586e-07, + "loss": 0.1553, + "num_input_tokens_seen": 35461768, + "step": 18118 + }, + { + "epoch": 2.401457919151756, + "grad_norm": 0.031061789020895958, + "learning_rate": 4.756227141580727e-07, + "loss": 0.0002, + "num_input_tokens_seen": 35463224, + "step": 18119 + }, + { + "epoch": 2.4015904572564613, + "grad_norm": 12.244922637939453, + "learning_rate": 4.754190426065766e-07, + "loss": 0.164, + "num_input_tokens_seen": 35465320, + "step": 18120 + }, + { + "epoch": 2.4017229953611663, + "grad_norm": 12.03762149810791, + "learning_rate": 4.7521541009039855e-07, + "loss": 0.1964, + "num_input_tokens_seen": 35467776, + "step": 18121 + }, + { + "epoch": 2.4018555334658713, + "grad_norm": 7.465327262878418, + "learning_rate": 4.7501181661346455e-07, + "loss": 0.126, + "num_input_tokens_seen": 35470080, + "step": 18122 + }, + { + "epoch": 2.4019880715705764, + "grad_norm": 0.03978775069117546, + "learning_rate": 4.748082621796987e-07, + "loss": 0.0003, + "num_input_tokens_seen": 35471816, + "step": 18123 + }, + { + "epoch": 2.402120609675282, + "grad_norm": 7.178983688354492, + "learning_rate": 4.746047467930276e-07, + "loss": 0.0627, + "num_input_tokens_seen": 35474584, + "step": 18124 + }, + { + "epoch": 2.402253147779987, + "grad_norm": 8.820255279541016, + "learning_rate": 4.7440127045737414e-07, + "loss": 0.0103, + "num_input_tokens_seen": 35476768, + "step": 18125 + }, + { + "epoch": 2.402385685884692, + "grad_norm": 13.912646293640137, + "learning_rate": 4.7419783317666087e-07, + "loss": 0.0864, + "num_input_tokens_seen": 35479400, + "step": 18126 + }, + { + "epoch": 2.402518223989397, + "grad_norm": 0.029091836884617805, + "learning_rate": 4.7399443495481145e-07, + "loss": 0.0002, + "num_input_tokens_seen": 35481680, + "step": 18127 + }, + { + "epoch": 2.402650762094102, + "grad_norm": 0.05253088101744652, + "learning_rate": 4.7379107579574726e-07, + "loss": 0.0004, + "num_input_tokens_seen": 35483344, + "step": 18128 + }, + { + "epoch": 2.402783300198807, + "grad_norm": 5.921629905700684, + "learning_rate": 4.73587755703388e-07, + "loss": 0.0234, + "num_input_tokens_seen": 35485528, + "step": 18129 + }, + { + "epoch": 2.402915838303512, + "grad_norm": 0.014086742885410786, + "learning_rate": 4.733844746816557e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35487296, + "step": 18130 + }, + { + "epoch": 2.4030483764082176, + "grad_norm": 2.082362651824951, + "learning_rate": 4.731812327344681e-07, + "loss": 0.0131, + "num_input_tokens_seen": 35490160, + "step": 18131 + }, + { + "epoch": 2.4031809145129226, + "grad_norm": 1.1473093032836914, + "learning_rate": 4.7297802986574495e-07, + "loss": 0.0091, + "num_input_tokens_seen": 35492288, + "step": 18132 + }, + { + "epoch": 2.4033134526176276, + "grad_norm": 0.01222185231745243, + "learning_rate": 4.7277486607940403e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35494320, + "step": 18133 + }, + { + "epoch": 2.4034459907223327, + "grad_norm": 0.04856293648481369, + "learning_rate": 4.725717413793615e-07, + "loss": 0.0003, + "num_input_tokens_seen": 35496248, + "step": 18134 + }, + { + "epoch": 2.4035785288270377, + "grad_norm": 0.06495780497789383, + "learning_rate": 4.723686557695351e-07, + "loss": 0.0004, + "num_input_tokens_seen": 35497768, + "step": 18135 + }, + { + "epoch": 2.4037110669317427, + "grad_norm": 6.831676483154297, + "learning_rate": 4.72165609253839e-07, + "loss": 0.0228, + "num_input_tokens_seen": 35499680, + "step": 18136 + }, + { + "epoch": 2.403843605036448, + "grad_norm": 6.890519142150879, + "learning_rate": 4.7196260183618977e-07, + "loss": 0.0651, + "num_input_tokens_seen": 35502072, + "step": 18137 + }, + { + "epoch": 2.4039761431411533, + "grad_norm": 6.579684257507324, + "learning_rate": 4.7175963352050045e-07, + "loss": 0.1259, + "num_input_tokens_seen": 35503976, + "step": 18138 + }, + { + "epoch": 2.4041086812458583, + "grad_norm": 8.526119232177734, + "learning_rate": 4.7155670431068473e-07, + "loss": 0.1591, + "num_input_tokens_seen": 35506256, + "step": 18139 + }, + { + "epoch": 2.4042412193505633, + "grad_norm": 7.475154399871826, + "learning_rate": 4.713538142106544e-07, + "loss": 0.0607, + "num_input_tokens_seen": 35508368, + "step": 18140 + }, + { + "epoch": 2.4043737574552684, + "grad_norm": 3.8581767082214355, + "learning_rate": 4.711509632243222e-07, + "loss": 0.0513, + "num_input_tokens_seen": 35510272, + "step": 18141 + }, + { + "epoch": 2.4045062955599734, + "grad_norm": 0.49727436900138855, + "learning_rate": 4.709481513555994e-07, + "loss": 0.002, + "num_input_tokens_seen": 35511800, + "step": 18142 + }, + { + "epoch": 2.4046388336646785, + "grad_norm": 4.463983058929443, + "learning_rate": 4.7074537860839626e-07, + "loss": 0.0226, + "num_input_tokens_seen": 35513488, + "step": 18143 + }, + { + "epoch": 2.4047713717693835, + "grad_norm": 0.013124100863933563, + "learning_rate": 4.705426449866221e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35515504, + "step": 18144 + }, + { + "epoch": 2.404903909874089, + "grad_norm": 0.008876031264662743, + "learning_rate": 4.703399504941858e-07, + "loss": 0.0, + "num_input_tokens_seen": 35517288, + "step": 18145 + }, + { + "epoch": 2.405036447978794, + "grad_norm": 4.817626953125, + "learning_rate": 4.7013729513499475e-07, + "loss": 0.028, + "num_input_tokens_seen": 35519216, + "step": 18146 + }, + { + "epoch": 2.405168986083499, + "grad_norm": 0.09164362400770187, + "learning_rate": 4.699346789129569e-07, + "loss": 0.0005, + "num_input_tokens_seen": 35521224, + "step": 18147 + }, + { + "epoch": 2.405301524188204, + "grad_norm": 0.1068929135799408, + "learning_rate": 4.697321018319795e-07, + "loss": 0.0005, + "num_input_tokens_seen": 35522696, + "step": 18148 + }, + { + "epoch": 2.405434062292909, + "grad_norm": 0.0580691322684288, + "learning_rate": 4.6952956389596795e-07, + "loss": 0.0004, + "num_input_tokens_seen": 35524368, + "step": 18149 + }, + { + "epoch": 2.405566600397614, + "grad_norm": 0.045484162867069244, + "learning_rate": 4.693270651088272e-07, + "loss": 0.0003, + "num_input_tokens_seen": 35527416, + "step": 18150 + }, + { + "epoch": 2.405699138502319, + "grad_norm": 6.020226955413818, + "learning_rate": 4.6912460547446083e-07, + "loss": 0.1113, + "num_input_tokens_seen": 35529304, + "step": 18151 + }, + { + "epoch": 2.4058316766070247, + "grad_norm": 12.10814094543457, + "learning_rate": 4.6892218499677374e-07, + "loss": 0.1609, + "num_input_tokens_seen": 35531280, + "step": 18152 + }, + { + "epoch": 2.4059642147117297, + "grad_norm": 0.019423136487603188, + "learning_rate": 4.6871980367966724e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35532672, + "step": 18153 + }, + { + "epoch": 2.4060967528164348, + "grad_norm": 0.3122580647468567, + "learning_rate": 4.685174615270452e-07, + "loss": 0.0009, + "num_input_tokens_seen": 35535104, + "step": 18154 + }, + { + "epoch": 2.40622929092114, + "grad_norm": 4.540923595428467, + "learning_rate": 4.683151585428078e-07, + "loss": 0.0501, + "num_input_tokens_seen": 35536912, + "step": 18155 + }, + { + "epoch": 2.406361829025845, + "grad_norm": 0.8943704962730408, + "learning_rate": 4.6811289473085554e-07, + "loss": 0.0072, + "num_input_tokens_seen": 35538816, + "step": 18156 + }, + { + "epoch": 2.40649436713055, + "grad_norm": 3.107003927230835, + "learning_rate": 4.679106700950881e-07, + "loss": 0.0107, + "num_input_tokens_seen": 35540688, + "step": 18157 + }, + { + "epoch": 2.406626905235255, + "grad_norm": 0.045324575155973434, + "learning_rate": 4.677084846394053e-07, + "loss": 0.0002, + "num_input_tokens_seen": 35542872, + "step": 18158 + }, + { + "epoch": 2.4067594433399604, + "grad_norm": 9.803524017333984, + "learning_rate": 4.675063383677042e-07, + "loss": 0.116, + "num_input_tokens_seen": 35544576, + "step": 18159 + }, + { + "epoch": 2.4068919814446654, + "grad_norm": 3.5478901863098145, + "learning_rate": 4.673042312838838e-07, + "loss": 0.0256, + "num_input_tokens_seen": 35546848, + "step": 18160 + }, + { + "epoch": 2.4070245195493705, + "grad_norm": 5.151865005493164, + "learning_rate": 4.671021633918399e-07, + "loss": 0.0338, + "num_input_tokens_seen": 35548656, + "step": 18161 + }, + { + "epoch": 2.4071570576540755, + "grad_norm": 11.641858100891113, + "learning_rate": 4.6690013469546825e-07, + "loss": 0.0836, + "num_input_tokens_seen": 35550144, + "step": 18162 + }, + { + "epoch": 2.4072895957587805, + "grad_norm": 0.2483617663383484, + "learning_rate": 4.666981451986652e-07, + "loss": 0.0019, + "num_input_tokens_seen": 35552232, + "step": 18163 + }, + { + "epoch": 2.4074221338634856, + "grad_norm": 2.4867842197418213, + "learning_rate": 4.66496194905324e-07, + "loss": 0.0223, + "num_input_tokens_seen": 35553696, + "step": 18164 + }, + { + "epoch": 2.4075546719681906, + "grad_norm": 0.1135510578751564, + "learning_rate": 4.6629428381933963e-07, + "loss": 0.0003, + "num_input_tokens_seen": 35554904, + "step": 18165 + }, + { + "epoch": 2.407687210072896, + "grad_norm": 2.1824302673339844, + "learning_rate": 4.660924119446045e-07, + "loss": 0.0078, + "num_input_tokens_seen": 35556752, + "step": 18166 + }, + { + "epoch": 2.407819748177601, + "grad_norm": 8.605531692504883, + "learning_rate": 4.658905792850099e-07, + "loss": 0.0802, + "num_input_tokens_seen": 35558224, + "step": 18167 + }, + { + "epoch": 2.407952286282306, + "grad_norm": 0.0029591170605272055, + "learning_rate": 4.656887858444492e-07, + "loss": 0.0, + "num_input_tokens_seen": 35559680, + "step": 18168 + }, + { + "epoch": 2.408084824387011, + "grad_norm": 20.020736694335938, + "learning_rate": 4.6548703162681174e-07, + "loss": 0.1175, + "num_input_tokens_seen": 35560680, + "step": 18169 + }, + { + "epoch": 2.4082173624917163, + "grad_norm": 0.012199283577501774, + "learning_rate": 4.652853166359875e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35562544, + "step": 18170 + }, + { + "epoch": 2.4083499005964213, + "grad_norm": 0.05306343734264374, + "learning_rate": 4.650836408758666e-07, + "loss": 0.0002, + "num_input_tokens_seen": 35565168, + "step": 18171 + }, + { + "epoch": 2.4084824387011263, + "grad_norm": 2.9107887744903564, + "learning_rate": 4.648820043503363e-07, + "loss": 0.0169, + "num_input_tokens_seen": 35567296, + "step": 18172 + }, + { + "epoch": 2.408614976805832, + "grad_norm": 0.0007972423336468637, + "learning_rate": 4.6468040706328543e-07, + "loss": 0.0, + "num_input_tokens_seen": 35569464, + "step": 18173 + }, + { + "epoch": 2.408747514910537, + "grad_norm": 0.023977655917406082, + "learning_rate": 4.644788490186003e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35571880, + "step": 18174 + }, + { + "epoch": 2.408880053015242, + "grad_norm": 0.35315513610839844, + "learning_rate": 4.642773302201675e-07, + "loss": 0.0023, + "num_input_tokens_seen": 35573312, + "step": 18175 + }, + { + "epoch": 2.409012591119947, + "grad_norm": 0.27598655223846436, + "learning_rate": 4.640758506718715e-07, + "loss": 0.0015, + "num_input_tokens_seen": 35575416, + "step": 18176 + }, + { + "epoch": 2.409145129224652, + "grad_norm": 0.06557110697031021, + "learning_rate": 4.6387441037759743e-07, + "loss": 0.0006, + "num_input_tokens_seen": 35576904, + "step": 18177 + }, + { + "epoch": 2.409277667329357, + "grad_norm": 0.02413679100573063, + "learning_rate": 4.6367300934123006e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35579200, + "step": 18178 + }, + { + "epoch": 2.4094102054340625, + "grad_norm": 0.004373766016215086, + "learning_rate": 4.6347164756665175e-07, + "loss": 0.0, + "num_input_tokens_seen": 35580864, + "step": 18179 + }, + { + "epoch": 2.4095427435387675, + "grad_norm": 3.9342076778411865, + "learning_rate": 4.6327032505774526e-07, + "loss": 0.0045, + "num_input_tokens_seen": 35582200, + "step": 18180 + }, + { + "epoch": 2.4096752816434726, + "grad_norm": 3.4398348331451416, + "learning_rate": 4.630690418183917e-07, + "loss": 0.0164, + "num_input_tokens_seen": 35584816, + "step": 18181 + }, + { + "epoch": 2.4098078197481776, + "grad_norm": 0.015541591681540012, + "learning_rate": 4.6286779785247176e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35587200, + "step": 18182 + }, + { + "epoch": 2.4099403578528826, + "grad_norm": 0.014683382585644722, + "learning_rate": 4.62666593163866e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35588848, + "step": 18183 + }, + { + "epoch": 2.4100728959575877, + "grad_norm": 4.659676551818848, + "learning_rate": 4.6246542775645444e-07, + "loss": 0.0771, + "num_input_tokens_seen": 35590664, + "step": 18184 + }, + { + "epoch": 2.410205434062293, + "grad_norm": 8.4768705368042, + "learning_rate": 4.6226430163411496e-07, + "loss": 0.119, + "num_input_tokens_seen": 35592512, + "step": 18185 + }, + { + "epoch": 2.410337972166998, + "grad_norm": 0.028431642800569534, + "learning_rate": 4.620632148007256e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35594600, + "step": 18186 + }, + { + "epoch": 2.4104705102717032, + "grad_norm": 6.089627742767334, + "learning_rate": 4.618621672601628e-07, + "loss": 0.0082, + "num_input_tokens_seen": 35596680, + "step": 18187 + }, + { + "epoch": 2.4106030483764083, + "grad_norm": 2.853551149368286, + "learning_rate": 4.6166115901630336e-07, + "loss": 0.0231, + "num_input_tokens_seen": 35598704, + "step": 18188 + }, + { + "epoch": 2.4107355864811133, + "grad_norm": 0.8654852509498596, + "learning_rate": 4.6146019007302367e-07, + "loss": 0.004, + "num_input_tokens_seen": 35599864, + "step": 18189 + }, + { + "epoch": 2.4108681245858183, + "grad_norm": 0.0009166355594061315, + "learning_rate": 4.6125926043419756e-07, + "loss": 0.0, + "num_input_tokens_seen": 35601096, + "step": 18190 + }, + { + "epoch": 2.4110006626905234, + "grad_norm": 0.09492700546979904, + "learning_rate": 4.610583701036997e-07, + "loss": 0.0006, + "num_input_tokens_seen": 35602792, + "step": 18191 + }, + { + "epoch": 2.411133200795229, + "grad_norm": 6.42224645614624, + "learning_rate": 4.6085751908540285e-07, + "loss": 0.0875, + "num_input_tokens_seen": 35604480, + "step": 18192 + }, + { + "epoch": 2.411265738899934, + "grad_norm": 0.0027386806905269623, + "learning_rate": 4.6065670738317914e-07, + "loss": 0.0, + "num_input_tokens_seen": 35606080, + "step": 18193 + }, + { + "epoch": 2.411398277004639, + "grad_norm": 1.2168889045715332, + "learning_rate": 4.604559350009011e-07, + "loss": 0.0063, + "num_input_tokens_seen": 35607584, + "step": 18194 + }, + { + "epoch": 2.411530815109344, + "grad_norm": 0.04797476530075073, + "learning_rate": 4.602552019424403e-07, + "loss": 0.0003, + "num_input_tokens_seen": 35609128, + "step": 18195 + }, + { + "epoch": 2.411663353214049, + "grad_norm": 2.40864896774292, + "learning_rate": 4.6005450821166616e-07, + "loss": 0.0067, + "num_input_tokens_seen": 35611552, + "step": 18196 + }, + { + "epoch": 2.411795891318754, + "grad_norm": 5.762424945831299, + "learning_rate": 4.598538538124486e-07, + "loss": 0.0573, + "num_input_tokens_seen": 35612904, + "step": 18197 + }, + { + "epoch": 2.411928429423459, + "grad_norm": 0.02459900453686714, + "learning_rate": 4.596532387486555e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35614440, + "step": 18198 + }, + { + "epoch": 2.4120609675281646, + "grad_norm": 7.797262668609619, + "learning_rate": 4.594526630241558e-07, + "loss": 0.0256, + "num_input_tokens_seen": 35616440, + "step": 18199 + }, + { + "epoch": 2.4121935056328696, + "grad_norm": 9.702192306518555, + "learning_rate": 4.592521266428163e-07, + "loss": 0.2826, + "num_input_tokens_seen": 35619856, + "step": 18200 + }, + { + "epoch": 2.4123260437375746, + "grad_norm": 0.0009254252654500306, + "learning_rate": 4.5905162960850403e-07, + "loss": 0.0, + "num_input_tokens_seen": 35621192, + "step": 18201 + }, + { + "epoch": 2.4124585818422797, + "grad_norm": 0.16143274307250977, + "learning_rate": 4.588511719250843e-07, + "loss": 0.0011, + "num_input_tokens_seen": 35624992, + "step": 18202 + }, + { + "epoch": 2.4125911199469847, + "grad_norm": 0.5152807235717773, + "learning_rate": 4.586507535964216e-07, + "loss": 0.0019, + "num_input_tokens_seen": 35627192, + "step": 18203 + }, + { + "epoch": 2.4127236580516898, + "grad_norm": 1.4740726947784424, + "learning_rate": 4.5845037462638104e-07, + "loss": 0.0102, + "num_input_tokens_seen": 35629392, + "step": 18204 + }, + { + "epoch": 2.412856196156395, + "grad_norm": 0.008987060748040676, + "learning_rate": 4.58250035018826e-07, + "loss": 0.0, + "num_input_tokens_seen": 35630704, + "step": 18205 + }, + { + "epoch": 2.4129887342611003, + "grad_norm": 13.788135528564453, + "learning_rate": 4.580497347776178e-07, + "loss": 0.0991, + "num_input_tokens_seen": 35632728, + "step": 18206 + }, + { + "epoch": 2.4131212723658053, + "grad_norm": 6.456593990325928, + "learning_rate": 4.5784947390662027e-07, + "loss": 0.0946, + "num_input_tokens_seen": 35634696, + "step": 18207 + }, + { + "epoch": 2.4132538104705104, + "grad_norm": 0.009794460609555244, + "learning_rate": 4.5764925240969285e-07, + "loss": 0.0, + "num_input_tokens_seen": 35636832, + "step": 18208 + }, + { + "epoch": 2.4133863485752154, + "grad_norm": 0.0008709046524018049, + "learning_rate": 4.574490702906978e-07, + "loss": 0.0, + "num_input_tokens_seen": 35638800, + "step": 18209 + }, + { + "epoch": 2.4135188866799204, + "grad_norm": 7.6576247215271, + "learning_rate": 4.5724892755349333e-07, + "loss": 0.0746, + "num_input_tokens_seen": 35640784, + "step": 18210 + }, + { + "epoch": 2.4136514247846255, + "grad_norm": 4.782994747161865, + "learning_rate": 4.570488242019391e-07, + "loss": 0.0712, + "num_input_tokens_seen": 35643632, + "step": 18211 + }, + { + "epoch": 2.4137839628893305, + "grad_norm": 1.3521519899368286, + "learning_rate": 4.5684876023989187e-07, + "loss": 0.0057, + "num_input_tokens_seen": 35645496, + "step": 18212 + }, + { + "epoch": 2.413916500994036, + "grad_norm": 4.821298122406006, + "learning_rate": 4.56648735671211e-07, + "loss": 0.0156, + "num_input_tokens_seen": 35647336, + "step": 18213 + }, + { + "epoch": 2.414049039098741, + "grad_norm": 4.241542339324951, + "learning_rate": 4.564487504997514e-07, + "loss": 0.0314, + "num_input_tokens_seen": 35649984, + "step": 18214 + }, + { + "epoch": 2.414181577203446, + "grad_norm": 8.686492919921875, + "learning_rate": 4.562488047293703e-07, + "loss": 0.1723, + "num_input_tokens_seen": 35651944, + "step": 18215 + }, + { + "epoch": 2.414314115308151, + "grad_norm": 1.873984456062317, + "learning_rate": 4.560488983639222e-07, + "loss": 0.0031, + "num_input_tokens_seen": 35653344, + "step": 18216 + }, + { + "epoch": 2.414446653412856, + "grad_norm": 0.006413490977138281, + "learning_rate": 4.5584903140726083e-07, + "loss": 0.0, + "num_input_tokens_seen": 35655216, + "step": 18217 + }, + { + "epoch": 2.414579191517561, + "grad_norm": 0.0023879469372332096, + "learning_rate": 4.5564920386324066e-07, + "loss": 0.0, + "num_input_tokens_seen": 35656392, + "step": 18218 + }, + { + "epoch": 2.414711729622266, + "grad_norm": 0.013316905125975609, + "learning_rate": 4.5544941573571393e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35657976, + "step": 18219 + }, + { + "epoch": 2.4148442677269717, + "grad_norm": 0.08912398666143417, + "learning_rate": 4.552496670285333e-07, + "loss": 0.0003, + "num_input_tokens_seen": 35660544, + "step": 18220 + }, + { + "epoch": 2.4149768058316767, + "grad_norm": 4.048020839691162, + "learning_rate": 4.550499577455497e-07, + "loss": 0.0409, + "num_input_tokens_seen": 35661784, + "step": 18221 + }, + { + "epoch": 2.4151093439363818, + "grad_norm": 0.10014895349740982, + "learning_rate": 4.5485028789061386e-07, + "loss": 0.0007, + "num_input_tokens_seen": 35664536, + "step": 18222 + }, + { + "epoch": 2.415241882041087, + "grad_norm": 2.833055019378662, + "learning_rate": 4.546506574675744e-07, + "loss": 0.0241, + "num_input_tokens_seen": 35667472, + "step": 18223 + }, + { + "epoch": 2.415374420145792, + "grad_norm": 10.043545722961426, + "learning_rate": 4.544510664802815e-07, + "loss": 0.2494, + "num_input_tokens_seen": 35669792, + "step": 18224 + }, + { + "epoch": 2.415506958250497, + "grad_norm": 0.24592944979667664, + "learning_rate": 4.5425151493258377e-07, + "loss": 0.0012, + "num_input_tokens_seen": 35672056, + "step": 18225 + }, + { + "epoch": 2.415639496355202, + "grad_norm": 8.025667190551758, + "learning_rate": 4.5405200282832795e-07, + "loss": 0.078, + "num_input_tokens_seen": 35673632, + "step": 18226 + }, + { + "epoch": 2.4157720344599074, + "grad_norm": 7.607851982116699, + "learning_rate": 4.5385253017136114e-07, + "loss": 0.0639, + "num_input_tokens_seen": 35675856, + "step": 18227 + }, + { + "epoch": 2.4159045725646124, + "grad_norm": 0.05425567552447319, + "learning_rate": 4.5365309696552925e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35677888, + "step": 18228 + }, + { + "epoch": 2.4160371106693175, + "grad_norm": 2.7542035579681396, + "learning_rate": 4.534537032146763e-07, + "loss": 0.0266, + "num_input_tokens_seen": 35681464, + "step": 18229 + }, + { + "epoch": 2.4161696487740225, + "grad_norm": 0.04528532922267914, + "learning_rate": 4.5325434892264817e-07, + "loss": 0.0002, + "num_input_tokens_seen": 35682960, + "step": 18230 + }, + { + "epoch": 2.4163021868787276, + "grad_norm": 14.158076286315918, + "learning_rate": 4.530550340932885e-07, + "loss": 0.1303, + "num_input_tokens_seen": 35685704, + "step": 18231 + }, + { + "epoch": 2.4164347249834326, + "grad_norm": 5.1780171394348145, + "learning_rate": 4.5285575873044025e-07, + "loss": 0.0429, + "num_input_tokens_seen": 35687456, + "step": 18232 + }, + { + "epoch": 2.4165672630881376, + "grad_norm": 7.798304557800293, + "learning_rate": 4.5265652283794483e-07, + "loss": 0.0324, + "num_input_tokens_seen": 35689232, + "step": 18233 + }, + { + "epoch": 2.416699801192843, + "grad_norm": 0.0011623292230069637, + "learning_rate": 4.5245732641964384e-07, + "loss": 0.0, + "num_input_tokens_seen": 35690800, + "step": 18234 + }, + { + "epoch": 2.416832339297548, + "grad_norm": 0.032207831740379333, + "learning_rate": 4.5225816947937843e-07, + "loss": 0.0002, + "num_input_tokens_seen": 35692952, + "step": 18235 + }, + { + "epoch": 2.416964877402253, + "grad_norm": 0.027162840589880943, + "learning_rate": 4.520590520209875e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35694680, + "step": 18236 + }, + { + "epoch": 2.4170974155069582, + "grad_norm": 11.363536834716797, + "learning_rate": 4.518599740483118e-07, + "loss": 0.0653, + "num_input_tokens_seen": 35696520, + "step": 18237 + }, + { + "epoch": 2.4172299536116633, + "grad_norm": 12.262215614318848, + "learning_rate": 4.516609355651885e-07, + "loss": 0.0791, + "num_input_tokens_seen": 35697928, + "step": 18238 + }, + { + "epoch": 2.4173624917163683, + "grad_norm": 6.310816764831543, + "learning_rate": 4.514619365754555e-07, + "loss": 0.1987, + "num_input_tokens_seen": 35699168, + "step": 18239 + }, + { + "epoch": 2.4174950298210733, + "grad_norm": 7.653509616851807, + "learning_rate": 4.512629770829488e-07, + "loss": 0.0981, + "num_input_tokens_seen": 35700944, + "step": 18240 + }, + { + "epoch": 2.417627567925779, + "grad_norm": 1.8968865871429443, + "learning_rate": 4.5106405709150536e-07, + "loss": 0.0182, + "num_input_tokens_seen": 35703224, + "step": 18241 + }, + { + "epoch": 2.417760106030484, + "grad_norm": 8.001379013061523, + "learning_rate": 4.5086517660496124e-07, + "loss": 0.0476, + "num_input_tokens_seen": 35705224, + "step": 18242 + }, + { + "epoch": 2.417892644135189, + "grad_norm": 6.293928146362305, + "learning_rate": 4.5066633562714986e-07, + "loss": 0.1107, + "num_input_tokens_seen": 35707064, + "step": 18243 + }, + { + "epoch": 2.418025182239894, + "grad_norm": 2.669416904449463, + "learning_rate": 4.504675341619055e-07, + "loss": 0.0261, + "num_input_tokens_seen": 35709544, + "step": 18244 + }, + { + "epoch": 2.418157720344599, + "grad_norm": 0.004193441942334175, + "learning_rate": 4.5026877221306025e-07, + "loss": 0.0, + "num_input_tokens_seen": 35712192, + "step": 18245 + }, + { + "epoch": 2.418290258449304, + "grad_norm": 0.31449371576309204, + "learning_rate": 4.500700497844476e-07, + "loss": 0.001, + "num_input_tokens_seen": 35714640, + "step": 18246 + }, + { + "epoch": 2.418422796554009, + "grad_norm": 0.7206229567527771, + "learning_rate": 4.4987136687989817e-07, + "loss": 0.0037, + "num_input_tokens_seen": 35716176, + "step": 18247 + }, + { + "epoch": 2.4185553346587145, + "grad_norm": 0.04445688799023628, + "learning_rate": 4.496727235032436e-07, + "loss": 0.0003, + "num_input_tokens_seen": 35717808, + "step": 18248 + }, + { + "epoch": 2.4186878727634196, + "grad_norm": 7.169434547424316, + "learning_rate": 4.494741196583133e-07, + "loss": 0.0396, + "num_input_tokens_seen": 35719792, + "step": 18249 + }, + { + "epoch": 2.4188204108681246, + "grad_norm": 4.15717887878418, + "learning_rate": 4.4927555534893616e-07, + "loss": 0.0127, + "num_input_tokens_seen": 35722048, + "step": 18250 + }, + { + "epoch": 2.4189529489728296, + "grad_norm": 4.315068244934082, + "learning_rate": 4.490770305789413e-07, + "loss": 0.0691, + "num_input_tokens_seen": 35724544, + "step": 18251 + }, + { + "epoch": 2.4190854870775347, + "grad_norm": 10.593759536743164, + "learning_rate": 4.488785453521566e-07, + "loss": 0.1639, + "num_input_tokens_seen": 35726528, + "step": 18252 + }, + { + "epoch": 2.4192180251822397, + "grad_norm": 10.113214492797852, + "learning_rate": 4.486800996724075e-07, + "loss": 0.2198, + "num_input_tokens_seen": 35728560, + "step": 18253 + }, + { + "epoch": 2.4193505632869448, + "grad_norm": 0.004312264733016491, + "learning_rate": 4.484816935435221e-07, + "loss": 0.0, + "num_input_tokens_seen": 35729720, + "step": 18254 + }, + { + "epoch": 2.4194831013916502, + "grad_norm": 5.775733947753906, + "learning_rate": 4.4828332696932405e-07, + "loss": 0.0265, + "num_input_tokens_seen": 35731224, + "step": 18255 + }, + { + "epoch": 2.4196156394963553, + "grad_norm": 0.0003091749385930598, + "learning_rate": 4.480849999536396e-07, + "loss": 0.0, + "num_input_tokens_seen": 35732368, + "step": 18256 + }, + { + "epoch": 2.4197481776010603, + "grad_norm": 0.02270517870783806, + "learning_rate": 4.4788671250029197e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35733888, + "step": 18257 + }, + { + "epoch": 2.4198807157057654, + "grad_norm": 13.564950942993164, + "learning_rate": 4.476884646131044e-07, + "loss": 0.2249, + "num_input_tokens_seen": 35735584, + "step": 18258 + }, + { + "epoch": 2.4200132538104704, + "grad_norm": 1.9107574224472046, + "learning_rate": 4.47490256295898e-07, + "loss": 0.0089, + "num_input_tokens_seen": 35737224, + "step": 18259 + }, + { + "epoch": 2.4201457919151754, + "grad_norm": 0.5527098774909973, + "learning_rate": 4.472920875524958e-07, + "loss": 0.002, + "num_input_tokens_seen": 35739216, + "step": 18260 + }, + { + "epoch": 2.420278330019881, + "grad_norm": 0.0026816122699528933, + "learning_rate": 4.470939583867187e-07, + "loss": 0.0, + "num_input_tokens_seen": 35740584, + "step": 18261 + }, + { + "epoch": 2.420410868124586, + "grad_norm": 1.262227177619934, + "learning_rate": 4.468958688023861e-07, + "loss": 0.0054, + "num_input_tokens_seen": 35742336, + "step": 18262 + }, + { + "epoch": 2.420543406229291, + "grad_norm": 5.2343525886535645, + "learning_rate": 4.4669781880331776e-07, + "loss": 0.0578, + "num_input_tokens_seen": 35744856, + "step": 18263 + }, + { + "epoch": 2.420675944333996, + "grad_norm": 0.018396543338894844, + "learning_rate": 4.4649980839333195e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35747280, + "step": 18264 + }, + { + "epoch": 2.420808482438701, + "grad_norm": 10.031655311584473, + "learning_rate": 4.4630183757624595e-07, + "loss": 0.119, + "num_input_tokens_seen": 35749544, + "step": 18265 + }, + { + "epoch": 2.420941020543406, + "grad_norm": 0.7376269102096558, + "learning_rate": 4.4610390635587696e-07, + "loss": 0.0053, + "num_input_tokens_seen": 35751528, + "step": 18266 + }, + { + "epoch": 2.4210735586481116, + "grad_norm": 4.251041889190674, + "learning_rate": 4.459060147360425e-07, + "loss": 0.0281, + "num_input_tokens_seen": 35753528, + "step": 18267 + }, + { + "epoch": 2.4212060967528166, + "grad_norm": 0.01935018040239811, + "learning_rate": 4.45708162720557e-07, + "loss": 0.0, + "num_input_tokens_seen": 35754856, + "step": 18268 + }, + { + "epoch": 2.4213386348575217, + "grad_norm": 15.803794860839844, + "learning_rate": 4.455103503132352e-07, + "loss": 0.1105, + "num_input_tokens_seen": 35756904, + "step": 18269 + }, + { + "epoch": 2.4214711729622267, + "grad_norm": 0.007716186810284853, + "learning_rate": 4.453125775178907e-07, + "loss": 0.0, + "num_input_tokens_seen": 35759584, + "step": 18270 + }, + { + "epoch": 2.4216037110669317, + "grad_norm": 10.491011619567871, + "learning_rate": 4.4511484433833714e-07, + "loss": 0.0091, + "num_input_tokens_seen": 35761864, + "step": 18271 + }, + { + "epoch": 2.4217362491716368, + "grad_norm": 5.620283603668213, + "learning_rate": 4.449171507783875e-07, + "loss": 0.0522, + "num_input_tokens_seen": 35763880, + "step": 18272 + }, + { + "epoch": 2.421868787276342, + "grad_norm": 11.039116859436035, + "learning_rate": 4.447194968418531e-07, + "loss": 0.1958, + "num_input_tokens_seen": 35765944, + "step": 18273 + }, + { + "epoch": 2.4220013253810473, + "grad_norm": 5.693419456481934, + "learning_rate": 4.445218825325445e-07, + "loss": 0.0275, + "num_input_tokens_seen": 35767992, + "step": 18274 + }, + { + "epoch": 2.4221338634857523, + "grad_norm": 0.0010750835062935948, + "learning_rate": 4.443243078542722e-07, + "loss": 0.0, + "num_input_tokens_seen": 35769080, + "step": 18275 + }, + { + "epoch": 2.4222664015904574, + "grad_norm": 0.7256829142570496, + "learning_rate": 4.441267728108445e-07, + "loss": 0.0045, + "num_input_tokens_seen": 35770760, + "step": 18276 + }, + { + "epoch": 2.4223989396951624, + "grad_norm": 4.712086200714111, + "learning_rate": 4.4392927740607104e-07, + "loss": 0.0445, + "num_input_tokens_seen": 35772648, + "step": 18277 + }, + { + "epoch": 2.4225314777998674, + "grad_norm": 0.4947657883167267, + "learning_rate": 4.437318216437603e-07, + "loss": 0.0032, + "num_input_tokens_seen": 35775304, + "step": 18278 + }, + { + "epoch": 2.4226640159045725, + "grad_norm": 6.993027210235596, + "learning_rate": 4.4353440552771825e-07, + "loss": 0.0742, + "num_input_tokens_seen": 35777192, + "step": 18279 + }, + { + "epoch": 2.4227965540092775, + "grad_norm": 0.017004508525133133, + "learning_rate": 4.433370290617517e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35779096, + "step": 18280 + }, + { + "epoch": 2.422929092113983, + "grad_norm": 0.013942967168986797, + "learning_rate": 4.431396922496656e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35781000, + "step": 18281 + }, + { + "epoch": 2.423061630218688, + "grad_norm": 5.452723026275635, + "learning_rate": 4.429423950952655e-07, + "loss": 0.0702, + "num_input_tokens_seen": 35783872, + "step": 18282 + }, + { + "epoch": 2.423194168323393, + "grad_norm": 0.02666877768933773, + "learning_rate": 4.427451376023548e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35785016, + "step": 18283 + }, + { + "epoch": 2.423326706428098, + "grad_norm": 10.672096252441406, + "learning_rate": 4.4254791977473733e-07, + "loss": 0.1691, + "num_input_tokens_seen": 35787400, + "step": 18284 + }, + { + "epoch": 2.423459244532803, + "grad_norm": 4.464118957519531, + "learning_rate": 4.4235074161621565e-07, + "loss": 0.0414, + "num_input_tokens_seen": 35789800, + "step": 18285 + }, + { + "epoch": 2.423591782637508, + "grad_norm": 0.20853915810585022, + "learning_rate": 4.4215360313059037e-07, + "loss": 0.001, + "num_input_tokens_seen": 35791264, + "step": 18286 + }, + { + "epoch": 2.4237243207422132, + "grad_norm": 4.23888635635376, + "learning_rate": 4.4195650432166393e-07, + "loss": 0.0892, + "num_input_tokens_seen": 35792424, + "step": 18287 + }, + { + "epoch": 2.4238568588469187, + "grad_norm": 0.01026932429522276, + "learning_rate": 4.417594451932361e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35794080, + "step": 18288 + }, + { + "epoch": 2.4239893969516237, + "grad_norm": 9.736784934997559, + "learning_rate": 4.4156242574910524e-07, + "loss": 0.0544, + "num_input_tokens_seen": 35796904, + "step": 18289 + }, + { + "epoch": 2.424121935056329, + "grad_norm": 13.2636079788208, + "learning_rate": 4.4136544599307136e-07, + "loss": 0.1242, + "num_input_tokens_seen": 35798896, + "step": 18290 + }, + { + "epoch": 2.424254473161034, + "grad_norm": 20.14436149597168, + "learning_rate": 4.411685059289314e-07, + "loss": 0.2413, + "num_input_tokens_seen": 35800216, + "step": 18291 + }, + { + "epoch": 2.424387011265739, + "grad_norm": 3.798980236053467, + "learning_rate": 4.4097160556048346e-07, + "loss": 0.0328, + "num_input_tokens_seen": 35803096, + "step": 18292 + }, + { + "epoch": 2.424519549370444, + "grad_norm": 3.8704378604888916, + "learning_rate": 4.407747448915234e-07, + "loss": 0.0206, + "num_input_tokens_seen": 35804936, + "step": 18293 + }, + { + "epoch": 2.424652087475149, + "grad_norm": 2.4681756496429443, + "learning_rate": 4.405779239258465e-07, + "loss": 0.008, + "num_input_tokens_seen": 35806472, + "step": 18294 + }, + { + "epoch": 2.4247846255798544, + "grad_norm": 0.006476826965808868, + "learning_rate": 4.403811426672483e-07, + "loss": 0.0, + "num_input_tokens_seen": 35809216, + "step": 18295 + }, + { + "epoch": 2.4249171636845595, + "grad_norm": 1.637824535369873, + "learning_rate": 4.4018440111952195e-07, + "loss": 0.0072, + "num_input_tokens_seen": 35811184, + "step": 18296 + }, + { + "epoch": 2.4250497017892645, + "grad_norm": 15.572833061218262, + "learning_rate": 4.3998769928646186e-07, + "loss": 0.2156, + "num_input_tokens_seen": 35813072, + "step": 18297 + }, + { + "epoch": 2.4251822398939695, + "grad_norm": 8.276287078857422, + "learning_rate": 4.397910371718603e-07, + "loss": 0.1311, + "num_input_tokens_seen": 35815368, + "step": 18298 + }, + { + "epoch": 2.4253147779986746, + "grad_norm": 0.07721289247274399, + "learning_rate": 4.3959441477950865e-07, + "loss": 0.0004, + "num_input_tokens_seen": 35816976, + "step": 18299 + }, + { + "epoch": 2.4254473161033796, + "grad_norm": 0.024493522942066193, + "learning_rate": 4.3939783211319754e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35818896, + "step": 18300 + }, + { + "epoch": 2.4255798542080846, + "grad_norm": 0.000995882204733789, + "learning_rate": 4.392012891767183e-07, + "loss": 0.0, + "num_input_tokens_seen": 35820344, + "step": 18301 + }, + { + "epoch": 2.42571239231279, + "grad_norm": 17.389694213867188, + "learning_rate": 4.3900478597385945e-07, + "loss": 0.1847, + "num_input_tokens_seen": 35822272, + "step": 18302 + }, + { + "epoch": 2.425844930417495, + "grad_norm": 0.07688923180103302, + "learning_rate": 4.3880832250841056e-07, + "loss": 0.0005, + "num_input_tokens_seen": 35823608, + "step": 18303 + }, + { + "epoch": 2.4259774685222, + "grad_norm": 0.0225998405367136, + "learning_rate": 4.386118987841592e-07, + "loss": 0.0002, + "num_input_tokens_seen": 35824928, + "step": 18304 + }, + { + "epoch": 2.4261100066269052, + "grad_norm": 5.627675533294678, + "learning_rate": 4.3841551480489243e-07, + "loss": 0.1071, + "num_input_tokens_seen": 35827832, + "step": 18305 + }, + { + "epoch": 2.4262425447316103, + "grad_norm": 2.758574962615967, + "learning_rate": 4.382191705743963e-07, + "loss": 0.0154, + "num_input_tokens_seen": 35831160, + "step": 18306 + }, + { + "epoch": 2.4263750828363153, + "grad_norm": 4.219793796539307, + "learning_rate": 4.3802286609645696e-07, + "loss": 0.0689, + "num_input_tokens_seen": 35833424, + "step": 18307 + }, + { + "epoch": 2.4265076209410203, + "grad_norm": 0.21684300899505615, + "learning_rate": 4.3782660137485973e-07, + "loss": 0.0008, + "num_input_tokens_seen": 35835248, + "step": 18308 + }, + { + "epoch": 2.426640159045726, + "grad_norm": 0.025491613894701004, + "learning_rate": 4.3763037641338826e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35837008, + "step": 18309 + }, + { + "epoch": 2.426772697150431, + "grad_norm": 0.019090726971626282, + "learning_rate": 4.374341912158259e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35839072, + "step": 18310 + }, + { + "epoch": 2.426905235255136, + "grad_norm": 0.0326387956738472, + "learning_rate": 4.372380457859551e-07, + "loss": 0.0002, + "num_input_tokens_seen": 35840456, + "step": 18311 + }, + { + "epoch": 2.427037773359841, + "grad_norm": 3.0513579845428467, + "learning_rate": 4.370419401275572e-07, + "loss": 0.0232, + "num_input_tokens_seen": 35842312, + "step": 18312 + }, + { + "epoch": 2.427170311464546, + "grad_norm": 0.13179492950439453, + "learning_rate": 4.36845874244414e-07, + "loss": 0.0006, + "num_input_tokens_seen": 35843760, + "step": 18313 + }, + { + "epoch": 2.427302849569251, + "grad_norm": 19.33431625366211, + "learning_rate": 4.3664984814030615e-07, + "loss": 0.1414, + "num_input_tokens_seen": 35846360, + "step": 18314 + }, + { + "epoch": 2.427435387673956, + "grad_norm": 3.8488097190856934, + "learning_rate": 4.3645386181901283e-07, + "loss": 0.0325, + "num_input_tokens_seen": 35848008, + "step": 18315 + }, + { + "epoch": 2.4275679257786615, + "grad_norm": 9.158230781555176, + "learning_rate": 4.3625791528431246e-07, + "loss": 0.099, + "num_input_tokens_seen": 35850312, + "step": 18316 + }, + { + "epoch": 2.4277004638833666, + "grad_norm": 15.860053062438965, + "learning_rate": 4.360620085399825e-07, + "loss": 0.2384, + "num_input_tokens_seen": 35852096, + "step": 18317 + }, + { + "epoch": 2.4278330019880716, + "grad_norm": 12.958687782287598, + "learning_rate": 4.358661415898016e-07, + "loss": 0.2383, + "num_input_tokens_seen": 35854416, + "step": 18318 + }, + { + "epoch": 2.4279655400927767, + "grad_norm": 0.011256652884185314, + "learning_rate": 4.356703144375449e-07, + "loss": 0.0, + "num_input_tokens_seen": 35855864, + "step": 18319 + }, + { + "epoch": 2.4280980781974817, + "grad_norm": 0.14236581325531006, + "learning_rate": 4.354745270869892e-07, + "loss": 0.0006, + "num_input_tokens_seen": 35857712, + "step": 18320 + }, + { + "epoch": 2.4282306163021867, + "grad_norm": 3.0555832386016846, + "learning_rate": 4.352787795419089e-07, + "loss": 0.0206, + "num_input_tokens_seen": 35859368, + "step": 18321 + }, + { + "epoch": 2.4283631544068918, + "grad_norm": 0.11765335500240326, + "learning_rate": 4.350830718060775e-07, + "loss": 0.0008, + "num_input_tokens_seen": 35861432, + "step": 18322 + }, + { + "epoch": 2.4284956925115972, + "grad_norm": 8.695486068725586, + "learning_rate": 4.3488740388326955e-07, + "loss": 0.0476, + "num_input_tokens_seen": 35862912, + "step": 18323 + }, + { + "epoch": 2.4286282306163023, + "grad_norm": 0.036051925271749496, + "learning_rate": 4.3469177577725645e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35865032, + "step": 18324 + }, + { + "epoch": 2.4287607687210073, + "grad_norm": 3.1093356609344482, + "learning_rate": 4.3449618749181133e-07, + "loss": 0.0231, + "num_input_tokens_seen": 35866704, + "step": 18325 + }, + { + "epoch": 2.4288933068257124, + "grad_norm": 9.437972068786621, + "learning_rate": 4.3430063903070474e-07, + "loss": 0.2568, + "num_input_tokens_seen": 35869376, + "step": 18326 + }, + { + "epoch": 2.4290258449304174, + "grad_norm": 0.008098899386823177, + "learning_rate": 4.341051303977067e-07, + "loss": 0.0, + "num_input_tokens_seen": 35871048, + "step": 18327 + }, + { + "epoch": 2.4291583830351224, + "grad_norm": 9.822820663452148, + "learning_rate": 4.339096615965865e-07, + "loss": 0.0792, + "num_input_tokens_seen": 35872800, + "step": 18328 + }, + { + "epoch": 2.4292909211398275, + "grad_norm": 0.002726078499108553, + "learning_rate": 4.337142326311139e-07, + "loss": 0.0, + "num_input_tokens_seen": 35874592, + "step": 18329 + }, + { + "epoch": 2.429423459244533, + "grad_norm": 0.0192753616720438, + "learning_rate": 4.3351884350505554e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35876048, + "step": 18330 + }, + { + "epoch": 2.429555997349238, + "grad_norm": 1.172118067741394, + "learning_rate": 4.3332349422218033e-07, + "loss": 0.0048, + "num_input_tokens_seen": 35877632, + "step": 18331 + }, + { + "epoch": 2.429688535453943, + "grad_norm": 0.977995753288269, + "learning_rate": 4.3312818478625367e-07, + "loss": 0.0024, + "num_input_tokens_seen": 35879848, + "step": 18332 + }, + { + "epoch": 2.429821073558648, + "grad_norm": 0.13878194987773895, + "learning_rate": 4.3293291520104114e-07, + "loss": 0.0007, + "num_input_tokens_seen": 35881480, + "step": 18333 + }, + { + "epoch": 2.429953611663353, + "grad_norm": 8.973982810974121, + "learning_rate": 4.327376854703086e-07, + "loss": 0.2234, + "num_input_tokens_seen": 35883560, + "step": 18334 + }, + { + "epoch": 2.430086149768058, + "grad_norm": 0.0032459511421620846, + "learning_rate": 4.325424955978194e-07, + "loss": 0.0, + "num_input_tokens_seen": 35885392, + "step": 18335 + }, + { + "epoch": 2.430218687872763, + "grad_norm": 0.0009680012008175254, + "learning_rate": 4.3234734558733644e-07, + "loss": 0.0, + "num_input_tokens_seen": 35886544, + "step": 18336 + }, + { + "epoch": 2.4303512259774687, + "grad_norm": 0.025460895150899887, + "learning_rate": 4.3215223544262387e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35888768, + "step": 18337 + }, + { + "epoch": 2.4304837640821737, + "grad_norm": 0.04833018407225609, + "learning_rate": 4.319571651674423e-07, + "loss": 0.0003, + "num_input_tokens_seen": 35890696, + "step": 18338 + }, + { + "epoch": 2.4306163021868787, + "grad_norm": 9.924674034118652, + "learning_rate": 4.3176213476555347e-07, + "loss": 0.1204, + "num_input_tokens_seen": 35893008, + "step": 18339 + }, + { + "epoch": 2.430748840291584, + "grad_norm": 5.924070358276367, + "learning_rate": 4.3156714424071767e-07, + "loss": 0.0644, + "num_input_tokens_seen": 35894544, + "step": 18340 + }, + { + "epoch": 2.430881378396289, + "grad_norm": 0.02826007455587387, + "learning_rate": 4.313721935966944e-07, + "loss": 0.0002, + "num_input_tokens_seen": 35896200, + "step": 18341 + }, + { + "epoch": 2.431013916500994, + "grad_norm": 3.287541389465332, + "learning_rate": 4.311772828372415e-07, + "loss": 0.0573, + "num_input_tokens_seen": 35898416, + "step": 18342 + }, + { + "epoch": 2.431146454605699, + "grad_norm": 5.100732803344727, + "learning_rate": 4.3098241196611797e-07, + "loss": 0.0261, + "num_input_tokens_seen": 35900336, + "step": 18343 + }, + { + "epoch": 2.4312789927104044, + "grad_norm": 12.204575538635254, + "learning_rate": 4.307875809870815e-07, + "loss": 0.2029, + "num_input_tokens_seen": 35903184, + "step": 18344 + }, + { + "epoch": 2.4314115308151094, + "grad_norm": 8.511176109313965, + "learning_rate": 4.305927899038878e-07, + "loss": 0.135, + "num_input_tokens_seen": 35905824, + "step": 18345 + }, + { + "epoch": 2.4315440689198144, + "grad_norm": 0.020928053185343742, + "learning_rate": 4.3039803872029275e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35908008, + "step": 18346 + }, + { + "epoch": 2.4316766070245195, + "grad_norm": 6.845074653625488, + "learning_rate": 4.302033274400505e-07, + "loss": 0.0719, + "num_input_tokens_seen": 35910456, + "step": 18347 + }, + { + "epoch": 2.4318091451292245, + "grad_norm": 10.301386833190918, + "learning_rate": 4.3000865606691615e-07, + "loss": 0.086, + "num_input_tokens_seen": 35912048, + "step": 18348 + }, + { + "epoch": 2.4319416832339296, + "grad_norm": 6.19709587097168, + "learning_rate": 4.2981402460464366e-07, + "loss": 0.1242, + "num_input_tokens_seen": 35914104, + "step": 18349 + }, + { + "epoch": 2.432074221338635, + "grad_norm": 0.03213613107800484, + "learning_rate": 4.2961943305698474e-07, + "loss": 0.0002, + "num_input_tokens_seen": 35916600, + "step": 18350 + }, + { + "epoch": 2.43220675944334, + "grad_norm": 3.0651750564575195, + "learning_rate": 4.294248814276916e-07, + "loss": 0.0517, + "num_input_tokens_seen": 35919144, + "step": 18351 + }, + { + "epoch": 2.432339297548045, + "grad_norm": 0.013049193657934666, + "learning_rate": 4.2923036972051497e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35920440, + "step": 18352 + }, + { + "epoch": 2.43247183565275, + "grad_norm": 0.011416442692279816, + "learning_rate": 4.290358979392048e-07, + "loss": 0.0, + "num_input_tokens_seen": 35921760, + "step": 18353 + }, + { + "epoch": 2.432604373757455, + "grad_norm": 0.5775468349456787, + "learning_rate": 4.288414660875112e-07, + "loss": 0.0022, + "num_input_tokens_seen": 35923448, + "step": 18354 + }, + { + "epoch": 2.4327369118621602, + "grad_norm": 9.361068725585938, + "learning_rate": 4.2864707416918365e-07, + "loss": 0.1805, + "num_input_tokens_seen": 35925120, + "step": 18355 + }, + { + "epoch": 2.4328694499668657, + "grad_norm": 6.171670913696289, + "learning_rate": 4.2845272218796945e-07, + "loss": 0.0928, + "num_input_tokens_seen": 35926624, + "step": 18356 + }, + { + "epoch": 2.4330019880715708, + "grad_norm": 0.07428599148988724, + "learning_rate": 4.282584101476159e-07, + "loss": 0.0007, + "num_input_tokens_seen": 35927768, + "step": 18357 + }, + { + "epoch": 2.433134526176276, + "grad_norm": 2.862386703491211, + "learning_rate": 4.280641380518691e-07, + "loss": 0.022, + "num_input_tokens_seen": 35929416, + "step": 18358 + }, + { + "epoch": 2.433267064280981, + "grad_norm": 0.2997550964355469, + "learning_rate": 4.278699059044747e-07, + "loss": 0.0007, + "num_input_tokens_seen": 35931384, + "step": 18359 + }, + { + "epoch": 2.433399602385686, + "grad_norm": 9.919206619262695, + "learning_rate": 4.2767571370917806e-07, + "loss": 0.0693, + "num_input_tokens_seen": 35933312, + "step": 18360 + }, + { + "epoch": 2.433532140490391, + "grad_norm": 3.051115036010742, + "learning_rate": 4.2748156146972393e-07, + "loss": 0.008, + "num_input_tokens_seen": 35935224, + "step": 18361 + }, + { + "epoch": 2.433664678595096, + "grad_norm": 2.03763747215271, + "learning_rate": 4.272874491898549e-07, + "loss": 0.0096, + "num_input_tokens_seen": 35937176, + "step": 18362 + }, + { + "epoch": 2.4337972166998014, + "grad_norm": 4.422829627990723, + "learning_rate": 4.2709337687331374e-07, + "loss": 0.0251, + "num_input_tokens_seen": 35938624, + "step": 18363 + }, + { + "epoch": 2.4339297548045065, + "grad_norm": 1.6534972190856934, + "learning_rate": 4.2689934452384175e-07, + "loss": 0.0065, + "num_input_tokens_seen": 35940600, + "step": 18364 + }, + { + "epoch": 2.4340622929092115, + "grad_norm": 2.9853992462158203, + "learning_rate": 4.267053521451811e-07, + "loss": 0.0199, + "num_input_tokens_seen": 35942560, + "step": 18365 + }, + { + "epoch": 2.4341948310139165, + "grad_norm": 0.008861691690981388, + "learning_rate": 4.265113997410711e-07, + "loss": 0.0, + "num_input_tokens_seen": 35944304, + "step": 18366 + }, + { + "epoch": 2.4343273691186216, + "grad_norm": 0.00162942532915622, + "learning_rate": 4.263174873152523e-07, + "loss": 0.0, + "num_input_tokens_seen": 35945384, + "step": 18367 + }, + { + "epoch": 2.4344599072233266, + "grad_norm": 0.04876639321446419, + "learning_rate": 4.2612361487146285e-07, + "loss": 0.0003, + "num_input_tokens_seen": 35946960, + "step": 18368 + }, + { + "epoch": 2.4345924453280317, + "grad_norm": 0.06672238558530807, + "learning_rate": 4.2592978241344007e-07, + "loss": 0.0003, + "num_input_tokens_seen": 35949232, + "step": 18369 + }, + { + "epoch": 2.434724983432737, + "grad_norm": 11.845396041870117, + "learning_rate": 4.257359899449226e-07, + "loss": 0.1805, + "num_input_tokens_seen": 35950680, + "step": 18370 + }, + { + "epoch": 2.434857521537442, + "grad_norm": 0.010267109610140324, + "learning_rate": 4.2554223746964636e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35951856, + "step": 18371 + }, + { + "epoch": 2.434990059642147, + "grad_norm": 2.477538585662842, + "learning_rate": 4.253485249913461e-07, + "loss": 0.0191, + "num_input_tokens_seen": 35954240, + "step": 18372 + }, + { + "epoch": 2.4351225977468522, + "grad_norm": 0.8585048913955688, + "learning_rate": 4.251548525137583e-07, + "loss": 0.0021, + "num_input_tokens_seen": 35956248, + "step": 18373 + }, + { + "epoch": 2.4352551358515573, + "grad_norm": 0.0016985032707452774, + "learning_rate": 4.249612200406156e-07, + "loss": 0.0, + "num_input_tokens_seen": 35957800, + "step": 18374 + }, + { + "epoch": 2.4353876739562623, + "grad_norm": 2.598919153213501, + "learning_rate": 4.247676275756527e-07, + "loss": 0.027, + "num_input_tokens_seen": 35959568, + "step": 18375 + }, + { + "epoch": 2.4355202120609674, + "grad_norm": 6.376715183258057, + "learning_rate": 4.245740751226013e-07, + "loss": 0.0614, + "num_input_tokens_seen": 35962072, + "step": 18376 + }, + { + "epoch": 2.435652750165673, + "grad_norm": 6.837149620056152, + "learning_rate": 4.2438056268519324e-07, + "loss": 0.0519, + "num_input_tokens_seen": 35963872, + "step": 18377 + }, + { + "epoch": 2.435785288270378, + "grad_norm": 0.05405694991350174, + "learning_rate": 4.241870902671602e-07, + "loss": 0.0002, + "num_input_tokens_seen": 35965192, + "step": 18378 + }, + { + "epoch": 2.435917826375083, + "grad_norm": 0.7467674016952515, + "learning_rate": 4.2399365787223176e-07, + "loss": 0.0052, + "num_input_tokens_seen": 35967488, + "step": 18379 + }, + { + "epoch": 2.436050364479788, + "grad_norm": 0.015552084892988205, + "learning_rate": 4.2380026550413816e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35970584, + "step": 18380 + }, + { + "epoch": 2.436182902584493, + "grad_norm": 0.6305965781211853, + "learning_rate": 4.236069131666076e-07, + "loss": 0.0039, + "num_input_tokens_seen": 35972416, + "step": 18381 + }, + { + "epoch": 2.436315440689198, + "grad_norm": 11.510557174682617, + "learning_rate": 4.2341360086336855e-07, + "loss": 0.0937, + "num_input_tokens_seen": 35974368, + "step": 18382 + }, + { + "epoch": 2.436447978793903, + "grad_norm": 1.6999883651733398, + "learning_rate": 4.2322032859814706e-07, + "loss": 0.0154, + "num_input_tokens_seen": 35976912, + "step": 18383 + }, + { + "epoch": 2.4365805168986086, + "grad_norm": 1.2743343114852905, + "learning_rate": 4.230270963746708e-07, + "loss": 0.0053, + "num_input_tokens_seen": 35978344, + "step": 18384 + }, + { + "epoch": 2.4367130550033136, + "grad_norm": 0.001416211947798729, + "learning_rate": 4.228339041966645e-07, + "loss": 0.0, + "num_input_tokens_seen": 35979640, + "step": 18385 + }, + { + "epoch": 2.4368455931080186, + "grad_norm": 4.7986884117126465, + "learning_rate": 4.226407520678541e-07, + "loss": 0.0395, + "num_input_tokens_seen": 35982032, + "step": 18386 + }, + { + "epoch": 2.4369781312127237, + "grad_norm": 6.503503322601318, + "learning_rate": 4.22447639991963e-07, + "loss": 0.0687, + "num_input_tokens_seen": 35984664, + "step": 18387 + }, + { + "epoch": 2.4371106693174287, + "grad_norm": 7.361541271209717, + "learning_rate": 4.222545679727147e-07, + "loss": 0.057, + "num_input_tokens_seen": 35987088, + "step": 18388 + }, + { + "epoch": 2.4372432074221337, + "grad_norm": 3.8410732746124268, + "learning_rate": 4.220615360138311e-07, + "loss": 0.0299, + "num_input_tokens_seen": 35989728, + "step": 18389 + }, + { + "epoch": 2.4373757455268388, + "grad_norm": 7.192863464355469, + "learning_rate": 4.2186854411903456e-07, + "loss": 0.1564, + "num_input_tokens_seen": 35991544, + "step": 18390 + }, + { + "epoch": 2.4375082836315443, + "grad_norm": 9.349955558776855, + "learning_rate": 4.216755922920468e-07, + "loss": 0.1583, + "num_input_tokens_seen": 35993776, + "step": 18391 + }, + { + "epoch": 2.4376408217362493, + "grad_norm": 7.483748912811279, + "learning_rate": 4.214826805365871e-07, + "loss": 0.1314, + "num_input_tokens_seen": 35995232, + "step": 18392 + }, + { + "epoch": 2.4377733598409543, + "grad_norm": 8.77556324005127, + "learning_rate": 4.212898088563755e-07, + "loss": 0.2755, + "num_input_tokens_seen": 35997256, + "step": 18393 + }, + { + "epoch": 2.4379058979456594, + "grad_norm": 0.030269550159573555, + "learning_rate": 4.210969772551304e-07, + "loss": 0.0001, + "num_input_tokens_seen": 35999112, + "step": 18394 + }, + { + "epoch": 2.4380384360503644, + "grad_norm": 3.167874336242676, + "learning_rate": 4.2090418573656895e-07, + "loss": 0.0095, + "num_input_tokens_seen": 36000792, + "step": 18395 + }, + { + "epoch": 2.4381709741550694, + "grad_norm": 0.1536400020122528, + "learning_rate": 4.2071143430440943e-07, + "loss": 0.0005, + "num_input_tokens_seen": 36002392, + "step": 18396 + }, + { + "epoch": 2.4383035122597745, + "grad_norm": 5.353066921234131, + "learning_rate": 4.2051872296236806e-07, + "loss": 0.0901, + "num_input_tokens_seen": 36004824, + "step": 18397 + }, + { + "epoch": 2.43843605036448, + "grad_norm": 0.0030064054299145937, + "learning_rate": 4.203260517141605e-07, + "loss": 0.0, + "num_input_tokens_seen": 36007048, + "step": 18398 + }, + { + "epoch": 2.438568588469185, + "grad_norm": 1.3286194801330566, + "learning_rate": 4.2013342056350133e-07, + "loss": 0.0103, + "num_input_tokens_seen": 36009632, + "step": 18399 + }, + { + "epoch": 2.43870112657389, + "grad_norm": 0.013480495661497116, + "learning_rate": 4.1994082951410414e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36010872, + "step": 18400 + }, + { + "epoch": 2.438833664678595, + "grad_norm": 0.032784976065158844, + "learning_rate": 4.197482785696827e-07, + "loss": 0.0002, + "num_input_tokens_seen": 36012424, + "step": 18401 + }, + { + "epoch": 2.4389662027833, + "grad_norm": 0.007632888853549957, + "learning_rate": 4.1955576773394984e-07, + "loss": 0.0, + "num_input_tokens_seen": 36014024, + "step": 18402 + }, + { + "epoch": 2.439098740888005, + "grad_norm": 0.0037206937558948994, + "learning_rate": 4.1936329701061736e-07, + "loss": 0.0, + "num_input_tokens_seen": 36015680, + "step": 18403 + }, + { + "epoch": 2.43923127899271, + "grad_norm": 1.278149127960205, + "learning_rate": 4.191708664033958e-07, + "loss": 0.0039, + "num_input_tokens_seen": 36017408, + "step": 18404 + }, + { + "epoch": 2.4393638170974157, + "grad_norm": 14.313703536987305, + "learning_rate": 4.1897847591599454e-07, + "loss": 0.0999, + "num_input_tokens_seen": 36018712, + "step": 18405 + }, + { + "epoch": 2.4394963552021207, + "grad_norm": 0.04546213895082474, + "learning_rate": 4.18786125552125e-07, + "loss": 0.0002, + "num_input_tokens_seen": 36020288, + "step": 18406 + }, + { + "epoch": 2.4396288933068258, + "grad_norm": 0.07428284734487534, + "learning_rate": 4.1859381531549363e-07, + "loss": 0.0003, + "num_input_tokens_seen": 36021760, + "step": 18407 + }, + { + "epoch": 2.439761431411531, + "grad_norm": 5.490110397338867, + "learning_rate": 4.184015452098103e-07, + "loss": 0.0182, + "num_input_tokens_seen": 36024528, + "step": 18408 + }, + { + "epoch": 2.439893969516236, + "grad_norm": 4.9820427894592285, + "learning_rate": 4.1820931523878117e-07, + "loss": 0.0251, + "num_input_tokens_seen": 36026720, + "step": 18409 + }, + { + "epoch": 2.440026507620941, + "grad_norm": 15.380413055419922, + "learning_rate": 4.180171254061119e-07, + "loss": 0.1256, + "num_input_tokens_seen": 36029040, + "step": 18410 + }, + { + "epoch": 2.440159045725646, + "grad_norm": 3.798063039779663, + "learning_rate": 4.1782497571550923e-07, + "loss": 0.0145, + "num_input_tokens_seen": 36030840, + "step": 18411 + }, + { + "epoch": 2.4402915838303514, + "grad_norm": 1.2348486185073853, + "learning_rate": 4.1763286617067744e-07, + "loss": 0.0037, + "num_input_tokens_seen": 36033104, + "step": 18412 + }, + { + "epoch": 2.4404241219350564, + "grad_norm": 0.008932110853493214, + "learning_rate": 4.174407967753202e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36035816, + "step": 18413 + }, + { + "epoch": 2.4405566600397615, + "grad_norm": 0.05620727688074112, + "learning_rate": 4.172487675331416e-07, + "loss": 0.0004, + "num_input_tokens_seen": 36038128, + "step": 18414 + }, + { + "epoch": 2.4406891981444665, + "grad_norm": 9.432465553283691, + "learning_rate": 4.1705677844784354e-07, + "loss": 0.1278, + "num_input_tokens_seen": 36039272, + "step": 18415 + }, + { + "epoch": 2.4408217362491715, + "grad_norm": 0.013605829328298569, + "learning_rate": 4.16864829523127e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36040352, + "step": 18416 + }, + { + "epoch": 2.4409542743538766, + "grad_norm": 0.07038331031799316, + "learning_rate": 4.1667292076269434e-07, + "loss": 0.0003, + "num_input_tokens_seen": 36041976, + "step": 18417 + }, + { + "epoch": 2.4410868124585816, + "grad_norm": 2.9397194385528564, + "learning_rate": 4.1648105217024503e-07, + "loss": 0.0096, + "num_input_tokens_seen": 36043920, + "step": 18418 + }, + { + "epoch": 2.441219350563287, + "grad_norm": 0.051928140223026276, + "learning_rate": 4.1628922374947785e-07, + "loss": 0.0006, + "num_input_tokens_seen": 36046032, + "step": 18419 + }, + { + "epoch": 2.441351888667992, + "grad_norm": 0.5381191968917847, + "learning_rate": 4.1609743550409227e-07, + "loss": 0.0029, + "num_input_tokens_seen": 36048112, + "step": 18420 + }, + { + "epoch": 2.441484426772697, + "grad_norm": 0.03849567472934723, + "learning_rate": 4.159056874377854e-07, + "loss": 0.0004, + "num_input_tokens_seen": 36049824, + "step": 18421 + }, + { + "epoch": 2.441616964877402, + "grad_norm": 5.573018550872803, + "learning_rate": 4.1571397955425505e-07, + "loss": 0.1024, + "num_input_tokens_seen": 36051600, + "step": 18422 + }, + { + "epoch": 2.4417495029821072, + "grad_norm": 0.04648677259683609, + "learning_rate": 4.1552231185719715e-07, + "loss": 0.0002, + "num_input_tokens_seen": 36053512, + "step": 18423 + }, + { + "epoch": 2.4418820410868123, + "grad_norm": 1.9818496704101562, + "learning_rate": 4.153306843503063e-07, + "loss": 0.0134, + "num_input_tokens_seen": 36054912, + "step": 18424 + }, + { + "epoch": 2.4420145791915173, + "grad_norm": 4.1605987548828125, + "learning_rate": 4.15139097037279e-07, + "loss": 0.1069, + "num_input_tokens_seen": 36057312, + "step": 18425 + }, + { + "epoch": 2.442147117296223, + "grad_norm": 0.0053329914808273315, + "learning_rate": 4.1494754992180727e-07, + "loss": 0.0, + "num_input_tokens_seen": 36058920, + "step": 18426 + }, + { + "epoch": 2.442279655400928, + "grad_norm": 0.528232753276825, + "learning_rate": 4.1475604300758585e-07, + "loss": 0.0016, + "num_input_tokens_seen": 36061048, + "step": 18427 + }, + { + "epoch": 2.442412193505633, + "grad_norm": 2.681910991668701, + "learning_rate": 4.1456457629830637e-07, + "loss": 0.0113, + "num_input_tokens_seen": 36062864, + "step": 18428 + }, + { + "epoch": 2.442544731610338, + "grad_norm": 3.688399314880371, + "learning_rate": 4.143731497976605e-07, + "loss": 0.0266, + "num_input_tokens_seen": 36065232, + "step": 18429 + }, + { + "epoch": 2.442677269715043, + "grad_norm": 3.1692559719085693, + "learning_rate": 4.141817635093387e-07, + "loss": 0.0199, + "num_input_tokens_seen": 36067208, + "step": 18430 + }, + { + "epoch": 2.442809807819748, + "grad_norm": 3.5751874446868896, + "learning_rate": 4.139904174370313e-07, + "loss": 0.0294, + "num_input_tokens_seen": 36069296, + "step": 18431 + }, + { + "epoch": 2.4429423459244535, + "grad_norm": 12.03130054473877, + "learning_rate": 4.1379911158442835e-07, + "loss": 0.0883, + "num_input_tokens_seen": 36071368, + "step": 18432 + }, + { + "epoch": 2.4430748840291585, + "grad_norm": 0.045436322689056396, + "learning_rate": 4.1360784595521756e-07, + "loss": 0.0003, + "num_input_tokens_seen": 36072624, + "step": 18433 + }, + { + "epoch": 2.4432074221338635, + "grad_norm": 1.3194892406463623, + "learning_rate": 4.134166205530871e-07, + "loss": 0.0031, + "num_input_tokens_seen": 36075000, + "step": 18434 + }, + { + "epoch": 2.4433399602385686, + "grad_norm": 3.343655586242676, + "learning_rate": 4.132254353817236e-07, + "loss": 0.0219, + "num_input_tokens_seen": 36076864, + "step": 18435 + }, + { + "epoch": 2.4434724983432736, + "grad_norm": 4.3004150390625, + "learning_rate": 4.1303429044481256e-07, + "loss": 0.0406, + "num_input_tokens_seen": 36078952, + "step": 18436 + }, + { + "epoch": 2.4436050364479787, + "grad_norm": 28.068340301513672, + "learning_rate": 4.128431857460402e-07, + "loss": 0.1997, + "num_input_tokens_seen": 36080600, + "step": 18437 + }, + { + "epoch": 2.443737574552684, + "grad_norm": 0.027430184185504913, + "learning_rate": 4.126521212890916e-07, + "loss": 0.0002, + "num_input_tokens_seen": 36082432, + "step": 18438 + }, + { + "epoch": 2.443870112657389, + "grad_norm": 4.455007553100586, + "learning_rate": 4.124610970776502e-07, + "loss": 0.0172, + "num_input_tokens_seen": 36084472, + "step": 18439 + }, + { + "epoch": 2.444002650762094, + "grad_norm": 0.14643605053424835, + "learning_rate": 4.122701131153989e-07, + "loss": 0.0008, + "num_input_tokens_seen": 36085864, + "step": 18440 + }, + { + "epoch": 2.4441351888667993, + "grad_norm": 0.015648264437913895, + "learning_rate": 4.120791694060203e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36087848, + "step": 18441 + }, + { + "epoch": 2.4442677269715043, + "grad_norm": 2.361349582672119, + "learning_rate": 4.1188826595319473e-07, + "loss": 0.0167, + "num_input_tokens_seen": 36089888, + "step": 18442 + }, + { + "epoch": 2.4444002650762093, + "grad_norm": 0.0550403818488121, + "learning_rate": 4.116974027606041e-07, + "loss": 0.0004, + "num_input_tokens_seen": 36091792, + "step": 18443 + }, + { + "epoch": 2.4445328031809144, + "grad_norm": 5.834690093994141, + "learning_rate": 4.115065798319287e-07, + "loss": 0.0391, + "num_input_tokens_seen": 36094872, + "step": 18444 + }, + { + "epoch": 2.44466534128562, + "grad_norm": 0.7376577854156494, + "learning_rate": 4.113157971708473e-07, + "loss": 0.0046, + "num_input_tokens_seen": 36096712, + "step": 18445 + }, + { + "epoch": 2.444797879390325, + "grad_norm": 0.0024754374753683805, + "learning_rate": 4.111250547810383e-07, + "loss": 0.0, + "num_input_tokens_seen": 36098256, + "step": 18446 + }, + { + "epoch": 2.44493041749503, + "grad_norm": 0.046855054795742035, + "learning_rate": 4.1093435266617855e-07, + "loss": 0.0002, + "num_input_tokens_seen": 36100848, + "step": 18447 + }, + { + "epoch": 2.445062955599735, + "grad_norm": 0.004084752406924963, + "learning_rate": 4.107436908299464e-07, + "loss": 0.0, + "num_input_tokens_seen": 36102176, + "step": 18448 + }, + { + "epoch": 2.44519549370444, + "grad_norm": 0.012817305512726307, + "learning_rate": 4.1055306927601653e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36103680, + "step": 18449 + }, + { + "epoch": 2.445328031809145, + "grad_norm": 0.9219167232513428, + "learning_rate": 4.103624880080656e-07, + "loss": 0.0024, + "num_input_tokens_seen": 36105008, + "step": 18450 + }, + { + "epoch": 2.44546056991385, + "grad_norm": 4.6600775718688965, + "learning_rate": 4.1017194702976717e-07, + "loss": 0.0429, + "num_input_tokens_seen": 36106784, + "step": 18451 + }, + { + "epoch": 2.4455931080185556, + "grad_norm": 5.627447128295898, + "learning_rate": 4.0998144634479485e-07, + "loss": 0.0164, + "num_input_tokens_seen": 36108064, + "step": 18452 + }, + { + "epoch": 2.4457256461232606, + "grad_norm": 0.22064460813999176, + "learning_rate": 4.0979098595682276e-07, + "loss": 0.0005, + "num_input_tokens_seen": 36109424, + "step": 18453 + }, + { + "epoch": 2.4458581842279656, + "grad_norm": 0.08552543073892593, + "learning_rate": 4.096005658695218e-07, + "loss": 0.0004, + "num_input_tokens_seen": 36111376, + "step": 18454 + }, + { + "epoch": 2.4459907223326707, + "grad_norm": 0.5202300548553467, + "learning_rate": 4.094101860865646e-07, + "loss": 0.0022, + "num_input_tokens_seen": 36113824, + "step": 18455 + }, + { + "epoch": 2.4461232604373757, + "grad_norm": 3.65033221244812, + "learning_rate": 4.0921984661162124e-07, + "loss": 0.0453, + "num_input_tokens_seen": 36116832, + "step": 18456 + }, + { + "epoch": 2.4462557985420808, + "grad_norm": 6.604290962219238, + "learning_rate": 4.0902954744836106e-07, + "loss": 0.0529, + "num_input_tokens_seen": 36118752, + "step": 18457 + }, + { + "epoch": 2.446388336646786, + "grad_norm": 2.9772605895996094, + "learning_rate": 4.0883928860045415e-07, + "loss": 0.0176, + "num_input_tokens_seen": 36120136, + "step": 18458 + }, + { + "epoch": 2.4465208747514913, + "grad_norm": 0.6155550479888916, + "learning_rate": 4.086490700715687e-07, + "loss": 0.0029, + "num_input_tokens_seen": 36122032, + "step": 18459 + }, + { + "epoch": 2.4466534128561963, + "grad_norm": 0.011906280182301998, + "learning_rate": 4.084588918653709e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36123840, + "step": 18460 + }, + { + "epoch": 2.4467859509609013, + "grad_norm": 0.12272311002016068, + "learning_rate": 4.082687539855293e-07, + "loss": 0.0004, + "num_input_tokens_seen": 36125096, + "step": 18461 + }, + { + "epoch": 2.4469184890656064, + "grad_norm": 7.3983588218688965, + "learning_rate": 4.0807865643570865e-07, + "loss": 0.04, + "num_input_tokens_seen": 36126528, + "step": 18462 + }, + { + "epoch": 2.4470510271703114, + "grad_norm": 0.06703300774097443, + "learning_rate": 4.078885992195752e-07, + "loss": 0.0003, + "num_input_tokens_seen": 36128520, + "step": 18463 + }, + { + "epoch": 2.4471835652750165, + "grad_norm": 0.002901841886341572, + "learning_rate": 4.0769858234079277e-07, + "loss": 0.0, + "num_input_tokens_seen": 36130000, + "step": 18464 + }, + { + "epoch": 2.4473161033797215, + "grad_norm": 0.014710960909724236, + "learning_rate": 4.0750860580302503e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36132728, + "step": 18465 + }, + { + "epoch": 2.447448641484427, + "grad_norm": 2.1773293018341064, + "learning_rate": 4.073186696099343e-07, + "loss": 0.011, + "num_input_tokens_seen": 36135328, + "step": 18466 + }, + { + "epoch": 2.447581179589132, + "grad_norm": 0.011082114651799202, + "learning_rate": 4.0712877376518327e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36136640, + "step": 18467 + }, + { + "epoch": 2.447713717693837, + "grad_norm": 11.57297420501709, + "learning_rate": 4.0693891827243397e-07, + "loss": 0.1207, + "num_input_tokens_seen": 36138264, + "step": 18468 + }, + { + "epoch": 2.447846255798542, + "grad_norm": 0.02986838109791279, + "learning_rate": 4.0674910313534627e-07, + "loss": 0.0002, + "num_input_tokens_seen": 36139328, + "step": 18469 + }, + { + "epoch": 2.447978793903247, + "grad_norm": 10.477578163146973, + "learning_rate": 4.0655932835758e-07, + "loss": 0.0428, + "num_input_tokens_seen": 36141024, + "step": 18470 + }, + { + "epoch": 2.448111332007952, + "grad_norm": 0.09542964398860931, + "learning_rate": 4.063695939427939e-07, + "loss": 0.0003, + "num_input_tokens_seen": 36142536, + "step": 18471 + }, + { + "epoch": 2.448243870112657, + "grad_norm": 0.2583884596824646, + "learning_rate": 4.061798998946459e-07, + "loss": 0.001, + "num_input_tokens_seen": 36144256, + "step": 18472 + }, + { + "epoch": 2.4483764082173627, + "grad_norm": 5.531585216522217, + "learning_rate": 4.059902462167939e-07, + "loss": 0.0302, + "num_input_tokens_seen": 36146480, + "step": 18473 + }, + { + "epoch": 2.4485089463220677, + "grad_norm": 0.6507936716079712, + "learning_rate": 4.0580063291289496e-07, + "loss": 0.0035, + "num_input_tokens_seen": 36149096, + "step": 18474 + }, + { + "epoch": 2.4486414844267728, + "grad_norm": 7.938992977142334, + "learning_rate": 4.056110599866045e-07, + "loss": 0.0444, + "num_input_tokens_seen": 36150848, + "step": 18475 + }, + { + "epoch": 2.448774022531478, + "grad_norm": 0.8242024183273315, + "learning_rate": 4.0542152744157794e-07, + "loss": 0.0029, + "num_input_tokens_seen": 36152720, + "step": 18476 + }, + { + "epoch": 2.448906560636183, + "grad_norm": 1.5624604225158691, + "learning_rate": 4.052320352814684e-07, + "loss": 0.0091, + "num_input_tokens_seen": 36154584, + "step": 18477 + }, + { + "epoch": 2.449039098740888, + "grad_norm": 0.1616496741771698, + "learning_rate": 4.050425835099306e-07, + "loss": 0.0011, + "num_input_tokens_seen": 36157448, + "step": 18478 + }, + { + "epoch": 2.449171636845593, + "grad_norm": 0.3109237253665924, + "learning_rate": 4.048531721306173e-07, + "loss": 0.0016, + "num_input_tokens_seen": 36160808, + "step": 18479 + }, + { + "epoch": 2.4493041749502984, + "grad_norm": 10.860774040222168, + "learning_rate": 4.046638011471804e-07, + "loss": 0.3004, + "num_input_tokens_seen": 36163192, + "step": 18480 + }, + { + "epoch": 2.4494367130550034, + "grad_norm": 0.6216098666191101, + "learning_rate": 4.044744705632711e-07, + "loss": 0.0034, + "num_input_tokens_seen": 36165504, + "step": 18481 + }, + { + "epoch": 2.4495692511597085, + "grad_norm": 2.1604909896850586, + "learning_rate": 4.0428518038253926e-07, + "loss": 0.0049, + "num_input_tokens_seen": 36168392, + "step": 18482 + }, + { + "epoch": 2.4497017892644135, + "grad_norm": 0.08943673223257065, + "learning_rate": 4.040959306086345e-07, + "loss": 0.0004, + "num_input_tokens_seen": 36169768, + "step": 18483 + }, + { + "epoch": 2.4498343273691185, + "grad_norm": 5.890272617340088, + "learning_rate": 4.0390672124520607e-07, + "loss": 0.041, + "num_input_tokens_seen": 36171624, + "step": 18484 + }, + { + "epoch": 2.4499668654738236, + "grad_norm": 8.719010353088379, + "learning_rate": 4.0371755229590273e-07, + "loss": 0.071, + "num_input_tokens_seen": 36173328, + "step": 18485 + }, + { + "epoch": 2.4500994035785286, + "grad_norm": 10.766963005065918, + "learning_rate": 4.0352842376437107e-07, + "loss": 0.0944, + "num_input_tokens_seen": 36175152, + "step": 18486 + }, + { + "epoch": 2.450231941683234, + "grad_norm": 1.6075553894042969, + "learning_rate": 4.0333933565425755e-07, + "loss": 0.0151, + "num_input_tokens_seen": 36176800, + "step": 18487 + }, + { + "epoch": 2.450364479787939, + "grad_norm": 12.83462905883789, + "learning_rate": 4.031502879692076e-07, + "loss": 0.0881, + "num_input_tokens_seen": 36178744, + "step": 18488 + }, + { + "epoch": 2.450497017892644, + "grad_norm": 0.001172952470369637, + "learning_rate": 4.029612807128674e-07, + "loss": 0.0, + "num_input_tokens_seen": 36180096, + "step": 18489 + }, + { + "epoch": 2.450629555997349, + "grad_norm": 10.649200439453125, + "learning_rate": 4.0277231388887947e-07, + "loss": 0.0157, + "num_input_tokens_seen": 36181808, + "step": 18490 + }, + { + "epoch": 2.4507620941020543, + "grad_norm": 0.35138416290283203, + "learning_rate": 4.0258338750088886e-07, + "loss": 0.0011, + "num_input_tokens_seen": 36183304, + "step": 18491 + }, + { + "epoch": 2.4508946322067593, + "grad_norm": 0.0034369833301752806, + "learning_rate": 4.0239450155253766e-07, + "loss": 0.0, + "num_input_tokens_seen": 36185160, + "step": 18492 + }, + { + "epoch": 2.4510271703114643, + "grad_norm": 2.6676578521728516, + "learning_rate": 4.022056560474666e-07, + "loss": 0.0943, + "num_input_tokens_seen": 36187656, + "step": 18493 + }, + { + "epoch": 2.45115970841617, + "grad_norm": 0.39595407247543335, + "learning_rate": 4.0201685098931855e-07, + "loss": 0.002, + "num_input_tokens_seen": 36188832, + "step": 18494 + }, + { + "epoch": 2.451292246520875, + "grad_norm": 0.016962377354502678, + "learning_rate": 4.018280863817328e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36191904, + "step": 18495 + }, + { + "epoch": 2.45142478462558, + "grad_norm": 0.001013073604553938, + "learning_rate": 4.016393622283485e-07, + "loss": 0.0, + "num_input_tokens_seen": 36193192, + "step": 18496 + }, + { + "epoch": 2.451557322730285, + "grad_norm": 0.0008697627345100045, + "learning_rate": 4.014506785328054e-07, + "loss": 0.0, + "num_input_tokens_seen": 36195344, + "step": 18497 + }, + { + "epoch": 2.45168986083499, + "grad_norm": 0.18789683282375336, + "learning_rate": 4.012620352987409e-07, + "loss": 0.0004, + "num_input_tokens_seen": 36197296, + "step": 18498 + }, + { + "epoch": 2.451822398939695, + "grad_norm": 0.027016781270503998, + "learning_rate": 4.0107343252979157e-07, + "loss": 0.0003, + "num_input_tokens_seen": 36200280, + "step": 18499 + }, + { + "epoch": 2.4519549370444, + "grad_norm": 9.296707153320312, + "learning_rate": 4.0088487022959507e-07, + "loss": 0.1531, + "num_input_tokens_seen": 36202512, + "step": 18500 + }, + { + "epoch": 2.4520874751491055, + "grad_norm": 5.446923732757568, + "learning_rate": 4.0069634840178645e-07, + "loss": 0.102, + "num_input_tokens_seen": 36204840, + "step": 18501 + }, + { + "epoch": 2.4522200132538106, + "grad_norm": 0.012347945012152195, + "learning_rate": 4.0050786704999985e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36206552, + "step": 18502 + }, + { + "epoch": 2.4523525513585156, + "grad_norm": 1.0581101179122925, + "learning_rate": 4.0031942617787063e-07, + "loss": 0.006, + "num_input_tokens_seen": 36207728, + "step": 18503 + }, + { + "epoch": 2.4524850894632206, + "grad_norm": 6.921823024749756, + "learning_rate": 4.001310257890306e-07, + "loss": 0.0622, + "num_input_tokens_seen": 36209592, + "step": 18504 + }, + { + "epoch": 2.4526176275679257, + "grad_norm": 6.245072364807129, + "learning_rate": 3.999426658871136e-07, + "loss": 0.0174, + "num_input_tokens_seen": 36212592, + "step": 18505 + }, + { + "epoch": 2.4527501656726307, + "grad_norm": 1.3434382677078247, + "learning_rate": 3.9975434647575083e-07, + "loss": 0.0019, + "num_input_tokens_seen": 36213984, + "step": 18506 + }, + { + "epoch": 2.4528827037773357, + "grad_norm": 0.032076429575681686, + "learning_rate": 3.995660675585725e-07, + "loss": 0.0002, + "num_input_tokens_seen": 36215760, + "step": 18507 + }, + { + "epoch": 2.4530152418820412, + "grad_norm": 6.930546283721924, + "learning_rate": 3.9937782913921e-07, + "loss": 0.0521, + "num_input_tokens_seen": 36217704, + "step": 18508 + }, + { + "epoch": 2.4531477799867463, + "grad_norm": 0.9492597579956055, + "learning_rate": 3.9918963122129175e-07, + "loss": 0.0017, + "num_input_tokens_seen": 36219568, + "step": 18509 + }, + { + "epoch": 2.4532803180914513, + "grad_norm": 0.0065710642375051975, + "learning_rate": 3.9900147380844716e-07, + "loss": 0.0, + "num_input_tokens_seen": 36220880, + "step": 18510 + }, + { + "epoch": 2.4534128561961563, + "grad_norm": 0.0010866832453757524, + "learning_rate": 3.9881335690430345e-07, + "loss": 0.0, + "num_input_tokens_seen": 36223008, + "step": 18511 + }, + { + "epoch": 2.4535453943008614, + "grad_norm": 0.23792217671871185, + "learning_rate": 3.9862528051248764e-07, + "loss": 0.0007, + "num_input_tokens_seen": 36224568, + "step": 18512 + }, + { + "epoch": 2.4536779324055664, + "grad_norm": 0.012285387143492699, + "learning_rate": 3.984372446366258e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36226752, + "step": 18513 + }, + { + "epoch": 2.4538104705102715, + "grad_norm": 8.636557579040527, + "learning_rate": 3.9824924928034333e-07, + "loss": 0.2031, + "num_input_tokens_seen": 36228432, + "step": 18514 + }, + { + "epoch": 2.453943008614977, + "grad_norm": 4.483791828155518, + "learning_rate": 3.9806129444726615e-07, + "loss": 0.0713, + "num_input_tokens_seen": 36231328, + "step": 18515 + }, + { + "epoch": 2.454075546719682, + "grad_norm": 0.38840046525001526, + "learning_rate": 3.978733801410173e-07, + "loss": 0.0016, + "num_input_tokens_seen": 36233216, + "step": 18516 + }, + { + "epoch": 2.454208084824387, + "grad_norm": 0.705833911895752, + "learning_rate": 3.9768550636521966e-07, + "loss": 0.0016, + "num_input_tokens_seen": 36234584, + "step": 18517 + }, + { + "epoch": 2.454340622929092, + "grad_norm": 0.3550834655761719, + "learning_rate": 3.9749767312349585e-07, + "loss": 0.0013, + "num_input_tokens_seen": 36236032, + "step": 18518 + }, + { + "epoch": 2.454473161033797, + "grad_norm": 0.39238330721855164, + "learning_rate": 3.9730988041946664e-07, + "loss": 0.001, + "num_input_tokens_seen": 36237704, + "step": 18519 + }, + { + "epoch": 2.454605699138502, + "grad_norm": 0.08152070641517639, + "learning_rate": 3.971221282567536e-07, + "loss": 0.0002, + "num_input_tokens_seen": 36238952, + "step": 18520 + }, + { + "epoch": 2.4547382372432076, + "grad_norm": 9.114176750183105, + "learning_rate": 3.969344166389771e-07, + "loss": 0.0583, + "num_input_tokens_seen": 36240968, + "step": 18521 + }, + { + "epoch": 2.4548707753479126, + "grad_norm": 7.351638317108154, + "learning_rate": 3.96746745569756e-07, + "loss": 0.1613, + "num_input_tokens_seen": 36243296, + "step": 18522 + }, + { + "epoch": 2.4550033134526177, + "grad_norm": 13.040792465209961, + "learning_rate": 3.965591150527087e-07, + "loss": 0.1516, + "num_input_tokens_seen": 36245752, + "step": 18523 + }, + { + "epoch": 2.4551358515573227, + "grad_norm": 0.03130553662776947, + "learning_rate": 3.963715250914521e-07, + "loss": 0.0002, + "num_input_tokens_seen": 36248136, + "step": 18524 + }, + { + "epoch": 2.4552683896620278, + "grad_norm": 5.817678928375244, + "learning_rate": 3.961839756896041e-07, + "loss": 0.053, + "num_input_tokens_seen": 36250272, + "step": 18525 + }, + { + "epoch": 2.455400927766733, + "grad_norm": 0.0018244199454784393, + "learning_rate": 3.9599646685078016e-07, + "loss": 0.0, + "num_input_tokens_seen": 36251736, + "step": 18526 + }, + { + "epoch": 2.4555334658714383, + "grad_norm": 7.069921970367432, + "learning_rate": 3.958089985785962e-07, + "loss": 0.0978, + "num_input_tokens_seen": 36253928, + "step": 18527 + }, + { + "epoch": 2.4556660039761433, + "grad_norm": 4.119227886199951, + "learning_rate": 3.956215708766664e-07, + "loss": 0.0153, + "num_input_tokens_seen": 36256224, + "step": 18528 + }, + { + "epoch": 2.4557985420808484, + "grad_norm": 2.882883071899414, + "learning_rate": 3.9543418374860446e-07, + "loss": 0.0704, + "num_input_tokens_seen": 36258816, + "step": 18529 + }, + { + "epoch": 2.4559310801855534, + "grad_norm": 0.0659695565700531, + "learning_rate": 3.9524683719802255e-07, + "loss": 0.0002, + "num_input_tokens_seen": 36260704, + "step": 18530 + }, + { + "epoch": 2.4560636182902584, + "grad_norm": 0.052687473595142365, + "learning_rate": 3.950595312285338e-07, + "loss": 0.0003, + "num_input_tokens_seen": 36262552, + "step": 18531 + }, + { + "epoch": 2.4561961563949635, + "grad_norm": 0.01680474914610386, + "learning_rate": 3.948722658437501e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36263848, + "step": 18532 + }, + { + "epoch": 2.4563286944996685, + "grad_norm": 14.849852561950684, + "learning_rate": 3.9468504104728107e-07, + "loss": 0.2957, + "num_input_tokens_seen": 36265472, + "step": 18533 + }, + { + "epoch": 2.456461232604374, + "grad_norm": 19.147478103637695, + "learning_rate": 3.944978568427371e-07, + "loss": 0.3487, + "num_input_tokens_seen": 36267112, + "step": 18534 + }, + { + "epoch": 2.456593770709079, + "grad_norm": 1.466558575630188, + "learning_rate": 3.9431071323372623e-07, + "loss": 0.004, + "num_input_tokens_seen": 36269504, + "step": 18535 + }, + { + "epoch": 2.456726308813784, + "grad_norm": 0.021101413294672966, + "learning_rate": 3.94123610223858e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36271240, + "step": 18536 + }, + { + "epoch": 2.456858846918489, + "grad_norm": 3.013065814971924, + "learning_rate": 3.9393654781673893e-07, + "loss": 0.0523, + "num_input_tokens_seen": 36273184, + "step": 18537 + }, + { + "epoch": 2.456991385023194, + "grad_norm": 0.25481125712394714, + "learning_rate": 3.9374952601597654e-07, + "loss": 0.0015, + "num_input_tokens_seen": 36275496, + "step": 18538 + }, + { + "epoch": 2.457123923127899, + "grad_norm": 0.0035919276997447014, + "learning_rate": 3.935625448251765e-07, + "loss": 0.0, + "num_input_tokens_seen": 36276984, + "step": 18539 + }, + { + "epoch": 2.457256461232604, + "grad_norm": 0.009860724210739136, + "learning_rate": 3.933756042479428e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36278680, + "step": 18540 + }, + { + "epoch": 2.4573889993373097, + "grad_norm": 4.453540802001953, + "learning_rate": 3.931887042878815e-07, + "loss": 0.011, + "num_input_tokens_seen": 36280336, + "step": 18541 + }, + { + "epoch": 2.4575215374420147, + "grad_norm": 1.2259290218353271, + "learning_rate": 3.930018449485956e-07, + "loss": 0.0029, + "num_input_tokens_seen": 36282464, + "step": 18542 + }, + { + "epoch": 2.4576540755467198, + "grad_norm": 0.015249701216816902, + "learning_rate": 3.928150262336869e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36283752, + "step": 18543 + }, + { + "epoch": 2.457786613651425, + "grad_norm": 2.2574620246887207, + "learning_rate": 3.926282481467589e-07, + "loss": 0.022, + "num_input_tokens_seen": 36285480, + "step": 18544 + }, + { + "epoch": 2.45791915175613, + "grad_norm": 0.017547357827425003, + "learning_rate": 3.924415106914112e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36287912, + "step": 18545 + }, + { + "epoch": 2.458051689860835, + "grad_norm": 0.019435573369264603, + "learning_rate": 3.9225481387124594e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36289632, + "step": 18546 + }, + { + "epoch": 2.45818422796554, + "grad_norm": 1.5946136713027954, + "learning_rate": 3.920681576898619e-07, + "loss": 0.0062, + "num_input_tokens_seen": 36291336, + "step": 18547 + }, + { + "epoch": 2.4583167660702454, + "grad_norm": 0.0018522939644753933, + "learning_rate": 3.9188154215085786e-07, + "loss": 0.0, + "num_input_tokens_seen": 36292376, + "step": 18548 + }, + { + "epoch": 2.4584493041749504, + "grad_norm": 6.19691801071167, + "learning_rate": 3.916949672578316e-07, + "loss": 0.1191, + "num_input_tokens_seen": 36295440, + "step": 18549 + }, + { + "epoch": 2.4585818422796555, + "grad_norm": 0.012406140565872192, + "learning_rate": 3.9150843301438063e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36297232, + "step": 18550 + }, + { + "epoch": 2.4587143803843605, + "grad_norm": 0.20916685461997986, + "learning_rate": 3.9132193942410244e-07, + "loss": 0.0013, + "num_input_tokens_seen": 36299496, + "step": 18551 + }, + { + "epoch": 2.4588469184890656, + "grad_norm": 0.05357753857970238, + "learning_rate": 3.9113548649059167e-07, + "loss": 0.0002, + "num_input_tokens_seen": 36301848, + "step": 18552 + }, + { + "epoch": 2.4589794565937706, + "grad_norm": 5.810751438140869, + "learning_rate": 3.9094907421744375e-07, + "loss": 0.0474, + "num_input_tokens_seen": 36303944, + "step": 18553 + }, + { + "epoch": 2.4591119946984756, + "grad_norm": 0.12448681145906448, + "learning_rate": 3.907627026082525e-07, + "loss": 0.0009, + "num_input_tokens_seen": 36306536, + "step": 18554 + }, + { + "epoch": 2.459244532803181, + "grad_norm": 0.010513396002352238, + "learning_rate": 3.9057637166661113e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36307888, + "step": 18555 + }, + { + "epoch": 2.459377070907886, + "grad_norm": 11.488607406616211, + "learning_rate": 3.903900813961123e-07, + "loss": 0.0607, + "num_input_tokens_seen": 36309224, + "step": 18556 + }, + { + "epoch": 2.459509609012591, + "grad_norm": 7.867898941040039, + "learning_rate": 3.902038318003487e-07, + "loss": 0.0652, + "num_input_tokens_seen": 36310960, + "step": 18557 + }, + { + "epoch": 2.4596421471172962, + "grad_norm": 0.0015349120367318392, + "learning_rate": 3.9001762288291056e-07, + "loss": 0.0, + "num_input_tokens_seen": 36312072, + "step": 18558 + }, + { + "epoch": 2.4597746852220013, + "grad_norm": 2.8339264392852783, + "learning_rate": 3.898314546473883e-07, + "loss": 0.0121, + "num_input_tokens_seen": 36313712, + "step": 18559 + }, + { + "epoch": 2.4599072233267063, + "grad_norm": 5.4413042068481445, + "learning_rate": 3.8964532709737103e-07, + "loss": 0.0216, + "num_input_tokens_seen": 36316000, + "step": 18560 + }, + { + "epoch": 2.4600397614314113, + "grad_norm": 5.859724998474121, + "learning_rate": 3.894592402364472e-07, + "loss": 0.0722, + "num_input_tokens_seen": 36318744, + "step": 18561 + }, + { + "epoch": 2.460172299536117, + "grad_norm": 0.0006884496542625129, + "learning_rate": 3.8927319406820625e-07, + "loss": 0.0, + "num_input_tokens_seen": 36320192, + "step": 18562 + }, + { + "epoch": 2.460304837640822, + "grad_norm": 8.242111206054688, + "learning_rate": 3.8908718859623413e-07, + "loss": 0.1392, + "num_input_tokens_seen": 36322384, + "step": 18563 + }, + { + "epoch": 2.460437375745527, + "grad_norm": 4.685074329376221, + "learning_rate": 3.8890122382411714e-07, + "loss": 0.0755, + "num_input_tokens_seen": 36324728, + "step": 18564 + }, + { + "epoch": 2.460569913850232, + "grad_norm": 0.04208492860198021, + "learning_rate": 3.8871529975544084e-07, + "loss": 0.0002, + "num_input_tokens_seen": 36326528, + "step": 18565 + }, + { + "epoch": 2.460702451954937, + "grad_norm": 0.0041338251903653145, + "learning_rate": 3.885294163937897e-07, + "loss": 0.0, + "num_input_tokens_seen": 36327976, + "step": 18566 + }, + { + "epoch": 2.460834990059642, + "grad_norm": 20.510738372802734, + "learning_rate": 3.8834357374274793e-07, + "loss": 0.1064, + "num_input_tokens_seen": 36329216, + "step": 18567 + }, + { + "epoch": 2.460967528164347, + "grad_norm": 0.0022442529443651438, + "learning_rate": 3.881577718058993e-07, + "loss": 0.0, + "num_input_tokens_seen": 36330984, + "step": 18568 + }, + { + "epoch": 2.4611000662690525, + "grad_norm": 0.3472481071949005, + "learning_rate": 3.8797201058682595e-07, + "loss": 0.002, + "num_input_tokens_seen": 36333032, + "step": 18569 + }, + { + "epoch": 2.4612326043737576, + "grad_norm": 3.3336098194122314, + "learning_rate": 3.8778629008910913e-07, + "loss": 0.0084, + "num_input_tokens_seen": 36335824, + "step": 18570 + }, + { + "epoch": 2.4613651424784626, + "grad_norm": 1.921223521232605, + "learning_rate": 3.876006103163293e-07, + "loss": 0.0145, + "num_input_tokens_seen": 36337488, + "step": 18571 + }, + { + "epoch": 2.4614976805831676, + "grad_norm": 10.020763397216797, + "learning_rate": 3.8741497127206733e-07, + "loss": 0.2283, + "num_input_tokens_seen": 36339880, + "step": 18572 + }, + { + "epoch": 2.4616302186878727, + "grad_norm": 5.586206912994385, + "learning_rate": 3.8722937295990163e-07, + "loss": 0.0714, + "num_input_tokens_seen": 36341824, + "step": 18573 + }, + { + "epoch": 2.4617627567925777, + "grad_norm": 1.663621187210083, + "learning_rate": 3.870438153834119e-07, + "loss": 0.0161, + "num_input_tokens_seen": 36344568, + "step": 18574 + }, + { + "epoch": 2.4618952948972828, + "grad_norm": 0.010140071623027325, + "learning_rate": 3.86858298546175e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36346104, + "step": 18575 + }, + { + "epoch": 2.4620278330019882, + "grad_norm": 0.01147930882871151, + "learning_rate": 3.866728224517671e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36347664, + "step": 18576 + }, + { + "epoch": 2.4621603711066933, + "grad_norm": 0.15439902245998383, + "learning_rate": 3.8648738710376584e-07, + "loss": 0.0008, + "num_input_tokens_seen": 36349464, + "step": 18577 + }, + { + "epoch": 2.4622929092113983, + "grad_norm": 0.0025804725009948015, + "learning_rate": 3.863019925057457e-07, + "loss": 0.0, + "num_input_tokens_seen": 36350840, + "step": 18578 + }, + { + "epoch": 2.4624254473161034, + "grad_norm": 0.08611530810594559, + "learning_rate": 3.861166386612808e-07, + "loss": 0.0005, + "num_input_tokens_seen": 36353352, + "step": 18579 + }, + { + "epoch": 2.4625579854208084, + "grad_norm": 0.2755880355834961, + "learning_rate": 3.859313255739458e-07, + "loss": 0.0012, + "num_input_tokens_seen": 36355088, + "step": 18580 + }, + { + "epoch": 2.4626905235255134, + "grad_norm": 0.07325536757707596, + "learning_rate": 3.8574605324731287e-07, + "loss": 0.0002, + "num_input_tokens_seen": 36356488, + "step": 18581 + }, + { + "epoch": 2.4628230616302185, + "grad_norm": 0.8562663793563843, + "learning_rate": 3.855608216849552e-07, + "loss": 0.003, + "num_input_tokens_seen": 36357832, + "step": 18582 + }, + { + "epoch": 2.462955599734924, + "grad_norm": 0.017841368913650513, + "learning_rate": 3.8537563089044355e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36360312, + "step": 18583 + }, + { + "epoch": 2.463088137839629, + "grad_norm": 2.6896965503692627, + "learning_rate": 3.851904808673479e-07, + "loss": 0.0278, + "num_input_tokens_seen": 36362816, + "step": 18584 + }, + { + "epoch": 2.463220675944334, + "grad_norm": 0.028345245867967606, + "learning_rate": 3.8500537161923923e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36364936, + "step": 18585 + }, + { + "epoch": 2.463353214049039, + "grad_norm": 4.704026699066162, + "learning_rate": 3.848203031496861e-07, + "loss": 0.076, + "num_input_tokens_seen": 36367544, + "step": 18586 + }, + { + "epoch": 2.463485752153744, + "grad_norm": 8.027031898498535, + "learning_rate": 3.846352754622562e-07, + "loss": 0.1695, + "num_input_tokens_seen": 36369976, + "step": 18587 + }, + { + "epoch": 2.463618290258449, + "grad_norm": 0.032721854746341705, + "learning_rate": 3.844502885605181e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36371568, + "step": 18588 + }, + { + "epoch": 2.463750828363154, + "grad_norm": 5.697801113128662, + "learning_rate": 3.8426534244803774e-07, + "loss": 0.1086, + "num_input_tokens_seen": 36373536, + "step": 18589 + }, + { + "epoch": 2.4638833664678597, + "grad_norm": 14.564227104187012, + "learning_rate": 3.8408043712838074e-07, + "loss": 0.0802, + "num_input_tokens_seen": 36375112, + "step": 18590 + }, + { + "epoch": 2.4640159045725647, + "grad_norm": 0.040207985788583755, + "learning_rate": 3.83895572605113e-07, + "loss": 0.0002, + "num_input_tokens_seen": 36376944, + "step": 18591 + }, + { + "epoch": 2.4641484426772697, + "grad_norm": 0.013511697761714458, + "learning_rate": 3.8371074888179814e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36378896, + "step": 18592 + }, + { + "epoch": 2.4642809807819748, + "grad_norm": 0.003365330398082733, + "learning_rate": 3.835259659620003e-07, + "loss": 0.0, + "num_input_tokens_seen": 36381528, + "step": 18593 + }, + { + "epoch": 2.46441351888668, + "grad_norm": 0.010893715545535088, + "learning_rate": 3.83341223849282e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36383824, + "step": 18594 + }, + { + "epoch": 2.464546056991385, + "grad_norm": 6.247860431671143, + "learning_rate": 3.83156522547205e-07, + "loss": 0.0622, + "num_input_tokens_seen": 36385560, + "step": 18595 + }, + { + "epoch": 2.46467859509609, + "grad_norm": 0.0010095748584717512, + "learning_rate": 3.8297186205933034e-07, + "loss": 0.0, + "num_input_tokens_seen": 36387000, + "step": 18596 + }, + { + "epoch": 2.4648111332007954, + "grad_norm": 5.720786094665527, + "learning_rate": 3.827872423892184e-07, + "loss": 0.026, + "num_input_tokens_seen": 36389624, + "step": 18597 + }, + { + "epoch": 2.4649436713055004, + "grad_norm": 0.07707821577787399, + "learning_rate": 3.8260266354042954e-07, + "loss": 0.0005, + "num_input_tokens_seen": 36391048, + "step": 18598 + }, + { + "epoch": 2.4650762094102054, + "grad_norm": 0.26295992732048035, + "learning_rate": 3.8241812551652215e-07, + "loss": 0.0014, + "num_input_tokens_seen": 36392616, + "step": 18599 + }, + { + "epoch": 2.4652087475149105, + "grad_norm": 0.02838672511279583, + "learning_rate": 3.8223362832105427e-07, + "loss": 0.0002, + "num_input_tokens_seen": 36394232, + "step": 18600 + }, + { + "epoch": 2.4653412856196155, + "grad_norm": 7.368450164794922, + "learning_rate": 3.820491719575828e-07, + "loss": 0.07, + "num_input_tokens_seen": 36396928, + "step": 18601 + }, + { + "epoch": 2.4654738237243206, + "grad_norm": 0.8114721775054932, + "learning_rate": 3.8186475642966376e-07, + "loss": 0.0035, + "num_input_tokens_seen": 36398672, + "step": 18602 + }, + { + "epoch": 2.465606361829026, + "grad_norm": 0.0015176148153841496, + "learning_rate": 3.8168038174085373e-07, + "loss": 0.0, + "num_input_tokens_seen": 36399880, + "step": 18603 + }, + { + "epoch": 2.465738899933731, + "grad_norm": 8.708642959594727, + "learning_rate": 3.8149604789470775e-07, + "loss": 0.2358, + "num_input_tokens_seen": 36403016, + "step": 18604 + }, + { + "epoch": 2.465871438038436, + "grad_norm": 2.6724283695220947, + "learning_rate": 3.813117548947792e-07, + "loss": 0.0057, + "num_input_tokens_seen": 36404504, + "step": 18605 + }, + { + "epoch": 2.466003976143141, + "grad_norm": 6.241840839385986, + "learning_rate": 3.811275027446221e-07, + "loss": 0.0216, + "num_input_tokens_seen": 36406024, + "step": 18606 + }, + { + "epoch": 2.466136514247846, + "grad_norm": 6.874896049499512, + "learning_rate": 3.8094329144778756e-07, + "loss": 0.0768, + "num_input_tokens_seen": 36407568, + "step": 18607 + }, + { + "epoch": 2.4662690523525512, + "grad_norm": 3.1563775539398193, + "learning_rate": 3.8075912100782877e-07, + "loss": 0.026, + "num_input_tokens_seen": 36409168, + "step": 18608 + }, + { + "epoch": 2.4664015904572567, + "grad_norm": 5.883502960205078, + "learning_rate": 3.805749914282955e-07, + "loss": 0.0669, + "num_input_tokens_seen": 36411416, + "step": 18609 + }, + { + "epoch": 2.4665341285619617, + "grad_norm": 0.36826008558273315, + "learning_rate": 3.803909027127395e-07, + "loss": 0.0023, + "num_input_tokens_seen": 36413112, + "step": 18610 + }, + { + "epoch": 2.466666666666667, + "grad_norm": 2.848257064819336, + "learning_rate": 3.8020685486470863e-07, + "loss": 0.0113, + "num_input_tokens_seen": 36416000, + "step": 18611 + }, + { + "epoch": 2.466799204771372, + "grad_norm": 3.7428433895111084, + "learning_rate": 3.800228478877524e-07, + "loss": 0.0316, + "num_input_tokens_seen": 36417912, + "step": 18612 + }, + { + "epoch": 2.466931742876077, + "grad_norm": 2.251021146774292, + "learning_rate": 3.7983888178541734e-07, + "loss": 0.0158, + "num_input_tokens_seen": 36419736, + "step": 18613 + }, + { + "epoch": 2.467064280980782, + "grad_norm": 6.864626407623291, + "learning_rate": 3.796549565612512e-07, + "loss": 0.0238, + "num_input_tokens_seen": 36422304, + "step": 18614 + }, + { + "epoch": 2.467196819085487, + "grad_norm": 8.772578239440918, + "learning_rate": 3.7947107221880107e-07, + "loss": 0.1593, + "num_input_tokens_seen": 36423960, + "step": 18615 + }, + { + "epoch": 2.4673293571901924, + "grad_norm": 10.461966514587402, + "learning_rate": 3.7928722876161143e-07, + "loss": 0.2263, + "num_input_tokens_seen": 36426192, + "step": 18616 + }, + { + "epoch": 2.4674618952948975, + "grad_norm": 21.60616683959961, + "learning_rate": 3.7910342619322694e-07, + "loss": 0.1168, + "num_input_tokens_seen": 36428144, + "step": 18617 + }, + { + "epoch": 2.4675944333996025, + "grad_norm": 0.8281954526901245, + "learning_rate": 3.7891966451719107e-07, + "loss": 0.0033, + "num_input_tokens_seen": 36430032, + "step": 18618 + }, + { + "epoch": 2.4677269715043075, + "grad_norm": 0.02571220137178898, + "learning_rate": 3.7873594373704775e-07, + "loss": 0.0002, + "num_input_tokens_seen": 36431832, + "step": 18619 + }, + { + "epoch": 2.4678595096090126, + "grad_norm": 0.5457932949066162, + "learning_rate": 3.785522638563385e-07, + "loss": 0.0012, + "num_input_tokens_seen": 36434232, + "step": 18620 + }, + { + "epoch": 2.4679920477137176, + "grad_norm": 2.104452610015869, + "learning_rate": 3.783686248786053e-07, + "loss": 0.0132, + "num_input_tokens_seen": 36436032, + "step": 18621 + }, + { + "epoch": 2.4681245858184226, + "grad_norm": 4.263936996459961, + "learning_rate": 3.7818502680738914e-07, + "loss": 0.0272, + "num_input_tokens_seen": 36437688, + "step": 18622 + }, + { + "epoch": 2.468257123923128, + "grad_norm": 2.0617122650146484, + "learning_rate": 3.7800146964622833e-07, + "loss": 0.0054, + "num_input_tokens_seen": 36440208, + "step": 18623 + }, + { + "epoch": 2.468389662027833, + "grad_norm": 11.768367767333984, + "learning_rate": 3.778179533986642e-07, + "loss": 0.0906, + "num_input_tokens_seen": 36441992, + "step": 18624 + }, + { + "epoch": 2.468522200132538, + "grad_norm": 0.14102783799171448, + "learning_rate": 3.7763447806823366e-07, + "loss": 0.0012, + "num_input_tokens_seen": 36446072, + "step": 18625 + }, + { + "epoch": 2.4686547382372432, + "grad_norm": 0.0030637281015515327, + "learning_rate": 3.7745104365847384e-07, + "loss": 0.0, + "num_input_tokens_seen": 36448152, + "step": 18626 + }, + { + "epoch": 2.4687872763419483, + "grad_norm": 2.3659050464630127, + "learning_rate": 3.7726765017292277e-07, + "loss": 0.013, + "num_input_tokens_seen": 36450256, + "step": 18627 + }, + { + "epoch": 2.4689198144466533, + "grad_norm": 0.17747730016708374, + "learning_rate": 3.770842976151154e-07, + "loss": 0.0007, + "num_input_tokens_seen": 36452432, + "step": 18628 + }, + { + "epoch": 2.4690523525513584, + "grad_norm": 3.586778402328491, + "learning_rate": 3.769009859885878e-07, + "loss": 0.016, + "num_input_tokens_seen": 36454640, + "step": 18629 + }, + { + "epoch": 2.469184890656064, + "grad_norm": 0.044169340282678604, + "learning_rate": 3.7671771529687374e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36456752, + "step": 18630 + }, + { + "epoch": 2.469317428760769, + "grad_norm": 11.662440299987793, + "learning_rate": 3.765344855435071e-07, + "loss": 0.1469, + "num_input_tokens_seen": 36458600, + "step": 18631 + }, + { + "epoch": 2.469449966865474, + "grad_norm": 8.108439445495605, + "learning_rate": 3.7635129673201984e-07, + "loss": 0.0599, + "num_input_tokens_seen": 36460736, + "step": 18632 + }, + { + "epoch": 2.469582504970179, + "grad_norm": 28.31918716430664, + "learning_rate": 3.7616814886594467e-07, + "loss": 0.1525, + "num_input_tokens_seen": 36462344, + "step": 18633 + }, + { + "epoch": 2.469715043074884, + "grad_norm": 0.03383000195026398, + "learning_rate": 3.7598504194881315e-07, + "loss": 0.0004, + "num_input_tokens_seen": 36464024, + "step": 18634 + }, + { + "epoch": 2.469847581179589, + "grad_norm": 9.165836334228516, + "learning_rate": 3.7580197598415523e-07, + "loss": 0.1452, + "num_input_tokens_seen": 36465176, + "step": 18635 + }, + { + "epoch": 2.469980119284294, + "grad_norm": 1.2978737354278564, + "learning_rate": 3.756189509755009e-07, + "loss": 0.0039, + "num_input_tokens_seen": 36466720, + "step": 18636 + }, + { + "epoch": 2.4701126573889995, + "grad_norm": 0.005410389043390751, + "learning_rate": 3.75435966926378e-07, + "loss": 0.0, + "num_input_tokens_seen": 36468304, + "step": 18637 + }, + { + "epoch": 2.4702451954937046, + "grad_norm": 3.268007516860962, + "learning_rate": 3.7525302384031543e-07, + "loss": 0.0105, + "num_input_tokens_seen": 36469944, + "step": 18638 + }, + { + "epoch": 2.4703777335984096, + "grad_norm": 0.024959152564406395, + "learning_rate": 3.7507012172084076e-07, + "loss": 0.0002, + "num_input_tokens_seen": 36471800, + "step": 18639 + }, + { + "epoch": 2.4705102717031147, + "grad_norm": 4.738266944885254, + "learning_rate": 3.748872605714801e-07, + "loss": 0.0823, + "num_input_tokens_seen": 36473504, + "step": 18640 + }, + { + "epoch": 2.4706428098078197, + "grad_norm": 0.001865852507762611, + "learning_rate": 3.747044403957592e-07, + "loss": 0.0, + "num_input_tokens_seen": 36475024, + "step": 18641 + }, + { + "epoch": 2.4707753479125247, + "grad_norm": 8.800101280212402, + "learning_rate": 3.7452166119720275e-07, + "loss": 0.1303, + "num_input_tokens_seen": 36478192, + "step": 18642 + }, + { + "epoch": 2.4709078860172298, + "grad_norm": 5.0085768699646, + "learning_rate": 3.7433892297933453e-07, + "loss": 0.023, + "num_input_tokens_seen": 36480392, + "step": 18643 + }, + { + "epoch": 2.4710404241219353, + "grad_norm": 0.019891148433089256, + "learning_rate": 3.7415622574567823e-07, + "loss": 0.0002, + "num_input_tokens_seen": 36482280, + "step": 18644 + }, + { + "epoch": 2.4711729622266403, + "grad_norm": 0.007515651639550924, + "learning_rate": 3.7397356949975704e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36483920, + "step": 18645 + }, + { + "epoch": 2.4713055003313453, + "grad_norm": 4.763917922973633, + "learning_rate": 3.737909542450918e-07, + "loss": 0.0339, + "num_input_tokens_seen": 36486152, + "step": 18646 + }, + { + "epoch": 2.4714380384360504, + "grad_norm": 0.5105686783790588, + "learning_rate": 3.7360837998520416e-07, + "loss": 0.0022, + "num_input_tokens_seen": 36487896, + "step": 18647 + }, + { + "epoch": 2.4715705765407554, + "grad_norm": 3.8065185546875, + "learning_rate": 3.734258467236138e-07, + "loss": 0.04, + "num_input_tokens_seen": 36489792, + "step": 18648 + }, + { + "epoch": 2.4717031146454604, + "grad_norm": 7.1392998695373535, + "learning_rate": 3.732433544638395e-07, + "loss": 0.0216, + "num_input_tokens_seen": 36491608, + "step": 18649 + }, + { + "epoch": 2.4718356527501655, + "grad_norm": 0.06258106231689453, + "learning_rate": 3.7306090320940044e-07, + "loss": 0.0005, + "num_input_tokens_seen": 36493480, + "step": 18650 + }, + { + "epoch": 2.471968190854871, + "grad_norm": 0.07136065512895584, + "learning_rate": 3.7287849296381515e-07, + "loss": 0.0003, + "num_input_tokens_seen": 36495456, + "step": 18651 + }, + { + "epoch": 2.472100728959576, + "grad_norm": 0.003425769740715623, + "learning_rate": 3.726961237305998e-07, + "loss": 0.0, + "num_input_tokens_seen": 36497752, + "step": 18652 + }, + { + "epoch": 2.472233267064281, + "grad_norm": 5.968837738037109, + "learning_rate": 3.725137955132707e-07, + "loss": 0.0432, + "num_input_tokens_seen": 36500104, + "step": 18653 + }, + { + "epoch": 2.472365805168986, + "grad_norm": 0.0020306508522480726, + "learning_rate": 3.723315083153431e-07, + "loss": 0.0, + "num_input_tokens_seen": 36501600, + "step": 18654 + }, + { + "epoch": 2.472498343273691, + "grad_norm": 0.8280400037765503, + "learning_rate": 3.721492621403322e-07, + "loss": 0.0042, + "num_input_tokens_seen": 36504064, + "step": 18655 + }, + { + "epoch": 2.472630881378396, + "grad_norm": 4.275320529937744, + "learning_rate": 3.719670569917508e-07, + "loss": 0.0199, + "num_input_tokens_seen": 36505952, + "step": 18656 + }, + { + "epoch": 2.472763419483101, + "grad_norm": 1.9094735383987427, + "learning_rate": 3.7178489287311295e-07, + "loss": 0.0046, + "num_input_tokens_seen": 36508768, + "step": 18657 + }, + { + "epoch": 2.4728959575878067, + "grad_norm": 0.16728071868419647, + "learning_rate": 3.7160276978793096e-07, + "loss": 0.0011, + "num_input_tokens_seen": 36510216, + "step": 18658 + }, + { + "epoch": 2.4730284956925117, + "grad_norm": 0.047469545155763626, + "learning_rate": 3.71420687739715e-07, + "loss": 0.0002, + "num_input_tokens_seen": 36513248, + "step": 18659 + }, + { + "epoch": 2.4731610337972167, + "grad_norm": 0.0017732606502249837, + "learning_rate": 3.712386467319776e-07, + "loss": 0.0, + "num_input_tokens_seen": 36514752, + "step": 18660 + }, + { + "epoch": 2.473293571901922, + "grad_norm": 4.72714376449585, + "learning_rate": 3.710566467682272e-07, + "loss": 0.0208, + "num_input_tokens_seen": 36516696, + "step": 18661 + }, + { + "epoch": 2.473426110006627, + "grad_norm": 24.258129119873047, + "learning_rate": 3.708746878519731e-07, + "loss": 0.2875, + "num_input_tokens_seen": 36518168, + "step": 18662 + }, + { + "epoch": 2.473558648111332, + "grad_norm": 0.8548184633255005, + "learning_rate": 3.706927699867241e-07, + "loss": 0.0034, + "num_input_tokens_seen": 36519624, + "step": 18663 + }, + { + "epoch": 2.473691186216037, + "grad_norm": 0.6532331705093384, + "learning_rate": 3.705108931759871e-07, + "loss": 0.0029, + "num_input_tokens_seen": 36521544, + "step": 18664 + }, + { + "epoch": 2.4738237243207424, + "grad_norm": 0.7739023566246033, + "learning_rate": 3.703290574232696e-07, + "loss": 0.003, + "num_input_tokens_seen": 36523568, + "step": 18665 + }, + { + "epoch": 2.4739562624254474, + "grad_norm": 4.31135368347168, + "learning_rate": 3.7014726273207706e-07, + "loss": 0.0579, + "num_input_tokens_seen": 36526032, + "step": 18666 + }, + { + "epoch": 2.4740888005301525, + "grad_norm": 0.013768148608505726, + "learning_rate": 3.69965509105914e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36527312, + "step": 18667 + }, + { + "epoch": 2.4742213386348575, + "grad_norm": 1.9507547616958618, + "learning_rate": 3.697837965482859e-07, + "loss": 0.015, + "num_input_tokens_seen": 36528832, + "step": 18668 + }, + { + "epoch": 2.4743538767395625, + "grad_norm": 6.322918891906738, + "learning_rate": 3.696021250626958e-07, + "loss": 0.065, + "num_input_tokens_seen": 36530616, + "step": 18669 + }, + { + "epoch": 2.4744864148442676, + "grad_norm": 7.7717742919921875, + "learning_rate": 3.6942049465264585e-07, + "loss": 0.0878, + "num_input_tokens_seen": 36532472, + "step": 18670 + }, + { + "epoch": 2.4746189529489726, + "grad_norm": 1.8606345653533936, + "learning_rate": 3.6923890532163915e-07, + "loss": 0.0053, + "num_input_tokens_seen": 36534336, + "step": 18671 + }, + { + "epoch": 2.474751491053678, + "grad_norm": 4.085430145263672, + "learning_rate": 3.690573570731765e-07, + "loss": 0.0201, + "num_input_tokens_seen": 36536672, + "step": 18672 + }, + { + "epoch": 2.474884029158383, + "grad_norm": 6.766450881958008, + "learning_rate": 3.6887584991075723e-07, + "loss": 0.0645, + "num_input_tokens_seen": 36539088, + "step": 18673 + }, + { + "epoch": 2.475016567263088, + "grad_norm": 0.13217997550964355, + "learning_rate": 3.686943838378823e-07, + "loss": 0.0004, + "num_input_tokens_seen": 36541056, + "step": 18674 + }, + { + "epoch": 2.475149105367793, + "grad_norm": 8.302579879760742, + "learning_rate": 3.6851295885804966e-07, + "loss": 0.0475, + "num_input_tokens_seen": 36543840, + "step": 18675 + }, + { + "epoch": 2.4752816434724982, + "grad_norm": 8.71478271484375, + "learning_rate": 3.683315749747579e-07, + "loss": 0.0768, + "num_input_tokens_seen": 36546128, + "step": 18676 + }, + { + "epoch": 2.4754141815772033, + "grad_norm": 0.040390629321336746, + "learning_rate": 3.6815023219150427e-07, + "loss": 0.0002, + "num_input_tokens_seen": 36547640, + "step": 18677 + }, + { + "epoch": 2.4755467196819083, + "grad_norm": 1.09035062789917, + "learning_rate": 3.679689305117845e-07, + "loss": 0.005, + "num_input_tokens_seen": 36549896, + "step": 18678 + }, + { + "epoch": 2.475679257786614, + "grad_norm": 0.0018565859645605087, + "learning_rate": 3.677876699390942e-07, + "loss": 0.0, + "num_input_tokens_seen": 36551808, + "step": 18679 + }, + { + "epoch": 2.475811795891319, + "grad_norm": 6.177809238433838, + "learning_rate": 3.6760645047692834e-07, + "loss": 0.021, + "num_input_tokens_seen": 36553616, + "step": 18680 + }, + { + "epoch": 2.475944333996024, + "grad_norm": 8.347784042358398, + "learning_rate": 3.674252721287819e-07, + "loss": 0.0753, + "num_input_tokens_seen": 36555704, + "step": 18681 + }, + { + "epoch": 2.476076872100729, + "grad_norm": 0.8276697993278503, + "learning_rate": 3.672441348981476e-07, + "loss": 0.0018, + "num_input_tokens_seen": 36557544, + "step": 18682 + }, + { + "epoch": 2.476209410205434, + "grad_norm": 5.259163856506348, + "learning_rate": 3.670630387885174e-07, + "loss": 0.0656, + "num_input_tokens_seen": 36559696, + "step": 18683 + }, + { + "epoch": 2.476341948310139, + "grad_norm": 3.6759846210479736, + "learning_rate": 3.6688198380338323e-07, + "loss": 0.1345, + "num_input_tokens_seen": 36561288, + "step": 18684 + }, + { + "epoch": 2.476474486414844, + "grad_norm": 4.614109516143799, + "learning_rate": 3.667009699462357e-07, + "loss": 0.0182, + "num_input_tokens_seen": 36563024, + "step": 18685 + }, + { + "epoch": 2.4766070245195495, + "grad_norm": 0.4406585991382599, + "learning_rate": 3.665199972205649e-07, + "loss": 0.0026, + "num_input_tokens_seen": 36565264, + "step": 18686 + }, + { + "epoch": 2.4767395626242545, + "grad_norm": 0.03262486308813095, + "learning_rate": 3.66339065629861e-07, + "loss": 0.0002, + "num_input_tokens_seen": 36566656, + "step": 18687 + }, + { + "epoch": 2.4768721007289596, + "grad_norm": 7.582146644592285, + "learning_rate": 3.6615817517761194e-07, + "loss": 0.1146, + "num_input_tokens_seen": 36569128, + "step": 18688 + }, + { + "epoch": 2.4770046388336646, + "grad_norm": 3.941302537918091, + "learning_rate": 3.6597732586730523e-07, + "loss": 0.0289, + "num_input_tokens_seen": 36571104, + "step": 18689 + }, + { + "epoch": 2.4771371769383697, + "grad_norm": 0.0013158038491383195, + "learning_rate": 3.6579651770242724e-07, + "loss": 0.0, + "num_input_tokens_seen": 36573192, + "step": 18690 + }, + { + "epoch": 2.4772697150430747, + "grad_norm": 0.001474212622269988, + "learning_rate": 3.6561575068646467e-07, + "loss": 0.0, + "num_input_tokens_seen": 36574432, + "step": 18691 + }, + { + "epoch": 2.47740225314778, + "grad_norm": 15.631548881530762, + "learning_rate": 3.6543502482290354e-07, + "loss": 0.0682, + "num_input_tokens_seen": 36576440, + "step": 18692 + }, + { + "epoch": 2.477534791252485, + "grad_norm": 0.014167402870953083, + "learning_rate": 3.652543401152278e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36578328, + "step": 18693 + }, + { + "epoch": 2.4776673293571903, + "grad_norm": 4.524556636810303, + "learning_rate": 3.6507369656692104e-07, + "loss": 0.0408, + "num_input_tokens_seen": 36580320, + "step": 18694 + }, + { + "epoch": 2.4777998674618953, + "grad_norm": 3.2678520679473877, + "learning_rate": 3.6489309418146546e-07, + "loss": 0.0125, + "num_input_tokens_seen": 36582504, + "step": 18695 + }, + { + "epoch": 2.4779324055666003, + "grad_norm": 0.03931229189038277, + "learning_rate": 3.6471253296234464e-07, + "loss": 0.0002, + "num_input_tokens_seen": 36584416, + "step": 18696 + }, + { + "epoch": 2.4780649436713054, + "grad_norm": 4.788264274597168, + "learning_rate": 3.645320129130386e-07, + "loss": 0.0992, + "num_input_tokens_seen": 36587544, + "step": 18697 + }, + { + "epoch": 2.478197481776011, + "grad_norm": 0.9232210516929626, + "learning_rate": 3.643515340370293e-07, + "loss": 0.0044, + "num_input_tokens_seen": 36589184, + "step": 18698 + }, + { + "epoch": 2.478330019880716, + "grad_norm": 7.604608535766602, + "learning_rate": 3.6417109633779564e-07, + "loss": 0.1169, + "num_input_tokens_seen": 36591768, + "step": 18699 + }, + { + "epoch": 2.478462557985421, + "grad_norm": 9.31635570526123, + "learning_rate": 3.639906998188167e-07, + "loss": 0.1002, + "num_input_tokens_seen": 36593832, + "step": 18700 + }, + { + "epoch": 2.478595096090126, + "grad_norm": 6.626319885253906, + "learning_rate": 3.6381034448356974e-07, + "loss": 0.0567, + "num_input_tokens_seen": 36596096, + "step": 18701 + }, + { + "epoch": 2.478727634194831, + "grad_norm": 6.331774711608887, + "learning_rate": 3.636300303355339e-07, + "loss": 0.1068, + "num_input_tokens_seen": 36597832, + "step": 18702 + }, + { + "epoch": 2.478860172299536, + "grad_norm": 0.019286390393972397, + "learning_rate": 3.6344975737818423e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36599816, + "step": 18703 + }, + { + "epoch": 2.478992710404241, + "grad_norm": 0.4375932514667511, + "learning_rate": 3.632695256149979e-07, + "loss": 0.001, + "num_input_tokens_seen": 36602376, + "step": 18704 + }, + { + "epoch": 2.4791252485089466, + "grad_norm": 10.119908332824707, + "learning_rate": 3.630893350494491e-07, + "loss": 0.0209, + "num_input_tokens_seen": 36605496, + "step": 18705 + }, + { + "epoch": 2.4792577866136516, + "grad_norm": 0.007768499664962292, + "learning_rate": 3.6290918568501145e-07, + "loss": 0.0, + "num_input_tokens_seen": 36606824, + "step": 18706 + }, + { + "epoch": 2.4793903247183566, + "grad_norm": 3.785700559616089, + "learning_rate": 3.6272907752515964e-07, + "loss": 0.0212, + "num_input_tokens_seen": 36609344, + "step": 18707 + }, + { + "epoch": 2.4795228628230617, + "grad_norm": 0.279410183429718, + "learning_rate": 3.625490105733653e-07, + "loss": 0.0008, + "num_input_tokens_seen": 36610656, + "step": 18708 + }, + { + "epoch": 2.4796554009277667, + "grad_norm": 0.046907536685466766, + "learning_rate": 3.6236898483310053e-07, + "loss": 0.0003, + "num_input_tokens_seen": 36612152, + "step": 18709 + }, + { + "epoch": 2.4797879390324717, + "grad_norm": 12.53381633758545, + "learning_rate": 3.6218900030783683e-07, + "loss": 0.0743, + "num_input_tokens_seen": 36614120, + "step": 18710 + }, + { + "epoch": 2.479920477137177, + "grad_norm": 3.6469523906707764, + "learning_rate": 3.6200905700104314e-07, + "loss": 0.0093, + "num_input_tokens_seen": 36617576, + "step": 18711 + }, + { + "epoch": 2.4800530152418823, + "grad_norm": 0.046498291194438934, + "learning_rate": 3.6182915491619065e-07, + "loss": 0.0003, + "num_input_tokens_seen": 36619224, + "step": 18712 + }, + { + "epoch": 2.4801855533465873, + "grad_norm": 0.0005093045765534043, + "learning_rate": 3.61649294056747e-07, + "loss": 0.0, + "num_input_tokens_seen": 36620800, + "step": 18713 + }, + { + "epoch": 2.4803180914512923, + "grad_norm": 3.954202175140381, + "learning_rate": 3.6146947442618015e-07, + "loss": 0.0523, + "num_input_tokens_seen": 36622168, + "step": 18714 + }, + { + "epoch": 2.4804506295559974, + "grad_norm": 9.797955513000488, + "learning_rate": 3.612896960279566e-07, + "loss": 0.0539, + "num_input_tokens_seen": 36624200, + "step": 18715 + }, + { + "epoch": 2.4805831676607024, + "grad_norm": 0.003079358022660017, + "learning_rate": 3.6110995886554283e-07, + "loss": 0.0, + "num_input_tokens_seen": 36626352, + "step": 18716 + }, + { + "epoch": 2.4807157057654075, + "grad_norm": 3.5536882877349854, + "learning_rate": 3.609302629424055e-07, + "loss": 0.041, + "num_input_tokens_seen": 36628224, + "step": 18717 + }, + { + "epoch": 2.4808482438701125, + "grad_norm": 0.0454619862139225, + "learning_rate": 3.6075060826200823e-07, + "loss": 0.0003, + "num_input_tokens_seen": 36631472, + "step": 18718 + }, + { + "epoch": 2.480980781974818, + "grad_norm": 6.280172824859619, + "learning_rate": 3.605709948278152e-07, + "loss": 0.1026, + "num_input_tokens_seen": 36633048, + "step": 18719 + }, + { + "epoch": 2.481113320079523, + "grad_norm": 3.46936297416687, + "learning_rate": 3.6039142264328863e-07, + "loss": 0.0161, + "num_input_tokens_seen": 36634752, + "step": 18720 + }, + { + "epoch": 2.481245858184228, + "grad_norm": 0.0014698642771691084, + "learning_rate": 3.6021189171189133e-07, + "loss": 0.0, + "num_input_tokens_seen": 36635760, + "step": 18721 + }, + { + "epoch": 2.481378396288933, + "grad_norm": 7.820882320404053, + "learning_rate": 3.600324020370857e-07, + "loss": 0.0591, + "num_input_tokens_seen": 36637720, + "step": 18722 + }, + { + "epoch": 2.481510934393638, + "grad_norm": 2.352278470993042, + "learning_rate": 3.598529536223316e-07, + "loss": 0.0374, + "num_input_tokens_seen": 36639440, + "step": 18723 + }, + { + "epoch": 2.481643472498343, + "grad_norm": 1.7555536031723022, + "learning_rate": 3.5967354647108875e-07, + "loss": 0.0045, + "num_input_tokens_seen": 36641040, + "step": 18724 + }, + { + "epoch": 2.481776010603048, + "grad_norm": 1.144910454750061, + "learning_rate": 3.594941805868168e-07, + "loss": 0.0059, + "num_input_tokens_seen": 36642944, + "step": 18725 + }, + { + "epoch": 2.4819085487077537, + "grad_norm": 10.142992973327637, + "learning_rate": 3.593148559729728e-07, + "loss": 0.1221, + "num_input_tokens_seen": 36645096, + "step": 18726 + }, + { + "epoch": 2.4820410868124587, + "grad_norm": 3.3010411262512207, + "learning_rate": 3.5913557263301536e-07, + "loss": 0.0031, + "num_input_tokens_seen": 36647144, + "step": 18727 + }, + { + "epoch": 2.4821736249171638, + "grad_norm": 3.5845608711242676, + "learning_rate": 3.5895633057040163e-07, + "loss": 0.0074, + "num_input_tokens_seen": 36648704, + "step": 18728 + }, + { + "epoch": 2.482306163021869, + "grad_norm": 1.3885129690170288, + "learning_rate": 3.587771297885867e-07, + "loss": 0.0082, + "num_input_tokens_seen": 36650696, + "step": 18729 + }, + { + "epoch": 2.482438701126574, + "grad_norm": 2.4023869037628174, + "learning_rate": 3.585979702910261e-07, + "loss": 0.0124, + "num_input_tokens_seen": 36653528, + "step": 18730 + }, + { + "epoch": 2.482571239231279, + "grad_norm": 0.0190268587321043, + "learning_rate": 3.584188520811738e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36657168, + "step": 18731 + }, + { + "epoch": 2.482703777335984, + "grad_norm": 0.004037338308990002, + "learning_rate": 3.5823977516248277e-07, + "loss": 0.0, + "num_input_tokens_seen": 36659600, + "step": 18732 + }, + { + "epoch": 2.4828363154406894, + "grad_norm": 0.008879962377250195, + "learning_rate": 3.5806073953840643e-07, + "loss": 0.0, + "num_input_tokens_seen": 36662272, + "step": 18733 + }, + { + "epoch": 2.4829688535453944, + "grad_norm": 8.757277488708496, + "learning_rate": 3.5788174521239727e-07, + "loss": 0.2213, + "num_input_tokens_seen": 36663952, + "step": 18734 + }, + { + "epoch": 2.4831013916500995, + "grad_norm": 0.027300016954541206, + "learning_rate": 3.577027921879059e-07, + "loss": 0.0002, + "num_input_tokens_seen": 36666472, + "step": 18735 + }, + { + "epoch": 2.4832339297548045, + "grad_norm": 0.3722897171974182, + "learning_rate": 3.5752388046838225e-07, + "loss": 0.0009, + "num_input_tokens_seen": 36667872, + "step": 18736 + }, + { + "epoch": 2.4833664678595095, + "grad_norm": 7.305905342102051, + "learning_rate": 3.573450100572759e-07, + "loss": 0.0729, + "num_input_tokens_seen": 36670896, + "step": 18737 + }, + { + "epoch": 2.4834990059642146, + "grad_norm": 0.17900992929935455, + "learning_rate": 3.5716618095803654e-07, + "loss": 0.001, + "num_input_tokens_seen": 36673104, + "step": 18738 + }, + { + "epoch": 2.4836315440689196, + "grad_norm": 0.029045643284916878, + "learning_rate": 3.569873931741108e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36674896, + "step": 18739 + }, + { + "epoch": 2.483764082173625, + "grad_norm": 1.4598139524459839, + "learning_rate": 3.568086467089471e-07, + "loss": 0.0114, + "num_input_tokens_seen": 36677144, + "step": 18740 + }, + { + "epoch": 2.48389662027833, + "grad_norm": 1.2496943473815918, + "learning_rate": 3.566299415659913e-07, + "loss": 0.0026, + "num_input_tokens_seen": 36678672, + "step": 18741 + }, + { + "epoch": 2.484029158383035, + "grad_norm": 10.979873657226562, + "learning_rate": 3.564512777486881e-07, + "loss": 0.0962, + "num_input_tokens_seen": 36680784, + "step": 18742 + }, + { + "epoch": 2.48416169648774, + "grad_norm": 0.30730950832366943, + "learning_rate": 3.562726552604839e-07, + "loss": 0.0012, + "num_input_tokens_seen": 36682432, + "step": 18743 + }, + { + "epoch": 2.4842942345924452, + "grad_norm": 18.25986671447754, + "learning_rate": 3.5609407410482103e-07, + "loss": 0.0487, + "num_input_tokens_seen": 36684272, + "step": 18744 + }, + { + "epoch": 2.4844267726971503, + "grad_norm": 1.9328157901763916, + "learning_rate": 3.5591553428514393e-07, + "loss": 0.0043, + "num_input_tokens_seen": 36686232, + "step": 18745 + }, + { + "epoch": 2.4845593108018553, + "grad_norm": 0.07898975163698196, + "learning_rate": 3.557370358048945e-07, + "loss": 0.0007, + "num_input_tokens_seen": 36689208, + "step": 18746 + }, + { + "epoch": 2.484691848906561, + "grad_norm": 0.24928021430969238, + "learning_rate": 3.5555857866751365e-07, + "loss": 0.0011, + "num_input_tokens_seen": 36692160, + "step": 18747 + }, + { + "epoch": 2.484824387011266, + "grad_norm": 19.410900115966797, + "learning_rate": 3.553801628764433e-07, + "loss": 0.3261, + "num_input_tokens_seen": 36694992, + "step": 18748 + }, + { + "epoch": 2.484956925115971, + "grad_norm": 0.05703128129243851, + "learning_rate": 3.5520178843512303e-07, + "loss": 0.0002, + "num_input_tokens_seen": 36697280, + "step": 18749 + }, + { + "epoch": 2.485089463220676, + "grad_norm": 0.06111735850572586, + "learning_rate": 3.550234553469914e-07, + "loss": 0.0004, + "num_input_tokens_seen": 36698976, + "step": 18750 + }, + { + "epoch": 2.485222001325381, + "grad_norm": 0.0015816851519048214, + "learning_rate": 3.5484516361548787e-07, + "loss": 0.0, + "num_input_tokens_seen": 36700176, + "step": 18751 + }, + { + "epoch": 2.485354539430086, + "grad_norm": 17.781381607055664, + "learning_rate": 3.546669132440486e-07, + "loss": 0.1251, + "num_input_tokens_seen": 36702032, + "step": 18752 + }, + { + "epoch": 2.485487077534791, + "grad_norm": 5.4291486740112305, + "learning_rate": 3.5448870423611197e-07, + "loss": 0.0746, + "num_input_tokens_seen": 36704320, + "step": 18753 + }, + { + "epoch": 2.4856196156394965, + "grad_norm": 1.171979308128357, + "learning_rate": 3.5431053659511327e-07, + "loss": 0.008, + "num_input_tokens_seen": 36706648, + "step": 18754 + }, + { + "epoch": 2.4857521537442016, + "grad_norm": 0.03141823038458824, + "learning_rate": 3.541324103244878e-07, + "loss": 0.0002, + "num_input_tokens_seen": 36707816, + "step": 18755 + }, + { + "epoch": 2.4858846918489066, + "grad_norm": 0.00999024510383606, + "learning_rate": 3.5395432542766893e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36709768, + "step": 18756 + }, + { + "epoch": 2.4860172299536116, + "grad_norm": 22.54648208618164, + "learning_rate": 3.53776281908092e-07, + "loss": 0.0932, + "num_input_tokens_seen": 36711208, + "step": 18757 + }, + { + "epoch": 2.4861497680583167, + "grad_norm": 0.007172875106334686, + "learning_rate": 3.5359827976918807e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36712744, + "step": 18758 + }, + { + "epoch": 2.4862823061630217, + "grad_norm": 6.4906415939331055, + "learning_rate": 3.5342031901439087e-07, + "loss": 0.0776, + "num_input_tokens_seen": 36715664, + "step": 18759 + }, + { + "epoch": 2.4864148442677267, + "grad_norm": 0.026268085464835167, + "learning_rate": 3.532423996471307e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36718848, + "step": 18760 + }, + { + "epoch": 2.4865473823724322, + "grad_norm": 10.058436393737793, + "learning_rate": 3.530645216708381e-07, + "loss": 0.1365, + "num_input_tokens_seen": 36721272, + "step": 18761 + }, + { + "epoch": 2.4866799204771373, + "grad_norm": 3.5997114181518555, + "learning_rate": 3.5288668508894203e-07, + "loss": 0.0196, + "num_input_tokens_seen": 36723568, + "step": 18762 + }, + { + "epoch": 2.4868124585818423, + "grad_norm": 5.4899797439575195, + "learning_rate": 3.5270888990487167e-07, + "loss": 0.0488, + "num_input_tokens_seen": 36726104, + "step": 18763 + }, + { + "epoch": 2.4869449966865473, + "grad_norm": 0.05004940554499626, + "learning_rate": 3.5253113612205595e-07, + "loss": 0.0003, + "num_input_tokens_seen": 36728240, + "step": 18764 + }, + { + "epoch": 2.4870775347912524, + "grad_norm": 0.09019999951124191, + "learning_rate": 3.523534237439216e-07, + "loss": 0.0006, + "num_input_tokens_seen": 36731072, + "step": 18765 + }, + { + "epoch": 2.4872100728959574, + "grad_norm": 2.283766984939575, + "learning_rate": 3.5217575277389474e-07, + "loss": 0.0606, + "num_input_tokens_seen": 36734040, + "step": 18766 + }, + { + "epoch": 2.4873426110006625, + "grad_norm": 2.919184684753418, + "learning_rate": 3.519981232154004e-07, + "loss": 0.0152, + "num_input_tokens_seen": 36736088, + "step": 18767 + }, + { + "epoch": 2.487475149105368, + "grad_norm": 1.461369514465332, + "learning_rate": 3.518205350718645e-07, + "loss": 0.0094, + "num_input_tokens_seen": 36737736, + "step": 18768 + }, + { + "epoch": 2.487607687210073, + "grad_norm": 0.05060702934861183, + "learning_rate": 3.516429883467104e-07, + "loss": 0.0003, + "num_input_tokens_seen": 36739496, + "step": 18769 + }, + { + "epoch": 2.487740225314778, + "grad_norm": 5.8037004470825195, + "learning_rate": 3.51465483043362e-07, + "loss": 0.025, + "num_input_tokens_seen": 36741120, + "step": 18770 + }, + { + "epoch": 2.487872763419483, + "grad_norm": 5.818715572357178, + "learning_rate": 3.5128801916524123e-07, + "loss": 0.0746, + "num_input_tokens_seen": 36743888, + "step": 18771 + }, + { + "epoch": 2.488005301524188, + "grad_norm": 0.043172262609004974, + "learning_rate": 3.5111059671576964e-07, + "loss": 0.0002, + "num_input_tokens_seen": 36745448, + "step": 18772 + }, + { + "epoch": 2.488137839628893, + "grad_norm": 5.840713977813721, + "learning_rate": 3.5093321569836776e-07, + "loss": 0.114, + "num_input_tokens_seen": 36747528, + "step": 18773 + }, + { + "epoch": 2.488270377733598, + "grad_norm": 0.13453559577465057, + "learning_rate": 3.507558761164562e-07, + "loss": 0.0009, + "num_input_tokens_seen": 36748832, + "step": 18774 + }, + { + "epoch": 2.4884029158383036, + "grad_norm": 1.9982856512069702, + "learning_rate": 3.505785779734544e-07, + "loss": 0.007, + "num_input_tokens_seen": 36750544, + "step": 18775 + }, + { + "epoch": 2.4885354539430087, + "grad_norm": 0.016761427745223045, + "learning_rate": 3.5040132127278054e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36752064, + "step": 18776 + }, + { + "epoch": 2.4886679920477137, + "grad_norm": 0.32652461528778076, + "learning_rate": 3.5022410601785184e-07, + "loss": 0.001, + "num_input_tokens_seen": 36753928, + "step": 18777 + }, + { + "epoch": 2.4888005301524188, + "grad_norm": 0.7855021953582764, + "learning_rate": 3.5004693221208527e-07, + "loss": 0.0155, + "num_input_tokens_seen": 36756792, + "step": 18778 + }, + { + "epoch": 2.488933068257124, + "grad_norm": 0.022242972627282143, + "learning_rate": 3.4986979985889735e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36758656, + "step": 18779 + }, + { + "epoch": 2.4890656063618293, + "grad_norm": 3.048884630203247, + "learning_rate": 3.496927089617025e-07, + "loss": 0.0207, + "num_input_tokens_seen": 36761224, + "step": 18780 + }, + { + "epoch": 2.4891981444665343, + "grad_norm": 0.005539495497941971, + "learning_rate": 3.495156595239163e-07, + "loss": 0.0, + "num_input_tokens_seen": 36762448, + "step": 18781 + }, + { + "epoch": 2.4893306825712394, + "grad_norm": 0.008072884753346443, + "learning_rate": 3.4933865154895163e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36764176, + "step": 18782 + }, + { + "epoch": 2.4894632206759444, + "grad_norm": 4.643518447875977, + "learning_rate": 3.4916168504022073e-07, + "loss": 0.0816, + "num_input_tokens_seen": 36765904, + "step": 18783 + }, + { + "epoch": 2.4895957587806494, + "grad_norm": 7.051122665405273, + "learning_rate": 3.4898476000113704e-07, + "loss": 0.0417, + "num_input_tokens_seen": 36768200, + "step": 18784 + }, + { + "epoch": 2.4897282968853545, + "grad_norm": 5.871222496032715, + "learning_rate": 3.4880787643511107e-07, + "loss": 0.0408, + "num_input_tokens_seen": 36769920, + "step": 18785 + }, + { + "epoch": 2.4898608349900595, + "grad_norm": 0.01995050720870495, + "learning_rate": 3.486310343455526e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36771152, + "step": 18786 + }, + { + "epoch": 2.489993373094765, + "grad_norm": 5.006399631500244, + "learning_rate": 3.4845423373587287e-07, + "loss": 0.0469, + "num_input_tokens_seen": 36773144, + "step": 18787 + }, + { + "epoch": 2.49012591119947, + "grad_norm": 0.0025954325683414936, + "learning_rate": 3.482774746094794e-07, + "loss": 0.0, + "num_input_tokens_seen": 36775072, + "step": 18788 + }, + { + "epoch": 2.490258449304175, + "grad_norm": 10.669304847717285, + "learning_rate": 3.481007569697803e-07, + "loss": 0.0812, + "num_input_tokens_seen": 36777128, + "step": 18789 + }, + { + "epoch": 2.49039098740888, + "grad_norm": 0.029443757608532906, + "learning_rate": 3.4792408082018336e-07, + "loss": 0.0002, + "num_input_tokens_seen": 36778744, + "step": 18790 + }, + { + "epoch": 2.490523525513585, + "grad_norm": 0.03344215825200081, + "learning_rate": 3.477474461640951e-07, + "loss": 0.0002, + "num_input_tokens_seen": 36780520, + "step": 18791 + }, + { + "epoch": 2.49065606361829, + "grad_norm": 0.032965466380119324, + "learning_rate": 3.4757085300492016e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36781896, + "step": 18792 + }, + { + "epoch": 2.490788601722995, + "grad_norm": 0.0040173460729420185, + "learning_rate": 3.4739430134606455e-07, + "loss": 0.0, + "num_input_tokens_seen": 36783720, + "step": 18793 + }, + { + "epoch": 2.4909211398277007, + "grad_norm": 0.001309259096160531, + "learning_rate": 3.4721779119093133e-07, + "loss": 0.0, + "num_input_tokens_seen": 36785392, + "step": 18794 + }, + { + "epoch": 2.4910536779324057, + "grad_norm": 0.36717554926872253, + "learning_rate": 3.4704132254292503e-07, + "loss": 0.0025, + "num_input_tokens_seen": 36787064, + "step": 18795 + }, + { + "epoch": 2.4911862160371108, + "grad_norm": 5.448441982269287, + "learning_rate": 3.468648954054468e-07, + "loss": 0.0139, + "num_input_tokens_seen": 36788808, + "step": 18796 + }, + { + "epoch": 2.491318754141816, + "grad_norm": 0.44331854581832886, + "learning_rate": 3.466885097818987e-07, + "loss": 0.0019, + "num_input_tokens_seen": 36790104, + "step": 18797 + }, + { + "epoch": 2.491451292246521, + "grad_norm": 8.136528968811035, + "learning_rate": 3.4651216567568174e-07, + "loss": 0.0569, + "num_input_tokens_seen": 36793560, + "step": 18798 + }, + { + "epoch": 2.491583830351226, + "grad_norm": 1.0523157119750977, + "learning_rate": 3.4633586309019555e-07, + "loss": 0.0017, + "num_input_tokens_seen": 36794840, + "step": 18799 + }, + { + "epoch": 2.491716368455931, + "grad_norm": 0.006942837964743376, + "learning_rate": 3.461596020288402e-07, + "loss": 0.0, + "num_input_tokens_seen": 36796552, + "step": 18800 + }, + { + "epoch": 2.4918489065606364, + "grad_norm": 0.05755011737346649, + "learning_rate": 3.4598338249501343e-07, + "loss": 0.0003, + "num_input_tokens_seen": 36798264, + "step": 18801 + }, + { + "epoch": 2.4919814446653414, + "grad_norm": 0.008022606372833252, + "learning_rate": 3.458072044921132e-07, + "loss": 0.0, + "num_input_tokens_seen": 36800528, + "step": 18802 + }, + { + "epoch": 2.4921139827700465, + "grad_norm": 11.006196975708008, + "learning_rate": 3.4563106802353557e-07, + "loss": 0.1532, + "num_input_tokens_seen": 36802488, + "step": 18803 + }, + { + "epoch": 2.4922465208747515, + "grad_norm": 0.010539903305470943, + "learning_rate": 3.4545497309267705e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36803480, + "step": 18804 + }, + { + "epoch": 2.4923790589794566, + "grad_norm": 0.006182404700666666, + "learning_rate": 3.4527891970293356e-07, + "loss": 0.0, + "num_input_tokens_seen": 36804576, + "step": 18805 + }, + { + "epoch": 2.4925115970841616, + "grad_norm": 8.655243873596191, + "learning_rate": 3.4510290785769894e-07, + "loss": 0.0877, + "num_input_tokens_seen": 36806648, + "step": 18806 + }, + { + "epoch": 2.4926441351888666, + "grad_norm": 0.002805377123877406, + "learning_rate": 3.44926937560367e-07, + "loss": 0.0, + "num_input_tokens_seen": 36808416, + "step": 18807 + }, + { + "epoch": 2.492776673293572, + "grad_norm": 0.0010355893755331635, + "learning_rate": 3.447510088143305e-07, + "loss": 0.0, + "num_input_tokens_seen": 36809592, + "step": 18808 + }, + { + "epoch": 2.492909211398277, + "grad_norm": 3.04388165473938, + "learning_rate": 3.445751216229806e-07, + "loss": 0.0164, + "num_input_tokens_seen": 36811328, + "step": 18809 + }, + { + "epoch": 2.493041749502982, + "grad_norm": 0.25169745087623596, + "learning_rate": 3.443992759897094e-07, + "loss": 0.0008, + "num_input_tokens_seen": 36812816, + "step": 18810 + }, + { + "epoch": 2.4931742876076872, + "grad_norm": 5.490306854248047, + "learning_rate": 3.4422347191790773e-07, + "loss": 0.0293, + "num_input_tokens_seen": 36815176, + "step": 18811 + }, + { + "epoch": 2.4933068257123923, + "grad_norm": 0.026315122842788696, + "learning_rate": 3.4404770941096484e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36817016, + "step": 18812 + }, + { + "epoch": 2.4934393638170973, + "grad_norm": 0.003766394918784499, + "learning_rate": 3.4387198847226913e-07, + "loss": 0.0, + "num_input_tokens_seen": 36818464, + "step": 18813 + }, + { + "epoch": 2.4935719019218023, + "grad_norm": 5.842089653015137, + "learning_rate": 3.436963091052092e-07, + "loss": 0.0533, + "num_input_tokens_seen": 36820360, + "step": 18814 + }, + { + "epoch": 2.493704440026508, + "grad_norm": 0.220312237739563, + "learning_rate": 3.435206713131714e-07, + "loss": 0.0017, + "num_input_tokens_seen": 36822632, + "step": 18815 + }, + { + "epoch": 2.493836978131213, + "grad_norm": 0.06877513974905014, + "learning_rate": 3.433450750995426e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36823712, + "step": 18816 + }, + { + "epoch": 2.493969516235918, + "grad_norm": 14.457984924316406, + "learning_rate": 3.4316952046770933e-07, + "loss": 0.1503, + "num_input_tokens_seen": 36826048, + "step": 18817 + }, + { + "epoch": 2.494102054340623, + "grad_norm": 0.01403570082038641, + "learning_rate": 3.4299400742105553e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36828288, + "step": 18818 + }, + { + "epoch": 2.494234592445328, + "grad_norm": 3.67305588722229, + "learning_rate": 3.4281853596296516e-07, + "loss": 0.0304, + "num_input_tokens_seen": 36829888, + "step": 18819 + }, + { + "epoch": 2.494367130550033, + "grad_norm": 2.767547130584717, + "learning_rate": 3.4264310609682105e-07, + "loss": 0.0106, + "num_input_tokens_seen": 36831680, + "step": 18820 + }, + { + "epoch": 2.494499668654738, + "grad_norm": 18.199670791625977, + "learning_rate": 3.4246771782600636e-07, + "loss": 0.0944, + "num_input_tokens_seen": 36833872, + "step": 18821 + }, + { + "epoch": 2.4946322067594435, + "grad_norm": 0.021108925342559814, + "learning_rate": 3.422923711539028e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36836088, + "step": 18822 + }, + { + "epoch": 2.4947647448641486, + "grad_norm": 0.003045568708330393, + "learning_rate": 3.42117066083891e-07, + "loss": 0.0, + "num_input_tokens_seen": 36837384, + "step": 18823 + }, + { + "epoch": 2.4948972829688536, + "grad_norm": 2.9254539012908936, + "learning_rate": 3.419418026193508e-07, + "loss": 0.0091, + "num_input_tokens_seen": 36840648, + "step": 18824 + }, + { + "epoch": 2.4950298210735586, + "grad_norm": 9.042760848999023, + "learning_rate": 3.417665807636608e-07, + "loss": 0.1737, + "num_input_tokens_seen": 36842200, + "step": 18825 + }, + { + "epoch": 2.4951623591782637, + "grad_norm": 2.5750064849853516, + "learning_rate": 3.415914005202006e-07, + "loss": 0.0216, + "num_input_tokens_seen": 36844376, + "step": 18826 + }, + { + "epoch": 2.4952948972829687, + "grad_norm": 11.376178741455078, + "learning_rate": 3.4141626189234665e-07, + "loss": 0.0333, + "num_input_tokens_seen": 36846504, + "step": 18827 + }, + { + "epoch": 2.4954274353876738, + "grad_norm": 9.08418083190918, + "learning_rate": 3.412411648834768e-07, + "loss": 0.1164, + "num_input_tokens_seen": 36848616, + "step": 18828 + }, + { + "epoch": 2.4955599734923792, + "grad_norm": 0.030434440821409225, + "learning_rate": 3.4106610949696666e-07, + "loss": 0.0002, + "num_input_tokens_seen": 36850224, + "step": 18829 + }, + { + "epoch": 2.4956925115970843, + "grad_norm": 0.007946821860969067, + "learning_rate": 3.408910957361908e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36851488, + "step": 18830 + }, + { + "epoch": 2.4958250497017893, + "grad_norm": 0.0014632802922278643, + "learning_rate": 3.407161236045245e-07, + "loss": 0.0, + "num_input_tokens_seen": 36852720, + "step": 18831 + }, + { + "epoch": 2.4959575878064943, + "grad_norm": 9.387550354003906, + "learning_rate": 3.405411931053409e-07, + "loss": 0.0508, + "num_input_tokens_seen": 36854920, + "step": 18832 + }, + { + "epoch": 2.4960901259111994, + "grad_norm": 6.4261474609375, + "learning_rate": 3.4036630424201246e-07, + "loss": 0.0529, + "num_input_tokens_seen": 36856448, + "step": 18833 + }, + { + "epoch": 2.4962226640159044, + "grad_norm": 14.683920860290527, + "learning_rate": 3.4019145701791186e-07, + "loss": 0.049, + "num_input_tokens_seen": 36858048, + "step": 18834 + }, + { + "epoch": 2.4963552021206095, + "grad_norm": 10.79285717010498, + "learning_rate": 3.400166514364095e-07, + "loss": 0.1052, + "num_input_tokens_seen": 36861432, + "step": 18835 + }, + { + "epoch": 2.496487740225315, + "grad_norm": 0.09410117566585541, + "learning_rate": 3.3984188750087686e-07, + "loss": 0.0006, + "num_input_tokens_seen": 36863280, + "step": 18836 + }, + { + "epoch": 2.49662027833002, + "grad_norm": 14.954526901245117, + "learning_rate": 3.396671652146827e-07, + "loss": 0.4126, + "num_input_tokens_seen": 36864784, + "step": 18837 + }, + { + "epoch": 2.496752816434725, + "grad_norm": 4.765635013580322, + "learning_rate": 3.394924845811959e-07, + "loss": 0.0287, + "num_input_tokens_seen": 36868056, + "step": 18838 + }, + { + "epoch": 2.49688535453943, + "grad_norm": 0.38782310485839844, + "learning_rate": 3.3931784560378405e-07, + "loss": 0.0011, + "num_input_tokens_seen": 36869664, + "step": 18839 + }, + { + "epoch": 2.497017892644135, + "grad_norm": 5.310091018676758, + "learning_rate": 3.391432482858148e-07, + "loss": 0.1737, + "num_input_tokens_seen": 36872256, + "step": 18840 + }, + { + "epoch": 2.49715043074884, + "grad_norm": 10.568957328796387, + "learning_rate": 3.389686926306548e-07, + "loss": 0.1863, + "num_input_tokens_seen": 36874584, + "step": 18841 + }, + { + "epoch": 2.497282968853545, + "grad_norm": 0.9207376837730408, + "learning_rate": 3.387941786416693e-07, + "loss": 0.0044, + "num_input_tokens_seen": 36876992, + "step": 18842 + }, + { + "epoch": 2.4974155069582507, + "grad_norm": 0.5306827425956726, + "learning_rate": 3.386197063222227e-07, + "loss": 0.0034, + "num_input_tokens_seen": 36880272, + "step": 18843 + }, + { + "epoch": 2.4975480450629557, + "grad_norm": 12.747725486755371, + "learning_rate": 3.3844527567567953e-07, + "loss": 0.289, + "num_input_tokens_seen": 36882120, + "step": 18844 + }, + { + "epoch": 2.4976805831676607, + "grad_norm": 14.568350791931152, + "learning_rate": 3.3827088670540217e-07, + "loss": 0.3602, + "num_input_tokens_seen": 36884232, + "step": 18845 + }, + { + "epoch": 2.4978131212723658, + "grad_norm": 0.28530025482177734, + "learning_rate": 3.380965394147531e-07, + "loss": 0.0012, + "num_input_tokens_seen": 36886544, + "step": 18846 + }, + { + "epoch": 2.497945659377071, + "grad_norm": 5.931352615356445, + "learning_rate": 3.3792223380709476e-07, + "loss": 0.033, + "num_input_tokens_seen": 36889152, + "step": 18847 + }, + { + "epoch": 2.498078197481776, + "grad_norm": 5.934521198272705, + "learning_rate": 3.3774796988578747e-07, + "loss": 0.0848, + "num_input_tokens_seen": 36891440, + "step": 18848 + }, + { + "epoch": 2.498210735586481, + "grad_norm": 4.612947940826416, + "learning_rate": 3.37573747654191e-07, + "loss": 0.0602, + "num_input_tokens_seen": 36892960, + "step": 18849 + }, + { + "epoch": 2.4983432736911864, + "grad_norm": 0.02554892562329769, + "learning_rate": 3.373995671156638e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36894352, + "step": 18850 + }, + { + "epoch": 2.4984758117958914, + "grad_norm": 1.5647541284561157, + "learning_rate": 3.3722542827356485e-07, + "loss": 0.0126, + "num_input_tokens_seen": 36897216, + "step": 18851 + }, + { + "epoch": 2.4986083499005964, + "grad_norm": 5.874993324279785, + "learning_rate": 3.3705133113125207e-07, + "loss": 0.0533, + "num_input_tokens_seen": 36898960, + "step": 18852 + }, + { + "epoch": 2.4987408880053015, + "grad_norm": 0.021260902285575867, + "learning_rate": 3.3687727569208214e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36900696, + "step": 18853 + }, + { + "epoch": 2.4988734261100065, + "grad_norm": 7.199407577514648, + "learning_rate": 3.3670326195941015e-07, + "loss": 0.0571, + "num_input_tokens_seen": 36902208, + "step": 18854 + }, + { + "epoch": 2.4990059642147116, + "grad_norm": 1.9947681427001953, + "learning_rate": 3.365292899365921e-07, + "loss": 0.0022, + "num_input_tokens_seen": 36903480, + "step": 18855 + }, + { + "epoch": 2.4991385023194166, + "grad_norm": 0.012593426741659641, + "learning_rate": 3.3635535962698086e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36905696, + "step": 18856 + }, + { + "epoch": 2.499271040424122, + "grad_norm": 0.2400263100862503, + "learning_rate": 3.3618147103393116e-07, + "loss": 0.0006, + "num_input_tokens_seen": 36907832, + "step": 18857 + }, + { + "epoch": 2.499403578528827, + "grad_norm": 0.024175256490707397, + "learning_rate": 3.36007624160796e-07, + "loss": 0.0002, + "num_input_tokens_seen": 36909944, + "step": 18858 + }, + { + "epoch": 2.499536116633532, + "grad_norm": 0.02062061056494713, + "learning_rate": 3.358338190109267e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36911248, + "step": 18859 + }, + { + "epoch": 2.499668654738237, + "grad_norm": 5.3736252784729, + "learning_rate": 3.3566005558767433e-07, + "loss": 0.051, + "num_input_tokens_seen": 36913296, + "step": 18860 + }, + { + "epoch": 2.499801192842942, + "grad_norm": 0.0045803021639585495, + "learning_rate": 3.354863338943887e-07, + "loss": 0.0, + "num_input_tokens_seen": 36914952, + "step": 18861 + }, + { + "epoch": 2.4999337309476473, + "grad_norm": 14.366836547851562, + "learning_rate": 3.353126539344201e-07, + "loss": 0.2001, + "num_input_tokens_seen": 36917136, + "step": 18862 + }, + { + "epoch": 2.5000662690523523, + "grad_norm": 0.017743512988090515, + "learning_rate": 3.3513901571111645e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36918592, + "step": 18863 + }, + { + "epoch": 2.500198807157058, + "grad_norm": 9.841443061828613, + "learning_rate": 3.3496541922782676e-07, + "loss": 0.283, + "num_input_tokens_seen": 36920776, + "step": 18864 + }, + { + "epoch": 2.500331345261763, + "grad_norm": 3.488628387451172, + "learning_rate": 3.3479186448789694e-07, + "loss": 0.1371, + "num_input_tokens_seen": 36923184, + "step": 18865 + }, + { + "epoch": 2.500463883366468, + "grad_norm": 0.21213752031326294, + "learning_rate": 3.3461835149467356e-07, + "loss": 0.0009, + "num_input_tokens_seen": 36925400, + "step": 18866 + }, + { + "epoch": 2.500596421471173, + "grad_norm": 3.745331048965454, + "learning_rate": 3.3444488025150265e-07, + "loss": 0.0078, + "num_input_tokens_seen": 36926864, + "step": 18867 + }, + { + "epoch": 2.500728959575878, + "grad_norm": 11.848283767700195, + "learning_rate": 3.342714507617281e-07, + "loss": 0.1565, + "num_input_tokens_seen": 36929280, + "step": 18868 + }, + { + "epoch": 2.5008614976805834, + "grad_norm": 0.0011726762168109417, + "learning_rate": 3.3409806302869385e-07, + "loss": 0.0, + "num_input_tokens_seen": 36930592, + "step": 18869 + }, + { + "epoch": 2.500994035785288, + "grad_norm": 0.02145821414887905, + "learning_rate": 3.339247170557436e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36933128, + "step": 18870 + }, + { + "epoch": 2.5011265738899935, + "grad_norm": 0.0023923423141241074, + "learning_rate": 3.337514128462188e-07, + "loss": 0.0, + "num_input_tokens_seen": 36935896, + "step": 18871 + }, + { + "epoch": 2.5012591119946985, + "grad_norm": 5.935612201690674, + "learning_rate": 3.33578150403461e-07, + "loss": 0.0396, + "num_input_tokens_seen": 36937816, + "step": 18872 + }, + { + "epoch": 2.5013916500994036, + "grad_norm": 1.558914065361023, + "learning_rate": 3.3340492973081144e-07, + "loss": 0.0051, + "num_input_tokens_seen": 36939744, + "step": 18873 + }, + { + "epoch": 2.5015241882041086, + "grad_norm": 1.0587198734283447, + "learning_rate": 3.332317508316088e-07, + "loss": 0.0049, + "num_input_tokens_seen": 36941544, + "step": 18874 + }, + { + "epoch": 2.5016567263088136, + "grad_norm": 1.4513193368911743, + "learning_rate": 3.3305861370919347e-07, + "loss": 0.0058, + "num_input_tokens_seen": 36943024, + "step": 18875 + }, + { + "epoch": 2.501789264413519, + "grad_norm": 2.5282719135284424, + "learning_rate": 3.32885518366903e-07, + "loss": 0.0389, + "num_input_tokens_seen": 36944864, + "step": 18876 + }, + { + "epoch": 2.5019218025182237, + "grad_norm": 5.884391784667969, + "learning_rate": 3.32712464808074e-07, + "loss": 0.0463, + "num_input_tokens_seen": 36948232, + "step": 18877 + }, + { + "epoch": 2.502054340622929, + "grad_norm": 0.08367583155632019, + "learning_rate": 3.3253945303604455e-07, + "loss": 0.0003, + "num_input_tokens_seen": 36950736, + "step": 18878 + }, + { + "epoch": 2.5021868787276342, + "grad_norm": 6.371726036071777, + "learning_rate": 3.3236648305414955e-07, + "loss": 0.081, + "num_input_tokens_seen": 36953824, + "step": 18879 + }, + { + "epoch": 2.5023194168323393, + "grad_norm": 0.0019569601863622665, + "learning_rate": 3.321935548657237e-07, + "loss": 0.0, + "num_input_tokens_seen": 36955888, + "step": 18880 + }, + { + "epoch": 2.5024519549370443, + "grad_norm": 11.512117385864258, + "learning_rate": 3.320206684741023e-07, + "loss": 0.0903, + "num_input_tokens_seen": 36957776, + "step": 18881 + }, + { + "epoch": 2.5025844930417493, + "grad_norm": 0.1372195929288864, + "learning_rate": 3.3184782388261727e-07, + "loss": 0.0006, + "num_input_tokens_seen": 36959832, + "step": 18882 + }, + { + "epoch": 2.502717031146455, + "grad_norm": 5.810431003570557, + "learning_rate": 3.3167502109460236e-07, + "loss": 0.0653, + "num_input_tokens_seen": 36961672, + "step": 18883 + }, + { + "epoch": 2.50284956925116, + "grad_norm": 0.00299549987539649, + "learning_rate": 3.315022601133891e-07, + "loss": 0.0, + "num_input_tokens_seen": 36963160, + "step": 18884 + }, + { + "epoch": 2.502982107355865, + "grad_norm": 0.3385556638240814, + "learning_rate": 3.3132954094230813e-07, + "loss": 0.0016, + "num_input_tokens_seen": 36964856, + "step": 18885 + }, + { + "epoch": 2.50311464546057, + "grad_norm": 2.9573817253112793, + "learning_rate": 3.311568635846893e-07, + "loss": 0.0083, + "num_input_tokens_seen": 36967984, + "step": 18886 + }, + { + "epoch": 2.503247183565275, + "grad_norm": 0.02622293308377266, + "learning_rate": 3.3098422804386217e-07, + "loss": 0.0001, + "num_input_tokens_seen": 36970512, + "step": 18887 + }, + { + "epoch": 2.50337972166998, + "grad_norm": 0.4121561646461487, + "learning_rate": 3.30811634323156e-07, + "loss": 0.0015, + "num_input_tokens_seen": 36972368, + "step": 18888 + }, + { + "epoch": 2.503512259774685, + "grad_norm": 0.04656155779957771, + "learning_rate": 3.306390824258981e-07, + "loss": 0.0002, + "num_input_tokens_seen": 36974080, + "step": 18889 + }, + { + "epoch": 2.5036447978793905, + "grad_norm": 0.22439758479595184, + "learning_rate": 3.304665723554151e-07, + "loss": 0.0011, + "num_input_tokens_seen": 36976576, + "step": 18890 + }, + { + "epoch": 2.5037773359840956, + "grad_norm": 8.613420486450195, + "learning_rate": 3.302941041150334e-07, + "loss": 0.1072, + "num_input_tokens_seen": 36979096, + "step": 18891 + }, + { + "epoch": 2.5039098740888006, + "grad_norm": 10.852705001831055, + "learning_rate": 3.301216777080776e-07, + "loss": 0.129, + "num_input_tokens_seen": 36980840, + "step": 18892 + }, + { + "epoch": 2.5040424121935057, + "grad_norm": 0.00690833106637001, + "learning_rate": 3.299492931378728e-07, + "loss": 0.0, + "num_input_tokens_seen": 36982192, + "step": 18893 + }, + { + "epoch": 2.5041749502982107, + "grad_norm": 5.08875036239624, + "learning_rate": 3.297769504077433e-07, + "loss": 0.021, + "num_input_tokens_seen": 36984088, + "step": 18894 + }, + { + "epoch": 2.5043074884029157, + "grad_norm": 0.0623430572450161, + "learning_rate": 3.296046495210112e-07, + "loss": 0.0003, + "num_input_tokens_seen": 36985680, + "step": 18895 + }, + { + "epoch": 2.5044400265076208, + "grad_norm": 0.8401451110839844, + "learning_rate": 3.294323904809987e-07, + "loss": 0.004, + "num_input_tokens_seen": 36987648, + "step": 18896 + }, + { + "epoch": 2.5045725646123262, + "grad_norm": 14.340987205505371, + "learning_rate": 3.2926017329102667e-07, + "loss": 0.1984, + "num_input_tokens_seen": 36989872, + "step": 18897 + }, + { + "epoch": 2.5047051027170313, + "grad_norm": 0.17604050040245056, + "learning_rate": 3.290879979544165e-07, + "loss": 0.001, + "num_input_tokens_seen": 36992544, + "step": 18898 + }, + { + "epoch": 2.5048376408217363, + "grad_norm": 1.127850890159607, + "learning_rate": 3.2891586447448693e-07, + "loss": 0.0039, + "num_input_tokens_seen": 36995336, + "step": 18899 + }, + { + "epoch": 2.5049701789264414, + "grad_norm": 0.2068016231060028, + "learning_rate": 3.287437728545578e-07, + "loss": 0.0015, + "num_input_tokens_seen": 36997536, + "step": 18900 + }, + { + "epoch": 2.5051027170311464, + "grad_norm": 1.2962006330490112, + "learning_rate": 3.2857172309794654e-07, + "loss": 0.0125, + "num_input_tokens_seen": 36999304, + "step": 18901 + }, + { + "epoch": 2.5052352551358514, + "grad_norm": 0.028670739382505417, + "learning_rate": 3.2839971520797044e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37000904, + "step": 18902 + }, + { + "epoch": 2.5053677932405565, + "grad_norm": 0.5956410765647888, + "learning_rate": 3.282277491879454e-07, + "loss": 0.0027, + "num_input_tokens_seen": 37002960, + "step": 18903 + }, + { + "epoch": 2.505500331345262, + "grad_norm": 0.03883851692080498, + "learning_rate": 3.280558250411875e-07, + "loss": 0.0003, + "num_input_tokens_seen": 37004352, + "step": 18904 + }, + { + "epoch": 2.505632869449967, + "grad_norm": 15.943716049194336, + "learning_rate": 3.278839427710123e-07, + "loss": 0.2493, + "num_input_tokens_seen": 37006704, + "step": 18905 + }, + { + "epoch": 2.505765407554672, + "grad_norm": 6.611527442932129, + "learning_rate": 3.2771210238073307e-07, + "loss": 0.0648, + "num_input_tokens_seen": 37009120, + "step": 18906 + }, + { + "epoch": 2.505897945659377, + "grad_norm": 0.36734530329704285, + "learning_rate": 3.27540303873663e-07, + "loss": 0.0008, + "num_input_tokens_seen": 37011304, + "step": 18907 + }, + { + "epoch": 2.506030483764082, + "grad_norm": 0.05755903571844101, + "learning_rate": 3.273685472531143e-07, + "loss": 0.0004, + "num_input_tokens_seen": 37013656, + "step": 18908 + }, + { + "epoch": 2.506163021868787, + "grad_norm": 2.1778171062469482, + "learning_rate": 3.2719683252239927e-07, + "loss": 0.0186, + "num_input_tokens_seen": 37015496, + "step": 18909 + }, + { + "epoch": 2.506295559973492, + "grad_norm": 5.037810802459717, + "learning_rate": 3.2702515968482757e-07, + "loss": 0.0523, + "num_input_tokens_seen": 37017616, + "step": 18910 + }, + { + "epoch": 2.5064280980781977, + "grad_norm": 8.038813591003418, + "learning_rate": 3.268535287437105e-07, + "loss": 0.0674, + "num_input_tokens_seen": 37019520, + "step": 18911 + }, + { + "epoch": 2.5065606361829027, + "grad_norm": 0.12796185910701752, + "learning_rate": 3.2668193970235655e-07, + "loss": 0.0006, + "num_input_tokens_seen": 37022648, + "step": 18912 + }, + { + "epoch": 2.5066931742876077, + "grad_norm": 0.004791236482560635, + "learning_rate": 3.265103925640736e-07, + "loss": 0.0, + "num_input_tokens_seen": 37023672, + "step": 18913 + }, + { + "epoch": 2.5068257123923128, + "grad_norm": 0.00309697724878788, + "learning_rate": 3.263388873321702e-07, + "loss": 0.0, + "num_input_tokens_seen": 37024816, + "step": 18914 + }, + { + "epoch": 2.506958250497018, + "grad_norm": 0.025032825767993927, + "learning_rate": 3.261674240099524e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37026848, + "step": 18915 + }, + { + "epoch": 2.507090788601723, + "grad_norm": 5.085031509399414, + "learning_rate": 3.259960026007261e-07, + "loss": 0.0806, + "num_input_tokens_seen": 37028904, + "step": 18916 + }, + { + "epoch": 2.507223326706428, + "grad_norm": 2.8521907329559326, + "learning_rate": 3.2582462310779715e-07, + "loss": 0.0233, + "num_input_tokens_seen": 37030584, + "step": 18917 + }, + { + "epoch": 2.5073558648111334, + "grad_norm": 0.29547789692878723, + "learning_rate": 3.256532855344685e-07, + "loss": 0.0011, + "num_input_tokens_seen": 37032120, + "step": 18918 + }, + { + "epoch": 2.5074884029158384, + "grad_norm": 0.03684595227241516, + "learning_rate": 3.2548198988404526e-07, + "loss": 0.0002, + "num_input_tokens_seen": 37033768, + "step": 18919 + }, + { + "epoch": 2.5076209410205434, + "grad_norm": 11.6662015914917, + "learning_rate": 3.253107361598296e-07, + "loss": 0.0687, + "num_input_tokens_seen": 37035912, + "step": 18920 + }, + { + "epoch": 2.5077534791252485, + "grad_norm": 0.011618968099355698, + "learning_rate": 3.2513952436512285e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37037872, + "step": 18921 + }, + { + "epoch": 2.5078860172299535, + "grad_norm": 0.005466192029416561, + "learning_rate": 3.249683545032259e-07, + "loss": 0.0, + "num_input_tokens_seen": 37039392, + "step": 18922 + }, + { + "epoch": 2.5080185553346586, + "grad_norm": 5.545663356781006, + "learning_rate": 3.247972265774396e-07, + "loss": 0.0275, + "num_input_tokens_seen": 37041064, + "step": 18923 + }, + { + "epoch": 2.5081510934393636, + "grad_norm": 2.71606707572937, + "learning_rate": 3.246261405910642e-07, + "loss": 0.0234, + "num_input_tokens_seen": 37042832, + "step": 18924 + }, + { + "epoch": 2.508283631544069, + "grad_norm": 0.01454972755163908, + "learning_rate": 3.244550965473972e-07, + "loss": 0.0, + "num_input_tokens_seen": 37044136, + "step": 18925 + }, + { + "epoch": 2.508416169648774, + "grad_norm": 8.307259559631348, + "learning_rate": 3.242840944497369e-07, + "loss": 0.0944, + "num_input_tokens_seen": 37045904, + "step": 18926 + }, + { + "epoch": 2.508548707753479, + "grad_norm": 7.576040267944336, + "learning_rate": 3.241131343013795e-07, + "loss": 0.0907, + "num_input_tokens_seen": 37047992, + "step": 18927 + }, + { + "epoch": 2.508681245858184, + "grad_norm": 0.03288733586668968, + "learning_rate": 3.239422161056227e-07, + "loss": 0.0004, + "num_input_tokens_seen": 37049896, + "step": 18928 + }, + { + "epoch": 2.5088137839628892, + "grad_norm": 0.012413440272212029, + "learning_rate": 3.2377133986576046e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37052832, + "step": 18929 + }, + { + "epoch": 2.5089463220675943, + "grad_norm": 9.242318153381348, + "learning_rate": 3.236005055850888e-07, + "loss": 0.0069, + "num_input_tokens_seen": 37054136, + "step": 18930 + }, + { + "epoch": 2.5090788601722993, + "grad_norm": 0.0018430276541039348, + "learning_rate": 3.2342971326690066e-07, + "loss": 0.0, + "num_input_tokens_seen": 37056192, + "step": 18931 + }, + { + "epoch": 2.509211398277005, + "grad_norm": 1.339027762413025, + "learning_rate": 3.2325896291448916e-07, + "loss": 0.0072, + "num_input_tokens_seen": 37059624, + "step": 18932 + }, + { + "epoch": 2.50934393638171, + "grad_norm": 0.13388855755329132, + "learning_rate": 3.230882545311462e-07, + "loss": 0.0005, + "num_input_tokens_seen": 37061056, + "step": 18933 + }, + { + "epoch": 2.509476474486415, + "grad_norm": 5.6394219398498535, + "learning_rate": 3.2291758812016324e-07, + "loss": 0.0199, + "num_input_tokens_seen": 37063232, + "step": 18934 + }, + { + "epoch": 2.50960901259112, + "grad_norm": 1.5580370426177979, + "learning_rate": 3.2274696368483153e-07, + "loss": 0.0082, + "num_input_tokens_seen": 37065056, + "step": 18935 + }, + { + "epoch": 2.509741550695825, + "grad_norm": 0.5308508276939392, + "learning_rate": 3.2257638122844047e-07, + "loss": 0.0007, + "num_input_tokens_seen": 37066248, + "step": 18936 + }, + { + "epoch": 2.5098740888005304, + "grad_norm": 9.7587308883667, + "learning_rate": 3.2240584075427906e-07, + "loss": 0.1319, + "num_input_tokens_seen": 37067944, + "step": 18937 + }, + { + "epoch": 2.510006626905235, + "grad_norm": 2.983492374420166, + "learning_rate": 3.222353422656352e-07, + "loss": 0.033, + "num_input_tokens_seen": 37070336, + "step": 18938 + }, + { + "epoch": 2.5101391650099405, + "grad_norm": 0.0697803646326065, + "learning_rate": 3.220648857657957e-07, + "loss": 0.0005, + "num_input_tokens_seen": 37073576, + "step": 18939 + }, + { + "epoch": 2.5102717031146455, + "grad_norm": 2.575366735458374, + "learning_rate": 3.218944712580477e-07, + "loss": 0.0472, + "num_input_tokens_seen": 37075336, + "step": 18940 + }, + { + "epoch": 2.5104042412193506, + "grad_norm": 2.5171701908111572, + "learning_rate": 3.217240987456774e-07, + "loss": 0.0208, + "num_input_tokens_seen": 37077768, + "step": 18941 + }, + { + "epoch": 2.5105367793240556, + "grad_norm": 2.3216018676757812, + "learning_rate": 3.215537682319694e-07, + "loss": 0.0105, + "num_input_tokens_seen": 37079808, + "step": 18942 + }, + { + "epoch": 2.5106693174287606, + "grad_norm": 0.13588246703147888, + "learning_rate": 3.213834797202073e-07, + "loss": 0.0008, + "num_input_tokens_seen": 37081392, + "step": 18943 + }, + { + "epoch": 2.510801855533466, + "grad_norm": 1.2300422191619873, + "learning_rate": 3.212132332136744e-07, + "loss": 0.0053, + "num_input_tokens_seen": 37084744, + "step": 18944 + }, + { + "epoch": 2.5109343936381707, + "grad_norm": 0.013956381939351559, + "learning_rate": 3.210430287156538e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37087000, + "step": 18945 + }, + { + "epoch": 2.511066931742876, + "grad_norm": 10.498563766479492, + "learning_rate": 3.2087286622942635e-07, + "loss": 0.2096, + "num_input_tokens_seen": 37088944, + "step": 18946 + }, + { + "epoch": 2.5111994698475812, + "grad_norm": 3.5410022735595703, + "learning_rate": 3.207027457582737e-07, + "loss": 0.0193, + "num_input_tokens_seen": 37090384, + "step": 18947 + }, + { + "epoch": 2.5113320079522863, + "grad_norm": 2.39983868598938, + "learning_rate": 3.2053266730547567e-07, + "loss": 0.0064, + "num_input_tokens_seen": 37092520, + "step": 18948 + }, + { + "epoch": 2.5114645460569913, + "grad_norm": 1.04558527469635, + "learning_rate": 3.2036263087431075e-07, + "loss": 0.0026, + "num_input_tokens_seen": 37094840, + "step": 18949 + }, + { + "epoch": 2.5115970841616964, + "grad_norm": 3.031851053237915, + "learning_rate": 3.2019263646805854e-07, + "loss": 0.0379, + "num_input_tokens_seen": 37096768, + "step": 18950 + }, + { + "epoch": 2.511729622266402, + "grad_norm": 0.01281486451625824, + "learning_rate": 3.2002268408999616e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37098832, + "step": 18951 + }, + { + "epoch": 2.5118621603711064, + "grad_norm": 0.0042800335213541985, + "learning_rate": 3.198527737433996e-07, + "loss": 0.0, + "num_input_tokens_seen": 37100864, + "step": 18952 + }, + { + "epoch": 2.511994698475812, + "grad_norm": 0.024975012987852097, + "learning_rate": 3.1968290543154615e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37103272, + "step": 18953 + }, + { + "epoch": 2.512127236580517, + "grad_norm": 0.8042102456092834, + "learning_rate": 3.1951307915771e-07, + "loss": 0.002, + "num_input_tokens_seen": 37105488, + "step": 18954 + }, + { + "epoch": 2.512259774685222, + "grad_norm": 2.6198372840881348, + "learning_rate": 3.1934329492516627e-07, + "loss": 0.026, + "num_input_tokens_seen": 37108064, + "step": 18955 + }, + { + "epoch": 2.512392312789927, + "grad_norm": 8.025418281555176, + "learning_rate": 3.191735527371884e-07, + "loss": 0.0307, + "num_input_tokens_seen": 37110024, + "step": 18956 + }, + { + "epoch": 2.512524850894632, + "grad_norm": 2.5593149662017822, + "learning_rate": 3.190038525970482e-07, + "loss": 0.0223, + "num_input_tokens_seen": 37111336, + "step": 18957 + }, + { + "epoch": 2.5126573889993375, + "grad_norm": 0.11412045359611511, + "learning_rate": 3.1883419450801894e-07, + "loss": 0.0008, + "num_input_tokens_seen": 37112720, + "step": 18958 + }, + { + "epoch": 2.512789927104042, + "grad_norm": 2.7619855403900146, + "learning_rate": 3.186645784733711e-07, + "loss": 0.0327, + "num_input_tokens_seen": 37114792, + "step": 18959 + }, + { + "epoch": 2.5129224652087476, + "grad_norm": 7.100040912628174, + "learning_rate": 3.184950044963744e-07, + "loss": 0.0873, + "num_input_tokens_seen": 37116680, + "step": 18960 + }, + { + "epoch": 2.5130550033134527, + "grad_norm": 0.03837904706597328, + "learning_rate": 3.183254725802995e-07, + "loss": 0.0002, + "num_input_tokens_seen": 37120320, + "step": 18961 + }, + { + "epoch": 2.5131875414181577, + "grad_norm": 0.18688632547855377, + "learning_rate": 3.1815598272841474e-07, + "loss": 0.0012, + "num_input_tokens_seen": 37122168, + "step": 18962 + }, + { + "epoch": 2.5133200795228627, + "grad_norm": 0.8362486958503723, + "learning_rate": 3.17986534943987e-07, + "loss": 0.0051, + "num_input_tokens_seen": 37124208, + "step": 18963 + }, + { + "epoch": 2.5134526176275678, + "grad_norm": 5.50975227355957, + "learning_rate": 3.178171292302848e-07, + "loss": 0.0942, + "num_input_tokens_seen": 37125448, + "step": 18964 + }, + { + "epoch": 2.5135851557322733, + "grad_norm": 5.371541500091553, + "learning_rate": 3.1764776559057327e-07, + "loss": 0.0314, + "num_input_tokens_seen": 37127552, + "step": 18965 + }, + { + "epoch": 2.5137176938369783, + "grad_norm": 12.527759552001953, + "learning_rate": 3.1747844402811893e-07, + "loss": 0.1917, + "num_input_tokens_seen": 37129608, + "step": 18966 + }, + { + "epoch": 2.5138502319416833, + "grad_norm": 1.2726984024047852, + "learning_rate": 3.1730916454618565e-07, + "loss": 0.0076, + "num_input_tokens_seen": 37133200, + "step": 18967 + }, + { + "epoch": 2.5139827700463884, + "grad_norm": 7.60409688949585, + "learning_rate": 3.171399271480377e-07, + "loss": 0.0725, + "num_input_tokens_seen": 37134960, + "step": 18968 + }, + { + "epoch": 2.5141153081510934, + "grad_norm": 0.5263475179672241, + "learning_rate": 3.1697073183693723e-07, + "loss": 0.0023, + "num_input_tokens_seen": 37136568, + "step": 18969 + }, + { + "epoch": 2.5142478462557984, + "grad_norm": 0.19430802762508392, + "learning_rate": 3.1680157861614684e-07, + "loss": 0.0003, + "num_input_tokens_seen": 37138056, + "step": 18970 + }, + { + "epoch": 2.5143803843605035, + "grad_norm": 0.0027702138759195805, + "learning_rate": 3.1663246748892875e-07, + "loss": 0.0, + "num_input_tokens_seen": 37139960, + "step": 18971 + }, + { + "epoch": 2.514512922465209, + "grad_norm": 2.9984428882598877, + "learning_rate": 3.1646339845854304e-07, + "loss": 0.0076, + "num_input_tokens_seen": 37141264, + "step": 18972 + }, + { + "epoch": 2.514645460569914, + "grad_norm": 1.7659131288528442, + "learning_rate": 3.162943715282493e-07, + "loss": 0.0058, + "num_input_tokens_seen": 37142400, + "step": 18973 + }, + { + "epoch": 2.514777998674619, + "grad_norm": 2.246964693069458, + "learning_rate": 3.1612538670130644e-07, + "loss": 0.0168, + "num_input_tokens_seen": 37144296, + "step": 18974 + }, + { + "epoch": 2.514910536779324, + "grad_norm": 0.053303394466638565, + "learning_rate": 3.159564439809723e-07, + "loss": 0.0004, + "num_input_tokens_seen": 37145768, + "step": 18975 + }, + { + "epoch": 2.515043074884029, + "grad_norm": 0.1905931681394577, + "learning_rate": 3.157875433705046e-07, + "loss": 0.0013, + "num_input_tokens_seen": 37147384, + "step": 18976 + }, + { + "epoch": 2.515175612988734, + "grad_norm": 4.551131725311279, + "learning_rate": 3.1561868487316016e-07, + "loss": 0.0447, + "num_input_tokens_seen": 37149872, + "step": 18977 + }, + { + "epoch": 2.515308151093439, + "grad_norm": 0.04451128467917442, + "learning_rate": 3.1544986849219473e-07, + "loss": 0.0003, + "num_input_tokens_seen": 37151480, + "step": 18978 + }, + { + "epoch": 2.5154406891981447, + "grad_norm": 0.0039344895631074905, + "learning_rate": 3.1528109423086264e-07, + "loss": 0.0, + "num_input_tokens_seen": 37154368, + "step": 18979 + }, + { + "epoch": 2.5155732273028497, + "grad_norm": 0.3041742146015167, + "learning_rate": 3.151123620924179e-07, + "loss": 0.0015, + "num_input_tokens_seen": 37156664, + "step": 18980 + }, + { + "epoch": 2.5157057654075548, + "grad_norm": 0.10289274156093597, + "learning_rate": 3.149436720801138e-07, + "loss": 0.0004, + "num_input_tokens_seen": 37157912, + "step": 18981 + }, + { + "epoch": 2.51583830351226, + "grad_norm": 7.035677909851074, + "learning_rate": 3.147750241972039e-07, + "loss": 0.0733, + "num_input_tokens_seen": 37159664, + "step": 18982 + }, + { + "epoch": 2.515970841616965, + "grad_norm": 0.8407468795776367, + "learning_rate": 3.1460641844693885e-07, + "loss": 0.0055, + "num_input_tokens_seen": 37161640, + "step": 18983 + }, + { + "epoch": 2.51610337972167, + "grad_norm": 5.838760852813721, + "learning_rate": 3.1443785483256996e-07, + "loss": 0.0194, + "num_input_tokens_seen": 37164136, + "step": 18984 + }, + { + "epoch": 2.516235917826375, + "grad_norm": 10.091032981872559, + "learning_rate": 3.142693333573468e-07, + "loss": 0.1177, + "num_input_tokens_seen": 37166256, + "step": 18985 + }, + { + "epoch": 2.5163684559310804, + "grad_norm": 3.496364116668701, + "learning_rate": 3.141008540245183e-07, + "loss": 0.0086, + "num_input_tokens_seen": 37167584, + "step": 18986 + }, + { + "epoch": 2.5165009940357854, + "grad_norm": 0.020223980769515038, + "learning_rate": 3.1393241683733313e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37168968, + "step": 18987 + }, + { + "epoch": 2.5166335321404905, + "grad_norm": 1.2177257537841797, + "learning_rate": 3.1376402179903953e-07, + "loss": 0.0078, + "num_input_tokens_seen": 37170936, + "step": 18988 + }, + { + "epoch": 2.5167660702451955, + "grad_norm": 0.03331724926829338, + "learning_rate": 3.135956689128841e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37172232, + "step": 18989 + }, + { + "epoch": 2.5168986083499005, + "grad_norm": 0.02631368301808834, + "learning_rate": 3.134273581821123e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37174896, + "step": 18990 + }, + { + "epoch": 2.5170311464546056, + "grad_norm": 0.2929513156414032, + "learning_rate": 3.132590896099691e-07, + "loss": 0.0024, + "num_input_tokens_seen": 37176776, + "step": 18991 + }, + { + "epoch": 2.5171636845593106, + "grad_norm": 4.12161922454834, + "learning_rate": 3.1309086319969957e-07, + "loss": 0.0187, + "num_input_tokens_seen": 37178864, + "step": 18992 + }, + { + "epoch": 2.517296222664016, + "grad_norm": 6.546437740325928, + "learning_rate": 3.129226789545464e-07, + "loss": 0.0523, + "num_input_tokens_seen": 37180184, + "step": 18993 + }, + { + "epoch": 2.517428760768721, + "grad_norm": 0.19710469245910645, + "learning_rate": 3.127545368777532e-07, + "loss": 0.0007, + "num_input_tokens_seen": 37182168, + "step": 18994 + }, + { + "epoch": 2.517561298873426, + "grad_norm": 4.459694862365723, + "learning_rate": 3.125864369725615e-07, + "loss": 0.0765, + "num_input_tokens_seen": 37183944, + "step": 18995 + }, + { + "epoch": 2.517693836978131, + "grad_norm": 10.04840087890625, + "learning_rate": 3.124183792422117e-07, + "loss": 0.0476, + "num_input_tokens_seen": 37186240, + "step": 18996 + }, + { + "epoch": 2.5178263750828362, + "grad_norm": 0.061082880944013596, + "learning_rate": 3.1225036368994545e-07, + "loss": 0.0002, + "num_input_tokens_seen": 37187456, + "step": 18997 + }, + { + "epoch": 2.5179589131875413, + "grad_norm": 0.0019496860913932323, + "learning_rate": 3.1208239031900125e-07, + "loss": 0.0, + "num_input_tokens_seen": 37188664, + "step": 18998 + }, + { + "epoch": 2.5180914512922463, + "grad_norm": 0.0017281313193961978, + "learning_rate": 3.119144591326176e-07, + "loss": 0.0, + "num_input_tokens_seen": 37189520, + "step": 18999 + }, + { + "epoch": 2.518223989396952, + "grad_norm": 4.99782657623291, + "learning_rate": 3.1174657013403303e-07, + "loss": 0.1069, + "num_input_tokens_seen": 37192008, + "step": 19000 + }, + { + "epoch": 2.518356527501657, + "grad_norm": 0.030006704851984978, + "learning_rate": 3.1157872332648357e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37193336, + "step": 19001 + }, + { + "epoch": 2.518489065606362, + "grad_norm": 0.0035458526108413935, + "learning_rate": 3.1141091871320667e-07, + "loss": 0.0, + "num_input_tokens_seen": 37194888, + "step": 19002 + }, + { + "epoch": 2.518621603711067, + "grad_norm": 0.04300444573163986, + "learning_rate": 3.112431562974372e-07, + "loss": 0.0003, + "num_input_tokens_seen": 37195872, + "step": 19003 + }, + { + "epoch": 2.518754141815772, + "grad_norm": 0.09322241693735123, + "learning_rate": 3.110754360824095e-07, + "loss": 0.0002, + "num_input_tokens_seen": 37198440, + "step": 19004 + }, + { + "epoch": 2.518886679920477, + "grad_norm": 0.08047468215227127, + "learning_rate": 3.109077580713568e-07, + "loss": 0.0003, + "num_input_tokens_seen": 37200200, + "step": 19005 + }, + { + "epoch": 2.519019218025182, + "grad_norm": 0.3719690442085266, + "learning_rate": 3.1074012226751297e-07, + "loss": 0.001, + "num_input_tokens_seen": 37202688, + "step": 19006 + }, + { + "epoch": 2.5191517561298875, + "grad_norm": 1.1955033540725708, + "learning_rate": 3.1057252867411013e-07, + "loss": 0.003, + "num_input_tokens_seen": 37205144, + "step": 19007 + }, + { + "epoch": 2.5192842942345925, + "grad_norm": 1.8531615734100342, + "learning_rate": 3.1040497729437955e-07, + "loss": 0.0086, + "num_input_tokens_seen": 37206536, + "step": 19008 + }, + { + "epoch": 2.5194168323392976, + "grad_norm": 0.028094176203012466, + "learning_rate": 3.102374681315515e-07, + "loss": 0.0002, + "num_input_tokens_seen": 37209200, + "step": 19009 + }, + { + "epoch": 2.5195493704440026, + "grad_norm": 13.5997896194458, + "learning_rate": 3.1007000118885523e-07, + "loss": 0.1316, + "num_input_tokens_seen": 37212112, + "step": 19010 + }, + { + "epoch": 2.5196819085487077, + "grad_norm": 6.746583938598633, + "learning_rate": 3.0990257646951985e-07, + "loss": 0.0231, + "num_input_tokens_seen": 37215104, + "step": 19011 + }, + { + "epoch": 2.5198144466534127, + "grad_norm": 0.029580211266875267, + "learning_rate": 3.0973519397677425e-07, + "loss": 0.0002, + "num_input_tokens_seen": 37217136, + "step": 19012 + }, + { + "epoch": 2.5199469847581177, + "grad_norm": 4.094579696655273, + "learning_rate": 3.0956785371384525e-07, + "loss": 0.1179, + "num_input_tokens_seen": 37218408, + "step": 19013 + }, + { + "epoch": 2.520079522862823, + "grad_norm": 8.75252914428711, + "learning_rate": 3.094005556839591e-07, + "loss": 0.1848, + "num_input_tokens_seen": 37220504, + "step": 19014 + }, + { + "epoch": 2.5202120609675283, + "grad_norm": 7.873892784118652, + "learning_rate": 3.092332998903416e-07, + "loss": 0.1052, + "num_input_tokens_seen": 37222256, + "step": 19015 + }, + { + "epoch": 2.5203445990722333, + "grad_norm": 0.0066873664036393166, + "learning_rate": 3.090660863362166e-07, + "loss": 0.0, + "num_input_tokens_seen": 37223744, + "step": 19016 + }, + { + "epoch": 2.5204771371769383, + "grad_norm": 1.5027353763580322, + "learning_rate": 3.0889891502480896e-07, + "loss": 0.0065, + "num_input_tokens_seen": 37226368, + "step": 19017 + }, + { + "epoch": 2.5206096752816434, + "grad_norm": 4.3736748695373535, + "learning_rate": 3.087317859593425e-07, + "loss": 0.0109, + "num_input_tokens_seen": 37228176, + "step": 19018 + }, + { + "epoch": 2.520742213386349, + "grad_norm": 10.199136734008789, + "learning_rate": 3.0856469914303883e-07, + "loss": 0.1101, + "num_input_tokens_seen": 37229752, + "step": 19019 + }, + { + "epoch": 2.5208747514910534, + "grad_norm": 7.644498825073242, + "learning_rate": 3.0839765457911954e-07, + "loss": 0.2549, + "num_input_tokens_seen": 37231832, + "step": 19020 + }, + { + "epoch": 2.521007289595759, + "grad_norm": 0.22376547753810883, + "learning_rate": 3.0823065227080536e-07, + "loss": 0.001, + "num_input_tokens_seen": 37234008, + "step": 19021 + }, + { + "epoch": 2.521139827700464, + "grad_norm": 0.013103737495839596, + "learning_rate": 3.0806369222131565e-07, + "loss": 0.0, + "num_input_tokens_seen": 37236320, + "step": 19022 + }, + { + "epoch": 2.521272365805169, + "grad_norm": 6.850834369659424, + "learning_rate": 3.078967744338701e-07, + "loss": 0.091, + "num_input_tokens_seen": 37238336, + "step": 19023 + }, + { + "epoch": 2.521404903909874, + "grad_norm": 2.564728260040283, + "learning_rate": 3.077298989116875e-07, + "loss": 0.0118, + "num_input_tokens_seen": 37240312, + "step": 19024 + }, + { + "epoch": 2.521537442014579, + "grad_norm": 0.01958959363400936, + "learning_rate": 3.0756306565798476e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37242992, + "step": 19025 + }, + { + "epoch": 2.5216699801192846, + "grad_norm": 8.58803939819336, + "learning_rate": 3.073962746759787e-07, + "loss": 0.102, + "num_input_tokens_seen": 37245400, + "step": 19026 + }, + { + "epoch": 2.521802518223989, + "grad_norm": 8.174155235290527, + "learning_rate": 3.0722952596888457e-07, + "loss": 0.2566, + "num_input_tokens_seen": 37247240, + "step": 19027 + }, + { + "epoch": 2.5219350563286946, + "grad_norm": 6.473876476287842, + "learning_rate": 3.0706281953991834e-07, + "loss": 0.1216, + "num_input_tokens_seen": 37249152, + "step": 19028 + }, + { + "epoch": 2.5220675944333997, + "grad_norm": 4.075709819793701, + "learning_rate": 3.0689615539229336e-07, + "loss": 0.0419, + "num_input_tokens_seen": 37250832, + "step": 19029 + }, + { + "epoch": 2.5222001325381047, + "grad_norm": 0.5353427529335022, + "learning_rate": 3.067295335292239e-07, + "loss": 0.0023, + "num_input_tokens_seen": 37251896, + "step": 19030 + }, + { + "epoch": 2.5223326706428097, + "grad_norm": 0.16234004497528076, + "learning_rate": 3.0656295395392224e-07, + "loss": 0.0013, + "num_input_tokens_seen": 37254816, + "step": 19031 + }, + { + "epoch": 2.522465208747515, + "grad_norm": 0.00024320707598235458, + "learning_rate": 3.063964166695993e-07, + "loss": 0.0, + "num_input_tokens_seen": 37255984, + "step": 19032 + }, + { + "epoch": 2.5225977468522203, + "grad_norm": 16.064165115356445, + "learning_rate": 3.062299216794673e-07, + "loss": 0.1522, + "num_input_tokens_seen": 37257672, + "step": 19033 + }, + { + "epoch": 2.522730284956925, + "grad_norm": 1.3186776638031006, + "learning_rate": 3.060634689867353e-07, + "loss": 0.0094, + "num_input_tokens_seen": 37259344, + "step": 19034 + }, + { + "epoch": 2.5228628230616303, + "grad_norm": 0.007920119911432266, + "learning_rate": 3.058970585946136e-07, + "loss": 0.0, + "num_input_tokens_seen": 37260624, + "step": 19035 + }, + { + "epoch": 2.5229953611663354, + "grad_norm": 0.0016489775152876973, + "learning_rate": 3.057306905063104e-07, + "loss": 0.0, + "num_input_tokens_seen": 37261832, + "step": 19036 + }, + { + "epoch": 2.5231278992710404, + "grad_norm": 11.083040237426758, + "learning_rate": 3.0556436472503256e-07, + "loss": 0.1377, + "num_input_tokens_seen": 37263360, + "step": 19037 + }, + { + "epoch": 2.5232604373757455, + "grad_norm": 10.893657684326172, + "learning_rate": 3.0539808125398807e-07, + "loss": 0.0726, + "num_input_tokens_seen": 37265392, + "step": 19038 + }, + { + "epoch": 2.5233929754804505, + "grad_norm": 0.5558332800865173, + "learning_rate": 3.0523184009638276e-07, + "loss": 0.0032, + "num_input_tokens_seen": 37267272, + "step": 19039 + }, + { + "epoch": 2.523525513585156, + "grad_norm": 10.004725456237793, + "learning_rate": 3.0506564125542123e-07, + "loss": 0.1095, + "num_input_tokens_seen": 37269272, + "step": 19040 + }, + { + "epoch": 2.5236580516898606, + "grad_norm": 0.005695379339158535, + "learning_rate": 3.048994847343087e-07, + "loss": 0.0, + "num_input_tokens_seen": 37271776, + "step": 19041 + }, + { + "epoch": 2.523790589794566, + "grad_norm": 9.489371299743652, + "learning_rate": 3.047333705362482e-07, + "loss": 0.0478, + "num_input_tokens_seen": 37274736, + "step": 19042 + }, + { + "epoch": 2.523923127899271, + "grad_norm": 7.947813987731934, + "learning_rate": 3.045672986644427e-07, + "loss": 0.0474, + "num_input_tokens_seen": 37276320, + "step": 19043 + }, + { + "epoch": 2.524055666003976, + "grad_norm": 5.883983612060547, + "learning_rate": 3.0440126912209434e-07, + "loss": 0.0437, + "num_input_tokens_seen": 37278552, + "step": 19044 + }, + { + "epoch": 2.524188204108681, + "grad_norm": 0.040536753833293915, + "learning_rate": 3.0423528191240447e-07, + "loss": 0.0002, + "num_input_tokens_seen": 37280424, + "step": 19045 + }, + { + "epoch": 2.524320742213386, + "grad_norm": 0.7967009544372559, + "learning_rate": 3.040693370385725e-07, + "loss": 0.0035, + "num_input_tokens_seen": 37283800, + "step": 19046 + }, + { + "epoch": 2.5244532803180917, + "grad_norm": 10.937032699584961, + "learning_rate": 3.039034345037992e-07, + "loss": 0.1011, + "num_input_tokens_seen": 37285336, + "step": 19047 + }, + { + "epoch": 2.5245858184227963, + "grad_norm": 0.004720596130937338, + "learning_rate": 3.0373757431128194e-07, + "loss": 0.0, + "num_input_tokens_seen": 37288136, + "step": 19048 + }, + { + "epoch": 2.5247183565275018, + "grad_norm": 6.496044635772705, + "learning_rate": 3.0357175646422014e-07, + "loss": 0.0526, + "num_input_tokens_seen": 37290536, + "step": 19049 + }, + { + "epoch": 2.524850894632207, + "grad_norm": 2.2700207233428955, + "learning_rate": 3.034059809658099e-07, + "loss": 0.0123, + "num_input_tokens_seen": 37292592, + "step": 19050 + }, + { + "epoch": 2.524983432736912, + "grad_norm": 0.014312869869172573, + "learning_rate": 3.032402478192481e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37293920, + "step": 19051 + }, + { + "epoch": 2.525115970841617, + "grad_norm": 0.012016148306429386, + "learning_rate": 3.0307455702772883e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37295104, + "step": 19052 + }, + { + "epoch": 2.525248508946322, + "grad_norm": 0.0016730931820347905, + "learning_rate": 3.0290890859444784e-07, + "loss": 0.0, + "num_input_tokens_seen": 37296488, + "step": 19053 + }, + { + "epoch": 2.5253810470510274, + "grad_norm": 5.655860900878906, + "learning_rate": 3.0274330252259925e-07, + "loss": 0.0334, + "num_input_tokens_seen": 37298240, + "step": 19054 + }, + { + "epoch": 2.5255135851557324, + "grad_norm": 20.5645694732666, + "learning_rate": 3.0257773881537583e-07, + "loss": 0.1837, + "num_input_tokens_seen": 37300280, + "step": 19055 + }, + { + "epoch": 2.5256461232604375, + "grad_norm": 4.748203754425049, + "learning_rate": 3.0241221747596943e-07, + "loss": 0.0492, + "num_input_tokens_seen": 37302888, + "step": 19056 + }, + { + "epoch": 2.5257786613651425, + "grad_norm": 4.462448596954346, + "learning_rate": 3.022467385075714e-07, + "loss": 0.1032, + "num_input_tokens_seen": 37305408, + "step": 19057 + }, + { + "epoch": 2.5259111994698475, + "grad_norm": 0.002763837343081832, + "learning_rate": 3.02081301913372e-07, + "loss": 0.0, + "num_input_tokens_seen": 37307744, + "step": 19058 + }, + { + "epoch": 2.5260437375745526, + "grad_norm": 1.0047616958618164, + "learning_rate": 3.019159076965611e-07, + "loss": 0.0045, + "num_input_tokens_seen": 37309112, + "step": 19059 + }, + { + "epoch": 2.5261762756792576, + "grad_norm": 0.035448722541332245, + "learning_rate": 3.017505558603287e-07, + "loss": 0.0002, + "num_input_tokens_seen": 37311592, + "step": 19060 + }, + { + "epoch": 2.526308813783963, + "grad_norm": 14.925313949584961, + "learning_rate": 3.01585246407862e-07, + "loss": 0.3552, + "num_input_tokens_seen": 37313856, + "step": 19061 + }, + { + "epoch": 2.526441351888668, + "grad_norm": 0.010677700862288475, + "learning_rate": 3.0141997934234836e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37315296, + "step": 19062 + }, + { + "epoch": 2.526573889993373, + "grad_norm": 3.28476619720459, + "learning_rate": 3.012547546669736e-07, + "loss": 0.0352, + "num_input_tokens_seen": 37318032, + "step": 19063 + }, + { + "epoch": 2.526706428098078, + "grad_norm": 0.023423900827765465, + "learning_rate": 3.0108957238492416e-07, + "loss": 0.0002, + "num_input_tokens_seen": 37319176, + "step": 19064 + }, + { + "epoch": 2.5268389662027833, + "grad_norm": 8.360081672668457, + "learning_rate": 3.009244324993851e-07, + "loss": 0.0708, + "num_input_tokens_seen": 37321464, + "step": 19065 + }, + { + "epoch": 2.5269715043074883, + "grad_norm": 0.007863742299377918, + "learning_rate": 3.0075933501354016e-07, + "loss": 0.0, + "num_input_tokens_seen": 37323968, + "step": 19066 + }, + { + "epoch": 2.5271040424121933, + "grad_norm": 24.61621856689453, + "learning_rate": 3.005942799305725e-07, + "loss": 0.3447, + "num_input_tokens_seen": 37325824, + "step": 19067 + }, + { + "epoch": 2.527236580516899, + "grad_norm": 10.274988174438477, + "learning_rate": 3.004292672536638e-07, + "loss": 0.227, + "num_input_tokens_seen": 37328200, + "step": 19068 + }, + { + "epoch": 2.527369118621604, + "grad_norm": 2.4371399879455566, + "learning_rate": 3.0026429698599673e-07, + "loss": 0.0498, + "num_input_tokens_seen": 37330056, + "step": 19069 + }, + { + "epoch": 2.527501656726309, + "grad_norm": 0.006653674878180027, + "learning_rate": 3.0009936913075106e-07, + "loss": 0.0, + "num_input_tokens_seen": 37332048, + "step": 19070 + }, + { + "epoch": 2.527634194831014, + "grad_norm": 0.015350362285971642, + "learning_rate": 2.999344836911075e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37333768, + "step": 19071 + }, + { + "epoch": 2.527766732935719, + "grad_norm": 8.021217346191406, + "learning_rate": 2.997696406702452e-07, + "loss": 0.2334, + "num_input_tokens_seen": 37335984, + "step": 19072 + }, + { + "epoch": 2.527899271040424, + "grad_norm": 0.05985564738512039, + "learning_rate": 2.996048400713419e-07, + "loss": 0.0004, + "num_input_tokens_seen": 37337944, + "step": 19073 + }, + { + "epoch": 2.528031809145129, + "grad_norm": 0.00936909206211567, + "learning_rate": 2.9944008189757457e-07, + "loss": 0.0, + "num_input_tokens_seen": 37340096, + "step": 19074 + }, + { + "epoch": 2.5281643472498345, + "grad_norm": 7.633286476135254, + "learning_rate": 2.992753661521214e-07, + "loss": 0.086, + "num_input_tokens_seen": 37342664, + "step": 19075 + }, + { + "epoch": 2.5282968853545396, + "grad_norm": 4.702107906341553, + "learning_rate": 2.991106928381565e-07, + "loss": 0.0227, + "num_input_tokens_seen": 37344992, + "step": 19076 + }, + { + "epoch": 2.5284294234592446, + "grad_norm": 7.830382823944092, + "learning_rate": 2.989460619588566e-07, + "loss": 0.035, + "num_input_tokens_seen": 37346728, + "step": 19077 + }, + { + "epoch": 2.5285619615639496, + "grad_norm": 5.351593971252441, + "learning_rate": 2.9878147351739496e-07, + "loss": 0.0329, + "num_input_tokens_seen": 37349240, + "step": 19078 + }, + { + "epoch": 2.5286944996686547, + "grad_norm": 6.137148380279541, + "learning_rate": 2.9861692751694456e-07, + "loss": 0.093, + "num_input_tokens_seen": 37351352, + "step": 19079 + }, + { + "epoch": 2.5288270377733597, + "grad_norm": 0.11415832489728928, + "learning_rate": 2.984524239606787e-07, + "loss": 0.0006, + "num_input_tokens_seen": 37353472, + "step": 19080 + }, + { + "epoch": 2.5289595758780647, + "grad_norm": 1.6513500213623047, + "learning_rate": 2.982879628517693e-07, + "loss": 0.0072, + "num_input_tokens_seen": 37355936, + "step": 19081 + }, + { + "epoch": 2.5290921139827702, + "grad_norm": 0.021916721016168594, + "learning_rate": 2.9812354419338606e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37357768, + "step": 19082 + }, + { + "epoch": 2.5292246520874753, + "grad_norm": 9.223024368286133, + "learning_rate": 2.979591679887003e-07, + "loss": 0.1057, + "num_input_tokens_seen": 37359320, + "step": 19083 + }, + { + "epoch": 2.5293571901921803, + "grad_norm": 0.22989630699157715, + "learning_rate": 2.977948342408807e-07, + "loss": 0.0015, + "num_input_tokens_seen": 37360872, + "step": 19084 + }, + { + "epoch": 2.5294897282968853, + "grad_norm": 0.08252662420272827, + "learning_rate": 2.9763054295309625e-07, + "loss": 0.0004, + "num_input_tokens_seen": 37362600, + "step": 19085 + }, + { + "epoch": 2.5296222664015904, + "grad_norm": 0.03387834504246712, + "learning_rate": 2.974662941285142e-07, + "loss": 0.0002, + "num_input_tokens_seen": 37365024, + "step": 19086 + }, + { + "epoch": 2.5297548045062954, + "grad_norm": 0.009304218925535679, + "learning_rate": 2.973020877703012e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37366976, + "step": 19087 + }, + { + "epoch": 2.5298873426110005, + "grad_norm": 0.0758136734366417, + "learning_rate": 2.971379238816238e-07, + "loss": 0.0003, + "num_input_tokens_seen": 37368656, + "step": 19088 + }, + { + "epoch": 2.530019880715706, + "grad_norm": 3.143813371658325, + "learning_rate": 2.9697380246564654e-07, + "loss": 0.0227, + "num_input_tokens_seen": 37370400, + "step": 19089 + }, + { + "epoch": 2.530152418820411, + "grad_norm": 19.974788665771484, + "learning_rate": 2.968097235255343e-07, + "loss": 0.2177, + "num_input_tokens_seen": 37371952, + "step": 19090 + }, + { + "epoch": 2.530284956925116, + "grad_norm": 4.7282280921936035, + "learning_rate": 2.96645687064451e-07, + "loss": 0.0299, + "num_input_tokens_seen": 37374032, + "step": 19091 + }, + { + "epoch": 2.530417495029821, + "grad_norm": 0.014639724045991898, + "learning_rate": 2.964816930855585e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37376000, + "step": 19092 + }, + { + "epoch": 2.530550033134526, + "grad_norm": 8.441856384277344, + "learning_rate": 2.9631774159201876e-07, + "loss": 0.1273, + "num_input_tokens_seen": 37378024, + "step": 19093 + }, + { + "epoch": 2.530682571239231, + "grad_norm": 13.380906105041504, + "learning_rate": 2.961538325869931e-07, + "loss": 0.0627, + "num_input_tokens_seen": 37380864, + "step": 19094 + }, + { + "epoch": 2.530815109343936, + "grad_norm": 1.010128140449524, + "learning_rate": 2.959899660736426e-07, + "loss": 0.0112, + "num_input_tokens_seen": 37383328, + "step": 19095 + }, + { + "epoch": 2.5309476474486416, + "grad_norm": 0.3424445390701294, + "learning_rate": 2.9582614205512586e-07, + "loss": 0.0024, + "num_input_tokens_seen": 37385664, + "step": 19096 + }, + { + "epoch": 2.5310801855533467, + "grad_norm": 0.008097711019217968, + "learning_rate": 2.956623605346018e-07, + "loss": 0.0, + "num_input_tokens_seen": 37387032, + "step": 19097 + }, + { + "epoch": 2.5312127236580517, + "grad_norm": 1.4307278394699097, + "learning_rate": 2.9549862151522806e-07, + "loss": 0.0138, + "num_input_tokens_seen": 37388592, + "step": 19098 + }, + { + "epoch": 2.5313452617627568, + "grad_norm": 7.536437034606934, + "learning_rate": 2.953349250001611e-07, + "loss": 0.0673, + "num_input_tokens_seen": 37390272, + "step": 19099 + }, + { + "epoch": 2.531477799867462, + "grad_norm": 0.01898566447198391, + "learning_rate": 2.951712709925578e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37391488, + "step": 19100 + }, + { + "epoch": 2.531610337972167, + "grad_norm": 7.185465335845947, + "learning_rate": 2.950076594955739e-07, + "loss": 0.0442, + "num_input_tokens_seen": 37393144, + "step": 19101 + }, + { + "epoch": 2.531742876076872, + "grad_norm": 1.1171246767044067, + "learning_rate": 2.948440905123634e-07, + "loss": 0.0093, + "num_input_tokens_seen": 37395672, + "step": 19102 + }, + { + "epoch": 2.5318754141815774, + "grad_norm": 11.81508731842041, + "learning_rate": 2.9468056404608036e-07, + "loss": 0.1703, + "num_input_tokens_seen": 37398368, + "step": 19103 + }, + { + "epoch": 2.5320079522862824, + "grad_norm": 0.059792373329401016, + "learning_rate": 2.9451708009987727e-07, + "loss": 0.0003, + "num_input_tokens_seen": 37399856, + "step": 19104 + }, + { + "epoch": 2.5321404903909874, + "grad_norm": 1.567510724067688, + "learning_rate": 2.943536386769058e-07, + "loss": 0.0066, + "num_input_tokens_seen": 37401712, + "step": 19105 + }, + { + "epoch": 2.5322730284956925, + "grad_norm": 8.388509750366211, + "learning_rate": 2.9419023978031786e-07, + "loss": 0.0448, + "num_input_tokens_seen": 37404096, + "step": 19106 + }, + { + "epoch": 2.5324055666003975, + "grad_norm": 0.009316482581198215, + "learning_rate": 2.9402688341326427e-07, + "loss": 0.0, + "num_input_tokens_seen": 37405544, + "step": 19107 + }, + { + "epoch": 2.532538104705103, + "grad_norm": 10.814212799072266, + "learning_rate": 2.938635695788941e-07, + "loss": 0.1891, + "num_input_tokens_seen": 37408576, + "step": 19108 + }, + { + "epoch": 2.5326706428098076, + "grad_norm": 0.028352294117212296, + "learning_rate": 2.937002982803563e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37410000, + "step": 19109 + }, + { + "epoch": 2.532803180914513, + "grad_norm": 5.990534782409668, + "learning_rate": 2.9353706952079813e-07, + "loss": 0.0394, + "num_input_tokens_seen": 37412032, + "step": 19110 + }, + { + "epoch": 2.532935719019218, + "grad_norm": 0.01575646921992302, + "learning_rate": 2.933738833033678e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37414208, + "step": 19111 + }, + { + "epoch": 2.533068257123923, + "grad_norm": 0.0018914544489234686, + "learning_rate": 2.9321073963121144e-07, + "loss": 0.0, + "num_input_tokens_seen": 37415752, + "step": 19112 + }, + { + "epoch": 2.533200795228628, + "grad_norm": 0.18259583413600922, + "learning_rate": 2.930476385074746e-07, + "loss": 0.0005, + "num_input_tokens_seen": 37418072, + "step": 19113 + }, + { + "epoch": 2.533333333333333, + "grad_norm": 2.512240409851074, + "learning_rate": 2.9288457993530167e-07, + "loss": 0.0141, + "num_input_tokens_seen": 37420112, + "step": 19114 + }, + { + "epoch": 2.5334658714380387, + "grad_norm": 0.07118674367666245, + "learning_rate": 2.9272156391783627e-07, + "loss": 0.0004, + "num_input_tokens_seen": 37421936, + "step": 19115 + }, + { + "epoch": 2.5335984095427433, + "grad_norm": 5.437435626983643, + "learning_rate": 2.925585904582223e-07, + "loss": 0.0513, + "num_input_tokens_seen": 37424344, + "step": 19116 + }, + { + "epoch": 2.5337309476474488, + "grad_norm": 0.007428389973938465, + "learning_rate": 2.9239565955960113e-07, + "loss": 0.0, + "num_input_tokens_seen": 37426064, + "step": 19117 + }, + { + "epoch": 2.533863485752154, + "grad_norm": 0.5026679635047913, + "learning_rate": 2.9223277122511495e-07, + "loss": 0.0025, + "num_input_tokens_seen": 37427808, + "step": 19118 + }, + { + "epoch": 2.533996023856859, + "grad_norm": 26.859704971313477, + "learning_rate": 2.9206992545790404e-07, + "loss": 0.2483, + "num_input_tokens_seen": 37429256, + "step": 19119 + }, + { + "epoch": 2.534128561961564, + "grad_norm": 4.2761359214782715, + "learning_rate": 2.919071222611075e-07, + "loss": 0.0176, + "num_input_tokens_seen": 37431728, + "step": 19120 + }, + { + "epoch": 2.534261100066269, + "grad_norm": 0.01922869123518467, + "learning_rate": 2.9174436163786566e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37433736, + "step": 19121 + }, + { + "epoch": 2.5343936381709744, + "grad_norm": 6.303127288818359, + "learning_rate": 2.9158164359131576e-07, + "loss": 0.0389, + "num_input_tokens_seen": 37435648, + "step": 19122 + }, + { + "epoch": 2.534526176275679, + "grad_norm": 2.154287099838257, + "learning_rate": 2.914189681245949e-07, + "loss": 0.0106, + "num_input_tokens_seen": 37437536, + "step": 19123 + }, + { + "epoch": 2.5346587143803845, + "grad_norm": 3.551905870437622, + "learning_rate": 2.912563352408404e-07, + "loss": 0.0127, + "num_input_tokens_seen": 37439288, + "step": 19124 + }, + { + "epoch": 2.5347912524850895, + "grad_norm": 2.208467483520508, + "learning_rate": 2.9109374494318693e-07, + "loss": 0.004, + "num_input_tokens_seen": 37440800, + "step": 19125 + }, + { + "epoch": 2.5349237905897946, + "grad_norm": 0.48554596304893494, + "learning_rate": 2.909311972347706e-07, + "loss": 0.0014, + "num_input_tokens_seen": 37442280, + "step": 19126 + }, + { + "epoch": 2.5350563286944996, + "grad_norm": 2.0004382133483887, + "learning_rate": 2.9076869211872445e-07, + "loss": 0.0111, + "num_input_tokens_seen": 37444304, + "step": 19127 + }, + { + "epoch": 2.5351888667992046, + "grad_norm": 0.005225060507655144, + "learning_rate": 2.9060622959818236e-07, + "loss": 0.0, + "num_input_tokens_seen": 37445880, + "step": 19128 + }, + { + "epoch": 2.53532140490391, + "grad_norm": 0.013014730997383595, + "learning_rate": 2.9044380967627544e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37448640, + "step": 19129 + }, + { + "epoch": 2.5354539430086147, + "grad_norm": 0.022811781615018845, + "learning_rate": 2.9028143235613705e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37450856, + "step": 19130 + }, + { + "epoch": 2.53558648111332, + "grad_norm": 9.4717435836792, + "learning_rate": 2.901190976408963e-07, + "loss": 0.1386, + "num_input_tokens_seen": 37453392, + "step": 19131 + }, + { + "epoch": 2.5357190192180252, + "grad_norm": 1.8154377937316895, + "learning_rate": 2.899568055336843e-07, + "loss": 0.0065, + "num_input_tokens_seen": 37455024, + "step": 19132 + }, + { + "epoch": 2.5358515573227303, + "grad_norm": 0.00735109718516469, + "learning_rate": 2.8979455603763003e-07, + "loss": 0.0, + "num_input_tokens_seen": 37456448, + "step": 19133 + }, + { + "epoch": 2.5359840954274353, + "grad_norm": 10.321700096130371, + "learning_rate": 2.8963234915586113e-07, + "loss": 0.0898, + "num_input_tokens_seen": 37458568, + "step": 19134 + }, + { + "epoch": 2.5361166335321403, + "grad_norm": 10.98218822479248, + "learning_rate": 2.894701848915052e-07, + "loss": 0.0311, + "num_input_tokens_seen": 37460848, + "step": 19135 + }, + { + "epoch": 2.536249171636846, + "grad_norm": 3.5249624252319336, + "learning_rate": 2.8930806324768885e-07, + "loss": 0.0074, + "num_input_tokens_seen": 37462752, + "step": 19136 + }, + { + "epoch": 2.536381709741551, + "grad_norm": 0.13867440819740295, + "learning_rate": 2.8914598422753873e-07, + "loss": 0.0004, + "num_input_tokens_seen": 37464688, + "step": 19137 + }, + { + "epoch": 2.536514247846256, + "grad_norm": 0.02282278798520565, + "learning_rate": 2.889839478341791e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37466408, + "step": 19138 + }, + { + "epoch": 2.536646785950961, + "grad_norm": 4.181307315826416, + "learning_rate": 2.888219540707343e-07, + "loss": 0.0329, + "num_input_tokens_seen": 37468176, + "step": 19139 + }, + { + "epoch": 2.536779324055666, + "grad_norm": 3.544297456741333, + "learning_rate": 2.886600029403272e-07, + "loss": 0.0227, + "num_input_tokens_seen": 37470600, + "step": 19140 + }, + { + "epoch": 2.536911862160371, + "grad_norm": 6.721055507659912, + "learning_rate": 2.884980944460808e-07, + "loss": 0.0465, + "num_input_tokens_seen": 37473480, + "step": 19141 + }, + { + "epoch": 2.537044400265076, + "grad_norm": 4.9972991943359375, + "learning_rate": 2.883362285911173e-07, + "loss": 0.0364, + "num_input_tokens_seen": 37475728, + "step": 19142 + }, + { + "epoch": 2.5371769383697815, + "grad_norm": 0.02297527901828289, + "learning_rate": 2.881744053785571e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37477552, + "step": 19143 + }, + { + "epoch": 2.5373094764744866, + "grad_norm": 0.00940045528113842, + "learning_rate": 2.8801262481152e-07, + "loss": 0.0, + "num_input_tokens_seen": 37478872, + "step": 19144 + }, + { + "epoch": 2.5374420145791916, + "grad_norm": 2.807574987411499, + "learning_rate": 2.878508868931254e-07, + "loss": 0.0108, + "num_input_tokens_seen": 37481128, + "step": 19145 + }, + { + "epoch": 2.5375745526838966, + "grad_norm": 19.909229278564453, + "learning_rate": 2.876891916264915e-07, + "loss": 0.112, + "num_input_tokens_seen": 37482936, + "step": 19146 + }, + { + "epoch": 2.5377070907886017, + "grad_norm": 1.5621196031570435, + "learning_rate": 2.875275390147361e-07, + "loss": 0.0133, + "num_input_tokens_seen": 37484656, + "step": 19147 + }, + { + "epoch": 2.5378396288933067, + "grad_norm": 0.06808166950941086, + "learning_rate": 2.8736592906097675e-07, + "loss": 0.0004, + "num_input_tokens_seen": 37486272, + "step": 19148 + }, + { + "epoch": 2.5379721669980118, + "grad_norm": 0.019649337977170944, + "learning_rate": 2.8720436176832847e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37487888, + "step": 19149 + }, + { + "epoch": 2.5381047051027172, + "grad_norm": 0.00863607693463564, + "learning_rate": 2.8704283713990674e-07, + "loss": 0.0, + "num_input_tokens_seen": 37489224, + "step": 19150 + }, + { + "epoch": 2.5382372432074223, + "grad_norm": 18.46916389465332, + "learning_rate": 2.868813551788252e-07, + "loss": 0.125, + "num_input_tokens_seen": 37491152, + "step": 19151 + }, + { + "epoch": 2.5383697813121273, + "grad_norm": 6.598438739776611, + "learning_rate": 2.8671991588819835e-07, + "loss": 0.0706, + "num_input_tokens_seen": 37493456, + "step": 19152 + }, + { + "epoch": 2.5385023194168324, + "grad_norm": 2.5307908058166504, + "learning_rate": 2.8655851927113786e-07, + "loss": 0.0082, + "num_input_tokens_seen": 37495768, + "step": 19153 + }, + { + "epoch": 2.5386348575215374, + "grad_norm": 4.513319492340088, + "learning_rate": 2.863971653307568e-07, + "loss": 0.0237, + "num_input_tokens_seen": 37497352, + "step": 19154 + }, + { + "epoch": 2.5387673956262424, + "grad_norm": 0.7258305549621582, + "learning_rate": 2.862358540701654e-07, + "loss": 0.0036, + "num_input_tokens_seen": 37499752, + "step": 19155 + }, + { + "epoch": 2.5388999337309475, + "grad_norm": 0.010591251775622368, + "learning_rate": 2.860745854924737e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37501384, + "step": 19156 + }, + { + "epoch": 2.539032471835653, + "grad_norm": 0.1885674148797989, + "learning_rate": 2.85913359600791e-07, + "loss": 0.0006, + "num_input_tokens_seen": 37503320, + "step": 19157 + }, + { + "epoch": 2.539165009940358, + "grad_norm": 0.00333246192894876, + "learning_rate": 2.8575217639822686e-07, + "loss": 0.0, + "num_input_tokens_seen": 37505264, + "step": 19158 + }, + { + "epoch": 2.539297548045063, + "grad_norm": 5.3531341552734375, + "learning_rate": 2.8559103588788746e-07, + "loss": 0.0105, + "num_input_tokens_seen": 37506536, + "step": 19159 + }, + { + "epoch": 2.539430086149768, + "grad_norm": 10.839872360229492, + "learning_rate": 2.8542993807288117e-07, + "loss": 0.2191, + "num_input_tokens_seen": 37508328, + "step": 19160 + }, + { + "epoch": 2.539562624254473, + "grad_norm": 0.3431721031665802, + "learning_rate": 2.8526888295631325e-07, + "loss": 0.0014, + "num_input_tokens_seen": 37509992, + "step": 19161 + }, + { + "epoch": 2.539695162359178, + "grad_norm": 2.787536859512329, + "learning_rate": 2.851078705412888e-07, + "loss": 0.0232, + "num_input_tokens_seen": 37512272, + "step": 19162 + }, + { + "epoch": 2.539827700463883, + "grad_norm": 7.519652366638184, + "learning_rate": 2.84946900830913e-07, + "loss": 0.077, + "num_input_tokens_seen": 37514504, + "step": 19163 + }, + { + "epoch": 2.5399602385685887, + "grad_norm": 0.25927790999412537, + "learning_rate": 2.847859738282885e-07, + "loss": 0.0008, + "num_input_tokens_seen": 37516008, + "step": 19164 + }, + { + "epoch": 2.5400927766732937, + "grad_norm": 2.9976625442504883, + "learning_rate": 2.8462508953651935e-07, + "loss": 0.0101, + "num_input_tokens_seen": 37518776, + "step": 19165 + }, + { + "epoch": 2.5402253147779987, + "grad_norm": 0.07422026246786118, + "learning_rate": 2.8446424795870677e-07, + "loss": 0.0003, + "num_input_tokens_seen": 37521936, + "step": 19166 + }, + { + "epoch": 2.5403578528827038, + "grad_norm": 0.12871003150939941, + "learning_rate": 2.84303449097951e-07, + "loss": 0.0007, + "num_input_tokens_seen": 37524424, + "step": 19167 + }, + { + "epoch": 2.540490390987409, + "grad_norm": 1.0708467960357666, + "learning_rate": 2.841426929573543e-07, + "loss": 0.004, + "num_input_tokens_seen": 37527248, + "step": 19168 + }, + { + "epoch": 2.540622929092114, + "grad_norm": 7.618175983428955, + "learning_rate": 2.8398197954001475e-07, + "loss": 0.0963, + "num_input_tokens_seen": 37529184, + "step": 19169 + }, + { + "epoch": 2.540755467196819, + "grad_norm": 1.7641981840133667, + "learning_rate": 2.8382130884903103e-07, + "loss": 0.0047, + "num_input_tokens_seen": 37530952, + "step": 19170 + }, + { + "epoch": 2.5408880053015244, + "grad_norm": 0.013728641904890537, + "learning_rate": 2.836606808875017e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37533264, + "step": 19171 + }, + { + "epoch": 2.5410205434062294, + "grad_norm": 2.108384132385254, + "learning_rate": 2.835000956585232e-07, + "loss": 0.0447, + "num_input_tokens_seen": 37534880, + "step": 19172 + }, + { + "epoch": 2.5411530815109344, + "grad_norm": 0.001985525246709585, + "learning_rate": 2.8333955316519224e-07, + "loss": 0.0, + "num_input_tokens_seen": 37535952, + "step": 19173 + }, + { + "epoch": 2.5412856196156395, + "grad_norm": 27.595172882080078, + "learning_rate": 2.831790534106038e-07, + "loss": 0.4104, + "num_input_tokens_seen": 37538040, + "step": 19174 + }, + { + "epoch": 2.5414181577203445, + "grad_norm": 5.20789098739624, + "learning_rate": 2.830185963978527e-07, + "loss": 0.076, + "num_input_tokens_seen": 37539872, + "step": 19175 + }, + { + "epoch": 2.5415506958250496, + "grad_norm": 0.09321755170822144, + "learning_rate": 2.828581821300319e-07, + "loss": 0.0004, + "num_input_tokens_seen": 37540960, + "step": 19176 + }, + { + "epoch": 2.5416832339297546, + "grad_norm": 0.021792927756905556, + "learning_rate": 2.826978106102349e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37542400, + "step": 19177 + }, + { + "epoch": 2.54181577203446, + "grad_norm": 0.14601649343967438, + "learning_rate": 2.825374818415541e-07, + "loss": 0.0003, + "num_input_tokens_seen": 37544496, + "step": 19178 + }, + { + "epoch": 2.541948310139165, + "grad_norm": 1.7685610055923462, + "learning_rate": 2.823771958270807e-07, + "loss": 0.0087, + "num_input_tokens_seen": 37546704, + "step": 19179 + }, + { + "epoch": 2.54208084824387, + "grad_norm": 0.031205739825963974, + "learning_rate": 2.822169525699045e-07, + "loss": 0.0002, + "num_input_tokens_seen": 37548736, + "step": 19180 + }, + { + "epoch": 2.542213386348575, + "grad_norm": 0.000769249745644629, + "learning_rate": 2.820567520731157e-07, + "loss": 0.0, + "num_input_tokens_seen": 37550016, + "step": 19181 + }, + { + "epoch": 2.5423459244532802, + "grad_norm": 0.19897986948490143, + "learning_rate": 2.8189659433980223e-07, + "loss": 0.0009, + "num_input_tokens_seen": 37551648, + "step": 19182 + }, + { + "epoch": 2.5424784625579853, + "grad_norm": 0.1513667106628418, + "learning_rate": 2.817364793730526e-07, + "loss": 0.0008, + "num_input_tokens_seen": 37554000, + "step": 19183 + }, + { + "epoch": 2.5426110006626903, + "grad_norm": 0.0219032634049654, + "learning_rate": 2.815764071759544e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37556240, + "step": 19184 + }, + { + "epoch": 2.542743538767396, + "grad_norm": 0.0010836496949195862, + "learning_rate": 2.8141637775159327e-07, + "loss": 0.0, + "num_input_tokens_seen": 37557768, + "step": 19185 + }, + { + "epoch": 2.542876076872101, + "grad_norm": 0.5151396989822388, + "learning_rate": 2.8125639110305527e-07, + "loss": 0.0013, + "num_input_tokens_seen": 37559384, + "step": 19186 + }, + { + "epoch": 2.543008614976806, + "grad_norm": 0.008022317662835121, + "learning_rate": 2.810964472334246e-07, + "loss": 0.0, + "num_input_tokens_seen": 37560672, + "step": 19187 + }, + { + "epoch": 2.543141153081511, + "grad_norm": 1.752504587173462, + "learning_rate": 2.809365461457844e-07, + "loss": 0.0113, + "num_input_tokens_seen": 37563432, + "step": 19188 + }, + { + "epoch": 2.543273691186216, + "grad_norm": 10.425017356872559, + "learning_rate": 2.8077668784321887e-07, + "loss": 0.1909, + "num_input_tokens_seen": 37564616, + "step": 19189 + }, + { + "epoch": 2.5434062292909214, + "grad_norm": 0.015368139371275902, + "learning_rate": 2.8061687232880995e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37566272, + "step": 19190 + }, + { + "epoch": 2.543538767395626, + "grad_norm": 0.08149521052837372, + "learning_rate": 2.8045709960563885e-07, + "loss": 0.0006, + "num_input_tokens_seen": 37568368, + "step": 19191 + }, + { + "epoch": 2.5436713055003315, + "grad_norm": 2.3762598037719727, + "learning_rate": 2.802973696767861e-07, + "loss": 0.0089, + "num_input_tokens_seen": 37570648, + "step": 19192 + }, + { + "epoch": 2.5438038436050365, + "grad_norm": 3.361557722091675, + "learning_rate": 2.8013768254533085e-07, + "loss": 0.0282, + "num_input_tokens_seen": 37572640, + "step": 19193 + }, + { + "epoch": 2.5439363817097416, + "grad_norm": 0.05279209092259407, + "learning_rate": 2.799780382143524e-07, + "loss": 0.0002, + "num_input_tokens_seen": 37574104, + "step": 19194 + }, + { + "epoch": 2.5440689198144466, + "grad_norm": 0.0015046474291011691, + "learning_rate": 2.798184366869297e-07, + "loss": 0.0, + "num_input_tokens_seen": 37576128, + "step": 19195 + }, + { + "epoch": 2.5442014579191516, + "grad_norm": 0.7899875044822693, + "learning_rate": 2.796588779661388e-07, + "loss": 0.0041, + "num_input_tokens_seen": 37577696, + "step": 19196 + }, + { + "epoch": 2.544333996023857, + "grad_norm": 1.9544918537139893, + "learning_rate": 2.7949936205505677e-07, + "loss": 0.0093, + "num_input_tokens_seen": 37579336, + "step": 19197 + }, + { + "epoch": 2.5444665341285617, + "grad_norm": 17.732507705688477, + "learning_rate": 2.7933988895675805e-07, + "loss": 0.1545, + "num_input_tokens_seen": 37581248, + "step": 19198 + }, + { + "epoch": 2.544599072233267, + "grad_norm": 3.6685752868652344, + "learning_rate": 2.791804586743191e-07, + "loss": 0.026, + "num_input_tokens_seen": 37582728, + "step": 19199 + }, + { + "epoch": 2.5447316103379722, + "grad_norm": 5.845583915710449, + "learning_rate": 2.7902107121081214e-07, + "loss": 0.0483, + "num_input_tokens_seen": 37584496, + "step": 19200 + }, + { + "epoch": 2.5448641484426773, + "grad_norm": 0.007427958771586418, + "learning_rate": 2.788617265693119e-07, + "loss": 0.0, + "num_input_tokens_seen": 37585664, + "step": 19201 + }, + { + "epoch": 2.5449966865473823, + "grad_norm": 10.987776756286621, + "learning_rate": 2.7870242475288966e-07, + "loss": 0.0613, + "num_input_tokens_seen": 37587448, + "step": 19202 + }, + { + "epoch": 2.5451292246520874, + "grad_norm": 7.798466205596924, + "learning_rate": 2.785431657646165e-07, + "loss": 0.0642, + "num_input_tokens_seen": 37589720, + "step": 19203 + }, + { + "epoch": 2.545261762756793, + "grad_norm": 1.2548589706420898, + "learning_rate": 2.7838394960756436e-07, + "loss": 0.0057, + "num_input_tokens_seen": 37591840, + "step": 19204 + }, + { + "epoch": 2.5453943008614974, + "grad_norm": 0.06541841477155685, + "learning_rate": 2.78224776284802e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37593656, + "step": 19205 + }, + { + "epoch": 2.545526838966203, + "grad_norm": 0.19835114479064941, + "learning_rate": 2.780656457993983e-07, + "loss": 0.0008, + "num_input_tokens_seen": 37595064, + "step": 19206 + }, + { + "epoch": 2.545659377070908, + "grad_norm": 0.21120081841945648, + "learning_rate": 2.779065581544224e-07, + "loss": 0.0008, + "num_input_tokens_seen": 37596536, + "step": 19207 + }, + { + "epoch": 2.545791915175613, + "grad_norm": 0.15744860470294952, + "learning_rate": 2.777475133529403e-07, + "loss": 0.001, + "num_input_tokens_seen": 37598024, + "step": 19208 + }, + { + "epoch": 2.545924453280318, + "grad_norm": 0.012701173312962055, + "learning_rate": 2.775885113980198e-07, + "loss": 0.0, + "num_input_tokens_seen": 37599512, + "step": 19209 + }, + { + "epoch": 2.546056991385023, + "grad_norm": 12.160787582397461, + "learning_rate": 2.774295522927259e-07, + "loss": 0.0622, + "num_input_tokens_seen": 37601528, + "step": 19210 + }, + { + "epoch": 2.5461895294897285, + "grad_norm": 0.011698375456035137, + "learning_rate": 2.7727063604012366e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37603816, + "step": 19211 + }, + { + "epoch": 2.546322067594433, + "grad_norm": 0.06989102810621262, + "learning_rate": 2.771117626432762e-07, + "loss": 0.0003, + "num_input_tokens_seen": 37605976, + "step": 19212 + }, + { + "epoch": 2.5464546056991386, + "grad_norm": 0.026223503053188324, + "learning_rate": 2.7695293210524834e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37608008, + "step": 19213 + }, + { + "epoch": 2.5465871438038437, + "grad_norm": 0.6472287178039551, + "learning_rate": 2.7679414442910057e-07, + "loss": 0.0012, + "num_input_tokens_seen": 37610128, + "step": 19214 + }, + { + "epoch": 2.5467196819085487, + "grad_norm": 0.025056691840291023, + "learning_rate": 2.7663539961789635e-07, + "loss": 0.0002, + "num_input_tokens_seen": 37611824, + "step": 19215 + }, + { + "epoch": 2.5468522200132537, + "grad_norm": 0.002045080531388521, + "learning_rate": 2.764766976746952e-07, + "loss": 0.0, + "num_input_tokens_seen": 37613272, + "step": 19216 + }, + { + "epoch": 2.5469847581179588, + "grad_norm": 5.3993048667907715, + "learning_rate": 2.763180386025566e-07, + "loss": 0.0658, + "num_input_tokens_seen": 37615424, + "step": 19217 + }, + { + "epoch": 2.5471172962226643, + "grad_norm": 5.393725395202637, + "learning_rate": 2.7615942240454106e-07, + "loss": 0.066, + "num_input_tokens_seen": 37617560, + "step": 19218 + }, + { + "epoch": 2.547249834327369, + "grad_norm": 5.836322784423828, + "learning_rate": 2.7600084908370543e-07, + "loss": 0.067, + "num_input_tokens_seen": 37619296, + "step": 19219 + }, + { + "epoch": 2.5473823724320743, + "grad_norm": 0.04517173394560814, + "learning_rate": 2.758423186431081e-07, + "loss": 0.0002, + "num_input_tokens_seen": 37620496, + "step": 19220 + }, + { + "epoch": 2.5475149105367794, + "grad_norm": 2.786724090576172, + "learning_rate": 2.7568383108580536e-07, + "loss": 0.0093, + "num_input_tokens_seen": 37621584, + "step": 19221 + }, + { + "epoch": 2.5476474486414844, + "grad_norm": 0.001452822471037507, + "learning_rate": 2.7552538641485273e-07, + "loss": 0.0, + "num_input_tokens_seen": 37622824, + "step": 19222 + }, + { + "epoch": 2.5477799867461894, + "grad_norm": 0.10762270539999008, + "learning_rate": 2.753669846333046e-07, + "loss": 0.0005, + "num_input_tokens_seen": 37625176, + "step": 19223 + }, + { + "epoch": 2.5479125248508945, + "grad_norm": 0.41375023126602173, + "learning_rate": 2.7520862574421604e-07, + "loss": 0.0023, + "num_input_tokens_seen": 37626936, + "step": 19224 + }, + { + "epoch": 2.5480450629556, + "grad_norm": 9.824305534362793, + "learning_rate": 2.750503097506402e-07, + "loss": 0.0662, + "num_input_tokens_seen": 37628824, + "step": 19225 + }, + { + "epoch": 2.548177601060305, + "grad_norm": 0.004199706017971039, + "learning_rate": 2.748920366556293e-07, + "loss": 0.0, + "num_input_tokens_seen": 37630120, + "step": 19226 + }, + { + "epoch": 2.54831013916501, + "grad_norm": 2.5526397228240967, + "learning_rate": 2.747338064622351e-07, + "loss": 0.0071, + "num_input_tokens_seen": 37631688, + "step": 19227 + }, + { + "epoch": 2.548442677269715, + "grad_norm": 5.496508598327637, + "learning_rate": 2.745756191735083e-07, + "loss": 0.1582, + "num_input_tokens_seen": 37635056, + "step": 19228 + }, + { + "epoch": 2.54857521537442, + "grad_norm": 4.5125250816345215, + "learning_rate": 2.7441747479249825e-07, + "loss": 0.0912, + "num_input_tokens_seen": 37636280, + "step": 19229 + }, + { + "epoch": 2.548707753479125, + "grad_norm": 0.17312651872634888, + "learning_rate": 2.7425937332225456e-07, + "loss": 0.0008, + "num_input_tokens_seen": 37637968, + "step": 19230 + }, + { + "epoch": 2.54884029158383, + "grad_norm": 0.029593251645565033, + "learning_rate": 2.7410131476582615e-07, + "loss": 0.0002, + "num_input_tokens_seen": 37640968, + "step": 19231 + }, + { + "epoch": 2.5489728296885357, + "grad_norm": 0.5128489136695862, + "learning_rate": 2.739432991262597e-07, + "loss": 0.003, + "num_input_tokens_seen": 37642224, + "step": 19232 + }, + { + "epoch": 2.5491053677932407, + "grad_norm": 0.8938080072402954, + "learning_rate": 2.737853264066026e-07, + "loss": 0.0016, + "num_input_tokens_seen": 37644024, + "step": 19233 + }, + { + "epoch": 2.5492379058979457, + "grad_norm": 7.7527055740356445, + "learning_rate": 2.7362739660989927e-07, + "loss": 0.118, + "num_input_tokens_seen": 37646248, + "step": 19234 + }, + { + "epoch": 2.549370444002651, + "grad_norm": 3.9511706829071045, + "learning_rate": 2.734695097391965e-07, + "loss": 0.0244, + "num_input_tokens_seen": 37649048, + "step": 19235 + }, + { + "epoch": 2.549502982107356, + "grad_norm": 0.12787789106369019, + "learning_rate": 2.733116657975368e-07, + "loss": 0.0007, + "num_input_tokens_seen": 37651672, + "step": 19236 + }, + { + "epoch": 2.549635520212061, + "grad_norm": 22.463727951049805, + "learning_rate": 2.73153864787965e-07, + "loss": 0.5195, + "num_input_tokens_seen": 37652952, + "step": 19237 + }, + { + "epoch": 2.549768058316766, + "grad_norm": 2.1905441284179688, + "learning_rate": 2.7299610671352285e-07, + "loss": 0.0137, + "num_input_tokens_seen": 37655040, + "step": 19238 + }, + { + "epoch": 2.5499005964214714, + "grad_norm": 0.0384017750620842, + "learning_rate": 2.7283839157725146e-07, + "loss": 0.0003, + "num_input_tokens_seen": 37657272, + "step": 19239 + }, + { + "epoch": 2.5500331345261764, + "grad_norm": 11.75296401977539, + "learning_rate": 2.7268071938219285e-07, + "loss": 0.1115, + "num_input_tokens_seen": 37659136, + "step": 19240 + }, + { + "epoch": 2.5501656726308815, + "grad_norm": 0.004305411130189896, + "learning_rate": 2.725230901313866e-07, + "loss": 0.0, + "num_input_tokens_seen": 37660808, + "step": 19241 + }, + { + "epoch": 2.5502982107355865, + "grad_norm": 6.9022979736328125, + "learning_rate": 2.7236550382787136e-07, + "loss": 0.1004, + "num_input_tokens_seen": 37663432, + "step": 19242 + }, + { + "epoch": 2.5504307488402915, + "grad_norm": 0.026946282014250755, + "learning_rate": 2.722079604746866e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37665104, + "step": 19243 + }, + { + "epoch": 2.5505632869449966, + "grad_norm": 3.5444226264953613, + "learning_rate": 2.7205046007486885e-07, + "loss": 0.0271, + "num_input_tokens_seen": 37666704, + "step": 19244 + }, + { + "epoch": 2.5506958250497016, + "grad_norm": 0.08248116075992584, + "learning_rate": 2.718930026314551e-07, + "loss": 0.0004, + "num_input_tokens_seen": 37668936, + "step": 19245 + }, + { + "epoch": 2.550828363154407, + "grad_norm": 0.06398060917854309, + "learning_rate": 2.7173558814748173e-07, + "loss": 0.0004, + "num_input_tokens_seen": 37671464, + "step": 19246 + }, + { + "epoch": 2.550960901259112, + "grad_norm": 0.013547554612159729, + "learning_rate": 2.7157821662598285e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37673160, + "step": 19247 + }, + { + "epoch": 2.551093439363817, + "grad_norm": 0.012475227005779743, + "learning_rate": 2.714208880699937e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37675464, + "step": 19248 + }, + { + "epoch": 2.551225977468522, + "grad_norm": 0.47060561180114746, + "learning_rate": 2.712636024825474e-07, + "loss": 0.0012, + "num_input_tokens_seen": 37676688, + "step": 19249 + }, + { + "epoch": 2.5513585155732272, + "grad_norm": 0.4960634708404541, + "learning_rate": 2.7110635986667577e-07, + "loss": 0.0018, + "num_input_tokens_seen": 37678536, + "step": 19250 + }, + { + "epoch": 2.5514910536779323, + "grad_norm": 7.8419976234436035, + "learning_rate": 2.709491602254116e-07, + "loss": 0.0581, + "num_input_tokens_seen": 37681312, + "step": 19251 + }, + { + "epoch": 2.5516235917826373, + "grad_norm": 1.3166944980621338, + "learning_rate": 2.7079200356178557e-07, + "loss": 0.0059, + "num_input_tokens_seen": 37683376, + "step": 19252 + }, + { + "epoch": 2.551756129887343, + "grad_norm": 0.46660521626472473, + "learning_rate": 2.706348898788269e-07, + "loss": 0.0003, + "num_input_tokens_seen": 37685448, + "step": 19253 + }, + { + "epoch": 2.551888667992048, + "grad_norm": 0.11908669769763947, + "learning_rate": 2.704778191795662e-07, + "loss": 0.0008, + "num_input_tokens_seen": 37687192, + "step": 19254 + }, + { + "epoch": 2.552021206096753, + "grad_norm": 0.3663778305053711, + "learning_rate": 2.7032079146703053e-07, + "loss": 0.002, + "num_input_tokens_seen": 37690504, + "step": 19255 + }, + { + "epoch": 2.552153744201458, + "grad_norm": 1.0788378715515137, + "learning_rate": 2.701638067442486e-07, + "loss": 0.0062, + "num_input_tokens_seen": 37692328, + "step": 19256 + }, + { + "epoch": 2.552286282306163, + "grad_norm": 0.003124698530882597, + "learning_rate": 2.7000686501424707e-07, + "loss": 0.0, + "num_input_tokens_seen": 37694048, + "step": 19257 + }, + { + "epoch": 2.552418820410868, + "grad_norm": 6.319106578826904, + "learning_rate": 2.6984996628005136e-07, + "loss": 0.0684, + "num_input_tokens_seen": 37696088, + "step": 19258 + }, + { + "epoch": 2.552551358515573, + "grad_norm": 0.9498832821846008, + "learning_rate": 2.696931105446862e-07, + "loss": 0.0041, + "num_input_tokens_seen": 37698208, + "step": 19259 + }, + { + "epoch": 2.5526838966202785, + "grad_norm": 1.3055788278579712, + "learning_rate": 2.6953629781117654e-07, + "loss": 0.0058, + "num_input_tokens_seen": 37700072, + "step": 19260 + }, + { + "epoch": 2.5528164347249835, + "grad_norm": 0.001714140991680324, + "learning_rate": 2.6937952808254646e-07, + "loss": 0.0, + "num_input_tokens_seen": 37701912, + "step": 19261 + }, + { + "epoch": 2.5529489728296886, + "grad_norm": 0.014134910888969898, + "learning_rate": 2.692228013618178e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37703840, + "step": 19262 + }, + { + "epoch": 2.5530815109343936, + "grad_norm": 4.216078758239746, + "learning_rate": 2.6906611765201236e-07, + "loss": 0.0887, + "num_input_tokens_seen": 37705720, + "step": 19263 + }, + { + "epoch": 2.5532140490390987, + "grad_norm": 0.054867763072252274, + "learning_rate": 2.689094769561515e-07, + "loss": 0.0002, + "num_input_tokens_seen": 37708144, + "step": 19264 + }, + { + "epoch": 2.5533465871438037, + "grad_norm": 0.5840003490447998, + "learning_rate": 2.6875287927725425e-07, + "loss": 0.0015, + "num_input_tokens_seen": 37709696, + "step": 19265 + }, + { + "epoch": 2.5534791252485087, + "grad_norm": 0.18417644500732422, + "learning_rate": 2.6859632461834095e-07, + "loss": 0.0011, + "num_input_tokens_seen": 37711880, + "step": 19266 + }, + { + "epoch": 2.553611663353214, + "grad_norm": 0.951231062412262, + "learning_rate": 2.684398129824303e-07, + "loss": 0.0043, + "num_input_tokens_seen": 37714328, + "step": 19267 + }, + { + "epoch": 2.5537442014579192, + "grad_norm": 0.010857123881578445, + "learning_rate": 2.682833443725394e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37716440, + "step": 19268 + }, + { + "epoch": 2.5538767395626243, + "grad_norm": 0.02390293963253498, + "learning_rate": 2.6812691879168543e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37718104, + "step": 19269 + }, + { + "epoch": 2.5540092776673293, + "grad_norm": 2.378444194793701, + "learning_rate": 2.679705362428836e-07, + "loss": 0.0064, + "num_input_tokens_seen": 37719952, + "step": 19270 + }, + { + "epoch": 2.5541418157720344, + "grad_norm": 0.001149016316048801, + "learning_rate": 2.678141967291495e-07, + "loss": 0.0, + "num_input_tokens_seen": 37720952, + "step": 19271 + }, + { + "epoch": 2.5542743538767394, + "grad_norm": 6.384291648864746, + "learning_rate": 2.676579002534982e-07, + "loss": 0.0276, + "num_input_tokens_seen": 37724000, + "step": 19272 + }, + { + "epoch": 2.5544068919814444, + "grad_norm": 0.007838216610252857, + "learning_rate": 2.675016468189426e-07, + "loss": 0.0, + "num_input_tokens_seen": 37725552, + "step": 19273 + }, + { + "epoch": 2.55453943008615, + "grad_norm": 0.004879423882812262, + "learning_rate": 2.6734543642849553e-07, + "loss": 0.0, + "num_input_tokens_seen": 37727176, + "step": 19274 + }, + { + "epoch": 2.554671968190855, + "grad_norm": 0.009679163806140423, + "learning_rate": 2.671892690851685e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37728360, + "step": 19275 + }, + { + "epoch": 2.55480450629556, + "grad_norm": 7.4938836097717285, + "learning_rate": 2.6703314479197215e-07, + "loss": 0.0764, + "num_input_tokens_seen": 37730544, + "step": 19276 + }, + { + "epoch": 2.554937044400265, + "grad_norm": 0.0029349392279982567, + "learning_rate": 2.6687706355191714e-07, + "loss": 0.0, + "num_input_tokens_seen": 37732768, + "step": 19277 + }, + { + "epoch": 2.55506958250497, + "grad_norm": 2.6308388710021973, + "learning_rate": 2.667210253680136e-07, + "loss": 0.0156, + "num_input_tokens_seen": 37736064, + "step": 19278 + }, + { + "epoch": 2.5552021206096756, + "grad_norm": 3.973207950592041, + "learning_rate": 2.665650302432693e-07, + "loss": 0.0439, + "num_input_tokens_seen": 37738784, + "step": 19279 + }, + { + "epoch": 2.55533465871438, + "grad_norm": 0.0034349444322288036, + "learning_rate": 2.664090781806919e-07, + "loss": 0.0, + "num_input_tokens_seen": 37740120, + "step": 19280 + }, + { + "epoch": 2.5554671968190856, + "grad_norm": 0.006652281153947115, + "learning_rate": 2.6625316918328787e-07, + "loss": 0.0, + "num_input_tokens_seen": 37742248, + "step": 19281 + }, + { + "epoch": 2.5555997349237907, + "grad_norm": 5.322209358215332, + "learning_rate": 2.6609730325406424e-07, + "loss": 0.0717, + "num_input_tokens_seen": 37744368, + "step": 19282 + }, + { + "epoch": 2.5557322730284957, + "grad_norm": 0.8250563740730286, + "learning_rate": 2.659414803960253e-07, + "loss": 0.003, + "num_input_tokens_seen": 37746600, + "step": 19283 + }, + { + "epoch": 2.5558648111332007, + "grad_norm": 6.899227619171143, + "learning_rate": 2.6578570061217614e-07, + "loss": 0.0696, + "num_input_tokens_seen": 37749136, + "step": 19284 + }, + { + "epoch": 2.555997349237906, + "grad_norm": 8.682537078857422, + "learning_rate": 2.6562996390552015e-07, + "loss": 0.0338, + "num_input_tokens_seen": 37752072, + "step": 19285 + }, + { + "epoch": 2.5561298873426113, + "grad_norm": 0.01274094358086586, + "learning_rate": 2.6547427027905944e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37753992, + "step": 19286 + }, + { + "epoch": 2.556262425447316, + "grad_norm": 0.006973768584430218, + "learning_rate": 2.653186197357965e-07, + "loss": 0.0, + "num_input_tokens_seen": 37756224, + "step": 19287 + }, + { + "epoch": 2.5563949635520213, + "grad_norm": 0.036773841828107834, + "learning_rate": 2.6516301227873244e-07, + "loss": 0.0002, + "num_input_tokens_seen": 37759152, + "step": 19288 + }, + { + "epoch": 2.5565275016567264, + "grad_norm": 0.0697697252035141, + "learning_rate": 2.6500744791086664e-07, + "loss": 0.0004, + "num_input_tokens_seen": 37761504, + "step": 19289 + }, + { + "epoch": 2.5566600397614314, + "grad_norm": 0.18524616956710815, + "learning_rate": 2.6485192663519953e-07, + "loss": 0.0008, + "num_input_tokens_seen": 37763056, + "step": 19290 + }, + { + "epoch": 2.5567925778661365, + "grad_norm": 0.08111553639173508, + "learning_rate": 2.6469644845472875e-07, + "loss": 0.0005, + "num_input_tokens_seen": 37766024, + "step": 19291 + }, + { + "epoch": 2.5569251159708415, + "grad_norm": 3.277122974395752, + "learning_rate": 2.6454101337245323e-07, + "loss": 0.0051, + "num_input_tokens_seen": 37767952, + "step": 19292 + }, + { + "epoch": 2.557057654075547, + "grad_norm": 8.206999778747559, + "learning_rate": 2.64385621391369e-07, + "loss": 0.0636, + "num_input_tokens_seen": 37770304, + "step": 19293 + }, + { + "epoch": 2.5571901921802516, + "grad_norm": 0.022250914946198463, + "learning_rate": 2.6423027251447215e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37772840, + "step": 19294 + }, + { + "epoch": 2.557322730284957, + "grad_norm": 0.0005052585038356483, + "learning_rate": 2.6407496674475755e-07, + "loss": 0.0, + "num_input_tokens_seen": 37774176, + "step": 19295 + }, + { + "epoch": 2.557455268389662, + "grad_norm": 0.31731081008911133, + "learning_rate": 2.639197040852201e-07, + "loss": 0.0012, + "num_input_tokens_seen": 37777392, + "step": 19296 + }, + { + "epoch": 2.557587806494367, + "grad_norm": 15.8176851272583, + "learning_rate": 2.63764484538854e-07, + "loss": 0.239, + "num_input_tokens_seen": 37779096, + "step": 19297 + }, + { + "epoch": 2.557720344599072, + "grad_norm": 0.020382242277264595, + "learning_rate": 2.6360930810865103e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37781288, + "step": 19298 + }, + { + "epoch": 2.557852882703777, + "grad_norm": 0.5020246505737305, + "learning_rate": 2.6345417479760354e-07, + "loss": 0.0033, + "num_input_tokens_seen": 37782808, + "step": 19299 + }, + { + "epoch": 2.5579854208084827, + "grad_norm": 0.32342737913131714, + "learning_rate": 2.6329908460870217e-07, + "loss": 0.0017, + "num_input_tokens_seen": 37785024, + "step": 19300 + }, + { + "epoch": 2.5581179589131873, + "grad_norm": 3.78509521484375, + "learning_rate": 2.631440375449379e-07, + "loss": 0.0346, + "num_input_tokens_seen": 37786776, + "step": 19301 + }, + { + "epoch": 2.5582504970178928, + "grad_norm": 0.009793227538466454, + "learning_rate": 2.629890336092991e-07, + "loss": 0.0, + "num_input_tokens_seen": 37788256, + "step": 19302 + }, + { + "epoch": 2.558383035122598, + "grad_norm": 11.976445198059082, + "learning_rate": 2.628340728047754e-07, + "loss": 0.1692, + "num_input_tokens_seen": 37791280, + "step": 19303 + }, + { + "epoch": 2.558515573227303, + "grad_norm": 9.773712158203125, + "learning_rate": 2.626791551343541e-07, + "loss": 0.1128, + "num_input_tokens_seen": 37793368, + "step": 19304 + }, + { + "epoch": 2.558648111332008, + "grad_norm": 2.7494139671325684, + "learning_rate": 2.6252428060102225e-07, + "loss": 0.0332, + "num_input_tokens_seen": 37795424, + "step": 19305 + }, + { + "epoch": 2.558780649436713, + "grad_norm": 0.029653625562787056, + "learning_rate": 2.6236944920776553e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37796552, + "step": 19306 + }, + { + "epoch": 2.5589131875414184, + "grad_norm": 0.1921263039112091, + "learning_rate": 2.622146609575693e-07, + "loss": 0.0006, + "num_input_tokens_seen": 37797856, + "step": 19307 + }, + { + "epoch": 2.5590457256461234, + "grad_norm": 4.230957508087158, + "learning_rate": 2.620599158534184e-07, + "loss": 0.0135, + "num_input_tokens_seen": 37799976, + "step": 19308 + }, + { + "epoch": 2.5591782637508285, + "grad_norm": 3.679116725921631, + "learning_rate": 2.6190521389829665e-07, + "loss": 0.0151, + "num_input_tokens_seen": 37801536, + "step": 19309 + }, + { + "epoch": 2.5593108018555335, + "grad_norm": 0.023809203878045082, + "learning_rate": 2.6175055509518625e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37803616, + "step": 19310 + }, + { + "epoch": 2.5594433399602385, + "grad_norm": 0.1380164474248886, + "learning_rate": 2.6159593944706907e-07, + "loss": 0.0011, + "num_input_tokens_seen": 37805824, + "step": 19311 + }, + { + "epoch": 2.5595758780649436, + "grad_norm": 0.005310839507728815, + "learning_rate": 2.61441366956926e-07, + "loss": 0.0, + "num_input_tokens_seen": 37807304, + "step": 19312 + }, + { + "epoch": 2.5597084161696486, + "grad_norm": 10.6792631149292, + "learning_rate": 2.612868376277375e-07, + "loss": 0.0733, + "num_input_tokens_seen": 37809144, + "step": 19313 + }, + { + "epoch": 2.559840954274354, + "grad_norm": 19.786666870117188, + "learning_rate": 2.6113235146248387e-07, + "loss": 0.1312, + "num_input_tokens_seen": 37811504, + "step": 19314 + }, + { + "epoch": 2.559973492379059, + "grad_norm": 3.425522565841675, + "learning_rate": 2.609779084641431e-07, + "loss": 0.013, + "num_input_tokens_seen": 37813312, + "step": 19315 + }, + { + "epoch": 2.560106030483764, + "grad_norm": 0.09007898718118668, + "learning_rate": 2.608235086356925e-07, + "loss": 0.0005, + "num_input_tokens_seen": 37814888, + "step": 19316 + }, + { + "epoch": 2.560238568588469, + "grad_norm": 0.0021935717668384314, + "learning_rate": 2.606691519801094e-07, + "loss": 0.0, + "num_input_tokens_seen": 37815984, + "step": 19317 + }, + { + "epoch": 2.5603711066931742, + "grad_norm": 0.07366465032100677, + "learning_rate": 2.6051483850037e-07, + "loss": 0.0004, + "num_input_tokens_seen": 37818280, + "step": 19318 + }, + { + "epoch": 2.5605036447978793, + "grad_norm": 0.03161676228046417, + "learning_rate": 2.603605681994489e-07, + "loss": 0.0002, + "num_input_tokens_seen": 37821072, + "step": 19319 + }, + { + "epoch": 2.5606361829025843, + "grad_norm": 0.27088600397109985, + "learning_rate": 2.6020634108032185e-07, + "loss": 0.0009, + "num_input_tokens_seen": 37822672, + "step": 19320 + }, + { + "epoch": 2.56076872100729, + "grad_norm": 3.1013357639312744, + "learning_rate": 2.600521571459616e-07, + "loss": 0.0144, + "num_input_tokens_seen": 37825648, + "step": 19321 + }, + { + "epoch": 2.560901259111995, + "grad_norm": 8.471653938293457, + "learning_rate": 2.598980163993406e-07, + "loss": 0.0646, + "num_input_tokens_seen": 37827592, + "step": 19322 + }, + { + "epoch": 2.5610337972167, + "grad_norm": 10.068063735961914, + "learning_rate": 2.597439188434317e-07, + "loss": 0.0369, + "num_input_tokens_seen": 37829448, + "step": 19323 + }, + { + "epoch": 2.561166335321405, + "grad_norm": 0.9850419759750366, + "learning_rate": 2.5958986448120476e-07, + "loss": 0.0052, + "num_input_tokens_seen": 37831400, + "step": 19324 + }, + { + "epoch": 2.56129887342611, + "grad_norm": 0.22837767004966736, + "learning_rate": 2.5943585331563157e-07, + "loss": 0.0008, + "num_input_tokens_seen": 37832832, + "step": 19325 + }, + { + "epoch": 2.561431411530815, + "grad_norm": 4.082931995391846, + "learning_rate": 2.592818853496809e-07, + "loss": 0.0354, + "num_input_tokens_seen": 37834944, + "step": 19326 + }, + { + "epoch": 2.56156394963552, + "grad_norm": 0.22462134063243866, + "learning_rate": 2.5912796058632063e-07, + "loss": 0.0011, + "num_input_tokens_seen": 37837008, + "step": 19327 + }, + { + "epoch": 2.5616964877402255, + "grad_norm": 0.015617863275110722, + "learning_rate": 2.5897407902851973e-07, + "loss": 0.0002, + "num_input_tokens_seen": 37838872, + "step": 19328 + }, + { + "epoch": 2.5618290258449306, + "grad_norm": 0.010446170344948769, + "learning_rate": 2.588202406792445e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37840000, + "step": 19329 + }, + { + "epoch": 2.5619615639496356, + "grad_norm": 2.222850799560547, + "learning_rate": 2.5866644554146057e-07, + "loss": 0.0069, + "num_input_tokens_seen": 37841872, + "step": 19330 + }, + { + "epoch": 2.5620941020543406, + "grad_norm": 0.017828362062573433, + "learning_rate": 2.585126936181345e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37843432, + "step": 19331 + }, + { + "epoch": 2.5622266401590457, + "grad_norm": 0.03534896299242973, + "learning_rate": 2.583589849122298e-07, + "loss": 0.0002, + "num_input_tokens_seen": 37845600, + "step": 19332 + }, + { + "epoch": 2.5623591782637507, + "grad_norm": 11.328756332397461, + "learning_rate": 2.5820531942671005e-07, + "loss": 0.1134, + "num_input_tokens_seen": 37847304, + "step": 19333 + }, + { + "epoch": 2.5624917163684557, + "grad_norm": 0.7523468732833862, + "learning_rate": 2.580516971645386e-07, + "loss": 0.004, + "num_input_tokens_seen": 37848648, + "step": 19334 + }, + { + "epoch": 2.5626242544731612, + "grad_norm": 5.9187421798706055, + "learning_rate": 2.5789811812867717e-07, + "loss": 0.0544, + "num_input_tokens_seen": 37851128, + "step": 19335 + }, + { + "epoch": 2.5627567925778663, + "grad_norm": 0.001379901310428977, + "learning_rate": 2.5774458232208616e-07, + "loss": 0.0, + "num_input_tokens_seen": 37852520, + "step": 19336 + }, + { + "epoch": 2.5628893306825713, + "grad_norm": 0.5638102293014526, + "learning_rate": 2.5759108974772686e-07, + "loss": 0.0025, + "num_input_tokens_seen": 37854640, + "step": 19337 + }, + { + "epoch": 2.5630218687872763, + "grad_norm": 0.5026506185531616, + "learning_rate": 2.5743764040855795e-07, + "loss": 0.0015, + "num_input_tokens_seen": 37856192, + "step": 19338 + }, + { + "epoch": 2.5631544068919814, + "grad_norm": 3.8375697135925293, + "learning_rate": 2.5728423430753907e-07, + "loss": 0.0283, + "num_input_tokens_seen": 37857440, + "step": 19339 + }, + { + "epoch": 2.5632869449966864, + "grad_norm": 2.4448704719543457, + "learning_rate": 2.57130871447627e-07, + "loss": 0.0096, + "num_input_tokens_seen": 37860000, + "step": 19340 + }, + { + "epoch": 2.5634194831013914, + "grad_norm": 2.7009222507476807, + "learning_rate": 2.569775518317791e-07, + "loss": 0.0631, + "num_input_tokens_seen": 37862280, + "step": 19341 + }, + { + "epoch": 2.563552021206097, + "grad_norm": 0.47407266497612, + "learning_rate": 2.5682427546295103e-07, + "loss": 0.0022, + "num_input_tokens_seen": 37863656, + "step": 19342 + }, + { + "epoch": 2.563684559310802, + "grad_norm": 0.9600747227668762, + "learning_rate": 2.5667104234409824e-07, + "loss": 0.0034, + "num_input_tokens_seen": 37865312, + "step": 19343 + }, + { + "epoch": 2.563817097415507, + "grad_norm": 0.0018711484735831618, + "learning_rate": 2.565178524781761e-07, + "loss": 0.0, + "num_input_tokens_seen": 37866984, + "step": 19344 + }, + { + "epoch": 2.563949635520212, + "grad_norm": 0.045118555426597595, + "learning_rate": 2.563647058681373e-07, + "loss": 0.0002, + "num_input_tokens_seen": 37868448, + "step": 19345 + }, + { + "epoch": 2.564082173624917, + "grad_norm": 5.06044864654541, + "learning_rate": 2.5621160251693476e-07, + "loss": 0.0142, + "num_input_tokens_seen": 37871336, + "step": 19346 + }, + { + "epoch": 2.564214711729622, + "grad_norm": 9.049171447753906, + "learning_rate": 2.560585424275203e-07, + "loss": 0.0994, + "num_input_tokens_seen": 37874000, + "step": 19347 + }, + { + "epoch": 2.564347249834327, + "grad_norm": 0.03440026193857193, + "learning_rate": 2.5590552560284477e-07, + "loss": 0.0002, + "num_input_tokens_seen": 37875832, + "step": 19348 + }, + { + "epoch": 2.5644797879390326, + "grad_norm": 9.536746978759766, + "learning_rate": 2.55752552045859e-07, + "loss": 0.0276, + "num_input_tokens_seen": 37877584, + "step": 19349 + }, + { + "epoch": 2.5646123260437377, + "grad_norm": 3.630237102508545, + "learning_rate": 2.5559962175951246e-07, + "loss": 0.0248, + "num_input_tokens_seen": 37879368, + "step": 19350 + }, + { + "epoch": 2.5647448641484427, + "grad_norm": 3.94929575920105, + "learning_rate": 2.5544673474675376e-07, + "loss": 0.0321, + "num_input_tokens_seen": 37881112, + "step": 19351 + }, + { + "epoch": 2.5648774022531478, + "grad_norm": 6.679806232452393, + "learning_rate": 2.552938910105304e-07, + "loss": 0.0244, + "num_input_tokens_seen": 37883528, + "step": 19352 + }, + { + "epoch": 2.565009940357853, + "grad_norm": 0.3834489583969116, + "learning_rate": 2.551410905537888e-07, + "loss": 0.0015, + "num_input_tokens_seen": 37885984, + "step": 19353 + }, + { + "epoch": 2.565142478462558, + "grad_norm": 0.0032387960236519575, + "learning_rate": 2.5498833337947587e-07, + "loss": 0.0, + "num_input_tokens_seen": 37888456, + "step": 19354 + }, + { + "epoch": 2.565275016567263, + "grad_norm": 0.05228766053915024, + "learning_rate": 2.5483561949053686e-07, + "loss": 0.0003, + "num_input_tokens_seen": 37890064, + "step": 19355 + }, + { + "epoch": 2.5654075546719683, + "grad_norm": 4.123272895812988, + "learning_rate": 2.5468294888991606e-07, + "loss": 0.0394, + "num_input_tokens_seen": 37892072, + "step": 19356 + }, + { + "epoch": 2.5655400927766734, + "grad_norm": 0.0013840856263414025, + "learning_rate": 2.5453032158055695e-07, + "loss": 0.0, + "num_input_tokens_seen": 37893952, + "step": 19357 + }, + { + "epoch": 2.5656726308813784, + "grad_norm": 2.8300416469573975, + "learning_rate": 2.543777375654022e-07, + "loss": 0.0223, + "num_input_tokens_seen": 37895936, + "step": 19358 + }, + { + "epoch": 2.5658051689860835, + "grad_norm": 11.97438907623291, + "learning_rate": 2.5422519684739353e-07, + "loss": 0.0777, + "num_input_tokens_seen": 37898264, + "step": 19359 + }, + { + "epoch": 2.5659377070907885, + "grad_norm": 0.0016476156888529658, + "learning_rate": 2.5407269942947233e-07, + "loss": 0.0, + "num_input_tokens_seen": 37899664, + "step": 19360 + }, + { + "epoch": 2.566070245195494, + "grad_norm": 0.002000865526497364, + "learning_rate": 2.539202453145792e-07, + "loss": 0.0, + "num_input_tokens_seen": 37900840, + "step": 19361 + }, + { + "epoch": 2.5662027833001986, + "grad_norm": 2.706514835357666, + "learning_rate": 2.5376783450565353e-07, + "loss": 0.0161, + "num_input_tokens_seen": 37902120, + "step": 19362 + }, + { + "epoch": 2.566335321404904, + "grad_norm": 0.2820376753807068, + "learning_rate": 2.536154670056332e-07, + "loss": 0.0017, + "num_input_tokens_seen": 37905312, + "step": 19363 + }, + { + "epoch": 2.566467859509609, + "grad_norm": 0.0015822445275261998, + "learning_rate": 2.534631428174561e-07, + "loss": 0.0, + "num_input_tokens_seen": 37906896, + "step": 19364 + }, + { + "epoch": 2.566600397614314, + "grad_norm": 0.03181130811572075, + "learning_rate": 2.5331086194405997e-07, + "loss": 0.0002, + "num_input_tokens_seen": 37908792, + "step": 19365 + }, + { + "epoch": 2.566732935719019, + "grad_norm": 0.0025381636805832386, + "learning_rate": 2.531586243883799e-07, + "loss": 0.0, + "num_input_tokens_seen": 37910280, + "step": 19366 + }, + { + "epoch": 2.566865473823724, + "grad_norm": 3.2885313034057617, + "learning_rate": 2.530064301533519e-07, + "loss": 0.0233, + "num_input_tokens_seen": 37912600, + "step": 19367 + }, + { + "epoch": 2.5669980119284297, + "grad_norm": 0.004531858488917351, + "learning_rate": 2.5285427924191e-07, + "loss": 0.0, + "num_input_tokens_seen": 37913728, + "step": 19368 + }, + { + "epoch": 2.5671305500331343, + "grad_norm": 1.7539565563201904, + "learning_rate": 2.5270217165698747e-07, + "loss": 0.0055, + "num_input_tokens_seen": 37915152, + "step": 19369 + }, + { + "epoch": 2.5672630881378398, + "grad_norm": 0.057718753814697266, + "learning_rate": 2.525501074015177e-07, + "loss": 0.0002, + "num_input_tokens_seen": 37916456, + "step": 19370 + }, + { + "epoch": 2.567395626242545, + "grad_norm": 12.765116691589355, + "learning_rate": 2.5239808647843253e-07, + "loss": 0.1194, + "num_input_tokens_seen": 37918208, + "step": 19371 + }, + { + "epoch": 2.56752816434725, + "grad_norm": 0.01136563066393137, + "learning_rate": 2.522461088906622e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37920672, + "step": 19372 + }, + { + "epoch": 2.567660702451955, + "grad_norm": 14.36377239227295, + "learning_rate": 2.520941746411379e-07, + "loss": 0.1969, + "num_input_tokens_seen": 37923472, + "step": 19373 + }, + { + "epoch": 2.56779324055666, + "grad_norm": 0.7322146892547607, + "learning_rate": 2.519422837327884e-07, + "loss": 0.0035, + "num_input_tokens_seen": 37925856, + "step": 19374 + }, + { + "epoch": 2.5679257786613654, + "grad_norm": 6.907450199127197, + "learning_rate": 2.517904361685428e-07, + "loss": 0.0812, + "num_input_tokens_seen": 37927664, + "step": 19375 + }, + { + "epoch": 2.56805831676607, + "grad_norm": 7.3366498947143555, + "learning_rate": 2.5163863195132865e-07, + "loss": 0.1034, + "num_input_tokens_seen": 37929376, + "step": 19376 + }, + { + "epoch": 2.5681908548707755, + "grad_norm": 0.0013101528165861964, + "learning_rate": 2.514868710840723e-07, + "loss": 0.0, + "num_input_tokens_seen": 37930592, + "step": 19377 + }, + { + "epoch": 2.5683233929754805, + "grad_norm": 3.063955783843994, + "learning_rate": 2.513351535697009e-07, + "loss": 0.0069, + "num_input_tokens_seen": 37932096, + "step": 19378 + }, + { + "epoch": 2.5684559310801856, + "grad_norm": 0.02319006808102131, + "learning_rate": 2.511834794111381e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37934056, + "step": 19379 + }, + { + "epoch": 2.5685884691848906, + "grad_norm": 5.567600250244141, + "learning_rate": 2.5103184861131e-07, + "loss": 0.0254, + "num_input_tokens_seen": 37937240, + "step": 19380 + }, + { + "epoch": 2.5687210072895956, + "grad_norm": 0.0043224976398050785, + "learning_rate": 2.508802611731392e-07, + "loss": 0.0, + "num_input_tokens_seen": 37938816, + "step": 19381 + }, + { + "epoch": 2.568853545394301, + "grad_norm": 2.747615098953247, + "learning_rate": 2.5072871709954874e-07, + "loss": 0.0106, + "num_input_tokens_seen": 37940816, + "step": 19382 + }, + { + "epoch": 2.5689860834990057, + "grad_norm": 1.1238667964935303, + "learning_rate": 2.5057721639345974e-07, + "loss": 0.0033, + "num_input_tokens_seen": 37942704, + "step": 19383 + }, + { + "epoch": 2.569118621603711, + "grad_norm": 3.017476797103882, + "learning_rate": 2.5042575905779356e-07, + "loss": 0.0213, + "num_input_tokens_seen": 37945104, + "step": 19384 + }, + { + "epoch": 2.569251159708416, + "grad_norm": 4.955749034881592, + "learning_rate": 2.5027434509547143e-07, + "loss": 0.0155, + "num_input_tokens_seen": 37947232, + "step": 19385 + }, + { + "epoch": 2.5693836978131213, + "grad_norm": 0.21100133657455444, + "learning_rate": 2.5012297450941166e-07, + "loss": 0.0007, + "num_input_tokens_seen": 37949056, + "step": 19386 + }, + { + "epoch": 2.5695162359178263, + "grad_norm": 6.351183891296387, + "learning_rate": 2.499716473025332e-07, + "loss": 0.0398, + "num_input_tokens_seen": 37950632, + "step": 19387 + }, + { + "epoch": 2.5696487740225313, + "grad_norm": 0.02148575708270073, + "learning_rate": 2.498203634777538e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37951992, + "step": 19388 + }, + { + "epoch": 2.569781312127237, + "grad_norm": 0.032897453755140305, + "learning_rate": 2.496691230379894e-07, + "loss": 0.0002, + "num_input_tokens_seen": 37953448, + "step": 19389 + }, + { + "epoch": 2.5699138502319414, + "grad_norm": 1.4907045364379883, + "learning_rate": 2.4951792598615686e-07, + "loss": 0.0059, + "num_input_tokens_seen": 37955320, + "step": 19390 + }, + { + "epoch": 2.570046388336647, + "grad_norm": 13.814324378967285, + "learning_rate": 2.4936677232517167e-07, + "loss": 0.0575, + "num_input_tokens_seen": 37957568, + "step": 19391 + }, + { + "epoch": 2.570178926441352, + "grad_norm": 0.006732525769621134, + "learning_rate": 2.4921566205794753e-07, + "loss": 0.0, + "num_input_tokens_seen": 37960464, + "step": 19392 + }, + { + "epoch": 2.570311464546057, + "grad_norm": 0.5687081217765808, + "learning_rate": 2.4906459518739823e-07, + "loss": 0.003, + "num_input_tokens_seen": 37962160, + "step": 19393 + }, + { + "epoch": 2.570444002650762, + "grad_norm": 0.6054650545120239, + "learning_rate": 2.4891357171643646e-07, + "loss": 0.0046, + "num_input_tokens_seen": 37964656, + "step": 19394 + }, + { + "epoch": 2.570576540755467, + "grad_norm": 0.7665790915489197, + "learning_rate": 2.487625916479736e-07, + "loss": 0.003, + "num_input_tokens_seen": 37967008, + "step": 19395 + }, + { + "epoch": 2.5707090788601725, + "grad_norm": 0.005277692805975676, + "learning_rate": 2.4861165498492104e-07, + "loss": 0.0, + "num_input_tokens_seen": 37969608, + "step": 19396 + }, + { + "epoch": 2.5708416169648776, + "grad_norm": 1.2636367082595825, + "learning_rate": 2.484607617301893e-07, + "loss": 0.01, + "num_input_tokens_seen": 37971960, + "step": 19397 + }, + { + "epoch": 2.5709741550695826, + "grad_norm": 0.01345908734947443, + "learning_rate": 2.483099118866872e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37973720, + "step": 19398 + }, + { + "epoch": 2.5711066931742876, + "grad_norm": 0.01251994352787733, + "learning_rate": 2.4815910545732347e-07, + "loss": 0.0, + "num_input_tokens_seen": 37975864, + "step": 19399 + }, + { + "epoch": 2.5712392312789927, + "grad_norm": 2.6812777519226074, + "learning_rate": 2.4800834244500525e-07, + "loss": 0.039, + "num_input_tokens_seen": 37977504, + "step": 19400 + }, + { + "epoch": 2.5713717693836977, + "grad_norm": 0.0434565432369709, + "learning_rate": 2.4785762285264025e-07, + "loss": 0.0002, + "num_input_tokens_seen": 37979480, + "step": 19401 + }, + { + "epoch": 2.5715043074884028, + "grad_norm": 1.7957371473312378, + "learning_rate": 2.477069466831333e-07, + "loss": 0.0043, + "num_input_tokens_seen": 37982216, + "step": 19402 + }, + { + "epoch": 2.5716368455931082, + "grad_norm": 10.812566757202148, + "learning_rate": 2.4755631393939077e-07, + "loss": 0.0906, + "num_input_tokens_seen": 37984248, + "step": 19403 + }, + { + "epoch": 2.5717693836978133, + "grad_norm": 0.027528712525963783, + "learning_rate": 2.474057246243164e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37987032, + "step": 19404 + }, + { + "epoch": 2.5719019218025183, + "grad_norm": 1.2115453481674194, + "learning_rate": 2.472551787408131e-07, + "loss": 0.0057, + "num_input_tokens_seen": 37989040, + "step": 19405 + }, + { + "epoch": 2.5720344599072233, + "grad_norm": 0.01788945496082306, + "learning_rate": 2.4710467629178475e-07, + "loss": 0.0001, + "num_input_tokens_seen": 37991800, + "step": 19406 + }, + { + "epoch": 2.5721669980119284, + "grad_norm": 0.0048600719310343266, + "learning_rate": 2.4695421728013176e-07, + "loss": 0.0, + "num_input_tokens_seen": 37993640, + "step": 19407 + }, + { + "epoch": 2.5722995361166334, + "grad_norm": 7.064468860626221, + "learning_rate": 2.4680380170875623e-07, + "loss": 0.1352, + "num_input_tokens_seen": 37995912, + "step": 19408 + }, + { + "epoch": 2.5724320742213385, + "grad_norm": 0.0709582045674324, + "learning_rate": 2.466534295805578e-07, + "loss": 0.0003, + "num_input_tokens_seen": 37998240, + "step": 19409 + }, + { + "epoch": 2.572564612326044, + "grad_norm": 0.09355524182319641, + "learning_rate": 2.465031008984353e-07, + "loss": 0.0003, + "num_input_tokens_seen": 38000184, + "step": 19410 + }, + { + "epoch": 2.572697150430749, + "grad_norm": 0.36136022210121155, + "learning_rate": 2.463528156652881e-07, + "loss": 0.002, + "num_input_tokens_seen": 38001216, + "step": 19411 + }, + { + "epoch": 2.572829688535454, + "grad_norm": 0.009657833725214005, + "learning_rate": 2.462025738840135e-07, + "loss": 0.0, + "num_input_tokens_seen": 38002744, + "step": 19412 + }, + { + "epoch": 2.572962226640159, + "grad_norm": 6.810157299041748, + "learning_rate": 2.460523755575073e-07, + "loss": 0.0534, + "num_input_tokens_seen": 38004672, + "step": 19413 + }, + { + "epoch": 2.573094764744864, + "grad_norm": 7.609522819519043, + "learning_rate": 2.4590222068866685e-07, + "loss": 0.1706, + "num_input_tokens_seen": 38007064, + "step": 19414 + }, + { + "epoch": 2.573227302849569, + "grad_norm": 15.750736236572266, + "learning_rate": 2.457521092803866e-07, + "loss": 0.1735, + "num_input_tokens_seen": 38009248, + "step": 19415 + }, + { + "epoch": 2.573359840954274, + "grad_norm": 0.0074454983696341515, + "learning_rate": 2.4560204133556033e-07, + "loss": 0.0, + "num_input_tokens_seen": 38010984, + "step": 19416 + }, + { + "epoch": 2.5734923790589797, + "grad_norm": 4.2924885749816895, + "learning_rate": 2.454520168570826e-07, + "loss": 0.0201, + "num_input_tokens_seen": 38014264, + "step": 19417 + }, + { + "epoch": 2.5736249171636847, + "grad_norm": 5.503175735473633, + "learning_rate": 2.45302035847845e-07, + "loss": 0.0349, + "num_input_tokens_seen": 38015880, + "step": 19418 + }, + { + "epoch": 2.5737574552683897, + "grad_norm": 9.279829025268555, + "learning_rate": 2.4515209831073915e-07, + "loss": 0.0426, + "num_input_tokens_seen": 38017336, + "step": 19419 + }, + { + "epoch": 2.5738899933730948, + "grad_norm": 0.6075912714004517, + "learning_rate": 2.4500220424865715e-07, + "loss": 0.0029, + "num_input_tokens_seen": 38020208, + "step": 19420 + }, + { + "epoch": 2.5740225314778, + "grad_norm": 0.03560463711619377, + "learning_rate": 2.4485235366448754e-07, + "loss": 0.0002, + "num_input_tokens_seen": 38022656, + "step": 19421 + }, + { + "epoch": 2.574155069582505, + "grad_norm": 4.7690629959106445, + "learning_rate": 2.447025465611208e-07, + "loss": 0.0246, + "num_input_tokens_seen": 38024776, + "step": 19422 + }, + { + "epoch": 2.57428760768721, + "grad_norm": 7.32528829574585, + "learning_rate": 2.4455278294144486e-07, + "loss": 0.0836, + "num_input_tokens_seen": 38026544, + "step": 19423 + }, + { + "epoch": 2.5744201457919154, + "grad_norm": 7.695858478546143, + "learning_rate": 2.444030628083474e-07, + "loss": 0.0485, + "num_input_tokens_seen": 38028320, + "step": 19424 + }, + { + "epoch": 2.5745526838966204, + "grad_norm": 9.423506736755371, + "learning_rate": 2.442533861647142e-07, + "loss": 0.1202, + "num_input_tokens_seen": 38030616, + "step": 19425 + }, + { + "epoch": 2.5746852220013254, + "grad_norm": 23.255292892456055, + "learning_rate": 2.4410375301343186e-07, + "loss": 0.1148, + "num_input_tokens_seen": 38033120, + "step": 19426 + }, + { + "epoch": 2.5748177601060305, + "grad_norm": 0.008706755936145782, + "learning_rate": 2.439541633573861e-07, + "loss": 0.0, + "num_input_tokens_seen": 38034688, + "step": 19427 + }, + { + "epoch": 2.5749502982107355, + "grad_norm": 5.490692615509033, + "learning_rate": 2.438046171994604e-07, + "loss": 0.0445, + "num_input_tokens_seen": 38036232, + "step": 19428 + }, + { + "epoch": 2.5750828363154405, + "grad_norm": 3.0497794151306152, + "learning_rate": 2.436551145425381e-07, + "loss": 0.0089, + "num_input_tokens_seen": 38038600, + "step": 19429 + }, + { + "epoch": 2.5752153744201456, + "grad_norm": 0.4662231504917145, + "learning_rate": 2.435056553895013e-07, + "loss": 0.0012, + "num_input_tokens_seen": 38040184, + "step": 19430 + }, + { + "epoch": 2.575347912524851, + "grad_norm": 0.02149279974400997, + "learning_rate": 2.4335623974323207e-07, + "loss": 0.0002, + "num_input_tokens_seen": 38042480, + "step": 19431 + }, + { + "epoch": 2.575480450629556, + "grad_norm": 3.182718276977539, + "learning_rate": 2.4320686760661184e-07, + "loss": 0.0127, + "num_input_tokens_seen": 38044416, + "step": 19432 + }, + { + "epoch": 2.575612988734261, + "grad_norm": 0.019041849300265312, + "learning_rate": 2.430575389825202e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38046376, + "step": 19433 + }, + { + "epoch": 2.575745526838966, + "grad_norm": 4.771291732788086, + "learning_rate": 2.429082538738359e-07, + "loss": 0.0645, + "num_input_tokens_seen": 38049256, + "step": 19434 + }, + { + "epoch": 2.575878064943671, + "grad_norm": 0.003284144913777709, + "learning_rate": 2.4275901228343777e-07, + "loss": 0.0, + "num_input_tokens_seen": 38050552, + "step": 19435 + }, + { + "epoch": 2.5760106030483763, + "grad_norm": 23.600391387939453, + "learning_rate": 2.426098142142025e-07, + "loss": 0.2925, + "num_input_tokens_seen": 38052880, + "step": 19436 + }, + { + "epoch": 2.5761431411530813, + "grad_norm": 1.059726357460022, + "learning_rate": 2.424606596690074e-07, + "loss": 0.0053, + "num_input_tokens_seen": 38055184, + "step": 19437 + }, + { + "epoch": 2.5762756792577868, + "grad_norm": 9.864409446716309, + "learning_rate": 2.4231154865072883e-07, + "loss": 0.0227, + "num_input_tokens_seen": 38056752, + "step": 19438 + }, + { + "epoch": 2.576408217362492, + "grad_norm": 7.650106430053711, + "learning_rate": 2.4216248116224084e-07, + "loss": 0.0525, + "num_input_tokens_seen": 38058736, + "step": 19439 + }, + { + "epoch": 2.576540755467197, + "grad_norm": 6.320418357849121, + "learning_rate": 2.4201345720641783e-07, + "loss": 0.073, + "num_input_tokens_seen": 38060528, + "step": 19440 + }, + { + "epoch": 2.576673293571902, + "grad_norm": 2.3626770973205566, + "learning_rate": 2.418644767861325e-07, + "loss": 0.0057, + "num_input_tokens_seen": 38062616, + "step": 19441 + }, + { + "epoch": 2.576805831676607, + "grad_norm": 16.425256729125977, + "learning_rate": 2.417155399042587e-07, + "loss": 0.241, + "num_input_tokens_seen": 38064256, + "step": 19442 + }, + { + "epoch": 2.576938369781312, + "grad_norm": 2.527681827545166, + "learning_rate": 2.415666465636665e-07, + "loss": 0.01, + "num_input_tokens_seen": 38066320, + "step": 19443 + }, + { + "epoch": 2.577070907886017, + "grad_norm": 1.396911859512329, + "learning_rate": 2.4141779676722767e-07, + "loss": 0.0149, + "num_input_tokens_seen": 38067856, + "step": 19444 + }, + { + "epoch": 2.5772034459907225, + "grad_norm": 8.709297180175781, + "learning_rate": 2.4126899051781197e-07, + "loss": 0.1239, + "num_input_tokens_seen": 38069904, + "step": 19445 + }, + { + "epoch": 2.5773359840954275, + "grad_norm": 0.007917642593383789, + "learning_rate": 2.4112022781828836e-07, + "loss": 0.0, + "num_input_tokens_seen": 38071352, + "step": 19446 + }, + { + "epoch": 2.5774685222001326, + "grad_norm": 0.009600375778973103, + "learning_rate": 2.4097150867152474e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38073064, + "step": 19447 + }, + { + "epoch": 2.5776010603048376, + "grad_norm": 0.0012321866815909743, + "learning_rate": 2.408228330803891e-07, + "loss": 0.0, + "num_input_tokens_seen": 38074136, + "step": 19448 + }, + { + "epoch": 2.5777335984095426, + "grad_norm": 0.03692163527011871, + "learning_rate": 2.406742010477475e-07, + "loss": 0.0002, + "num_input_tokens_seen": 38076296, + "step": 19449 + }, + { + "epoch": 2.577866136514248, + "grad_norm": 12.00407600402832, + "learning_rate": 2.405256125764663e-07, + "loss": 0.1698, + "num_input_tokens_seen": 38077592, + "step": 19450 + }, + { + "epoch": 2.5779986746189527, + "grad_norm": 0.9321497678756714, + "learning_rate": 2.4037706766941007e-07, + "loss": 0.0028, + "num_input_tokens_seen": 38079432, + "step": 19451 + }, + { + "epoch": 2.578131212723658, + "grad_norm": 0.004490045364946127, + "learning_rate": 2.4022856632944236e-07, + "loss": 0.0, + "num_input_tokens_seen": 38081592, + "step": 19452 + }, + { + "epoch": 2.5782637508283632, + "grad_norm": 0.280457466840744, + "learning_rate": 2.4008010855942734e-07, + "loss": 0.0005, + "num_input_tokens_seen": 38083448, + "step": 19453 + }, + { + "epoch": 2.5783962889330683, + "grad_norm": 3.5100326538085938, + "learning_rate": 2.3993169436222624e-07, + "loss": 0.0321, + "num_input_tokens_seen": 38085864, + "step": 19454 + }, + { + "epoch": 2.5785288270377733, + "grad_norm": 0.13419193029403687, + "learning_rate": 2.3978332374070186e-07, + "loss": 0.0003, + "num_input_tokens_seen": 38087208, + "step": 19455 + }, + { + "epoch": 2.5786613651424783, + "grad_norm": 1.6064867973327637, + "learning_rate": 2.3963499669771406e-07, + "loss": 0.0045, + "num_input_tokens_seen": 38088368, + "step": 19456 + }, + { + "epoch": 2.578793903247184, + "grad_norm": 28.58725357055664, + "learning_rate": 2.3948671323612253e-07, + "loss": 0.1833, + "num_input_tokens_seen": 38090264, + "step": 19457 + }, + { + "epoch": 2.5789264413518884, + "grad_norm": 5.1721272468566895, + "learning_rate": 2.3933847335878695e-07, + "loss": 0.0401, + "num_input_tokens_seen": 38091976, + "step": 19458 + }, + { + "epoch": 2.579058979456594, + "grad_norm": 0.002186292316764593, + "learning_rate": 2.3919027706856526e-07, + "loss": 0.0, + "num_input_tokens_seen": 38093960, + "step": 19459 + }, + { + "epoch": 2.579191517561299, + "grad_norm": 6.574543476104736, + "learning_rate": 2.3904212436831415e-07, + "loss": 0.0453, + "num_input_tokens_seen": 38096240, + "step": 19460 + }, + { + "epoch": 2.579324055666004, + "grad_norm": 0.00038848823169246316, + "learning_rate": 2.388940152608912e-07, + "loss": 0.0, + "num_input_tokens_seen": 38097864, + "step": 19461 + }, + { + "epoch": 2.579456593770709, + "grad_norm": 0.01779775694012642, + "learning_rate": 2.3874594974915094e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38099696, + "step": 19462 + }, + { + "epoch": 2.579589131875414, + "grad_norm": 5.215701103210449, + "learning_rate": 2.38597927835949e-07, + "loss": 0.0398, + "num_input_tokens_seen": 38102000, + "step": 19463 + }, + { + "epoch": 2.5797216699801195, + "grad_norm": 15.066582679748535, + "learning_rate": 2.384499495241394e-07, + "loss": 0.0579, + "num_input_tokens_seen": 38104032, + "step": 19464 + }, + { + "epoch": 2.579854208084824, + "grad_norm": 8.454154014587402, + "learning_rate": 2.383020148165749e-07, + "loss": 0.0599, + "num_input_tokens_seen": 38106080, + "step": 19465 + }, + { + "epoch": 2.5799867461895296, + "grad_norm": 2.4973719120025635, + "learning_rate": 2.3815412371610725e-07, + "loss": 0.0112, + "num_input_tokens_seen": 38107840, + "step": 19466 + }, + { + "epoch": 2.5801192842942346, + "grad_norm": 0.7363899946212769, + "learning_rate": 2.3800627622558858e-07, + "loss": 0.0038, + "num_input_tokens_seen": 38109472, + "step": 19467 + }, + { + "epoch": 2.5802518223989397, + "grad_norm": 4.367335796356201, + "learning_rate": 2.3785847234786968e-07, + "loss": 0.0109, + "num_input_tokens_seen": 38111848, + "step": 19468 + }, + { + "epoch": 2.5803843605036447, + "grad_norm": 0.006820421665906906, + "learning_rate": 2.3771071208580022e-07, + "loss": 0.0, + "num_input_tokens_seen": 38113264, + "step": 19469 + }, + { + "epoch": 2.5805168986083498, + "grad_norm": 12.771903991699219, + "learning_rate": 2.375629954422287e-07, + "loss": 0.0904, + "num_input_tokens_seen": 38115184, + "step": 19470 + }, + { + "epoch": 2.5806494367130552, + "grad_norm": 0.007003570906817913, + "learning_rate": 2.3741532242000347e-07, + "loss": 0.0, + "num_input_tokens_seen": 38116176, + "step": 19471 + }, + { + "epoch": 2.58078197481776, + "grad_norm": 1.1385854482650757, + "learning_rate": 2.372676930219711e-07, + "loss": 0.0049, + "num_input_tokens_seen": 38118240, + "step": 19472 + }, + { + "epoch": 2.5809145129224653, + "grad_norm": 0.1380559653043747, + "learning_rate": 2.3712010725097879e-07, + "loss": 0.0007, + "num_input_tokens_seen": 38120560, + "step": 19473 + }, + { + "epoch": 2.5810470510271704, + "grad_norm": 9.46606731414795, + "learning_rate": 2.3697256510987227e-07, + "loss": 0.2537, + "num_input_tokens_seen": 38123360, + "step": 19474 + }, + { + "epoch": 2.5811795891318754, + "grad_norm": 0.026126910001039505, + "learning_rate": 2.3682506660149597e-07, + "loss": 0.0002, + "num_input_tokens_seen": 38124720, + "step": 19475 + }, + { + "epoch": 2.5813121272365804, + "grad_norm": 0.19773024320602417, + "learning_rate": 2.366776117286934e-07, + "loss": 0.0005, + "num_input_tokens_seen": 38126520, + "step": 19476 + }, + { + "epoch": 2.5814446653412855, + "grad_norm": 6.872732639312744, + "learning_rate": 2.365302004943082e-07, + "loss": 0.0982, + "num_input_tokens_seen": 38129176, + "step": 19477 + }, + { + "epoch": 2.581577203445991, + "grad_norm": 0.002044921973720193, + "learning_rate": 2.3638283290118163e-07, + "loss": 0.0, + "num_input_tokens_seen": 38130544, + "step": 19478 + }, + { + "epoch": 2.581709741550696, + "grad_norm": 0.2781258225440979, + "learning_rate": 2.3623550895215563e-07, + "loss": 0.0007, + "num_input_tokens_seen": 38131624, + "step": 19479 + }, + { + "epoch": 2.581842279655401, + "grad_norm": 4.32877779006958, + "learning_rate": 2.3608822865007097e-07, + "loss": 0.0794, + "num_input_tokens_seen": 38134696, + "step": 19480 + }, + { + "epoch": 2.581974817760106, + "grad_norm": 0.003592919558286667, + "learning_rate": 2.3594099199776733e-07, + "loss": 0.0, + "num_input_tokens_seen": 38136800, + "step": 19481 + }, + { + "epoch": 2.582107355864811, + "grad_norm": 0.002581442706286907, + "learning_rate": 2.3579379899808326e-07, + "loss": 0.0, + "num_input_tokens_seen": 38138488, + "step": 19482 + }, + { + "epoch": 2.582239893969516, + "grad_norm": 0.026846833527088165, + "learning_rate": 2.356466496538562e-07, + "loss": 0.0, + "num_input_tokens_seen": 38139944, + "step": 19483 + }, + { + "epoch": 2.582372432074221, + "grad_norm": 2.8415000438690186, + "learning_rate": 2.3549954396792363e-07, + "loss": 0.0217, + "num_input_tokens_seen": 38141336, + "step": 19484 + }, + { + "epoch": 2.5825049701789267, + "grad_norm": 21.553850173950195, + "learning_rate": 2.3535248194312272e-07, + "loss": 0.0164, + "num_input_tokens_seen": 38143168, + "step": 19485 + }, + { + "epoch": 2.5826375082836317, + "grad_norm": 5.537283420562744, + "learning_rate": 2.352054635822884e-07, + "loss": 0.0496, + "num_input_tokens_seen": 38144864, + "step": 19486 + }, + { + "epoch": 2.5827700463883367, + "grad_norm": 13.871557235717773, + "learning_rate": 2.3505848888825482e-07, + "loss": 0.18, + "num_input_tokens_seen": 38147120, + "step": 19487 + }, + { + "epoch": 2.5829025844930418, + "grad_norm": 4.882599830627441, + "learning_rate": 2.3491155786385583e-07, + "loss": 0.0397, + "num_input_tokens_seen": 38149392, + "step": 19488 + }, + { + "epoch": 2.583035122597747, + "grad_norm": 0.00277733919210732, + "learning_rate": 2.3476467051192497e-07, + "loss": 0.0, + "num_input_tokens_seen": 38150984, + "step": 19489 + }, + { + "epoch": 2.583167660702452, + "grad_norm": 0.09833642840385437, + "learning_rate": 2.346178268352936e-07, + "loss": 0.0004, + "num_input_tokens_seen": 38153768, + "step": 19490 + }, + { + "epoch": 2.583300198807157, + "grad_norm": 8.486614227294922, + "learning_rate": 2.344710268367939e-07, + "loss": 0.1154, + "num_input_tokens_seen": 38155832, + "step": 19491 + }, + { + "epoch": 2.5834327369118624, + "grad_norm": 4.544781684875488, + "learning_rate": 2.3432427051925555e-07, + "loss": 0.0419, + "num_input_tokens_seen": 38158288, + "step": 19492 + }, + { + "epoch": 2.5835652750165674, + "grad_norm": 2.195694923400879, + "learning_rate": 2.3417755788550767e-07, + "loss": 0.0252, + "num_input_tokens_seen": 38160488, + "step": 19493 + }, + { + "epoch": 2.5836978131212724, + "grad_norm": 0.29723379015922546, + "learning_rate": 2.3403088893838021e-07, + "loss": 0.0007, + "num_input_tokens_seen": 38163312, + "step": 19494 + }, + { + "epoch": 2.5838303512259775, + "grad_norm": 1.874769687652588, + "learning_rate": 2.3388426368070067e-07, + "loss": 0.0139, + "num_input_tokens_seen": 38165120, + "step": 19495 + }, + { + "epoch": 2.5839628893306825, + "grad_norm": 0.006234204396605492, + "learning_rate": 2.3373768211529506e-07, + "loss": 0.0, + "num_input_tokens_seen": 38166968, + "step": 19496 + }, + { + "epoch": 2.5840954274353876, + "grad_norm": 0.1711202710866928, + "learning_rate": 2.335911442449909e-07, + "loss": 0.0006, + "num_input_tokens_seen": 38168456, + "step": 19497 + }, + { + "epoch": 2.5842279655400926, + "grad_norm": 9.714573860168457, + "learning_rate": 2.334446500726123e-07, + "loss": 0.0447, + "num_input_tokens_seen": 38170640, + "step": 19498 + }, + { + "epoch": 2.584360503644798, + "grad_norm": 0.0012651566648855805, + "learning_rate": 2.3329819960098532e-07, + "loss": 0.0, + "num_input_tokens_seen": 38172104, + "step": 19499 + }, + { + "epoch": 2.584493041749503, + "grad_norm": 0.005524888634681702, + "learning_rate": 2.3315179283293242e-07, + "loss": 0.0, + "num_input_tokens_seen": 38173224, + "step": 19500 + }, + { + "epoch": 2.584625579854208, + "grad_norm": 12.879678726196289, + "learning_rate": 2.3300542977127695e-07, + "loss": 0.1676, + "num_input_tokens_seen": 38174944, + "step": 19501 + }, + { + "epoch": 2.584758117958913, + "grad_norm": 5.7510085105896, + "learning_rate": 2.3285911041884046e-07, + "loss": 0.0934, + "num_input_tokens_seen": 38176408, + "step": 19502 + }, + { + "epoch": 2.5848906560636182, + "grad_norm": 9.69985294342041, + "learning_rate": 2.3271283477844436e-07, + "loss": 0.1832, + "num_input_tokens_seen": 38179144, + "step": 19503 + }, + { + "epoch": 2.5850231941683233, + "grad_norm": 0.026296650990843773, + "learning_rate": 2.3256660285290887e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38180520, + "step": 19504 + }, + { + "epoch": 2.5851557322730283, + "grad_norm": 1.7171012163162231, + "learning_rate": 2.324204146450537e-07, + "loss": 0.0028, + "num_input_tokens_seen": 38183008, + "step": 19505 + }, + { + "epoch": 2.585288270377734, + "grad_norm": 0.0023586058523505926, + "learning_rate": 2.3227427015769742e-07, + "loss": 0.0, + "num_input_tokens_seen": 38184264, + "step": 19506 + }, + { + "epoch": 2.585420808482439, + "grad_norm": 0.003217240097001195, + "learning_rate": 2.3212816939365695e-07, + "loss": 0.0, + "num_input_tokens_seen": 38185760, + "step": 19507 + }, + { + "epoch": 2.585553346587144, + "grad_norm": 1.6808384656906128, + "learning_rate": 2.3198211235575058e-07, + "loss": 0.0088, + "num_input_tokens_seen": 38187152, + "step": 19508 + }, + { + "epoch": 2.585685884691849, + "grad_norm": 0.23419268429279327, + "learning_rate": 2.3183609904679328e-07, + "loss": 0.0008, + "num_input_tokens_seen": 38188680, + "step": 19509 + }, + { + "epoch": 2.585818422796554, + "grad_norm": 0.5774477124214172, + "learning_rate": 2.3169012946960083e-07, + "loss": 0.0025, + "num_input_tokens_seen": 38190056, + "step": 19510 + }, + { + "epoch": 2.585950960901259, + "grad_norm": 0.6518896222114563, + "learning_rate": 2.31544203626988e-07, + "loss": 0.0039, + "num_input_tokens_seen": 38192360, + "step": 19511 + }, + { + "epoch": 2.586083499005964, + "grad_norm": 8.013463973999023, + "learning_rate": 2.3139832152176744e-07, + "loss": 0.1075, + "num_input_tokens_seen": 38194808, + "step": 19512 + }, + { + "epoch": 2.5862160371106695, + "grad_norm": 5.267388820648193, + "learning_rate": 2.3125248315675196e-07, + "loss": 0.0943, + "num_input_tokens_seen": 38196600, + "step": 19513 + }, + { + "epoch": 2.5863485752153745, + "grad_norm": 8.684226036071777, + "learning_rate": 2.3110668853475378e-07, + "loss": 0.0941, + "num_input_tokens_seen": 38198576, + "step": 19514 + }, + { + "epoch": 2.5864811133200796, + "grad_norm": 7.114933967590332, + "learning_rate": 2.309609376585842e-07, + "loss": 0.076, + "num_input_tokens_seen": 38200440, + "step": 19515 + }, + { + "epoch": 2.5866136514247846, + "grad_norm": 0.06926994770765305, + "learning_rate": 2.3081523053105322e-07, + "loss": 0.0003, + "num_input_tokens_seen": 38202128, + "step": 19516 + }, + { + "epoch": 2.5867461895294896, + "grad_norm": 0.06178737431764603, + "learning_rate": 2.3066956715496998e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38203472, + "step": 19517 + }, + { + "epoch": 2.5868787276341947, + "grad_norm": 8.441856384277344, + "learning_rate": 2.3052394753314305e-07, + "loss": 0.218, + "num_input_tokens_seen": 38205376, + "step": 19518 + }, + { + "epoch": 2.5870112657388997, + "grad_norm": 1.3803714513778687, + "learning_rate": 2.303783716683794e-07, + "loss": 0.0039, + "num_input_tokens_seen": 38207424, + "step": 19519 + }, + { + "epoch": 2.587143803843605, + "grad_norm": 28.150075912475586, + "learning_rate": 2.3023283956348676e-07, + "loss": 0.2354, + "num_input_tokens_seen": 38210624, + "step": 19520 + }, + { + "epoch": 2.5872763419483102, + "grad_norm": 0.023670602589845657, + "learning_rate": 2.3008735122127117e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38212584, + "step": 19521 + }, + { + "epoch": 2.5874088800530153, + "grad_norm": 3.379124641418457, + "learning_rate": 2.2994190664453713e-07, + "loss": 0.0072, + "num_input_tokens_seen": 38215312, + "step": 19522 + }, + { + "epoch": 2.5875414181577203, + "grad_norm": 5.892861366271973, + "learning_rate": 2.2979650583608958e-07, + "loss": 0.0354, + "num_input_tokens_seen": 38216760, + "step": 19523 + }, + { + "epoch": 2.5876739562624254, + "grad_norm": 0.003627873957157135, + "learning_rate": 2.2965114879873074e-07, + "loss": 0.0, + "num_input_tokens_seen": 38218480, + "step": 19524 + }, + { + "epoch": 2.5878064943671304, + "grad_norm": 0.003729845630005002, + "learning_rate": 2.2950583553526474e-07, + "loss": 0.0, + "num_input_tokens_seen": 38220832, + "step": 19525 + }, + { + "epoch": 2.5879390324718354, + "grad_norm": 0.006826526951044798, + "learning_rate": 2.2936056604849184e-07, + "loss": 0.0, + "num_input_tokens_seen": 38223504, + "step": 19526 + }, + { + "epoch": 2.588071570576541, + "grad_norm": 0.001559249241836369, + "learning_rate": 2.2921534034121422e-07, + "loss": 0.0, + "num_input_tokens_seen": 38224968, + "step": 19527 + }, + { + "epoch": 2.588204108681246, + "grad_norm": 0.0007498703198507428, + "learning_rate": 2.2907015841623137e-07, + "loss": 0.0, + "num_input_tokens_seen": 38226328, + "step": 19528 + }, + { + "epoch": 2.588336646785951, + "grad_norm": 3.114485740661621, + "learning_rate": 2.289250202763424e-07, + "loss": 0.0071, + "num_input_tokens_seen": 38227848, + "step": 19529 + }, + { + "epoch": 2.588469184890656, + "grad_norm": 9.804993629455566, + "learning_rate": 2.2877992592434533e-07, + "loss": 0.0302, + "num_input_tokens_seen": 38230464, + "step": 19530 + }, + { + "epoch": 2.588601722995361, + "grad_norm": 2.340512990951538, + "learning_rate": 2.286348753630388e-07, + "loss": 0.0079, + "num_input_tokens_seen": 38232616, + "step": 19531 + }, + { + "epoch": 2.588734261100066, + "grad_norm": 0.005360286682844162, + "learning_rate": 2.2848986859521805e-07, + "loss": 0.0, + "num_input_tokens_seen": 38233736, + "step": 19532 + }, + { + "epoch": 2.588866799204771, + "grad_norm": 0.010043726302683353, + "learning_rate": 2.2834490562368028e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38235552, + "step": 19533 + }, + { + "epoch": 2.5889993373094766, + "grad_norm": 0.9141721725463867, + "learning_rate": 2.2819998645121992e-07, + "loss": 0.006, + "num_input_tokens_seen": 38237320, + "step": 19534 + }, + { + "epoch": 2.5891318754141817, + "grad_norm": 1.2118431329727173, + "learning_rate": 2.2805511108063062e-07, + "loss": 0.0064, + "num_input_tokens_seen": 38240384, + "step": 19535 + }, + { + "epoch": 2.5892644135188867, + "grad_norm": 2.083002805709839, + "learning_rate": 2.2791027951470647e-07, + "loss": 0.0085, + "num_input_tokens_seen": 38242000, + "step": 19536 + }, + { + "epoch": 2.5893969516235917, + "grad_norm": 8.474562644958496, + "learning_rate": 2.2776549175623912e-07, + "loss": 0.141, + "num_input_tokens_seen": 38243280, + "step": 19537 + }, + { + "epoch": 2.5895294897282968, + "grad_norm": 3.1480631828308105, + "learning_rate": 2.2762074780802113e-07, + "loss": 0.0195, + "num_input_tokens_seen": 38245304, + "step": 19538 + }, + { + "epoch": 2.5896620278330023, + "grad_norm": 0.1373857855796814, + "learning_rate": 2.2747604767284272e-07, + "loss": 0.0006, + "num_input_tokens_seen": 38247360, + "step": 19539 + }, + { + "epoch": 2.589794565937707, + "grad_norm": 7.851412773132324, + "learning_rate": 2.273313913534933e-07, + "loss": 0.0486, + "num_input_tokens_seen": 38249344, + "step": 19540 + }, + { + "epoch": 2.5899271040424123, + "grad_norm": 1.4062966108322144, + "learning_rate": 2.271867788527632e-07, + "loss": 0.0071, + "num_input_tokens_seen": 38251040, + "step": 19541 + }, + { + "epoch": 2.5900596421471174, + "grad_norm": 0.0016251326305791736, + "learning_rate": 2.2704221017343957e-07, + "loss": 0.0, + "num_input_tokens_seen": 38253160, + "step": 19542 + }, + { + "epoch": 2.5901921802518224, + "grad_norm": 6.189863204956055, + "learning_rate": 2.2689768531830997e-07, + "loss": 0.0297, + "num_input_tokens_seen": 38254896, + "step": 19543 + }, + { + "epoch": 2.5903247183565274, + "grad_norm": 2.76934814453125, + "learning_rate": 2.2675320429016156e-07, + "loss": 0.0211, + "num_input_tokens_seen": 38257280, + "step": 19544 + }, + { + "epoch": 2.5904572564612325, + "grad_norm": 0.09911293536424637, + "learning_rate": 2.2660876709177908e-07, + "loss": 0.0006, + "num_input_tokens_seen": 38259016, + "step": 19545 + }, + { + "epoch": 2.590589794565938, + "grad_norm": 0.008780150674283504, + "learning_rate": 2.2646437372594838e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38260536, + "step": 19546 + }, + { + "epoch": 2.5907223326706426, + "grad_norm": 0.0021694921888411045, + "learning_rate": 2.2632002419545302e-07, + "loss": 0.0, + "num_input_tokens_seen": 38262192, + "step": 19547 + }, + { + "epoch": 2.590854870775348, + "grad_norm": 2.959047317504883, + "learning_rate": 2.2617571850307606e-07, + "loss": 0.0179, + "num_input_tokens_seen": 38264264, + "step": 19548 + }, + { + "epoch": 2.590987408880053, + "grad_norm": 0.005289941094815731, + "learning_rate": 2.2603145665159948e-07, + "loss": 0.0, + "num_input_tokens_seen": 38265600, + "step": 19549 + }, + { + "epoch": 2.591119946984758, + "grad_norm": 0.02008618600666523, + "learning_rate": 2.2588723864380519e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38267840, + "step": 19550 + }, + { + "epoch": 2.591252485089463, + "grad_norm": 0.018083341419696808, + "learning_rate": 2.2574306448247402e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38270072, + "step": 19551 + }, + { + "epoch": 2.591385023194168, + "grad_norm": 0.40605488419532776, + "learning_rate": 2.2559893417038569e-07, + "loss": 0.0018, + "num_input_tokens_seen": 38272784, + "step": 19552 + }, + { + "epoch": 2.5915175612988737, + "grad_norm": 10.036961555480957, + "learning_rate": 2.2545484771031884e-07, + "loss": 0.1048, + "num_input_tokens_seen": 38274440, + "step": 19553 + }, + { + "epoch": 2.5916500994035783, + "grad_norm": 0.0020510826725512743, + "learning_rate": 2.2531080510505176e-07, + "loss": 0.0, + "num_input_tokens_seen": 38275744, + "step": 19554 + }, + { + "epoch": 2.5917826375082837, + "grad_norm": 6.2653021812438965, + "learning_rate": 2.2516680635736088e-07, + "loss": 0.0927, + "num_input_tokens_seen": 38277856, + "step": 19555 + }, + { + "epoch": 2.591915175612989, + "grad_norm": 0.10841788351535797, + "learning_rate": 2.2502285147002367e-07, + "loss": 0.0004, + "num_input_tokens_seen": 38279880, + "step": 19556 + }, + { + "epoch": 2.592047713717694, + "grad_norm": 4.138819694519043, + "learning_rate": 2.2487894044581544e-07, + "loss": 0.0223, + "num_input_tokens_seen": 38281760, + "step": 19557 + }, + { + "epoch": 2.592180251822399, + "grad_norm": 5.491577625274658, + "learning_rate": 2.2473507328751086e-07, + "loss": 0.0381, + "num_input_tokens_seen": 38283728, + "step": 19558 + }, + { + "epoch": 2.592312789927104, + "grad_norm": 3.116621971130371, + "learning_rate": 2.245912499978839e-07, + "loss": 0.0087, + "num_input_tokens_seen": 38286200, + "step": 19559 + }, + { + "epoch": 2.5924453280318094, + "grad_norm": 10.72745418548584, + "learning_rate": 2.2444747057970674e-07, + "loss": 0.0616, + "num_input_tokens_seen": 38288712, + "step": 19560 + }, + { + "epoch": 2.592577866136514, + "grad_norm": 0.09034241735935211, + "learning_rate": 2.2430373503575214e-07, + "loss": 0.0005, + "num_input_tokens_seen": 38291736, + "step": 19561 + }, + { + "epoch": 2.5927104042412195, + "grad_norm": 8.30131721496582, + "learning_rate": 2.2416004336879183e-07, + "loss": 0.1322, + "num_input_tokens_seen": 38293184, + "step": 19562 + }, + { + "epoch": 2.5928429423459245, + "grad_norm": 0.49436691403388977, + "learning_rate": 2.2401639558159605e-07, + "loss": 0.0025, + "num_input_tokens_seen": 38295272, + "step": 19563 + }, + { + "epoch": 2.5929754804506295, + "grad_norm": 0.1546715945005417, + "learning_rate": 2.2387279167693427e-07, + "loss": 0.0012, + "num_input_tokens_seen": 38297184, + "step": 19564 + }, + { + "epoch": 2.5931080185553346, + "grad_norm": 0.0016089346027001739, + "learning_rate": 2.2372923165757536e-07, + "loss": 0.0, + "num_input_tokens_seen": 38298824, + "step": 19565 + }, + { + "epoch": 2.5932405566600396, + "grad_norm": 0.14386631548404694, + "learning_rate": 2.235857155262866e-07, + "loss": 0.0003, + "num_input_tokens_seen": 38300784, + "step": 19566 + }, + { + "epoch": 2.593373094764745, + "grad_norm": 7.597013473510742, + "learning_rate": 2.2344224328583575e-07, + "loss": 0.062, + "num_input_tokens_seen": 38304640, + "step": 19567 + }, + { + "epoch": 2.59350563286945, + "grad_norm": 0.001097539090551436, + "learning_rate": 2.2329881493898947e-07, + "loss": 0.0, + "num_input_tokens_seen": 38306256, + "step": 19568 + }, + { + "epoch": 2.593638170974155, + "grad_norm": 0.16022011637687683, + "learning_rate": 2.231554304885125e-07, + "loss": 0.0008, + "num_input_tokens_seen": 38308248, + "step": 19569 + }, + { + "epoch": 2.59377070907886, + "grad_norm": 0.034476395696401596, + "learning_rate": 2.230120899371696e-07, + "loss": 0.0002, + "num_input_tokens_seen": 38310272, + "step": 19570 + }, + { + "epoch": 2.5939032471835652, + "grad_norm": 0.002575445221737027, + "learning_rate": 2.2286879328772383e-07, + "loss": 0.0, + "num_input_tokens_seen": 38312592, + "step": 19571 + }, + { + "epoch": 2.5940357852882703, + "grad_norm": 4.131199359893799, + "learning_rate": 2.2272554054293934e-07, + "loss": 0.0283, + "num_input_tokens_seen": 38314752, + "step": 19572 + }, + { + "epoch": 2.5941683233929753, + "grad_norm": 11.348933219909668, + "learning_rate": 2.2258233170557668e-07, + "loss": 0.0175, + "num_input_tokens_seen": 38317176, + "step": 19573 + }, + { + "epoch": 2.594300861497681, + "grad_norm": 7.754245758056641, + "learning_rate": 2.2243916677839844e-07, + "loss": 0.1297, + "num_input_tokens_seen": 38319184, + "step": 19574 + }, + { + "epoch": 2.594433399602386, + "grad_norm": 7.891770362854004, + "learning_rate": 2.222960457641643e-07, + "loss": 0.0711, + "num_input_tokens_seen": 38321304, + "step": 19575 + }, + { + "epoch": 2.594565937707091, + "grad_norm": 10.571821212768555, + "learning_rate": 2.2215296866563317e-07, + "loss": 0.0348, + "num_input_tokens_seen": 38322976, + "step": 19576 + }, + { + "epoch": 2.594698475811796, + "grad_norm": 0.008921639062464237, + "learning_rate": 2.2200993548556483e-07, + "loss": 0.0, + "num_input_tokens_seen": 38323984, + "step": 19577 + }, + { + "epoch": 2.594831013916501, + "grad_norm": 0.0035092367324978113, + "learning_rate": 2.2186694622671618e-07, + "loss": 0.0, + "num_input_tokens_seen": 38327016, + "step": 19578 + }, + { + "epoch": 2.594963552021206, + "grad_norm": 0.07848535478115082, + "learning_rate": 2.217240008918442e-07, + "loss": 0.0005, + "num_input_tokens_seen": 38329952, + "step": 19579 + }, + { + "epoch": 2.595096090125911, + "grad_norm": 3.8175082206726074, + "learning_rate": 2.215810994837056e-07, + "loss": 0.011, + "num_input_tokens_seen": 38332536, + "step": 19580 + }, + { + "epoch": 2.5952286282306165, + "grad_norm": 4.941031455993652, + "learning_rate": 2.2143824200505454e-07, + "loss": 0.0066, + "num_input_tokens_seen": 38335448, + "step": 19581 + }, + { + "epoch": 2.5953611663353215, + "grad_norm": 0.04654120281338692, + "learning_rate": 2.212954284586466e-07, + "loss": 0.0002, + "num_input_tokens_seen": 38337328, + "step": 19582 + }, + { + "epoch": 2.5954937044400266, + "grad_norm": 0.0020885588601231575, + "learning_rate": 2.2115265884723486e-07, + "loss": 0.0, + "num_input_tokens_seen": 38338280, + "step": 19583 + }, + { + "epoch": 2.5956262425447316, + "grad_norm": 5.172085762023926, + "learning_rate": 2.210099331735721e-07, + "loss": 0.0894, + "num_input_tokens_seen": 38340808, + "step": 19584 + }, + { + "epoch": 2.5957587806494367, + "grad_norm": 0.010889441706240177, + "learning_rate": 2.2086725144040922e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38342448, + "step": 19585 + }, + { + "epoch": 2.5958913187541417, + "grad_norm": 0.16189153492450714, + "learning_rate": 2.2072461365049868e-07, + "loss": 0.0005, + "num_input_tokens_seen": 38343816, + "step": 19586 + }, + { + "epoch": 2.5960238568588467, + "grad_norm": 1.4364497661590576, + "learning_rate": 2.2058201980658972e-07, + "loss": 0.0062, + "num_input_tokens_seen": 38345440, + "step": 19587 + }, + { + "epoch": 2.596156394963552, + "grad_norm": 0.010228211060166359, + "learning_rate": 2.2043946991143205e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38347304, + "step": 19588 + }, + { + "epoch": 2.5962889330682573, + "grad_norm": 19.569616317749023, + "learning_rate": 2.2029696396777405e-07, + "loss": 0.1177, + "num_input_tokens_seen": 38350048, + "step": 19589 + }, + { + "epoch": 2.5964214711729623, + "grad_norm": 6.20168399810791, + "learning_rate": 2.2015450197836296e-07, + "loss": 0.0782, + "num_input_tokens_seen": 38351960, + "step": 19590 + }, + { + "epoch": 2.5965540092776673, + "grad_norm": 3.105771541595459, + "learning_rate": 2.200120839459463e-07, + "loss": 0.0286, + "num_input_tokens_seen": 38354864, + "step": 19591 + }, + { + "epoch": 2.5966865473823724, + "grad_norm": 0.001977552194148302, + "learning_rate": 2.1986970987326884e-07, + "loss": 0.0, + "num_input_tokens_seen": 38356464, + "step": 19592 + }, + { + "epoch": 2.5968190854870774, + "grad_norm": 0.0007803388289175928, + "learning_rate": 2.1972737976307697e-07, + "loss": 0.0, + "num_input_tokens_seen": 38358088, + "step": 19593 + }, + { + "epoch": 2.5969516235917824, + "grad_norm": 0.0019044795772060752, + "learning_rate": 2.195850936181143e-07, + "loss": 0.0, + "num_input_tokens_seen": 38360024, + "step": 19594 + }, + { + "epoch": 2.597084161696488, + "grad_norm": 0.026728440076112747, + "learning_rate": 2.1944285144112425e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38362528, + "step": 19595 + }, + { + "epoch": 2.597216699801193, + "grad_norm": 10.568843841552734, + "learning_rate": 2.1930065323484878e-07, + "loss": 0.0612, + "num_input_tokens_seen": 38363920, + "step": 19596 + }, + { + "epoch": 2.597349237905898, + "grad_norm": 1.8160940408706665, + "learning_rate": 2.1915849900203012e-07, + "loss": 0.0078, + "num_input_tokens_seen": 38366984, + "step": 19597 + }, + { + "epoch": 2.597481776010603, + "grad_norm": 16.5985107421875, + "learning_rate": 2.1901638874540941e-07, + "loss": 0.1434, + "num_input_tokens_seen": 38369496, + "step": 19598 + }, + { + "epoch": 2.597614314115308, + "grad_norm": 6.536657333374023, + "learning_rate": 2.1887432246772644e-07, + "loss": 0.0322, + "num_input_tokens_seen": 38372328, + "step": 19599 + }, + { + "epoch": 2.597746852220013, + "grad_norm": 0.053532056510448456, + "learning_rate": 2.1873230017172037e-07, + "loss": 0.0003, + "num_input_tokens_seen": 38373456, + "step": 19600 + }, + { + "epoch": 2.597879390324718, + "grad_norm": 7.111681938171387, + "learning_rate": 2.1859032186012903e-07, + "loss": 0.0939, + "num_input_tokens_seen": 38375688, + "step": 19601 + }, + { + "epoch": 2.5980119284294236, + "grad_norm": 7.735266208648682, + "learning_rate": 2.1844838753568965e-07, + "loss": 0.0383, + "num_input_tokens_seen": 38378328, + "step": 19602 + }, + { + "epoch": 2.5981444665341287, + "grad_norm": 14.824787139892578, + "learning_rate": 2.1830649720113951e-07, + "loss": 0.2376, + "num_input_tokens_seen": 38380912, + "step": 19603 + }, + { + "epoch": 2.5982770046388337, + "grad_norm": 9.278714179992676, + "learning_rate": 2.1816465085921473e-07, + "loss": 0.1838, + "num_input_tokens_seen": 38383064, + "step": 19604 + }, + { + "epoch": 2.5984095427435387, + "grad_norm": 7.4793171882629395, + "learning_rate": 2.1802284851264927e-07, + "loss": 0.0311, + "num_input_tokens_seen": 38385360, + "step": 19605 + }, + { + "epoch": 2.598542080848244, + "grad_norm": 5.897233963012695, + "learning_rate": 2.1788109016417785e-07, + "loss": 0.0108, + "num_input_tokens_seen": 38386712, + "step": 19606 + }, + { + "epoch": 2.598674618952949, + "grad_norm": 0.10957270115613937, + "learning_rate": 2.1773937581653277e-07, + "loss": 0.0005, + "num_input_tokens_seen": 38388752, + "step": 19607 + }, + { + "epoch": 2.598807157057654, + "grad_norm": 0.0015440797433257103, + "learning_rate": 2.1759770547244735e-07, + "loss": 0.0, + "num_input_tokens_seen": 38389944, + "step": 19608 + }, + { + "epoch": 2.5989396951623593, + "grad_norm": 0.0008896925719454885, + "learning_rate": 2.174560791346525e-07, + "loss": 0.0, + "num_input_tokens_seen": 38391552, + "step": 19609 + }, + { + "epoch": 2.5990722332670644, + "grad_norm": 0.22280244529247284, + "learning_rate": 2.1731449680587936e-07, + "loss": 0.0005, + "num_input_tokens_seen": 38392728, + "step": 19610 + }, + { + "epoch": 2.5992047713717694, + "grad_norm": 4.767943382263184, + "learning_rate": 2.1717295848885767e-07, + "loss": 0.0552, + "num_input_tokens_seen": 38394944, + "step": 19611 + }, + { + "epoch": 2.5993373094764745, + "grad_norm": 0.5313977003097534, + "learning_rate": 2.1703146418631553e-07, + "loss": 0.0025, + "num_input_tokens_seen": 38396616, + "step": 19612 + }, + { + "epoch": 2.5994698475811795, + "grad_norm": 0.002957290969789028, + "learning_rate": 2.168900139009822e-07, + "loss": 0.0, + "num_input_tokens_seen": 38398144, + "step": 19613 + }, + { + "epoch": 2.5996023856858845, + "grad_norm": 0.0017025766428560019, + "learning_rate": 2.1674860763558402e-07, + "loss": 0.0, + "num_input_tokens_seen": 38399176, + "step": 19614 + }, + { + "epoch": 2.5997349237905896, + "grad_norm": 2.0331015586853027, + "learning_rate": 2.1660724539284805e-07, + "loss": 0.0081, + "num_input_tokens_seen": 38401336, + "step": 19615 + }, + { + "epoch": 2.599867461895295, + "grad_norm": 0.001170025090686977, + "learning_rate": 2.1646592717549985e-07, + "loss": 0.0, + "num_input_tokens_seen": 38402464, + "step": 19616 + }, + { + "epoch": 2.6, + "grad_norm": 0.6683072447776794, + "learning_rate": 2.1632465298626392e-07, + "loss": 0.0031, + "num_input_tokens_seen": 38405488, + "step": 19617 + }, + { + "epoch": 2.600132538104705, + "grad_norm": 0.004597685765475035, + "learning_rate": 2.1618342282786363e-07, + "loss": 0.0, + "num_input_tokens_seen": 38408296, + "step": 19618 + }, + { + "epoch": 2.60026507620941, + "grad_norm": 3.408809185028076, + "learning_rate": 2.1604223670302293e-07, + "loss": 0.0121, + "num_input_tokens_seen": 38410456, + "step": 19619 + }, + { + "epoch": 2.600397614314115, + "grad_norm": 0.0016405662754550576, + "learning_rate": 2.1590109461446297e-07, + "loss": 0.0, + "num_input_tokens_seen": 38411800, + "step": 19620 + }, + { + "epoch": 2.6005301524188207, + "grad_norm": 11.907539367675781, + "learning_rate": 2.1575999656490626e-07, + "loss": 0.0669, + "num_input_tokens_seen": 38413512, + "step": 19621 + }, + { + "epoch": 2.6006626905235253, + "grad_norm": 0.04382339492440224, + "learning_rate": 2.1561894255707234e-07, + "loss": 0.0002, + "num_input_tokens_seen": 38415216, + "step": 19622 + }, + { + "epoch": 2.6007952286282308, + "grad_norm": 0.1933593600988388, + "learning_rate": 2.1547793259368093e-07, + "loss": 0.0006, + "num_input_tokens_seen": 38417752, + "step": 19623 + }, + { + "epoch": 2.600927766732936, + "grad_norm": 1.444933533668518, + "learning_rate": 2.153369666774513e-07, + "loss": 0.0066, + "num_input_tokens_seen": 38420288, + "step": 19624 + }, + { + "epoch": 2.601060304837641, + "grad_norm": 0.011689778417348862, + "learning_rate": 2.1519604481110124e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38421984, + "step": 19625 + }, + { + "epoch": 2.601192842942346, + "grad_norm": 10.084622383117676, + "learning_rate": 2.1505516699734692e-07, + "loss": 0.2342, + "num_input_tokens_seen": 38424192, + "step": 19626 + }, + { + "epoch": 2.601325381047051, + "grad_norm": 0.38927197456359863, + "learning_rate": 2.1491433323890586e-07, + "loss": 0.0029, + "num_input_tokens_seen": 38426304, + "step": 19627 + }, + { + "epoch": 2.6014579191517564, + "grad_norm": 5.027788162231445, + "learning_rate": 2.1477354353849262e-07, + "loss": 0.0308, + "num_input_tokens_seen": 38427920, + "step": 19628 + }, + { + "epoch": 2.601590457256461, + "grad_norm": 0.04316222295165062, + "learning_rate": 2.1463279789882217e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38429760, + "step": 19629 + }, + { + "epoch": 2.6017229953611665, + "grad_norm": 8.21989917755127, + "learning_rate": 2.1449209632260797e-07, + "loss": 0.1081, + "num_input_tokens_seen": 38431880, + "step": 19630 + }, + { + "epoch": 2.6018555334658715, + "grad_norm": 0.020876847207546234, + "learning_rate": 2.1435143881256282e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38433520, + "step": 19631 + }, + { + "epoch": 2.6019880715705765, + "grad_norm": 0.5855117440223694, + "learning_rate": 2.1421082537139842e-07, + "loss": 0.0016, + "num_input_tokens_seen": 38435848, + "step": 19632 + }, + { + "epoch": 2.6021206096752816, + "grad_norm": 2.1936540603637695, + "learning_rate": 2.1407025600182624e-07, + "loss": 0.0036, + "num_input_tokens_seen": 38437240, + "step": 19633 + }, + { + "epoch": 2.6022531477799866, + "grad_norm": 1.4765433073043823, + "learning_rate": 2.139297307065566e-07, + "loss": 0.006, + "num_input_tokens_seen": 38439280, + "step": 19634 + }, + { + "epoch": 2.602385685884692, + "grad_norm": 5.887489318847656, + "learning_rate": 2.1378924948829926e-07, + "loss": 0.0065, + "num_input_tokens_seen": 38440736, + "step": 19635 + }, + { + "epoch": 2.6025182239893967, + "grad_norm": 16.165857315063477, + "learning_rate": 2.136488123497621e-07, + "loss": 0.2541, + "num_input_tokens_seen": 38443480, + "step": 19636 + }, + { + "epoch": 2.602650762094102, + "grad_norm": 4.440367698669434, + "learning_rate": 2.135084192936532e-07, + "loss": 0.0349, + "num_input_tokens_seen": 38445752, + "step": 19637 + }, + { + "epoch": 2.602783300198807, + "grad_norm": 2.2103991508483887, + "learning_rate": 2.133680703226787e-07, + "loss": 0.0135, + "num_input_tokens_seen": 38447824, + "step": 19638 + }, + { + "epoch": 2.6029158383035123, + "grad_norm": 0.22706687450408936, + "learning_rate": 2.1322776543954565e-07, + "loss": 0.0006, + "num_input_tokens_seen": 38450312, + "step": 19639 + }, + { + "epoch": 2.6030483764082173, + "grad_norm": 0.0026858109049499035, + "learning_rate": 2.130875046469591e-07, + "loss": 0.0, + "num_input_tokens_seen": 38452440, + "step": 19640 + }, + { + "epoch": 2.6031809145129223, + "grad_norm": 1.896992802619934, + "learning_rate": 2.1294728794762297e-07, + "loss": 0.0041, + "num_input_tokens_seen": 38453728, + "step": 19641 + }, + { + "epoch": 2.603313452617628, + "grad_norm": 0.06573653966188431, + "learning_rate": 2.1280711534424097e-07, + "loss": 0.0004, + "num_input_tokens_seen": 38455392, + "step": 19642 + }, + { + "epoch": 2.6034459907223324, + "grad_norm": 10.373908996582031, + "learning_rate": 2.1266698683951535e-07, + "loss": 0.2887, + "num_input_tokens_seen": 38458016, + "step": 19643 + }, + { + "epoch": 2.603578528827038, + "grad_norm": 0.05259266495704651, + "learning_rate": 2.1252690243614814e-07, + "loss": 0.0002, + "num_input_tokens_seen": 38459344, + "step": 19644 + }, + { + "epoch": 2.603711066931743, + "grad_norm": 0.007535157725214958, + "learning_rate": 2.123868621368405e-07, + "loss": 0.0, + "num_input_tokens_seen": 38460624, + "step": 19645 + }, + { + "epoch": 2.603843605036448, + "grad_norm": 10.991229057312012, + "learning_rate": 2.122468659442925e-07, + "loss": 0.0961, + "num_input_tokens_seen": 38462640, + "step": 19646 + }, + { + "epoch": 2.603976143141153, + "grad_norm": 7.567541122436523, + "learning_rate": 2.1210691386120308e-07, + "loss": 0.1298, + "num_input_tokens_seen": 38464632, + "step": 19647 + }, + { + "epoch": 2.604108681245858, + "grad_norm": 7.787850856781006, + "learning_rate": 2.119670058902709e-07, + "loss": 0.0786, + "num_input_tokens_seen": 38466824, + "step": 19648 + }, + { + "epoch": 2.6042412193505635, + "grad_norm": 0.005632633343338966, + "learning_rate": 2.1182714203419276e-07, + "loss": 0.0, + "num_input_tokens_seen": 38469408, + "step": 19649 + }, + { + "epoch": 2.604373757455268, + "grad_norm": 0.001964293885976076, + "learning_rate": 2.1168732229566557e-07, + "loss": 0.0, + "num_input_tokens_seen": 38471224, + "step": 19650 + }, + { + "epoch": 2.6045062955599736, + "grad_norm": 0.06351301819086075, + "learning_rate": 2.1154754667738615e-07, + "loss": 0.0003, + "num_input_tokens_seen": 38473056, + "step": 19651 + }, + { + "epoch": 2.6046388336646786, + "grad_norm": 9.607240676879883, + "learning_rate": 2.1140781518204866e-07, + "loss": 0.1671, + "num_input_tokens_seen": 38475232, + "step": 19652 + }, + { + "epoch": 2.6047713717693837, + "grad_norm": 0.015724049881100655, + "learning_rate": 2.112681278123474e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38476840, + "step": 19653 + }, + { + "epoch": 2.6049039098740887, + "grad_norm": 0.09589043259620667, + "learning_rate": 2.111284845709752e-07, + "loss": 0.0005, + "num_input_tokens_seen": 38478816, + "step": 19654 + }, + { + "epoch": 2.6050364479787937, + "grad_norm": 0.012372137047350407, + "learning_rate": 2.1098888546062514e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38480760, + "step": 19655 + }, + { + "epoch": 2.6051689860834992, + "grad_norm": 0.14548827707767487, + "learning_rate": 2.1084933048398787e-07, + "loss": 0.0003, + "num_input_tokens_seen": 38482240, + "step": 19656 + }, + { + "epoch": 2.6053015241882043, + "grad_norm": 0.029473038390278816, + "learning_rate": 2.107098196437554e-07, + "loss": 0.0002, + "num_input_tokens_seen": 38483968, + "step": 19657 + }, + { + "epoch": 2.6054340622929093, + "grad_norm": 0.029876407235860825, + "learning_rate": 2.1057035294261674e-07, + "loss": 0.0002, + "num_input_tokens_seen": 38485880, + "step": 19658 + }, + { + "epoch": 2.6055666003976143, + "grad_norm": 0.0411628820002079, + "learning_rate": 2.1043093038326078e-07, + "loss": 0.0002, + "num_input_tokens_seen": 38488440, + "step": 19659 + }, + { + "epoch": 2.6056991385023194, + "grad_norm": 0.005252644885331392, + "learning_rate": 2.1029155196837624e-07, + "loss": 0.0, + "num_input_tokens_seen": 38489584, + "step": 19660 + }, + { + "epoch": 2.6058316766070244, + "grad_norm": 0.26746857166290283, + "learning_rate": 2.1015221770065015e-07, + "loss": 0.001, + "num_input_tokens_seen": 38491952, + "step": 19661 + }, + { + "epoch": 2.6059642147117295, + "grad_norm": 0.03166719526052475, + "learning_rate": 2.1001292758276864e-07, + "loss": 0.0002, + "num_input_tokens_seen": 38493488, + "step": 19662 + }, + { + "epoch": 2.606096752816435, + "grad_norm": 3.5101544857025146, + "learning_rate": 2.0987368161741822e-07, + "loss": 0.0137, + "num_input_tokens_seen": 38495272, + "step": 19663 + }, + { + "epoch": 2.60622929092114, + "grad_norm": 3.5140631198883057, + "learning_rate": 2.0973447980728257e-07, + "loss": 0.0277, + "num_input_tokens_seen": 38497840, + "step": 19664 + }, + { + "epoch": 2.606361829025845, + "grad_norm": 0.0383896604180336, + "learning_rate": 2.0959532215504646e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38499408, + "step": 19665 + }, + { + "epoch": 2.60649436713055, + "grad_norm": 6.661445617675781, + "learning_rate": 2.0945620866339277e-07, + "loss": 0.0596, + "num_input_tokens_seen": 38500880, + "step": 19666 + }, + { + "epoch": 2.606626905235255, + "grad_norm": 0.8470848798751831, + "learning_rate": 2.0931713933500298e-07, + "loss": 0.0009, + "num_input_tokens_seen": 38502440, + "step": 19667 + }, + { + "epoch": 2.60675944333996, + "grad_norm": 8.894847869873047, + "learning_rate": 2.0917811417255963e-07, + "loss": 0.1285, + "num_input_tokens_seen": 38503896, + "step": 19668 + }, + { + "epoch": 2.606891981444665, + "grad_norm": 0.014560201205313206, + "learning_rate": 2.0903913317874198e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38506288, + "step": 19669 + }, + { + "epoch": 2.6070245195493706, + "grad_norm": 1.5319017171859741, + "learning_rate": 2.0890019635623092e-07, + "loss": 0.0102, + "num_input_tokens_seen": 38508792, + "step": 19670 + }, + { + "epoch": 2.6071570576540757, + "grad_norm": 5.752842903137207, + "learning_rate": 2.0876130370770438e-07, + "loss": 0.0739, + "num_input_tokens_seen": 38511128, + "step": 19671 + }, + { + "epoch": 2.6072895957587807, + "grad_norm": 0.006269050296396017, + "learning_rate": 2.086224552358407e-07, + "loss": 0.0, + "num_input_tokens_seen": 38512768, + "step": 19672 + }, + { + "epoch": 2.6074221338634858, + "grad_norm": 8.103026390075684, + "learning_rate": 2.0848365094331635e-07, + "loss": 0.0489, + "num_input_tokens_seen": 38515224, + "step": 19673 + }, + { + "epoch": 2.607554671968191, + "grad_norm": 6.0618085861206055, + "learning_rate": 2.083448908328084e-07, + "loss": 0.1086, + "num_input_tokens_seen": 38518168, + "step": 19674 + }, + { + "epoch": 2.607687210072896, + "grad_norm": 4.855641841888428, + "learning_rate": 2.0820617490699163e-07, + "loss": 0.0793, + "num_input_tokens_seen": 38520648, + "step": 19675 + }, + { + "epoch": 2.607819748177601, + "grad_norm": 0.0010878577595576644, + "learning_rate": 2.0806750316854114e-07, + "loss": 0.0, + "num_input_tokens_seen": 38521912, + "step": 19676 + }, + { + "epoch": 2.6079522862823064, + "grad_norm": 1.47963547706604, + "learning_rate": 2.0792887562013032e-07, + "loss": 0.0034, + "num_input_tokens_seen": 38523928, + "step": 19677 + }, + { + "epoch": 2.6080848243870114, + "grad_norm": 2.3803086280822754, + "learning_rate": 2.0779029226443205e-07, + "loss": 0.0096, + "num_input_tokens_seen": 38525632, + "step": 19678 + }, + { + "epoch": 2.6082173624917164, + "grad_norm": 10.975857734680176, + "learning_rate": 2.0765175310411778e-07, + "loss": 0.2862, + "num_input_tokens_seen": 38528128, + "step": 19679 + }, + { + "epoch": 2.6083499005964215, + "grad_norm": 3.9681003093719482, + "learning_rate": 2.07513258141859e-07, + "loss": 0.0132, + "num_input_tokens_seen": 38529696, + "step": 19680 + }, + { + "epoch": 2.6084824387011265, + "grad_norm": 2.673328399658203, + "learning_rate": 2.0737480738032666e-07, + "loss": 0.0138, + "num_input_tokens_seen": 38531536, + "step": 19681 + }, + { + "epoch": 2.6086149768058315, + "grad_norm": 3.9161808490753174, + "learning_rate": 2.0723640082218972e-07, + "loss": 0.0605, + "num_input_tokens_seen": 38533776, + "step": 19682 + }, + { + "epoch": 2.6087475149105366, + "grad_norm": 0.005036740563809872, + "learning_rate": 2.0709803847011655e-07, + "loss": 0.0, + "num_input_tokens_seen": 38535152, + "step": 19683 + }, + { + "epoch": 2.608880053015242, + "grad_norm": 15.925858497619629, + "learning_rate": 2.0695972032677508e-07, + "loss": 0.0739, + "num_input_tokens_seen": 38537336, + "step": 19684 + }, + { + "epoch": 2.609012591119947, + "grad_norm": 8.88155460357666, + "learning_rate": 2.068214463948315e-07, + "loss": 0.0698, + "num_input_tokens_seen": 38538352, + "step": 19685 + }, + { + "epoch": 2.609145129224652, + "grad_norm": 0.048788294196128845, + "learning_rate": 2.066832166769525e-07, + "loss": 0.0002, + "num_input_tokens_seen": 38539952, + "step": 19686 + }, + { + "epoch": 2.609277667329357, + "grad_norm": 0.0354636050760746, + "learning_rate": 2.0654503117580381e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38541880, + "step": 19687 + }, + { + "epoch": 2.609410205434062, + "grad_norm": 5.091892242431641, + "learning_rate": 2.0640688989404883e-07, + "loss": 0.0289, + "num_input_tokens_seen": 38544648, + "step": 19688 + }, + { + "epoch": 2.6095427435387673, + "grad_norm": 2.6286814212799072, + "learning_rate": 2.0626879283435124e-07, + "loss": 0.0339, + "num_input_tokens_seen": 38546424, + "step": 19689 + }, + { + "epoch": 2.6096752816434723, + "grad_norm": 0.005861126352101564, + "learning_rate": 2.061307399993734e-07, + "loss": 0.0, + "num_input_tokens_seen": 38548296, + "step": 19690 + }, + { + "epoch": 2.6098078197481778, + "grad_norm": 2.2982873916625977, + "learning_rate": 2.0599273139177785e-07, + "loss": 0.0072, + "num_input_tokens_seen": 38550120, + "step": 19691 + }, + { + "epoch": 2.609940357852883, + "grad_norm": 0.004637660458683968, + "learning_rate": 2.058547670142244e-07, + "loss": 0.0, + "num_input_tokens_seen": 38552280, + "step": 19692 + }, + { + "epoch": 2.610072895957588, + "grad_norm": 6.655600547790527, + "learning_rate": 2.0571684686937433e-07, + "loss": 0.0874, + "num_input_tokens_seen": 38554056, + "step": 19693 + }, + { + "epoch": 2.610205434062293, + "grad_norm": 0.2024775743484497, + "learning_rate": 2.05578970959886e-07, + "loss": 0.001, + "num_input_tokens_seen": 38556608, + "step": 19694 + }, + { + "epoch": 2.610337972166998, + "grad_norm": 0.004785206634551287, + "learning_rate": 2.0544113928841757e-07, + "loss": 0.0, + "num_input_tokens_seen": 38558032, + "step": 19695 + }, + { + "epoch": 2.610470510271703, + "grad_norm": 0.0558529756963253, + "learning_rate": 2.0530335185762724e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38560616, + "step": 19696 + }, + { + "epoch": 2.610603048376408, + "grad_norm": 0.010985753498971462, + "learning_rate": 2.0516560867017089e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38562504, + "step": 19697 + }, + { + "epoch": 2.6107355864811135, + "grad_norm": 0.0026474446058273315, + "learning_rate": 2.050279097287053e-07, + "loss": 0.0, + "num_input_tokens_seen": 38564704, + "step": 19698 + }, + { + "epoch": 2.6108681245858185, + "grad_norm": 3.0736896991729736, + "learning_rate": 2.048902550358847e-07, + "loss": 0.0157, + "num_input_tokens_seen": 38566416, + "step": 19699 + }, + { + "epoch": 2.6110006626905236, + "grad_norm": 0.7595178484916687, + "learning_rate": 2.0475264459436346e-07, + "loss": 0.0089, + "num_input_tokens_seen": 38567952, + "step": 19700 + }, + { + "epoch": 2.6111332007952286, + "grad_norm": 0.0016683790599927306, + "learning_rate": 2.0461507840679435e-07, + "loss": 0.0, + "num_input_tokens_seen": 38569552, + "step": 19701 + }, + { + "epoch": 2.6112657388999336, + "grad_norm": 0.026529155671596527, + "learning_rate": 2.0447755647583e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38572280, + "step": 19702 + }, + { + "epoch": 2.6113982770046387, + "grad_norm": 0.9153798222541809, + "learning_rate": 2.0434007880412192e-07, + "loss": 0.005, + "num_input_tokens_seen": 38574536, + "step": 19703 + }, + { + "epoch": 2.6115308151093437, + "grad_norm": 0.008862215094268322, + "learning_rate": 2.0420264539432105e-07, + "loss": 0.0, + "num_input_tokens_seen": 38577160, + "step": 19704 + }, + { + "epoch": 2.611663353214049, + "grad_norm": 16.374385833740234, + "learning_rate": 2.0406525624907718e-07, + "loss": 0.054, + "num_input_tokens_seen": 38579064, + "step": 19705 + }, + { + "epoch": 2.6117958913187542, + "grad_norm": 0.15715391933918, + "learning_rate": 2.0392791137103822e-07, + "loss": 0.0008, + "num_input_tokens_seen": 38581208, + "step": 19706 + }, + { + "epoch": 2.6119284294234593, + "grad_norm": 2.4186015129089355, + "learning_rate": 2.03790610762854e-07, + "loss": 0.005, + "num_input_tokens_seen": 38583208, + "step": 19707 + }, + { + "epoch": 2.6120609675281643, + "grad_norm": 2.1412997245788574, + "learning_rate": 2.0365335442717072e-07, + "loss": 0.0141, + "num_input_tokens_seen": 38584760, + "step": 19708 + }, + { + "epoch": 2.6121935056328693, + "grad_norm": 0.009292821399867535, + "learning_rate": 2.035161423666343e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38586568, + "step": 19709 + }, + { + "epoch": 2.612326043737575, + "grad_norm": 0.012547353282570839, + "learning_rate": 2.0337897458389154e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38588744, + "step": 19710 + }, + { + "epoch": 2.6124585818422794, + "grad_norm": 0.0037126350216567516, + "learning_rate": 2.032418510815859e-07, + "loss": 0.0, + "num_input_tokens_seen": 38591640, + "step": 19711 + }, + { + "epoch": 2.612591119946985, + "grad_norm": 0.5583294034004211, + "learning_rate": 2.0310477186236242e-07, + "loss": 0.0012, + "num_input_tokens_seen": 38593264, + "step": 19712 + }, + { + "epoch": 2.61272365805169, + "grad_norm": 1.7309136390686035, + "learning_rate": 2.0296773692886352e-07, + "loss": 0.0069, + "num_input_tokens_seen": 38594472, + "step": 19713 + }, + { + "epoch": 2.612856196156395, + "grad_norm": 0.0544796884059906, + "learning_rate": 2.0283074628373119e-07, + "loss": 0.0003, + "num_input_tokens_seen": 38595904, + "step": 19714 + }, + { + "epoch": 2.6129887342611, + "grad_norm": 5.0832953453063965, + "learning_rate": 2.0269379992960636e-07, + "loss": 0.0216, + "num_input_tokens_seen": 38597760, + "step": 19715 + }, + { + "epoch": 2.613121272365805, + "grad_norm": 12.340690612792969, + "learning_rate": 2.0255689786913003e-07, + "loss": 0.2385, + "num_input_tokens_seen": 38599080, + "step": 19716 + }, + { + "epoch": 2.6132538104705105, + "grad_norm": 8.06580924987793, + "learning_rate": 2.0242004010494198e-07, + "loss": 0.0291, + "num_input_tokens_seen": 38601344, + "step": 19717 + }, + { + "epoch": 2.613386348575215, + "grad_norm": 0.4431365728378296, + "learning_rate": 2.022832266396807e-07, + "loss": 0.001, + "num_input_tokens_seen": 38604200, + "step": 19718 + }, + { + "epoch": 2.6135188866799206, + "grad_norm": 14.249427795410156, + "learning_rate": 2.0214645747598376e-07, + "loss": 0.1641, + "num_input_tokens_seen": 38605816, + "step": 19719 + }, + { + "epoch": 2.6136514247846256, + "grad_norm": 0.4850020706653595, + "learning_rate": 2.02009732616488e-07, + "loss": 0.001, + "num_input_tokens_seen": 38607088, + "step": 19720 + }, + { + "epoch": 2.6137839628893307, + "grad_norm": 11.44346809387207, + "learning_rate": 2.0187305206382984e-07, + "loss": 0.0855, + "num_input_tokens_seen": 38608880, + "step": 19721 + }, + { + "epoch": 2.6139165009940357, + "grad_norm": 6.483201503753662, + "learning_rate": 2.01736415820645e-07, + "loss": 0.1367, + "num_input_tokens_seen": 38610800, + "step": 19722 + }, + { + "epoch": 2.6140490390987408, + "grad_norm": 2.7182555198669434, + "learning_rate": 2.015998238895675e-07, + "loss": 0.0409, + "num_input_tokens_seen": 38612824, + "step": 19723 + }, + { + "epoch": 2.6141815772034462, + "grad_norm": 0.010767446830868721, + "learning_rate": 2.0146327627323102e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38614688, + "step": 19724 + }, + { + "epoch": 2.614314115308151, + "grad_norm": 11.460749626159668, + "learning_rate": 2.013267729742685e-07, + "loss": 0.1839, + "num_input_tokens_seen": 38616656, + "step": 19725 + }, + { + "epoch": 2.6144466534128563, + "grad_norm": 14.148900985717773, + "learning_rate": 2.0119031399531087e-07, + "loss": 0.2544, + "num_input_tokens_seen": 38619312, + "step": 19726 + }, + { + "epoch": 2.6145791915175614, + "grad_norm": 3.1021413803100586, + "learning_rate": 2.010538993389899e-07, + "loss": 0.0367, + "num_input_tokens_seen": 38621176, + "step": 19727 + }, + { + "epoch": 2.6147117296222664, + "grad_norm": 0.01930908113718033, + "learning_rate": 2.0091752900793626e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38623120, + "step": 19728 + }, + { + "epoch": 2.6148442677269714, + "grad_norm": 2.3271484375, + "learning_rate": 2.0078120300477872e-07, + "loss": 0.0228, + "num_input_tokens_seen": 38624912, + "step": 19729 + }, + { + "epoch": 2.6149768058316765, + "grad_norm": 4.01146936416626, + "learning_rate": 2.006449213321457e-07, + "loss": 0.0341, + "num_input_tokens_seen": 38627240, + "step": 19730 + }, + { + "epoch": 2.615109343936382, + "grad_norm": 0.2731715738773346, + "learning_rate": 2.0050868399266483e-07, + "loss": 0.0013, + "num_input_tokens_seen": 38629728, + "step": 19731 + }, + { + "epoch": 2.6152418820410865, + "grad_norm": 13.137104034423828, + "learning_rate": 2.0037249098896234e-07, + "loss": 0.1277, + "num_input_tokens_seen": 38631696, + "step": 19732 + }, + { + "epoch": 2.615374420145792, + "grad_norm": 0.14065180718898773, + "learning_rate": 2.0023634232366473e-07, + "loss": 0.0007, + "num_input_tokens_seen": 38633200, + "step": 19733 + }, + { + "epoch": 2.615506958250497, + "grad_norm": 3.7055394649505615, + "learning_rate": 2.0010023799939743e-07, + "loss": 0.0395, + "num_input_tokens_seen": 38635432, + "step": 19734 + }, + { + "epoch": 2.615639496355202, + "grad_norm": 0.007867217995226383, + "learning_rate": 1.9996417801878416e-07, + "loss": 0.0, + "num_input_tokens_seen": 38637232, + "step": 19735 + }, + { + "epoch": 2.615772034459907, + "grad_norm": 1.7203454971313477, + "learning_rate": 1.9982816238444807e-07, + "loss": 0.0112, + "num_input_tokens_seen": 38638688, + "step": 19736 + }, + { + "epoch": 2.615904572564612, + "grad_norm": 1.2049305438995361, + "learning_rate": 1.9969219109901155e-07, + "loss": 0.0037, + "num_input_tokens_seen": 38641152, + "step": 19737 + }, + { + "epoch": 2.6160371106693177, + "grad_norm": 0.01971018873155117, + "learning_rate": 1.9955626416509693e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38642288, + "step": 19738 + }, + { + "epoch": 2.6161696487740227, + "grad_norm": 0.021865589544177055, + "learning_rate": 1.9942038158532407e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38644648, + "step": 19739 + }, + { + "epoch": 2.6163021868787277, + "grad_norm": 0.001193903386592865, + "learning_rate": 1.992845433623139e-07, + "loss": 0.0, + "num_input_tokens_seen": 38646744, + "step": 19740 + }, + { + "epoch": 2.6164347249834328, + "grad_norm": 0.19450832903385162, + "learning_rate": 1.9914874949868463e-07, + "loss": 0.0005, + "num_input_tokens_seen": 38648472, + "step": 19741 + }, + { + "epoch": 2.616567263088138, + "grad_norm": 0.022375067695975304, + "learning_rate": 1.990129999970547e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38650328, + "step": 19742 + }, + { + "epoch": 2.616699801192843, + "grad_norm": 0.10503179579973221, + "learning_rate": 1.988772948600415e-07, + "loss": 0.0004, + "num_input_tokens_seen": 38652400, + "step": 19743 + }, + { + "epoch": 2.616832339297548, + "grad_norm": 0.010269483551383018, + "learning_rate": 1.9874163409026175e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38654144, + "step": 19744 + }, + { + "epoch": 2.6169648774022534, + "grad_norm": 12.245887756347656, + "learning_rate": 1.9860601769033038e-07, + "loss": 0.0645, + "num_input_tokens_seen": 38655864, + "step": 19745 + }, + { + "epoch": 2.6170974155069584, + "grad_norm": 2.6453616619110107, + "learning_rate": 1.9847044566286305e-07, + "loss": 0.0112, + "num_input_tokens_seen": 38657440, + "step": 19746 + }, + { + "epoch": 2.6172299536116634, + "grad_norm": 5.343145370483398, + "learning_rate": 1.9833491801047296e-07, + "loss": 0.1042, + "num_input_tokens_seen": 38659568, + "step": 19747 + }, + { + "epoch": 2.6173624917163685, + "grad_norm": 10.84970474243164, + "learning_rate": 1.981994347357738e-07, + "loss": 0.0948, + "num_input_tokens_seen": 38661736, + "step": 19748 + }, + { + "epoch": 2.6174950298210735, + "grad_norm": 0.003658079309388995, + "learning_rate": 1.980639958413777e-07, + "loss": 0.0, + "num_input_tokens_seen": 38663016, + "step": 19749 + }, + { + "epoch": 2.6176275679257786, + "grad_norm": 0.006934002973139286, + "learning_rate": 1.9792860132989504e-07, + "loss": 0.0, + "num_input_tokens_seen": 38664952, + "step": 19750 + }, + { + "epoch": 2.6177601060304836, + "grad_norm": 16.418716430664062, + "learning_rate": 1.9779325120393767e-07, + "loss": 0.1344, + "num_input_tokens_seen": 38667184, + "step": 19751 + }, + { + "epoch": 2.617892644135189, + "grad_norm": 8.170684814453125, + "learning_rate": 1.97657945466114e-07, + "loss": 0.2289, + "num_input_tokens_seen": 38670488, + "step": 19752 + }, + { + "epoch": 2.618025182239894, + "grad_norm": 1.1484946012496948, + "learning_rate": 1.9752268411903418e-07, + "loss": 0.0041, + "num_input_tokens_seen": 38673144, + "step": 19753 + }, + { + "epoch": 2.618157720344599, + "grad_norm": 0.0027064899913966656, + "learning_rate": 1.9738746716530504e-07, + "loss": 0.0, + "num_input_tokens_seen": 38674712, + "step": 19754 + }, + { + "epoch": 2.618290258449304, + "grad_norm": 0.07577101141214371, + "learning_rate": 1.9725229460753447e-07, + "loss": 0.0005, + "num_input_tokens_seen": 38676624, + "step": 19755 + }, + { + "epoch": 2.6184227965540092, + "grad_norm": 8.376341819763184, + "learning_rate": 1.9711716644832734e-07, + "loss": 0.0273, + "num_input_tokens_seen": 38679440, + "step": 19756 + }, + { + "epoch": 2.6185553346587143, + "grad_norm": 4.80224609375, + "learning_rate": 1.9698208269029073e-07, + "loss": 0.0567, + "num_input_tokens_seen": 38680960, + "step": 19757 + }, + { + "epoch": 2.6186878727634193, + "grad_norm": 5.651721000671387, + "learning_rate": 1.9684704333602783e-07, + "loss": 0.0269, + "num_input_tokens_seen": 38683400, + "step": 19758 + }, + { + "epoch": 2.618820410868125, + "grad_norm": 0.0049553425051271915, + "learning_rate": 1.9671204838814296e-07, + "loss": 0.0, + "num_input_tokens_seen": 38685408, + "step": 19759 + }, + { + "epoch": 2.61895294897283, + "grad_norm": 0.030724303796887398, + "learning_rate": 1.96577097849239e-07, + "loss": 0.0002, + "num_input_tokens_seen": 38687472, + "step": 19760 + }, + { + "epoch": 2.619085487077535, + "grad_norm": 11.732736587524414, + "learning_rate": 1.9644219172191754e-07, + "loss": 0.0595, + "num_input_tokens_seen": 38689408, + "step": 19761 + }, + { + "epoch": 2.61921802518224, + "grad_norm": 0.009748328477144241, + "learning_rate": 1.963073300087795e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38691312, + "step": 19762 + }, + { + "epoch": 2.619350563286945, + "grad_norm": 0.7420490980148315, + "learning_rate": 1.9617251271242533e-07, + "loss": 0.0298, + "num_input_tokens_seen": 38693256, + "step": 19763 + }, + { + "epoch": 2.61948310139165, + "grad_norm": 0.0563078299164772, + "learning_rate": 1.9603773983545488e-07, + "loss": 0.0003, + "num_input_tokens_seen": 38695472, + "step": 19764 + }, + { + "epoch": 2.619615639496355, + "grad_norm": 0.011395145207643509, + "learning_rate": 1.9590301138046607e-07, + "loss": 0.0, + "num_input_tokens_seen": 38697192, + "step": 19765 + }, + { + "epoch": 2.6197481776010605, + "grad_norm": 0.24805064499378204, + "learning_rate": 1.9576832735005686e-07, + "loss": 0.0017, + "num_input_tokens_seen": 38698840, + "step": 19766 + }, + { + "epoch": 2.6198807157057655, + "grad_norm": 3.8066189289093018, + "learning_rate": 1.9563368774682373e-07, + "loss": 0.0595, + "num_input_tokens_seen": 38701296, + "step": 19767 + }, + { + "epoch": 2.6200132538104706, + "grad_norm": 0.22127187252044678, + "learning_rate": 1.954990925733624e-07, + "loss": 0.0008, + "num_input_tokens_seen": 38702952, + "step": 19768 + }, + { + "epoch": 2.6201457919151756, + "grad_norm": 3.8527517318725586, + "learning_rate": 1.953645418322686e-07, + "loss": 0.0126, + "num_input_tokens_seen": 38705208, + "step": 19769 + }, + { + "epoch": 2.6202783300198806, + "grad_norm": 9.357540130615234, + "learning_rate": 1.952300355261366e-07, + "loss": 0.0837, + "num_input_tokens_seen": 38706672, + "step": 19770 + }, + { + "epoch": 2.6204108681245857, + "grad_norm": 5.711824417114258, + "learning_rate": 1.9509557365755938e-07, + "loss": 0.0542, + "num_input_tokens_seen": 38709752, + "step": 19771 + }, + { + "epoch": 2.6205434062292907, + "grad_norm": 0.030969567596912384, + "learning_rate": 1.949611562291298e-07, + "loss": 0.0002, + "num_input_tokens_seen": 38710776, + "step": 19772 + }, + { + "epoch": 2.620675944333996, + "grad_norm": 2.8518829345703125, + "learning_rate": 1.9482678324343863e-07, + "loss": 0.0289, + "num_input_tokens_seen": 38712656, + "step": 19773 + }, + { + "epoch": 2.6208084824387012, + "grad_norm": 1.0748099088668823, + "learning_rate": 1.9469245470307762e-07, + "loss": 0.0017, + "num_input_tokens_seen": 38714408, + "step": 19774 + }, + { + "epoch": 2.6209410205434063, + "grad_norm": 7.755034446716309, + "learning_rate": 1.945581706106367e-07, + "loss": 0.1486, + "num_input_tokens_seen": 38716488, + "step": 19775 + }, + { + "epoch": 2.6210735586481113, + "grad_norm": 5.888571739196777, + "learning_rate": 1.944239309687046e-07, + "loss": 0.2444, + "num_input_tokens_seen": 38718296, + "step": 19776 + }, + { + "epoch": 2.6212060967528164, + "grad_norm": 0.017607321962714195, + "learning_rate": 1.9428973577986982e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38719936, + "step": 19777 + }, + { + "epoch": 2.6213386348575214, + "grad_norm": 0.03258022293448448, + "learning_rate": 1.9415558504671888e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38721592, + "step": 19778 + }, + { + "epoch": 2.6214711729622264, + "grad_norm": 0.00026192908990196884, + "learning_rate": 1.9402147877183974e-07, + "loss": 0.0, + "num_input_tokens_seen": 38722896, + "step": 19779 + }, + { + "epoch": 2.621603711066932, + "grad_norm": 3.2329914569854736, + "learning_rate": 1.938874169578167e-07, + "loss": 0.0207, + "num_input_tokens_seen": 38726096, + "step": 19780 + }, + { + "epoch": 2.621736249171637, + "grad_norm": 0.029188159853219986, + "learning_rate": 1.937533996072355e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38728080, + "step": 19781 + }, + { + "epoch": 2.621868787276342, + "grad_norm": 0.021158233284950256, + "learning_rate": 1.9361942672267985e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38729856, + "step": 19782 + }, + { + "epoch": 2.622001325381047, + "grad_norm": 0.0012191388523206115, + "learning_rate": 1.9348549830673246e-07, + "loss": 0.0, + "num_input_tokens_seen": 38730856, + "step": 19783 + }, + { + "epoch": 2.622133863485752, + "grad_norm": 8.106405258178711, + "learning_rate": 1.9335161436197598e-07, + "loss": 0.136, + "num_input_tokens_seen": 38732456, + "step": 19784 + }, + { + "epoch": 2.622266401590457, + "grad_norm": 0.0011384871322661638, + "learning_rate": 1.9321777489099192e-07, + "loss": 0.0, + "num_input_tokens_seen": 38733800, + "step": 19785 + }, + { + "epoch": 2.622398939695162, + "grad_norm": 0.012584177777171135, + "learning_rate": 1.930839798963599e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38736632, + "step": 19786 + }, + { + "epoch": 2.6225314777998676, + "grad_norm": 4.519819736480713, + "learning_rate": 1.9295022938066066e-07, + "loss": 0.0261, + "num_input_tokens_seen": 38739512, + "step": 19787 + }, + { + "epoch": 2.6226640159045727, + "grad_norm": 0.11794044077396393, + "learning_rate": 1.9281652334647265e-07, + "loss": 0.0008, + "num_input_tokens_seen": 38741512, + "step": 19788 + }, + { + "epoch": 2.6227965540092777, + "grad_norm": 0.05870175361633301, + "learning_rate": 1.926828617963733e-07, + "loss": 0.0004, + "num_input_tokens_seen": 38742696, + "step": 19789 + }, + { + "epoch": 2.6229290921139827, + "grad_norm": 8.147379875183105, + "learning_rate": 1.9254924473294052e-07, + "loss": 0.078, + "num_input_tokens_seen": 38745128, + "step": 19790 + }, + { + "epoch": 2.6230616302186878, + "grad_norm": 3.9784226417541504, + "learning_rate": 1.9241567215875e-07, + "loss": 0.0438, + "num_input_tokens_seen": 38747608, + "step": 19791 + }, + { + "epoch": 2.6231941683233932, + "grad_norm": 0.04090183600783348, + "learning_rate": 1.9228214407637695e-07, + "loss": 0.0002, + "num_input_tokens_seen": 38748864, + "step": 19792 + }, + { + "epoch": 2.623326706428098, + "grad_norm": 6.516653060913086, + "learning_rate": 1.9214866048839654e-07, + "loss": 0.0429, + "num_input_tokens_seen": 38750536, + "step": 19793 + }, + { + "epoch": 2.6234592445328033, + "grad_norm": 0.03304719924926758, + "learning_rate": 1.9201522139738167e-07, + "loss": 0.0002, + "num_input_tokens_seen": 38753008, + "step": 19794 + }, + { + "epoch": 2.6235917826375084, + "grad_norm": 0.004906546790152788, + "learning_rate": 1.918818268059061e-07, + "loss": 0.0, + "num_input_tokens_seen": 38754536, + "step": 19795 + }, + { + "epoch": 2.6237243207422134, + "grad_norm": 5.391851902008057, + "learning_rate": 1.917484767165409e-07, + "loss": 0.0338, + "num_input_tokens_seen": 38755592, + "step": 19796 + }, + { + "epoch": 2.6238568588469184, + "grad_norm": 1.4228373765945435, + "learning_rate": 1.916151711318573e-07, + "loss": 0.0389, + "num_input_tokens_seen": 38758480, + "step": 19797 + }, + { + "epoch": 2.6239893969516235, + "grad_norm": 0.01757168397307396, + "learning_rate": 1.91481910054426e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38760736, + "step": 19798 + }, + { + "epoch": 2.624121935056329, + "grad_norm": 0.0023080434184521437, + "learning_rate": 1.9134869348681556e-07, + "loss": 0.0, + "num_input_tokens_seen": 38762568, + "step": 19799 + }, + { + "epoch": 2.6242544731610336, + "grad_norm": 8.883512496948242, + "learning_rate": 1.9121552143159528e-07, + "loss": 0.1218, + "num_input_tokens_seen": 38764800, + "step": 19800 + }, + { + "epoch": 2.624387011265739, + "grad_norm": 0.8498203754425049, + "learning_rate": 1.910823938913328e-07, + "loss": 0.0025, + "num_input_tokens_seen": 38766760, + "step": 19801 + }, + { + "epoch": 2.624519549370444, + "grad_norm": 0.004248417913913727, + "learning_rate": 1.909493108685942e-07, + "loss": 0.0, + "num_input_tokens_seen": 38767984, + "step": 19802 + }, + { + "epoch": 2.624652087475149, + "grad_norm": 0.005198840983211994, + "learning_rate": 1.908162723659457e-07, + "loss": 0.0, + "num_input_tokens_seen": 38769504, + "step": 19803 + }, + { + "epoch": 2.624784625579854, + "grad_norm": 0.0010867668315768242, + "learning_rate": 1.9068327838595246e-07, + "loss": 0.0, + "num_input_tokens_seen": 38770896, + "step": 19804 + }, + { + "epoch": 2.624917163684559, + "grad_norm": 0.01152516808360815, + "learning_rate": 1.9055032893117915e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38772896, + "step": 19805 + }, + { + "epoch": 2.6250497017892647, + "grad_norm": 5.803727626800537, + "learning_rate": 1.9041742400418866e-07, + "loss": 0.0076, + "num_input_tokens_seen": 38775808, + "step": 19806 + }, + { + "epoch": 2.6251822398939693, + "grad_norm": 4.524374961853027, + "learning_rate": 1.9028456360754345e-07, + "loss": 0.0321, + "num_input_tokens_seen": 38777784, + "step": 19807 + }, + { + "epoch": 2.6253147779986747, + "grad_norm": 0.0013586783315986395, + "learning_rate": 1.9015174774380528e-07, + "loss": 0.0, + "num_input_tokens_seen": 38778744, + "step": 19808 + }, + { + "epoch": 2.62544731610338, + "grad_norm": 6.243824481964111, + "learning_rate": 1.900189764155347e-07, + "loss": 0.0482, + "num_input_tokens_seen": 38780192, + "step": 19809 + }, + { + "epoch": 2.625579854208085, + "grad_norm": 3.989624500274658, + "learning_rate": 1.8988624962529151e-07, + "loss": 0.0304, + "num_input_tokens_seen": 38783488, + "step": 19810 + }, + { + "epoch": 2.62571239231279, + "grad_norm": 0.006453671958297491, + "learning_rate": 1.8975356737563567e-07, + "loss": 0.0, + "num_input_tokens_seen": 38784560, + "step": 19811 + }, + { + "epoch": 2.625844930417495, + "grad_norm": 10.892070770263672, + "learning_rate": 1.8962092966912483e-07, + "loss": 0.0658, + "num_input_tokens_seen": 38786240, + "step": 19812 + }, + { + "epoch": 2.6259774685222004, + "grad_norm": 0.002781492890790105, + "learning_rate": 1.894883365083164e-07, + "loss": 0.0, + "num_input_tokens_seen": 38787784, + "step": 19813 + }, + { + "epoch": 2.626110006626905, + "grad_norm": 0.007054154761135578, + "learning_rate": 1.8935578789576607e-07, + "loss": 0.0, + "num_input_tokens_seen": 38789096, + "step": 19814 + }, + { + "epoch": 2.6262425447316105, + "grad_norm": 0.28943291306495667, + "learning_rate": 1.8922328383403104e-07, + "loss": 0.0017, + "num_input_tokens_seen": 38791016, + "step": 19815 + }, + { + "epoch": 2.6263750828363155, + "grad_norm": 0.05346442013978958, + "learning_rate": 1.890908243256645e-07, + "loss": 0.0004, + "num_input_tokens_seen": 38792808, + "step": 19816 + }, + { + "epoch": 2.6265076209410205, + "grad_norm": 5.49091911315918, + "learning_rate": 1.8895840937322134e-07, + "loss": 0.1693, + "num_input_tokens_seen": 38795824, + "step": 19817 + }, + { + "epoch": 2.6266401590457256, + "grad_norm": 0.026946239173412323, + "learning_rate": 1.8882603897925482e-07, + "loss": 0.0002, + "num_input_tokens_seen": 38797368, + "step": 19818 + }, + { + "epoch": 2.6267726971504306, + "grad_norm": 0.0009256310295313597, + "learning_rate": 1.8869371314631623e-07, + "loss": 0.0, + "num_input_tokens_seen": 38799104, + "step": 19819 + }, + { + "epoch": 2.626905235255136, + "grad_norm": 5.3360395431518555, + "learning_rate": 1.8856143187695713e-07, + "loss": 0.0324, + "num_input_tokens_seen": 38800496, + "step": 19820 + }, + { + "epoch": 2.6270377733598407, + "grad_norm": 0.17862439155578613, + "learning_rate": 1.8842919517372854e-07, + "loss": 0.0012, + "num_input_tokens_seen": 38803248, + "step": 19821 + }, + { + "epoch": 2.627170311464546, + "grad_norm": 4.026628494262695, + "learning_rate": 1.8829700303917953e-07, + "loss": 0.0225, + "num_input_tokens_seen": 38805848, + "step": 19822 + }, + { + "epoch": 2.627302849569251, + "grad_norm": 7.196382999420166, + "learning_rate": 1.8816485547585916e-07, + "loss": 0.101, + "num_input_tokens_seen": 38807448, + "step": 19823 + }, + { + "epoch": 2.6274353876739562, + "grad_norm": 0.016932401806116104, + "learning_rate": 1.8803275248631542e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38809568, + "step": 19824 + }, + { + "epoch": 2.6275679257786613, + "grad_norm": 0.22900737822055817, + "learning_rate": 1.879006940730946e-07, + "loss": 0.0004, + "num_input_tokens_seen": 38811464, + "step": 19825 + }, + { + "epoch": 2.6277004638833663, + "grad_norm": 0.26289063692092896, + "learning_rate": 1.8776868023874407e-07, + "loss": 0.0014, + "num_input_tokens_seen": 38813176, + "step": 19826 + }, + { + "epoch": 2.627833001988072, + "grad_norm": 14.400627136230469, + "learning_rate": 1.8763671098580792e-07, + "loss": 0.1487, + "num_input_tokens_seen": 38814808, + "step": 19827 + }, + { + "epoch": 2.627965540092777, + "grad_norm": 0.0033969345968216658, + "learning_rate": 1.8750478631683167e-07, + "loss": 0.0, + "num_input_tokens_seen": 38815816, + "step": 19828 + }, + { + "epoch": 2.628098078197482, + "grad_norm": 9.857460975646973, + "learning_rate": 1.873729062343585e-07, + "loss": 0.0272, + "num_input_tokens_seen": 38817824, + "step": 19829 + }, + { + "epoch": 2.628230616302187, + "grad_norm": 0.21560563147068024, + "learning_rate": 1.8724107074093057e-07, + "loss": 0.0007, + "num_input_tokens_seen": 38819672, + "step": 19830 + }, + { + "epoch": 2.628363154406892, + "grad_norm": 5.877389907836914, + "learning_rate": 1.8710927983909084e-07, + "loss": 0.0533, + "num_input_tokens_seen": 38822784, + "step": 19831 + }, + { + "epoch": 2.628495692511597, + "grad_norm": 0.019122915342450142, + "learning_rate": 1.8697753353137976e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38824544, + "step": 19832 + }, + { + "epoch": 2.628628230616302, + "grad_norm": 0.0026731607504189014, + "learning_rate": 1.86845831820337e-07, + "loss": 0.0, + "num_input_tokens_seen": 38825944, + "step": 19833 + }, + { + "epoch": 2.6287607687210075, + "grad_norm": 6.117276191711426, + "learning_rate": 1.8671417470850273e-07, + "loss": 0.16, + "num_input_tokens_seen": 38827936, + "step": 19834 + }, + { + "epoch": 2.6288933068257125, + "grad_norm": 12.60441780090332, + "learning_rate": 1.8658256219841463e-07, + "loss": 0.1214, + "num_input_tokens_seen": 38829528, + "step": 19835 + }, + { + "epoch": 2.6290258449304176, + "grad_norm": 10.467912673950195, + "learning_rate": 1.8645099429261127e-07, + "loss": 0.2085, + "num_input_tokens_seen": 38831216, + "step": 19836 + }, + { + "epoch": 2.6291583830351226, + "grad_norm": 0.005683396942913532, + "learning_rate": 1.8631947099362862e-07, + "loss": 0.0, + "num_input_tokens_seen": 38832512, + "step": 19837 + }, + { + "epoch": 2.6292909211398277, + "grad_norm": 0.00114800117444247, + "learning_rate": 1.8618799230400274e-07, + "loss": 0.0, + "num_input_tokens_seen": 38833880, + "step": 19838 + }, + { + "epoch": 2.6294234592445327, + "grad_norm": 2.264744997024536, + "learning_rate": 1.8605655822626828e-07, + "loss": 0.0152, + "num_input_tokens_seen": 38836352, + "step": 19839 + }, + { + "epoch": 2.6295559973492377, + "grad_norm": 27.523895263671875, + "learning_rate": 1.8592516876295957e-07, + "loss": 0.0684, + "num_input_tokens_seen": 38837696, + "step": 19840 + }, + { + "epoch": 2.629688535453943, + "grad_norm": 0.05799351632595062, + "learning_rate": 1.8579382391661043e-07, + "loss": 0.0003, + "num_input_tokens_seen": 38839480, + "step": 19841 + }, + { + "epoch": 2.6298210735586482, + "grad_norm": 1.1518042087554932, + "learning_rate": 1.8566252368975275e-07, + "loss": 0.0027, + "num_input_tokens_seen": 38841496, + "step": 19842 + }, + { + "epoch": 2.6299536116633533, + "grad_norm": 5.671234607696533, + "learning_rate": 1.8553126808491832e-07, + "loss": 0.0395, + "num_input_tokens_seen": 38843912, + "step": 19843 + }, + { + "epoch": 2.6300861497680583, + "grad_norm": 0.9645246267318726, + "learning_rate": 1.854000571046377e-07, + "loss": 0.0018, + "num_input_tokens_seen": 38846984, + "step": 19844 + }, + { + "epoch": 2.6302186878727634, + "grad_norm": 0.0026462748646736145, + "learning_rate": 1.852688907514402e-07, + "loss": 0.0, + "num_input_tokens_seen": 38848728, + "step": 19845 + }, + { + "epoch": 2.6303512259774684, + "grad_norm": 8.64064884185791, + "learning_rate": 1.851377690278555e-07, + "loss": 0.047, + "num_input_tokens_seen": 38850792, + "step": 19846 + }, + { + "epoch": 2.6304837640821734, + "grad_norm": 4.990572929382324, + "learning_rate": 1.8500669193641186e-07, + "loss": 0.0417, + "num_input_tokens_seen": 38853112, + "step": 19847 + }, + { + "epoch": 2.630616302186879, + "grad_norm": 0.002292696852236986, + "learning_rate": 1.8487565947963638e-07, + "loss": 0.0, + "num_input_tokens_seen": 38855384, + "step": 19848 + }, + { + "epoch": 2.630748840291584, + "grad_norm": 13.123645782470703, + "learning_rate": 1.847446716600551e-07, + "loss": 0.0749, + "num_input_tokens_seen": 38857440, + "step": 19849 + }, + { + "epoch": 2.630881378396289, + "grad_norm": 0.028990715742111206, + "learning_rate": 1.8461372848019355e-07, + "loss": 0.0002, + "num_input_tokens_seen": 38859288, + "step": 19850 + }, + { + "epoch": 2.631013916500994, + "grad_norm": 0.040633898228406906, + "learning_rate": 1.844828299425766e-07, + "loss": 0.0002, + "num_input_tokens_seen": 38860912, + "step": 19851 + }, + { + "epoch": 2.631146454605699, + "grad_norm": 0.005968830548226833, + "learning_rate": 1.8435197604972838e-07, + "loss": 0.0, + "num_input_tokens_seen": 38862248, + "step": 19852 + }, + { + "epoch": 2.631278992710404, + "grad_norm": 0.07968772202730179, + "learning_rate": 1.8422116680417156e-07, + "loss": 0.0003, + "num_input_tokens_seen": 38863480, + "step": 19853 + }, + { + "epoch": 2.631411530815109, + "grad_norm": 0.011876623146235943, + "learning_rate": 1.8409040220842834e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38865680, + "step": 19854 + }, + { + "epoch": 2.6315440689198146, + "grad_norm": 0.02079741843044758, + "learning_rate": 1.839596822650197e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38867120, + "step": 19855 + }, + { + "epoch": 2.6316766070245197, + "grad_norm": 0.18633709847927094, + "learning_rate": 1.8382900697646587e-07, + "loss": 0.0009, + "num_input_tokens_seen": 38868936, + "step": 19856 + }, + { + "epoch": 2.6318091451292247, + "grad_norm": 4.441710948944092, + "learning_rate": 1.8369837634528654e-07, + "loss": 0.0746, + "num_input_tokens_seen": 38871032, + "step": 19857 + }, + { + "epoch": 2.6319416832339297, + "grad_norm": 16.848413467407227, + "learning_rate": 1.8356779037400075e-07, + "loss": 0.1032, + "num_input_tokens_seen": 38873120, + "step": 19858 + }, + { + "epoch": 2.632074221338635, + "grad_norm": 0.01287469919770956, + "learning_rate": 1.8343724906512593e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38874776, + "step": 19859 + }, + { + "epoch": 2.63220675944334, + "grad_norm": 0.027764474973082542, + "learning_rate": 1.8330675242117929e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38875920, + "step": 19860 + }, + { + "epoch": 2.632339297548045, + "grad_norm": 0.007164470851421356, + "learning_rate": 1.8317630044467626e-07, + "loss": 0.0, + "num_input_tokens_seen": 38879152, + "step": 19861 + }, + { + "epoch": 2.6324718356527503, + "grad_norm": 0.00500657269731164, + "learning_rate": 1.8304589313813264e-07, + "loss": 0.0, + "num_input_tokens_seen": 38881712, + "step": 19862 + }, + { + "epoch": 2.6326043737574554, + "grad_norm": 0.07775654643774033, + "learning_rate": 1.8291553050406251e-07, + "loss": 0.0006, + "num_input_tokens_seen": 38883064, + "step": 19863 + }, + { + "epoch": 2.6327369118621604, + "grad_norm": 2.480588436126709, + "learning_rate": 1.8278521254497944e-07, + "loss": 0.011, + "num_input_tokens_seen": 38884392, + "step": 19864 + }, + { + "epoch": 2.6328694499668654, + "grad_norm": 0.3145724833011627, + "learning_rate": 1.826549392633964e-07, + "loss": 0.0014, + "num_input_tokens_seen": 38886384, + "step": 19865 + }, + { + "epoch": 2.6330019880715705, + "grad_norm": 0.016920631751418114, + "learning_rate": 1.8252471066182415e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38888648, + "step": 19866 + }, + { + "epoch": 2.6331345261762755, + "grad_norm": 3.8198182582855225, + "learning_rate": 1.8239452674277485e-07, + "loss": 0.0208, + "num_input_tokens_seen": 38890496, + "step": 19867 + }, + { + "epoch": 2.6332670642809806, + "grad_norm": 3.271207332611084, + "learning_rate": 1.8226438750875763e-07, + "loss": 0.0164, + "num_input_tokens_seen": 38892096, + "step": 19868 + }, + { + "epoch": 2.633399602385686, + "grad_norm": 0.02111208066344261, + "learning_rate": 1.8213429296228158e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38893688, + "step": 19869 + }, + { + "epoch": 2.633532140490391, + "grad_norm": 4.609452724456787, + "learning_rate": 1.8200424310585606e-07, + "loss": 0.0188, + "num_input_tokens_seen": 38896368, + "step": 19870 + }, + { + "epoch": 2.633664678595096, + "grad_norm": 0.015867408365011215, + "learning_rate": 1.8187423794198716e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38897640, + "step": 19871 + }, + { + "epoch": 2.633797216699801, + "grad_norm": 12.80637264251709, + "learning_rate": 1.8174427747318258e-07, + "loss": 0.219, + "num_input_tokens_seen": 38898656, + "step": 19872 + }, + { + "epoch": 2.633929754804506, + "grad_norm": 0.004511797334998846, + "learning_rate": 1.8161436170194778e-07, + "loss": 0.0, + "num_input_tokens_seen": 38900280, + "step": 19873 + }, + { + "epoch": 2.6340622929092112, + "grad_norm": 2.459672212600708, + "learning_rate": 1.8148449063078723e-07, + "loss": 0.0212, + "num_input_tokens_seen": 38902632, + "step": 19874 + }, + { + "epoch": 2.6341948310139163, + "grad_norm": 8.193979263305664, + "learning_rate": 1.8135466426220495e-07, + "loss": 0.0357, + "num_input_tokens_seen": 38904736, + "step": 19875 + }, + { + "epoch": 2.6343273691186218, + "grad_norm": 1.1457719802856445, + "learning_rate": 1.8122488259870452e-07, + "loss": 0.0038, + "num_input_tokens_seen": 38906368, + "step": 19876 + }, + { + "epoch": 2.634459907223327, + "grad_norm": 3.193279504776001, + "learning_rate": 1.8109514564278786e-07, + "loss": 0.0093, + "num_input_tokens_seen": 38908208, + "step": 19877 + }, + { + "epoch": 2.634592445328032, + "grad_norm": 0.01182013563811779, + "learning_rate": 1.8096545339695655e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38909704, + "step": 19878 + }, + { + "epoch": 2.634724983432737, + "grad_norm": 6.101032733917236, + "learning_rate": 1.8083580586371134e-07, + "loss": 0.116, + "num_input_tokens_seen": 38912320, + "step": 19879 + }, + { + "epoch": 2.634857521537442, + "grad_norm": 0.003880533156916499, + "learning_rate": 1.8070620304555142e-07, + "loss": 0.0, + "num_input_tokens_seen": 38914000, + "step": 19880 + }, + { + "epoch": 2.6349900596421474, + "grad_norm": 1.8346939086914062, + "learning_rate": 1.8057664494497613e-07, + "loss": 0.0038, + "num_input_tokens_seen": 38915184, + "step": 19881 + }, + { + "epoch": 2.635122597746852, + "grad_norm": 0.03053249418735504, + "learning_rate": 1.804471315644829e-07, + "loss": 0.0002, + "num_input_tokens_seen": 38917856, + "step": 19882 + }, + { + "epoch": 2.6352551358515575, + "grad_norm": 5.642035007476807, + "learning_rate": 1.803176629065695e-07, + "loss": 0.0425, + "num_input_tokens_seen": 38920192, + "step": 19883 + }, + { + "epoch": 2.6353876739562625, + "grad_norm": 15.781593322753906, + "learning_rate": 1.8018823897373167e-07, + "loss": 0.2407, + "num_input_tokens_seen": 38922224, + "step": 19884 + }, + { + "epoch": 2.6355202120609675, + "grad_norm": 0.04698694124817848, + "learning_rate": 1.8005885976846522e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38923768, + "step": 19885 + }, + { + "epoch": 2.6356527501656726, + "grad_norm": 0.007424511481076479, + "learning_rate": 1.7992952529326369e-07, + "loss": 0.0, + "num_input_tokens_seen": 38925576, + "step": 19886 + }, + { + "epoch": 2.6357852882703776, + "grad_norm": 0.003051064210012555, + "learning_rate": 1.7980023555062175e-07, + "loss": 0.0, + "num_input_tokens_seen": 38926960, + "step": 19887 + }, + { + "epoch": 2.635917826375083, + "grad_norm": 1.1274958848953247, + "learning_rate": 1.7967099054303217e-07, + "loss": 0.0052, + "num_input_tokens_seen": 38928576, + "step": 19888 + }, + { + "epoch": 2.6360503644797877, + "grad_norm": 0.09458813816308975, + "learning_rate": 1.795417902729865e-07, + "loss": 0.0005, + "num_input_tokens_seen": 38930224, + "step": 19889 + }, + { + "epoch": 2.636182902584493, + "grad_norm": 0.07431977242231369, + "learning_rate": 1.7941263474297616e-07, + "loss": 0.0004, + "num_input_tokens_seen": 38931752, + "step": 19890 + }, + { + "epoch": 2.636315440689198, + "grad_norm": 0.05226682871580124, + "learning_rate": 1.7928352395549104e-07, + "loss": 0.0003, + "num_input_tokens_seen": 38934688, + "step": 19891 + }, + { + "epoch": 2.6364479787939032, + "grad_norm": 15.73684310913086, + "learning_rate": 1.7915445791302e-07, + "loss": 0.2547, + "num_input_tokens_seen": 38936656, + "step": 19892 + }, + { + "epoch": 2.6365805168986083, + "grad_norm": 3.693511724472046, + "learning_rate": 1.7902543661805217e-07, + "loss": 0.0431, + "num_input_tokens_seen": 38938440, + "step": 19893 + }, + { + "epoch": 2.6367130550033133, + "grad_norm": 14.132867813110352, + "learning_rate": 1.7889646007307553e-07, + "loss": 0.113, + "num_input_tokens_seen": 38940632, + "step": 19894 + }, + { + "epoch": 2.636845593108019, + "grad_norm": 0.5871493816375732, + "learning_rate": 1.7876752828057647e-07, + "loss": 0.002, + "num_input_tokens_seen": 38942072, + "step": 19895 + }, + { + "epoch": 2.6369781312127234, + "grad_norm": 0.05219053104519844, + "learning_rate": 1.7863864124304047e-07, + "loss": 0.0003, + "num_input_tokens_seen": 38944152, + "step": 19896 + }, + { + "epoch": 2.637110669317429, + "grad_norm": 0.008763797581195831, + "learning_rate": 1.7850979896295278e-07, + "loss": 0.0, + "num_input_tokens_seen": 38945872, + "step": 19897 + }, + { + "epoch": 2.637243207422134, + "grad_norm": 0.024244124069809914, + "learning_rate": 1.7838100144279808e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38947832, + "step": 19898 + }, + { + "epoch": 2.637375745526839, + "grad_norm": 17.92586326599121, + "learning_rate": 1.782522486850588e-07, + "loss": 0.1235, + "num_input_tokens_seen": 38949520, + "step": 19899 + }, + { + "epoch": 2.637508283631544, + "grad_norm": 0.19952444732189178, + "learning_rate": 1.7812354069221826e-07, + "loss": 0.001, + "num_input_tokens_seen": 38950832, + "step": 19900 + }, + { + "epoch": 2.637640821736249, + "grad_norm": 0.004044728819280863, + "learning_rate": 1.7799487746675754e-07, + "loss": 0.0, + "num_input_tokens_seen": 38952136, + "step": 19901 + }, + { + "epoch": 2.6377733598409545, + "grad_norm": 0.09898005425930023, + "learning_rate": 1.7786625901115767e-07, + "loss": 0.0005, + "num_input_tokens_seen": 38955088, + "step": 19902 + }, + { + "epoch": 2.637905897945659, + "grad_norm": 3.3267083168029785, + "learning_rate": 1.777376853278978e-07, + "loss": 0.033, + "num_input_tokens_seen": 38956840, + "step": 19903 + }, + { + "epoch": 2.6380384360503646, + "grad_norm": 0.019604451954364777, + "learning_rate": 1.7760915641945736e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38957904, + "step": 19904 + }, + { + "epoch": 2.6381709741550696, + "grad_norm": 0.34980395436286926, + "learning_rate": 1.774806722883149e-07, + "loss": 0.0015, + "num_input_tokens_seen": 38959536, + "step": 19905 + }, + { + "epoch": 2.6383035122597747, + "grad_norm": 3.580228567123413, + "learning_rate": 1.773522329369473e-07, + "loss": 0.015, + "num_input_tokens_seen": 38961464, + "step": 19906 + }, + { + "epoch": 2.6384360503644797, + "grad_norm": 0.026035038754343987, + "learning_rate": 1.7722383836783096e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38962704, + "step": 19907 + }, + { + "epoch": 2.6385685884691847, + "grad_norm": 0.003725981106981635, + "learning_rate": 1.7709548858344112e-07, + "loss": 0.0, + "num_input_tokens_seen": 38964488, + "step": 19908 + }, + { + "epoch": 2.63870112657389, + "grad_norm": 4.675106525421143, + "learning_rate": 1.7696718358625297e-07, + "loss": 0.0308, + "num_input_tokens_seen": 38966400, + "step": 19909 + }, + { + "epoch": 2.6388336646785953, + "grad_norm": 0.006604394875466824, + "learning_rate": 1.7683892337873987e-07, + "loss": 0.0, + "num_input_tokens_seen": 38967816, + "step": 19910 + }, + { + "epoch": 2.6389662027833003, + "grad_norm": 4.267369270324707, + "learning_rate": 1.7671070796337535e-07, + "loss": 0.0399, + "num_input_tokens_seen": 38969656, + "step": 19911 + }, + { + "epoch": 2.6390987408880053, + "grad_norm": 0.3346015512943268, + "learning_rate": 1.7658253734263108e-07, + "loss": 0.0014, + "num_input_tokens_seen": 38971112, + "step": 19912 + }, + { + "epoch": 2.6392312789927104, + "grad_norm": 0.14386485517024994, + "learning_rate": 1.7645441151897786e-07, + "loss": 0.0004, + "num_input_tokens_seen": 38973728, + "step": 19913 + }, + { + "epoch": 2.6393638170974154, + "grad_norm": 2.9693682193756104, + "learning_rate": 1.76326330494887e-07, + "loss": 0.0138, + "num_input_tokens_seen": 38976016, + "step": 19914 + }, + { + "epoch": 2.6394963552021204, + "grad_norm": 2.772496223449707, + "learning_rate": 1.761982942728277e-07, + "loss": 0.0116, + "num_input_tokens_seen": 38977704, + "step": 19915 + }, + { + "epoch": 2.639628893306826, + "grad_norm": 4.7372026443481445, + "learning_rate": 1.7607030285526767e-07, + "loss": 0.053, + "num_input_tokens_seen": 38980416, + "step": 19916 + }, + { + "epoch": 2.639761431411531, + "grad_norm": 0.011975102126598358, + "learning_rate": 1.7594235624467605e-07, + "loss": 0.0001, + "num_input_tokens_seen": 38981872, + "step": 19917 + }, + { + "epoch": 2.639893969516236, + "grad_norm": 0.3138246238231659, + "learning_rate": 1.7581445444351862e-07, + "loss": 0.0013, + "num_input_tokens_seen": 38983144, + "step": 19918 + }, + { + "epoch": 2.640026507620941, + "grad_norm": 7.129380226135254, + "learning_rate": 1.756865974542621e-07, + "loss": 0.1041, + "num_input_tokens_seen": 38984920, + "step": 19919 + }, + { + "epoch": 2.640159045725646, + "grad_norm": 5.539901256561279, + "learning_rate": 1.7555878527937164e-07, + "loss": 0.0971, + "num_input_tokens_seen": 38986480, + "step": 19920 + }, + { + "epoch": 2.640291583830351, + "grad_norm": 17.023334503173828, + "learning_rate": 1.7543101792131145e-07, + "loss": 0.1507, + "num_input_tokens_seen": 38989328, + "step": 19921 + }, + { + "epoch": 2.640424121935056, + "grad_norm": 14.090907096862793, + "learning_rate": 1.7530329538254425e-07, + "loss": 0.2161, + "num_input_tokens_seen": 38990888, + "step": 19922 + }, + { + "epoch": 2.6405566600397616, + "grad_norm": 0.0051709674298763275, + "learning_rate": 1.7517561766553337e-07, + "loss": 0.0, + "num_input_tokens_seen": 38992536, + "step": 19923 + }, + { + "epoch": 2.6406891981444667, + "grad_norm": 1.495229721069336, + "learning_rate": 1.750479847727407e-07, + "loss": 0.0025, + "num_input_tokens_seen": 38994320, + "step": 19924 + }, + { + "epoch": 2.6408217362491717, + "grad_norm": 0.15092863142490387, + "learning_rate": 1.749203967066268e-07, + "loss": 0.0006, + "num_input_tokens_seen": 38997080, + "step": 19925 + }, + { + "epoch": 2.6409542743538768, + "grad_norm": 0.03637028485536575, + "learning_rate": 1.7479285346965162e-07, + "loss": 0.0002, + "num_input_tokens_seen": 38999280, + "step": 19926 + }, + { + "epoch": 2.641086812458582, + "grad_norm": 4.16127872467041, + "learning_rate": 1.7466535506427434e-07, + "loss": 0.0698, + "num_input_tokens_seen": 39001240, + "step": 19927 + }, + { + "epoch": 2.641219350563287, + "grad_norm": 0.0031853823456913233, + "learning_rate": 1.7453790149295296e-07, + "loss": 0.0, + "num_input_tokens_seen": 39002928, + "step": 19928 + }, + { + "epoch": 2.641351888667992, + "grad_norm": 0.8193962574005127, + "learning_rate": 1.744104927581447e-07, + "loss": 0.0019, + "num_input_tokens_seen": 39004568, + "step": 19929 + }, + { + "epoch": 2.6414844267726973, + "grad_norm": 0.03946051001548767, + "learning_rate": 1.742831288623073e-07, + "loss": 0.0002, + "num_input_tokens_seen": 39006584, + "step": 19930 + }, + { + "epoch": 2.6416169648774024, + "grad_norm": 0.007678720634430647, + "learning_rate": 1.7415580980789522e-07, + "loss": 0.0, + "num_input_tokens_seen": 39008040, + "step": 19931 + }, + { + "epoch": 2.6417495029821074, + "grad_norm": 1.199819564819336, + "learning_rate": 1.7402853559736393e-07, + "loss": 0.0063, + "num_input_tokens_seen": 39010128, + "step": 19932 + }, + { + "epoch": 2.6418820410868125, + "grad_norm": 17.225217819213867, + "learning_rate": 1.7390130623316653e-07, + "loss": 0.207, + "num_input_tokens_seen": 39011696, + "step": 19933 + }, + { + "epoch": 2.6420145791915175, + "grad_norm": 0.04431646689772606, + "learning_rate": 1.7377412171775687e-07, + "loss": 0.0003, + "num_input_tokens_seen": 39014336, + "step": 19934 + }, + { + "epoch": 2.6421471172962225, + "grad_norm": 14.560745239257812, + "learning_rate": 1.7364698205358715e-07, + "loss": 0.0488, + "num_input_tokens_seen": 39016568, + "step": 19935 + }, + { + "epoch": 2.6422796554009276, + "grad_norm": 0.004816378001123667, + "learning_rate": 1.7351988724310848e-07, + "loss": 0.0, + "num_input_tokens_seen": 39018424, + "step": 19936 + }, + { + "epoch": 2.642412193505633, + "grad_norm": 1.3067947626113892, + "learning_rate": 1.7339283728877139e-07, + "loss": 0.0083, + "num_input_tokens_seen": 39020640, + "step": 19937 + }, + { + "epoch": 2.642544731610338, + "grad_norm": 9.525301933288574, + "learning_rate": 1.732658321930253e-07, + "loss": 0.0819, + "num_input_tokens_seen": 39022872, + "step": 19938 + }, + { + "epoch": 2.642677269715043, + "grad_norm": 0.13601581752300262, + "learning_rate": 1.731388719583188e-07, + "loss": 0.0007, + "num_input_tokens_seen": 39024160, + "step": 19939 + }, + { + "epoch": 2.642809807819748, + "grad_norm": 4.722878932952881, + "learning_rate": 1.7301195658709996e-07, + "loss": 0.0496, + "num_input_tokens_seen": 39026040, + "step": 19940 + }, + { + "epoch": 2.642942345924453, + "grad_norm": 11.298348426818848, + "learning_rate": 1.7288508608181652e-07, + "loss": 0.1451, + "num_input_tokens_seen": 39028560, + "step": 19941 + }, + { + "epoch": 2.6430748840291582, + "grad_norm": 0.6074582934379578, + "learning_rate": 1.7275826044491378e-07, + "loss": 0.0017, + "num_input_tokens_seen": 39031816, + "step": 19942 + }, + { + "epoch": 2.6432074221338633, + "grad_norm": 0.04165493696928024, + "learning_rate": 1.7263147967883753e-07, + "loss": 0.0003, + "num_input_tokens_seen": 39033400, + "step": 19943 + }, + { + "epoch": 2.6433399602385688, + "grad_norm": 0.010375391691923141, + "learning_rate": 1.7250474378603137e-07, + "loss": 0.0, + "num_input_tokens_seen": 39034640, + "step": 19944 + }, + { + "epoch": 2.643472498343274, + "grad_norm": 4.652912139892578, + "learning_rate": 1.7237805276893975e-07, + "loss": 0.0718, + "num_input_tokens_seen": 39036184, + "step": 19945 + }, + { + "epoch": 2.643605036447979, + "grad_norm": 21.221359252929688, + "learning_rate": 1.7225140663000462e-07, + "loss": 0.1263, + "num_input_tokens_seen": 39038120, + "step": 19946 + }, + { + "epoch": 2.643737574552684, + "grad_norm": 0.020312394946813583, + "learning_rate": 1.721248053716687e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39039672, + "step": 19947 + }, + { + "epoch": 2.643870112657389, + "grad_norm": 0.028406143188476562, + "learning_rate": 1.719982489963723e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39041048, + "step": 19948 + }, + { + "epoch": 2.644002650762094, + "grad_norm": 3.6935346126556396, + "learning_rate": 1.718717375065554e-07, + "loss": 0.0266, + "num_input_tokens_seen": 39042824, + "step": 19949 + }, + { + "epoch": 2.644135188866799, + "grad_norm": 15.869932174682617, + "learning_rate": 1.7174527090465798e-07, + "loss": 0.0229, + "num_input_tokens_seen": 39044888, + "step": 19950 + }, + { + "epoch": 2.6442677269715045, + "grad_norm": 0.4168020486831665, + "learning_rate": 1.7161884919311783e-07, + "loss": 0.0014, + "num_input_tokens_seen": 39046600, + "step": 19951 + }, + { + "epoch": 2.6444002650762095, + "grad_norm": 21.429616928100586, + "learning_rate": 1.7149247237437215e-07, + "loss": 0.0845, + "num_input_tokens_seen": 39047992, + "step": 19952 + }, + { + "epoch": 2.6445328031809145, + "grad_norm": 0.9962335824966431, + "learning_rate": 1.713661404508582e-07, + "loss": 0.0064, + "num_input_tokens_seen": 39049656, + "step": 19953 + }, + { + "epoch": 2.6446653412856196, + "grad_norm": 0.1471930295228958, + "learning_rate": 1.712398534250112e-07, + "loss": 0.0005, + "num_input_tokens_seen": 39051072, + "step": 19954 + }, + { + "epoch": 2.6447978793903246, + "grad_norm": 4.736299514770508, + "learning_rate": 1.7111361129926674e-07, + "loss": 0.0157, + "num_input_tokens_seen": 39052592, + "step": 19955 + }, + { + "epoch": 2.6449304174950297, + "grad_norm": 0.05444004759192467, + "learning_rate": 1.7098741407605868e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39053912, + "step": 19956 + }, + { + "epoch": 2.6450629555997347, + "grad_norm": 9.947511672973633, + "learning_rate": 1.7086126175781926e-07, + "loss": 0.166, + "num_input_tokens_seen": 39056856, + "step": 19957 + }, + { + "epoch": 2.64519549370444, + "grad_norm": 0.0481526181101799, + "learning_rate": 1.7073515434698207e-07, + "loss": 0.0003, + "num_input_tokens_seen": 39058928, + "step": 19958 + }, + { + "epoch": 2.645328031809145, + "grad_norm": 0.006598897278308868, + "learning_rate": 1.7060909184597797e-07, + "loss": 0.0, + "num_input_tokens_seen": 39060424, + "step": 19959 + }, + { + "epoch": 2.6454605699138503, + "grad_norm": 0.0009480995358899236, + "learning_rate": 1.704830742572372e-07, + "loss": 0.0, + "num_input_tokens_seen": 39061736, + "step": 19960 + }, + { + "epoch": 2.6455931080185553, + "grad_norm": 0.0014780407072976232, + "learning_rate": 1.703571015831901e-07, + "loss": 0.0, + "num_input_tokens_seen": 39062840, + "step": 19961 + }, + { + "epoch": 2.6457256461232603, + "grad_norm": 0.6250894665718079, + "learning_rate": 1.7023117382626518e-07, + "loss": 0.0021, + "num_input_tokens_seen": 39064896, + "step": 19962 + }, + { + "epoch": 2.645858184227966, + "grad_norm": 0.002213862957432866, + "learning_rate": 1.7010529098889034e-07, + "loss": 0.0, + "num_input_tokens_seen": 39066224, + "step": 19963 + }, + { + "epoch": 2.6459907223326704, + "grad_norm": 7.723633766174316, + "learning_rate": 1.6997945307349302e-07, + "loss": 0.0511, + "num_input_tokens_seen": 39069408, + "step": 19964 + }, + { + "epoch": 2.646123260437376, + "grad_norm": 0.018439266830682755, + "learning_rate": 1.6985366008249875e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39070664, + "step": 19965 + }, + { + "epoch": 2.646255798542081, + "grad_norm": 12.208579063415527, + "learning_rate": 1.697279120183337e-07, + "loss": 0.1349, + "num_input_tokens_seen": 39072512, + "step": 19966 + }, + { + "epoch": 2.646388336646786, + "grad_norm": 5.34529447555542, + "learning_rate": 1.6960220888342233e-07, + "loss": 0.0333, + "num_input_tokens_seen": 39074344, + "step": 19967 + }, + { + "epoch": 2.646520874751491, + "grad_norm": 3.3625733852386475, + "learning_rate": 1.6947655068018793e-07, + "loss": 0.0318, + "num_input_tokens_seen": 39076168, + "step": 19968 + }, + { + "epoch": 2.646653412856196, + "grad_norm": 0.09358619153499603, + "learning_rate": 1.6935093741105302e-07, + "loss": 0.0005, + "num_input_tokens_seen": 39078136, + "step": 19969 + }, + { + "epoch": 2.6467859509609015, + "grad_norm": 11.284737586975098, + "learning_rate": 1.6922536907843983e-07, + "loss": 0.105, + "num_input_tokens_seen": 39079288, + "step": 19970 + }, + { + "epoch": 2.646918489065606, + "grad_norm": 12.570952415466309, + "learning_rate": 1.6909984568477007e-07, + "loss": 0.0717, + "num_input_tokens_seen": 39082000, + "step": 19971 + }, + { + "epoch": 2.6470510271703116, + "grad_norm": 15.462092399597168, + "learning_rate": 1.6897436723246284e-07, + "loss": 0.2438, + "num_input_tokens_seen": 39083848, + "step": 19972 + }, + { + "epoch": 2.6471835652750166, + "grad_norm": 0.006328567396849394, + "learning_rate": 1.6884893372393823e-07, + "loss": 0.0, + "num_input_tokens_seen": 39085256, + "step": 19973 + }, + { + "epoch": 2.6473161033797217, + "grad_norm": 0.011133326217532158, + "learning_rate": 1.6872354516161427e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39086768, + "step": 19974 + }, + { + "epoch": 2.6474486414844267, + "grad_norm": 5.95301628112793, + "learning_rate": 1.6859820154790819e-07, + "loss": 0.0932, + "num_input_tokens_seen": 39088624, + "step": 19975 + }, + { + "epoch": 2.6475811795891318, + "grad_norm": 3.5081679821014404, + "learning_rate": 1.6847290288523722e-07, + "loss": 0.0609, + "num_input_tokens_seen": 39090136, + "step": 19976 + }, + { + "epoch": 2.6477137176938372, + "grad_norm": 5.2423624992370605, + "learning_rate": 1.6834764917601749e-07, + "loss": 0.0132, + "num_input_tokens_seen": 39091904, + "step": 19977 + }, + { + "epoch": 2.647846255798542, + "grad_norm": 0.34484416246414185, + "learning_rate": 1.6822244042266345e-07, + "loss": 0.0013, + "num_input_tokens_seen": 39094352, + "step": 19978 + }, + { + "epoch": 2.6479787939032473, + "grad_norm": 0.11933187395334244, + "learning_rate": 1.6809727662758958e-07, + "loss": 0.0004, + "num_input_tokens_seen": 39096840, + "step": 19979 + }, + { + "epoch": 2.6481113320079523, + "grad_norm": 0.03326604142785072, + "learning_rate": 1.6797215779320864e-07, + "loss": 0.0002, + "num_input_tokens_seen": 39098472, + "step": 19980 + }, + { + "epoch": 2.6482438701126574, + "grad_norm": 0.00959748961031437, + "learning_rate": 1.6784708392193344e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39101304, + "step": 19981 + }, + { + "epoch": 2.6483764082173624, + "grad_norm": 4.482297897338867, + "learning_rate": 1.6772205501617482e-07, + "loss": 0.0214, + "num_input_tokens_seen": 39103456, + "step": 19982 + }, + { + "epoch": 2.6485089463220675, + "grad_norm": 4.820776462554932, + "learning_rate": 1.675970710783445e-07, + "loss": 0.0455, + "num_input_tokens_seen": 39105648, + "step": 19983 + }, + { + "epoch": 2.648641484426773, + "grad_norm": 0.055431704968214035, + "learning_rate": 1.674721321108519e-07, + "loss": 0.0003, + "num_input_tokens_seen": 39108032, + "step": 19984 + }, + { + "epoch": 2.6487740225314775, + "grad_norm": 3.939870595932007, + "learning_rate": 1.673472381161051e-07, + "loss": 0.0462, + "num_input_tokens_seen": 39110272, + "step": 19985 + }, + { + "epoch": 2.648906560636183, + "grad_norm": 0.06875581294298172, + "learning_rate": 1.67222389096513e-07, + "loss": 0.0004, + "num_input_tokens_seen": 39113216, + "step": 19986 + }, + { + "epoch": 2.649039098740888, + "grad_norm": 0.004226888995617628, + "learning_rate": 1.670975850544823e-07, + "loss": 0.0, + "num_input_tokens_seen": 39115896, + "step": 19987 + }, + { + "epoch": 2.649171636845593, + "grad_norm": 2.1376240253448486, + "learning_rate": 1.6697282599241998e-07, + "loss": 0.006, + "num_input_tokens_seen": 39116952, + "step": 19988 + }, + { + "epoch": 2.649304174950298, + "grad_norm": 13.674684524536133, + "learning_rate": 1.6684811191273104e-07, + "loss": 0.1966, + "num_input_tokens_seen": 39119520, + "step": 19989 + }, + { + "epoch": 2.649436713055003, + "grad_norm": 0.5422107577323914, + "learning_rate": 1.6672344281781994e-07, + "loss": 0.0021, + "num_input_tokens_seen": 39121696, + "step": 19990 + }, + { + "epoch": 2.6495692511597086, + "grad_norm": 0.005797104444354773, + "learning_rate": 1.6659881871009032e-07, + "loss": 0.0, + "num_input_tokens_seen": 39123520, + "step": 19991 + }, + { + "epoch": 2.6497017892644132, + "grad_norm": 4.735316276550293, + "learning_rate": 1.6647423959194554e-07, + "loss": 0.0399, + "num_input_tokens_seen": 39125328, + "step": 19992 + }, + { + "epoch": 2.6498343273691187, + "grad_norm": 0.011999591253697872, + "learning_rate": 1.6634970546578672e-07, + "loss": 0.0, + "num_input_tokens_seen": 39126584, + "step": 19993 + }, + { + "epoch": 2.6499668654738238, + "grad_norm": 0.24750719964504242, + "learning_rate": 1.6622521633401612e-07, + "loss": 0.0011, + "num_input_tokens_seen": 39128784, + "step": 19994 + }, + { + "epoch": 2.650099403578529, + "grad_norm": 3.515619993209839, + "learning_rate": 1.6610077219903292e-07, + "loss": 0.0575, + "num_input_tokens_seen": 39131256, + "step": 19995 + }, + { + "epoch": 2.650231941683234, + "grad_norm": 7.976091384887695, + "learning_rate": 1.659763730632369e-07, + "loss": 0.1472, + "num_input_tokens_seen": 39133248, + "step": 19996 + }, + { + "epoch": 2.650364479787939, + "grad_norm": 8.442564010620117, + "learning_rate": 1.658520189290269e-07, + "loss": 0.0996, + "num_input_tokens_seen": 39135272, + "step": 19997 + }, + { + "epoch": 2.6504970178926444, + "grad_norm": 0.8371568918228149, + "learning_rate": 1.657277097988e-07, + "loss": 0.0038, + "num_input_tokens_seen": 39137376, + "step": 19998 + }, + { + "epoch": 2.6506295559973494, + "grad_norm": 7.709126949310303, + "learning_rate": 1.6560344567495307e-07, + "loss": 0.0709, + "num_input_tokens_seen": 39139376, + "step": 19999 + }, + { + "epoch": 2.6507620941020544, + "grad_norm": 0.45898982882499695, + "learning_rate": 1.654792265598823e-07, + "loss": 0.002, + "num_input_tokens_seen": 39141376, + "step": 20000 + }, + { + "epoch": 2.6508946322067595, + "grad_norm": 3.2740676403045654, + "learning_rate": 1.6535505245598215e-07, + "loss": 0.0165, + "num_input_tokens_seen": 39143256, + "step": 20001 + }, + { + "epoch": 2.6510271703114645, + "grad_norm": 0.0170175451785326, + "learning_rate": 1.6523092336564768e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39144640, + "step": 20002 + }, + { + "epoch": 2.6511597084161695, + "grad_norm": 11.66884994506836, + "learning_rate": 1.6510683929127192e-07, + "loss": 0.1005, + "num_input_tokens_seen": 39147272, + "step": 20003 + }, + { + "epoch": 2.6512922465208746, + "grad_norm": 0.01447171438485384, + "learning_rate": 1.649828002352466e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39148952, + "step": 20004 + }, + { + "epoch": 2.65142478462558, + "grad_norm": 5.252066612243652, + "learning_rate": 1.648588061999637e-07, + "loss": 0.0328, + "num_input_tokens_seen": 39150856, + "step": 20005 + }, + { + "epoch": 2.651557322730285, + "grad_norm": 7.80180549621582, + "learning_rate": 1.647348571878138e-07, + "loss": 0.0612, + "num_input_tokens_seen": 39152544, + "step": 20006 + }, + { + "epoch": 2.65168986083499, + "grad_norm": 0.15701831877231598, + "learning_rate": 1.646109532011872e-07, + "loss": 0.0009, + "num_input_tokens_seen": 39154024, + "step": 20007 + }, + { + "epoch": 2.651822398939695, + "grad_norm": 3.1685616970062256, + "learning_rate": 1.6448709424247256e-07, + "loss": 0.0168, + "num_input_tokens_seen": 39155264, + "step": 20008 + }, + { + "epoch": 2.6519549370444, + "grad_norm": 0.016454381868243217, + "learning_rate": 1.6436328031405796e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39156616, + "step": 20009 + }, + { + "epoch": 2.6520874751491053, + "grad_norm": 8.28774642944336, + "learning_rate": 1.642395114183301e-07, + "loss": 0.0458, + "num_input_tokens_seen": 39159264, + "step": 20010 + }, + { + "epoch": 2.6522200132538103, + "grad_norm": 4.7118048667907715, + "learning_rate": 1.64115787557676e-07, + "loss": 0.0521, + "num_input_tokens_seen": 39161656, + "step": 20011 + }, + { + "epoch": 2.6523525513585158, + "grad_norm": 2.692826271057129, + "learning_rate": 1.639921087344812e-07, + "loss": 0.0113, + "num_input_tokens_seen": 39163192, + "step": 20012 + }, + { + "epoch": 2.652485089463221, + "grad_norm": 9.105107307434082, + "learning_rate": 1.6386847495113017e-07, + "loss": 0.0741, + "num_input_tokens_seen": 39165008, + "step": 20013 + }, + { + "epoch": 2.652617627567926, + "grad_norm": 3.4778482913970947, + "learning_rate": 1.6374488621000662e-07, + "loss": 0.023, + "num_input_tokens_seen": 39166880, + "step": 20014 + }, + { + "epoch": 2.652750165672631, + "grad_norm": 5.938304901123047, + "learning_rate": 1.6362134251349332e-07, + "loss": 0.0589, + "num_input_tokens_seen": 39169112, + "step": 20015 + }, + { + "epoch": 2.652882703777336, + "grad_norm": 2.8815531730651855, + "learning_rate": 1.634978438639717e-07, + "loss": 0.0362, + "num_input_tokens_seen": 39171424, + "step": 20016 + }, + { + "epoch": 2.653015241882041, + "grad_norm": 5.1492815017700195, + "learning_rate": 1.633743902638238e-07, + "loss": 0.0474, + "num_input_tokens_seen": 39173640, + "step": 20017 + }, + { + "epoch": 2.653147779986746, + "grad_norm": 0.01200445368885994, + "learning_rate": 1.6325098171542986e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39176200, + "step": 20018 + }, + { + "epoch": 2.6532803180914515, + "grad_norm": 0.16650548577308655, + "learning_rate": 1.6312761822116914e-07, + "loss": 0.0007, + "num_input_tokens_seen": 39177848, + "step": 20019 + }, + { + "epoch": 2.6534128561961565, + "grad_norm": 0.0016646881122142076, + "learning_rate": 1.6300429978341997e-07, + "loss": 0.0, + "num_input_tokens_seen": 39179048, + "step": 20020 + }, + { + "epoch": 2.6535453943008616, + "grad_norm": 0.009938289411365986, + "learning_rate": 1.6288102640455993e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39180648, + "step": 20021 + }, + { + "epoch": 2.6536779324055666, + "grad_norm": 0.06868237257003784, + "learning_rate": 1.62757798086966e-07, + "loss": 0.0005, + "num_input_tokens_seen": 39182360, + "step": 20022 + }, + { + "epoch": 2.6538104705102716, + "grad_norm": 0.0029044339898973703, + "learning_rate": 1.626346148330138e-07, + "loss": 0.0, + "num_input_tokens_seen": 39184008, + "step": 20023 + }, + { + "epoch": 2.6539430086149767, + "grad_norm": 3.9993114471435547, + "learning_rate": 1.6251147664507917e-07, + "loss": 0.0263, + "num_input_tokens_seen": 39186424, + "step": 20024 + }, + { + "epoch": 2.6540755467196817, + "grad_norm": 0.1993933618068695, + "learning_rate": 1.6238838352553603e-07, + "loss": 0.001, + "num_input_tokens_seen": 39188064, + "step": 20025 + }, + { + "epoch": 2.654208084824387, + "grad_norm": 0.996856689453125, + "learning_rate": 1.6226533547675726e-07, + "loss": 0.0027, + "num_input_tokens_seen": 39189456, + "step": 20026 + }, + { + "epoch": 2.6543406229290922, + "grad_norm": 1.2736454010009766, + "learning_rate": 1.621423325011151e-07, + "loss": 0.0185, + "num_input_tokens_seen": 39190800, + "step": 20027 + }, + { + "epoch": 2.6544731610337973, + "grad_norm": 18.34808349609375, + "learning_rate": 1.620193746009821e-07, + "loss": 0.2632, + "num_input_tokens_seen": 39192400, + "step": 20028 + }, + { + "epoch": 2.6546056991385023, + "grad_norm": 11.78628158569336, + "learning_rate": 1.6189646177872804e-07, + "loss": 0.24, + "num_input_tokens_seen": 39194800, + "step": 20029 + }, + { + "epoch": 2.6547382372432073, + "grad_norm": 0.11824502795934677, + "learning_rate": 1.617735940367235e-07, + "loss": 0.0008, + "num_input_tokens_seen": 39197024, + "step": 20030 + }, + { + "epoch": 2.6548707753479124, + "grad_norm": 1.2106844186782837, + "learning_rate": 1.6165077137733715e-07, + "loss": 0.0096, + "num_input_tokens_seen": 39199280, + "step": 20031 + }, + { + "epoch": 2.6550033134526174, + "grad_norm": 0.033760473132133484, + "learning_rate": 1.6152799380293683e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39200808, + "step": 20032 + }, + { + "epoch": 2.655135851557323, + "grad_norm": 0.06833388656377792, + "learning_rate": 1.6140526131589035e-07, + "loss": 0.0003, + "num_input_tokens_seen": 39202744, + "step": 20033 + }, + { + "epoch": 2.655268389662028, + "grad_norm": 0.031996142119169235, + "learning_rate": 1.6128257391856362e-07, + "loss": 0.0002, + "num_input_tokens_seen": 39205224, + "step": 20034 + }, + { + "epoch": 2.655400927766733, + "grad_norm": 87.51895141601562, + "learning_rate": 1.6115993161332193e-07, + "loss": 0.1354, + "num_input_tokens_seen": 39207520, + "step": 20035 + }, + { + "epoch": 2.655533465871438, + "grad_norm": 4.961874961853027, + "learning_rate": 1.6103733440253066e-07, + "loss": 0.0359, + "num_input_tokens_seen": 39209240, + "step": 20036 + }, + { + "epoch": 2.655666003976143, + "grad_norm": 0.05917447432875633, + "learning_rate": 1.609147822885526e-07, + "loss": 0.0004, + "num_input_tokens_seen": 39210456, + "step": 20037 + }, + { + "epoch": 2.655798542080848, + "grad_norm": 1.1534103155136108, + "learning_rate": 1.6079227527375175e-07, + "loss": 0.0055, + "num_input_tokens_seen": 39212240, + "step": 20038 + }, + { + "epoch": 2.655931080185553, + "grad_norm": 2.2544357776641846, + "learning_rate": 1.6066981336048947e-07, + "loss": 0.0182, + "num_input_tokens_seen": 39215136, + "step": 20039 + }, + { + "epoch": 2.6560636182902586, + "grad_norm": 1.5119657516479492, + "learning_rate": 1.6054739655112672e-07, + "loss": 0.0063, + "num_input_tokens_seen": 39216776, + "step": 20040 + }, + { + "epoch": 2.6561961563949636, + "grad_norm": 0.021279633045196533, + "learning_rate": 1.6042502484802463e-07, + "loss": 0.0002, + "num_input_tokens_seen": 39218128, + "step": 20041 + }, + { + "epoch": 2.6563286944996687, + "grad_norm": 4.328943252563477, + "learning_rate": 1.6030269825354133e-07, + "loss": 0.0696, + "num_input_tokens_seen": 39219584, + "step": 20042 + }, + { + "epoch": 2.6564612326043737, + "grad_norm": 0.04322853684425354, + "learning_rate": 1.601804167700366e-07, + "loss": 0.0004, + "num_input_tokens_seen": 39221344, + "step": 20043 + }, + { + "epoch": 2.6565937707090788, + "grad_norm": 6.227418422698975, + "learning_rate": 1.600581803998677e-07, + "loss": 0.0517, + "num_input_tokens_seen": 39223648, + "step": 20044 + }, + { + "epoch": 2.656726308813784, + "grad_norm": 49.591705322265625, + "learning_rate": 1.599359891453911e-07, + "loss": 0.1447, + "num_input_tokens_seen": 39226208, + "step": 20045 + }, + { + "epoch": 2.656858846918489, + "grad_norm": 6.170390605926514, + "learning_rate": 1.5981384300896268e-07, + "loss": 0.0707, + "num_input_tokens_seen": 39228200, + "step": 20046 + }, + { + "epoch": 2.6569913850231943, + "grad_norm": 0.04464423283934593, + "learning_rate": 1.5969174199293806e-07, + "loss": 0.0002, + "num_input_tokens_seen": 39230232, + "step": 20047 + }, + { + "epoch": 2.6571239231278994, + "grad_norm": 0.007255627308040857, + "learning_rate": 1.5956968609967094e-07, + "loss": 0.0, + "num_input_tokens_seen": 39231600, + "step": 20048 + }, + { + "epoch": 2.6572564612326044, + "grad_norm": 0.0022484089713543653, + "learning_rate": 1.5944767533151522e-07, + "loss": 0.0, + "num_input_tokens_seen": 39232896, + "step": 20049 + }, + { + "epoch": 2.6573889993373094, + "grad_norm": 23.103734970092773, + "learning_rate": 1.5932570969082296e-07, + "loss": 0.1865, + "num_input_tokens_seen": 39234752, + "step": 20050 + }, + { + "epoch": 2.6575215374420145, + "grad_norm": 0.03214700520038605, + "learning_rate": 1.5920378917994583e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39236856, + "step": 20051 + }, + { + "epoch": 2.65765407554672, + "grad_norm": 0.021375279873609543, + "learning_rate": 1.5908191380123395e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39238544, + "step": 20052 + }, + { + "epoch": 2.6577866136514245, + "grad_norm": 5.277658939361572, + "learning_rate": 1.5896008355703764e-07, + "loss": 0.0318, + "num_input_tokens_seen": 39240864, + "step": 20053 + }, + { + "epoch": 2.65791915175613, + "grad_norm": 11.660393714904785, + "learning_rate": 1.588382984497064e-07, + "loss": 0.2908, + "num_input_tokens_seen": 39242368, + "step": 20054 + }, + { + "epoch": 2.658051689860835, + "grad_norm": 0.002004727255553007, + "learning_rate": 1.587165584815878e-07, + "loss": 0.0, + "num_input_tokens_seen": 39244000, + "step": 20055 + }, + { + "epoch": 2.65818422796554, + "grad_norm": 0.05760011821985245, + "learning_rate": 1.585948636550294e-07, + "loss": 0.0006, + "num_input_tokens_seen": 39246176, + "step": 20056 + }, + { + "epoch": 2.658316766070245, + "grad_norm": 0.01596280187368393, + "learning_rate": 1.5847321397237685e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39247952, + "step": 20057 + }, + { + "epoch": 2.65844930417495, + "grad_norm": 0.008537213318049908, + "learning_rate": 1.5835160943597606e-07, + "loss": 0.0, + "num_input_tokens_seen": 39249896, + "step": 20058 + }, + { + "epoch": 2.6585818422796557, + "grad_norm": 0.33254188299179077, + "learning_rate": 1.5823005004817148e-07, + "loss": 0.0011, + "num_input_tokens_seen": 39251576, + "step": 20059 + }, + { + "epoch": 2.6587143803843603, + "grad_norm": 7.203460693359375, + "learning_rate": 1.581085358113074e-07, + "loss": 0.0325, + "num_input_tokens_seen": 39252744, + "step": 20060 + }, + { + "epoch": 2.6588469184890657, + "grad_norm": 7.312133312225342, + "learning_rate": 1.579870667277264e-07, + "loss": 0.0428, + "num_input_tokens_seen": 39254536, + "step": 20061 + }, + { + "epoch": 2.6589794565937708, + "grad_norm": 0.012560809031128883, + "learning_rate": 1.5786564279977046e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39256592, + "step": 20062 + }, + { + "epoch": 2.659111994698476, + "grad_norm": 6.9692206382751465, + "learning_rate": 1.5774426402978021e-07, + "loss": 0.0631, + "num_input_tokens_seen": 39258216, + "step": 20063 + }, + { + "epoch": 2.659244532803181, + "grad_norm": 3.811007022857666, + "learning_rate": 1.576229304200963e-07, + "loss": 0.0136, + "num_input_tokens_seen": 39260728, + "step": 20064 + }, + { + "epoch": 2.659377070907886, + "grad_norm": 4.833829879760742, + "learning_rate": 1.575016419730588e-07, + "loss": 0.0691, + "num_input_tokens_seen": 39263080, + "step": 20065 + }, + { + "epoch": 2.6595096090125914, + "grad_norm": 0.841262698173523, + "learning_rate": 1.5738039869100552e-07, + "loss": 0.0046, + "num_input_tokens_seen": 39265736, + "step": 20066 + }, + { + "epoch": 2.659642147117296, + "grad_norm": 5.980198383331299, + "learning_rate": 1.5725920057627437e-07, + "loss": 0.097, + "num_input_tokens_seen": 39268448, + "step": 20067 + }, + { + "epoch": 2.6597746852220014, + "grad_norm": 37.981143951416016, + "learning_rate": 1.571380476312015e-07, + "loss": 0.7511, + "num_input_tokens_seen": 39270984, + "step": 20068 + }, + { + "epoch": 2.6599072233267065, + "grad_norm": 6.292543888092041, + "learning_rate": 1.5701693985812365e-07, + "loss": 0.1116, + "num_input_tokens_seen": 39273040, + "step": 20069 + }, + { + "epoch": 2.6600397614314115, + "grad_norm": 0.0018822691636160016, + "learning_rate": 1.568958772593754e-07, + "loss": 0.0, + "num_input_tokens_seen": 39274152, + "step": 20070 + }, + { + "epoch": 2.6601722995361166, + "grad_norm": 2.0688834190368652, + "learning_rate": 1.5677485983729119e-07, + "loss": 0.0071, + "num_input_tokens_seen": 39275664, + "step": 20071 + }, + { + "epoch": 2.6603048376408216, + "grad_norm": 0.001039625145494938, + "learning_rate": 1.5665388759420447e-07, + "loss": 0.0, + "num_input_tokens_seen": 39276824, + "step": 20072 + }, + { + "epoch": 2.660437375745527, + "grad_norm": 0.02869611792266369, + "learning_rate": 1.5653296053244727e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39278672, + "step": 20073 + }, + { + "epoch": 2.6605699138502317, + "grad_norm": 7.082756042480469, + "learning_rate": 1.5641207865435077e-07, + "loss": 0.0675, + "num_input_tokens_seen": 39280544, + "step": 20074 + }, + { + "epoch": 2.660702451954937, + "grad_norm": 0.01785850152373314, + "learning_rate": 1.5629124196224672e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39282352, + "step": 20075 + }, + { + "epoch": 2.660834990059642, + "grad_norm": 6.0198469161987305, + "learning_rate": 1.561704504584638e-07, + "loss": 0.0372, + "num_input_tokens_seen": 39284296, + "step": 20076 + }, + { + "epoch": 2.6609675281643472, + "grad_norm": 0.008046872913837433, + "learning_rate": 1.5604970414533183e-07, + "loss": 0.0, + "num_input_tokens_seen": 39287096, + "step": 20077 + }, + { + "epoch": 2.6611000662690523, + "grad_norm": 0.054896097630262375, + "learning_rate": 1.5592900302517867e-07, + "loss": 0.0002, + "num_input_tokens_seen": 39288112, + "step": 20078 + }, + { + "epoch": 2.6612326043737573, + "grad_norm": 4.530753135681152, + "learning_rate": 1.5580834710033132e-07, + "loss": 0.0738, + "num_input_tokens_seen": 39290848, + "step": 20079 + }, + { + "epoch": 2.661365142478463, + "grad_norm": 7.721060752868652, + "learning_rate": 1.556877363731163e-07, + "loss": 0.0638, + "num_input_tokens_seen": 39292928, + "step": 20080 + }, + { + "epoch": 2.661497680583168, + "grad_norm": 11.560966491699219, + "learning_rate": 1.5556717084585892e-07, + "loss": 0.123, + "num_input_tokens_seen": 39294976, + "step": 20081 + }, + { + "epoch": 2.661630218687873, + "grad_norm": 0.0696026012301445, + "learning_rate": 1.5544665052088376e-07, + "loss": 0.0003, + "num_input_tokens_seen": 39296664, + "step": 20082 + }, + { + "epoch": 2.661762756792578, + "grad_norm": 0.015878230333328247, + "learning_rate": 1.5532617540051476e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39298912, + "step": 20083 + }, + { + "epoch": 2.661895294897283, + "grad_norm": 1.1882286071777344, + "learning_rate": 1.5520574548707422e-07, + "loss": 0.0021, + "num_input_tokens_seen": 39301104, + "step": 20084 + }, + { + "epoch": 2.662027833001988, + "grad_norm": 0.1862114667892456, + "learning_rate": 1.5508536078288478e-07, + "loss": 0.0007, + "num_input_tokens_seen": 39302864, + "step": 20085 + }, + { + "epoch": 2.662160371106693, + "grad_norm": 0.0048501272685825825, + "learning_rate": 1.5496502129026758e-07, + "loss": 0.0, + "num_input_tokens_seen": 39304624, + "step": 20086 + }, + { + "epoch": 2.6622929092113985, + "grad_norm": 0.06142023950815201, + "learning_rate": 1.548447270115422e-07, + "loss": 0.0002, + "num_input_tokens_seen": 39306792, + "step": 20087 + }, + { + "epoch": 2.6624254473161035, + "grad_norm": 11.073729515075684, + "learning_rate": 1.5472447794902811e-07, + "loss": 0.1335, + "num_input_tokens_seen": 39309440, + "step": 20088 + }, + { + "epoch": 2.6625579854208086, + "grad_norm": 3.9331328868865967, + "learning_rate": 1.546042741050441e-07, + "loss": 0.0189, + "num_input_tokens_seen": 39311800, + "step": 20089 + }, + { + "epoch": 2.6626905235255136, + "grad_norm": 10.820836067199707, + "learning_rate": 1.5448411548190795e-07, + "loss": 0.0439, + "num_input_tokens_seen": 39313240, + "step": 20090 + }, + { + "epoch": 2.6628230616302186, + "grad_norm": 2.4432740211486816, + "learning_rate": 1.543640020819359e-07, + "loss": 0.0189, + "num_input_tokens_seen": 39315144, + "step": 20091 + }, + { + "epoch": 2.6629555997349237, + "grad_norm": 0.014453635551035404, + "learning_rate": 1.5424393390744446e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39317112, + "step": 20092 + }, + { + "epoch": 2.6630881378396287, + "grad_norm": 0.008074669167399406, + "learning_rate": 1.5412391096074757e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39319120, + "step": 20093 + }, + { + "epoch": 2.663220675944334, + "grad_norm": 8.264554023742676, + "learning_rate": 1.5400393324416035e-07, + "loss": 0.1257, + "num_input_tokens_seen": 39322072, + "step": 20094 + }, + { + "epoch": 2.6633532140490392, + "grad_norm": 0.02673794887959957, + "learning_rate": 1.5388400075999566e-07, + "loss": 0.0002, + "num_input_tokens_seen": 39323960, + "step": 20095 + }, + { + "epoch": 2.6634857521537443, + "grad_norm": 0.021921608597040176, + "learning_rate": 1.5376411351056635e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39325712, + "step": 20096 + }, + { + "epoch": 2.6636182902584493, + "grad_norm": 0.012615563347935677, + "learning_rate": 1.5364427149818312e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39326856, + "step": 20097 + }, + { + "epoch": 2.6637508283631544, + "grad_norm": 0.03368218615651131, + "learning_rate": 1.5352447472515742e-07, + "loss": 0.0002, + "num_input_tokens_seen": 39329128, + "step": 20098 + }, + { + "epoch": 2.6638833664678594, + "grad_norm": 0.09718035906553268, + "learning_rate": 1.5340472319379795e-07, + "loss": 0.0005, + "num_input_tokens_seen": 39331160, + "step": 20099 + }, + { + "epoch": 2.6640159045725644, + "grad_norm": 0.13043227791786194, + "learning_rate": 1.532850169064143e-07, + "loss": 0.0009, + "num_input_tokens_seen": 39332952, + "step": 20100 + }, + { + "epoch": 2.66414844267727, + "grad_norm": 2.2386772632598877, + "learning_rate": 1.5316535586531483e-07, + "loss": 0.0154, + "num_input_tokens_seen": 39335088, + "step": 20101 + }, + { + "epoch": 2.664280980781975, + "grad_norm": 0.0015371773624792695, + "learning_rate": 1.5304574007280636e-07, + "loss": 0.0, + "num_input_tokens_seen": 39336192, + "step": 20102 + }, + { + "epoch": 2.66441351888668, + "grad_norm": 0.047920484095811844, + "learning_rate": 1.5292616953119483e-07, + "loss": 0.0004, + "num_input_tokens_seen": 39337816, + "step": 20103 + }, + { + "epoch": 2.664546056991385, + "grad_norm": 12.439166069030762, + "learning_rate": 1.5280664424278612e-07, + "loss": 0.0622, + "num_input_tokens_seen": 39340384, + "step": 20104 + }, + { + "epoch": 2.66467859509609, + "grad_norm": 0.24778370559215546, + "learning_rate": 1.5268716420988426e-07, + "loss": 0.0009, + "num_input_tokens_seen": 39342168, + "step": 20105 + }, + { + "epoch": 2.664811133200795, + "grad_norm": 7.261645793914795, + "learning_rate": 1.5256772943479325e-07, + "loss": 0.0372, + "num_input_tokens_seen": 39344040, + "step": 20106 + }, + { + "epoch": 2.6649436713055, + "grad_norm": 0.010847168043255806, + "learning_rate": 1.5244833991981595e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39346056, + "step": 20107 + }, + { + "epoch": 2.6650762094102056, + "grad_norm": 6.037286758422852, + "learning_rate": 1.523289956672544e-07, + "loss": 0.054, + "num_input_tokens_seen": 39348680, + "step": 20108 + }, + { + "epoch": 2.6652087475149107, + "grad_norm": 2.160426616668701, + "learning_rate": 1.5220969667940928e-07, + "loss": 0.0268, + "num_input_tokens_seen": 39350424, + "step": 20109 + }, + { + "epoch": 2.6653412856196157, + "grad_norm": 2.9819531440734863, + "learning_rate": 1.5209044295858038e-07, + "loss": 0.0074, + "num_input_tokens_seen": 39351960, + "step": 20110 + }, + { + "epoch": 2.6654738237243207, + "grad_norm": 2.256443738937378, + "learning_rate": 1.5197123450706786e-07, + "loss": 0.0127, + "num_input_tokens_seen": 39354200, + "step": 20111 + }, + { + "epoch": 2.6656063618290258, + "grad_norm": 0.24322111904621124, + "learning_rate": 1.5185207132716957e-07, + "loss": 0.001, + "num_input_tokens_seen": 39356048, + "step": 20112 + }, + { + "epoch": 2.665738899933731, + "grad_norm": 0.0934867113828659, + "learning_rate": 1.517329534211834e-07, + "loss": 0.0002, + "num_input_tokens_seen": 39357512, + "step": 20113 + }, + { + "epoch": 2.665871438038436, + "grad_norm": 0.02310285158455372, + "learning_rate": 1.516138807914061e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39358864, + "step": 20114 + }, + { + "epoch": 2.6660039761431413, + "grad_norm": 0.023409908637404442, + "learning_rate": 1.5149485344013282e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39360584, + "step": 20115 + }, + { + "epoch": 2.6661365142478464, + "grad_norm": 2.965247631072998, + "learning_rate": 1.513758713696592e-07, + "loss": 0.0225, + "num_input_tokens_seen": 39362496, + "step": 20116 + }, + { + "epoch": 2.6662690523525514, + "grad_norm": 0.01717386208474636, + "learning_rate": 1.5125693458227868e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39364408, + "step": 20117 + }, + { + "epoch": 2.6664015904572564, + "grad_norm": 1.7209481000900269, + "learning_rate": 1.5113804308028502e-07, + "loss": 0.007, + "num_input_tokens_seen": 39366344, + "step": 20118 + }, + { + "epoch": 2.6665341285619615, + "grad_norm": 4.734864234924316, + "learning_rate": 1.510191968659705e-07, + "loss": 0.0473, + "num_input_tokens_seen": 39367672, + "step": 20119 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 20.694290161132812, + "learning_rate": 1.509003959416258e-07, + "loss": 0.1722, + "num_input_tokens_seen": 39370400, + "step": 20120 + }, + { + "epoch": 2.6667992047713716, + "grad_norm": 8.548318862915039, + "learning_rate": 1.5078164030954244e-07, + "loss": 0.0725, + "num_input_tokens_seen": 39373200, + "step": 20121 + }, + { + "epoch": 2.666931742876077, + "grad_norm": 0.075322225689888, + "learning_rate": 1.5066292997201e-07, + "loss": 0.0003, + "num_input_tokens_seen": 39375072, + "step": 20122 + }, + { + "epoch": 2.667064280980782, + "grad_norm": 5.661335468292236, + "learning_rate": 1.505442649313163e-07, + "loss": 0.0442, + "num_input_tokens_seen": 39376968, + "step": 20123 + }, + { + "epoch": 2.667196819085487, + "grad_norm": 0.5694214105606079, + "learning_rate": 1.504256451897504e-07, + "loss": 0.0019, + "num_input_tokens_seen": 39379048, + "step": 20124 + }, + { + "epoch": 2.667329357190192, + "grad_norm": 2.161294937133789, + "learning_rate": 1.503070707495985e-07, + "loss": 0.0085, + "num_input_tokens_seen": 39380848, + "step": 20125 + }, + { + "epoch": 2.667461895294897, + "grad_norm": 0.7247650027275085, + "learning_rate": 1.5018854161314767e-07, + "loss": 0.0033, + "num_input_tokens_seen": 39382856, + "step": 20126 + }, + { + "epoch": 2.6675944333996022, + "grad_norm": 4.434040546417236, + "learning_rate": 1.5007005778268302e-07, + "loss": 0.0301, + "num_input_tokens_seen": 39385216, + "step": 20127 + }, + { + "epoch": 2.6677269715043073, + "grad_norm": 0.012582316994667053, + "learning_rate": 1.4995161926048856e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39386504, + "step": 20128 + }, + { + "epoch": 2.6678595096090127, + "grad_norm": 7.426968097686768, + "learning_rate": 1.4983322604884777e-07, + "loss": 0.0587, + "num_input_tokens_seen": 39387888, + "step": 20129 + }, + { + "epoch": 2.667992047713718, + "grad_norm": 11.776243209838867, + "learning_rate": 1.4971487815004405e-07, + "loss": 0.0874, + "num_input_tokens_seen": 39390048, + "step": 20130 + }, + { + "epoch": 2.668124585818423, + "grad_norm": 2.1628201007843018, + "learning_rate": 1.4959657556635866e-07, + "loss": 0.0272, + "num_input_tokens_seen": 39392488, + "step": 20131 + }, + { + "epoch": 2.668257123923128, + "grad_norm": 0.25032714009284973, + "learning_rate": 1.494783183000728e-07, + "loss": 0.0012, + "num_input_tokens_seen": 39395104, + "step": 20132 + }, + { + "epoch": 2.668389662027833, + "grad_norm": 0.09875693917274475, + "learning_rate": 1.4936010635346666e-07, + "loss": 0.0003, + "num_input_tokens_seen": 39396760, + "step": 20133 + }, + { + "epoch": 2.6685222001325384, + "grad_norm": 0.0052293166518211365, + "learning_rate": 1.4924193972881944e-07, + "loss": 0.0, + "num_input_tokens_seen": 39398128, + "step": 20134 + }, + { + "epoch": 2.668654738237243, + "grad_norm": 21.74353790283203, + "learning_rate": 1.4912381842840884e-07, + "loss": 0.0809, + "num_input_tokens_seen": 39400120, + "step": 20135 + }, + { + "epoch": 2.6687872763419485, + "grad_norm": 9.714179039001465, + "learning_rate": 1.490057424545127e-07, + "loss": 0.1488, + "num_input_tokens_seen": 39402904, + "step": 20136 + }, + { + "epoch": 2.6689198144466535, + "grad_norm": 2.7182931900024414, + "learning_rate": 1.4888771180940815e-07, + "loss": 0.0141, + "num_input_tokens_seen": 39404608, + "step": 20137 + }, + { + "epoch": 2.6690523525513585, + "grad_norm": 6.90073299407959, + "learning_rate": 1.4876972649537053e-07, + "loss": 0.0383, + "num_input_tokens_seen": 39406488, + "step": 20138 + }, + { + "epoch": 2.6691848906560636, + "grad_norm": 1.1990959644317627, + "learning_rate": 1.4865178651467444e-07, + "loss": 0.0203, + "num_input_tokens_seen": 39407992, + "step": 20139 + }, + { + "epoch": 2.6693174287607686, + "grad_norm": 0.1924542784690857, + "learning_rate": 1.4853389186959388e-07, + "loss": 0.0014, + "num_input_tokens_seen": 39409448, + "step": 20140 + }, + { + "epoch": 2.669449966865474, + "grad_norm": 1.3991565704345703, + "learning_rate": 1.4841604256240205e-07, + "loss": 0.006, + "num_input_tokens_seen": 39411000, + "step": 20141 + }, + { + "epoch": 2.6695825049701787, + "grad_norm": 6.912725925445557, + "learning_rate": 1.482982385953713e-07, + "loss": 0.1095, + "num_input_tokens_seen": 39413008, + "step": 20142 + }, + { + "epoch": 2.669715043074884, + "grad_norm": 0.029663022607564926, + "learning_rate": 1.4818047997077312e-07, + "loss": 0.0002, + "num_input_tokens_seen": 39415376, + "step": 20143 + }, + { + "epoch": 2.669847581179589, + "grad_norm": 3.212301015853882, + "learning_rate": 1.4806276669087765e-07, + "loss": 0.0091, + "num_input_tokens_seen": 39417920, + "step": 20144 + }, + { + "epoch": 2.6699801192842942, + "grad_norm": 0.003484267508611083, + "learning_rate": 1.479450987579545e-07, + "loss": 0.0, + "num_input_tokens_seen": 39419040, + "step": 20145 + }, + { + "epoch": 2.6701126573889993, + "grad_norm": 2.7343859672546387, + "learning_rate": 1.4782747617427234e-07, + "loss": 0.0225, + "num_input_tokens_seen": 39421000, + "step": 20146 + }, + { + "epoch": 2.6702451954937043, + "grad_norm": 0.009740045294165611, + "learning_rate": 1.4770989894209886e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39422624, + "step": 20147 + }, + { + "epoch": 2.67037773359841, + "grad_norm": 0.0016459949547424912, + "learning_rate": 1.4759236706370167e-07, + "loss": 0.0, + "num_input_tokens_seen": 39423752, + "step": 20148 + }, + { + "epoch": 2.6705102717031144, + "grad_norm": 0.034503061324357986, + "learning_rate": 1.4747488054134646e-07, + "loss": 0.0002, + "num_input_tokens_seen": 39426056, + "step": 20149 + }, + { + "epoch": 2.67064280980782, + "grad_norm": 0.07384388893842697, + "learning_rate": 1.4735743937729862e-07, + "loss": 0.0003, + "num_input_tokens_seen": 39427912, + "step": 20150 + }, + { + "epoch": 2.670775347912525, + "grad_norm": 12.226487159729004, + "learning_rate": 1.4724004357382192e-07, + "loss": 0.0574, + "num_input_tokens_seen": 39430776, + "step": 20151 + }, + { + "epoch": 2.67090788601723, + "grad_norm": 0.24289873242378235, + "learning_rate": 1.4712269313318067e-07, + "loss": 0.0009, + "num_input_tokens_seen": 39432456, + "step": 20152 + }, + { + "epoch": 2.671040424121935, + "grad_norm": 0.024418234825134277, + "learning_rate": 1.4700538805763664e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39433688, + "step": 20153 + }, + { + "epoch": 2.67117296222664, + "grad_norm": 7.3896965980529785, + "learning_rate": 1.468881283494525e-07, + "loss": 0.0113, + "num_input_tokens_seen": 39435280, + "step": 20154 + }, + { + "epoch": 2.6713055003313455, + "grad_norm": 0.011946018785238266, + "learning_rate": 1.4677091401088833e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39437752, + "step": 20155 + }, + { + "epoch": 2.67143803843605, + "grad_norm": 0.18004576861858368, + "learning_rate": 1.4665374504420404e-07, + "loss": 0.0004, + "num_input_tokens_seen": 39440184, + "step": 20156 + }, + { + "epoch": 2.6715705765407556, + "grad_norm": 0.008589798584580421, + "learning_rate": 1.4653662145165947e-07, + "loss": 0.0, + "num_input_tokens_seen": 39441760, + "step": 20157 + }, + { + "epoch": 2.6717031146454606, + "grad_norm": 4.811905384063721, + "learning_rate": 1.4641954323551226e-07, + "loss": 0.0368, + "num_input_tokens_seen": 39443776, + "step": 20158 + }, + { + "epoch": 2.6718356527501657, + "grad_norm": 0.022709475830197334, + "learning_rate": 1.4630251039801978e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39446088, + "step": 20159 + }, + { + "epoch": 2.6719681908548707, + "grad_norm": 2.7338054180145264, + "learning_rate": 1.4618552294143884e-07, + "loss": 0.0651, + "num_input_tokens_seen": 39447936, + "step": 20160 + }, + { + "epoch": 2.6721007289595757, + "grad_norm": 8.300863265991211, + "learning_rate": 1.4606858086802483e-07, + "loss": 0.0715, + "num_input_tokens_seen": 39450656, + "step": 20161 + }, + { + "epoch": 2.672233267064281, + "grad_norm": 10.302836418151855, + "learning_rate": 1.4595168418003235e-07, + "loss": 0.1274, + "num_input_tokens_seen": 39452368, + "step": 20162 + }, + { + "epoch": 2.672365805168986, + "grad_norm": 6.975489139556885, + "learning_rate": 1.4583483287971546e-07, + "loss": 0.1961, + "num_input_tokens_seen": 39453488, + "step": 20163 + }, + { + "epoch": 2.6724983432736913, + "grad_norm": 0.018723439425230026, + "learning_rate": 1.457180269693273e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39455288, + "step": 20164 + }, + { + "epoch": 2.6726308813783963, + "grad_norm": 6.84435510635376, + "learning_rate": 1.4560126645111915e-07, + "loss": 0.0278, + "num_input_tokens_seen": 39457544, + "step": 20165 + }, + { + "epoch": 2.6727634194831014, + "grad_norm": 0.06169802322983742, + "learning_rate": 1.4548455132734313e-07, + "loss": 0.0003, + "num_input_tokens_seen": 39460576, + "step": 20166 + }, + { + "epoch": 2.6728959575878064, + "grad_norm": 0.0019228770397603512, + "learning_rate": 1.4536788160024879e-07, + "loss": 0.0, + "num_input_tokens_seen": 39461872, + "step": 20167 + }, + { + "epoch": 2.6730284956925114, + "grad_norm": 2.4049911499023438, + "learning_rate": 1.4525125727208682e-07, + "loss": 0.0337, + "num_input_tokens_seen": 39463360, + "step": 20168 + }, + { + "epoch": 2.673161033797217, + "grad_norm": 4.048755168914795, + "learning_rate": 1.4513467834510463e-07, + "loss": 0.0451, + "num_input_tokens_seen": 39465192, + "step": 20169 + }, + { + "epoch": 2.673293571901922, + "grad_norm": 0.01354187447577715, + "learning_rate": 1.450181448215504e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39467872, + "step": 20170 + }, + { + "epoch": 2.673426110006627, + "grad_norm": 0.01715157739818096, + "learning_rate": 1.4490165670367122e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39469712, + "step": 20171 + }, + { + "epoch": 2.673558648111332, + "grad_norm": 2.8186209201812744, + "learning_rate": 1.4478521399371225e-07, + "loss": 0.0193, + "num_input_tokens_seen": 39471752, + "step": 20172 + }, + { + "epoch": 2.673691186216037, + "grad_norm": 5.104423999786377, + "learning_rate": 1.4466881669391976e-07, + "loss": 0.0245, + "num_input_tokens_seen": 39473424, + "step": 20173 + }, + { + "epoch": 2.673823724320742, + "grad_norm": 0.2736017405986786, + "learning_rate": 1.4455246480653717e-07, + "loss": 0.0011, + "num_input_tokens_seen": 39476656, + "step": 20174 + }, + { + "epoch": 2.673956262425447, + "grad_norm": 0.4044327139854431, + "learning_rate": 1.4443615833380804e-07, + "loss": 0.0018, + "num_input_tokens_seen": 39479184, + "step": 20175 + }, + { + "epoch": 2.6740888005301526, + "grad_norm": 10.494331359863281, + "learning_rate": 1.443198972779744e-07, + "loss": 0.2165, + "num_input_tokens_seen": 39480848, + "step": 20176 + }, + { + "epoch": 2.6742213386348577, + "grad_norm": 0.007487769704312086, + "learning_rate": 1.4420368164127813e-07, + "loss": 0.0, + "num_input_tokens_seen": 39482896, + "step": 20177 + }, + { + "epoch": 2.6743538767395627, + "grad_norm": 0.10096654295921326, + "learning_rate": 1.4408751142596073e-07, + "loss": 0.0005, + "num_input_tokens_seen": 39484792, + "step": 20178 + }, + { + "epoch": 2.6744864148442677, + "grad_norm": 10.122913360595703, + "learning_rate": 1.4397138663426125e-07, + "loss": 0.1066, + "num_input_tokens_seen": 39486864, + "step": 20179 + }, + { + "epoch": 2.674618952948973, + "grad_norm": 0.0019637218210846186, + "learning_rate": 1.4385530726841872e-07, + "loss": 0.0, + "num_input_tokens_seen": 39488696, + "step": 20180 + }, + { + "epoch": 2.674751491053678, + "grad_norm": 0.20172810554504395, + "learning_rate": 1.4373927333067113e-07, + "loss": 0.0014, + "num_input_tokens_seen": 39490560, + "step": 20181 + }, + { + "epoch": 2.674884029158383, + "grad_norm": 0.040239885449409485, + "learning_rate": 1.4362328482325577e-07, + "loss": 0.0002, + "num_input_tokens_seen": 39493744, + "step": 20182 + }, + { + "epoch": 2.6750165672630883, + "grad_norm": 5.654191017150879, + "learning_rate": 1.4350734174840897e-07, + "loss": 0.0254, + "num_input_tokens_seen": 39495888, + "step": 20183 + }, + { + "epoch": 2.6751491053677934, + "grad_norm": 4.878161907196045, + "learning_rate": 1.4339144410836642e-07, + "loss": 0.0609, + "num_input_tokens_seen": 39498464, + "step": 20184 + }, + { + "epoch": 2.6752816434724984, + "grad_norm": 0.1461607962846756, + "learning_rate": 1.43275591905363e-07, + "loss": 0.0009, + "num_input_tokens_seen": 39500296, + "step": 20185 + }, + { + "epoch": 2.6754141815772035, + "grad_norm": 2.244842052459717, + "learning_rate": 1.4315978514163164e-07, + "loss": 0.0084, + "num_input_tokens_seen": 39502688, + "step": 20186 + }, + { + "epoch": 2.6755467196819085, + "grad_norm": 0.9511129856109619, + "learning_rate": 1.4304402381940556e-07, + "loss": 0.006, + "num_input_tokens_seen": 39504160, + "step": 20187 + }, + { + "epoch": 2.6756792577866135, + "grad_norm": 0.010936187580227852, + "learning_rate": 1.4292830794091662e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39505400, + "step": 20188 + }, + { + "epoch": 2.6758117958913186, + "grad_norm": 5.239866733551025, + "learning_rate": 1.4281263750839548e-07, + "loss": 0.0753, + "num_input_tokens_seen": 39507408, + "step": 20189 + }, + { + "epoch": 2.675944333996024, + "grad_norm": 24.428874969482422, + "learning_rate": 1.426970125240737e-07, + "loss": 0.1246, + "num_input_tokens_seen": 39509224, + "step": 20190 + }, + { + "epoch": 2.676076872100729, + "grad_norm": 0.005128705408424139, + "learning_rate": 1.4258143299017924e-07, + "loss": 0.0, + "num_input_tokens_seen": 39510896, + "step": 20191 + }, + { + "epoch": 2.676209410205434, + "grad_norm": 0.014148305170238018, + "learning_rate": 1.4246589890894142e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39512632, + "step": 20192 + }, + { + "epoch": 2.676341948310139, + "grad_norm": 0.056967880576848984, + "learning_rate": 1.4235041028258706e-07, + "loss": 0.0003, + "num_input_tokens_seen": 39514264, + "step": 20193 + }, + { + "epoch": 2.676474486414844, + "grad_norm": 0.09654107689857483, + "learning_rate": 1.42234967113343e-07, + "loss": 0.0004, + "num_input_tokens_seen": 39516288, + "step": 20194 + }, + { + "epoch": 2.6766070245195492, + "grad_norm": 6.095205783843994, + "learning_rate": 1.421195694034358e-07, + "loss": 0.0808, + "num_input_tokens_seen": 39518584, + "step": 20195 + }, + { + "epoch": 2.6767395626242543, + "grad_norm": 3.4731605052948, + "learning_rate": 1.4200421715509006e-07, + "loss": 0.0477, + "num_input_tokens_seen": 39520256, + "step": 20196 + }, + { + "epoch": 2.6768721007289598, + "grad_norm": 3.3745572566986084, + "learning_rate": 1.4188891037052953e-07, + "loss": 0.0304, + "num_input_tokens_seen": 39522408, + "step": 20197 + }, + { + "epoch": 2.677004638833665, + "grad_norm": 3.6716883182525635, + "learning_rate": 1.417736490519772e-07, + "loss": 0.0136, + "num_input_tokens_seen": 39524584, + "step": 20198 + }, + { + "epoch": 2.67713717693837, + "grad_norm": 0.1443951576948166, + "learning_rate": 1.4165843320165628e-07, + "loss": 0.0005, + "num_input_tokens_seen": 39525952, + "step": 20199 + }, + { + "epoch": 2.677269715043075, + "grad_norm": 1.744572401046753, + "learning_rate": 1.4154326282178748e-07, + "loss": 0.0124, + "num_input_tokens_seen": 39527408, + "step": 20200 + }, + { + "epoch": 2.67740225314778, + "grad_norm": 1.2068439722061157, + "learning_rate": 1.4142813791459154e-07, + "loss": 0.0037, + "num_input_tokens_seen": 39529424, + "step": 20201 + }, + { + "epoch": 2.677534791252485, + "grad_norm": 0.003543563885614276, + "learning_rate": 1.413130584822886e-07, + "loss": 0.0, + "num_input_tokens_seen": 39531352, + "step": 20202 + }, + { + "epoch": 2.67766732935719, + "grad_norm": 0.08373633027076721, + "learning_rate": 1.4119802452709635e-07, + "loss": 0.0003, + "num_input_tokens_seen": 39532920, + "step": 20203 + }, + { + "epoch": 2.6777998674618955, + "grad_norm": 0.04483358561992645, + "learning_rate": 1.4108303605123413e-07, + "loss": 0.0003, + "num_input_tokens_seen": 39534536, + "step": 20204 + }, + { + "epoch": 2.6779324055666005, + "grad_norm": 4.8608903884887695, + "learning_rate": 1.4096809305691822e-07, + "loss": 0.084, + "num_input_tokens_seen": 39535896, + "step": 20205 + }, + { + "epoch": 2.6780649436713055, + "grad_norm": 0.01978892646729946, + "learning_rate": 1.4085319554636435e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39537560, + "step": 20206 + }, + { + "epoch": 2.6781974817760106, + "grad_norm": 0.4883391857147217, + "learning_rate": 1.4073834352178879e-07, + "loss": 0.0033, + "num_input_tokens_seen": 39538976, + "step": 20207 + }, + { + "epoch": 2.6783300198807156, + "grad_norm": 7.455320835113525, + "learning_rate": 1.4062353698540532e-07, + "loss": 0.1018, + "num_input_tokens_seen": 39541624, + "step": 20208 + }, + { + "epoch": 2.6784625579854207, + "grad_norm": 1.334483027458191, + "learning_rate": 1.4050877593942802e-07, + "loss": 0.0051, + "num_input_tokens_seen": 39543728, + "step": 20209 + }, + { + "epoch": 2.6785950960901257, + "grad_norm": 2.351060628890991, + "learning_rate": 1.40394060386069e-07, + "loss": 0.0168, + "num_input_tokens_seen": 39546000, + "step": 20210 + }, + { + "epoch": 2.678727634194831, + "grad_norm": 0.007957668974995613, + "learning_rate": 1.4027939032754067e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39547480, + "step": 20211 + }, + { + "epoch": 2.678860172299536, + "grad_norm": 0.008340725675225258, + "learning_rate": 1.4016476576605292e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39549320, + "step": 20212 + }, + { + "epoch": 2.6789927104042413, + "grad_norm": 2.645320415496826, + "learning_rate": 1.4005018670381675e-07, + "loss": 0.0143, + "num_input_tokens_seen": 39551960, + "step": 20213 + }, + { + "epoch": 2.6791252485089463, + "grad_norm": 0.0015474415849894285, + "learning_rate": 1.3993565314304126e-07, + "loss": 0.0, + "num_input_tokens_seen": 39553544, + "step": 20214 + }, + { + "epoch": 2.6792577866136513, + "grad_norm": 0.00988653115928173, + "learning_rate": 1.3982116508593436e-07, + "loss": 0.0, + "num_input_tokens_seen": 39555360, + "step": 20215 + }, + { + "epoch": 2.6793903247183564, + "grad_norm": 0.06097639724612236, + "learning_rate": 1.3970672253470375e-07, + "loss": 0.0003, + "num_input_tokens_seen": 39556448, + "step": 20216 + }, + { + "epoch": 2.6795228628230614, + "grad_norm": 0.02217528410255909, + "learning_rate": 1.395923254915557e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39558096, + "step": 20217 + }, + { + "epoch": 2.679655400927767, + "grad_norm": 7.531145095825195, + "learning_rate": 1.3947797395869572e-07, + "loss": 0.075, + "num_input_tokens_seen": 39559928, + "step": 20218 + }, + { + "epoch": 2.679787939032472, + "grad_norm": 7.074980735778809, + "learning_rate": 1.3936366793832895e-07, + "loss": 0.0872, + "num_input_tokens_seen": 39561984, + "step": 20219 + }, + { + "epoch": 2.679920477137177, + "grad_norm": 2.1431424617767334, + "learning_rate": 1.392494074326592e-07, + "loss": 0.0281, + "num_input_tokens_seen": 39563736, + "step": 20220 + }, + { + "epoch": 2.680053015241882, + "grad_norm": 1.197313666343689, + "learning_rate": 1.3913519244388967e-07, + "loss": 0.0051, + "num_input_tokens_seen": 39565968, + "step": 20221 + }, + { + "epoch": 2.680185553346587, + "grad_norm": 0.022982394322752953, + "learning_rate": 1.3902102297422255e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39567864, + "step": 20222 + }, + { + "epoch": 2.6803180914512925, + "grad_norm": 2.231358766555786, + "learning_rate": 1.3890689902585824e-07, + "loss": 0.0182, + "num_input_tokens_seen": 39569672, + "step": 20223 + }, + { + "epoch": 2.680450629555997, + "grad_norm": 11.15102481842041, + "learning_rate": 1.3879282060099754e-07, + "loss": 0.1709, + "num_input_tokens_seen": 39573152, + "step": 20224 + }, + { + "epoch": 2.6805831676607026, + "grad_norm": 1.9883564710617065, + "learning_rate": 1.3867878770184085e-07, + "loss": 0.013, + "num_input_tokens_seen": 39576064, + "step": 20225 + }, + { + "epoch": 2.6807157057654076, + "grad_norm": 0.019533313810825348, + "learning_rate": 1.3856480033058617e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39577584, + "step": 20226 + }, + { + "epoch": 2.6808482438701127, + "grad_norm": 0.006272313185036182, + "learning_rate": 1.384508584894309e-07, + "loss": 0.0, + "num_input_tokens_seen": 39579152, + "step": 20227 + }, + { + "epoch": 2.6809807819748177, + "grad_norm": 8.300698280334473, + "learning_rate": 1.3833696218057247e-07, + "loss": 0.0389, + "num_input_tokens_seen": 39581432, + "step": 20228 + }, + { + "epoch": 2.6811133200795227, + "grad_norm": 0.015048502013087273, + "learning_rate": 1.382231114062063e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39583272, + "step": 20229 + }, + { + "epoch": 2.6812458581842282, + "grad_norm": 13.196106910705566, + "learning_rate": 1.3810930616852764e-07, + "loss": 0.1391, + "num_input_tokens_seen": 39586312, + "step": 20230 + }, + { + "epoch": 2.681378396288933, + "grad_norm": 0.7480204105377197, + "learning_rate": 1.3799554646973163e-07, + "loss": 0.0022, + "num_input_tokens_seen": 39588032, + "step": 20231 + }, + { + "epoch": 2.6815109343936383, + "grad_norm": 7.550303936004639, + "learning_rate": 1.378818323120107e-07, + "loss": 0.0949, + "num_input_tokens_seen": 39589624, + "step": 20232 + }, + { + "epoch": 2.6816434724983433, + "grad_norm": 4.663870334625244, + "learning_rate": 1.3776816369755752e-07, + "loss": 0.0281, + "num_input_tokens_seen": 39591240, + "step": 20233 + }, + { + "epoch": 2.6817760106030484, + "grad_norm": 10.037118911743164, + "learning_rate": 1.3765454062856343e-07, + "loss": 0.0577, + "num_input_tokens_seen": 39593680, + "step": 20234 + }, + { + "epoch": 2.6819085487077534, + "grad_norm": 0.0015440036077052355, + "learning_rate": 1.3754096310721997e-07, + "loss": 0.0, + "num_input_tokens_seen": 39594800, + "step": 20235 + }, + { + "epoch": 2.6820410868124585, + "grad_norm": 0.007052072323858738, + "learning_rate": 1.3742743113571628e-07, + "loss": 0.0, + "num_input_tokens_seen": 39595816, + "step": 20236 + }, + { + "epoch": 2.682173624917164, + "grad_norm": 3.039276599884033, + "learning_rate": 1.3731394471624165e-07, + "loss": 0.0226, + "num_input_tokens_seen": 39598936, + "step": 20237 + }, + { + "epoch": 2.6823061630218685, + "grad_norm": 0.01326199434697628, + "learning_rate": 1.372005038509844e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39600696, + "step": 20238 + }, + { + "epoch": 2.682438701126574, + "grad_norm": 0.032343730330467224, + "learning_rate": 1.3708710854213074e-07, + "loss": 0.0003, + "num_input_tokens_seen": 39602184, + "step": 20239 + }, + { + "epoch": 2.682571239231279, + "grad_norm": 21.16706085205078, + "learning_rate": 1.3697375879186846e-07, + "loss": 0.1707, + "num_input_tokens_seen": 39604720, + "step": 20240 + }, + { + "epoch": 2.682703777335984, + "grad_norm": 0.11128988116979599, + "learning_rate": 1.368604546023819e-07, + "loss": 0.0006, + "num_input_tokens_seen": 39607104, + "step": 20241 + }, + { + "epoch": 2.682836315440689, + "grad_norm": 4.010493278503418, + "learning_rate": 1.3674719597585596e-07, + "loss": 0.0676, + "num_input_tokens_seen": 39610424, + "step": 20242 + }, + { + "epoch": 2.682968853545394, + "grad_norm": 0.05256155878305435, + "learning_rate": 1.3663398291447444e-07, + "loss": 0.0002, + "num_input_tokens_seen": 39611496, + "step": 20243 + }, + { + "epoch": 2.6831013916500996, + "grad_norm": 5.813283443450928, + "learning_rate": 1.3652081542042033e-07, + "loss": 0.0905, + "num_input_tokens_seen": 39614264, + "step": 20244 + }, + { + "epoch": 2.6832339297548042, + "grad_norm": 0.2944090664386749, + "learning_rate": 1.3640769349587523e-07, + "loss": 0.0012, + "num_input_tokens_seen": 39616872, + "step": 20245 + }, + { + "epoch": 2.6833664678595097, + "grad_norm": 0.05702001601457596, + "learning_rate": 1.3629461714302046e-07, + "loss": 0.0003, + "num_input_tokens_seen": 39619592, + "step": 20246 + }, + { + "epoch": 2.6834990059642148, + "grad_norm": 0.014209458604454994, + "learning_rate": 1.361815863640359e-07, + "loss": 0.0, + "num_input_tokens_seen": 39620832, + "step": 20247 + }, + { + "epoch": 2.68363154406892, + "grad_norm": 6.479780197143555, + "learning_rate": 1.360686011611012e-07, + "loss": 0.1706, + "num_input_tokens_seen": 39623872, + "step": 20248 + }, + { + "epoch": 2.683764082173625, + "grad_norm": 3.0763556957244873, + "learning_rate": 1.3595566153639493e-07, + "loss": 0.038, + "num_input_tokens_seen": 39625576, + "step": 20249 + }, + { + "epoch": 2.68389662027833, + "grad_norm": 2.2298595905303955, + "learning_rate": 1.3584276749209367e-07, + "loss": 0.0416, + "num_input_tokens_seen": 39628080, + "step": 20250 + }, + { + "epoch": 2.6840291583830354, + "grad_norm": 0.1068248525261879, + "learning_rate": 1.357299190303754e-07, + "loss": 0.0004, + "num_input_tokens_seen": 39630440, + "step": 20251 + }, + { + "epoch": 2.6841616964877404, + "grad_norm": 20.213184356689453, + "learning_rate": 1.3561711615341534e-07, + "loss": 0.4817, + "num_input_tokens_seen": 39633128, + "step": 20252 + }, + { + "epoch": 2.6842942345924454, + "grad_norm": 10.266946792602539, + "learning_rate": 1.355043588633881e-07, + "loss": 0.0793, + "num_input_tokens_seen": 39635456, + "step": 20253 + }, + { + "epoch": 2.6844267726971505, + "grad_norm": 0.09329124540090561, + "learning_rate": 1.3539164716246806e-07, + "loss": 0.0006, + "num_input_tokens_seen": 39637432, + "step": 20254 + }, + { + "epoch": 2.6845593108018555, + "grad_norm": 0.12536819279193878, + "learning_rate": 1.3527898105282821e-07, + "loss": 0.0005, + "num_input_tokens_seen": 39638616, + "step": 20255 + }, + { + "epoch": 2.6846918489065605, + "grad_norm": 0.00541206868365407, + "learning_rate": 1.351663605366413e-07, + "loss": 0.0, + "num_input_tokens_seen": 39641168, + "step": 20256 + }, + { + "epoch": 2.6848243870112656, + "grad_norm": 9.480112075805664, + "learning_rate": 1.350537856160783e-07, + "loss": 0.3359, + "num_input_tokens_seen": 39643232, + "step": 20257 + }, + { + "epoch": 2.684956925115971, + "grad_norm": 3.5848605632781982, + "learning_rate": 1.3494125629330974e-07, + "loss": 0.0268, + "num_input_tokens_seen": 39645160, + "step": 20258 + }, + { + "epoch": 2.685089463220676, + "grad_norm": 0.03447338566184044, + "learning_rate": 1.3482877257050498e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39647520, + "step": 20259 + }, + { + "epoch": 2.685222001325381, + "grad_norm": 0.03422676771879196, + "learning_rate": 1.3471633444983312e-07, + "loss": 0.0002, + "num_input_tokens_seen": 39650368, + "step": 20260 + }, + { + "epoch": 2.685354539430086, + "grad_norm": 7.019379615783691, + "learning_rate": 1.346039419334627e-07, + "loss": 0.111, + "num_input_tokens_seen": 39653208, + "step": 20261 + }, + { + "epoch": 2.685487077534791, + "grad_norm": 1.5970743894577026, + "learning_rate": 1.344915950235598e-07, + "loss": 0.0041, + "num_input_tokens_seen": 39656200, + "step": 20262 + }, + { + "epoch": 2.6856196156394962, + "grad_norm": 0.007372781168669462, + "learning_rate": 1.3437929372229098e-07, + "loss": 0.0, + "num_input_tokens_seen": 39658296, + "step": 20263 + }, + { + "epoch": 2.6857521537442013, + "grad_norm": 7.088783264160156, + "learning_rate": 1.3426703803182117e-07, + "loss": 0.0953, + "num_input_tokens_seen": 39660624, + "step": 20264 + }, + { + "epoch": 2.6858846918489068, + "grad_norm": 8.35385799407959, + "learning_rate": 1.341548279543145e-07, + "loss": 0.1057, + "num_input_tokens_seen": 39663952, + "step": 20265 + }, + { + "epoch": 2.686017229953612, + "grad_norm": 5.946142196655273, + "learning_rate": 1.3404266349193506e-07, + "loss": 0.0386, + "num_input_tokens_seen": 39666880, + "step": 20266 + }, + { + "epoch": 2.686149768058317, + "grad_norm": 0.013135652057826519, + "learning_rate": 1.3393054464684557e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39668992, + "step": 20267 + }, + { + "epoch": 2.686282306163022, + "grad_norm": 3.1865711212158203, + "learning_rate": 1.3381847142120736e-07, + "loss": 0.0272, + "num_input_tokens_seen": 39671224, + "step": 20268 + }, + { + "epoch": 2.686414844267727, + "grad_norm": 0.03475270792841911, + "learning_rate": 1.3370644381718145e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39672696, + "step": 20269 + }, + { + "epoch": 2.686547382372432, + "grad_norm": 0.36105677485466003, + "learning_rate": 1.3359446183692726e-07, + "loss": 0.002, + "num_input_tokens_seen": 39674256, + "step": 20270 + }, + { + "epoch": 2.686679920477137, + "grad_norm": 0.015268632210791111, + "learning_rate": 1.33482525482605e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39675864, + "step": 20271 + }, + { + "epoch": 2.6868124585818425, + "grad_norm": 0.5047983527183533, + "learning_rate": 1.3337063475637152e-07, + "loss": 0.001, + "num_input_tokens_seen": 39677256, + "step": 20272 + }, + { + "epoch": 2.6869449966865475, + "grad_norm": 7.422905445098877, + "learning_rate": 1.3325878966038542e-07, + "loss": 0.2324, + "num_input_tokens_seen": 39680080, + "step": 20273 + }, + { + "epoch": 2.6870775347912526, + "grad_norm": 9.941550254821777, + "learning_rate": 1.3314699019680245e-07, + "loss": 0.1157, + "num_input_tokens_seen": 39682568, + "step": 20274 + }, + { + "epoch": 2.6872100728959576, + "grad_norm": 0.011494150385260582, + "learning_rate": 1.3303523636777838e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39684024, + "step": 20275 + }, + { + "epoch": 2.6873426110006626, + "grad_norm": 5.960836887359619, + "learning_rate": 1.329235281754676e-07, + "loss": 0.0722, + "num_input_tokens_seen": 39686240, + "step": 20276 + }, + { + "epoch": 2.6874751491053677, + "grad_norm": 1.843982458114624, + "learning_rate": 1.3281186562202396e-07, + "loss": 0.0061, + "num_input_tokens_seen": 39688656, + "step": 20277 + }, + { + "epoch": 2.6876076872100727, + "grad_norm": 0.024961229413747787, + "learning_rate": 1.3270024870960102e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39690328, + "step": 20278 + }, + { + "epoch": 2.687740225314778, + "grad_norm": 2.6101198196411133, + "learning_rate": 1.3258867744035037e-07, + "loss": 0.0412, + "num_input_tokens_seen": 39692704, + "step": 20279 + }, + { + "epoch": 2.6878727634194832, + "grad_norm": 16.861854553222656, + "learning_rate": 1.3247715181642334e-07, + "loss": 0.0859, + "num_input_tokens_seen": 39694600, + "step": 20280 + }, + { + "epoch": 2.6880053015241883, + "grad_norm": 11.09103012084961, + "learning_rate": 1.3236567183996963e-07, + "loss": 0.1181, + "num_input_tokens_seen": 39696768, + "step": 20281 + }, + { + "epoch": 2.6881378396288933, + "grad_norm": 0.32551276683807373, + "learning_rate": 1.3225423751313942e-07, + "loss": 0.0016, + "num_input_tokens_seen": 39698736, + "step": 20282 + }, + { + "epoch": 2.6882703777335983, + "grad_norm": 0.04987425357103348, + "learning_rate": 1.3214284883808076e-07, + "loss": 0.0002, + "num_input_tokens_seen": 39700488, + "step": 20283 + }, + { + "epoch": 2.6884029158383034, + "grad_norm": 2.375267505645752, + "learning_rate": 1.3203150581694163e-07, + "loss": 0.0178, + "num_input_tokens_seen": 39702520, + "step": 20284 + }, + { + "epoch": 2.6885354539430084, + "grad_norm": 0.01946786791086197, + "learning_rate": 1.3192020845186864e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39704864, + "step": 20285 + }, + { + "epoch": 2.688667992047714, + "grad_norm": 14.088971138000488, + "learning_rate": 1.318089567450076e-07, + "loss": 0.1571, + "num_input_tokens_seen": 39707680, + "step": 20286 + }, + { + "epoch": 2.688800530152419, + "grad_norm": 1.4880197048187256, + "learning_rate": 1.3169775069850372e-07, + "loss": 0.0088, + "num_input_tokens_seen": 39709272, + "step": 20287 + }, + { + "epoch": 2.688933068257124, + "grad_norm": 0.015204011462628841, + "learning_rate": 1.315865903145011e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39710888, + "step": 20288 + }, + { + "epoch": 2.689065606361829, + "grad_norm": 0.00498259486630559, + "learning_rate": 1.314754755951425e-07, + "loss": 0.0, + "num_input_tokens_seen": 39713624, + "step": 20289 + }, + { + "epoch": 2.689198144466534, + "grad_norm": 0.3188038170337677, + "learning_rate": 1.3136440654257122e-07, + "loss": 0.0015, + "num_input_tokens_seen": 39715424, + "step": 20290 + }, + { + "epoch": 2.689330682571239, + "grad_norm": 4.13944673538208, + "learning_rate": 1.312533831589277e-07, + "loss": 0.0391, + "num_input_tokens_seen": 39716944, + "step": 20291 + }, + { + "epoch": 2.689463220675944, + "grad_norm": 0.002933989744633436, + "learning_rate": 1.3114240544635337e-07, + "loss": 0.0, + "num_input_tokens_seen": 39718232, + "step": 20292 + }, + { + "epoch": 2.6895957587806496, + "grad_norm": 3.713808059692383, + "learning_rate": 1.3103147340698758e-07, + "loss": 0.0106, + "num_input_tokens_seen": 39720176, + "step": 20293 + }, + { + "epoch": 2.6897282968853546, + "grad_norm": 0.14484186470508575, + "learning_rate": 1.3092058704296945e-07, + "loss": 0.0006, + "num_input_tokens_seen": 39721360, + "step": 20294 + }, + { + "epoch": 2.6898608349900597, + "grad_norm": 0.0018538759322836995, + "learning_rate": 1.3080974635643646e-07, + "loss": 0.0, + "num_input_tokens_seen": 39723048, + "step": 20295 + }, + { + "epoch": 2.6899933730947647, + "grad_norm": 19.220539093017578, + "learning_rate": 1.3069895134952577e-07, + "loss": 0.2743, + "num_input_tokens_seen": 39725920, + "step": 20296 + }, + { + "epoch": 2.6901259111994698, + "grad_norm": 0.016886474564671516, + "learning_rate": 1.3058820202437427e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39727216, + "step": 20297 + }, + { + "epoch": 2.690258449304175, + "grad_norm": 7.439931392669678, + "learning_rate": 1.3047749838311667e-07, + "loss": 0.087, + "num_input_tokens_seen": 39729520, + "step": 20298 + }, + { + "epoch": 2.69039098740888, + "grad_norm": 1.7826076745986938, + "learning_rate": 1.3036684042788766e-07, + "loss": 0.0044, + "num_input_tokens_seen": 39731216, + "step": 20299 + }, + { + "epoch": 2.6905235255135853, + "grad_norm": 0.004218373913317919, + "learning_rate": 1.302562281608205e-07, + "loss": 0.0, + "num_input_tokens_seen": 39733704, + "step": 20300 + }, + { + "epoch": 2.6906560636182904, + "grad_norm": 0.10931126773357391, + "learning_rate": 1.3014566158404824e-07, + "loss": 0.0006, + "num_input_tokens_seen": 39735264, + "step": 20301 + }, + { + "epoch": 2.6907886017229954, + "grad_norm": 2.7251193523406982, + "learning_rate": 1.300351406997022e-07, + "loss": 0.0122, + "num_input_tokens_seen": 39737520, + "step": 20302 + }, + { + "epoch": 2.6909211398277004, + "grad_norm": 0.7056270241737366, + "learning_rate": 1.2992466550991372e-07, + "loss": 0.0043, + "num_input_tokens_seen": 39738840, + "step": 20303 + }, + { + "epoch": 2.6910536779324055, + "grad_norm": 9.411016464233398, + "learning_rate": 1.2981423601681308e-07, + "loss": 0.0859, + "num_input_tokens_seen": 39740784, + "step": 20304 + }, + { + "epoch": 2.691186216037111, + "grad_norm": 0.022363554686307907, + "learning_rate": 1.2970385222252884e-07, + "loss": 0.0002, + "num_input_tokens_seen": 39742792, + "step": 20305 + }, + { + "epoch": 2.6913187541418155, + "grad_norm": 0.0990038514137268, + "learning_rate": 1.295935141291893e-07, + "loss": 0.0006, + "num_input_tokens_seen": 39744568, + "step": 20306 + }, + { + "epoch": 2.691451292246521, + "grad_norm": 5.8893914222717285, + "learning_rate": 1.2948322173892193e-07, + "loss": 0.0662, + "num_input_tokens_seen": 39746648, + "step": 20307 + }, + { + "epoch": 2.691583830351226, + "grad_norm": 33.46512222290039, + "learning_rate": 1.2937297505385387e-07, + "loss": 0.1409, + "num_input_tokens_seen": 39749568, + "step": 20308 + }, + { + "epoch": 2.691716368455931, + "grad_norm": 0.0046052513644099236, + "learning_rate": 1.2926277407611016e-07, + "loss": 0.0, + "num_input_tokens_seen": 39751192, + "step": 20309 + }, + { + "epoch": 2.691848906560636, + "grad_norm": 7.8816022872924805, + "learning_rate": 1.291526188078157e-07, + "loss": 0.0758, + "num_input_tokens_seen": 39752624, + "step": 20310 + }, + { + "epoch": 2.691981444665341, + "grad_norm": 0.2195933610200882, + "learning_rate": 1.290425092510944e-07, + "loss": 0.0009, + "num_input_tokens_seen": 39754816, + "step": 20311 + }, + { + "epoch": 2.6921139827700467, + "grad_norm": 0.024156082421541214, + "learning_rate": 1.2893244540806872e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39756584, + "step": 20312 + }, + { + "epoch": 2.6922465208747512, + "grad_norm": 0.6879380345344543, + "learning_rate": 1.288224272808611e-07, + "loss": 0.01, + "num_input_tokens_seen": 39758992, + "step": 20313 + }, + { + "epoch": 2.6923790589794567, + "grad_norm": 0.022204674780368805, + "learning_rate": 1.2871245487159346e-07, + "loss": 0.0002, + "num_input_tokens_seen": 39760720, + "step": 20314 + }, + { + "epoch": 2.6925115970841618, + "grad_norm": 0.0149612445384264, + "learning_rate": 1.286025281823852e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39762088, + "step": 20315 + }, + { + "epoch": 2.692644135188867, + "grad_norm": 7.528134346008301, + "learning_rate": 1.2849264721535638e-07, + "loss": 0.0646, + "num_input_tokens_seen": 39763896, + "step": 20316 + }, + { + "epoch": 2.692776673293572, + "grad_norm": 0.03728026524186134, + "learning_rate": 1.2838281197262493e-07, + "loss": 0.0002, + "num_input_tokens_seen": 39765560, + "step": 20317 + }, + { + "epoch": 2.692909211398277, + "grad_norm": 1.3240609169006348, + "learning_rate": 1.2827302245630946e-07, + "loss": 0.0055, + "num_input_tokens_seen": 39767280, + "step": 20318 + }, + { + "epoch": 2.6930417495029824, + "grad_norm": 8.028139114379883, + "learning_rate": 1.281632786685258e-07, + "loss": 0.0831, + "num_input_tokens_seen": 39769400, + "step": 20319 + }, + { + "epoch": 2.693174287607687, + "grad_norm": 0.01482705119997263, + "learning_rate": 1.2805358061139084e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39772192, + "step": 20320 + }, + { + "epoch": 2.6933068257123924, + "grad_norm": 0.0503116212785244, + "learning_rate": 1.2794392828701902e-07, + "loss": 0.0003, + "num_input_tokens_seen": 39773496, + "step": 20321 + }, + { + "epoch": 2.6934393638170975, + "grad_norm": 0.022360673174262047, + "learning_rate": 1.2783432169752446e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39776240, + "step": 20322 + }, + { + "epoch": 2.6935719019218025, + "grad_norm": 0.04282457381486893, + "learning_rate": 1.2772476084502078e-07, + "loss": 0.0003, + "num_input_tokens_seen": 39778416, + "step": 20323 + }, + { + "epoch": 2.6937044400265076, + "grad_norm": 0.5014520883560181, + "learning_rate": 1.2761524573162043e-07, + "loss": 0.0019, + "num_input_tokens_seen": 39780656, + "step": 20324 + }, + { + "epoch": 2.6938369781312126, + "grad_norm": 14.330720901489258, + "learning_rate": 1.275057763594345e-07, + "loss": 0.0996, + "num_input_tokens_seen": 39782296, + "step": 20325 + }, + { + "epoch": 2.693969516235918, + "grad_norm": 0.006834366358816624, + "learning_rate": 1.2739635273057411e-07, + "loss": 0.0, + "num_input_tokens_seen": 39783608, + "step": 20326 + }, + { + "epoch": 2.6941020543406227, + "grad_norm": 0.009059164673089981, + "learning_rate": 1.2728697484714836e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39784920, + "step": 20327 + }, + { + "epoch": 2.694234592445328, + "grad_norm": 0.0007869730470702052, + "learning_rate": 1.2717764271126725e-07, + "loss": 0.0, + "num_input_tokens_seen": 39786136, + "step": 20328 + }, + { + "epoch": 2.694367130550033, + "grad_norm": 9.232940673828125, + "learning_rate": 1.2706835632503773e-07, + "loss": 0.0539, + "num_input_tokens_seen": 39787576, + "step": 20329 + }, + { + "epoch": 2.6944996686547382, + "grad_norm": 0.004180433228611946, + "learning_rate": 1.2695911569056723e-07, + "loss": 0.0, + "num_input_tokens_seen": 39788672, + "step": 20330 + }, + { + "epoch": 2.6946322067594433, + "grad_norm": 0.005995085462927818, + "learning_rate": 1.2684992080996245e-07, + "loss": 0.0, + "num_input_tokens_seen": 39790424, + "step": 20331 + }, + { + "epoch": 2.6947647448641483, + "grad_norm": 1.3178893327713013, + "learning_rate": 1.2674077168532806e-07, + "loss": 0.0114, + "num_input_tokens_seen": 39792424, + "step": 20332 + }, + { + "epoch": 2.694897282968854, + "grad_norm": 0.041404444724321365, + "learning_rate": 1.2663166831876878e-07, + "loss": 0.0002, + "num_input_tokens_seen": 39794584, + "step": 20333 + }, + { + "epoch": 2.6950298210735584, + "grad_norm": 9.767465591430664, + "learning_rate": 1.2652261071238847e-07, + "loss": 0.1181, + "num_input_tokens_seen": 39795968, + "step": 20334 + }, + { + "epoch": 2.695162359178264, + "grad_norm": 0.009301705285906792, + "learning_rate": 1.2641359886828935e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39798904, + "step": 20335 + }, + { + "epoch": 2.695294897282969, + "grad_norm": 0.0045341686345636845, + "learning_rate": 1.2630463278857336e-07, + "loss": 0.0, + "num_input_tokens_seen": 39801016, + "step": 20336 + }, + { + "epoch": 2.695427435387674, + "grad_norm": 15.451719284057617, + "learning_rate": 1.2619571247534212e-07, + "loss": 0.1004, + "num_input_tokens_seen": 39802752, + "step": 20337 + }, + { + "epoch": 2.695559973492379, + "grad_norm": 0.46395382285118103, + "learning_rate": 1.2608683793069455e-07, + "loss": 0.0016, + "num_input_tokens_seen": 39804936, + "step": 20338 + }, + { + "epoch": 2.695692511597084, + "grad_norm": 9.996124267578125, + "learning_rate": 1.2597800915673087e-07, + "loss": 0.1075, + "num_input_tokens_seen": 39807032, + "step": 20339 + }, + { + "epoch": 2.6958250497017895, + "grad_norm": 2.716503143310547, + "learning_rate": 1.2586922615554887e-07, + "loss": 0.0128, + "num_input_tokens_seen": 39808768, + "step": 20340 + }, + { + "epoch": 2.6959575878064945, + "grad_norm": 0.00889415480196476, + "learning_rate": 1.2576048892924602e-07, + "loss": 0.0, + "num_input_tokens_seen": 39810992, + "step": 20341 + }, + { + "epoch": 2.6960901259111996, + "grad_norm": 0.9031978845596313, + "learning_rate": 1.2565179747991846e-07, + "loss": 0.0023, + "num_input_tokens_seen": 39813976, + "step": 20342 + }, + { + "epoch": 2.6962226640159046, + "grad_norm": 9.870047569274902, + "learning_rate": 1.2554315180966224e-07, + "loss": 0.1339, + "num_input_tokens_seen": 39815600, + "step": 20343 + }, + { + "epoch": 2.6963552021206096, + "grad_norm": 5.533182621002197, + "learning_rate": 1.254345519205727e-07, + "loss": 0.0702, + "num_input_tokens_seen": 39817440, + "step": 20344 + }, + { + "epoch": 2.6964877402253147, + "grad_norm": 0.0019746068865060806, + "learning_rate": 1.2532599781474282e-07, + "loss": 0.0, + "num_input_tokens_seen": 39818752, + "step": 20345 + }, + { + "epoch": 2.6966202783300197, + "grad_norm": 0.01331594679504633, + "learning_rate": 1.2521748949426594e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39820416, + "step": 20346 + }, + { + "epoch": 2.696752816434725, + "grad_norm": 9.39315128326416, + "learning_rate": 1.2510902696123427e-07, + "loss": 0.08, + "num_input_tokens_seen": 39822208, + "step": 20347 + }, + { + "epoch": 2.6968853545394302, + "grad_norm": 0.024104658514261246, + "learning_rate": 1.2500061021773867e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39824080, + "step": 20348 + }, + { + "epoch": 2.6970178926441353, + "grad_norm": 1.7579188346862793, + "learning_rate": 1.248922392658694e-07, + "loss": 0.0089, + "num_input_tokens_seen": 39826192, + "step": 20349 + }, + { + "epoch": 2.6971504307488403, + "grad_norm": 0.9980867505073547, + "learning_rate": 1.2478391410771701e-07, + "loss": 0.0024, + "num_input_tokens_seen": 39827928, + "step": 20350 + }, + { + "epoch": 2.6972829688535453, + "grad_norm": 0.0054858666844666, + "learning_rate": 1.2467563474536898e-07, + "loss": 0.0, + "num_input_tokens_seen": 39829880, + "step": 20351 + }, + { + "epoch": 2.6974155069582504, + "grad_norm": 0.0034914363641291857, + "learning_rate": 1.2456740118091338e-07, + "loss": 0.0, + "num_input_tokens_seen": 39831112, + "step": 20352 + }, + { + "epoch": 2.6975480450629554, + "grad_norm": 0.0014470757450908422, + "learning_rate": 1.2445921341643686e-07, + "loss": 0.0, + "num_input_tokens_seen": 39832592, + "step": 20353 + }, + { + "epoch": 2.697680583167661, + "grad_norm": 6.767105579376221, + "learning_rate": 1.2435107145402553e-07, + "loss": 0.1556, + "num_input_tokens_seen": 39834424, + "step": 20354 + }, + { + "epoch": 2.697813121272366, + "grad_norm": 1.029522180557251, + "learning_rate": 1.2424297529576469e-07, + "loss": 0.0038, + "num_input_tokens_seen": 39836432, + "step": 20355 + }, + { + "epoch": 2.697945659377071, + "grad_norm": 0.9925873279571533, + "learning_rate": 1.2413492494373818e-07, + "loss": 0.0085, + "num_input_tokens_seen": 39838528, + "step": 20356 + }, + { + "epoch": 2.698078197481776, + "grad_norm": 0.003812601091340184, + "learning_rate": 1.240269204000294e-07, + "loss": 0.0, + "num_input_tokens_seen": 39840032, + "step": 20357 + }, + { + "epoch": 2.698210735586481, + "grad_norm": 1.8558276891708374, + "learning_rate": 1.2391896166672023e-07, + "loss": 0.0056, + "num_input_tokens_seen": 39842040, + "step": 20358 + }, + { + "epoch": 2.698343273691186, + "grad_norm": 10.4681396484375, + "learning_rate": 1.2381104874589294e-07, + "loss": 0.1118, + "num_input_tokens_seen": 39843976, + "step": 20359 + }, + { + "epoch": 2.698475811795891, + "grad_norm": 4.482240676879883, + "learning_rate": 1.237031816396278e-07, + "loss": 0.1225, + "num_input_tokens_seen": 39845632, + "step": 20360 + }, + { + "epoch": 2.6986083499005966, + "grad_norm": 4.063783645629883, + "learning_rate": 1.235953603500048e-07, + "loss": 0.0212, + "num_input_tokens_seen": 39846944, + "step": 20361 + }, + { + "epoch": 2.6987408880053017, + "grad_norm": 0.013657841831445694, + "learning_rate": 1.2348758487910283e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39848408, + "step": 20362 + }, + { + "epoch": 2.6988734261100067, + "grad_norm": 0.329316645860672, + "learning_rate": 1.2337985522899942e-07, + "loss": 0.0011, + "num_input_tokens_seen": 39850600, + "step": 20363 + }, + { + "epoch": 2.6990059642147117, + "grad_norm": 0.6962037682533264, + "learning_rate": 1.2327217140177177e-07, + "loss": 0.002, + "num_input_tokens_seen": 39851992, + "step": 20364 + }, + { + "epoch": 2.6991385023194168, + "grad_norm": 0.0760614275932312, + "learning_rate": 1.2316453339949658e-07, + "loss": 0.0004, + "num_input_tokens_seen": 39853352, + "step": 20365 + }, + { + "epoch": 2.699271040424122, + "grad_norm": 0.0036921033170074224, + "learning_rate": 1.2305694122424884e-07, + "loss": 0.0, + "num_input_tokens_seen": 39854584, + "step": 20366 + }, + { + "epoch": 2.699403578528827, + "grad_norm": 9.803718566894531, + "learning_rate": 1.22949394878103e-07, + "loss": 0.1309, + "num_input_tokens_seen": 39856672, + "step": 20367 + }, + { + "epoch": 2.6995361166335323, + "grad_norm": 11.173903465270996, + "learning_rate": 1.2284189436313265e-07, + "loss": 0.078, + "num_input_tokens_seen": 39858560, + "step": 20368 + }, + { + "epoch": 2.6996686547382374, + "grad_norm": 9.086496353149414, + "learning_rate": 1.2273443968141037e-07, + "loss": 0.0834, + "num_input_tokens_seen": 39860776, + "step": 20369 + }, + { + "epoch": 2.6998011928429424, + "grad_norm": 2.188953161239624, + "learning_rate": 1.2262703083500832e-07, + "loss": 0.0113, + "num_input_tokens_seen": 39862672, + "step": 20370 + }, + { + "epoch": 2.6999337309476474, + "grad_norm": 3.627107858657837, + "learning_rate": 1.2251966782599738e-07, + "loss": 0.0316, + "num_input_tokens_seen": 39864280, + "step": 20371 + }, + { + "epoch": 2.7000662690523525, + "grad_norm": 0.26889508962631226, + "learning_rate": 1.2241235065644675e-07, + "loss": 0.0014, + "num_input_tokens_seen": 39866824, + "step": 20372 + }, + { + "epoch": 2.7001988071570575, + "grad_norm": 18.8335018157959, + "learning_rate": 1.2230507932842668e-07, + "loss": 0.1985, + "num_input_tokens_seen": 39868864, + "step": 20373 + }, + { + "epoch": 2.7003313452617626, + "grad_norm": 6.819803714752197, + "learning_rate": 1.221978538440047e-07, + "loss": 0.059, + "num_input_tokens_seen": 39870808, + "step": 20374 + }, + { + "epoch": 2.700463883366468, + "grad_norm": 0.045212216675281525, + "learning_rate": 1.2209067420524856e-07, + "loss": 0.0002, + "num_input_tokens_seen": 39872376, + "step": 20375 + }, + { + "epoch": 2.700596421471173, + "grad_norm": 2.1256027221679688, + "learning_rate": 1.2198354041422499e-07, + "loss": 0.0237, + "num_input_tokens_seen": 39874992, + "step": 20376 + }, + { + "epoch": 2.700728959575878, + "grad_norm": 0.019258426502346992, + "learning_rate": 1.21876452472999e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39876568, + "step": 20377 + }, + { + "epoch": 2.700861497680583, + "grad_norm": 7.593324661254883, + "learning_rate": 1.2176941038363532e-07, + "loss": 0.0368, + "num_input_tokens_seen": 39878976, + "step": 20378 + }, + { + "epoch": 2.700994035785288, + "grad_norm": 0.007217365317046642, + "learning_rate": 1.2166241414819812e-07, + "loss": 0.0, + "num_input_tokens_seen": 39880120, + "step": 20379 + }, + { + "epoch": 2.701126573889993, + "grad_norm": 22.74839210510254, + "learning_rate": 1.215554637687505e-07, + "loss": 0.2364, + "num_input_tokens_seen": 39881832, + "step": 20380 + }, + { + "epoch": 2.7012591119946983, + "grad_norm": 0.024011442437767982, + "learning_rate": 1.2144855924735467e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39883336, + "step": 20381 + }, + { + "epoch": 2.7013916500994037, + "grad_norm": 0.02151578478515148, + "learning_rate": 1.213417005860712e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39884648, + "step": 20382 + }, + { + "epoch": 2.701524188204109, + "grad_norm": 0.4968040883541107, + "learning_rate": 1.2123488778696042e-07, + "loss": 0.0022, + "num_input_tokens_seen": 39887496, + "step": 20383 + }, + { + "epoch": 2.701656726308814, + "grad_norm": 15.344085693359375, + "learning_rate": 1.2112812085208204e-07, + "loss": 0.1312, + "num_input_tokens_seen": 39889248, + "step": 20384 + }, + { + "epoch": 2.701789264413519, + "grad_norm": 0.46730637550354004, + "learning_rate": 1.2102139978349497e-07, + "loss": 0.0014, + "num_input_tokens_seen": 39890576, + "step": 20385 + }, + { + "epoch": 2.701921802518224, + "grad_norm": 0.03659910336136818, + "learning_rate": 1.2091472458325648e-07, + "loss": 0.0002, + "num_input_tokens_seen": 39892312, + "step": 20386 + }, + { + "epoch": 2.702054340622929, + "grad_norm": 0.5470610857009888, + "learning_rate": 1.2080809525342324e-07, + "loss": 0.0026, + "num_input_tokens_seen": 39894120, + "step": 20387 + }, + { + "epoch": 2.702186878727634, + "grad_norm": 0.20464904606342316, + "learning_rate": 1.2070151179605137e-07, + "loss": 0.0008, + "num_input_tokens_seen": 39895712, + "step": 20388 + }, + { + "epoch": 2.7023194168323394, + "grad_norm": 0.8235451579093933, + "learning_rate": 1.2059497421319537e-07, + "loss": 0.0023, + "num_input_tokens_seen": 39898424, + "step": 20389 + }, + { + "epoch": 2.7024519549370445, + "grad_norm": 1.1988577842712402, + "learning_rate": 1.204884825069097e-07, + "loss": 0.0086, + "num_input_tokens_seen": 39900240, + "step": 20390 + }, + { + "epoch": 2.7025844930417495, + "grad_norm": 19.02069091796875, + "learning_rate": 1.2038203667924797e-07, + "loss": 0.2477, + "num_input_tokens_seen": 39902640, + "step": 20391 + }, + { + "epoch": 2.7027170311464546, + "grad_norm": 2.2827587127685547, + "learning_rate": 1.2027563673226217e-07, + "loss": 0.0039, + "num_input_tokens_seen": 39904408, + "step": 20392 + }, + { + "epoch": 2.7028495692511596, + "grad_norm": 2.2632927894592285, + "learning_rate": 1.20169282668004e-07, + "loss": 0.0135, + "num_input_tokens_seen": 39906648, + "step": 20393 + }, + { + "epoch": 2.702982107355865, + "grad_norm": 0.8650773167610168, + "learning_rate": 1.2006297448852344e-07, + "loss": 0.0061, + "num_input_tokens_seen": 39908064, + "step": 20394 + }, + { + "epoch": 2.7031146454605697, + "grad_norm": 0.004232404287904501, + "learning_rate": 1.1995671219587058e-07, + "loss": 0.0, + "num_input_tokens_seen": 39909440, + "step": 20395 + }, + { + "epoch": 2.703247183565275, + "grad_norm": 0.10431836545467377, + "learning_rate": 1.1985049579209402e-07, + "loss": 0.0003, + "num_input_tokens_seen": 39910840, + "step": 20396 + }, + { + "epoch": 2.70337972166998, + "grad_norm": 0.06244365870952606, + "learning_rate": 1.197443252792424e-07, + "loss": 0.0004, + "num_input_tokens_seen": 39912792, + "step": 20397 + }, + { + "epoch": 2.7035122597746852, + "grad_norm": 0.017905300483107567, + "learning_rate": 1.1963820065936188e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39914464, + "step": 20398 + }, + { + "epoch": 2.7036447978793903, + "grad_norm": 0.09973317384719849, + "learning_rate": 1.1953212193449943e-07, + "loss": 0.0006, + "num_input_tokens_seen": 39916520, + "step": 20399 + }, + { + "epoch": 2.7037773359840953, + "grad_norm": 0.048305001109838486, + "learning_rate": 1.1942608910669924e-07, + "loss": 0.0002, + "num_input_tokens_seen": 39918584, + "step": 20400 + }, + { + "epoch": 2.703909874088801, + "grad_norm": 0.029026715084910393, + "learning_rate": 1.193201021780066e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39920560, + "step": 20401 + }, + { + "epoch": 2.7040424121935054, + "grad_norm": 0.46186208724975586, + "learning_rate": 1.1921416115046463e-07, + "loss": 0.0013, + "num_input_tokens_seen": 39922720, + "step": 20402 + }, + { + "epoch": 2.704174950298211, + "grad_norm": 4.987401485443115, + "learning_rate": 1.1910826602611642e-07, + "loss": 0.0172, + "num_input_tokens_seen": 39925008, + "step": 20403 + }, + { + "epoch": 2.704307488402916, + "grad_norm": 5.420884609222412, + "learning_rate": 1.1900241680700308e-07, + "loss": 0.0275, + "num_input_tokens_seen": 39926752, + "step": 20404 + }, + { + "epoch": 2.704440026507621, + "grad_norm": 0.003904491662979126, + "learning_rate": 1.1889661349516552e-07, + "loss": 0.0, + "num_input_tokens_seen": 39928224, + "step": 20405 + }, + { + "epoch": 2.704572564612326, + "grad_norm": 4.0419602394104, + "learning_rate": 1.187908560926443e-07, + "loss": 0.0225, + "num_input_tokens_seen": 39929936, + "step": 20406 + }, + { + "epoch": 2.704705102717031, + "grad_norm": 0.0070715174078941345, + "learning_rate": 1.1868514460147751e-07, + "loss": 0.0, + "num_input_tokens_seen": 39931288, + "step": 20407 + }, + { + "epoch": 2.7048376408217365, + "grad_norm": 0.02131757140159607, + "learning_rate": 1.1857947902370437e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39933976, + "step": 20408 + }, + { + "epoch": 2.704970178926441, + "grad_norm": 0.0004918212071061134, + "learning_rate": 1.1847385936136186e-07, + "loss": 0.0, + "num_input_tokens_seen": 39935184, + "step": 20409 + }, + { + "epoch": 2.7051027170311466, + "grad_norm": 2.3985321521759033, + "learning_rate": 1.1836828561648583e-07, + "loss": 0.0084, + "num_input_tokens_seen": 39937392, + "step": 20410 + }, + { + "epoch": 2.7052352551358516, + "grad_norm": 1.473010540008545, + "learning_rate": 1.1826275779111274e-07, + "loss": 0.0049, + "num_input_tokens_seen": 39938880, + "step": 20411 + }, + { + "epoch": 2.7053677932405567, + "grad_norm": 5.794705390930176, + "learning_rate": 1.1815727588727648e-07, + "loss": 0.0629, + "num_input_tokens_seen": 39941256, + "step": 20412 + }, + { + "epoch": 2.7055003313452617, + "grad_norm": 2.4563493728637695, + "learning_rate": 1.1805183990701102e-07, + "loss": 0.0135, + "num_input_tokens_seen": 39942840, + "step": 20413 + }, + { + "epoch": 2.7056328694499667, + "grad_norm": 1.3914251327514648, + "learning_rate": 1.1794644985234943e-07, + "loss": 0.0077, + "num_input_tokens_seen": 39944768, + "step": 20414 + }, + { + "epoch": 2.705765407554672, + "grad_norm": 10.71847915649414, + "learning_rate": 1.1784110572532342e-07, + "loss": 0.0653, + "num_input_tokens_seen": 39946248, + "step": 20415 + }, + { + "epoch": 2.705897945659377, + "grad_norm": 0.015905441716313362, + "learning_rate": 1.1773580752796443e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39948800, + "step": 20416 + }, + { + "epoch": 2.7060304837640823, + "grad_norm": 0.02157069742679596, + "learning_rate": 1.176305552623025e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39950520, + "step": 20417 + }, + { + "epoch": 2.7061630218687873, + "grad_norm": 0.00991250853985548, + "learning_rate": 1.175253489303671e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39951816, + "step": 20418 + }, + { + "epoch": 2.7062955599734924, + "grad_norm": 0.0071641551330685616, + "learning_rate": 1.1742018853418608e-07, + "loss": 0.0, + "num_input_tokens_seen": 39954040, + "step": 20419 + }, + { + "epoch": 2.7064280980781974, + "grad_norm": 0.0017371606081724167, + "learning_rate": 1.1731507407578807e-07, + "loss": 0.0, + "num_input_tokens_seen": 39956000, + "step": 20420 + }, + { + "epoch": 2.7065606361829024, + "grad_norm": 1.1266764402389526, + "learning_rate": 1.1721000555719869e-07, + "loss": 0.0043, + "num_input_tokens_seen": 39957664, + "step": 20421 + }, + { + "epoch": 2.706693174287608, + "grad_norm": 0.006660205312073231, + "learning_rate": 1.1710498298044437e-07, + "loss": 0.0, + "num_input_tokens_seen": 39960280, + "step": 20422 + }, + { + "epoch": 2.706825712392313, + "grad_norm": 0.004107495304197073, + "learning_rate": 1.1700000634755015e-07, + "loss": 0.0, + "num_input_tokens_seen": 39961832, + "step": 20423 + }, + { + "epoch": 2.706958250497018, + "grad_norm": 7.976728916168213, + "learning_rate": 1.168950756605397e-07, + "loss": 0.1257, + "num_input_tokens_seen": 39963336, + "step": 20424 + }, + { + "epoch": 2.707090788601723, + "grad_norm": 0.010293417610228062, + "learning_rate": 1.1679019092143585e-07, + "loss": 0.0, + "num_input_tokens_seen": 39964744, + "step": 20425 + }, + { + "epoch": 2.707223326706428, + "grad_norm": 0.04261060804128647, + "learning_rate": 1.1668535213226112e-07, + "loss": 0.0002, + "num_input_tokens_seen": 39966448, + "step": 20426 + }, + { + "epoch": 2.707355864811133, + "grad_norm": 4.443118095397949, + "learning_rate": 1.1658055929503725e-07, + "loss": 0.0578, + "num_input_tokens_seen": 39968440, + "step": 20427 + }, + { + "epoch": 2.707488402915838, + "grad_norm": 0.004546963144093752, + "learning_rate": 1.1647581241178457e-07, + "loss": 0.0, + "num_input_tokens_seen": 39971024, + "step": 20428 + }, + { + "epoch": 2.7076209410205436, + "grad_norm": 0.003629885846748948, + "learning_rate": 1.1637111148452257e-07, + "loss": 0.0, + "num_input_tokens_seen": 39972504, + "step": 20429 + }, + { + "epoch": 2.7077534791252487, + "grad_norm": 0.38038837909698486, + "learning_rate": 1.1626645651526935e-07, + "loss": 0.0009, + "num_input_tokens_seen": 39974376, + "step": 20430 + }, + { + "epoch": 2.7078860172299537, + "grad_norm": 4.97283411026001, + "learning_rate": 1.1616184750604387e-07, + "loss": 0.026, + "num_input_tokens_seen": 39976392, + "step": 20431 + }, + { + "epoch": 2.7080185553346587, + "grad_norm": 0.13921299576759338, + "learning_rate": 1.16057284458862e-07, + "loss": 0.0005, + "num_input_tokens_seen": 39977728, + "step": 20432 + }, + { + "epoch": 2.7081510934393638, + "grad_norm": 0.019957350566983223, + "learning_rate": 1.1595276737574074e-07, + "loss": 0.0001, + "num_input_tokens_seen": 39979536, + "step": 20433 + }, + { + "epoch": 2.708283631544069, + "grad_norm": 4.023921012878418, + "learning_rate": 1.1584829625869487e-07, + "loss": 0.0376, + "num_input_tokens_seen": 39981032, + "step": 20434 + }, + { + "epoch": 2.708416169648774, + "grad_norm": 0.027752919122576714, + "learning_rate": 1.1574387110973834e-07, + "loss": 0.0002, + "num_input_tokens_seen": 39983312, + "step": 20435 + }, + { + "epoch": 2.7085487077534793, + "grad_norm": 0.34686166048049927, + "learning_rate": 1.1563949193088453e-07, + "loss": 0.0014, + "num_input_tokens_seen": 39986016, + "step": 20436 + }, + { + "epoch": 2.7086812458581844, + "grad_norm": 10.66120433807373, + "learning_rate": 1.1553515872414628e-07, + "loss": 0.0998, + "num_input_tokens_seen": 39988152, + "step": 20437 + }, + { + "epoch": 2.7088137839628894, + "grad_norm": 0.003483023727312684, + "learning_rate": 1.1543087149153559e-07, + "loss": 0.0, + "num_input_tokens_seen": 39990192, + "step": 20438 + }, + { + "epoch": 2.7089463220675944, + "grad_norm": 6.696458339691162, + "learning_rate": 1.1532663023506251e-07, + "loss": 0.0492, + "num_input_tokens_seen": 39993136, + "step": 20439 + }, + { + "epoch": 2.7090788601722995, + "grad_norm": 3.7017924785614014, + "learning_rate": 1.1522243495673713e-07, + "loss": 0.0088, + "num_input_tokens_seen": 39995056, + "step": 20440 + }, + { + "epoch": 2.7092113982770045, + "grad_norm": 6.560924530029297, + "learning_rate": 1.1511828565856809e-07, + "loss": 0.0128, + "num_input_tokens_seen": 39996888, + "step": 20441 + }, + { + "epoch": 2.7093439363817096, + "grad_norm": 10.480481147766113, + "learning_rate": 1.1501418234256406e-07, + "loss": 0.0384, + "num_input_tokens_seen": 39998472, + "step": 20442 + }, + { + "epoch": 2.709476474486415, + "grad_norm": 9.672799110412598, + "learning_rate": 1.1491012501073152e-07, + "loss": 0.0716, + "num_input_tokens_seen": 40000760, + "step": 20443 + }, + { + "epoch": 2.70960901259112, + "grad_norm": 0.2651550769805908, + "learning_rate": 1.1480611366507771e-07, + "loss": 0.0021, + "num_input_tokens_seen": 40002512, + "step": 20444 + }, + { + "epoch": 2.709741550695825, + "grad_norm": 0.0073428223840892315, + "learning_rate": 1.1470214830760717e-07, + "loss": 0.0, + "num_input_tokens_seen": 40003960, + "step": 20445 + }, + { + "epoch": 2.70987408880053, + "grad_norm": 0.04767048358917236, + "learning_rate": 1.1459822894032497e-07, + "loss": 0.0003, + "num_input_tokens_seen": 40005648, + "step": 20446 + }, + { + "epoch": 2.710006626905235, + "grad_norm": 0.6073043942451477, + "learning_rate": 1.1449435556523419e-07, + "loss": 0.0026, + "num_input_tokens_seen": 40007568, + "step": 20447 + }, + { + "epoch": 2.7101391650099402, + "grad_norm": 1.798175573348999, + "learning_rate": 1.1439052818433826e-07, + "loss": 0.0113, + "num_input_tokens_seen": 40009664, + "step": 20448 + }, + { + "epoch": 2.7102717031146453, + "grad_norm": 0.03706727921962738, + "learning_rate": 1.1428674679963836e-07, + "loss": 0.0002, + "num_input_tokens_seen": 40011160, + "step": 20449 + }, + { + "epoch": 2.7104042412193508, + "grad_norm": 0.0514959879219532, + "learning_rate": 1.1418301141313592e-07, + "loss": 0.0003, + "num_input_tokens_seen": 40013464, + "step": 20450 + }, + { + "epoch": 2.710536779324056, + "grad_norm": 2.8121190071105957, + "learning_rate": 1.1407932202683103e-07, + "loss": 0.0164, + "num_input_tokens_seen": 40015320, + "step": 20451 + }, + { + "epoch": 2.710669317428761, + "grad_norm": 7.000250339508057, + "learning_rate": 1.1397567864272263e-07, + "loss": 0.0455, + "num_input_tokens_seen": 40017328, + "step": 20452 + }, + { + "epoch": 2.710801855533466, + "grad_norm": 5.649206161499023, + "learning_rate": 1.1387208126280913e-07, + "loss": 0.0467, + "num_input_tokens_seen": 40019280, + "step": 20453 + }, + { + "epoch": 2.710934393638171, + "grad_norm": 1.0253263711929321, + "learning_rate": 1.1376852988908837e-07, + "loss": 0.004, + "num_input_tokens_seen": 40021936, + "step": 20454 + }, + { + "epoch": 2.711066931742876, + "grad_norm": 0.005749219097197056, + "learning_rate": 1.1366502452355599e-07, + "loss": 0.0, + "num_input_tokens_seen": 40023536, + "step": 20455 + }, + { + "epoch": 2.711199469847581, + "grad_norm": 0.35451579093933105, + "learning_rate": 1.135615651682087e-07, + "loss": 0.0005, + "num_input_tokens_seen": 40025264, + "step": 20456 + }, + { + "epoch": 2.7113320079522865, + "grad_norm": 2.415989398956299, + "learning_rate": 1.1345815182504021e-07, + "loss": 0.0093, + "num_input_tokens_seen": 40027440, + "step": 20457 + }, + { + "epoch": 2.7114645460569915, + "grad_norm": 13.429655075073242, + "learning_rate": 1.133547844960453e-07, + "loss": 0.097, + "num_input_tokens_seen": 40029696, + "step": 20458 + }, + { + "epoch": 2.7115970841616965, + "grad_norm": 1.1104086637496948, + "learning_rate": 1.1325146318321684e-07, + "loss": 0.0033, + "num_input_tokens_seen": 40031704, + "step": 20459 + }, + { + "epoch": 2.7117296222664016, + "grad_norm": 0.022360466420650482, + "learning_rate": 1.1314818788854626e-07, + "loss": 0.0001, + "num_input_tokens_seen": 40033472, + "step": 20460 + }, + { + "epoch": 2.7118621603711066, + "grad_norm": 0.0052514513954520226, + "learning_rate": 1.1304495861402531e-07, + "loss": 0.0, + "num_input_tokens_seen": 40035912, + "step": 20461 + }, + { + "epoch": 2.7119946984758116, + "grad_norm": 11.62448501586914, + "learning_rate": 1.1294177536164408e-07, + "loss": 0.1042, + "num_input_tokens_seen": 40038400, + "step": 20462 + }, + { + "epoch": 2.7121272365805167, + "grad_norm": 3.156332492828369, + "learning_rate": 1.1283863813339263e-07, + "loss": 0.0181, + "num_input_tokens_seen": 40040488, + "step": 20463 + }, + { + "epoch": 2.712259774685222, + "grad_norm": 0.008256812579929829, + "learning_rate": 1.1273554693125883e-07, + "loss": 0.0, + "num_input_tokens_seen": 40043160, + "step": 20464 + }, + { + "epoch": 2.712392312789927, + "grad_norm": 0.026214120909571648, + "learning_rate": 1.1263250175723078e-07, + "loss": 0.0001, + "num_input_tokens_seen": 40045056, + "step": 20465 + }, + { + "epoch": 2.7125248508946322, + "grad_norm": 2.1159303188323975, + "learning_rate": 1.125295026132947e-07, + "loss": 0.0114, + "num_input_tokens_seen": 40048360, + "step": 20466 + }, + { + "epoch": 2.7126573889993373, + "grad_norm": 0.03746640682220459, + "learning_rate": 1.1242654950143678e-07, + "loss": 0.0001, + "num_input_tokens_seen": 40049504, + "step": 20467 + }, + { + "epoch": 2.7127899271040423, + "grad_norm": 2.9890410900115967, + "learning_rate": 1.1232364242364236e-07, + "loss": 0.018, + "num_input_tokens_seen": 40051448, + "step": 20468 + }, + { + "epoch": 2.7129224652087474, + "grad_norm": 0.6385653614997864, + "learning_rate": 1.122207813818954e-07, + "loss": 0.0017, + "num_input_tokens_seen": 40053640, + "step": 20469 + }, + { + "epoch": 2.7130550033134524, + "grad_norm": 3.001546621322632, + "learning_rate": 1.1211796637817879e-07, + "loss": 0.0138, + "num_input_tokens_seen": 40056600, + "step": 20470 + }, + { + "epoch": 2.713187541418158, + "grad_norm": 1.5109045505523682, + "learning_rate": 1.1201519741447536e-07, + "loss": 0.0076, + "num_input_tokens_seen": 40058824, + "step": 20471 + }, + { + "epoch": 2.713320079522863, + "grad_norm": 0.00196451204828918, + "learning_rate": 1.1191247449276576e-07, + "loss": 0.0, + "num_input_tokens_seen": 40060240, + "step": 20472 + }, + { + "epoch": 2.713452617627568, + "grad_norm": 0.003959778696298599, + "learning_rate": 1.1180979761503147e-07, + "loss": 0.0, + "num_input_tokens_seen": 40061520, + "step": 20473 + }, + { + "epoch": 2.713585155732273, + "grad_norm": 0.47768163681030273, + "learning_rate": 1.1170716678325171e-07, + "loss": 0.0013, + "num_input_tokens_seen": 40063840, + "step": 20474 + }, + { + "epoch": 2.713717693836978, + "grad_norm": 0.07340512424707413, + "learning_rate": 1.1160458199940577e-07, + "loss": 0.0004, + "num_input_tokens_seen": 40066592, + "step": 20475 + }, + { + "epoch": 2.7138502319416835, + "grad_norm": 1.4453693628311157, + "learning_rate": 1.1150204326547092e-07, + "loss": 0.0166, + "num_input_tokens_seen": 40068968, + "step": 20476 + }, + { + "epoch": 2.713982770046388, + "grad_norm": 0.0018821981502696872, + "learning_rate": 1.113995505834245e-07, + "loss": 0.0, + "num_input_tokens_seen": 40069984, + "step": 20477 + }, + { + "epoch": 2.7141153081510936, + "grad_norm": 2.4498016834259033, + "learning_rate": 1.1129710395524213e-07, + "loss": 0.0067, + "num_input_tokens_seen": 40071968, + "step": 20478 + }, + { + "epoch": 2.7142478462557986, + "grad_norm": 4.701002597808838, + "learning_rate": 1.1119470338289945e-07, + "loss": 0.0252, + "num_input_tokens_seen": 40073848, + "step": 20479 + }, + { + "epoch": 2.7143803843605037, + "grad_norm": 0.08684976398944855, + "learning_rate": 1.1109234886837127e-07, + "loss": 0.0005, + "num_input_tokens_seen": 40076144, + "step": 20480 + }, + { + "epoch": 2.7145129224652087, + "grad_norm": 0.024829061701893806, + "learning_rate": 1.1099004041363049e-07, + "loss": 0.0002, + "num_input_tokens_seen": 40077656, + "step": 20481 + }, + { + "epoch": 2.7146454605699137, + "grad_norm": 0.00728806434199214, + "learning_rate": 1.1088777802064993e-07, + "loss": 0.0, + "num_input_tokens_seen": 40079648, + "step": 20482 + }, + { + "epoch": 2.714777998674619, + "grad_norm": 2.1082491874694824, + "learning_rate": 1.1078556169140109e-07, + "loss": 0.0098, + "num_input_tokens_seen": 40081672, + "step": 20483 + }, + { + "epoch": 2.714910536779324, + "grad_norm": 0.6471569538116455, + "learning_rate": 1.1068339142785462e-07, + "loss": 0.0022, + "num_input_tokens_seen": 40083920, + "step": 20484 + }, + { + "epoch": 2.7150430748840293, + "grad_norm": 0.020151810720562935, + "learning_rate": 1.1058126723198088e-07, + "loss": 0.0001, + "num_input_tokens_seen": 40085984, + "step": 20485 + }, + { + "epoch": 2.7151756129887343, + "grad_norm": 0.01663009449839592, + "learning_rate": 1.1047918910574884e-07, + "loss": 0.0001, + "num_input_tokens_seen": 40087648, + "step": 20486 + }, + { + "epoch": 2.7153081510934394, + "grad_norm": 10.400846481323242, + "learning_rate": 1.1037715705112639e-07, + "loss": 0.0343, + "num_input_tokens_seen": 40089632, + "step": 20487 + }, + { + "epoch": 2.7154406891981444, + "grad_norm": 0.05939977243542671, + "learning_rate": 1.1027517107008057e-07, + "loss": 0.0002, + "num_input_tokens_seen": 40091512, + "step": 20488 + }, + { + "epoch": 2.7155732273028494, + "grad_norm": 14.993009567260742, + "learning_rate": 1.101732311645784e-07, + "loss": 0.1527, + "num_input_tokens_seen": 40093936, + "step": 20489 + }, + { + "epoch": 2.715705765407555, + "grad_norm": 0.001280203927308321, + "learning_rate": 1.100713373365847e-07, + "loss": 0.0, + "num_input_tokens_seen": 40095896, + "step": 20490 + }, + { + "epoch": 2.7158383035122595, + "grad_norm": 0.005018872674554586, + "learning_rate": 1.0996948958806459e-07, + "loss": 0.0, + "num_input_tokens_seen": 40098392, + "step": 20491 + }, + { + "epoch": 2.715970841616965, + "grad_norm": 0.04099457710981369, + "learning_rate": 1.0986768792098146e-07, + "loss": 0.0002, + "num_input_tokens_seen": 40099656, + "step": 20492 + }, + { + "epoch": 2.71610337972167, + "grad_norm": 10.873488426208496, + "learning_rate": 1.0976593233729793e-07, + "loss": 0.0858, + "num_input_tokens_seen": 40101896, + "step": 20493 + }, + { + "epoch": 2.716235917826375, + "grad_norm": 0.0017908806912600994, + "learning_rate": 1.096642228389766e-07, + "loss": 0.0, + "num_input_tokens_seen": 40103064, + "step": 20494 + }, + { + "epoch": 2.71636845593108, + "grad_norm": 0.006518849637359381, + "learning_rate": 1.0956255942797784e-07, + "loss": 0.0, + "num_input_tokens_seen": 40104296, + "step": 20495 + }, + { + "epoch": 2.716500994035785, + "grad_norm": 9.36495304107666, + "learning_rate": 1.0946094210626201e-07, + "loss": 0.1394, + "num_input_tokens_seen": 40107104, + "step": 20496 + }, + { + "epoch": 2.7166335321404906, + "grad_norm": 3.4203884601593018, + "learning_rate": 1.093593708757884e-07, + "loss": 0.0118, + "num_input_tokens_seen": 40109296, + "step": 20497 + }, + { + "epoch": 2.7167660702451952, + "grad_norm": 0.003240822581574321, + "learning_rate": 1.0925784573851517e-07, + "loss": 0.0, + "num_input_tokens_seen": 40111472, + "step": 20498 + }, + { + "epoch": 2.7168986083499007, + "grad_norm": 6.11899995803833, + "learning_rate": 1.0915636669640017e-07, + "loss": 0.0676, + "num_input_tokens_seen": 40113328, + "step": 20499 + }, + { + "epoch": 2.7170311464546058, + "grad_norm": 1.2890818119049072, + "learning_rate": 1.0905493375139963e-07, + "loss": 0.0071, + "num_input_tokens_seen": 40114576, + "step": 20500 + }, + { + "epoch": 2.717163684559311, + "grad_norm": 0.14043845236301422, + "learning_rate": 1.0895354690546978e-07, + "loss": 0.0007, + "num_input_tokens_seen": 40115792, + "step": 20501 + }, + { + "epoch": 2.717296222664016, + "grad_norm": 11.39294147491455, + "learning_rate": 1.0885220616056458e-07, + "loss": 0.2703, + "num_input_tokens_seen": 40118256, + "step": 20502 + }, + { + "epoch": 2.717428760768721, + "grad_norm": 8.292712211608887, + "learning_rate": 1.0875091151863859e-07, + "loss": 0.0941, + "num_input_tokens_seen": 40119832, + "step": 20503 + }, + { + "epoch": 2.7175612988734263, + "grad_norm": 0.020115241408348083, + "learning_rate": 1.0864966298164442e-07, + "loss": 0.0001, + "num_input_tokens_seen": 40121432, + "step": 20504 + }, + { + "epoch": 2.717693836978131, + "grad_norm": 0.0244462713599205, + "learning_rate": 1.0854846055153468e-07, + "loss": 0.0001, + "num_input_tokens_seen": 40122968, + "step": 20505 + }, + { + "epoch": 2.7178263750828364, + "grad_norm": 5.001358509063721, + "learning_rate": 1.0844730423026057e-07, + "loss": 0.0049, + "num_input_tokens_seen": 40125144, + "step": 20506 + }, + { + "epoch": 2.7179589131875415, + "grad_norm": 0.007635052781552076, + "learning_rate": 1.083461940197722e-07, + "loss": 0.0, + "num_input_tokens_seen": 40128648, + "step": 20507 + }, + { + "epoch": 2.7180914512922465, + "grad_norm": 0.08132961392402649, + "learning_rate": 1.0824512992201886e-07, + "loss": 0.0005, + "num_input_tokens_seen": 40131584, + "step": 20508 + }, + { + "epoch": 2.7182239893969515, + "grad_norm": 2.2775979042053223, + "learning_rate": 1.0814411193894952e-07, + "loss": 0.0112, + "num_input_tokens_seen": 40134184, + "step": 20509 + }, + { + "epoch": 2.7183565275016566, + "grad_norm": 0.33755895495414734, + "learning_rate": 1.0804314007251182e-07, + "loss": 0.0019, + "num_input_tokens_seen": 40136680, + "step": 20510 + }, + { + "epoch": 2.718489065606362, + "grad_norm": 0.021252945065498352, + "learning_rate": 1.0794221432465252e-07, + "loss": 0.0001, + "num_input_tokens_seen": 40139896, + "step": 20511 + }, + { + "epoch": 2.718621603711067, + "grad_norm": 2.2145538330078125, + "learning_rate": 1.0784133469731784e-07, + "loss": 0.0116, + "num_input_tokens_seen": 40142248, + "step": 20512 + }, + { + "epoch": 2.718754141815772, + "grad_norm": 17.48003387451172, + "learning_rate": 1.0774050119245178e-07, + "loss": 0.1989, + "num_input_tokens_seen": 40144680, + "step": 20513 + }, + { + "epoch": 2.718886679920477, + "grad_norm": 0.017997637391090393, + "learning_rate": 1.0763971381199945e-07, + "loss": 0.0001, + "num_input_tokens_seen": 40146520, + "step": 20514 + }, + { + "epoch": 2.719019218025182, + "grad_norm": 0.002188366837799549, + "learning_rate": 1.0753897255790402e-07, + "loss": 0.0, + "num_input_tokens_seen": 40147968, + "step": 20515 + }, + { + "epoch": 2.7191517561298872, + "grad_norm": 0.013410797342658043, + "learning_rate": 1.0743827743210783e-07, + "loss": 0.0001, + "num_input_tokens_seen": 40149648, + "step": 20516 + }, + { + "epoch": 2.7192842942345923, + "grad_norm": 0.006881359498947859, + "learning_rate": 1.073376284365521e-07, + "loss": 0.0, + "num_input_tokens_seen": 40151928, + "step": 20517 + }, + { + "epoch": 2.7194168323392978, + "grad_norm": 1.909382700920105, + "learning_rate": 1.072370255731775e-07, + "loss": 0.0054, + "num_input_tokens_seen": 40153800, + "step": 20518 + }, + { + "epoch": 2.719549370444003, + "grad_norm": 0.8635737299919128, + "learning_rate": 1.0713646884392331e-07, + "loss": 0.003, + "num_input_tokens_seen": 40157112, + "step": 20519 + }, + { + "epoch": 2.719681908548708, + "grad_norm": 0.06327981501817703, + "learning_rate": 1.0703595825072882e-07, + "loss": 0.0002, + "num_input_tokens_seen": 40158472, + "step": 20520 + }, + { + "epoch": 2.719814446653413, + "grad_norm": 6.974562644958496, + "learning_rate": 1.0693549379553219e-07, + "loss": 0.0805, + "num_input_tokens_seen": 40160856, + "step": 20521 + }, + { + "epoch": 2.719946984758118, + "grad_norm": 5.166565418243408, + "learning_rate": 1.0683507548027022e-07, + "loss": 0.0498, + "num_input_tokens_seen": 40163760, + "step": 20522 + }, + { + "epoch": 2.720079522862823, + "grad_norm": 11.195116996765137, + "learning_rate": 1.0673470330687913e-07, + "loss": 0.0944, + "num_input_tokens_seen": 40165944, + "step": 20523 + }, + { + "epoch": 2.720212060967528, + "grad_norm": 0.1436731368303299, + "learning_rate": 1.066343772772932e-07, + "loss": 0.0003, + "num_input_tokens_seen": 40168904, + "step": 20524 + }, + { + "epoch": 2.7203445990722335, + "grad_norm": 0.313079833984375, + "learning_rate": 1.0653409739344839e-07, + "loss": 0.0013, + "num_input_tokens_seen": 40170928, + "step": 20525 + }, + { + "epoch": 2.7204771371769385, + "grad_norm": 37.44199752807617, + "learning_rate": 1.0643386365727675e-07, + "loss": 0.1942, + "num_input_tokens_seen": 40172720, + "step": 20526 + }, + { + "epoch": 2.7206096752816435, + "grad_norm": 0.0012883244780823588, + "learning_rate": 1.0633367607071176e-07, + "loss": 0.0, + "num_input_tokens_seen": 40174112, + "step": 20527 + }, + { + "epoch": 2.7207422133863486, + "grad_norm": 0.19330883026123047, + "learning_rate": 1.0623353463568493e-07, + "loss": 0.0008, + "num_input_tokens_seen": 40176360, + "step": 20528 + }, + { + "epoch": 2.7208747514910536, + "grad_norm": 1.4043216705322266, + "learning_rate": 1.0613343935412635e-07, + "loss": 0.0065, + "num_input_tokens_seen": 40177496, + "step": 20529 + }, + { + "epoch": 2.7210072895957587, + "grad_norm": 0.015906674787402153, + "learning_rate": 1.06033390227967e-07, + "loss": 0.0001, + "num_input_tokens_seen": 40180416, + "step": 20530 + }, + { + "epoch": 2.7211398277004637, + "grad_norm": 0.22724610567092896, + "learning_rate": 1.0593338725913532e-07, + "loss": 0.0005, + "num_input_tokens_seen": 40182400, + "step": 20531 + }, + { + "epoch": 2.721272365805169, + "grad_norm": 0.6512168049812317, + "learning_rate": 1.0583343044955923e-07, + "loss": 0.0012, + "num_input_tokens_seen": 40184272, + "step": 20532 + }, + { + "epoch": 2.721404903909874, + "grad_norm": 0.03858548030257225, + "learning_rate": 1.0573351980116664e-07, + "loss": 0.0002, + "num_input_tokens_seen": 40186016, + "step": 20533 + }, + { + "epoch": 2.7215374420145793, + "grad_norm": 8.55422592163086, + "learning_rate": 1.0563365531588349e-07, + "loss": 0.057, + "num_input_tokens_seen": 40187752, + "step": 20534 + }, + { + "epoch": 2.7216699801192843, + "grad_norm": 7.8189592361450195, + "learning_rate": 1.0553383699563491e-07, + "loss": 0.1266, + "num_input_tokens_seen": 40190760, + "step": 20535 + }, + { + "epoch": 2.7218025182239893, + "grad_norm": 9.852140426635742, + "learning_rate": 1.0543406484234631e-07, + "loss": 0.1764, + "num_input_tokens_seen": 40193104, + "step": 20536 + }, + { + "epoch": 2.7219350563286944, + "grad_norm": 0.9803377985954285, + "learning_rate": 1.0533433885794031e-07, + "loss": 0.004, + "num_input_tokens_seen": 40196088, + "step": 20537 + }, + { + "epoch": 2.7220675944333994, + "grad_norm": 2.134528398513794, + "learning_rate": 1.0523465904434066e-07, + "loss": 0.0087, + "num_input_tokens_seen": 40197712, + "step": 20538 + }, + { + "epoch": 2.722200132538105, + "grad_norm": 17.87036895751953, + "learning_rate": 1.0513502540346888e-07, + "loss": 0.019, + "num_input_tokens_seen": 40199376, + "step": 20539 + }, + { + "epoch": 2.72233267064281, + "grad_norm": 0.0033193696290254593, + "learning_rate": 1.0503543793724563e-07, + "loss": 0.0, + "num_input_tokens_seen": 40200840, + "step": 20540 + }, + { + "epoch": 2.722465208747515, + "grad_norm": 1.9549384117126465, + "learning_rate": 1.0493589664759163e-07, + "loss": 0.0129, + "num_input_tokens_seen": 40202664, + "step": 20541 + }, + { + "epoch": 2.72259774685222, + "grad_norm": 5.965747833251953, + "learning_rate": 1.0483640153642561e-07, + "loss": 0.0227, + "num_input_tokens_seen": 40205176, + "step": 20542 + }, + { + "epoch": 2.722730284956925, + "grad_norm": 21.051149368286133, + "learning_rate": 1.0473695260566602e-07, + "loss": 0.3597, + "num_input_tokens_seen": 40207336, + "step": 20543 + }, + { + "epoch": 2.72286282306163, + "grad_norm": 0.5468198657035828, + "learning_rate": 1.0463754985723051e-07, + "loss": 0.0022, + "num_input_tokens_seen": 40209776, + "step": 20544 + }, + { + "epoch": 2.722995361166335, + "grad_norm": 0.15997539460659027, + "learning_rate": 1.045381932930356e-07, + "loss": 0.0004, + "num_input_tokens_seen": 40211464, + "step": 20545 + }, + { + "epoch": 2.7231278992710406, + "grad_norm": 0.167790487408638, + "learning_rate": 1.0443888291499671e-07, + "loss": 0.0012, + "num_input_tokens_seen": 40213968, + "step": 20546 + }, + { + "epoch": 2.7232604373757456, + "grad_norm": 0.7253275513648987, + "learning_rate": 1.0433961872502895e-07, + "loss": 0.003, + "num_input_tokens_seen": 40216184, + "step": 20547 + }, + { + "epoch": 2.7233929754804507, + "grad_norm": 9.830698013305664, + "learning_rate": 1.042404007250461e-07, + "loss": 0.1966, + "num_input_tokens_seen": 40217920, + "step": 20548 + }, + { + "epoch": 2.7235255135851557, + "grad_norm": 0.06328345835208893, + "learning_rate": 1.0414122891696077e-07, + "loss": 0.0003, + "num_input_tokens_seen": 40219920, + "step": 20549 + }, + { + "epoch": 2.7236580516898607, + "grad_norm": 0.11352048814296722, + "learning_rate": 1.0404210330268532e-07, + "loss": 0.0005, + "num_input_tokens_seen": 40221296, + "step": 20550 + }, + { + "epoch": 2.723790589794566, + "grad_norm": 2.7292401790618896, + "learning_rate": 1.039430238841313e-07, + "loss": 0.0172, + "num_input_tokens_seen": 40223568, + "step": 20551 + }, + { + "epoch": 2.723923127899271, + "grad_norm": 8.333146095275879, + "learning_rate": 1.0384399066320883e-07, + "loss": 0.1421, + "num_input_tokens_seen": 40225400, + "step": 20552 + }, + { + "epoch": 2.7240556660039763, + "grad_norm": 0.14083242416381836, + "learning_rate": 1.0374500364182722e-07, + "loss": 0.0006, + "num_input_tokens_seen": 40227656, + "step": 20553 + }, + { + "epoch": 2.7241882041086813, + "grad_norm": 0.000848158320877701, + "learning_rate": 1.0364606282189493e-07, + "loss": 0.0, + "num_input_tokens_seen": 40229248, + "step": 20554 + }, + { + "epoch": 2.7243207422133864, + "grad_norm": 46.73067092895508, + "learning_rate": 1.0354716820531935e-07, + "loss": 0.2044, + "num_input_tokens_seen": 40231304, + "step": 20555 + }, + { + "epoch": 2.7244532803180914, + "grad_norm": 0.04204120859503746, + "learning_rate": 1.0344831979400754e-07, + "loss": 0.0002, + "num_input_tokens_seen": 40233544, + "step": 20556 + }, + { + "epoch": 2.7245858184227965, + "grad_norm": 7.8568315505981445, + "learning_rate": 1.0334951758986606e-07, + "loss": 0.0511, + "num_input_tokens_seen": 40235944, + "step": 20557 + }, + { + "epoch": 2.7247183565275015, + "grad_norm": 0.030321598052978516, + "learning_rate": 1.032507615947989e-07, + "loss": 0.0002, + "num_input_tokens_seen": 40237576, + "step": 20558 + }, + { + "epoch": 2.7248508946322065, + "grad_norm": 0.6467282772064209, + "learning_rate": 1.031520518107107e-07, + "loss": 0.0036, + "num_input_tokens_seen": 40240144, + "step": 20559 + }, + { + "epoch": 2.724983432736912, + "grad_norm": 0.07581186294555664, + "learning_rate": 1.0305338823950434e-07, + "loss": 0.0003, + "num_input_tokens_seen": 40241328, + "step": 20560 + }, + { + "epoch": 2.725115970841617, + "grad_norm": 6.682172775268555, + "learning_rate": 1.0295477088308193e-07, + "loss": 0.0393, + "num_input_tokens_seen": 40243152, + "step": 20561 + }, + { + "epoch": 2.725248508946322, + "grad_norm": 0.06568250805139542, + "learning_rate": 1.0285619974334526e-07, + "loss": 0.0003, + "num_input_tokens_seen": 40245304, + "step": 20562 + }, + { + "epoch": 2.725381047051027, + "grad_norm": 0.04685897380113602, + "learning_rate": 1.0275767482219506e-07, + "loss": 0.0005, + "num_input_tokens_seen": 40247744, + "step": 20563 + }, + { + "epoch": 2.725513585155732, + "grad_norm": 3.6662306785583496, + "learning_rate": 1.0265919612153064e-07, + "loss": 0.0531, + "num_input_tokens_seen": 40249472, + "step": 20564 + }, + { + "epoch": 2.7256461232604376, + "grad_norm": 0.060763824731111526, + "learning_rate": 1.0256076364325073e-07, + "loss": 0.0002, + "num_input_tokens_seen": 40251808, + "step": 20565 + }, + { + "epoch": 2.7257786613651422, + "grad_norm": 0.013884919695556164, + "learning_rate": 1.0246237738925302e-07, + "loss": 0.0, + "num_input_tokens_seen": 40253496, + "step": 20566 + }, + { + "epoch": 2.7259111994698477, + "grad_norm": 0.05732065066695213, + "learning_rate": 1.023640373614343e-07, + "loss": 0.0005, + "num_input_tokens_seen": 40255168, + "step": 20567 + }, + { + "epoch": 2.7260437375745528, + "grad_norm": 0.004776959307491779, + "learning_rate": 1.0226574356169167e-07, + "loss": 0.0, + "num_input_tokens_seen": 40257360, + "step": 20568 + }, + { + "epoch": 2.726176275679258, + "grad_norm": 11.196110725402832, + "learning_rate": 1.0216749599191944e-07, + "loss": 0.0613, + "num_input_tokens_seen": 40259720, + "step": 20569 + }, + { + "epoch": 2.726308813783963, + "grad_norm": 2.9785289764404297, + "learning_rate": 1.0206929465401222e-07, + "loss": 0.0132, + "num_input_tokens_seen": 40262408, + "step": 20570 + }, + { + "epoch": 2.726441351888668, + "grad_norm": 13.020023345947266, + "learning_rate": 1.0197113954986266e-07, + "loss": 0.1254, + "num_input_tokens_seen": 40264656, + "step": 20571 + }, + { + "epoch": 2.7265738899933734, + "grad_norm": 3.677790403366089, + "learning_rate": 1.0187303068136423e-07, + "loss": 0.021, + "num_input_tokens_seen": 40266432, + "step": 20572 + }, + { + "epoch": 2.726706428098078, + "grad_norm": 5.411695957183838, + "learning_rate": 1.0177496805040794e-07, + "loss": 0.0929, + "num_input_tokens_seen": 40268856, + "step": 20573 + }, + { + "epoch": 2.7268389662027834, + "grad_norm": 6.035541534423828, + "learning_rate": 1.0167695165888502e-07, + "loss": 0.0481, + "num_input_tokens_seen": 40271360, + "step": 20574 + }, + { + "epoch": 2.7269715043074885, + "grad_norm": 0.11402358114719391, + "learning_rate": 1.0157898150868512e-07, + "loss": 0.0007, + "num_input_tokens_seen": 40273064, + "step": 20575 + }, + { + "epoch": 2.7271040424121935, + "grad_norm": 17.241437911987305, + "learning_rate": 1.0148105760169669e-07, + "loss": 0.1222, + "num_input_tokens_seen": 40275128, + "step": 20576 + }, + { + "epoch": 2.7272365805168985, + "grad_norm": 10.276262283325195, + "learning_rate": 1.0138317993980823e-07, + "loss": 0.1347, + "num_input_tokens_seen": 40277168, + "step": 20577 + }, + { + "epoch": 2.7273691186216036, + "grad_norm": 2.896395206451416, + "learning_rate": 1.0128534852490685e-07, + "loss": 0.0249, + "num_input_tokens_seen": 40279104, + "step": 20578 + }, + { + "epoch": 2.727501656726309, + "grad_norm": 8.9425630569458, + "learning_rate": 1.0118756335887852e-07, + "loss": 0.2136, + "num_input_tokens_seen": 40281344, + "step": 20579 + }, + { + "epoch": 2.7276341948310137, + "grad_norm": 6.429861545562744, + "learning_rate": 1.0108982444360899e-07, + "loss": 0.0223, + "num_input_tokens_seen": 40283584, + "step": 20580 + }, + { + "epoch": 2.727766732935719, + "grad_norm": 2.63631010055542, + "learning_rate": 1.0099213178098228e-07, + "loss": 0.0156, + "num_input_tokens_seen": 40286416, + "step": 20581 + }, + { + "epoch": 2.727899271040424, + "grad_norm": 8.04603099822998, + "learning_rate": 1.0089448537288271e-07, + "loss": 0.0522, + "num_input_tokens_seen": 40288208, + "step": 20582 + }, + { + "epoch": 2.728031809145129, + "grad_norm": 1.5973155498504639, + "learning_rate": 1.007968852211924e-07, + "loss": 0.0134, + "num_input_tokens_seen": 40290464, + "step": 20583 + }, + { + "epoch": 2.7281643472498343, + "grad_norm": 5.981142520904541, + "learning_rate": 1.0069933132779347e-07, + "loss": 0.0949, + "num_input_tokens_seen": 40292616, + "step": 20584 + }, + { + "epoch": 2.7282968853545393, + "grad_norm": 0.012942900881171227, + "learning_rate": 1.0060182369456606e-07, + "loss": 0.0001, + "num_input_tokens_seen": 40294192, + "step": 20585 + }, + { + "epoch": 2.7284294234592448, + "grad_norm": 7.4312052726745605, + "learning_rate": 1.005043623233909e-07, + "loss": 0.0836, + "num_input_tokens_seen": 40296168, + "step": 20586 + }, + { + "epoch": 2.7285619615639494, + "grad_norm": 2.779864549636841, + "learning_rate": 1.004069472161473e-07, + "loss": 0.0154, + "num_input_tokens_seen": 40298040, + "step": 20587 + }, + { + "epoch": 2.728694499668655, + "grad_norm": 3.4366843700408936, + "learning_rate": 1.0030957837471294e-07, + "loss": 0.0202, + "num_input_tokens_seen": 40301424, + "step": 20588 + }, + { + "epoch": 2.72882703777336, + "grad_norm": 6.422103404998779, + "learning_rate": 1.002122558009655e-07, + "loss": 0.0117, + "num_input_tokens_seen": 40303032, + "step": 20589 + }, + { + "epoch": 2.728959575878065, + "grad_norm": 0.006842087022960186, + "learning_rate": 1.0011497949678096e-07, + "loss": 0.0, + "num_input_tokens_seen": 40305016, + "step": 20590 + }, + { + "epoch": 2.72909211398277, + "grad_norm": 0.023462209850549698, + "learning_rate": 1.0001774946403558e-07, + "loss": 0.0002, + "num_input_tokens_seen": 40306912, + "step": 20591 + }, + { + "epoch": 2.729224652087475, + "grad_norm": 1.7935649156570435, + "learning_rate": 9.992056570460345e-08, + "loss": 0.0077, + "num_input_tokens_seen": 40308384, + "step": 20592 + }, + { + "epoch": 2.7293571901921805, + "grad_norm": 5.512087821960449, + "learning_rate": 9.982342822035862e-08, + "loss": 0.0652, + "num_input_tokens_seen": 40310944, + "step": 20593 + }, + { + "epoch": 2.7294897282968855, + "grad_norm": 0.008757694624364376, + "learning_rate": 9.972633701317403e-08, + "loss": 0.0, + "num_input_tokens_seen": 40313744, + "step": 20594 + }, + { + "epoch": 2.7296222664015906, + "grad_norm": 9.620189666748047, + "learning_rate": 9.96292920849215e-08, + "loss": 0.0826, + "num_input_tokens_seen": 40315488, + "step": 20595 + }, + { + "epoch": 2.7297548045062956, + "grad_norm": 0.0023674629628658295, + "learning_rate": 9.953229343747179e-08, + "loss": 0.0, + "num_input_tokens_seen": 40316768, + "step": 20596 + }, + { + "epoch": 2.7298873426110006, + "grad_norm": 0.0042010038159787655, + "learning_rate": 9.943534107269559e-08, + "loss": 0.0, + "num_input_tokens_seen": 40319152, + "step": 20597 + }, + { + "epoch": 2.7300198807157057, + "grad_norm": 0.09010634571313858, + "learning_rate": 9.933843499246226e-08, + "loss": 0.0003, + "num_input_tokens_seen": 40321376, + "step": 20598 + }, + { + "epoch": 2.7301524188204107, + "grad_norm": 0.0031219448428601027, + "learning_rate": 9.924157519864003e-08, + "loss": 0.0, + "num_input_tokens_seen": 40323592, + "step": 20599 + }, + { + "epoch": 2.730284956925116, + "grad_norm": 0.002106622327119112, + "learning_rate": 9.914476169309655e-08, + "loss": 0.0, + "num_input_tokens_seen": 40324696, + "step": 20600 + }, + { + "epoch": 2.7304174950298212, + "grad_norm": 5.996222496032715, + "learning_rate": 9.904799447769813e-08, + "loss": 0.0206, + "num_input_tokens_seen": 40326336, + "step": 20601 + }, + { + "epoch": 2.7305500331345263, + "grad_norm": 0.03086123801767826, + "learning_rate": 9.89512735543105e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40329136, + "step": 20602 + }, + { + "epoch": 2.7306825712392313, + "grad_norm": 0.11009085178375244, + "learning_rate": 9.885459892479882e-08, + "loss": 0.0004, + "num_input_tokens_seen": 40330688, + "step": 20603 + }, + { + "epoch": 2.7308151093439363, + "grad_norm": 0.027499202638864517, + "learning_rate": 9.875797059102715e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40332768, + "step": 20604 + }, + { + "epoch": 2.7309476474486414, + "grad_norm": 2.2852234840393066, + "learning_rate": 9.866138855485819e-08, + "loss": 0.0065, + "num_input_tokens_seen": 40334464, + "step": 20605 + }, + { + "epoch": 2.7310801855533464, + "grad_norm": 9.260998725891113, + "learning_rate": 9.856485281815459e-08, + "loss": 0.1636, + "num_input_tokens_seen": 40336736, + "step": 20606 + }, + { + "epoch": 2.731212723658052, + "grad_norm": 0.0059330156072974205, + "learning_rate": 9.846836338277682e-08, + "loss": 0.0, + "num_input_tokens_seen": 40337744, + "step": 20607 + }, + { + "epoch": 2.731345261762757, + "grad_norm": 0.020890355110168457, + "learning_rate": 9.83719202505859e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40339264, + "step": 20608 + }, + { + "epoch": 2.731477799867462, + "grad_norm": 4.8022332191467285, + "learning_rate": 9.827552342344116e-08, + "loss": 0.0517, + "num_input_tokens_seen": 40341032, + "step": 20609 + }, + { + "epoch": 2.731610337972167, + "grad_norm": 6.902804851531982, + "learning_rate": 9.817917290320112e-08, + "loss": 0.0809, + "num_input_tokens_seen": 40343896, + "step": 20610 + }, + { + "epoch": 2.731742876076872, + "grad_norm": 0.04349290207028389, + "learning_rate": 9.808286869172373e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40345608, + "step": 20611 + }, + { + "epoch": 2.731875414181577, + "grad_norm": 3.818340539932251, + "learning_rate": 9.798661079086557e-08, + "loss": 0.0233, + "num_input_tokens_seen": 40347328, + "step": 20612 + }, + { + "epoch": 2.732007952286282, + "grad_norm": 0.024737080559134483, + "learning_rate": 9.789039920248266e-08, + "loss": 0.0002, + "num_input_tokens_seen": 40349560, + "step": 20613 + }, + { + "epoch": 2.7321404903909876, + "grad_norm": 3.0820424556732178, + "learning_rate": 9.779423392842991e-08, + "loss": 0.008, + "num_input_tokens_seen": 40350664, + "step": 20614 + }, + { + "epoch": 2.7322730284956926, + "grad_norm": 8.274417877197266, + "learning_rate": 9.769811497056137e-08, + "loss": 0.1507, + "num_input_tokens_seen": 40352328, + "step": 20615 + }, + { + "epoch": 2.7324055666003977, + "grad_norm": 0.007910707965493202, + "learning_rate": 9.760204233073084e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40354336, + "step": 20616 + }, + { + "epoch": 2.7325381047051027, + "grad_norm": 0.0007721353904344141, + "learning_rate": 9.750601601078991e-08, + "loss": 0.0, + "num_input_tokens_seen": 40355704, + "step": 20617 + }, + { + "epoch": 2.7326706428098078, + "grad_norm": 1.7725874185562134, + "learning_rate": 9.741003601259041e-08, + "loss": 0.0289, + "num_input_tokens_seen": 40357560, + "step": 20618 + }, + { + "epoch": 2.732803180914513, + "grad_norm": 9.851848602294922, + "learning_rate": 9.73141023379831e-08, + "loss": 0.1228, + "num_input_tokens_seen": 40359760, + "step": 20619 + }, + { + "epoch": 2.732935719019218, + "grad_norm": 0.0025030917022377253, + "learning_rate": 9.721821498881706e-08, + "loss": 0.0, + "num_input_tokens_seen": 40361512, + "step": 20620 + }, + { + "epoch": 2.7330682571239233, + "grad_norm": 7.826012134552002, + "learning_rate": 9.712237396694191e-08, + "loss": 0.1075, + "num_input_tokens_seen": 40363112, + "step": 20621 + }, + { + "epoch": 2.7332007952286284, + "grad_norm": 0.012881021946668625, + "learning_rate": 9.702657927420478e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40364992, + "step": 20622 + }, + { + "epoch": 2.7333333333333334, + "grad_norm": 13.361252784729004, + "learning_rate": 9.693083091245281e-08, + "loss": 0.1845, + "num_input_tokens_seen": 40366976, + "step": 20623 + }, + { + "epoch": 2.7334658714380384, + "grad_norm": 16.721614837646484, + "learning_rate": 9.68351288835323e-08, + "loss": 0.2106, + "num_input_tokens_seen": 40369536, + "step": 20624 + }, + { + "epoch": 2.7335984095427435, + "grad_norm": 0.012255587615072727, + "learning_rate": 9.673947318928873e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40372208, + "step": 20625 + }, + { + "epoch": 2.7337309476474485, + "grad_norm": 9.030549049377441, + "learning_rate": 9.66438638315656e-08, + "loss": 0.1213, + "num_input_tokens_seen": 40375080, + "step": 20626 + }, + { + "epoch": 2.7338634857521535, + "grad_norm": 4.068104267120361, + "learning_rate": 9.6548300812207e-08, + "loss": 0.0208, + "num_input_tokens_seen": 40377256, + "step": 20627 + }, + { + "epoch": 2.733996023856859, + "grad_norm": 9.125537872314453, + "learning_rate": 9.645278413305481e-08, + "loss": 0.046, + "num_input_tokens_seen": 40379832, + "step": 20628 + }, + { + "epoch": 2.734128561961564, + "grad_norm": 1.1615562438964844, + "learning_rate": 9.635731379595143e-08, + "loss": 0.0054, + "num_input_tokens_seen": 40381904, + "step": 20629 + }, + { + "epoch": 2.734261100066269, + "grad_norm": 6.887634754180908, + "learning_rate": 9.626188980273732e-08, + "loss": 0.0542, + "num_input_tokens_seen": 40384024, + "step": 20630 + }, + { + "epoch": 2.734393638170974, + "grad_norm": 14.204608917236328, + "learning_rate": 9.616651215525213e-08, + "loss": 0.1652, + "num_input_tokens_seen": 40386184, + "step": 20631 + }, + { + "epoch": 2.734526176275679, + "grad_norm": 31.906185150146484, + "learning_rate": 9.607118085533468e-08, + "loss": 0.3946, + "num_input_tokens_seen": 40387960, + "step": 20632 + }, + { + "epoch": 2.734658714380384, + "grad_norm": 0.016838302835822105, + "learning_rate": 9.59758959048232e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40390000, + "step": 20633 + }, + { + "epoch": 2.7347912524850893, + "grad_norm": 7.177703857421875, + "learning_rate": 9.588065730555513e-08, + "loss": 0.122, + "num_input_tokens_seen": 40391960, + "step": 20634 + }, + { + "epoch": 2.7349237905897947, + "grad_norm": 12.038917541503906, + "learning_rate": 9.578546505936675e-08, + "loss": 0.0667, + "num_input_tokens_seen": 40394040, + "step": 20635 + }, + { + "epoch": 2.7350563286944998, + "grad_norm": 0.13613131642341614, + "learning_rate": 9.569031916809301e-08, + "loss": 0.0009, + "num_input_tokens_seen": 40395912, + "step": 20636 + }, + { + "epoch": 2.735188866799205, + "grad_norm": 0.00870995968580246, + "learning_rate": 9.559521963356855e-08, + "loss": 0.0, + "num_input_tokens_seen": 40398064, + "step": 20637 + }, + { + "epoch": 2.73532140490391, + "grad_norm": 0.036005210131406784, + "learning_rate": 9.55001664576266e-08, + "loss": 0.0002, + "num_input_tokens_seen": 40399592, + "step": 20638 + }, + { + "epoch": 2.735453943008615, + "grad_norm": 0.003369431011378765, + "learning_rate": 9.540515964210045e-08, + "loss": 0.0, + "num_input_tokens_seen": 40401032, + "step": 20639 + }, + { + "epoch": 2.73558648111332, + "grad_norm": 5.880390167236328, + "learning_rate": 9.531019918882195e-08, + "loss": 0.0289, + "num_input_tokens_seen": 40403776, + "step": 20640 + }, + { + "epoch": 2.735719019218025, + "grad_norm": 0.0331774465739727, + "learning_rate": 9.521528509962158e-08, + "loss": 0.0003, + "num_input_tokens_seen": 40405744, + "step": 20641 + }, + { + "epoch": 2.7358515573227304, + "grad_norm": 0.015433989465236664, + "learning_rate": 9.512041737632955e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40407160, + "step": 20642 + }, + { + "epoch": 2.7359840954274355, + "grad_norm": 2.6368601322174072, + "learning_rate": 9.502559602077438e-08, + "loss": 0.0121, + "num_input_tokens_seen": 40409072, + "step": 20643 + }, + { + "epoch": 2.7361166335321405, + "grad_norm": 0.010278803296387196, + "learning_rate": 9.493082103478519e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40410888, + "step": 20644 + }, + { + "epoch": 2.7362491716368456, + "grad_norm": 0.029607700183987617, + "learning_rate": 9.48360924201891e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40412544, + "step": 20645 + }, + { + "epoch": 2.7363817097415506, + "grad_norm": 0.0024321903474628925, + "learning_rate": 9.474141017881245e-08, + "loss": 0.0, + "num_input_tokens_seen": 40414040, + "step": 20646 + }, + { + "epoch": 2.736514247846256, + "grad_norm": 0.001133545534685254, + "learning_rate": 9.464677431248042e-08, + "loss": 0.0, + "num_input_tokens_seen": 40415272, + "step": 20647 + }, + { + "epoch": 2.7366467859509607, + "grad_norm": 1.2533169984817505, + "learning_rate": 9.455218482301826e-08, + "loss": 0.0061, + "num_input_tokens_seen": 40417864, + "step": 20648 + }, + { + "epoch": 2.736779324055666, + "grad_norm": 17.05585289001465, + "learning_rate": 9.445764171224891e-08, + "loss": 0.1668, + "num_input_tokens_seen": 40419680, + "step": 20649 + }, + { + "epoch": 2.736911862160371, + "grad_norm": 4.985896587371826, + "learning_rate": 9.436314498199595e-08, + "loss": 0.0579, + "num_input_tokens_seen": 40422232, + "step": 20650 + }, + { + "epoch": 2.7370444002650762, + "grad_norm": 0.015136467292904854, + "learning_rate": 9.426869463408122e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40424080, + "step": 20651 + }, + { + "epoch": 2.7371769383697813, + "grad_norm": 0.4803467392921448, + "learning_rate": 9.41742906703258e-08, + "loss": 0.0007, + "num_input_tokens_seen": 40425128, + "step": 20652 + }, + { + "epoch": 2.7373094764744863, + "grad_norm": 7.718040466308594, + "learning_rate": 9.407993309254959e-08, + "loss": 0.1486, + "num_input_tokens_seen": 40426896, + "step": 20653 + }, + { + "epoch": 2.737442014579192, + "grad_norm": 6.133661270141602, + "learning_rate": 9.398562190257199e-08, + "loss": 0.0403, + "num_input_tokens_seen": 40428776, + "step": 20654 + }, + { + "epoch": 2.7375745526838964, + "grad_norm": 0.003038499504327774, + "learning_rate": 9.389135710221153e-08, + "loss": 0.0, + "num_input_tokens_seen": 40430304, + "step": 20655 + }, + { + "epoch": 2.737707090788602, + "grad_norm": 8.800638198852539, + "learning_rate": 9.379713869328538e-08, + "loss": 0.0914, + "num_input_tokens_seen": 40431880, + "step": 20656 + }, + { + "epoch": 2.737839628893307, + "grad_norm": 0.7635812163352966, + "learning_rate": 9.37029666776107e-08, + "loss": 0.0013, + "num_input_tokens_seen": 40433616, + "step": 20657 + }, + { + "epoch": 2.737972166998012, + "grad_norm": 1.2712606191635132, + "learning_rate": 9.360884105700269e-08, + "loss": 0.0106, + "num_input_tokens_seen": 40435472, + "step": 20658 + }, + { + "epoch": 2.738104705102717, + "grad_norm": 0.5395106077194214, + "learning_rate": 9.351476183327601e-08, + "loss": 0.0012, + "num_input_tokens_seen": 40437456, + "step": 20659 + }, + { + "epoch": 2.738237243207422, + "grad_norm": 1.3326441049575806, + "learning_rate": 9.342072900824533e-08, + "loss": 0.007, + "num_input_tokens_seen": 40439872, + "step": 20660 + }, + { + "epoch": 2.7383697813121275, + "grad_norm": 5.483769416809082, + "learning_rate": 9.33267425837231e-08, + "loss": 0.1187, + "num_input_tokens_seen": 40442224, + "step": 20661 + }, + { + "epoch": 2.738502319416832, + "grad_norm": 0.00310563575476408, + "learning_rate": 9.323280256152118e-08, + "loss": 0.0, + "num_input_tokens_seen": 40444128, + "step": 20662 + }, + { + "epoch": 2.7386348575215376, + "grad_norm": 0.024852773174643517, + "learning_rate": 9.313890894345173e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40445720, + "step": 20663 + }, + { + "epoch": 2.7387673956262426, + "grad_norm": 10.2554931640625, + "learning_rate": 9.304506173132416e-08, + "loss": 0.2218, + "num_input_tokens_seen": 40448440, + "step": 20664 + }, + { + "epoch": 2.7388999337309476, + "grad_norm": 4.317249298095703, + "learning_rate": 9.295126092694839e-08, + "loss": 0.0261, + "num_input_tokens_seen": 40449984, + "step": 20665 + }, + { + "epoch": 2.7390324718356527, + "grad_norm": 6.298765659332275, + "learning_rate": 9.2857506532133e-08, + "loss": 0.0526, + "num_input_tokens_seen": 40452104, + "step": 20666 + }, + { + "epoch": 2.7391650099403577, + "grad_norm": 0.005699994508177042, + "learning_rate": 9.27637985486854e-08, + "loss": 0.0, + "num_input_tokens_seen": 40454424, + "step": 20667 + }, + { + "epoch": 2.739297548045063, + "grad_norm": 0.1396913379430771, + "learning_rate": 9.267013697841193e-08, + "loss": 0.0009, + "num_input_tokens_seen": 40457312, + "step": 20668 + }, + { + "epoch": 2.739430086149768, + "grad_norm": 0.010176203213632107, + "learning_rate": 9.257652182311921e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40460192, + "step": 20669 + }, + { + "epoch": 2.7395626242544733, + "grad_norm": 0.4269968867301941, + "learning_rate": 9.24829530846122e-08, + "loss": 0.0039, + "num_input_tokens_seen": 40462104, + "step": 20670 + }, + { + "epoch": 2.7396951623591783, + "grad_norm": 7.088038921356201, + "learning_rate": 9.238943076469469e-08, + "loss": 0.0626, + "num_input_tokens_seen": 40463376, + "step": 20671 + }, + { + "epoch": 2.7398277004638834, + "grad_norm": 5.937478065490723, + "learning_rate": 9.229595486517001e-08, + "loss": 0.0773, + "num_input_tokens_seen": 40465584, + "step": 20672 + }, + { + "epoch": 2.7399602385685884, + "grad_norm": 6.387305736541748, + "learning_rate": 9.220252538784002e-08, + "loss": 0.0335, + "num_input_tokens_seen": 40467264, + "step": 20673 + }, + { + "epoch": 2.7400927766732934, + "grad_norm": 0.07741384953260422, + "learning_rate": 9.210914233450635e-08, + "loss": 0.0004, + "num_input_tokens_seen": 40469080, + "step": 20674 + }, + { + "epoch": 2.740225314777999, + "grad_norm": 2.6026041507720947, + "learning_rate": 9.20158057069695e-08, + "loss": 0.0089, + "num_input_tokens_seen": 40470976, + "step": 20675 + }, + { + "epoch": 2.7403578528827035, + "grad_norm": 2.2284111976623535, + "learning_rate": 9.19225155070294e-08, + "loss": 0.0202, + "num_input_tokens_seen": 40474480, + "step": 20676 + }, + { + "epoch": 2.740490390987409, + "grad_norm": 0.017774587497115135, + "learning_rate": 9.182927173648437e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40476296, + "step": 20677 + }, + { + "epoch": 2.740622929092114, + "grad_norm": 0.17431522905826569, + "learning_rate": 9.173607439713239e-08, + "loss": 0.0008, + "num_input_tokens_seen": 40479224, + "step": 20678 + }, + { + "epoch": 2.740755467196819, + "grad_norm": 6.192979335784912, + "learning_rate": 9.164292349077009e-08, + "loss": 0.0786, + "num_input_tokens_seen": 40481104, + "step": 20679 + }, + { + "epoch": 2.740888005301524, + "grad_norm": 1.1193798780441284, + "learning_rate": 9.154981901919324e-08, + "loss": 0.0033, + "num_input_tokens_seen": 40484024, + "step": 20680 + }, + { + "epoch": 2.741020543406229, + "grad_norm": 18.16061782836914, + "learning_rate": 9.145676098419793e-08, + "loss": 0.0539, + "num_input_tokens_seen": 40486408, + "step": 20681 + }, + { + "epoch": 2.7411530815109346, + "grad_norm": 5.657936096191406, + "learning_rate": 9.136374938757798e-08, + "loss": 0.0112, + "num_input_tokens_seen": 40488936, + "step": 20682 + }, + { + "epoch": 2.7412856196156397, + "grad_norm": 0.3479876220226288, + "learning_rate": 9.12707842311264e-08, + "loss": 0.0024, + "num_input_tokens_seen": 40491696, + "step": 20683 + }, + { + "epoch": 2.7414181577203447, + "grad_norm": 12.141497611999512, + "learning_rate": 9.117786551663594e-08, + "loss": 0.1574, + "num_input_tokens_seen": 40494320, + "step": 20684 + }, + { + "epoch": 2.7415506958250497, + "grad_norm": 0.02492293156683445, + "learning_rate": 9.108499324589765e-08, + "loss": 0.0002, + "num_input_tokens_seen": 40495864, + "step": 20685 + }, + { + "epoch": 2.7416832339297548, + "grad_norm": 0.039177246391773224, + "learning_rate": 9.099216742070232e-08, + "loss": 0.0002, + "num_input_tokens_seen": 40497056, + "step": 20686 + }, + { + "epoch": 2.74181577203446, + "grad_norm": 0.01595601998269558, + "learning_rate": 9.08993880428402e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40498632, + "step": 20687 + }, + { + "epoch": 2.741948310139165, + "grad_norm": 0.010404431261122227, + "learning_rate": 9.080665511410014e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40500528, + "step": 20688 + }, + { + "epoch": 2.7420808482438703, + "grad_norm": 0.025889527052640915, + "learning_rate": 9.071396863626958e-08, + "loss": 0.0002, + "num_input_tokens_seen": 40502856, + "step": 20689 + }, + { + "epoch": 2.7422133863485754, + "grad_norm": 14.627632141113281, + "learning_rate": 9.062132861113542e-08, + "loss": 0.2475, + "num_input_tokens_seen": 40505872, + "step": 20690 + }, + { + "epoch": 2.7423459244532804, + "grad_norm": 4.508406162261963, + "learning_rate": 9.05287350404846e-08, + "loss": 0.0306, + "num_input_tokens_seen": 40507808, + "step": 20691 + }, + { + "epoch": 2.7424784625579854, + "grad_norm": 0.0013602660037577152, + "learning_rate": 9.043618792610176e-08, + "loss": 0.0, + "num_input_tokens_seen": 40509288, + "step": 20692 + }, + { + "epoch": 2.7426110006626905, + "grad_norm": 3.037480115890503, + "learning_rate": 9.034368726977161e-08, + "loss": 0.0234, + "num_input_tokens_seen": 40511576, + "step": 20693 + }, + { + "epoch": 2.7427435387673955, + "grad_norm": 5.327826499938965, + "learning_rate": 9.025123307327771e-08, + "loss": 0.0992, + "num_input_tokens_seen": 40513664, + "step": 20694 + }, + { + "epoch": 2.7428760768721006, + "grad_norm": 17.871667861938477, + "learning_rate": 9.015882533840198e-08, + "loss": 0.3611, + "num_input_tokens_seen": 40516288, + "step": 20695 + }, + { + "epoch": 2.743008614976806, + "grad_norm": 11.369023323059082, + "learning_rate": 9.006646406692688e-08, + "loss": 0.3064, + "num_input_tokens_seen": 40517432, + "step": 20696 + }, + { + "epoch": 2.743141153081511, + "grad_norm": 18.61103630065918, + "learning_rate": 8.997414926063264e-08, + "loss": 0.0495, + "num_input_tokens_seen": 40520264, + "step": 20697 + }, + { + "epoch": 2.743273691186216, + "grad_norm": 0.0007582338294014335, + "learning_rate": 8.988188092129951e-08, + "loss": 0.0, + "num_input_tokens_seen": 40521200, + "step": 20698 + }, + { + "epoch": 2.743406229290921, + "grad_norm": 0.023078473284840584, + "learning_rate": 8.978965905070664e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40522544, + "step": 20699 + }, + { + "epoch": 2.743538767395626, + "grad_norm": 0.019972823560237885, + "learning_rate": 8.969748365063146e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40523696, + "step": 20700 + }, + { + "epoch": 2.7436713055003312, + "grad_norm": 0.027034739032387733, + "learning_rate": 8.960535472285175e-08, + "loss": 0.0002, + "num_input_tokens_seen": 40526896, + "step": 20701 + }, + { + "epoch": 2.7438038436050363, + "grad_norm": 0.005604673642665148, + "learning_rate": 8.951327226914386e-08, + "loss": 0.0, + "num_input_tokens_seen": 40528408, + "step": 20702 + }, + { + "epoch": 2.7439363817097417, + "grad_norm": 8.542337417602539, + "learning_rate": 8.942123629128247e-08, + "loss": 0.0557, + "num_input_tokens_seen": 40530856, + "step": 20703 + }, + { + "epoch": 2.744068919814447, + "grad_norm": 8.739341735839844, + "learning_rate": 8.93292467910431e-08, + "loss": 0.1558, + "num_input_tokens_seen": 40533240, + "step": 20704 + }, + { + "epoch": 2.744201457919152, + "grad_norm": 8.053001403808594, + "learning_rate": 8.92373037701988e-08, + "loss": 0.0333, + "num_input_tokens_seen": 40535304, + "step": 20705 + }, + { + "epoch": 2.744333996023857, + "grad_norm": 0.0044760992750525475, + "learning_rate": 8.9145407230522e-08, + "loss": 0.0, + "num_input_tokens_seen": 40537360, + "step": 20706 + }, + { + "epoch": 2.744466534128562, + "grad_norm": 3.009662389755249, + "learning_rate": 8.905355717378523e-08, + "loss": 0.0062, + "num_input_tokens_seen": 40539520, + "step": 20707 + }, + { + "epoch": 2.744599072233267, + "grad_norm": 12.790186882019043, + "learning_rate": 8.896175360175923e-08, + "loss": 0.0328, + "num_input_tokens_seen": 40541096, + "step": 20708 + }, + { + "epoch": 2.744731610337972, + "grad_norm": 0.020984278991818428, + "learning_rate": 8.886999651621347e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40542824, + "step": 20709 + }, + { + "epoch": 2.7448641484426775, + "grad_norm": 1.5154893398284912, + "learning_rate": 8.877828591891819e-08, + "loss": 0.0093, + "num_input_tokens_seen": 40544512, + "step": 20710 + }, + { + "epoch": 2.7449966865473825, + "grad_norm": 8.19878101348877, + "learning_rate": 8.868662181164028e-08, + "loss": 0.0731, + "num_input_tokens_seen": 40546024, + "step": 20711 + }, + { + "epoch": 2.7451292246520875, + "grad_norm": 0.02542228065431118, + "learning_rate": 8.859500419614808e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40547528, + "step": 20712 + }, + { + "epoch": 2.7452617627567926, + "grad_norm": 0.8276879787445068, + "learning_rate": 8.850343307420794e-08, + "loss": 0.0028, + "num_input_tokens_seen": 40549280, + "step": 20713 + }, + { + "epoch": 2.7453943008614976, + "grad_norm": 6.590904235839844, + "learning_rate": 8.841190844758512e-08, + "loss": 0.1788, + "num_input_tokens_seen": 40550912, + "step": 20714 + }, + { + "epoch": 2.7455268389662026, + "grad_norm": 12.48646354675293, + "learning_rate": 8.832043031804405e-08, + "loss": 0.0503, + "num_input_tokens_seen": 40553280, + "step": 20715 + }, + { + "epoch": 2.7456593770709077, + "grad_norm": 0.0037856181152164936, + "learning_rate": 8.822899868734885e-08, + "loss": 0.0, + "num_input_tokens_seen": 40554576, + "step": 20716 + }, + { + "epoch": 2.745791915175613, + "grad_norm": 0.001498855766840279, + "learning_rate": 8.813761355726286e-08, + "loss": 0.0, + "num_input_tokens_seen": 40556120, + "step": 20717 + }, + { + "epoch": 2.745924453280318, + "grad_norm": 0.00438922131434083, + "learning_rate": 8.804627492954742e-08, + "loss": 0.0, + "num_input_tokens_seen": 40558056, + "step": 20718 + }, + { + "epoch": 2.7460569913850232, + "grad_norm": 6.076627254486084, + "learning_rate": 8.795498280596337e-08, + "loss": 0.0759, + "num_input_tokens_seen": 40559960, + "step": 20719 + }, + { + "epoch": 2.7461895294897283, + "grad_norm": 0.14451400935649872, + "learning_rate": 8.786373718827152e-08, + "loss": 0.0006, + "num_input_tokens_seen": 40562560, + "step": 20720 + }, + { + "epoch": 2.7463220675944333, + "grad_norm": 0.9155290722846985, + "learning_rate": 8.777253807823071e-08, + "loss": 0.005, + "num_input_tokens_seen": 40564296, + "step": 20721 + }, + { + "epoch": 2.7464546056991384, + "grad_norm": 0.08196693658828735, + "learning_rate": 8.76813854775993e-08, + "loss": 0.0002, + "num_input_tokens_seen": 40565544, + "step": 20722 + }, + { + "epoch": 2.7465871438038434, + "grad_norm": 0.009713605046272278, + "learning_rate": 8.75902793881353e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40567576, + "step": 20723 + }, + { + "epoch": 2.746719681908549, + "grad_norm": 3.408336639404297, + "learning_rate": 8.749921981159482e-08, + "loss": 0.0067, + "num_input_tokens_seen": 40570616, + "step": 20724 + }, + { + "epoch": 2.746852220013254, + "grad_norm": 5.829514503479004, + "learning_rate": 8.740820674973393e-08, + "loss": 0.0341, + "num_input_tokens_seen": 40572312, + "step": 20725 + }, + { + "epoch": 2.746984758117959, + "grad_norm": 0.01758240908384323, + "learning_rate": 8.731724020430654e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40574064, + "step": 20726 + }, + { + "epoch": 2.747117296222664, + "grad_norm": 8.522876739501953, + "learning_rate": 8.722632017706762e-08, + "loss": 0.0445, + "num_input_tokens_seen": 40575664, + "step": 20727 + }, + { + "epoch": 2.747249834327369, + "grad_norm": 6.124189853668213, + "learning_rate": 8.713544666976965e-08, + "loss": 0.1465, + "num_input_tokens_seen": 40577464, + "step": 20728 + }, + { + "epoch": 2.747382372432074, + "grad_norm": 8.647217750549316, + "learning_rate": 8.704461968416511e-08, + "loss": 0.13, + "num_input_tokens_seen": 40578904, + "step": 20729 + }, + { + "epoch": 2.747514910536779, + "grad_norm": 7.488789081573486, + "learning_rate": 8.695383922200457e-08, + "loss": 0.014, + "num_input_tokens_seen": 40580384, + "step": 20730 + }, + { + "epoch": 2.7476474486414846, + "grad_norm": 0.019429726526141167, + "learning_rate": 8.686310528503883e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40581840, + "step": 20731 + }, + { + "epoch": 2.7477799867461896, + "grad_norm": 0.6497260928153992, + "learning_rate": 8.677241787501706e-08, + "loss": 0.0048, + "num_input_tokens_seen": 40583824, + "step": 20732 + }, + { + "epoch": 2.7479125248508947, + "grad_norm": 0.7497528195381165, + "learning_rate": 8.668177699368757e-08, + "loss": 0.0013, + "num_input_tokens_seen": 40586168, + "step": 20733 + }, + { + "epoch": 2.7480450629555997, + "grad_norm": 5.29608678817749, + "learning_rate": 8.659118264279898e-08, + "loss": 0.0684, + "num_input_tokens_seen": 40588080, + "step": 20734 + }, + { + "epoch": 2.7481776010603047, + "grad_norm": 12.830812454223633, + "learning_rate": 8.650063482409681e-08, + "loss": 0.1249, + "num_input_tokens_seen": 40591136, + "step": 20735 + }, + { + "epoch": 2.74831013916501, + "grad_norm": 2.3125052452087402, + "learning_rate": 8.641013353932775e-08, + "loss": 0.0184, + "num_input_tokens_seen": 40593960, + "step": 20736 + }, + { + "epoch": 2.748442677269715, + "grad_norm": 1.837138056755066, + "learning_rate": 8.631967879023595e-08, + "loss": 0.0126, + "num_input_tokens_seen": 40595696, + "step": 20737 + }, + { + "epoch": 2.7485752153744203, + "grad_norm": 0.24660219252109528, + "learning_rate": 8.622927057856612e-08, + "loss": 0.0006, + "num_input_tokens_seen": 40597528, + "step": 20738 + }, + { + "epoch": 2.7487077534791253, + "grad_norm": 0.004943624138832092, + "learning_rate": 8.613890890606103e-08, + "loss": 0.0, + "num_input_tokens_seen": 40600872, + "step": 20739 + }, + { + "epoch": 2.7488402915838304, + "grad_norm": 0.009159441106021404, + "learning_rate": 8.604859377446317e-08, + "loss": 0.0, + "num_input_tokens_seen": 40603424, + "step": 20740 + }, + { + "epoch": 2.7489728296885354, + "grad_norm": 0.005963070783764124, + "learning_rate": 8.595832518551367e-08, + "loss": 0.0, + "num_input_tokens_seen": 40605080, + "step": 20741 + }, + { + "epoch": 2.7491053677932404, + "grad_norm": 1.7647782564163208, + "learning_rate": 8.586810314095278e-08, + "loss": 0.0078, + "num_input_tokens_seen": 40607216, + "step": 20742 + }, + { + "epoch": 2.749237905897946, + "grad_norm": 16.16608238220215, + "learning_rate": 8.577792764252052e-08, + "loss": 0.1861, + "num_input_tokens_seen": 40609232, + "step": 20743 + }, + { + "epoch": 2.7493704440026505, + "grad_norm": 2.951291084289551, + "learning_rate": 8.56877986919552e-08, + "loss": 0.0676, + "num_input_tokens_seen": 40611312, + "step": 20744 + }, + { + "epoch": 2.749502982107356, + "grad_norm": 4.2243452072143555, + "learning_rate": 8.559771629099462e-08, + "loss": 0.0249, + "num_input_tokens_seen": 40613688, + "step": 20745 + }, + { + "epoch": 2.749635520212061, + "grad_norm": 0.018397625535726547, + "learning_rate": 8.550768044137597e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40615160, + "step": 20746 + }, + { + "epoch": 2.749768058316766, + "grad_norm": 0.012802147306501865, + "learning_rate": 8.541769114483456e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40616672, + "step": 20747 + }, + { + "epoch": 2.749900596421471, + "grad_norm": 7.6945905685424805, + "learning_rate": 8.532774840310592e-08, + "loss": 0.0935, + "num_input_tokens_seen": 40618640, + "step": 20748 + }, + { + "epoch": 2.750033134526176, + "grad_norm": 0.01403286587446928, + "learning_rate": 8.523785221792424e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40620544, + "step": 20749 + }, + { + "epoch": 2.7501656726308816, + "grad_norm": 17.7221622467041, + "learning_rate": 8.514800259102229e-08, + "loss": 0.1382, + "num_input_tokens_seen": 40622824, + "step": 20750 + }, + { + "epoch": 2.7502982107355862, + "grad_norm": 21.403738021850586, + "learning_rate": 8.505819952413314e-08, + "loss": 0.1369, + "num_input_tokens_seen": 40624528, + "step": 20751 + }, + { + "epoch": 2.7504307488402917, + "grad_norm": 0.014931654557585716, + "learning_rate": 8.49684430189876e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40625824, + "step": 20752 + }, + { + "epoch": 2.7505632869449967, + "grad_norm": 0.003229240421205759, + "learning_rate": 8.487873307731681e-08, + "loss": 0.0, + "num_input_tokens_seen": 40626872, + "step": 20753 + }, + { + "epoch": 2.750695825049702, + "grad_norm": 4.609385967254639, + "learning_rate": 8.478906970084994e-08, + "loss": 0.0191, + "num_input_tokens_seen": 40628712, + "step": 20754 + }, + { + "epoch": 2.750828363154407, + "grad_norm": 4.673152446746826, + "learning_rate": 8.469945289131614e-08, + "loss": 0.0149, + "num_input_tokens_seen": 40631864, + "step": 20755 + }, + { + "epoch": 2.750960901259112, + "grad_norm": 1.2935545444488525, + "learning_rate": 8.460988265044295e-08, + "loss": 0.0198, + "num_input_tokens_seen": 40633184, + "step": 20756 + }, + { + "epoch": 2.7510934393638173, + "grad_norm": 2.5498416423797607, + "learning_rate": 8.45203589799573e-08, + "loss": 0.008, + "num_input_tokens_seen": 40635072, + "step": 20757 + }, + { + "epoch": 2.751225977468522, + "grad_norm": 0.16308946907520294, + "learning_rate": 8.443088188158588e-08, + "loss": 0.0009, + "num_input_tokens_seen": 40636576, + "step": 20758 + }, + { + "epoch": 2.7513585155732274, + "grad_norm": 0.0250396765768528, + "learning_rate": 8.434145135705341e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40639032, + "step": 20759 + }, + { + "epoch": 2.7514910536779325, + "grad_norm": 0.04512974992394447, + "learning_rate": 8.425206740808434e-08, + "loss": 0.0002, + "num_input_tokens_seen": 40640568, + "step": 20760 + }, + { + "epoch": 2.7516235917826375, + "grad_norm": 3.204676628112793, + "learning_rate": 8.416273003640202e-08, + "loss": 0.074, + "num_input_tokens_seen": 40642696, + "step": 20761 + }, + { + "epoch": 2.7517561298873425, + "grad_norm": 0.014112602919340134, + "learning_rate": 8.40734392437284e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40643856, + "step": 20762 + }, + { + "epoch": 2.7518886679920476, + "grad_norm": 0.1166965439915657, + "learning_rate": 8.398419503178545e-08, + "loss": 0.0005, + "num_input_tokens_seen": 40645616, + "step": 20763 + }, + { + "epoch": 2.752021206096753, + "grad_norm": 4.574771881103516, + "learning_rate": 8.389499740229457e-08, + "loss": 0.0248, + "num_input_tokens_seen": 40647272, + "step": 20764 + }, + { + "epoch": 2.752153744201458, + "grad_norm": 0.012392591685056686, + "learning_rate": 8.380584635697464e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40649704, + "step": 20765 + }, + { + "epoch": 2.752286282306163, + "grad_norm": 3.191887855529785, + "learning_rate": 8.371674189754487e-08, + "loss": 0.0184, + "num_input_tokens_seen": 40651872, + "step": 20766 + }, + { + "epoch": 2.752418820410868, + "grad_norm": 0.11252931505441666, + "learning_rate": 8.36276840257233e-08, + "loss": 0.0008, + "num_input_tokens_seen": 40655256, + "step": 20767 + }, + { + "epoch": 2.752551358515573, + "grad_norm": 0.26427769660949707, + "learning_rate": 8.353867274322663e-08, + "loss": 0.0007, + "num_input_tokens_seen": 40656944, + "step": 20768 + }, + { + "epoch": 2.7526838966202782, + "grad_norm": 0.020294105634093285, + "learning_rate": 8.344970805177127e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40658368, + "step": 20769 + }, + { + "epoch": 2.7528164347249833, + "grad_norm": 0.08789995312690735, + "learning_rate": 8.336078995307306e-08, + "loss": 0.0004, + "num_input_tokens_seen": 40660112, + "step": 20770 + }, + { + "epoch": 2.7529489728296888, + "grad_norm": 0.1142592504620552, + "learning_rate": 8.327191844884619e-08, + "loss": 0.0007, + "num_input_tokens_seen": 40661320, + "step": 20771 + }, + { + "epoch": 2.753081510934394, + "grad_norm": 0.009458131156861782, + "learning_rate": 8.318309354080345e-08, + "loss": 0.0, + "num_input_tokens_seen": 40662976, + "step": 20772 + }, + { + "epoch": 2.753214049039099, + "grad_norm": 0.09801451116800308, + "learning_rate": 8.309431523065792e-08, + "loss": 0.0006, + "num_input_tokens_seen": 40665720, + "step": 20773 + }, + { + "epoch": 2.753346587143804, + "grad_norm": 0.7977538108825684, + "learning_rate": 8.300558352012183e-08, + "loss": 0.0019, + "num_input_tokens_seen": 40667968, + "step": 20774 + }, + { + "epoch": 2.753479125248509, + "grad_norm": 3.537339687347412, + "learning_rate": 8.291689841090495e-08, + "loss": 0.0222, + "num_input_tokens_seen": 40670184, + "step": 20775 + }, + { + "epoch": 2.753611663353214, + "grad_norm": 3.0308964252471924, + "learning_rate": 8.28282599047181e-08, + "loss": 0.0219, + "num_input_tokens_seen": 40672376, + "step": 20776 + }, + { + "epoch": 2.753744201457919, + "grad_norm": 0.06346801668405533, + "learning_rate": 8.273966800326965e-08, + "loss": 0.0002, + "num_input_tokens_seen": 40674384, + "step": 20777 + }, + { + "epoch": 2.7538767395626245, + "grad_norm": 17.348079681396484, + "learning_rate": 8.265112270826797e-08, + "loss": 0.0901, + "num_input_tokens_seen": 40676456, + "step": 20778 + }, + { + "epoch": 2.7540092776673295, + "grad_norm": 7.964555263519287, + "learning_rate": 8.256262402142057e-08, + "loss": 0.086, + "num_input_tokens_seen": 40678360, + "step": 20779 + }, + { + "epoch": 2.7541418157720345, + "grad_norm": 0.023211030289530754, + "learning_rate": 8.24741719444333e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40680016, + "step": 20780 + }, + { + "epoch": 2.7542743538767396, + "grad_norm": 5.3977131843566895, + "learning_rate": 8.238576647901175e-08, + "loss": 0.0192, + "num_input_tokens_seen": 40681856, + "step": 20781 + }, + { + "epoch": 2.7544068919814446, + "grad_norm": 0.8784810304641724, + "learning_rate": 8.229740762686068e-08, + "loss": 0.0022, + "num_input_tokens_seen": 40683488, + "step": 20782 + }, + { + "epoch": 2.7545394300861497, + "grad_norm": 0.048423800617456436, + "learning_rate": 8.220909538968314e-08, + "loss": 0.0002, + "num_input_tokens_seen": 40686384, + "step": 20783 + }, + { + "epoch": 2.7546719681908547, + "grad_norm": 6.812203407287598, + "learning_rate": 8.212082976918223e-08, + "loss": 0.0789, + "num_input_tokens_seen": 40688672, + "step": 20784 + }, + { + "epoch": 2.75480450629556, + "grad_norm": 0.0023199389688670635, + "learning_rate": 8.203261076705993e-08, + "loss": 0.0, + "num_input_tokens_seen": 40690200, + "step": 20785 + }, + { + "epoch": 2.754937044400265, + "grad_norm": 4.256412982940674, + "learning_rate": 8.194443838501681e-08, + "loss": 0.0419, + "num_input_tokens_seen": 40692496, + "step": 20786 + }, + { + "epoch": 2.7550695825049702, + "grad_norm": 0.0037354445084929466, + "learning_rate": 8.18563126247532e-08, + "loss": 0.0, + "num_input_tokens_seen": 40693968, + "step": 20787 + }, + { + "epoch": 2.7552021206096753, + "grad_norm": 7.825641632080078, + "learning_rate": 8.176823348796825e-08, + "loss": 0.1034, + "num_input_tokens_seen": 40695640, + "step": 20788 + }, + { + "epoch": 2.7553346587143803, + "grad_norm": 0.028580544516444206, + "learning_rate": 8.168020097635954e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40697176, + "step": 20789 + }, + { + "epoch": 2.7554671968190854, + "grad_norm": 0.24258102476596832, + "learning_rate": 8.159221509162512e-08, + "loss": 0.0014, + "num_input_tokens_seen": 40698808, + "step": 20790 + }, + { + "epoch": 2.7555997349237904, + "grad_norm": 12.722267150878906, + "learning_rate": 8.150427583546117e-08, + "loss": 0.0787, + "num_input_tokens_seen": 40700824, + "step": 20791 + }, + { + "epoch": 2.755732273028496, + "grad_norm": 3.0444629192352295, + "learning_rate": 8.141638320956297e-08, + "loss": 0.005, + "num_input_tokens_seen": 40702904, + "step": 20792 + }, + { + "epoch": 2.755864811133201, + "grad_norm": 11.818035125732422, + "learning_rate": 8.132853721562583e-08, + "loss": 0.1787, + "num_input_tokens_seen": 40704664, + "step": 20793 + }, + { + "epoch": 2.755997349237906, + "grad_norm": 3.7424509525299072, + "learning_rate": 8.124073785534258e-08, + "loss": 0.0346, + "num_input_tokens_seen": 40706432, + "step": 20794 + }, + { + "epoch": 2.756129887342611, + "grad_norm": 3.7779476642608643, + "learning_rate": 8.115298513040687e-08, + "loss": 0.0131, + "num_input_tokens_seen": 40708536, + "step": 20795 + }, + { + "epoch": 2.756262425447316, + "grad_norm": 0.0285557359457016, + "learning_rate": 8.10652790425101e-08, + "loss": 0.0002, + "num_input_tokens_seen": 40710544, + "step": 20796 + }, + { + "epoch": 2.756394963552021, + "grad_norm": 0.7302746176719666, + "learning_rate": 8.097761959334371e-08, + "loss": 0.0097, + "num_input_tokens_seen": 40712112, + "step": 20797 + }, + { + "epoch": 2.756527501656726, + "grad_norm": 0.010622425004839897, + "learning_rate": 8.089000678459746e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40715464, + "step": 20798 + }, + { + "epoch": 2.7566600397614316, + "grad_norm": 5.550331115722656, + "learning_rate": 8.080244061796056e-08, + "loss": 0.0592, + "num_input_tokens_seen": 40717136, + "step": 20799 + }, + { + "epoch": 2.7567925778661366, + "grad_norm": 0.015374637208878994, + "learning_rate": 8.071492109512164e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40719128, + "step": 20800 + }, + { + "epoch": 2.7569251159708417, + "grad_norm": 0.4079461991786957, + "learning_rate": 8.062744821776825e-08, + "loss": 0.002, + "num_input_tokens_seen": 40721336, + "step": 20801 + }, + { + "epoch": 2.7570576540755467, + "grad_norm": 5.408421039581299, + "learning_rate": 8.054002198758654e-08, + "loss": 0.0664, + "num_input_tokens_seen": 40724096, + "step": 20802 + }, + { + "epoch": 2.7571901921802517, + "grad_norm": 31.234981536865234, + "learning_rate": 8.04526424062621e-08, + "loss": 0.4707, + "num_input_tokens_seen": 40726152, + "step": 20803 + }, + { + "epoch": 2.757322730284957, + "grad_norm": 5.949959754943848, + "learning_rate": 8.036530947547999e-08, + "loss": 0.0693, + "num_input_tokens_seen": 40728472, + "step": 20804 + }, + { + "epoch": 2.757455268389662, + "grad_norm": 0.0016995753394439816, + "learning_rate": 8.027802319692413e-08, + "loss": 0.0, + "num_input_tokens_seen": 40730560, + "step": 20805 + }, + { + "epoch": 2.7575878064943673, + "grad_norm": 12.175122261047363, + "learning_rate": 8.019078357227734e-08, + "loss": 0.2992, + "num_input_tokens_seen": 40731576, + "step": 20806 + }, + { + "epoch": 2.7577203445990723, + "grad_norm": 24.102895736694336, + "learning_rate": 8.010359060322131e-08, + "loss": 0.3805, + "num_input_tokens_seen": 40733928, + "step": 20807 + }, + { + "epoch": 2.7578528827037774, + "grad_norm": 0.014085857197642326, + "learning_rate": 8.00164442914378e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40735656, + "step": 20808 + }, + { + "epoch": 2.7579854208084824, + "grad_norm": 0.03189467266201973, + "learning_rate": 7.992934463860624e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40737688, + "step": 20809 + }, + { + "epoch": 2.7581179589131875, + "grad_norm": 2.0268285274505615, + "learning_rate": 7.984229164640645e-08, + "loss": 0.0243, + "num_input_tokens_seen": 40740064, + "step": 20810 + }, + { + "epoch": 2.7582504970178925, + "grad_norm": 0.013389559462666512, + "learning_rate": 7.975528531651705e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40742416, + "step": 20811 + }, + { + "epoch": 2.7583830351225975, + "grad_norm": 0.28944456577301025, + "learning_rate": 7.966832565061561e-08, + "loss": 0.0013, + "num_input_tokens_seen": 40744248, + "step": 20812 + }, + { + "epoch": 2.758515573227303, + "grad_norm": 1.5062832832336426, + "learning_rate": 7.958141265037828e-08, + "loss": 0.0109, + "num_input_tokens_seen": 40746352, + "step": 20813 + }, + { + "epoch": 2.758648111332008, + "grad_norm": 0.026987936347723007, + "learning_rate": 7.949454631748094e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40748312, + "step": 20814 + }, + { + "epoch": 2.758780649436713, + "grad_norm": 0.970618486404419, + "learning_rate": 7.940772665359891e-08, + "loss": 0.0015, + "num_input_tokens_seen": 40749544, + "step": 20815 + }, + { + "epoch": 2.758913187541418, + "grad_norm": 0.051948267966508865, + "learning_rate": 7.93209536604056e-08, + "loss": 0.0004, + "num_input_tokens_seen": 40751040, + "step": 20816 + }, + { + "epoch": 2.759045725646123, + "grad_norm": 0.005507080815732479, + "learning_rate": 7.923422733957437e-08, + "loss": 0.0, + "num_input_tokens_seen": 40752528, + "step": 20817 + }, + { + "epoch": 2.759178263750828, + "grad_norm": 5.02571964263916, + "learning_rate": 7.914754769277721e-08, + "loss": 0.04, + "num_input_tokens_seen": 40754176, + "step": 20818 + }, + { + "epoch": 2.7593108018555332, + "grad_norm": 7.491484642028809, + "learning_rate": 7.906091472168531e-08, + "loss": 0.0364, + "num_input_tokens_seen": 40756376, + "step": 20819 + }, + { + "epoch": 2.7594433399602387, + "grad_norm": 8.351678848266602, + "learning_rate": 7.897432842796898e-08, + "loss": 0.0856, + "num_input_tokens_seen": 40758680, + "step": 20820 + }, + { + "epoch": 2.7595758780649438, + "grad_norm": 0.4821520149707794, + "learning_rate": 7.8887788813298e-08, + "loss": 0.0021, + "num_input_tokens_seen": 40760424, + "step": 20821 + }, + { + "epoch": 2.759708416169649, + "grad_norm": 0.01757717877626419, + "learning_rate": 7.88012958793405e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40761792, + "step": 20822 + }, + { + "epoch": 2.759840954274354, + "grad_norm": 13.423911094665527, + "learning_rate": 7.871484962776454e-08, + "loss": 0.1356, + "num_input_tokens_seen": 40763704, + "step": 20823 + }, + { + "epoch": 2.759973492379059, + "grad_norm": 10.499930381774902, + "learning_rate": 7.862845006023661e-08, + "loss": 0.0595, + "num_input_tokens_seen": 40765816, + "step": 20824 + }, + { + "epoch": 2.7601060304837643, + "grad_norm": 0.08448812365531921, + "learning_rate": 7.854209717842231e-08, + "loss": 0.0004, + "num_input_tokens_seen": 40767688, + "step": 20825 + }, + { + "epoch": 2.760238568588469, + "grad_norm": 0.01673898473381996, + "learning_rate": 7.845579098398698e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40769336, + "step": 20826 + }, + { + "epoch": 2.7603711066931744, + "grad_norm": 0.029192348942160606, + "learning_rate": 7.836953147859455e-08, + "loss": 0.0002, + "num_input_tokens_seen": 40770680, + "step": 20827 + }, + { + "epoch": 2.7605036447978795, + "grad_norm": 9.478084564208984, + "learning_rate": 7.828331866390815e-08, + "loss": 0.0725, + "num_input_tokens_seen": 40772496, + "step": 20828 + }, + { + "epoch": 2.7606361829025845, + "grad_norm": 0.0016814350383356214, + "learning_rate": 7.819715254159033e-08, + "loss": 0.0, + "num_input_tokens_seen": 40773744, + "step": 20829 + }, + { + "epoch": 2.7607687210072895, + "grad_norm": 0.004367490764707327, + "learning_rate": 7.811103311330142e-08, + "loss": 0.0, + "num_input_tokens_seen": 40775584, + "step": 20830 + }, + { + "epoch": 2.7609012591119946, + "grad_norm": 0.003849869128316641, + "learning_rate": 7.802496038070318e-08, + "loss": 0.0, + "num_input_tokens_seen": 40777240, + "step": 20831 + }, + { + "epoch": 2.7610337972167, + "grad_norm": 0.002572358585894108, + "learning_rate": 7.793893434545452e-08, + "loss": 0.0, + "num_input_tokens_seen": 40778800, + "step": 20832 + }, + { + "epoch": 2.7611663353214047, + "grad_norm": 0.003549224929884076, + "learning_rate": 7.785295500921386e-08, + "loss": 0.0, + "num_input_tokens_seen": 40780568, + "step": 20833 + }, + { + "epoch": 2.76129887342611, + "grad_norm": 0.07412654906511307, + "learning_rate": 7.776702237363931e-08, + "loss": 0.0003, + "num_input_tokens_seen": 40782392, + "step": 20834 + }, + { + "epoch": 2.761431411530815, + "grad_norm": 13.964412689208984, + "learning_rate": 7.768113644038733e-08, + "loss": 0.1081, + "num_input_tokens_seen": 40784832, + "step": 20835 + }, + { + "epoch": 2.76156394963552, + "grad_norm": 9.393669128417969, + "learning_rate": 7.759529721111437e-08, + "loss": 0.1035, + "num_input_tokens_seen": 40787456, + "step": 20836 + }, + { + "epoch": 2.7616964877402252, + "grad_norm": 2.863393545150757, + "learning_rate": 7.75095046874752e-08, + "loss": 0.01, + "num_input_tokens_seen": 40789528, + "step": 20837 + }, + { + "epoch": 2.7618290258449303, + "grad_norm": 0.0035786584485322237, + "learning_rate": 7.742375887112407e-08, + "loss": 0.0, + "num_input_tokens_seen": 40790952, + "step": 20838 + }, + { + "epoch": 2.7619615639496358, + "grad_norm": 6.241516590118408, + "learning_rate": 7.733805976371384e-08, + "loss": 0.0718, + "num_input_tokens_seen": 40792808, + "step": 20839 + }, + { + "epoch": 2.7620941020543404, + "grad_norm": 5.104973316192627, + "learning_rate": 7.725240736689704e-08, + "loss": 0.0514, + "num_input_tokens_seen": 40794488, + "step": 20840 + }, + { + "epoch": 2.762226640159046, + "grad_norm": 4.05593204498291, + "learning_rate": 7.716680168232543e-08, + "loss": 0.034, + "num_input_tokens_seen": 40796648, + "step": 20841 + }, + { + "epoch": 2.762359178263751, + "grad_norm": 0.020614732056856155, + "learning_rate": 7.708124271164962e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40798376, + "step": 20842 + }, + { + "epoch": 2.762491716368456, + "grad_norm": 3.2341105937957764, + "learning_rate": 7.69957304565186e-08, + "loss": 0.0205, + "num_input_tokens_seen": 40800704, + "step": 20843 + }, + { + "epoch": 2.762624254473161, + "grad_norm": 2.981468439102173, + "learning_rate": 7.691026491858155e-08, + "loss": 0.0089, + "num_input_tokens_seen": 40802104, + "step": 20844 + }, + { + "epoch": 2.762756792577866, + "grad_norm": 0.06487394869327545, + "learning_rate": 7.682484609948582e-08, + "loss": 0.0003, + "num_input_tokens_seen": 40804848, + "step": 20845 + }, + { + "epoch": 2.7628893306825715, + "grad_norm": 0.026254601776599884, + "learning_rate": 7.673947400087894e-08, + "loss": 0.0002, + "num_input_tokens_seen": 40807112, + "step": 20846 + }, + { + "epoch": 2.763021868787276, + "grad_norm": 0.011117507703602314, + "learning_rate": 7.665414862440684e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40809064, + "step": 20847 + }, + { + "epoch": 2.7631544068919816, + "grad_norm": 0.004128988366574049, + "learning_rate": 7.656886997171459e-08, + "loss": 0.0, + "num_input_tokens_seen": 40810424, + "step": 20848 + }, + { + "epoch": 2.7632869449966866, + "grad_norm": 7.297088623046875, + "learning_rate": 7.648363804444642e-08, + "loss": 0.076, + "num_input_tokens_seen": 40811808, + "step": 20849 + }, + { + "epoch": 2.7634194831013916, + "grad_norm": 2.4560353755950928, + "learning_rate": 7.639845284424546e-08, + "loss": 0.0057, + "num_input_tokens_seen": 40814472, + "step": 20850 + }, + { + "epoch": 2.7635520212060967, + "grad_norm": 0.1690322607755661, + "learning_rate": 7.631331437275402e-08, + "loss": 0.0006, + "num_input_tokens_seen": 40815856, + "step": 20851 + }, + { + "epoch": 2.7636845593108017, + "grad_norm": 0.0036196240689605474, + "learning_rate": 7.622822263161411e-08, + "loss": 0.0, + "num_input_tokens_seen": 40817096, + "step": 20852 + }, + { + "epoch": 2.763817097415507, + "grad_norm": 0.07059808820486069, + "learning_rate": 7.614317762246637e-08, + "loss": 0.0002, + "num_input_tokens_seen": 40818520, + "step": 20853 + }, + { + "epoch": 2.7639496355202122, + "grad_norm": 1.5186347961425781, + "learning_rate": 7.605817934695003e-08, + "loss": 0.0082, + "num_input_tokens_seen": 40820080, + "step": 20854 + }, + { + "epoch": 2.7640821736249173, + "grad_norm": 5.195520401000977, + "learning_rate": 7.597322780670435e-08, + "loss": 0.0338, + "num_input_tokens_seen": 40821768, + "step": 20855 + }, + { + "epoch": 2.7642147117296223, + "grad_norm": 2.5688111782073975, + "learning_rate": 7.588832300336691e-08, + "loss": 0.0115, + "num_input_tokens_seen": 40823824, + "step": 20856 + }, + { + "epoch": 2.7643472498343273, + "grad_norm": 2.1141586303710938, + "learning_rate": 7.58034649385747e-08, + "loss": 0.0115, + "num_input_tokens_seen": 40827040, + "step": 20857 + }, + { + "epoch": 2.7644797879390324, + "grad_norm": 4.456235885620117, + "learning_rate": 7.571865361396452e-08, + "loss": 0.0366, + "num_input_tokens_seen": 40828720, + "step": 20858 + }, + { + "epoch": 2.7646123260437374, + "grad_norm": 0.008224617689847946, + "learning_rate": 7.563388903117114e-08, + "loss": 0.0, + "num_input_tokens_seen": 40830400, + "step": 20859 + }, + { + "epoch": 2.764744864148443, + "grad_norm": 0.008496693335473537, + "learning_rate": 7.55491711918288e-08, + "loss": 0.0, + "num_input_tokens_seen": 40832592, + "step": 20860 + }, + { + "epoch": 2.764877402253148, + "grad_norm": 0.8227792382240295, + "learning_rate": 7.546450009757067e-08, + "loss": 0.0034, + "num_input_tokens_seen": 40834584, + "step": 20861 + }, + { + "epoch": 2.765009940357853, + "grad_norm": 4.207841873168945, + "learning_rate": 7.537987575002986e-08, + "loss": 0.0343, + "num_input_tokens_seen": 40837328, + "step": 20862 + }, + { + "epoch": 2.765142478462558, + "grad_norm": 0.006260550580918789, + "learning_rate": 7.529529815083731e-08, + "loss": 0.0, + "num_input_tokens_seen": 40839520, + "step": 20863 + }, + { + "epoch": 2.765275016567263, + "grad_norm": 22.401485443115234, + "learning_rate": 7.521076730162446e-08, + "loss": 0.0867, + "num_input_tokens_seen": 40842968, + "step": 20864 + }, + { + "epoch": 2.765407554671968, + "grad_norm": 9.3337984085083, + "learning_rate": 7.512628320402088e-08, + "loss": 0.063, + "num_input_tokens_seen": 40844952, + "step": 20865 + }, + { + "epoch": 2.765540092776673, + "grad_norm": 3.445998430252075, + "learning_rate": 7.504184585965523e-08, + "loss": 0.0239, + "num_input_tokens_seen": 40847200, + "step": 20866 + }, + { + "epoch": 2.7656726308813786, + "grad_norm": 4.49904727935791, + "learning_rate": 7.495745527015569e-08, + "loss": 0.013, + "num_input_tokens_seen": 40848712, + "step": 20867 + }, + { + "epoch": 2.7658051689860836, + "grad_norm": 0.01743980497121811, + "learning_rate": 7.487311143714954e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40851400, + "step": 20868 + }, + { + "epoch": 2.7659377070907887, + "grad_norm": 0.26251015067100525, + "learning_rate": 7.478881436226271e-08, + "loss": 0.0007, + "num_input_tokens_seen": 40852968, + "step": 20869 + }, + { + "epoch": 2.7660702451954937, + "grad_norm": 0.0017102982383221388, + "learning_rate": 7.470456404712056e-08, + "loss": 0.0, + "num_input_tokens_seen": 40854416, + "step": 20870 + }, + { + "epoch": 2.7662027833001988, + "grad_norm": 2.284578561782837, + "learning_rate": 7.462036049334736e-08, + "loss": 0.0082, + "num_input_tokens_seen": 40856608, + "step": 20871 + }, + { + "epoch": 2.766335321404904, + "grad_norm": 0.0017699371092021465, + "learning_rate": 7.453620370256709e-08, + "loss": 0.0, + "num_input_tokens_seen": 40858200, + "step": 20872 + }, + { + "epoch": 2.766467859509609, + "grad_norm": 0.3686170279979706, + "learning_rate": 7.445209367640205e-08, + "loss": 0.002, + "num_input_tokens_seen": 40859912, + "step": 20873 + }, + { + "epoch": 2.7666003976143143, + "grad_norm": 14.988755226135254, + "learning_rate": 7.4368030416474e-08, + "loss": 0.0879, + "num_input_tokens_seen": 40861568, + "step": 20874 + }, + { + "epoch": 2.7667329357190193, + "grad_norm": 0.00611950783059001, + "learning_rate": 7.428401392440332e-08, + "loss": 0.0, + "num_input_tokens_seen": 40864296, + "step": 20875 + }, + { + "epoch": 2.7668654738237244, + "grad_norm": 3.680931329727173, + "learning_rate": 7.420004420181065e-08, + "loss": 0.031, + "num_input_tokens_seen": 40867280, + "step": 20876 + }, + { + "epoch": 2.7669980119284294, + "grad_norm": 1.0681432485580444, + "learning_rate": 7.411612125031415e-08, + "loss": 0.0048, + "num_input_tokens_seen": 40869296, + "step": 20877 + }, + { + "epoch": 2.7671305500331345, + "grad_norm": 3.815697431564331, + "learning_rate": 7.403224507153278e-08, + "loss": 0.0536, + "num_input_tokens_seen": 40870872, + "step": 20878 + }, + { + "epoch": 2.7672630881378395, + "grad_norm": 0.001125411712564528, + "learning_rate": 7.394841566708333e-08, + "loss": 0.0, + "num_input_tokens_seen": 40871928, + "step": 20879 + }, + { + "epoch": 2.7673956262425445, + "grad_norm": 3.9910011291503906, + "learning_rate": 7.386463303858171e-08, + "loss": 0.0243, + "num_input_tokens_seen": 40873616, + "step": 20880 + }, + { + "epoch": 2.76752816434725, + "grad_norm": 1.0161473751068115, + "learning_rate": 7.378089718764386e-08, + "loss": 0.0028, + "num_input_tokens_seen": 40875128, + "step": 20881 + }, + { + "epoch": 2.767660702451955, + "grad_norm": 6.072625160217285, + "learning_rate": 7.369720811588405e-08, + "loss": 0.0427, + "num_input_tokens_seen": 40876864, + "step": 20882 + }, + { + "epoch": 2.76779324055666, + "grad_norm": 9.03811264038086, + "learning_rate": 7.361356582491596e-08, + "loss": 0.0916, + "num_input_tokens_seen": 40878352, + "step": 20883 + }, + { + "epoch": 2.767925778661365, + "grad_norm": 3.590909242630005, + "learning_rate": 7.352997031635223e-08, + "loss": 0.0342, + "num_input_tokens_seen": 40880872, + "step": 20884 + }, + { + "epoch": 2.76805831676607, + "grad_norm": 3.073624849319458, + "learning_rate": 7.34464215918046e-08, + "loss": 0.0098, + "num_input_tokens_seen": 40882824, + "step": 20885 + }, + { + "epoch": 2.768190854870775, + "grad_norm": 1.668739676475525, + "learning_rate": 7.336291965288372e-08, + "loss": 0.0082, + "num_input_tokens_seen": 40884744, + "step": 20886 + }, + { + "epoch": 2.7683233929754802, + "grad_norm": 5.776982307434082, + "learning_rate": 7.327946450119999e-08, + "loss": 0.2135, + "num_input_tokens_seen": 40886432, + "step": 20887 + }, + { + "epoch": 2.7684559310801857, + "grad_norm": 17.412174224853516, + "learning_rate": 7.319605613836239e-08, + "loss": 0.167, + "num_input_tokens_seen": 40889000, + "step": 20888 + }, + { + "epoch": 2.7685884691848908, + "grad_norm": 0.002139165299013257, + "learning_rate": 7.31126945659788e-08, + "loss": 0.0, + "num_input_tokens_seen": 40890464, + "step": 20889 + }, + { + "epoch": 2.768721007289596, + "grad_norm": 5.644901752471924, + "learning_rate": 7.302937978565683e-08, + "loss": 0.0466, + "num_input_tokens_seen": 40892568, + "step": 20890 + }, + { + "epoch": 2.768853545394301, + "grad_norm": 0.024387791752815247, + "learning_rate": 7.294611179900297e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40893920, + "step": 20891 + }, + { + "epoch": 2.768986083499006, + "grad_norm": 0.035539500415325165, + "learning_rate": 7.286289060762175e-08, + "loss": 0.0002, + "num_input_tokens_seen": 40896656, + "step": 20892 + }, + { + "epoch": 2.769118621603711, + "grad_norm": 0.0007234837394207716, + "learning_rate": 7.277971621311858e-08, + "loss": 0.0, + "num_input_tokens_seen": 40897744, + "step": 20893 + }, + { + "epoch": 2.769251159708416, + "grad_norm": 12.200126647949219, + "learning_rate": 7.269658861709717e-08, + "loss": 0.0584, + "num_input_tokens_seen": 40899296, + "step": 20894 + }, + { + "epoch": 2.7693836978131214, + "grad_norm": 0.021504683420062065, + "learning_rate": 7.261350782116012e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40900792, + "step": 20895 + }, + { + "epoch": 2.7695162359178265, + "grad_norm": 0.015467756427824497, + "learning_rate": 7.25304738269092e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40902688, + "step": 20896 + }, + { + "epoch": 2.7696487740225315, + "grad_norm": 1.2827223539352417, + "learning_rate": 7.244748663594509e-08, + "loss": 0.0023, + "num_input_tokens_seen": 40904528, + "step": 20897 + }, + { + "epoch": 2.7697813121272365, + "grad_norm": 4.967916965484619, + "learning_rate": 7.236454624986816e-08, + "loss": 0.0251, + "num_input_tokens_seen": 40906280, + "step": 20898 + }, + { + "epoch": 2.7699138502319416, + "grad_norm": 0.008002682588994503, + "learning_rate": 7.228165267027743e-08, + "loss": 0.0, + "num_input_tokens_seen": 40907544, + "step": 20899 + }, + { + "epoch": 2.7700463883366466, + "grad_norm": 0.13958871364593506, + "learning_rate": 7.219880589877159e-08, + "loss": 0.0008, + "num_input_tokens_seen": 40909144, + "step": 20900 + }, + { + "epoch": 2.7701789264413517, + "grad_norm": 9.278409004211426, + "learning_rate": 7.211600593694745e-08, + "loss": 0.1496, + "num_input_tokens_seen": 40911640, + "step": 20901 + }, + { + "epoch": 2.770311464546057, + "grad_norm": 0.06530249118804932, + "learning_rate": 7.20332527864015e-08, + "loss": 0.0004, + "num_input_tokens_seen": 40913104, + "step": 20902 + }, + { + "epoch": 2.770444002650762, + "grad_norm": 2.7947771549224854, + "learning_rate": 7.195054644872968e-08, + "loss": 0.0236, + "num_input_tokens_seen": 40915496, + "step": 20903 + }, + { + "epoch": 2.770576540755467, + "grad_norm": 10.681421279907227, + "learning_rate": 7.186788692552626e-08, + "loss": 0.0613, + "num_input_tokens_seen": 40917552, + "step": 20904 + }, + { + "epoch": 2.7707090788601723, + "grad_norm": 0.03688772767782211, + "learning_rate": 7.178527421838471e-08, + "loss": 0.0002, + "num_input_tokens_seen": 40919312, + "step": 20905 + }, + { + "epoch": 2.7708416169648773, + "grad_norm": 10.164468765258789, + "learning_rate": 7.170270832889875e-08, + "loss": 0.0447, + "num_input_tokens_seen": 40920920, + "step": 20906 + }, + { + "epoch": 2.770974155069583, + "grad_norm": 2.313426971435547, + "learning_rate": 7.16201892586596e-08, + "loss": 0.0384, + "num_input_tokens_seen": 40922656, + "step": 20907 + }, + { + "epoch": 2.7711066931742874, + "grad_norm": 0.006964238826185465, + "learning_rate": 7.153771700925821e-08, + "loss": 0.0, + "num_input_tokens_seen": 40924840, + "step": 20908 + }, + { + "epoch": 2.771239231278993, + "grad_norm": 11.521326065063477, + "learning_rate": 7.145529158228525e-08, + "loss": 0.1032, + "num_input_tokens_seen": 40926456, + "step": 20909 + }, + { + "epoch": 2.771371769383698, + "grad_norm": 0.06942639499902725, + "learning_rate": 7.13729129793292e-08, + "loss": 0.0004, + "num_input_tokens_seen": 40928096, + "step": 20910 + }, + { + "epoch": 2.771504307488403, + "grad_norm": 9.1969633102417, + "learning_rate": 7.12905812019793e-08, + "loss": 0.0378, + "num_input_tokens_seen": 40930856, + "step": 20911 + }, + { + "epoch": 2.771636845593108, + "grad_norm": 9.78416919708252, + "learning_rate": 7.120829625182235e-08, + "loss": 0.0876, + "num_input_tokens_seen": 40932688, + "step": 20912 + }, + { + "epoch": 2.771769383697813, + "grad_norm": 4.0840840339660645, + "learning_rate": 7.112605813044459e-08, + "loss": 0.0267, + "num_input_tokens_seen": 40935208, + "step": 20913 + }, + { + "epoch": 2.7719019218025185, + "grad_norm": 2.2518157958984375, + "learning_rate": 7.104386683943253e-08, + "loss": 0.01, + "num_input_tokens_seen": 40936552, + "step": 20914 + }, + { + "epoch": 2.772034459907223, + "grad_norm": 0.03784725442528725, + "learning_rate": 7.096172238037018e-08, + "loss": 0.0002, + "num_input_tokens_seen": 40939064, + "step": 20915 + }, + { + "epoch": 2.7721669980119286, + "grad_norm": 1.79575777053833, + "learning_rate": 7.087962475484156e-08, + "loss": 0.0144, + "num_input_tokens_seen": 40941840, + "step": 20916 + }, + { + "epoch": 2.7722995361166336, + "grad_norm": 0.2588895559310913, + "learning_rate": 7.079757396442954e-08, + "loss": 0.0012, + "num_input_tokens_seen": 40943616, + "step": 20917 + }, + { + "epoch": 2.7724320742213386, + "grad_norm": 0.004944967571645975, + "learning_rate": 7.071557001071594e-08, + "loss": 0.0, + "num_input_tokens_seen": 40945016, + "step": 20918 + }, + { + "epoch": 2.7725646123260437, + "grad_norm": 0.021314293146133423, + "learning_rate": 7.063361289528225e-08, + "loss": 0.0002, + "num_input_tokens_seen": 40946856, + "step": 20919 + }, + { + "epoch": 2.7726971504307487, + "grad_norm": 2.8499176502227783, + "learning_rate": 7.055170261970861e-08, + "loss": 0.0124, + "num_input_tokens_seen": 40948608, + "step": 20920 + }, + { + "epoch": 2.772829688535454, + "grad_norm": 7.742518424987793, + "learning_rate": 7.046983918557377e-08, + "loss": 0.0411, + "num_input_tokens_seen": 40950288, + "step": 20921 + }, + { + "epoch": 2.772962226640159, + "grad_norm": 0.006479662377387285, + "learning_rate": 7.038802259445643e-08, + "loss": 0.0, + "num_input_tokens_seen": 40951704, + "step": 20922 + }, + { + "epoch": 2.7730947647448643, + "grad_norm": 0.48367395997047424, + "learning_rate": 7.030625284793424e-08, + "loss": 0.0029, + "num_input_tokens_seen": 40953320, + "step": 20923 + }, + { + "epoch": 2.7732273028495693, + "grad_norm": 0.9125158190727234, + "learning_rate": 7.022452994758372e-08, + "loss": 0.0028, + "num_input_tokens_seen": 40955176, + "step": 20924 + }, + { + "epoch": 2.7733598409542743, + "grad_norm": 0.6000428199768066, + "learning_rate": 7.014285389498055e-08, + "loss": 0.002, + "num_input_tokens_seen": 40956960, + "step": 20925 + }, + { + "epoch": 2.7734923790589794, + "grad_norm": 0.008195624686777592, + "learning_rate": 7.006122469169957e-08, + "loss": 0.0, + "num_input_tokens_seen": 40958472, + "step": 20926 + }, + { + "epoch": 2.7736249171636844, + "grad_norm": 4.389254570007324, + "learning_rate": 6.997964233931426e-08, + "loss": 0.0439, + "num_input_tokens_seen": 40960416, + "step": 20927 + }, + { + "epoch": 2.77375745526839, + "grad_norm": 20.38875961303711, + "learning_rate": 6.98981068393978e-08, + "loss": 1.0028, + "num_input_tokens_seen": 40963448, + "step": 20928 + }, + { + "epoch": 2.7738899933730945, + "grad_norm": 17.16028594970703, + "learning_rate": 6.981661819352197e-08, + "loss": 0.1438, + "num_input_tokens_seen": 40966440, + "step": 20929 + }, + { + "epoch": 2.7740225314778, + "grad_norm": 0.0930047258734703, + "learning_rate": 6.973517640325889e-08, + "loss": 0.0002, + "num_input_tokens_seen": 40969664, + "step": 20930 + }, + { + "epoch": 2.774155069582505, + "grad_norm": 4.141513347625732, + "learning_rate": 6.965378147017782e-08, + "loss": 0.0135, + "num_input_tokens_seen": 40972504, + "step": 20931 + }, + { + "epoch": 2.77428760768721, + "grad_norm": 13.593473434448242, + "learning_rate": 6.957243339584862e-08, + "loss": 0.3344, + "num_input_tokens_seen": 40975432, + "step": 20932 + }, + { + "epoch": 2.774420145791915, + "grad_norm": 0.00039170836680568755, + "learning_rate": 6.949113218183923e-08, + "loss": 0.0, + "num_input_tokens_seen": 40976400, + "step": 20933 + }, + { + "epoch": 2.77455268389662, + "grad_norm": 0.09563025832176208, + "learning_rate": 6.940987782971781e-08, + "loss": 0.0004, + "num_input_tokens_seen": 40977664, + "step": 20934 + }, + { + "epoch": 2.7746852220013256, + "grad_norm": 9.69224739074707, + "learning_rate": 6.932867034105062e-08, + "loss": 0.1041, + "num_input_tokens_seen": 40979776, + "step": 20935 + }, + { + "epoch": 2.77481776010603, + "grad_norm": 0.01672360673546791, + "learning_rate": 6.924750971740363e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40982360, + "step": 20936 + }, + { + "epoch": 2.7749502982107357, + "grad_norm": 0.054197996854782104, + "learning_rate": 6.91663959603417e-08, + "loss": 0.0003, + "num_input_tokens_seen": 40984832, + "step": 20937 + }, + { + "epoch": 2.7750828363154407, + "grad_norm": 1.2980748414993286, + "learning_rate": 6.908532907142829e-08, + "loss": 0.003, + "num_input_tokens_seen": 40986464, + "step": 20938 + }, + { + "epoch": 2.7752153744201458, + "grad_norm": 0.8785792589187622, + "learning_rate": 6.90043090522266e-08, + "loss": 0.0022, + "num_input_tokens_seen": 40987720, + "step": 20939 + }, + { + "epoch": 2.775347912524851, + "grad_norm": 0.016567552462220192, + "learning_rate": 6.8923335904299e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40989208, + "step": 20940 + }, + { + "epoch": 2.775480450629556, + "grad_norm": 0.0059869615361094475, + "learning_rate": 6.884240962920674e-08, + "loss": 0.0, + "num_input_tokens_seen": 40990984, + "step": 20941 + }, + { + "epoch": 2.7756129887342613, + "grad_norm": 0.003520689904689789, + "learning_rate": 6.876153022851023e-08, + "loss": 0.0, + "num_input_tokens_seen": 40993672, + "step": 20942 + }, + { + "epoch": 2.7757455268389664, + "grad_norm": 0.013717461377382278, + "learning_rate": 6.868069770376823e-08, + "loss": 0.0001, + "num_input_tokens_seen": 40995248, + "step": 20943 + }, + { + "epoch": 2.7758780649436714, + "grad_norm": 0.04156718775629997, + "learning_rate": 6.859991205653976e-08, + "loss": 0.0002, + "num_input_tokens_seen": 40997632, + "step": 20944 + }, + { + "epoch": 2.7760106030483764, + "grad_norm": 9.642149925231934, + "learning_rate": 6.851917328838248e-08, + "loss": 0.2821, + "num_input_tokens_seen": 40999312, + "step": 20945 + }, + { + "epoch": 2.7761431411530815, + "grad_norm": 1.878269910812378, + "learning_rate": 6.843848140085263e-08, + "loss": 0.0242, + "num_input_tokens_seen": 41001688, + "step": 20946 + }, + { + "epoch": 2.7762756792577865, + "grad_norm": 0.007642155978828669, + "learning_rate": 6.835783639550647e-08, + "loss": 0.0, + "num_input_tokens_seen": 41004504, + "step": 20947 + }, + { + "epoch": 2.7764082173624915, + "grad_norm": 6.041691303253174, + "learning_rate": 6.827723827389888e-08, + "loss": 0.1587, + "num_input_tokens_seen": 41006464, + "step": 20948 + }, + { + "epoch": 2.776540755467197, + "grad_norm": 0.08418311178684235, + "learning_rate": 6.81966870375833e-08, + "loss": 0.0004, + "num_input_tokens_seen": 41008680, + "step": 20949 + }, + { + "epoch": 2.776673293571902, + "grad_norm": 0.005496255587786436, + "learning_rate": 6.811618268811381e-08, + "loss": 0.0, + "num_input_tokens_seen": 41010008, + "step": 20950 + }, + { + "epoch": 2.776805831676607, + "grad_norm": 0.012865745462477207, + "learning_rate": 6.803572522704166e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41011872, + "step": 20951 + }, + { + "epoch": 2.776938369781312, + "grad_norm": 10.924593925476074, + "learning_rate": 6.795531465591838e-08, + "loss": 0.0895, + "num_input_tokens_seen": 41014808, + "step": 20952 + }, + { + "epoch": 2.777070907886017, + "grad_norm": 1.9252498149871826, + "learning_rate": 6.787495097629438e-08, + "loss": 0.0165, + "num_input_tokens_seen": 41016424, + "step": 20953 + }, + { + "epoch": 2.777203445990722, + "grad_norm": 0.0039898911491036415, + "learning_rate": 6.779463418971926e-08, + "loss": 0.0, + "num_input_tokens_seen": 41018072, + "step": 20954 + }, + { + "epoch": 2.7773359840954273, + "grad_norm": 3.61431884765625, + "learning_rate": 6.771436429774153e-08, + "loss": 0.0196, + "num_input_tokens_seen": 41020376, + "step": 20955 + }, + { + "epoch": 2.7774685222001327, + "grad_norm": 0.003355487948283553, + "learning_rate": 6.763414130190882e-08, + "loss": 0.0, + "num_input_tokens_seen": 41021840, + "step": 20956 + }, + { + "epoch": 2.7776010603048378, + "grad_norm": 10.288333892822266, + "learning_rate": 6.755396520376794e-08, + "loss": 0.116, + "num_input_tokens_seen": 41023656, + "step": 20957 + }, + { + "epoch": 2.777733598409543, + "grad_norm": 11.278108596801758, + "learning_rate": 6.747383600486435e-08, + "loss": 0.114, + "num_input_tokens_seen": 41025544, + "step": 20958 + }, + { + "epoch": 2.777866136514248, + "grad_norm": 0.0066740806214511395, + "learning_rate": 6.739375370674317e-08, + "loss": 0.0, + "num_input_tokens_seen": 41027352, + "step": 20959 + }, + { + "epoch": 2.777998674618953, + "grad_norm": 14.183767318725586, + "learning_rate": 6.731371831094901e-08, + "loss": 0.0629, + "num_input_tokens_seen": 41029024, + "step": 20960 + }, + { + "epoch": 2.778131212723658, + "grad_norm": 0.0038550645112991333, + "learning_rate": 6.723372981902481e-08, + "loss": 0.0, + "num_input_tokens_seen": 41030304, + "step": 20961 + }, + { + "epoch": 2.778263750828363, + "grad_norm": 0.11425229161977768, + "learning_rate": 6.715378823251212e-08, + "loss": 0.0006, + "num_input_tokens_seen": 41032240, + "step": 20962 + }, + { + "epoch": 2.7783962889330684, + "grad_norm": 11.547921180725098, + "learning_rate": 6.707389355295273e-08, + "loss": 0.0341, + "num_input_tokens_seen": 41034352, + "step": 20963 + }, + { + "epoch": 2.7785288270377735, + "grad_norm": 0.7887674570083618, + "learning_rate": 6.69940457818874e-08, + "loss": 0.005, + "num_input_tokens_seen": 41035840, + "step": 20964 + }, + { + "epoch": 2.7786613651424785, + "grad_norm": 0.02141967974603176, + "learning_rate": 6.691424492085486e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41038048, + "step": 20965 + }, + { + "epoch": 2.7787939032471836, + "grad_norm": 0.21203473210334778, + "learning_rate": 6.683449097139471e-08, + "loss": 0.0009, + "num_input_tokens_seen": 41039888, + "step": 20966 + }, + { + "epoch": 2.7789264413518886, + "grad_norm": 14.88135051727295, + "learning_rate": 6.675478393504381e-08, + "loss": 0.2256, + "num_input_tokens_seen": 41042184, + "step": 20967 + }, + { + "epoch": 2.7790589794565936, + "grad_norm": 0.012147413566708565, + "learning_rate": 6.667512381333951e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41044184, + "step": 20968 + }, + { + "epoch": 2.7791915175612987, + "grad_norm": 9.570160865783691, + "learning_rate": 6.659551060781699e-08, + "loss": 0.0594, + "num_input_tokens_seen": 41045856, + "step": 20969 + }, + { + "epoch": 2.779324055666004, + "grad_norm": 0.04716145992279053, + "learning_rate": 6.651594432001197e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41047144, + "step": 20970 + }, + { + "epoch": 2.779456593770709, + "grad_norm": 18.108707427978516, + "learning_rate": 6.643642495145847e-08, + "loss": 0.078, + "num_input_tokens_seen": 41049832, + "step": 20971 + }, + { + "epoch": 2.7795891318754142, + "grad_norm": 0.04913478344678879, + "learning_rate": 6.635695250368945e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41052064, + "step": 20972 + }, + { + "epoch": 2.7797216699801193, + "grad_norm": 6.319462776184082, + "learning_rate": 6.627752697823758e-08, + "loss": 0.0428, + "num_input_tokens_seen": 41053824, + "step": 20973 + }, + { + "epoch": 2.7798542080848243, + "grad_norm": 2.920759916305542, + "learning_rate": 6.619814837663357e-08, + "loss": 0.0215, + "num_input_tokens_seen": 41055656, + "step": 20974 + }, + { + "epoch": 2.7799867461895293, + "grad_norm": 0.05145959183573723, + "learning_rate": 6.611881670040815e-08, + "loss": 0.0004, + "num_input_tokens_seen": 41057384, + "step": 20975 + }, + { + "epoch": 2.7801192842942344, + "grad_norm": 0.014735309407114983, + "learning_rate": 6.603953195109119e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41058728, + "step": 20976 + }, + { + "epoch": 2.78025182239894, + "grad_norm": 0.008051185868680477, + "learning_rate": 6.596029413021121e-08, + "loss": 0.0, + "num_input_tokens_seen": 41060328, + "step": 20977 + }, + { + "epoch": 2.780384360503645, + "grad_norm": 0.14655928313732147, + "learning_rate": 6.588110323929586e-08, + "loss": 0.0007, + "num_input_tokens_seen": 41062072, + "step": 20978 + }, + { + "epoch": 2.78051689860835, + "grad_norm": 1.7804315090179443, + "learning_rate": 6.580195927987199e-08, + "loss": 0.0075, + "num_input_tokens_seen": 41063224, + "step": 20979 + }, + { + "epoch": 2.780649436713055, + "grad_norm": 0.5162862539291382, + "learning_rate": 6.572286225346558e-08, + "loss": 0.0028, + "num_input_tokens_seen": 41064680, + "step": 20980 + }, + { + "epoch": 2.78078197481776, + "grad_norm": 7.215555191040039, + "learning_rate": 6.564381216160181e-08, + "loss": 0.0704, + "num_input_tokens_seen": 41066296, + "step": 20981 + }, + { + "epoch": 2.780914512922465, + "grad_norm": 5.670719146728516, + "learning_rate": 6.556480900580448e-08, + "loss": 0.0152, + "num_input_tokens_seen": 41068880, + "step": 20982 + }, + { + "epoch": 2.78104705102717, + "grad_norm": 0.21926355361938477, + "learning_rate": 6.548585278759733e-08, + "loss": 0.0007, + "num_input_tokens_seen": 41071256, + "step": 20983 + }, + { + "epoch": 2.7811795891318756, + "grad_norm": 7.80220365524292, + "learning_rate": 6.540694350850252e-08, + "loss": 0.153, + "num_input_tokens_seen": 41072688, + "step": 20984 + }, + { + "epoch": 2.7813121272365806, + "grad_norm": 6.4989166259765625, + "learning_rate": 6.532808117004102e-08, + "loss": 0.0674, + "num_input_tokens_seen": 41075048, + "step": 20985 + }, + { + "epoch": 2.7814446653412856, + "grad_norm": 0.36534222960472107, + "learning_rate": 6.524926577373414e-08, + "loss": 0.0016, + "num_input_tokens_seen": 41077528, + "step": 20986 + }, + { + "epoch": 2.7815772034459907, + "grad_norm": 0.174643412232399, + "learning_rate": 6.517049732110064e-08, + "loss": 0.0006, + "num_input_tokens_seen": 41079352, + "step": 20987 + }, + { + "epoch": 2.7817097415506957, + "grad_norm": 0.062140483409166336, + "learning_rate": 6.509177581365989e-08, + "loss": 0.0003, + "num_input_tokens_seen": 41080960, + "step": 20988 + }, + { + "epoch": 2.7818422796554008, + "grad_norm": 0.08671267330646515, + "learning_rate": 6.501310125292953e-08, + "loss": 0.0004, + "num_input_tokens_seen": 41082728, + "step": 20989 + }, + { + "epoch": 2.781974817760106, + "grad_norm": 6.544454574584961, + "learning_rate": 6.493447364042643e-08, + "loss": 0.0227, + "num_input_tokens_seen": 41084568, + "step": 20990 + }, + { + "epoch": 2.7821073558648113, + "grad_norm": 6.627952575683594, + "learning_rate": 6.485589297766603e-08, + "loss": 0.0737, + "num_input_tokens_seen": 41086464, + "step": 20991 + }, + { + "epoch": 2.7822398939695163, + "grad_norm": 0.0019624680280685425, + "learning_rate": 6.477735926616436e-08, + "loss": 0.0, + "num_input_tokens_seen": 41087936, + "step": 20992 + }, + { + "epoch": 2.7823724320742214, + "grad_norm": 0.06000867113471031, + "learning_rate": 6.46988725074349e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41089320, + "step": 20993 + }, + { + "epoch": 2.7825049701789264, + "grad_norm": 9.613629341125488, + "learning_rate": 6.462043270299174e-08, + "loss": 0.0933, + "num_input_tokens_seen": 41091288, + "step": 20994 + }, + { + "epoch": 2.7826375082836314, + "grad_norm": 0.061959296464920044, + "learning_rate": 6.454203985434643e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41092800, + "step": 20995 + }, + { + "epoch": 2.782770046388337, + "grad_norm": 0.013948083855211735, + "learning_rate": 6.446369396301055e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41095352, + "step": 20996 + }, + { + "epoch": 2.7829025844930415, + "grad_norm": 6.633408546447754, + "learning_rate": 6.438539503049513e-08, + "loss": 0.0548, + "num_input_tokens_seen": 41097592, + "step": 20997 + }, + { + "epoch": 2.783035122597747, + "grad_norm": 0.3199999928474426, + "learning_rate": 6.43071430583092e-08, + "loss": 0.0015, + "num_input_tokens_seen": 41099096, + "step": 20998 + }, + { + "epoch": 2.783167660702452, + "grad_norm": 0.003453528042882681, + "learning_rate": 6.422893804796215e-08, + "loss": 0.0, + "num_input_tokens_seen": 41100520, + "step": 20999 + }, + { + "epoch": 2.783300198807157, + "grad_norm": 0.913486123085022, + "learning_rate": 6.415078000096137e-08, + "loss": 0.0018, + "num_input_tokens_seen": 41102136, + "step": 21000 + }, + { + "epoch": 2.783432736911862, + "grad_norm": 0.5370334982872009, + "learning_rate": 6.407266891881397e-08, + "loss": 0.0024, + "num_input_tokens_seen": 41104144, + "step": 21001 + }, + { + "epoch": 2.783565275016567, + "grad_norm": 0.19792629778385162, + "learning_rate": 6.399460480302599e-08, + "loss": 0.0004, + "num_input_tokens_seen": 41105504, + "step": 21002 + }, + { + "epoch": 2.7836978131212726, + "grad_norm": 2.6696712970733643, + "learning_rate": 6.391658765510261e-08, + "loss": 0.0065, + "num_input_tokens_seen": 41106944, + "step": 21003 + }, + { + "epoch": 2.783830351225977, + "grad_norm": 0.011088461615145206, + "learning_rate": 6.383861747654818e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41109040, + "step": 21004 + }, + { + "epoch": 2.7839628893306827, + "grad_norm": 0.018623841926455498, + "learning_rate": 6.376069426886538e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41111256, + "step": 21005 + }, + { + "epoch": 2.7840954274353877, + "grad_norm": 0.08331770449876785, + "learning_rate": 6.368281803355692e-08, + "loss": 0.0005, + "num_input_tokens_seen": 41113304, + "step": 21006 + }, + { + "epoch": 2.7842279655400928, + "grad_norm": 0.035351209342479706, + "learning_rate": 6.36049887721249e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41115184, + "step": 21007 + }, + { + "epoch": 2.784360503644798, + "grad_norm": 4.194624900817871, + "learning_rate": 6.352720648606953e-08, + "loss": 0.0173, + "num_input_tokens_seen": 41116832, + "step": 21008 + }, + { + "epoch": 2.784493041749503, + "grad_norm": 0.04171781241893768, + "learning_rate": 6.344947117689043e-08, + "loss": 0.0003, + "num_input_tokens_seen": 41118408, + "step": 21009 + }, + { + "epoch": 2.7846255798542083, + "grad_norm": 0.028762217611074448, + "learning_rate": 6.337178284608641e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41120184, + "step": 21010 + }, + { + "epoch": 2.784758117958913, + "grad_norm": 0.1323934942483902, + "learning_rate": 6.329414149515516e-08, + "loss": 0.0007, + "num_input_tokens_seen": 41122192, + "step": 21011 + }, + { + "epoch": 2.7848906560636184, + "grad_norm": 0.12103257328271866, + "learning_rate": 6.321654712559382e-08, + "loss": 0.0004, + "num_input_tokens_seen": 41123800, + "step": 21012 + }, + { + "epoch": 2.7850231941683234, + "grad_norm": 4.631160259246826, + "learning_rate": 6.31389997388987e-08, + "loss": 0.0287, + "num_input_tokens_seen": 41125768, + "step": 21013 + }, + { + "epoch": 2.7851557322730285, + "grad_norm": 0.3393952548503876, + "learning_rate": 6.306149933656469e-08, + "loss": 0.001, + "num_input_tokens_seen": 41127384, + "step": 21014 + }, + { + "epoch": 2.7852882703777335, + "grad_norm": 0.04788094386458397, + "learning_rate": 6.298404592008617e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41129392, + "step": 21015 + }, + { + "epoch": 2.7854208084824386, + "grad_norm": 3.0164635181427, + "learning_rate": 6.290663949095637e-08, + "loss": 0.0414, + "num_input_tokens_seen": 41130928, + "step": 21016 + }, + { + "epoch": 2.785553346587144, + "grad_norm": 2.257233142852783, + "learning_rate": 6.282928005066773e-08, + "loss": 0.0071, + "num_input_tokens_seen": 41132136, + "step": 21017 + }, + { + "epoch": 2.7856858846918486, + "grad_norm": 1.1426843404769897, + "learning_rate": 6.275196760071212e-08, + "loss": 0.0069, + "num_input_tokens_seen": 41134312, + "step": 21018 + }, + { + "epoch": 2.785818422796554, + "grad_norm": 5.766793251037598, + "learning_rate": 6.267470214257999e-08, + "loss": 0.0566, + "num_input_tokens_seen": 41135912, + "step": 21019 + }, + { + "epoch": 2.785950960901259, + "grad_norm": 0.09359381347894669, + "learning_rate": 6.259748367776098e-08, + "loss": 0.0005, + "num_input_tokens_seen": 41137632, + "step": 21020 + }, + { + "epoch": 2.786083499005964, + "grad_norm": 9.862256050109863, + "learning_rate": 6.252031220774391e-08, + "loss": 0.138, + "num_input_tokens_seen": 41138848, + "step": 21021 + }, + { + "epoch": 2.7862160371106692, + "grad_norm": 0.6382250189781189, + "learning_rate": 6.244318773401676e-08, + "loss": 0.0008, + "num_input_tokens_seen": 41140416, + "step": 21022 + }, + { + "epoch": 2.7863485752153743, + "grad_norm": 0.016723182052373886, + "learning_rate": 6.23661102580661e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41141848, + "step": 21023 + }, + { + "epoch": 2.7864811133200797, + "grad_norm": 30.496265411376953, + "learning_rate": 6.22890797813791e-08, + "loss": 0.1064, + "num_input_tokens_seen": 41143696, + "step": 21024 + }, + { + "epoch": 2.786613651424785, + "grad_norm": 0.16997209191322327, + "learning_rate": 6.221209630544011e-08, + "loss": 0.0005, + "num_input_tokens_seen": 41145048, + "step": 21025 + }, + { + "epoch": 2.78674618952949, + "grad_norm": 0.0016112951561808586, + "learning_rate": 6.21351598317338e-08, + "loss": 0.0, + "num_input_tokens_seen": 41146480, + "step": 21026 + }, + { + "epoch": 2.786878727634195, + "grad_norm": 4.7025580406188965, + "learning_rate": 6.205827036174311e-08, + "loss": 0.0475, + "num_input_tokens_seen": 41148424, + "step": 21027 + }, + { + "epoch": 2.7870112657389, + "grad_norm": 0.00173857097979635, + "learning_rate": 6.198142789695105e-08, + "loss": 0.0, + "num_input_tokens_seen": 41149456, + "step": 21028 + }, + { + "epoch": 2.787143803843605, + "grad_norm": 6.694767475128174, + "learning_rate": 6.190463243883865e-08, + "loss": 0.022, + "num_input_tokens_seen": 41151104, + "step": 21029 + }, + { + "epoch": 2.78727634194831, + "grad_norm": 0.001707921619527042, + "learning_rate": 6.182788398888722e-08, + "loss": 0.0, + "num_input_tokens_seen": 41152808, + "step": 21030 + }, + { + "epoch": 2.7874088800530155, + "grad_norm": 0.8749745488166809, + "learning_rate": 6.175118254857614e-08, + "loss": 0.0032, + "num_input_tokens_seen": 41154528, + "step": 21031 + }, + { + "epoch": 2.7875414181577205, + "grad_norm": 0.003589886473491788, + "learning_rate": 6.167452811938423e-08, + "loss": 0.0, + "num_input_tokens_seen": 41156608, + "step": 21032 + }, + { + "epoch": 2.7876739562624255, + "grad_norm": 4.973377704620361, + "learning_rate": 6.159792070278946e-08, + "loss": 0.0347, + "num_input_tokens_seen": 41158952, + "step": 21033 + }, + { + "epoch": 2.7878064943671306, + "grad_norm": 8.801827430725098, + "learning_rate": 6.152136030026928e-08, + "loss": 0.0778, + "num_input_tokens_seen": 41160904, + "step": 21034 + }, + { + "epoch": 2.7879390324718356, + "grad_norm": 4.197114944458008, + "learning_rate": 6.144484691329916e-08, + "loss": 0.0446, + "num_input_tokens_seen": 41163760, + "step": 21035 + }, + { + "epoch": 2.7880715705765406, + "grad_norm": 6.506473541259766, + "learning_rate": 6.136838054335487e-08, + "loss": 0.0874, + "num_input_tokens_seen": 41165608, + "step": 21036 + }, + { + "epoch": 2.7882041086812457, + "grad_norm": 0.02419748529791832, + "learning_rate": 6.129196119191022e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41167192, + "step": 21037 + }, + { + "epoch": 2.788336646785951, + "grad_norm": 0.007670126389712095, + "learning_rate": 6.121558886043932e-08, + "loss": 0.0, + "num_input_tokens_seen": 41168832, + "step": 21038 + }, + { + "epoch": 2.788469184890656, + "grad_norm": 0.6606788635253906, + "learning_rate": 6.113926355041406e-08, + "loss": 0.002, + "num_input_tokens_seen": 41170656, + "step": 21039 + }, + { + "epoch": 2.7886017229953612, + "grad_norm": 0.015165066346526146, + "learning_rate": 6.106298526330629e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41172280, + "step": 21040 + }, + { + "epoch": 2.7887342611000663, + "grad_norm": 9.663131713867188, + "learning_rate": 6.098675400058679e-08, + "loss": 0.1466, + "num_input_tokens_seen": 41174200, + "step": 21041 + }, + { + "epoch": 2.7888667992047713, + "grad_norm": 10.645883560180664, + "learning_rate": 6.091056976372523e-08, + "loss": 0.045, + "num_input_tokens_seen": 41176288, + "step": 21042 + }, + { + "epoch": 2.7889993373094764, + "grad_norm": 0.006707888096570969, + "learning_rate": 6.083443255419041e-08, + "loss": 0.0, + "num_input_tokens_seen": 41177504, + "step": 21043 + }, + { + "epoch": 2.7891318754141814, + "grad_norm": 0.03991345316171646, + "learning_rate": 6.075834237345062e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41179008, + "step": 21044 + }, + { + "epoch": 2.789264413518887, + "grad_norm": 2.372570276260376, + "learning_rate": 6.068229922297275e-08, + "loss": 0.0134, + "num_input_tokens_seen": 41180824, + "step": 21045 + }, + { + "epoch": 2.789396951623592, + "grad_norm": 0.7408421039581299, + "learning_rate": 6.060630310422283e-08, + "loss": 0.0054, + "num_input_tokens_seen": 41182392, + "step": 21046 + }, + { + "epoch": 2.789529489728297, + "grad_norm": 0.0022928647231310606, + "learning_rate": 6.053035401866636e-08, + "loss": 0.0, + "num_input_tokens_seen": 41184736, + "step": 21047 + }, + { + "epoch": 2.789662027833002, + "grad_norm": 0.0016083281952887774, + "learning_rate": 6.045445196776745e-08, + "loss": 0.0, + "num_input_tokens_seen": 41186016, + "step": 21048 + }, + { + "epoch": 2.789794565937707, + "grad_norm": 0.006037278566509485, + "learning_rate": 6.037859695298964e-08, + "loss": 0.0, + "num_input_tokens_seen": 41187840, + "step": 21049 + }, + { + "epoch": 2.789927104042412, + "grad_norm": 0.0038180644623935223, + "learning_rate": 6.030278897579567e-08, + "loss": 0.0, + "num_input_tokens_seen": 41189240, + "step": 21050 + }, + { + "epoch": 2.790059642147117, + "grad_norm": 0.0028970923740416765, + "learning_rate": 6.022702803764685e-08, + "loss": 0.0, + "num_input_tokens_seen": 41190576, + "step": 21051 + }, + { + "epoch": 2.7901921802518226, + "grad_norm": 0.03345775976777077, + "learning_rate": 6.015131414000397e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41191768, + "step": 21052 + }, + { + "epoch": 2.7903247183565276, + "grad_norm": 0.4801238775253296, + "learning_rate": 6.007564728432669e-08, + "loss": 0.0012, + "num_input_tokens_seen": 41194200, + "step": 21053 + }, + { + "epoch": 2.7904572564612327, + "grad_norm": 0.0259841438382864, + "learning_rate": 6.000002747207468e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41196000, + "step": 21054 + }, + { + "epoch": 2.7905897945659377, + "grad_norm": 0.18014776706695557, + "learning_rate": 5.992445470470537e-08, + "loss": 0.0007, + "num_input_tokens_seen": 41197592, + "step": 21055 + }, + { + "epoch": 2.7907223326706427, + "grad_norm": 0.006258269306272268, + "learning_rate": 5.984892898367567e-08, + "loss": 0.0, + "num_input_tokens_seen": 41198768, + "step": 21056 + }, + { + "epoch": 2.7908548707753478, + "grad_norm": 5.35130500793457, + "learning_rate": 5.977345031044219e-08, + "loss": 0.2189, + "num_input_tokens_seen": 41201680, + "step": 21057 + }, + { + "epoch": 2.790987408880053, + "grad_norm": 0.01256349217146635, + "learning_rate": 5.969801868645958e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41204120, + "step": 21058 + }, + { + "epoch": 2.7911199469847583, + "grad_norm": 4.266199588775635, + "learning_rate": 5.96226341131828e-08, + "loss": 0.0224, + "num_input_tokens_seen": 41206392, + "step": 21059 + }, + { + "epoch": 2.7912524850894633, + "grad_norm": 28.84193229675293, + "learning_rate": 5.954729659206543e-08, + "loss": 0.4753, + "num_input_tokens_seen": 41209168, + "step": 21060 + }, + { + "epoch": 2.7913850231941684, + "grad_norm": 10.301603317260742, + "learning_rate": 5.947200612455961e-08, + "loss": 0.0526, + "num_input_tokens_seen": 41210424, + "step": 21061 + }, + { + "epoch": 2.7915175612988734, + "grad_norm": 6.536253452301025, + "learning_rate": 5.939676271211725e-08, + "loss": 0.0659, + "num_input_tokens_seen": 41212264, + "step": 21062 + }, + { + "epoch": 2.7916500994035784, + "grad_norm": 0.04836488142609596, + "learning_rate": 5.9321566356188856e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41213832, + "step": 21063 + }, + { + "epoch": 2.7917826375082835, + "grad_norm": 5.425322532653809, + "learning_rate": 5.9246417058224645e-08, + "loss": 0.0165, + "num_input_tokens_seen": 41215776, + "step": 21064 + }, + { + "epoch": 2.7919151756129885, + "grad_norm": 8.702264785766602, + "learning_rate": 5.917131481967292e-08, + "loss": 0.1585, + "num_input_tokens_seen": 41217144, + "step": 21065 + }, + { + "epoch": 2.792047713717694, + "grad_norm": 19.054758071899414, + "learning_rate": 5.9096259641982234e-08, + "loss": 0.0849, + "num_input_tokens_seen": 41219560, + "step": 21066 + }, + { + "epoch": 2.792180251822399, + "grad_norm": 5.8528313636779785, + "learning_rate": 5.902125152659977e-08, + "loss": 0.0227, + "num_input_tokens_seen": 41222208, + "step": 21067 + }, + { + "epoch": 2.792312789927104, + "grad_norm": 0.0013631965266540647, + "learning_rate": 5.894629047497102e-08, + "loss": 0.0, + "num_input_tokens_seen": 41223472, + "step": 21068 + }, + { + "epoch": 2.792445328031809, + "grad_norm": 6.5479350090026855, + "learning_rate": 5.887137648854235e-08, + "loss": 0.0461, + "num_input_tokens_seen": 41225312, + "step": 21069 + }, + { + "epoch": 2.792577866136514, + "grad_norm": 3.0046913623809814, + "learning_rate": 5.879650956875704e-08, + "loss": 0.0311, + "num_input_tokens_seen": 41227288, + "step": 21070 + }, + { + "epoch": 2.792710404241219, + "grad_norm": 0.004020113032311201, + "learning_rate": 5.872168971705949e-08, + "loss": 0.0, + "num_input_tokens_seen": 41228664, + "step": 21071 + }, + { + "epoch": 2.7928429423459242, + "grad_norm": 4.783579349517822, + "learning_rate": 5.864691693489216e-08, + "loss": 0.1455, + "num_input_tokens_seen": 41230976, + "step": 21072 + }, + { + "epoch": 2.7929754804506297, + "grad_norm": 0.011726445518434048, + "learning_rate": 5.857219122369584e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41233696, + "step": 21073 + }, + { + "epoch": 2.7931080185553347, + "grad_norm": 18.745065689086914, + "learning_rate": 5.849751258491243e-08, + "loss": 0.1397, + "num_input_tokens_seen": 41235264, + "step": 21074 + }, + { + "epoch": 2.79324055666004, + "grad_norm": 10.672466278076172, + "learning_rate": 5.842288101998106e-08, + "loss": 0.1801, + "num_input_tokens_seen": 41237632, + "step": 21075 + }, + { + "epoch": 2.793373094764745, + "grad_norm": 6.761448383331299, + "learning_rate": 5.834829653034085e-08, + "loss": 0.1107, + "num_input_tokens_seen": 41239984, + "step": 21076 + }, + { + "epoch": 2.79350563286945, + "grad_norm": 4.067572593688965, + "learning_rate": 5.827375911743011e-08, + "loss": 0.0257, + "num_input_tokens_seen": 41242088, + "step": 21077 + }, + { + "epoch": 2.7936381709741553, + "grad_norm": 8.263463973999023, + "learning_rate": 5.8199268782685724e-08, + "loss": 0.0641, + "num_input_tokens_seen": 41244496, + "step": 21078 + }, + { + "epoch": 2.79377070907886, + "grad_norm": 0.0018659187480807304, + "learning_rate": 5.812482552754378e-08, + "loss": 0.0, + "num_input_tokens_seen": 41246264, + "step": 21079 + }, + { + "epoch": 2.7939032471835654, + "grad_norm": 14.75499439239502, + "learning_rate": 5.80504293534398e-08, + "loss": 0.1684, + "num_input_tokens_seen": 41248768, + "step": 21080 + }, + { + "epoch": 2.7940357852882705, + "grad_norm": 5.678816795349121, + "learning_rate": 5.7976080261808456e-08, + "loss": 0.0192, + "num_input_tokens_seen": 41251192, + "step": 21081 + }, + { + "epoch": 2.7941683233929755, + "grad_norm": 0.04435134306550026, + "learning_rate": 5.7901778254082495e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41252600, + "step": 21082 + }, + { + "epoch": 2.7943008614976805, + "grad_norm": 0.01724880561232567, + "learning_rate": 5.782752333169522e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41254024, + "step": 21083 + }, + { + "epoch": 2.7944333996023856, + "grad_norm": 6.973931789398193, + "learning_rate": 5.775331549607799e-08, + "loss": 0.0548, + "num_input_tokens_seen": 41256040, + "step": 21084 + }, + { + "epoch": 2.794565937707091, + "grad_norm": 7.36980676651001, + "learning_rate": 5.767915474866159e-08, + "loss": 0.1465, + "num_input_tokens_seen": 41258208, + "step": 21085 + }, + { + "epoch": 2.7946984758117956, + "grad_norm": 6.036591529846191, + "learning_rate": 5.760504109087628e-08, + "loss": 0.0708, + "num_input_tokens_seen": 41259800, + "step": 21086 + }, + { + "epoch": 2.794831013916501, + "grad_norm": 4.778675556182861, + "learning_rate": 5.753097452415063e-08, + "loss": 0.0308, + "num_input_tokens_seen": 41261512, + "step": 21087 + }, + { + "epoch": 2.794963552021206, + "grad_norm": 0.6089677214622498, + "learning_rate": 5.745695504991239e-08, + "loss": 0.0023, + "num_input_tokens_seen": 41264016, + "step": 21088 + }, + { + "epoch": 2.795096090125911, + "grad_norm": 4.966409683227539, + "learning_rate": 5.738298266958903e-08, + "loss": 0.025, + "num_input_tokens_seen": 41265792, + "step": 21089 + }, + { + "epoch": 2.7952286282306162, + "grad_norm": 0.11304580420255661, + "learning_rate": 5.7309057384607184e-08, + "loss": 0.0005, + "num_input_tokens_seen": 41267536, + "step": 21090 + }, + { + "epoch": 2.7953611663353213, + "grad_norm": 8.519670486450195, + "learning_rate": 5.723517919639182e-08, + "loss": 0.0351, + "num_input_tokens_seen": 41269704, + "step": 21091 + }, + { + "epoch": 2.7954937044400268, + "grad_norm": 0.012391739524900913, + "learning_rate": 5.716134810636736e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41271848, + "step": 21092 + }, + { + "epoch": 2.7956262425447314, + "grad_norm": 4.687692165374756, + "learning_rate": 5.708756411595712e-08, + "loss": 0.0194, + "num_input_tokens_seen": 41273552, + "step": 21093 + }, + { + "epoch": 2.795758780649437, + "grad_norm": 0.19010545313358307, + "learning_rate": 5.7013827226584104e-08, + "loss": 0.0005, + "num_input_tokens_seen": 41275032, + "step": 21094 + }, + { + "epoch": 2.795891318754142, + "grad_norm": 0.004423375241458416, + "learning_rate": 5.6940137439669974e-08, + "loss": 0.0, + "num_input_tokens_seen": 41276496, + "step": 21095 + }, + { + "epoch": 2.796023856858847, + "grad_norm": 3.5739452838897705, + "learning_rate": 5.686649475663525e-08, + "loss": 0.0097, + "num_input_tokens_seen": 41279376, + "step": 21096 + }, + { + "epoch": 2.796156394963552, + "grad_norm": 0.06174447759985924, + "learning_rate": 5.6792899178899905e-08, + "loss": 0.0003, + "num_input_tokens_seen": 41281744, + "step": 21097 + }, + { + "epoch": 2.796288933068257, + "grad_norm": 2.365490674972534, + "learning_rate": 5.6719350707883095e-08, + "loss": 0.0073, + "num_input_tokens_seen": 41283520, + "step": 21098 + }, + { + "epoch": 2.7964214711729625, + "grad_norm": 0.046070653945207596, + "learning_rate": 5.664584934500228e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41285160, + "step": 21099 + }, + { + "epoch": 2.796554009277667, + "grad_norm": 4.9522318840026855, + "learning_rate": 5.6572395091675225e-08, + "loss": 0.1332, + "num_input_tokens_seen": 41287120, + "step": 21100 + }, + { + "epoch": 2.7966865473823725, + "grad_norm": 11.270883560180664, + "learning_rate": 5.64989879493183e-08, + "loss": 0.1348, + "num_input_tokens_seen": 41289152, + "step": 21101 + }, + { + "epoch": 2.7968190854870776, + "grad_norm": 1.6935293674468994, + "learning_rate": 5.6425627919346746e-08, + "loss": 0.0112, + "num_input_tokens_seen": 41290616, + "step": 21102 + }, + { + "epoch": 2.7969516235917826, + "grad_norm": 8.659612655639648, + "learning_rate": 5.635231500317445e-08, + "loss": 0.1384, + "num_input_tokens_seen": 41292912, + "step": 21103 + }, + { + "epoch": 2.7970841616964877, + "grad_norm": 9.90807056427002, + "learning_rate": 5.6279049202215553e-08, + "loss": 0.1649, + "num_input_tokens_seen": 41295760, + "step": 21104 + }, + { + "epoch": 2.7972166998011927, + "grad_norm": 3.3314456939697266, + "learning_rate": 5.620583051788198e-08, + "loss": 0.0261, + "num_input_tokens_seen": 41297728, + "step": 21105 + }, + { + "epoch": 2.797349237905898, + "grad_norm": 7.96833610534668, + "learning_rate": 5.613265895158621e-08, + "loss": 0.0752, + "num_input_tokens_seen": 41300112, + "step": 21106 + }, + { + "epoch": 2.7974817760106028, + "grad_norm": 0.2128216177225113, + "learning_rate": 5.605953450473878e-08, + "loss": 0.001, + "num_input_tokens_seen": 41301680, + "step": 21107 + }, + { + "epoch": 2.7976143141153083, + "grad_norm": 5.336821556091309, + "learning_rate": 5.5986457178749395e-08, + "loss": 0.0091, + "num_input_tokens_seen": 41304136, + "step": 21108 + }, + { + "epoch": 2.7977468522200133, + "grad_norm": 0.013649891130626202, + "learning_rate": 5.591342697502722e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41306096, + "step": 21109 + }, + { + "epoch": 2.7978793903247183, + "grad_norm": 0.033881302922964096, + "learning_rate": 5.584044389498e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41307648, + "step": 21110 + }, + { + "epoch": 2.7980119284294234, + "grad_norm": 0.006086244247853756, + "learning_rate": 5.576750794001551e-08, + "loss": 0.0, + "num_input_tokens_seen": 41310032, + "step": 21111 + }, + { + "epoch": 2.7981444665341284, + "grad_norm": 5.599295616149902, + "learning_rate": 5.569461911153928e-08, + "loss": 0.1362, + "num_input_tokens_seen": 41311648, + "step": 21112 + }, + { + "epoch": 2.798277004638834, + "grad_norm": 0.0022967602126300335, + "learning_rate": 5.562177741095742e-08, + "loss": 0.0, + "num_input_tokens_seen": 41313176, + "step": 21113 + }, + { + "epoch": 2.798409542743539, + "grad_norm": 0.0066515132784843445, + "learning_rate": 5.5548982839673803e-08, + "loss": 0.0, + "num_input_tokens_seen": 41314688, + "step": 21114 + }, + { + "epoch": 2.798542080848244, + "grad_norm": 0.22072042524814606, + "learning_rate": 5.5476235399092026e-08, + "loss": 0.0009, + "num_input_tokens_seen": 41316232, + "step": 21115 + }, + { + "epoch": 2.798674618952949, + "grad_norm": 4.813520431518555, + "learning_rate": 5.540353509061513e-08, + "loss": 0.0683, + "num_input_tokens_seen": 41320200, + "step": 21116 + }, + { + "epoch": 2.798807157057654, + "grad_norm": 0.03269220143556595, + "learning_rate": 5.533088191564423e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41322080, + "step": 21117 + }, + { + "epoch": 2.798939695162359, + "grad_norm": 3.635256052017212, + "learning_rate": 5.5258275875580415e-08, + "loss": 0.0115, + "num_input_tokens_seen": 41324184, + "step": 21118 + }, + { + "epoch": 2.799072233267064, + "grad_norm": 13.741676330566406, + "learning_rate": 5.518571697182395e-08, + "loss": 0.0822, + "num_input_tokens_seen": 41326496, + "step": 21119 + }, + { + "epoch": 2.7992047713717696, + "grad_norm": 0.18275916576385498, + "learning_rate": 5.5113205205772905e-08, + "loss": 0.0015, + "num_input_tokens_seen": 41329088, + "step": 21120 + }, + { + "epoch": 2.7993373094764746, + "grad_norm": 6.986661434173584, + "learning_rate": 5.504074057882641e-08, + "loss": 0.0361, + "num_input_tokens_seen": 41330992, + "step": 21121 + }, + { + "epoch": 2.7994698475811797, + "grad_norm": 2.232555627822876, + "learning_rate": 5.496832309238087e-08, + "loss": 0.0089, + "num_input_tokens_seen": 41333232, + "step": 21122 + }, + { + "epoch": 2.7996023856858847, + "grad_norm": 0.01850157231092453, + "learning_rate": 5.489595274783266e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41335448, + "step": 21123 + }, + { + "epoch": 2.7997349237905897, + "grad_norm": 0.04674866795539856, + "learning_rate": 5.482362954657761e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41337472, + "step": 21124 + }, + { + "epoch": 2.799867461895295, + "grad_norm": 10.575713157653809, + "learning_rate": 5.4751353490009594e-08, + "loss": 0.0665, + "num_input_tokens_seen": 41339416, + "step": 21125 + }, + { + "epoch": 2.8, + "grad_norm": 0.0015305984998121858, + "learning_rate": 5.4679124579522515e-08, + "loss": 0.0, + "num_input_tokens_seen": 41340528, + "step": 21126 + }, + { + "epoch": 2.8001325381047053, + "grad_norm": 2.921983242034912, + "learning_rate": 5.460694281650886e-08, + "loss": 0.0072, + "num_input_tokens_seen": 41342368, + "step": 21127 + }, + { + "epoch": 2.8002650762094103, + "grad_norm": 0.032284971326589584, + "learning_rate": 5.453480820236029e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41344808, + "step": 21128 + }, + { + "epoch": 2.8003976143141154, + "grad_norm": 14.392049789428711, + "learning_rate": 5.446272073846764e-08, + "loss": 0.1713, + "num_input_tokens_seen": 41346672, + "step": 21129 + }, + { + "epoch": 2.8005301524188204, + "grad_norm": 0.002186872297897935, + "learning_rate": 5.4390680426220644e-08, + "loss": 0.0, + "num_input_tokens_seen": 41347848, + "step": 21130 + }, + { + "epoch": 2.8006626905235255, + "grad_norm": 5.429933547973633, + "learning_rate": 5.431868726700873e-08, + "loss": 0.1056, + "num_input_tokens_seen": 41350160, + "step": 21131 + }, + { + "epoch": 2.8007952286282305, + "grad_norm": 7.646900653839111, + "learning_rate": 5.4246741262219685e-08, + "loss": 0.0231, + "num_input_tokens_seen": 41352320, + "step": 21132 + }, + { + "epoch": 2.8009277667329355, + "grad_norm": 2.3302557468414307, + "learning_rate": 5.417484241324072e-08, + "loss": 0.0056, + "num_input_tokens_seen": 41353544, + "step": 21133 + }, + { + "epoch": 2.801060304837641, + "grad_norm": 0.05373460799455643, + "learning_rate": 5.410299072145797e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41355824, + "step": 21134 + }, + { + "epoch": 2.801192842942346, + "grad_norm": 0.8139528632164001, + "learning_rate": 5.40311861882567e-08, + "loss": 0.006, + "num_input_tokens_seen": 41357480, + "step": 21135 + }, + { + "epoch": 2.801325381047051, + "grad_norm": 14.065767288208008, + "learning_rate": 5.3959428815021644e-08, + "loss": 0.3127, + "num_input_tokens_seen": 41359952, + "step": 21136 + }, + { + "epoch": 2.801457919151756, + "grad_norm": 4.053398132324219, + "learning_rate": 5.388771860313669e-08, + "loss": 0.0118, + "num_input_tokens_seen": 41362200, + "step": 21137 + }, + { + "epoch": 2.801590457256461, + "grad_norm": 0.04186026006937027, + "learning_rate": 5.381605555398378e-08, + "loss": 0.0003, + "num_input_tokens_seen": 41364232, + "step": 21138 + }, + { + "epoch": 2.801722995361166, + "grad_norm": 0.034189168363809586, + "learning_rate": 5.374443966894488e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41367536, + "step": 21139 + }, + { + "epoch": 2.8018555334658712, + "grad_norm": 6.118852615356445, + "learning_rate": 5.3672870949400544e-08, + "loss": 0.05, + "num_input_tokens_seen": 41369224, + "step": 21140 + }, + { + "epoch": 2.8019880715705767, + "grad_norm": 4.129786014556885, + "learning_rate": 5.3601349396730784e-08, + "loss": 0.0205, + "num_input_tokens_seen": 41371336, + "step": 21141 + }, + { + "epoch": 2.8021206096752818, + "grad_norm": 0.05202076956629753, + "learning_rate": 5.352987501231477e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41373136, + "step": 21142 + }, + { + "epoch": 2.802253147779987, + "grad_norm": 10.510772705078125, + "learning_rate": 5.345844779753084e-08, + "loss": 0.1063, + "num_input_tokens_seen": 41375912, + "step": 21143 + }, + { + "epoch": 2.802385685884692, + "grad_norm": 0.00344017893075943, + "learning_rate": 5.3387067753755686e-08, + "loss": 0.0, + "num_input_tokens_seen": 41378080, + "step": 21144 + }, + { + "epoch": 2.802518223989397, + "grad_norm": 0.05713677778840065, + "learning_rate": 5.331573488236569e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41379328, + "step": 21145 + }, + { + "epoch": 2.802650762094102, + "grad_norm": 0.03259960561990738, + "learning_rate": 5.324444918473615e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41380600, + "step": 21146 + }, + { + "epoch": 2.802783300198807, + "grad_norm": 0.005337974056601524, + "learning_rate": 5.317321066224124e-08, + "loss": 0.0, + "num_input_tokens_seen": 41382344, + "step": 21147 + }, + { + "epoch": 2.8029158383035124, + "grad_norm": 2.9660754203796387, + "learning_rate": 5.310201931625542e-08, + "loss": 0.0215, + "num_input_tokens_seen": 41384320, + "step": 21148 + }, + { + "epoch": 2.8030483764082175, + "grad_norm": 0.08560872077941895, + "learning_rate": 5.303087514815036e-08, + "loss": 0.0003, + "num_input_tokens_seen": 41386200, + "step": 21149 + }, + { + "epoch": 2.8031809145129225, + "grad_norm": 0.26001647114753723, + "learning_rate": 5.295977815929831e-08, + "loss": 0.001, + "num_input_tokens_seen": 41387824, + "step": 21150 + }, + { + "epoch": 2.8033134526176275, + "grad_norm": 0.021474014967679977, + "learning_rate": 5.2888728351069564e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41389944, + "step": 21151 + }, + { + "epoch": 2.8034459907223326, + "grad_norm": 0.004883387126028538, + "learning_rate": 5.28177257248344e-08, + "loss": 0.0, + "num_input_tokens_seen": 41391568, + "step": 21152 + }, + { + "epoch": 2.8035785288270376, + "grad_norm": 0.02623615972697735, + "learning_rate": 5.274677028196173e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41393992, + "step": 21153 + }, + { + "epoch": 2.8037110669317427, + "grad_norm": 8.970613479614258, + "learning_rate": 5.2675862023819636e-08, + "loss": 0.0371, + "num_input_tokens_seen": 41395920, + "step": 21154 + }, + { + "epoch": 2.803843605036448, + "grad_norm": 12.674955368041992, + "learning_rate": 5.260500095177534e-08, + "loss": 0.235, + "num_input_tokens_seen": 41398048, + "step": 21155 + }, + { + "epoch": 2.803976143141153, + "grad_norm": 0.03006616048514843, + "learning_rate": 5.253418706719471e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41400208, + "step": 21156 + }, + { + "epoch": 2.804108681245858, + "grad_norm": 0.9721900820732117, + "learning_rate": 5.246342037144359e-08, + "loss": 0.0029, + "num_input_tokens_seen": 41402192, + "step": 21157 + }, + { + "epoch": 2.8042412193505633, + "grad_norm": 0.6990938782691956, + "learning_rate": 5.239270086588616e-08, + "loss": 0.0044, + "num_input_tokens_seen": 41403848, + "step": 21158 + }, + { + "epoch": 2.8043737574552683, + "grad_norm": 5.602118015289307, + "learning_rate": 5.232202855188606e-08, + "loss": 0.0319, + "num_input_tokens_seen": 41405936, + "step": 21159 + }, + { + "epoch": 2.8045062955599733, + "grad_norm": 0.005795423407107592, + "learning_rate": 5.2251403430805816e-08, + "loss": 0.0, + "num_input_tokens_seen": 41407456, + "step": 21160 + }, + { + "epoch": 2.8046388336646784, + "grad_norm": 57.483123779296875, + "learning_rate": 5.2180825504007106e-08, + "loss": 0.1406, + "num_input_tokens_seen": 41409024, + "step": 21161 + }, + { + "epoch": 2.804771371769384, + "grad_norm": 0.004263599403202534, + "learning_rate": 5.211029477285051e-08, + "loss": 0.0, + "num_input_tokens_seen": 41410528, + "step": 21162 + }, + { + "epoch": 2.804903909874089, + "grad_norm": 0.0070289019495248795, + "learning_rate": 5.203981123869634e-08, + "loss": 0.0, + "num_input_tokens_seen": 41413056, + "step": 21163 + }, + { + "epoch": 2.805036447978794, + "grad_norm": 17.672651290893555, + "learning_rate": 5.196937490290349e-08, + "loss": 0.0609, + "num_input_tokens_seen": 41415480, + "step": 21164 + }, + { + "epoch": 2.805168986083499, + "grad_norm": 9.004027366638184, + "learning_rate": 5.18989857668295e-08, + "loss": 0.0467, + "num_input_tokens_seen": 41418496, + "step": 21165 + }, + { + "epoch": 2.805301524188204, + "grad_norm": 3.9029955863952637, + "learning_rate": 5.182864383183217e-08, + "loss": 0.0087, + "num_input_tokens_seen": 41420448, + "step": 21166 + }, + { + "epoch": 2.8054340622929095, + "grad_norm": 5.305731773376465, + "learning_rate": 5.175834909926736e-08, + "loss": 0.0848, + "num_input_tokens_seen": 41423072, + "step": 21167 + }, + { + "epoch": 2.805566600397614, + "grad_norm": 0.0071297441609203815, + "learning_rate": 5.1688101570490926e-08, + "loss": 0.0, + "num_input_tokens_seen": 41424376, + "step": 21168 + }, + { + "epoch": 2.8056991385023196, + "grad_norm": 0.0024708209093660116, + "learning_rate": 5.1617901246856785e-08, + "loss": 0.0, + "num_input_tokens_seen": 41425592, + "step": 21169 + }, + { + "epoch": 2.8058316766070246, + "grad_norm": 5.7150559425354, + "learning_rate": 5.1547748129718025e-08, + "loss": 0.0466, + "num_input_tokens_seen": 41427448, + "step": 21170 + }, + { + "epoch": 2.8059642147117296, + "grad_norm": 11.907758712768555, + "learning_rate": 5.147764222042828e-08, + "loss": 0.1049, + "num_input_tokens_seen": 41429320, + "step": 21171 + }, + { + "epoch": 2.8060967528164347, + "grad_norm": 1.3341501951217651, + "learning_rate": 5.14075835203387e-08, + "loss": 0.0077, + "num_input_tokens_seen": 41431096, + "step": 21172 + }, + { + "epoch": 2.8062292909211397, + "grad_norm": 0.06699223816394806, + "learning_rate": 5.1337572030800145e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41433912, + "step": 21173 + }, + { + "epoch": 2.806361829025845, + "grad_norm": 1.0817219018936157, + "learning_rate": 5.1267607753162364e-08, + "loss": 0.0011, + "num_input_tokens_seen": 41435264, + "step": 21174 + }, + { + "epoch": 2.80649436713055, + "grad_norm": 8.84842586517334, + "learning_rate": 5.119769068877456e-08, + "loss": 0.2363, + "num_input_tokens_seen": 41437840, + "step": 21175 + }, + { + "epoch": 2.8066269052352553, + "grad_norm": 0.019686629995703697, + "learning_rate": 5.1127820838984263e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41440560, + "step": 21176 + }, + { + "epoch": 2.8067594433399603, + "grad_norm": 3.311279535293579, + "learning_rate": 5.105799820513902e-08, + "loss": 0.0218, + "num_input_tokens_seen": 41442768, + "step": 21177 + }, + { + "epoch": 2.8068919814446653, + "grad_norm": 0.1113070696592331, + "learning_rate": 5.098822278858523e-08, + "loss": 0.0004, + "num_input_tokens_seen": 41445320, + "step": 21178 + }, + { + "epoch": 2.8070245195493704, + "grad_norm": 0.006468723528087139, + "learning_rate": 5.091849459066794e-08, + "loss": 0.0, + "num_input_tokens_seen": 41447552, + "step": 21179 + }, + { + "epoch": 2.8071570576540754, + "grad_norm": 17.911327362060547, + "learning_rate": 5.084881361273164e-08, + "loss": 0.2814, + "num_input_tokens_seen": 41449624, + "step": 21180 + }, + { + "epoch": 2.807289595758781, + "grad_norm": 4.388555526733398, + "learning_rate": 5.077917985611969e-08, + "loss": 0.0395, + "num_input_tokens_seen": 41451664, + "step": 21181 + }, + { + "epoch": 2.8074221338634855, + "grad_norm": 5.563365459442139, + "learning_rate": 5.07095933221749e-08, + "loss": 0.097, + "num_input_tokens_seen": 41453568, + "step": 21182 + }, + { + "epoch": 2.807554671968191, + "grad_norm": 0.25721418857574463, + "learning_rate": 5.064005401223843e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41456304, + "step": 21183 + }, + { + "epoch": 2.807687210072896, + "grad_norm": 0.8028609156608582, + "learning_rate": 5.0570561927651974e-08, + "loss": 0.0038, + "num_input_tokens_seen": 41458208, + "step": 21184 + }, + { + "epoch": 2.807819748177601, + "grad_norm": 9.616104125976562, + "learning_rate": 5.050111706975475e-08, + "loss": 0.0279, + "num_input_tokens_seen": 41459728, + "step": 21185 + }, + { + "epoch": 2.807952286282306, + "grad_norm": 0.04375437647104263, + "learning_rate": 5.0431719439885674e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41462752, + "step": 21186 + }, + { + "epoch": 2.808084824387011, + "grad_norm": 0.001760742161422968, + "learning_rate": 5.036236903938285e-08, + "loss": 0.0, + "num_input_tokens_seen": 41464248, + "step": 21187 + }, + { + "epoch": 2.8082173624917166, + "grad_norm": 11.687002182006836, + "learning_rate": 5.029306586958355e-08, + "loss": 0.1101, + "num_input_tokens_seen": 41467456, + "step": 21188 + }, + { + "epoch": 2.808349900596421, + "grad_norm": 0.5118199586868286, + "learning_rate": 5.0223809931823906e-08, + "loss": 0.0018, + "num_input_tokens_seen": 41469696, + "step": 21189 + }, + { + "epoch": 2.8084824387011267, + "grad_norm": 12.227437019348145, + "learning_rate": 5.015460122743926e-08, + "loss": 0.2561, + "num_input_tokens_seen": 41472120, + "step": 21190 + }, + { + "epoch": 2.8086149768058317, + "grad_norm": 0.019973209127783775, + "learning_rate": 5.0085439757764085e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41473512, + "step": 21191 + }, + { + "epoch": 2.8087475149105368, + "grad_norm": 0.022125286981463432, + "learning_rate": 5.001632552413177e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41475136, + "step": 21192 + }, + { + "epoch": 2.808880053015242, + "grad_norm": 4.517755031585693, + "learning_rate": 4.994725852787458e-08, + "loss": 0.0572, + "num_input_tokens_seen": 41476720, + "step": 21193 + }, + { + "epoch": 2.809012591119947, + "grad_norm": 0.03753714635968208, + "learning_rate": 4.9878238770324494e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41479128, + "step": 21194 + }, + { + "epoch": 2.8091451292246523, + "grad_norm": 0.012316010892391205, + "learning_rate": 4.980926625281213e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41480760, + "step": 21195 + }, + { + "epoch": 2.8092776673293574, + "grad_norm": 4.032469272613525, + "learning_rate": 4.97403409766678e-08, + "loss": 0.0349, + "num_input_tokens_seen": 41482576, + "step": 21196 + }, + { + "epoch": 2.8094102054340624, + "grad_norm": 0.012639536522328854, + "learning_rate": 4.9671462943219895e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41484592, + "step": 21197 + }, + { + "epoch": 2.8095427435387674, + "grad_norm": 4.469357967376709, + "learning_rate": 4.9602632153796235e-08, + "loss": 0.0561, + "num_input_tokens_seen": 41485800, + "step": 21198 + }, + { + "epoch": 2.8096752816434725, + "grad_norm": 0.02814772166311741, + "learning_rate": 4.953384860972465e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41488176, + "step": 21199 + }, + { + "epoch": 2.8098078197481775, + "grad_norm": 0.008501028642058372, + "learning_rate": 4.9465112312330756e-08, + "loss": 0.0, + "num_input_tokens_seen": 41489520, + "step": 21200 + }, + { + "epoch": 2.8099403578528825, + "grad_norm": 4.758168697357178, + "learning_rate": 4.9396423262940143e-08, + "loss": 0.0157, + "num_input_tokens_seen": 41491520, + "step": 21201 + }, + { + "epoch": 2.810072895957588, + "grad_norm": 0.008862193673849106, + "learning_rate": 4.932778146287703e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41494960, + "step": 21202 + }, + { + "epoch": 2.810205434062293, + "grad_norm": 0.06783480197191238, + "learning_rate": 4.925918691346482e-08, + "loss": 0.0004, + "num_input_tokens_seen": 41496736, + "step": 21203 + }, + { + "epoch": 2.810337972166998, + "grad_norm": 1.5066395998001099, + "learning_rate": 4.9190639616026324e-08, + "loss": 0.009, + "num_input_tokens_seen": 41498720, + "step": 21204 + }, + { + "epoch": 2.810470510271703, + "grad_norm": 2.028540849685669, + "learning_rate": 4.912213957188328e-08, + "loss": 0.0053, + "num_input_tokens_seen": 41500800, + "step": 21205 + }, + { + "epoch": 2.810603048376408, + "grad_norm": 0.027135441079735756, + "learning_rate": 4.9053686782355724e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41502856, + "step": 21206 + }, + { + "epoch": 2.810735586481113, + "grad_norm": 5.882522106170654, + "learning_rate": 4.8985281248764006e-08, + "loss": 0.0677, + "num_input_tokens_seen": 41505528, + "step": 21207 + }, + { + "epoch": 2.8108681245858183, + "grad_norm": 0.051870886236429214, + "learning_rate": 4.891692297242678e-08, + "loss": 0.0003, + "num_input_tokens_seen": 41507496, + "step": 21208 + }, + { + "epoch": 2.8110006626905237, + "grad_norm": 3.69916033744812, + "learning_rate": 4.884861195466245e-08, + "loss": 0.0218, + "num_input_tokens_seen": 41510120, + "step": 21209 + }, + { + "epoch": 2.8111332007952288, + "grad_norm": 0.010116532444953918, + "learning_rate": 4.878034819678801e-08, + "loss": 0.0, + "num_input_tokens_seen": 41512480, + "step": 21210 + }, + { + "epoch": 2.811265738899934, + "grad_norm": 0.03615608438849449, + "learning_rate": 4.871213170011935e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41514264, + "step": 21211 + }, + { + "epoch": 2.811398277004639, + "grad_norm": 3.6562039852142334, + "learning_rate": 4.864396246597153e-08, + "loss": 0.0269, + "num_input_tokens_seen": 41515896, + "step": 21212 + }, + { + "epoch": 2.811530815109344, + "grad_norm": 6.809515476226807, + "learning_rate": 4.8575840495659335e-08, + "loss": 0.1403, + "num_input_tokens_seen": 41518136, + "step": 21213 + }, + { + "epoch": 2.811663353214049, + "grad_norm": 18.032827377319336, + "learning_rate": 4.850776579049615e-08, + "loss": 0.1232, + "num_input_tokens_seen": 41520304, + "step": 21214 + }, + { + "epoch": 2.811795891318754, + "grad_norm": 0.3632330596446991, + "learning_rate": 4.843973835179455e-08, + "loss": 0.0008, + "num_input_tokens_seen": 41522256, + "step": 21215 + }, + { + "epoch": 2.8119284294234594, + "grad_norm": 11.840888023376465, + "learning_rate": 4.837175818086598e-08, + "loss": 0.1179, + "num_input_tokens_seen": 41523456, + "step": 21216 + }, + { + "epoch": 2.8120609675281645, + "grad_norm": 4.350199222564697, + "learning_rate": 4.8303825279021336e-08, + "loss": 0.0243, + "num_input_tokens_seen": 41525304, + "step": 21217 + }, + { + "epoch": 2.8121935056328695, + "grad_norm": 0.08766380697488785, + "learning_rate": 4.823593964757012e-08, + "loss": 0.0004, + "num_input_tokens_seen": 41527112, + "step": 21218 + }, + { + "epoch": 2.8123260437375746, + "grad_norm": 0.14347699284553528, + "learning_rate": 4.816810128782101e-08, + "loss": 0.0004, + "num_input_tokens_seen": 41528528, + "step": 21219 + }, + { + "epoch": 2.8124585818422796, + "grad_norm": 5.1562180519104, + "learning_rate": 4.810031020108297e-08, + "loss": 0.1279, + "num_input_tokens_seen": 41530616, + "step": 21220 + }, + { + "epoch": 2.8125911199469846, + "grad_norm": 0.005039736162871122, + "learning_rate": 4.803256638866216e-08, + "loss": 0.0, + "num_input_tokens_seen": 41531696, + "step": 21221 + }, + { + "epoch": 2.8127236580516897, + "grad_norm": 13.648358345031738, + "learning_rate": 4.7964869851865046e-08, + "loss": 0.1187, + "num_input_tokens_seen": 41533176, + "step": 21222 + }, + { + "epoch": 2.812856196156395, + "grad_norm": 13.263527870178223, + "learning_rate": 4.789722059199669e-08, + "loss": 0.2443, + "num_input_tokens_seen": 41534920, + "step": 21223 + }, + { + "epoch": 2.8129887342611, + "grad_norm": 0.06892092525959015, + "learning_rate": 4.782961861036134e-08, + "loss": 0.0003, + "num_input_tokens_seen": 41536848, + "step": 21224 + }, + { + "epoch": 2.8131212723658052, + "grad_norm": 0.020196134224534035, + "learning_rate": 4.776206390826293e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41538576, + "step": 21225 + }, + { + "epoch": 2.8132538104705103, + "grad_norm": 0.00526905944570899, + "learning_rate": 4.769455648700377e-08, + "loss": 0.0, + "num_input_tokens_seen": 41540400, + "step": 21226 + }, + { + "epoch": 2.8133863485752153, + "grad_norm": 17.075319290161133, + "learning_rate": 4.7627096347885316e-08, + "loss": 0.1684, + "num_input_tokens_seen": 41542064, + "step": 21227 + }, + { + "epoch": 2.8135188866799203, + "grad_norm": 4.7833333015441895, + "learning_rate": 4.7559683492208184e-08, + "loss": 0.0905, + "num_input_tokens_seen": 41543720, + "step": 21228 + }, + { + "epoch": 2.8136514247846254, + "grad_norm": 0.006947199814021587, + "learning_rate": 4.74923179212719e-08, + "loss": 0.0, + "num_input_tokens_seen": 41546040, + "step": 21229 + }, + { + "epoch": 2.813783962889331, + "grad_norm": 0.12753786146640778, + "learning_rate": 4.74249996363757e-08, + "loss": 0.0009, + "num_input_tokens_seen": 41548120, + "step": 21230 + }, + { + "epoch": 2.813916500994036, + "grad_norm": 11.77702808380127, + "learning_rate": 4.7357728638817716e-08, + "loss": 0.0916, + "num_input_tokens_seen": 41549784, + "step": 21231 + }, + { + "epoch": 2.814049039098741, + "grad_norm": 0.006870923563838005, + "learning_rate": 4.72905049298944e-08, + "loss": 0.0, + "num_input_tokens_seen": 41551040, + "step": 21232 + }, + { + "epoch": 2.814181577203446, + "grad_norm": 0.052990805357694626, + "learning_rate": 4.72233285109025e-08, + "loss": 0.0003, + "num_input_tokens_seen": 41552504, + "step": 21233 + }, + { + "epoch": 2.814314115308151, + "grad_norm": 4.455300331115723, + "learning_rate": 4.7156199383136814e-08, + "loss": 0.0683, + "num_input_tokens_seen": 41554696, + "step": 21234 + }, + { + "epoch": 2.814446653412856, + "grad_norm": 1.5458555221557617, + "learning_rate": 4.708911754789158e-08, + "loss": 0.0257, + "num_input_tokens_seen": 41556824, + "step": 21235 + }, + { + "epoch": 2.814579191517561, + "grad_norm": 0.002492510713636875, + "learning_rate": 4.70220830064605e-08, + "loss": 0.0, + "num_input_tokens_seen": 41558136, + "step": 21236 + }, + { + "epoch": 2.8147117296222666, + "grad_norm": 0.006433361209928989, + "learning_rate": 4.695509576013585e-08, + "loss": 0.0, + "num_input_tokens_seen": 41560368, + "step": 21237 + }, + { + "epoch": 2.8148442677269716, + "grad_norm": 0.04005099833011627, + "learning_rate": 4.688815581020911e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41563432, + "step": 21238 + }, + { + "epoch": 2.8149768058316766, + "grad_norm": 7.580732345581055, + "learning_rate": 4.6821263157971195e-08, + "loss": 0.1198, + "num_input_tokens_seen": 41565808, + "step": 21239 + }, + { + "epoch": 2.8151093439363817, + "grad_norm": 9.633988380432129, + "learning_rate": 4.675441780471163e-08, + "loss": 0.0349, + "num_input_tokens_seen": 41568112, + "step": 21240 + }, + { + "epoch": 2.8152418820410867, + "grad_norm": 0.05486686900258064, + "learning_rate": 4.668761975171937e-08, + "loss": 0.0003, + "num_input_tokens_seen": 41570264, + "step": 21241 + }, + { + "epoch": 2.8153744201457918, + "grad_norm": 0.008673026226460934, + "learning_rate": 4.662086900028201e-08, + "loss": 0.0, + "num_input_tokens_seen": 41571336, + "step": 21242 + }, + { + "epoch": 2.815506958250497, + "grad_norm": 3.4414398670196533, + "learning_rate": 4.655416555168713e-08, + "loss": 0.0253, + "num_input_tokens_seen": 41573632, + "step": 21243 + }, + { + "epoch": 2.8156394963552023, + "grad_norm": 0.015590169467031956, + "learning_rate": 4.6487509407220074e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41576104, + "step": 21244 + }, + { + "epoch": 2.8157720344599073, + "grad_norm": 0.007496792823076248, + "learning_rate": 4.642090056816678e-08, + "loss": 0.0, + "num_input_tokens_seen": 41577904, + "step": 21245 + }, + { + "epoch": 2.8159045725646124, + "grad_norm": 16.19715690612793, + "learning_rate": 4.6354339035811215e-08, + "loss": 0.1702, + "num_input_tokens_seen": 41579080, + "step": 21246 + }, + { + "epoch": 2.8160371106693174, + "grad_norm": 1.3593823909759521, + "learning_rate": 4.628782481143651e-08, + "loss": 0.0076, + "num_input_tokens_seen": 41580960, + "step": 21247 + }, + { + "epoch": 2.8161696487740224, + "grad_norm": 0.04407981410622597, + "learning_rate": 4.622135789632526e-08, + "loss": 0.0003, + "num_input_tokens_seen": 41582512, + "step": 21248 + }, + { + "epoch": 2.816302186878728, + "grad_norm": 0.07390999048948288, + "learning_rate": 4.6154938291758936e-08, + "loss": 0.0003, + "num_input_tokens_seen": 41585016, + "step": 21249 + }, + { + "epoch": 2.8164347249834325, + "grad_norm": 5.110679626464844, + "learning_rate": 4.6088565999018174e-08, + "loss": 0.0414, + "num_input_tokens_seen": 41587288, + "step": 21250 + }, + { + "epoch": 2.816567263088138, + "grad_norm": 6.790787220001221, + "learning_rate": 4.602224101938307e-08, + "loss": 0.034, + "num_input_tokens_seen": 41590056, + "step": 21251 + }, + { + "epoch": 2.816699801192843, + "grad_norm": 0.01827739179134369, + "learning_rate": 4.595596335413177e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41592288, + "step": 21252 + }, + { + "epoch": 2.816832339297548, + "grad_norm": 0.005573027767241001, + "learning_rate": 4.588973300454241e-08, + "loss": 0.0, + "num_input_tokens_seen": 41594248, + "step": 21253 + }, + { + "epoch": 2.816964877402253, + "grad_norm": 2.179642915725708, + "learning_rate": 4.582354997189231e-08, + "loss": 0.0043, + "num_input_tokens_seen": 41595848, + "step": 21254 + }, + { + "epoch": 2.817097415506958, + "grad_norm": 4.831472396850586, + "learning_rate": 4.575741425745683e-08, + "loss": 0.0703, + "num_input_tokens_seen": 41598264, + "step": 21255 + }, + { + "epoch": 2.8172299536116636, + "grad_norm": 0.007969295606017113, + "learning_rate": 4.56913258625119e-08, + "loss": 0.0, + "num_input_tokens_seen": 41599344, + "step": 21256 + }, + { + "epoch": 2.817362491716368, + "grad_norm": 3.166008234024048, + "learning_rate": 4.5625284788331226e-08, + "loss": 0.0174, + "num_input_tokens_seen": 41601000, + "step": 21257 + }, + { + "epoch": 2.8174950298210737, + "grad_norm": 3.363495111465454, + "learning_rate": 4.5559291036188227e-08, + "loss": 0.0404, + "num_input_tokens_seen": 41603136, + "step": 21258 + }, + { + "epoch": 2.8176275679257787, + "grad_norm": 0.0033262053038924932, + "learning_rate": 4.5493344607355214e-08, + "loss": 0.0, + "num_input_tokens_seen": 41604808, + "step": 21259 + }, + { + "epoch": 2.8177601060304838, + "grad_norm": 0.0867895856499672, + "learning_rate": 4.542744550310368e-08, + "loss": 0.0003, + "num_input_tokens_seen": 41606848, + "step": 21260 + }, + { + "epoch": 2.817892644135189, + "grad_norm": 0.025034377351403236, + "learning_rate": 4.5361593724704554e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41608472, + "step": 21261 + }, + { + "epoch": 2.818025182239894, + "grad_norm": 0.009451201185584068, + "learning_rate": 4.529578927342737e-08, + "loss": 0.0, + "num_input_tokens_seen": 41609808, + "step": 21262 + }, + { + "epoch": 2.8181577203445993, + "grad_norm": 0.005809496156871319, + "learning_rate": 4.523003215054056e-08, + "loss": 0.0, + "num_input_tokens_seen": 41611984, + "step": 21263 + }, + { + "epoch": 2.818290258449304, + "grad_norm": 8.191415786743164, + "learning_rate": 4.5164322357312276e-08, + "loss": 0.1402, + "num_input_tokens_seen": 41615344, + "step": 21264 + }, + { + "epoch": 2.8184227965540094, + "grad_norm": 0.001972722355276346, + "learning_rate": 4.509865989500928e-08, + "loss": 0.0, + "num_input_tokens_seen": 41617368, + "step": 21265 + }, + { + "epoch": 2.8185553346587144, + "grad_norm": 2.6391422748565674, + "learning_rate": 4.5033044764897503e-08, + "loss": 0.0202, + "num_input_tokens_seen": 41620080, + "step": 21266 + }, + { + "epoch": 2.8186878727634195, + "grad_norm": 4.681728839874268, + "learning_rate": 4.496747696824261e-08, + "loss": 0.0298, + "num_input_tokens_seen": 41621688, + "step": 21267 + }, + { + "epoch": 2.8188204108681245, + "grad_norm": 1.3220518827438354, + "learning_rate": 4.49019565063083e-08, + "loss": 0.0039, + "num_input_tokens_seen": 41623664, + "step": 21268 + }, + { + "epoch": 2.8189529489728296, + "grad_norm": 2.9818472862243652, + "learning_rate": 4.483648338035801e-08, + "loss": 0.0668, + "num_input_tokens_seen": 41625848, + "step": 21269 + }, + { + "epoch": 2.819085487077535, + "grad_norm": 0.0044041285291314125, + "learning_rate": 4.477105759165379e-08, + "loss": 0.0, + "num_input_tokens_seen": 41627408, + "step": 21270 + }, + { + "epoch": 2.8192180251822396, + "grad_norm": 0.0022272781934589148, + "learning_rate": 4.470567914145768e-08, + "loss": 0.0, + "num_input_tokens_seen": 41628464, + "step": 21271 + }, + { + "epoch": 2.819350563286945, + "grad_norm": 8.511528968811035, + "learning_rate": 4.464034803102951e-08, + "loss": 0.0525, + "num_input_tokens_seen": 41630464, + "step": 21272 + }, + { + "epoch": 2.81948310139165, + "grad_norm": 0.007058646529912949, + "learning_rate": 4.4575064261629665e-08, + "loss": 0.0, + "num_input_tokens_seen": 41632064, + "step": 21273 + }, + { + "epoch": 2.819615639496355, + "grad_norm": 1.1773110628128052, + "learning_rate": 4.4509827834516296e-08, + "loss": 0.0075, + "num_input_tokens_seen": 41634360, + "step": 21274 + }, + { + "epoch": 2.8197481776010602, + "grad_norm": 0.005149673204869032, + "learning_rate": 4.4444638750947564e-08, + "loss": 0.0, + "num_input_tokens_seen": 41635688, + "step": 21275 + }, + { + "epoch": 2.8198807157057653, + "grad_norm": 0.001669332617893815, + "learning_rate": 4.4379497012179976e-08, + "loss": 0.0, + "num_input_tokens_seen": 41637104, + "step": 21276 + }, + { + "epoch": 2.8200132538104707, + "grad_norm": 0.4741596579551697, + "learning_rate": 4.431440261946946e-08, + "loss": 0.0011, + "num_input_tokens_seen": 41638936, + "step": 21277 + }, + { + "epoch": 2.8201457919151753, + "grad_norm": 0.005232417490333319, + "learning_rate": 4.424935557407195e-08, + "loss": 0.0, + "num_input_tokens_seen": 41640832, + "step": 21278 + }, + { + "epoch": 2.820278330019881, + "grad_norm": 0.019071562215685844, + "learning_rate": 4.4184355877240906e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41642600, + "step": 21279 + }, + { + "epoch": 2.820410868124586, + "grad_norm": 3.601416826248169, + "learning_rate": 4.4119403530229475e-08, + "loss": 0.0284, + "num_input_tokens_seen": 41644408, + "step": 21280 + }, + { + "epoch": 2.820543406229291, + "grad_norm": 0.1430717557668686, + "learning_rate": 4.4054498534290005e-08, + "loss": 0.0007, + "num_input_tokens_seen": 41645992, + "step": 21281 + }, + { + "epoch": 2.820675944333996, + "grad_norm": 0.1317349672317505, + "learning_rate": 4.3989640890674536e-08, + "loss": 0.0005, + "num_input_tokens_seen": 41648496, + "step": 21282 + }, + { + "epoch": 2.820808482438701, + "grad_norm": 0.006044508423656225, + "learning_rate": 4.392483060063263e-08, + "loss": 0.0, + "num_input_tokens_seen": 41649720, + "step": 21283 + }, + { + "epoch": 2.8209410205434065, + "grad_norm": 0.003988482989370823, + "learning_rate": 4.386006766541467e-08, + "loss": 0.0, + "num_input_tokens_seen": 41651992, + "step": 21284 + }, + { + "epoch": 2.8210735586481115, + "grad_norm": 8.106592178344727, + "learning_rate": 4.379535208626884e-08, + "loss": 0.1144, + "num_input_tokens_seen": 41653992, + "step": 21285 + }, + { + "epoch": 2.8212060967528165, + "grad_norm": 8.010564804077148, + "learning_rate": 4.373068386444301e-08, + "loss": 0.053, + "num_input_tokens_seen": 41655560, + "step": 21286 + }, + { + "epoch": 2.8213386348575216, + "grad_norm": 5.563754081726074, + "learning_rate": 4.3666063001184535e-08, + "loss": 0.0279, + "num_input_tokens_seen": 41657584, + "step": 21287 + }, + { + "epoch": 2.8214711729622266, + "grad_norm": 0.007190825883299112, + "learning_rate": 4.360148949773851e-08, + "loss": 0.0, + "num_input_tokens_seen": 41659704, + "step": 21288 + }, + { + "epoch": 2.8216037110669316, + "grad_norm": 0.5484961867332458, + "learning_rate": 4.3536963355350345e-08, + "loss": 0.002, + "num_input_tokens_seen": 41661472, + "step": 21289 + }, + { + "epoch": 2.8217362491716367, + "grad_norm": 0.05635608732700348, + "learning_rate": 4.3472484575264315e-08, + "loss": 0.0003, + "num_input_tokens_seen": 41663216, + "step": 21290 + }, + { + "epoch": 2.821868787276342, + "grad_norm": 8.621072769165039, + "learning_rate": 4.3408053158723315e-08, + "loss": 0.0541, + "num_input_tokens_seen": 41664952, + "step": 21291 + }, + { + "epoch": 2.822001325381047, + "grad_norm": 5.946181297302246, + "learning_rate": 4.334366910696969e-08, + "loss": 0.0463, + "num_input_tokens_seen": 41666880, + "step": 21292 + }, + { + "epoch": 2.8221338634857522, + "grad_norm": 0.13684526085853577, + "learning_rate": 4.3279332421245214e-08, + "loss": 0.0006, + "num_input_tokens_seen": 41669272, + "step": 21293 + }, + { + "epoch": 2.8222664015904573, + "grad_norm": 5.767205238342285, + "learning_rate": 4.321504310279001e-08, + "loss": 0.0483, + "num_input_tokens_seen": 41671624, + "step": 21294 + }, + { + "epoch": 2.8223989396951623, + "grad_norm": 0.02296694554388523, + "learning_rate": 4.315080115284309e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41673144, + "step": 21295 + }, + { + "epoch": 2.8225314777998673, + "grad_norm": 0.0019314229721203446, + "learning_rate": 4.3086606572644016e-08, + "loss": 0.0, + "num_input_tokens_seen": 41674696, + "step": 21296 + }, + { + "epoch": 2.8226640159045724, + "grad_norm": 5.506106853485107, + "learning_rate": 4.302245936343014e-08, + "loss": 0.0147, + "num_input_tokens_seen": 41677064, + "step": 21297 + }, + { + "epoch": 2.822796554009278, + "grad_norm": 0.2841896712779999, + "learning_rate": 4.2958359526438235e-08, + "loss": 0.0008, + "num_input_tokens_seen": 41679952, + "step": 21298 + }, + { + "epoch": 2.822929092113983, + "grad_norm": 6.363249778747559, + "learning_rate": 4.2894307062903984e-08, + "loss": 0.0063, + "num_input_tokens_seen": 41682360, + "step": 21299 + }, + { + "epoch": 2.823061630218688, + "grad_norm": 0.025439860299229622, + "learning_rate": 4.2830301974062796e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41684904, + "step": 21300 + }, + { + "epoch": 2.823194168323393, + "grad_norm": 3.287243366241455, + "learning_rate": 4.276634426114812e-08, + "loss": 0.0283, + "num_input_tokens_seen": 41687144, + "step": 21301 + }, + { + "epoch": 2.823326706428098, + "grad_norm": 0.7168206572532654, + "learning_rate": 4.2702433925393696e-08, + "loss": 0.005, + "num_input_tokens_seen": 41689640, + "step": 21302 + }, + { + "epoch": 2.823459244532803, + "grad_norm": 0.1533549726009369, + "learning_rate": 4.263857096803159e-08, + "loss": 0.0004, + "num_input_tokens_seen": 41691392, + "step": 21303 + }, + { + "epoch": 2.823591782637508, + "grad_norm": 0.0039391000755131245, + "learning_rate": 4.257475539029332e-08, + "loss": 0.0, + "num_input_tokens_seen": 41692536, + "step": 21304 + }, + { + "epoch": 2.8237243207422136, + "grad_norm": 0.6914017200469971, + "learning_rate": 4.251098719340874e-08, + "loss": 0.0019, + "num_input_tokens_seen": 41694456, + "step": 21305 + }, + { + "epoch": 2.8238568588469186, + "grad_norm": 0.08038034290075302, + "learning_rate": 4.244726637860769e-08, + "loss": 0.0004, + "num_input_tokens_seen": 41696112, + "step": 21306 + }, + { + "epoch": 2.8239893969516237, + "grad_norm": 15.284549713134766, + "learning_rate": 4.238359294711863e-08, + "loss": 0.2504, + "num_input_tokens_seen": 41698728, + "step": 21307 + }, + { + "epoch": 2.8241219350563287, + "grad_norm": 2.6776864528656006, + "learning_rate": 4.231996690016948e-08, + "loss": 0.0087, + "num_input_tokens_seen": 41700464, + "step": 21308 + }, + { + "epoch": 2.8242544731610337, + "grad_norm": 0.01853269338607788, + "learning_rate": 4.225638823898703e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41701808, + "step": 21309 + }, + { + "epoch": 2.8243870112657388, + "grad_norm": 5.682422161102295, + "learning_rate": 4.219285696479669e-08, + "loss": 0.0221, + "num_input_tokens_seen": 41703568, + "step": 21310 + }, + { + "epoch": 2.824519549370444, + "grad_norm": 0.006058938801288605, + "learning_rate": 4.212937307882386e-08, + "loss": 0.0, + "num_input_tokens_seen": 41705352, + "step": 21311 + }, + { + "epoch": 2.8246520874751493, + "grad_norm": 0.004415520001202822, + "learning_rate": 4.206593658229174e-08, + "loss": 0.0, + "num_input_tokens_seen": 41706560, + "step": 21312 + }, + { + "epoch": 2.8247846255798543, + "grad_norm": 8.943802833557129, + "learning_rate": 4.200254747642435e-08, + "loss": 0.0782, + "num_input_tokens_seen": 41708704, + "step": 21313 + }, + { + "epoch": 2.8249171636845594, + "grad_norm": 14.423078536987305, + "learning_rate": 4.1939205762443756e-08, + "loss": 0.164, + "num_input_tokens_seen": 41711392, + "step": 21314 + }, + { + "epoch": 2.8250497017892644, + "grad_norm": 0.022188182920217514, + "learning_rate": 4.187591144157066e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41712616, + "step": 21315 + }, + { + "epoch": 2.8251822398939694, + "grad_norm": 0.9098058342933655, + "learning_rate": 4.1812664515026026e-08, + "loss": 0.0056, + "num_input_tokens_seen": 41715384, + "step": 21316 + }, + { + "epoch": 2.8253147779986745, + "grad_norm": 5.674572467803955, + "learning_rate": 4.1749464984028874e-08, + "loss": 0.0126, + "num_input_tokens_seen": 41717576, + "step": 21317 + }, + { + "epoch": 2.8254473161033795, + "grad_norm": 0.2418396919965744, + "learning_rate": 4.168631284979796e-08, + "loss": 0.0014, + "num_input_tokens_seen": 41718912, + "step": 21318 + }, + { + "epoch": 2.825579854208085, + "grad_norm": 18.161022186279297, + "learning_rate": 4.162320811355064e-08, + "loss": 0.1263, + "num_input_tokens_seen": 41721472, + "step": 21319 + }, + { + "epoch": 2.82571239231279, + "grad_norm": 0.3156079649925232, + "learning_rate": 4.156015077650399e-08, + "loss": 0.0022, + "num_input_tokens_seen": 41723304, + "step": 21320 + }, + { + "epoch": 2.825844930417495, + "grad_norm": 0.10809608548879623, + "learning_rate": 4.1497140839873715e-08, + "loss": 0.0004, + "num_input_tokens_seen": 41724584, + "step": 21321 + }, + { + "epoch": 2.8259774685222, + "grad_norm": 0.03469456732273102, + "learning_rate": 4.14341783048744e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41727160, + "step": 21322 + }, + { + "epoch": 2.826110006626905, + "grad_norm": 2.7486681938171387, + "learning_rate": 4.1371263172720336e-08, + "loss": 0.0285, + "num_input_tokens_seen": 41729304, + "step": 21323 + }, + { + "epoch": 2.82624254473161, + "grad_norm": 0.029182739555835724, + "learning_rate": 4.130839544462445e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41730688, + "step": 21324 + }, + { + "epoch": 2.8263750828363152, + "grad_norm": 2.0474400520324707, + "learning_rate": 4.124557512179883e-08, + "loss": 0.0169, + "num_input_tokens_seen": 41732040, + "step": 21325 + }, + { + "epoch": 2.8265076209410207, + "grad_norm": 2.942699670791626, + "learning_rate": 4.118280220545473e-08, + "loss": 0.0248, + "num_input_tokens_seen": 41733840, + "step": 21326 + }, + { + "epoch": 2.8266401590457257, + "grad_norm": 5.175823211669922, + "learning_rate": 4.112007669680229e-08, + "loss": 0.0368, + "num_input_tokens_seen": 41735592, + "step": 21327 + }, + { + "epoch": 2.826772697150431, + "grad_norm": 0.1909402310848236, + "learning_rate": 4.105739859705138e-08, + "loss": 0.0003, + "num_input_tokens_seen": 41737568, + "step": 21328 + }, + { + "epoch": 2.826905235255136, + "grad_norm": 0.09783075749874115, + "learning_rate": 4.0994767907410196e-08, + "loss": 0.0008, + "num_input_tokens_seen": 41738888, + "step": 21329 + }, + { + "epoch": 2.827037773359841, + "grad_norm": 0.008746135048568249, + "learning_rate": 4.093218462908582e-08, + "loss": 0.0, + "num_input_tokens_seen": 41740200, + "step": 21330 + }, + { + "epoch": 2.827170311464546, + "grad_norm": 0.010528885759413242, + "learning_rate": 4.0869648763285643e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41742360, + "step": 21331 + }, + { + "epoch": 2.827302849569251, + "grad_norm": 0.0025462631601840258, + "learning_rate": 4.0807160311215066e-08, + "loss": 0.0, + "num_input_tokens_seen": 41744048, + "step": 21332 + }, + { + "epoch": 2.8274353876739564, + "grad_norm": 8.750323295593262, + "learning_rate": 4.074471927407897e-08, + "loss": 0.0278, + "num_input_tokens_seen": 41746248, + "step": 21333 + }, + { + "epoch": 2.8275679257786615, + "grad_norm": 0.04769982770085335, + "learning_rate": 4.068232565308139e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41748576, + "step": 21334 + }, + { + "epoch": 2.8277004638833665, + "grad_norm": 0.5535390973091125, + "learning_rate": 4.0619979449424974e-08, + "loss": 0.0039, + "num_input_tokens_seen": 41750904, + "step": 21335 + }, + { + "epoch": 2.8278330019880715, + "grad_norm": 2.0264904499053955, + "learning_rate": 4.0557680664311816e-08, + "loss": 0.0139, + "num_input_tokens_seen": 41752768, + "step": 21336 + }, + { + "epoch": 2.8279655400927766, + "grad_norm": 0.004613807424902916, + "learning_rate": 4.049542929894346e-08, + "loss": 0.0, + "num_input_tokens_seen": 41754112, + "step": 21337 + }, + { + "epoch": 2.828098078197482, + "grad_norm": 3.950345277786255, + "learning_rate": 4.0433225354520046e-08, + "loss": 0.0246, + "num_input_tokens_seen": 41756280, + "step": 21338 + }, + { + "epoch": 2.8282306163021866, + "grad_norm": 0.024955151602625847, + "learning_rate": 4.037106883224062e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41757768, + "step": 21339 + }, + { + "epoch": 2.828363154406892, + "grad_norm": 6.444292068481445, + "learning_rate": 4.0308959733303955e-08, + "loss": 0.1045, + "num_input_tokens_seen": 41759288, + "step": 21340 + }, + { + "epoch": 2.828495692511597, + "grad_norm": 0.023813916370272636, + "learning_rate": 4.024689805890741e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41760536, + "step": 21341 + }, + { + "epoch": 2.828628230616302, + "grad_norm": 3.578228712081909, + "learning_rate": 4.018488381024754e-08, + "loss": 0.0177, + "num_input_tokens_seen": 41762488, + "step": 21342 + }, + { + "epoch": 2.8287607687210072, + "grad_norm": 2.963191270828247, + "learning_rate": 4.012291698851978e-08, + "loss": 0.0151, + "num_input_tokens_seen": 41764304, + "step": 21343 + }, + { + "epoch": 2.8288933068257123, + "grad_norm": 6.463123321533203, + "learning_rate": 4.0060997594919546e-08, + "loss": 0.0299, + "num_input_tokens_seen": 41767464, + "step": 21344 + }, + { + "epoch": 2.8290258449304178, + "grad_norm": 0.004640256520360708, + "learning_rate": 3.999912563064007e-08, + "loss": 0.0, + "num_input_tokens_seen": 41768848, + "step": 21345 + }, + { + "epoch": 2.8291583830351223, + "grad_norm": 7.133349895477295, + "learning_rate": 3.993730109687482e-08, + "loss": 0.1231, + "num_input_tokens_seen": 41771456, + "step": 21346 + }, + { + "epoch": 2.829290921139828, + "grad_norm": 1.109593152999878, + "learning_rate": 3.987552399481537e-08, + "loss": 0.0081, + "num_input_tokens_seen": 41773120, + "step": 21347 + }, + { + "epoch": 2.829423459244533, + "grad_norm": 9.424050331115723, + "learning_rate": 3.981379432565269e-08, + "loss": 0.182, + "num_input_tokens_seen": 41775112, + "step": 21348 + }, + { + "epoch": 2.829555997349238, + "grad_norm": 0.0021270166616886854, + "learning_rate": 3.9752112090577224e-08, + "loss": 0.0, + "num_input_tokens_seen": 41776584, + "step": 21349 + }, + { + "epoch": 2.829688535453943, + "grad_norm": 3.2307441234588623, + "learning_rate": 3.969047729077857e-08, + "loss": 0.0182, + "num_input_tokens_seen": 41778952, + "step": 21350 + }, + { + "epoch": 2.829821073558648, + "grad_norm": 4.46967887878418, + "learning_rate": 3.962888992744496e-08, + "loss": 0.07, + "num_input_tokens_seen": 41780640, + "step": 21351 + }, + { + "epoch": 2.8299536116633535, + "grad_norm": 0.004322728607803583, + "learning_rate": 3.956735000176348e-08, + "loss": 0.0, + "num_input_tokens_seen": 41782632, + "step": 21352 + }, + { + "epoch": 2.830086149768058, + "grad_norm": 0.780724823474884, + "learning_rate": 3.95058575149207e-08, + "loss": 0.0038, + "num_input_tokens_seen": 41784024, + "step": 21353 + }, + { + "epoch": 2.8302186878727635, + "grad_norm": 0.12405820935964584, + "learning_rate": 3.94444124681026e-08, + "loss": 0.0006, + "num_input_tokens_seen": 41786888, + "step": 21354 + }, + { + "epoch": 2.8303512259774686, + "grad_norm": 7.429314613342285, + "learning_rate": 3.9383014862493516e-08, + "loss": 0.047, + "num_input_tokens_seen": 41789008, + "step": 21355 + }, + { + "epoch": 2.8304837640821736, + "grad_norm": 0.05965806171298027, + "learning_rate": 3.9321664699277504e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41790344, + "step": 21356 + }, + { + "epoch": 2.8306163021868787, + "grad_norm": 0.012387754395604134, + "learning_rate": 3.926036197963723e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41792544, + "step": 21357 + }, + { + "epoch": 2.8307488402915837, + "grad_norm": 1.214707851409912, + "learning_rate": 3.91991067047548e-08, + "loss": 0.0068, + "num_input_tokens_seen": 41793968, + "step": 21358 + }, + { + "epoch": 2.830881378396289, + "grad_norm": 0.004752099979668856, + "learning_rate": 3.9137898875811206e-08, + "loss": 0.0, + "num_input_tokens_seen": 41795808, + "step": 21359 + }, + { + "epoch": 2.8310139165009938, + "grad_norm": 0.0037499689497053623, + "learning_rate": 3.907673849398663e-08, + "loss": 0.0, + "num_input_tokens_seen": 41797848, + "step": 21360 + }, + { + "epoch": 2.8311464546056992, + "grad_norm": 0.005664592143148184, + "learning_rate": 3.90156255604604e-08, + "loss": 0.0, + "num_input_tokens_seen": 41799744, + "step": 21361 + }, + { + "epoch": 2.8312789927104043, + "grad_norm": 10.73759651184082, + "learning_rate": 3.8954560076410464e-08, + "loss": 0.1291, + "num_input_tokens_seen": 41802264, + "step": 21362 + }, + { + "epoch": 2.8314115308151093, + "grad_norm": 5.420210838317871, + "learning_rate": 3.889354204301449e-08, + "loss": 0.0437, + "num_input_tokens_seen": 41804536, + "step": 21363 + }, + { + "epoch": 2.8315440689198144, + "grad_norm": 0.01144046988338232, + "learning_rate": 3.883257146144875e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41806208, + "step": 21364 + }, + { + "epoch": 2.8316766070245194, + "grad_norm": 2.0986413955688477, + "learning_rate": 3.877164833288899e-08, + "loss": 0.0127, + "num_input_tokens_seen": 41808464, + "step": 21365 + }, + { + "epoch": 2.831809145129225, + "grad_norm": 12.3529691696167, + "learning_rate": 3.8710772658509534e-08, + "loss": 0.1597, + "num_input_tokens_seen": 41810072, + "step": 21366 + }, + { + "epoch": 2.83194168323393, + "grad_norm": 6.094432353973389, + "learning_rate": 3.864994443948444e-08, + "loss": 0.0578, + "num_input_tokens_seen": 41812304, + "step": 21367 + }, + { + "epoch": 2.832074221338635, + "grad_norm": 0.00438559427857399, + "learning_rate": 3.8589163676986674e-08, + "loss": 0.0, + "num_input_tokens_seen": 41813952, + "step": 21368 + }, + { + "epoch": 2.83220675944334, + "grad_norm": 0.23814915120601654, + "learning_rate": 3.852843037218751e-08, + "loss": 0.001, + "num_input_tokens_seen": 41816160, + "step": 21369 + }, + { + "epoch": 2.832339297548045, + "grad_norm": 5.790221214294434, + "learning_rate": 3.846774452625851e-08, + "loss": 0.0645, + "num_input_tokens_seen": 41818592, + "step": 21370 + }, + { + "epoch": 2.83247183565275, + "grad_norm": 2.2277114391326904, + "learning_rate": 3.8407106140369574e-08, + "loss": 0.009, + "num_input_tokens_seen": 41820536, + "step": 21371 + }, + { + "epoch": 2.832604373757455, + "grad_norm": 0.33863967657089233, + "learning_rate": 3.8346515215689773e-08, + "loss": 0.0025, + "num_input_tokens_seen": 41822264, + "step": 21372 + }, + { + "epoch": 2.8327369118621606, + "grad_norm": 0.05382567271590233, + "learning_rate": 3.828597175338733e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41824784, + "step": 21373 + }, + { + "epoch": 2.8328694499668656, + "grad_norm": 0.4005905091762543, + "learning_rate": 3.822547575462937e-08, + "loss": 0.002, + "num_input_tokens_seen": 41827224, + "step": 21374 + }, + { + "epoch": 2.8330019880715707, + "grad_norm": 0.17862588167190552, + "learning_rate": 3.816502722058274e-08, + "loss": 0.0011, + "num_input_tokens_seen": 41830264, + "step": 21375 + }, + { + "epoch": 2.8331345261762757, + "grad_norm": 13.751076698303223, + "learning_rate": 3.8104626152412894e-08, + "loss": 0.1711, + "num_input_tokens_seen": 41832352, + "step": 21376 + }, + { + "epoch": 2.8332670642809807, + "grad_norm": 7.563451766967773, + "learning_rate": 3.80442725512839e-08, + "loss": 0.0592, + "num_input_tokens_seen": 41834544, + "step": 21377 + }, + { + "epoch": 2.833399602385686, + "grad_norm": 0.03853147104382515, + "learning_rate": 3.798396641835983e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41836288, + "step": 21378 + }, + { + "epoch": 2.833532140490391, + "grad_norm": 0.03218938782811165, + "learning_rate": 3.792370775480336e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41837784, + "step": 21379 + }, + { + "epoch": 2.8336646785950963, + "grad_norm": 1.1423343420028687, + "learning_rate": 3.786349656177635e-08, + "loss": 0.0088, + "num_input_tokens_seen": 41840056, + "step": 21380 + }, + { + "epoch": 2.8337972166998013, + "grad_norm": 0.41114315390586853, + "learning_rate": 3.78033328404398e-08, + "loss": 0.0015, + "num_input_tokens_seen": 41841888, + "step": 21381 + }, + { + "epoch": 2.8339297548045064, + "grad_norm": 0.012818088755011559, + "learning_rate": 3.7743216591953626e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41843248, + "step": 21382 + }, + { + "epoch": 2.8340622929092114, + "grad_norm": 0.01361959706991911, + "learning_rate": 3.768314781747634e-08, + "loss": 0.0, + "num_input_tokens_seen": 41844784, + "step": 21383 + }, + { + "epoch": 2.8341948310139164, + "grad_norm": 0.027121368795633316, + "learning_rate": 3.762312651816702e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41846128, + "step": 21384 + }, + { + "epoch": 2.8343273691186215, + "grad_norm": 0.001676503336057067, + "learning_rate": 3.756315269518251e-08, + "loss": 0.0, + "num_input_tokens_seen": 41847712, + "step": 21385 + }, + { + "epoch": 2.8344599072233265, + "grad_norm": 0.004906956572085619, + "learning_rate": 3.7503226349679124e-08, + "loss": 0.0, + "num_input_tokens_seen": 41849584, + "step": 21386 + }, + { + "epoch": 2.834592445328032, + "grad_norm": 1.1343759298324585, + "learning_rate": 3.744334748281231e-08, + "loss": 0.0086, + "num_input_tokens_seen": 41851280, + "step": 21387 + }, + { + "epoch": 2.834724983432737, + "grad_norm": 0.06775186955928802, + "learning_rate": 3.73835160957367e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41852920, + "step": 21388 + }, + { + "epoch": 2.834857521537442, + "grad_norm": 0.2680496573448181, + "learning_rate": 3.732373218960556e-08, + "loss": 0.0007, + "num_input_tokens_seen": 41855488, + "step": 21389 + }, + { + "epoch": 2.834990059642147, + "grad_norm": 2.7450804710388184, + "learning_rate": 3.726399576557183e-08, + "loss": 0.016, + "num_input_tokens_seen": 41857784, + "step": 21390 + }, + { + "epoch": 2.835122597746852, + "grad_norm": 5.901760578155518, + "learning_rate": 3.720430682478737e-08, + "loss": 0.0878, + "num_input_tokens_seen": 41859792, + "step": 21391 + }, + { + "epoch": 2.835255135851557, + "grad_norm": 5.245533466339111, + "learning_rate": 3.714466536840267e-08, + "loss": 0.0567, + "num_input_tokens_seen": 41863696, + "step": 21392 + }, + { + "epoch": 2.8353876739562622, + "grad_norm": 6.626520156860352, + "learning_rate": 3.708507139756817e-08, + "loss": 0.0736, + "num_input_tokens_seen": 41865384, + "step": 21393 + }, + { + "epoch": 2.8355202120609677, + "grad_norm": 0.03496141731739044, + "learning_rate": 3.702552491343242e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41867072, + "step": 21394 + }, + { + "epoch": 2.8356527501656728, + "grad_norm": 6.933764457702637, + "learning_rate": 3.696602591714338e-08, + "loss": 0.1065, + "num_input_tokens_seen": 41869688, + "step": 21395 + }, + { + "epoch": 2.835785288270378, + "grad_norm": 19.10161781311035, + "learning_rate": 3.690657440984874e-08, + "loss": 0.3098, + "num_input_tokens_seen": 41871736, + "step": 21396 + }, + { + "epoch": 2.835917826375083, + "grad_norm": 1.3146227598190308, + "learning_rate": 3.684717039269453e-08, + "loss": 0.0066, + "num_input_tokens_seen": 41874368, + "step": 21397 + }, + { + "epoch": 2.836050364479788, + "grad_norm": 0.005302970297634602, + "learning_rate": 3.678781386682623e-08, + "loss": 0.0, + "num_input_tokens_seen": 41876864, + "step": 21398 + }, + { + "epoch": 2.836182902584493, + "grad_norm": 14.017012596130371, + "learning_rate": 3.672850483338819e-08, + "loss": 0.3651, + "num_input_tokens_seen": 41878744, + "step": 21399 + }, + { + "epoch": 2.836315440689198, + "grad_norm": 4.511245250701904, + "learning_rate": 3.666924329352367e-08, + "loss": 0.0216, + "num_input_tokens_seen": 41880904, + "step": 21400 + }, + { + "epoch": 2.8364479787939034, + "grad_norm": 0.02728136070072651, + "learning_rate": 3.6610029248375644e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41882472, + "step": 21401 + }, + { + "epoch": 2.8365805168986085, + "grad_norm": 0.0032947382424026728, + "learning_rate": 3.6550862699085696e-08, + "loss": 0.0, + "num_input_tokens_seen": 41884544, + "step": 21402 + }, + { + "epoch": 2.8367130550033135, + "grad_norm": 0.06398927420377731, + "learning_rate": 3.6491743646794584e-08, + "loss": 0.0004, + "num_input_tokens_seen": 41886568, + "step": 21403 + }, + { + "epoch": 2.8368455931080185, + "grad_norm": 0.008374019525945187, + "learning_rate": 3.643267209264195e-08, + "loss": 0.0, + "num_input_tokens_seen": 41890320, + "step": 21404 + }, + { + "epoch": 2.8369781312127236, + "grad_norm": 0.019741594791412354, + "learning_rate": 3.637364803776688e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41892904, + "step": 21405 + }, + { + "epoch": 2.8371106693174286, + "grad_norm": 0.016110295429825783, + "learning_rate": 3.6314671483307916e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41894696, + "step": 21406 + }, + { + "epoch": 2.8372432074221337, + "grad_norm": 0.011761624366044998, + "learning_rate": 3.6255742430401364e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41897480, + "step": 21407 + }, + { + "epoch": 2.837375745526839, + "grad_norm": 0.4244985282421112, + "learning_rate": 3.619686088018354e-08, + "loss": 0.0009, + "num_input_tokens_seen": 41899568, + "step": 21408 + }, + { + "epoch": 2.837508283631544, + "grad_norm": 0.11555639654397964, + "learning_rate": 3.6138026833790193e-08, + "loss": 0.0005, + "num_input_tokens_seen": 41901184, + "step": 21409 + }, + { + "epoch": 2.837640821736249, + "grad_norm": 4.921369552612305, + "learning_rate": 3.607924029235543e-08, + "loss": 0.0608, + "num_input_tokens_seen": 41903408, + "step": 21410 + }, + { + "epoch": 2.8377733598409542, + "grad_norm": 0.5179631114006042, + "learning_rate": 3.6020501257012506e-08, + "loss": 0.0036, + "num_input_tokens_seen": 41905520, + "step": 21411 + }, + { + "epoch": 2.8379058979456593, + "grad_norm": 5.513009548187256, + "learning_rate": 3.5961809728894407e-08, + "loss": 0.0205, + "num_input_tokens_seen": 41906888, + "step": 21412 + }, + { + "epoch": 2.8380384360503643, + "grad_norm": 5.5876784324646, + "learning_rate": 3.590316570913216e-08, + "loss": 0.0605, + "num_input_tokens_seen": 41908656, + "step": 21413 + }, + { + "epoch": 2.8381709741550694, + "grad_norm": 0.36041584610939026, + "learning_rate": 3.584456919885709e-08, + "loss": 0.0013, + "num_input_tokens_seen": 41911320, + "step": 21414 + }, + { + "epoch": 2.838303512259775, + "grad_norm": 8.429523468017578, + "learning_rate": 3.57860201991983e-08, + "loss": 0.0608, + "num_input_tokens_seen": 41913408, + "step": 21415 + }, + { + "epoch": 2.83843605036448, + "grad_norm": 9.752354621887207, + "learning_rate": 3.5727518711285435e-08, + "loss": 0.0828, + "num_input_tokens_seen": 41915208, + "step": 21416 + }, + { + "epoch": 2.838568588469185, + "grad_norm": 11.552865982055664, + "learning_rate": 3.566906473624593e-08, + "loss": 0.1979, + "num_input_tokens_seen": 41916976, + "step": 21417 + }, + { + "epoch": 2.83870112657389, + "grad_norm": 1.6927932500839233, + "learning_rate": 3.561065827520665e-08, + "loss": 0.0062, + "num_input_tokens_seen": 41919360, + "step": 21418 + }, + { + "epoch": 2.838833664678595, + "grad_norm": 0.041958149522542953, + "learning_rate": 3.5552299329294215e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41921504, + "step": 21419 + }, + { + "epoch": 2.8389662027833005, + "grad_norm": 0.17980051040649414, + "learning_rate": 3.549398789963354e-08, + "loss": 0.0008, + "num_input_tokens_seen": 41923512, + "step": 21420 + }, + { + "epoch": 2.839098740888005, + "grad_norm": 17.875896453857422, + "learning_rate": 3.5435723987348734e-08, + "loss": 0.286, + "num_input_tokens_seen": 41925704, + "step": 21421 + }, + { + "epoch": 2.8392312789927105, + "grad_norm": 13.347850799560547, + "learning_rate": 3.537750759356362e-08, + "loss": 0.0199, + "num_input_tokens_seen": 41927312, + "step": 21422 + }, + { + "epoch": 2.8393638170974156, + "grad_norm": 4.012181758880615, + "learning_rate": 3.531933871940035e-08, + "loss": 0.024, + "num_input_tokens_seen": 41929592, + "step": 21423 + }, + { + "epoch": 2.8394963552021206, + "grad_norm": 0.01615353487432003, + "learning_rate": 3.526121736598054e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41931008, + "step": 21424 + }, + { + "epoch": 2.8396288933068257, + "grad_norm": 1.0778565406799316, + "learning_rate": 3.520314353442467e-08, + "loss": 0.0088, + "num_input_tokens_seen": 41933104, + "step": 21425 + }, + { + "epoch": 2.8397614314115307, + "grad_norm": 0.057106051594018936, + "learning_rate": 3.5145117225852675e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41935368, + "step": 21426 + }, + { + "epoch": 2.839893969516236, + "grad_norm": 3.0264198780059814, + "learning_rate": 3.508713844138312e-08, + "loss": 0.0153, + "num_input_tokens_seen": 41937648, + "step": 21427 + }, + { + "epoch": 2.8400265076209408, + "grad_norm": 0.003958333749324083, + "learning_rate": 3.5029207182134264e-08, + "loss": 0.0, + "num_input_tokens_seen": 41939528, + "step": 21428 + }, + { + "epoch": 2.8401590457256463, + "grad_norm": 0.020841307938098907, + "learning_rate": 3.4971323449222716e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41941856, + "step": 21429 + }, + { + "epoch": 2.8402915838303513, + "grad_norm": 11.71628475189209, + "learning_rate": 3.491348724376453e-08, + "loss": 0.0897, + "num_input_tokens_seen": 41945304, + "step": 21430 + }, + { + "epoch": 2.8404241219350563, + "grad_norm": 0.03598780930042267, + "learning_rate": 3.485569856687465e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41947504, + "step": 21431 + }, + { + "epoch": 2.8405566600397614, + "grad_norm": 0.02070513181388378, + "learning_rate": 3.4797957419667736e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41948912, + "step": 21432 + }, + { + "epoch": 2.8406891981444664, + "grad_norm": 9.764309883117676, + "learning_rate": 3.474026380325679e-08, + "loss": 0.0882, + "num_input_tokens_seen": 41950792, + "step": 21433 + }, + { + "epoch": 2.840821736249172, + "grad_norm": 0.20964300632476807, + "learning_rate": 3.468261771875453e-08, + "loss": 0.0007, + "num_input_tokens_seen": 41952944, + "step": 21434 + }, + { + "epoch": 2.8409542743538765, + "grad_norm": 0.00671512121334672, + "learning_rate": 3.462501916727201e-08, + "loss": 0.0, + "num_input_tokens_seen": 41954488, + "step": 21435 + }, + { + "epoch": 2.841086812458582, + "grad_norm": 8.857194900512695, + "learning_rate": 3.4567468149919734e-08, + "loss": 0.0155, + "num_input_tokens_seen": 41955904, + "step": 21436 + }, + { + "epoch": 2.841219350563287, + "grad_norm": 7.722423553466797, + "learning_rate": 3.450996466780737e-08, + "loss": 0.1254, + "num_input_tokens_seen": 41957264, + "step": 21437 + }, + { + "epoch": 2.841351888667992, + "grad_norm": 0.21999236941337585, + "learning_rate": 3.445250872204403e-08, + "loss": 0.0004, + "num_input_tokens_seen": 41959064, + "step": 21438 + }, + { + "epoch": 2.841484426772697, + "grad_norm": 0.046196501702070236, + "learning_rate": 3.439510031373716e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41960728, + "step": 21439 + }, + { + "epoch": 2.841616964877402, + "grad_norm": 0.006579110864549875, + "learning_rate": 3.433773944399365e-08, + "loss": 0.0, + "num_input_tokens_seen": 41962272, + "step": 21440 + }, + { + "epoch": 2.8417495029821076, + "grad_norm": 13.444130897521973, + "learning_rate": 3.42804261139193e-08, + "loss": 0.1641, + "num_input_tokens_seen": 41964056, + "step": 21441 + }, + { + "epoch": 2.841882041086812, + "grad_norm": 0.020831700414419174, + "learning_rate": 3.4223160324619586e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41965792, + "step": 21442 + }, + { + "epoch": 2.8420145791915177, + "grad_norm": 13.28009033203125, + "learning_rate": 3.416594207719837e-08, + "loss": 0.0445, + "num_input_tokens_seen": 41967488, + "step": 21443 + }, + { + "epoch": 2.8421471172962227, + "grad_norm": 0.19583982229232788, + "learning_rate": 3.410877137275892e-08, + "loss": 0.0009, + "num_input_tokens_seen": 41969824, + "step": 21444 + }, + { + "epoch": 2.8422796554009278, + "grad_norm": 9.200377464294434, + "learning_rate": 3.405164821240342e-08, + "loss": 0.0679, + "num_input_tokens_seen": 41973128, + "step": 21445 + }, + { + "epoch": 2.842412193505633, + "grad_norm": 5.699094772338867, + "learning_rate": 3.3994572597233214e-08, + "loss": 0.058, + "num_input_tokens_seen": 41975296, + "step": 21446 + }, + { + "epoch": 2.842544731610338, + "grad_norm": 0.001877969945780933, + "learning_rate": 3.393754452834907e-08, + "loss": 0.0, + "num_input_tokens_seen": 41976736, + "step": 21447 + }, + { + "epoch": 2.8426772697150433, + "grad_norm": 0.019672375172376633, + "learning_rate": 3.388056400685014e-08, + "loss": 0.0002, + "num_input_tokens_seen": 41978280, + "step": 21448 + }, + { + "epoch": 2.842809807819748, + "grad_norm": 3.8077802658081055, + "learning_rate": 3.382363103383524e-08, + "loss": 0.0122, + "num_input_tokens_seen": 41979856, + "step": 21449 + }, + { + "epoch": 2.8429423459244534, + "grad_norm": 0.127094566822052, + "learning_rate": 3.3766745610402106e-08, + "loss": 0.0003, + "num_input_tokens_seen": 41981552, + "step": 21450 + }, + { + "epoch": 2.8430748840291584, + "grad_norm": 0.024055523797869682, + "learning_rate": 3.370990773764765e-08, + "loss": 0.0001, + "num_input_tokens_seen": 41983520, + "step": 21451 + }, + { + "epoch": 2.8432074221338635, + "grad_norm": 0.0021653759758919477, + "learning_rate": 3.365311741666738e-08, + "loss": 0.0, + "num_input_tokens_seen": 41984936, + "step": 21452 + }, + { + "epoch": 2.8433399602385685, + "grad_norm": 7.9955644607543945, + "learning_rate": 3.359637464855653e-08, + "loss": 0.0644, + "num_input_tokens_seen": 41987344, + "step": 21453 + }, + { + "epoch": 2.8434724983432735, + "grad_norm": 3.1849682331085205, + "learning_rate": 3.353967943440895e-08, + "loss": 0.0845, + "num_input_tokens_seen": 41988928, + "step": 21454 + }, + { + "epoch": 2.843605036447979, + "grad_norm": 0.49534451961517334, + "learning_rate": 3.348303177531792e-08, + "loss": 0.0011, + "num_input_tokens_seen": 41990768, + "step": 21455 + }, + { + "epoch": 2.843737574552684, + "grad_norm": 0.05766313895583153, + "learning_rate": 3.342643167237564e-08, + "loss": 0.0003, + "num_input_tokens_seen": 41992720, + "step": 21456 + }, + { + "epoch": 2.843870112657389, + "grad_norm": 0.07199223339557648, + "learning_rate": 3.3369879126673167e-08, + "loss": 0.0004, + "num_input_tokens_seen": 41994744, + "step": 21457 + }, + { + "epoch": 2.844002650762094, + "grad_norm": 9.784435272216797, + "learning_rate": 3.331337413930158e-08, + "loss": 0.2137, + "num_input_tokens_seen": 41996848, + "step": 21458 + }, + { + "epoch": 2.844135188866799, + "grad_norm": 0.064348004758358, + "learning_rate": 3.325691671134945e-08, + "loss": 0.0004, + "num_input_tokens_seen": 41999600, + "step": 21459 + }, + { + "epoch": 2.844267726971504, + "grad_norm": 0.011450551450252533, + "learning_rate": 3.320050684390563e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42002416, + "step": 21460 + }, + { + "epoch": 2.8444002650762092, + "grad_norm": 11.707417488098145, + "learning_rate": 3.314414453805786e-08, + "loss": 0.2008, + "num_input_tokens_seen": 42004528, + "step": 21461 + }, + { + "epoch": 2.8445328031809147, + "grad_norm": 0.00412170821800828, + "learning_rate": 3.3087829794892775e-08, + "loss": 0.0, + "num_input_tokens_seen": 42006824, + "step": 21462 + }, + { + "epoch": 2.8446653412856198, + "grad_norm": 0.0063974191434681416, + "learning_rate": 3.3031562615496174e-08, + "loss": 0.0, + "num_input_tokens_seen": 42008656, + "step": 21463 + }, + { + "epoch": 2.844797879390325, + "grad_norm": 0.005561056546866894, + "learning_rate": 3.297534300095301e-08, + "loss": 0.0, + "num_input_tokens_seen": 42010664, + "step": 21464 + }, + { + "epoch": 2.84493041749503, + "grad_norm": 0.01984023116528988, + "learning_rate": 3.291917095234715e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42012000, + "step": 21465 + }, + { + "epoch": 2.845062955599735, + "grad_norm": 0.01753745973110199, + "learning_rate": 3.2863046470761605e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42014256, + "step": 21466 + }, + { + "epoch": 2.84519549370444, + "grad_norm": 0.5938788056373596, + "learning_rate": 3.28069695572783e-08, + "loss": 0.002, + "num_input_tokens_seen": 42015976, + "step": 21467 + }, + { + "epoch": 2.845328031809145, + "grad_norm": 0.012784359976649284, + "learning_rate": 3.2750940212978865e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42017920, + "step": 21468 + }, + { + "epoch": 2.8454605699138504, + "grad_norm": 9.3268404006958, + "learning_rate": 3.269495843894327e-08, + "loss": 0.0412, + "num_input_tokens_seen": 42019952, + "step": 21469 + }, + { + "epoch": 2.8455931080185555, + "grad_norm": 0.057686544954776764, + "learning_rate": 3.2639024236251207e-08, + "loss": 0.0002, + "num_input_tokens_seen": 42021696, + "step": 21470 + }, + { + "epoch": 2.8457256461232605, + "grad_norm": 4.762962818145752, + "learning_rate": 3.25831376059807e-08, + "loss": 0.0227, + "num_input_tokens_seen": 42024216, + "step": 21471 + }, + { + "epoch": 2.8458581842279655, + "grad_norm": 7.897347450256348, + "learning_rate": 3.25272985492095e-08, + "loss": 0.0801, + "num_input_tokens_seen": 42025952, + "step": 21472 + }, + { + "epoch": 2.8459907223326706, + "grad_norm": 0.46802207827568054, + "learning_rate": 3.2471507067013964e-08, + "loss": 0.0018, + "num_input_tokens_seen": 42027608, + "step": 21473 + }, + { + "epoch": 2.8461232604373756, + "grad_norm": 0.6081012487411499, + "learning_rate": 3.241576316047046e-08, + "loss": 0.0027, + "num_input_tokens_seen": 42029664, + "step": 21474 + }, + { + "epoch": 2.8462557985420807, + "grad_norm": 5.416201591491699, + "learning_rate": 3.2360066830653124e-08, + "loss": 0.0951, + "num_input_tokens_seen": 42031048, + "step": 21475 + }, + { + "epoch": 2.846388336646786, + "grad_norm": 11.232903480529785, + "learning_rate": 3.23044180786361e-08, + "loss": 0.1064, + "num_input_tokens_seen": 42033040, + "step": 21476 + }, + { + "epoch": 2.846520874751491, + "grad_norm": 0.0017418210627511144, + "learning_rate": 3.224881690549242e-08, + "loss": 0.0, + "num_input_tokens_seen": 42034144, + "step": 21477 + }, + { + "epoch": 2.846653412856196, + "grad_norm": 0.0023322266060858965, + "learning_rate": 3.219326331229372e-08, + "loss": 0.0, + "num_input_tokens_seen": 42036288, + "step": 21478 + }, + { + "epoch": 2.8467859509609013, + "grad_norm": 0.02629624493420124, + "learning_rate": 3.2137757300111375e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42038352, + "step": 21479 + }, + { + "epoch": 2.8469184890656063, + "grad_norm": 4.990286350250244, + "learning_rate": 3.208229887001563e-08, + "loss": 0.0299, + "num_input_tokens_seen": 42040088, + "step": 21480 + }, + { + "epoch": 2.8470510271703113, + "grad_norm": 1.9148160219192505, + "learning_rate": 3.202688802307563e-08, + "loss": 0.0128, + "num_input_tokens_seen": 42041880, + "step": 21481 + }, + { + "epoch": 2.8471835652750164, + "grad_norm": 0.26209819316864014, + "learning_rate": 3.1971524760359974e-08, + "loss": 0.0015, + "num_input_tokens_seen": 42043632, + "step": 21482 + }, + { + "epoch": 2.847316103379722, + "grad_norm": 0.004109885543584824, + "learning_rate": 3.191620908293586e-08, + "loss": 0.0, + "num_input_tokens_seen": 42045344, + "step": 21483 + }, + { + "epoch": 2.847448641484427, + "grad_norm": 0.1027844101190567, + "learning_rate": 3.186094099186965e-08, + "loss": 0.0007, + "num_input_tokens_seen": 42047768, + "step": 21484 + }, + { + "epoch": 2.847581179589132, + "grad_norm": 0.005945008248090744, + "learning_rate": 3.1805720488227445e-08, + "loss": 0.0, + "num_input_tokens_seen": 42049000, + "step": 21485 + }, + { + "epoch": 2.847713717693837, + "grad_norm": 3.1874094009399414, + "learning_rate": 3.175054757307366e-08, + "loss": 0.0363, + "num_input_tokens_seen": 42051448, + "step": 21486 + }, + { + "epoch": 2.847846255798542, + "grad_norm": 0.007606113329529762, + "learning_rate": 3.169542224747191e-08, + "loss": 0.0, + "num_input_tokens_seen": 42053168, + "step": 21487 + }, + { + "epoch": 2.847978793903247, + "grad_norm": 7.422728061676025, + "learning_rate": 3.164034451248521e-08, + "loss": 0.0649, + "num_input_tokens_seen": 42055208, + "step": 21488 + }, + { + "epoch": 2.848111332007952, + "grad_norm": 4.923513889312744, + "learning_rate": 3.15853143691755e-08, + "loss": 0.0218, + "num_input_tokens_seen": 42057680, + "step": 21489 + }, + { + "epoch": 2.8482438701126576, + "grad_norm": 0.023214422166347504, + "learning_rate": 3.1530331818603876e-08, + "loss": 0.0002, + "num_input_tokens_seen": 42059320, + "step": 21490 + }, + { + "epoch": 2.8483764082173626, + "grad_norm": 0.005103986710309982, + "learning_rate": 3.1475396861830596e-08, + "loss": 0.0, + "num_input_tokens_seen": 42061832, + "step": 21491 + }, + { + "epoch": 2.8485089463220676, + "grad_norm": 0.05062860995531082, + "learning_rate": 3.1420509499914266e-08, + "loss": 0.0005, + "num_input_tokens_seen": 42063224, + "step": 21492 + }, + { + "epoch": 2.8486414844267727, + "grad_norm": 0.0015596362063661218, + "learning_rate": 3.136566973391347e-08, + "loss": 0.0, + "num_input_tokens_seen": 42064432, + "step": 21493 + }, + { + "epoch": 2.8487740225314777, + "grad_norm": 0.039342768490314484, + "learning_rate": 3.131087756488599e-08, + "loss": 0.0002, + "num_input_tokens_seen": 42066752, + "step": 21494 + }, + { + "epoch": 2.8489065606361827, + "grad_norm": 7.154979705810547, + "learning_rate": 3.1256132993887636e-08, + "loss": 0.0616, + "num_input_tokens_seen": 42069064, + "step": 21495 + }, + { + "epoch": 2.849039098740888, + "grad_norm": 0.0047095525078475475, + "learning_rate": 3.120143602197395e-08, + "loss": 0.0, + "num_input_tokens_seen": 42070960, + "step": 21496 + }, + { + "epoch": 2.8491716368455933, + "grad_norm": 0.011831262148916721, + "learning_rate": 3.114678665019993e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42072360, + "step": 21497 + }, + { + "epoch": 2.8493041749502983, + "grad_norm": 0.7917279601097107, + "learning_rate": 3.1092184879618904e-08, + "loss": 0.0028, + "num_input_tokens_seen": 42074480, + "step": 21498 + }, + { + "epoch": 2.8494367130550033, + "grad_norm": 0.02553957886993885, + "learning_rate": 3.1037630711283904e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42075640, + "step": 21499 + }, + { + "epoch": 2.8495692511597084, + "grad_norm": 0.012959788553416729, + "learning_rate": 3.0983124146246604e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42078296, + "step": 21500 + }, + { + "epoch": 2.8497017892644134, + "grad_norm": 16.327404022216797, + "learning_rate": 3.0928665185557815e-08, + "loss": 0.0726, + "num_input_tokens_seen": 42080184, + "step": 21501 + }, + { + "epoch": 2.8498343273691185, + "grad_norm": 0.19364070892333984, + "learning_rate": 3.0874253830267816e-08, + "loss": 0.0008, + "num_input_tokens_seen": 42081744, + "step": 21502 + }, + { + "epoch": 2.8499668654738235, + "grad_norm": 16.6882381439209, + "learning_rate": 3.0819890081425494e-08, + "loss": 0.0525, + "num_input_tokens_seen": 42084384, + "step": 21503 + }, + { + "epoch": 2.850099403578529, + "grad_norm": 0.223484605550766, + "learning_rate": 3.076557394007917e-08, + "loss": 0.0004, + "num_input_tokens_seen": 42086544, + "step": 21504 + }, + { + "epoch": 2.850231941683234, + "grad_norm": 0.19100338220596313, + "learning_rate": 3.071130540727607e-08, + "loss": 0.0009, + "num_input_tokens_seen": 42088224, + "step": 21505 + }, + { + "epoch": 2.850364479787939, + "grad_norm": 0.31649988889694214, + "learning_rate": 3.065708448406229e-08, + "loss": 0.001, + "num_input_tokens_seen": 42090016, + "step": 21506 + }, + { + "epoch": 2.850497017892644, + "grad_norm": 0.4848686754703522, + "learning_rate": 3.0602911171483674e-08, + "loss": 0.0019, + "num_input_tokens_seen": 42092016, + "step": 21507 + }, + { + "epoch": 2.850629555997349, + "grad_norm": 0.002843445399776101, + "learning_rate": 3.0548785470584095e-08, + "loss": 0.0, + "num_input_tokens_seen": 42093160, + "step": 21508 + }, + { + "epoch": 2.8507620941020546, + "grad_norm": 0.8613713979721069, + "learning_rate": 3.0494707382407727e-08, + "loss": 0.005, + "num_input_tokens_seen": 42095056, + "step": 21509 + }, + { + "epoch": 2.850894632206759, + "grad_norm": 8.344206809997559, + "learning_rate": 3.044067690799707e-08, + "loss": 0.024, + "num_input_tokens_seen": 42096808, + "step": 21510 + }, + { + "epoch": 2.8510271703114647, + "grad_norm": 0.01259254477918148, + "learning_rate": 3.0386694048394064e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42098408, + "step": 21511 + }, + { + "epoch": 2.8511597084161697, + "grad_norm": 10.377538681030273, + "learning_rate": 3.033275880463898e-08, + "loss": 0.1754, + "num_input_tokens_seen": 42100600, + "step": 21512 + }, + { + "epoch": 2.8512922465208748, + "grad_norm": 6.459046840667725, + "learning_rate": 3.0278871177772116e-08, + "loss": 0.0413, + "num_input_tokens_seen": 42103432, + "step": 21513 + }, + { + "epoch": 2.85142478462558, + "grad_norm": 7.098331451416016, + "learning_rate": 3.022503116883208e-08, + "loss": 0.0703, + "num_input_tokens_seen": 42105096, + "step": 21514 + }, + { + "epoch": 2.851557322730285, + "grad_norm": 8.956415176391602, + "learning_rate": 3.017123877885775e-08, + "loss": 0.2043, + "num_input_tokens_seen": 42107208, + "step": 21515 + }, + { + "epoch": 2.8516898608349903, + "grad_norm": 3.7101073265075684, + "learning_rate": 3.011749400888553e-08, + "loss": 0.0206, + "num_input_tokens_seen": 42109648, + "step": 21516 + }, + { + "epoch": 2.851822398939695, + "grad_norm": 0.07020027935504913, + "learning_rate": 3.006379685995209e-08, + "loss": 0.0003, + "num_input_tokens_seen": 42111048, + "step": 21517 + }, + { + "epoch": 2.8519549370444004, + "grad_norm": 3.9727425575256348, + "learning_rate": 3.001014733309243e-08, + "loss": 0.0046, + "num_input_tokens_seen": 42112872, + "step": 21518 + }, + { + "epoch": 2.8520874751491054, + "grad_norm": 0.017590822651982307, + "learning_rate": 2.995654542934101e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42115600, + "step": 21519 + }, + { + "epoch": 2.8522200132538105, + "grad_norm": 0.03579111769795418, + "learning_rate": 2.9902991149731444e-08, + "loss": 0.0002, + "num_input_tokens_seen": 42117944, + "step": 21520 + }, + { + "epoch": 2.8523525513585155, + "grad_norm": 8.085173606872559, + "learning_rate": 2.984948449529651e-08, + "loss": 0.0299, + "num_input_tokens_seen": 42120760, + "step": 21521 + }, + { + "epoch": 2.8524850894632205, + "grad_norm": 0.045443933457136154, + "learning_rate": 2.9796025467067337e-08, + "loss": 0.0003, + "num_input_tokens_seen": 42124080, + "step": 21522 + }, + { + "epoch": 2.852617627567926, + "grad_norm": 8.209148406982422, + "learning_rate": 2.9742614066075038e-08, + "loss": 0.1279, + "num_input_tokens_seen": 42126328, + "step": 21523 + }, + { + "epoch": 2.8527501656726306, + "grad_norm": 0.007594657130539417, + "learning_rate": 2.968925029334907e-08, + "loss": 0.0, + "num_input_tokens_seen": 42127936, + "step": 21524 + }, + { + "epoch": 2.852882703777336, + "grad_norm": 0.0020971691701561213, + "learning_rate": 2.963593414991861e-08, + "loss": 0.0, + "num_input_tokens_seen": 42129240, + "step": 21525 + }, + { + "epoch": 2.853015241882041, + "grad_norm": 0.002518667373806238, + "learning_rate": 2.9582665636811447e-08, + "loss": 0.0, + "num_input_tokens_seen": 42130944, + "step": 21526 + }, + { + "epoch": 2.853147779986746, + "grad_norm": 3.5018136501312256, + "learning_rate": 2.9529444755054816e-08, + "loss": 0.0053, + "num_input_tokens_seen": 42132192, + "step": 21527 + }, + { + "epoch": 2.853280318091451, + "grad_norm": 6.4626288414001465, + "learning_rate": 2.9476271505675115e-08, + "loss": 0.068, + "num_input_tokens_seen": 42133848, + "step": 21528 + }, + { + "epoch": 2.8534128561961563, + "grad_norm": 7.315345764160156, + "learning_rate": 2.9423145889696804e-08, + "loss": 0.0425, + "num_input_tokens_seen": 42135704, + "step": 21529 + }, + { + "epoch": 2.8535453943008617, + "grad_norm": 0.12128118425607681, + "learning_rate": 2.9370067908144895e-08, + "loss": 0.0004, + "num_input_tokens_seen": 42137392, + "step": 21530 + }, + { + "epoch": 2.8536779324055663, + "grad_norm": 0.12631213665008545, + "learning_rate": 2.9317037562042183e-08, + "loss": 0.0004, + "num_input_tokens_seen": 42139688, + "step": 21531 + }, + { + "epoch": 2.853810470510272, + "grad_norm": 8.350948333740234, + "learning_rate": 2.926405485241146e-08, + "loss": 0.0308, + "num_input_tokens_seen": 42142600, + "step": 21532 + }, + { + "epoch": 2.853943008614977, + "grad_norm": 3.4920592308044434, + "learning_rate": 2.921111978027441e-08, + "loss": 0.0385, + "num_input_tokens_seen": 42144504, + "step": 21533 + }, + { + "epoch": 2.854075546719682, + "grad_norm": 3.794992208480835, + "learning_rate": 2.9158232346651605e-08, + "loss": 0.0235, + "num_input_tokens_seen": 42146504, + "step": 21534 + }, + { + "epoch": 2.854208084824387, + "grad_norm": 10.224516868591309, + "learning_rate": 2.910539255256223e-08, + "loss": 0.0334, + "num_input_tokens_seen": 42149472, + "step": 21535 + }, + { + "epoch": 2.854340622929092, + "grad_norm": 0.009973562322556973, + "learning_rate": 2.9052600399025744e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42151656, + "step": 21536 + }, + { + "epoch": 2.8544731610337974, + "grad_norm": 0.02455044910311699, + "learning_rate": 2.899985588705967e-08, + "loss": 0.0002, + "num_input_tokens_seen": 42153504, + "step": 21537 + }, + { + "epoch": 2.8546056991385025, + "grad_norm": 6.4873809814453125, + "learning_rate": 2.894715901768097e-08, + "loss": 0.0721, + "num_input_tokens_seen": 42155608, + "step": 21538 + }, + { + "epoch": 2.8547382372432075, + "grad_norm": 0.05195096880197525, + "learning_rate": 2.8894509791906055e-08, + "loss": 0.0003, + "num_input_tokens_seen": 42157568, + "step": 21539 + }, + { + "epoch": 2.8548707753479126, + "grad_norm": 0.0142402034252882, + "learning_rate": 2.8841908210749114e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42159160, + "step": 21540 + }, + { + "epoch": 2.8550033134526176, + "grad_norm": 0.08431531488895416, + "learning_rate": 2.8789354275225444e-08, + "loss": 0.0004, + "num_input_tokens_seen": 42161712, + "step": 21541 + }, + { + "epoch": 2.8551358515573226, + "grad_norm": 0.004095221403986216, + "learning_rate": 2.873684798634785e-08, + "loss": 0.0, + "num_input_tokens_seen": 42162848, + "step": 21542 + }, + { + "epoch": 2.8552683896620277, + "grad_norm": 0.0011919178068637848, + "learning_rate": 2.8684389345128295e-08, + "loss": 0.0, + "num_input_tokens_seen": 42163880, + "step": 21543 + }, + { + "epoch": 2.855400927766733, + "grad_norm": 3.6090047359466553, + "learning_rate": 2.863197835257875e-08, + "loss": 0.0129, + "num_input_tokens_seen": 42165136, + "step": 21544 + }, + { + "epoch": 2.855533465871438, + "grad_norm": 0.031554218381643295, + "learning_rate": 2.8579615009709517e-08, + "loss": 0.0002, + "num_input_tokens_seen": 42167088, + "step": 21545 + }, + { + "epoch": 2.8556660039761432, + "grad_norm": 2.682666778564453, + "learning_rate": 2.8527299317530345e-08, + "loss": 0.0618, + "num_input_tokens_seen": 42169224, + "step": 21546 + }, + { + "epoch": 2.8557985420808483, + "grad_norm": 0.22623561322689056, + "learning_rate": 2.8475031277050145e-08, + "loss": 0.0015, + "num_input_tokens_seen": 42171216, + "step": 21547 + }, + { + "epoch": 2.8559310801855533, + "grad_norm": 11.862945556640625, + "learning_rate": 2.8422810889275898e-08, + "loss": 0.1963, + "num_input_tokens_seen": 42173960, + "step": 21548 + }, + { + "epoch": 2.8560636182902583, + "grad_norm": 0.015091180801391602, + "learning_rate": 2.8370638155215125e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42175728, + "step": 21549 + }, + { + "epoch": 2.8561961563949634, + "grad_norm": 6.337652683258057, + "learning_rate": 2.8318513075873132e-08, + "loss": 0.0489, + "num_input_tokens_seen": 42177640, + "step": 21550 + }, + { + "epoch": 2.856328694499669, + "grad_norm": 3.359755039215088, + "learning_rate": 2.8266435652255785e-08, + "loss": 0.0186, + "num_input_tokens_seen": 42179592, + "step": 21551 + }, + { + "epoch": 2.856461232604374, + "grad_norm": 1.9638458490371704, + "learning_rate": 2.8214405885366724e-08, + "loss": 0.0113, + "num_input_tokens_seen": 42181832, + "step": 21552 + }, + { + "epoch": 2.856593770709079, + "grad_norm": 0.8414820432662964, + "learning_rate": 2.8162423776209035e-08, + "loss": 0.0031, + "num_input_tokens_seen": 42183512, + "step": 21553 + }, + { + "epoch": 2.856726308813784, + "grad_norm": 9.75345230102539, + "learning_rate": 2.8110489325784974e-08, + "loss": 0.0736, + "num_input_tokens_seen": 42185016, + "step": 21554 + }, + { + "epoch": 2.856858846918489, + "grad_norm": 5.08085298538208, + "learning_rate": 2.8058602535095957e-08, + "loss": 0.1567, + "num_input_tokens_seen": 42186624, + "step": 21555 + }, + { + "epoch": 2.856991385023194, + "grad_norm": 7.474252223968506, + "learning_rate": 2.8006763405142302e-08, + "loss": 0.0576, + "num_input_tokens_seen": 42188296, + "step": 21556 + }, + { + "epoch": 2.857123923127899, + "grad_norm": 0.012581606395542622, + "learning_rate": 2.7954971936923758e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42190144, + "step": 21557 + }, + { + "epoch": 2.8572564612326046, + "grad_norm": 3.340372323989868, + "learning_rate": 2.7903228131438697e-08, + "loss": 0.0196, + "num_input_tokens_seen": 42192488, + "step": 21558 + }, + { + "epoch": 2.8573889993373096, + "grad_norm": 11.64097785949707, + "learning_rate": 2.7851531989684932e-08, + "loss": 0.0741, + "num_input_tokens_seen": 42195144, + "step": 21559 + }, + { + "epoch": 2.8575215374420146, + "grad_norm": 0.012781533412635326, + "learning_rate": 2.779988351265861e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42196904, + "step": 21560 + }, + { + "epoch": 2.8576540755467197, + "grad_norm": 0.5476394295692444, + "learning_rate": 2.774828270135643e-08, + "loss": 0.0017, + "num_input_tokens_seen": 42198904, + "step": 21561 + }, + { + "epoch": 2.8577866136514247, + "grad_norm": 0.5305120944976807, + "learning_rate": 2.7696729556772884e-08, + "loss": 0.0014, + "num_input_tokens_seen": 42200560, + "step": 21562 + }, + { + "epoch": 2.8579191517561298, + "grad_norm": 4.503376007080078, + "learning_rate": 2.7645224079901887e-08, + "loss": 0.047, + "num_input_tokens_seen": 42203152, + "step": 21563 + }, + { + "epoch": 2.858051689860835, + "grad_norm": 0.09058692306280136, + "learning_rate": 2.7593766271736544e-08, + "loss": 0.0002, + "num_input_tokens_seen": 42205832, + "step": 21564 + }, + { + "epoch": 2.8581842279655403, + "grad_norm": 3.534839391708374, + "learning_rate": 2.7542356133268832e-08, + "loss": 0.01, + "num_input_tokens_seen": 42207688, + "step": 21565 + }, + { + "epoch": 2.8583167660702453, + "grad_norm": 15.308016777038574, + "learning_rate": 2.7490993665490462e-08, + "loss": 0.0819, + "num_input_tokens_seen": 42209232, + "step": 21566 + }, + { + "epoch": 2.8584493041749504, + "grad_norm": 2.941460132598877, + "learning_rate": 2.7439678869390917e-08, + "loss": 0.0093, + "num_input_tokens_seen": 42210632, + "step": 21567 + }, + { + "epoch": 2.8585818422796554, + "grad_norm": 3.7181968688964844, + "learning_rate": 2.7388411745960518e-08, + "loss": 0.03, + "num_input_tokens_seen": 42212784, + "step": 21568 + }, + { + "epoch": 2.8587143803843604, + "grad_norm": 6.756582736968994, + "learning_rate": 2.7337192296187364e-08, + "loss": 0.0859, + "num_input_tokens_seen": 42215224, + "step": 21569 + }, + { + "epoch": 2.8588469184890655, + "grad_norm": 0.0030467670876532793, + "learning_rate": 2.7286020521058998e-08, + "loss": 0.0, + "num_input_tokens_seen": 42216896, + "step": 21570 + }, + { + "epoch": 2.8589794565937705, + "grad_norm": 0.009037742391228676, + "learning_rate": 2.7234896421561573e-08, + "loss": 0.0, + "num_input_tokens_seen": 42218184, + "step": 21571 + }, + { + "epoch": 2.859111994698476, + "grad_norm": 2.048365592956543, + "learning_rate": 2.7183819998681526e-08, + "loss": 0.0052, + "num_input_tokens_seen": 42220232, + "step": 21572 + }, + { + "epoch": 2.859244532803181, + "grad_norm": 0.003953130450099707, + "learning_rate": 2.7132791253403345e-08, + "loss": 0.0, + "num_input_tokens_seen": 42221760, + "step": 21573 + }, + { + "epoch": 2.859377070907886, + "grad_norm": 6.039165019989014, + "learning_rate": 2.7081810186710688e-08, + "loss": 0.0333, + "num_input_tokens_seen": 42223472, + "step": 21574 + }, + { + "epoch": 2.859509609012591, + "grad_norm": 4.942966938018799, + "learning_rate": 2.7030876799586935e-08, + "loss": 0.0221, + "num_input_tokens_seen": 42225488, + "step": 21575 + }, + { + "epoch": 2.859642147117296, + "grad_norm": 0.8841825723648071, + "learning_rate": 2.69799910930138e-08, + "loss": 0.0044, + "num_input_tokens_seen": 42226984, + "step": 21576 + }, + { + "epoch": 2.859774685222001, + "grad_norm": 18.89573097229004, + "learning_rate": 2.6929153067972447e-08, + "loss": 0.1817, + "num_input_tokens_seen": 42229656, + "step": 21577 + }, + { + "epoch": 2.859907223326706, + "grad_norm": 15.131582260131836, + "learning_rate": 2.6878362725443198e-08, + "loss": 0.2164, + "num_input_tokens_seen": 42231448, + "step": 21578 + }, + { + "epoch": 2.8600397614314117, + "grad_norm": 0.0098275700584054, + "learning_rate": 2.6827620066405267e-08, + "loss": 0.0, + "num_input_tokens_seen": 42232736, + "step": 21579 + }, + { + "epoch": 2.8601722995361167, + "grad_norm": 2.343675374984741, + "learning_rate": 2.677692509183677e-08, + "loss": 0.0037, + "num_input_tokens_seen": 42235104, + "step": 21580 + }, + { + "epoch": 2.8603048376408218, + "grad_norm": 0.017611177638173103, + "learning_rate": 2.6726277802715528e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42237288, + "step": 21581 + }, + { + "epoch": 2.860437375745527, + "grad_norm": 6.342570781707764, + "learning_rate": 2.667567820001771e-08, + "loss": 0.0689, + "num_input_tokens_seen": 42239728, + "step": 21582 + }, + { + "epoch": 2.860569913850232, + "grad_norm": 0.3231405019760132, + "learning_rate": 2.6625126284719192e-08, + "loss": 0.0012, + "num_input_tokens_seen": 42241704, + "step": 21583 + }, + { + "epoch": 2.860702451954937, + "grad_norm": 7.155397415161133, + "learning_rate": 2.6574622057794762e-08, + "loss": 0.0473, + "num_input_tokens_seen": 42244248, + "step": 21584 + }, + { + "epoch": 2.860834990059642, + "grad_norm": 2.798198699951172, + "learning_rate": 2.6524165520217526e-08, + "loss": 0.0205, + "num_input_tokens_seen": 42247440, + "step": 21585 + }, + { + "epoch": 2.8609675281643474, + "grad_norm": 0.8360603451728821, + "learning_rate": 2.647375667296087e-08, + "loss": 0.0014, + "num_input_tokens_seen": 42248680, + "step": 21586 + }, + { + "epoch": 2.8611000662690524, + "grad_norm": 0.14519105851650238, + "learning_rate": 2.6423395516996798e-08, + "loss": 0.0004, + "num_input_tokens_seen": 42250360, + "step": 21587 + }, + { + "epoch": 2.8612326043737575, + "grad_norm": 5.92405366897583, + "learning_rate": 2.637308205329592e-08, + "loss": 0.0208, + "num_input_tokens_seen": 42252944, + "step": 21588 + }, + { + "epoch": 2.8613651424784625, + "grad_norm": 5.489123344421387, + "learning_rate": 2.6322816282828567e-08, + "loss": 0.0588, + "num_input_tokens_seen": 42254632, + "step": 21589 + }, + { + "epoch": 2.8614976805831676, + "grad_norm": 0.11977170407772064, + "learning_rate": 2.6272598206563692e-08, + "loss": 0.0005, + "num_input_tokens_seen": 42256904, + "step": 21590 + }, + { + "epoch": 2.861630218687873, + "grad_norm": 4.432560920715332, + "learning_rate": 2.6222427825469687e-08, + "loss": 0.0497, + "num_input_tokens_seen": 42258792, + "step": 21591 + }, + { + "epoch": 2.8617627567925776, + "grad_norm": 0.0009176672901958227, + "learning_rate": 2.617230514051383e-08, + "loss": 0.0, + "num_input_tokens_seen": 42260112, + "step": 21592 + }, + { + "epoch": 2.861895294897283, + "grad_norm": 0.002038666745647788, + "learning_rate": 2.6122230152662576e-08, + "loss": 0.0, + "num_input_tokens_seen": 42261280, + "step": 21593 + }, + { + "epoch": 2.862027833001988, + "grad_norm": 16.328203201293945, + "learning_rate": 2.6072202862881534e-08, + "loss": 0.1102, + "num_input_tokens_seen": 42263592, + "step": 21594 + }, + { + "epoch": 2.862160371106693, + "grad_norm": 0.03127090632915497, + "learning_rate": 2.602222327213494e-08, + "loss": 0.0003, + "num_input_tokens_seen": 42265544, + "step": 21595 + }, + { + "epoch": 2.8622929092113982, + "grad_norm": 0.05780075490474701, + "learning_rate": 2.597229138138646e-08, + "loss": 0.0002, + "num_input_tokens_seen": 42267160, + "step": 21596 + }, + { + "epoch": 2.8624254473161033, + "grad_norm": 1.4880731105804443, + "learning_rate": 2.592240719159894e-08, + "loss": 0.0026, + "num_input_tokens_seen": 42269168, + "step": 21597 + }, + { + "epoch": 2.8625579854208087, + "grad_norm": 0.032713957130908966, + "learning_rate": 2.587257070373439e-08, + "loss": 0.0002, + "num_input_tokens_seen": 42271256, + "step": 21598 + }, + { + "epoch": 2.8626905235255133, + "grad_norm": 0.010266105644404888, + "learning_rate": 2.5822781918753426e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42272480, + "step": 21599 + }, + { + "epoch": 2.862823061630219, + "grad_norm": 0.0373479388654232, + "learning_rate": 2.5773040837616116e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42274784, + "step": 21600 + }, + { + "epoch": 2.862955599734924, + "grad_norm": 6.0270562171936035, + "learning_rate": 2.572334746128141e-08, + "loss": 0.0411, + "num_input_tokens_seen": 42276328, + "step": 21601 + }, + { + "epoch": 2.863088137839629, + "grad_norm": 0.4605582058429718, + "learning_rate": 2.567370179070744e-08, + "loss": 0.0006, + "num_input_tokens_seen": 42277968, + "step": 21602 + }, + { + "epoch": 2.863220675944334, + "grad_norm": 0.02336639165878296, + "learning_rate": 2.562410382685149e-08, + "loss": 0.0002, + "num_input_tokens_seen": 42279776, + "step": 21603 + }, + { + "epoch": 2.863353214049039, + "grad_norm": 0.003615527180954814, + "learning_rate": 2.5574553570669736e-08, + "loss": 0.0, + "num_input_tokens_seen": 42281928, + "step": 21604 + }, + { + "epoch": 2.8634857521537445, + "grad_norm": 5.4126973152160645, + "learning_rate": 2.5525051023117807e-08, + "loss": 0.0255, + "num_input_tokens_seen": 42283800, + "step": 21605 + }, + { + "epoch": 2.863618290258449, + "grad_norm": 0.001508770976215601, + "learning_rate": 2.5475596185149944e-08, + "loss": 0.0, + "num_input_tokens_seen": 42285296, + "step": 21606 + }, + { + "epoch": 2.8637508283631545, + "grad_norm": 53.955726623535156, + "learning_rate": 2.542618905771954e-08, + "loss": 0.9585, + "num_input_tokens_seen": 42287376, + "step": 21607 + }, + { + "epoch": 2.8638833664678596, + "grad_norm": 16.93720054626465, + "learning_rate": 2.5376829641779177e-08, + "loss": 0.0801, + "num_input_tokens_seen": 42288912, + "step": 21608 + }, + { + "epoch": 2.8640159045725646, + "grad_norm": 2.663635492324829, + "learning_rate": 2.5327517938280867e-08, + "loss": 0.0158, + "num_input_tokens_seen": 42291776, + "step": 21609 + }, + { + "epoch": 2.8641484426772696, + "grad_norm": 0.9739689230918884, + "learning_rate": 2.5278253948175237e-08, + "loss": 0.0062, + "num_input_tokens_seen": 42293800, + "step": 21610 + }, + { + "epoch": 2.8642809807819747, + "grad_norm": 2.988729476928711, + "learning_rate": 2.522903767241208e-08, + "loss": 0.0103, + "num_input_tokens_seen": 42295512, + "step": 21611 + }, + { + "epoch": 2.86441351888668, + "grad_norm": 2.6339926719665527, + "learning_rate": 2.517986911194009e-08, + "loss": 0.0075, + "num_input_tokens_seen": 42298152, + "step": 21612 + }, + { + "epoch": 2.8645460569913848, + "grad_norm": 3.9611356258392334, + "learning_rate": 2.513074826770795e-08, + "loss": 0.0114, + "num_input_tokens_seen": 42299992, + "step": 21613 + }, + { + "epoch": 2.8646785950960902, + "grad_norm": 0.027816010639071465, + "learning_rate": 2.508167514066212e-08, + "loss": 0.0002, + "num_input_tokens_seen": 42301920, + "step": 21614 + }, + { + "epoch": 2.8648111332007953, + "grad_norm": 0.0120164779946208, + "learning_rate": 2.5032649731748792e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42303096, + "step": 21615 + }, + { + "epoch": 2.8649436713055003, + "grad_norm": 7.948295593261719, + "learning_rate": 2.4983672041913598e-08, + "loss": 0.1091, + "num_input_tokens_seen": 42304824, + "step": 21616 + }, + { + "epoch": 2.8650762094102054, + "grad_norm": 2.3466310501098633, + "learning_rate": 2.4934742072100505e-08, + "loss": 0.0074, + "num_input_tokens_seen": 42307216, + "step": 21617 + }, + { + "epoch": 2.8652087475149104, + "grad_norm": 0.0011500369291752577, + "learning_rate": 2.4885859823252924e-08, + "loss": 0.0, + "num_input_tokens_seen": 42308328, + "step": 21618 + }, + { + "epoch": 2.865341285619616, + "grad_norm": 8.245307922363281, + "learning_rate": 2.4837025296313712e-08, + "loss": 0.1021, + "num_input_tokens_seen": 42310416, + "step": 21619 + }, + { + "epoch": 2.8654738237243205, + "grad_norm": 0.10181114822626114, + "learning_rate": 2.478823849222406e-08, + "loss": 0.0003, + "num_input_tokens_seen": 42312736, + "step": 21620 + }, + { + "epoch": 2.865606361829026, + "grad_norm": 0.00629626726731658, + "learning_rate": 2.4739499411924607e-08, + "loss": 0.0, + "num_input_tokens_seen": 42315488, + "step": 21621 + }, + { + "epoch": 2.865738899933731, + "grad_norm": 6.9653778076171875, + "learning_rate": 2.4690808056355432e-08, + "loss": 0.0901, + "num_input_tokens_seen": 42317952, + "step": 21622 + }, + { + "epoch": 2.865871438038436, + "grad_norm": 0.023661013692617416, + "learning_rate": 2.464216442645495e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42319536, + "step": 21623 + }, + { + "epoch": 2.866003976143141, + "grad_norm": 0.28355541825294495, + "learning_rate": 2.4593568523161303e-08, + "loss": 0.0013, + "num_input_tokens_seen": 42321344, + "step": 21624 + }, + { + "epoch": 2.866136514247846, + "grad_norm": 10.655150413513184, + "learning_rate": 2.454502034741152e-08, + "loss": 0.1781, + "num_input_tokens_seen": 42323304, + "step": 21625 + }, + { + "epoch": 2.8662690523525516, + "grad_norm": 0.012025518342852592, + "learning_rate": 2.4496519900140957e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42324784, + "step": 21626 + }, + { + "epoch": 2.8664015904572566, + "grad_norm": 11.229488372802734, + "learning_rate": 2.4448067182285818e-08, + "loss": 0.1752, + "num_input_tokens_seen": 42326496, + "step": 21627 + }, + { + "epoch": 2.8665341285619617, + "grad_norm": 6.366613388061523, + "learning_rate": 2.4399662194779238e-08, + "loss": 0.0481, + "num_input_tokens_seen": 42329376, + "step": 21628 + }, + { + "epoch": 2.8666666666666667, + "grad_norm": 1.2863168716430664, + "learning_rate": 2.4351304938555196e-08, + "loss": 0.006, + "num_input_tokens_seen": 42331800, + "step": 21629 + }, + { + "epoch": 2.8667992047713717, + "grad_norm": 9.116541862487793, + "learning_rate": 2.4302995414546005e-08, + "loss": 0.0714, + "num_input_tokens_seen": 42333480, + "step": 21630 + }, + { + "epoch": 2.8669317428760768, + "grad_norm": 23.86290168762207, + "learning_rate": 2.4254733623682858e-08, + "loss": 0.1617, + "num_input_tokens_seen": 42336456, + "step": 21631 + }, + { + "epoch": 2.867064280980782, + "grad_norm": 0.014077488332986832, + "learning_rate": 2.4206519566896403e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42338240, + "step": 21632 + }, + { + "epoch": 2.8671968190854873, + "grad_norm": 0.4011901617050171, + "learning_rate": 2.415835324511617e-08, + "loss": 0.0008, + "num_input_tokens_seen": 42340160, + "step": 21633 + }, + { + "epoch": 2.8673293571901923, + "grad_norm": 0.0015562600456178188, + "learning_rate": 2.411023465927087e-08, + "loss": 0.0, + "num_input_tokens_seen": 42341928, + "step": 21634 + }, + { + "epoch": 2.8674618952948974, + "grad_norm": 0.1240721195936203, + "learning_rate": 2.4062163810288363e-08, + "loss": 0.0004, + "num_input_tokens_seen": 42343448, + "step": 21635 + }, + { + "epoch": 2.8675944333996024, + "grad_norm": 1.2250924110412598, + "learning_rate": 2.401414069909541e-08, + "loss": 0.0094, + "num_input_tokens_seen": 42345328, + "step": 21636 + }, + { + "epoch": 2.8677269715043074, + "grad_norm": 2.543734550476074, + "learning_rate": 2.396616532661794e-08, + "loss": 0.0108, + "num_input_tokens_seen": 42348208, + "step": 21637 + }, + { + "epoch": 2.8678595096090125, + "grad_norm": 0.0018578172894194722, + "learning_rate": 2.3918237693780767e-08, + "loss": 0.0, + "num_input_tokens_seen": 42349432, + "step": 21638 + }, + { + "epoch": 2.8679920477137175, + "grad_norm": 0.0189815703779459, + "learning_rate": 2.387035780150787e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42350520, + "step": 21639 + }, + { + "epoch": 2.868124585818423, + "grad_norm": 0.00974790845066309, + "learning_rate": 2.382252565072324e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42352304, + "step": 21640 + }, + { + "epoch": 2.868257123923128, + "grad_norm": 4.263134956359863, + "learning_rate": 2.3774741242348355e-08, + "loss": 0.0239, + "num_input_tokens_seen": 42354096, + "step": 21641 + }, + { + "epoch": 2.868389662027833, + "grad_norm": 4.719553470611572, + "learning_rate": 2.37270045773047e-08, + "loss": 0.0676, + "num_input_tokens_seen": 42356560, + "step": 21642 + }, + { + "epoch": 2.868522200132538, + "grad_norm": 0.03760210797190666, + "learning_rate": 2.3679315656512382e-08, + "loss": 0.0004, + "num_input_tokens_seen": 42359408, + "step": 21643 + }, + { + "epoch": 2.868654738237243, + "grad_norm": 1.6443088054656982, + "learning_rate": 2.3631674480891486e-08, + "loss": 0.023, + "num_input_tokens_seen": 42362096, + "step": 21644 + }, + { + "epoch": 2.868787276341948, + "grad_norm": 0.043302327394485474, + "learning_rate": 2.3584081051360174e-08, + "loss": 0.0003, + "num_input_tokens_seen": 42364392, + "step": 21645 + }, + { + "epoch": 2.8689198144466532, + "grad_norm": 0.005287370178848505, + "learning_rate": 2.3536535368836323e-08, + "loss": 0.0, + "num_input_tokens_seen": 42366888, + "step": 21646 + }, + { + "epoch": 2.8690523525513587, + "grad_norm": 0.23173081874847412, + "learning_rate": 2.3489037434236418e-08, + "loss": 0.001, + "num_input_tokens_seen": 42369392, + "step": 21647 + }, + { + "epoch": 2.8691848906560637, + "grad_norm": 3.143131971359253, + "learning_rate": 2.3441587248476394e-08, + "loss": 0.0566, + "num_input_tokens_seen": 42372128, + "step": 21648 + }, + { + "epoch": 2.869317428760769, + "grad_norm": 0.008277269080281258, + "learning_rate": 2.3394184812471078e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42373696, + "step": 21649 + }, + { + "epoch": 2.869449966865474, + "grad_norm": 0.020477479323744774, + "learning_rate": 2.334683012713418e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42375600, + "step": 21650 + }, + { + "epoch": 2.869582504970179, + "grad_norm": 0.20695693790912628, + "learning_rate": 2.3299523193379138e-08, + "loss": 0.0007, + "num_input_tokens_seen": 42378040, + "step": 21651 + }, + { + "epoch": 2.869715043074884, + "grad_norm": 0.0027679097838699818, + "learning_rate": 2.325226401211772e-08, + "loss": 0.0, + "num_input_tokens_seen": 42379232, + "step": 21652 + }, + { + "epoch": 2.869847581179589, + "grad_norm": 5.683372974395752, + "learning_rate": 2.320505258426142e-08, + "loss": 0.0388, + "num_input_tokens_seen": 42380600, + "step": 21653 + }, + { + "epoch": 2.8699801192842944, + "grad_norm": 0.09511028230190277, + "learning_rate": 2.315788891072035e-08, + "loss": 0.0006, + "num_input_tokens_seen": 42383328, + "step": 21654 + }, + { + "epoch": 2.8701126573889995, + "grad_norm": 0.20621126890182495, + "learning_rate": 2.3110772992403774e-08, + "loss": 0.0009, + "num_input_tokens_seen": 42385128, + "step": 21655 + }, + { + "epoch": 2.8702451954937045, + "grad_norm": 0.00502427015453577, + "learning_rate": 2.3063704830220136e-08, + "loss": 0.0, + "num_input_tokens_seen": 42387344, + "step": 21656 + }, + { + "epoch": 2.8703777335984095, + "grad_norm": 6.2559099197387695, + "learning_rate": 2.301668442507732e-08, + "loss": 0.033, + "num_input_tokens_seen": 42389192, + "step": 21657 + }, + { + "epoch": 2.8705102717031146, + "grad_norm": 3.7927937507629395, + "learning_rate": 2.2969711777881264e-08, + "loss": 0.019, + "num_input_tokens_seen": 42392288, + "step": 21658 + }, + { + "epoch": 2.8706428098078196, + "grad_norm": 0.008257139474153519, + "learning_rate": 2.2922786889538196e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42395112, + "step": 21659 + }, + { + "epoch": 2.8707753479125246, + "grad_norm": 10.56733226776123, + "learning_rate": 2.2875909760952665e-08, + "loss": 0.0757, + "num_input_tokens_seen": 42397064, + "step": 21660 + }, + { + "epoch": 2.87090788601723, + "grad_norm": 6.8338541984558105, + "learning_rate": 2.282908039302839e-08, + "loss": 0.0192, + "num_input_tokens_seen": 42398960, + "step": 21661 + }, + { + "epoch": 2.871040424121935, + "grad_norm": 20.688587188720703, + "learning_rate": 2.2782298786668265e-08, + "loss": 0.1028, + "num_input_tokens_seen": 42400576, + "step": 21662 + }, + { + "epoch": 2.87117296222664, + "grad_norm": 10.25086784362793, + "learning_rate": 2.2735564942774622e-08, + "loss": 0.0355, + "num_input_tokens_seen": 42402832, + "step": 21663 + }, + { + "epoch": 2.8713055003313452, + "grad_norm": 0.3733934462070465, + "learning_rate": 2.268887886224813e-08, + "loss": 0.0012, + "num_input_tokens_seen": 42404392, + "step": 21664 + }, + { + "epoch": 2.8714380384360503, + "grad_norm": 7.752166748046875, + "learning_rate": 2.2642240545988904e-08, + "loss": 0.1862, + "num_input_tokens_seen": 42406208, + "step": 21665 + }, + { + "epoch": 2.8715705765407553, + "grad_norm": 0.15452830493450165, + "learning_rate": 2.2595649994896494e-08, + "loss": 0.0011, + "num_input_tokens_seen": 42407744, + "step": 21666 + }, + { + "epoch": 2.8717031146454604, + "grad_norm": 0.09544361382722855, + "learning_rate": 2.2549107209869082e-08, + "loss": 0.0008, + "num_input_tokens_seen": 42409480, + "step": 21667 + }, + { + "epoch": 2.871835652750166, + "grad_norm": 3.3384268283843994, + "learning_rate": 2.2502612191803995e-08, + "loss": 0.0086, + "num_input_tokens_seen": 42411240, + "step": 21668 + }, + { + "epoch": 2.871968190854871, + "grad_norm": 1.4291062355041504, + "learning_rate": 2.2456164941597468e-08, + "loss": 0.0036, + "num_input_tokens_seen": 42413320, + "step": 21669 + }, + { + "epoch": 2.872100728959576, + "grad_norm": 9.687260627746582, + "learning_rate": 2.240976546014545e-08, + "loss": 0.1018, + "num_input_tokens_seen": 42414960, + "step": 21670 + }, + { + "epoch": 2.872233267064281, + "grad_norm": 3.645359754562378, + "learning_rate": 2.2363413748342223e-08, + "loss": 0.0463, + "num_input_tokens_seen": 42417040, + "step": 21671 + }, + { + "epoch": 2.872365805168986, + "grad_norm": 0.12051080912351608, + "learning_rate": 2.2317109807081794e-08, + "loss": 0.0002, + "num_input_tokens_seen": 42418416, + "step": 21672 + }, + { + "epoch": 2.872498343273691, + "grad_norm": 0.31452521681785583, + "learning_rate": 2.2270853637256507e-08, + "loss": 0.0011, + "num_input_tokens_seen": 42419912, + "step": 21673 + }, + { + "epoch": 2.872630881378396, + "grad_norm": 7.765163421630859, + "learning_rate": 2.2224645239758424e-08, + "loss": 0.088, + "num_input_tokens_seen": 42422232, + "step": 21674 + }, + { + "epoch": 2.8727634194831015, + "grad_norm": 6.41986608505249, + "learning_rate": 2.217848461547878e-08, + "loss": 0.0684, + "num_input_tokens_seen": 42423704, + "step": 21675 + }, + { + "epoch": 2.8728959575878066, + "grad_norm": 8.917693138122559, + "learning_rate": 2.2132371765307137e-08, + "loss": 0.0417, + "num_input_tokens_seen": 42426456, + "step": 21676 + }, + { + "epoch": 2.8730284956925116, + "grad_norm": 0.028219355270266533, + "learning_rate": 2.2086306690133062e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42428776, + "step": 21677 + }, + { + "epoch": 2.8731610337972167, + "grad_norm": 0.0066528404131531715, + "learning_rate": 2.2040289390844183e-08, + "loss": 0.0, + "num_input_tokens_seen": 42430672, + "step": 21678 + }, + { + "epoch": 2.8732935719019217, + "grad_norm": 3.178255796432495, + "learning_rate": 2.199431986832784e-08, + "loss": 0.0377, + "num_input_tokens_seen": 42432936, + "step": 21679 + }, + { + "epoch": 2.873426110006627, + "grad_norm": 3.7675628662109375, + "learning_rate": 2.194839812347055e-08, + "loss": 0.0282, + "num_input_tokens_seen": 42434432, + "step": 21680 + }, + { + "epoch": 2.8735586481113318, + "grad_norm": 3.441983222961426, + "learning_rate": 2.1902524157157988e-08, + "loss": 0.0081, + "num_input_tokens_seen": 42436304, + "step": 21681 + }, + { + "epoch": 2.8736911862160373, + "grad_norm": 0.007487286813557148, + "learning_rate": 2.18566979702739e-08, + "loss": 0.0, + "num_input_tokens_seen": 42439264, + "step": 21682 + }, + { + "epoch": 2.8738237243207423, + "grad_norm": 1.8212895393371582, + "learning_rate": 2.181091956370257e-08, + "loss": 0.0116, + "num_input_tokens_seen": 42441544, + "step": 21683 + }, + { + "epoch": 2.8739562624254473, + "grad_norm": 0.8910925984382629, + "learning_rate": 2.1765188938326354e-08, + "loss": 0.0048, + "num_input_tokens_seen": 42442816, + "step": 21684 + }, + { + "epoch": 2.8740888005301524, + "grad_norm": 9.363848686218262, + "learning_rate": 2.1719506095026767e-08, + "loss": 0.2196, + "num_input_tokens_seen": 42445088, + "step": 21685 + }, + { + "epoch": 2.8742213386348574, + "grad_norm": 0.5202776193618774, + "learning_rate": 2.1673871034684492e-08, + "loss": 0.0025, + "num_input_tokens_seen": 42447032, + "step": 21686 + }, + { + "epoch": 2.874353876739563, + "grad_norm": 9.840773582458496, + "learning_rate": 2.1628283758180215e-08, + "loss": 0.0919, + "num_input_tokens_seen": 42448936, + "step": 21687 + }, + { + "epoch": 2.8744864148442675, + "grad_norm": 0.005742209032177925, + "learning_rate": 2.158274426639212e-08, + "loss": 0.0, + "num_input_tokens_seen": 42450832, + "step": 21688 + }, + { + "epoch": 2.874618952948973, + "grad_norm": 0.03901435807347298, + "learning_rate": 2.1537252560198674e-08, + "loss": 0.0004, + "num_input_tokens_seen": 42452984, + "step": 21689 + }, + { + "epoch": 2.874751491053678, + "grad_norm": 0.006571086123585701, + "learning_rate": 2.1491808640476396e-08, + "loss": 0.0, + "num_input_tokens_seen": 42455736, + "step": 21690 + }, + { + "epoch": 2.874884029158383, + "grad_norm": 4.665439128875732, + "learning_rate": 2.1446412508102366e-08, + "loss": 0.0328, + "num_input_tokens_seen": 42458072, + "step": 21691 + }, + { + "epoch": 2.875016567263088, + "grad_norm": 0.6568962931632996, + "learning_rate": 2.140106416395088e-08, + "loss": 0.0027, + "num_input_tokens_seen": 42459448, + "step": 21692 + }, + { + "epoch": 2.875149105367793, + "grad_norm": 2.5970985889434814, + "learning_rate": 2.1355763608897072e-08, + "loss": 0.0147, + "num_input_tokens_seen": 42461104, + "step": 21693 + }, + { + "epoch": 2.8752816434724986, + "grad_norm": 0.8434082865715027, + "learning_rate": 2.1310510843814137e-08, + "loss": 0.0022, + "num_input_tokens_seen": 42463064, + "step": 21694 + }, + { + "epoch": 2.875414181577203, + "grad_norm": 0.04172539338469505, + "learning_rate": 2.1265305869574426e-08, + "loss": 0.0003, + "num_input_tokens_seen": 42464944, + "step": 21695 + }, + { + "epoch": 2.8755467196819087, + "grad_norm": 0.002152903936803341, + "learning_rate": 2.122014868704947e-08, + "loss": 0.0, + "num_input_tokens_seen": 42466488, + "step": 21696 + }, + { + "epoch": 2.8756792577866137, + "grad_norm": 6.342823505401611, + "learning_rate": 2.1175039297110235e-08, + "loss": 0.039, + "num_input_tokens_seen": 42467848, + "step": 21697 + }, + { + "epoch": 2.8758117958913187, + "grad_norm": 0.06377577781677246, + "learning_rate": 2.1129977700626304e-08, + "loss": 0.0002, + "num_input_tokens_seen": 42470000, + "step": 21698 + }, + { + "epoch": 2.875944333996024, + "grad_norm": 1.3428319692611694, + "learning_rate": 2.1084963898466427e-08, + "loss": 0.0027, + "num_input_tokens_seen": 42471216, + "step": 21699 + }, + { + "epoch": 2.876076872100729, + "grad_norm": 0.01016664132475853, + "learning_rate": 2.1039997891498522e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42474608, + "step": 21700 + }, + { + "epoch": 2.8762094102054343, + "grad_norm": 12.080531120300293, + "learning_rate": 2.099507968058967e-08, + "loss": 0.1034, + "num_input_tokens_seen": 42476360, + "step": 21701 + }, + { + "epoch": 2.876341948310139, + "grad_norm": 0.036984700709581375, + "learning_rate": 2.0950209266605846e-08, + "loss": 0.0002, + "num_input_tokens_seen": 42479440, + "step": 21702 + }, + { + "epoch": 2.8764744864148444, + "grad_norm": 8.265290260314941, + "learning_rate": 2.0905386650412197e-08, + "loss": 0.1447, + "num_input_tokens_seen": 42481464, + "step": 21703 + }, + { + "epoch": 2.8766070245195494, + "grad_norm": 0.4845183193683624, + "learning_rate": 2.0860611832873024e-08, + "loss": 0.0022, + "num_input_tokens_seen": 42483000, + "step": 21704 + }, + { + "epoch": 2.8767395626242545, + "grad_norm": 0.013882378116250038, + "learning_rate": 2.081588481485125e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42484728, + "step": 21705 + }, + { + "epoch": 2.8768721007289595, + "grad_norm": 0.2727926969528198, + "learning_rate": 2.0771205597209798e-08, + "loss": 0.0009, + "num_input_tokens_seen": 42486768, + "step": 21706 + }, + { + "epoch": 2.8770046388336645, + "grad_norm": 0.015065119601786137, + "learning_rate": 2.0726574180809646e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42488552, + "step": 21707 + }, + { + "epoch": 2.87713717693837, + "grad_norm": 0.021624000743031502, + "learning_rate": 2.0681990566511768e-08, + "loss": 0.0002, + "num_input_tokens_seen": 42490104, + "step": 21708 + }, + { + "epoch": 2.877269715043075, + "grad_norm": 0.14539587497711182, + "learning_rate": 2.0637454755175203e-08, + "loss": 0.0009, + "num_input_tokens_seen": 42492544, + "step": 21709 + }, + { + "epoch": 2.87740225314778, + "grad_norm": 0.06341344863176346, + "learning_rate": 2.0592966747658983e-08, + "loss": 0.0003, + "num_input_tokens_seen": 42494744, + "step": 21710 + }, + { + "epoch": 2.877534791252485, + "grad_norm": 0.03968055918812752, + "learning_rate": 2.0548526544821033e-08, + "loss": 0.0002, + "num_input_tokens_seen": 42496696, + "step": 21711 + }, + { + "epoch": 2.87766732935719, + "grad_norm": 0.0029678787104785442, + "learning_rate": 2.0504134147517894e-08, + "loss": 0.0, + "num_input_tokens_seen": 42498392, + "step": 21712 + }, + { + "epoch": 2.877799867461895, + "grad_norm": 0.17011205852031708, + "learning_rate": 2.0459789556605548e-08, + "loss": 0.0008, + "num_input_tokens_seen": 42500440, + "step": 21713 + }, + { + "epoch": 2.8779324055666002, + "grad_norm": 4.736098766326904, + "learning_rate": 2.0415492772938862e-08, + "loss": 0.0498, + "num_input_tokens_seen": 42502424, + "step": 21714 + }, + { + "epoch": 2.8780649436713057, + "grad_norm": 6.867688179016113, + "learning_rate": 2.0371243797372152e-08, + "loss": 0.0531, + "num_input_tokens_seen": 42504400, + "step": 21715 + }, + { + "epoch": 2.8781974817760108, + "grad_norm": 0.007529549766331911, + "learning_rate": 2.0327042630758355e-08, + "loss": 0.0, + "num_input_tokens_seen": 42506792, + "step": 21716 + }, + { + "epoch": 2.878330019880716, + "grad_norm": 5.492095947265625, + "learning_rate": 2.0282889273950114e-08, + "loss": 0.0499, + "num_input_tokens_seen": 42508944, + "step": 21717 + }, + { + "epoch": 2.878462557985421, + "grad_norm": 0.07597468048334122, + "learning_rate": 2.0238783727798138e-08, + "loss": 0.0004, + "num_input_tokens_seen": 42510712, + "step": 21718 + }, + { + "epoch": 2.878595096090126, + "grad_norm": 6.721797943115234, + "learning_rate": 2.0194725993153418e-08, + "loss": 0.0291, + "num_input_tokens_seen": 42512656, + "step": 21719 + }, + { + "epoch": 2.878727634194831, + "grad_norm": 4.492206573486328, + "learning_rate": 2.0150716070864995e-08, + "loss": 0.0152, + "num_input_tokens_seen": 42514312, + "step": 21720 + }, + { + "epoch": 2.878860172299536, + "grad_norm": 0.4439235031604767, + "learning_rate": 2.0106753961781354e-08, + "loss": 0.0029, + "num_input_tokens_seen": 42516360, + "step": 21721 + }, + { + "epoch": 2.8789927104042414, + "grad_norm": 0.021872833371162415, + "learning_rate": 2.0062839666750154e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42519032, + "step": 21722 + }, + { + "epoch": 2.8791252485089465, + "grad_norm": 0.0074595315381884575, + "learning_rate": 2.0018973186618494e-08, + "loss": 0.0, + "num_input_tokens_seen": 42520248, + "step": 21723 + }, + { + "epoch": 2.8792577866136515, + "grad_norm": 0.01594364084303379, + "learning_rate": 1.9975154522232087e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42521784, + "step": 21724 + }, + { + "epoch": 2.8793903247183565, + "grad_norm": 0.011900728568434715, + "learning_rate": 1.9931383674435255e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42523408, + "step": 21725 + }, + { + "epoch": 2.8795228628230616, + "grad_norm": 0.009303472936153412, + "learning_rate": 1.9887660644072325e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42524568, + "step": 21726 + }, + { + "epoch": 2.8796554009277666, + "grad_norm": 6.740594387054443, + "learning_rate": 1.9843985431986235e-08, + "loss": 0.0195, + "num_input_tokens_seen": 42526728, + "step": 21727 + }, + { + "epoch": 2.8797879390324717, + "grad_norm": 0.009153252467513084, + "learning_rate": 1.9800358039019086e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42529176, + "step": 21728 + }, + { + "epoch": 2.879920477137177, + "grad_norm": 0.009961471892893314, + "learning_rate": 1.975677846601215e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42530720, + "step": 21729 + }, + { + "epoch": 2.880053015241882, + "grad_norm": 2.2072718143463135, + "learning_rate": 1.9713246713805588e-08, + "loss": 0.0043, + "num_input_tokens_seen": 42531856, + "step": 21730 + }, + { + "epoch": 2.880185553346587, + "grad_norm": 10.182709693908691, + "learning_rate": 1.9669762783238456e-08, + "loss": 0.1179, + "num_input_tokens_seen": 42534376, + "step": 21731 + }, + { + "epoch": 2.8803180914512923, + "grad_norm": 0.14054341614246368, + "learning_rate": 1.9626326675149522e-08, + "loss": 0.0006, + "num_input_tokens_seen": 42536656, + "step": 21732 + }, + { + "epoch": 2.8804506295559973, + "grad_norm": 0.009076383896172047, + "learning_rate": 1.9582938390375893e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42538096, + "step": 21733 + }, + { + "epoch": 2.8805831676607023, + "grad_norm": 0.01048740092664957, + "learning_rate": 1.9539597929754683e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42540968, + "step": 21734 + }, + { + "epoch": 2.8807157057654074, + "grad_norm": 0.07054173201322556, + "learning_rate": 1.949630529412133e-08, + "loss": 0.0003, + "num_input_tokens_seen": 42543272, + "step": 21735 + }, + { + "epoch": 2.880848243870113, + "grad_norm": 0.03623688220977783, + "learning_rate": 1.945306048431017e-08, + "loss": 0.0002, + "num_input_tokens_seen": 42545344, + "step": 21736 + }, + { + "epoch": 2.880980781974818, + "grad_norm": 0.028118977323174477, + "learning_rate": 1.9409863501155258e-08, + "loss": 0.0002, + "num_input_tokens_seen": 42547176, + "step": 21737 + }, + { + "epoch": 2.881113320079523, + "grad_norm": 15.747359275817871, + "learning_rate": 1.9366714345489534e-08, + "loss": 0.1386, + "num_input_tokens_seen": 42550160, + "step": 21738 + }, + { + "epoch": 2.881245858184228, + "grad_norm": 4.383615970611572, + "learning_rate": 1.932361301814456e-08, + "loss": 0.0119, + "num_input_tokens_seen": 42551864, + "step": 21739 + }, + { + "epoch": 2.881378396288933, + "grad_norm": 2.640383005142212, + "learning_rate": 1.928055951995189e-08, + "loss": 0.0349, + "num_input_tokens_seen": 42553472, + "step": 21740 + }, + { + "epoch": 2.881510934393638, + "grad_norm": 1.5517865419387817, + "learning_rate": 1.923755385174142e-08, + "loss": 0.0109, + "num_input_tokens_seen": 42555400, + "step": 21741 + }, + { + "epoch": 2.881643472498343, + "grad_norm": 0.03025534376502037, + "learning_rate": 1.9194596014342204e-08, + "loss": 0.0002, + "num_input_tokens_seen": 42556992, + "step": 21742 + }, + { + "epoch": 2.8817760106030486, + "grad_norm": 14.83239459991455, + "learning_rate": 1.9151686008582747e-08, + "loss": 0.3637, + "num_input_tokens_seen": 42561088, + "step": 21743 + }, + { + "epoch": 2.8819085487077536, + "grad_norm": 16.01445770263672, + "learning_rate": 1.9108823835290168e-08, + "loss": 0.2057, + "num_input_tokens_seen": 42564112, + "step": 21744 + }, + { + "epoch": 2.8820410868124586, + "grad_norm": 2.6291191577911377, + "learning_rate": 1.9066009495291028e-08, + "loss": 0.0071, + "num_input_tokens_seen": 42565552, + "step": 21745 + }, + { + "epoch": 2.8821736249171637, + "grad_norm": 0.9352185726165771, + "learning_rate": 1.9023242989410496e-08, + "loss": 0.0027, + "num_input_tokens_seen": 42567480, + "step": 21746 + }, + { + "epoch": 2.8823061630218687, + "grad_norm": 0.018471039831638336, + "learning_rate": 1.898052431847347e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42570792, + "step": 21747 + }, + { + "epoch": 2.8824387011265737, + "grad_norm": 0.0039279102347791195, + "learning_rate": 1.8937853483303735e-08, + "loss": 0.0, + "num_input_tokens_seen": 42572664, + "step": 21748 + }, + { + "epoch": 2.882571239231279, + "grad_norm": 0.02657441236078739, + "learning_rate": 1.8895230484723692e-08, + "loss": 0.0002, + "num_input_tokens_seen": 42575224, + "step": 21749 + }, + { + "epoch": 2.8827037773359843, + "grad_norm": 12.278406143188477, + "learning_rate": 1.885265532355518e-08, + "loss": 0.0869, + "num_input_tokens_seen": 42577728, + "step": 21750 + }, + { + "epoch": 2.8828363154406893, + "grad_norm": 16.79652214050293, + "learning_rate": 1.8810128000618932e-08, + "loss": 0.0758, + "num_input_tokens_seen": 42579176, + "step": 21751 + }, + { + "epoch": 2.8829688535453943, + "grad_norm": 8.590455055236816, + "learning_rate": 1.8767648516735126e-08, + "loss": 0.1589, + "num_input_tokens_seen": 42581704, + "step": 21752 + }, + { + "epoch": 2.8831013916500994, + "grad_norm": 0.6413662433624268, + "learning_rate": 1.8725216872722828e-08, + "loss": 0.003, + "num_input_tokens_seen": 42583904, + "step": 21753 + }, + { + "epoch": 2.8832339297548044, + "grad_norm": 0.13234147429466248, + "learning_rate": 1.8682833069400274e-08, + "loss": 0.001, + "num_input_tokens_seen": 42585240, + "step": 21754 + }, + { + "epoch": 2.8833664678595095, + "grad_norm": 0.002945690881460905, + "learning_rate": 1.8640497107584586e-08, + "loss": 0.0, + "num_input_tokens_seen": 42586896, + "step": 21755 + }, + { + "epoch": 2.8834990059642145, + "grad_norm": 1.6607190370559692, + "learning_rate": 1.8598208988091504e-08, + "loss": 0.0128, + "num_input_tokens_seen": 42588888, + "step": 21756 + }, + { + "epoch": 2.88363154406892, + "grad_norm": 3.9055895805358887, + "learning_rate": 1.855596871173676e-08, + "loss": 0.0287, + "num_input_tokens_seen": 42590536, + "step": 21757 + }, + { + "epoch": 2.883764082173625, + "grad_norm": 0.08381219953298569, + "learning_rate": 1.851377627933526e-08, + "loss": 0.0003, + "num_input_tokens_seen": 42591760, + "step": 21758 + }, + { + "epoch": 2.88389662027833, + "grad_norm": 0.008764654397964478, + "learning_rate": 1.8471631691699688e-08, + "loss": 0.0, + "num_input_tokens_seen": 42592968, + "step": 21759 + }, + { + "epoch": 2.884029158383035, + "grad_norm": 0.03767247125506401, + "learning_rate": 1.842953494964328e-08, + "loss": 0.0002, + "num_input_tokens_seen": 42595616, + "step": 21760 + }, + { + "epoch": 2.88416169648774, + "grad_norm": 4.955318927764893, + "learning_rate": 1.8387486053977056e-08, + "loss": 0.0516, + "num_input_tokens_seen": 42598472, + "step": 21761 + }, + { + "epoch": 2.8842942345924456, + "grad_norm": 2.9059388637542725, + "learning_rate": 1.834548500551231e-08, + "loss": 0.0247, + "num_input_tokens_seen": 42600120, + "step": 21762 + }, + { + "epoch": 2.88442677269715, + "grad_norm": 25.11546516418457, + "learning_rate": 1.8303531805058394e-08, + "loss": 0.266, + "num_input_tokens_seen": 42601904, + "step": 21763 + }, + { + "epoch": 2.8845593108018557, + "grad_norm": 0.009374380111694336, + "learning_rate": 1.8261626453424663e-08, + "loss": 0.0, + "num_input_tokens_seen": 42603256, + "step": 21764 + }, + { + "epoch": 2.8846918489065607, + "grad_norm": 0.26204389333724976, + "learning_rate": 1.8219768951418527e-08, + "loss": 0.0006, + "num_input_tokens_seen": 42605320, + "step": 21765 + }, + { + "epoch": 2.8848243870112658, + "grad_norm": 4.270134925842285, + "learning_rate": 1.817795929984767e-08, + "loss": 0.0294, + "num_input_tokens_seen": 42606936, + "step": 21766 + }, + { + "epoch": 2.884956925115971, + "grad_norm": 9.965041160583496, + "learning_rate": 1.813619749951784e-08, + "loss": 0.1455, + "num_input_tokens_seen": 42609440, + "step": 21767 + }, + { + "epoch": 2.885089463220676, + "grad_norm": 21.464330673217773, + "learning_rate": 1.8094483551233943e-08, + "loss": 0.2571, + "num_input_tokens_seen": 42610944, + "step": 21768 + }, + { + "epoch": 2.8852220013253813, + "grad_norm": 3.57464599609375, + "learning_rate": 1.8052817455800897e-08, + "loss": 0.0305, + "num_input_tokens_seen": 42612848, + "step": 21769 + }, + { + "epoch": 2.885354539430086, + "grad_norm": 0.2957526743412018, + "learning_rate": 1.801119921402167e-08, + "loss": 0.0015, + "num_input_tokens_seen": 42614472, + "step": 21770 + }, + { + "epoch": 2.8854870775347914, + "grad_norm": 4.880345344543457, + "learning_rate": 1.796962882669867e-08, + "loss": 0.0183, + "num_input_tokens_seen": 42616760, + "step": 21771 + }, + { + "epoch": 2.8856196156394964, + "grad_norm": 8.757031440734863, + "learning_rate": 1.792810629463376e-08, + "loss": 0.0154, + "num_input_tokens_seen": 42619656, + "step": 21772 + }, + { + "epoch": 2.8857521537442015, + "grad_norm": 0.007824905216693878, + "learning_rate": 1.7886631618626858e-08, + "loss": 0.0, + "num_input_tokens_seen": 42620800, + "step": 21773 + }, + { + "epoch": 2.8858846918489065, + "grad_norm": 2.5010392665863037, + "learning_rate": 1.7845204799478434e-08, + "loss": 0.0097, + "num_input_tokens_seen": 42622280, + "step": 21774 + }, + { + "epoch": 2.8860172299536115, + "grad_norm": 1.4397038221359253, + "learning_rate": 1.7803825837986456e-08, + "loss": 0.0067, + "num_input_tokens_seen": 42624280, + "step": 21775 + }, + { + "epoch": 2.886149768058317, + "grad_norm": 0.34597355127334595, + "learning_rate": 1.7762494734949176e-08, + "loss": 0.0011, + "num_input_tokens_seen": 42626472, + "step": 21776 + }, + { + "epoch": 2.8862823061630216, + "grad_norm": 8.39917278289795, + "learning_rate": 1.7721211491163458e-08, + "loss": 0.0552, + "num_input_tokens_seen": 42628840, + "step": 21777 + }, + { + "epoch": 2.886414844267727, + "grad_norm": 5.2002997398376465, + "learning_rate": 1.7679976107425333e-08, + "loss": 0.0254, + "num_input_tokens_seen": 42630688, + "step": 21778 + }, + { + "epoch": 2.886547382372432, + "grad_norm": 6.080469131469727, + "learning_rate": 1.763878858452972e-08, + "loss": 0.0227, + "num_input_tokens_seen": 42633032, + "step": 21779 + }, + { + "epoch": 2.886679920477137, + "grad_norm": 2.3813304901123047, + "learning_rate": 1.7597648923270426e-08, + "loss": 0.0024, + "num_input_tokens_seen": 42634480, + "step": 21780 + }, + { + "epoch": 2.886812458581842, + "grad_norm": 0.039499830454587936, + "learning_rate": 1.7556557124441264e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42636584, + "step": 21781 + }, + { + "epoch": 2.8869449966865472, + "grad_norm": 0.25914257764816284, + "learning_rate": 1.7515513188834375e-08, + "loss": 0.0013, + "num_input_tokens_seen": 42638888, + "step": 21782 + }, + { + "epoch": 2.8870775347912527, + "grad_norm": 6.190903663635254, + "learning_rate": 1.7474517117240518e-08, + "loss": 0.0443, + "num_input_tokens_seen": 42640968, + "step": 21783 + }, + { + "epoch": 2.8872100728959573, + "grad_norm": 7.2174072265625, + "learning_rate": 1.7433568910451003e-08, + "loss": 0.1181, + "num_input_tokens_seen": 42643128, + "step": 21784 + }, + { + "epoch": 2.887342611000663, + "grad_norm": 0.4529682695865631, + "learning_rate": 1.7392668569254645e-08, + "loss": 0.0017, + "num_input_tokens_seen": 42644856, + "step": 21785 + }, + { + "epoch": 2.887475149105368, + "grad_norm": 0.14502069354057312, + "learning_rate": 1.7351816094440534e-08, + "loss": 0.0008, + "num_input_tokens_seen": 42647392, + "step": 21786 + }, + { + "epoch": 2.887607687210073, + "grad_norm": 6.466854095458984, + "learning_rate": 1.7311011486795816e-08, + "loss": 0.0263, + "num_input_tokens_seen": 42649416, + "step": 21787 + }, + { + "epoch": 2.887740225314778, + "grad_norm": 0.2678705155849457, + "learning_rate": 1.727025474710764e-08, + "loss": 0.0007, + "num_input_tokens_seen": 42651040, + "step": 21788 + }, + { + "epoch": 2.887872763419483, + "grad_norm": 3.4545094966888428, + "learning_rate": 1.722954587616177e-08, + "loss": 0.0167, + "num_input_tokens_seen": 42653032, + "step": 21789 + }, + { + "epoch": 2.8880053015241884, + "grad_norm": 16.618690490722656, + "learning_rate": 1.718888487474313e-08, + "loss": 0.1934, + "num_input_tokens_seen": 42654856, + "step": 21790 + }, + { + "epoch": 2.888137839628893, + "grad_norm": 4.440697193145752, + "learning_rate": 1.7148271743635536e-08, + "loss": 0.0119, + "num_input_tokens_seen": 42657368, + "step": 21791 + }, + { + "epoch": 2.8882703777335985, + "grad_norm": 0.014543090015649796, + "learning_rate": 1.71077064836217e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42659016, + "step": 21792 + }, + { + "epoch": 2.8884029158383036, + "grad_norm": 4.472564220428467, + "learning_rate": 1.7067189095484605e-08, + "loss": 0.0598, + "num_input_tokens_seen": 42660656, + "step": 21793 + }, + { + "epoch": 2.8885354539430086, + "grad_norm": 0.02428354322910309, + "learning_rate": 1.7026719580004736e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42662544, + "step": 21794 + }, + { + "epoch": 2.8886679920477136, + "grad_norm": 9.320484161376953, + "learning_rate": 1.698629793796286e-08, + "loss": 0.1232, + "num_input_tokens_seen": 42664512, + "step": 21795 + }, + { + "epoch": 2.8888005301524187, + "grad_norm": 0.9706075191497803, + "learning_rate": 1.6945924170138072e-08, + "loss": 0.0054, + "num_input_tokens_seen": 42666728, + "step": 21796 + }, + { + "epoch": 2.888933068257124, + "grad_norm": 6.47089958190918, + "learning_rate": 1.6905598277308644e-08, + "loss": 0.14, + "num_input_tokens_seen": 42669016, + "step": 21797 + }, + { + "epoch": 2.889065606361829, + "grad_norm": 5.111827373504639, + "learning_rate": 1.686532026025228e-08, + "loss": 0.0289, + "num_input_tokens_seen": 42670728, + "step": 21798 + }, + { + "epoch": 2.8891981444665342, + "grad_norm": 0.054378002882003784, + "learning_rate": 1.682509011974559e-08, + "loss": 0.0003, + "num_input_tokens_seen": 42672608, + "step": 21799 + }, + { + "epoch": 2.8893306825712393, + "grad_norm": 0.5560510754585266, + "learning_rate": 1.6784907856564336e-08, + "loss": 0.0019, + "num_input_tokens_seen": 42674736, + "step": 21800 + }, + { + "epoch": 2.8894632206759443, + "grad_norm": 0.00109343440271914, + "learning_rate": 1.6744773471483178e-08, + "loss": 0.0, + "num_input_tokens_seen": 42676152, + "step": 21801 + }, + { + "epoch": 2.8895957587806493, + "grad_norm": 0.03358738496899605, + "learning_rate": 1.670468696527594e-08, + "loss": 0.0002, + "num_input_tokens_seen": 42677808, + "step": 21802 + }, + { + "epoch": 2.8897282968853544, + "grad_norm": 0.05505892634391785, + "learning_rate": 1.6664648338715338e-08, + "loss": 0.0002, + "num_input_tokens_seen": 42679752, + "step": 21803 + }, + { + "epoch": 2.88986083499006, + "grad_norm": 1.9368375539779663, + "learning_rate": 1.6624657592573534e-08, + "loss": 0.0043, + "num_input_tokens_seen": 42681128, + "step": 21804 + }, + { + "epoch": 2.889993373094765, + "grad_norm": 0.0030798695515841246, + "learning_rate": 1.6584714727621297e-08, + "loss": 0.0, + "num_input_tokens_seen": 42682536, + "step": 21805 + }, + { + "epoch": 2.89012591119947, + "grad_norm": 0.27509772777557373, + "learning_rate": 1.6544819744629124e-08, + "loss": 0.0008, + "num_input_tokens_seen": 42684216, + "step": 21806 + }, + { + "epoch": 2.890258449304175, + "grad_norm": 0.028223685920238495, + "learning_rate": 1.65049726443664e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42685664, + "step": 21807 + }, + { + "epoch": 2.89039098740888, + "grad_norm": 16.499895095825195, + "learning_rate": 1.6465173427600844e-08, + "loss": 0.1276, + "num_input_tokens_seen": 42688664, + "step": 21808 + }, + { + "epoch": 2.890523525513585, + "grad_norm": 0.047535490244627, + "learning_rate": 1.6425422095099898e-08, + "loss": 0.0003, + "num_input_tokens_seen": 42690336, + "step": 21809 + }, + { + "epoch": 2.89065606361829, + "grad_norm": 0.28025877475738525, + "learning_rate": 1.6385718647630166e-08, + "loss": 0.0019, + "num_input_tokens_seen": 42693080, + "step": 21810 + }, + { + "epoch": 2.8907886017229956, + "grad_norm": 0.0013176877982914448, + "learning_rate": 1.6346063085957155e-08, + "loss": 0.0, + "num_input_tokens_seen": 42695720, + "step": 21811 + }, + { + "epoch": 2.8909211398277006, + "grad_norm": 8.358757972717285, + "learning_rate": 1.6306455410845522e-08, + "loss": 0.0827, + "num_input_tokens_seen": 42697360, + "step": 21812 + }, + { + "epoch": 2.8910536779324056, + "grad_norm": 0.2825828790664673, + "learning_rate": 1.626689562305883e-08, + "loss": 0.0019, + "num_input_tokens_seen": 42700296, + "step": 21813 + }, + { + "epoch": 2.8911862160371107, + "grad_norm": 15.404608726501465, + "learning_rate": 1.62273837233598e-08, + "loss": 0.187, + "num_input_tokens_seen": 42701800, + "step": 21814 + }, + { + "epoch": 2.8913187541418157, + "grad_norm": 0.007021215278655291, + "learning_rate": 1.6187919712510324e-08, + "loss": 0.0, + "num_input_tokens_seen": 42703240, + "step": 21815 + }, + { + "epoch": 2.8914512922465208, + "grad_norm": 3.0171751976013184, + "learning_rate": 1.6148503591271182e-08, + "loss": 0.0101, + "num_input_tokens_seen": 42705456, + "step": 21816 + }, + { + "epoch": 2.891583830351226, + "grad_norm": 0.016001442447304726, + "learning_rate": 1.6109135360402595e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42707464, + "step": 21817 + }, + { + "epoch": 2.8917163684559313, + "grad_norm": 12.613093376159668, + "learning_rate": 1.6069815020663125e-08, + "loss": 0.0871, + "num_input_tokens_seen": 42709360, + "step": 21818 + }, + { + "epoch": 2.8918489065606363, + "grad_norm": 0.2810207009315491, + "learning_rate": 1.6030542572811615e-08, + "loss": 0.0011, + "num_input_tokens_seen": 42710944, + "step": 21819 + }, + { + "epoch": 2.8919814446653413, + "grad_norm": 0.003911016974598169, + "learning_rate": 1.5991318017604395e-08, + "loss": 0.0, + "num_input_tokens_seen": 42712712, + "step": 21820 + }, + { + "epoch": 2.8921139827700464, + "grad_norm": 8.695013046264648, + "learning_rate": 1.5952141355798368e-08, + "loss": 0.0594, + "num_input_tokens_seen": 42715336, + "step": 21821 + }, + { + "epoch": 2.8922465208747514, + "grad_norm": 0.08680420368909836, + "learning_rate": 1.5913012588148757e-08, + "loss": 0.0005, + "num_input_tokens_seen": 42717240, + "step": 21822 + }, + { + "epoch": 2.8923790589794565, + "grad_norm": 20.08997917175293, + "learning_rate": 1.587393171540996e-08, + "loss": 0.4566, + "num_input_tokens_seen": 42719456, + "step": 21823 + }, + { + "epoch": 2.8925115970841615, + "grad_norm": 16.35171890258789, + "learning_rate": 1.583489873833527e-08, + "loss": 0.195, + "num_input_tokens_seen": 42720960, + "step": 21824 + }, + { + "epoch": 2.892644135188867, + "grad_norm": 1.8816537857055664, + "learning_rate": 1.5795913657677686e-08, + "loss": 0.0082, + "num_input_tokens_seen": 42723368, + "step": 21825 + }, + { + "epoch": 2.892776673293572, + "grad_norm": 4.2345967292785645, + "learning_rate": 1.5756976474188558e-08, + "loss": 0.1209, + "num_input_tokens_seen": 42725544, + "step": 21826 + }, + { + "epoch": 2.892909211398277, + "grad_norm": 0.023701287806034088, + "learning_rate": 1.5718087188618948e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42727472, + "step": 21827 + }, + { + "epoch": 2.893041749502982, + "grad_norm": 0.15921413898468018, + "learning_rate": 1.5679245801717978e-08, + "loss": 0.0008, + "num_input_tokens_seen": 42729704, + "step": 21828 + }, + { + "epoch": 2.893174287607687, + "grad_norm": 0.1450648456811905, + "learning_rate": 1.5640452314235332e-08, + "loss": 0.0006, + "num_input_tokens_seen": 42731496, + "step": 21829 + }, + { + "epoch": 2.893306825712392, + "grad_norm": 6.471755504608154, + "learning_rate": 1.5601706726918463e-08, + "loss": 0.0444, + "num_input_tokens_seen": 42733696, + "step": 21830 + }, + { + "epoch": 2.893439363817097, + "grad_norm": 4.941380500793457, + "learning_rate": 1.556300904051483e-08, + "loss": 0.0272, + "num_input_tokens_seen": 42735936, + "step": 21831 + }, + { + "epoch": 2.8935719019218027, + "grad_norm": 0.02417609840631485, + "learning_rate": 1.5524359255770227e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42737936, + "step": 21832 + }, + { + "epoch": 2.8937044400265077, + "grad_norm": 0.768993079662323, + "learning_rate": 1.548575737342989e-08, + "loss": 0.0053, + "num_input_tokens_seen": 42739872, + "step": 21833 + }, + { + "epoch": 2.8938369781312128, + "grad_norm": 0.00539828184992075, + "learning_rate": 1.544720339423794e-08, + "loss": 0.0, + "num_input_tokens_seen": 42741704, + "step": 21834 + }, + { + "epoch": 2.893969516235918, + "grad_norm": 1.2141324281692505, + "learning_rate": 1.540869731893796e-08, + "loss": 0.0046, + "num_input_tokens_seen": 42743696, + "step": 21835 + }, + { + "epoch": 2.894102054340623, + "grad_norm": 0.002828523050993681, + "learning_rate": 1.5370239148272682e-08, + "loss": 0.0, + "num_input_tokens_seen": 42745360, + "step": 21836 + }, + { + "epoch": 2.894234592445328, + "grad_norm": 0.053335040807724, + "learning_rate": 1.53318288829829e-08, + "loss": 0.0004, + "num_input_tokens_seen": 42747808, + "step": 21837 + }, + { + "epoch": 2.894367130550033, + "grad_norm": 8.15924072265625, + "learning_rate": 1.5293466523809697e-08, + "loss": 0.0482, + "num_input_tokens_seen": 42749888, + "step": 21838 + }, + { + "epoch": 2.8944996686547384, + "grad_norm": 2.0875802040100098, + "learning_rate": 1.5255152071492475e-08, + "loss": 0.0077, + "num_input_tokens_seen": 42752392, + "step": 21839 + }, + { + "epoch": 2.8946322067594434, + "grad_norm": 0.19174259901046753, + "learning_rate": 1.5216885526770087e-08, + "loss": 0.0009, + "num_input_tokens_seen": 42754184, + "step": 21840 + }, + { + "epoch": 2.8947647448641485, + "grad_norm": 1.917540192604065, + "learning_rate": 1.5178666890380278e-08, + "loss": 0.0066, + "num_input_tokens_seen": 42755624, + "step": 21841 + }, + { + "epoch": 2.8948972829688535, + "grad_norm": 12.75284481048584, + "learning_rate": 1.5140496163060236e-08, + "loss": 0.0721, + "num_input_tokens_seen": 42757320, + "step": 21842 + }, + { + "epoch": 2.8950298210735586, + "grad_norm": 5.695208549499512, + "learning_rate": 1.5102373345545484e-08, + "loss": 0.0247, + "num_input_tokens_seen": 42760448, + "step": 21843 + }, + { + "epoch": 2.8951623591782636, + "grad_norm": 9.881443977355957, + "learning_rate": 1.506429843857099e-08, + "loss": 0.0937, + "num_input_tokens_seen": 42762704, + "step": 21844 + }, + { + "epoch": 2.8952948972829686, + "grad_norm": 1.167162299156189, + "learning_rate": 1.5026271442871165e-08, + "loss": 0.0035, + "num_input_tokens_seen": 42764848, + "step": 21845 + }, + { + "epoch": 2.895427435387674, + "grad_norm": 0.0025954912416636944, + "learning_rate": 1.4988292359179314e-08, + "loss": 0.0, + "num_input_tokens_seen": 42767504, + "step": 21846 + }, + { + "epoch": 2.895559973492379, + "grad_norm": 0.003800133476033807, + "learning_rate": 1.4950361188227348e-08, + "loss": 0.0, + "num_input_tokens_seen": 42769008, + "step": 21847 + }, + { + "epoch": 2.895692511597084, + "grad_norm": 0.02113240770995617, + "learning_rate": 1.4912477930746905e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42771008, + "step": 21848 + }, + { + "epoch": 2.8958250497017892, + "grad_norm": 0.0873739942908287, + "learning_rate": 1.4874642587468235e-08, + "loss": 0.0002, + "num_input_tokens_seen": 42772040, + "step": 21849 + }, + { + "epoch": 2.8959575878064943, + "grad_norm": 1.607867956161499, + "learning_rate": 1.4836855159120755e-08, + "loss": 0.0032, + "num_input_tokens_seen": 42773800, + "step": 21850 + }, + { + "epoch": 2.8960901259111997, + "grad_norm": 3.8669376373291016, + "learning_rate": 1.4799115646433327e-08, + "loss": 0.0213, + "num_input_tokens_seen": 42776120, + "step": 21851 + }, + { + "epoch": 2.8962226640159043, + "grad_norm": 15.551241874694824, + "learning_rate": 1.4761424050133144e-08, + "loss": 0.1471, + "num_input_tokens_seen": 42778688, + "step": 21852 + }, + { + "epoch": 2.89635520212061, + "grad_norm": 1.2815444469451904, + "learning_rate": 1.4723780370947405e-08, + "loss": 0.0062, + "num_input_tokens_seen": 42781064, + "step": 21853 + }, + { + "epoch": 2.896487740225315, + "grad_norm": 0.01603313535451889, + "learning_rate": 1.4686184609601639e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42782680, + "step": 21854 + }, + { + "epoch": 2.89662027833002, + "grad_norm": 0.8123252391815186, + "learning_rate": 1.4648636766820546e-08, + "loss": 0.0028, + "num_input_tokens_seen": 42783784, + "step": 21855 + }, + { + "epoch": 2.896752816434725, + "grad_norm": 0.06824760884046555, + "learning_rate": 1.4611136843328544e-08, + "loss": 0.0002, + "num_input_tokens_seen": 42786288, + "step": 21856 + }, + { + "epoch": 2.89688535453943, + "grad_norm": 0.5590487122535706, + "learning_rate": 1.4573684839848112e-08, + "loss": 0.0024, + "num_input_tokens_seen": 42787888, + "step": 21857 + }, + { + "epoch": 2.8970178926441355, + "grad_norm": 0.01573786325752735, + "learning_rate": 1.4536280757102005e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42789584, + "step": 21858 + }, + { + "epoch": 2.89715043074884, + "grad_norm": 0.003916813526302576, + "learning_rate": 1.4498924595810759e-08, + "loss": 0.0, + "num_input_tokens_seen": 42791976, + "step": 21859 + }, + { + "epoch": 2.8972829688535455, + "grad_norm": 0.018391158431768417, + "learning_rate": 1.4461616356695185e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42794936, + "step": 21860 + }, + { + "epoch": 2.8974155069582506, + "grad_norm": 0.15034545958042145, + "learning_rate": 1.4424356040474152e-08, + "loss": 0.0005, + "num_input_tokens_seen": 42796288, + "step": 21861 + }, + { + "epoch": 2.8975480450629556, + "grad_norm": 7.587205410003662, + "learning_rate": 1.4387143647866253e-08, + "loss": 0.0767, + "num_input_tokens_seen": 42798000, + "step": 21862 + }, + { + "epoch": 2.8976805831676606, + "grad_norm": 0.005003215279430151, + "learning_rate": 1.4349979179588969e-08, + "loss": 0.0, + "num_input_tokens_seen": 42799408, + "step": 21863 + }, + { + "epoch": 2.8978131212723657, + "grad_norm": 8.567745208740234, + "learning_rate": 1.431286263635867e-08, + "loss": 0.0899, + "num_input_tokens_seen": 42800928, + "step": 21864 + }, + { + "epoch": 2.897945659377071, + "grad_norm": 0.012640919536352158, + "learning_rate": 1.4275794018891454e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42802632, + "step": 21865 + }, + { + "epoch": 2.8980781974817758, + "grad_norm": 5.966353893280029, + "learning_rate": 1.4238773327901745e-08, + "loss": 0.0344, + "num_input_tokens_seen": 42804760, + "step": 21866 + }, + { + "epoch": 2.8982107355864812, + "grad_norm": 7.464968204498291, + "learning_rate": 1.420180056410314e-08, + "loss": 0.0582, + "num_input_tokens_seen": 42806280, + "step": 21867 + }, + { + "epoch": 2.8983432736911863, + "grad_norm": 0.02829921618103981, + "learning_rate": 1.4164875728208682e-08, + "loss": 0.0002, + "num_input_tokens_seen": 42808760, + "step": 21868 + }, + { + "epoch": 2.8984758117958913, + "grad_norm": 12.499402046203613, + "learning_rate": 1.41279988209303e-08, + "loss": 0.0787, + "num_input_tokens_seen": 42812128, + "step": 21869 + }, + { + "epoch": 2.8986083499005963, + "grad_norm": 10.971367835998535, + "learning_rate": 1.4091169842979368e-08, + "loss": 0.0779, + "num_input_tokens_seen": 42813664, + "step": 21870 + }, + { + "epoch": 2.8987408880053014, + "grad_norm": 1.0918554067611694, + "learning_rate": 1.4054388795065321e-08, + "loss": 0.0054, + "num_input_tokens_seen": 42815192, + "step": 21871 + }, + { + "epoch": 2.898873426110007, + "grad_norm": 14.199649810791016, + "learning_rate": 1.401765567789759e-08, + "loss": 0.1929, + "num_input_tokens_seen": 42817328, + "step": 21872 + }, + { + "epoch": 2.8990059642147115, + "grad_norm": 0.014770323410630226, + "learning_rate": 1.3980970492184498e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42818776, + "step": 21873 + }, + { + "epoch": 2.899138502319417, + "grad_norm": 0.003389711258932948, + "learning_rate": 1.3944333238633534e-08, + "loss": 0.0, + "num_input_tokens_seen": 42820960, + "step": 21874 + }, + { + "epoch": 2.899271040424122, + "grad_norm": 32.26531982421875, + "learning_rate": 1.3907743917950523e-08, + "loss": 0.0435, + "num_input_tokens_seen": 42823688, + "step": 21875 + }, + { + "epoch": 2.899403578528827, + "grad_norm": 0.23862943053245544, + "learning_rate": 1.3871202530841287e-08, + "loss": 0.001, + "num_input_tokens_seen": 42826656, + "step": 21876 + }, + { + "epoch": 2.899536116633532, + "grad_norm": 3.6919634342193604, + "learning_rate": 1.3834709078010544e-08, + "loss": 0.0155, + "num_input_tokens_seen": 42828528, + "step": 21877 + }, + { + "epoch": 2.899668654738237, + "grad_norm": 0.6939218044281006, + "learning_rate": 1.3798263560161617e-08, + "loss": 0.0038, + "num_input_tokens_seen": 42830376, + "step": 21878 + }, + { + "epoch": 2.8998011928429426, + "grad_norm": 0.005500981118530035, + "learning_rate": 1.3761865977997558e-08, + "loss": 0.0, + "num_input_tokens_seen": 42832032, + "step": 21879 + }, + { + "epoch": 2.8999337309476476, + "grad_norm": 0.7904248833656311, + "learning_rate": 1.3725516332219747e-08, + "loss": 0.0017, + "num_input_tokens_seen": 42833872, + "step": 21880 + }, + { + "epoch": 2.9000662690523527, + "grad_norm": 6.683494567871094, + "learning_rate": 1.3689214623529012e-08, + "loss": 0.0693, + "num_input_tokens_seen": 42835912, + "step": 21881 + }, + { + "epoch": 2.9001988071570577, + "grad_norm": 0.08821951597929001, + "learning_rate": 1.365296085262563e-08, + "loss": 0.0009, + "num_input_tokens_seen": 42838616, + "step": 21882 + }, + { + "epoch": 2.9003313452617627, + "grad_norm": 0.02909945882856846, + "learning_rate": 1.3616755020208206e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42840728, + "step": 21883 + }, + { + "epoch": 2.9004638833664678, + "grad_norm": 0.05134959891438484, + "learning_rate": 1.3580597126975347e-08, + "loss": 0.0003, + "num_input_tokens_seen": 42842856, + "step": 21884 + }, + { + "epoch": 2.900596421471173, + "grad_norm": 0.9567461609840393, + "learning_rate": 1.3544487173623444e-08, + "loss": 0.0063, + "num_input_tokens_seen": 42844680, + "step": 21885 + }, + { + "epoch": 2.9007289595758783, + "grad_norm": 12.670013427734375, + "learning_rate": 1.3508425160849437e-08, + "loss": 0.186, + "num_input_tokens_seen": 42847536, + "step": 21886 + }, + { + "epoch": 2.9008614976805833, + "grad_norm": 0.8445271849632263, + "learning_rate": 1.3472411089348048e-08, + "loss": 0.005, + "num_input_tokens_seen": 42849088, + "step": 21887 + }, + { + "epoch": 2.9009940357852884, + "grad_norm": 0.014230293221771717, + "learning_rate": 1.3436444959814277e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42850432, + "step": 21888 + }, + { + "epoch": 2.9011265738899934, + "grad_norm": 0.6979024410247803, + "learning_rate": 1.340052677294118e-08, + "loss": 0.0055, + "num_input_tokens_seen": 42852552, + "step": 21889 + }, + { + "epoch": 2.9012591119946984, + "grad_norm": 0.020527120679616928, + "learning_rate": 1.336465652942126e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42854128, + "step": 21890 + }, + { + "epoch": 2.9013916500994035, + "grad_norm": 4.5034499168396, + "learning_rate": 1.3328834229946464e-08, + "loss": 0.048, + "num_input_tokens_seen": 42856336, + "step": 21891 + }, + { + "epoch": 2.9015241882041085, + "grad_norm": 0.21529565751552582, + "learning_rate": 1.3293059875206793e-08, + "loss": 0.0009, + "num_input_tokens_seen": 42859056, + "step": 21892 + }, + { + "epoch": 2.901656726308814, + "grad_norm": 0.006279053632169962, + "learning_rate": 1.325733346589253e-08, + "loss": 0.0, + "num_input_tokens_seen": 42860576, + "step": 21893 + }, + { + "epoch": 2.901789264413519, + "grad_norm": 0.15658703446388245, + "learning_rate": 1.3221655002692569e-08, + "loss": 0.0003, + "num_input_tokens_seen": 42861864, + "step": 21894 + }, + { + "epoch": 2.901921802518224, + "grad_norm": 0.13757097721099854, + "learning_rate": 1.3186024486294414e-08, + "loss": 0.0005, + "num_input_tokens_seen": 42863744, + "step": 21895 + }, + { + "epoch": 2.902054340622929, + "grad_norm": 14.049901962280273, + "learning_rate": 1.315044191738557e-08, + "loss": 0.1589, + "num_input_tokens_seen": 42865576, + "step": 21896 + }, + { + "epoch": 2.902186878727634, + "grad_norm": 0.003701205365359783, + "learning_rate": 1.3114907296651325e-08, + "loss": 0.0, + "num_input_tokens_seen": 42866736, + "step": 21897 + }, + { + "epoch": 2.902319416832339, + "grad_norm": 12.016429901123047, + "learning_rate": 1.3079420624777794e-08, + "loss": 0.1122, + "num_input_tokens_seen": 42869520, + "step": 21898 + }, + { + "epoch": 2.902451954937044, + "grad_norm": 0.46425268054008484, + "learning_rate": 1.3043981902448322e-08, + "loss": 0.0019, + "num_input_tokens_seen": 42871544, + "step": 21899 + }, + { + "epoch": 2.9025844930417497, + "grad_norm": 0.14612586796283722, + "learning_rate": 1.3008591130346804e-08, + "loss": 0.0005, + "num_input_tokens_seen": 42873448, + "step": 21900 + }, + { + "epoch": 2.9027170311464547, + "grad_norm": 0.012677825056016445, + "learning_rate": 1.2973248309155195e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42874936, + "step": 21901 + }, + { + "epoch": 2.90284956925116, + "grad_norm": 0.06714598089456558, + "learning_rate": 1.2937953439554895e-08, + "loss": 0.0003, + "num_input_tokens_seen": 42876760, + "step": 21902 + }, + { + "epoch": 2.902982107355865, + "grad_norm": 0.002638844307512045, + "learning_rate": 1.2902706522226749e-08, + "loss": 0.0, + "num_input_tokens_seen": 42878016, + "step": 21903 + }, + { + "epoch": 2.90311464546057, + "grad_norm": 0.012105557136237621, + "learning_rate": 1.2867507557850211e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42880648, + "step": 21904 + }, + { + "epoch": 2.903247183565275, + "grad_norm": 3.1819558143615723, + "learning_rate": 1.2832356547103907e-08, + "loss": 0.0161, + "num_input_tokens_seen": 42881896, + "step": 21905 + }, + { + "epoch": 2.90337972166998, + "grad_norm": 7.1907806396484375, + "learning_rate": 1.279725349066563e-08, + "loss": 0.0687, + "num_input_tokens_seen": 42883400, + "step": 21906 + }, + { + "epoch": 2.9035122597746854, + "grad_norm": 11.399194717407227, + "learning_rate": 1.2762198389212055e-08, + "loss": 0.1373, + "num_input_tokens_seen": 42886496, + "step": 21907 + }, + { + "epoch": 2.9036447978793904, + "grad_norm": 3.959879159927368, + "learning_rate": 1.2727191243419313e-08, + "loss": 0.0157, + "num_input_tokens_seen": 42888232, + "step": 21908 + }, + { + "epoch": 2.9037773359840955, + "grad_norm": 0.022855183109641075, + "learning_rate": 1.269223205396214e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42890248, + "step": 21909 + }, + { + "epoch": 2.9039098740888005, + "grad_norm": 0.04584252089262009, + "learning_rate": 1.265732082151444e-08, + "loss": 0.0003, + "num_input_tokens_seen": 42891888, + "step": 21910 + }, + { + "epoch": 2.9040424121935056, + "grad_norm": 0.006457976531237364, + "learning_rate": 1.2622457546749567e-08, + "loss": 0.0, + "num_input_tokens_seen": 42893736, + "step": 21911 + }, + { + "epoch": 2.9041749502982106, + "grad_norm": 1.5480132102966309, + "learning_rate": 1.2587642230339759e-08, + "loss": 0.0089, + "num_input_tokens_seen": 42896448, + "step": 21912 + }, + { + "epoch": 2.9043074884029156, + "grad_norm": 0.03996611014008522, + "learning_rate": 1.2552874872956145e-08, + "loss": 0.0003, + "num_input_tokens_seen": 42897784, + "step": 21913 + }, + { + "epoch": 2.904440026507621, + "grad_norm": 0.005707287695258856, + "learning_rate": 1.25181554752693e-08, + "loss": 0.0, + "num_input_tokens_seen": 42900160, + "step": 21914 + }, + { + "epoch": 2.904572564612326, + "grad_norm": 5.709275722503662, + "learning_rate": 1.2483484037948135e-08, + "loss": 0.0924, + "num_input_tokens_seen": 42902296, + "step": 21915 + }, + { + "epoch": 2.904705102717031, + "grad_norm": 15.23489761352539, + "learning_rate": 1.2448860561661558e-08, + "loss": 0.1701, + "num_input_tokens_seen": 42904200, + "step": 21916 + }, + { + "epoch": 2.9048376408217362, + "grad_norm": 0.0033224602229893208, + "learning_rate": 1.241428504707709e-08, + "loss": 0.0, + "num_input_tokens_seen": 42905488, + "step": 21917 + }, + { + "epoch": 2.9049701789264413, + "grad_norm": 0.0005927822785452008, + "learning_rate": 1.2379757494861421e-08, + "loss": 0.0, + "num_input_tokens_seen": 42907000, + "step": 21918 + }, + { + "epoch": 2.9051027170311463, + "grad_norm": 0.382308691740036, + "learning_rate": 1.2345277905679854e-08, + "loss": 0.0014, + "num_input_tokens_seen": 42908568, + "step": 21919 + }, + { + "epoch": 2.9052352551358513, + "grad_norm": 0.001161378575488925, + "learning_rate": 1.2310846280197686e-08, + "loss": 0.0, + "num_input_tokens_seen": 42909960, + "step": 21920 + }, + { + "epoch": 2.905367793240557, + "grad_norm": 4.968019008636475, + "learning_rate": 1.2276462619078556e-08, + "loss": 0.0569, + "num_input_tokens_seen": 42911976, + "step": 21921 + }, + { + "epoch": 2.905500331345262, + "grad_norm": 0.05725034698843956, + "learning_rate": 1.2242126922985265e-08, + "loss": 0.0002, + "num_input_tokens_seen": 42913928, + "step": 21922 + }, + { + "epoch": 2.905632869449967, + "grad_norm": 3.928481340408325, + "learning_rate": 1.2207839192580062e-08, + "loss": 0.0035, + "num_input_tokens_seen": 42917288, + "step": 21923 + }, + { + "epoch": 2.905765407554672, + "grad_norm": 12.014176368713379, + "learning_rate": 1.2173599428524086e-08, + "loss": 0.3217, + "num_input_tokens_seen": 42918896, + "step": 21924 + }, + { + "epoch": 2.905897945659377, + "grad_norm": 0.029615573585033417, + "learning_rate": 1.2139407631477362e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42920016, + "step": 21925 + }, + { + "epoch": 2.906030483764082, + "grad_norm": 11.053147315979004, + "learning_rate": 1.2105263802099088e-08, + "loss": 0.3689, + "num_input_tokens_seen": 42922944, + "step": 21926 + }, + { + "epoch": 2.906163021868787, + "grad_norm": 0.017649173736572266, + "learning_rate": 1.2071167941047624e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42924336, + "step": 21927 + }, + { + "epoch": 2.9062955599734925, + "grad_norm": 0.00022445783542934805, + "learning_rate": 1.2037120048980222e-08, + "loss": 0.0, + "num_input_tokens_seen": 42925328, + "step": 21928 + }, + { + "epoch": 2.9064280980781976, + "grad_norm": 2.2284164428710938, + "learning_rate": 1.200312012655358e-08, + "loss": 0.004, + "num_input_tokens_seen": 42926536, + "step": 21929 + }, + { + "epoch": 2.9065606361829026, + "grad_norm": 1.1413416862487793, + "learning_rate": 1.1969168174423284e-08, + "loss": 0.0076, + "num_input_tokens_seen": 42928416, + "step": 21930 + }, + { + "epoch": 2.9066931742876077, + "grad_norm": 0.018550578504800797, + "learning_rate": 1.1935264193243813e-08, + "loss": 0.0, + "num_input_tokens_seen": 42929912, + "step": 21931 + }, + { + "epoch": 2.9068257123923127, + "grad_norm": 0.004678811877965927, + "learning_rate": 1.1901408183668806e-08, + "loss": 0.0, + "num_input_tokens_seen": 42931816, + "step": 21932 + }, + { + "epoch": 2.9069582504970177, + "grad_norm": 0.009757760912179947, + "learning_rate": 1.1867600146351076e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42933576, + "step": 21933 + }, + { + "epoch": 2.9070907886017228, + "grad_norm": 0.013394701294600964, + "learning_rate": 1.1833840081942605e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42935560, + "step": 21934 + }, + { + "epoch": 2.9072233267064282, + "grad_norm": 3.623309850692749, + "learning_rate": 1.1800127991093979e-08, + "loss": 0.0154, + "num_input_tokens_seen": 42937992, + "step": 21935 + }, + { + "epoch": 2.9073558648111333, + "grad_norm": 0.1036694347858429, + "learning_rate": 1.176646387445579e-08, + "loss": 0.0004, + "num_input_tokens_seen": 42940808, + "step": 21936 + }, + { + "epoch": 2.9074884029158383, + "grad_norm": 0.018242409452795982, + "learning_rate": 1.1732847732676411e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42942192, + "step": 21937 + }, + { + "epoch": 2.9076209410205434, + "grad_norm": 0.09169457852840424, + "learning_rate": 1.1699279566404487e-08, + "loss": 0.0002, + "num_input_tokens_seen": 42943832, + "step": 21938 + }, + { + "epoch": 2.9077534791252484, + "grad_norm": 0.002927461639046669, + "learning_rate": 1.1665759376287001e-08, + "loss": 0.0, + "num_input_tokens_seen": 42945064, + "step": 21939 + }, + { + "epoch": 2.907886017229954, + "grad_norm": 11.159092903137207, + "learning_rate": 1.1632287162970102e-08, + "loss": 0.0303, + "num_input_tokens_seen": 42946600, + "step": 21940 + }, + { + "epoch": 2.9080185553346585, + "grad_norm": 2.7261157035827637, + "learning_rate": 1.1598862927099386e-08, + "loss": 0.0065, + "num_input_tokens_seen": 42948056, + "step": 21941 + }, + { + "epoch": 2.908151093439364, + "grad_norm": 0.028012461960315704, + "learning_rate": 1.1565486669319336e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42950328, + "step": 21942 + }, + { + "epoch": 2.908283631544069, + "grad_norm": 2.4944710731506348, + "learning_rate": 1.1532158390273606e-08, + "loss": 0.0085, + "num_input_tokens_seen": 42952568, + "step": 21943 + }, + { + "epoch": 2.908416169648774, + "grad_norm": 0.02497400902211666, + "learning_rate": 1.1498878090604181e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42953912, + "step": 21944 + }, + { + "epoch": 2.908548707753479, + "grad_norm": 0.5636566281318665, + "learning_rate": 1.1465645770953326e-08, + "loss": 0.0018, + "num_input_tokens_seen": 42956840, + "step": 21945 + }, + { + "epoch": 2.908681245858184, + "grad_norm": 0.010956265963613987, + "learning_rate": 1.1432461431961361e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42958840, + "step": 21946 + }, + { + "epoch": 2.9088137839628896, + "grad_norm": 6.515491008758545, + "learning_rate": 1.1399325074268608e-08, + "loss": 0.06, + "num_input_tokens_seen": 42960592, + "step": 21947 + }, + { + "epoch": 2.908946322067594, + "grad_norm": 0.011846862733364105, + "learning_rate": 1.1366236698513444e-08, + "loss": 0.0, + "num_input_tokens_seen": 42961760, + "step": 21948 + }, + { + "epoch": 2.9090788601722997, + "grad_norm": 9.874768257141113, + "learning_rate": 1.1333196305334249e-08, + "loss": 0.061, + "num_input_tokens_seen": 42964240, + "step": 21949 + }, + { + "epoch": 2.9092113982770047, + "grad_norm": 4.613884449005127, + "learning_rate": 1.1300203895367735e-08, + "loss": 0.0471, + "num_input_tokens_seen": 42966784, + "step": 21950 + }, + { + "epoch": 2.9093439363817097, + "grad_norm": 3.0748848915100098, + "learning_rate": 1.1267259469250336e-08, + "loss": 0.0392, + "num_input_tokens_seen": 42969256, + "step": 21951 + }, + { + "epoch": 2.9094764744864148, + "grad_norm": 0.7355468273162842, + "learning_rate": 1.1234363027616824e-08, + "loss": 0.0034, + "num_input_tokens_seen": 42970768, + "step": 21952 + }, + { + "epoch": 2.90960901259112, + "grad_norm": 0.006784104276448488, + "learning_rate": 1.120151457110169e-08, + "loss": 0.0, + "num_input_tokens_seen": 42972144, + "step": 21953 + }, + { + "epoch": 2.9097415506958253, + "grad_norm": 0.8847712278366089, + "learning_rate": 1.1168714100338317e-08, + "loss": 0.0017, + "num_input_tokens_seen": 42973688, + "step": 21954 + }, + { + "epoch": 2.90987408880053, + "grad_norm": 0.1828608512878418, + "learning_rate": 1.1135961615959256e-08, + "loss": 0.0009, + "num_input_tokens_seen": 42975536, + "step": 21955 + }, + { + "epoch": 2.9100066269052354, + "grad_norm": 6.940862655639648, + "learning_rate": 1.1103257118595666e-08, + "loss": 0.1052, + "num_input_tokens_seen": 42977992, + "step": 21956 + }, + { + "epoch": 2.9101391650099404, + "grad_norm": 7.962825298309326, + "learning_rate": 1.1070600608878434e-08, + "loss": 0.079, + "num_input_tokens_seen": 42979584, + "step": 21957 + }, + { + "epoch": 2.9102717031146454, + "grad_norm": 0.5806574821472168, + "learning_rate": 1.1037992087437056e-08, + "loss": 0.0016, + "num_input_tokens_seen": 42981104, + "step": 21958 + }, + { + "epoch": 2.9104042412193505, + "grad_norm": 10.117878913879395, + "learning_rate": 1.1005431554899915e-08, + "loss": 0.0726, + "num_input_tokens_seen": 42983464, + "step": 21959 + }, + { + "epoch": 2.9105367793240555, + "grad_norm": 3.5498950481414795, + "learning_rate": 1.09729190118954e-08, + "loss": 0.0422, + "num_input_tokens_seen": 42985944, + "step": 21960 + }, + { + "epoch": 2.910669317428761, + "grad_norm": 0.008531622588634491, + "learning_rate": 1.0940454459050232e-08, + "loss": 0.0, + "num_input_tokens_seen": 42987264, + "step": 21961 + }, + { + "epoch": 2.9108018555334656, + "grad_norm": 0.027749083936214447, + "learning_rate": 1.090803789699002e-08, + "loss": 0.0003, + "num_input_tokens_seen": 42989112, + "step": 21962 + }, + { + "epoch": 2.910934393638171, + "grad_norm": 2.5667002201080322, + "learning_rate": 1.0875669326340099e-08, + "loss": 0.0108, + "num_input_tokens_seen": 42991168, + "step": 21963 + }, + { + "epoch": 2.911066931742876, + "grad_norm": 0.6141388416290283, + "learning_rate": 1.0843348747724136e-08, + "loss": 0.0007, + "num_input_tokens_seen": 42993008, + "step": 21964 + }, + { + "epoch": 2.911199469847581, + "grad_norm": 0.022096291184425354, + "learning_rate": 1.0811076161765799e-08, + "loss": 0.0001, + "num_input_tokens_seen": 42994480, + "step": 21965 + }, + { + "epoch": 2.911332007952286, + "grad_norm": 0.006765156984329224, + "learning_rate": 1.077885156908709e-08, + "loss": 0.0, + "num_input_tokens_seen": 42997648, + "step": 21966 + }, + { + "epoch": 2.9114645460569912, + "grad_norm": 6.568155765533447, + "learning_rate": 1.0746674970309456e-08, + "loss": 0.0646, + "num_input_tokens_seen": 43000128, + "step": 21967 + }, + { + "epoch": 2.9115970841616967, + "grad_norm": 4.378548622131348, + "learning_rate": 1.0714546366053235e-08, + "loss": 0.0398, + "num_input_tokens_seen": 43002016, + "step": 21968 + }, + { + "epoch": 2.9117296222664018, + "grad_norm": 5.3975090980529785, + "learning_rate": 1.0682465756937654e-08, + "loss": 0.055, + "num_input_tokens_seen": 43003424, + "step": 21969 + }, + { + "epoch": 2.911862160371107, + "grad_norm": 0.0014704520581290126, + "learning_rate": 1.0650433143581384e-08, + "loss": 0.0, + "num_input_tokens_seen": 43004712, + "step": 21970 + }, + { + "epoch": 2.911994698475812, + "grad_norm": 0.6462353467941284, + "learning_rate": 1.0618448526601988e-08, + "loss": 0.0033, + "num_input_tokens_seen": 43006928, + "step": 21971 + }, + { + "epoch": 2.912127236580517, + "grad_norm": 2.3356761932373047, + "learning_rate": 1.058651190661647e-08, + "loss": 0.0629, + "num_input_tokens_seen": 43009312, + "step": 21972 + }, + { + "epoch": 2.912259774685222, + "grad_norm": 0.0044085378758609295, + "learning_rate": 1.055462328424045e-08, + "loss": 0.0, + "num_input_tokens_seen": 43010776, + "step": 21973 + }, + { + "epoch": 2.912392312789927, + "grad_norm": 0.117121122777462, + "learning_rate": 1.0522782660088437e-08, + "loss": 0.0006, + "num_input_tokens_seen": 43013696, + "step": 21974 + }, + { + "epoch": 2.9125248508946324, + "grad_norm": 11.036858558654785, + "learning_rate": 1.0490990034774663e-08, + "loss": 0.1612, + "num_input_tokens_seen": 43016976, + "step": 21975 + }, + { + "epoch": 2.9126573889993375, + "grad_norm": 6.189133644104004, + "learning_rate": 1.0459245408911966e-08, + "loss": 0.0174, + "num_input_tokens_seen": 43019552, + "step": 21976 + }, + { + "epoch": 2.9127899271040425, + "grad_norm": 3.0199368000030518, + "learning_rate": 1.042754878311264e-08, + "loss": 0.0049, + "num_input_tokens_seen": 43021192, + "step": 21977 + }, + { + "epoch": 2.9129224652087475, + "grad_norm": 4.815978050231934, + "learning_rate": 1.0395900157987304e-08, + "loss": 0.0425, + "num_input_tokens_seen": 43023048, + "step": 21978 + }, + { + "epoch": 2.9130550033134526, + "grad_norm": 4.038658142089844, + "learning_rate": 1.036429953414686e-08, + "loss": 0.0161, + "num_input_tokens_seen": 43024752, + "step": 21979 + }, + { + "epoch": 2.9131875414181576, + "grad_norm": 9.997885704040527, + "learning_rate": 1.0332746912199987e-08, + "loss": 0.1582, + "num_input_tokens_seen": 43027264, + "step": 21980 + }, + { + "epoch": 2.9133200795228626, + "grad_norm": 9.532692909240723, + "learning_rate": 1.0301242292755364e-08, + "loss": 0.0373, + "num_input_tokens_seen": 43028512, + "step": 21981 + }, + { + "epoch": 2.913452617627568, + "grad_norm": 0.6367543339729309, + "learning_rate": 1.0269785676420285e-08, + "loss": 0.0038, + "num_input_tokens_seen": 43030088, + "step": 21982 + }, + { + "epoch": 2.913585155732273, + "grad_norm": 0.008711429312825203, + "learning_rate": 1.0238377063801486e-08, + "loss": 0.0, + "num_input_tokens_seen": 43031768, + "step": 21983 + }, + { + "epoch": 2.913717693836978, + "grad_norm": 2.50688099861145, + "learning_rate": 1.0207016455504316e-08, + "loss": 0.0097, + "num_input_tokens_seen": 43035576, + "step": 21984 + }, + { + "epoch": 2.9138502319416832, + "grad_norm": 0.8532982468605042, + "learning_rate": 1.0175703852133567e-08, + "loss": 0.0078, + "num_input_tokens_seen": 43037608, + "step": 21985 + }, + { + "epoch": 2.9139827700463883, + "grad_norm": 3.4834022521972656, + "learning_rate": 1.0144439254292649e-08, + "loss": 0.0202, + "num_input_tokens_seen": 43039520, + "step": 21986 + }, + { + "epoch": 2.9141153081510933, + "grad_norm": 0.03925604373216629, + "learning_rate": 1.0113222662584966e-08, + "loss": 0.0002, + "num_input_tokens_seen": 43042080, + "step": 21987 + }, + { + "epoch": 2.9142478462557984, + "grad_norm": 11.040791511535645, + "learning_rate": 1.0082054077611703e-08, + "loss": 0.1806, + "num_input_tokens_seen": 43043760, + "step": 21988 + }, + { + "epoch": 2.914380384360504, + "grad_norm": 0.021875865757465363, + "learning_rate": 1.0050933499974325e-08, + "loss": 0.0001, + "num_input_tokens_seen": 43045920, + "step": 21989 + }, + { + "epoch": 2.914512922465209, + "grad_norm": 14.984296798706055, + "learning_rate": 1.0019860930272352e-08, + "loss": 0.2619, + "num_input_tokens_seen": 43047240, + "step": 21990 + }, + { + "epoch": 2.914645460569914, + "grad_norm": 0.18902479112148285, + "learning_rate": 9.988836369105582e-09, + "loss": 0.0008, + "num_input_tokens_seen": 43050416, + "step": 21991 + }, + { + "epoch": 2.914777998674619, + "grad_norm": 7.049749374389648, + "learning_rate": 9.95785981707159e-09, + "loss": 0.1074, + "num_input_tokens_seen": 43052240, + "step": 21992 + }, + { + "epoch": 2.914910536779324, + "grad_norm": 0.6406229734420776, + "learning_rate": 9.926931274767959e-09, + "loss": 0.0028, + "num_input_tokens_seen": 43054920, + "step": 21993 + }, + { + "epoch": 2.915043074884029, + "grad_norm": 0.06147696450352669, + "learning_rate": 9.896050742790875e-09, + "loss": 0.0003, + "num_input_tokens_seen": 43056472, + "step": 21994 + }, + { + "epoch": 2.915175612988734, + "grad_norm": 0.05534979701042175, + "learning_rate": 9.865218221735973e-09, + "loss": 0.0003, + "num_input_tokens_seen": 43058184, + "step": 21995 + }, + { + "epoch": 2.9153081510934395, + "grad_norm": 13.993824005126953, + "learning_rate": 9.834433712197223e-09, + "loss": 0.0456, + "num_input_tokens_seen": 43060016, + "step": 21996 + }, + { + "epoch": 2.9154406891981446, + "grad_norm": 0.11426708102226257, + "learning_rate": 9.803697214768593e-09, + "loss": 0.0008, + "num_input_tokens_seen": 43062056, + "step": 21997 + }, + { + "epoch": 2.9155732273028496, + "grad_norm": 0.0015103942714631557, + "learning_rate": 9.773008730042666e-09, + "loss": 0.0, + "num_input_tokens_seen": 43064688, + "step": 21998 + }, + { + "epoch": 2.9157057654075547, + "grad_norm": 8.768732070922852, + "learning_rate": 9.742368258610912e-09, + "loss": 0.0597, + "num_input_tokens_seen": 43067832, + "step": 21999 + }, + { + "epoch": 2.9158383035122597, + "grad_norm": 0.08799783885478973, + "learning_rate": 9.71177580106425e-09, + "loss": 0.0005, + "num_input_tokens_seen": 43069616, + "step": 22000 + }, + { + "epoch": 2.9159708416169647, + "grad_norm": 0.16143526136875153, + "learning_rate": 9.681231357992481e-09, + "loss": 0.0006, + "num_input_tokens_seen": 43071320, + "step": 22001 + }, + { + "epoch": 2.9161033797216698, + "grad_norm": 0.6979979872703552, + "learning_rate": 9.650734929984584e-09, + "loss": 0.0023, + "num_input_tokens_seen": 43072640, + "step": 22002 + }, + { + "epoch": 2.9162359178263753, + "grad_norm": 0.00871757697314024, + "learning_rate": 9.620286517628696e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43074552, + "step": 22003 + }, + { + "epoch": 2.9163684559310803, + "grad_norm": 3.8733153343200684, + "learning_rate": 9.589886121511572e-09, + "loss": 0.0328, + "num_input_tokens_seen": 43077056, + "step": 22004 + }, + { + "epoch": 2.9165009940357853, + "grad_norm": 0.16095322370529175, + "learning_rate": 9.559533742219407e-09, + "loss": 0.0006, + "num_input_tokens_seen": 43078560, + "step": 22005 + }, + { + "epoch": 2.9166335321404904, + "grad_norm": 0.015874097123742104, + "learning_rate": 9.529229380337569e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43080056, + "step": 22006 + }, + { + "epoch": 2.9167660702451954, + "grad_norm": 9.986083984375, + "learning_rate": 9.49897303645031e-09, + "loss": 0.1752, + "num_input_tokens_seen": 43082744, + "step": 22007 + }, + { + "epoch": 2.9168986083499004, + "grad_norm": 0.014211458154022694, + "learning_rate": 9.468764711141054e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43084776, + "step": 22008 + }, + { + "epoch": 2.9170311464546055, + "grad_norm": 3.568197250366211, + "learning_rate": 9.438604404991835e-09, + "loss": 0.0172, + "num_input_tokens_seen": 43086784, + "step": 22009 + }, + { + "epoch": 2.917163684559311, + "grad_norm": 6.67360258102417, + "learning_rate": 9.408492118584689e-09, + "loss": 0.0694, + "num_input_tokens_seen": 43088824, + "step": 22010 + }, + { + "epoch": 2.917296222664016, + "grad_norm": 0.007190667558461428, + "learning_rate": 9.378427852499983e-09, + "loss": 0.0, + "num_input_tokens_seen": 43091248, + "step": 22011 + }, + { + "epoch": 2.917428760768721, + "grad_norm": 0.37744802236557007, + "learning_rate": 9.348411607317254e-09, + "loss": 0.001, + "num_input_tokens_seen": 43093168, + "step": 22012 + }, + { + "epoch": 2.917561298873426, + "grad_norm": 0.0015196386957541108, + "learning_rate": 9.318443383615483e-09, + "loss": 0.0, + "num_input_tokens_seen": 43095504, + "step": 22013 + }, + { + "epoch": 2.917693836978131, + "grad_norm": 0.03816041350364685, + "learning_rate": 9.288523181972541e-09, + "loss": 0.0002, + "num_input_tokens_seen": 43097048, + "step": 22014 + }, + { + "epoch": 2.917826375082836, + "grad_norm": 15.683837890625, + "learning_rate": 9.258651002964913e-09, + "loss": 0.0953, + "num_input_tokens_seen": 43098768, + "step": 22015 + }, + { + "epoch": 2.917958913187541, + "grad_norm": 0.7249067425727844, + "learning_rate": 9.228826847168804e-09, + "loss": 0.0017, + "num_input_tokens_seen": 43101632, + "step": 22016 + }, + { + "epoch": 2.9180914512922467, + "grad_norm": 0.018533840775489807, + "learning_rate": 9.19905071515903e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43105176, + "step": 22017 + }, + { + "epoch": 2.9182239893969517, + "grad_norm": 19.001815795898438, + "learning_rate": 9.169322607510134e-09, + "loss": 0.2542, + "num_input_tokens_seen": 43106752, + "step": 22018 + }, + { + "epoch": 2.9183565275016567, + "grad_norm": 0.008577525615692139, + "learning_rate": 9.139642524794712e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43108664, + "step": 22019 + }, + { + "epoch": 2.918489065606362, + "grad_norm": 0.0036467225290834904, + "learning_rate": 9.110010467585639e-09, + "loss": 0.0, + "num_input_tokens_seen": 43111024, + "step": 22020 + }, + { + "epoch": 2.918621603711067, + "grad_norm": 4.102461338043213, + "learning_rate": 9.080426436453849e-09, + "loss": 0.0158, + "num_input_tokens_seen": 43112784, + "step": 22021 + }, + { + "epoch": 2.9187541418157723, + "grad_norm": 0.08176983892917633, + "learning_rate": 9.050890431969994e-09, + "loss": 0.0004, + "num_input_tokens_seen": 43115360, + "step": 22022 + }, + { + "epoch": 2.918886679920477, + "grad_norm": 0.01711432635784149, + "learning_rate": 9.021402454703065e-09, + "loss": 0.0002, + "num_input_tokens_seen": 43117640, + "step": 22023 + }, + { + "epoch": 2.9190192180251824, + "grad_norm": 8.551813125610352, + "learning_rate": 8.991962505222052e-09, + "loss": 0.0542, + "num_input_tokens_seen": 43119024, + "step": 22024 + }, + { + "epoch": 2.9191517561298874, + "grad_norm": 3.147411346435547, + "learning_rate": 8.962570584094554e-09, + "loss": 0.0185, + "num_input_tokens_seen": 43120968, + "step": 22025 + }, + { + "epoch": 2.9192842942345925, + "grad_norm": 21.117834091186523, + "learning_rate": 8.933226691887343e-09, + "loss": 0.17, + "num_input_tokens_seen": 43122920, + "step": 22026 + }, + { + "epoch": 2.9194168323392975, + "grad_norm": 15.083314895629883, + "learning_rate": 8.903930829165519e-09, + "loss": 0.2737, + "num_input_tokens_seen": 43125112, + "step": 22027 + }, + { + "epoch": 2.9195493704440025, + "grad_norm": 0.10068300366401672, + "learning_rate": 8.874682996494743e-09, + "loss": 0.0007, + "num_input_tokens_seen": 43126536, + "step": 22028 + }, + { + "epoch": 2.919681908548708, + "grad_norm": 12.075376510620117, + "learning_rate": 8.845483194438732e-09, + "loss": 0.0499, + "num_input_tokens_seen": 43128488, + "step": 22029 + }, + { + "epoch": 2.9198144466534126, + "grad_norm": 1.986091136932373, + "learning_rate": 8.816331423560087e-09, + "loss": 0.0091, + "num_input_tokens_seen": 43130376, + "step": 22030 + }, + { + "epoch": 2.919946984758118, + "grad_norm": 0.00848328322172165, + "learning_rate": 8.787227684421417e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43132560, + "step": 22031 + }, + { + "epoch": 2.920079522862823, + "grad_norm": 2.598376989364624, + "learning_rate": 8.758171977583386e-09, + "loss": 0.0245, + "num_input_tokens_seen": 43135240, + "step": 22032 + }, + { + "epoch": 2.920212060967528, + "grad_norm": 0.0036690249107778072, + "learning_rate": 8.729164303606652e-09, + "loss": 0.0, + "num_input_tokens_seen": 43136640, + "step": 22033 + }, + { + "epoch": 2.920344599072233, + "grad_norm": 0.016206033527851105, + "learning_rate": 8.700204663049939e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43139072, + "step": 22034 + }, + { + "epoch": 2.9204771371769382, + "grad_norm": 0.8606730699539185, + "learning_rate": 8.67129305647224e-09, + "loss": 0.0041, + "num_input_tokens_seen": 43141088, + "step": 22035 + }, + { + "epoch": 2.9206096752816437, + "grad_norm": 0.06360591948032379, + "learning_rate": 8.642429484430615e-09, + "loss": 0.0004, + "num_input_tokens_seen": 43143576, + "step": 22036 + }, + { + "epoch": 2.9207422133863483, + "grad_norm": 4.6535844802856445, + "learning_rate": 8.613613947481558e-09, + "loss": 0.0354, + "num_input_tokens_seen": 43144856, + "step": 22037 + }, + { + "epoch": 2.920874751491054, + "grad_norm": 10.734253883361816, + "learning_rate": 8.584846446180739e-09, + "loss": 0.1165, + "num_input_tokens_seen": 43146888, + "step": 22038 + }, + { + "epoch": 2.921007289595759, + "grad_norm": 0.0016261411365121603, + "learning_rate": 8.556126981082991e-09, + "loss": 0.0, + "num_input_tokens_seen": 43148384, + "step": 22039 + }, + { + "epoch": 2.921139827700464, + "grad_norm": 0.0018114912090823054, + "learning_rate": 8.52745555274176e-09, + "loss": 0.0, + "num_input_tokens_seen": 43149816, + "step": 22040 + }, + { + "epoch": 2.921272365805169, + "grad_norm": 0.08052407205104828, + "learning_rate": 8.498832161709935e-09, + "loss": 0.0004, + "num_input_tokens_seen": 43150952, + "step": 22041 + }, + { + "epoch": 2.921404903909874, + "grad_norm": 0.012318454682826996, + "learning_rate": 8.470256808539301e-09, + "loss": 0.0, + "num_input_tokens_seen": 43152136, + "step": 22042 + }, + { + "epoch": 2.9215374420145794, + "grad_norm": 0.053288139402866364, + "learning_rate": 8.441729493781081e-09, + "loss": 0.0003, + "num_input_tokens_seen": 43154864, + "step": 22043 + }, + { + "epoch": 2.921669980119284, + "grad_norm": 2.100132465362549, + "learning_rate": 8.413250217985114e-09, + "loss": 0.0146, + "num_input_tokens_seen": 43157304, + "step": 22044 + }, + { + "epoch": 2.9218025182239895, + "grad_norm": 6.771554946899414, + "learning_rate": 8.384818981700683e-09, + "loss": 0.069, + "num_input_tokens_seen": 43159224, + "step": 22045 + }, + { + "epoch": 2.9219350563286945, + "grad_norm": 0.018923955038189888, + "learning_rate": 8.35643578547568e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43161008, + "step": 22046 + }, + { + "epoch": 2.9220675944333996, + "grad_norm": 0.0010692656505852938, + "learning_rate": 8.328100629857449e-09, + "loss": 0.0, + "num_input_tokens_seen": 43162384, + "step": 22047 + }, + { + "epoch": 2.9222001325381046, + "grad_norm": 3.4071366786956787, + "learning_rate": 8.299813515392496e-09, + "loss": 0.0061, + "num_input_tokens_seen": 43164576, + "step": 22048 + }, + { + "epoch": 2.9223326706428097, + "grad_norm": 0.02812289632856846, + "learning_rate": 8.27157444262594e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43166288, + "step": 22049 + }, + { + "epoch": 2.922465208747515, + "grad_norm": 0.2410457879304886, + "learning_rate": 8.243383412102623e-09, + "loss": 0.0016, + "num_input_tokens_seen": 43168120, + "step": 22050 + }, + { + "epoch": 2.92259774685222, + "grad_norm": 0.010149221867322922, + "learning_rate": 8.215240424365723e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43170488, + "step": 22051 + }, + { + "epoch": 2.922730284956925, + "grad_norm": 3.368898391723633, + "learning_rate": 8.187145479957858e-09, + "loss": 0.0166, + "num_input_tokens_seen": 43172424, + "step": 22052 + }, + { + "epoch": 2.9228628230616303, + "grad_norm": 8.495651245117188, + "learning_rate": 8.15909857942082e-09, + "loss": 0.0831, + "num_input_tokens_seen": 43174240, + "step": 22053 + }, + { + "epoch": 2.9229953611663353, + "grad_norm": 0.1026514545083046, + "learning_rate": 8.131099723295565e-09, + "loss": 0.0005, + "num_input_tokens_seen": 43176376, + "step": 22054 + }, + { + "epoch": 2.9231278992710403, + "grad_norm": 1.635671615600586, + "learning_rate": 8.103148912121661e-09, + "loss": 0.004, + "num_input_tokens_seen": 43178048, + "step": 22055 + }, + { + "epoch": 2.9232604373757454, + "grad_norm": 0.002948101842775941, + "learning_rate": 8.07524614643812e-09, + "loss": 0.0, + "num_input_tokens_seen": 43180392, + "step": 22056 + }, + { + "epoch": 2.923392975480451, + "grad_norm": 11.108196258544922, + "learning_rate": 8.047391426783124e-09, + "loss": 0.0662, + "num_input_tokens_seen": 43182856, + "step": 22057 + }, + { + "epoch": 2.923525513585156, + "grad_norm": 0.1270332783460617, + "learning_rate": 8.019584753693189e-09, + "loss": 0.0004, + "num_input_tokens_seen": 43185008, + "step": 22058 + }, + { + "epoch": 2.923658051689861, + "grad_norm": 0.05493020638823509, + "learning_rate": 7.991826127705105e-09, + "loss": 0.0002, + "num_input_tokens_seen": 43186440, + "step": 22059 + }, + { + "epoch": 2.923790589794566, + "grad_norm": 7.82921028137207, + "learning_rate": 7.964115549353447e-09, + "loss": 0.1504, + "num_input_tokens_seen": 43190024, + "step": 22060 + }, + { + "epoch": 2.923923127899271, + "grad_norm": 8.035615921020508, + "learning_rate": 7.936453019173063e-09, + "loss": 0.1009, + "num_input_tokens_seen": 43192320, + "step": 22061 + }, + { + "epoch": 2.924055666003976, + "grad_norm": 0.05808660387992859, + "learning_rate": 7.908838537697138e-09, + "loss": 0.0002, + "num_input_tokens_seen": 43195456, + "step": 22062 + }, + { + "epoch": 2.924188204108681, + "grad_norm": 9.034675598144531, + "learning_rate": 7.88127210545775e-09, + "loss": 0.1319, + "num_input_tokens_seen": 43197872, + "step": 22063 + }, + { + "epoch": 2.9243207422133866, + "grad_norm": 2.089660167694092, + "learning_rate": 7.85375372298669e-09, + "loss": 0.0054, + "num_input_tokens_seen": 43200832, + "step": 22064 + }, + { + "epoch": 2.9244532803180916, + "grad_norm": 8.944283485412598, + "learning_rate": 7.826283390814648e-09, + "loss": 0.1389, + "num_input_tokens_seen": 43202624, + "step": 22065 + }, + { + "epoch": 2.9245858184227966, + "grad_norm": 3.8535234928131104, + "learning_rate": 7.798861109470923e-09, + "loss": 0.0113, + "num_input_tokens_seen": 43204856, + "step": 22066 + }, + { + "epoch": 2.9247183565275017, + "grad_norm": 0.016583627089858055, + "learning_rate": 7.771486879484535e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43206936, + "step": 22067 + }, + { + "epoch": 2.9248508946322067, + "grad_norm": 6.6017937660217285, + "learning_rate": 7.744160701383118e-09, + "loss": 0.0577, + "num_input_tokens_seen": 43208464, + "step": 22068 + }, + { + "epoch": 2.9249834327369117, + "grad_norm": 0.008596937172114849, + "learning_rate": 7.716882575693752e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43210432, + "step": 22069 + }, + { + "epoch": 2.925115970841617, + "grad_norm": 0.6601366400718689, + "learning_rate": 7.689652502942124e-09, + "loss": 0.002, + "num_input_tokens_seen": 43212288, + "step": 22070 + }, + { + "epoch": 2.9252485089463223, + "grad_norm": 7.792044162750244, + "learning_rate": 7.662470483653373e-09, + "loss": 0.1357, + "num_input_tokens_seen": 43213976, + "step": 22071 + }, + { + "epoch": 2.9253810470510273, + "grad_norm": 0.0022907129023224115, + "learning_rate": 7.6353365183518e-09, + "loss": 0.0, + "num_input_tokens_seen": 43215176, + "step": 22072 + }, + { + "epoch": 2.9255135851557323, + "grad_norm": 0.2264651656150818, + "learning_rate": 7.608250607560042e-09, + "loss": 0.0008, + "num_input_tokens_seen": 43216872, + "step": 22073 + }, + { + "epoch": 2.9256461232604374, + "grad_norm": 0.009814048185944557, + "learning_rate": 7.581212751800737e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43218544, + "step": 22074 + }, + { + "epoch": 2.9257786613651424, + "grad_norm": 0.14068816602230072, + "learning_rate": 7.554222951594859e-09, + "loss": 0.0006, + "num_input_tokens_seen": 43220128, + "step": 22075 + }, + { + "epoch": 2.9259111994698475, + "grad_norm": 3.8782832622528076, + "learning_rate": 7.527281207463377e-09, + "loss": 0.0329, + "num_input_tokens_seen": 43221744, + "step": 22076 + }, + { + "epoch": 2.9260437375745525, + "grad_norm": 8.24238395690918, + "learning_rate": 7.500387519925046e-09, + "loss": 0.0139, + "num_input_tokens_seen": 43222888, + "step": 22077 + }, + { + "epoch": 2.926176275679258, + "grad_norm": 6.80824613571167, + "learning_rate": 7.473541889498892e-09, + "loss": 0.048, + "num_input_tokens_seen": 43225336, + "step": 22078 + }, + { + "epoch": 2.926308813783963, + "grad_norm": 2.690898895263672, + "learning_rate": 7.446744316702281e-09, + "loss": 0.0139, + "num_input_tokens_seen": 43226992, + "step": 22079 + }, + { + "epoch": 2.926441351888668, + "grad_norm": 0.4056514501571655, + "learning_rate": 7.419994802052022e-09, + "loss": 0.0005, + "num_input_tokens_seen": 43228504, + "step": 22080 + }, + { + "epoch": 2.926573889993373, + "grad_norm": 0.05016142874956131, + "learning_rate": 7.39329334606409e-09, + "loss": 0.0002, + "num_input_tokens_seen": 43230224, + "step": 22081 + }, + { + "epoch": 2.926706428098078, + "grad_norm": 1.9146174192428589, + "learning_rate": 7.366639949252518e-09, + "loss": 0.0027, + "num_input_tokens_seen": 43232416, + "step": 22082 + }, + { + "epoch": 2.926838966202783, + "grad_norm": 0.005480424966663122, + "learning_rate": 7.340034612132174e-09, + "loss": 0.0, + "num_input_tokens_seen": 43234000, + "step": 22083 + }, + { + "epoch": 2.926971504307488, + "grad_norm": 1.8438187837600708, + "learning_rate": 7.3134773352151464e-09, + "loss": 0.0052, + "num_input_tokens_seen": 43236248, + "step": 22084 + }, + { + "epoch": 2.9271040424121937, + "grad_norm": 8.427469253540039, + "learning_rate": 7.286968119014081e-09, + "loss": 0.0715, + "num_input_tokens_seen": 43238664, + "step": 22085 + }, + { + "epoch": 2.9272365805168987, + "grad_norm": 6.219549179077148, + "learning_rate": 7.2605069640399595e-09, + "loss": 0.0726, + "num_input_tokens_seen": 43240680, + "step": 22086 + }, + { + "epoch": 2.9273691186216038, + "grad_norm": 22.618549346923828, + "learning_rate": 7.234093870802927e-09, + "loss": 0.0383, + "num_input_tokens_seen": 43243240, + "step": 22087 + }, + { + "epoch": 2.927501656726309, + "grad_norm": 0.016974564641714096, + "learning_rate": 7.207728839812023e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43245736, + "step": 22088 + }, + { + "epoch": 2.927634194831014, + "grad_norm": 11.528935432434082, + "learning_rate": 7.181411871576005e-09, + "loss": 0.0586, + "num_input_tokens_seen": 43247808, + "step": 22089 + }, + { + "epoch": 2.927766732935719, + "grad_norm": 6.52501106262207, + "learning_rate": 7.155142966601969e-09, + "loss": 0.0536, + "num_input_tokens_seen": 43249328, + "step": 22090 + }, + { + "epoch": 2.927899271040424, + "grad_norm": 4.307399272918701, + "learning_rate": 7.128922125396454e-09, + "loss": 0.0384, + "num_input_tokens_seen": 43251272, + "step": 22091 + }, + { + "epoch": 2.9280318091451294, + "grad_norm": 10.056846618652344, + "learning_rate": 7.102749348465166e-09, + "loss": 0.0423, + "num_input_tokens_seen": 43253688, + "step": 22092 + }, + { + "epoch": 2.9281643472498344, + "grad_norm": 2.213285207748413, + "learning_rate": 7.076624636312701e-09, + "loss": 0.0033, + "num_input_tokens_seen": 43256056, + "step": 22093 + }, + { + "epoch": 2.9282968853545395, + "grad_norm": 0.6583153605461121, + "learning_rate": 7.050547989442546e-09, + "loss": 0.0018, + "num_input_tokens_seen": 43257832, + "step": 22094 + }, + { + "epoch": 2.9284294234592445, + "grad_norm": 0.015409549698233604, + "learning_rate": 7.024519408357633e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43260384, + "step": 22095 + }, + { + "epoch": 2.9285619615639495, + "grad_norm": 0.9824042320251465, + "learning_rate": 6.998538893559781e-09, + "loss": 0.004, + "num_input_tokens_seen": 43262664, + "step": 22096 + }, + { + "epoch": 2.9286944996686546, + "grad_norm": 0.0654655396938324, + "learning_rate": 6.9726064455502564e-09, + "loss": 0.0003, + "num_input_tokens_seen": 43264256, + "step": 22097 + }, + { + "epoch": 2.9288270377733596, + "grad_norm": 2.583324670791626, + "learning_rate": 6.946722064828382e-09, + "loss": 0.0159, + "num_input_tokens_seen": 43266320, + "step": 22098 + }, + { + "epoch": 2.928959575878065, + "grad_norm": 10.743614196777344, + "learning_rate": 6.9208857518934805e-09, + "loss": 0.1085, + "num_input_tokens_seen": 43268592, + "step": 22099 + }, + { + "epoch": 2.92909211398277, + "grad_norm": 0.002843249123543501, + "learning_rate": 6.895097507244042e-09, + "loss": 0.0, + "num_input_tokens_seen": 43270008, + "step": 22100 + }, + { + "epoch": 2.929224652087475, + "grad_norm": 5.35045862197876, + "learning_rate": 6.869357331376891e-09, + "loss": 0.0668, + "num_input_tokens_seen": 43271480, + "step": 22101 + }, + { + "epoch": 2.92935719019218, + "grad_norm": 30.963146209716797, + "learning_rate": 6.843665224788576e-09, + "loss": 0.2139, + "num_input_tokens_seen": 43273000, + "step": 22102 + }, + { + "epoch": 2.9294897282968853, + "grad_norm": 7.191398620605469, + "learning_rate": 6.818021187974533e-09, + "loss": 0.0695, + "num_input_tokens_seen": 43275728, + "step": 22103 + }, + { + "epoch": 2.9296222664015903, + "grad_norm": 6.045431613922119, + "learning_rate": 6.7924252214285336e-09, + "loss": 0.0272, + "num_input_tokens_seen": 43277720, + "step": 22104 + }, + { + "epoch": 2.9297548045062953, + "grad_norm": 0.005579132121056318, + "learning_rate": 6.766877325644905e-09, + "loss": 0.0, + "num_input_tokens_seen": 43279304, + "step": 22105 + }, + { + "epoch": 2.929887342611001, + "grad_norm": 6.941376209259033, + "learning_rate": 6.7413775011154756e-09, + "loss": 0.0479, + "num_input_tokens_seen": 43280944, + "step": 22106 + }, + { + "epoch": 2.930019880715706, + "grad_norm": 9.945730209350586, + "learning_rate": 6.71592574833263e-09, + "loss": 0.0831, + "num_input_tokens_seen": 43283592, + "step": 22107 + }, + { + "epoch": 2.930152418820411, + "grad_norm": 0.029946571215987206, + "learning_rate": 6.6905220677868085e-09, + "loss": 0.0002, + "num_input_tokens_seen": 43285352, + "step": 22108 + }, + { + "epoch": 2.930284956925116, + "grad_norm": 2.69757342338562, + "learning_rate": 6.6651664599676204e-09, + "loss": 0.0137, + "num_input_tokens_seen": 43287736, + "step": 22109 + }, + { + "epoch": 2.930417495029821, + "grad_norm": 8.09864330291748, + "learning_rate": 6.639858925364118e-09, + "loss": 0.1031, + "num_input_tokens_seen": 43289640, + "step": 22110 + }, + { + "epoch": 2.9305500331345264, + "grad_norm": 14.94510555267334, + "learning_rate": 6.6145994644642445e-09, + "loss": 0.0743, + "num_input_tokens_seen": 43292136, + "step": 22111 + }, + { + "epoch": 2.930682571239231, + "grad_norm": 3.3469364643096924, + "learning_rate": 6.589388077754833e-09, + "loss": 0.0344, + "num_input_tokens_seen": 43294640, + "step": 22112 + }, + { + "epoch": 2.9308151093439365, + "grad_norm": 0.3360113799571991, + "learning_rate": 6.5642247657224376e-09, + "loss": 0.0012, + "num_input_tokens_seen": 43296968, + "step": 22113 + }, + { + "epoch": 2.9309476474486416, + "grad_norm": 0.016380127519369125, + "learning_rate": 6.539109528851673e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43298568, + "step": 22114 + }, + { + "epoch": 2.9310801855533466, + "grad_norm": 3.8965156078338623, + "learning_rate": 6.514042367627149e-09, + "loss": 0.0435, + "num_input_tokens_seen": 43300112, + "step": 22115 + }, + { + "epoch": 2.9312127236580516, + "grad_norm": 2.0914340019226074, + "learning_rate": 6.489023282532092e-09, + "loss": 0.0083, + "num_input_tokens_seen": 43301824, + "step": 22116 + }, + { + "epoch": 2.9313452617627567, + "grad_norm": 2.017146110534668, + "learning_rate": 6.4640522740486155e-09, + "loss": 0.031, + "num_input_tokens_seen": 43303640, + "step": 22117 + }, + { + "epoch": 2.931477799867462, + "grad_norm": 1.4194573163986206, + "learning_rate": 6.439129342658557e-09, + "loss": 0.0081, + "num_input_tokens_seen": 43305520, + "step": 22118 + }, + { + "epoch": 2.9316103379721667, + "grad_norm": 0.0023405931424349546, + "learning_rate": 6.414254488842364e-09, + "loss": 0.0, + "num_input_tokens_seen": 43306952, + "step": 22119 + }, + { + "epoch": 2.9317428760768722, + "grad_norm": 2.6504392623901367, + "learning_rate": 6.3894277130796545e-09, + "loss": 0.0102, + "num_input_tokens_seen": 43308784, + "step": 22120 + }, + { + "epoch": 2.9318754141815773, + "grad_norm": 2.050424575805664, + "learning_rate": 6.364649015848934e-09, + "loss": 0.0242, + "num_input_tokens_seen": 43310560, + "step": 22121 + }, + { + "epoch": 2.9320079522862823, + "grad_norm": 0.023232610896229744, + "learning_rate": 6.339918397627876e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43312104, + "step": 22122 + }, + { + "epoch": 2.9321404903909873, + "grad_norm": 0.11372028291225433, + "learning_rate": 6.315235858893598e-09, + "loss": 0.0005, + "num_input_tokens_seen": 43313664, + "step": 22123 + }, + { + "epoch": 2.9322730284956924, + "grad_norm": 8.348061561584473, + "learning_rate": 6.2906014001221094e-09, + "loss": 0.0924, + "num_input_tokens_seen": 43315784, + "step": 22124 + }, + { + "epoch": 2.932405566600398, + "grad_norm": 0.014346299692988396, + "learning_rate": 6.266015021787752e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43317440, + "step": 22125 + }, + { + "epoch": 2.9325381047051025, + "grad_norm": 0.03806941956281662, + "learning_rate": 6.241476724365148e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43318760, + "step": 22126 + }, + { + "epoch": 2.932670642809808, + "grad_norm": 0.30412936210632324, + "learning_rate": 6.216986508327527e-09, + "loss": 0.0011, + "num_input_tokens_seen": 43320680, + "step": 22127 + }, + { + "epoch": 2.932803180914513, + "grad_norm": 0.7329542636871338, + "learning_rate": 6.192544374146458e-09, + "loss": 0.0034, + "num_input_tokens_seen": 43322992, + "step": 22128 + }, + { + "epoch": 2.932935719019218, + "grad_norm": 1.9308414459228516, + "learning_rate": 6.1681503222935094e-09, + "loss": 0.0091, + "num_input_tokens_seen": 43326168, + "step": 22129 + }, + { + "epoch": 2.933068257123923, + "grad_norm": 2.010263681411743, + "learning_rate": 6.143804353238858e-09, + "loss": 0.0076, + "num_input_tokens_seen": 43329000, + "step": 22130 + }, + { + "epoch": 2.933200795228628, + "grad_norm": 0.009435562416911125, + "learning_rate": 6.119506467452408e-09, + "loss": 0.0, + "num_input_tokens_seen": 43330568, + "step": 22131 + }, + { + "epoch": 2.9333333333333336, + "grad_norm": 10.957011222839355, + "learning_rate": 6.0952566654023934e-09, + "loss": 0.0802, + "num_input_tokens_seen": 43332560, + "step": 22132 + }, + { + "epoch": 2.933465871438038, + "grad_norm": 0.014063953422009945, + "learning_rate": 6.071054947555943e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43333912, + "step": 22133 + }, + { + "epoch": 2.9335984095427436, + "grad_norm": 14.409337043762207, + "learning_rate": 6.046901314380183e-09, + "loss": 0.2201, + "num_input_tokens_seen": 43336160, + "step": 22134 + }, + { + "epoch": 2.9337309476474487, + "grad_norm": 0.01625487580895424, + "learning_rate": 6.022795766340572e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43338160, + "step": 22135 + }, + { + "epoch": 2.9338634857521537, + "grad_norm": 7.523234844207764, + "learning_rate": 5.998738303902018e-09, + "loss": 0.0902, + "num_input_tokens_seen": 43341040, + "step": 22136 + }, + { + "epoch": 2.9339960238568588, + "grad_norm": 0.002209529746323824, + "learning_rate": 5.974728927528595e-09, + "loss": 0.0, + "num_input_tokens_seen": 43342176, + "step": 22137 + }, + { + "epoch": 2.934128561961564, + "grad_norm": 5.534908771514893, + "learning_rate": 5.9507676376827085e-09, + "loss": 0.0838, + "num_input_tokens_seen": 43344416, + "step": 22138 + }, + { + "epoch": 2.9342611000662693, + "grad_norm": 4.271093845367432, + "learning_rate": 5.92685443482649e-09, + "loss": 0.0197, + "num_input_tokens_seen": 43347416, + "step": 22139 + }, + { + "epoch": 2.9343936381709743, + "grad_norm": 0.03302037715911865, + "learning_rate": 5.902989319421237e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43349656, + "step": 22140 + }, + { + "epoch": 2.9345261762756794, + "grad_norm": 5.448806285858154, + "learning_rate": 5.87917229192686e-09, + "loss": 0.0332, + "num_input_tokens_seen": 43351768, + "step": 22141 + }, + { + "epoch": 2.9346587143803844, + "grad_norm": 0.008495145477354527, + "learning_rate": 5.8554033528024355e-09, + "loss": 0.0, + "num_input_tokens_seen": 43353656, + "step": 22142 + }, + { + "epoch": 2.9347912524850894, + "grad_norm": 0.012205966748297215, + "learning_rate": 5.831682502506764e-09, + "loss": 0.0, + "num_input_tokens_seen": 43356000, + "step": 22143 + }, + { + "epoch": 2.9349237905897945, + "grad_norm": 0.0032853432931005955, + "learning_rate": 5.8080097414967e-09, + "loss": 0.0, + "num_input_tokens_seen": 43357568, + "step": 22144 + }, + { + "epoch": 2.9350563286944995, + "grad_norm": 0.006267149467021227, + "learning_rate": 5.784385070229104e-09, + "loss": 0.0, + "num_input_tokens_seen": 43359344, + "step": 22145 + }, + { + "epoch": 2.935188866799205, + "grad_norm": 10.19390869140625, + "learning_rate": 5.760808489159164e-09, + "loss": 0.1342, + "num_input_tokens_seen": 43361912, + "step": 22146 + }, + { + "epoch": 2.93532140490391, + "grad_norm": 0.03335731104016304, + "learning_rate": 5.737279998741241e-09, + "loss": 0.0002, + "num_input_tokens_seen": 43364440, + "step": 22147 + }, + { + "epoch": 2.935453943008615, + "grad_norm": 0.43734079599380493, + "learning_rate": 5.713799599429693e-09, + "loss": 0.0012, + "num_input_tokens_seen": 43365896, + "step": 22148 + }, + { + "epoch": 2.93558648111332, + "grad_norm": 0.08593138307332993, + "learning_rate": 5.69036729167638e-09, + "loss": 0.0004, + "num_input_tokens_seen": 43367896, + "step": 22149 + }, + { + "epoch": 2.935719019218025, + "grad_norm": 0.009426298551261425, + "learning_rate": 5.6669830759339965e-09, + "loss": 0.0, + "num_input_tokens_seen": 43369072, + "step": 22150 + }, + { + "epoch": 2.93585155732273, + "grad_norm": 6.8926239013671875, + "learning_rate": 5.64364695265246e-09, + "loss": 0.0327, + "num_input_tokens_seen": 43371120, + "step": 22151 + }, + { + "epoch": 2.935984095427435, + "grad_norm": 0.005670865066349506, + "learning_rate": 5.6203589222825205e-09, + "loss": 0.0, + "num_input_tokens_seen": 43372152, + "step": 22152 + }, + { + "epoch": 2.9361166335321407, + "grad_norm": 0.8136738538742065, + "learning_rate": 5.59711898527271e-09, + "loss": 0.0039, + "num_input_tokens_seen": 43375024, + "step": 22153 + }, + { + "epoch": 2.9362491716368457, + "grad_norm": 5.071423530578613, + "learning_rate": 5.5739271420712785e-09, + "loss": 0.0506, + "num_input_tokens_seen": 43376624, + "step": 22154 + }, + { + "epoch": 2.9363817097415508, + "grad_norm": 24.13330078125, + "learning_rate": 5.550783393125092e-09, + "loss": 0.1184, + "num_input_tokens_seen": 43378248, + "step": 22155 + }, + { + "epoch": 2.936514247846256, + "grad_norm": 0.002653921488672495, + "learning_rate": 5.5276877388810155e-09, + "loss": 0.0, + "num_input_tokens_seen": 43380880, + "step": 22156 + }, + { + "epoch": 2.936646785950961, + "grad_norm": 0.011623536236584187, + "learning_rate": 5.504640179783693e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43382688, + "step": 22157 + }, + { + "epoch": 2.936779324055666, + "grad_norm": 0.046251118183135986, + "learning_rate": 5.481640716278047e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43383696, + "step": 22158 + }, + { + "epoch": 2.936911862160371, + "grad_norm": 0.0017245863564312458, + "learning_rate": 5.4586893488070555e-09, + "loss": 0.0, + "num_input_tokens_seen": 43384928, + "step": 22159 + }, + { + "epoch": 2.9370444002650764, + "grad_norm": 0.03072090446949005, + "learning_rate": 5.435786077813421e-09, + "loss": 0.0002, + "num_input_tokens_seen": 43386896, + "step": 22160 + }, + { + "epoch": 2.9371769383697814, + "grad_norm": 2.2341530323028564, + "learning_rate": 5.412930903738734e-09, + "loss": 0.0122, + "num_input_tokens_seen": 43388824, + "step": 22161 + }, + { + "epoch": 2.9373094764744865, + "grad_norm": 10.868108749389648, + "learning_rate": 5.390123827023752e-09, + "loss": 0.0932, + "num_input_tokens_seen": 43390600, + "step": 22162 + }, + { + "epoch": 2.9374420145791915, + "grad_norm": 0.0027919115964323282, + "learning_rate": 5.367364848108125e-09, + "loss": 0.0, + "num_input_tokens_seen": 43391528, + "step": 22163 + }, + { + "epoch": 2.9375745526838966, + "grad_norm": 0.06976006925106049, + "learning_rate": 5.344653967430669e-09, + "loss": 0.0004, + "num_input_tokens_seen": 43392984, + "step": 22164 + }, + { + "epoch": 2.9377070907886016, + "grad_norm": 2.8172214031219482, + "learning_rate": 5.321991185429087e-09, + "loss": 0.012, + "num_input_tokens_seen": 43395120, + "step": 22165 + }, + { + "epoch": 2.9378396288933066, + "grad_norm": 2.775331974029541, + "learning_rate": 5.299376502540809e-09, + "loss": 0.0324, + "num_input_tokens_seen": 43397072, + "step": 22166 + }, + { + "epoch": 2.937972166998012, + "grad_norm": 0.09739279001951218, + "learning_rate": 5.27680991920132e-09, + "loss": 0.0005, + "num_input_tokens_seen": 43398416, + "step": 22167 + }, + { + "epoch": 2.938104705102717, + "grad_norm": 8.417837142944336, + "learning_rate": 5.254291435846104e-09, + "loss": 0.1836, + "num_input_tokens_seen": 43401040, + "step": 22168 + }, + { + "epoch": 2.938237243207422, + "grad_norm": 7.034914493560791, + "learning_rate": 5.231821052908981e-09, + "loss": 0.0269, + "num_input_tokens_seen": 43403104, + "step": 22169 + }, + { + "epoch": 2.9383697813121272, + "grad_norm": 17.984704971313477, + "learning_rate": 5.209398770823493e-09, + "loss": 0.2277, + "num_input_tokens_seen": 43404984, + "step": 22170 + }, + { + "epoch": 2.9385023194168323, + "grad_norm": 4.409945964813232, + "learning_rate": 5.187024590021794e-09, + "loss": 0.0273, + "num_input_tokens_seen": 43406632, + "step": 22171 + }, + { + "epoch": 2.9386348575215373, + "grad_norm": 0.01551005057990551, + "learning_rate": 5.164698510935484e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43408688, + "step": 22172 + }, + { + "epoch": 2.9387673956262423, + "grad_norm": 0.00377973192371428, + "learning_rate": 5.142420533995052e-09, + "loss": 0.0, + "num_input_tokens_seen": 43409912, + "step": 22173 + }, + { + "epoch": 2.938899933730948, + "grad_norm": 0.007364246528595686, + "learning_rate": 5.120190659629598e-09, + "loss": 0.0, + "num_input_tokens_seen": 43412008, + "step": 22174 + }, + { + "epoch": 2.939032471835653, + "grad_norm": 12.22370719909668, + "learning_rate": 5.098008888267947e-09, + "loss": 0.2074, + "num_input_tokens_seen": 43414400, + "step": 22175 + }, + { + "epoch": 2.939165009940358, + "grad_norm": 0.0016230563633143902, + "learning_rate": 5.075875220338089e-09, + "loss": 0.0, + "num_input_tokens_seen": 43415832, + "step": 22176 + }, + { + "epoch": 2.939297548045063, + "grad_norm": 4.920330047607422, + "learning_rate": 5.05378965626635e-09, + "loss": 0.041, + "num_input_tokens_seen": 43417576, + "step": 22177 + }, + { + "epoch": 2.939430086149768, + "grad_norm": 2.4089043140411377, + "learning_rate": 5.0317521964787786e-09, + "loss": 0.0204, + "num_input_tokens_seen": 43420160, + "step": 22178 + }, + { + "epoch": 2.939562624254473, + "grad_norm": 4.749571800231934, + "learning_rate": 5.009762841400034e-09, + "loss": 0.0695, + "num_input_tokens_seen": 43422056, + "step": 22179 + }, + { + "epoch": 2.939695162359178, + "grad_norm": 0.0010952723678201437, + "learning_rate": 4.9878215914545004e-09, + "loss": 0.0, + "num_input_tokens_seen": 43423408, + "step": 22180 + }, + { + "epoch": 2.9398277004638835, + "grad_norm": 0.22786812484264374, + "learning_rate": 4.9659284470648935e-09, + "loss": 0.0009, + "num_input_tokens_seen": 43425184, + "step": 22181 + }, + { + "epoch": 2.9399602385685886, + "grad_norm": 14.400517463684082, + "learning_rate": 4.9440834086533775e-09, + "loss": 0.3224, + "num_input_tokens_seen": 43427024, + "step": 22182 + }, + { + "epoch": 2.9400927766732936, + "grad_norm": 0.012812037952244282, + "learning_rate": 4.92228647664128e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43428888, + "step": 22183 + }, + { + "epoch": 2.9402253147779986, + "grad_norm": 0.008589325472712517, + "learning_rate": 4.900537651448545e-09, + "loss": 0.0, + "num_input_tokens_seen": 43430088, + "step": 22184 + }, + { + "epoch": 2.9403578528827037, + "grad_norm": 5.465794086456299, + "learning_rate": 4.8788369334951126e-09, + "loss": 0.0433, + "num_input_tokens_seen": 43433848, + "step": 22185 + }, + { + "epoch": 2.9404903909874087, + "grad_norm": 0.04985884577035904, + "learning_rate": 4.857184323198705e-09, + "loss": 0.0003, + "num_input_tokens_seen": 43436752, + "step": 22186 + }, + { + "epoch": 2.9406229290921138, + "grad_norm": 0.1549299955368042, + "learning_rate": 4.835579820977043e-09, + "loss": 0.0011, + "num_input_tokens_seen": 43438840, + "step": 22187 + }, + { + "epoch": 2.9407554671968192, + "grad_norm": 13.72365665435791, + "learning_rate": 4.814023427247017e-09, + "loss": 0.3077, + "num_input_tokens_seen": 43441104, + "step": 22188 + }, + { + "epoch": 2.9408880053015243, + "grad_norm": 0.02718745730817318, + "learning_rate": 4.792515142423848e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43444024, + "step": 22189 + }, + { + "epoch": 2.9410205434062293, + "grad_norm": 2.9345405101776123, + "learning_rate": 4.7710549669222065e-09, + "loss": 0.0165, + "num_input_tokens_seen": 43445672, + "step": 22190 + }, + { + "epoch": 2.9411530815109344, + "grad_norm": 6.276704788208008, + "learning_rate": 4.749642901156204e-09, + "loss": 0.0132, + "num_input_tokens_seen": 43447584, + "step": 22191 + }, + { + "epoch": 2.9412856196156394, + "grad_norm": 0.008495446294546127, + "learning_rate": 4.728278945538289e-09, + "loss": 0.0, + "num_input_tokens_seen": 43448784, + "step": 22192 + }, + { + "epoch": 2.941418157720345, + "grad_norm": 9.648369789123535, + "learning_rate": 4.706963100480633e-09, + "loss": 0.0192, + "num_input_tokens_seen": 43451120, + "step": 22193 + }, + { + "epoch": 2.9415506958250495, + "grad_norm": 0.2611417770385742, + "learning_rate": 4.685695366394016e-09, + "loss": 0.0018, + "num_input_tokens_seen": 43453992, + "step": 22194 + }, + { + "epoch": 2.941683233929755, + "grad_norm": 14.668900489807129, + "learning_rate": 4.664475743688668e-09, + "loss": 0.2126, + "num_input_tokens_seen": 43456448, + "step": 22195 + }, + { + "epoch": 2.94181577203446, + "grad_norm": 0.08406632393598557, + "learning_rate": 4.643304232773982e-09, + "loss": 0.0003, + "num_input_tokens_seen": 43459360, + "step": 22196 + }, + { + "epoch": 2.941948310139165, + "grad_norm": 0.3660418689250946, + "learning_rate": 4.62218083405741e-09, + "loss": 0.0016, + "num_input_tokens_seen": 43461160, + "step": 22197 + }, + { + "epoch": 2.94208084824387, + "grad_norm": 0.5038986206054688, + "learning_rate": 4.60110554794696e-09, + "loss": 0.0036, + "num_input_tokens_seen": 43464288, + "step": 22198 + }, + { + "epoch": 2.942213386348575, + "grad_norm": 12.731736183166504, + "learning_rate": 4.5800783748486955e-09, + "loss": 0.1609, + "num_input_tokens_seen": 43467112, + "step": 22199 + }, + { + "epoch": 2.9423459244532806, + "grad_norm": 0.003218056634068489, + "learning_rate": 4.559099315167848e-09, + "loss": 0.0, + "num_input_tokens_seen": 43468408, + "step": 22200 + }, + { + "epoch": 2.942478462557985, + "grad_norm": 6.425694465637207, + "learning_rate": 4.538168369309093e-09, + "loss": 0.0485, + "num_input_tokens_seen": 43470848, + "step": 22201 + }, + { + "epoch": 2.9426110006626907, + "grad_norm": 2.1874780654907227, + "learning_rate": 4.517285537675997e-09, + "loss": 0.0075, + "num_input_tokens_seen": 43472400, + "step": 22202 + }, + { + "epoch": 2.9427435387673957, + "grad_norm": 7.34139347076416, + "learning_rate": 4.496450820671294e-09, + "loss": 0.1725, + "num_input_tokens_seen": 43474648, + "step": 22203 + }, + { + "epoch": 2.9428760768721007, + "grad_norm": 0.006921963766217232, + "learning_rate": 4.4756642186963295e-09, + "loss": 0.0, + "num_input_tokens_seen": 43476512, + "step": 22204 + }, + { + "epoch": 2.9430086149768058, + "grad_norm": 11.82992935180664, + "learning_rate": 4.454925732152449e-09, + "loss": 0.1437, + "num_input_tokens_seen": 43478288, + "step": 22205 + }, + { + "epoch": 2.943141153081511, + "grad_norm": 5.393231391906738, + "learning_rate": 4.434235361439054e-09, + "loss": 0.0232, + "num_input_tokens_seen": 43480312, + "step": 22206 + }, + { + "epoch": 2.9432736911862163, + "grad_norm": 10.708026885986328, + "learning_rate": 4.413593106955272e-09, + "loss": 0.2381, + "num_input_tokens_seen": 43481856, + "step": 22207 + }, + { + "epoch": 2.943406229290921, + "grad_norm": 6.932523250579834, + "learning_rate": 4.392998969098838e-09, + "loss": 0.0521, + "num_input_tokens_seen": 43483912, + "step": 22208 + }, + { + "epoch": 2.9435387673956264, + "grad_norm": 2.2146403789520264, + "learning_rate": 4.372452948267214e-09, + "loss": 0.0143, + "num_input_tokens_seen": 43485280, + "step": 22209 + }, + { + "epoch": 2.9436713055003314, + "grad_norm": 0.24946315586566925, + "learning_rate": 4.3519550448561935e-09, + "loss": 0.0007, + "num_input_tokens_seen": 43488680, + "step": 22210 + }, + { + "epoch": 2.9438038436050364, + "grad_norm": 0.6993536949157715, + "learning_rate": 4.331505259261293e-09, + "loss": 0.0056, + "num_input_tokens_seen": 43490576, + "step": 22211 + }, + { + "epoch": 2.9439363817097415, + "grad_norm": 14.776391983032227, + "learning_rate": 4.311103591876365e-09, + "loss": 0.2447, + "num_input_tokens_seen": 43494168, + "step": 22212 + }, + { + "epoch": 2.9440689198144465, + "grad_norm": 7.063780784606934, + "learning_rate": 4.2907500430952595e-09, + "loss": 0.0752, + "num_input_tokens_seen": 43497616, + "step": 22213 + }, + { + "epoch": 2.944201457919152, + "grad_norm": 0.0016312790103256702, + "learning_rate": 4.2704446133101655e-09, + "loss": 0.0, + "num_input_tokens_seen": 43499184, + "step": 22214 + }, + { + "epoch": 2.9443339960238566, + "grad_norm": 11.208063125610352, + "learning_rate": 4.2501873029127115e-09, + "loss": 0.0647, + "num_input_tokens_seen": 43501440, + "step": 22215 + }, + { + "epoch": 2.944466534128562, + "grad_norm": 0.06850332021713257, + "learning_rate": 4.22997811229342e-09, + "loss": 0.0005, + "num_input_tokens_seen": 43503512, + "step": 22216 + }, + { + "epoch": 2.944599072233267, + "grad_norm": 0.010271979495882988, + "learning_rate": 4.209817041841702e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43504464, + "step": 22217 + }, + { + "epoch": 2.944731610337972, + "grad_norm": 0.16310912370681763, + "learning_rate": 4.189704091946411e-09, + "loss": 0.0003, + "num_input_tokens_seen": 43506520, + "step": 22218 + }, + { + "epoch": 2.944864148442677, + "grad_norm": 0.6446613669395447, + "learning_rate": 4.169639262995573e-09, + "loss": 0.0024, + "num_input_tokens_seen": 43509936, + "step": 22219 + }, + { + "epoch": 2.9449966865473822, + "grad_norm": 0.03429146856069565, + "learning_rate": 4.149622555375821e-09, + "loss": 0.0002, + "num_input_tokens_seen": 43511656, + "step": 22220 + }, + { + "epoch": 2.9451292246520877, + "grad_norm": 5.493871688842773, + "learning_rate": 4.1296539694732355e-09, + "loss": 0.0597, + "num_input_tokens_seen": 43513872, + "step": 22221 + }, + { + "epoch": 2.9452617627567923, + "grad_norm": 0.004199087154120207, + "learning_rate": 4.109733505672509e-09, + "loss": 0.0, + "num_input_tokens_seen": 43515952, + "step": 22222 + }, + { + "epoch": 2.945394300861498, + "grad_norm": 0.35710567235946655, + "learning_rate": 4.089861164358056e-09, + "loss": 0.0007, + "num_input_tokens_seen": 43517608, + "step": 22223 + }, + { + "epoch": 2.945526838966203, + "grad_norm": 0.03404812514781952, + "learning_rate": 4.070036945912903e-09, + "loss": 0.0002, + "num_input_tokens_seen": 43518848, + "step": 22224 + }, + { + "epoch": 2.945659377070908, + "grad_norm": 0.5212902426719666, + "learning_rate": 4.0502608507192454e-09, + "loss": 0.0007, + "num_input_tokens_seen": 43521176, + "step": 22225 + }, + { + "epoch": 2.945791915175613, + "grad_norm": 0.12299291789531708, + "learning_rate": 4.030532879158722e-09, + "loss": 0.0007, + "num_input_tokens_seen": 43523376, + "step": 22226 + }, + { + "epoch": 2.945924453280318, + "grad_norm": 3.2495760917663574, + "learning_rate": 4.010853031611029e-09, + "loss": 0.0426, + "num_input_tokens_seen": 43525152, + "step": 22227 + }, + { + "epoch": 2.9460569913850234, + "grad_norm": 0.0035223516169935465, + "learning_rate": 3.991221308456139e-09, + "loss": 0.0, + "num_input_tokens_seen": 43526520, + "step": 22228 + }, + { + "epoch": 2.9461895294897285, + "grad_norm": 1.9483466148376465, + "learning_rate": 3.971637710072363e-09, + "loss": 0.0149, + "num_input_tokens_seen": 43528272, + "step": 22229 + }, + { + "epoch": 2.9463220675944335, + "grad_norm": 0.03639443218708038, + "learning_rate": 3.952102236837452e-09, + "loss": 0.0002, + "num_input_tokens_seen": 43530632, + "step": 22230 + }, + { + "epoch": 2.9464546056991385, + "grad_norm": 6.411543846130371, + "learning_rate": 3.93261488912805e-09, + "loss": 0.0262, + "num_input_tokens_seen": 43533008, + "step": 22231 + }, + { + "epoch": 2.9465871438038436, + "grad_norm": 4.313691139221191, + "learning_rate": 3.913175667319413e-09, + "loss": 0.0317, + "num_input_tokens_seen": 43534960, + "step": 22232 + }, + { + "epoch": 2.9467196819085486, + "grad_norm": 5.763996601104736, + "learning_rate": 3.893784571787074e-09, + "loss": 0.0392, + "num_input_tokens_seen": 43536904, + "step": 22233 + }, + { + "epoch": 2.9468522200132536, + "grad_norm": 0.23853237926959991, + "learning_rate": 3.874441602904344e-09, + "loss": 0.0008, + "num_input_tokens_seen": 43537968, + "step": 22234 + }, + { + "epoch": 2.946984758117959, + "grad_norm": 0.0018829447217285633, + "learning_rate": 3.855146761044259e-09, + "loss": 0.0, + "num_input_tokens_seen": 43539104, + "step": 22235 + }, + { + "epoch": 2.947117296222664, + "grad_norm": 0.03619382530450821, + "learning_rate": 3.8359000465793e-09, + "loss": 0.0002, + "num_input_tokens_seen": 43541744, + "step": 22236 + }, + { + "epoch": 2.947249834327369, + "grad_norm": 0.0006016155239194632, + "learning_rate": 3.816701459880001e-09, + "loss": 0.0, + "num_input_tokens_seen": 43543160, + "step": 22237 + }, + { + "epoch": 2.9473823724320742, + "grad_norm": 4.760390281677246, + "learning_rate": 3.797551001316624e-09, + "loss": 0.0733, + "num_input_tokens_seen": 43546168, + "step": 22238 + }, + { + "epoch": 2.9475149105367793, + "grad_norm": 0.00532658351585269, + "learning_rate": 3.7784486712585946e-09, + "loss": 0.0, + "num_input_tokens_seen": 43547784, + "step": 22239 + }, + { + "epoch": 2.9476474486414843, + "grad_norm": 0.11292336881160736, + "learning_rate": 3.759394470074229e-09, + "loss": 0.0003, + "num_input_tokens_seen": 43549192, + "step": 22240 + }, + { + "epoch": 2.9477799867461894, + "grad_norm": 2.300103187561035, + "learning_rate": 3.740388398130734e-09, + "loss": 0.0075, + "num_input_tokens_seen": 43550616, + "step": 22241 + }, + { + "epoch": 2.947912524850895, + "grad_norm": 14.270966529846191, + "learning_rate": 3.7214304557944835e-09, + "loss": 0.0856, + "num_input_tokens_seen": 43552808, + "step": 22242 + }, + { + "epoch": 2.9480450629556, + "grad_norm": 12.490785598754883, + "learning_rate": 3.7025206434312954e-09, + "loss": 0.0888, + "num_input_tokens_seen": 43554880, + "step": 22243 + }, + { + "epoch": 2.948177601060305, + "grad_norm": 5.024721622467041, + "learning_rate": 3.6836589614056007e-09, + "loss": 0.0841, + "num_input_tokens_seen": 43556528, + "step": 22244 + }, + { + "epoch": 2.94831013916501, + "grad_norm": 10.244945526123047, + "learning_rate": 3.664845410080997e-09, + "loss": 0.0604, + "num_input_tokens_seen": 43558416, + "step": 22245 + }, + { + "epoch": 2.948442677269715, + "grad_norm": 0.7364751100540161, + "learning_rate": 3.646079989820528e-09, + "loss": 0.0017, + "num_input_tokens_seen": 43560624, + "step": 22246 + }, + { + "epoch": 2.94857521537442, + "grad_norm": 7.57244873046875, + "learning_rate": 3.6273627009855706e-09, + "loss": 0.0554, + "num_input_tokens_seen": 43563104, + "step": 22247 + }, + { + "epoch": 2.948707753479125, + "grad_norm": 7.464260101318359, + "learning_rate": 3.608693543937225e-09, + "loss": 0.0753, + "num_input_tokens_seen": 43565280, + "step": 22248 + }, + { + "epoch": 2.9488402915838305, + "grad_norm": 8.373775482177734, + "learning_rate": 3.5900725190357587e-09, + "loss": 0.1406, + "num_input_tokens_seen": 43566992, + "step": 22249 + }, + { + "epoch": 2.9489728296885356, + "grad_norm": 0.002331664552912116, + "learning_rate": 3.571499626639496e-09, + "loss": 0.0, + "num_input_tokens_seen": 43568344, + "step": 22250 + }, + { + "epoch": 2.9491053677932406, + "grad_norm": 0.7943106889724731, + "learning_rate": 3.5529748671070396e-09, + "loss": 0.0037, + "num_input_tokens_seen": 43571104, + "step": 22251 + }, + { + "epoch": 2.9492379058979457, + "grad_norm": 0.027450673282146454, + "learning_rate": 3.534498240795603e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43572840, + "step": 22252 + }, + { + "epoch": 2.9493704440026507, + "grad_norm": 0.0012822701828554273, + "learning_rate": 3.5160697480610127e-09, + "loss": 0.0, + "num_input_tokens_seen": 43574528, + "step": 22253 + }, + { + "epoch": 2.9495029821073557, + "grad_norm": 0.05112503841519356, + "learning_rate": 3.497689389258818e-09, + "loss": 0.0003, + "num_input_tokens_seen": 43576272, + "step": 22254 + }, + { + "epoch": 2.9496355202120608, + "grad_norm": 10.213202476501465, + "learning_rate": 3.4793571647434577e-09, + "loss": 0.1349, + "num_input_tokens_seen": 43578112, + "step": 22255 + }, + { + "epoch": 2.9497680583167663, + "grad_norm": 1.7686731815338135, + "learning_rate": 3.461073074868537e-09, + "loss": 0.0048, + "num_input_tokens_seen": 43580344, + "step": 22256 + }, + { + "epoch": 2.9499005964214713, + "grad_norm": 3.5689826011657715, + "learning_rate": 3.4428371199862755e-09, + "loss": 0.0103, + "num_input_tokens_seen": 43582160, + "step": 22257 + }, + { + "epoch": 2.9500331345261763, + "grad_norm": 0.048518940806388855, + "learning_rate": 3.4246493004486127e-09, + "loss": 0.0002, + "num_input_tokens_seen": 43584136, + "step": 22258 + }, + { + "epoch": 2.9501656726308814, + "grad_norm": 0.6350401043891907, + "learning_rate": 3.4065096166055465e-09, + "loss": 0.0027, + "num_input_tokens_seen": 43586048, + "step": 22259 + }, + { + "epoch": 2.9502982107355864, + "grad_norm": 0.01981395110487938, + "learning_rate": 3.3884180688076308e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43587912, + "step": 22260 + }, + { + "epoch": 2.9504307488402914, + "grad_norm": 4.001582145690918, + "learning_rate": 3.370374657403197e-09, + "loss": 0.0267, + "num_input_tokens_seen": 43589856, + "step": 22261 + }, + { + "epoch": 2.9505632869449965, + "grad_norm": 6.8600687980651855, + "learning_rate": 3.352379382740578e-09, + "loss": 0.0568, + "num_input_tokens_seen": 43591248, + "step": 22262 + }, + { + "epoch": 2.950695825049702, + "grad_norm": 0.006536876782774925, + "learning_rate": 3.3344322451661636e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43592904, + "step": 22263 + }, + { + "epoch": 2.950828363154407, + "grad_norm": 0.25441277027130127, + "learning_rate": 3.316533245026343e-09, + "loss": 0.0011, + "num_input_tokens_seen": 43594672, + "step": 22264 + }, + { + "epoch": 2.950960901259112, + "grad_norm": 0.5788674354553223, + "learning_rate": 3.298682382665841e-09, + "loss": 0.003, + "num_input_tokens_seen": 43596576, + "step": 22265 + }, + { + "epoch": 2.951093439363817, + "grad_norm": 0.01784122921526432, + "learning_rate": 3.280879658429381e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43599136, + "step": 22266 + }, + { + "epoch": 2.951225977468522, + "grad_norm": 0.005329910200089216, + "learning_rate": 3.263125072660023e-09, + "loss": 0.0, + "num_input_tokens_seen": 43601392, + "step": 22267 + }, + { + "epoch": 2.951358515573227, + "grad_norm": 15.184979438781738, + "learning_rate": 3.245418625699992e-09, + "loss": 0.1412, + "num_input_tokens_seen": 43603408, + "step": 22268 + }, + { + "epoch": 2.951491053677932, + "grad_norm": 0.030959444120526314, + "learning_rate": 3.2277603178906826e-09, + "loss": 0.0002, + "num_input_tokens_seen": 43605240, + "step": 22269 + }, + { + "epoch": 2.9516235917826377, + "grad_norm": 10.12108325958252, + "learning_rate": 3.210150149572655e-09, + "loss": 0.1279, + "num_input_tokens_seen": 43608064, + "step": 22270 + }, + { + "epoch": 2.9517561298873427, + "grad_norm": 0.0018814560025930405, + "learning_rate": 3.1925881210853606e-09, + "loss": 0.0, + "num_input_tokens_seen": 43609416, + "step": 22271 + }, + { + "epoch": 2.9518886679920477, + "grad_norm": 0.002194805070757866, + "learning_rate": 3.175074232767139e-09, + "loss": 0.0, + "num_input_tokens_seen": 43610984, + "step": 22272 + }, + { + "epoch": 2.952021206096753, + "grad_norm": 2.2957074642181396, + "learning_rate": 3.1576084849563315e-09, + "loss": 0.0123, + "num_input_tokens_seen": 43613936, + "step": 22273 + }, + { + "epoch": 2.952153744201458, + "grad_norm": 5.1449360847473145, + "learning_rate": 3.1401908779890576e-09, + "loss": 0.0935, + "num_input_tokens_seen": 43615304, + "step": 22274 + }, + { + "epoch": 2.952286282306163, + "grad_norm": 3.04215669631958, + "learning_rate": 3.122821412201438e-09, + "loss": 0.027, + "num_input_tokens_seen": 43617168, + "step": 22275 + }, + { + "epoch": 2.952418820410868, + "grad_norm": 0.05018129572272301, + "learning_rate": 3.1055000879284815e-09, + "loss": 0.0003, + "num_input_tokens_seen": 43619016, + "step": 22276 + }, + { + "epoch": 2.9525513585155734, + "grad_norm": 0.24577747285366058, + "learning_rate": 3.088226905504088e-09, + "loss": 0.0012, + "num_input_tokens_seen": 43620576, + "step": 22277 + }, + { + "epoch": 2.9526838966202784, + "grad_norm": 0.007668768987059593, + "learning_rate": 3.0710018652610472e-09, + "loss": 0.0, + "num_input_tokens_seen": 43622936, + "step": 22278 + }, + { + "epoch": 2.9528164347249835, + "grad_norm": 0.026538889855146408, + "learning_rate": 3.0538249675315935e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43625712, + "step": 22279 + }, + { + "epoch": 2.9529489728296885, + "grad_norm": 4.520501613616943, + "learning_rate": 3.03669621264685e-09, + "loss": 0.0218, + "num_input_tokens_seen": 43627824, + "step": 22280 + }, + { + "epoch": 2.9530815109343935, + "grad_norm": 8.250967979431152, + "learning_rate": 3.0196156009373865e-09, + "loss": 0.0393, + "num_input_tokens_seen": 43629976, + "step": 22281 + }, + { + "epoch": 2.953214049039099, + "grad_norm": 0.13710132241249084, + "learning_rate": 3.0025831327321063e-09, + "loss": 0.0003, + "num_input_tokens_seen": 43631288, + "step": 22282 + }, + { + "epoch": 2.9533465871438036, + "grad_norm": 4.798572540283203, + "learning_rate": 2.985598808359913e-09, + "loss": 0.0291, + "num_input_tokens_seen": 43633784, + "step": 22283 + }, + { + "epoch": 2.953479125248509, + "grad_norm": 0.04624618589878082, + "learning_rate": 2.968662628147767e-09, + "loss": 0.0003, + "num_input_tokens_seen": 43635528, + "step": 22284 + }, + { + "epoch": 2.953611663353214, + "grad_norm": 0.008210758678615093, + "learning_rate": 2.9517745924223517e-09, + "loss": 0.0, + "num_input_tokens_seen": 43637408, + "step": 22285 + }, + { + "epoch": 2.953744201457919, + "grad_norm": 0.01774030737578869, + "learning_rate": 2.9349347015095175e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43639840, + "step": 22286 + }, + { + "epoch": 2.953876739562624, + "grad_norm": 0.10867586731910706, + "learning_rate": 2.918142955733727e-09, + "loss": 0.0004, + "num_input_tokens_seen": 43642440, + "step": 22287 + }, + { + "epoch": 2.9540092776673292, + "grad_norm": 0.003712777979671955, + "learning_rate": 2.901399355418888e-09, + "loss": 0.0, + "num_input_tokens_seen": 43644424, + "step": 22288 + }, + { + "epoch": 2.9541418157720347, + "grad_norm": 0.008637025021016598, + "learning_rate": 2.88470390088752e-09, + "loss": 0.0, + "num_input_tokens_seen": 43645864, + "step": 22289 + }, + { + "epoch": 2.9542743538767393, + "grad_norm": 4.615153789520264, + "learning_rate": 2.8680565924618653e-09, + "loss": 0.0238, + "num_input_tokens_seen": 43648088, + "step": 22290 + }, + { + "epoch": 2.954406891981445, + "grad_norm": 0.05423276126384735, + "learning_rate": 2.8514574304627785e-09, + "loss": 0.0003, + "num_input_tokens_seen": 43649544, + "step": 22291 + }, + { + "epoch": 2.95453943008615, + "grad_norm": 0.005249144975095987, + "learning_rate": 2.8349064152102812e-09, + "loss": 0.0, + "num_input_tokens_seen": 43650736, + "step": 22292 + }, + { + "epoch": 2.954671968190855, + "grad_norm": 0.9359302520751953, + "learning_rate": 2.818403547023285e-09, + "loss": 0.0048, + "num_input_tokens_seen": 43652608, + "step": 22293 + }, + { + "epoch": 2.95480450629556, + "grad_norm": 4.693918704986572, + "learning_rate": 2.801948826220424e-09, + "loss": 0.0437, + "num_input_tokens_seen": 43654536, + "step": 22294 + }, + { + "epoch": 2.954937044400265, + "grad_norm": 0.018951239064335823, + "learning_rate": 2.785542253118667e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43656384, + "step": 22295 + }, + { + "epoch": 2.9550695825049704, + "grad_norm": 9.953413009643555, + "learning_rate": 2.76918382803415e-09, + "loss": 0.0716, + "num_input_tokens_seen": 43658232, + "step": 22296 + }, + { + "epoch": 2.955202120609675, + "grad_norm": 8.604491233825684, + "learning_rate": 2.7528735512830085e-09, + "loss": 0.0523, + "num_input_tokens_seen": 43659984, + "step": 22297 + }, + { + "epoch": 2.9553346587143805, + "grad_norm": 11.26625919342041, + "learning_rate": 2.736611423178881e-09, + "loss": 0.249, + "num_input_tokens_seen": 43661808, + "step": 22298 + }, + { + "epoch": 2.9554671968190855, + "grad_norm": 0.10009501874446869, + "learning_rate": 2.7203974440356827e-09, + "loss": 0.0003, + "num_input_tokens_seen": 43663512, + "step": 22299 + }, + { + "epoch": 2.9555997349237906, + "grad_norm": 3.3866069316864014, + "learning_rate": 2.7042316141662195e-09, + "loss": 0.0184, + "num_input_tokens_seen": 43666384, + "step": 22300 + }, + { + "epoch": 2.9557322730284956, + "grad_norm": 0.009596092626452446, + "learning_rate": 2.6881139338816307e-09, + "loss": 0.0, + "num_input_tokens_seen": 43668424, + "step": 22301 + }, + { + "epoch": 2.9558648111332007, + "grad_norm": 0.004632224328815937, + "learning_rate": 2.6720444034933347e-09, + "loss": 0.0, + "num_input_tokens_seen": 43669440, + "step": 22302 + }, + { + "epoch": 2.955997349237906, + "grad_norm": 4.62300968170166, + "learning_rate": 2.6560230233105277e-09, + "loss": 0.0389, + "num_input_tokens_seen": 43672344, + "step": 22303 + }, + { + "epoch": 2.9561298873426107, + "grad_norm": 18.987808227539062, + "learning_rate": 2.640049793642685e-09, + "loss": 0.0542, + "num_input_tokens_seen": 43675760, + "step": 22304 + }, + { + "epoch": 2.956262425447316, + "grad_norm": 13.165765762329102, + "learning_rate": 2.624124714797338e-09, + "loss": 0.0775, + "num_input_tokens_seen": 43678248, + "step": 22305 + }, + { + "epoch": 2.9563949635520212, + "grad_norm": 15.375874519348145, + "learning_rate": 2.608247787082019e-09, + "loss": 0.2101, + "num_input_tokens_seen": 43680072, + "step": 22306 + }, + { + "epoch": 2.9565275016567263, + "grad_norm": 7.167605400085449, + "learning_rate": 2.5924190108023164e-09, + "loss": 0.0641, + "num_input_tokens_seen": 43682152, + "step": 22307 + }, + { + "epoch": 2.9566600397614313, + "grad_norm": 0.015214335173368454, + "learning_rate": 2.576638386263541e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43683896, + "step": 22308 + }, + { + "epoch": 2.9567925778661364, + "grad_norm": 2.014601707458496, + "learning_rate": 2.560905913770173e-09, + "loss": 0.0166, + "num_input_tokens_seen": 43686896, + "step": 22309 + }, + { + "epoch": 2.956925115970842, + "grad_norm": 5.356799125671387, + "learning_rate": 2.5452215936255796e-09, + "loss": 0.0239, + "num_input_tokens_seen": 43689000, + "step": 22310 + }, + { + "epoch": 2.957057654075547, + "grad_norm": 0.09059500694274902, + "learning_rate": 2.5295854261317422e-09, + "loss": 0.0005, + "num_input_tokens_seen": 43691888, + "step": 22311 + }, + { + "epoch": 2.957190192180252, + "grad_norm": 3.72603178024292, + "learning_rate": 2.5139974115906407e-09, + "loss": 0.0411, + "num_input_tokens_seen": 43694016, + "step": 22312 + }, + { + "epoch": 2.957322730284957, + "grad_norm": 0.004890168085694313, + "learning_rate": 2.498457550302591e-09, + "loss": 0.0, + "num_input_tokens_seen": 43695776, + "step": 22313 + }, + { + "epoch": 2.957455268389662, + "grad_norm": 6.017292022705078, + "learning_rate": 2.4829658425670755e-09, + "loss": 0.0281, + "num_input_tokens_seen": 43697736, + "step": 22314 + }, + { + "epoch": 2.957587806494367, + "grad_norm": 0.03325015306472778, + "learning_rate": 2.467522288683022e-09, + "loss": 0.0002, + "num_input_tokens_seen": 43699432, + "step": 22315 + }, + { + "epoch": 2.957720344599072, + "grad_norm": 3.026155948638916, + "learning_rate": 2.45212688894797e-09, + "loss": 0.0235, + "num_input_tokens_seen": 43701200, + "step": 22316 + }, + { + "epoch": 2.9578528827037776, + "grad_norm": 0.0852789431810379, + "learning_rate": 2.436779643659182e-09, + "loss": 0.0004, + "num_input_tokens_seen": 43703584, + "step": 22317 + }, + { + "epoch": 2.9579854208084826, + "grad_norm": 0.04867374151945114, + "learning_rate": 2.4214805531119767e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43705704, + "step": 22318 + }, + { + "epoch": 2.9581179589131876, + "grad_norm": 0.11292661726474762, + "learning_rate": 2.406229617601674e-09, + "loss": 0.0006, + "num_input_tokens_seen": 43707848, + "step": 22319 + }, + { + "epoch": 2.9582504970178927, + "grad_norm": 0.024184376001358032, + "learning_rate": 2.3910268374222055e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43709784, + "step": 22320 + }, + { + "epoch": 2.9583830351225977, + "grad_norm": 0.0027894002851098776, + "learning_rate": 2.3758722128669477e-09, + "loss": 0.0, + "num_input_tokens_seen": 43711184, + "step": 22321 + }, + { + "epoch": 2.9585155732273027, + "grad_norm": 0.0005841794773004949, + "learning_rate": 2.360765744227889e-09, + "loss": 0.0, + "num_input_tokens_seen": 43712456, + "step": 22322 + }, + { + "epoch": 2.958648111332008, + "grad_norm": 0.09099941700696945, + "learning_rate": 2.3457074317961847e-09, + "loss": 0.0005, + "num_input_tokens_seen": 43714544, + "step": 22323 + }, + { + "epoch": 2.9587806494367133, + "grad_norm": 0.9017846584320068, + "learning_rate": 2.3306972758621594e-09, + "loss": 0.0026, + "num_input_tokens_seen": 43716648, + "step": 22324 + }, + { + "epoch": 2.9589131875414183, + "grad_norm": 16.434083938598633, + "learning_rate": 2.3157352767155807e-09, + "loss": 0.0761, + "num_input_tokens_seen": 43718560, + "step": 22325 + }, + { + "epoch": 2.9590457256461233, + "grad_norm": 6.358641147613525, + "learning_rate": 2.300821434644551e-09, + "loss": 0.0549, + "num_input_tokens_seen": 43720544, + "step": 22326 + }, + { + "epoch": 2.9591782637508284, + "grad_norm": 0.013488908298313618, + "learning_rate": 2.2859557499368966e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43722256, + "step": 22327 + }, + { + "epoch": 2.9593108018555334, + "grad_norm": 0.0003470120718702674, + "learning_rate": 2.2711382228790547e-09, + "loss": 0.0, + "num_input_tokens_seen": 43723568, + "step": 22328 + }, + { + "epoch": 2.9594433399602385, + "grad_norm": 9.97823715209961, + "learning_rate": 2.2563688537566298e-09, + "loss": 0.027, + "num_input_tokens_seen": 43725224, + "step": 22329 + }, + { + "epoch": 2.9595758780649435, + "grad_norm": 12.326144218444824, + "learning_rate": 2.2416476428546717e-09, + "loss": 0.0796, + "num_input_tokens_seen": 43727288, + "step": 22330 + }, + { + "epoch": 2.959708416169649, + "grad_norm": 8.557718276977539, + "learning_rate": 2.226974590456843e-09, + "loss": 0.1543, + "num_input_tokens_seen": 43729784, + "step": 22331 + }, + { + "epoch": 2.959840954274354, + "grad_norm": 0.7549514770507812, + "learning_rate": 2.2123496968459723e-09, + "loss": 0.0043, + "num_input_tokens_seen": 43732112, + "step": 22332 + }, + { + "epoch": 2.959973492379059, + "grad_norm": 4.409550189971924, + "learning_rate": 2.1977729623040565e-09, + "loss": 0.0172, + "num_input_tokens_seen": 43733752, + "step": 22333 + }, + { + "epoch": 2.960106030483764, + "grad_norm": 1.5269321203231812, + "learning_rate": 2.1832443871125374e-09, + "loss": 0.0066, + "num_input_tokens_seen": 43736048, + "step": 22334 + }, + { + "epoch": 2.960238568588469, + "grad_norm": 0.010068727657198906, + "learning_rate": 2.1687639715509136e-09, + "loss": 0.0, + "num_input_tokens_seen": 43737608, + "step": 22335 + }, + { + "epoch": 2.960371106693174, + "grad_norm": 5.898979663848877, + "learning_rate": 2.154331715898683e-09, + "loss": 0.0427, + "num_input_tokens_seen": 43739568, + "step": 22336 + }, + { + "epoch": 2.960503644797879, + "grad_norm": 0.0009231868316419423, + "learning_rate": 2.1399476204345126e-09, + "loss": 0.0, + "num_input_tokens_seen": 43740744, + "step": 22337 + }, + { + "epoch": 2.9606361829025847, + "grad_norm": 0.0025715793017297983, + "learning_rate": 2.1256116854348472e-09, + "loss": 0.0, + "num_input_tokens_seen": 43742320, + "step": 22338 + }, + { + "epoch": 2.9607687210072897, + "grad_norm": 8.020583152770996, + "learning_rate": 2.1113239111769655e-09, + "loss": 0.1574, + "num_input_tokens_seen": 43744944, + "step": 22339 + }, + { + "epoch": 2.9609012591119948, + "grad_norm": 0.0890028178691864, + "learning_rate": 2.0970842979356476e-09, + "loss": 0.0004, + "num_input_tokens_seen": 43747768, + "step": 22340 + }, + { + "epoch": 2.9610337972167, + "grad_norm": 12.441834449768066, + "learning_rate": 2.082892845985951e-09, + "loss": 0.1038, + "num_input_tokens_seen": 43750176, + "step": 22341 + }, + { + "epoch": 2.961166335321405, + "grad_norm": 21.394775390625, + "learning_rate": 2.068749555601268e-09, + "loss": 0.1581, + "num_input_tokens_seen": 43752368, + "step": 22342 + }, + { + "epoch": 2.96129887342611, + "grad_norm": 0.08104350417852402, + "learning_rate": 2.0546544270544365e-09, + "loss": 0.0003, + "num_input_tokens_seen": 43753536, + "step": 22343 + }, + { + "epoch": 2.961431411530815, + "grad_norm": 0.021069826558232307, + "learning_rate": 2.040607460616906e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43754912, + "step": 22344 + }, + { + "epoch": 2.9615639496355204, + "grad_norm": 0.0027820041868835688, + "learning_rate": 2.0266086565598474e-09, + "loss": 0.0, + "num_input_tokens_seen": 43757088, + "step": 22345 + }, + { + "epoch": 2.9616964877402254, + "grad_norm": 0.015443779528141022, + "learning_rate": 2.012658015153046e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43758616, + "step": 22346 + }, + { + "epoch": 2.9618290258449305, + "grad_norm": 0.001395764178596437, + "learning_rate": 1.998755536665453e-09, + "loss": 0.0, + "num_input_tokens_seen": 43760328, + "step": 22347 + }, + { + "epoch": 2.9619615639496355, + "grad_norm": 8.027894020080566, + "learning_rate": 1.9849012213649097e-09, + "loss": 0.0304, + "num_input_tokens_seen": 43762528, + "step": 22348 + }, + { + "epoch": 2.9620941020543405, + "grad_norm": 0.005758329294621944, + "learning_rate": 1.97109506951898e-09, + "loss": 0.0, + "num_input_tokens_seen": 43764144, + "step": 22349 + }, + { + "epoch": 2.9622266401590456, + "grad_norm": 7.215410232543945, + "learning_rate": 1.9573370813935623e-09, + "loss": 0.0195, + "num_input_tokens_seen": 43766400, + "step": 22350 + }, + { + "epoch": 2.9623591782637506, + "grad_norm": 3.4682328701019287, + "learning_rate": 1.943627257254277e-09, + "loss": 0.0299, + "num_input_tokens_seen": 43768440, + "step": 22351 + }, + { + "epoch": 2.962491716368456, + "grad_norm": 0.0022721586283296347, + "learning_rate": 1.9299655973648023e-09, + "loss": 0.0, + "num_input_tokens_seen": 43771040, + "step": 22352 + }, + { + "epoch": 2.962624254473161, + "grad_norm": 3.0962419509887695, + "learning_rate": 1.916352101989094e-09, + "loss": 0.0175, + "num_input_tokens_seen": 43773704, + "step": 22353 + }, + { + "epoch": 2.962756792577866, + "grad_norm": 0.007636042311787605, + "learning_rate": 1.9027867713894422e-09, + "loss": 0.0, + "num_input_tokens_seen": 43775384, + "step": 22354 + }, + { + "epoch": 2.962889330682571, + "grad_norm": 7.977814197540283, + "learning_rate": 1.889269605827304e-09, + "loss": 0.0983, + "num_input_tokens_seen": 43777928, + "step": 22355 + }, + { + "epoch": 2.9630218687872762, + "grad_norm": 0.032154861837625504, + "learning_rate": 1.8758006055635823e-09, + "loss": 0.0002, + "num_input_tokens_seen": 43780304, + "step": 22356 + }, + { + "epoch": 2.9631544068919813, + "grad_norm": 0.03983138129115105, + "learning_rate": 1.8623797708577918e-09, + "loss": 0.0002, + "num_input_tokens_seen": 43781864, + "step": 22357 + }, + { + "epoch": 2.9632869449966863, + "grad_norm": 0.005701002664864063, + "learning_rate": 1.849007101968614e-09, + "loss": 0.0, + "num_input_tokens_seen": 43783296, + "step": 22358 + }, + { + "epoch": 2.963419483101392, + "grad_norm": 1.1917616128921509, + "learning_rate": 1.8356825991538985e-09, + "loss": 0.0043, + "num_input_tokens_seen": 43785504, + "step": 22359 + }, + { + "epoch": 2.963552021206097, + "grad_norm": 8.105329513549805, + "learning_rate": 1.8224062626706618e-09, + "loss": 0.102, + "num_input_tokens_seen": 43787680, + "step": 22360 + }, + { + "epoch": 2.963684559310802, + "grad_norm": 0.19337278604507446, + "learning_rate": 1.8091780927750879e-09, + "loss": 0.0007, + "num_input_tokens_seen": 43789504, + "step": 22361 + }, + { + "epoch": 2.963817097415507, + "grad_norm": 2.403597116470337, + "learning_rate": 1.7959980897216955e-09, + "loss": 0.0112, + "num_input_tokens_seen": 43791336, + "step": 22362 + }, + { + "epoch": 2.963949635520212, + "grad_norm": 4.3618245124816895, + "learning_rate": 1.7828662537650032e-09, + "loss": 0.0126, + "num_input_tokens_seen": 43792976, + "step": 22363 + }, + { + "epoch": 2.9640821736249174, + "grad_norm": 0.025620654225349426, + "learning_rate": 1.7697825851578642e-09, + "loss": 0.0002, + "num_input_tokens_seen": 43795952, + "step": 22364 + }, + { + "epoch": 2.964214711729622, + "grad_norm": 0.010524922050535679, + "learning_rate": 1.7567470841528545e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43797608, + "step": 22365 + }, + { + "epoch": 2.9643472498343275, + "grad_norm": 0.4931896924972534, + "learning_rate": 1.7437597510014391e-09, + "loss": 0.0014, + "num_input_tokens_seen": 43800080, + "step": 22366 + }, + { + "epoch": 2.9644797879390326, + "grad_norm": 0.008791539818048477, + "learning_rate": 1.7308205859536965e-09, + "loss": 0.0, + "num_input_tokens_seen": 43802880, + "step": 22367 + }, + { + "epoch": 2.9646123260437376, + "grad_norm": 0.01890513114631176, + "learning_rate": 1.717929589258871e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43804032, + "step": 22368 + }, + { + "epoch": 2.9647448641484426, + "grad_norm": 8.224040031433105, + "learning_rate": 1.705086761166208e-09, + "loss": 0.0619, + "num_input_tokens_seen": 43805592, + "step": 22369 + }, + { + "epoch": 2.9648774022531477, + "grad_norm": 0.0042699105106294155, + "learning_rate": 1.6922921019227323e-09, + "loss": 0.0, + "num_input_tokens_seen": 43807192, + "step": 22370 + }, + { + "epoch": 2.965009940357853, + "grad_norm": 0.0141218900680542, + "learning_rate": 1.6795456117757458e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43808696, + "step": 22371 + }, + { + "epoch": 2.9651424784625577, + "grad_norm": 9.345793724060059, + "learning_rate": 1.6668472909703304e-09, + "loss": 0.0384, + "num_input_tokens_seen": 43811288, + "step": 22372 + }, + { + "epoch": 2.9652750165672632, + "grad_norm": 7.4340081214904785, + "learning_rate": 1.6541971397515677e-09, + "loss": 0.0337, + "num_input_tokens_seen": 43813256, + "step": 22373 + }, + { + "epoch": 2.9654075546719683, + "grad_norm": 1.4314326047897339, + "learning_rate": 1.6415951583634293e-09, + "loss": 0.0213, + "num_input_tokens_seen": 43814656, + "step": 22374 + }, + { + "epoch": 2.9655400927766733, + "grad_norm": 1.2346673011779785, + "learning_rate": 1.6290413470487765e-09, + "loss": 0.0059, + "num_input_tokens_seen": 43816456, + "step": 22375 + }, + { + "epoch": 2.9656726308813783, + "grad_norm": 8.996658325195312, + "learning_rate": 1.6165357060499154e-09, + "loss": 0.152, + "num_input_tokens_seen": 43818984, + "step": 22376 + }, + { + "epoch": 2.9658051689860834, + "grad_norm": 3.562676429748535, + "learning_rate": 1.6040782356077643e-09, + "loss": 0.0384, + "num_input_tokens_seen": 43821472, + "step": 22377 + }, + { + "epoch": 2.965937707090789, + "grad_norm": 8.882349014282227, + "learning_rate": 1.591668935962687e-09, + "loss": 0.0211, + "num_input_tokens_seen": 43823736, + "step": 22378 + }, + { + "epoch": 2.9660702451954934, + "grad_norm": 17.305540084838867, + "learning_rate": 1.5793078073533808e-09, + "loss": 0.0941, + "num_input_tokens_seen": 43825648, + "step": 22379 + }, + { + "epoch": 2.966202783300199, + "grad_norm": 0.07756513357162476, + "learning_rate": 1.5669948500188215e-09, + "loss": 0.0004, + "num_input_tokens_seen": 43827624, + "step": 22380 + }, + { + "epoch": 2.966335321404904, + "grad_norm": 11.90113353729248, + "learning_rate": 1.5547300641963192e-09, + "loss": 0.2061, + "num_input_tokens_seen": 43830744, + "step": 22381 + }, + { + "epoch": 2.966467859509609, + "grad_norm": 0.01230569463223219, + "learning_rate": 1.542513450122074e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43833536, + "step": 22382 + }, + { + "epoch": 2.966600397614314, + "grad_norm": 1.1517153978347778, + "learning_rate": 1.5303450080320082e-09, + "loss": 0.0031, + "num_input_tokens_seen": 43835136, + "step": 22383 + }, + { + "epoch": 2.966732935719019, + "grad_norm": 1.1581480503082275, + "learning_rate": 1.5182247381601011e-09, + "loss": 0.0101, + "num_input_tokens_seen": 43837672, + "step": 22384 + }, + { + "epoch": 2.9668654738237246, + "grad_norm": 3.0572574138641357, + "learning_rate": 1.5061526407406102e-09, + "loss": 0.0364, + "num_input_tokens_seen": 43839368, + "step": 22385 + }, + { + "epoch": 2.966998011928429, + "grad_norm": 0.01920732669532299, + "learning_rate": 1.494128716006127e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43841112, + "step": 22386 + }, + { + "epoch": 2.9671305500331346, + "grad_norm": 0.0026114489883184433, + "learning_rate": 1.4821529641884103e-09, + "loss": 0.0, + "num_input_tokens_seen": 43843904, + "step": 22387 + }, + { + "epoch": 2.9672630881378397, + "grad_norm": 6.04542350769043, + "learning_rate": 1.4702253855186644e-09, + "loss": 0.1423, + "num_input_tokens_seen": 43845392, + "step": 22388 + }, + { + "epoch": 2.9673956262425447, + "grad_norm": 0.014861948788166046, + "learning_rate": 1.4583459802264278e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43846792, + "step": 22389 + }, + { + "epoch": 2.9675281643472498, + "grad_norm": 5.665372371673584, + "learning_rate": 1.4465147485406839e-09, + "loss": 0.0361, + "num_input_tokens_seen": 43848232, + "step": 22390 + }, + { + "epoch": 2.967660702451955, + "grad_norm": 0.017988082021474838, + "learning_rate": 1.4347316906901387e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43849744, + "step": 22391 + }, + { + "epoch": 2.9677932405566603, + "grad_norm": 0.26280468702316284, + "learning_rate": 1.4229968069015553e-09, + "loss": 0.0008, + "num_input_tokens_seen": 43852096, + "step": 22392 + }, + { + "epoch": 2.967925778661365, + "grad_norm": 14.28861141204834, + "learning_rate": 1.4113100974014193e-09, + "loss": 0.0911, + "num_input_tokens_seen": 43854728, + "step": 22393 + }, + { + "epoch": 2.9680583167660703, + "grad_norm": 5.226220607757568, + "learning_rate": 1.3996715624148282e-09, + "loss": 0.0997, + "num_input_tokens_seen": 43857104, + "step": 22394 + }, + { + "epoch": 2.9681908548707754, + "grad_norm": 0.05354820564389229, + "learning_rate": 1.3880812021660473e-09, + "loss": 0.0003, + "num_input_tokens_seen": 43858864, + "step": 22395 + }, + { + "epoch": 2.9683233929754804, + "grad_norm": 14.342785835266113, + "learning_rate": 1.3765390168790637e-09, + "loss": 0.1293, + "num_input_tokens_seen": 43861424, + "step": 22396 + }, + { + "epoch": 2.9684559310801855, + "grad_norm": 0.04000642150640488, + "learning_rate": 1.3650450067759224e-09, + "loss": 0.0002, + "num_input_tokens_seen": 43863568, + "step": 22397 + }, + { + "epoch": 2.9685884691848905, + "grad_norm": 0.0033949394710361958, + "learning_rate": 1.3535991720783903e-09, + "loss": 0.0, + "num_input_tokens_seen": 43865104, + "step": 22398 + }, + { + "epoch": 2.968721007289596, + "grad_norm": 0.28686603903770447, + "learning_rate": 1.3422015130074017e-09, + "loss": 0.0006, + "num_input_tokens_seen": 43867616, + "step": 22399 + }, + { + "epoch": 2.968853545394301, + "grad_norm": 0.5833792090415955, + "learning_rate": 1.3308520297822258e-09, + "loss": 0.0022, + "num_input_tokens_seen": 43869520, + "step": 22400 + }, + { + "epoch": 2.968986083499006, + "grad_norm": 7.897106647491455, + "learning_rate": 1.3195507226221315e-09, + "loss": 0.1092, + "num_input_tokens_seen": 43871736, + "step": 22401 + }, + { + "epoch": 2.969118621603711, + "grad_norm": 5.990362644195557, + "learning_rate": 1.3082975917450003e-09, + "loss": 0.0722, + "num_input_tokens_seen": 43874032, + "step": 22402 + }, + { + "epoch": 2.969251159708416, + "grad_norm": 3.174128770828247, + "learning_rate": 1.2970926373676028e-09, + "loss": 0.0058, + "num_input_tokens_seen": 43875632, + "step": 22403 + }, + { + "epoch": 2.969383697813121, + "grad_norm": 0.0164306852966547, + "learning_rate": 1.285935859705878e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43877552, + "step": 22404 + }, + { + "epoch": 2.969516235917826, + "grad_norm": 0.08274020999670029, + "learning_rate": 1.2748272589749311e-09, + "loss": 0.0004, + "num_input_tokens_seen": 43880000, + "step": 22405 + }, + { + "epoch": 2.9696487740225317, + "grad_norm": 4.216237545013428, + "learning_rate": 1.263766835389313e-09, + "loss": 0.0159, + "num_input_tokens_seen": 43881992, + "step": 22406 + }, + { + "epoch": 2.9697813121272367, + "grad_norm": 0.00994815956801176, + "learning_rate": 1.2527545891621862e-09, + "loss": 0.0, + "num_input_tokens_seen": 43883624, + "step": 22407 + }, + { + "epoch": 2.9699138502319418, + "grad_norm": 0.5836260914802551, + "learning_rate": 1.2417905205056036e-09, + "loss": 0.0019, + "num_input_tokens_seen": 43885576, + "step": 22408 + }, + { + "epoch": 2.970046388336647, + "grad_norm": 0.012959951534867287, + "learning_rate": 1.2308746296310626e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43887304, + "step": 22409 + }, + { + "epoch": 2.970178926441352, + "grad_norm": 0.002590167336165905, + "learning_rate": 1.220006916749228e-09, + "loss": 0.0, + "num_input_tokens_seen": 43889432, + "step": 22410 + }, + { + "epoch": 2.970311464546057, + "grad_norm": 0.008520927280187607, + "learning_rate": 1.2091873820693767e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43891000, + "step": 22411 + }, + { + "epoch": 2.970444002650762, + "grad_norm": 8.114709854125977, + "learning_rate": 1.198416025800231e-09, + "loss": 0.0892, + "num_input_tokens_seen": 43892912, + "step": 22412 + }, + { + "epoch": 2.9705765407554674, + "grad_norm": 0.005912213586270809, + "learning_rate": 1.1876928481494021e-09, + "loss": 0.0, + "num_input_tokens_seen": 43894584, + "step": 22413 + }, + { + "epoch": 2.9707090788601724, + "grad_norm": 0.3754428029060364, + "learning_rate": 1.1770178493236695e-09, + "loss": 0.0021, + "num_input_tokens_seen": 43896224, + "step": 22414 + }, + { + "epoch": 2.9708416169648775, + "grad_norm": 0.5932297706604004, + "learning_rate": 1.1663910295289793e-09, + "loss": 0.0021, + "num_input_tokens_seen": 43898016, + "step": 22415 + }, + { + "epoch": 2.9709741550695825, + "grad_norm": 4.471325874328613, + "learning_rate": 1.1558123889701679e-09, + "loss": 0.0255, + "num_input_tokens_seen": 43899608, + "step": 22416 + }, + { + "epoch": 2.9711066931742875, + "grad_norm": 0.011148176155984402, + "learning_rate": 1.145281927850961e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43901712, + "step": 22417 + }, + { + "epoch": 2.9712392312789926, + "grad_norm": 0.0071175298653542995, + "learning_rate": 1.1347996463748069e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43903440, + "step": 22418 + }, + { + "epoch": 2.9713717693836976, + "grad_norm": 2.276231527328491, + "learning_rate": 1.1243655447434886e-09, + "loss": 0.0064, + "num_input_tokens_seen": 43905992, + "step": 22419 + }, + { + "epoch": 2.971504307488403, + "grad_norm": 13.152486801147461, + "learning_rate": 1.1139796231582346e-09, + "loss": 0.1398, + "num_input_tokens_seen": 43908416, + "step": 22420 + }, + { + "epoch": 2.971636845593108, + "grad_norm": 0.008481040596961975, + "learning_rate": 1.1036418818194394e-09, + "loss": 0.0, + "num_input_tokens_seen": 43910544, + "step": 22421 + }, + { + "epoch": 2.971769383697813, + "grad_norm": 0.011056071147322655, + "learning_rate": 1.0933523209263885e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43912336, + "step": 22422 + }, + { + "epoch": 2.971901921802518, + "grad_norm": 2.5400233268737793, + "learning_rate": 1.0831109406772567e-09, + "loss": 0.0241, + "num_input_tokens_seen": 43914016, + "step": 22423 + }, + { + "epoch": 2.9720344599072233, + "grad_norm": 0.032160475850105286, + "learning_rate": 1.0729177412696634e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43917192, + "step": 22424 + }, + { + "epoch": 2.9721669980119283, + "grad_norm": 4.5012102127075195, + "learning_rate": 1.0627727229003959e-09, + "loss": 0.108, + "num_input_tokens_seen": 43919968, + "step": 22425 + }, + { + "epoch": 2.9722995361166333, + "grad_norm": 6.585148811340332, + "learning_rate": 1.0526758857645758e-09, + "loss": 0.111, + "num_input_tokens_seen": 43922920, + "step": 22426 + }, + { + "epoch": 2.972432074221339, + "grad_norm": 5.541047096252441, + "learning_rate": 1.0426272300570471e-09, + "loss": 0.0991, + "num_input_tokens_seen": 43924928, + "step": 22427 + }, + { + "epoch": 2.972564612326044, + "grad_norm": 12.602825164794922, + "learning_rate": 1.0326267559718217e-09, + "loss": 0.0647, + "num_input_tokens_seen": 43927560, + "step": 22428 + }, + { + "epoch": 2.972697150430749, + "grad_norm": 9.28304386138916, + "learning_rate": 1.0226744637012453e-09, + "loss": 0.1327, + "num_input_tokens_seen": 43929600, + "step": 22429 + }, + { + "epoch": 2.972829688535454, + "grad_norm": 0.011068164370954037, + "learning_rate": 1.0127703534376643e-09, + "loss": 0.0001, + "num_input_tokens_seen": 43931776, + "step": 22430 + }, + { + "epoch": 2.972962226640159, + "grad_norm": 0.05678822472691536, + "learning_rate": 1.0029144253717593e-09, + "loss": 0.0006, + "num_input_tokens_seen": 43933528, + "step": 22431 + }, + { + "epoch": 2.973094764744864, + "grad_norm": 11.85108757019043, + "learning_rate": 9.931066796936561e-10, + "loss": 0.0431, + "num_input_tokens_seen": 43934928, + "step": 22432 + }, + { + "epoch": 2.973227302849569, + "grad_norm": 25.780689239501953, + "learning_rate": 9.833471165923702e-10, + "loss": 0.2594, + "num_input_tokens_seen": 43937064, + "step": 22433 + }, + { + "epoch": 2.9733598409542745, + "grad_norm": 1.2261377573013306, + "learning_rate": 9.736357362560844e-10, + "loss": 0.0051, + "num_input_tokens_seen": 43938656, + "step": 22434 + }, + { + "epoch": 2.9734923790589796, + "grad_norm": 11.480022430419922, + "learning_rate": 9.639725388721488e-10, + "loss": 0.0515, + "num_input_tokens_seen": 43940976, + "step": 22435 + }, + { + "epoch": 2.9736249171636846, + "grad_norm": 9.060023307800293, + "learning_rate": 9.54357524627081e-10, + "loss": 0.1407, + "num_input_tokens_seen": 43942992, + "step": 22436 + }, + { + "epoch": 2.9737574552683896, + "grad_norm": 10.538068771362305, + "learning_rate": 9.447906937057327e-10, + "loss": 0.1695, + "num_input_tokens_seen": 43946656, + "step": 22437 + }, + { + "epoch": 2.9738899933730947, + "grad_norm": 1.1125993728637695, + "learning_rate": 9.35272046292679e-10, + "loss": 0.0046, + "num_input_tokens_seen": 43948272, + "step": 22438 + }, + { + "epoch": 2.9740225314777997, + "grad_norm": 0.006630345713347197, + "learning_rate": 9.258015825719391e-10, + "loss": 0.0, + "num_input_tokens_seen": 43949776, + "step": 22439 + }, + { + "epoch": 2.9741550695825048, + "grad_norm": 5.808896064758301, + "learning_rate": 9.163793027255896e-10, + "loss": 0.0596, + "num_input_tokens_seen": 43951544, + "step": 22440 + }, + { + "epoch": 2.9742876076872102, + "grad_norm": 5.979946136474609, + "learning_rate": 9.070052069354296e-10, + "loss": 0.0613, + "num_input_tokens_seen": 43953264, + "step": 22441 + }, + { + "epoch": 2.9744201457919153, + "grad_norm": 0.09204027056694031, + "learning_rate": 8.976792953824254e-10, + "loss": 0.0003, + "num_input_tokens_seen": 43954656, + "step": 22442 + }, + { + "epoch": 2.9745526838966203, + "grad_norm": 15.386218070983887, + "learning_rate": 8.884015682461555e-10, + "loss": 0.1559, + "num_input_tokens_seen": 43957136, + "step": 22443 + }, + { + "epoch": 2.9746852220013253, + "grad_norm": 1.254500150680542, + "learning_rate": 8.79172025705366e-10, + "loss": 0.0074, + "num_input_tokens_seen": 43958768, + "step": 22444 + }, + { + "epoch": 2.9748177601060304, + "grad_norm": 5.828731536865234, + "learning_rate": 8.699906679382475e-10, + "loss": 0.0635, + "num_input_tokens_seen": 43960568, + "step": 22445 + }, + { + "epoch": 2.9749502982107354, + "grad_norm": 0.5708998441696167, + "learning_rate": 8.608574951218807e-10, + "loss": 0.002, + "num_input_tokens_seen": 43963160, + "step": 22446 + }, + { + "epoch": 2.9750828363154405, + "grad_norm": 11.24099349975586, + "learning_rate": 8.517725074322358e-10, + "loss": 0.2347, + "num_input_tokens_seen": 43964840, + "step": 22447 + }, + { + "epoch": 2.975215374420146, + "grad_norm": 0.007305128499865532, + "learning_rate": 8.427357050444506e-10, + "loss": 0.0, + "num_input_tokens_seen": 43966160, + "step": 22448 + }, + { + "epoch": 2.975347912524851, + "grad_norm": 0.38213807344436646, + "learning_rate": 8.337470881328302e-10, + "loss": 0.0012, + "num_input_tokens_seen": 43967760, + "step": 22449 + }, + { + "epoch": 2.975480450629556, + "grad_norm": 12.351956367492676, + "learning_rate": 8.248066568708469e-10, + "loss": 0.2888, + "num_input_tokens_seen": 43969400, + "step": 22450 + }, + { + "epoch": 2.975612988734261, + "grad_norm": 4.127171516418457, + "learning_rate": 8.159144114303075e-10, + "loss": 0.019, + "num_input_tokens_seen": 43972136, + "step": 22451 + }, + { + "epoch": 2.975745526838966, + "grad_norm": 0.01602078601717949, + "learning_rate": 8.070703519832967e-10, + "loss": 0.0001, + "num_input_tokens_seen": 43973480, + "step": 22452 + }, + { + "epoch": 2.9758780649436716, + "grad_norm": 4.002138614654541, + "learning_rate": 7.982744786999563e-10, + "loss": 0.0436, + "num_input_tokens_seen": 43975848, + "step": 22453 + }, + { + "epoch": 2.976010603048376, + "grad_norm": 4.734738349914551, + "learning_rate": 7.895267917501503e-10, + "loss": 0.0334, + "num_input_tokens_seen": 43977696, + "step": 22454 + }, + { + "epoch": 2.9761431411530817, + "grad_norm": 4.16130256652832, + "learning_rate": 7.808272913020776e-10, + "loss": 0.0158, + "num_input_tokens_seen": 43979456, + "step": 22455 + }, + { + "epoch": 2.9762756792577867, + "grad_norm": 0.09407790750265121, + "learning_rate": 7.721759775239368e-10, + "loss": 0.0003, + "num_input_tokens_seen": 43982024, + "step": 22456 + }, + { + "epoch": 2.9764082173624917, + "grad_norm": 0.002813558094203472, + "learning_rate": 7.63572850582539e-10, + "loss": 0.0, + "num_input_tokens_seen": 43983552, + "step": 22457 + }, + { + "epoch": 2.9765407554671968, + "grad_norm": 0.01731523871421814, + "learning_rate": 7.550179106433075e-10, + "loss": 0.0001, + "num_input_tokens_seen": 43985136, + "step": 22458 + }, + { + "epoch": 2.976673293571902, + "grad_norm": 4.934605121612549, + "learning_rate": 7.465111578716655e-10, + "loss": 0.0429, + "num_input_tokens_seen": 43987672, + "step": 22459 + }, + { + "epoch": 2.9768058316766073, + "grad_norm": 1.0088497400283813, + "learning_rate": 7.380525924313709e-10, + "loss": 0.0062, + "num_input_tokens_seen": 43990008, + "step": 22460 + }, + { + "epoch": 2.976938369781312, + "grad_norm": 13.008758544921875, + "learning_rate": 7.296422144856263e-10, + "loss": 0.1086, + "num_input_tokens_seen": 43992056, + "step": 22461 + }, + { + "epoch": 2.9770709078860174, + "grad_norm": 0.08129282295703888, + "learning_rate": 7.212800241965245e-10, + "loss": 0.0002, + "num_input_tokens_seen": 43993640, + "step": 22462 + }, + { + "epoch": 2.9772034459907224, + "grad_norm": 0.557762086391449, + "learning_rate": 7.129660217253254e-10, + "loss": 0.0018, + "num_input_tokens_seen": 43996384, + "step": 22463 + }, + { + "epoch": 2.9773359840954274, + "grad_norm": 4.946264743804932, + "learning_rate": 7.047002072321785e-10, + "loss": 0.0773, + "num_input_tokens_seen": 43999336, + "step": 22464 + }, + { + "epoch": 2.9774685222001325, + "grad_norm": 0.06037917733192444, + "learning_rate": 6.964825808766785e-10, + "loss": 0.0002, + "num_input_tokens_seen": 44001816, + "step": 22465 + }, + { + "epoch": 2.9776010603048375, + "grad_norm": 0.40494289994239807, + "learning_rate": 6.883131428173095e-10, + "loss": 0.0013, + "num_input_tokens_seen": 44003464, + "step": 22466 + }, + { + "epoch": 2.977733598409543, + "grad_norm": 0.004346027504652739, + "learning_rate": 6.801918932111684e-10, + "loss": 0.0, + "num_input_tokens_seen": 44005768, + "step": 22467 + }, + { + "epoch": 2.9778661365142476, + "grad_norm": 0.001597817288711667, + "learning_rate": 6.721188322153516e-10, + "loss": 0.0, + "num_input_tokens_seen": 44007080, + "step": 22468 + }, + { + "epoch": 2.977998674618953, + "grad_norm": 10.53309154510498, + "learning_rate": 6.640939599852902e-10, + "loss": 0.248, + "num_input_tokens_seen": 44009144, + "step": 22469 + }, + { + "epoch": 2.978131212723658, + "grad_norm": 0.00469657639041543, + "learning_rate": 6.56117276675583e-10, + "loss": 0.0, + "num_input_tokens_seen": 44010720, + "step": 22470 + }, + { + "epoch": 2.978263750828363, + "grad_norm": 0.07269161194562912, + "learning_rate": 6.481887824399957e-10, + "loss": 0.0003, + "num_input_tokens_seen": 44012648, + "step": 22471 + }, + { + "epoch": 2.978396288933068, + "grad_norm": 4.372887134552002, + "learning_rate": 6.403084774317392e-10, + "loss": 0.0262, + "num_input_tokens_seen": 44014176, + "step": 22472 + }, + { + "epoch": 2.978528827037773, + "grad_norm": 10.702216148376465, + "learning_rate": 6.324763618026363e-10, + "loss": 0.0834, + "num_input_tokens_seen": 44015704, + "step": 22473 + }, + { + "epoch": 2.9786613651424787, + "grad_norm": 0.007718060631304979, + "learning_rate": 6.246924357034001e-10, + "loss": 0.0, + "num_input_tokens_seen": 44017280, + "step": 22474 + }, + { + "epoch": 2.9787939032471833, + "grad_norm": 0.2517492175102234, + "learning_rate": 6.169566992844656e-10, + "loss": 0.0005, + "num_input_tokens_seen": 44019264, + "step": 22475 + }, + { + "epoch": 2.9789264413518888, + "grad_norm": 0.33524131774902344, + "learning_rate": 6.092691526948801e-10, + "loss": 0.0006, + "num_input_tokens_seen": 44021336, + "step": 22476 + }, + { + "epoch": 2.979058979456594, + "grad_norm": 0.09051000326871872, + "learning_rate": 6.016297960828588e-10, + "loss": 0.0004, + "num_input_tokens_seen": 44023696, + "step": 22477 + }, + { + "epoch": 2.979191517561299, + "grad_norm": 0.5951526165008545, + "learning_rate": 5.940386295955058e-10, + "loss": 0.004, + "num_input_tokens_seen": 44025456, + "step": 22478 + }, + { + "epoch": 2.979324055666004, + "grad_norm": 3.501281976699829, + "learning_rate": 5.864956533793709e-10, + "loss": 0.0158, + "num_input_tokens_seen": 44027192, + "step": 22479 + }, + { + "epoch": 2.979456593770709, + "grad_norm": 6.802612781524658, + "learning_rate": 5.790008675798931e-10, + "loss": 0.0358, + "num_input_tokens_seen": 44029272, + "step": 22480 + }, + { + "epoch": 2.9795891318754144, + "grad_norm": 0.10742016136646271, + "learning_rate": 5.715542723414014e-10, + "loss": 0.0005, + "num_input_tokens_seen": 44030920, + "step": 22481 + }, + { + "epoch": 2.9797216699801194, + "grad_norm": 8.273235321044922, + "learning_rate": 5.641558678079473e-10, + "loss": 0.0778, + "num_input_tokens_seen": 44033720, + "step": 22482 + }, + { + "epoch": 2.9798542080848245, + "grad_norm": 0.005858945660293102, + "learning_rate": 5.568056541213618e-10, + "loss": 0.0, + "num_input_tokens_seen": 44035200, + "step": 22483 + }, + { + "epoch": 2.9799867461895295, + "grad_norm": 0.23889635503292084, + "learning_rate": 5.495036314243085e-10, + "loss": 0.0007, + "num_input_tokens_seen": 44037048, + "step": 22484 + }, + { + "epoch": 2.9801192842942346, + "grad_norm": 1.9455432891845703, + "learning_rate": 5.422497998569532e-10, + "loss": 0.0062, + "num_input_tokens_seen": 44038696, + "step": 22485 + }, + { + "epoch": 2.9802518223989396, + "grad_norm": 0.01828172616660595, + "learning_rate": 5.350441595591837e-10, + "loss": 0.0001, + "num_input_tokens_seen": 44040680, + "step": 22486 + }, + { + "epoch": 2.9803843605036446, + "grad_norm": 0.0012640360509976745, + "learning_rate": 5.278867106700558e-10, + "loss": 0.0, + "num_input_tokens_seen": 44042072, + "step": 22487 + }, + { + "epoch": 2.98051689860835, + "grad_norm": 0.5344723463058472, + "learning_rate": 5.20777453327792e-10, + "loss": 0.0019, + "num_input_tokens_seen": 44043920, + "step": 22488 + }, + { + "epoch": 2.980649436713055, + "grad_norm": 14.965801239013672, + "learning_rate": 5.137163876689499e-10, + "loss": 0.3176, + "num_input_tokens_seen": 44045968, + "step": 22489 + }, + { + "epoch": 2.98078197481776, + "grad_norm": 1.2716041803359985, + "learning_rate": 5.067035138300869e-10, + "loss": 0.0064, + "num_input_tokens_seen": 44048048, + "step": 22490 + }, + { + "epoch": 2.9809145129224652, + "grad_norm": 0.29344311356544495, + "learning_rate": 4.997388319463726e-10, + "loss": 0.0015, + "num_input_tokens_seen": 44049928, + "step": 22491 + }, + { + "epoch": 2.9810470510271703, + "grad_norm": 0.014174328185617924, + "learning_rate": 4.928223421518663e-10, + "loss": 0.0001, + "num_input_tokens_seen": 44051712, + "step": 22492 + }, + { + "epoch": 2.9811795891318753, + "grad_norm": 3.300236940383911, + "learning_rate": 4.859540445803501e-10, + "loss": 0.0228, + "num_input_tokens_seen": 44054040, + "step": 22493 + }, + { + "epoch": 2.9813121272365803, + "grad_norm": 0.04087667539715767, + "learning_rate": 4.791339393636629e-10, + "loss": 0.0003, + "num_input_tokens_seen": 44056880, + "step": 22494 + }, + { + "epoch": 2.981444665341286, + "grad_norm": 0.0296641793102026, + "learning_rate": 4.723620266336437e-10, + "loss": 0.0001, + "num_input_tokens_seen": 44058200, + "step": 22495 + }, + { + "epoch": 2.981577203445991, + "grad_norm": 11.552082061767578, + "learning_rate": 4.6563830652102127e-10, + "loss": 0.102, + "num_input_tokens_seen": 44060352, + "step": 22496 + }, + { + "epoch": 2.981709741550696, + "grad_norm": 0.003502878127619624, + "learning_rate": 4.5896277915513655e-10, + "loss": 0.0, + "num_input_tokens_seen": 44061784, + "step": 22497 + }, + { + "epoch": 2.981842279655401, + "grad_norm": 1.863431692123413, + "learning_rate": 4.5233544466449786e-10, + "loss": 0.0207, + "num_input_tokens_seen": 44064000, + "step": 22498 + }, + { + "epoch": 2.981974817760106, + "grad_norm": 3.064136266708374, + "learning_rate": 4.457563031776135e-10, + "loss": 0.0189, + "num_input_tokens_seen": 44066064, + "step": 22499 + }, + { + "epoch": 2.982107355864811, + "grad_norm": 0.005188322626054287, + "learning_rate": 4.3922535482077145e-10, + "loss": 0.0, + "num_input_tokens_seen": 44067472, + "step": 22500 + }, + { + "epoch": 2.982239893969516, + "grad_norm": 0.11821889132261276, + "learning_rate": 4.327425997199819e-10, + "loss": 0.0003, + "num_input_tokens_seen": 44069032, + "step": 22501 + }, + { + "epoch": 2.9823724320742215, + "grad_norm": 3.676971673965454, + "learning_rate": 4.26308038000145e-10, + "loss": 0.0209, + "num_input_tokens_seen": 44070760, + "step": 22502 + }, + { + "epoch": 2.9825049701789266, + "grad_norm": 0.012415418401360512, + "learning_rate": 4.1992166978560567e-10, + "loss": 0.0001, + "num_input_tokens_seen": 44072320, + "step": 22503 + }, + { + "epoch": 2.9826375082836316, + "grad_norm": 3.4719927310943604, + "learning_rate": 4.1358349519932115e-10, + "loss": 0.0084, + "num_input_tokens_seen": 44074304, + "step": 22504 + }, + { + "epoch": 2.9827700463883366, + "grad_norm": 4.517679691314697, + "learning_rate": 4.072935143636936e-10, + "loss": 0.0361, + "num_input_tokens_seen": 44075760, + "step": 22505 + }, + { + "epoch": 2.9829025844930417, + "grad_norm": 0.04147573933005333, + "learning_rate": 4.0105172739945963e-10, + "loss": 0.0002, + "num_input_tokens_seen": 44077208, + "step": 22506 + }, + { + "epoch": 2.9830351225977467, + "grad_norm": 8.671402931213379, + "learning_rate": 3.9485813442763365e-10, + "loss": 0.06, + "num_input_tokens_seen": 44078920, + "step": 22507 + }, + { + "epoch": 2.9831676607024518, + "grad_norm": 1.8591692447662354, + "learning_rate": 3.8871273556728706e-10, + "loss": 0.0085, + "num_input_tokens_seen": 44080912, + "step": 22508 + }, + { + "epoch": 2.9833001988071572, + "grad_norm": 0.035686902701854706, + "learning_rate": 3.8261553093693617e-10, + "loss": 0.0001, + "num_input_tokens_seen": 44082384, + "step": 22509 + }, + { + "epoch": 2.9834327369118623, + "grad_norm": 6.499451160430908, + "learning_rate": 3.76566520653987e-10, + "loss": 0.0249, + "num_input_tokens_seen": 44085016, + "step": 22510 + }, + { + "epoch": 2.9835652750165673, + "grad_norm": 3.103484630584717, + "learning_rate": 3.705657048355682e-10, + "loss": 0.0412, + "num_input_tokens_seen": 44086904, + "step": 22511 + }, + { + "epoch": 2.9836978131212724, + "grad_norm": 7.763436317443848, + "learning_rate": 3.646130835968653e-10, + "loss": 0.0919, + "num_input_tokens_seen": 44089464, + "step": 22512 + }, + { + "epoch": 2.9838303512259774, + "grad_norm": 8.521469116210938, + "learning_rate": 3.5870865705278647e-10, + "loss": 0.1375, + "num_input_tokens_seen": 44093072, + "step": 22513 + }, + { + "epoch": 2.9839628893306824, + "grad_norm": 12.874971389770508, + "learning_rate": 3.528524253171295e-10, + "loss": 0.2204, + "num_input_tokens_seen": 44095064, + "step": 22514 + }, + { + "epoch": 2.9840954274353875, + "grad_norm": 0.02091335505247116, + "learning_rate": 3.4704438850313715e-10, + "loss": 0.0002, + "num_input_tokens_seen": 44096400, + "step": 22515 + }, + { + "epoch": 2.984227965540093, + "grad_norm": 11.117433547973633, + "learning_rate": 3.4128454672238687e-10, + "loss": 0.0713, + "num_input_tokens_seen": 44098248, + "step": 22516 + }, + { + "epoch": 2.984360503644798, + "grad_norm": 2.2027745246887207, + "learning_rate": 3.3557290008617847e-10, + "loss": 0.0082, + "num_input_tokens_seen": 44100040, + "step": 22517 + }, + { + "epoch": 2.984493041749503, + "grad_norm": 0.24879984557628632, + "learning_rate": 3.299094487044241e-10, + "loss": 0.0015, + "num_input_tokens_seen": 44101360, + "step": 22518 + }, + { + "epoch": 2.984625579854208, + "grad_norm": 5.670065879821777, + "learning_rate": 3.2429419268648067e-10, + "loss": 0.0451, + "num_input_tokens_seen": 44103216, + "step": 22519 + }, + { + "epoch": 2.984758117958913, + "grad_norm": 19.086732864379883, + "learning_rate": 3.187271321405949e-10, + "loss": 0.2993, + "num_input_tokens_seen": 44104920, + "step": 22520 + }, + { + "epoch": 2.984890656063618, + "grad_norm": 7.6504435539245605, + "learning_rate": 3.1320826717418093e-10, + "loss": 0.0263, + "num_input_tokens_seen": 44107680, + "step": 22521 + }, + { + "epoch": 2.985023194168323, + "grad_norm": 4.6792402267456055, + "learning_rate": 3.077375978935426e-10, + "loss": 0.0732, + "num_input_tokens_seen": 44109800, + "step": 22522 + }, + { + "epoch": 2.9851557322730287, + "grad_norm": 2.2937140464782715, + "learning_rate": 3.0231512440415114e-10, + "loss": 0.0123, + "num_input_tokens_seen": 44111960, + "step": 22523 + }, + { + "epoch": 2.9852882703777337, + "grad_norm": 4.477351665496826, + "learning_rate": 2.9694084681064496e-10, + "loss": 0.0527, + "num_input_tokens_seen": 44113680, + "step": 22524 + }, + { + "epoch": 2.9854208084824387, + "grad_norm": 0.008505197241902351, + "learning_rate": 2.916147652165524e-10, + "loss": 0.0001, + "num_input_tokens_seen": 44115888, + "step": 22525 + }, + { + "epoch": 2.9855533465871438, + "grad_norm": 0.00716635026037693, + "learning_rate": 2.8633687972456915e-10, + "loss": 0.0, + "num_input_tokens_seen": 44117384, + "step": 22526 + }, + { + "epoch": 2.985685884691849, + "grad_norm": 7.6649699211120605, + "learning_rate": 2.8110719043628056e-10, + "loss": 0.0321, + "num_input_tokens_seen": 44118960, + "step": 22527 + }, + { + "epoch": 2.985818422796554, + "grad_norm": 21.992382049560547, + "learning_rate": 2.759256974529945e-10, + "loss": 0.1726, + "num_input_tokens_seen": 44120472, + "step": 22528 + }, + { + "epoch": 2.985950960901259, + "grad_norm": 0.8311477899551392, + "learning_rate": 2.707924008740759e-10, + "loss": 0.0026, + "num_input_tokens_seen": 44122768, + "step": 22529 + }, + { + "epoch": 2.9860834990059644, + "grad_norm": 0.6071740388870239, + "learning_rate": 2.657073007988897e-10, + "loss": 0.0018, + "num_input_tokens_seen": 44124360, + "step": 22530 + }, + { + "epoch": 2.9862160371106694, + "grad_norm": 14.059874534606934, + "learning_rate": 2.606703973254132e-10, + "loss": 0.1835, + "num_input_tokens_seen": 44126384, + "step": 22531 + }, + { + "epoch": 2.9863485752153744, + "grad_norm": 0.002456819172948599, + "learning_rate": 2.5568169055079085e-10, + "loss": 0.0, + "num_input_tokens_seen": 44127824, + "step": 22532 + }, + { + "epoch": 2.9864811133200795, + "grad_norm": 24.06719970703125, + "learning_rate": 2.507411805707793e-10, + "loss": 0.2299, + "num_input_tokens_seen": 44129832, + "step": 22533 + }, + { + "epoch": 2.9866136514247845, + "grad_norm": 1.5957129001617432, + "learning_rate": 2.458488674811355e-10, + "loss": 0.0087, + "num_input_tokens_seen": 44132280, + "step": 22534 + }, + { + "epoch": 2.98674618952949, + "grad_norm": 1.1026782989501953, + "learning_rate": 2.4100475137622816e-10, + "loss": 0.0028, + "num_input_tokens_seen": 44134200, + "step": 22535 + }, + { + "epoch": 2.9868787276341946, + "grad_norm": 0.047960489988327026, + "learning_rate": 2.362088323490386e-10, + "loss": 0.0003, + "num_input_tokens_seen": 44136240, + "step": 22536 + }, + { + "epoch": 2.9870112657389, + "grad_norm": 4.072607040405273, + "learning_rate": 2.3146111049199281e-10, + "loss": 0.0072, + "num_input_tokens_seen": 44137928, + "step": 22537 + }, + { + "epoch": 2.987143803843605, + "grad_norm": 0.01629113033413887, + "learning_rate": 2.267615858972394e-10, + "loss": 0.0001, + "num_input_tokens_seen": 44139584, + "step": 22538 + }, + { + "epoch": 2.98727634194831, + "grad_norm": 0.016805415973067284, + "learning_rate": 2.2211025865498392e-10, + "loss": 0.0001, + "num_input_tokens_seen": 44141152, + "step": 22539 + }, + { + "epoch": 2.987408880053015, + "grad_norm": 0.003400669898837805, + "learning_rate": 2.175071288548769e-10, + "loss": 0.0, + "num_input_tokens_seen": 44143464, + "step": 22540 + }, + { + "epoch": 2.9875414181577202, + "grad_norm": 3.5520541667938232, + "learning_rate": 2.1295219658573618e-10, + "loss": 0.0159, + "num_input_tokens_seen": 44145512, + "step": 22541 + }, + { + "epoch": 2.9876739562624257, + "grad_norm": 0.9891106486320496, + "learning_rate": 2.084454619352694e-10, + "loss": 0.0038, + "num_input_tokens_seen": 44147056, + "step": 22542 + }, + { + "epoch": 2.9878064943671303, + "grad_norm": 0.6001947522163391, + "learning_rate": 2.03986924990629e-10, + "loss": 0.0016, + "num_input_tokens_seen": 44149904, + "step": 22543 + }, + { + "epoch": 2.987939032471836, + "grad_norm": 5.142385482788086, + "learning_rate": 1.9957658583785733e-10, + "loss": 0.029, + "num_input_tokens_seen": 44152776, + "step": 22544 + }, + { + "epoch": 2.988071570576541, + "grad_norm": 8.903682708740234, + "learning_rate": 1.9521444456133133e-10, + "loss": 0.1973, + "num_input_tokens_seen": 44155600, + "step": 22545 + }, + { + "epoch": 2.988204108681246, + "grad_norm": 3.744699716567993, + "learning_rate": 1.9090050124598302e-10, + "loss": 0.0165, + "num_input_tokens_seen": 44157136, + "step": 22546 + }, + { + "epoch": 2.988336646785951, + "grad_norm": 9.884406089782715, + "learning_rate": 1.8663475597424652e-10, + "loss": 0.0885, + "num_input_tokens_seen": 44158640, + "step": 22547 + }, + { + "epoch": 2.988469184890656, + "grad_norm": 0.5473750233650208, + "learning_rate": 1.8241720882911096e-10, + "loss": 0.0012, + "num_input_tokens_seen": 44160152, + "step": 22548 + }, + { + "epoch": 2.9886017229953614, + "grad_norm": 16.15572166442871, + "learning_rate": 1.7824785989134508e-10, + "loss": 0.1119, + "num_input_tokens_seen": 44162104, + "step": 22549 + }, + { + "epoch": 2.988734261100066, + "grad_norm": 0.017358332872390747, + "learning_rate": 1.7412670924144004e-10, + "loss": 0.0001, + "num_input_tokens_seen": 44164312, + "step": 22550 + }, + { + "epoch": 2.9888667992047715, + "grad_norm": 0.05452156439423561, + "learning_rate": 1.700537569587768e-10, + "loss": 0.0004, + "num_input_tokens_seen": 44166440, + "step": 22551 + }, + { + "epoch": 2.9889993373094765, + "grad_norm": 0.5943291187286377, + "learning_rate": 1.660290031221812e-10, + "loss": 0.0027, + "num_input_tokens_seen": 44167944, + "step": 22552 + }, + { + "epoch": 2.9891318754141816, + "grad_norm": 1.6413581371307373, + "learning_rate": 1.6205244780909123e-10, + "loss": 0.0138, + "num_input_tokens_seen": 44170032, + "step": 22553 + }, + { + "epoch": 2.9892644135188866, + "grad_norm": 9.999971389770508, + "learning_rate": 1.5812409109638994e-10, + "loss": 0.1212, + "num_input_tokens_seen": 44171528, + "step": 22554 + }, + { + "epoch": 2.9893969516235916, + "grad_norm": 2.721292018890381, + "learning_rate": 1.5424393305929486e-10, + "loss": 0.0301, + "num_input_tokens_seen": 44172624, + "step": 22555 + }, + { + "epoch": 2.989529489728297, + "grad_norm": 0.2592402398586273, + "learning_rate": 1.5041197377302365e-10, + "loss": 0.0013, + "num_input_tokens_seen": 44174992, + "step": 22556 + }, + { + "epoch": 2.9896620278330017, + "grad_norm": 3.2145674228668213, + "learning_rate": 1.4662821331140609e-10, + "loss": 0.0217, + "num_input_tokens_seen": 44176512, + "step": 22557 + }, + { + "epoch": 2.989794565937707, + "grad_norm": 0.0013348259963095188, + "learning_rate": 1.428926517474394e-10, + "loss": 0.0, + "num_input_tokens_seen": 44177672, + "step": 22558 + }, + { + "epoch": 2.9899271040424122, + "grad_norm": 1.804548740386963, + "learning_rate": 1.3920528915301047e-10, + "loss": 0.0118, + "num_input_tokens_seen": 44179824, + "step": 22559 + }, + { + "epoch": 2.9900596421471173, + "grad_norm": 4.515546798706055, + "learning_rate": 1.3556612559917358e-10, + "loss": 0.0456, + "num_input_tokens_seen": 44182488, + "step": 22560 + }, + { + "epoch": 2.9901921802518223, + "grad_norm": 3.5564351081848145, + "learning_rate": 1.3197516115615038e-10, + "loss": 0.0196, + "num_input_tokens_seen": 44184392, + "step": 22561 + }, + { + "epoch": 2.9903247183565274, + "grad_norm": 11.872001647949219, + "learning_rate": 1.284323958933298e-10, + "loss": 0.2508, + "num_input_tokens_seen": 44186488, + "step": 22562 + }, + { + "epoch": 2.990457256461233, + "grad_norm": 0.06354028731584549, + "learning_rate": 1.249378298789905e-10, + "loss": 0.0005, + "num_input_tokens_seen": 44188768, + "step": 22563 + }, + { + "epoch": 2.9905897945659374, + "grad_norm": 0.009352052584290504, + "learning_rate": 1.2149146318030103e-10, + "loss": 0.0001, + "num_input_tokens_seen": 44190824, + "step": 22564 + }, + { + "epoch": 2.990722332670643, + "grad_norm": 0.004185742232948542, + "learning_rate": 1.1809329586387476e-10, + "loss": 0.0, + "num_input_tokens_seen": 44192504, + "step": 22565 + }, + { + "epoch": 2.990854870775348, + "grad_norm": 0.02018456719815731, + "learning_rate": 1.1474332799493726e-10, + "loss": 0.0001, + "num_input_tokens_seen": 44195280, + "step": 22566 + }, + { + "epoch": 2.990987408880053, + "grad_norm": 1.6735354661941528, + "learning_rate": 1.1144155963871417e-10, + "loss": 0.0052, + "num_input_tokens_seen": 44197072, + "step": 22567 + }, + { + "epoch": 2.991119946984758, + "grad_norm": 0.3514725863933563, + "learning_rate": 1.0818799085821063e-10, + "loss": 0.001, + "num_input_tokens_seen": 44198536, + "step": 22568 + }, + { + "epoch": 2.991252485089463, + "grad_norm": 0.026112373918294907, + "learning_rate": 1.0498262171670937e-10, + "loss": 0.0001, + "num_input_tokens_seen": 44200768, + "step": 22569 + }, + { + "epoch": 2.9913850231941685, + "grad_norm": 0.005100660491734743, + "learning_rate": 1.0182545227555018e-10, + "loss": 0.0, + "num_input_tokens_seen": 44202896, + "step": 22570 + }, + { + "epoch": 2.9915175612988736, + "grad_norm": 1.691542387008667, + "learning_rate": 9.871648259579536e-11, + "loss": 0.0086, + "num_input_tokens_seen": 44204488, + "step": 22571 + }, + { + "epoch": 2.9916500994035786, + "grad_norm": 0.0029435441829264164, + "learning_rate": 9.565571273739693e-11, + "loss": 0.0, + "num_input_tokens_seen": 44206608, + "step": 22572 + }, + { + "epoch": 2.9917826375082837, + "grad_norm": 0.04887622594833374, + "learning_rate": 9.264314275919673e-11, + "loss": 0.0005, + "num_input_tokens_seen": 44209008, + "step": 22573 + }, + { + "epoch": 2.9919151756129887, + "grad_norm": 6.049227237701416, + "learning_rate": 8.9678772719759e-11, + "loss": 0.0492, + "num_input_tokens_seen": 44211152, + "step": 22574 + }, + { + "epoch": 2.9920477137176937, + "grad_norm": 0.1889122724533081, + "learning_rate": 8.676260267570514e-11, + "loss": 0.0009, + "num_input_tokens_seen": 44213528, + "step": 22575 + }, + { + "epoch": 2.9921802518223988, + "grad_norm": 0.0025527379475533962, + "learning_rate": 8.389463268337894e-11, + "loss": 0.0, + "num_input_tokens_seen": 44215128, + "step": 22576 + }, + { + "epoch": 2.9923127899271043, + "grad_norm": 0.031091947108507156, + "learning_rate": 8.107486279829157e-11, + "loss": 0.0002, + "num_input_tokens_seen": 44216504, + "step": 22577 + }, + { + "epoch": 2.9924453280318093, + "grad_norm": 0.058732688426971436, + "learning_rate": 7.830329307484397e-11, + "loss": 0.0002, + "num_input_tokens_seen": 44218464, + "step": 22578 + }, + { + "epoch": 2.9925778661365143, + "grad_norm": 0.08767752349376678, + "learning_rate": 7.557992356604926e-11, + "loss": 0.0005, + "num_input_tokens_seen": 44219944, + "step": 22579 + }, + { + "epoch": 2.9927104042412194, + "grad_norm": 6.561985015869141, + "learning_rate": 7.290475432464305e-11, + "loss": 0.0376, + "num_input_tokens_seen": 44221752, + "step": 22580 + }, + { + "epoch": 2.9928429423459244, + "grad_norm": 0.0009230970172211528, + "learning_rate": 7.027778540252827e-11, + "loss": 0.0, + "num_input_tokens_seen": 44223144, + "step": 22581 + }, + { + "epoch": 2.9929754804506294, + "grad_norm": 0.04239053651690483, + "learning_rate": 6.769901684994251e-11, + "loss": 0.0003, + "num_input_tokens_seen": 44224520, + "step": 22582 + }, + { + "epoch": 2.9931080185553345, + "grad_norm": 8.81534481048584, + "learning_rate": 6.51684487168458e-11, + "loss": 0.0337, + "num_input_tokens_seen": 44226248, + "step": 22583 + }, + { + "epoch": 2.99324055666004, + "grad_norm": 6.648081302642822, + "learning_rate": 6.268608105153285e-11, + "loss": 0.0174, + "num_input_tokens_seen": 44228128, + "step": 22584 + }, + { + "epoch": 2.993373094764745, + "grad_norm": 5.100999355316162, + "learning_rate": 6.02519139025759e-11, + "loss": 0.0402, + "num_input_tokens_seen": 44230344, + "step": 22585 + }, + { + "epoch": 2.99350563286945, + "grad_norm": 0.0026779421605169773, + "learning_rate": 5.786594731660433e-11, + "loss": 0.0, + "num_input_tokens_seen": 44232328, + "step": 22586 + }, + { + "epoch": 2.993638170974155, + "grad_norm": 11.860074996948242, + "learning_rate": 5.552818133941484e-11, + "loss": 0.0903, + "num_input_tokens_seen": 44233584, + "step": 22587 + }, + { + "epoch": 2.99377070907886, + "grad_norm": 0.14862053096294403, + "learning_rate": 5.323861601652658e-11, + "loss": 0.0011, + "num_input_tokens_seen": 44235288, + "step": 22588 + }, + { + "epoch": 2.993903247183565, + "grad_norm": 8.668434143066406, + "learning_rate": 5.099725139151579e-11, + "loss": 0.0902, + "num_input_tokens_seen": 44236944, + "step": 22589 + }, + { + "epoch": 2.99403578528827, + "grad_norm": 0.0036657059099525213, + "learning_rate": 4.880408750823629e-11, + "loss": 0.0, + "num_input_tokens_seen": 44238264, + "step": 22590 + }, + { + "epoch": 2.9941683233929757, + "grad_norm": 0.005773690063506365, + "learning_rate": 4.665912440832143e-11, + "loss": 0.0, + "num_input_tokens_seen": 44240160, + "step": 22591 + }, + { + "epoch": 2.9943008614976807, + "grad_norm": 12.544565200805664, + "learning_rate": 4.4562362133682144e-11, + "loss": 0.1789, + "num_input_tokens_seen": 44241752, + "step": 22592 + }, + { + "epoch": 2.9944333996023857, + "grad_norm": 1.4391350746154785, + "learning_rate": 4.251380072428646e-11, + "loss": 0.0072, + "num_input_tokens_seen": 44243072, + "step": 22593 + }, + { + "epoch": 2.994565937707091, + "grad_norm": 12.043181419372559, + "learning_rate": 4.0513440220102395e-11, + "loss": 0.0887, + "num_input_tokens_seen": 44245064, + "step": 22594 + }, + { + "epoch": 2.994698475811796, + "grad_norm": 0.003705475479364395, + "learning_rate": 3.856128065943265e-11, + "loss": 0.0, + "num_input_tokens_seen": 44247000, + "step": 22595 + }, + { + "epoch": 2.994831013916501, + "grad_norm": 0.6523666977882385, + "learning_rate": 3.665732207974726e-11, + "loss": 0.003, + "num_input_tokens_seen": 44248984, + "step": 22596 + }, + { + "epoch": 2.994963552021206, + "grad_norm": 0.020966388285160065, + "learning_rate": 3.480156451796113e-11, + "loss": 0.0001, + "num_input_tokens_seen": 44250576, + "step": 22597 + }, + { + "epoch": 2.9950960901259114, + "grad_norm": 5.690288543701172, + "learning_rate": 3.2994008009878954e-11, + "loss": 0.0901, + "num_input_tokens_seen": 44253224, + "step": 22598 + }, + { + "epoch": 2.9952286282306164, + "grad_norm": 6.685713768005371, + "learning_rate": 3.1234652590472756e-11, + "loss": 0.0525, + "num_input_tokens_seen": 44255072, + "step": 22599 + }, + { + "epoch": 2.9953611663353215, + "grad_norm": 3.420220375061035, + "learning_rate": 2.952349829332679e-11, + "loss": 0.0296, + "num_input_tokens_seen": 44257408, + "step": 22600 + }, + { + "epoch": 2.9954937044400265, + "grad_norm": 0.003240854712203145, + "learning_rate": 2.7860545151747743e-11, + "loss": 0.0, + "num_input_tokens_seen": 44259456, + "step": 22601 + }, + { + "epoch": 2.9956262425447315, + "grad_norm": 4.917562484741211, + "learning_rate": 2.6245793197376966e-11, + "loss": 0.1933, + "num_input_tokens_seen": 44261464, + "step": 22602 + }, + { + "epoch": 2.9957587806494366, + "grad_norm": 0.0024403154384344816, + "learning_rate": 2.4679242461855824e-11, + "loss": 0.0, + "num_input_tokens_seen": 44262992, + "step": 22603 + }, + { + "epoch": 2.9958913187541416, + "grad_norm": 0.19025234878063202, + "learning_rate": 2.316089297516033e-11, + "loss": 0.0009, + "num_input_tokens_seen": 44264936, + "step": 22604 + }, + { + "epoch": 2.996023856858847, + "grad_norm": 3.1127891540527344, + "learning_rate": 2.1690744766711404e-11, + "loss": 0.0103, + "num_input_tokens_seen": 44266608, + "step": 22605 + }, + { + "epoch": 2.996156394963552, + "grad_norm": 0.020279603078961372, + "learning_rate": 2.0268797864542165e-11, + "loss": 0.0001, + "num_input_tokens_seen": 44268088, + "step": 22606 + }, + { + "epoch": 2.996288933068257, + "grad_norm": 14.725159645080566, + "learning_rate": 1.8895052296130644e-11, + "loss": 0.2132, + "num_input_tokens_seen": 44270304, + "step": 22607 + }, + { + "epoch": 2.996421471172962, + "grad_norm": 1.5043866634368896, + "learning_rate": 1.7569508088122188e-11, + "loss": 0.0072, + "num_input_tokens_seen": 44273064, + "step": 22608 + }, + { + "epoch": 2.9965540092776672, + "grad_norm": 9.167046546936035, + "learning_rate": 1.6292165266051928e-11, + "loss": 0.0885, + "num_input_tokens_seen": 44275352, + "step": 22609 + }, + { + "epoch": 2.9966865473823723, + "grad_norm": 0.012785854749381542, + "learning_rate": 1.5063023854622327e-11, + "loss": 0.0, + "num_input_tokens_seen": 44277384, + "step": 22610 + }, + { + "epoch": 2.9968190854870773, + "grad_norm": 5.353344440460205, + "learning_rate": 1.3882083877425623e-11, + "loss": 0.0449, + "num_input_tokens_seen": 44279248, + "step": 22611 + }, + { + "epoch": 2.996951623591783, + "grad_norm": 5.932577133178711, + "learning_rate": 1.2749345356943832e-11, + "loss": 0.0851, + "num_input_tokens_seen": 44281792, + "step": 22612 + }, + { + "epoch": 2.997084161696488, + "grad_norm": 0.02163410186767578, + "learning_rate": 1.166480831565897e-11, + "loss": 0.0001, + "num_input_tokens_seen": 44283144, + "step": 22613 + }, + { + "epoch": 2.997216699801193, + "grad_norm": 0.025362735614180565, + "learning_rate": 1.0628472774110166e-11, + "loss": 0.0001, + "num_input_tokens_seen": 44285152, + "step": 22614 + }, + { + "epoch": 2.997349237905898, + "grad_norm": 0.006480336654931307, + "learning_rate": 9.640338752003874e-12, + "loss": 0.0, + "num_input_tokens_seen": 44286680, + "step": 22615 + }, + { + "epoch": 2.997481776010603, + "grad_norm": 0.2815084457397461, + "learning_rate": 8.700406269046557e-12, + "loss": 0.0014, + "num_input_tokens_seen": 44288672, + "step": 22616 + }, + { + "epoch": 2.997614314115308, + "grad_norm": 0.770248532295227, + "learning_rate": 7.808675342724226e-12, + "loss": 0.0045, + "num_input_tokens_seen": 44290808, + "step": 22617 + }, + { + "epoch": 2.997746852220013, + "grad_norm": 6.655081272125244, + "learning_rate": 6.965145990522892e-12, + "loss": 0.067, + "num_input_tokens_seen": 44294128, + "step": 22618 + }, + { + "epoch": 2.9978793903247185, + "grad_norm": 9.320930480957031, + "learning_rate": 6.169818228818347e-12, + "loss": 0.2274, + "num_input_tokens_seen": 44296600, + "step": 22619 + }, + { + "epoch": 2.9980119284294235, + "grad_norm": 0.0037947280798107386, + "learning_rate": 5.422692072876157e-12, + "loss": 0.0, + "num_input_tokens_seen": 44297888, + "step": 22620 + }, + { + "epoch": 2.9981444665341286, + "grad_norm": 8.301339149475098, + "learning_rate": 4.723767536851665e-12, + "loss": 0.202, + "num_input_tokens_seen": 44299320, + "step": 22621 + }, + { + "epoch": 2.9982770046388336, + "grad_norm": 0.0010123125975951552, + "learning_rate": 4.073044634345102e-12, + "loss": 0.0, + "num_input_tokens_seen": 44300664, + "step": 22622 + }, + { + "epoch": 2.9984095427435387, + "grad_norm": 0.012102228589355946, + "learning_rate": 3.4705233781240354e-12, + "loss": 0.0001, + "num_input_tokens_seen": 44303000, + "step": 22623 + }, + { + "epoch": 2.998542080848244, + "grad_norm": 0.06029818207025528, + "learning_rate": 2.9162037798458056e-12, + "loss": 0.0003, + "num_input_tokens_seen": 44304840, + "step": 22624 + }, + { + "epoch": 2.9986746189529487, + "grad_norm": 0.025797370821237564, + "learning_rate": 2.410085849779975e-12, + "loss": 0.0, + "num_input_tokens_seen": 44306280, + "step": 22625 + }, + { + "epoch": 2.998807157057654, + "grad_norm": 0.030400771647691727, + "learning_rate": 1.9521695979185518e-12, + "loss": 0.0002, + "num_input_tokens_seen": 44308704, + "step": 22626 + }, + { + "epoch": 2.9989396951623593, + "grad_norm": 0.15217946469783783, + "learning_rate": 1.5424550331433197e-12, + "loss": 0.0007, + "num_input_tokens_seen": 44311464, + "step": 22627 + }, + { + "epoch": 2.9990722332670643, + "grad_norm": 2.7459113597869873, + "learning_rate": 1.18094216322584e-12, + "loss": 0.0105, + "num_input_tokens_seen": 44313784, + "step": 22628 + }, + { + "epoch": 2.9992047713717693, + "grad_norm": 0.0004933705204166472, + "learning_rate": 8.676309951050066e-13, + "loss": 0.0, + "num_input_tokens_seen": 44315000, + "step": 22629 + }, + { + "epoch": 2.9993373094764744, + "grad_norm": 6.124077796936035, + "learning_rate": 6.025215351646019e-13, + "loss": 0.1049, + "num_input_tokens_seen": 44316504, + "step": 22630 + }, + { + "epoch": 2.99946984758118, + "grad_norm": 0.4443013072013855, + "learning_rate": 3.856137881230737e-13, + "loss": 0.0017, + "num_input_tokens_seen": 44318792, + "step": 22631 + }, + { + "epoch": 2.9996023856858844, + "grad_norm": 8.954855918884277, + "learning_rate": 2.1690775814375842e-13, + "loss": 0.0622, + "num_input_tokens_seen": 44320520, + "step": 22632 + }, + { + "epoch": 2.99973492379059, + "grad_norm": 5.586569309234619, + "learning_rate": 9.640344883488084e-14, + "loss": 0.0457, + "num_input_tokens_seen": 44322392, + "step": 22633 + }, + { + "epoch": 2.999867461895295, + "grad_norm": 0.010882946662604809, + "learning_rate": 2.4100862416887028e-14, + "loss": 0.0001, + "num_input_tokens_seen": 44324256, + "step": 22634 + }, + { + "epoch": 3.0, + "grad_norm": 0.3562160134315491, + "learning_rate": 0.0, + "loss": 0.0017, + "num_input_tokens_seen": 44325984, + "step": 22635 + }, + { + "epoch": 3.0, + "num_input_tokens_seen": 44325984, + "step": 22635, + "total_flos": 1.8804729049283297e+18, + "train_loss": 0.11007208550827785, + "train_runtime": 220429.5839, + "train_samples_per_second": 0.205, + "train_steps_per_second": 0.103 + } + ], + "logging_steps": 1, + "max_steps": 22635, + "num_input_tokens_seen": 44325984, + "num_train_epochs": 3, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.8804729049283297e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}