{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 201, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014925373134328358, "grad_norm": 1.690108060836792, "learning_rate": 3.3333333333333335e-05, "loss": 2.0929, "step": 1 }, { "epoch": 0.029850746268656716, "grad_norm": 1.8146634101867676, "learning_rate": 6.666666666666667e-05, "loss": 2.2396, "step": 2 }, { "epoch": 0.04477611940298507, "grad_norm": 1.9881230592727661, "learning_rate": 0.0001, "loss": 2.2041, "step": 3 }, { "epoch": 0.05970149253731343, "grad_norm": 2.016158103942871, "learning_rate": 0.00013333333333333334, "loss": 2.176, "step": 4 }, { "epoch": 0.07462686567164178, "grad_norm": 1.7050724029541016, "learning_rate": 0.0001666666666666667, "loss": 2.0313, "step": 5 }, { "epoch": 0.08955223880597014, "grad_norm": 1.3839370012283325, "learning_rate": 0.0002, "loss": 1.818, "step": 6 }, { "epoch": 0.1044776119402985, "grad_norm": 1.2064529657363892, "learning_rate": 0.00019998832024742372, "loss": 1.6223, "step": 7 }, { "epoch": 0.11940298507462686, "grad_norm": 1.4260923862457275, "learning_rate": 0.00019995328402117537, "loss": 1.5066, "step": 8 }, { "epoch": 0.13432835820895522, "grad_norm": 1.4008389711380005, "learning_rate": 0.0001998949004149094, "loss": 1.2986, "step": 9 }, { "epoch": 0.14925373134328357, "grad_norm": 1.0532835721969604, "learning_rate": 0.00019981318458209423, "loss": 1.2209, "step": 10 }, { "epoch": 0.16417910447761194, "grad_norm": 1.0162363052368164, "learning_rate": 0.0001997081577320789, "loss": 1.1007, "step": 11 }, { "epoch": 0.1791044776119403, "grad_norm": 1.0316762924194336, "learning_rate": 0.00019957984712458838, "loss": 1.0087, "step": 12 }, { "epoch": 0.19402985074626866, "grad_norm": 1.4354606866836548, "learning_rate": 0.00019942828606264818, "loss": 1.2594, "step": 13 }, { "epoch": 0.208955223880597, "grad_norm": 2.1245648860931396, "learning_rate": 0.0001992535138839406, "loss": 1.4087, "step": 14 }, { "epoch": 0.22388059701492538, "grad_norm": 1.2807283401489258, "learning_rate": 0.0001990555759505946, "loss": 1.3497, "step": 15 }, { "epoch": 0.23880597014925373, "grad_norm": 0.7393147349357605, "learning_rate": 0.00019883452363741216, "loss": 1.263, "step": 16 }, { "epoch": 0.2537313432835821, "grad_norm": 0.9481880068778992, "learning_rate": 0.0001985904143185338, "loss": 1.2411, "step": 17 }, { "epoch": 0.26865671641791045, "grad_norm": 0.8055089116096497, "learning_rate": 0.00019832331135254724, "loss": 1.1185, "step": 18 }, { "epoch": 0.2835820895522388, "grad_norm": 0.5172561407089233, "learning_rate": 0.00019803328406604252, "loss": 1.1209, "step": 19 }, { "epoch": 0.29850746268656714, "grad_norm": 0.49716681241989136, "learning_rate": 0.00019772040773561854, "loss": 1.0457, "step": 20 }, { "epoch": 0.31343283582089554, "grad_norm": 0.6212261915206909, "learning_rate": 0.0001973847635683447, "loss": 1.0181, "step": 21 }, { "epoch": 0.3283582089552239, "grad_norm": 0.5520172119140625, "learning_rate": 0.0001970264386806839, "loss": 1.0484, "step": 22 }, { "epoch": 0.34328358208955223, "grad_norm": 0.4611297845840454, "learning_rate": 0.00019664552607588117, "loss": 0.9581, "step": 23 }, { "epoch": 0.3582089552238806, "grad_norm": 0.4654655158519745, "learning_rate": 0.00019624212461982497, "loss": 0.9156, "step": 24 }, { "epoch": 0.373134328358209, "grad_norm": 0.6210163831710815, "learning_rate": 0.00019581633901538626, "loss": 0.8496, "step": 25 }, { "epoch": 0.3880597014925373, "grad_norm": 0.8395450711250305, "learning_rate": 0.0001953682797752431, "loss": 1.3176, "step": 26 }, { "epoch": 0.40298507462686567, "grad_norm": 0.695261538028717, "learning_rate": 0.00019489806319319687, "loss": 1.1809, "step": 27 }, { "epoch": 0.417910447761194, "grad_norm": 0.48507070541381836, "learning_rate": 0.0001944058113139884, "loss": 1.166, "step": 28 }, { "epoch": 0.43283582089552236, "grad_norm": 0.4849218726158142, "learning_rate": 0.00019389165190162114, "loss": 1.1374, "step": 29 }, { "epoch": 0.44776119402985076, "grad_norm": 0.5968400835990906, "learning_rate": 0.00019335571840619988, "loss": 1.1094, "step": 30 }, { "epoch": 0.4626865671641791, "grad_norm": 0.7121988534927368, "learning_rate": 0.00019279814992929418, "loss": 1.0958, "step": 31 }, { "epoch": 0.47761194029850745, "grad_norm": 0.5847436189651489, "learning_rate": 0.00019221909118783407, "loss": 0.9935, "step": 32 }, { "epoch": 0.4925373134328358, "grad_norm": 0.4457413852214813, "learning_rate": 0.0001916186924765491, "loss": 0.9889, "step": 33 }, { "epoch": 0.5074626865671642, "grad_norm": 0.3817737102508545, "learning_rate": 0.00019099710962895908, "loss": 0.9586, "step": 34 }, { "epoch": 0.5223880597014925, "grad_norm": 0.4596458077430725, "learning_rate": 0.0001903545039769278, "loss": 0.9506, "step": 35 }, { "epoch": 0.5373134328358209, "grad_norm": 0.45744407176971436, "learning_rate": 0.0001896910423087889, "loss": 0.8788, "step": 36 }, { "epoch": 0.5522388059701493, "grad_norm": 0.4180223345756531, "learning_rate": 0.00018900689682605642, "loss": 0.8059, "step": 37 }, { "epoch": 0.5671641791044776, "grad_norm": 0.5419154167175293, "learning_rate": 0.00018830224509872953, "loss": 1.0703, "step": 38 }, { "epoch": 0.582089552238806, "grad_norm": 0.8379240036010742, "learning_rate": 0.00018757727001920445, "loss": 1.2347, "step": 39 }, { "epoch": 0.5970149253731343, "grad_norm": 0.7408146262168884, "learning_rate": 0.00018683215975480452, "loss": 1.2051, "step": 40 }, { "epoch": 0.6119402985074627, "grad_norm": 0.4733333885669708, "learning_rate": 0.00018606710769894153, "loss": 1.112, "step": 41 }, { "epoch": 0.6268656716417911, "grad_norm": 0.3522099554538727, "learning_rate": 0.0001852823124209204, "loss": 1.0547, "step": 42 }, { "epoch": 0.6417910447761194, "grad_norm": 0.4999963939189911, "learning_rate": 0.00018447797761440051, "loss": 1.0523, "step": 43 }, { "epoch": 0.6567164179104478, "grad_norm": 0.6665084362030029, "learning_rate": 0.00018365431204452683, "loss": 1.0204, "step": 44 }, { "epoch": 0.6716417910447762, "grad_norm": 0.6554574370384216, "learning_rate": 0.00018281152949374527, "loss": 1.0063, "step": 45 }, { "epoch": 0.6865671641791045, "grad_norm": 0.6276643872261047, "learning_rate": 0.00018194984870631512, "loss": 0.9716, "step": 46 }, { "epoch": 0.7014925373134329, "grad_norm": 0.49094897508621216, "learning_rate": 0.00018106949333153405, "loss": 0.934, "step": 47 }, { "epoch": 0.7164179104477612, "grad_norm": 0.4001779556274414, "learning_rate": 0.00018017069186569001, "loss": 0.9578, "step": 48 }, { "epoch": 0.7313432835820896, "grad_norm": 0.40681931376457214, "learning_rate": 0.00017925367759275495, "loss": 0.8893, "step": 49 }, { "epoch": 0.746268656716418, "grad_norm": 0.47569212317466736, "learning_rate": 0.00017831868852383583, "loss": 0.7641, "step": 50 }, { "epoch": 0.7611940298507462, "grad_norm": 0.6968297362327576, "learning_rate": 0.00017736596733539909, "loss": 1.2385, "step": 51 }, { "epoch": 0.7761194029850746, "grad_norm": 0.7256303429603577, "learning_rate": 0.00017639576130628376, "loss": 1.2058, "step": 52 }, { "epoch": 0.7910447761194029, "grad_norm": 0.6893110275268555, "learning_rate": 0.00017540832225352012, "loss": 1.2222, "step": 53 }, { "epoch": 0.8059701492537313, "grad_norm": 0.4945465326309204, "learning_rate": 0.0001744039064669709, "loss": 1.1218, "step": 54 }, { "epoch": 0.8208955223880597, "grad_norm": 0.4019485414028168, "learning_rate": 0.00017338277464281108, "loss": 1.0477, "step": 55 }, { "epoch": 0.835820895522388, "grad_norm": 0.3658827245235443, "learning_rate": 0.00017234519181586396, "loss": 1.0071, "step": 56 }, { "epoch": 0.8507462686567164, "grad_norm": 0.3937952220439911, "learning_rate": 0.00017129142729081177, "loss": 0.9923, "step": 57 }, { "epoch": 0.8656716417910447, "grad_norm": 0.4745718836784363, "learning_rate": 0.00017022175457229725, "loss": 0.9878, "step": 58 }, { "epoch": 0.8805970149253731, "grad_norm": 0.5112557411193848, "learning_rate": 0.00016913645129393578, "loss": 0.9179, "step": 59 }, { "epoch": 0.8955223880597015, "grad_norm": 0.5845867991447449, "learning_rate": 0.00016803579914625535, "loss": 0.9084, "step": 60 }, { "epoch": 0.9104477611940298, "grad_norm": 0.5979167222976685, "learning_rate": 0.00016692008380358395, "loss": 0.953, "step": 61 }, { "epoch": 0.9253731343283582, "grad_norm": 0.5538232922554016, "learning_rate": 0.00016578959484990263, "loss": 0.8545, "step": 62 }, { "epoch": 0.9402985074626866, "grad_norm": 0.3931979835033417, "learning_rate": 0.00016464462570368402, "loss": 0.9686, "step": 63 }, { "epoch": 0.9552238805970149, "grad_norm": 0.36252066493034363, "learning_rate": 0.00016348547354173558, "loss": 1.1047, "step": 64 }, { "epoch": 0.9701492537313433, "grad_norm": 0.4850124418735504, "learning_rate": 0.0001623124392220673, "loss": 1.0296, "step": 65 }, { "epoch": 0.9850746268656716, "grad_norm": 0.4648304879665375, "learning_rate": 0.00016112582720580402, "loss": 0.9287, "step": 66 }, { "epoch": 1.0, "grad_norm": 0.3998699486255646, "learning_rate": 0.0001599259454781625, "loss": 0.9156, "step": 67 }, { "epoch": 1.0149253731343284, "grad_norm": 0.421875536441803, "learning_rate": 0.00015871310546851383, "loss": 1.1974, "step": 68 }, { "epoch": 1.0298507462686568, "grad_norm": 0.39584335684776306, "learning_rate": 0.00015748762196955197, "loss": 1.1015, "step": 69 }, { "epoch": 1.044776119402985, "grad_norm": 0.37235844135284424, "learning_rate": 0.00015624981305558918, "loss": 1.0992, "step": 70 }, { "epoch": 1.0597014925373134, "grad_norm": 0.3330038785934448, "learning_rate": 0.000155, "loss": 1.0614, "step": 71 }, { "epoch": 1.0746268656716418, "grad_norm": 0.3231871724128723, "learning_rate": 0.00015373850719183454, "loss": 0.9913, "step": 72 }, { "epoch": 1.0895522388059702, "grad_norm": 0.3235574960708618, "learning_rate": 0.0001524656620516234, "loss": 0.9477, "step": 73 }, { "epoch": 1.1044776119402986, "grad_norm": 0.37417852878570557, "learning_rate": 0.0001511817949463956, "loss": 0.9286, "step": 74 }, { "epoch": 1.1194029850746268, "grad_norm": 0.39474931359291077, "learning_rate": 0.00014988723910393175, "loss": 0.9042, "step": 75 }, { "epoch": 1.1343283582089552, "grad_norm": 0.485606849193573, "learning_rate": 0.00014858233052627488, "loss": 0.8646, "step": 76 }, { "epoch": 1.1492537313432836, "grad_norm": 0.49617916345596313, "learning_rate": 0.00014726740790252108, "loss": 0.8742, "step": 77 }, { "epoch": 1.164179104477612, "grad_norm": 0.46328800916671753, "learning_rate": 0.0001459428125209126, "loss": 0.8256, "step": 78 }, { "epoch": 1.1791044776119404, "grad_norm": 0.41910260915756226, "learning_rate": 0.0001446088881802566, "loss": 0.7242, "step": 79 }, { "epoch": 1.1940298507462686, "grad_norm": 0.3721103072166443, "learning_rate": 0.000143265981100692, "loss": 0.9349, "step": 80 }, { "epoch": 1.208955223880597, "grad_norm": 0.35940876603126526, "learning_rate": 0.00014191443983382822, "loss": 1.1032, "step": 81 }, { "epoch": 1.2238805970149254, "grad_norm": 0.4461018145084381, "learning_rate": 0.00014055461517227847, "loss": 1.0979, "step": 82 }, { "epoch": 1.2388059701492538, "grad_norm": 0.5155666470527649, "learning_rate": 0.00013918686005861145, "loss": 1.0231, "step": 83 }, { "epoch": 1.2537313432835822, "grad_norm": 0.5260865092277527, "learning_rate": 0.00013781152949374526, "loss": 1.0228, "step": 84 }, { "epoch": 1.2686567164179103, "grad_norm": 0.47857630252838135, "learning_rate": 0.0001364289804448068, "loss": 0.9576, "step": 85 }, { "epoch": 1.2835820895522387, "grad_norm": 0.4128516912460327, "learning_rate": 0.00013503957175248075, "loss": 0.9763, "step": 86 }, { "epoch": 1.2985074626865671, "grad_norm": 0.3800605535507202, "learning_rate": 0.00013364366403787283, "loss": 0.9272, "step": 87 }, { "epoch": 1.3134328358208955, "grad_norm": 0.3839597702026367, "learning_rate": 0.00013224161960891025, "loss": 0.8632, "step": 88 }, { "epoch": 1.328358208955224, "grad_norm": 0.4173763394355774, "learning_rate": 0.0001308338023663049, "loss": 0.8838, "step": 89 }, { "epoch": 1.3432835820895521, "grad_norm": 0.5093969106674194, "learning_rate": 0.00012942057770910255, "loss": 0.8444, "step": 90 }, { "epoch": 1.3582089552238805, "grad_norm": 0.5481818914413452, "learning_rate": 0.00012800231243984401, "loss": 0.7714, "step": 91 }, { "epoch": 1.373134328358209, "grad_norm": 0.6201927065849304, "learning_rate": 0.00012657937466936106, "loss": 0.707, "step": 92 }, { "epoch": 1.3880597014925373, "grad_norm": 0.5079281330108643, "learning_rate": 0.00012515213372123345, "loss": 1.1623, "step": 93 }, { "epoch": 1.4029850746268657, "grad_norm": 0.4763680398464203, "learning_rate": 0.0001237209600359311, "loss": 1.1181, "step": 94 }, { "epoch": 1.417910447761194, "grad_norm": 0.38111981749534607, "learning_rate": 0.00012228622507466587, "loss": 0.9978, "step": 95 }, { "epoch": 1.4328358208955223, "grad_norm": 0.38934579491615295, "learning_rate": 0.00012084830122297907, "loss": 1.0207, "step": 96 }, { "epoch": 1.4477611940298507, "grad_norm": 0.5346177816390991, "learning_rate": 0.00011940756169408881, "loss": 1.0358, "step": 97 }, { "epoch": 1.462686567164179, "grad_norm": 0.5502073764801025, "learning_rate": 0.00011796438043202227, "loss": 0.919, "step": 98 }, { "epoch": 1.4776119402985075, "grad_norm": 0.5650199055671692, "learning_rate": 0.00011651913201455864, "loss": 0.9101, "step": 99 }, { "epoch": 1.4925373134328357, "grad_norm": 0.46825388073921204, "learning_rate": 0.00011507219155600737, "loss": 0.9228, "step": 100 }, { "epoch": 1.5074626865671643, "grad_norm": 0.42388054728507996, "learning_rate": 0.00011362393460984737, "loss": 0.8535, "step": 101 }, { "epoch": 1.5223880597014925, "grad_norm": 0.39270514249801636, "learning_rate": 0.00011217473707125192, "loss": 0.8353, "step": 102 }, { "epoch": 1.537313432835821, "grad_norm": 0.42990821599960327, "learning_rate": 0.0001107249750795251, "loss": 0.7997, "step": 103 }, { "epoch": 1.5522388059701493, "grad_norm": 0.4879288375377655, "learning_rate": 0.00010927502492047492, "loss": 0.7087, "step": 104 }, { "epoch": 1.5671641791044775, "grad_norm": 0.4773794114589691, "learning_rate": 0.00010782526292874813, "loss": 0.9446, "step": 105 }, { "epoch": 1.582089552238806, "grad_norm": 0.3559342622756958, "learning_rate": 0.00010637606539015268, "loss": 1.0926, "step": 106 }, { "epoch": 1.5970149253731343, "grad_norm": 0.38176605105400085, "learning_rate": 0.00010492780844399264, "loss": 1.0961, "step": 107 }, { "epoch": 1.6119402985074627, "grad_norm": 0.37334877252578735, "learning_rate": 0.00010348086798544141, "loss": 1.0699, "step": 108 }, { "epoch": 1.626865671641791, "grad_norm": 0.34911951422691345, "learning_rate": 0.00010203561956797775, "loss": 0.9812, "step": 109 }, { "epoch": 1.6417910447761193, "grad_norm": 0.35348981618881226, "learning_rate": 0.00010059243830591121, "loss": 0.9875, "step": 110 }, { "epoch": 1.6567164179104479, "grad_norm": 0.36440154910087585, "learning_rate": 9.915169877702095e-05, "loss": 0.9164, "step": 111 }, { "epoch": 1.671641791044776, "grad_norm": 0.3861341178417206, "learning_rate": 9.771377492533418e-05, "loss": 0.865, "step": 112 }, { "epoch": 1.6865671641791045, "grad_norm": 0.4054892957210541, "learning_rate": 9.627903996406892e-05, "loss": 0.8351, "step": 113 }, { "epoch": 1.7014925373134329, "grad_norm": 0.4275088906288147, "learning_rate": 9.484786627876654e-05, "loss": 0.8281, "step": 114 }, { "epoch": 1.716417910447761, "grad_norm": 0.420304536819458, "learning_rate": 9.342062533063898e-05, "loss": 0.8573, "step": 115 }, { "epoch": 1.7313432835820897, "grad_norm": 0.3914477527141571, "learning_rate": 9.199768756015603e-05, "loss": 0.7773, "step": 116 }, { "epoch": 1.7462686567164178, "grad_norm": 0.43206727504730225, "learning_rate": 9.057942229089747e-05, "loss": 0.7021, "step": 117 }, { "epoch": 1.7611940298507462, "grad_norm": 0.43827471137046814, "learning_rate": 8.916619763369516e-05, "loss": 1.1523, "step": 118 }, { "epoch": 1.7761194029850746, "grad_norm": 0.3688608407974243, "learning_rate": 8.775838039108974e-05, "loss": 1.0733, "step": 119 }, { "epoch": 1.7910447761194028, "grad_norm": 0.35826849937438965, "learning_rate": 8.635633596212718e-05, "loss": 1.0442, "step": 120 }, { "epoch": 1.8059701492537314, "grad_norm": 0.35874509811401367, "learning_rate": 8.496042824751926e-05, "loss": 1.0133, "step": 121 }, { "epoch": 1.8208955223880596, "grad_norm": 0.3662901818752289, "learning_rate": 8.357101955519324e-05, "loss": 0.9586, "step": 122 }, { "epoch": 1.835820895522388, "grad_norm": 0.3741875886917114, "learning_rate": 8.218847050625476e-05, "loss": 0.9409, "step": 123 }, { "epoch": 1.8507462686567164, "grad_norm": 0.378964364528656, "learning_rate": 8.081313994138857e-05, "loss": 0.8868, "step": 124 }, { "epoch": 1.8656716417910446, "grad_norm": 0.3909272849559784, "learning_rate": 7.944538482772156e-05, "loss": 0.8837, "step": 125 }, { "epoch": 1.8805970149253732, "grad_norm": 0.41188931465148926, "learning_rate": 7.808556016617178e-05, "loss": 0.8514, "step": 126 }, { "epoch": 1.8955223880597014, "grad_norm": 0.4490172863006592, "learning_rate": 7.673401889930802e-05, "loss": 0.8751, "step": 127 }, { "epoch": 1.9104477611940298, "grad_norm": 0.47430258989334106, "learning_rate": 7.539111181974343e-05, "loss": 0.8221, "step": 128 }, { "epoch": 1.9253731343283582, "grad_norm": 0.4374563992023468, "learning_rate": 7.405718747908743e-05, "loss": 0.7239, "step": 129 }, { "epoch": 1.9402985074626866, "grad_norm": 0.4078747034072876, "learning_rate": 7.273259209747896e-05, "loss": 0.8874, "step": 130 }, { "epoch": 1.955223880597015, "grad_norm": 0.42563360929489136, "learning_rate": 7.141766947372512e-05, "loss": 1.0419, "step": 131 }, { "epoch": 1.9701492537313432, "grad_norm": 0.4324108362197876, "learning_rate": 7.011276089606829e-05, "loss": 0.948, "step": 132 }, { "epoch": 1.9850746268656716, "grad_norm": 0.42789846658706665, "learning_rate": 6.881820505360443e-05, "loss": 0.852, "step": 133 }, { "epoch": 2.0, "grad_norm": 0.39731353521347046, "learning_rate": 6.753433794837662e-05, "loss": 0.8727, "step": 134 }, { "epoch": 2.014925373134328, "grad_norm": 0.35240477323532104, "learning_rate": 6.626149280816546e-05, "loss": 1.0679, "step": 135 }, { "epoch": 2.029850746268657, "grad_norm": 0.33283913135528564, "learning_rate": 6.500000000000002e-05, "loss": 1.0181, "step": 136 }, { "epoch": 2.044776119402985, "grad_norm": 0.3615310788154602, "learning_rate": 6.375018694441084e-05, "loss": 0.9978, "step": 137 }, { "epoch": 2.0597014925373136, "grad_norm": 0.36938410997390747, "learning_rate": 6.251237803044805e-05, "loss": 0.9883, "step": 138 }, { "epoch": 2.074626865671642, "grad_norm": 0.3807987570762634, "learning_rate": 6.128689453148619e-05, "loss": 0.9572, "step": 139 }, { "epoch": 2.08955223880597, "grad_norm": 0.37807610630989075, "learning_rate": 6.00740545218375e-05, "loss": 0.9174, "step": 140 }, { "epoch": 2.1044776119402986, "grad_norm": 0.37602487206459045, "learning_rate": 5.887417279419599e-05, "loss": 0.8229, "step": 141 }, { "epoch": 2.1194029850746268, "grad_norm": 0.35350197553634644, "learning_rate": 5.7687560777932735e-05, "loss": 0.8076, "step": 142 }, { "epoch": 2.1343283582089554, "grad_norm": 0.3940332233905792, "learning_rate": 5.651452645826445e-05, "loss": 0.788, "step": 143 }, { "epoch": 2.1492537313432836, "grad_norm": 0.46034398674964905, "learning_rate": 5.5355374296315995e-05, "loss": 0.7882, "step": 144 }, { "epoch": 2.1641791044776117, "grad_norm": 0.4225603938102722, "learning_rate": 5.421040515009737e-05, "loss": 0.7197, "step": 145 }, { "epoch": 2.1791044776119404, "grad_norm": 0.46008700132369995, "learning_rate": 5.3079916196416055e-05, "loss": 0.6569, "step": 146 }, { "epoch": 2.1940298507462686, "grad_norm": 0.41973477602005005, "learning_rate": 5.196420085374467e-05, "loss": 0.8682, "step": 147 }, { "epoch": 2.208955223880597, "grad_norm": 0.3677213191986084, "learning_rate": 5.0863548706064245e-05, "loss": 1.0353, "step": 148 }, { "epoch": 2.2238805970149254, "grad_norm": 0.37162861227989197, "learning_rate": 4.977824542770279e-05, "loss": 1.001, "step": 149 }, { "epoch": 2.2388059701492535, "grad_norm": 0.39737215638160706, "learning_rate": 4.870857270918825e-05, "loss": 0.9846, "step": 150 }, { "epoch": 2.253731343283582, "grad_norm": 0.38380125164985657, "learning_rate": 4.7654808184136064e-05, "loss": 0.9606, "step": 151 }, { "epoch": 2.2686567164179103, "grad_norm": 0.40244144201278687, "learning_rate": 4.6617225357188976e-05, "loss": 0.8571, "step": 152 }, { "epoch": 2.283582089552239, "grad_norm": 0.4329751431941986, "learning_rate": 4.5596093533029116e-05, "loss": 0.8531, "step": 153 }, { "epoch": 2.298507462686567, "grad_norm": 0.45405519008636475, "learning_rate": 4.459167774647993e-05, "loss": 0.8512, "step": 154 }, { "epoch": 2.3134328358208958, "grad_norm": 0.45590460300445557, "learning_rate": 4.360423869371629e-05, "loss": 0.8208, "step": 155 }, { "epoch": 2.328358208955224, "grad_norm": 0.4376915395259857, "learning_rate": 4.2634032664600895e-05, "loss": 0.7654, "step": 156 }, { "epoch": 2.343283582089552, "grad_norm": 0.45759543776512146, "learning_rate": 4.168131147616417e-05, "loss": 0.7857, "step": 157 }, { "epoch": 2.3582089552238807, "grad_norm": 0.4490528702735901, "learning_rate": 4.0746322407245066e-05, "loss": 0.7051, "step": 158 }, { "epoch": 2.373134328358209, "grad_norm": 0.4924563765525818, "learning_rate": 3.982930813430999e-05, "loss": 0.6348, "step": 159 }, { "epoch": 2.388059701492537, "grad_norm": 0.35502833127975464, "learning_rate": 3.893050666846596e-05, "loss": 1.1142, "step": 160 }, { "epoch": 2.4029850746268657, "grad_norm": 0.3795003890991211, "learning_rate": 3.805015129368492e-05, "loss": 1.0387, "step": 161 }, { "epoch": 2.417910447761194, "grad_norm": 0.3922593593597412, "learning_rate": 3.718847050625475e-05, "loss": 1.0402, "step": 162 }, { "epoch": 2.4328358208955225, "grad_norm": 0.4245050251483917, "learning_rate": 3.6345687955473166e-05, "loss": 0.9854, "step": 163 }, { "epoch": 2.4477611940298507, "grad_norm": 0.39441049098968506, "learning_rate": 3.552202238559953e-05, "loss": 0.9561, "step": 164 }, { "epoch": 2.4626865671641793, "grad_norm": 0.39788442850112915, "learning_rate": 3.4717687579079596e-05, "loss": 0.9104, "step": 165 }, { "epoch": 2.4776119402985075, "grad_norm": 0.4182056784629822, "learning_rate": 3.393289230105849e-05, "loss": 0.8841, "step": 166 }, { "epoch": 2.4925373134328357, "grad_norm": 0.42861151695251465, "learning_rate": 3.316784024519553e-05, "loss": 0.8055, "step": 167 }, { "epoch": 2.5074626865671643, "grad_norm": 0.42246565222740173, "learning_rate": 3.242272998079557e-05, "loss": 0.7947, "step": 168 }, { "epoch": 2.5223880597014925, "grad_norm": 0.46474263072013855, "learning_rate": 3.1697754901270473e-05, "loss": 0.8153, "step": 169 }, { "epoch": 2.5373134328358207, "grad_norm": 0.4996289312839508, "learning_rate": 3.099310317394359e-05, "loss": 0.7579, "step": 170 }, { "epoch": 2.5522388059701493, "grad_norm": 0.47399628162384033, "learning_rate": 3.030895769121112e-05, "loss": 0.6813, "step": 171 }, { "epoch": 2.5671641791044775, "grad_norm": 0.4417833983898163, "learning_rate": 2.9645496023072244e-05, "loss": 0.8971, "step": 172 }, { "epoch": 2.582089552238806, "grad_norm": 0.3691651225090027, "learning_rate": 2.9002890371040918e-05, "loss": 1.0862, "step": 173 }, { "epoch": 2.5970149253731343, "grad_norm": 0.4065288007259369, "learning_rate": 2.8381307523450916e-05, "loss": 1.031, "step": 174 }, { "epoch": 2.611940298507463, "grad_norm": 0.3905118405818939, "learning_rate": 2.778090881216592e-05, "loss": 0.9701, "step": 175 }, { "epoch": 2.626865671641791, "grad_norm": 0.39984792470932007, "learning_rate": 2.7201850070705826e-05, "loss": 0.9493, "step": 176 }, { "epoch": 2.6417910447761193, "grad_norm": 0.415585994720459, "learning_rate": 2.664428159380013e-05, "loss": 0.9129, "step": 177 }, { "epoch": 2.656716417910448, "grad_norm": 0.42336076498031616, "learning_rate": 2.610834809837891e-05, "loss": 0.8791, "step": 178 }, { "epoch": 2.671641791044776, "grad_norm": 0.45662274956703186, "learning_rate": 2.5594188686011615e-05, "loss": 0.871, "step": 179 }, { "epoch": 2.6865671641791042, "grad_norm": 0.4160149395465851, "learning_rate": 2.5101936806803117e-05, "loss": 0.7626, "step": 180 }, { "epoch": 2.701492537313433, "grad_norm": 0.43893417716026306, "learning_rate": 2.463172022475691e-05, "loss": 0.8046, "step": 181 }, { "epoch": 2.716417910447761, "grad_norm": 0.4579525291919708, "learning_rate": 2.418366098461374e-05, "loss": 0.7713, "step": 182 }, { "epoch": 2.7313432835820897, "grad_norm": 0.4761490523815155, "learning_rate": 2.3757875380175044e-05, "loss": 0.69, "step": 183 }, { "epoch": 2.746268656716418, "grad_norm": 0.5591773986816406, "learning_rate": 2.3354473924118842e-05, "loss": 0.6075, "step": 184 }, { "epoch": 2.7611940298507465, "grad_norm": 0.3821795880794525, "learning_rate": 2.297356131931614e-05, "loss": 1.0839, "step": 185 }, { "epoch": 2.7761194029850746, "grad_norm": 0.37466174364089966, "learning_rate": 2.261523643165532e-05, "loss": 1.0221, "step": 186 }, { "epoch": 2.791044776119403, "grad_norm": 0.3825508654117584, "learning_rate": 2.22795922643815e-05, "loss": 1.0, "step": 187 }, { "epoch": 2.8059701492537314, "grad_norm": 0.41949060559272766, "learning_rate": 2.196671593395749e-05, "loss": 0.9473, "step": 188 }, { "epoch": 2.8208955223880596, "grad_norm": 0.42044076323509216, "learning_rate": 2.167668864745279e-05, "loss": 0.8887, "step": 189 }, { "epoch": 2.835820895522388, "grad_norm": 0.4112393856048584, "learning_rate": 2.1409585681466204e-05, "loss": 0.8724, "step": 190 }, { "epoch": 2.8507462686567164, "grad_norm": 0.45745235681533813, "learning_rate": 2.1165476362587846e-05, "loss": 0.8562, "step": 191 }, { "epoch": 2.8656716417910446, "grad_norm": 0.4491675794124603, "learning_rate": 2.09444240494054e-05, "loss": 0.8593, "step": 192 }, { "epoch": 2.8805970149253732, "grad_norm": 0.4317816197872162, "learning_rate": 2.0746486116059418e-05, "loss": 0.7933, "step": 193 }, { "epoch": 2.8955223880597014, "grad_norm": 0.46604618430137634, "learning_rate": 2.0571713937351834e-05, "loss": 0.7903, "step": 194 }, { "epoch": 2.91044776119403, "grad_norm": 0.48169732093811035, "learning_rate": 2.0420152875411624e-05, "loss": 0.7668, "step": 195 }, { "epoch": 2.925373134328358, "grad_norm": 0.4627380073070526, "learning_rate": 2.0291842267921108e-05, "loss": 0.6404, "step": 196 }, { "epoch": 2.9402985074626864, "grad_norm": 0.44821980595588684, "learning_rate": 2.0186815417905787e-05, "loss": 0.8672, "step": 197 }, { "epoch": 2.955223880597015, "grad_norm": 0.3909732699394226, "learning_rate": 2.0105099585090603e-05, "loss": 0.9487, "step": 198 }, { "epoch": 2.970149253731343, "grad_norm": 0.4137306213378906, "learning_rate": 2.0046715978824664e-05, "loss": 0.8438, "step": 199 }, { "epoch": 2.9850746268656714, "grad_norm": 0.4548170864582062, "learning_rate": 2.001167975257628e-05, "loss": 0.8052, "step": 200 }, { "epoch": 3.0, "grad_norm": 0.4302070140838623, "learning_rate": 2e-05, "loss": 0.8199, "step": 201 } ], "logging_steps": 1, "max_steps": 201, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.25183025013588e+17, "train_batch_size": 18, "trial_name": null, "trial_params": null }