|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.7414272474513438, |
|
"eval_steps": 500, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0037071362372567192, |
|
"grad_norm": 167957.515625, |
|
"learning_rate": 0.0003992, |
|
"loss": 10.791, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0074142724745134385, |
|
"grad_norm": 109901.1953125, |
|
"learning_rate": 0.00039840000000000003, |
|
"loss": 10.4195, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.011121408711770158, |
|
"grad_norm": 101352.265625, |
|
"learning_rate": 0.0003976, |
|
"loss": 10.0242, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.014828544949026877, |
|
"grad_norm": 105144.921875, |
|
"learning_rate": 0.0003968, |
|
"loss": 9.5684, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.018535681186283594, |
|
"grad_norm": 107496.6640625, |
|
"learning_rate": 0.00039600000000000003, |
|
"loss": 9.1775, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.022242817423540315, |
|
"grad_norm": 99089.9453125, |
|
"learning_rate": 0.0003952, |
|
"loss": 8.9319, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.025949953660797033, |
|
"grad_norm": 91537.4765625, |
|
"learning_rate": 0.0003944, |
|
"loss": 8.4907, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.029657089898053754, |
|
"grad_norm": 113517.7265625, |
|
"learning_rate": 0.0003936, |
|
"loss": 8.1206, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.033364226135310475, |
|
"grad_norm": 120697.0546875, |
|
"learning_rate": 0.0003928, |
|
"loss": 8.0377, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.03707136237256719, |
|
"grad_norm": 115909.0546875, |
|
"learning_rate": 0.000392, |
|
"loss": 7.9754, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04077849860982391, |
|
"grad_norm": 116857.3359375, |
|
"learning_rate": 0.0003912, |
|
"loss": 7.8788, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.04448563484708063, |
|
"grad_norm": 101768.7109375, |
|
"learning_rate": 0.0003904, |
|
"loss": 7.8914, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.04819277108433735, |
|
"grad_norm": 101978.6015625, |
|
"learning_rate": 0.0003896, |
|
"loss": 7.8178, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.051899907321594066, |
|
"grad_norm": 90827.578125, |
|
"learning_rate": 0.0003888, |
|
"loss": 7.85, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.05560704355885079, |
|
"grad_norm": 82672.1640625, |
|
"learning_rate": 0.000388, |
|
"loss": 7.8152, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.05931417979610751, |
|
"grad_norm": 65482.09375, |
|
"learning_rate": 0.00038720000000000003, |
|
"loss": 7.8131, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.06302131603336422, |
|
"grad_norm": 55323.29296875, |
|
"learning_rate": 0.0003864, |
|
"loss": 7.6994, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.06672845227062095, |
|
"grad_norm": 94588.7109375, |
|
"learning_rate": 0.0003856, |
|
"loss": 7.8545, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.07043558850787766, |
|
"grad_norm": 50202.546875, |
|
"learning_rate": 0.00038480000000000003, |
|
"loss": 7.75, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.07414272474513438, |
|
"grad_norm": 48727.80859375, |
|
"learning_rate": 0.000384, |
|
"loss": 7.7449, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0778498609823911, |
|
"grad_norm": 53795.23046875, |
|
"learning_rate": 0.0003832, |
|
"loss": 7.702, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.08155699721964782, |
|
"grad_norm": 55052.234375, |
|
"learning_rate": 0.0003824, |
|
"loss": 7.7048, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.08526413345690455, |
|
"grad_norm": 35977.5625, |
|
"learning_rate": 0.0003816, |
|
"loss": 7.7986, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.08897126969416126, |
|
"grad_norm": 55099.60546875, |
|
"learning_rate": 0.0003808, |
|
"loss": 7.8071, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.09267840593141798, |
|
"grad_norm": 34977.36328125, |
|
"learning_rate": 0.00038, |
|
"loss": 7.8473, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0963855421686747, |
|
"grad_norm": 35271.6640625, |
|
"learning_rate": 0.0003792, |
|
"loss": 7.7099, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.10009267840593142, |
|
"grad_norm": 44887.39453125, |
|
"learning_rate": 0.0003784, |
|
"loss": 7.617, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.10379981464318813, |
|
"grad_norm": 41191.33203125, |
|
"learning_rate": 0.0003776, |
|
"loss": 7.697, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.10750695088044486, |
|
"grad_norm": 45202.97265625, |
|
"learning_rate": 0.0003768, |
|
"loss": 7.7264, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.11121408711770157, |
|
"grad_norm": 44944.65234375, |
|
"learning_rate": 0.000376, |
|
"loss": 7.7159, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.11492122335495829, |
|
"grad_norm": 34502.83203125, |
|
"learning_rate": 0.0003752, |
|
"loss": 7.7213, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.11862835959221502, |
|
"grad_norm": 38415.63671875, |
|
"learning_rate": 0.00037440000000000005, |
|
"loss": 7.6674, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.12233549582947173, |
|
"grad_norm": 34140.18359375, |
|
"learning_rate": 0.00037360000000000003, |
|
"loss": 7.6627, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.12604263206672844, |
|
"grad_norm": 27067.009765625, |
|
"learning_rate": 0.00037280000000000006, |
|
"loss": 7.6678, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.12974976830398516, |
|
"grad_norm": 34192.23828125, |
|
"learning_rate": 0.00037200000000000004, |
|
"loss": 7.7438, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.1334569045412419, |
|
"grad_norm": 42940.6953125, |
|
"learning_rate": 0.0003712, |
|
"loss": 7.6559, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.1371640407784986, |
|
"grad_norm": 28908.26171875, |
|
"learning_rate": 0.00037040000000000006, |
|
"loss": 7.6763, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.14087117701575533, |
|
"grad_norm": 46989.23046875, |
|
"learning_rate": 0.00036960000000000004, |
|
"loss": 7.6483, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.14457831325301204, |
|
"grad_norm": 38628.00390625, |
|
"learning_rate": 0.0003688, |
|
"loss": 7.5813, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.14828544949026876, |
|
"grad_norm": 26901.994140625, |
|
"learning_rate": 0.00036800000000000005, |
|
"loss": 7.7328, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1519925857275255, |
|
"grad_norm": 55413.51953125, |
|
"learning_rate": 0.00036720000000000004, |
|
"loss": 7.5977, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.1556997219647822, |
|
"grad_norm": 38922.68359375, |
|
"learning_rate": 0.0003664, |
|
"loss": 7.6575, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.15940685820203893, |
|
"grad_norm": 49835.87109375, |
|
"learning_rate": 0.00036560000000000005, |
|
"loss": 7.7382, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.16311399443929564, |
|
"grad_norm": 41342.8515625, |
|
"learning_rate": 0.00036480000000000003, |
|
"loss": 7.7068, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.16682113067655235, |
|
"grad_norm": 38896.15625, |
|
"learning_rate": 0.000364, |
|
"loss": 7.6614, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1705282669138091, |
|
"grad_norm": 29027.955078125, |
|
"learning_rate": 0.00036320000000000005, |
|
"loss": 7.728, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.1742354031510658, |
|
"grad_norm": 33758.0859375, |
|
"learning_rate": 0.0003624, |
|
"loss": 7.7392, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.17794253938832252, |
|
"grad_norm": 29002.869140625, |
|
"learning_rate": 0.0003616, |
|
"loss": 7.666, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.18164967562557924, |
|
"grad_norm": 33393.12890625, |
|
"learning_rate": 0.00036080000000000004, |
|
"loss": 7.6067, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.18535681186283595, |
|
"grad_norm": 39193.51171875, |
|
"learning_rate": 0.00036, |
|
"loss": 7.7868, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.18906394810009267, |
|
"grad_norm": 25982.78125, |
|
"learning_rate": 0.0003592, |
|
"loss": 7.7189, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.1927710843373494, |
|
"grad_norm": 28694.505859375, |
|
"learning_rate": 0.00035840000000000004, |
|
"loss": 7.6999, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.19647822057460612, |
|
"grad_norm": 26356.8828125, |
|
"learning_rate": 0.0003576, |
|
"loss": 7.712, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.20018535681186284, |
|
"grad_norm": 25880.298828125, |
|
"learning_rate": 0.0003568, |
|
"loss": 7.7015, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.20389249304911955, |
|
"grad_norm": 23557.111328125, |
|
"learning_rate": 0.00035600000000000003, |
|
"loss": 7.6849, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.20759962928637626, |
|
"grad_norm": 31365.33203125, |
|
"learning_rate": 0.0003552, |
|
"loss": 7.7333, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.211306765523633, |
|
"grad_norm": 31506.552734375, |
|
"learning_rate": 0.0003544, |
|
"loss": 7.7317, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.21501390176088972, |
|
"grad_norm": 22261.244140625, |
|
"learning_rate": 0.00035360000000000003, |
|
"loss": 7.6978, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.21872103799814643, |
|
"grad_norm": 36267.4921875, |
|
"learning_rate": 0.0003528, |
|
"loss": 7.7125, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.22242817423540315, |
|
"grad_norm": 29624.087890625, |
|
"learning_rate": 0.00035200000000000005, |
|
"loss": 7.734, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.22613531047265986, |
|
"grad_norm": 25301.228515625, |
|
"learning_rate": 0.0003512, |
|
"loss": 7.7287, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.22984244670991658, |
|
"grad_norm": 26147.228515625, |
|
"learning_rate": 0.0003504, |
|
"loss": 7.7059, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.23354958294717332, |
|
"grad_norm": 27329.443359375, |
|
"learning_rate": 0.00034960000000000004, |
|
"loss": 7.6798, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.23725671918443003, |
|
"grad_norm": 23415.9609375, |
|
"learning_rate": 0.0003488, |
|
"loss": 7.6968, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.24096385542168675, |
|
"grad_norm": 23625.1171875, |
|
"learning_rate": 0.000348, |
|
"loss": 7.7119, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.24467099165894346, |
|
"grad_norm": 23805.42578125, |
|
"learning_rate": 0.00034720000000000004, |
|
"loss": 7.6473, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.24837812789620017, |
|
"grad_norm": 47364.8203125, |
|
"learning_rate": 0.0003464, |
|
"loss": 7.7921, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.2520852641334569, |
|
"grad_norm": 29178.279296875, |
|
"learning_rate": 0.0003456, |
|
"loss": 7.6958, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.2557924003707136, |
|
"grad_norm": 26202.958984375, |
|
"learning_rate": 0.00034480000000000003, |
|
"loss": 7.7765, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.2594995366079703, |
|
"grad_norm": 48753.58203125, |
|
"learning_rate": 0.000344, |
|
"loss": 7.6496, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2632066728452271, |
|
"grad_norm": 24508.509765625, |
|
"learning_rate": 0.0003432, |
|
"loss": 7.7125, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.2669138090824838, |
|
"grad_norm": 33996.55078125, |
|
"learning_rate": 0.00034240000000000003, |
|
"loss": 7.6635, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.2706209453197405, |
|
"grad_norm": 32989.36328125, |
|
"learning_rate": 0.0003416, |
|
"loss": 7.6893, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.2743280815569972, |
|
"grad_norm": 32296.1796875, |
|
"learning_rate": 0.0003408, |
|
"loss": 7.6696, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.27803521779425394, |
|
"grad_norm": 35698.16015625, |
|
"learning_rate": 0.00034, |
|
"loss": 7.6713, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.28174235403151066, |
|
"grad_norm": 25034.283203125, |
|
"learning_rate": 0.0003392, |
|
"loss": 7.6629, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.28544949026876737, |
|
"grad_norm": 36568.65625, |
|
"learning_rate": 0.0003384, |
|
"loss": 7.7075, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.2891566265060241, |
|
"grad_norm": 25048.875, |
|
"learning_rate": 0.0003376, |
|
"loss": 7.6727, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.2928637627432808, |
|
"grad_norm": 25438.61328125, |
|
"learning_rate": 0.0003368, |
|
"loss": 7.7028, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.2965708989805375, |
|
"grad_norm": 27428.9453125, |
|
"learning_rate": 0.000336, |
|
"loss": 7.6516, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.3002780352177943, |
|
"grad_norm": 32185.8125, |
|
"learning_rate": 0.0003352, |
|
"loss": 7.7127, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.303985171455051, |
|
"grad_norm": 28342.439453125, |
|
"learning_rate": 0.0003344, |
|
"loss": 7.6461, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 22977.4140625, |
|
"learning_rate": 0.0003336, |
|
"loss": 7.6348, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.3113994439295644, |
|
"grad_norm": 28778.767578125, |
|
"learning_rate": 0.0003328, |
|
"loss": 7.6299, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.31510658016682114, |
|
"grad_norm": 21658.966796875, |
|
"learning_rate": 0.000332, |
|
"loss": 7.633, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.31881371640407785, |
|
"grad_norm": 22994.66796875, |
|
"learning_rate": 0.0003312, |
|
"loss": 7.648, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.32252085264133457, |
|
"grad_norm": 23064.05078125, |
|
"learning_rate": 0.0003304, |
|
"loss": 7.712, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.3262279888785913, |
|
"grad_norm": 34689.19140625, |
|
"learning_rate": 0.0003296, |
|
"loss": 7.6168, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.329935125115848, |
|
"grad_norm": 26677.1328125, |
|
"learning_rate": 0.0003288, |
|
"loss": 7.6226, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.3336422613531047, |
|
"grad_norm": 39699.62109375, |
|
"learning_rate": 0.000328, |
|
"loss": 7.6465, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3373493975903614, |
|
"grad_norm": 47106.6640625, |
|
"learning_rate": 0.0003272, |
|
"loss": 7.6884, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.3410565338276182, |
|
"grad_norm": 30162.638671875, |
|
"learning_rate": 0.0003264, |
|
"loss": 7.7695, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.3447636700648749, |
|
"grad_norm": 40879.01953125, |
|
"learning_rate": 0.0003256, |
|
"loss": 7.7253, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.3484708063021316, |
|
"grad_norm": 56518.4921875, |
|
"learning_rate": 0.00032480000000000003, |
|
"loss": 7.6734, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.35217794253938833, |
|
"grad_norm": 37450.08203125, |
|
"learning_rate": 0.000324, |
|
"loss": 7.6897, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.35588507877664505, |
|
"grad_norm": 28603.978515625, |
|
"learning_rate": 0.00032320000000000005, |
|
"loss": 7.7346, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.35959221501390176, |
|
"grad_norm": 45344.12109375, |
|
"learning_rate": 0.00032240000000000003, |
|
"loss": 7.7564, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.3632993512511585, |
|
"grad_norm": 20206.189453125, |
|
"learning_rate": 0.0003216, |
|
"loss": 7.6465, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.3670064874884152, |
|
"grad_norm": 29952.62890625, |
|
"learning_rate": 0.00032080000000000005, |
|
"loss": 7.6581, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.3707136237256719, |
|
"grad_norm": 24017.02734375, |
|
"learning_rate": 0.00032, |
|
"loss": 7.7068, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3744207599629286, |
|
"grad_norm": 21995.66796875, |
|
"learning_rate": 0.0003192, |
|
"loss": 7.7306, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.37812789620018533, |
|
"grad_norm": 22698.15625, |
|
"learning_rate": 0.00031840000000000004, |
|
"loss": 7.6167, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.3818350324374421, |
|
"grad_norm": 19390.587890625, |
|
"learning_rate": 0.0003176, |
|
"loss": 7.6298, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.3855421686746988, |
|
"grad_norm": 23548.39453125, |
|
"learning_rate": 0.00031680000000000006, |
|
"loss": 7.7148, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.38924930491195553, |
|
"grad_norm": 25070.564453125, |
|
"learning_rate": 0.00031600000000000004, |
|
"loss": 7.8045, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.39295644114921224, |
|
"grad_norm": 39852.94921875, |
|
"learning_rate": 0.0003152, |
|
"loss": 7.6813, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.39666357738646896, |
|
"grad_norm": 30994.017578125, |
|
"learning_rate": 0.00031440000000000005, |
|
"loss": 7.6801, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.40037071362372567, |
|
"grad_norm": 35010.94140625, |
|
"learning_rate": 0.00031360000000000003, |
|
"loss": 7.7625, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.4040778498609824, |
|
"grad_norm": 32364.001953125, |
|
"learning_rate": 0.0003128, |
|
"loss": 7.682, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.4077849860982391, |
|
"grad_norm": 24475.48828125, |
|
"learning_rate": 0.00031200000000000005, |
|
"loss": 7.6953, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.4114921223354958, |
|
"grad_norm": 28467.2890625, |
|
"learning_rate": 0.00031120000000000003, |
|
"loss": 7.7112, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.4151992585727525, |
|
"grad_norm": 46241.89453125, |
|
"learning_rate": 0.0003104, |
|
"loss": 7.625, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.41890639481000924, |
|
"grad_norm": 25736.814453125, |
|
"learning_rate": 0.00030960000000000004, |
|
"loss": 7.6842, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.422613531047266, |
|
"grad_norm": 25479.744140625, |
|
"learning_rate": 0.0003088, |
|
"loss": 7.7131, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.4263206672845227, |
|
"grad_norm": 32374.447265625, |
|
"learning_rate": 0.000308, |
|
"loss": 7.7209, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.43002780352177944, |
|
"grad_norm": 21930.126953125, |
|
"learning_rate": 0.00030720000000000004, |
|
"loss": 7.6593, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.43373493975903615, |
|
"grad_norm": 22632.013671875, |
|
"learning_rate": 0.0003064, |
|
"loss": 7.7121, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.43744207599629287, |
|
"grad_norm": 21551.6328125, |
|
"learning_rate": 0.0003056, |
|
"loss": 7.6504, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.4411492122335496, |
|
"grad_norm": 24234.326171875, |
|
"learning_rate": 0.00030480000000000004, |
|
"loss": 7.7, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.4448563484708063, |
|
"grad_norm": 27236.205078125, |
|
"learning_rate": 0.000304, |
|
"loss": 7.7073, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.448563484708063, |
|
"grad_norm": 20109.84765625, |
|
"learning_rate": 0.0003032, |
|
"loss": 7.642, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.4522706209453197, |
|
"grad_norm": 20982.546875, |
|
"learning_rate": 0.00030240000000000003, |
|
"loss": 7.7092, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.45597775718257644, |
|
"grad_norm": 30563.40625, |
|
"learning_rate": 0.0003016, |
|
"loss": 7.6086, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.45968489341983315, |
|
"grad_norm": 26537.8828125, |
|
"learning_rate": 0.0003008, |
|
"loss": 7.711, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.4633920296570899, |
|
"grad_norm": 26180.9765625, |
|
"learning_rate": 0.00030000000000000003, |
|
"loss": 7.6946, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.46709916589434664, |
|
"grad_norm": 25894.8828125, |
|
"learning_rate": 0.0002992, |
|
"loss": 7.6252, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.47080630213160335, |
|
"grad_norm": 17775.234375, |
|
"learning_rate": 0.0002984, |
|
"loss": 7.7064, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.47451343836886006, |
|
"grad_norm": 23387.5625, |
|
"learning_rate": 0.0002976, |
|
"loss": 7.606, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.4782205746061168, |
|
"grad_norm": 26294.63671875, |
|
"learning_rate": 0.0002968, |
|
"loss": 7.6753, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.4819277108433735, |
|
"grad_norm": 22350.404296875, |
|
"learning_rate": 0.000296, |
|
"loss": 7.6926, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4856348470806302, |
|
"grad_norm": 23048.61328125, |
|
"learning_rate": 0.0002952, |
|
"loss": 7.6476, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.4893419833178869, |
|
"grad_norm": 26630.447265625, |
|
"learning_rate": 0.0002944, |
|
"loss": 7.7831, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.49304911955514363, |
|
"grad_norm": 34660.65234375, |
|
"learning_rate": 0.00029360000000000003, |
|
"loss": 7.5954, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.49675625579240035, |
|
"grad_norm": 19611.568359375, |
|
"learning_rate": 0.0002928, |
|
"loss": 7.6305, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.5004633920296571, |
|
"grad_norm": 38032.05078125, |
|
"learning_rate": 0.000292, |
|
"loss": 7.725, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.5041705282669138, |
|
"grad_norm": 26124.802734375, |
|
"learning_rate": 0.00029120000000000003, |
|
"loss": 7.6547, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.5078776645041705, |
|
"grad_norm": 22567.94921875, |
|
"learning_rate": 0.0002904, |
|
"loss": 7.7534, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.5115848007414272, |
|
"grad_norm": 37485.49609375, |
|
"learning_rate": 0.0002896, |
|
"loss": 7.6795, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.5152919369786839, |
|
"grad_norm": 32182.43359375, |
|
"learning_rate": 0.0002888, |
|
"loss": 7.7417, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.5189990732159406, |
|
"grad_norm": 24093.3125, |
|
"learning_rate": 0.000288, |
|
"loss": 7.6875, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.5227062094531975, |
|
"grad_norm": 23480.59765625, |
|
"learning_rate": 0.0002872, |
|
"loss": 7.6571, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.5264133456904542, |
|
"grad_norm": 34477.796875, |
|
"learning_rate": 0.0002864, |
|
"loss": 7.6389, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.5301204819277109, |
|
"grad_norm": 32023.896484375, |
|
"learning_rate": 0.0002856, |
|
"loss": 7.7501, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.5338276181649676, |
|
"grad_norm": 21589.513671875, |
|
"learning_rate": 0.0002848, |
|
"loss": 7.6895, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.5375347544022243, |
|
"grad_norm": 31786.94921875, |
|
"learning_rate": 0.000284, |
|
"loss": 7.7106, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.541241890639481, |
|
"grad_norm": 31673.8359375, |
|
"learning_rate": 0.0002832, |
|
"loss": 7.6815, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.5449490268767377, |
|
"grad_norm": 17670.734375, |
|
"learning_rate": 0.0002824, |
|
"loss": 7.6869, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.5486561631139945, |
|
"grad_norm": 34063.0703125, |
|
"learning_rate": 0.0002816, |
|
"loss": 7.7108, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.5523632993512512, |
|
"grad_norm": 36702.2734375, |
|
"learning_rate": 0.0002808, |
|
"loss": 7.7124, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.5560704355885079, |
|
"grad_norm": 22709.572265625, |
|
"learning_rate": 0.00028, |
|
"loss": 7.7326, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5597775718257646, |
|
"grad_norm": 36804.21484375, |
|
"learning_rate": 0.0002792, |
|
"loss": 7.6414, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.5634847080630213, |
|
"grad_norm": 30339.912109375, |
|
"learning_rate": 0.0002784, |
|
"loss": 7.7337, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.567191844300278, |
|
"grad_norm": 31866.80859375, |
|
"learning_rate": 0.00027759999999999997, |
|
"loss": 7.6208, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.5708989805375347, |
|
"grad_norm": 23864.302734375, |
|
"learning_rate": 0.0002768, |
|
"loss": 7.7083, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.5746061167747915, |
|
"grad_norm": 29230.330078125, |
|
"learning_rate": 0.000276, |
|
"loss": 7.6914, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.5783132530120482, |
|
"grad_norm": 21988.8046875, |
|
"learning_rate": 0.00027519999999999997, |
|
"loss": 7.7157, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.5820203892493049, |
|
"grad_norm": 21070.361328125, |
|
"learning_rate": 0.00027440000000000006, |
|
"loss": 7.6987, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.5857275254865616, |
|
"grad_norm": 39177.30859375, |
|
"learning_rate": 0.00027360000000000004, |
|
"loss": 7.5922, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.5894346617238183, |
|
"grad_norm": 20961.755859375, |
|
"learning_rate": 0.0002728, |
|
"loss": 7.7621, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.593141797961075, |
|
"grad_norm": 24547.12890625, |
|
"learning_rate": 0.00027200000000000005, |
|
"loss": 7.7387, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5968489341983317, |
|
"grad_norm": 17789.8125, |
|
"learning_rate": 0.00027120000000000003, |
|
"loss": 7.6818, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.6005560704355886, |
|
"grad_norm": 21633.90625, |
|
"learning_rate": 0.0002704, |
|
"loss": 7.6545, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.6042632066728453, |
|
"grad_norm": 17543.3046875, |
|
"learning_rate": 0.00026960000000000005, |
|
"loss": 7.6662, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.607970342910102, |
|
"grad_norm": 18747.458984375, |
|
"learning_rate": 0.00026880000000000003, |
|
"loss": 7.6227, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.6116774791473587, |
|
"grad_norm": 22172.224609375, |
|
"learning_rate": 0.000268, |
|
"loss": 7.6899, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 19154.330078125, |
|
"learning_rate": 0.00026720000000000004, |
|
"loss": 7.6195, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.6190917516218721, |
|
"grad_norm": 20868.43359375, |
|
"learning_rate": 0.0002664, |
|
"loss": 7.677, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.6227988878591288, |
|
"grad_norm": 18564.533203125, |
|
"learning_rate": 0.0002656, |
|
"loss": 7.696, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.6265060240963856, |
|
"grad_norm": 22970.892578125, |
|
"learning_rate": 0.00026480000000000004, |
|
"loss": 7.6589, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.6302131603336423, |
|
"grad_norm": 18157.03515625, |
|
"learning_rate": 0.000264, |
|
"loss": 7.7017, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.633920296570899, |
|
"grad_norm": 20085.443359375, |
|
"learning_rate": 0.0002632, |
|
"loss": 7.7293, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.6376274328081557, |
|
"grad_norm": 26864.5390625, |
|
"learning_rate": 0.00026240000000000004, |
|
"loss": 7.5853, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.6413345690454124, |
|
"grad_norm": 21249.70703125, |
|
"learning_rate": 0.0002616, |
|
"loss": 7.7276, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.6450417052826691, |
|
"grad_norm": 17884.49609375, |
|
"learning_rate": 0.0002608, |
|
"loss": 7.7034, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.6487488415199258, |
|
"grad_norm": 19097.380859375, |
|
"learning_rate": 0.00026000000000000003, |
|
"loss": 7.7472, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.6524559777571826, |
|
"grad_norm": 21432.216796875, |
|
"learning_rate": 0.0002592, |
|
"loss": 7.7052, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.6561631139944393, |
|
"grad_norm": 17022.677734375, |
|
"learning_rate": 0.00025840000000000005, |
|
"loss": 7.7127, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.659870250231696, |
|
"grad_norm": 21216.20703125, |
|
"learning_rate": 0.00025760000000000003, |
|
"loss": 7.5911, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.6635773864689527, |
|
"grad_norm": 21638.240234375, |
|
"learning_rate": 0.0002568, |
|
"loss": 7.5969, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.6672845227062094, |
|
"grad_norm": 27894.361328125, |
|
"learning_rate": 0.00025600000000000004, |
|
"loss": 7.6331, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6709916589434661, |
|
"grad_norm": 21034.33984375, |
|
"learning_rate": 0.0002552, |
|
"loss": 7.6371, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.6746987951807228, |
|
"grad_norm": 25746.513671875, |
|
"learning_rate": 0.0002544, |
|
"loss": 7.6462, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.6784059314179796, |
|
"grad_norm": 23690.24609375, |
|
"learning_rate": 0.00025360000000000004, |
|
"loss": 7.6662, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.6821130676552364, |
|
"grad_norm": 19138.052734375, |
|
"learning_rate": 0.0002528, |
|
"loss": 7.74, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.6858202038924931, |
|
"grad_norm": 20391.046875, |
|
"learning_rate": 0.000252, |
|
"loss": 7.7163, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.6895273401297498, |
|
"grad_norm": 17356.830078125, |
|
"learning_rate": 0.00025120000000000003, |
|
"loss": 7.6277, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.6932344763670065, |
|
"grad_norm": 27145.943359375, |
|
"learning_rate": 0.0002504, |
|
"loss": 7.7351, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.6969416126042632, |
|
"grad_norm": 18061.5703125, |
|
"learning_rate": 0.0002496, |
|
"loss": 7.6895, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.70064874884152, |
|
"grad_norm": 17943.388671875, |
|
"learning_rate": 0.00024880000000000003, |
|
"loss": 7.7073, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.7043558850787767, |
|
"grad_norm": 19911.068359375, |
|
"learning_rate": 0.000248, |
|
"loss": 7.7247, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.7080630213160334, |
|
"grad_norm": 23313.1328125, |
|
"learning_rate": 0.0002472, |
|
"loss": 7.6459, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.7117701575532901, |
|
"grad_norm": 18374.34375, |
|
"learning_rate": 0.0002464, |
|
"loss": 7.6853, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.7154772937905468, |
|
"grad_norm": 18763.783203125, |
|
"learning_rate": 0.0002456, |
|
"loss": 7.6097, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.7191844300278035, |
|
"grad_norm": 18051.265625, |
|
"learning_rate": 0.0002448, |
|
"loss": 7.6265, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.7228915662650602, |
|
"grad_norm": 21930.23828125, |
|
"learning_rate": 0.000244, |
|
"loss": 7.7064, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.726598702502317, |
|
"grad_norm": 21661.873046875, |
|
"learning_rate": 0.0002432, |
|
"loss": 7.7374, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.7303058387395737, |
|
"grad_norm": 26628.837890625, |
|
"learning_rate": 0.0002424, |
|
"loss": 7.6806, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.7340129749768304, |
|
"grad_norm": 24882.0234375, |
|
"learning_rate": 0.0002416, |
|
"loss": 7.6327, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.7377201112140871, |
|
"grad_norm": 25492.328125, |
|
"learning_rate": 0.0002408, |
|
"loss": 7.6956, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.7414272474513438, |
|
"grad_norm": 27734.201171875, |
|
"learning_rate": 0.00024, |
|
"loss": 7.6169, |
|
"step": 200 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.78927800680448e+16, |
|
"train_batch_size": 6, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|