|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.996415770609319, |
|
"eval_steps": 500, |
|
"global_step": 1254, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.023894862604540025, |
|
"grad_norm": 6.081180104447192, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9213, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04778972520908005, |
|
"grad_norm": 2.5675420707703682, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8064, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07168458781362007, |
|
"grad_norm": 1.116884956350582, |
|
"learning_rate": 5e-06, |
|
"loss": 0.764, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0955794504181601, |
|
"grad_norm": 0.9827896973879405, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7456, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11947431302270012, |
|
"grad_norm": 1.1716177346699552, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7252, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.14336917562724014, |
|
"grad_norm": 1.054189475268262, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7144, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.16726403823178015, |
|
"grad_norm": 0.7494472231038515, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7153, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1911589008363202, |
|
"grad_norm": 0.5903583553666529, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6941, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.21505376344086022, |
|
"grad_norm": 0.7850320924725688, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6917, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.23894862604540024, |
|
"grad_norm": 0.8082497951696401, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6931, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2628434886499403, |
|
"grad_norm": 0.5831514385960807, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6808, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2867383512544803, |
|
"grad_norm": 0.7344040626713287, |
|
"learning_rate": 5e-06, |
|
"loss": 0.689, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3106332138590203, |
|
"grad_norm": 0.8291631762782786, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6852, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3345280764635603, |
|
"grad_norm": 0.555446198309624, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6746, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.35842293906810035, |
|
"grad_norm": 0.8214482724693175, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6789, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3823178016726404, |
|
"grad_norm": 0.5332479678739207, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6676, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4062126642771804, |
|
"grad_norm": 1.0446840993388027, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6669, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.43010752688172044, |
|
"grad_norm": 0.5354819297917649, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6671, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4540023894862604, |
|
"grad_norm": 0.5790753975231967, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6637, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4778972520908005, |
|
"grad_norm": 0.5611754139446838, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6706, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5017921146953405, |
|
"grad_norm": 0.541229197735182, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6678, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5256869772998806, |
|
"grad_norm": 0.6290230152316767, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6615, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5495818399044206, |
|
"grad_norm": 0.5103134958958712, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6621, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5734767025089605, |
|
"grad_norm": 0.4963914880777678, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6587, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5973715651135006, |
|
"grad_norm": 0.7047770736230026, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6633, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6212664277180406, |
|
"grad_norm": 0.6875405023947134, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6602, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6451612903225806, |
|
"grad_norm": 0.5469403807072362, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6649, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6690561529271206, |
|
"grad_norm": 0.6301316104025243, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6605, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6929510155316607, |
|
"grad_norm": 0.7436109186331767, |
|
"learning_rate": 5e-06, |
|
"loss": 0.657, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7168458781362007, |
|
"grad_norm": 0.6316182942840975, |
|
"learning_rate": 5e-06, |
|
"loss": 0.659, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7407407407407407, |
|
"grad_norm": 0.5862276185836299, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6543, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7646356033452808, |
|
"grad_norm": 0.5727274506679324, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6578, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7885304659498208, |
|
"grad_norm": 0.8637237307062305, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6572, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8124253285543608, |
|
"grad_norm": 0.5716637099962937, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6544, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8363201911589009, |
|
"grad_norm": 0.5267829079741267, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6542, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8602150537634409, |
|
"grad_norm": 0.5819870889688735, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6524, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8841099163679809, |
|
"grad_norm": 0.5230379233220511, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6561, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9080047789725209, |
|
"grad_norm": 0.5405733925967506, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6576, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.931899641577061, |
|
"grad_norm": 0.6252081499402665, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6431, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.955794504181601, |
|
"grad_norm": 0.5241989931128205, |
|
"learning_rate": 5e-06, |
|
"loss": 0.645, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9796893667861409, |
|
"grad_norm": 0.5917706193349264, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6548, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.998805256869773, |
|
"eval_loss": 0.6481666564941406, |
|
"eval_runtime": 225.2584, |
|
"eval_samples_per_second": 50.045, |
|
"eval_steps_per_second": 0.395, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 1.003584229390681, |
|
"grad_norm": 0.8892508812440383, |
|
"learning_rate": 5e-06, |
|
"loss": 0.645, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.027479091995221, |
|
"grad_norm": 0.6856782905740205, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6088, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.0513739545997611, |
|
"grad_norm": 0.5419930096362386, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6083, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.075268817204301, |
|
"grad_norm": 0.6254120282769089, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6102, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.099163679808841, |
|
"grad_norm": 0.7886162221301777, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6022, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.1230585424133812, |
|
"grad_norm": 0.5575253634257799, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6008, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.146953405017921, |
|
"grad_norm": 0.5457321558023005, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6076, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.1708482676224612, |
|
"grad_norm": 0.4910125465988992, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5998, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.194743130227001, |
|
"grad_norm": 0.5851438416753514, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6095, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.2186379928315412, |
|
"grad_norm": 0.5544196352892787, |
|
"learning_rate": 5e-06, |
|
"loss": 0.608, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.2425328554360813, |
|
"grad_norm": 0.5835218258508235, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6053, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.2664277180406214, |
|
"grad_norm": 0.6402155111012049, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6086, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.2903225806451613, |
|
"grad_norm": 0.5274545363597922, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6082, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.3142174432497014, |
|
"grad_norm": 0.5023370306863523, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6097, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.3381123058542412, |
|
"grad_norm": 0.5336199850801069, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6037, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.3620071684587813, |
|
"grad_norm": 0.7734469958692578, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6066, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.3859020310633214, |
|
"grad_norm": 0.8452375931165987, |
|
"learning_rate": 5e-06, |
|
"loss": 0.604, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.4097968936678615, |
|
"grad_norm": 0.5907657942794627, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6042, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.4336917562724014, |
|
"grad_norm": 0.6105214261701881, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6077, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.4575866188769415, |
|
"grad_norm": 0.6803070474017702, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6145, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.4814814814814814, |
|
"grad_norm": 0.5137151572440128, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6068, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.5053763440860215, |
|
"grad_norm": 0.5526611853120886, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6023, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.5292712066905616, |
|
"grad_norm": 0.5276740317417068, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6066, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.5531660692951017, |
|
"grad_norm": 0.5266173782221237, |
|
"learning_rate": 5e-06, |
|
"loss": 0.604, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.5770609318996416, |
|
"grad_norm": 0.5879899658739348, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6062, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.6009557945041815, |
|
"grad_norm": 0.6426153237314072, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6044, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.6248506571087216, |
|
"grad_norm": 0.5633170243940351, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5975, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.6487455197132617, |
|
"grad_norm": 0.5448798915341956, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5969, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.6726403823178018, |
|
"grad_norm": 0.484774901310647, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6069, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.6965352449223416, |
|
"grad_norm": 0.5257867856081395, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5985, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.7204301075268817, |
|
"grad_norm": 0.5602525718442715, |
|
"learning_rate": 5e-06, |
|
"loss": 0.612, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.7443249701314216, |
|
"grad_norm": 0.5040592279383703, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6021, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.7682198327359617, |
|
"grad_norm": 0.6140508371910811, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6029, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.7921146953405018, |
|
"grad_norm": 0.526723228546995, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6051, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.816009557945042, |
|
"grad_norm": 0.5485611376004595, |
|
"learning_rate": 5e-06, |
|
"loss": 0.602, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.8399044205495818, |
|
"grad_norm": 0.4925549773889819, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5976, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.863799283154122, |
|
"grad_norm": 0.5127568984869073, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6015, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.8876941457586618, |
|
"grad_norm": 0.7953660178238419, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5983, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.911589008363202, |
|
"grad_norm": 0.4644171655680959, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6043, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.935483870967742, |
|
"grad_norm": 0.5504656432596945, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5991, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.959378733572282, |
|
"grad_norm": 0.5014583862445193, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6027, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.983273596176822, |
|
"grad_norm": 0.5169701196971361, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6065, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.6383097767829895, |
|
"eval_runtime": 225.1334, |
|
"eval_samples_per_second": 50.073, |
|
"eval_steps_per_second": 0.395, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 2.007168458781362, |
|
"grad_norm": 0.9522384301889181, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5892, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.031063321385902, |
|
"grad_norm": 0.5604355482368693, |
|
"learning_rate": 5e-06, |
|
"loss": 0.555, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.054958183990442, |
|
"grad_norm": 0.8454377137422782, |
|
"learning_rate": 5e-06, |
|
"loss": 0.559, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.078853046594982, |
|
"grad_norm": 0.6125609351007736, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5588, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.1027479091995223, |
|
"grad_norm": 0.550684661961655, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5566, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.126642771804062, |
|
"grad_norm": 0.5268020228547002, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5509, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.150537634408602, |
|
"grad_norm": 0.5374686724727457, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5558, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.174432497013142, |
|
"grad_norm": 0.6591728468868026, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5523, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.198327359617682, |
|
"grad_norm": 0.5758411262414109, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5626, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 0.5974631029863997, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5633, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.2461170848267624, |
|
"grad_norm": 0.6693430289411901, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5569, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.270011947431302, |
|
"grad_norm": 0.5235798291837075, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5592, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.293906810035842, |
|
"grad_norm": 0.5951063252000355, |
|
"learning_rate": 5e-06, |
|
"loss": 0.558, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.3178016726403823, |
|
"grad_norm": 0.7007904171286725, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5585, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.3416965352449224, |
|
"grad_norm": 0.6157005538099132, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5584, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.3655913978494625, |
|
"grad_norm": 0.5560793638442904, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5536, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.389486260454002, |
|
"grad_norm": 0.5994284875647052, |
|
"learning_rate": 5e-06, |
|
"loss": 0.559, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.4133811230585422, |
|
"grad_norm": 0.6178447526276027, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5622, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.4372759856630823, |
|
"grad_norm": 0.598268249438012, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5617, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.4611708482676224, |
|
"grad_norm": 0.6456807780700198, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5598, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.4850657108721625, |
|
"grad_norm": 0.5838521623874042, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5565, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.5089605734767026, |
|
"grad_norm": 0.6160425605583054, |
|
"learning_rate": 5e-06, |
|
"loss": 0.561, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.5328554360812428, |
|
"grad_norm": 0.6471012332418825, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5609, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.5567502986857824, |
|
"grad_norm": 0.7438495174826055, |
|
"learning_rate": 5e-06, |
|
"loss": 0.559, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.5806451612903225, |
|
"grad_norm": 0.5965448247540126, |
|
"learning_rate": 5e-06, |
|
"loss": 0.57, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.6045400238948626, |
|
"grad_norm": 0.6030850533182174, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5619, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.6284348864994027, |
|
"grad_norm": 0.5627845167167422, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5615, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.652329749103943, |
|
"grad_norm": 0.5586383353478368, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5567, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.6762246117084825, |
|
"grad_norm": 0.6169226364241693, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5614, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.7001194743130226, |
|
"grad_norm": 0.5655962840268283, |
|
"learning_rate": 5e-06, |
|
"loss": 0.566, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.7240143369175627, |
|
"grad_norm": 0.5379947967296507, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5585, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.7479091995221028, |
|
"grad_norm": 0.6022462071356245, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5647, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.771804062126643, |
|
"grad_norm": 0.5630793813942363, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5701, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.795698924731183, |
|
"grad_norm": 0.7669976902930744, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5584, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.819593787335723, |
|
"grad_norm": 0.6081830710715922, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5619, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.8434886499402627, |
|
"grad_norm": 0.5471146843592755, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5607, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.867383512544803, |
|
"grad_norm": 0.500432815577127, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5605, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.891278375149343, |
|
"grad_norm": 0.5513789797790463, |
|
"learning_rate": 5e-06, |
|
"loss": 0.558, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.915173237753883, |
|
"grad_norm": 0.5699534515350526, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5626, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.9390681003584227, |
|
"grad_norm": 0.6140827402540434, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5648, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.962962962962963, |
|
"grad_norm": 0.6486122276400309, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5627, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.986857825567503, |
|
"grad_norm": 0.5448840278284515, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5672, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.996415770609319, |
|
"eval_loss": 0.6404265761375427, |
|
"eval_runtime": 226.6238, |
|
"eval_samples_per_second": 49.743, |
|
"eval_steps_per_second": 0.393, |
|
"step": 1254 |
|
}, |
|
{ |
|
"epoch": 2.996415770609319, |
|
"step": 1254, |
|
"total_flos": 2100077946470400.0, |
|
"train_loss": 0.6158634758832162, |
|
"train_runtime": 37532.7652, |
|
"train_samples_per_second": 17.12, |
|
"train_steps_per_second": 0.033 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1254, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2100077946470400.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|