|
{ |
|
"best_metric": 0.5220404863357544, |
|
"best_model_checkpoint": "vit_epochs5_batch64_lr0.001_size224_tiles1_seed1_vit_old_transform_old_hp/checkpoint-1175", |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 1175, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02127659574468085, |
|
"grad_norm": 1.5236927270889282, |
|
"learning_rate": 0.000995744680851064, |
|
"loss": 0.8552, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0425531914893617, |
|
"grad_norm": 0.1933981478214264, |
|
"learning_rate": 0.0009914893617021276, |
|
"loss": 0.6984, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06382978723404255, |
|
"grad_norm": 0.2786834239959717, |
|
"learning_rate": 0.0009872340425531915, |
|
"loss": 0.684, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0851063829787234, |
|
"grad_norm": 0.3437531888484955, |
|
"learning_rate": 0.0009829787234042554, |
|
"loss": 0.699, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.10638297872340426, |
|
"grad_norm": 0.17921054363250732, |
|
"learning_rate": 0.0009787234042553192, |
|
"loss": 0.6876, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.1276595744680851, |
|
"grad_norm": 0.2969794273376465, |
|
"learning_rate": 0.0009744680851063829, |
|
"loss": 0.7084, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.14893617021276595, |
|
"grad_norm": 0.2975955307483673, |
|
"learning_rate": 0.0009702127659574468, |
|
"loss": 0.6938, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.1702127659574468, |
|
"grad_norm": 0.049827929586172104, |
|
"learning_rate": 0.0009659574468085106, |
|
"loss": 0.6834, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.19148936170212766, |
|
"grad_norm": 0.6071491837501526, |
|
"learning_rate": 0.0009617021276595745, |
|
"loss": 0.6737, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.2127659574468085, |
|
"grad_norm": 0.1733636111021042, |
|
"learning_rate": 0.0009574468085106384, |
|
"loss": 0.6401, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.23404255319148937, |
|
"grad_norm": 0.6925361752510071, |
|
"learning_rate": 0.0009531914893617022, |
|
"loss": 0.6786, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.2553191489361702, |
|
"grad_norm": 1.0148730278015137, |
|
"learning_rate": 0.000948936170212766, |
|
"loss": 0.6925, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2765957446808511, |
|
"grad_norm": 0.4391551911830902, |
|
"learning_rate": 0.0009446808510638298, |
|
"loss": 0.7001, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.2978723404255319, |
|
"grad_norm": 0.10365554690361023, |
|
"learning_rate": 0.0009404255319148937, |
|
"loss": 0.661, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.3191489361702128, |
|
"grad_norm": 0.5373475551605225, |
|
"learning_rate": 0.0009361702127659575, |
|
"loss": 0.6646, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.3404255319148936, |
|
"grad_norm": 0.26909396052360535, |
|
"learning_rate": 0.0009319148936170214, |
|
"loss": 0.6496, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.3617021276595745, |
|
"grad_norm": 0.7345396876335144, |
|
"learning_rate": 0.0009276595744680851, |
|
"loss": 0.6809, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.3829787234042553, |
|
"grad_norm": 0.17642471194267273, |
|
"learning_rate": 0.0009234042553191489, |
|
"loss": 0.6689, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.40425531914893614, |
|
"grad_norm": 0.24865615367889404, |
|
"learning_rate": 0.0009191489361702128, |
|
"loss": 0.6668, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.425531914893617, |
|
"grad_norm": 0.0725848600268364, |
|
"learning_rate": 0.0009148936170212766, |
|
"loss": 0.6955, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.44680851063829785, |
|
"grad_norm": 0.6779701113700867, |
|
"learning_rate": 0.0009106382978723405, |
|
"loss": 0.6643, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.46808510638297873, |
|
"grad_norm": 0.2594638466835022, |
|
"learning_rate": 0.0009063829787234043, |
|
"loss": 0.6774, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.48936170212765956, |
|
"grad_norm": 0.41974830627441406, |
|
"learning_rate": 0.000902127659574468, |
|
"loss": 0.6632, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.5106382978723404, |
|
"grad_norm": 0.2086678445339203, |
|
"learning_rate": 0.0008978723404255319, |
|
"loss": 0.6264, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.5319148936170213, |
|
"grad_norm": 0.45617616176605225, |
|
"learning_rate": 0.0008936170212765957, |
|
"loss": 0.6538, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.5531914893617021, |
|
"grad_norm": 0.32972219586372375, |
|
"learning_rate": 0.0008893617021276596, |
|
"loss": 0.6471, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.574468085106383, |
|
"grad_norm": 0.5587528347969055, |
|
"learning_rate": 0.0008851063829787234, |
|
"loss": 0.624, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.5957446808510638, |
|
"grad_norm": 0.5918276906013489, |
|
"learning_rate": 0.0008808510638297873, |
|
"loss": 0.6576, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.6170212765957447, |
|
"grad_norm": 0.35423263907432556, |
|
"learning_rate": 0.0008765957446808511, |
|
"loss": 0.6376, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.6382978723404256, |
|
"grad_norm": 0.49659672379493713, |
|
"learning_rate": 0.0008723404255319149, |
|
"loss": 0.6555, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.6595744680851063, |
|
"grad_norm": 0.26542067527770996, |
|
"learning_rate": 0.0008680851063829788, |
|
"loss": 0.6457, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.6808510638297872, |
|
"grad_norm": 0.5932815670967102, |
|
"learning_rate": 0.0008638297872340426, |
|
"loss": 0.6706, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.7021276595744681, |
|
"grad_norm": 0.18936298787593842, |
|
"learning_rate": 0.0008595744680851064, |
|
"loss": 0.6923, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.723404255319149, |
|
"grad_norm": 0.2216617614030838, |
|
"learning_rate": 0.0008553191489361703, |
|
"loss": 0.6805, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.7446808510638298, |
|
"grad_norm": 0.2572282552719116, |
|
"learning_rate": 0.000851063829787234, |
|
"loss": 0.6803, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.7659574468085106, |
|
"grad_norm": 0.2624934911727905, |
|
"learning_rate": 0.0008468085106382979, |
|
"loss": 0.6796, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.7872340425531915, |
|
"grad_norm": 0.3983383774757385, |
|
"learning_rate": 0.0008425531914893617, |
|
"loss": 0.652, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.8085106382978723, |
|
"grad_norm": 0.7851768136024475, |
|
"learning_rate": 0.0008382978723404256, |
|
"loss": 0.6972, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.8297872340425532, |
|
"grad_norm": 0.08407687395811081, |
|
"learning_rate": 0.0008340425531914894, |
|
"loss": 0.7127, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.851063829787234, |
|
"grad_norm": 0.2317022830247879, |
|
"learning_rate": 0.0008297872340425531, |
|
"loss": 0.6879, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.8723404255319149, |
|
"grad_norm": 0.10921870172023773, |
|
"learning_rate": 0.000825531914893617, |
|
"loss": 0.6909, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.8936170212765957, |
|
"grad_norm": 0.06697387248277664, |
|
"learning_rate": 0.0008212765957446808, |
|
"loss": 0.6858, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.9148936170212766, |
|
"grad_norm": 0.16396264731884003, |
|
"learning_rate": 0.0008170212765957447, |
|
"loss": 0.6836, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.9361702127659575, |
|
"grad_norm": 0.07334744930267334, |
|
"learning_rate": 0.0008127659574468085, |
|
"loss": 0.6835, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.9574468085106383, |
|
"grad_norm": 0.28075695037841797, |
|
"learning_rate": 0.0008085106382978723, |
|
"loss": 0.6616, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.9787234042553191, |
|
"grad_norm": 0.32385650277137756, |
|
"learning_rate": 0.0008042553191489363, |
|
"loss": 0.6763, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.6150110960006714, |
|
"learning_rate": 0.0008, |
|
"loss": 0.6668, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.5725333333333333, |
|
"eval_loss": 0.6652818918228149, |
|
"eval_runtime": 52.8415, |
|
"eval_samples_per_second": 70.967, |
|
"eval_steps_per_second": 1.117, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.0212765957446808, |
|
"grad_norm": 0.36133354902267456, |
|
"learning_rate": 0.0007957446808510639, |
|
"loss": 0.6505, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.0425531914893618, |
|
"grad_norm": 0.2631653845310211, |
|
"learning_rate": 0.0007914893617021277, |
|
"loss": 0.6666, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.0638297872340425, |
|
"grad_norm": 0.40402382612228394, |
|
"learning_rate": 0.0007872340425531915, |
|
"loss": 0.6406, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.0851063829787233, |
|
"grad_norm": 0.22335675358772278, |
|
"learning_rate": 0.0007829787234042554, |
|
"loss": 0.6584, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.1063829787234043, |
|
"grad_norm": 0.38019102811813354, |
|
"learning_rate": 0.0007787234042553192, |
|
"loss": 0.6773, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.127659574468085, |
|
"grad_norm": 0.6945547461509705, |
|
"learning_rate": 0.000774468085106383, |
|
"loss": 0.66, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.148936170212766, |
|
"grad_norm": 0.2084246724843979, |
|
"learning_rate": 0.0007702127659574468, |
|
"loss": 0.6512, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.1702127659574468, |
|
"grad_norm": 0.1295584738254547, |
|
"learning_rate": 0.0007659574468085106, |
|
"loss": 0.6521, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.1914893617021276, |
|
"grad_norm": 0.12610581517219543, |
|
"learning_rate": 0.0007617021276595745, |
|
"loss": 0.6281, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.2127659574468086, |
|
"grad_norm": 0.5777516961097717, |
|
"learning_rate": 0.0007574468085106383, |
|
"loss": 0.6315, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.2340425531914894, |
|
"grad_norm": 0.4698016047477722, |
|
"learning_rate": 0.0007531914893617022, |
|
"loss": 0.6736, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.2553191489361701, |
|
"grad_norm": 0.306220680475235, |
|
"learning_rate": 0.0007489361702127659, |
|
"loss": 0.6616, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.2765957446808511, |
|
"grad_norm": 0.1651347577571869, |
|
"learning_rate": 0.0007446808510638298, |
|
"loss": 0.6624, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.297872340425532, |
|
"grad_norm": 0.1671248823404312, |
|
"learning_rate": 0.0007404255319148936, |
|
"loss": 0.6537, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.3191489361702127, |
|
"grad_norm": 0.5579215288162231, |
|
"learning_rate": 0.0007361702127659574, |
|
"loss": 0.6547, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.3404255319148937, |
|
"grad_norm": 0.20245681703090668, |
|
"learning_rate": 0.0007319148936170213, |
|
"loss": 0.6477, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.3617021276595744, |
|
"grad_norm": 0.1913478672504425, |
|
"learning_rate": 0.0007276595744680852, |
|
"loss": 0.6311, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.3829787234042552, |
|
"grad_norm": 0.4945693016052246, |
|
"learning_rate": 0.000723404255319149, |
|
"loss": 0.5979, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.4042553191489362, |
|
"grad_norm": 0.1921028196811676, |
|
"learning_rate": 0.0007191489361702128, |
|
"loss": 0.7027, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.425531914893617, |
|
"grad_norm": 0.26029083132743835, |
|
"learning_rate": 0.0007148936170212766, |
|
"loss": 0.6733, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.4468085106382977, |
|
"grad_norm": 0.3045407831668854, |
|
"learning_rate": 0.0007106382978723405, |
|
"loss": 0.6619, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.4680851063829787, |
|
"grad_norm": 0.12488707154989243, |
|
"learning_rate": 0.0007063829787234043, |
|
"loss": 0.666, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.4893617021276595, |
|
"grad_norm": 0.15467241406440735, |
|
"learning_rate": 0.0007021276595744682, |
|
"loss": 0.634, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.5106382978723403, |
|
"grad_norm": 0.23499886691570282, |
|
"learning_rate": 0.0006978723404255319, |
|
"loss": 0.6257, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.5319148936170213, |
|
"grad_norm": 0.48748576641082764, |
|
"learning_rate": 0.0006936170212765957, |
|
"loss": 0.6369, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.5531914893617023, |
|
"grad_norm": 0.3014831244945526, |
|
"learning_rate": 0.0006893617021276596, |
|
"loss": 0.6274, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.574468085106383, |
|
"grad_norm": 0.12689495086669922, |
|
"learning_rate": 0.0006851063829787234, |
|
"loss": 0.6427, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.5957446808510638, |
|
"grad_norm": 0.3490160405635834, |
|
"learning_rate": 0.0006808510638297873, |
|
"loss": 0.6885, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.6170212765957448, |
|
"grad_norm": 0.2676607370376587, |
|
"learning_rate": 0.000676595744680851, |
|
"loss": 0.6436, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.6382978723404256, |
|
"grad_norm": 0.26951488852500916, |
|
"learning_rate": 0.0006723404255319148, |
|
"loss": 0.6387, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.6595744680851063, |
|
"grad_norm": 0.3769073784351349, |
|
"learning_rate": 0.0006680851063829787, |
|
"loss": 0.6003, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.6808510638297873, |
|
"grad_norm": 0.43915122747421265, |
|
"learning_rate": 0.0006638297872340425, |
|
"loss": 0.6477, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.702127659574468, |
|
"grad_norm": 0.2419726401567459, |
|
"learning_rate": 0.0006595744680851064, |
|
"loss": 0.6174, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.7234042553191489, |
|
"grad_norm": 0.5210821628570557, |
|
"learning_rate": 0.0006553191489361702, |
|
"loss": 0.625, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.7446808510638299, |
|
"grad_norm": 0.5546556115150452, |
|
"learning_rate": 0.0006510638297872342, |
|
"loss": 0.604, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.7659574468085106, |
|
"grad_norm": 0.5459072589874268, |
|
"learning_rate": 0.0006468085106382979, |
|
"loss": 0.6322, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.7872340425531914, |
|
"grad_norm": 0.28615137934684753, |
|
"learning_rate": 0.0006425531914893617, |
|
"loss": 0.6288, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.8085106382978724, |
|
"grad_norm": 0.25826430320739746, |
|
"learning_rate": 0.0006382978723404256, |
|
"loss": 0.6377, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.8297872340425532, |
|
"grad_norm": 0.27113598585128784, |
|
"learning_rate": 0.0006340425531914894, |
|
"loss": 0.6155, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.851063829787234, |
|
"grad_norm": 0.3145448565483093, |
|
"learning_rate": 0.0006297872340425533, |
|
"loss": 0.6258, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.872340425531915, |
|
"grad_norm": 0.221902996301651, |
|
"learning_rate": 0.000625531914893617, |
|
"loss": 0.6133, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.8936170212765957, |
|
"grad_norm": 0.2308581918478012, |
|
"learning_rate": 0.0006212765957446808, |
|
"loss": 0.5883, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.9148936170212765, |
|
"grad_norm": 0.2169838696718216, |
|
"learning_rate": 0.0006170212765957447, |
|
"loss": 0.6219, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.9361702127659575, |
|
"grad_norm": 0.32386860251426697, |
|
"learning_rate": 0.0006127659574468085, |
|
"loss": 0.6102, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.9574468085106385, |
|
"grad_norm": 0.13700896501541138, |
|
"learning_rate": 0.0006085106382978724, |
|
"loss": 0.6436, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.978723404255319, |
|
"grad_norm": 0.18552586436271667, |
|
"learning_rate": 0.0006042553191489362, |
|
"loss": 0.6524, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.5744425058364868, |
|
"learning_rate": 0.0006, |
|
"loss": 0.6527, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.6528, |
|
"eval_loss": 0.6233171224594116, |
|
"eval_runtime": 52.0761, |
|
"eval_samples_per_second": 72.01, |
|
"eval_steps_per_second": 1.133, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.021276595744681, |
|
"grad_norm": 0.39053860306739807, |
|
"learning_rate": 0.0005957446808510638, |
|
"loss": 0.5832, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 2.0425531914893615, |
|
"grad_norm": 0.2939192056655884, |
|
"learning_rate": 0.0005914893617021276, |
|
"loss": 0.5808, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.0638297872340425, |
|
"grad_norm": 0.5998929142951965, |
|
"learning_rate": 0.0005872340425531915, |
|
"loss": 0.6119, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 2.0851063829787235, |
|
"grad_norm": 0.48165130615234375, |
|
"learning_rate": 0.0005829787234042553, |
|
"loss": 0.5868, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.106382978723404, |
|
"grad_norm": 0.2857578694820404, |
|
"learning_rate": 0.0005787234042553191, |
|
"loss": 0.5843, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 2.127659574468085, |
|
"grad_norm": 0.28461429476737976, |
|
"learning_rate": 0.0005744680851063831, |
|
"loss": 0.5843, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.148936170212766, |
|
"grad_norm": 0.30877211689949036, |
|
"learning_rate": 0.0005702127659574468, |
|
"loss": 0.5652, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 2.1702127659574466, |
|
"grad_norm": 0.7491441369056702, |
|
"learning_rate": 0.0005659574468085107, |
|
"loss": 0.5687, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.1914893617021276, |
|
"grad_norm": 0.29466772079467773, |
|
"learning_rate": 0.0005617021276595745, |
|
"loss": 0.6339, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 2.2127659574468086, |
|
"grad_norm": 0.44021138548851013, |
|
"learning_rate": 0.0005574468085106383, |
|
"loss": 0.5629, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.2340425531914896, |
|
"grad_norm": 0.19135086238384247, |
|
"learning_rate": 0.0005531914893617022, |
|
"loss": 0.6169, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 2.25531914893617, |
|
"grad_norm": 0.6730530858039856, |
|
"learning_rate": 0.000548936170212766, |
|
"loss": 0.6063, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.276595744680851, |
|
"grad_norm": 0.4451698362827301, |
|
"learning_rate": 0.0005446808510638298, |
|
"loss": 0.614, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 2.297872340425532, |
|
"grad_norm": 0.19956566393375397, |
|
"learning_rate": 0.0005404255319148936, |
|
"loss": 0.5848, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.3191489361702127, |
|
"grad_norm": 0.3573627471923828, |
|
"learning_rate": 0.0005361702127659575, |
|
"loss": 0.5963, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 2.3404255319148937, |
|
"grad_norm": 0.22617582976818085, |
|
"learning_rate": 0.0005319148936170213, |
|
"loss": 0.5512, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.3617021276595747, |
|
"grad_norm": 0.2276870310306549, |
|
"learning_rate": 0.0005276595744680851, |
|
"loss": 0.5801, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 2.382978723404255, |
|
"grad_norm": 0.3912278413772583, |
|
"learning_rate": 0.000523404255319149, |
|
"loss": 0.6101, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.404255319148936, |
|
"grad_norm": 0.20038598775863647, |
|
"learning_rate": 0.0005191489361702127, |
|
"loss": 0.5842, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 2.425531914893617, |
|
"grad_norm": 0.27847474813461304, |
|
"learning_rate": 0.0005148936170212766, |
|
"loss": 0.5597, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.4468085106382977, |
|
"grad_norm": 0.49357470870018005, |
|
"learning_rate": 0.0005106382978723404, |
|
"loss": 0.5374, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 2.4680851063829787, |
|
"grad_norm": 0.22584182024002075, |
|
"learning_rate": 0.0005063829787234042, |
|
"loss": 0.6416, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.4893617021276597, |
|
"grad_norm": 0.4970340430736542, |
|
"learning_rate": 0.0005021276595744681, |
|
"loss": 0.6101, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 2.5106382978723403, |
|
"grad_norm": 0.23562884330749512, |
|
"learning_rate": 0.000497872340425532, |
|
"loss": 0.5728, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.5319148936170213, |
|
"grad_norm": 0.2772935926914215, |
|
"learning_rate": 0.0004936170212765957, |
|
"loss": 0.5969, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 2.5531914893617023, |
|
"grad_norm": 0.466553658246994, |
|
"learning_rate": 0.0004893617021276596, |
|
"loss": 0.5722, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.574468085106383, |
|
"grad_norm": 0.1931866854429245, |
|
"learning_rate": 0.0004851063829787234, |
|
"loss": 0.5947, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 2.595744680851064, |
|
"grad_norm": 0.3345823884010315, |
|
"learning_rate": 0.00048085106382978723, |
|
"loss": 0.5464, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.617021276595745, |
|
"grad_norm": 0.8605038523674011, |
|
"learning_rate": 0.0004765957446808511, |
|
"loss": 0.616, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 2.6382978723404253, |
|
"grad_norm": 0.467629611492157, |
|
"learning_rate": 0.0004723404255319149, |
|
"loss": 0.5997, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.6595744680851063, |
|
"grad_norm": 0.30429497361183167, |
|
"learning_rate": 0.00046808510638297874, |
|
"loss": 0.5498, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 2.6808510638297873, |
|
"grad_norm": 0.2898688316345215, |
|
"learning_rate": 0.00046382978723404257, |
|
"loss": 0.5526, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.702127659574468, |
|
"grad_norm": 0.24966174364089966, |
|
"learning_rate": 0.0004595744680851064, |
|
"loss": 0.568, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 2.723404255319149, |
|
"grad_norm": 0.31960707902908325, |
|
"learning_rate": 0.00045531914893617024, |
|
"loss": 0.5573, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.74468085106383, |
|
"grad_norm": 0.17629045248031616, |
|
"learning_rate": 0.000451063829787234, |
|
"loss": 0.5793, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 2.7659574468085104, |
|
"grad_norm": 0.3344897925853729, |
|
"learning_rate": 0.00044680851063829785, |
|
"loss": 0.5782, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.7872340425531914, |
|
"grad_norm": 0.6426132917404175, |
|
"learning_rate": 0.0004425531914893617, |
|
"loss": 0.6065, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 2.8085106382978724, |
|
"grad_norm": 0.4149859547615051, |
|
"learning_rate": 0.00043829787234042557, |
|
"loss": 0.6095, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.829787234042553, |
|
"grad_norm": 0.2638397812843323, |
|
"learning_rate": 0.0004340425531914894, |
|
"loss": 0.5651, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 2.851063829787234, |
|
"grad_norm": 0.47826263308525085, |
|
"learning_rate": 0.0004297872340425532, |
|
"loss": 0.6366, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.872340425531915, |
|
"grad_norm": 0.47488388419151306, |
|
"learning_rate": 0.000425531914893617, |
|
"loss": 0.5498, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 2.8936170212765955, |
|
"grad_norm": 0.29856908321380615, |
|
"learning_rate": 0.00042127659574468085, |
|
"loss": 0.5576, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.9148936170212765, |
|
"grad_norm": 0.3228590488433838, |
|
"learning_rate": 0.0004170212765957447, |
|
"loss": 0.5527, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 2.9361702127659575, |
|
"grad_norm": 0.28109100461006165, |
|
"learning_rate": 0.0004127659574468085, |
|
"loss": 0.5421, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.9574468085106385, |
|
"grad_norm": 0.43624716997146606, |
|
"learning_rate": 0.00040851063829787235, |
|
"loss": 0.5419, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 2.978723404255319, |
|
"grad_norm": 0.33003830909729004, |
|
"learning_rate": 0.00040425531914893613, |
|
"loss": 0.5614, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.8071190118789673, |
|
"learning_rate": 0.0004, |
|
"loss": 0.5628, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.7048, |
|
"eval_loss": 0.5658010244369507, |
|
"eval_runtime": 52.264, |
|
"eval_samples_per_second": 71.751, |
|
"eval_steps_per_second": 1.129, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 3.021276595744681, |
|
"grad_norm": 0.34832167625427246, |
|
"learning_rate": 0.00039574468085106385, |
|
"loss": 0.5412, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 3.0425531914893615, |
|
"grad_norm": 0.3105883002281189, |
|
"learning_rate": 0.0003914893617021277, |
|
"loss": 0.5428, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 3.0638297872340425, |
|
"grad_norm": 0.48978525400161743, |
|
"learning_rate": 0.0003872340425531915, |
|
"loss": 0.5074, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 3.0851063829787235, |
|
"grad_norm": 0.3323807120323181, |
|
"learning_rate": 0.0003829787234042553, |
|
"loss": 0.5388, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 3.106382978723404, |
|
"grad_norm": 0.23931725323200226, |
|
"learning_rate": 0.00037872340425531913, |
|
"loss": 0.5329, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 3.127659574468085, |
|
"grad_norm": 0.4094422459602356, |
|
"learning_rate": 0.00037446808510638297, |
|
"loss": 0.5256, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 3.148936170212766, |
|
"grad_norm": 0.2427910566329956, |
|
"learning_rate": 0.0003702127659574468, |
|
"loss": 0.4994, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 3.1702127659574466, |
|
"grad_norm": 0.46753978729248047, |
|
"learning_rate": 0.00036595744680851063, |
|
"loss": 0.5841, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 3.1914893617021276, |
|
"grad_norm": 0.60309898853302, |
|
"learning_rate": 0.0003617021276595745, |
|
"loss": 0.5018, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 3.2127659574468086, |
|
"grad_norm": 0.32367798686027527, |
|
"learning_rate": 0.0003574468085106383, |
|
"loss": 0.5112, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 3.2340425531914896, |
|
"grad_norm": 0.31850096583366394, |
|
"learning_rate": 0.00035319148936170213, |
|
"loss": 0.5197, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 3.25531914893617, |
|
"grad_norm": 0.40993842482566833, |
|
"learning_rate": 0.00034893617021276597, |
|
"loss": 0.491, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 3.276595744680851, |
|
"grad_norm": 0.31502920389175415, |
|
"learning_rate": 0.0003446808510638298, |
|
"loss": 0.5134, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 3.297872340425532, |
|
"grad_norm": 0.34986236691474915, |
|
"learning_rate": 0.00034042553191489364, |
|
"loss": 0.5093, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 3.3191489361702127, |
|
"grad_norm": 0.30818113684654236, |
|
"learning_rate": 0.0003361702127659574, |
|
"loss": 0.4668, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 3.3404255319148937, |
|
"grad_norm": 0.45690372586250305, |
|
"learning_rate": 0.00033191489361702125, |
|
"loss": 0.4793, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 3.3617021276595747, |
|
"grad_norm": 0.431671142578125, |
|
"learning_rate": 0.0003276595744680851, |
|
"loss": 0.5449, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 3.382978723404255, |
|
"grad_norm": 0.6079233288764954, |
|
"learning_rate": 0.00032340425531914897, |
|
"loss": 0.5055, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 3.404255319148936, |
|
"grad_norm": 0.25394123792648315, |
|
"learning_rate": 0.0003191489361702128, |
|
"loss": 0.5137, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 3.425531914893617, |
|
"grad_norm": 0.2768719494342804, |
|
"learning_rate": 0.00031489361702127664, |
|
"loss": 0.5378, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 3.4468085106382977, |
|
"grad_norm": 0.33412039279937744, |
|
"learning_rate": 0.0003106382978723404, |
|
"loss": 0.5529, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 3.4680851063829787, |
|
"grad_norm": 0.45218709111213684, |
|
"learning_rate": 0.00030638297872340425, |
|
"loss": 0.514, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 3.4893617021276597, |
|
"grad_norm": 0.29416921734809875, |
|
"learning_rate": 0.0003021276595744681, |
|
"loss": 0.471, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 3.5106382978723403, |
|
"grad_norm": 0.4108869433403015, |
|
"learning_rate": 0.0002978723404255319, |
|
"loss": 0.5222, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 3.5319148936170213, |
|
"grad_norm": 0.5049691200256348, |
|
"learning_rate": 0.00029361702127659575, |
|
"loss": 0.5103, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 3.5531914893617023, |
|
"grad_norm": 0.37521079182624817, |
|
"learning_rate": 0.00028936170212765953, |
|
"loss": 0.5088, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 3.574468085106383, |
|
"grad_norm": 0.6042494177818298, |
|
"learning_rate": 0.0002851063829787234, |
|
"loss": 0.4886, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 3.595744680851064, |
|
"grad_norm": 0.3379281163215637, |
|
"learning_rate": 0.00028085106382978725, |
|
"loss": 0.4878, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 3.617021276595745, |
|
"grad_norm": 0.42538291215896606, |
|
"learning_rate": 0.0002765957446808511, |
|
"loss": 0.5241, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 3.6382978723404253, |
|
"grad_norm": 0.34973302483558655, |
|
"learning_rate": 0.0002723404255319149, |
|
"loss": 0.497, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 3.6595744680851063, |
|
"grad_norm": 0.5937588214874268, |
|
"learning_rate": 0.00026808510638297875, |
|
"loss": 0.5004, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 3.6808510638297873, |
|
"grad_norm": 0.3566235601902008, |
|
"learning_rate": 0.00026382978723404253, |
|
"loss": 0.5192, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 3.702127659574468, |
|
"grad_norm": 0.7297813296318054, |
|
"learning_rate": 0.00025957446808510637, |
|
"loss": 0.5313, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 3.723404255319149, |
|
"grad_norm": 0.3060586452484131, |
|
"learning_rate": 0.0002553191489361702, |
|
"loss": 0.5057, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 3.74468085106383, |
|
"grad_norm": 0.3572905361652374, |
|
"learning_rate": 0.00025106382978723403, |
|
"loss": 0.5078, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 3.7659574468085104, |
|
"grad_norm": 0.5359181761741638, |
|
"learning_rate": 0.00024680851063829787, |
|
"loss": 0.4953, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 3.7872340425531914, |
|
"grad_norm": 0.676404595375061, |
|
"learning_rate": 0.0002425531914893617, |
|
"loss": 0.4878, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 3.8085106382978724, |
|
"grad_norm": 0.7736416459083557, |
|
"learning_rate": 0.00023829787234042556, |
|
"loss": 0.4897, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 3.829787234042553, |
|
"grad_norm": 0.6416388154029846, |
|
"learning_rate": 0.00023404255319148937, |
|
"loss": 0.5031, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 3.851063829787234, |
|
"grad_norm": 1.1011937856674194, |
|
"learning_rate": 0.0002297872340425532, |
|
"loss": 0.4563, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 3.872340425531915, |
|
"grad_norm": 0.4412100613117218, |
|
"learning_rate": 0.000225531914893617, |
|
"loss": 0.525, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 3.8936170212765955, |
|
"grad_norm": 0.6614885926246643, |
|
"learning_rate": 0.00022127659574468084, |
|
"loss": 0.5163, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 3.9148936170212765, |
|
"grad_norm": 0.38106369972229004, |
|
"learning_rate": 0.0002170212765957447, |
|
"loss": 0.5182, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 3.9361702127659575, |
|
"grad_norm": 0.44818058609962463, |
|
"learning_rate": 0.0002127659574468085, |
|
"loss": 0.4875, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 3.9574468085106385, |
|
"grad_norm": 0.3101024925708771, |
|
"learning_rate": 0.00020851063829787234, |
|
"loss": 0.5498, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 3.978723404255319, |
|
"grad_norm": 0.4035079777240753, |
|
"learning_rate": 0.00020425531914893618, |
|
"loss": 0.5139, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.4626338481903076, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4683, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.7290666666666666, |
|
"eval_loss": 0.5313977003097534, |
|
"eval_runtime": 52.259, |
|
"eval_samples_per_second": 71.758, |
|
"eval_steps_per_second": 1.129, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 4.0212765957446805, |
|
"grad_norm": 0.3656058609485626, |
|
"learning_rate": 0.00019574468085106384, |
|
"loss": 0.4576, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 4.042553191489362, |
|
"grad_norm": 0.6442248225212097, |
|
"learning_rate": 0.00019148936170212765, |
|
"loss": 0.4869, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 4.0638297872340425, |
|
"grad_norm": 0.8725343942642212, |
|
"learning_rate": 0.00018723404255319148, |
|
"loss": 0.4081, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 4.085106382978723, |
|
"grad_norm": 0.5488789677619934, |
|
"learning_rate": 0.00018297872340425532, |
|
"loss": 0.3774, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 4.1063829787234045, |
|
"grad_norm": 0.45871075987815857, |
|
"learning_rate": 0.00017872340425531915, |
|
"loss": 0.3895, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 4.127659574468085, |
|
"grad_norm": 0.7183250784873962, |
|
"learning_rate": 0.00017446808510638298, |
|
"loss": 0.4216, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 4.148936170212766, |
|
"grad_norm": 0.43252503871917725, |
|
"learning_rate": 0.00017021276595744682, |
|
"loss": 0.4306, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 4.170212765957447, |
|
"grad_norm": 0.5714681148529053, |
|
"learning_rate": 0.00016595744680851062, |
|
"loss": 0.4607, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 4.191489361702128, |
|
"grad_norm": 0.5099291801452637, |
|
"learning_rate": 0.00016170212765957449, |
|
"loss": 0.372, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 4.212765957446808, |
|
"grad_norm": 0.5010551810264587, |
|
"learning_rate": 0.00015744680851063832, |
|
"loss": 0.4414, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 4.23404255319149, |
|
"grad_norm": 0.6585486531257629, |
|
"learning_rate": 0.00015319148936170213, |
|
"loss": 0.4191, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 4.25531914893617, |
|
"grad_norm": 0.5043871402740479, |
|
"learning_rate": 0.00014893617021276596, |
|
"loss": 0.4273, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 4.276595744680851, |
|
"grad_norm": 0.4368508756160736, |
|
"learning_rate": 0.00014468085106382977, |
|
"loss": 0.4329, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 4.297872340425532, |
|
"grad_norm": 0.5174155235290527, |
|
"learning_rate": 0.00014042553191489363, |
|
"loss": 0.4256, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 4.319148936170213, |
|
"grad_norm": 0.7088821530342102, |
|
"learning_rate": 0.00013617021276595746, |
|
"loss": 0.4025, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 4.340425531914893, |
|
"grad_norm": 0.41731932759284973, |
|
"learning_rate": 0.00013191489361702127, |
|
"loss": 0.4018, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 4.361702127659575, |
|
"grad_norm": 0.47780218720436096, |
|
"learning_rate": 0.0001276595744680851, |
|
"loss": 0.4683, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 4.382978723404255, |
|
"grad_norm": 0.49915027618408203, |
|
"learning_rate": 0.00012340425531914893, |
|
"loss": 0.4643, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 4.404255319148936, |
|
"grad_norm": 0.5682059526443481, |
|
"learning_rate": 0.00011914893617021278, |
|
"loss": 0.4259, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 4.425531914893617, |
|
"grad_norm": 0.36220914125442505, |
|
"learning_rate": 0.0001148936170212766, |
|
"loss": 0.4116, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 4.446808510638298, |
|
"grad_norm": 0.5478158593177795, |
|
"learning_rate": 0.00011063829787234042, |
|
"loss": 0.4299, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 4.468085106382979, |
|
"grad_norm": 0.5897641181945801, |
|
"learning_rate": 0.00010638297872340425, |
|
"loss": 0.3619, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 4.48936170212766, |
|
"grad_norm": 1.084243893623352, |
|
"learning_rate": 0.00010212765957446809, |
|
"loss": 0.4211, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 4.51063829787234, |
|
"grad_norm": 0.7980880737304688, |
|
"learning_rate": 9.787234042553192e-05, |
|
"loss": 0.4053, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 4.531914893617021, |
|
"grad_norm": 0.9330500364303589, |
|
"learning_rate": 9.361702127659574e-05, |
|
"loss": 0.4183, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 4.553191489361702, |
|
"grad_norm": 0.40023094415664673, |
|
"learning_rate": 8.936170212765958e-05, |
|
"loss": 0.4343, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 4.574468085106383, |
|
"grad_norm": 0.6411470770835876, |
|
"learning_rate": 8.510638297872341e-05, |
|
"loss": 0.4096, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 4.595744680851064, |
|
"grad_norm": 0.4613640308380127, |
|
"learning_rate": 8.085106382978724e-05, |
|
"loss": 0.4089, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 4.617021276595745, |
|
"grad_norm": 0.5364215970039368, |
|
"learning_rate": 7.659574468085106e-05, |
|
"loss": 0.406, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 4.638297872340425, |
|
"grad_norm": 0.7170926928520203, |
|
"learning_rate": 7.234042553191488e-05, |
|
"loss": 0.3827, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 4.659574468085106, |
|
"grad_norm": 0.5427092909812927, |
|
"learning_rate": 6.808510638297873e-05, |
|
"loss": 0.4274, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 4.680851063829787, |
|
"grad_norm": 0.44160687923431396, |
|
"learning_rate": 6.382978723404255e-05, |
|
"loss": 0.4182, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 4.702127659574468, |
|
"grad_norm": 0.5841237902641296, |
|
"learning_rate": 5.957446808510639e-05, |
|
"loss": 0.4459, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 4.723404255319149, |
|
"grad_norm": 0.6145776510238647, |
|
"learning_rate": 5.531914893617021e-05, |
|
"loss": 0.4415, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 4.74468085106383, |
|
"grad_norm": 0.44807735085487366, |
|
"learning_rate": 5.1063829787234044e-05, |
|
"loss": 0.4248, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 4.76595744680851, |
|
"grad_norm": 0.7016127109527588, |
|
"learning_rate": 4.680851063829787e-05, |
|
"loss": 0.3613, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 4.787234042553192, |
|
"grad_norm": 0.5572742819786072, |
|
"learning_rate": 4.2553191489361704e-05, |
|
"loss": 0.4027, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 4.808510638297872, |
|
"grad_norm": 0.5368435978889465, |
|
"learning_rate": 3.829787234042553e-05, |
|
"loss": 0.4111, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 4.829787234042553, |
|
"grad_norm": 0.4862489700317383, |
|
"learning_rate": 3.4042553191489365e-05, |
|
"loss": 0.3663, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 4.851063829787234, |
|
"grad_norm": 0.6198825240135193, |
|
"learning_rate": 2.9787234042553195e-05, |
|
"loss": 0.3791, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 4.872340425531915, |
|
"grad_norm": 0.5688868165016174, |
|
"learning_rate": 2.5531914893617022e-05, |
|
"loss": 0.393, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 4.8936170212765955, |
|
"grad_norm": 0.44319066405296326, |
|
"learning_rate": 2.1276595744680852e-05, |
|
"loss": 0.3728, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 4.914893617021277, |
|
"grad_norm": 0.6268962621688843, |
|
"learning_rate": 1.7021276595744682e-05, |
|
"loss": 0.3832, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 4.9361702127659575, |
|
"grad_norm": 0.41478314995765686, |
|
"learning_rate": 1.2765957446808511e-05, |
|
"loss": 0.3647, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 4.957446808510638, |
|
"grad_norm": 0.8043156266212463, |
|
"learning_rate": 8.510638297872341e-06, |
|
"loss": 0.3705, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 4.9787234042553195, |
|
"grad_norm": 0.5503541827201843, |
|
"learning_rate": 4.255319148936171e-06, |
|
"loss": 0.3862, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.6788877844810486, |
|
"learning_rate": 0.0, |
|
"loss": 0.3694, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.7538666666666667, |
|
"eval_loss": 0.5220404863357544, |
|
"eval_runtime": 52.803, |
|
"eval_samples_per_second": 71.019, |
|
"eval_steps_per_second": 1.117, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 1175, |
|
"total_flos": 5.8118992210944e+18, |
|
"train_loss": 0.5645027552259729, |
|
"train_runtime": 2886.1261, |
|
"train_samples_per_second": 25.986, |
|
"train_steps_per_second": 0.407 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1175, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.8118992210944e+18, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|