|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.396887159533074, |
|
"eval_steps": 8, |
|
"global_step": 77, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0311284046692607, |
|
"grad_norm": 0.5707955360412598, |
|
"learning_rate": 1e-05, |
|
"loss": 1.8935, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0311284046692607, |
|
"eval_loss": 1.8884226083755493, |
|
"eval_runtime": 34.2567, |
|
"eval_samples_per_second": 29.104, |
|
"eval_steps_per_second": 0.321, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0622568093385214, |
|
"grad_norm": 0.5781293511390686, |
|
"learning_rate": 2e-05, |
|
"loss": 1.879, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0933852140077821, |
|
"grad_norm": 0.5720934271812439, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8848, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.1245136186770428, |
|
"grad_norm": 0.580179750919342, |
|
"learning_rate": 4e-05, |
|
"loss": 1.8845, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.1556420233463035, |
|
"grad_norm": 0.6264262795448303, |
|
"learning_rate": 5e-05, |
|
"loss": 1.8758, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.1867704280155642, |
|
"grad_norm": 0.643973708152771, |
|
"learning_rate": 6e-05, |
|
"loss": 1.8309, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.2178988326848249, |
|
"grad_norm": 0.6367993354797363, |
|
"learning_rate": 7e-05, |
|
"loss": 1.7743, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.2490272373540856, |
|
"grad_norm": 0.5833392143249512, |
|
"learning_rate": 8e-05, |
|
"loss": 1.6965, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.2490272373540856, |
|
"eval_loss": 1.5970573425292969, |
|
"eval_runtime": 34.32, |
|
"eval_samples_per_second": 29.05, |
|
"eval_steps_per_second": 0.321, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.2801556420233463, |
|
"grad_norm": 0.5133880972862244, |
|
"learning_rate": 9e-05, |
|
"loss": 1.5915, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.311284046692607, |
|
"grad_norm": 0.42409589886665344, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5128, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.3424124513618677, |
|
"grad_norm": 0.3264746069908142, |
|
"learning_rate": 0.00011000000000000002, |
|
"loss": 1.4567, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.3735408560311284, |
|
"grad_norm": 0.2589164078235626, |
|
"learning_rate": 0.00012, |
|
"loss": 1.4249, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.4046692607003891, |
|
"grad_norm": 0.3931436538696289, |
|
"learning_rate": 0.00013000000000000002, |
|
"loss": 1.4125, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.4357976653696498, |
|
"grad_norm": 0.5455179810523987, |
|
"learning_rate": 0.00014, |
|
"loss": 1.4079, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.4669260700389105, |
|
"grad_norm": 0.5418187379837036, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 1.4031, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.4980544747081712, |
|
"grad_norm": 0.42387455701828003, |
|
"learning_rate": 0.00016, |
|
"loss": 1.3733, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.4980544747081712, |
|
"eval_loss": 1.3445571660995483, |
|
"eval_runtime": 34.3152, |
|
"eval_samples_per_second": 29.054, |
|
"eval_steps_per_second": 0.321, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.5291828793774319, |
|
"grad_norm": 0.2986687123775482, |
|
"learning_rate": 0.00017, |
|
"loss": 1.3505, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.5603112840466926, |
|
"grad_norm": 0.2099975347518921, |
|
"learning_rate": 0.00018, |
|
"loss": 1.3243, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.5914396887159533, |
|
"grad_norm": 0.16759291291236877, |
|
"learning_rate": 0.00019, |
|
"loss": 1.3056, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.622568093385214, |
|
"grad_norm": 0.16132138669490814, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3014, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.6536964980544747, |
|
"grad_norm": 0.17767557501792908, |
|
"learning_rate": 0.0001999145758387301, |
|
"loss": 1.2932, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.6848249027237354, |
|
"grad_norm": 0.19573098421096802, |
|
"learning_rate": 0.000199658449300667, |
|
"loss": 1.2771, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.7159533073929961, |
|
"grad_norm": 0.19915379583835602, |
|
"learning_rate": 0.0001992320579737045, |
|
"loss": 1.2762, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.7470817120622568, |
|
"grad_norm": 0.17230945825576782, |
|
"learning_rate": 0.00019863613034027224, |
|
"loss": 1.2466, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.7470817120622568, |
|
"eval_loss": 1.2462533712387085, |
|
"eval_runtime": 34.3129, |
|
"eval_samples_per_second": 29.056, |
|
"eval_steps_per_second": 0.321, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.7782101167315175, |
|
"grad_norm": 0.13044685125350952, |
|
"learning_rate": 0.00019787168453273544, |
|
"loss": 1.2402, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.8093385214007782, |
|
"grad_norm": 0.09282781183719635, |
|
"learning_rate": 0.00019694002659393305, |
|
"loss": 1.234, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.8404669260700389, |
|
"grad_norm": 0.10575597733259201, |
|
"learning_rate": 0.0001958427482458253, |
|
"loss": 1.2214, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.8715953307392996, |
|
"grad_norm": 0.14210504293441772, |
|
"learning_rate": 0.00019458172417006347, |
|
"loss": 1.2185, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.9027237354085603, |
|
"grad_norm": 0.17919066548347473, |
|
"learning_rate": 0.0001931591088051279, |
|
"loss": 1.2025, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.933852140077821, |
|
"grad_norm": 0.16358336806297302, |
|
"learning_rate": 0.00019157733266550575, |
|
"loss": 1.2032, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.9649805447470817, |
|
"grad_norm": 0.13862887024879456, |
|
"learning_rate": 0.0001898390981891979, |
|
"loss": 1.197, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.9961089494163424, |
|
"grad_norm": 0.11003394424915314, |
|
"learning_rate": 0.0001879473751206489, |
|
"loss": 1.1852, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.9961089494163424, |
|
"eval_loss": 1.1821681261062622, |
|
"eval_runtime": 34.3117, |
|
"eval_samples_per_second": 29.057, |
|
"eval_steps_per_second": 0.321, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.027237354085603, |
|
"grad_norm": 0.08200129121541977, |
|
"learning_rate": 0.00018590539543698854, |
|
"loss": 1.178, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.0583657587548638, |
|
"grad_norm": 0.07455576211214066, |
|
"learning_rate": 0.00018371664782625287, |
|
"loss": 1.1725, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.0894941634241244, |
|
"grad_norm": 0.08433058857917786, |
|
"learning_rate": 0.0001813848717270195, |
|
"loss": 1.1569, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.1206225680933852, |
|
"grad_norm": 0.09246356040239334, |
|
"learning_rate": 0.00017891405093963938, |
|
"loss": 1.1627, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.1517509727626458, |
|
"grad_norm": 0.09312273561954498, |
|
"learning_rate": 0.00017630840681998066, |
|
"loss": 1.1526, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.1828793774319066, |
|
"grad_norm": 0.08373520523309708, |
|
"learning_rate": 0.00017357239106731317, |
|
"loss": 1.1456, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.2140077821011672, |
|
"grad_norm": 0.07111110538244247, |
|
"learning_rate": 0.00017071067811865476, |
|
"loss": 1.1531, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.245136186770428, |
|
"grad_norm": 0.06889671832323074, |
|
"learning_rate": 0.00016772815716257412, |
|
"loss": 1.1444, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.245136186770428, |
|
"eval_loss": 1.1379262208938599, |
|
"eval_runtime": 34.3236, |
|
"eval_samples_per_second": 29.047, |
|
"eval_steps_per_second": 0.32, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.2762645914396886, |
|
"grad_norm": 0.06582967936992645, |
|
"learning_rate": 0.00016462992378609407, |
|
"loss": 1.1335, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.3073929961089494, |
|
"grad_norm": 0.07529184967279434, |
|
"learning_rate": 0.0001614212712689668, |
|
"loss": 1.1292, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.3385214007782102, |
|
"grad_norm": 0.07816017419099808, |
|
"learning_rate": 0.00015810768154019385, |
|
"loss": 1.1293, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.3696498054474708, |
|
"grad_norm": 0.08063483238220215, |
|
"learning_rate": 0.00015469481581224272, |
|
"loss": 1.1161, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.4007782101167314, |
|
"grad_norm": 0.06947366893291473, |
|
"learning_rate": 0.00015118850490896012, |
|
"loss": 1.1168, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.4319066147859922, |
|
"grad_norm": 0.05603436380624771, |
|
"learning_rate": 0.00014759473930370736, |
|
"loss": 1.1147, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.463035019455253, |
|
"grad_norm": 0.055858004838228226, |
|
"learning_rate": 0.00014391965888473703, |
|
"loss": 1.1123, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.4941634241245136, |
|
"grad_norm": 0.0600324422121048, |
|
"learning_rate": 0.00014016954246529696, |
|
"loss": 1.0986, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.4941634241245136, |
|
"eval_loss": 1.1052128076553345, |
|
"eval_runtime": 34.2952, |
|
"eval_samples_per_second": 29.071, |
|
"eval_steps_per_second": 0.321, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.5252918287937742, |
|
"grad_norm": 0.0596173070371151, |
|
"learning_rate": 0.00013635079705638298, |
|
"loss": 1.0949, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.556420233463035, |
|
"grad_norm": 0.06981530040502548, |
|
"learning_rate": 0.00013246994692046836, |
|
"loss": 1.1, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.5875486381322959, |
|
"grad_norm": 0.058555856347084045, |
|
"learning_rate": 0.00012853362242491053, |
|
"loss": 1.0946, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.6186770428015564, |
|
"grad_norm": 0.052131447941064835, |
|
"learning_rate": 0.00012454854871407994, |
|
"loss": 1.096, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.649805447470817, |
|
"grad_norm": 0.05138020217418671, |
|
"learning_rate": 0.00012052153421956342, |
|
"loss": 1.0948, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.6809338521400778, |
|
"grad_norm": 0.055884215980768204, |
|
"learning_rate": 0.00011645945902807341, |
|
"loss": 1.0868, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.7120622568093387, |
|
"grad_norm": 0.056635960936546326, |
|
"learning_rate": 0.00011236926312693479, |
|
"loss": 1.0782, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.7431906614785992, |
|
"grad_norm": 0.05791952833533287, |
|
"learning_rate": 0.00010825793454723325, |
|
"loss": 1.0774, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.7431906614785992, |
|
"eval_loss": 1.0816473960876465, |
|
"eval_runtime": 34.308, |
|
"eval_samples_per_second": 29.06, |
|
"eval_steps_per_second": 0.321, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.7743190661478598, |
|
"grad_norm": 0.05655137449502945, |
|
"learning_rate": 0.00010413249742488131, |
|
"loss": 1.0793, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.8054474708171206, |
|
"grad_norm": 0.05930772423744202, |
|
"learning_rate": 0.0001, |
|
"loss": 1.0765, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.8365758754863815, |
|
"grad_norm": 0.056934159249067307, |
|
"learning_rate": 9.586750257511867e-05, |
|
"loss": 1.0825, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.867704280155642, |
|
"grad_norm": 0.05056174844503403, |
|
"learning_rate": 9.174206545276677e-05, |
|
"loss": 1.074, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.8988326848249026, |
|
"grad_norm": 0.05416735261678696, |
|
"learning_rate": 8.763073687306524e-05, |
|
"loss": 1.0731, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.9299610894941635, |
|
"grad_norm": 0.05306009575724602, |
|
"learning_rate": 8.35405409719266e-05, |
|
"loss": 1.0646, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.9610894941634243, |
|
"grad_norm": 0.054572440683841705, |
|
"learning_rate": 7.947846578043659e-05, |
|
"loss": 1.0697, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.9922178988326849, |
|
"grad_norm": 0.051973506808280945, |
|
"learning_rate": 7.54514512859201e-05, |
|
"loss": 1.065, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.9922178988326849, |
|
"eval_loss": 1.0657449960708618, |
|
"eval_runtime": 34.2892, |
|
"eval_samples_per_second": 29.076, |
|
"eval_steps_per_second": 0.321, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.0233463035019454, |
|
"grad_norm": 0.048152584582567215, |
|
"learning_rate": 7.146637757508949e-05, |
|
"loss": 1.0629, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.054474708171206, |
|
"grad_norm": 0.04994530603289604, |
|
"learning_rate": 6.753005307953167e-05, |
|
"loss": 1.0516, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 2.085603112840467, |
|
"grad_norm": 0.05009295791387558, |
|
"learning_rate": 6.3649202943617e-05, |
|
"loss": 1.0526, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 2.1167315175097277, |
|
"grad_norm": 0.05345555767416954, |
|
"learning_rate": 5.983045753470308e-05, |
|
"loss": 1.0553, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.1478599221789882, |
|
"grad_norm": 0.04756650701165199, |
|
"learning_rate": 5.608034111526298e-05, |
|
"loss": 1.059, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.178988326848249, |
|
"grad_norm": 0.04925397038459778, |
|
"learning_rate": 5.240526069629265e-05, |
|
"loss": 1.0508, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.21011673151751, |
|
"grad_norm": 0.05096421390771866, |
|
"learning_rate": 4.8811495091039926e-05, |
|
"loss": 1.0472, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 2.2412451361867705, |
|
"grad_norm": 0.047330863773822784, |
|
"learning_rate": 4.530518418775733e-05, |
|
"loss": 1.055, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.2412451361867705, |
|
"eval_loss": 1.0550851821899414, |
|
"eval_runtime": 34.2738, |
|
"eval_samples_per_second": 29.089, |
|
"eval_steps_per_second": 0.321, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.272373540856031, |
|
"grad_norm": 0.04690932855010033, |
|
"learning_rate": 4.189231845980618e-05, |
|
"loss": 1.0495, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 2.3035019455252916, |
|
"grad_norm": 0.04692551866173744, |
|
"learning_rate": 3.857872873103322e-05, |
|
"loss": 1.0561, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 2.3346303501945527, |
|
"grad_norm": 0.04910856485366821, |
|
"learning_rate": 3.53700762139059e-05, |
|
"loss": 1.0459, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.3657587548638133, |
|
"grad_norm": 0.04869484528899193, |
|
"learning_rate": 3.227184283742591e-05, |
|
"loss": 1.0373, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 2.396887159533074, |
|
"grad_norm": 0.045992154628038406, |
|
"learning_rate": 2.9289321881345254e-05, |
|
"loss": 1.0306, |
|
"step": 77 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 96, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 11, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.2048140774938247e+19, |
|
"train_batch_size": 12, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|