|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.964622641509434, |
|
"eval_steps": 500, |
|
"global_step": 424, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.009433962264150943, |
|
"grad_norm": 0.1552734375, |
|
"learning_rate": 2e-05, |
|
"loss": 1.6292, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.018867924528301886, |
|
"grad_norm": 0.1416015625, |
|
"learning_rate": 4e-05, |
|
"loss": 1.8541, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.02830188679245283, |
|
"grad_norm": 0.162109375, |
|
"learning_rate": 6e-05, |
|
"loss": 1.8592, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.03773584905660377, |
|
"grad_norm": 0.1572265625, |
|
"learning_rate": 8e-05, |
|
"loss": 1.8298, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.04716981132075472, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.801, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.05660377358490566, |
|
"grad_norm": 0.1953125, |
|
"learning_rate": 0.00012, |
|
"loss": 1.7895, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0660377358490566, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 0.00014, |
|
"loss": 1.7302, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.07547169811320754, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 0.00016, |
|
"loss": 1.7972, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.08490566037735849, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 0.00018, |
|
"loss": 1.5813, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.09433962264150944, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6154, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.10377358490566038, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00019999712083215463, |
|
"loss": 1.7572, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.11320754716981132, |
|
"grad_norm": 0.1669921875, |
|
"learning_rate": 0.00019998848349441062, |
|
"loss": 1.7039, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.12264150943396226, |
|
"grad_norm": 0.13671875, |
|
"learning_rate": 0.00019997408848413493, |
|
"loss": 1.541, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.1320754716981132, |
|
"grad_norm": 0.16015625, |
|
"learning_rate": 0.00019995393663024054, |
|
"loss": 1.6942, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.14150943396226415, |
|
"grad_norm": 0.1396484375, |
|
"learning_rate": 0.0001999280290931388, |
|
"loss": 1.6445, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.1509433962264151, |
|
"grad_norm": 0.138671875, |
|
"learning_rate": 0.00019989636736467278, |
|
"loss": 1.6148, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.16037735849056603, |
|
"grad_norm": 0.1357421875, |
|
"learning_rate": 0.00019985895326803097, |
|
"loss": 1.5653, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.16981132075471697, |
|
"grad_norm": 0.1279296875, |
|
"learning_rate": 0.00019981578895764273, |
|
"loss": 1.5683, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.1792452830188679, |
|
"grad_norm": 0.1494140625, |
|
"learning_rate": 0.00019976687691905393, |
|
"loss": 1.4737, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.18867924528301888, |
|
"grad_norm": 0.169921875, |
|
"learning_rate": 0.00019971221996878394, |
|
"loss": 1.595, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.19811320754716982, |
|
"grad_norm": 0.1533203125, |
|
"learning_rate": 0.0001996518212541634, |
|
"loss": 1.5053, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.20754716981132076, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 0.00019958568425315314, |
|
"loss": 1.5134, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.2169811320754717, |
|
"grad_norm": 0.1494140625, |
|
"learning_rate": 0.0001995138127741436, |
|
"loss": 1.5173, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.22641509433962265, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 0.00019943621095573586, |
|
"loss": 1.5506, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.2358490566037736, |
|
"grad_norm": 0.134765625, |
|
"learning_rate": 0.00019935288326650312, |
|
"loss": 1.5697, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.24528301886792453, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 0.00019926383450473344, |
|
"loss": 1.4924, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.25471698113207547, |
|
"grad_norm": 0.134765625, |
|
"learning_rate": 0.00019916906979815347, |
|
"loss": 1.5674, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.2641509433962264, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 0.00019906859460363307, |
|
"loss": 1.5699, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.27358490566037735, |
|
"grad_norm": 0.1396484375, |
|
"learning_rate": 0.0001989624147068713, |
|
"loss": 1.5094, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.2830188679245283, |
|
"grad_norm": 0.142578125, |
|
"learning_rate": 0.00019885053622206304, |
|
"loss": 1.5293, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.29245283018867924, |
|
"grad_norm": 0.15234375, |
|
"learning_rate": 0.00019873296559154698, |
|
"loss": 1.5266, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.3018867924528302, |
|
"grad_norm": 0.1416015625, |
|
"learning_rate": 0.0001986097095854347, |
|
"loss": 1.5474, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.3113207547169811, |
|
"grad_norm": 0.1552734375, |
|
"learning_rate": 0.00019848077530122083, |
|
"loss": 1.5738, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.32075471698113206, |
|
"grad_norm": 0.1630859375, |
|
"learning_rate": 0.0001983461701633742, |
|
"loss": 1.5296, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.330188679245283, |
|
"grad_norm": 0.1533203125, |
|
"learning_rate": 0.0001982059019229106, |
|
"loss": 1.5545, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.33962264150943394, |
|
"grad_norm": 0.1728515625, |
|
"learning_rate": 0.00019805997865694614, |
|
"loss": 1.5179, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.3490566037735849, |
|
"grad_norm": 0.1708984375, |
|
"learning_rate": 0.00019790840876823232, |
|
"loss": 1.4534, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.3584905660377358, |
|
"grad_norm": 0.162109375, |
|
"learning_rate": 0.0001977512009846721, |
|
"loss": 1.4827, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.36792452830188677, |
|
"grad_norm": 0.1689453125, |
|
"learning_rate": 0.00019758836435881746, |
|
"loss": 1.4441, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.37735849056603776, |
|
"grad_norm": 0.169921875, |
|
"learning_rate": 0.00019741990826734794, |
|
"loss": 1.5114, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.3867924528301887, |
|
"grad_norm": 0.1552734375, |
|
"learning_rate": 0.0001972458424105307, |
|
"loss": 1.5024, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.39622641509433965, |
|
"grad_norm": 0.162109375, |
|
"learning_rate": 0.00019706617681166218, |
|
"loss": 1.5126, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.4056603773584906, |
|
"grad_norm": 0.1650390625, |
|
"learning_rate": 0.00019688092181649065, |
|
"loss": 1.5009, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.41509433962264153, |
|
"grad_norm": 0.171875, |
|
"learning_rate": 0.00019669008809262062, |
|
"loss": 1.5124, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.42452830188679247, |
|
"grad_norm": 0.1611328125, |
|
"learning_rate": 0.00019649368662889855, |
|
"loss": 1.4302, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.4339622641509434, |
|
"grad_norm": 0.169921875, |
|
"learning_rate": 0.00019629172873477995, |
|
"loss": 1.4321, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.44339622641509435, |
|
"grad_norm": 0.1708984375, |
|
"learning_rate": 0.00019608422603967836, |
|
"loss": 1.4764, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.4528301886792453, |
|
"grad_norm": 0.1591796875, |
|
"learning_rate": 0.00019587119049229557, |
|
"loss": 1.5262, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.46226415094339623, |
|
"grad_norm": 0.1591796875, |
|
"learning_rate": 0.0001956526343599335, |
|
"loss": 1.3968, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.4716981132075472, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 0.0001954285702277879, |
|
"loss": 1.4958, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.4811320754716981, |
|
"grad_norm": 0.169921875, |
|
"learning_rate": 0.00019519901099822372, |
|
"loss": 1.472, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.49056603773584906, |
|
"grad_norm": 0.1630859375, |
|
"learning_rate": 0.00019496396989003193, |
|
"loss": 1.3241, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.1640625, |
|
"learning_rate": 0.00019472346043766865, |
|
"loss": 1.4498, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.5094339622641509, |
|
"grad_norm": 0.1552734375, |
|
"learning_rate": 0.00019447749649047542, |
|
"loss": 1.4044, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.5188679245283019, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 0.00019422609221188207, |
|
"loss": 1.4725, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.5283018867924528, |
|
"grad_norm": 0.1689453125, |
|
"learning_rate": 0.00019396926207859084, |
|
"loss": 1.3035, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.5377358490566038, |
|
"grad_norm": 0.1669921875, |
|
"learning_rate": 0.00019370702087974302, |
|
"loss": 1.4331, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.5471698113207547, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 0.00019343938371606712, |
|
"loss": 1.4268, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.5566037735849056, |
|
"grad_norm": 0.1748046875, |
|
"learning_rate": 0.00019316636599900946, |
|
"loss": 1.4898, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.5660377358490566, |
|
"grad_norm": 0.1748046875, |
|
"learning_rate": 0.00019288798344984672, |
|
"loss": 1.4587, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.5754716981132075, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 0.00019260425209878052, |
|
"loss": 1.2718, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.5849056603773585, |
|
"grad_norm": 0.1708984375, |
|
"learning_rate": 0.00019231518828401458, |
|
"loss": 1.4943, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.5943396226415094, |
|
"grad_norm": 0.1708984375, |
|
"learning_rate": 0.00019202080865081368, |
|
"loss": 1.4563, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.6037735849056604, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 0.00019172113015054532, |
|
"loss": 1.5265, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.6132075471698113, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 0.0001914161700397035, |
|
"loss": 1.3905, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.6226415094339622, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 0.00019110594587891519, |
|
"loss": 1.4768, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.6320754716981132, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 0.0001907904755319289, |
|
"loss": 1.3191, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.6415094339622641, |
|
"grad_norm": 0.17578125, |
|
"learning_rate": 0.00019046977716458626, |
|
"loss": 1.3206, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.6509433962264151, |
|
"grad_norm": 0.1748046875, |
|
"learning_rate": 0.00019014386924377582, |
|
"loss": 1.331, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.660377358490566, |
|
"grad_norm": 0.17578125, |
|
"learning_rate": 0.0001898127705363696, |
|
"loss": 1.373, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.6698113207547169, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 0.0001894765001081428, |
|
"loss": 1.4195, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.6792452830188679, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 0.0001891350773226754, |
|
"loss": 1.3853, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.6886792452830188, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 0.0001887885218402375, |
|
"loss": 1.496, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.6981132075471698, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 0.00018843685361665723, |
|
"loss": 1.4916, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.7075471698113207, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 0.00018808009290217136, |
|
"loss": 1.2506, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.7169811320754716, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 0.00018771826024025946, |
|
"loss": 1.3273, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.7264150943396226, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 0.00018735137646646078, |
|
"loss": 1.343, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.7358490566037735, |
|
"grad_norm": 0.17578125, |
|
"learning_rate": 0.00018697946270717467, |
|
"loss": 1.4744, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.7452830188679245, |
|
"grad_norm": 0.1748046875, |
|
"learning_rate": 0.00018660254037844388, |
|
"loss": 1.3758, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.7547169811320755, |
|
"grad_norm": 0.1728515625, |
|
"learning_rate": 0.00018622063118472134, |
|
"loss": 1.3836, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.7641509433962265, |
|
"grad_norm": 0.1923828125, |
|
"learning_rate": 0.00018583375711762052, |
|
"loss": 1.347, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.7735849056603774, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 0.00018544194045464886, |
|
"loss": 1.4363, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.7830188679245284, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 0.0001850452037579251, |
|
"loss": 1.4564, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.7924528301886793, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 0.00018464356987288013, |
|
"loss": 1.4342, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.8018867924528302, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 0.00018423706192694116, |
|
"loss": 1.4234, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.8113207547169812, |
|
"grad_norm": 0.17578125, |
|
"learning_rate": 0.00018382570332820043, |
|
"loss": 1.3817, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.8207547169811321, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 0.00018340951776406694, |
|
"loss": 1.4327, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.8301886792452831, |
|
"grad_norm": 0.17578125, |
|
"learning_rate": 0.00018298852919990252, |
|
"loss": 1.3911, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.839622641509434, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 0.00018256276187764197, |
|
"loss": 1.4534, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.8490566037735849, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 0.0001821322403143969, |
|
"loss": 1.3534, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.8584905660377359, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 0.0001816969893010442, |
|
"loss": 1.4886, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.8679245283018868, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 0.0001812570339007983, |
|
"loss": 1.3435, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.8773584905660378, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 0.00018081239944776805, |
|
"loss": 1.2901, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.8867924528301887, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 0.00018036311154549784, |
|
"loss": 1.4074, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.8962264150943396, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 0.00017990919606549328, |
|
"loss": 1.3902, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.9056603773584906, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 0.00017945067914573146, |
|
"loss": 1.4234, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.9150943396226415, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 0.00017898758718915586, |
|
"loss": 1.2415, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.9245283018867925, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 0.0001785199468621559, |
|
"loss": 1.4503, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.9339622641509434, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 0.00017804778509303138, |
|
"loss": 1.415, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.9433962264150944, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 0.000177571129070442, |
|
"loss": 1.3419, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.9528301886792453, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 0.00017709000624184162, |
|
"loss": 1.4011, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.9622641509433962, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 0.0001766044443118978, |
|
"loss": 1.3546, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.9716981132075472, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 0.00017611447124089649, |
|
"loss": 1.3989, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.9811320754716981, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 0.00017562011524313185, |
|
"loss": 1.3564, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.9905660377358491, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 0.0001751214047852818, |
|
"loss": 1.3605, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 0.00017461836858476856, |
|
"loss": 1.4459, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.009433962264151, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 0.00017411103560810526, |
|
"loss": 1.3842, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.0070754716981132, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 0.00017359943506922774, |
|
"loss": 1.3091, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.0165094339622642, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 0.00017308359642781242, |
|
"loss": 1.3549, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.025943396226415, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 0.0001725635493875799, |
|
"loss": 1.2693, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.0353773584905661, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 0.00017203932389458454, |
|
"loss": 1.2991, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.044811320754717, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 0.00017151095013548994, |
|
"loss": 1.379, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.054245283018868, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 0.0001709784585358309, |
|
"loss": 1.2804, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.0636792452830188, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 0.00017044187975826124, |
|
"loss": 1.2645, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.0731132075471699, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 0.00016990124470078822, |
|
"loss": 1.1872, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.0825471698113207, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 0.0001693565844949933, |
|
"loss": 1.2613, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.0919811320754718, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 0.0001688079305042395, |
|
"loss": 1.2765, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.1014150943396226, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 0.00016825531432186543, |
|
"loss": 1.2478, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.1108490566037736, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 0.0001676987677693659, |
|
"loss": 1.3338, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.1202830188679245, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 0.0001671383228945597, |
|
"loss": 1.2246, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.1297169811320755, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 0.00016657401196974405, |
|
"loss": 1.2763, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.1391509433962264, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 0.00016600586748983641, |
|
"loss": 1.1468, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.1485849056603774, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 0.00016543392217050314, |
|
"loss": 1.3413, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.1580188679245282, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 0.0001648582089462756, |
|
"loss": 1.418, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.1674528301886793, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 0.00016427876096865394, |
|
"loss": 1.2445, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.1768867924528301, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 0.00016369561160419784, |
|
"loss": 1.23, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.1863207547169812, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 0.00016310879443260528, |
|
"loss": 1.3247, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.195754716981132, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 0.0001625183432447789, |
|
"loss": 1.2278, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.205188679245283, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 0.0001619242920408802, |
|
"loss": 1.1869, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.2146226415094339, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.00016132667502837165, |
|
"loss": 1.2633, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.224056603773585, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.00016072552662004696, |
|
"loss": 1.2912, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.2334905660377358, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 0.00016012088143204953, |
|
"loss": 1.3428, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.2429245283018868, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 0.00015951277428187898, |
|
"loss": 1.2032, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.2523584905660377, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 0.00015890124018638638, |
|
"loss": 1.2246, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.2617924528301887, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 0.00015828631435975784, |
|
"loss": 1.2724, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.2712264150943398, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 0.00015766803221148673, |
|
"loss": 1.2384, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.2806603773584906, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.0001570464293443346, |
|
"loss": 1.2844, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.2900943396226414, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 0.00015642154155228122, |
|
"loss": 1.2727, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.2995283018867925, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 0.00015579340481846336, |
|
"loss": 1.2884, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.3089622641509435, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 0.00015516205531310273, |
|
"loss": 1.3399, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.3183962264150944, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.00015452752939142328, |
|
"loss": 1.3155, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.3278301886792452, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 0.00015388986359155758, |
|
"loss": 1.1984, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.3372641509433962, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.00015324909463244296, |
|
"loss": 1.1836, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.3466981132075473, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.00015260525941170712, |
|
"loss": 1.27, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.3561320754716981, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 0.00015195839500354335, |
|
"loss": 1.3095, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.365566037735849, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 0.0001513085386565758, |
|
"loss": 1.168, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.375, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 0.00015065572779171432, |
|
"loss": 1.2784, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.384433962264151, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 1.2809, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.3938679245283019, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 0.00014934139304044033, |
|
"loss": 1.3085, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.4033018867924527, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 0.00014867994483783485, |
|
"loss": 1.214, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.4127358490566038, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 0.00014801569348059157, |
|
"loss": 1.2022, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.4221698113207548, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 0.0001473486772185334, |
|
"loss": 1.2022, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.4316037735849056, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.00014667893446069588, |
|
"loss": 1.3266, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.4410377358490565, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.00014600650377311522, |
|
"loss": 1.2686, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.4504716981132075, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 0.00014533142387660773, |
|
"loss": 1.2602, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.4599056603773586, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.00014465373364454001, |
|
"loss": 1.2541, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.4693396226415094, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 0.00014397347210059057, |
|
"loss": 1.1867, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.4787735849056602, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 0.00014329067841650274, |
|
"loss": 1.3164, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.4882075471698113, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.00014260539190982886, |
|
"loss": 1.3074, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.4976415094339623, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 0.00014191765204166643, |
|
"loss": 1.2422, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.5070754716981132, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.00014122749841438575, |
|
"loss": 1.2188, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.516509433962264, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.00014053497076934948, |
|
"loss": 1.2709, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.525943396226415, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 0.00013984010898462416, |
|
"loss": 1.2864, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.5353773584905661, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.00013914295307268396, |
|
"loss": 1.2356, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.544811320754717, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.0001384435431781065, |
|
"loss": 1.2602, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.5542452830188678, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.00013774191957526143, |
|
"loss": 1.266, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.5636792452830188, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 0.00013703812266599113, |
|
"loss": 1.2911, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.5731132075471699, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 0.00013633219297728416, |
|
"loss": 1.3344, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.5825471698113207, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 0.00013562417115894172, |
|
"loss": 1.1809, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.5919811320754715, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00013491409798123687, |
|
"loss": 1.2117, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.6014150943396226, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 0.00013420201433256689, |
|
"loss": 1.1622, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.6108490566037736, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.00013348796121709862, |
|
"loss": 1.1991, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.6202830188679245, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.0001327719797524075, |
|
"loss": 1.0869, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.6297169811320755, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.00013205411116710972, |
|
"loss": 1.2271, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.6391509433962264, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 0.00013133439679848823, |
|
"loss": 1.3048, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.6485849056603774, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.00013061287809011242, |
|
"loss": 1.1293, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.6580188679245285, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 0.0001298895965894516, |
|
"loss": 1.2416, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.6674528301886793, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 0.0001291645939454825, |
|
"loss": 1.2559, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.6768867924528301, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.0001284379119062912, |
|
"loss": 1.2819, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.6863207547169812, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 0.0001277095923166689, |
|
"loss": 1.2085, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.6957547169811322, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.00012697967711570242, |
|
"loss": 1.2698, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.705188679245283, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.00012624820833435937, |
|
"loss": 1.2076, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.7146226415094339, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.0001255152280930676, |
|
"loss": 1.2315, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.724056603773585, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.00012478077859929, |
|
"loss": 1.2389, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.733490566037736, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00012404490214509386, |
|
"loss": 1.2874, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.7429245283018868, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 0.00012330764110471566, |
|
"loss": 1.1518, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.7523584905660377, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.00012256903793212107, |
|
"loss": 1.2779, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.7617924528301887, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 0.00012182913515856015, |
|
"loss": 1.1925, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.7712264150943398, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.00012108797539011847, |
|
"loss": 1.2957, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.7806603773584906, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.0001203456013052634, |
|
"loss": 1.2427, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.7900943396226414, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.00011960205565238684, |
|
"loss": 1.0626, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.7995283018867925, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00011885738124734358, |
|
"loss": 1.2465, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.8089622641509435, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 0.00011811162097098558, |
|
"loss": 1.1583, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.8183962264150944, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 1.2873, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.8278301886792452, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.00011661701463790142, |
|
"loss": 1.2776, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.8372641509433962, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.00011586825464562514, |
|
"loss": 1.1212, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.8466981132075473, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 0.0001151185809059781, |
|
"loss": 1.1373, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.8561320754716981, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 0.00011436803658769082, |
|
"loss": 1.2672, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.865566037735849, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.00011361666490962468, |
|
"loss": 1.2042, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.00011286450913828312, |
|
"loss": 1.2579, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.884433962264151, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 0.00011211161258532041, |
|
"loss": 1.2116, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.8938679245283019, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.00011135801860504749, |
|
"loss": 1.2045, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.9033018867924527, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 0.00011060377059193547, |
|
"loss": 1.1928, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.9127358490566038, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00010984891197811687, |
|
"loss": 1.3457, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.9221698113207548, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.0001090934862308847, |
|
"loss": 1.2525, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.9316037735849056, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 0.00010833753685018935, |
|
"loss": 1.2322, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.9410377358490565, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 0.00010758110736613385, |
|
"loss": 1.2157, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.9504716981132075, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0001068242413364671, |
|
"loss": 1.1082, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.9599056603773586, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.00010606698234407586, |
|
"loss": 1.2336, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.9693396226415094, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.00010530937399447496, |
|
"loss": 1.1619, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.9787735849056602, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00010455145991329638, |
|
"loss": 1.2344, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.9882075471698113, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.00010379328374377715, |
|
"loss": 1.2143, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.9976415094339623, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.00010303488914424624, |
|
"loss": 1.231, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 2.007075471698113, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 0.00010227631978561056, |
|
"loss": 1.1562, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 2.0023584905660377, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00010151761934884028, |
|
"loss": 1.1339, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 2.0117924528301887, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00010075883152245334, |
|
"loss": 1.1717, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 2.0212264150943398, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0001, |
|
"loss": 1.1575, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 2.0306603773584904, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 9.92411684775467e-05, |
|
"loss": 1.0906, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 2.0400943396226414, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 9.848238065115975e-05, |
|
"loss": 1.1255, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 2.0495283018867925, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 9.772368021438943e-05, |
|
"loss": 1.0733, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.0589622641509435, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 9.696511085575377e-05, |
|
"loss": 1.1415, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 2.068396226415094, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 9.620671625622288e-05, |
|
"loss": 1.0607, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 2.077830188679245, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 9.544854008670367e-05, |
|
"loss": 1.0438, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 2.0872641509433962, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 9.469062600552509e-05, |
|
"loss": 1.1176, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 2.0966981132075473, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 9.393301765592415e-05, |
|
"loss": 1.0588, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.106132075471698, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 9.317575866353292e-05, |
|
"loss": 1.0319, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 2.115566037735849, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 9.241889263386618e-05, |
|
"loss": 1.1098, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 2.125, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 9.166246314981066e-05, |
|
"loss": 1.1213, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 2.134433962264151, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 9.09065137691153e-05, |
|
"loss": 1.0372, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 2.143867924528302, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 9.015108802188313e-05, |
|
"loss": 1.1706, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.1533018867924527, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 8.939622940806455e-05, |
|
"loss": 1.0834, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 2.1627358490566038, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 8.86419813949525e-05, |
|
"loss": 1.044, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 2.172169811320755, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 8.788838741467962e-05, |
|
"loss": 1.0498, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 2.1816037735849054, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 8.713549086171691e-05, |
|
"loss": 1.0347, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 2.1910377358490565, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 8.638333509037536e-05, |
|
"loss": 1.0902, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 2.2004716981132075, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 8.563196341230919e-05, |
|
"loss": 0.9978, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 2.2099056603773586, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 8.488141909402191e-05, |
|
"loss": 1.1299, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 2.2193396226415096, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 8.413174535437487e-05, |
|
"loss": 0.9632, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 2.2287735849056602, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 8.33829853620986e-05, |
|
"loss": 1.029, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 2.2382075471698113, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 8.263518223330697e-05, |
|
"loss": 1.0729, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.2476415094339623, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 8.188837902901442e-05, |
|
"loss": 1.0296, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 2.2570754716981134, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 8.114261875265643e-05, |
|
"loss": 1.0439, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 2.266509433962264, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 8.039794434761318e-05, |
|
"loss": 1.0191, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 2.275943396226415, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 7.965439869473664e-05, |
|
"loss": 1.1443, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 2.285377358490566, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 7.891202460988158e-05, |
|
"loss": 1.1637, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 2.294811320754717, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 7.817086484143986e-05, |
|
"loss": 1.1807, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 2.3042452830188678, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 7.743096206787894e-05, |
|
"loss": 1.0508, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 2.313679245283019, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 7.669235889528436e-05, |
|
"loss": 1.0764, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 2.32311320754717, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 7.595509785490617e-05, |
|
"loss": 1.0247, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 2.332547169811321, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 7.521922140071002e-05, |
|
"loss": 1.0341, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.3419811320754715, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 7.448477190693238e-05, |
|
"loss": 1.0841, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 2.3514150943396226, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 7.375179166564063e-05, |
|
"loss": 1.001, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 2.3608490566037736, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 7.302032288429756e-05, |
|
"loss": 1.073, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 2.3702830188679247, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 7.229040768333115e-05, |
|
"loss": 1.139, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 2.3797169811320753, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 7.156208809370883e-05, |
|
"loss": 1.035, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.3891509433962264, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 7.08354060545175e-05, |
|
"loss": 1.0059, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 2.3985849056603774, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 7.011040341054845e-05, |
|
"loss": 1.0855, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 2.4080188679245285, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 6.93871219098876e-05, |
|
"loss": 1.0351, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 2.417452830188679, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 6.866560320151179e-05, |
|
"loss": 1.0143, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 2.42688679245283, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 6.79458888328903e-05, |
|
"loss": 1.106, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.436320754716981, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 6.722802024759252e-05, |
|
"loss": 1.0725, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 2.4457547169811322, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 6.651203878290139e-05, |
|
"loss": 1.0353, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 2.455188679245283, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 6.579798566743314e-05, |
|
"loss": 1.0543, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 2.464622641509434, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 6.508590201876317e-05, |
|
"loss": 1.135, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 2.474056603773585, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 6.437582884105835e-05, |
|
"loss": 1.0466, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.483490566037736, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 6.366780702271589e-05, |
|
"loss": 1.0234, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 2.4929245283018866, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 6.29618773340089e-05, |
|
"loss": 1.0979, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 2.5023584905660377, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 6.225808042473858e-05, |
|
"loss": 1.066, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 2.5117924528301887, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 6.155645682189351e-05, |
|
"loss": 1.1694, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 2.5212264150943398, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 6.085704692731609e-05, |
|
"loss": 0.9722, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.5306603773584904, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 6.015989101537586e-05, |
|
"loss": 1.0155, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 2.5400943396226414, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 5.9465029230650534e-05, |
|
"loss": 1.063, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 2.5495283018867925, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 5.877250158561425e-05, |
|
"loss": 1.0669, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 2.5589622641509435, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 5.8082347958333625e-05, |
|
"loss": 1.0505, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 2.5683962264150946, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 5.73946080901712e-05, |
|
"loss": 1.177, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.577830188679245, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 5.670932158349731e-05, |
|
"loss": 1.0335, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 2.5872641509433962, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 5.602652789940941e-05, |
|
"loss": 1.0383, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 2.5966981132075473, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 5.5346266355459995e-05, |
|
"loss": 0.9778, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 2.606132075471698, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 5.466857612339229e-05, |
|
"loss": 0.9718, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 2.615566037735849, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 5.399349622688479e-05, |
|
"loss": 1.1602, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.625, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 5.332106553930414e-05, |
|
"loss": 1.0645, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 2.634433962264151, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 5.26513227814666e-05, |
|
"loss": 0.9681, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 2.643867924528302, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 5.1984306519408456e-05, |
|
"loss": 1.0741, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 2.6533018867924527, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 5.1320055162165115e-05, |
|
"loss": 1.0404, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 2.6627358490566038, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 5.065860695955971e-05, |
|
"loss": 1.114, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.672169811320755, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 5.000000000000002e-05, |
|
"loss": 1.0966, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 2.6816037735849054, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 4.934427220828571e-05, |
|
"loss": 1.0322, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 2.6910377358490565, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 4.869146134342426e-05, |
|
"loss": 1.0087, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 2.7004716981132075, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 4.804160499645667e-05, |
|
"loss": 1.1284, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 2.7099056603773586, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 4.739474058829289e-05, |
|
"loss": 0.9876, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.7193396226415096, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 4.675090536755705e-05, |
|
"loss": 1.1121, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 2.7287735849056602, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 4.611013640844245e-05, |
|
"loss": 1.0433, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 2.7382075471698113, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 4.547247060857675e-05, |
|
"loss": 0.9529, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 2.7476415094339623, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 4.483794468689728e-05, |
|
"loss": 1.0347, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 2.757075471698113, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 4.420659518153667e-05, |
|
"loss": 1.0604, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.766509433962264, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 4.357845844771881e-05, |
|
"loss": 1.1131, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 2.775943396226415, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 4.295357065566543e-05, |
|
"loss": 1.0776, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 2.785377358490566, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 4.2331967788513295e-05, |
|
"loss": 1.0204, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 2.794811320754717, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 4.1713685640242165e-05, |
|
"loss": 1.0161, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 2.8042452830188678, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 4.109875981361363e-05, |
|
"loss": 1.1091, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.813679245283019, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 4.048722571812105e-05, |
|
"loss": 1.0495, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 2.82311320754717, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 3.987911856795047e-05, |
|
"loss": 1.0086, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 2.8325471698113205, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 3.927447337995304e-05, |
|
"loss": 0.9878, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 2.8419811320754715, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 3.8673324971628357e-05, |
|
"loss": 0.9856, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 2.8514150943396226, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 3.8075707959119846e-05, |
|
"loss": 1.0848, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.8608490566037736, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 3.7481656755221125e-05, |
|
"loss": 1.0856, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 2.8702830188679247, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 3.689120556739475e-05, |
|
"loss": 0.9755, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 2.8797169811320753, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 3.630438839580217e-05, |
|
"loss": 1.0983, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 2.8891509433962264, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 3.5721239031346066e-05, |
|
"loss": 1.0877, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 2.8985849056603774, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 3.5141791053724405e-05, |
|
"loss": 1.1485, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.9080188679245285, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 3.456607782949689e-05, |
|
"loss": 1.0692, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 2.9174528301886795, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 3.399413251016359e-05, |
|
"loss": 1.0783, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.92688679245283, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 3.342598803025595e-05, |
|
"loss": 1.1049, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 2.936320754716981, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 3.2861677105440336e-05, |
|
"loss": 1.143, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 2.9457547169811322, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 3.2301232230634104e-05, |
|
"loss": 1.0474, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.955188679245283, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 3.174468567813461e-05, |
|
"loss": 1.078, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 2.964622641509434, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 3.119206949576052e-05, |
|
"loss": 1.0219, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 2.974056603773585, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 3.0643415505006735e-05, |
|
"loss": 1.061, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 2.983490566037736, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 3.009875529921181e-05, |
|
"loss": 1.0573, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 2.992924528301887, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 2.9558120241738784e-05, |
|
"loss": 1.0131, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 3.0023584905660377, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 2.90215414641691e-05, |
|
"loss": 1.0359, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 3.0023584905660377, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 2.8489049864510054e-05, |
|
"loss": 1.07, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 3.0117924528301887, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 2.7960676105415472e-05, |
|
"loss": 1.0113, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 3.0212264150943398, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 2.7436450612420095e-05, |
|
"loss": 0.9147, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 3.0306603773584904, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 2.691640357218759e-05, |
|
"loss": 0.8318, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 3.0400943396226414, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 2.640056493077231e-05, |
|
"loss": 0.9546, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 3.0495283018867925, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 2.5888964391894766e-05, |
|
"loss": 1.0179, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 3.0589622641509435, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 2.5381631415231454e-05, |
|
"loss": 0.8373, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 3.068396226415094, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 2.4878595214718236e-05, |
|
"loss": 0.8973, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 3.077830188679245, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 2.4379884756868167e-05, |
|
"loss": 0.9058, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 3.0872641509433962, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 2.3885528759103538e-05, |
|
"loss": 0.9879, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 3.0966981132075473, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 2.339555568810221e-05, |
|
"loss": 0.9923, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 3.106132075471698, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 2.2909993758158412e-05, |
|
"loss": 0.9279, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 3.115566037735849, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 2.242887092955801e-05, |
|
"loss": 0.9026, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 2.1952214906968627e-05, |
|
"loss": 0.9476, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 3.134433962264151, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 2.1480053137844115e-05, |
|
"loss": 0.9352, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 3.143867924528302, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 2.101241281084416e-05, |
|
"loss": 0.9964, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 3.1533018867924527, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 2.054932085426856e-05, |
|
"loss": 1.0295, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 3.1627358490566038, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 2.0090803934506764e-05, |
|
"loss": 0.9968, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 3.172169811320755, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 1.9636888454502178e-05, |
|
"loss": 0.8935, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 3.1816037735849054, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 1.9187600552231955e-05, |
|
"loss": 0.9172, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 3.1910377358490565, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 1.8742966099201697e-05, |
|
"loss": 0.9573, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 3.2004716981132075, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 1.8303010698955804e-05, |
|
"loss": 0.9661, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 3.2099056603773586, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 1.7867759685603114e-05, |
|
"loss": 0.9576, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 3.2193396226415096, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 1.7437238122358057e-05, |
|
"loss": 1.0133, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 3.2287735849056602, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 1.7011470800097496e-05, |
|
"loss": 0.9558, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 3.2382075471698113, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 1.659048223593308e-05, |
|
"loss": 0.8893, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 3.2476415094339623, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 1.6174296671799572e-05, |
|
"loss": 1.0161, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 3.2570754716981134, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 1.5762938073058853e-05, |
|
"loss": 1.0262, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 3.266509433962264, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 1.5356430127119913e-05, |
|
"loss": 0.9567, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 3.275943396226415, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 1.4954796242074898e-05, |
|
"loss": 0.9701, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 3.285377358490566, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 1.4558059545351143e-05, |
|
"loss": 0.9455, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 3.294811320754717, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 1.4166242882379476e-05, |
|
"loss": 0.9922, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 3.3042452830188678, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 1.3779368815278647e-05, |
|
"loss": 0.9102, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 3.313679245283019, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 1.339745962155613e-05, |
|
"loss": 0.9873, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 3.32311320754717, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 1.302053729282533e-05, |
|
"loss": 0.9758, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 3.332547169811321, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.2648623533539261e-05, |
|
"loss": 0.851, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 3.3419811320754715, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 1.2281739759740574e-05, |
|
"loss": 1.0054, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 3.3514150943396226, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 1.1919907097828653e-05, |
|
"loss": 0.8892, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 3.3608490566037736, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 1.1563146383342772e-05, |
|
"loss": 0.9823, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 3.3702830188679247, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 1.1211478159762478e-05, |
|
"loss": 0.8435, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 3.3797169811320753, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 1.0864922677324618e-05, |
|
"loss": 1.0188, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 3.3891509433962264, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 1.0523499891857225e-05, |
|
"loss": 1.0157, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 3.3985849056603774, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 1.01872294636304e-05, |
|
"loss": 0.9674, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 3.4080188679245285, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 9.856130756224213e-06, |
|
"loss": 0.9332, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 3.417452830188679, |
|
"grad_norm": 0.375, |
|
"learning_rate": 9.530222835413738e-06, |
|
"loss": 0.9148, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 3.42688679245283, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 9.209524468071096e-06, |
|
"loss": 1.0542, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 3.436320754716981, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 8.894054121084838e-06, |
|
"loss": 0.9474, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 3.4457547169811322, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 8.58382996029652e-06, |
|
"loss": 0.9664, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 3.455188679245283, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 8.278869849454718e-06, |
|
"loss": 0.9389, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 3.464622641509434, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 7.97919134918632e-06, |
|
"loss": 0.9574, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 3.474056603773585, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 7.684811715985429e-06, |
|
"loss": 0.9318, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 3.483490566037736, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 7.395747901219474e-06, |
|
"loss": 0.8918, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 3.4929245283018866, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 7.1120165501533e-06, |
|
"loss": 1.0351, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 3.5023584905660377, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 6.833634000990541e-06, |
|
"loss": 1.0396, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 3.5117924528301887, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 6.560616283932897e-06, |
|
"loss": 0.9884, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 3.5212264150943398, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 6.292979120256992e-06, |
|
"loss": 0.8938, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 3.5306603773584904, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 6.030737921409169e-06, |
|
"loss": 0.9728, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 3.5400943396226414, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 5.77390778811796e-06, |
|
"loss": 1.0086, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 3.5495283018867925, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 5.52250350952459e-06, |
|
"loss": 1.0073, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.5589622641509435, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 5.276539562331384e-06, |
|
"loss": 0.9953, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 3.5683962264150946, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 5.036030109968082e-06, |
|
"loss": 0.9377, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 3.577830188679245, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 4.800989001776324e-06, |
|
"loss": 0.9637, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 3.5872641509433962, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 4.5714297722121106e-06, |
|
"loss": 0.8614, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 3.5966981132075473, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 4.347365640066525e-06, |
|
"loss": 0.9701, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 3.606132075471698, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 4.128809507704445e-06, |
|
"loss": 0.9451, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 3.615566037735849, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 3.915773960321634e-06, |
|
"loss": 0.9175, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 3.625, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 3.7082712652200867e-06, |
|
"loss": 0.8858, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 3.634433962264151, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 3.5063133711014882e-06, |
|
"loss": 0.9181, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 3.643867924528302, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 3.3099119073793928e-06, |
|
"loss": 0.9641, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.6533018867924527, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 3.119078183509372e-06, |
|
"loss": 1.0059, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 3.6627358490566038, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 2.9338231883378366e-06, |
|
"loss": 0.9977, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 3.672169811320755, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 2.7541575894693194e-06, |
|
"loss": 0.9899, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 3.6816037735849054, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 2.580091732652101e-06, |
|
"loss": 0.9088, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 3.6910377358490565, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 2.4116356411825525e-06, |
|
"loss": 0.9484, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 3.7004716981132075, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 2.248799015327907e-06, |
|
"loss": 0.9853, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 3.7099056603773586, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 2.091591231767709e-06, |
|
"loss": 0.9616, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 3.7193396226415096, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 1.9400213430538773e-06, |
|
"loss": 0.9615, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 3.7287735849056602, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 1.7940980770894122e-06, |
|
"loss": 1.003, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 3.7382075471698113, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 1.6538298366257976e-06, |
|
"loss": 0.9496, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.7476415094339623, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 1.5192246987791981e-06, |
|
"loss": 0.9563, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 3.757075471698113, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 1.3902904145653096e-06, |
|
"loss": 0.9478, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 3.766509433962264, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.2670344084530383e-06, |
|
"loss": 0.9733, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 3.775943396226415, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 1.1494637779369766e-06, |
|
"loss": 0.8937, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 3.785377358490566, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 1.0375852931286956e-06, |
|
"loss": 0.9241, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 3.794811320754717, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 9.314053963669245e-07, |
|
"loss": 0.9335, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 3.8042452830188678, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 8.309302018465581e-07, |
|
"loss": 0.9593, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 3.813679245283019, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 7.361654952665609e-07, |
|
"loss": 0.9529, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 3.82311320754717, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 6.471167334968886e-07, |
|
"loss": 0.9608, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 3.8325471698113205, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 5.637890442641402e-07, |
|
"loss": 0.9223, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.8419811320754715, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 4.861872258564049e-07, |
|
"loss": 1.0127, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 3.8514150943396226, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 4.143157468468717e-07, |
|
"loss": 0.9592, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 3.8608490566037736, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 3.481787458365915e-07, |
|
"loss": 0.9963, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 3.8702830188679247, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 2.877800312160783e-07, |
|
"loss": 0.9346, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 3.8797169811320753, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 2.3312308094607382e-07, |
|
"loss": 0.8629, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 3.8891509433962264, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 1.8421104235727405e-07, |
|
"loss": 0.9351, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 3.8985849056603774, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 1.4104673196903005e-07, |
|
"loss": 1.0274, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 3.9080188679245285, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 1.0363263532724432e-07, |
|
"loss": 0.9699, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 3.9174528301886795, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 7.197090686119623e-08, |
|
"loss": 0.9988, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 3.92688679245283, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 4.606336975948589e-08, |
|
"loss": 0.9501, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.936320754716981, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 2.5911515865084667e-08, |
|
"loss": 0.9581, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 3.9457547169811322, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 1.1516505589381776e-08, |
|
"loss": 0.9865, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 3.955188679245283, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 2.8791678453821135e-09, |
|
"loss": 0.9402, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 3.964622641509434, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.0, |
|
"loss": 0.9966, |
|
"step": 424 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 424, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 106, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.322438582095053e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|