End of training

Browse files

Files changed (6) hide show

README.md +14 -2
all_results.json +16 -0
eval_results.json +10 -0
runs/Jul24_20-10-48_phyl-ling-p01.la.utexas.edu/events.out.tfevents.1721894287.phyl-ling-p01.la.utexas.edu.130506.1 +3 -0
train_results.json +9 -0
trainer_state.json +1119 -0

README.md CHANGED Viewed

@@ -1,11 +1,23 @@
 ---
 tags:
 - generated_from_trainer
 metrics:
 - accuracy
 model-index:
 - name: cria-babylm2-subset-default-1e-3
-  results: []
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -13,7 +25,7 @@ should probably proofread and complete it, then remove this comment. -->
 # cria-babylm2-subset-default-1e-3
-This model was trained from scratch on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 2.6626
 - Accuracy: 0.5204

 ---
 tags:
 - generated_from_trainer
+datasets:
+- kanishka/babylm2-subset
 metrics:
 - accuracy
 model-index:
 - name: cria-babylm2-subset-default-1e-3
+  results:
+  - task:
+      name: Causal Language Modeling
+      type: text-generation
+    dataset:
+      name: kanishka/babylm2-subset
+      type: kanishka/babylm2-subset
+    metrics:
+    - name: Accuracy
+      type: accuracy
+      value: 0.5203706477236009
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 # cria-babylm2-subset-default-1e-3
+This model was trained from scratch on the kanishka/babylm2-subset dataset.
 It achieves the following results on the evaluation set:
 - Loss: 2.6626
 - Accuracy: 0.5204

all_results.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "epoch": 10.0,
+    "eval_accuracy": 0.5203706477236009,
+    "eval_loss": 2.6626083850860596,
+    "eval_runtime": 101.9412,
+    "eval_samples": 46868,
+    "eval_samples_per_second": 459.755,
+    "eval_steps_per_second": 7.19,
+    "perplexity": 14.333628001897594,
+    "total_flos": 6.171008476428288e+17,
+    "train_loss": 2.0168114449748256,
+    "train_runtime": 24063.5613,
+    "train_samples": 452524,
+    "train_samples_per_second": 188.054,
+    "train_steps_per_second": 5.877
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "epoch": 10.0,
+    "eval_accuracy": 0.5203706477236009,
+    "eval_loss": 2.6626083850860596,
+    "eval_runtime": 101.9412,
+    "eval_samples": 46868,
+    "eval_samples_per_second": 459.755,
+    "eval_steps_per_second": 7.19,
+    "perplexity": 14.333628001897594
+}

runs/Jul24_20-10-48_phyl-ling-p01.la.utexas.edu/events.out.tfevents.1721894287.phyl-ling-p01.la.utexas.edu.130506.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e0fea37449358621fdf730edb3521f6f9e1b3d2a0b86759c96a3aa248d1ba938
+size 417

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 10.0,
+    "total_flos": 6.171008476428288e+17,
+    "train_loss": 2.0168114449748256,
+    "train_runtime": 24063.5613,
+    "train_samples": 452524,
+    "train_samples_per_second": 188.054,
+    "train_steps_per_second": 5.877
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1119 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 10.0,
+  "eval_steps": 500,
+  "global_step": 141420,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.07071135624381275,
+      "grad_norm": 1.5066859722137451,
+      "learning_rate": 3.125e-05,
+      "loss": 4.9729,
+      "step": 1000
+    },
+    {
+      "epoch": 0.1414227124876255,
+      "grad_norm": 0.9705550670623779,
+      "learning_rate": 6.25e-05,
+      "loss": 3.374,
+      "step": 2000
+    },
+    {
+      "epoch": 0.21213406873143828,
+      "grad_norm": 0.8380181789398193,
+      "learning_rate": 9.375e-05,
+      "loss": 3.0791,
+      "step": 3000
+    },
+    {
+      "epoch": 0.282845424975251,
+      "grad_norm": 0.7704618573188782,
+      "learning_rate": 0.000125,
+      "loss": 2.9038,
+      "step": 4000
+    },
+    {
+      "epoch": 0.3535567812190638,
+      "grad_norm": 0.675819993019104,
+      "learning_rate": 0.00015625,
+      "loss": 2.7896,
+      "step": 5000
+    },
+    {
+      "epoch": 0.42426813746287656,
+      "grad_norm": 0.6245794892311096,
+      "learning_rate": 0.0001875,
+      "loss": 2.6919,
+      "step": 6000
+    },
+    {
+      "epoch": 0.4949794937066893,
+      "grad_norm": 0.5709772109985352,
+      "learning_rate": 0.00021875,
+      "loss": 2.6272,
+      "step": 7000
+    },
+    {
+      "epoch": 0.565690849950502,
+      "grad_norm": 0.5335475206375122,
+      "learning_rate": 0.00025,
+      "loss": 2.5715,
+      "step": 8000
+    },
+    {
+      "epoch": 0.6364022061943148,
+      "grad_norm": 0.4924204349517822,
+      "learning_rate": 0.00028125000000000003,
+      "loss": 2.5384,
+      "step": 9000
+    },
+    {
+      "epoch": 0.7071135624381276,
+      "grad_norm": 0.49077439308166504,
+      "learning_rate": 0.0003125,
+      "loss": 2.5154,
+      "step": 10000
+    },
+    {
+      "epoch": 0.7778249186819404,
+      "grad_norm": 0.4304497241973877,
+      "learning_rate": 0.00034371875,
+      "loss": 2.4784,
+      "step": 11000
+    },
+    {
+      "epoch": 0.8485362749257531,
+      "grad_norm": 0.45384302735328674,
+      "learning_rate": 0.00037496875000000003,
+      "loss": 2.458,
+      "step": 12000
+    },
+    {
+      "epoch": 0.9192476311695659,
+      "grad_norm": 0.3978016674518585,
+      "learning_rate": 0.0004061875,
+      "loss": 2.4536,
+      "step": 13000
+    },
+    {
+      "epoch": 0.9899589874133786,
+      "grad_norm": 0.3981296718120575,
+      "learning_rate": 0.0004374375,
+      "loss": 2.4397,
+      "step": 14000
+    },
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.49545158536152034,
+      "eval_loss": 2.6684181690216064,
+      "eval_runtime": 101.2526,
+      "eval_samples_per_second": 462.882,
+      "eval_steps_per_second": 7.239,
+      "step": 14142
+    },
+    {
+      "epoch": 1.0606703436571914,
+      "grad_norm": 0.36285269260406494,
+      "learning_rate": 0.00046865625,
+      "loss": 2.3887,
+      "step": 15000
+    },
+    {
+      "epoch": 1.131381699901004,
+      "grad_norm": 0.3568965494632721,
+      "learning_rate": 0.00049990625,
+      "loss": 2.3848,
+      "step": 16000
+    },
+    {
+      "epoch": 1.2020930561448169,
+      "grad_norm": 0.32918983697891235,
+      "learning_rate": 0.00053109375,
+      "loss": 2.3763,
+      "step": 17000
+    },
+    {
+      "epoch": 1.2728044123886297,
+      "grad_norm": 0.3192691206932068,
+      "learning_rate": 0.00056234375,
+      "loss": 2.3757,
+      "step": 18000
+    },
+    {
+      "epoch": 1.3435157686324424,
+      "grad_norm": 0.29217350482940674,
+      "learning_rate": 0.00059359375,
+      "loss": 2.3728,
+      "step": 19000
+    },
+    {
+      "epoch": 1.414227124876255,
+      "grad_norm": 0.2726396918296814,
+      "learning_rate": 0.0006248437500000001,
+      "loss": 2.3482,
+      "step": 20000
+    },
+    {
+      "epoch": 1.4849384811200679,
+      "grad_norm": 0.2647142708301544,
+      "learning_rate": 0.0006560625,
+      "loss": 2.361,
+      "step": 21000
+    },
+    {
+      "epoch": 1.5556498373638807,
+      "grad_norm": 0.24640022218227386,
+      "learning_rate": 0.00068728125,
+      "loss": 2.3414,
+      "step": 22000
+    },
+    {
+      "epoch": 1.6263611936076934,
+      "grad_norm": 0.2376652956008911,
+      "learning_rate": 0.00071853125,
+      "loss": 2.3469,
+      "step": 23000
+    },
+    {
+      "epoch": 1.697072549851506,
+      "grad_norm": 0.20667687058448792,
+      "learning_rate": 0.00074978125,
+      "loss": 2.3334,
+      "step": 24000
+    },
+    {
+      "epoch": 1.7677839060953189,
+      "grad_norm": 0.21862906217575073,
+      "learning_rate": 0.0007810312499999999,
+      "loss": 2.325,
+      "step": 25000
+    },
+    {
+      "epoch": 1.8384952623391317,
+      "grad_norm": 0.19700638949871063,
+      "learning_rate": 0.00081225,
+      "loss": 2.3169,
+      "step": 26000
+    },
+    {
+      "epoch": 1.9092066185829444,
+      "grad_norm": 0.19530941545963287,
+      "learning_rate": 0.00084346875,
+      "loss": 2.3085,
+      "step": 27000
+    },
+    {
+      "epoch": 1.979917974826757,
+      "grad_norm": 0.18496540188789368,
+      "learning_rate": 0.00087471875,
+      "loss": 2.3085,
+      "step": 28000
+    },
+    {
+      "epoch": 2.0,
+      "eval_accuracy": 0.5092843747934841,
+      "eval_loss": 2.5420279502868652,
+      "eval_runtime": 102.0041,
+      "eval_samples_per_second": 459.472,
+      "eval_steps_per_second": 7.186,
+      "step": 28284
+    },
+    {
+      "epoch": 2.05062933107057,
+      "grad_norm": 0.18070034682750702,
+      "learning_rate": 0.00090596875,
+      "loss": 2.2557,
+      "step": 29000
+    },
+    {
+      "epoch": 2.1213406873143827,
+      "grad_norm": 0.17614798247814178,
+      "learning_rate": 0.0009371875,
+      "loss": 2.248,
+      "step": 30000
+    },
+    {
+      "epoch": 2.1920520435581956,
+      "grad_norm": 0.18162938952445984,
+      "learning_rate": 0.0009684375,
+      "loss": 2.246,
+      "step": 31000
+    },
+    {
+      "epoch": 2.262763399802008,
+      "grad_norm": 0.16680462658405304,
+      "learning_rate": 0.0009996875,
+      "loss": 2.2398,
+      "step": 32000
+    },
+    {
+      "epoch": 2.333474756045821,
+      "grad_norm": 0.17343448102474213,
+      "learning_rate": 0.0009909614330104186,
+      "loss": 2.2332,
+      "step": 33000
+    },
+    {
+      "epoch": 2.4041861122896337,
+      "grad_norm": 0.15368333458900452,
+      "learning_rate": 0.0009818223359532078,
+      "loss": 2.2367,
+      "step": 34000
+    },
+    {
+      "epoch": 2.4748974685334466,
+      "grad_norm": 0.14444677531719208,
+      "learning_rate": 0.0009726832388959971,
+      "loss": 2.2277,
+      "step": 35000
+    },
+    {
+      "epoch": 2.5456088247772595,
+      "grad_norm": 0.16958372294902802,
+      "learning_rate": 0.0009635441418387864,
+      "loss": 2.2136,
+      "step": 36000
+    },
+    {
+      "epoch": 2.616320181021072,
+      "grad_norm": 0.15171754360198975,
+      "learning_rate": 0.0009544141838786328,
+      "loss": 2.2105,
+      "step": 37000
+    },
+    {
+      "epoch": 2.6870315372648848,
+      "grad_norm": 0.13588131964206696,
+      "learning_rate": 0.0009452750868214221,
+      "loss": 2.2056,
+      "step": 38000
+    },
+    {
+      "epoch": 2.7577428935086976,
+      "grad_norm": 0.13553854823112488,
+      "learning_rate": 0.0009361359897642113,
+      "loss": 2.1988,
+      "step": 39000
+    },
+    {
+      "epoch": 2.82845424975251,
+      "grad_norm": 0.15744280815124512,
+      "learning_rate": 0.0009269968927070006,
+      "loss": 2.1949,
+      "step": 40000
+    },
+    {
+      "epoch": 2.899165605996323,
+      "grad_norm": 0.1427813470363617,
+      "learning_rate": 0.000917866934746847,
+      "loss": 2.1875,
+      "step": 41000
+    },
+    {
+      "epoch": 2.9698769622401358,
+      "grad_norm": 0.14179003238677979,
+      "learning_rate": 0.0009087278376896363,
+      "loss": 2.19,
+      "step": 42000
+    },
+    {
+      "epoch": 3.0,
+      "eval_accuracy": 0.5214661161125094,
+      "eval_loss": 2.439739942550659,
+      "eval_runtime": 102.0725,
+      "eval_samples_per_second": 459.164,
+      "eval_steps_per_second": 7.181,
+      "step": 42426
+    },
+    {
+      "epoch": 3.0405883184839486,
+      "grad_norm": 0.1600356101989746,
+      "learning_rate": 0.0008995978797294828,
+      "loss": 2.13,
+      "step": 43000
+    },
+    {
+      "epoch": 3.1112996747277615,
+      "grad_norm": 0.16733036935329437,
+      "learning_rate": 0.0008904587826722719,
+      "loss": 2.0964,
+      "step": 44000
+    },
+    {
+      "epoch": 3.182011030971574,
+      "grad_norm": 0.15149937570095062,
+      "learning_rate": 0.0008813379638091756,
+      "loss": 2.0964,
+      "step": 45000
+    },
+    {
+      "epoch": 3.2527223872153868,
+      "grad_norm": 0.1375265121459961,
+      "learning_rate": 0.0008721988667519649,
+      "loss": 2.1021,
+      "step": 46000
+    },
+    {
+      "epoch": 3.3234337434591996,
+      "grad_norm": 0.13642068207263947,
+      "learning_rate": 0.0008630597696947542,
+      "loss": 2.1062,
+      "step": 47000
+    },
+    {
+      "epoch": 3.3941450997030125,
+      "grad_norm": 0.15942348539829254,
+      "learning_rate": 0.0008539206726375435,
+      "loss": 2.0943,
+      "step": 48000
+    },
+    {
+      "epoch": 3.464856455946825,
+      "grad_norm": 0.14231225848197937,
+      "learning_rate": 0.0008447815755803326,
+      "loss": 2.0968,
+      "step": 49000
+    },
+    {
+      "epoch": 3.5355678121906378,
+      "grad_norm": 0.13483628630638123,
+      "learning_rate": 0.0008356516176201791,
+      "loss": 2.0923,
+      "step": 50000
+    },
+    {
+      "epoch": 3.6062791684344506,
+      "grad_norm": 0.15377779304981232,
+      "learning_rate": 0.0008265125205629684,
+      "loss": 2.0929,
+      "step": 51000
+    },
+    {
+      "epoch": 3.6769905246782635,
+      "grad_norm": 0.13733841478824615,
+      "learning_rate": 0.0008173825626028149,
+      "loss": 2.0929,
+      "step": 52000
+    },
+    {
+      "epoch": 3.747701880922076,
+      "grad_norm": 0.13640180230140686,
+      "learning_rate": 0.0008082434655456042,
+      "loss": 2.0938,
+      "step": 53000
+    },
+    {
+      "epoch": 3.8184132371658888,
+      "grad_norm": 0.13909070193767548,
+      "learning_rate": 0.0007991135075854505,
+      "loss": 2.0907,
+      "step": 54000
+    },
+    {
+      "epoch": 3.8891245934097016,
+      "grad_norm": 0.1521981954574585,
+      "learning_rate": 0.0007899744105282398,
+      "loss": 2.0816,
+      "step": 55000
+    },
+    {
+      "epoch": 3.9598359496535145,
+      "grad_norm": 0.12255113571882248,
+      "learning_rate": 0.0007808444525680864,
+      "loss": 2.0865,
+      "step": 56000
+    },
+    {
+      "epoch": 4.0,
+      "eval_accuracy": 0.5276129432475146,
+      "eval_loss": 2.3943161964416504,
+      "eval_runtime": 104.7687,
+      "eval_samples_per_second": 447.347,
+      "eval_steps_per_second": 6.996,
+      "step": 56568
+    },
+    {
+      "epoch": 4.030547305897327,
+      "grad_norm": 0.1423817127943039,
+      "learning_rate": 0.0007717053555108755,
+      "loss": 2.0304,
+      "step": 57000
+    },
+    {
+      "epoch": 4.10125866214114,
+      "grad_norm": 0.13736553490161896,
+      "learning_rate": 0.0007625662584536648,
+      "loss": 1.9815,
+      "step": 58000
+    },
+    {
+      "epoch": 4.171970018384952,
+      "grad_norm": 0.1411396712064743,
+      "learning_rate": 0.0007534363004935113,
+      "loss": 1.9919,
+      "step": 59000
+    },
+    {
+      "epoch": 4.2426813746287655,
+      "grad_norm": 0.14484618604183197,
+      "learning_rate": 0.0007442972034363005,
+      "loss": 1.9915,
+      "step": 60000
+    },
+    {
+      "epoch": 4.313392730872578,
+      "grad_norm": 0.1606305092573166,
+      "learning_rate": 0.000735167245476147,
+      "loss": 1.9925,
+      "step": 61000
+    },
+    {
+      "epoch": 4.384104087116391,
+      "grad_norm": 0.15816234052181244,
+      "learning_rate": 0.0007260281484189363,
+      "loss": 1.9963,
+      "step": 62000
+    },
+    {
+      "epoch": 4.454815443360204,
+      "grad_norm": 0.14397823810577393,
+      "learning_rate": 0.0007168981904587826,
+      "loss": 1.9989,
+      "step": 63000
+    },
+    {
+      "epoch": 4.525526799604016,
+      "grad_norm": 0.15473702549934387,
+      "learning_rate": 0.0007077590934015719,
+      "loss": 1.9965,
+      "step": 64000
+    },
+    {
+      "epoch": 4.596238155847829,
+      "grad_norm": 0.14191265404224396,
+      "learning_rate": 0.0006986291354414184,
+      "loss": 2.0005,
+      "step": 65000
+    },
+    {
+      "epoch": 4.666949512091642,
+      "grad_norm": 0.15206751227378845,
+      "learning_rate": 0.0006894900383842077,
+      "loss": 2.0114,
+      "step": 66000
+    },
+    {
+      "epoch": 4.737660868335455,
+      "grad_norm": 0.18548937141895294,
+      "learning_rate": 0.0006803600804240542,
+      "loss": 2.0021,
+      "step": 67000
+    },
+    {
+      "epoch": 4.8083722245792675,
+      "grad_norm": 0.16364724934101105,
+      "learning_rate": 0.0006712209833668433,
+      "loss": 2.0093,
+      "step": 68000
+    },
+    {
+      "epoch": 4.87908358082308,
+      "grad_norm": 0.1373205929994583,
+      "learning_rate": 0.0006620818863096326,
+      "loss": 2.0073,
+      "step": 69000
+    },
+    {
+      "epoch": 4.949794937066893,
+      "grad_norm": 0.15305304527282715,
+      "learning_rate": 0.000652951928349479,
+      "loss": 1.9957,
+      "step": 70000
+    },
+    {
+      "epoch": 5.0,
+      "eval_accuracy": 0.5305311481637808,
+      "eval_loss": 2.3786160945892334,
+      "eval_runtime": 102.0738,
+      "eval_samples_per_second": 459.158,
+      "eval_steps_per_second": 7.181,
+      "step": 70710
+    },
+    {
+      "epoch": 5.020506293310706,
+      "grad_norm": 0.17954622209072113,
+      "learning_rate": 0.0006438128312922683,
+      "loss": 1.963,
+      "step": 71000
+    },
+    {
+      "epoch": 5.091217649554518,
+      "grad_norm": 0.17249706387519836,
+      "learning_rate": 0.0006346828733321149,
+      "loss": 1.8814,
+      "step": 72000
+    },
+    {
+      "epoch": 5.161929005798331,
+      "grad_norm": 0.16035763919353485,
+      "learning_rate": 0.000625543776274904,
+      "loss": 1.8888,
+      "step": 73000
+    },
+    {
+      "epoch": 5.232640362042144,
+      "grad_norm": 0.16601450741291046,
+      "learning_rate": 0.0006164046792176932,
+      "loss": 1.8945,
+      "step": 74000
+    },
+    {
+      "epoch": 5.303351718285957,
+      "grad_norm": 0.1559607982635498,
+      "learning_rate": 0.0006072747212575398,
+      "loss": 1.9005,
+      "step": 75000
+    },
+    {
+      "epoch": 5.3740630745297695,
+      "grad_norm": 0.1599714308977127,
+      "learning_rate": 0.000598135624200329,
+      "loss": 1.9056,
+      "step": 76000
+    },
+    {
+      "epoch": 5.444774430773582,
+      "grad_norm": 0.15538254380226135,
+      "learning_rate": 0.0005890056662401755,
+      "loss": 1.9091,
+      "step": 77000
+    },
+    {
+      "epoch": 5.515485787017395,
+      "grad_norm": 0.1645193099975586,
+      "learning_rate": 0.0005798665691829647,
+      "loss": 1.9138,
+      "step": 78000
+    },
+    {
+      "epoch": 5.586197143261208,
+      "grad_norm": 0.1560288369655609,
+      "learning_rate": 0.0005707366112228112,
+      "loss": 1.9276,
+      "step": 79000
+    },
+    {
+      "epoch": 5.65690849950502,
+      "grad_norm": 0.169467955827713,
+      "learning_rate": 0.0005615975141656004,
+      "loss": 1.9167,
+      "step": 80000
+    },
+    {
+      "epoch": 5.727619855748833,
+      "grad_norm": 0.18090558052062988,
+      "learning_rate": 0.0005524675562054469,
+      "loss": 1.9289,
+      "step": 81000
+    },
+    {
+      "epoch": 5.798331211992646,
+      "grad_norm": 0.16788819432258606,
+      "learning_rate": 0.0005433284591482362,
+      "loss": 1.9228,
+      "step": 82000
+    },
+    {
+      "epoch": 5.869042568236459,
+      "grad_norm": 0.15961690247058868,
+      "learning_rate": 0.0005341893620910255,
+      "loss": 1.9178,
+      "step": 83000
+    },
+    {
+      "epoch": 5.9397539244802715,
+      "grad_norm": 0.15657977759838104,
+      "learning_rate": 0.0005250594041308718,
+      "loss": 1.9161,
+      "step": 84000
+    },
+    {
+      "epoch": 6.0,
+      "eval_accuracy": 0.5312578351518911,
+      "eval_loss": 2.3910350799560547,
+      "eval_runtime": 102.0407,
+      "eval_samples_per_second": 459.307,
+      "eval_steps_per_second": 7.183,
+      "step": 84852
+    },
+    {
+      "epoch": 6.010465280724084,
+      "grad_norm": 0.15551112592220306,
+      "learning_rate": 0.0005159203070736611,
+      "loss": 1.9123,
+      "step": 85000
+    },
+    {
+      "epoch": 6.081176636967897,
+      "grad_norm": 0.18589554727077484,
+      "learning_rate": 0.0005067812100164504,
+      "loss": 1.7906,
+      "step": 86000
+    },
+    {
+      "epoch": 6.15188799321171,
+      "grad_norm": 0.16240116953849792,
+      "learning_rate": 0.0004976512520562968,
+      "loss": 1.805,
+      "step": 87000
+    },
+    {
+      "epoch": 6.222599349455523,
+      "grad_norm": 0.1752467155456543,
+      "learning_rate": 0.0004885121549990861,
+      "loss": 1.8147,
+      "step": 88000
+    },
+    {
+      "epoch": 6.293310705699335,
+      "grad_norm": 0.15973269939422607,
+      "learning_rate": 0.00047937305794187537,
+      "loss": 1.8063,
+      "step": 89000
+    },
+    {
+      "epoch": 6.364022061943148,
+      "grad_norm": 0.18358197808265686,
+      "learning_rate": 0.0004702430999817218,
+      "loss": 1.8182,
+      "step": 90000
+    },
+    {
+      "epoch": 6.434733418186961,
+      "grad_norm": 0.20550867915153503,
+      "learning_rate": 0.00046110400292451105,
+      "loss": 1.8251,
+      "step": 91000
+    },
+    {
+      "epoch": 6.5054447744307735,
+      "grad_norm": 0.18148034811019897,
+      "learning_rate": 0.0004519740449643575,
+      "loss": 1.8283,
+      "step": 92000
+    },
+    {
+      "epoch": 6.576156130674587,
+      "grad_norm": 0.1863207072019577,
+      "learning_rate": 0.0004428349479071468,
+      "loss": 1.834,
+      "step": 93000
+    },
+    {
+      "epoch": 6.646867486918399,
+      "grad_norm": 0.1836949586868286,
+      "learning_rate": 0.000433695850849936,
+      "loss": 1.8257,
+      "step": 94000
+    },
+    {
+      "epoch": 6.717578843162212,
+      "grad_norm": 0.18851223587989807,
+      "learning_rate": 0.00042456589288978247,
+      "loss": 1.8291,
+      "step": 95000
+    },
+    {
+      "epoch": 6.788290199406025,
+      "grad_norm": 0.16575908660888672,
+      "learning_rate": 0.00041542679583257176,
+      "loss": 1.8412,
+      "step": 96000
+    },
+    {
+      "epoch": 6.859001555649837,
+      "grad_norm": 0.1861979216337204,
+      "learning_rate": 0.000406287698775361,
+      "loss": 1.848,
+      "step": 97000
+    },
+    {
+      "epoch": 6.92971291189365,
+      "grad_norm": 0.1783532202243805,
+      "learning_rate": 0.00039714860171815024,
+      "loss": 1.8361,
+      "step": 98000
+    },
+    {
+      "epoch": 7.0,
+      "eval_accuracy": 0.5303533991815411,
+      "eval_loss": 2.4205334186553955,
+      "eval_runtime": 102.4141,
+      "eval_samples_per_second": 457.632,
+      "eval_steps_per_second": 7.157,
+      "step": 98994
+    },
+    {
+      "epoch": 7.000424268137463,
+      "grad_norm": 0.1907605677843094,
+      "learning_rate": 0.00038801864375799674,
+      "loss": 1.8413,
+      "step": 99000
+    },
+    {
+      "epoch": 7.0711356243812755,
+      "grad_norm": 0.21442489326000214,
+      "learning_rate": 0.0003788795467007859,
+      "loss": 1.6956,
+      "step": 100000
+    },
+    {
+      "epoch": 7.141846980625088,
+      "grad_norm": 0.19562986493110657,
+      "learning_rate": 0.0003697404496435752,
+      "loss": 1.7053,
+      "step": 101000
+    },
+    {
+      "epoch": 7.212558336868901,
+      "grad_norm": 0.23670311272144318,
+      "learning_rate": 0.00036060135258636445,
+      "loss": 1.7196,
+      "step": 102000
+    },
+    {
+      "epoch": 7.283269693112714,
+      "grad_norm": 0.19641369581222534,
+      "learning_rate": 0.00035148053372326815,
+      "loss": 1.719,
+      "step": 103000
+    },
+    {
+      "epoch": 7.353981049356527,
+      "grad_norm": 0.2086309790611267,
+      "learning_rate": 0.0003423414366660574,
+      "loss": 1.7279,
+      "step": 104000
+    },
+    {
+      "epoch": 7.424692405600339,
+      "grad_norm": 0.1947568953037262,
+      "learning_rate": 0.0003332023396088467,
+      "loss": 1.7389,
+      "step": 105000
+    },
+    {
+      "epoch": 7.495403761844152,
+      "grad_norm": 0.19536983966827393,
+      "learning_rate": 0.00032407238164869313,
+      "loss": 1.7428,
+      "step": 106000
+    },
+    {
+      "epoch": 7.566115118087965,
+      "grad_norm": 0.1872589886188507,
+      "learning_rate": 0.00031493328459148237,
+      "loss": 1.7463,
+      "step": 107000
+    },
+    {
+      "epoch": 7.6368264743317775,
+      "grad_norm": 0.22906361520290375,
+      "learning_rate": 0.0003057941875342716,
+      "loss": 1.7479,
+      "step": 108000
+    },
+    {
+      "epoch": 7.707537830575591,
+      "grad_norm": 0.19299902021884918,
+      "learning_rate": 0.0002966642295741181,
+      "loss": 1.7514,
+      "step": 109000
+    },
+    {
+      "epoch": 7.778249186819403,
+      "grad_norm": 0.19876809418201447,
+      "learning_rate": 0.00028752513251690734,
+      "loss": 1.7467,
+      "step": 110000
+    },
+    {
+      "epoch": 7.848960543063216,
+      "grad_norm": 0.22273430228233337,
+      "learning_rate": 0.0002783860354596966,
+      "loss": 1.76,
+      "step": 111000
+    },
+    {
+      "epoch": 7.919671899307029,
+      "grad_norm": 0.1979241669178009,
+      "learning_rate": 0.0002692560774995431,
+      "loss": 1.7547,
+      "step": 112000
+    },
+    {
+      "epoch": 7.990383255550841,
+      "grad_norm": 0.2099294811487198,
+      "learning_rate": 0.00026011698044233226,
+      "loss": 1.7477,
+      "step": 113000
+    },
+    {
+      "epoch": 8.0,
+      "eval_accuracy": 0.5282502161049046,
+      "eval_loss": 2.474827289581299,
+      "eval_runtime": 102.4954,
+      "eval_samples_per_second": 457.269,
+      "eval_steps_per_second": 7.152,
+      "step": 113136
+    },
+    {
+      "epoch": 8.061094611794655,
+      "grad_norm": 0.24672599136829376,
+      "learning_rate": 0.00025097788338512156,
+      "loss": 1.6197,
+      "step": 114000
+    },
+    {
+      "epoch": 8.131805968038467,
+      "grad_norm": 0.21202607452869415,
+      "learning_rate": 0.00024183878632791082,
+      "loss": 1.6192,
+      "step": 115000
+    },
+    {
+      "epoch": 8.20251732428228,
+      "grad_norm": 0.24981403350830078,
+      "learning_rate": 0.00023271796746481447,
+      "loss": 1.6329,
+      "step": 116000
+    },
+    {
+      "epoch": 8.273228680526092,
+      "grad_norm": 0.25290995836257935,
+      "learning_rate": 0.00022357887040760373,
+      "loss": 1.6386,
+      "step": 117000
+    },
+    {
+      "epoch": 8.343940036769904,
+      "grad_norm": 0.2473640739917755,
+      "learning_rate": 0.000214439773350393,
+      "loss": 1.6414,
+      "step": 118000
+    },
+    {
+      "epoch": 8.414651393013719,
+      "grad_norm": 0.20307676494121552,
+      "learning_rate": 0.00020530981539023944,
+      "loss": 1.6458,
+      "step": 119000
+    },
+    {
+      "epoch": 8.485362749257531,
+      "grad_norm": 0.21696613729000092,
+      "learning_rate": 0.0001961707183330287,
+      "loss": 1.6473,
+      "step": 120000
+    },
+    {
+      "epoch": 8.556074105501343,
+      "grad_norm": 0.23408186435699463,
+      "learning_rate": 0.00018703162127581797,
+      "loss": 1.656,
+      "step": 121000
+    },
+    {
+      "epoch": 8.626785461745156,
+      "grad_norm": 0.23058977723121643,
+      "learning_rate": 0.0001778925242186072,
+      "loss": 1.6578,
+      "step": 122000
+    },
+    {
+      "epoch": 8.697496817988968,
+      "grad_norm": 0.23317036032676697,
+      "learning_rate": 0.00016877170535551086,
+      "loss": 1.6516,
+      "step": 123000
+    },
+    {
+      "epoch": 8.768208174232782,
+      "grad_norm": 0.2361781746149063,
+      "learning_rate": 0.00015963260829830012,
+      "loss": 1.6525,
+      "step": 124000
+    },
+    {
+      "epoch": 8.838919530476595,
+      "grad_norm": 0.260776549577713,
+      "learning_rate": 0.00015049351124108936,
+      "loss": 1.6547,
+      "step": 125000
+    },
+    {
+      "epoch": 8.909630886720407,
+      "grad_norm": 0.2507932186126709,
+      "learning_rate": 0.00014136355328093583,
+      "loss": 1.6556,
+      "step": 126000
+    },
+    {
+      "epoch": 8.98034224296422,
+      "grad_norm": 0.2422228902578354,
+      "learning_rate": 0.0001322244562237251,
+      "loss": 1.6549,
+      "step": 127000
+    },
+    {
+      "epoch": 9.0,
+      "eval_accuracy": 0.5249380742803117,
+      "eval_loss": 2.5581541061401367,
+      "eval_runtime": 102.2383,
+      "eval_samples_per_second": 458.419,
+      "eval_steps_per_second": 7.17,
+      "step": 127278
+    },
+    {
+      "epoch": 9.051053599208032,
+      "grad_norm": 0.2604562044143677,
+      "learning_rate": 0.00012308535916651437,
+      "loss": 1.5675,
+      "step": 128000
+    },
+    {
+      "epoch": 9.121764955451846,
+      "grad_norm": 0.22102615237236023,
+      "learning_rate": 0.0001139462621093036,
+      "loss": 1.5337,
+      "step": 129000
+    },
+    {
+      "epoch": 9.192476311695659,
+      "grad_norm": 0.2960878014564514,
+      "learning_rate": 0.00010481630414915007,
+      "loss": 1.5556,
+      "step": 130000
+    },
+    {
+      "epoch": 9.263187667939471,
+      "grad_norm": 0.22400617599487305,
+      "learning_rate": 9.567720709193931e-05,
+      "loss": 1.5491,
+      "step": 131000
+    },
+    {
+      "epoch": 9.333899024183284,
+      "grad_norm": 0.24257275462150574,
+      "learning_rate": 8.655638822884298e-05,
+      "loss": 1.5502,
+      "step": 132000
+    },
+    {
+      "epoch": 9.404610380427096,
+      "grad_norm": 0.24599485099315643,
+      "learning_rate": 7.741729117163225e-05,
+      "loss": 1.552,
+      "step": 133000
+    },
+    {
+      "epoch": 9.47532173667091,
+      "grad_norm": 0.25757452845573425,
+      "learning_rate": 6.82781941144215e-05,
+      "loss": 1.5576,
+      "step": 134000
+    },
+    {
+      "epoch": 9.546033092914723,
+      "grad_norm": 0.28276532888412476,
+      "learning_rate": 5.914823615426796e-05,
+      "loss": 1.5529,
+      "step": 135000
+    },
+    {
+      "epoch": 9.616744449158535,
+      "grad_norm": 0.2369563728570938,
+      "learning_rate": 5.000913909705721e-05,
+      "loss": 1.5548,
+      "step": 136000
+    },
+    {
+      "epoch": 9.687455805402347,
+      "grad_norm": 0.25778231024742126,
+      "learning_rate": 4.0870042039846464e-05,
+      "loss": 1.55,
+      "step": 137000
+    },
+    {
+      "epoch": 9.75816716164616,
+      "grad_norm": 0.2770988941192627,
+      "learning_rate": 3.173094498263571e-05,
+      "loss": 1.559,
+      "step": 138000
+    },
+    {
+      "epoch": 9.828878517889972,
+      "grad_norm": 0.2701665163040161,
+      "learning_rate": 2.261012611953939e-05,
+      "loss": 1.5579,
+      "step": 139000
+    },
+    {
+      "epoch": 9.899589874133786,
+      "grad_norm": 0.2540683448314667,
+      "learning_rate": 1.3471029062328641e-05,
+      "loss": 1.5563,
+      "step": 140000
+    },
+    {
+      "epoch": 9.970301230377599,
+      "grad_norm": 0.26811909675598145,
+      "learning_rate": 4.341071102175106e-06,
+      "loss": 1.5611,
+      "step": 141000
+    },
+    {
+      "epoch": 10.0,
+      "eval_accuracy": 0.5203706477236009,
+      "eval_loss": 2.6626083850860596,
+      "eval_runtime": 102.3167,
+      "eval_samples_per_second": 458.068,
+      "eval_steps_per_second": 7.164,
+      "step": 141420
+    },
+    {
+      "epoch": 10.0,
+      "step": 141420,
+      "total_flos": 6.171008476428288e+17,
+      "train_loss": 2.0168114449748256,
+      "train_runtime": 24063.5613,
+      "train_samples_per_second": 188.054,
+      "train_steps_per_second": 5.877
+    }
+  ],
+  "logging_steps": 1000,
+  "max_steps": 141420,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 5000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 6.171008476428288e+17,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}