commit from $USER

Browse files

Files changed (12) hide show

config.json +39 -0
generation_config.json +6 -0
merges.txt +0 -0
model.safetensors +3 -0
optimizer.pt +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
special_tokens_map.json +30 -0
tokenizer_config.json +22 -0
trainer_state.json +1036 -0
training_args.bin +3 -0
vocab.json +0 -0

config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "_name_or_path": "./gpt2-shakespeare\\checkpoint-3900",
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.40.1",
+  "use_cache": true,
+  "vocab_size": 50257
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "transformers_version": "4.40.1"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5154c6ca38077b04d8f4173c74b2da3d46a04580fc6c8fe1432a08b733160735
+size 497774208

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:36bc7b32830f2fadb2a016745d520bddc6aecee7360385d143eeaf0c65cbff20
+size 995638202

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:61c89b91383aabdf3598cc50da4f10b585def58e826bb18972264e57b5b9ca5d
+size 13990

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aa53da977e9f71f807e5e42d8f652a9a84db1910f5ed30f09f7564220a0a6e4c
+size 1064

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "model_max_length": 1024,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1036 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.3169845594913716,
+  "eval_steps": 500,
+  "global_step": 4350,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.009082652134423252,
+      "grad_norm": 2.5449585914611816,
+      "learning_rate": 4.9848622464426284e-05,
+      "loss": 8.7191,
+      "step": 30
+    },
+    {
+      "epoch": 0.018165304268846504,
+      "grad_norm": 2.3144371509552,
+      "learning_rate": 4.969724492885256e-05,
+      "loss": 7.4698,
+      "step": 60
+    },
+    {
+      "epoch": 0.027247956403269755,
+      "grad_norm": 2.304499626159668,
+      "learning_rate": 4.954586739327884e-05,
+      "loss": 6.5589,
+      "step": 90
+    },
+    {
+      "epoch": 0.03633060853769301,
+      "grad_norm": 2.272608757019043,
+      "learning_rate": 4.9394489857705115e-05,
+      "loss": 6.2425,
+      "step": 120
+    },
+    {
+      "epoch": 0.045413260672116255,
+      "grad_norm": 2.46329402923584,
+      "learning_rate": 4.9243112322131396e-05,
+      "loss": 6.1459,
+      "step": 150
+    },
+    {
+      "epoch": 0.05449591280653951,
+      "grad_norm": 1.8283530473709106,
+      "learning_rate": 4.909173478655768e-05,
+      "loss": 5.969,
+      "step": 180
+    },
+    {
+      "epoch": 0.06357856494096276,
+      "grad_norm": 2.1723110675811768,
+      "learning_rate": 4.894035725098395e-05,
+      "loss": 6.008,
+      "step": 210
+    },
+    {
+      "epoch": 0.07266121707538602,
+      "grad_norm": 2.5368807315826416,
+      "learning_rate": 4.878897971541024e-05,
+      "loss": 5.8783,
+      "step": 240
+    },
+    {
+      "epoch": 0.08174386920980926,
+      "grad_norm": 2.3222858905792236,
+      "learning_rate": 4.8637602179836515e-05,
+      "loss": 5.825,
+      "step": 270
+    },
+    {
+      "epoch": 0.09082652134423251,
+      "grad_norm": 2.557065010070801,
+      "learning_rate": 4.8486224644262796e-05,
+      "loss": 5.76,
+      "step": 300
+    },
+    {
+      "epoch": 0.09990917347865577,
+      "grad_norm": 2.4016597270965576,
+      "learning_rate": 4.833484710868907e-05,
+      "loss": 5.7039,
+      "step": 330
+    },
+    {
+      "epoch": 0.10899182561307902,
+      "grad_norm": 2.6895477771759033,
+      "learning_rate": 4.818346957311535e-05,
+      "loss": 5.5843,
+      "step": 360
+    },
+    {
+      "epoch": 0.11807447774750227,
+      "grad_norm": 2.741234064102173,
+      "learning_rate": 4.8032092037541634e-05,
+      "loss": 5.6376,
+      "step": 390
+    },
+    {
+      "epoch": 0.1271571298819255,
+      "grad_norm": 2.8266804218292236,
+      "learning_rate": 4.788071450196791e-05,
+      "loss": 5.5649,
+      "step": 420
+    },
+    {
+      "epoch": 0.1362397820163488,
+      "grad_norm": 2.792654275894165,
+      "learning_rate": 4.772933696639419e-05,
+      "loss": 5.3651,
+      "step": 450
+    },
+    {
+      "epoch": 0.14532243415077203,
+      "grad_norm": 2.7088894844055176,
+      "learning_rate": 4.757795943082047e-05,
+      "loss": 5.4921,
+      "step": 480
+    },
+    {
+      "epoch": 0.15440508628519528,
+      "grad_norm": 2.627201795578003,
+      "learning_rate": 4.7426581895246746e-05,
+      "loss": 5.461,
+      "step": 510
+    },
+    {
+      "epoch": 0.16348773841961853,
+      "grad_norm": 2.6373610496520996,
+      "learning_rate": 4.727520435967303e-05,
+      "loss": 5.3973,
+      "step": 540
+    },
+    {
+      "epoch": 0.17257039055404177,
+      "grad_norm": 2.772226095199585,
+      "learning_rate": 4.71238268240993e-05,
+      "loss": 5.3618,
+      "step": 570
+    },
+    {
+      "epoch": 0.18165304268846502,
+      "grad_norm": 2.6005172729492188,
+      "learning_rate": 4.6972449288525583e-05,
+      "loss": 5.4365,
+      "step": 600
+    },
+    {
+      "epoch": 0.1907356948228883,
+      "grad_norm": 4.7815260887146,
+      "learning_rate": 4.6821071752951865e-05,
+      "loss": 5.3225,
+      "step": 630
+    },
+    {
+      "epoch": 0.19981834695731154,
+      "grad_norm": 2.5871763229370117,
+      "learning_rate": 4.6669694217378146e-05,
+      "loss": 5.3615,
+      "step": 660
+    },
+    {
+      "epoch": 0.2089009990917348,
+      "grad_norm": 2.686840534210205,
+      "learning_rate": 4.651831668180443e-05,
+      "loss": 5.3201,
+      "step": 690
+    },
+    {
+      "epoch": 0.21798365122615804,
+      "grad_norm": 2.6963067054748535,
+      "learning_rate": 4.63669391462307e-05,
+      "loss": 5.1972,
+      "step": 720
+    },
+    {
+      "epoch": 0.22706630336058128,
+      "grad_norm": 2.9284744262695312,
+      "learning_rate": 4.6215561610656984e-05,
+      "loss": 5.3031,
+      "step": 750
+    },
+    {
+      "epoch": 0.23614895549500453,
+      "grad_norm": 2.7302122116088867,
+      "learning_rate": 4.606418407508326e-05,
+      "loss": 5.2057,
+      "step": 780
+    },
+    {
+      "epoch": 0.2452316076294278,
+      "grad_norm": 2.5760107040405273,
+      "learning_rate": 4.591280653950954e-05,
+      "loss": 5.1767,
+      "step": 810
+    },
+    {
+      "epoch": 0.254314259763851,
+      "grad_norm": 2.9804234504699707,
+      "learning_rate": 4.576142900393582e-05,
+      "loss": 5.1875,
+      "step": 840
+    },
+    {
+      "epoch": 0.2633969118982743,
+      "grad_norm": 3.311448812484741,
+      "learning_rate": 4.5610051468362096e-05,
+      "loss": 5.0712,
+      "step": 870
+    },
+    {
+      "epoch": 0.2724795640326976,
+      "grad_norm": 2.67448091506958,
+      "learning_rate": 4.545867393278838e-05,
+      "loss": 5.1241,
+      "step": 900
+    },
+    {
+      "epoch": 0.2815622161671208,
+      "grad_norm": 2.8352444171905518,
+      "learning_rate": 4.530729639721465e-05,
+      "loss": 5.1732,
+      "step": 930
+    },
+    {
+      "epoch": 0.29064486830154407,
+      "grad_norm": 2.5969910621643066,
+      "learning_rate": 4.515591886164093e-05,
+      "loss": 5.0828,
+      "step": 960
+    },
+    {
+      "epoch": 0.2997275204359673,
+      "grad_norm": 2.8792121410369873,
+      "learning_rate": 4.5004541326067215e-05,
+      "loss": 5.0844,
+      "step": 990
+    },
+    {
+      "epoch": 0.30881017257039056,
+      "grad_norm": 2.9506993293762207,
+      "learning_rate": 4.485316379049349e-05,
+      "loss": 5.1764,
+      "step": 1020
+    },
+    {
+      "epoch": 0.3178928247048138,
+      "grad_norm": 2.8818390369415283,
+      "learning_rate": 4.470178625491977e-05,
+      "loss": 5.0663,
+      "step": 1050
+    },
+    {
+      "epoch": 0.32697547683923706,
+      "grad_norm": 3.128511667251587,
+      "learning_rate": 4.4550408719346046e-05,
+      "loss": 5.1026,
+      "step": 1080
+    },
+    {
+      "epoch": 0.33605812897366033,
+      "grad_norm": 3.0155856609344482,
+      "learning_rate": 4.4399031183772334e-05,
+      "loss": 5.0686,
+      "step": 1110
+    },
+    {
+      "epoch": 0.34514078110808355,
+      "grad_norm": 2.811448097229004,
+      "learning_rate": 4.424765364819861e-05,
+      "loss": 5.0351,
+      "step": 1140
+    },
+    {
+      "epoch": 0.3542234332425068,
+      "grad_norm": 2.9916000366210938,
+      "learning_rate": 4.409627611262489e-05,
+      "loss": 5.1651,
+      "step": 1170
+    },
+    {
+      "epoch": 0.36330608537693004,
+      "grad_norm": 2.9689950942993164,
+      "learning_rate": 4.394489857705117e-05,
+      "loss": 5.1457,
+      "step": 1200
+    },
+    {
+      "epoch": 0.3723887375113533,
+      "grad_norm": 2.7896862030029297,
+      "learning_rate": 4.3793521041477446e-05,
+      "loss": 5.0049,
+      "step": 1230
+    },
+    {
+      "epoch": 0.3814713896457766,
+      "grad_norm": 2.790712833404541,
+      "learning_rate": 4.364214350590373e-05,
+      "loss": 4.9943,
+      "step": 1260
+    },
+    {
+      "epoch": 0.3905540417801998,
+      "grad_norm": 2.9977900981903076,
+      "learning_rate": 4.349076597033e-05,
+      "loss": 4.996,
+      "step": 1290
+    },
+    {
+      "epoch": 0.3996366939146231,
+      "grad_norm": 3.504183530807495,
+      "learning_rate": 4.333938843475628e-05,
+      "loss": 4.9611,
+      "step": 1320
+    },
+    {
+      "epoch": 0.4087193460490463,
+      "grad_norm": 2.737821578979492,
+      "learning_rate": 4.3188010899182565e-05,
+      "loss": 4.9541,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4178019981834696,
+      "grad_norm": 3.0585217475891113,
+      "learning_rate": 4.303663336360884e-05,
+      "loss": 4.9014,
+      "step": 1380
+    },
+    {
+      "epoch": 0.4268846503178928,
+      "grad_norm": 3.004413604736328,
+      "learning_rate": 4.288525582803512e-05,
+      "loss": 4.9703,
+      "step": 1410
+    },
+    {
+      "epoch": 0.4359673024523161,
+      "grad_norm": 2.9328274726867676,
+      "learning_rate": 4.27338782924614e-05,
+      "loss": 4.9637,
+      "step": 1440
+    },
+    {
+      "epoch": 0.44504995458673935,
+      "grad_norm": 2.93721604347229,
+      "learning_rate": 4.258250075688768e-05,
+      "loss": 4.8024,
+      "step": 1470
+    },
+    {
+      "epoch": 0.45413260672116257,
+      "grad_norm": 3.0333001613616943,
+      "learning_rate": 4.243112322131396e-05,
+      "loss": 4.8555,
+      "step": 1500
+    },
+    {
+      "epoch": 0.46321525885558584,
+      "grad_norm": 3.3445775508880615,
+      "learning_rate": 4.227974568574024e-05,
+      "loss": 4.8035,
+      "step": 1530
+    },
+    {
+      "epoch": 0.47229791099000906,
+      "grad_norm": 2.9364359378814697,
+      "learning_rate": 4.212836815016652e-05,
+      "loss": 4.9296,
+      "step": 1560
+    },
+    {
+      "epoch": 0.48138056312443234,
+      "grad_norm": 2.755453586578369,
+      "learning_rate": 4.1976990614592796e-05,
+      "loss": 4.8051,
+      "step": 1590
+    },
+    {
+      "epoch": 0.4904632152588556,
+      "grad_norm": 3.0365066528320312,
+      "learning_rate": 4.182561307901908e-05,
+      "loss": 4.7833,
+      "step": 1620
+    },
+    {
+      "epoch": 0.49954586739327883,
+      "grad_norm": 3.2632575035095215,
+      "learning_rate": 4.167423554344536e-05,
+      "loss": 4.837,
+      "step": 1650
+    },
+    {
+      "epoch": 0.508628519527702,
+      "grad_norm": 3.310817003250122,
+      "learning_rate": 4.152285800787163e-05,
+      "loss": 4.7417,
+      "step": 1680
+    },
+    {
+      "epoch": 0.5177111716621253,
+      "grad_norm": 3.121156692504883,
+      "learning_rate": 4.1371480472297915e-05,
+      "loss": 4.7791,
+      "step": 1710
+    },
+    {
+      "epoch": 0.5267938237965486,
+      "grad_norm": 3.200591564178467,
+      "learning_rate": 4.122010293672419e-05,
+      "loss": 4.8619,
+      "step": 1740
+    },
+    {
+      "epoch": 0.5358764759309719,
+      "grad_norm": 3.1420202255249023,
+      "learning_rate": 4.106872540115047e-05,
+      "loss": 4.7576,
+      "step": 1770
+    },
+    {
+      "epoch": 0.5449591280653951,
+      "grad_norm": 3.2239160537719727,
+      "learning_rate": 4.091734786557675e-05,
+      "loss": 4.7767,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5540417801998183,
+      "grad_norm": 2.9624414443969727,
+      "learning_rate": 4.076597033000303e-05,
+      "loss": 4.8608,
+      "step": 1830
+    },
+    {
+      "epoch": 0.5631244323342416,
+      "grad_norm": 3.14367938041687,
+      "learning_rate": 4.061459279442931e-05,
+      "loss": 4.7909,
+      "step": 1860
+    },
+    {
+      "epoch": 0.5722070844686649,
+      "grad_norm": 3.664564371109009,
+      "learning_rate": 4.046321525885558e-05,
+      "loss": 4.7325,
+      "step": 1890
+    },
+    {
+      "epoch": 0.5812897366030881,
+      "grad_norm": 2.9251296520233154,
+      "learning_rate": 4.0311837723281864e-05,
+      "loss": 4.8017,
+      "step": 1920
+    },
+    {
+      "epoch": 0.5903723887375113,
+      "grad_norm": 2.8796215057373047,
+      "learning_rate": 4.0160460187708146e-05,
+      "loss": 4.7124,
+      "step": 1950
+    },
+    {
+      "epoch": 0.5994550408719346,
+      "grad_norm": 3.0257513523101807,
+      "learning_rate": 4.000908265213443e-05,
+      "loss": 4.7311,
+      "step": 1980
+    },
+    {
+      "epoch": 0.6085376930063578,
+      "grad_norm": 3.096799612045288,
+      "learning_rate": 3.985770511656071e-05,
+      "loss": 4.6568,
+      "step": 2010
+    },
+    {
+      "epoch": 0.6176203451407811,
+      "grad_norm": 3.1430232524871826,
+      "learning_rate": 3.970632758098698e-05,
+      "loss": 4.6451,
+      "step": 2040
+    },
+    {
+      "epoch": 0.6267029972752044,
+      "grad_norm": 3.0216684341430664,
+      "learning_rate": 3.9554950045413265e-05,
+      "loss": 4.6565,
+      "step": 2070
+    },
+    {
+      "epoch": 0.6357856494096276,
+      "grad_norm": 3.0199525356292725,
+      "learning_rate": 3.940357250983954e-05,
+      "loss": 4.6988,
+      "step": 2100
+    },
+    {
+      "epoch": 0.6448683015440508,
+      "grad_norm": 2.9998953342437744,
+      "learning_rate": 3.925219497426582e-05,
+      "loss": 4.6654,
+      "step": 2130
+    },
+    {
+      "epoch": 0.6539509536784741,
+      "grad_norm": 3.15533447265625,
+      "learning_rate": 3.91008174386921e-05,
+      "loss": 4.616,
+      "step": 2160
+    },
+    {
+      "epoch": 0.6630336058128974,
+      "grad_norm": 2.8745930194854736,
+      "learning_rate": 3.894943990311838e-05,
+      "loss": 4.649,
+      "step": 2190
+    },
+    {
+      "epoch": 0.6721162579473207,
+      "grad_norm": 3.0759665966033936,
+      "learning_rate": 3.879806236754466e-05,
+      "loss": 4.6054,
+      "step": 2220
+    },
+    {
+      "epoch": 0.6811989100817438,
+      "grad_norm": 3.0508482456207275,
+      "learning_rate": 3.864668483197093e-05,
+      "loss": 4.4922,
+      "step": 2250
+    },
+    {
+      "epoch": 0.6902815622161671,
+      "grad_norm": 2.9260127544403076,
+      "learning_rate": 3.8495307296397214e-05,
+      "loss": 4.6469,
+      "step": 2280
+    },
+    {
+      "epoch": 0.6993642143505904,
+      "grad_norm": 2.924952268600464,
+      "learning_rate": 3.8343929760823496e-05,
+      "loss": 4.6164,
+      "step": 2310
+    },
+    {
+      "epoch": 0.7084468664850136,
+      "grad_norm": 3.056288480758667,
+      "learning_rate": 3.819255222524977e-05,
+      "loss": 4.5877,
+      "step": 2340
+    },
+    {
+      "epoch": 0.7175295186194369,
+      "grad_norm": 4.257227420806885,
+      "learning_rate": 3.804117468967605e-05,
+      "loss": 4.6301,
+      "step": 2370
+    },
+    {
+      "epoch": 0.7266121707538601,
+      "grad_norm": 3.282137155532837,
+      "learning_rate": 3.788979715410233e-05,
+      "loss": 4.4623,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7356948228882834,
+      "grad_norm": 2.945059299468994,
+      "learning_rate": 3.7738419618528615e-05,
+      "loss": 4.6267,
+      "step": 2430
+    },
+    {
+      "epoch": 0.7447774750227066,
+      "grad_norm": 3.1374645233154297,
+      "learning_rate": 3.7587042082954896e-05,
+      "loss": 4.6835,
+      "step": 2460
+    },
+    {
+      "epoch": 0.7538601271571299,
+      "grad_norm": 3.21016001701355,
+      "learning_rate": 3.743566454738117e-05,
+      "loss": 4.5581,
+      "step": 2490
+    },
+    {
+      "epoch": 0.7629427792915532,
+      "grad_norm": 2.8072383403778076,
+      "learning_rate": 3.728428701180745e-05,
+      "loss": 4.571,
+      "step": 2520
+    },
+    {
+      "epoch": 0.7720254314259763,
+      "grad_norm": 2.9735002517700195,
+      "learning_rate": 3.713290947623373e-05,
+      "loss": 4.5013,
+      "step": 2550
+    },
+    {
+      "epoch": 0.7811080835603996,
+      "grad_norm": 3.182706832885742,
+      "learning_rate": 3.698153194066001e-05,
+      "loss": 4.534,
+      "step": 2580
+    },
+    {
+      "epoch": 0.7901907356948229,
+      "grad_norm": 2.958193778991699,
+      "learning_rate": 3.683015440508629e-05,
+      "loss": 4.5697,
+      "step": 2610
+    },
+    {
+      "epoch": 0.7992733878292462,
+      "grad_norm": 2.950946569442749,
+      "learning_rate": 3.6678776869512564e-05,
+      "loss": 4.6066,
+      "step": 2640
+    },
+    {
+      "epoch": 0.8083560399636693,
+      "grad_norm": 2.9701859951019287,
+      "learning_rate": 3.6527399333938846e-05,
+      "loss": 4.5934,
+      "step": 2670
+    },
+    {
+      "epoch": 0.8174386920980926,
+      "grad_norm": 3.2177681922912598,
+      "learning_rate": 3.637602179836512e-05,
+      "loss": 4.5418,
+      "step": 2700
+    },
+    {
+      "epoch": 0.8265213442325159,
+      "grad_norm": 2.7435505390167236,
+      "learning_rate": 3.62246442627914e-05,
+      "loss": 4.5485,
+      "step": 2730
+    },
+    {
+      "epoch": 0.8356039963669392,
+      "grad_norm": 3.4409849643707275,
+      "learning_rate": 3.607326672721768e-05,
+      "loss": 4.4268,
+      "step": 2760
+    },
+    {
+      "epoch": 0.8446866485013624,
+      "grad_norm": 3.803256034851074,
+      "learning_rate": 3.592188919164396e-05,
+      "loss": 4.5643,
+      "step": 2790
+    },
+    {
+      "epoch": 0.8537693006357856,
+      "grad_norm": 3.0399341583251953,
+      "learning_rate": 3.5770511656070246e-05,
+      "loss": 4.4783,
+      "step": 2820
+    },
+    {
+      "epoch": 0.8628519527702089,
+      "grad_norm": 2.9948980808258057,
+      "learning_rate": 3.561913412049652e-05,
+      "loss": 4.4929,
+      "step": 2850
+    },
+    {
+      "epoch": 0.8719346049046321,
+      "grad_norm": 3.400299549102783,
+      "learning_rate": 3.54677565849228e-05,
+      "loss": 4.4803,
+      "step": 2880
+    },
+    {
+      "epoch": 0.8810172570390554,
+      "grad_norm": 2.9282257556915283,
+      "learning_rate": 3.531637904934908e-05,
+      "loss": 4.4554,
+      "step": 2910
+    },
+    {
+      "epoch": 0.8900999091734787,
+      "grad_norm": 2.957598924636841,
+      "learning_rate": 3.516500151377536e-05,
+      "loss": 4.5324,
+      "step": 2940
+    },
+    {
+      "epoch": 0.8991825613079019,
+      "grad_norm": 2.9992153644561768,
+      "learning_rate": 3.501362397820164e-05,
+      "loss": 4.508,
+      "step": 2970
+    },
+    {
+      "epoch": 0.9082652134423251,
+      "grad_norm": 3.1509618759155273,
+      "learning_rate": 3.4862246442627914e-05,
+      "loss": 4.4265,
+      "step": 3000
+    },
+    {
+      "epoch": 0.9173478655767484,
+      "grad_norm": 3.027726888656616,
+      "learning_rate": 3.4710868907054196e-05,
+      "loss": 4.4979,
+      "step": 3030
+    },
+    {
+      "epoch": 0.9264305177111717,
+      "grad_norm": 3.0711803436279297,
+      "learning_rate": 3.455949137148047e-05,
+      "loss": 4.4946,
+      "step": 3060
+    },
+    {
+      "epoch": 0.935513169845595,
+      "grad_norm": 2.982269287109375,
+      "learning_rate": 3.440811383590675e-05,
+      "loss": 4.3433,
+      "step": 3090
+    },
+    {
+      "epoch": 0.9445958219800181,
+      "grad_norm": 2.9734480381011963,
+      "learning_rate": 3.425673630033303e-05,
+      "loss": 4.453,
+      "step": 3120
+    },
+    {
+      "epoch": 0.9536784741144414,
+      "grad_norm": 2.985030174255371,
+      "learning_rate": 3.410535876475931e-05,
+      "loss": 4.3705,
+      "step": 3150
+    },
+    {
+      "epoch": 0.9627611262488647,
+      "grad_norm": 3.1812829971313477,
+      "learning_rate": 3.395398122918559e-05,
+      "loss": 4.3414,
+      "step": 3180
+    },
+    {
+      "epoch": 0.971843778383288,
+      "grad_norm": 3.415923595428467,
+      "learning_rate": 3.380260369361187e-05,
+      "loss": 4.522,
+      "step": 3210
+    },
+    {
+      "epoch": 0.9809264305177112,
+      "grad_norm": 3.176737070083618,
+      "learning_rate": 3.3651226158038145e-05,
+      "loss": 4.4112,
+      "step": 3240
+    },
+    {
+      "epoch": 0.9900090826521344,
+      "grad_norm": 3.1306254863739014,
+      "learning_rate": 3.3499848622464433e-05,
+      "loss": 4.5104,
+      "step": 3270
+    },
+    {
+      "epoch": 0.9990917347865577,
+      "grad_norm": 3.216395616531372,
+      "learning_rate": 3.334847108689071e-05,
+      "loss": 4.3244,
+      "step": 3300
+    },
+    {
+      "epoch": 1.008174386920981,
+      "grad_norm": 3.1889307498931885,
+      "learning_rate": 3.319709355131699e-05,
+      "loss": 4.3521,
+      "step": 3330
+    },
+    {
+      "epoch": 1.017257039055404,
+      "grad_norm": 2.8001787662506104,
+      "learning_rate": 3.3045716015743264e-05,
+      "loss": 4.3047,
+      "step": 3360
+    },
+    {
+      "epoch": 1.0263396911898275,
+      "grad_norm": 3.5796685218811035,
+      "learning_rate": 3.2894338480169546e-05,
+      "loss": 4.1921,
+      "step": 3390
+    },
+    {
+      "epoch": 1.0354223433242506,
+      "grad_norm": 3.725538730621338,
+      "learning_rate": 3.274296094459583e-05,
+      "loss": 4.3203,
+      "step": 3420
+    },
+    {
+      "epoch": 1.044504995458674,
+      "grad_norm": 2.9058167934417725,
+      "learning_rate": 3.25915834090221e-05,
+      "loss": 4.385,
+      "step": 3450
+    },
+    {
+      "epoch": 1.0535876475930972,
+      "grad_norm": 3.120119333267212,
+      "learning_rate": 3.244020587344838e-05,
+      "loss": 4.2883,
+      "step": 3480
+    },
+    {
+      "epoch": 1.0626702997275204,
+      "grad_norm": 3.230036735534668,
+      "learning_rate": 3.228882833787466e-05,
+      "loss": 4.3602,
+      "step": 3510
+    },
+    {
+      "epoch": 1.0717529518619437,
+      "grad_norm": 3.482921600341797,
+      "learning_rate": 3.213745080230094e-05,
+      "loss": 4.3984,
+      "step": 3540
+    },
+    {
+      "epoch": 1.080835603996367,
+      "grad_norm": 3.0121572017669678,
+      "learning_rate": 3.198607326672722e-05,
+      "loss": 4.3864,
+      "step": 3570
+    },
+    {
+      "epoch": 1.0899182561307903,
+      "grad_norm": 3.277411460876465,
+      "learning_rate": 3.1834695731153495e-05,
+      "loss": 4.2294,
+      "step": 3600
+    },
+    {
+      "epoch": 1.0990009082652135,
+      "grad_norm": 3.0383167266845703,
+      "learning_rate": 3.168331819557978e-05,
+      "loss": 4.2759,
+      "step": 3630
+    },
+    {
+      "epoch": 1.1080835603996366,
+      "grad_norm": 3.3026745319366455,
+      "learning_rate": 3.153194066000605e-05,
+      "loss": 4.3093,
+      "step": 3660
+    },
+    {
+      "epoch": 1.11716621253406,
+      "grad_norm": 2.954747200012207,
+      "learning_rate": 3.138056312443234e-05,
+      "loss": 4.2476,
+      "step": 3690
+    },
+    {
+      "epoch": 1.1262488646684832,
+      "grad_norm": 3.2137765884399414,
+      "learning_rate": 3.1229185588858614e-05,
+      "loss": 4.2858,
+      "step": 3720
+    },
+    {
+      "epoch": 1.1353315168029066,
+      "grad_norm": 3.4028799533843994,
+      "learning_rate": 3.1077808053284896e-05,
+      "loss": 4.3652,
+      "step": 3750
+    },
+    {
+      "epoch": 1.1444141689373297,
+      "grad_norm": 3.0039563179016113,
+      "learning_rate": 3.092643051771118e-05,
+      "loss": 4.4106,
+      "step": 3780
+    },
+    {
+      "epoch": 1.1534968210717529,
+      "grad_norm": 2.973820209503174,
+      "learning_rate": 3.077505298213745e-05,
+      "loss": 4.1827,
+      "step": 3810
+    },
+    {
+      "epoch": 1.1625794732061763,
+      "grad_norm": 2.99037766456604,
+      "learning_rate": 3.062367544656373e-05,
+      "loss": 4.3092,
+      "step": 3840
+    },
+    {
+      "epoch": 1.1716621253405994,
+      "grad_norm": 3.181398391723633,
+      "learning_rate": 3.047229791099001e-05,
+      "loss": 4.417,
+      "step": 3870
+    },
+    {
+      "epoch": 1.1807447774750228,
+      "grad_norm": 3.1933484077453613,
+      "learning_rate": 3.032092037541629e-05,
+      "loss": 4.2361,
+      "step": 3900
+    },
+    {
+      "epoch": 1.189827429609446,
+      "grad_norm": 3.4427855014801025,
+      "learning_rate": 3.0169542839842567e-05,
+      "loss": 4.2687,
+      "step": 3930
+    },
+    {
+      "epoch": 1.1989100817438691,
+      "grad_norm": 3.0683298110961914,
+      "learning_rate": 3.001816530426885e-05,
+      "loss": 4.2748,
+      "step": 3960
+    },
+    {
+      "epoch": 1.2079927338782925,
+      "grad_norm": 3.044698715209961,
+      "learning_rate": 2.9866787768695127e-05,
+      "loss": 4.2671,
+      "step": 3990
+    },
+    {
+      "epoch": 1.2170753860127157,
+      "grad_norm": 3.1354904174804688,
+      "learning_rate": 2.9715410233121405e-05,
+      "loss": 4.2635,
+      "step": 4020
+    },
+    {
+      "epoch": 1.226158038147139,
+      "grad_norm": 3.282745361328125,
+      "learning_rate": 2.9564032697547683e-05,
+      "loss": 4.3544,
+      "step": 4050
+    },
+    {
+      "epoch": 1.2352406902815622,
+      "grad_norm": 3.369798183441162,
+      "learning_rate": 2.941265516197396e-05,
+      "loss": 4.1993,
+      "step": 4080
+    },
+    {
+      "epoch": 1.2443233424159854,
+      "grad_norm": 3.395785331726074,
+      "learning_rate": 2.9261277626400242e-05,
+      "loss": 4.1131,
+      "step": 4110
+    },
+    {
+      "epoch": 1.2534059945504088,
+      "grad_norm": 3.500697135925293,
+      "learning_rate": 2.9109900090826524e-05,
+      "loss": 4.192,
+      "step": 4140
+    },
+    {
+      "epoch": 1.262488646684832,
+      "grad_norm": 2.94278621673584,
+      "learning_rate": 2.8958522555252805e-05,
+      "loss": 4.2863,
+      "step": 4170
+    },
+    {
+      "epoch": 1.2715712988192553,
+      "grad_norm": 3.3217315673828125,
+      "learning_rate": 2.8807145019679083e-05,
+      "loss": 4.1763,
+      "step": 4200
+    },
+    {
+      "epoch": 1.2806539509536785,
+      "grad_norm": 3.232830762863159,
+      "learning_rate": 2.865576748410536e-05,
+      "loss": 4.2595,
+      "step": 4230
+    },
+    {
+      "epoch": 1.2897366030881017,
+      "grad_norm": 3.3042378425598145,
+      "learning_rate": 2.850438994853164e-05,
+      "loss": 4.2393,
+      "step": 4260
+    },
+    {
+      "epoch": 1.298819255222525,
+      "grad_norm": 3.83151912689209,
+      "learning_rate": 2.835301241295792e-05,
+      "loss": 4.3005,
+      "step": 4290
+    },
+    {
+      "epoch": 1.3079019073569482,
+      "grad_norm": 3.245086431503296,
+      "learning_rate": 2.82016348773842e-05,
+      "loss": 4.205,
+      "step": 4320
+    },
+    {
+      "epoch": 1.3169845594913716,
+      "grad_norm": 3.4392285346984863,
+      "learning_rate": 2.8050257341810477e-05,
+      "loss": 4.1964,
+      "step": 4350
+    }
+  ],
+  "logging_steps": 30,
+  "max_steps": 9909,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 30,
+  "total_flos": 1136555016192000.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:51073e9acaf31db2dd04a38614739acc9b8272913419c8770c1a7b85bb4facb3
+size 4920

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff