Upload 11 files

Browse files

Files changed (11) hide show

config.json +31 -0
generation_config.json +7 -0
model.safetensors +3 -0
optimizer.pt +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
special_tokens_map.json +24 -0
tokenizer.json +0 -0
tokenizer_config.json +81 -0
trainer_state.json +1433 -0
training_args.bin +3 -0

config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "_name_or_path": "opt-350m",
+  "_remove_final_layer_norm": false,
+  "activation_dropout": 0.0,
+  "activation_function": "relu",
+  "architectures": [
+    "OPTForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 2,
+  "do_layer_norm_before": false,
+  "dropout": 0.1,
+  "enable_bias": true,
+  "eos_token_id": 2,
+  "ffn_dim": 4096,
+  "hidden_size": 1024,
+  "init_std": 0.02,
+  "layer_norm_elementwise_affine": true,
+  "layerdrop": 0.0,
+  "max_position_embeddings": 1024,
+  "model_type": "opt",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "pad_token_id": 1,
+  "prefix": "</s>",
+  "torch_dtype": "float32",
+  "transformers_version": "4.42.4",
+  "use_cache": true,
+  "vocab_size": 46336,
+  "word_embed_proj_dim": 512
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 2,
+  "eos_token_id": 2,
+  "pad_token_id": 1,
+  "transformers_version": "4.42.4"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ec2b0ccadb04bccada14a3fc0ccd3d5f3814c53bc945670a8982a20c385ac795
+size 1312575624

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de8914dcd73f007ad7a3ac7da15a66d4ef7eb5e4462c7688e39ffe2f1d227ca5
+size 2625385566

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d304891207a541ed4fa42ff5b57c1ff31488f6e09c0af457af0f4cad3215934b
+size 14244

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f0fbc8b0570d3fe9a0f03c82d4b4cdc650cfc0d5b5cfc547f4c98ab4966663d0
+size 1064

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<unk>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,81 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "46331": {
+      "content": "<|sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "46332": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "46333": {
+      "content": "<|acc|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "46334": {
+      "content": "<|rrn|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "46335": {
+      "content": "<|tel|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<unk>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1433 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.7414272474513438,
+  "eval_steps": 500,
+  "global_step": 200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0037071362372567192,
+      "grad_norm": 167957.515625,
+      "learning_rate": 0.0003992,
+      "loss": 10.791,
+      "step": 1
+    },
+    {
+      "epoch": 0.0074142724745134385,
+      "grad_norm": 109901.1953125,
+      "learning_rate": 0.00039840000000000003,
+      "loss": 10.4195,
+      "step": 2
+    },
+    {
+      "epoch": 0.011121408711770158,
+      "grad_norm": 101352.265625,
+      "learning_rate": 0.0003976,
+      "loss": 10.0242,
+      "step": 3
+    },
+    {
+      "epoch": 0.014828544949026877,
+      "grad_norm": 105144.921875,
+      "learning_rate": 0.0003968,
+      "loss": 9.5684,
+      "step": 4
+    },
+    {
+      "epoch": 0.018535681186283594,
+      "grad_norm": 107496.6640625,
+      "learning_rate": 0.00039600000000000003,
+      "loss": 9.1775,
+      "step": 5
+    },
+    {
+      "epoch": 0.022242817423540315,
+      "grad_norm": 99089.9453125,
+      "learning_rate": 0.0003952,
+      "loss": 8.9319,
+      "step": 6
+    },
+    {
+      "epoch": 0.025949953660797033,
+      "grad_norm": 91537.4765625,
+      "learning_rate": 0.0003944,
+      "loss": 8.4907,
+      "step": 7
+    },
+    {
+      "epoch": 0.029657089898053754,
+      "grad_norm": 113517.7265625,
+      "learning_rate": 0.0003936,
+      "loss": 8.1206,
+      "step": 8
+    },
+    {
+      "epoch": 0.033364226135310475,
+      "grad_norm": 120697.0546875,
+      "learning_rate": 0.0003928,
+      "loss": 8.0377,
+      "step": 9
+    },
+    {
+      "epoch": 0.03707136237256719,
+      "grad_norm": 115909.0546875,
+      "learning_rate": 0.000392,
+      "loss": 7.9754,
+      "step": 10
+    },
+    {
+      "epoch": 0.04077849860982391,
+      "grad_norm": 116857.3359375,
+      "learning_rate": 0.0003912,
+      "loss": 7.8788,
+      "step": 11
+    },
+    {
+      "epoch": 0.04448563484708063,
+      "grad_norm": 101768.7109375,
+      "learning_rate": 0.0003904,
+      "loss": 7.8914,
+      "step": 12
+    },
+    {
+      "epoch": 0.04819277108433735,
+      "grad_norm": 101978.6015625,
+      "learning_rate": 0.0003896,
+      "loss": 7.8178,
+      "step": 13
+    },
+    {
+      "epoch": 0.051899907321594066,
+      "grad_norm": 90827.578125,
+      "learning_rate": 0.0003888,
+      "loss": 7.85,
+      "step": 14
+    },
+    {
+      "epoch": 0.05560704355885079,
+      "grad_norm": 82672.1640625,
+      "learning_rate": 0.000388,
+      "loss": 7.8152,
+      "step": 15
+    },
+    {
+      "epoch": 0.05931417979610751,
+      "grad_norm": 65482.09375,
+      "learning_rate": 0.00038720000000000003,
+      "loss": 7.8131,
+      "step": 16
+    },
+    {
+      "epoch": 0.06302131603336422,
+      "grad_norm": 55323.29296875,
+      "learning_rate": 0.0003864,
+      "loss": 7.6994,
+      "step": 17
+    },
+    {
+      "epoch": 0.06672845227062095,
+      "grad_norm": 94588.7109375,
+      "learning_rate": 0.0003856,
+      "loss": 7.8545,
+      "step": 18
+    },
+    {
+      "epoch": 0.07043558850787766,
+      "grad_norm": 50202.546875,
+      "learning_rate": 0.00038480000000000003,
+      "loss": 7.75,
+      "step": 19
+    },
+    {
+      "epoch": 0.07414272474513438,
+      "grad_norm": 48727.80859375,
+      "learning_rate": 0.000384,
+      "loss": 7.7449,
+      "step": 20
+    },
+    {
+      "epoch": 0.0778498609823911,
+      "grad_norm": 53795.23046875,
+      "learning_rate": 0.0003832,
+      "loss": 7.702,
+      "step": 21
+    },
+    {
+      "epoch": 0.08155699721964782,
+      "grad_norm": 55052.234375,
+      "learning_rate": 0.0003824,
+      "loss": 7.7048,
+      "step": 22
+    },
+    {
+      "epoch": 0.08526413345690455,
+      "grad_norm": 35977.5625,
+      "learning_rate": 0.0003816,
+      "loss": 7.7986,
+      "step": 23
+    },
+    {
+      "epoch": 0.08897126969416126,
+      "grad_norm": 55099.60546875,
+      "learning_rate": 0.0003808,
+      "loss": 7.8071,
+      "step": 24
+    },
+    {
+      "epoch": 0.09267840593141798,
+      "grad_norm": 34977.36328125,
+      "learning_rate": 0.00038,
+      "loss": 7.8473,
+      "step": 25
+    },
+    {
+      "epoch": 0.0963855421686747,
+      "grad_norm": 35271.6640625,
+      "learning_rate": 0.0003792,
+      "loss": 7.7099,
+      "step": 26
+    },
+    {
+      "epoch": 0.10009267840593142,
+      "grad_norm": 44887.39453125,
+      "learning_rate": 0.0003784,
+      "loss": 7.617,
+      "step": 27
+    },
+    {
+      "epoch": 0.10379981464318813,
+      "grad_norm": 41191.33203125,
+      "learning_rate": 0.0003776,
+      "loss": 7.697,
+      "step": 28
+    },
+    {
+      "epoch": 0.10750695088044486,
+      "grad_norm": 45202.97265625,
+      "learning_rate": 0.0003768,
+      "loss": 7.7264,
+      "step": 29
+    },
+    {
+      "epoch": 0.11121408711770157,
+      "grad_norm": 44944.65234375,
+      "learning_rate": 0.000376,
+      "loss": 7.7159,
+      "step": 30
+    },
+    {
+      "epoch": 0.11492122335495829,
+      "grad_norm": 34502.83203125,
+      "learning_rate": 0.0003752,
+      "loss": 7.7213,
+      "step": 31
+    },
+    {
+      "epoch": 0.11862835959221502,
+      "grad_norm": 38415.63671875,
+      "learning_rate": 0.00037440000000000005,
+      "loss": 7.6674,
+      "step": 32
+    },
+    {
+      "epoch": 0.12233549582947173,
+      "grad_norm": 34140.18359375,
+      "learning_rate": 0.00037360000000000003,
+      "loss": 7.6627,
+      "step": 33
+    },
+    {
+      "epoch": 0.12604263206672844,
+      "grad_norm": 27067.009765625,
+      "learning_rate": 0.00037280000000000006,
+      "loss": 7.6678,
+      "step": 34
+    },
+    {
+      "epoch": 0.12974976830398516,
+      "grad_norm": 34192.23828125,
+      "learning_rate": 0.00037200000000000004,
+      "loss": 7.7438,
+      "step": 35
+    },
+    {
+      "epoch": 0.1334569045412419,
+      "grad_norm": 42940.6953125,
+      "learning_rate": 0.0003712,
+      "loss": 7.6559,
+      "step": 36
+    },
+    {
+      "epoch": 0.1371640407784986,
+      "grad_norm": 28908.26171875,
+      "learning_rate": 0.00037040000000000006,
+      "loss": 7.6763,
+      "step": 37
+    },
+    {
+      "epoch": 0.14087117701575533,
+      "grad_norm": 46989.23046875,
+      "learning_rate": 0.00036960000000000004,
+      "loss": 7.6483,
+      "step": 38
+    },
+    {
+      "epoch": 0.14457831325301204,
+      "grad_norm": 38628.00390625,
+      "learning_rate": 0.0003688,
+      "loss": 7.5813,
+      "step": 39
+    },
+    {
+      "epoch": 0.14828544949026876,
+      "grad_norm": 26901.994140625,
+      "learning_rate": 0.00036800000000000005,
+      "loss": 7.7328,
+      "step": 40
+    },
+    {
+      "epoch": 0.1519925857275255,
+      "grad_norm": 55413.51953125,
+      "learning_rate": 0.00036720000000000004,
+      "loss": 7.5977,
+      "step": 41
+    },
+    {
+      "epoch": 0.1556997219647822,
+      "grad_norm": 38922.68359375,
+      "learning_rate": 0.0003664,
+      "loss": 7.6575,
+      "step": 42
+    },
+    {
+      "epoch": 0.15940685820203893,
+      "grad_norm": 49835.87109375,
+      "learning_rate": 0.00036560000000000005,
+      "loss": 7.7382,
+      "step": 43
+    },
+    {
+      "epoch": 0.16311399443929564,
+      "grad_norm": 41342.8515625,
+      "learning_rate": 0.00036480000000000003,
+      "loss": 7.7068,
+      "step": 44
+    },
+    {
+      "epoch": 0.16682113067655235,
+      "grad_norm": 38896.15625,
+      "learning_rate": 0.000364,
+      "loss": 7.6614,
+      "step": 45
+    },
+    {
+      "epoch": 0.1705282669138091,
+      "grad_norm": 29027.955078125,
+      "learning_rate": 0.00036320000000000005,
+      "loss": 7.728,
+      "step": 46
+    },
+    {
+      "epoch": 0.1742354031510658,
+      "grad_norm": 33758.0859375,
+      "learning_rate": 0.0003624,
+      "loss": 7.7392,
+      "step": 47
+    },
+    {
+      "epoch": 0.17794253938832252,
+      "grad_norm": 29002.869140625,
+      "learning_rate": 0.0003616,
+      "loss": 7.666,
+      "step": 48
+    },
+    {
+      "epoch": 0.18164967562557924,
+      "grad_norm": 33393.12890625,
+      "learning_rate": 0.00036080000000000004,
+      "loss": 7.6067,
+      "step": 49
+    },
+    {
+      "epoch": 0.18535681186283595,
+      "grad_norm": 39193.51171875,
+      "learning_rate": 0.00036,
+      "loss": 7.7868,
+      "step": 50
+    },
+    {
+      "epoch": 0.18906394810009267,
+      "grad_norm": 25982.78125,
+      "learning_rate": 0.0003592,
+      "loss": 7.7189,
+      "step": 51
+    },
+    {
+      "epoch": 0.1927710843373494,
+      "grad_norm": 28694.505859375,
+      "learning_rate": 0.00035840000000000004,
+      "loss": 7.6999,
+      "step": 52
+    },
+    {
+      "epoch": 0.19647822057460612,
+      "grad_norm": 26356.8828125,
+      "learning_rate": 0.0003576,
+      "loss": 7.712,
+      "step": 53
+    },
+    {
+      "epoch": 0.20018535681186284,
+      "grad_norm": 25880.298828125,
+      "learning_rate": 0.0003568,
+      "loss": 7.7015,
+      "step": 54
+    },
+    {
+      "epoch": 0.20389249304911955,
+      "grad_norm": 23557.111328125,
+      "learning_rate": 0.00035600000000000003,
+      "loss": 7.6849,
+      "step": 55
+    },
+    {
+      "epoch": 0.20759962928637626,
+      "grad_norm": 31365.33203125,
+      "learning_rate": 0.0003552,
+      "loss": 7.7333,
+      "step": 56
+    },
+    {
+      "epoch": 0.211306765523633,
+      "grad_norm": 31506.552734375,
+      "learning_rate": 0.0003544,
+      "loss": 7.7317,
+      "step": 57
+    },
+    {
+      "epoch": 0.21501390176088972,
+      "grad_norm": 22261.244140625,
+      "learning_rate": 0.00035360000000000003,
+      "loss": 7.6978,
+      "step": 58
+    },
+    {
+      "epoch": 0.21872103799814643,
+      "grad_norm": 36267.4921875,
+      "learning_rate": 0.0003528,
+      "loss": 7.7125,
+      "step": 59
+    },
+    {
+      "epoch": 0.22242817423540315,
+      "grad_norm": 29624.087890625,
+      "learning_rate": 0.00035200000000000005,
+      "loss": 7.734,
+      "step": 60
+    },
+    {
+      "epoch": 0.22613531047265986,
+      "grad_norm": 25301.228515625,
+      "learning_rate": 0.0003512,
+      "loss": 7.7287,
+      "step": 61
+    },
+    {
+      "epoch": 0.22984244670991658,
+      "grad_norm": 26147.228515625,
+      "learning_rate": 0.0003504,
+      "loss": 7.7059,
+      "step": 62
+    },
+    {
+      "epoch": 0.23354958294717332,
+      "grad_norm": 27329.443359375,
+      "learning_rate": 0.00034960000000000004,
+      "loss": 7.6798,
+      "step": 63
+    },
+    {
+      "epoch": 0.23725671918443003,
+      "grad_norm": 23415.9609375,
+      "learning_rate": 0.0003488,
+      "loss": 7.6968,
+      "step": 64
+    },
+    {
+      "epoch": 0.24096385542168675,
+      "grad_norm": 23625.1171875,
+      "learning_rate": 0.000348,
+      "loss": 7.7119,
+      "step": 65
+    },
+    {
+      "epoch": 0.24467099165894346,
+      "grad_norm": 23805.42578125,
+      "learning_rate": 0.00034720000000000004,
+      "loss": 7.6473,
+      "step": 66
+    },
+    {
+      "epoch": 0.24837812789620017,
+      "grad_norm": 47364.8203125,
+      "learning_rate": 0.0003464,
+      "loss": 7.7921,
+      "step": 67
+    },
+    {
+      "epoch": 0.2520852641334569,
+      "grad_norm": 29178.279296875,
+      "learning_rate": 0.0003456,
+      "loss": 7.6958,
+      "step": 68
+    },
+    {
+      "epoch": 0.2557924003707136,
+      "grad_norm": 26202.958984375,
+      "learning_rate": 0.00034480000000000003,
+      "loss": 7.7765,
+      "step": 69
+    },
+    {
+      "epoch": 0.2594995366079703,
+      "grad_norm": 48753.58203125,
+      "learning_rate": 0.000344,
+      "loss": 7.6496,
+      "step": 70
+    },
+    {
+      "epoch": 0.2632066728452271,
+      "grad_norm": 24508.509765625,
+      "learning_rate": 0.0003432,
+      "loss": 7.7125,
+      "step": 71
+    },
+    {
+      "epoch": 0.2669138090824838,
+      "grad_norm": 33996.55078125,
+      "learning_rate": 0.00034240000000000003,
+      "loss": 7.6635,
+      "step": 72
+    },
+    {
+      "epoch": 0.2706209453197405,
+      "grad_norm": 32989.36328125,
+      "learning_rate": 0.0003416,
+      "loss": 7.6893,
+      "step": 73
+    },
+    {
+      "epoch": 0.2743280815569972,
+      "grad_norm": 32296.1796875,
+      "learning_rate": 0.0003408,
+      "loss": 7.6696,
+      "step": 74
+    },
+    {
+      "epoch": 0.27803521779425394,
+      "grad_norm": 35698.16015625,
+      "learning_rate": 0.00034,
+      "loss": 7.6713,
+      "step": 75
+    },
+    {
+      "epoch": 0.28174235403151066,
+      "grad_norm": 25034.283203125,
+      "learning_rate": 0.0003392,
+      "loss": 7.6629,
+      "step": 76
+    },
+    {
+      "epoch": 0.28544949026876737,
+      "grad_norm": 36568.65625,
+      "learning_rate": 0.0003384,
+      "loss": 7.7075,
+      "step": 77
+    },
+    {
+      "epoch": 0.2891566265060241,
+      "grad_norm": 25048.875,
+      "learning_rate": 0.0003376,
+      "loss": 7.6727,
+      "step": 78
+    },
+    {
+      "epoch": 0.2928637627432808,
+      "grad_norm": 25438.61328125,
+      "learning_rate": 0.0003368,
+      "loss": 7.7028,
+      "step": 79
+    },
+    {
+      "epoch": 0.2965708989805375,
+      "grad_norm": 27428.9453125,
+      "learning_rate": 0.000336,
+      "loss": 7.6516,
+      "step": 80
+    },
+    {
+      "epoch": 0.3002780352177943,
+      "grad_norm": 32185.8125,
+      "learning_rate": 0.0003352,
+      "loss": 7.7127,
+      "step": 81
+    },
+    {
+      "epoch": 0.303985171455051,
+      "grad_norm": 28342.439453125,
+      "learning_rate": 0.0003344,
+      "loss": 7.6461,
+      "step": 82
+    },
+    {
+      "epoch": 0.3076923076923077,
+      "grad_norm": 22977.4140625,
+      "learning_rate": 0.0003336,
+      "loss": 7.6348,
+      "step": 83
+    },
+    {
+      "epoch": 0.3113994439295644,
+      "grad_norm": 28778.767578125,
+      "learning_rate": 0.0003328,
+      "loss": 7.6299,
+      "step": 84
+    },
+    {
+      "epoch": 0.31510658016682114,
+      "grad_norm": 21658.966796875,
+      "learning_rate": 0.000332,
+      "loss": 7.633,
+      "step": 85
+    },
+    {
+      "epoch": 0.31881371640407785,
+      "grad_norm": 22994.66796875,
+      "learning_rate": 0.0003312,
+      "loss": 7.648,
+      "step": 86
+    },
+    {
+      "epoch": 0.32252085264133457,
+      "grad_norm": 23064.05078125,
+      "learning_rate": 0.0003304,
+      "loss": 7.712,
+      "step": 87
+    },
+    {
+      "epoch": 0.3262279888785913,
+      "grad_norm": 34689.19140625,
+      "learning_rate": 0.0003296,
+      "loss": 7.6168,
+      "step": 88
+    },
+    {
+      "epoch": 0.329935125115848,
+      "grad_norm": 26677.1328125,
+      "learning_rate": 0.0003288,
+      "loss": 7.6226,
+      "step": 89
+    },
+    {
+      "epoch": 0.3336422613531047,
+      "grad_norm": 39699.62109375,
+      "learning_rate": 0.000328,
+      "loss": 7.6465,
+      "step": 90
+    },
+    {
+      "epoch": 0.3373493975903614,
+      "grad_norm": 47106.6640625,
+      "learning_rate": 0.0003272,
+      "loss": 7.6884,
+      "step": 91
+    },
+    {
+      "epoch": 0.3410565338276182,
+      "grad_norm": 30162.638671875,
+      "learning_rate": 0.0003264,
+      "loss": 7.7695,
+      "step": 92
+    },
+    {
+      "epoch": 0.3447636700648749,
+      "grad_norm": 40879.01953125,
+      "learning_rate": 0.0003256,
+      "loss": 7.7253,
+      "step": 93
+    },
+    {
+      "epoch": 0.3484708063021316,
+      "grad_norm": 56518.4921875,
+      "learning_rate": 0.00032480000000000003,
+      "loss": 7.6734,
+      "step": 94
+    },
+    {
+      "epoch": 0.35217794253938833,
+      "grad_norm": 37450.08203125,
+      "learning_rate": 0.000324,
+      "loss": 7.6897,
+      "step": 95
+    },
+    {
+      "epoch": 0.35588507877664505,
+      "grad_norm": 28603.978515625,
+      "learning_rate": 0.00032320000000000005,
+      "loss": 7.7346,
+      "step": 96
+    },
+    {
+      "epoch": 0.35959221501390176,
+      "grad_norm": 45344.12109375,
+      "learning_rate": 0.00032240000000000003,
+      "loss": 7.7564,
+      "step": 97
+    },
+    {
+      "epoch": 0.3632993512511585,
+      "grad_norm": 20206.189453125,
+      "learning_rate": 0.0003216,
+      "loss": 7.6465,
+      "step": 98
+    },
+    {
+      "epoch": 0.3670064874884152,
+      "grad_norm": 29952.62890625,
+      "learning_rate": 0.00032080000000000005,
+      "loss": 7.6581,
+      "step": 99
+    },
+    {
+      "epoch": 0.3707136237256719,
+      "grad_norm": 24017.02734375,
+      "learning_rate": 0.00032,
+      "loss": 7.7068,
+      "step": 100
+    },
+    {
+      "epoch": 0.3744207599629286,
+      "grad_norm": 21995.66796875,
+      "learning_rate": 0.0003192,
+      "loss": 7.7306,
+      "step": 101
+    },
+    {
+      "epoch": 0.37812789620018533,
+      "grad_norm": 22698.15625,
+      "learning_rate": 0.00031840000000000004,
+      "loss": 7.6167,
+      "step": 102
+    },
+    {
+      "epoch": 0.3818350324374421,
+      "grad_norm": 19390.587890625,
+      "learning_rate": 0.0003176,
+      "loss": 7.6298,
+      "step": 103
+    },
+    {
+      "epoch": 0.3855421686746988,
+      "grad_norm": 23548.39453125,
+      "learning_rate": 0.00031680000000000006,
+      "loss": 7.7148,
+      "step": 104
+    },
+    {
+      "epoch": 0.38924930491195553,
+      "grad_norm": 25070.564453125,
+      "learning_rate": 0.00031600000000000004,
+      "loss": 7.8045,
+      "step": 105
+    },
+    {
+      "epoch": 0.39295644114921224,
+      "grad_norm": 39852.94921875,
+      "learning_rate": 0.0003152,
+      "loss": 7.6813,
+      "step": 106
+    },
+    {
+      "epoch": 0.39666357738646896,
+      "grad_norm": 30994.017578125,
+      "learning_rate": 0.00031440000000000005,
+      "loss": 7.6801,
+      "step": 107
+    },
+    {
+      "epoch": 0.40037071362372567,
+      "grad_norm": 35010.94140625,
+      "learning_rate": 0.00031360000000000003,
+      "loss": 7.7625,
+      "step": 108
+    },
+    {
+      "epoch": 0.4040778498609824,
+      "grad_norm": 32364.001953125,
+      "learning_rate": 0.0003128,
+      "loss": 7.682,
+      "step": 109
+    },
+    {
+      "epoch": 0.4077849860982391,
+      "grad_norm": 24475.48828125,
+      "learning_rate": 0.00031200000000000005,
+      "loss": 7.6953,
+      "step": 110
+    },
+    {
+      "epoch": 0.4114921223354958,
+      "grad_norm": 28467.2890625,
+      "learning_rate": 0.00031120000000000003,
+      "loss": 7.7112,
+      "step": 111
+    },
+    {
+      "epoch": 0.4151992585727525,
+      "grad_norm": 46241.89453125,
+      "learning_rate": 0.0003104,
+      "loss": 7.625,
+      "step": 112
+    },
+    {
+      "epoch": 0.41890639481000924,
+      "grad_norm": 25736.814453125,
+      "learning_rate": 0.00030960000000000004,
+      "loss": 7.6842,
+      "step": 113
+    },
+    {
+      "epoch": 0.422613531047266,
+      "grad_norm": 25479.744140625,
+      "learning_rate": 0.0003088,
+      "loss": 7.7131,
+      "step": 114
+    },
+    {
+      "epoch": 0.4263206672845227,
+      "grad_norm": 32374.447265625,
+      "learning_rate": 0.000308,
+      "loss": 7.7209,
+      "step": 115
+    },
+    {
+      "epoch": 0.43002780352177944,
+      "grad_norm": 21930.126953125,
+      "learning_rate": 0.00030720000000000004,
+      "loss": 7.6593,
+      "step": 116
+    },
+    {
+      "epoch": 0.43373493975903615,
+      "grad_norm": 22632.013671875,
+      "learning_rate": 0.0003064,
+      "loss": 7.7121,
+      "step": 117
+    },
+    {
+      "epoch": 0.43744207599629287,
+      "grad_norm": 21551.6328125,
+      "learning_rate": 0.0003056,
+      "loss": 7.6504,
+      "step": 118
+    },
+    {
+      "epoch": 0.4411492122335496,
+      "grad_norm": 24234.326171875,
+      "learning_rate": 0.00030480000000000004,
+      "loss": 7.7,
+      "step": 119
+    },
+    {
+      "epoch": 0.4448563484708063,
+      "grad_norm": 27236.205078125,
+      "learning_rate": 0.000304,
+      "loss": 7.7073,
+      "step": 120
+    },
+    {
+      "epoch": 0.448563484708063,
+      "grad_norm": 20109.84765625,
+      "learning_rate": 0.0003032,
+      "loss": 7.642,
+      "step": 121
+    },
+    {
+      "epoch": 0.4522706209453197,
+      "grad_norm": 20982.546875,
+      "learning_rate": 0.00030240000000000003,
+      "loss": 7.7092,
+      "step": 122
+    },
+    {
+      "epoch": 0.45597775718257644,
+      "grad_norm": 30563.40625,
+      "learning_rate": 0.0003016,
+      "loss": 7.6086,
+      "step": 123
+    },
+    {
+      "epoch": 0.45968489341983315,
+      "grad_norm": 26537.8828125,
+      "learning_rate": 0.0003008,
+      "loss": 7.711,
+      "step": 124
+    },
+    {
+      "epoch": 0.4633920296570899,
+      "grad_norm": 26180.9765625,
+      "learning_rate": 0.00030000000000000003,
+      "loss": 7.6946,
+      "step": 125
+    },
+    {
+      "epoch": 0.46709916589434664,
+      "grad_norm": 25894.8828125,
+      "learning_rate": 0.0002992,
+      "loss": 7.6252,
+      "step": 126
+    },
+    {
+      "epoch": 0.47080630213160335,
+      "grad_norm": 17775.234375,
+      "learning_rate": 0.0002984,
+      "loss": 7.7064,
+      "step": 127
+    },
+    {
+      "epoch": 0.47451343836886006,
+      "grad_norm": 23387.5625,
+      "learning_rate": 0.0002976,
+      "loss": 7.606,
+      "step": 128
+    },
+    {
+      "epoch": 0.4782205746061168,
+      "grad_norm": 26294.63671875,
+      "learning_rate": 0.0002968,
+      "loss": 7.6753,
+      "step": 129
+    },
+    {
+      "epoch": 0.4819277108433735,
+      "grad_norm": 22350.404296875,
+      "learning_rate": 0.000296,
+      "loss": 7.6926,
+      "step": 130
+    },
+    {
+      "epoch": 0.4856348470806302,
+      "grad_norm": 23048.61328125,
+      "learning_rate": 0.0002952,
+      "loss": 7.6476,
+      "step": 131
+    },
+    {
+      "epoch": 0.4893419833178869,
+      "grad_norm": 26630.447265625,
+      "learning_rate": 0.0002944,
+      "loss": 7.7831,
+      "step": 132
+    },
+    {
+      "epoch": 0.49304911955514363,
+      "grad_norm": 34660.65234375,
+      "learning_rate": 0.00029360000000000003,
+      "loss": 7.5954,
+      "step": 133
+    },
+    {
+      "epoch": 0.49675625579240035,
+      "grad_norm": 19611.568359375,
+      "learning_rate": 0.0002928,
+      "loss": 7.6305,
+      "step": 134
+    },
+    {
+      "epoch": 0.5004633920296571,
+      "grad_norm": 38032.05078125,
+      "learning_rate": 0.000292,
+      "loss": 7.725,
+      "step": 135
+    },
+    {
+      "epoch": 0.5041705282669138,
+      "grad_norm": 26124.802734375,
+      "learning_rate": 0.00029120000000000003,
+      "loss": 7.6547,
+      "step": 136
+    },
+    {
+      "epoch": 0.5078776645041705,
+      "grad_norm": 22567.94921875,
+      "learning_rate": 0.0002904,
+      "loss": 7.7534,
+      "step": 137
+    },
+    {
+      "epoch": 0.5115848007414272,
+      "grad_norm": 37485.49609375,
+      "learning_rate": 0.0002896,
+      "loss": 7.6795,
+      "step": 138
+    },
+    {
+      "epoch": 0.5152919369786839,
+      "grad_norm": 32182.43359375,
+      "learning_rate": 0.0002888,
+      "loss": 7.7417,
+      "step": 139
+    },
+    {
+      "epoch": 0.5189990732159406,
+      "grad_norm": 24093.3125,
+      "learning_rate": 0.000288,
+      "loss": 7.6875,
+      "step": 140
+    },
+    {
+      "epoch": 0.5227062094531975,
+      "grad_norm": 23480.59765625,
+      "learning_rate": 0.0002872,
+      "loss": 7.6571,
+      "step": 141
+    },
+    {
+      "epoch": 0.5264133456904542,
+      "grad_norm": 34477.796875,
+      "learning_rate": 0.0002864,
+      "loss": 7.6389,
+      "step": 142
+    },
+    {
+      "epoch": 0.5301204819277109,
+      "grad_norm": 32023.896484375,
+      "learning_rate": 0.0002856,
+      "loss": 7.7501,
+      "step": 143
+    },
+    {
+      "epoch": 0.5338276181649676,
+      "grad_norm": 21589.513671875,
+      "learning_rate": 0.0002848,
+      "loss": 7.6895,
+      "step": 144
+    },
+    {
+      "epoch": 0.5375347544022243,
+      "grad_norm": 31786.94921875,
+      "learning_rate": 0.000284,
+      "loss": 7.7106,
+      "step": 145
+    },
+    {
+      "epoch": 0.541241890639481,
+      "grad_norm": 31673.8359375,
+      "learning_rate": 0.0002832,
+      "loss": 7.6815,
+      "step": 146
+    },
+    {
+      "epoch": 0.5449490268767377,
+      "grad_norm": 17670.734375,
+      "learning_rate": 0.0002824,
+      "loss": 7.6869,
+      "step": 147
+    },
+    {
+      "epoch": 0.5486561631139945,
+      "grad_norm": 34063.0703125,
+      "learning_rate": 0.0002816,
+      "loss": 7.7108,
+      "step": 148
+    },
+    {
+      "epoch": 0.5523632993512512,
+      "grad_norm": 36702.2734375,
+      "learning_rate": 0.0002808,
+      "loss": 7.7124,
+      "step": 149
+    },
+    {
+      "epoch": 0.5560704355885079,
+      "grad_norm": 22709.572265625,
+      "learning_rate": 0.00028,
+      "loss": 7.7326,
+      "step": 150
+    },
+    {
+      "epoch": 0.5597775718257646,
+      "grad_norm": 36804.21484375,
+      "learning_rate": 0.0002792,
+      "loss": 7.6414,
+      "step": 151
+    },
+    {
+      "epoch": 0.5634847080630213,
+      "grad_norm": 30339.912109375,
+      "learning_rate": 0.0002784,
+      "loss": 7.7337,
+      "step": 152
+    },
+    {
+      "epoch": 0.567191844300278,
+      "grad_norm": 31866.80859375,
+      "learning_rate": 0.00027759999999999997,
+      "loss": 7.6208,
+      "step": 153
+    },
+    {
+      "epoch": 0.5708989805375347,
+      "grad_norm": 23864.302734375,
+      "learning_rate": 0.0002768,
+      "loss": 7.7083,
+      "step": 154
+    },
+    {
+      "epoch": 0.5746061167747915,
+      "grad_norm": 29230.330078125,
+      "learning_rate": 0.000276,
+      "loss": 7.6914,
+      "step": 155
+    },
+    {
+      "epoch": 0.5783132530120482,
+      "grad_norm": 21988.8046875,
+      "learning_rate": 0.00027519999999999997,
+      "loss": 7.7157,
+      "step": 156
+    },
+    {
+      "epoch": 0.5820203892493049,
+      "grad_norm": 21070.361328125,
+      "learning_rate": 0.00027440000000000006,
+      "loss": 7.6987,
+      "step": 157
+    },
+    {
+      "epoch": 0.5857275254865616,
+      "grad_norm": 39177.30859375,
+      "learning_rate": 0.00027360000000000004,
+      "loss": 7.5922,
+      "step": 158
+    },
+    {
+      "epoch": 0.5894346617238183,
+      "grad_norm": 20961.755859375,
+      "learning_rate": 0.0002728,
+      "loss": 7.7621,
+      "step": 159
+    },
+    {
+      "epoch": 0.593141797961075,
+      "grad_norm": 24547.12890625,
+      "learning_rate": 0.00027200000000000005,
+      "loss": 7.7387,
+      "step": 160
+    },
+    {
+      "epoch": 0.5968489341983317,
+      "grad_norm": 17789.8125,
+      "learning_rate": 0.00027120000000000003,
+      "loss": 7.6818,
+      "step": 161
+    },
+    {
+      "epoch": 0.6005560704355886,
+      "grad_norm": 21633.90625,
+      "learning_rate": 0.0002704,
+      "loss": 7.6545,
+      "step": 162
+    },
+    {
+      "epoch": 0.6042632066728453,
+      "grad_norm": 17543.3046875,
+      "learning_rate": 0.00026960000000000005,
+      "loss": 7.6662,
+      "step": 163
+    },
+    {
+      "epoch": 0.607970342910102,
+      "grad_norm": 18747.458984375,
+      "learning_rate": 0.00026880000000000003,
+      "loss": 7.6227,
+      "step": 164
+    },
+    {
+      "epoch": 0.6116774791473587,
+      "grad_norm": 22172.224609375,
+      "learning_rate": 0.000268,
+      "loss": 7.6899,
+      "step": 165
+    },
+    {
+      "epoch": 0.6153846153846154,
+      "grad_norm": 19154.330078125,
+      "learning_rate": 0.00026720000000000004,
+      "loss": 7.6195,
+      "step": 166
+    },
+    {
+      "epoch": 0.6190917516218721,
+      "grad_norm": 20868.43359375,
+      "learning_rate": 0.0002664,
+      "loss": 7.677,
+      "step": 167
+    },
+    {
+      "epoch": 0.6227988878591288,
+      "grad_norm": 18564.533203125,
+      "learning_rate": 0.0002656,
+      "loss": 7.696,
+      "step": 168
+    },
+    {
+      "epoch": 0.6265060240963856,
+      "grad_norm": 22970.892578125,
+      "learning_rate": 0.00026480000000000004,
+      "loss": 7.6589,
+      "step": 169
+    },
+    {
+      "epoch": 0.6302131603336423,
+      "grad_norm": 18157.03515625,
+      "learning_rate": 0.000264,
+      "loss": 7.7017,
+      "step": 170
+    },
+    {
+      "epoch": 0.633920296570899,
+      "grad_norm": 20085.443359375,
+      "learning_rate": 0.0002632,
+      "loss": 7.7293,
+      "step": 171
+    },
+    {
+      "epoch": 0.6376274328081557,
+      "grad_norm": 26864.5390625,
+      "learning_rate": 0.00026240000000000004,
+      "loss": 7.5853,
+      "step": 172
+    },
+    {
+      "epoch": 0.6413345690454124,
+      "grad_norm": 21249.70703125,
+      "learning_rate": 0.0002616,
+      "loss": 7.7276,
+      "step": 173
+    },
+    {
+      "epoch": 0.6450417052826691,
+      "grad_norm": 17884.49609375,
+      "learning_rate": 0.0002608,
+      "loss": 7.7034,
+      "step": 174
+    },
+    {
+      "epoch": 0.6487488415199258,
+      "grad_norm": 19097.380859375,
+      "learning_rate": 0.00026000000000000003,
+      "loss": 7.7472,
+      "step": 175
+    },
+    {
+      "epoch": 0.6524559777571826,
+      "grad_norm": 21432.216796875,
+      "learning_rate": 0.0002592,
+      "loss": 7.7052,
+      "step": 176
+    },
+    {
+      "epoch": 0.6561631139944393,
+      "grad_norm": 17022.677734375,
+      "learning_rate": 0.00025840000000000005,
+      "loss": 7.7127,
+      "step": 177
+    },
+    {
+      "epoch": 0.659870250231696,
+      "grad_norm": 21216.20703125,
+      "learning_rate": 0.00025760000000000003,
+      "loss": 7.5911,
+      "step": 178
+    },
+    {
+      "epoch": 0.6635773864689527,
+      "grad_norm": 21638.240234375,
+      "learning_rate": 0.0002568,
+      "loss": 7.5969,
+      "step": 179
+    },
+    {
+      "epoch": 0.6672845227062094,
+      "grad_norm": 27894.361328125,
+      "learning_rate": 0.00025600000000000004,
+      "loss": 7.6331,
+      "step": 180
+    },
+    {
+      "epoch": 0.6709916589434661,
+      "grad_norm": 21034.33984375,
+      "learning_rate": 0.0002552,
+      "loss": 7.6371,
+      "step": 181
+    },
+    {
+      "epoch": 0.6746987951807228,
+      "grad_norm": 25746.513671875,
+      "learning_rate": 0.0002544,
+      "loss": 7.6462,
+      "step": 182
+    },
+    {
+      "epoch": 0.6784059314179796,
+      "grad_norm": 23690.24609375,
+      "learning_rate": 0.00025360000000000004,
+      "loss": 7.6662,
+      "step": 183
+    },
+    {
+      "epoch": 0.6821130676552364,
+      "grad_norm": 19138.052734375,
+      "learning_rate": 0.0002528,
+      "loss": 7.74,
+      "step": 184
+    },
+    {
+      "epoch": 0.6858202038924931,
+      "grad_norm": 20391.046875,
+      "learning_rate": 0.000252,
+      "loss": 7.7163,
+      "step": 185
+    },
+    {
+      "epoch": 0.6895273401297498,
+      "grad_norm": 17356.830078125,
+      "learning_rate": 0.00025120000000000003,
+      "loss": 7.6277,
+      "step": 186
+    },
+    {
+      "epoch": 0.6932344763670065,
+      "grad_norm": 27145.943359375,
+      "learning_rate": 0.0002504,
+      "loss": 7.7351,
+      "step": 187
+    },
+    {
+      "epoch": 0.6969416126042632,
+      "grad_norm": 18061.5703125,
+      "learning_rate": 0.0002496,
+      "loss": 7.6895,
+      "step": 188
+    },
+    {
+      "epoch": 0.70064874884152,
+      "grad_norm": 17943.388671875,
+      "learning_rate": 0.00024880000000000003,
+      "loss": 7.7073,
+      "step": 189
+    },
+    {
+      "epoch": 0.7043558850787767,
+      "grad_norm": 19911.068359375,
+      "learning_rate": 0.000248,
+      "loss": 7.7247,
+      "step": 190
+    },
+    {
+      "epoch": 0.7080630213160334,
+      "grad_norm": 23313.1328125,
+      "learning_rate": 0.0002472,
+      "loss": 7.6459,
+      "step": 191
+    },
+    {
+      "epoch": 0.7117701575532901,
+      "grad_norm": 18374.34375,
+      "learning_rate": 0.0002464,
+      "loss": 7.6853,
+      "step": 192
+    },
+    {
+      "epoch": 0.7154772937905468,
+      "grad_norm": 18763.783203125,
+      "learning_rate": 0.0002456,
+      "loss": 7.6097,
+      "step": 193
+    },
+    {
+      "epoch": 0.7191844300278035,
+      "grad_norm": 18051.265625,
+      "learning_rate": 0.0002448,
+      "loss": 7.6265,
+      "step": 194
+    },
+    {
+      "epoch": 0.7228915662650602,
+      "grad_norm": 21930.23828125,
+      "learning_rate": 0.000244,
+      "loss": 7.7064,
+      "step": 195
+    },
+    {
+      "epoch": 0.726598702502317,
+      "grad_norm": 21661.873046875,
+      "learning_rate": 0.0002432,
+      "loss": 7.7374,
+      "step": 196
+    },
+    {
+      "epoch": 0.7303058387395737,
+      "grad_norm": 26628.837890625,
+      "learning_rate": 0.0002424,
+      "loss": 7.6806,
+      "step": 197
+    },
+    {
+      "epoch": 0.7340129749768304,
+      "grad_norm": 24882.0234375,
+      "learning_rate": 0.0002416,
+      "loss": 7.6327,
+      "step": 198
+    },
+    {
+      "epoch": 0.7377201112140871,
+      "grad_norm": 25492.328125,
+      "learning_rate": 0.0002408,
+      "loss": 7.6956,
+      "step": 199
+    },
+    {
+      "epoch": 0.7414272474513438,
+      "grad_norm": 27734.201171875,
+      "learning_rate": 0.00024,
+      "loss": 7.6169,
+      "step": 200
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.78927800680448e+16,
+  "train_batch_size": 6,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2b0885aabf5062bce5be067bc22e23dfeed6563307742b230545b8e8cbab6b6c
+size 5112