Upload 16 files

Browse files

Files changed (16) hide show

added_tokens.json +5 -0
config.json +36 -0
generation_config.json +7 -0
merges.txt +0 -0
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +297 -0
optimizer.pt +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
special_tokens_map.json +35 -0
tokenizer.json +0 -0
tokenizer_config.json +200 -0
trainer_state.json +345 -0
training_args.bin +3 -0
vocab.json +0 -0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "<|end_of_role|>": 49153,
+  "<|start_of_role|>": 49152,
+  "<|tool_call|>": 49154
+}

config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "_name_or_path": "E:/text-generation-webui-1.14/models/granite-3.1-3b-a800m-instruct",
+  "architectures": [
+    "GraniteMoeForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attention_multiplier": 0.015625,
+  "bos_token_id": 0,
+  "embedding_multiplier": 12.0,
+  "eos_token_id": 0,
+  "hidden_act": "silu",
+  "hidden_size": 1536,
+  "initializer_range": 0.02,
+  "intermediate_size": 512,
+  "logits_scaling": 6.0,
+  "max_position_embeddings": 131072,
+  "model_type": "granitemoe",
+  "num_attention_heads": 24,
+  "num_experts_per_tok": 8,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "num_local_experts": 40,
+  "output_router_logits": false,
+  "pad_token_id": 0,
+  "residual_multiplier": 0.22,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000000.0,
+  "router_aux_loss_coef": 0.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.48.3",
+  "use_cache": false,
+  "vocab_size": 49155
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": 0,
+  "pad_token_id": 0,
+  "transformers_version": "4.48.3"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c4467e3124323e483c31ef2ab8afed70d2734bdb2bb03a5b05da4d4a529735b1
+size 4998548704

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:16232d1b115b7e9ac1ee42af3c7da2520fefa82d155016a3162fe19aa6334361
+size 2001984712

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,297 @@

+{
+  "metadata": {
+    "total_size": 7000498176
+  },
+  "weight_map": {
+    "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.block_sparse_moe.input_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.block_sparse_moe.output_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.block_sparse_moe.router.layer.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.block_sparse_moe.input_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.block_sparse_moe.output_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.block_sparse_moe.router.layer.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.block_sparse_moe.input_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.block_sparse_moe.output_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.block_sparse_moe.router.layer.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.block_sparse_moe.input_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.block_sparse_moe.output_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.block_sparse_moe.router.layer.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.block_sparse_moe.input_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.block_sparse_moe.output_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.block_sparse_moe.router.layer.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.block_sparse_moe.input_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.block_sparse_moe.output_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.block_sparse_moe.router.layer.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.block_sparse_moe.input_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.block_sparse_moe.output_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.block_sparse_moe.router.layer.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.block_sparse_moe.input_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.block_sparse_moe.output_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.block_sparse_moe.router.layer.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.block_sparse_moe.input_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.block_sparse_moe.output_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.block_sparse_moe.router.layer.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.block_sparse_moe.input_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.block_sparse_moe.output_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.block_sparse_moe.router.layer.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.block_sparse_moe.input_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.block_sparse_moe.output_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.block_sparse_moe.router.layer.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.block_sparse_moe.input_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.block_sparse_moe.output_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.block_sparse_moe.router.layer.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.block_sparse_moe.input_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.block_sparse_moe.output_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.block_sparse_moe.router.layer.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.block_sparse_moe.input_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.block_sparse_moe.output_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.block_sparse_moe.router.layer.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.block_sparse_moe.input_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.block_sparse_moe.output_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.block_sparse_moe.router.layer.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.block_sparse_moe.input_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.block_sparse_moe.output_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.block_sparse_moe.router.layer.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.block_sparse_moe.input_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.block_sparse_moe.output_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.block_sparse_moe.router.layer.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.block_sparse_moe.input_linear.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.block_sparse_moe.output_linear.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.block_sparse_moe.router.layer.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.block_sparse_moe.input_linear.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.block_sparse_moe.output_linear.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.block_sparse_moe.router.layer.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.block_sparse_moe.input_linear.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.block_sparse_moe.output_linear.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.block_sparse_moe.router.layer.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.block_sparse_moe.input_linear.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.block_sparse_moe.output_linear.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.block_sparse_moe.router.layer.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.block_sparse_moe.input_linear.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.block_sparse_moe.output_linear.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.block_sparse_moe.router.layer.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.block_sparse_moe.input_linear.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.block_sparse_moe.output_linear.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.block_sparse_moe.router.layer.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.3.block_sparse_moe.input_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.block_sparse_moe.output_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.block_sparse_moe.router.layer.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.30.block_sparse_moe.input_linear.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.block_sparse_moe.output_linear.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.block_sparse_moe.router.layer.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.block_sparse_moe.input_linear.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.block_sparse_moe.output_linear.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.block_sparse_moe.router.layer.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.4.block_sparse_moe.input_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.block_sparse_moe.output_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.block_sparse_moe.router.layer.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.block_sparse_moe.input_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.block_sparse_moe.output_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.block_sparse_moe.router.layer.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.block_sparse_moe.input_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.block_sparse_moe.output_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.block_sparse_moe.router.layer.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.block_sparse_moe.input_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.block_sparse_moe.output_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.block_sparse_moe.router.layer.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.block_sparse_moe.input_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.block_sparse_moe.output_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.block_sparse_moe.router.layer.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.block_sparse_moe.input_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.block_sparse_moe.output_linear.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.block_sparse_moe.router.layer.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.norm.weight": "model-00002-of-00002.safetensors"
+  }
+}

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d5350c6acb49e8819864396e8403d377b0fb60d0752623d6be45d0806e0405a9
+size 1611660904

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1a5b5e911d73b506345e399f41700d2610af622150ae128a8d07653de423dec7
+size 14244

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f1186f7221b751ec9a0bdbdc1e10b890ebd2828e6bf9d87eda330b10e78ef66b
+size 1192

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "additional_special_tokens": [
+    "<|start_of_role|>",
+    "<|end_of_role|>",
+    "<|tool_call|>"
+  ],
+  "bos_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,200 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|end_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<fim_prefix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<fim_middle>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<fim_suffix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<fim_pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<commit_before>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<commit_msg>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "17": {
+      "content": "<commit_after>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "18": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49152": {
+      "content": "<|start_of_role|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49153": {
+      "content": "<|end_of_role|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49154": {
+      "content": "<|tool_call|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|start_of_role|>",
+    "<|end_of_role|>",
+    "<|tool_call|>"
+  ],
+  "bos_token": "<|end_of_text|>",
+  "chat_template": "{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content'] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = \"Knowledge Cutoff Date: April 2024.\nToday's Date: \" + strftime_now('%B %d, %Y') + \".\nYou are Granite, developed by IBM.\" %}\n    {%- if tools and documents %}\n        {%- set system_message = system_message + \" You are a helpful AI assistant with access to the following tools. When a tool is required to answer the user's query, respond with <|tool_call|> followed by a JSON list of tools used. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request.\n\nWrite the response to the user's input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data.\" %}\n    {%- elif tools %}\n        {%- set system_message = system_message + \" You are a helpful AI assistant with access to the following tools. When a tool is required to answer the user's query, respond with <|tool_call|> followed by a JSON list of tools used. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request.\" %}\n    {%- elif documents %}\n        {%- set system_message = system_message + \" Write the response to the user's input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data.\" %}\n    {%- else %}\n        {%- set system_message = system_message + \" You are a helpful AI assistant.\" %}    \n    {%- endif %}\n    {%- if 'citations' in controls and documents %}\n        {%- set system_message = system_message + '\n\nIn your response, use the symbols <co> and </co> to indicate when a fact comes from a document in the search result, e.g <co>0</co> for a fact from document 0. Afterwards, list all the citations with their corresponding documents in an ordered list.' %}\n    {%- endif %}\n    {%- if 'hallucinations' in controls and documents %}\n        {%- set system_message = system_message + '\n\nFinally, after the response is written, include a numbered list of sentences from the response that are potentially hallucinated and not based in the documents.' %}\n    {%- endif %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{{- '<|start_of_role|>system<|end_of_role|>' + system_message + '<|end_of_text|>\n' }}\n{%- if tools %}\n    {{- '<|start_of_role|>tools<|end_of_role|>' }}\n    {{- tools | tojson(indent=4) }}\n    {{- '<|end_of_text|>\n' }}\n{%- endif %}\n{%- if documents %}\n    {{- '<|start_of_role|>documents<|end_of_role|>' }}\n    {%- for document in documents %}\n        {{- 'Document ' + loop.index0 | string + '\n' }}\n        {{- document['text'] }}\n        {%- if not loop.last %}\n            {{- '\n\n'}}\n        {%- endif%}\n    {%- endfor %}\n    {{- '<|end_of_text|>\n' }}\n{%- endif %}\n{%- for message in loop_messages %}\n    {{- '<|start_of_role|>' + message['role'] + '<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}\n    {%- if loop.last and add_generation_prompt %}\n        {{- '<|start_of_role|>assistant' }}\n            {%- if controls %}\n                {{- ' ' + controls | tojson()}}\n            {%- endif %}\n        {{- '<|end_of_role|>' }}\n    {%- endif %}\n{%- endfor %}",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|end_of_text|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 2048,
+  "pad_token": "<|end_of_text|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|end_of_text|>",
+  "vocab_size": 49152
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,345 @@

+{
+  "best_metric": 0.6242462992668152,
+  "best_model_checkpoint": "saves/granite-3.1-3b-a800m-instruct\\checkpoint-400",
+  "epoch": 0.6676403087836428,
+  "eval_steps": 100,
+  "global_step": 400,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01669100771959107,
+      "grad_norm": 2.9202163219451904,
+      "learning_rate": 0.0004,
+      "loss": 1.278,
+      "step": 10
+    },
+    {
+      "epoch": 0.03338201543918214,
+      "grad_norm": 4.189798355102539,
+      "learning_rate": 0.0004,
+      "loss": 0.9626,
+      "step": 20
+    },
+    {
+      "epoch": 0.05007302315877321,
+      "grad_norm": 2.869903087615967,
+      "learning_rate": 0.0004,
+      "loss": 0.993,
+      "step": 30
+    },
+    {
+      "epoch": 0.06676403087836429,
+      "grad_norm": 2.438555955886841,
+      "learning_rate": 0.0004,
+      "loss": 0.8561,
+      "step": 40
+    },
+    {
+      "epoch": 0.08345503859795535,
+      "grad_norm": 2.5792276859283447,
+      "learning_rate": 0.0004,
+      "loss": 1.008,
+      "step": 50
+    },
+    {
+      "epoch": 0.10014604631754642,
+      "grad_norm": 2.982292890548706,
+      "learning_rate": 0.0004,
+      "loss": 0.7934,
+      "step": 60
+    },
+    {
+      "epoch": 0.1168370540371375,
+      "grad_norm": 1.8241057395935059,
+      "learning_rate": 0.0004,
+      "loss": 0.7811,
+      "step": 70
+    },
+    {
+      "epoch": 0.13352806175672857,
+      "grad_norm": 1.7634323835372925,
+      "learning_rate": 0.0004,
+      "loss": 0.8709,
+      "step": 80
+    },
+    {
+      "epoch": 0.15021906947631963,
+      "grad_norm": 2.9324874877929688,
+      "learning_rate": 0.0004,
+      "loss": 0.8429,
+      "step": 90
+    },
+    {
+      "epoch": 0.1669100771959107,
+      "grad_norm": 1.6204209327697754,
+      "learning_rate": 0.0004,
+      "loss": 0.7855,
+      "step": 100
+    },
+    {
+      "epoch": 0.1669100771959107,
+      "eval_loss": 0.6945974230766296,
+      "eval_runtime": 172.0416,
+      "eval_samples_per_second": 6.196,
+      "eval_steps_per_second": 3.098,
+      "step": 100
+    },
+    {
+      "epoch": 0.18360108491550178,
+      "grad_norm": 3.2142117023468018,
+      "learning_rate": 0.0004,
+      "loss": 0.802,
+      "step": 110
+    },
+    {
+      "epoch": 0.20029209263509284,
+      "grad_norm": 2.2638301849365234,
+      "learning_rate": 0.0004,
+      "loss": 0.7556,
+      "step": 120
+    },
+    {
+      "epoch": 0.2169831003546839,
+      "grad_norm": 1.815718412399292,
+      "learning_rate": 0.0004,
+      "loss": 0.8419,
+      "step": 130
+    },
+    {
+      "epoch": 0.233674108074275,
+      "grad_norm": 2.0672075748443604,
+      "learning_rate": 0.0004,
+      "loss": 0.9097,
+      "step": 140
+    },
+    {
+      "epoch": 0.25036511579386606,
+      "grad_norm": 1.7800904512405396,
+      "learning_rate": 0.0004,
+      "loss": 0.8887,
+      "step": 150
+    },
+    {
+      "epoch": 0.26705612351345714,
+      "grad_norm": 2.7289271354675293,
+      "learning_rate": 0.0004,
+      "loss": 0.8078,
+      "step": 160
+    },
+    {
+      "epoch": 0.2837471312330482,
+      "grad_norm": 1.8588060140609741,
+      "learning_rate": 0.0004,
+      "loss": 0.824,
+      "step": 170
+    },
+    {
+      "epoch": 0.30043813895263927,
+      "grad_norm": 3.1786084175109863,
+      "learning_rate": 0.0004,
+      "loss": 0.7997,
+      "step": 180
+    },
+    {
+      "epoch": 0.31712914667223036,
+      "grad_norm": 1.992241382598877,
+      "learning_rate": 0.0004,
+      "loss": 0.7627,
+      "step": 190
+    },
+    {
+      "epoch": 0.3338201543918214,
+      "grad_norm": 1.9906195402145386,
+      "learning_rate": 0.0004,
+      "loss": 0.7525,
+      "step": 200
+    },
+    {
+      "epoch": 0.3338201543918214,
+      "eval_loss": 0.6729084849357605,
+      "eval_runtime": 168.6462,
+      "eval_samples_per_second": 6.321,
+      "eval_steps_per_second": 3.16,
+      "step": 200
+    },
+    {
+      "epoch": 0.3505111621114125,
+      "grad_norm": 1.408159613609314,
+      "learning_rate": 0.0004,
+      "loss": 0.803,
+      "step": 210
+    },
+    {
+      "epoch": 0.36720216983100357,
+      "grad_norm": 2.2278130054473877,
+      "learning_rate": 0.0004,
+      "loss": 0.8844,
+      "step": 220
+    },
+    {
+      "epoch": 0.3838931775505946,
+      "grad_norm": 2.3945512771606445,
+      "learning_rate": 0.0004,
+      "loss": 0.9442,
+      "step": 230
+    },
+    {
+      "epoch": 0.4005841852701857,
+      "grad_norm": 1.1758439540863037,
+      "learning_rate": 0.0004,
+      "loss": 0.8379,
+      "step": 240
+    },
+    {
+      "epoch": 0.4172751929897768,
+      "grad_norm": 2.483109951019287,
+      "learning_rate": 0.0004,
+      "loss": 0.7718,
+      "step": 250
+    },
+    {
+      "epoch": 0.4339662007093678,
+      "grad_norm": 1.4695591926574707,
+      "learning_rate": 0.0004,
+      "loss": 0.7664,
+      "step": 260
+    },
+    {
+      "epoch": 0.4506572084289589,
+      "grad_norm": 1.5021220445632935,
+      "learning_rate": 0.0004,
+      "loss": 0.92,
+      "step": 270
+    },
+    {
+      "epoch": 0.46734821614855,
+      "grad_norm": 1.426329255104065,
+      "learning_rate": 0.0004,
+      "loss": 0.9022,
+      "step": 280
+    },
+    {
+      "epoch": 0.484039223868141,
+      "grad_norm": 1.4195940494537354,
+      "learning_rate": 0.0004,
+      "loss": 0.7857,
+      "step": 290
+    },
+    {
+      "epoch": 0.5007302315877321,
+      "grad_norm": 1.7680014371871948,
+      "learning_rate": 0.0004,
+      "loss": 0.8381,
+      "step": 300
+    },
+    {
+      "epoch": 0.5007302315877321,
+      "eval_loss": 0.6377778649330139,
+      "eval_runtime": 168.5375,
+      "eval_samples_per_second": 6.325,
+      "eval_steps_per_second": 3.163,
+      "step": 300
+    },
+    {
+      "epoch": 0.5174212393073232,
+      "grad_norm": 1.8080698251724243,
+      "learning_rate": 0.0004,
+      "loss": 0.8346,
+      "step": 310
+    },
+    {
+      "epoch": 0.5341122470269143,
+      "grad_norm": 7.522341728210449,
+      "learning_rate": 0.0004,
+      "loss": 0.7861,
+      "step": 320
+    },
+    {
+      "epoch": 0.5508032547465053,
+      "grad_norm": 1.397352933883667,
+      "learning_rate": 0.0004,
+      "loss": 0.8113,
+      "step": 330
+    },
+    {
+      "epoch": 0.5674942624660964,
+      "grad_norm": 1.4032018184661865,
+      "learning_rate": 0.0004,
+      "loss": 0.7936,
+      "step": 340
+    },
+    {
+      "epoch": 0.5841852701856874,
+      "grad_norm": 1.6604522466659546,
+      "learning_rate": 0.0004,
+      "loss": 0.7278,
+      "step": 350
+    },
+    {
+      "epoch": 0.6008762779052785,
+      "grad_norm": 1.8743423223495483,
+      "learning_rate": 0.0004,
+      "loss": 0.8652,
+      "step": 360
+    },
+    {
+      "epoch": 0.6175672856248696,
+      "grad_norm": 3.0662145614624023,
+      "learning_rate": 0.0004,
+      "loss": 0.8746,
+      "step": 370
+    },
+    {
+      "epoch": 0.6342582933444607,
+      "grad_norm": 1.4264730215072632,
+      "learning_rate": 0.0004,
+      "loss": 0.7431,
+      "step": 380
+    },
+    {
+      "epoch": 0.6509493010640517,
+      "grad_norm": 1.9815279245376587,
+      "learning_rate": 0.0004,
+      "loss": 0.828,
+      "step": 390
+    },
+    {
+      "epoch": 0.6676403087836428,
+      "grad_norm": 1.3689788579940796,
+      "learning_rate": 0.0004,
+      "loss": 0.8564,
+      "step": 400
+    },
+    {
+      "epoch": 0.6676403087836428,
+      "eval_loss": 0.6242462992668152,
+      "eval_runtime": 168.5455,
+      "eval_samples_per_second": 6.325,
+      "eval_steps_per_second": 3.162,
+      "step": 400
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 10000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 17,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.420426751907922e+17,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5fa63d0db02ddd4ae27e9456bba55f591922f2a5853634d8ab48d95127679a85
+size 5688

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff