Initial commit

Browse files

Files changed (9) hide show

config.json +30 -0
generation_config.json +10 -0
model.safetensors +3 -0
modeling_llama_nugptq.py +36 -0
quantize_config.json +13 -0
special_tokens_map.json +23 -0
tokenizer.json +0 -0
tokenizer.model +3 -0
tokenizer_config.json +35 -0

config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "architectures": [
+    "LLamaNuGPTQForCausalLM"
+  ],
+  "auto_map": {
+    "AutoModelForCausalLM": "modeling_llama_nugptq.LLamaNuGPTQForCausalLM"
+  },
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 4096,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "vocab_size": 32000
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "bos_token_id": 1,
+  "do_sample": true,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "temperature": 0.6,
+  "max_length": 4096,
+  "top_p": 0.9,
+  "transformers_version": "4.31.0.dev0"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c5c61af2c480c4de3d110490a0440f951c629a5326a0b9b88737ab8380cb380
+size 13479611720

modeling_llama_nugptq.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from transformers import LlamaForCausalLM
+import torch
+from torch import nn
+class ScaledLinear(nn.Linear):
+    def __init__(self, in_features, out_features, bias=True):
+        super().__init__(in_features, out_features, bias=bias)
+        self.output_scales = nn.Parameter(torch.ones((1, out_features)))
+        assert bias == False, "bias not supported yet" # need to divide bias by scales.
+    def forward(self, x):
+        return super().forward(x) * self.output_scales
+    # Works for CPU but not CUDA.
+    # Starting point if you need to add support for bias.
+    # def _load_from_state_dict(self, *args, **kwargs):
+    #     # Seems like transformers doesn't call load_state_dict.
+    #     # args[0] - state_dict
+    #     # args[1] - prefix
+    #     args[0][f"{args[1]}output_scales"] = args[0][f"{args[1]}output_scales"].t()
+    #     super()._load_from_state_dict(*args, **kwargs)
+    #     if self.bias is not None:
+    #         self.bias.data = self.bias.data / self.output_scales
+class LLamaNuGPTQForCausalLM(LlamaForCausalLM):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        def replace_linear_modules(module):
+            for name, mod in module.named_children():
+                if isinstance(mod, nn.Linear) and name in ["gate_proj", "up_proj", "down_proj", "q_proj", "k_proj", "v_proj", "o_proj"]:
+                    setattr(module, name, ScaledLinear(mod.in_features, mod.out_features, mod.bias is not None))
+                else:
+                    replace_linear_modules(mod)
+        replace_linear_modules(self)

quantize_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "bits": 4,
+  "group_size": -1,
+  "damp_percent": 0.01,
+  "desc_act": true,
+  "static_groups": false,
+  "sym": true,
+  "true_sequential": true,
+  "model_name_or_path": "meta-llama/Llama-2-7b-hf-4bit-nuq",
+  "model_file_base_name": "gptq_model-4bit--1g",
+  "is_marlin_format": false,
+  "quant_method": "gptq"
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "bos_token": {
+    "__type": "AddedToken",
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "clean_up_tokenization_spaces": false,
+  "eos_token": {
+    "__type": "AddedToken",
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": null,
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}