LoneStriker commited on
Commit
750220f
β€’
1 Parent(s): 7928b8c

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Trinity.png filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: llama2
3
+ ---
4
+
5
+ # Trinity
6
+
7
+
8
+ ![Trinity](https://huggingface.co/migtissera/Trinity-13B-v1.0/resolve/main/Trinity.png)
9
+
10
+
11
+ Trinity is a coding specific model series that can be used to create autonomous agents. In the future, we will be releasing agent software that uses this model.
12
+
13
+
14
+ # Our Offensive Cybersecurity Model WhiteRabbitNeo-33B model is now in beta!
15
+ Access at: https://www.whiterabbitneo.com/
16
+
17
+ # Join Our Discord Server
18
+ Join us at: https://discord.gg/8Ynkrcbk92 (Updated on Dec 29th. Now permanent link to join)
19
+
20
+ # Sample Inference Code
21
+ ```
22
+ import torch, json
23
+ from transformers import AutoModelForCausalLM, AutoTokenizer
24
+
25
+ model_path = "/home/migel/models/WhiteRabbitNeo"
26
+
27
+ model = AutoModelForCausalLM.from_pretrained(
28
+ model_path,
29
+ torch_dtype=torch.float16,
30
+ device_map="auto",
31
+ load_in_4bit=False,
32
+ load_in_8bit=True,
33
+ trust_remote_code=True,
34
+ )
35
+
36
+ tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
37
+
38
+
39
+ def generate_text(instruction):
40
+ tokens = tokenizer.encode(instruction)
41
+ tokens = torch.LongTensor(tokens).unsqueeze(0)
42
+ tokens = tokens.to("cuda")
43
+
44
+ instance = {
45
+ "input_ids": tokens,
46
+ "top_p": 1.0,
47
+ "temperature": 0.5,
48
+ "generate_len": 1024,
49
+ "top_k": 50,
50
+ }
51
+
52
+ length = len(tokens[0])
53
+ with torch.no_grad():
54
+ rest = model.generate(
55
+ input_ids=tokens,
56
+ max_length=length + instance["generate_len"],
57
+ use_cache=True,
58
+ do_sample=True,
59
+ top_p=instance["top_p"],
60
+ temperature=instance["temperature"],
61
+ top_k=instance["top_k"],
62
+ num_return_sequences=1,
63
+ )
64
+ output = rest[0][length:]
65
+ string = tokenizer.decode(output, skip_special_tokens=True)
66
+ answer = string.split("USER:")[0].strip()
67
+ return f"{answer}"
68
+
69
+
70
+ tot_system_prompt = """
71
+ Answer the Question by exploring multiple reasoning paths as follows:
72
+ - First, carefully analyze the question to extract the key information components and break it down into logical sub-questions. This helps set up the framework for reasoning. The goal is to construct an internal search tree.
73
+ - For each sub-question, leverage your knowledge to generate 2-3 intermediate thoughts that represent steps towards an answer. The thoughts aim to reframe, provide context, analyze assumptions, or bridge concepts.
74
+ - Evaluate the clarity, relevance, logical flow and coverage of concepts for each thought option. Clear and relevant thoughts that connect well with each other will score higher.
75
+ - Based on the thought evaluations, deliberate to construct a chain of reasoning that stitches together the strongest thoughts in a natural order.
76
+ - If the current chain is determined to not fully answer the question, backtrack and explore alternative paths by substituting different high-scoring thoughts.
77
+ - Throughout the reasoning process, aim to provide explanatory details on thought process rather than just state conclusions, including briefly noting why some thoughts were deemed less ideal.
78
+ - Once a reasoning chain is constructed that thoroughly answers all sub-questions in a clear, logical manner, synthesize the key insights into a final concise answer.
79
+ - Please note that while the focus is on the final answer in the response, it should also include intermediate thoughts inline to illustrate the deliberative reasoning process.
80
+ In summary, leverage a Tree of Thoughts approach to actively explore multiple reasoning paths, evaluate thoughts heuristically, and explain the process - with the goal of producing insightful answers.
81
+ """
82
+
83
+ conversation = f"SYSTEM: {tot_system_prompt} Always answer without hesitation."
84
+
85
+
86
+ while True:
87
+ user_input = input("You: ")
88
+ llm_prompt = f"{conversation} \nUSER: {user_input} \nASSISTANT: "
89
+ answer = generate_text(llm_prompt)
90
+ print(answer)
91
+ conversation = f"{llm_prompt}{answer}"
92
+ # print(conversation)
93
+ json_data = {"prompt": user_input, "answer": answer}
94
+
95
+ # print(json_data)
96
+ # with open(output_file_path, "a") as output_file:
97
+ # output_file.write(json.dumps(json_data) + "\n")
98
+
99
+ ```
100
+
101
+
102
+ # Sample Conversation:
103
+
104
+
105
+ You: Code me a Fast API server, with one endpoint. This endpoint returns files from a S3 bucket.
106
+
107
+ Here is how to code a Fast API server with one endpoint that returns files from an S3 bucket:
108
+
109
+ ```python
110
+ from fastapi import FastAPI
111
+ from starlette.responses import StreamingResponse
112
+ import boto3
113
+ import botocore.exceptions
114
+
115
+ app = FastAPI()
116
+ s3_client = boto3.client('s3')
117
+
118
+ @app.get("/s3files")
119
+ async def get_s3_files(bucket_name: str, key: str):
120
+ try:
121
+ obj = s3_client.get_object(Bucket=bucket_name, Key=key)
122
+ return StreamingResponse(obj['Body'], media_type=obj['ContentType'])
123
+ except botocore.exceptions.ClientError as e:
124
+ return {"error": str(e)}, 404
125
+ ```
126
+
127
+ To explain:
128
+ - We initialize a FastAPI app and an S3 client
129
+ - The `/s3files` endpoint takes the bucket name and object key as parameters
130
+ - We try to get the object from S3 using the S3 client
131
+ - If successful, we return the object's Body wrapped in a StreamingResponse, setting the ContentType
132
+ - If the client raises an error, we return a 404 with the error message
133
+
134
+ This allows clients to request files from an S3 bucket by specifying the bucket name and object key. The endpoint streams the file content back to the client.
135
+
136
+ To elaborate further:
137
+ - The endpoint can be extended to validate inputs, handle authentication, logging, etc.
138
+ - We could make this endpoint more generic by accepting a list of object keys instead of requiring a single key.
139
+ - There are several ways to optimize performance, like caching responses.
140
+
141
+ Let me know if you have any other questions!
142
+
143
+ You: How do I authenticate s3?
144
+
145
+ Here are a few ways to authenticate with S3:
146
+
147
+ 1. Credentials - You can provide AWS access key ID and secret access key to boto3. This will sign requests automatically.
148
+
149
+ ```python
150
+ import boto3
151
+
152
+ s3 = boto3.client('s3',
153
+ aws_access_key_id='YOUR_ACCESS_KEY',
154
+ aws_secret_access_key='YOUR_SECRET_KEY')
155
+ ```
156
+
157
+ 2. IAM roles - If you are running on EC2 with an IAM role, you don't need to provide credentials. Boto3 will automatically use the role.
158
+
159
+ 3. Environment variables - Store credentials in environment variables and read them in boto3.
160
+
161
+ 4. AWS credentials file - Store credentials in ~/.aws/credentials. Boto3 will automatically use them.
162
+
163
+ 5. STS Assume roles - Get temporary credentials from STS and assume a role to access S3.
164
+
165
+ 6. AWS SDK credential providers - Use environment variables, configuration files, etc to load credentials.
166
+
167
+ So in summary, the best way depends on your deployment environment. The best practice is to use IAM roles when possible. Other options like environment variables and credential files can be useful for testing.
168
+
169
+ Let me know if you have any other questions!
Trinity.png ADDED

Git LFS Details

  • SHA256: d737b133f13fe8e647e16a13c810e86d098cf345fc01aad8e075af7b67d96799
  • Pointer size: 132 Bytes
  • Size of remote file: 1 MB
config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/home/migel/CodeLlama-13B-fp16",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "auto_map": {
9
+ "AutoConfig": "configuration_llama.LlamaConfig",
10
+ "AutoModel": "modeling_llama.LlamaModel",
11
+ "AutoModelForCausalLM": "modeling_llama.LlamaForCausalLM",
12
+ "AutoModelForSequenceClassification": "modeling_llama.LlamaForSequenceClassification"
13
+ },
14
+ "bos_token_id": 1,
15
+ "eos_token_id": 2,
16
+ "hidden_act": "silu",
17
+ "hidden_size": 5120,
18
+ "initializer_range": 0.02,
19
+ "intermediate_size": 13824,
20
+ "max_position_embeddings": 16384,
21
+ "model_type": "llama",
22
+ "num_attention_heads": 40,
23
+ "num_hidden_layers": 40,
24
+ "num_key_value_heads": 40,
25
+ "pad_token_id": 0,
26
+ "pretraining_tp": 1,
27
+ "rms_norm_eps": 1e-05,
28
+ "rope_scaling": null,
29
+ "rope_theta": 1000000,
30
+ "tie_word_embeddings": false,
31
+ "torch_dtype": "bfloat16",
32
+ "transformers_version": "4.36.2",
33
+ "use_cache": false,
34
+ "vocab_size": 32016
35
+ }
configuration_llama.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ """ LLaMA model configuration"""
21
+
22
+ from transformers.configuration_utils import PretrainedConfig
23
+ from transformers.utils import logging
24
+
25
+
26
+ logger = logging.get_logger(__name__)
27
+
28
+ LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
29
+
30
+
31
+ class LlamaConfig(PretrainedConfig):
32
+ r"""
33
+ This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
34
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
35
+ defaults will yield a similar configuration to that of the LLaMA-7B.
36
+
37
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
38
+ documentation from [`PretrainedConfig`] for more information.
39
+
40
+
41
+ Args:
42
+ vocab_size (`int`, *optional*, defaults to 32000):
43
+ Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
44
+ `inputs_ids` passed when calling [`LlamaModel`]
45
+ hidden_size (`int`, *optional*, defaults to 4096):
46
+ Dimension of the hidden representations.
47
+ intermediate_size (`int`, *optional*, defaults to 11008):
48
+ Dimension of the MLP representations.
49
+ num_hidden_layers (`int`, *optional*, defaults to 32):
50
+ Number of hidden layers in the Transformer encoder.
51
+ num_attention_heads (`int`, *optional*, defaults to 32):
52
+ Number of attention heads for each attention layer in the Transformer encoder.
53
+ num_key_value_heads (`int`, *optional*):
54
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
55
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
56
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
57
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
58
+ by meanpooling all the original heads within that group. For more details checkout [this
59
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
60
+ `num_attention_heads`.
61
+ pretraining_tp (`int`, *optional*, defaults to `1`):
62
+ Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
63
+ document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
64
+ necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
65
+ issue](https://github.com/pytorch/pytorch/issues/76232).
66
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
67
+ The non-linear activation function (function or string) in the decoder.
68
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
69
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
70
+ just in case (e.g., 512 or 1024 or 2048).
71
+ initializer_range (`float`, *optional*, defaults to 0.02):
72
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
73
+ rms_norm_eps (`float`, *optional*, defaults to 1e-12):
74
+ The epsilon used by the rms normalization layers.
75
+ use_cache (`bool`, *optional*, defaults to `True`):
76
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
77
+ relevant if `config.is_decoder=True`.
78
+ tie_word_embeddings(`bool`, *optional*, defaults to `False`):
79
+ Whether to tie weight embeddings
80
+ rope_scaling (`Dict`, *optional*):
81
+ Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
82
+ strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
83
+ is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
84
+ `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
85
+ these scaling strategies behave:
86
+ https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
87
+ experimental feature, subject to breaking API changes in future versions.
88
+
89
+ Example:
90
+
91
+ ```python
92
+ >>> from transformers import LlamaModel, LlamaConfig
93
+
94
+ >>> # Initializing a LLaMA llama-7b style configuration
95
+ >>> configuration = LlamaConfig()
96
+
97
+ >>> # Initializing a model from the llama-7b style configuration
98
+ >>> model = LlamaModel(configuration)
99
+
100
+ >>> # Accessing the model configuration
101
+ >>> configuration = model.config
102
+ ```"""
103
+ model_type = "llama"
104
+ keys_to_ignore_at_inference = ["past_key_values"]
105
+
106
+ def __init__(
107
+ self,
108
+ vocab_size=32000,
109
+ hidden_size=4096,
110
+ intermediate_size=11008,
111
+ num_hidden_layers=32,
112
+ num_attention_heads=32,
113
+ num_key_value_heads=None,
114
+ hidden_act="silu",
115
+ max_position_embeddings=2048,
116
+ initializer_range=0.02,
117
+ rms_norm_eps=1e-6,
118
+ use_cache=True,
119
+ pad_token_id=None,
120
+ bos_token_id=1,
121
+ eos_token_id=2,
122
+ pretraining_tp=1,
123
+ tie_word_embeddings=False,
124
+ rope_scaling=None,
125
+ rope_theta=10000,
126
+ **kwargs,
127
+ ):
128
+ self.vocab_size = vocab_size
129
+ self.max_position_embeddings = max_position_embeddings
130
+ self.hidden_size = hidden_size
131
+ self.intermediate_size = intermediate_size
132
+ self.num_hidden_layers = num_hidden_layers
133
+ self.num_attention_heads = num_attention_heads
134
+
135
+ # for backward compatibility
136
+ if num_key_value_heads is None:
137
+ num_key_value_heads = num_attention_heads
138
+
139
+ self.num_key_value_heads = num_key_value_heads
140
+ self.hidden_act = hidden_act
141
+ self.initializer_range = initializer_range
142
+ self.rms_norm_eps = rms_norm_eps
143
+ self.pretraining_tp = pretraining_tp
144
+ self.use_cache = use_cache
145
+ self.rope_scaling = rope_scaling
146
+ self._rope_scaling_validation()
147
+ self.rope_theta = rope_theta
148
+
149
+ super().__init__(
150
+ pad_token_id=pad_token_id,
151
+ bos_token_id=bos_token_id,
152
+ eos_token_id=eos_token_id,
153
+ tie_word_embeddings=tie_word_embeddings,
154
+ **kwargs,
155
+ )
156
+
157
+ def _rope_scaling_validation(self):
158
+ """
159
+ Validate the `rope_scaling` configuration.
160
+ """
161
+ if self.rope_scaling is None:
162
+ return
163
+
164
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
165
+ raise ValueError(
166
+ "`rope_scaling` must be a dictionary with with two fields, `name` and `factor`, "
167
+ f"got {self.rope_scaling}"
168
+ )
169
+ rope_scaling_type = self.rope_scaling.get("type", None)
170
+ rope_scaling_factor = self.rope_scaling.get("factor", None)
171
+ if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
172
+ raise ValueError(
173
+ f"`rope_scaling`'s name field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
174
+ )
175
+ if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
176
+ raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.36.2"
7
+ }
modeling_llama.py ADDED
@@ -0,0 +1,1020 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ """ PyTorch LLaMA model."""
21
+ import math
22
+ from typing import List, Optional, Tuple, Union
23
+
24
+ import torch
25
+ import torch.nn.functional as F
26
+ import torch.utils.checkpoint
27
+ from torch import nn
28
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
29
+
30
+ from transformers.activations import ACT2FN
31
+ from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
32
+ from transformers.modeling_utils import PreTrainedModel
33
+ from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
34
+ from .configuration_llama import LlamaConfig
35
+
36
+
37
+ logger = logging.get_logger(__name__)
38
+
39
+ _CONFIG_FOR_DOC = "LlamaConfig"
40
+
41
+
42
+ # Copied from transformers.models.bart.modeling_bart._make_causal_mask
43
+ def _make_causal_mask(
44
+ input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
45
+ ):
46
+ """
47
+ Make causal mask used for bi-directional self-attention.
48
+ """
49
+ bsz, tgt_len = input_ids_shape
50
+ mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
51
+ mask_cond = torch.arange(mask.size(-1), device=device)
52
+ mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
53
+ mask = mask.to(dtype)
54
+
55
+ if past_key_values_length > 0:
56
+ mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
57
+ return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
58
+
59
+
60
+ # Copied from transformers.models.bart.modeling_bart._expand_mask
61
+ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
62
+ """
63
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
64
+ """
65
+ bsz, src_len = mask.size()
66
+ tgt_len = tgt_len if tgt_len is not None else src_len
67
+
68
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
69
+
70
+ inverted_mask = 1.0 - expanded_mask
71
+
72
+ return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
73
+
74
+
75
+ class LlamaRMSNorm(nn.Module):
76
+ def __init__(self, hidden_size, eps=1e-6):
77
+ """
78
+ LlamaRMSNorm is equivalent to T5LayerNorm
79
+ """
80
+ super().__init__()
81
+ self.weight = nn.Parameter(torch.ones(hidden_size))
82
+ self.variance_epsilon = eps
83
+
84
+ def forward(self, hidden_states):
85
+ input_dtype = hidden_states.dtype
86
+ hidden_states = hidden_states.to(torch.float32)
87
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
88
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
89
+ return self.weight * hidden_states.to(input_dtype)
90
+
91
+
92
+ class LlamaRotaryEmbedding(torch.nn.Module):
93
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
94
+ super().__init__()
95
+
96
+ self.dim = dim
97
+ self.max_position_embeddings = max_position_embeddings
98
+ self.base = base
99
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
100
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
101
+
102
+ # Build here to make `torch.jit.trace` work.
103
+ self._set_cos_sin_cache(
104
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
105
+ )
106
+
107
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
108
+ self.max_seq_len_cached = seq_len
109
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
110
+
111
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
112
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
113
+ emb = torch.cat((freqs, freqs), dim=-1)
114
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
115
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
116
+
117
+ def forward(self, x, seq_len=None):
118
+ # x: [bs, num_attention_heads, seq_len, head_size]
119
+ if seq_len > self.max_seq_len_cached:
120
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
121
+
122
+ return (
123
+ self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
124
+ self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
125
+ )
126
+
127
+
128
+ class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
129
+ """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
130
+
131
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
132
+ self.scaling_factor = scaling_factor
133
+ super().__init__(dim, max_position_embeddings, base, device)
134
+
135
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
136
+ self.max_seq_len_cached = seq_len
137
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
138
+ t = t / self.scaling_factor
139
+
140
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
141
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
142
+ emb = torch.cat((freqs, freqs), dim=-1)
143
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
144
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
145
+
146
+
147
+ class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
148
+ """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
149
+
150
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
151
+ self.scaling_factor = scaling_factor
152
+ super().__init__(dim, max_position_embeddings, base, device)
153
+
154
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
155
+ self.max_seq_len_cached = seq_len
156
+
157
+ if seq_len > self.max_position_embeddings:
158
+ base = self.base * (
159
+ (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
160
+ ) ** (self.dim / (self.dim - 2))
161
+ inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
162
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
163
+
164
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
165
+
166
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
167
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
168
+ emb = torch.cat((freqs, freqs), dim=-1)
169
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
170
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
171
+
172
+
173
+ def rotate_half(x):
174
+ """Rotates half the hidden dims of the input."""
175
+ x1 = x[..., : x.shape[-1] // 2]
176
+ x2 = x[..., x.shape[-1] // 2 :]
177
+ return torch.cat((-x2, x1), dim=-1)
178
+
179
+
180
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
181
+ # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
182
+ cos = cos.squeeze(1).squeeze(0) # [seq_len, dim]
183
+ sin = sin.squeeze(1).squeeze(0) # [seq_len, dim]
184
+ cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
185
+ sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
186
+ q_embed = (q * cos) + (rotate_half(q) * sin)
187
+ k_embed = (k * cos) + (rotate_half(k) * sin)
188
+ return q_embed, k_embed
189
+
190
+
191
+ class LlamaMLP(nn.Module):
192
+ def __init__(self, config):
193
+ super().__init__()
194
+ self.config = config
195
+ self.hidden_size = config.hidden_size
196
+ self.intermediate_size = config.intermediate_size
197
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
198
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
199
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
200
+ self.act_fn = ACT2FN[config.hidden_act]
201
+
202
+ def forward(self, x):
203
+ if self.config.pretraining_tp > 1:
204
+ slice = self.intermediate_size // self.config.pretraining_tp
205
+ gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
206
+ up_proj_slices = self.up_proj.weight.split(slice, dim=0)
207
+ down_proj_slices = self.down_proj.weight.split(slice, dim=1)
208
+
209
+ gate_proj = torch.cat(
210
+ [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
211
+ )
212
+ up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
213
+
214
+ intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
215
+ down_proj = [
216
+ F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
217
+ ]
218
+ down_proj = sum(down_proj)
219
+ else:
220
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
221
+
222
+ return down_proj
223
+
224
+
225
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
226
+ """
227
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
228
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
229
+ """
230
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
231
+ if n_rep == 1:
232
+ return hidden_states
233
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
234
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
235
+
236
+
237
+ class LlamaAttention(nn.Module):
238
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
239
+
240
+ def __init__(self, config: LlamaConfig):
241
+ super().__init__()
242
+ self.config = config
243
+ self.hidden_size = config.hidden_size
244
+ self.num_heads = config.num_attention_heads
245
+ self.head_dim = self.hidden_size // self.num_heads
246
+ self.num_key_value_heads = config.num_key_value_heads
247
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
248
+ self.max_position_embeddings = config.max_position_embeddings
249
+ self.rope_theta = config.rope_theta
250
+
251
+ if (self.head_dim * self.num_heads) != self.hidden_size:
252
+ raise ValueError(
253
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
254
+ f" and `num_heads`: {self.num_heads})."
255
+ )
256
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
257
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
258
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
259
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
260
+ self._init_rope()
261
+
262
+ def _init_rope(self):
263
+ if self.config.rope_scaling is None:
264
+ self.rotary_emb = LlamaRotaryEmbedding(
265
+ self.head_dim, max_position_embeddings=self.max_position_embeddings,
266
+ base=self.rope_theta
267
+ )
268
+ else:
269
+ scaling_type = self.config.rope_scaling["type"]
270
+ scaling_factor = self.config.rope_scaling["factor"]
271
+ if scaling_type == "linear":
272
+ self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
273
+ self.head_dim, max_position_embeddings=self.max_position_embeddings,
274
+ base=self.rope_theta, scaling_factor=scaling_factor
275
+ )
276
+ elif scaling_type == "dynamic":
277
+ self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
278
+ self.head_dim, max_position_embeddings=self.max_position_embeddings,
279
+ base=self.rope_theta, scaling_factor=scaling_factor
280
+ )
281
+ else:
282
+ raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
283
+
284
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
285
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
286
+
287
+ def forward(
288
+ self,
289
+ hidden_states: torch.Tensor,
290
+ attention_mask: Optional[torch.Tensor] = None,
291
+ position_ids: Optional[torch.LongTensor] = None,
292
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
293
+ output_attentions: bool = False,
294
+ use_cache: bool = False,
295
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
296
+ bsz, q_len, _ = hidden_states.size()
297
+
298
+ if self.config.pretraining_tp > 1:
299
+ key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
300
+ query_slices = self.q_proj.weight.split(
301
+ (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
302
+ )
303
+ key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
304
+ value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
305
+
306
+ query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
307
+ query_states = torch.cat(query_states, dim=-1)
308
+
309
+ key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
310
+ key_states = torch.cat(key_states, dim=-1)
311
+
312
+ value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
313
+ value_states = torch.cat(value_states, dim=-1)
314
+
315
+ else:
316
+ query_states = self.q_proj(hidden_states)
317
+ key_states = self.k_proj(hidden_states)
318
+ value_states = self.v_proj(hidden_states)
319
+
320
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
321
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
322
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
323
+
324
+ kv_seq_len = key_states.shape[-2]
325
+ if past_key_value is not None:
326
+ kv_seq_len += past_key_value[0].shape[-2]
327
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
328
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
329
+
330
+ if past_key_value is not None:
331
+ # reuse k, v, self_attention
332
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
333
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
334
+
335
+ past_key_value = (key_states, value_states) if use_cache else None
336
+
337
+ # repeat k/v heads if n_kv_heads < n_heads
338
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
339
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
340
+
341
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
342
+
343
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
344
+ raise ValueError(
345
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
346
+ f" {attn_weights.size()}"
347
+ )
348
+
349
+ if attention_mask is not None:
350
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
351
+ raise ValueError(
352
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
353
+ )
354
+ attn_weights = attn_weights + attention_mask
355
+
356
+ # upcast attention to fp32
357
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
358
+ attn_output = torch.matmul(attn_weights, value_states)
359
+
360
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
361
+ raise ValueError(
362
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
363
+ f" {attn_output.size()}"
364
+ )
365
+
366
+ attn_output = attn_output.transpose(1, 2).contiguous()
367
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
368
+
369
+ if self.config.pretraining_tp > 1:
370
+ attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
371
+ o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
372
+ attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
373
+ else:
374
+ attn_output = self.o_proj(attn_output)
375
+
376
+ if not output_attentions:
377
+ attn_weights = None
378
+
379
+ return attn_output, attn_weights, past_key_value
380
+
381
+
382
+ class LlamaDecoderLayer(nn.Module):
383
+ def __init__(self, config: LlamaConfig):
384
+ super().__init__()
385
+ self.hidden_size = config.hidden_size
386
+ self.self_attn = LlamaAttention(config=config)
387
+ self.mlp = LlamaMLP(config)
388
+ self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
389
+ self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
390
+
391
+ def forward(
392
+ self,
393
+ hidden_states: torch.Tensor,
394
+ attention_mask: Optional[torch.Tensor] = None,
395
+ position_ids: Optional[torch.LongTensor] = None,
396
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
397
+ output_attentions: Optional[bool] = False,
398
+ use_cache: Optional[bool] = False,
399
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
400
+ """
401
+ Args:
402
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
403
+ attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
404
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
405
+ output_attentions (`bool`, *optional*):
406
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
407
+ returned tensors for more detail.
408
+ use_cache (`bool`, *optional*):
409
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
410
+ (see `past_key_values`).
411
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
412
+ """
413
+
414
+ residual = hidden_states
415
+
416
+ hidden_states = self.input_layernorm(hidden_states)
417
+
418
+ # Self Attention
419
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
420
+ hidden_states=hidden_states,
421
+ attention_mask=attention_mask,
422
+ position_ids=position_ids,
423
+ past_key_value=past_key_value,
424
+ output_attentions=output_attentions,
425
+ use_cache=use_cache,
426
+ )
427
+ hidden_states = residual + hidden_states
428
+
429
+ # Fully Connected
430
+ residual = hidden_states
431
+ hidden_states = self.post_attention_layernorm(hidden_states)
432
+ hidden_states = self.mlp(hidden_states)
433
+ hidden_states = residual + hidden_states
434
+
435
+ outputs = (hidden_states,)
436
+
437
+ if output_attentions:
438
+ outputs += (self_attn_weights,)
439
+
440
+ if use_cache:
441
+ outputs += (present_key_value,)
442
+
443
+ return outputs
444
+
445
+
446
+ LLAMA_START_DOCSTRING = r"""
447
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
448
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
449
+ etc.)
450
+
451
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
452
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
453
+ and behavior.
454
+
455
+ Parameters:
456
+ config ([`LlamaConfig`]):
457
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
458
+ load the weights associated with the model, only the configuration. Check out the
459
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
460
+ """
461
+
462
+
463
+ @add_start_docstrings(
464
+ "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
465
+ LLAMA_START_DOCSTRING,
466
+ )
467
+ class LlamaPreTrainedModel(PreTrainedModel):
468
+ config_class = LlamaConfig
469
+ base_model_prefix = "model"
470
+ supports_gradient_checkpointing = True
471
+ _no_split_modules = ["LlamaDecoderLayer"]
472
+ _skip_keys_device_placement = "past_key_values"
473
+
474
+ def _init_weights(self, module):
475
+ std = self.config.initializer_range
476
+ if isinstance(module, nn.Linear):
477
+ module.weight.data.normal_(mean=0.0, std=std)
478
+ if module.bias is not None:
479
+ module.bias.data.zero_()
480
+ elif isinstance(module, nn.Embedding):
481
+ module.weight.data.normal_(mean=0.0, std=std)
482
+ if module.padding_idx is not None:
483
+ module.weight.data[module.padding_idx].zero_()
484
+
485
+ def _set_gradient_checkpointing(self, module, value=False):
486
+ if isinstance(module, LlamaModel):
487
+ module.gradient_checkpointing = value
488
+
489
+
490
+ LLAMA_INPUTS_DOCSTRING = r"""
491
+ Args:
492
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
493
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
494
+ it.
495
+
496
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
497
+ [`PreTrainedTokenizer.__call__`] for details.
498
+
499
+ [What are input IDs?](../glossary#input-ids)
500
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
501
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
502
+
503
+ - 1 for tokens that are **not masked**,
504
+ - 0 for tokens that are **masked**.
505
+
506
+ [What are attention masks?](../glossary#attention-mask)
507
+
508
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
509
+ [`PreTrainedTokenizer.__call__`] for details.
510
+
511
+ If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
512
+ `past_key_values`).
513
+
514
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
515
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
516
+ information on the default strategy.
517
+
518
+ - 1 indicates the head is **not masked**,
519
+ - 0 indicates the head is **masked**.
520
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
521
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
522
+ config.n_positions - 1]`.
523
+
524
+ [What are position IDs?](../glossary#position-ids)
525
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
526
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
527
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
528
+ `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
529
+
530
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
531
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
532
+
533
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
534
+ don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
535
+ `decoder_input_ids` of shape `(batch_size, sequence_length)`.
536
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
537
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
538
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
539
+ model's internal embedding lookup matrix.
540
+ use_cache (`bool`, *optional*):
541
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
542
+ `past_key_values`).
543
+ output_attentions (`bool`, *optional*):
544
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
545
+ tensors for more detail.
546
+ output_hidden_states (`bool`, *optional*):
547
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
548
+ more detail.
549
+ return_dict (`bool`, *optional*):
550
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
551
+ """
552
+
553
+
554
+ @add_start_docstrings(
555
+ "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
556
+ LLAMA_START_DOCSTRING,
557
+ )
558
+ class LlamaModel(LlamaPreTrainedModel):
559
+ """
560
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
561
+
562
+ Args:
563
+ config: LlamaConfig
564
+ """
565
+
566
+ def __init__(self, config: LlamaConfig):
567
+ super().__init__(config)
568
+ self.padding_idx = config.pad_token_id
569
+ self.vocab_size = config.vocab_size
570
+
571
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
572
+ self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
573
+ self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
574
+
575
+ self.gradient_checkpointing = False
576
+ # Initialize weights and apply final processing
577
+ self.post_init()
578
+
579
+ def get_input_embeddings(self):
580
+ return self.embed_tokens
581
+
582
+ def set_input_embeddings(self, value):
583
+ self.embed_tokens = value
584
+
585
+ # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
586
+ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
587
+ # create causal mask
588
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
589
+ combined_attention_mask = None
590
+ if input_shape[-1] > 1:
591
+ combined_attention_mask = _make_causal_mask(
592
+ input_shape,
593
+ inputs_embeds.dtype,
594
+ device=inputs_embeds.device,
595
+ past_key_values_length=past_key_values_length,
596
+ )
597
+
598
+ if attention_mask is not None:
599
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
600
+ expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
601
+ inputs_embeds.device
602
+ )
603
+ combined_attention_mask = (
604
+ expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
605
+ )
606
+
607
+ return combined_attention_mask
608
+
609
+ @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
610
+ def forward(
611
+ self,
612
+ input_ids: torch.LongTensor = None,
613
+ attention_mask: Optional[torch.Tensor] = None,
614
+ position_ids: Optional[torch.LongTensor] = None,
615
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
616
+ inputs_embeds: Optional[torch.FloatTensor] = None,
617
+ use_cache: Optional[bool] = None,
618
+ output_attentions: Optional[bool] = None,
619
+ output_hidden_states: Optional[bool] = None,
620
+ return_dict: Optional[bool] = None,
621
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
622
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
623
+ output_hidden_states = (
624
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
625
+ )
626
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
627
+
628
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
629
+
630
+ # retrieve input_ids and inputs_embeds
631
+ if input_ids is not None and inputs_embeds is not None:
632
+ raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
633
+ elif input_ids is not None:
634
+ batch_size, seq_length = input_ids.shape
635
+ elif inputs_embeds is not None:
636
+ batch_size, seq_length, _ = inputs_embeds.shape
637
+ else:
638
+ raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
639
+
640
+ seq_length_with_past = seq_length
641
+ past_key_values_length = 0
642
+
643
+ if past_key_values is not None:
644
+ past_key_values_length = past_key_values[0][0].shape[2]
645
+ seq_length_with_past = seq_length_with_past + past_key_values_length
646
+
647
+ if position_ids is None:
648
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
649
+ position_ids = torch.arange(
650
+ past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
651
+ )
652
+ position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
653
+ else:
654
+ position_ids = position_ids.view(-1, seq_length).long()
655
+
656
+ if inputs_embeds is None:
657
+ inputs_embeds = self.embed_tokens(input_ids)
658
+ # embed positions
659
+ if attention_mask is None:
660
+ attention_mask = torch.ones(
661
+ (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
662
+ )
663
+ attention_mask = self._prepare_decoder_attention_mask(
664
+ attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
665
+ )
666
+
667
+ hidden_states = inputs_embeds
668
+
669
+ if self.gradient_checkpointing and self.training:
670
+ if use_cache:
671
+ logger.warning_once(
672
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
673
+ )
674
+ use_cache = False
675
+
676
+ # decoder layers
677
+ all_hidden_states = () if output_hidden_states else None
678
+ all_self_attns = () if output_attentions else None
679
+ next_decoder_cache = () if use_cache else None
680
+
681
+ for idx, decoder_layer in enumerate(self.layers):
682
+ if output_hidden_states:
683
+ all_hidden_states += (hidden_states,)
684
+
685
+ past_key_value = past_key_values[idx] if past_key_values is not None else None
686
+
687
+ if self.gradient_checkpointing and self.training:
688
+
689
+ def create_custom_forward(module):
690
+ def custom_forward(*inputs):
691
+ # None for past_key_value
692
+ return module(*inputs, past_key_value, output_attentions)
693
+
694
+ return custom_forward
695
+
696
+ layer_outputs = torch.utils.checkpoint.checkpoint(
697
+ create_custom_forward(decoder_layer),
698
+ hidden_states,
699
+ attention_mask,
700
+ position_ids,
701
+ )
702
+ else:
703
+ layer_outputs = decoder_layer(
704
+ hidden_states,
705
+ attention_mask=attention_mask,
706
+ position_ids=position_ids,
707
+ past_key_value=past_key_value,
708
+ output_attentions=output_attentions,
709
+ use_cache=use_cache,
710
+ )
711
+
712
+ hidden_states = layer_outputs[0]
713
+
714
+ if use_cache:
715
+ next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
716
+
717
+ if output_attentions:
718
+ all_self_attns += (layer_outputs[1],)
719
+
720
+ hidden_states = self.norm(hidden_states)
721
+
722
+ # add hidden states from the last decoder layer
723
+ if output_hidden_states:
724
+ all_hidden_states += (hidden_states,)
725
+
726
+ next_cache = next_decoder_cache if use_cache else None
727
+ if not return_dict:
728
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
729
+ return BaseModelOutputWithPast(
730
+ last_hidden_state=hidden_states,
731
+ past_key_values=next_cache,
732
+ hidden_states=all_hidden_states,
733
+ attentions=all_self_attns,
734
+ )
735
+
736
+
737
+ class LlamaForCausalLM(LlamaPreTrainedModel):
738
+ _tied_weights_keys = ["lm_head.weight"]
739
+
740
+ def __init__(self, config):
741
+ super().__init__(config)
742
+ self.model = LlamaModel(config)
743
+ self.vocab_size = config.vocab_size
744
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
745
+
746
+ # Initialize weights and apply final processing
747
+ self.post_init()
748
+
749
+ def get_input_embeddings(self):
750
+ return self.model.embed_tokens
751
+
752
+ def set_input_embeddings(self, value):
753
+ self.model.embed_tokens = value
754
+
755
+ def get_output_embeddings(self):
756
+ return self.lm_head
757
+
758
+ def set_output_embeddings(self, new_embeddings):
759
+ self.lm_head = new_embeddings
760
+
761
+ def set_decoder(self, decoder):
762
+ self.model = decoder
763
+
764
+ def get_decoder(self):
765
+ return self.model
766
+
767
+ @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
768
+ @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
769
+ def forward(
770
+ self,
771
+ input_ids: torch.LongTensor = None,
772
+ attention_mask: Optional[torch.Tensor] = None,
773
+ position_ids: Optional[torch.LongTensor] = None,
774
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
775
+ inputs_embeds: Optional[torch.FloatTensor] = None,
776
+ labels: Optional[torch.LongTensor] = None,
777
+ use_cache: Optional[bool] = None,
778
+ output_attentions: Optional[bool] = None,
779
+ output_hidden_states: Optional[bool] = None,
780
+ return_dict: Optional[bool] = None,
781
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
782
+ r"""
783
+ Args:
784
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
785
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
786
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
787
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
788
+
789
+ Returns:
790
+
791
+ Example:
792
+
793
+ ```python
794
+ >>> from transformers import AutoTokenizer, LlamaForCausalLM
795
+
796
+ >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
797
+ >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
798
+
799
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
800
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
801
+
802
+ >>> # Generate
803
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
804
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
805
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
806
+ ```"""
807
+
808
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
809
+ output_hidden_states = (
810
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
811
+ )
812
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
813
+
814
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
815
+ outputs = self.model(
816
+ input_ids=input_ids,
817
+ attention_mask=attention_mask,
818
+ position_ids=position_ids,
819
+ past_key_values=past_key_values,
820
+ inputs_embeds=inputs_embeds,
821
+ use_cache=use_cache,
822
+ output_attentions=output_attentions,
823
+ output_hidden_states=output_hidden_states,
824
+ return_dict=return_dict,
825
+ )
826
+
827
+ hidden_states = outputs[0]
828
+ if self.config.pretraining_tp > 1:
829
+ lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
830
+ logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
831
+ logits = torch.cat(logits, dim=-1)
832
+ else:
833
+ logits = self.lm_head(hidden_states)
834
+ logits = logits.float()
835
+
836
+ loss = None
837
+ if labels is not None:
838
+ # Shift so that tokens < n predict n
839
+ shift_logits = logits[..., :-1, :].contiguous()
840
+ shift_labels = labels[..., 1:].contiguous()
841
+ # Flatten the tokens
842
+ loss_fct = CrossEntropyLoss()
843
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
844
+ shift_labels = shift_labels.view(-1)
845
+ # Enable model parallelism
846
+ shift_labels = shift_labels.to(shift_logits.device)
847
+ loss = loss_fct(shift_logits, shift_labels)
848
+
849
+ if not return_dict:
850
+ output = (logits,) + outputs[1:]
851
+ return (loss,) + output if loss is not None else output
852
+
853
+ return CausalLMOutputWithPast(
854
+ loss=loss,
855
+ logits=logits,
856
+ past_key_values=outputs.past_key_values,
857
+ hidden_states=outputs.hidden_states,
858
+ attentions=outputs.attentions,
859
+ )
860
+
861
+ def prepare_inputs_for_generation(
862
+ self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
863
+ ):
864
+ if past_key_values:
865
+ input_ids = input_ids[:, -1:]
866
+
867
+ position_ids = kwargs.get("position_ids", None)
868
+ if attention_mask is not None and position_ids is None:
869
+ # create position_ids on the fly for batch generation
870
+ position_ids = attention_mask.long().cumsum(-1) - 1
871
+ position_ids.masked_fill_(attention_mask == 0, 1)
872
+ if past_key_values:
873
+ position_ids = position_ids[:, -1].unsqueeze(-1)
874
+
875
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
876
+ if inputs_embeds is not None and past_key_values is None:
877
+ model_inputs = {"inputs_embeds": inputs_embeds}
878
+ else:
879
+ model_inputs = {"input_ids": input_ids}
880
+
881
+ model_inputs.update(
882
+ {
883
+ "position_ids": position_ids,
884
+ "past_key_values": past_key_values,
885
+ "use_cache": kwargs.get("use_cache"),
886
+ "attention_mask": attention_mask,
887
+ }
888
+ )
889
+ return model_inputs
890
+
891
+ @staticmethod
892
+ def _reorder_cache(past_key_values, beam_idx):
893
+ reordered_past = ()
894
+ for layer_past in past_key_values:
895
+ reordered_past += (
896
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
897
+ )
898
+ return reordered_past
899
+
900
+
901
+ @add_start_docstrings(
902
+ """
903
+ The LLaMa Model transformer with a sequence classification head on top (linear layer).
904
+
905
+ [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
906
+ (e.g. GPT-2) do.
907
+
908
+ Since it does classification on the last token, it requires to know the position of the last token. If a
909
+ `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
910
+ no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
911
+ padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
912
+ each row of the batch).
913
+ """,
914
+ LLAMA_START_DOCSTRING,
915
+ )
916
+ class LlamaForSequenceClassification(LlamaPreTrainedModel):
917
+ def __init__(self, config):
918
+ super().__init__(config)
919
+ self.num_labels = config.num_labels
920
+ self.model = LlamaModel(config)
921
+ self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
922
+
923
+ # Initialize weights and apply final processing
924
+ self.post_init()
925
+
926
+ def get_input_embeddings(self):
927
+ return self.model.embed_tokens
928
+
929
+ def set_input_embeddings(self, value):
930
+ self.model.embed_tokens = value
931
+
932
+ @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
933
+ def forward(
934
+ self,
935
+ input_ids: torch.LongTensor = None,
936
+ attention_mask: Optional[torch.Tensor] = None,
937
+ position_ids: Optional[torch.LongTensor] = None,
938
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
939
+ inputs_embeds: Optional[torch.FloatTensor] = None,
940
+ labels: Optional[torch.LongTensor] = None,
941
+ use_cache: Optional[bool] = None,
942
+ output_attentions: Optional[bool] = None,
943
+ output_hidden_states: Optional[bool] = None,
944
+ return_dict: Optional[bool] = None,
945
+ ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
946
+ r"""
947
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
948
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
949
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
950
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
951
+ """
952
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
953
+
954
+ transformer_outputs = self.model(
955
+ input_ids,
956
+ attention_mask=attention_mask,
957
+ position_ids=position_ids,
958
+ past_key_values=past_key_values,
959
+ inputs_embeds=inputs_embeds,
960
+ use_cache=use_cache,
961
+ output_attentions=output_attentions,
962
+ output_hidden_states=output_hidden_states,
963
+ return_dict=return_dict,
964
+ )
965
+ hidden_states = transformer_outputs[0]
966
+ logits = self.score(hidden_states)
967
+
968
+ if input_ids is not None:
969
+ batch_size = input_ids.shape[0]
970
+ else:
971
+ batch_size = inputs_embeds.shape[0]
972
+
973
+ if self.config.pad_token_id is None and batch_size != 1:
974
+ raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
975
+ if self.config.pad_token_id is None:
976
+ sequence_lengths = -1
977
+ else:
978
+ if input_ids is not None:
979
+ sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
980
+ logits.device
981
+ )
982
+ else:
983
+ sequence_lengths = -1
984
+
985
+ pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
986
+
987
+ loss = None
988
+ if labels is not None:
989
+ labels = labels.to(logits.device)
990
+ if self.config.problem_type is None:
991
+ if self.num_labels == 1:
992
+ self.config.problem_type = "regression"
993
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
994
+ self.config.problem_type = "single_label_classification"
995
+ else:
996
+ self.config.problem_type = "multi_label_classification"
997
+
998
+ if self.config.problem_type == "regression":
999
+ loss_fct = MSELoss()
1000
+ if self.num_labels == 1:
1001
+ loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
1002
+ else:
1003
+ loss = loss_fct(pooled_logits, labels)
1004
+ elif self.config.problem_type == "single_label_classification":
1005
+ loss_fct = CrossEntropyLoss()
1006
+ loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
1007
+ elif self.config.problem_type == "multi_label_classification":
1008
+ loss_fct = BCEWithLogitsLoss()
1009
+ loss = loss_fct(pooled_logits, labels)
1010
+ if not return_dict:
1011
+ output = (pooled_logits,) + transformer_outputs[1:]
1012
+ return ((loss,) + output) if loss is not None else output
1013
+
1014
+ return SequenceClassifierOutputWithPast(
1015
+ loss=loss,
1016
+ logits=pooled_logits,
1017
+ past_key_values=transformer_outputs.past_key_values,
1018
+ hidden_states=transformer_outputs.hidden_states,
1019
+ attentions=transformer_outputs.attentions,
1020
+ )
output.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52fca37b0644034c97866966310cbe186ddfe356886e782f161524ebcc7cec97
3
+ size 6809306014
pytorch_model.bin.index.json ADDED
@@ -0,0 +1,370 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 26032056320
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "pytorch_model-00006-of-00006.bin",
7
+ "model.embed_tokens.weight": "pytorch_model-00001-of-00006.bin",
8
+ "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
9
+ "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
10
+ "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
11
+ "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
12
+ "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
13
+ "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
14
+ "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
15
+ "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
16
+ "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
17
+ "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
18
+ "model.layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
19
+ "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
20
+ "model.layers.1.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
21
+ "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
22
+ "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
23
+ "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
24
+ "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
25
+ "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
26
+ "model.layers.10.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
27
+ "model.layers.10.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
28
+ "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
29
+ "model.layers.10.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
30
+ "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
31
+ "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
32
+ "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
33
+ "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
34
+ "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
35
+ "model.layers.11.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
36
+ "model.layers.11.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
37
+ "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
38
+ "model.layers.11.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
39
+ "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
40
+ "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
41
+ "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
42
+ "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
43
+ "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
44
+ "model.layers.12.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
45
+ "model.layers.12.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
46
+ "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
47
+ "model.layers.12.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
48
+ "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
49
+ "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
50
+ "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
51
+ "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
52
+ "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
53
+ "model.layers.13.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
54
+ "model.layers.13.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
55
+ "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
56
+ "model.layers.13.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
57
+ "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
58
+ "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
59
+ "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
60
+ "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
61
+ "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
62
+ "model.layers.14.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
63
+ "model.layers.14.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
64
+ "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
65
+ "model.layers.14.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
66
+ "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
67
+ "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
68
+ "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
69
+ "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
70
+ "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
71
+ "model.layers.15.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
72
+ "model.layers.15.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
73
+ "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
74
+ "model.layers.15.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
75
+ "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
76
+ "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
77
+ "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
78
+ "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
79
+ "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
80
+ "model.layers.16.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
81
+ "model.layers.16.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
82
+ "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
83
+ "model.layers.16.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
84
+ "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
85
+ "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
86
+ "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
87
+ "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
88
+ "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
89
+ "model.layers.17.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
90
+ "model.layers.17.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
91
+ "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
92
+ "model.layers.17.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
93
+ "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
94
+ "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
95
+ "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
96
+ "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
97
+ "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
98
+ "model.layers.18.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
99
+ "model.layers.18.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
100
+ "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
101
+ "model.layers.18.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
102
+ "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
103
+ "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
104
+ "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
105
+ "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
106
+ "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
107
+ "model.layers.19.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
108
+ "model.layers.19.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
109
+ "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
110
+ "model.layers.19.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
111
+ "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
112
+ "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
113
+ "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
114
+ "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
115
+ "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
116
+ "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
117
+ "model.layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
118
+ "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
119
+ "model.layers.2.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
120
+ "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
121
+ "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
122
+ "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
123
+ "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
124
+ "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
125
+ "model.layers.20.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
126
+ "model.layers.20.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
127
+ "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
128
+ "model.layers.20.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
129
+ "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
130
+ "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
131
+ "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
132
+ "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
133
+ "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
134
+ "model.layers.21.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
135
+ "model.layers.21.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
136
+ "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
137
+ "model.layers.21.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
138
+ "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
139
+ "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
140
+ "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
141
+ "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
142
+ "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
143
+ "model.layers.22.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
144
+ "model.layers.22.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
145
+ "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
146
+ "model.layers.22.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
147
+ "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
148
+ "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
149
+ "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
150
+ "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
151
+ "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
152
+ "model.layers.23.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
153
+ "model.layers.23.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
154
+ "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
155
+ "model.layers.23.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
156
+ "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
157
+ "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
158
+ "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
159
+ "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
160
+ "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
161
+ "model.layers.24.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
162
+ "model.layers.24.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
163
+ "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
164
+ "model.layers.24.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
165
+ "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
166
+ "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
167
+ "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
168
+ "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
169
+ "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
170
+ "model.layers.25.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
171
+ "model.layers.25.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
172
+ "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
173
+ "model.layers.25.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
174
+ "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
175
+ "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
176
+ "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
177
+ "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
178
+ "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
179
+ "model.layers.26.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
180
+ "model.layers.26.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
181
+ "model.layers.26.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
182
+ "model.layers.26.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
183
+ "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
184
+ "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
185
+ "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
186
+ "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
187
+ "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
188
+ "model.layers.27.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
189
+ "model.layers.27.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
190
+ "model.layers.27.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
191
+ "model.layers.27.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
192
+ "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
193
+ "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
194
+ "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
195
+ "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
196
+ "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
197
+ "model.layers.28.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
198
+ "model.layers.28.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
199
+ "model.layers.28.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
200
+ "model.layers.28.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
201
+ "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
202
+ "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
203
+ "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
204
+ "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
205
+ "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
206
+ "model.layers.29.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
207
+ "model.layers.29.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
208
+ "model.layers.29.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
209
+ "model.layers.29.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
210
+ "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
211
+ "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
212
+ "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
213
+ "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
214
+ "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
215
+ "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
216
+ "model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
217
+ "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
218
+ "model.layers.3.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
219
+ "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
220
+ "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
221
+ "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
222
+ "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
223
+ "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
224
+ "model.layers.30.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
225
+ "model.layers.30.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
226
+ "model.layers.30.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
227
+ "model.layers.30.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
228
+ "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
229
+ "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
230
+ "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
231
+ "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
232
+ "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
233
+ "model.layers.31.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
234
+ "model.layers.31.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
235
+ "model.layers.31.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
236
+ "model.layers.31.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
237
+ "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
238
+ "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
239
+ "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
240
+ "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
241
+ "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
242
+ "model.layers.32.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
243
+ "model.layers.32.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
244
+ "model.layers.32.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
245
+ "model.layers.32.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
246
+ "model.layers.32.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
247
+ "model.layers.32.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
248
+ "model.layers.32.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
249
+ "model.layers.32.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
250
+ "model.layers.32.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
251
+ "model.layers.33.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
252
+ "model.layers.33.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
253
+ "model.layers.33.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
254
+ "model.layers.33.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
255
+ "model.layers.33.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
256
+ "model.layers.33.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
257
+ "model.layers.33.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
258
+ "model.layers.33.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
259
+ "model.layers.33.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
260
+ "model.layers.34.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
261
+ "model.layers.34.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
262
+ "model.layers.34.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
263
+ "model.layers.34.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
264
+ "model.layers.34.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
265
+ "model.layers.34.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
266
+ "model.layers.34.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
267
+ "model.layers.34.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
268
+ "model.layers.34.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
269
+ "model.layers.35.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
270
+ "model.layers.35.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
271
+ "model.layers.35.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
272
+ "model.layers.35.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
273
+ "model.layers.35.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
274
+ "model.layers.35.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
275
+ "model.layers.35.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
276
+ "model.layers.35.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
277
+ "model.layers.35.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
278
+ "model.layers.36.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
279
+ "model.layers.36.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
280
+ "model.layers.36.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
281
+ "model.layers.36.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
282
+ "model.layers.36.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
283
+ "model.layers.36.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
284
+ "model.layers.36.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
285
+ "model.layers.36.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
286
+ "model.layers.36.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
287
+ "model.layers.37.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
288
+ "model.layers.37.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
289
+ "model.layers.37.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
290
+ "model.layers.37.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
291
+ "model.layers.37.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
292
+ "model.layers.37.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
293
+ "model.layers.37.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
294
+ "model.layers.37.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
295
+ "model.layers.37.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
296
+ "model.layers.38.input_layernorm.weight": "pytorch_model-00006-of-00006.bin",
297
+ "model.layers.38.mlp.down_proj.weight": "pytorch_model-00006-of-00006.bin",
298
+ "model.layers.38.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
299
+ "model.layers.38.mlp.up_proj.weight": "pytorch_model-00006-of-00006.bin",
300
+ "model.layers.38.post_attention_layernorm.weight": "pytorch_model-00006-of-00006.bin",
301
+ "model.layers.38.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
302
+ "model.layers.38.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
303
+ "model.layers.38.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
304
+ "model.layers.38.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
305
+ "model.layers.39.input_layernorm.weight": "pytorch_model-00006-of-00006.bin",
306
+ "model.layers.39.mlp.down_proj.weight": "pytorch_model-00006-of-00006.bin",
307
+ "model.layers.39.mlp.gate_proj.weight": "pytorch_model-00006-of-00006.bin",
308
+ "model.layers.39.mlp.up_proj.weight": "pytorch_model-00006-of-00006.bin",
309
+ "model.layers.39.post_attention_layernorm.weight": "pytorch_model-00006-of-00006.bin",
310
+ "model.layers.39.self_attn.k_proj.weight": "pytorch_model-00006-of-00006.bin",
311
+ "model.layers.39.self_attn.o_proj.weight": "pytorch_model-00006-of-00006.bin",
312
+ "model.layers.39.self_attn.q_proj.weight": "pytorch_model-00006-of-00006.bin",
313
+ "model.layers.39.self_attn.v_proj.weight": "pytorch_model-00006-of-00006.bin",
314
+ "model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
315
+ "model.layers.4.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
316
+ "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
317
+ "model.layers.4.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
318
+ "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
319
+ "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
320
+ "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
321
+ "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
322
+ "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
323
+ "model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
324
+ "model.layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
325
+ "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
326
+ "model.layers.5.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
327
+ "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
328
+ "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
329
+ "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
330
+ "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
331
+ "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
332
+ "model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
333
+ "model.layers.6.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
334
+ "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
335
+ "model.layers.6.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
336
+ "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
337
+ "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
338
+ "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
339
+ "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
340
+ "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
341
+ "model.layers.7.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
342
+ "model.layers.7.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
343
+ "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
344
+ "model.layers.7.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
345
+ "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
346
+ "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
347
+ "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
348
+ "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
349
+ "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
350
+ "model.layers.8.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
351
+ "model.layers.8.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
352
+ "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
353
+ "model.layers.8.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
354
+ "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
355
+ "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
356
+ "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
357
+ "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
358
+ "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
359
+ "model.layers.9.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
360
+ "model.layers.9.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
361
+ "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
362
+ "model.layers.9.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
363
+ "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
364
+ "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
365
+ "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
366
+ "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
367
+ "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
368
+ "model.norm.weight": "pytorch_model-00006-of-00006.bin"
369
+ }
370
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "▁<PRE>",
4
+ "▁<MID>",
5
+ "▁<SUF>",
6
+ "▁<EOT>"
7
+ ],
8
+ "bos_token": {
9
+ "content": "<s>",
10
+ "lstrip": false,
11
+ "normalized": false,
12
+ "rstrip": false,
13
+ "single_word": false
14
+ },
15
+ "eos_token": {
16
+ "content": "</s>",
17
+ "lstrip": false,
18
+ "normalized": false,
19
+ "rstrip": false,
20
+ "single_word": false
21
+ },
22
+ "pad_token": "</s>",
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
3
+ size 500058
tokenizer_config.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "32007": {
30
+ "content": "▁<PRE>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "32008": {
38
+ "content": "▁<SUF>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "32009": {
46
+ "content": "▁<MID>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "32010": {
54
+ "content": "▁<EOT>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ }
61
+ },
62
+ "additional_special_tokens": [
63
+ "▁<PRE>",
64
+ "▁<MID>",
65
+ "▁<SUF>",
66
+ "▁<EOT>"
67
+ ],
68
+ "bos_token": "<s>",
69
+ "clean_up_tokenization_spaces": false,
70
+ "eos_token": "</s>",
71
+ "eot_token": "▁<EOT>",
72
+ "fill_token": "<FILL_ME>",
73
+ "legacy": null,
74
+ "middle_token": "▁<MID>",
75
+ "model_max_length": 1000000000000000019884624838656,
76
+ "pad_token": "</s>",
77
+ "prefix_token": "▁<PRE>",
78
+ "sp_model_kwargs": {},
79
+ "spaces_between_special_tokens": false,
80
+ "suffix_first": false,
81
+ "suffix_token": "▁<SUF>",
82
+ "tokenizer_class": "CodeLlamaTokenizer",
83
+ "trust_remote_code": false,
84
+ "unk_token": "<unk>",
85
+ "use_default_system_prompt": true,
86
+ "use_fast": true
87
+ }