shalomma commited on
Commit
a4cfa39
1 Parent(s): 23f852d
llama/__init__.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from typing import TYPE_CHECKING
15
+
16
+ from transformers.utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
17
+
18
+
19
+ _import_structure = {
20
+ "configuration_llama": ["LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP", "LLaMAConfig"],
21
+ "tokenization_llama": ["LLaMATokenizer"],
22
+ }
23
+
24
+ try:
25
+ if not is_torch_available():
26
+ raise OptionalDependencyNotAvailable()
27
+ except OptionalDependencyNotAvailable:
28
+ pass
29
+ else:
30
+ _import_structure["modeling_llama"] = [
31
+ "LLaMAForCausalLM",
32
+ "LLaMAModel",
33
+ "LLaMAPreTrainedModel",
34
+ ]
35
+
36
+
37
+ if TYPE_CHECKING:
38
+ from .configuration_llama import LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP, LLaMAConfig
39
+ from .tokenization_llama import LLaMATokenizer
40
+
41
+ try:
42
+ if not is_torch_available():
43
+ raise OptionalDependencyNotAvailable()
44
+ except OptionalDependencyNotAvailable:
45
+ pass
46
+ else:
47
+ from .modeling_llama import (
48
+ LLaMAForCausalLM,
49
+ LLaMAModel,
50
+ LLaMAPreTrainedModel,
51
+ )
52
+
53
+
54
+ else:
55
+ import sys
56
+
57
+ sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
llama/configuration_llama.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 The FAIR team of Meta AI and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ LLaMA model configuration"""
16
+
17
+ from transformers.configuration_utils import PretrainedConfig
18
+ from transformers.utils import logging
19
+
20
+
21
+ logger = logging.get_logger(__name__)
22
+
23
+ LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
24
+
25
+
26
+ class LLaMAConfig(PretrainedConfig):
27
+ r"""
28
+ This is the configuration class to store the configuration of a [`~LLaMAModel`]. It is used to instantiate an LLaMA
29
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
30
+ defaults will yield a similar configuration to that of the LLaMA-7B.
31
+
32
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
33
+ documentation from [`PretrainedConfig`] for more information.
34
+
35
+
36
+ Args:
37
+ vocab_size (`int`, *optional*, defaults to 32000):
38
+ Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
39
+ `inputs_ids` passed when calling [`~LLaMAModel`] or [`~TFLLaMAModel`].
40
+ hidden_size (`int`, *optional*, defaults to 4096):
41
+ Dimension of the hidden representations.
42
+ intermediate_size (`int`, *optional*, defaults to 11008):
43
+ Dimension of the MLP representations.
44
+ num_hidden_layers (`int`, *optional*, defaults to 32):
45
+ Number of hidden layers in the Transformer encoder.
46
+ num_attention_heads (`int`, *optional*, defaults to 32):
47
+ Number of attention heads for each attention layer in the Transformer encoder.
48
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
49
+ The non-linear activation function (function or string) in the decoder.
50
+ max_sequence_length (`int`, *optional*, defaults to 2048):
51
+ Max sequence length for model (for RoPE computation)
52
+ initializer_range (`float`, *optional*, defaults to 0.02):
53
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
54
+ rms_norm_eps (`float`, *optional*, defaults to 1e-12):
55
+ The epsilon used by the rms normalization layers.
56
+ use_cache (`bool`, *optional*, defaults to `True`):
57
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
58
+ relevant if `config.is_decoder=True`.
59
+ tie_word_embeddings(`bool`, *optional*, defaults to `False`):
60
+ Whether to tie weight embeddings
61
+ Example:
62
+
63
+ ```python
64
+ >>> from llama import LLaMAModel, LLaMAConfig
65
+
66
+ >>> # Initializing a LLaMA llama-7b style configuration
67
+ >>> configuration = LLaMAConfig()
68
+
69
+ >>> # Initializing a model from the llama-7b style configuration
70
+ >>> model = LLaMAModel(configuration)
71
+
72
+ >>> # Accessing the model configuration
73
+ >>> configuration = model.config
74
+ ```"""
75
+ model_type = "llama"
76
+
77
+ def __init__(
78
+ self,
79
+ vocab_size=32000,
80
+ hidden_size=4096,
81
+ intermediate_size=11008,
82
+ num_hidden_layers=32,
83
+ num_attention_heads=32,
84
+ hidden_act="silu",
85
+ max_sequence_length=2048,
86
+ initializer_range=0.02,
87
+ rms_norm_eps=1e-6,
88
+ use_cache=True,
89
+ pad_token_id=-1,
90
+ bos_token_id=0,
91
+ eos_token_id=1,
92
+ tie_word_embeddings=False,
93
+ **kwargs,
94
+ ):
95
+ self.vocab_size = vocab_size
96
+ self.hidden_size = hidden_size
97
+ self.intermediate_size = intermediate_size
98
+ self.num_hidden_layers = num_hidden_layers
99
+ self.num_attention_heads = num_attention_heads
100
+ self.hidden_act = hidden_act
101
+ self.max_sequence_length = max_sequence_length
102
+ self.initializer_range = initializer_range
103
+ self.rms_norm_eps = rms_norm_eps
104
+ self.use_cache = use_cache
105
+ super().__init__(
106
+ pad_token_id=pad_token_id,
107
+ bos_token_id=bos_token_id,
108
+ eos_token_id=eos_token_id,
109
+ tie_word_embeddings=tie_word_embeddings,
110
+ **kwargs,
111
+ )
llama/convert_llama_weights_to_hf.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+ import shutil
5
+
6
+ import torch
7
+
8
+
9
+ """
10
+ Sample usage:
11
+
12
+ ```
13
+ python src/transformers/models/llama/convert_llama_weights_to_hf.py \
14
+ --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path
15
+ ```
16
+
17
+ Thereafter, models can be loaded via:
18
+
19
+ ```
20
+ tokenizer = transformers.LLaMATokenizer.from_pretrained("/output/path/tokenizer/")
21
+
22
+ model = transformers.LLaMAForCausalLM.from_pretrained("/output/path/llama-7b/")
23
+ ```
24
+ """
25
+
26
+ INTERMEDIATE_SIZE_MAP = {
27
+ "7B": 11008,
28
+ "13B": 13824,
29
+ "30B": 17920,
30
+ "65B": 22016,
31
+ }
32
+ NUM_SHARDS = {
33
+ "7B": 1,
34
+ "13B": 2,
35
+ "30B": 4,
36
+ "65B": 8,
37
+ }
38
+
39
+
40
+ def read_json(path):
41
+ with open(path, "r") as f:
42
+ return json.loads(f.read())
43
+
44
+
45
+ def write_json(text, path):
46
+ with open(path, "w") as f:
47
+ f.write(json.dumps(text))
48
+
49
+
50
+ def write_model(model_path, input_base_path, model_size):
51
+ assert model_size in INTERMEDIATE_SIZE_MAP
52
+ os.makedirs(model_path, exist_ok=True)
53
+
54
+ params = read_json(os.path.join(input_base_path, "params.json"))
55
+ num_shards = NUM_SHARDS[model_size]
56
+ n_layers = params["n_layers"]
57
+ n_heads = params["n_heads"]
58
+ n_heads_per_shard = n_heads // num_shards
59
+ dim = params["dim"]
60
+ dims_per_head = dim // n_heads
61
+
62
+ # Load weights
63
+ if model_size == "7B":
64
+ # Not shared
65
+ # (The sharded implementation would also work, but this is simpler.)
66
+ loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu")
67
+ else:
68
+ # Sharded
69
+ loaded = [
70
+ torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pth"), map_location="cpu")
71
+ for i in range(num_shards)
72
+ ]
73
+ param_count = 0
74
+ index_dict = {"weight_map": {}}
75
+ for layer_i in range(n_layers):
76
+ filename = "pytorch_model-{:05d}-of-{:05d}.bin".format(
77
+ layer_i,
78
+ n_layers + 1,
79
+ )
80
+ if model_size == "7B":
81
+ # Unsharded
82
+ state_dict = {
83
+ f"model.decoder.layers.{layer_i}.self_attn.q_proj.weight": loaded[
84
+ f"layers.{layer_i}.attention.wq.weight"
85
+ ],
86
+ f"model.decoder.layers.{layer_i}.self_attn.k_proj.weight": loaded[
87
+ f"layers.{layer_i}.attention.wk.weight"
88
+ ],
89
+ f"model.decoder.layers.{layer_i}.self_attn.v_proj.weight": loaded[
90
+ f"layers.{layer_i}.attention.wv.weight"
91
+ ],
92
+ f"model.decoder.layers.{layer_i}.self_attn.o_proj.weight": loaded[
93
+ f"layers.{layer_i}.attention.wo.weight"
94
+ ],
95
+ f"model.decoder.layers.{layer_i}.feed_forward.w1.weight": loaded[
96
+ f"layers.{layer_i}.feed_forward.w1.weight"
97
+ ],
98
+ f"model.decoder.layers.{layer_i}.feed_forward.w2.weight": loaded[
99
+ f"layers.{layer_i}.feed_forward.w2.weight"
100
+ ],
101
+ f"model.decoder.layers.{layer_i}.feed_forward.w3.weight": loaded[
102
+ f"layers.{layer_i}.feed_forward.w3.weight"
103
+ ],
104
+ f"model.decoder.layers.{layer_i}.attention_norm.weight": loaded[
105
+ f"layers.{layer_i}.attention_norm.weight"
106
+ ],
107
+ f"model.decoder.layers.{layer_i}.ffn_norm.weight": loaded[f"layers.{layer_i}.ffn_norm.weight"],
108
+ }
109
+ else:
110
+ # Sharded
111
+ state_dict = {
112
+ f"model.decoder.layers.{layer_i}.attention_norm.weight": loaded[0][
113
+ f"layers.{layer_i}.attention_norm.weight"
114
+ ],
115
+ f"model.decoder.layers.{layer_i}.ffn_norm.weight": loaded[0][f"layers.{layer_i}.ffn_norm.weight"],
116
+ }
117
+ state_dict[f"model.decoder.layers.{layer_i}.self_attn.q_proj.weight"] = torch.cat(
118
+ [
119
+ loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(n_heads_per_shard, dims_per_head, dim)
120
+ for i in range(num_shards)
121
+ ],
122
+ dim=0,
123
+ ).reshape(dim, dim)
124
+ state_dict[f"model.decoder.layers.{layer_i}.self_attn.k_proj.weight"] = torch.cat(
125
+ [
126
+ loaded[i][f"layers.{layer_i}.attention.wk.weight"].view(n_heads_per_shard, dims_per_head, dim)
127
+ for i in range(num_shards)
128
+ ],
129
+ dim=0,
130
+ ).reshape(dim, dim)
131
+ state_dict[f"model.decoder.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat(
132
+ [
133
+ loaded[i][f"layers.{layer_i}.attention.wv.weight"].view(n_heads_per_shard, dims_per_head, dim)
134
+ for i in range(num_shards)
135
+ ],
136
+ dim=0,
137
+ ).reshape(dim, dim)
138
+
139
+ state_dict[f"model.decoder.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat(
140
+ [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=1
141
+ )
142
+ state_dict[f"model.decoder.layers.{layer_i}.feed_forward.w1.weight"] = torch.cat(
143
+ [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0
144
+ )
145
+ state_dict[f"model.decoder.layers.{layer_i}.feed_forward.w2.weight"] = torch.cat(
146
+ [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=1
147
+ )
148
+ state_dict[f"model.decoder.layers.{layer_i}.feed_forward.w3.weight"] = torch.cat(
149
+ [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0
150
+ )
151
+
152
+ for k, v in state_dict.items():
153
+ index_dict["weight_map"][k] = filename
154
+ param_count += v.numel()
155
+ torch.save(state_dict, os.path.join(model_path, filename))
156
+
157
+ filename = "pytorch_model-{:05d}-of-{:05d}.bin".format(
158
+ n_layers,
159
+ n_layers + 1,
160
+ )
161
+ if model_size == "7B":
162
+ # Unsharded
163
+ state_dict = {
164
+ "model.decoder.embed_tokens.weight": loaded["tok_embeddings.weight"],
165
+ "model.decoder.norm.weight": loaded["norm.weight"],
166
+ "lm_head.weight": loaded["output.weight"],
167
+ }
168
+ else:
169
+ state_dict = {
170
+ "model.decoder.norm.weight": loaded[0]["norm.weight"],
171
+ "model.decoder.embed_tokens.weight": torch.cat(
172
+ [loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=1
173
+ ),
174
+ "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0),
175
+ }
176
+
177
+ for k, v in state_dict.items():
178
+ index_dict["weight_map"][k] = filename
179
+ param_count += v.numel()
180
+ torch.save(state_dict, os.path.join(model_path, filename))
181
+
182
+ # Write configs
183
+ index_dict["metadata"] = {"total_size": param_count * 2}
184
+ write_json(index_dict, os.path.join(model_path, "pytorch_model.bin.index.json"))
185
+ config_out = {
186
+ "architectures": ["LLaMAForCausalLM"],
187
+ "bos_token_id": 0,
188
+ "eos_token_id": 1,
189
+ "hidden_act": "silu",
190
+ "hidden_size": params["dim"],
191
+ "intermediate_size": INTERMEDIATE_SIZE_MAP[model_size],
192
+ "initializer_range": 0.02,
193
+ "max_sequence_length": 2048,
194
+ "model_type": "llama",
195
+ "num_attention_heads": params["n_heads"],
196
+ "num_hidden_layers": params["n_layers"],
197
+ "pad_token_id": -1,
198
+ "rms_norm_eps": params["norm_eps"],
199
+ "torch_dtype": "float16",
200
+ "transformers_version": "4.27.0.dev0",
201
+ "use_cache": True,
202
+ "vocab_size": 32000,
203
+ }
204
+ write_json(
205
+ config_out,
206
+ os.path.join(model_path, "config.json"),
207
+ )
208
+ generation_config = {
209
+ "_from_model_config": True,
210
+ "bos_token_id": 0,
211
+ "eos_token_id": 1,
212
+ "pad_token_id": -1,
213
+ "transformers_version": "4.27.0.dev0",
214
+ }
215
+ write_json(
216
+ generation_config,
217
+ os.path.join(model_path, "generation_config.json"),
218
+ )
219
+
220
+
221
+ def write_tokenizer(tokenizer_path, input_tokenizer_path):
222
+ os.makedirs(tokenizer_path, exist_ok=True)
223
+ write_json({}, os.path.join(tokenizer_path, "special_tokens_map.json"))
224
+ write_json(
225
+ {
226
+ "bos_token": "",
227
+ "eos_token": "",
228
+ "model_max_length": int(1e30),
229
+ "tokenizer_class": "LLaMATokenizer",
230
+ "unk_token": "",
231
+ },
232
+ os.path.join(tokenizer_path, "tokenizer_config.json"),
233
+ )
234
+ shutil.copyfile(input_tokenizer_path, os.path.join(tokenizer_path, "tokenizer.model"))
235
+
236
+
237
+ def main():
238
+ parser = argparse.ArgumentParser()
239
+ parser.add_argument(
240
+ "--input_dir",
241
+ help="Location of LLaMA weights, which contains tokenizer.model and model folders",
242
+ )
243
+ parser.add_argument(
244
+ "--model_size",
245
+ choices=["7B", "13B", "30B", "65B"],
246
+ )
247
+ parser.add_argument(
248
+ "--output_dir",
249
+ help="Location to write HF model and tokenizer",
250
+ )
251
+ args = parser.parse_args()
252
+ write_model(
253
+ model_path=os.path.join(args.output_dir, "llama-{}".format(args.model_size).lower()),
254
+ input_base_path=os.path.join(args.input_dir, args.model_size),
255
+ model_size=args.model_size,
256
+ )
257
+ write_tokenizer(
258
+ tokenizer_path=os.path.join(args.output_dir, "tokenizer"),
259
+ input_tokenizer_path=os.path.join(args.input_dir, "tokenizer.model"),
260
+ )
261
+
262
+
263
+ if __name__ == "__main__":
264
+ main()
llama/modeling_llama.py ADDED
@@ -0,0 +1,941 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ PyTorch LLaMA model."""
16
+ import math
17
+ from typing import List, Optional, Tuple, Union
18
+
19
+ import torch
20
+ import torch.utils.checkpoint
21
+ from torch import nn
22
+ from torch.nn import CrossEntropyLoss
23
+
24
+ from transformers.activations import ACT2FN
25
+ from transformers.modeling_outputs import (
26
+ BaseModelOutputWithPast,
27
+ CausalLMOutputWithPast,
28
+ )
29
+ from transformers.modeling_utils import PreTrainedModel
30
+ from transformers.utils import (
31
+ add_code_sample_docstrings,
32
+ add_start_docstrings,
33
+ add_start_docstrings_to_model_forward,
34
+ logging,
35
+ replace_return_docstrings,
36
+ )
37
+ from .configuration_llama import LLaMAConfig
38
+
39
+
40
+ logger = logging.get_logger(__name__)
41
+
42
+ _CHECKPOINT_FOR_DOC = "llama-7b"
43
+ _CONFIG_FOR_DOC = "LLaMAConfig"
44
+
45
+
46
+ def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
47
+ """
48
+ Make causal mask used for bi-directional self-attention.
49
+ """
50
+ bsz, tgt_len = input_ids_shape
51
+ mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
52
+ mask_cond = torch.arange(mask.size(-1))
53
+ mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
54
+ mask = mask.to(dtype)
55
+
56
+ if past_key_values_length > 0:
57
+ mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
58
+ return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
59
+
60
+
61
+ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
62
+ """
63
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
64
+ """
65
+ bsz, src_len = mask.size()
66
+ tgt_len = tgt_len if tgt_len is not None else src_len
67
+
68
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
69
+
70
+ inverted_mask = 1.0 - expanded_mask
71
+
72
+ return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
73
+
74
+
75
+ class RMSNorm(torch.nn.Module):
76
+ def __init__(self, dim: int, eps: float = 1e-6):
77
+ super().__init__()
78
+ self.eps = eps
79
+ self.weight = nn.Parameter(torch.ones(dim))
80
+
81
+ def _norm(self, x):
82
+ return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
83
+
84
+ def forward(self, x):
85
+ output = self._norm(x.float()).type_as(x)
86
+ return output * self.weight
87
+
88
+
89
+ class LLaMAFeedForward(nn.Module):
90
+ def __init__(
91
+ self,
92
+ hidden_size: int,
93
+ intermediate_size: int,
94
+ hidden_act: str,
95
+ ):
96
+ super().__init__()
97
+ self.w1 = nn.Linear(hidden_size, intermediate_size, bias=False)
98
+ self.w2 = nn.Linear(intermediate_size, hidden_size, bias=False)
99
+ self.w3 = nn.Linear(hidden_size, intermediate_size, bias=False)
100
+ self.act_fn = ACT2FN[hidden_act]
101
+
102
+ def forward(self, x):
103
+ return self.w2(self.act_fn(self.w1(x)) * self.w3(x))
104
+
105
+
106
+ class LLaMAAttention(nn.Module):
107
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
108
+
109
+ def __init__(
110
+ self,
111
+ hidden_size: int,
112
+ num_heads: int,
113
+ complex_frequencies: torch.Tensor,
114
+ ):
115
+ super().__init__()
116
+ self.hidden_size = hidden_size
117
+ self.num_heads = num_heads
118
+ self.head_dim = hidden_size // num_heads
119
+
120
+ if (self.head_dim * num_heads) != self.hidden_size:
121
+ raise ValueError(
122
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
123
+ f" and `num_heads`: {num_heads})."
124
+ )
125
+ self.q_proj = nn.Linear(
126
+ hidden_size,
127
+ num_heads * self.head_dim,
128
+ bias=False,
129
+ )
130
+ self.k_proj = nn.Linear(
131
+ hidden_size,
132
+ num_heads * self.head_dim,
133
+ bias=False,
134
+ )
135
+ self.v_proj = nn.Linear(
136
+ hidden_size,
137
+ num_heads * self.head_dim,
138
+ bias=False,
139
+ )
140
+ self.o_proj = nn.Linear(
141
+ num_heads * self.head_dim,
142
+ hidden_size,
143
+ bias=False,
144
+ )
145
+ self.complex_frequencies = complex_frequencies
146
+
147
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
148
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
149
+
150
+ def forward(
151
+ self,
152
+ hidden_states: torch.Tensor,
153
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
154
+ attention_mask: Optional[torch.Tensor] = None,
155
+ layer_head_mask: Optional[torch.Tensor] = None,
156
+ output_attentions: bool = False,
157
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
158
+ """Input shape: Batch x Time x Channel"""
159
+
160
+ self.complex_frequencies = self.complex_frequencies.to(hidden_states.device)
161
+
162
+ bsz, tgt_len, _ = hidden_states.size()
163
+
164
+ # get query proj
165
+ query_states = self.q_proj(hidden_states).view(bsz, tgt_len, self.num_heads, self.head_dim)
166
+ key_states = self.k_proj(hidden_states).view(bsz, tgt_len, self.num_heads, self.head_dim)
167
+ value_states = self.v_proj(hidden_states).view(bsz, tgt_len, self.num_heads, self.head_dim)
168
+
169
+ if past_key_value is not None:
170
+ start = past_key_value[0].shape[2]
171
+ else:
172
+ start = 0
173
+
174
+ sliced_complex_frequencies = self.complex_frequencies[start : start + tgt_len]
175
+ query_states, key_states = apply_rotary_emb(
176
+ query_states=query_states, key_states=key_states, complex_frequencies=sliced_complex_frequencies
177
+ )
178
+
179
+ # get key, value proj
180
+ key_states = self._shape(key_states, -1, bsz)
181
+ value_states = self._shape(value_states, -1, bsz)
182
+ if past_key_value is not None:
183
+ # reuse k, v, self_attention
184
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
185
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
186
+
187
+ past_key_value = (key_states, value_states)
188
+ proj_shape = (bsz * self.num_heads, -1, self.head_dim)
189
+ query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
190
+ key_states = key_states.view(*proj_shape)
191
+ value_states = value_states.view(*proj_shape)
192
+
193
+ src_len = key_states.size(1)
194
+ attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) / math.sqrt(self.head_dim)
195
+
196
+ if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
197
+ raise ValueError(
198
+ f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
199
+ f" {attn_weights.size()}"
200
+ )
201
+
202
+ if attention_mask is not None:
203
+ if attention_mask.size() != (bsz, 1, tgt_len, src_len):
204
+ raise ValueError(
205
+ f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
206
+ )
207
+ attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
208
+ attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
209
+ attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
210
+
211
+ # upcast to fp32 if the weights are in fp16. Please see https://github.com/huggingface/transformers/pull/17437
212
+ if attn_weights.dtype == torch.float16:
213
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(torch.float16)
214
+ else:
215
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1)
216
+
217
+ if layer_head_mask is not None:
218
+ if layer_head_mask.size() != (self.num_heads,):
219
+ raise ValueError(
220
+ f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
221
+ f" {layer_head_mask.size()}"
222
+ )
223
+ attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
224
+ attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
225
+
226
+ if output_attentions:
227
+ # this operation is a bit awkward, but it's required to
228
+ # make sure that attn_weights keeps its gradient.
229
+ # In order to do so, attn_weights have to be reshaped
230
+ # twice and have to be reused in the following
231
+ attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
232
+ attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
233
+ else:
234
+ attn_weights_reshaped = None
235
+
236
+ attn_output = torch.bmm(attn_weights, value_states)
237
+
238
+ if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
239
+ raise ValueError(
240
+ f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
241
+ f" {attn_output.size()}"
242
+ )
243
+
244
+ attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
245
+ attn_output = attn_output.transpose(1, 2)
246
+
247
+ attn_output = attn_output.reshape(bsz, tgt_len, self.hidden_size)
248
+
249
+ attn_output = self.o_proj(attn_output)
250
+
251
+ return attn_output, attn_weights_reshaped, past_key_value
252
+
253
+
254
+ class LLaMADecoderLayer(nn.Module):
255
+ def __init__(self, config: LLaMAConfig):
256
+ super().__init__()
257
+ self.hidden_size = config.hidden_size
258
+ complex_frequencies = precompute_complex_frequencies(
259
+ head_dim=self.hidden_size // config.num_attention_heads,
260
+ length=config.max_sequence_length * 2,
261
+ )
262
+ self.self_attn = LLaMAAttention(
263
+ hidden_size=self.hidden_size,
264
+ num_heads=config.num_attention_heads,
265
+ complex_frequencies=complex_frequencies,
266
+ )
267
+ self.feed_forward = LLaMAFeedForward(
268
+ hidden_size=self.hidden_size,
269
+ intermediate_size=config.intermediate_size,
270
+ hidden_act=config.hidden_act,
271
+ )
272
+ self.attention_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
273
+ self.ffn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
274
+
275
+ def forward(
276
+ self,
277
+ hidden_states: torch.Tensor,
278
+ attention_mask: Optional[torch.Tensor] = None,
279
+ layer_head_mask: Optional[torch.Tensor] = None,
280
+ output_attentions: Optional[bool] = False,
281
+ use_cache: Optional[bool] = False,
282
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
283
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
284
+ """
285
+ Args:
286
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
287
+ attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
288
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
289
+ layer_head_mask (`torch.FloatTensor`, *optional*): mask for attention heads in a given layer of size
290
+ `(encoder_attention_heads,)`.
291
+ output_attentions (`bool`, *optional*):
292
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
293
+ returned tensors for more detail.
294
+ use_cache (`bool`, *optional*):
295
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
296
+ (see `past_key_values`).
297
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
298
+ """
299
+
300
+ residual = hidden_states
301
+
302
+ hidden_states = self.attention_norm(hidden_states)
303
+
304
+ # Self Attention
305
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
306
+ hidden_states=hidden_states,
307
+ past_key_value=past_key_value,
308
+ attention_mask=attention_mask,
309
+ layer_head_mask=layer_head_mask,
310
+ output_attentions=output_attentions,
311
+ )
312
+ hidden_states = residual + hidden_states
313
+
314
+ # Fully Connected
315
+ residual = hidden_states
316
+ hidden_states = self.ffn_norm(hidden_states)
317
+ hidden_states = self.feed_forward(hidden_states)
318
+ hidden_states = residual + hidden_states
319
+
320
+ outputs = (hidden_states,)
321
+
322
+ if output_attentions:
323
+ outputs += (self_attn_weights,)
324
+
325
+ if use_cache:
326
+ outputs += (present_key_value,)
327
+
328
+ return outputs
329
+
330
+
331
+ LLAMA_START_DOCSTRING = r"""
332
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
333
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
334
+ etc.)
335
+
336
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
337
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
338
+ and behavior.
339
+
340
+ Parameters:
341
+ config ([`LLaMAConfig`]):
342
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
343
+ load the weights associated with the model, only the configuration. Check out the
344
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
345
+ """
346
+
347
+
348
+ @add_start_docstrings(
349
+ "The bare OPT Model outputting raw hidden-states without any specific head on top.",
350
+ LLAMA_START_DOCSTRING,
351
+ )
352
+ class LLaMAPreTrainedModel(PreTrainedModel):
353
+ config_class = LLaMAConfig
354
+ base_model_prefix = "model"
355
+ supports_gradient_checkpointing = True
356
+ _no_split_modules = ["LLaMADecoderLayer"]
357
+ _keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
358
+
359
+ def _init_weights(self, module):
360
+ std = self.config.initializer_range
361
+ if isinstance(module, nn.Linear):
362
+ module.weight.data.normal_(mean=0.0, std=std)
363
+ if module.bias is not None:
364
+ module.bias.data.zero_()
365
+ elif isinstance(module, nn.Embedding):
366
+ module.weight.data.normal_(mean=0.0, std=std)
367
+ if module.padding_idx is not None:
368
+ module.weight.data[module.padding_idx].zero_()
369
+
370
+ def _set_gradient_checkpointing(self, module, value=False):
371
+ if isinstance(module, (LLaMADecoder)):
372
+ module.gradient_checkpointing = value
373
+
374
+
375
+ LLAMA_INPUTS_DOCSTRING = r"""
376
+ Args:
377
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
378
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
379
+ it.
380
+
381
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
382
+ [`PreTrainedTokenizer.__call__`] for details.
383
+
384
+ [What are input IDs?](../glossary#input-ids)
385
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
386
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
387
+
388
+ - 1 for tokens that are **not masked**,
389
+ - 0 for tokens that are **masked**.
390
+
391
+ [What are attention masks?](../glossary#attention-mask)
392
+
393
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
394
+ [`PreTrainedTokenizer.__call__`] for details.
395
+
396
+ If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
397
+ `past_key_values`).
398
+
399
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
400
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
401
+ information on the default strategy.
402
+ head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
403
+ Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
404
+
405
+ - 1 indicates the head is **not masked**,
406
+ - 0 indicates the head is **masked**.
407
+
408
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
409
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
410
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
411
+ `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
412
+
413
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
414
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
415
+
416
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
417
+ don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
418
+ `decoder_input_ids` of shape `(batch_size, sequence_length)`.
419
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
420
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
421
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
422
+ model's internal embedding lookup matrix.
423
+ use_cache (`bool`, *optional*):
424
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
425
+ `past_key_values`).
426
+ output_attentions (`bool`, *optional*):
427
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
428
+ tensors for more detail.
429
+ output_hidden_states (`bool`, *optional*):
430
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
431
+ more detail.
432
+ return_dict (`bool`, *optional*):
433
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
434
+ """
435
+
436
+
437
+ class LLaMADecoder(LLaMAPreTrainedModel):
438
+ """
439
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LLaMADecoderLayer`]
440
+
441
+ Args:
442
+ config: LLaMAConfig
443
+ """
444
+
445
+ def __init__(self, config: LLaMAConfig):
446
+ super().__init__(config)
447
+ self.padding_idx = config.pad_token_id
448
+
449
+ self.vocab_size = config.vocab_size
450
+
451
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
452
+
453
+ self.layers = nn.ModuleList([LLaMADecoderLayer(config) for _ in range(config.num_hidden_layers)])
454
+ self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
455
+
456
+ self.gradient_checkpointing = False
457
+ # Initialize weights and apply final processing
458
+ self.post_init()
459
+
460
+ def get_input_embeddings(self):
461
+ return self.embed_tokens
462
+
463
+ def set_input_embeddings(self, value):
464
+ self.embed_tokens = value
465
+
466
+ # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
467
+ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
468
+ # create causal mask
469
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
470
+ combined_attention_mask = None
471
+ if input_shape[-1] > 1:
472
+ combined_attention_mask = _make_causal_mask(
473
+ input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
474
+ ).to(inputs_embeds.device)
475
+
476
+ if attention_mask is not None:
477
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
478
+ expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
479
+ inputs_embeds.device
480
+ )
481
+ combined_attention_mask = (
482
+ expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
483
+ )
484
+
485
+ return combined_attention_mask
486
+
487
+ def forward(
488
+ self,
489
+ input_ids: torch.LongTensor = None,
490
+ attention_mask: Optional[torch.Tensor] = None,
491
+ head_mask: Optional[torch.Tensor] = None,
492
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
493
+ inputs_embeds: Optional[torch.FloatTensor] = None,
494
+ use_cache: Optional[bool] = None,
495
+ output_attentions: Optional[bool] = None,
496
+ output_hidden_states: Optional[bool] = None,
497
+ return_dict: Optional[bool] = None,
498
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
499
+ r"""
500
+ Args:
501
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
502
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
503
+ provide it.
504
+
505
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
506
+ [`PreTrainedTokenizer.__call__`] for details.
507
+
508
+ [What are input IDs?](../glossary#input-ids)
509
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
510
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
511
+
512
+ - 1 for tokens that are **not masked**,
513
+ - 0 for tokens that are **masked**.
514
+
515
+ [What are attention masks?](../glossary#attention-mask)
516
+ head_mask (`torch.Tensor` of shape `(num_hidden_layers, num_attention_heads)`, *optional*):
517
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
518
+
519
+ - 1 indicates the head is **not masked**,
520
+ - 0 indicates the head is **masked**.
521
+
522
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
523
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
524
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
525
+
526
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
527
+ cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
528
+
529
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
530
+ that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
531
+ all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
532
+
533
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
534
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
535
+ This is useful if you want more control over how to convert `input_ids` indices into associated vectors
536
+ than the model's internal embedding lookup matrix.
537
+ output_attentions (`bool`, *optional*):
538
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
539
+ returned tensors for more detail.
540
+ output_hidden_states (`bool`, *optional*):
541
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
542
+ for more detail.
543
+ return_dict (`bool`, *optional*):
544
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
545
+ """
546
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
547
+ output_hidden_states = (
548
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
549
+ )
550
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
551
+
552
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
553
+
554
+ # retrieve input_ids and inputs_embeds
555
+ if input_ids is not None and inputs_embeds is not None:
556
+ raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
557
+ elif input_ids is not None:
558
+ input_shape = input_ids.size()
559
+ input_ids = input_ids.view(-1, input_shape[-1])
560
+ elif inputs_embeds is not None:
561
+ input_shape = inputs_embeds.size()[:-1]
562
+ else:
563
+ raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
564
+
565
+ past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
566
+
567
+ if inputs_embeds is None:
568
+ inputs_embeds = self.embed_tokens(input_ids)
569
+
570
+ # embed positions
571
+ if attention_mask is None:
572
+ attention_mask = torch.ones(inputs_embeds.shape[:2], dtype=torch.bool, device=inputs_embeds.device)
573
+
574
+ attention_mask = self._prepare_decoder_attention_mask(
575
+ attention_mask, input_shape, inputs_embeds, past_key_values_length
576
+ )
577
+
578
+ hidden_states = inputs_embeds
579
+
580
+ if self.gradient_checkpointing and self.training:
581
+ if use_cache:
582
+ logger.warning_once(
583
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
584
+ )
585
+ use_cache = False
586
+
587
+ # decoder layers
588
+ all_hidden_states = () if output_hidden_states else None
589
+ all_self_attns = () if output_attentions else None
590
+ next_decoder_cache = () if use_cache else None
591
+
592
+ # check if head_mask has a correct number of layers specified if desired
593
+ for attn_mask, mask_name in zip([head_mask], ["head_mask"]):
594
+ if attn_mask is not None:
595
+ if attn_mask.size()[0] != (len(self.layers)):
596
+ raise ValueError(
597
+ f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
598
+ f" {head_mask.size()[0]}."
599
+ )
600
+
601
+ for idx, decoder_layer in enumerate(self.layers):
602
+ if output_hidden_states:
603
+ all_hidden_states += (hidden_states,)
604
+
605
+ past_key_value = past_key_values[idx] if past_key_values is not None else None
606
+
607
+ if self.gradient_checkpointing and self.training:
608
+
609
+ def create_custom_forward(module):
610
+ def custom_forward(*inputs):
611
+ # None for past_key_value
612
+ return module(*inputs, output_attentions, None)
613
+
614
+ return custom_forward
615
+
616
+ layer_outputs = torch.utils.checkpoint.checkpoint(
617
+ create_custom_forward(decoder_layer),
618
+ hidden_states,
619
+ attention_mask,
620
+ head_mask[idx] if head_mask is not None else None,
621
+ None,
622
+ )
623
+ else:
624
+ layer_outputs = decoder_layer(
625
+ hidden_states,
626
+ attention_mask=attention_mask,
627
+ layer_head_mask=(head_mask[idx] if head_mask is not None else None),
628
+ past_key_value=past_key_value,
629
+ output_attentions=output_attentions,
630
+ use_cache=use_cache,
631
+ )
632
+
633
+ hidden_states = layer_outputs[0]
634
+
635
+ if use_cache:
636
+ next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
637
+
638
+ if output_attentions:
639
+ all_self_attns += (layer_outputs[1],)
640
+
641
+ hidden_states = self.norm(hidden_states)
642
+
643
+ # add hidden states from the last decoder layer
644
+ if output_hidden_states:
645
+ all_hidden_states += (hidden_states,)
646
+
647
+ next_cache = next_decoder_cache if use_cache else None
648
+ if not return_dict:
649
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
650
+ return BaseModelOutputWithPast(
651
+ last_hidden_state=hidden_states,
652
+ past_key_values=next_cache,
653
+ hidden_states=all_hidden_states,
654
+ attentions=all_self_attns,
655
+ )
656
+
657
+
658
+ @add_start_docstrings(
659
+ "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
660
+ LLAMA_START_DOCSTRING,
661
+ )
662
+ class LLaMAModel(LLaMAPreTrainedModel):
663
+ def __init__(self, config: LLaMAConfig):
664
+ super().__init__(config)
665
+ self.decoder = LLaMADecoder(config)
666
+ # Initialize weights and apply final processing
667
+ self.post_init()
668
+
669
+ def get_input_embeddings(self):
670
+ return self.decoder.embed_tokens
671
+
672
+ def set_input_embeddings(self, value):
673
+ self.decoder.embed_tokens = value
674
+
675
+ def get_decoder(self):
676
+ return self.decoder
677
+
678
+ @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
679
+ @add_code_sample_docstrings(
680
+ checkpoint=_CHECKPOINT_FOR_DOC,
681
+ output_type=BaseModelOutputWithPast,
682
+ config_class=_CONFIG_FOR_DOC,
683
+ )
684
+ def forward(
685
+ self,
686
+ input_ids: torch.LongTensor = None,
687
+ attention_mask: Optional[torch.Tensor] = None,
688
+ head_mask: Optional[torch.Tensor] = None,
689
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
690
+ inputs_embeds: Optional[torch.FloatTensor] = None,
691
+ use_cache: Optional[bool] = None,
692
+ output_attentions: Optional[bool] = None,
693
+ output_hidden_states: Optional[bool] = None,
694
+ return_dict: Optional[bool] = None,
695
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
696
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
697
+ output_hidden_states = (
698
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
699
+ )
700
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
701
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
702
+
703
+ # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
704
+ decoder_outputs = self.decoder(
705
+ input_ids=input_ids,
706
+ attention_mask=attention_mask,
707
+ head_mask=head_mask,
708
+ past_key_values=past_key_values,
709
+ inputs_embeds=inputs_embeds,
710
+ use_cache=use_cache,
711
+ output_attentions=output_attentions,
712
+ output_hidden_states=output_hidden_states,
713
+ return_dict=return_dict,
714
+ )
715
+
716
+ if not return_dict:
717
+ return decoder_outputs
718
+
719
+ return BaseModelOutputWithPast(
720
+ last_hidden_state=decoder_outputs.last_hidden_state,
721
+ past_key_values=decoder_outputs.past_key_values,
722
+ hidden_states=decoder_outputs.hidden_states,
723
+ attentions=decoder_outputs.attentions,
724
+ )
725
+
726
+
727
+ class LLaMAForCausalLM(LLaMAPreTrainedModel):
728
+ _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
729
+
730
+ def __init__(self, config):
731
+ super().__init__(config)
732
+ self.model = LLaMAModel(config)
733
+
734
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
735
+
736
+ # Initialize weights and apply final processing
737
+ self.post_init()
738
+
739
+ def get_input_embeddings(self):
740
+ return self.model.decoder.embed_tokens
741
+
742
+ def set_input_embeddings(self, value):
743
+ self.model.decoder.embed_tokens = value
744
+
745
+ def get_output_embeddings(self):
746
+ return self.lm_head
747
+
748
+ def set_output_embeddings(self, new_embeddings):
749
+ self.lm_head = new_embeddings
750
+
751
+ def set_decoder(self, decoder):
752
+ self.model.decoder = decoder
753
+
754
+ def get_decoder(self):
755
+ return self.model.decoder
756
+
757
+ @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
758
+ def forward(
759
+ self,
760
+ input_ids: torch.LongTensor = None,
761
+ attention_mask: Optional[torch.Tensor] = None,
762
+ head_mask: Optional[torch.Tensor] = None,
763
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
764
+ inputs_embeds: Optional[torch.FloatTensor] = None,
765
+ labels: Optional[torch.LongTensor] = None,
766
+ use_cache: Optional[bool] = None,
767
+ output_attentions: Optional[bool] = None,
768
+ output_hidden_states: Optional[bool] = None,
769
+ return_dict: Optional[bool] = None,
770
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
771
+ r"""
772
+ Args:
773
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
774
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
775
+ provide it.
776
+
777
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
778
+ [`PreTrainedTokenizer.__call__`] for details.
779
+
780
+ [What are input IDs?](../glossary#input-ids)
781
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
782
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
783
+
784
+ - 1 for tokens that are **not masked**,
785
+ - 0 for tokens that are **masked**.
786
+
787
+ [What are attention masks?](../glossary#attention-mask)
788
+ head_mask (`torch.Tensor` of shape `(num_hidden_layers, num_attention_heads)`, *optional*):
789
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
790
+
791
+ - 1 indicates the head is **not masked**,
792
+ - 0 indicates the head is **masked**.
793
+
794
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
795
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
796
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
797
+ shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional
798
+ tensors are only required when the model is used as a decoder in a Sequence to Sequence model.
799
+
800
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
801
+ cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
802
+
803
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
804
+ that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
805
+ all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
806
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
807
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
808
+ This is useful if you want more control over how to convert `input_ids` indices into associated vectors
809
+ than the model's internal embedding lookup matrix.
810
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
811
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
812
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
813
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
814
+ use_cache (`bool`, *optional*):
815
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
816
+ (see `past_key_values`).
817
+ output_attentions (`bool`, *optional*):
818
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
819
+ returned tensors for more detail.
820
+ output_hidden_states (`bool`, *optional*):
821
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
822
+ for more detail.
823
+ return_dict (`bool`, *optional*):
824
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
825
+
826
+ Returns:
827
+
828
+ Example:
829
+
830
+ ```python
831
+ >>> from transformers import AutoTokenizer, LLaMAForCausalLM
832
+
833
+ >>> model = LLaMAForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
834
+ >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
835
+
836
+ >>> prompt = "Hey, are you consciours? Can you talk to me?"
837
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
838
+
839
+ >>> # Generate
840
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
841
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
842
+ "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
843
+ ```"""
844
+
845
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
846
+ output_hidden_states = (
847
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
848
+ )
849
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
850
+
851
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
852
+ outputs = self.model.decoder(
853
+ input_ids=input_ids,
854
+ attention_mask=attention_mask,
855
+ head_mask=head_mask,
856
+ past_key_values=past_key_values,
857
+ inputs_embeds=inputs_embeds,
858
+ use_cache=use_cache,
859
+ output_attentions=output_attentions,
860
+ output_hidden_states=output_hidden_states,
861
+ return_dict=return_dict,
862
+ )
863
+
864
+ logits = self.lm_head(outputs[0]).contiguous()
865
+
866
+ loss = None
867
+ if labels is not None:
868
+ # Shift so that tokens < n predict n
869
+ shift_logits = logits[..., :-1, :].contiguous()
870
+ shift_labels = labels[..., 1:].contiguous()
871
+ # Flatten the tokens
872
+ loss_fct = CrossEntropyLoss()
873
+ loss = loss_fct(shift_logits.view(-1, self.config.vocab_size), shift_labels.view(-1))
874
+
875
+ if not return_dict:
876
+ output = (logits,) + outputs[1:]
877
+ return (loss,) + output if loss is not None else output
878
+
879
+ return CausalLMOutputWithPast(
880
+ loss=loss,
881
+ logits=logits,
882
+ past_key_values=outputs.past_key_values,
883
+ hidden_states=outputs.hidden_states,
884
+ attentions=outputs.attentions,
885
+ )
886
+
887
+ def prepare_inputs_for_generation(
888
+ self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
889
+ ):
890
+ if past_key_values:
891
+ input_ids = input_ids[:, -1:]
892
+
893
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
894
+ if inputs_embeds is not None and past_key_values is None:
895
+ model_inputs = {"inputs_embeds": inputs_embeds}
896
+ else:
897
+ model_inputs = {"input_ids": input_ids}
898
+
899
+ model_inputs.update(
900
+ {
901
+ "past_key_values": past_key_values,
902
+ "use_cache": kwargs.get("use_cache"),
903
+ "attention_mask": attention_mask,
904
+ }
905
+ )
906
+ return model_inputs
907
+
908
+ @staticmethod
909
+ def _reorder_cache(past_key_values, beam_idx):
910
+ reordered_past = ()
911
+ for layer_past in past_key_values:
912
+ reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
913
+ return reordered_past
914
+
915
+
916
+ def precompute_complex_frequencies(head_dim: int, length: int, theta: float = 10000.0):
917
+ frequencies = 1.0 / (theta ** (torch.arange(0, head_dim, 2)[: (head_dim // 2)].float() / head_dim))
918
+ t = torch.arange(length, device=frequencies.device)
919
+ frequencies = torch.outer(t, frequencies).float()
920
+ return torch.polar(torch.ones_like(frequencies), frequencies) # complex64
921
+
922
+
923
+ def apply_rotary_emb(
924
+ query_states: torch.Tensor,
925
+ key_states: torch.Tensor,
926
+ complex_frequencies: torch.Tensor,
927
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
928
+ query_states_complex = torch.view_as_complex(query_states.float().reshape(*key_states.shape[:-1], -1, 2))
929
+ key_states_complex = torch.view_as_complex(key_states.float().reshape(*key_states.shape[:-1], -1, 2))
930
+ complex_frequencies = reshape_for_broadcast(complex_frequencies, query_states_complex)
931
+ output_query_states = torch.view_as_real(query_states_complex * complex_frequencies).flatten(3)
932
+ output_key_states = torch.view_as_real(key_states_complex * complex_frequencies).flatten(3)
933
+ return output_query_states.type_as(query_states), output_key_states.type_as(key_states)
934
+
935
+
936
+ def reshape_for_broadcast(complex_frequencies: torch.Tensor, x: torch.Tensor):
937
+ ndim = x.ndim
938
+ assert 0 <= 1 < ndim
939
+ assert complex_frequencies.shape == (x.shape[1], x.shape[-1])
940
+ shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
941
+ return complex_frequencies.view(*shape)
llama/tokenization_llama.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 The FAIR team of Meta AI and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Tokenization classes for LLaMA."""
16
+ import os
17
+ import re
18
+ from shutil import copyfile
19
+ from typing import Any, Dict, List, Optional, Tuple
20
+
21
+ import sentencepiece as spm
22
+
23
+ from transformers.tokenization_utils import PreTrainedTokenizer
24
+ from transformers.utils import logging
25
+
26
+
27
+ logger = logging.get_logger(__name__)
28
+
29
+ VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
30
+
31
+ PRETRAINED_VOCAB_FILES_MAP = {}
32
+
33
+
34
+ class LLaMATokenizer(PreTrainedTokenizer):
35
+ """
36
+ Construct a LLaMA tokenizer. Based on byte-level Byte-Pair-Encoding.
37
+
38
+ Args:
39
+ vocab_file (`str`):
40
+ Path to the vocabulary file.
41
+ """
42
+
43
+ vocab_files_names = VOCAB_FILES_NAMES
44
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
45
+ model_input_names = ["input_ids", "attention_mask"]
46
+
47
+ def __init__(
48
+ self,
49
+ vocab_file,
50
+ unk_token="",
51
+ bos_token="",
52
+ eos_token="",
53
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
54
+ add_bos_token=False,
55
+ add_eos_token=False,
56
+ **kwargs,
57
+ ):
58
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
59
+ super().__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
60
+ self.vocab_file = vocab_file
61
+ self.add_bos_token = add_bos_token
62
+ self.add_eos_token = add_eos_token
63
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
64
+ self.sp_model.Load(vocab_file)
65
+
66
+ """ Initialisation"""
67
+
68
+ @property
69
+ def vocab_size(self):
70
+ """Returns vocab size"""
71
+ return self.sp_model.get_piece_size()
72
+
73
+ @property
74
+ def bos_token_id(self) -> Optional[int]:
75
+ return self.sp_model.bos_id()
76
+
77
+ @property
78
+ def eos_token_id(self) -> Optional[int]:
79
+ return self.sp_model.eos_id()
80
+
81
+ def get_vocab(self):
82
+ """Returns vocab as a dict"""
83
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
84
+ vocab.update(self.added_tokens_encoder)
85
+ return vocab
86
+
87
+ def _tokenize(self, text):
88
+ """Returns a tokenized string."""
89
+ return self.sp_model.encode(text, out_type=str)
90
+
91
+ def _convert_token_to_id(self, token):
92
+ """Converts a token (str) in an id using the vocab."""
93
+ return self.sp_model.piece_to_id(token)
94
+
95
+ def _convert_id_to_token(self, index):
96
+ """Converts an index (integer) in a token (str) using the vocab."""
97
+ token = self.sp_model.IdToPiece(index)
98
+ return token
99
+
100
+ def convert_tokens_to_string(self, tokens):
101
+ """Converts a sequence of tokens (string) in a single string."""
102
+ current_sub_tokens = []
103
+ out_string = ""
104
+ prev_is_special = False
105
+ for token in tokens:
106
+ # make sure that special tokens are not decoded using sentencepiece model
107
+ if token in self.all_special_tokens:
108
+ if not prev_is_special:
109
+ out_string += " "
110
+ out_string += self.sp_model.decode(current_sub_tokens) + token
111
+ prev_is_special = True
112
+ current_sub_tokens = []
113
+ else:
114
+ current_sub_tokens.append(token)
115
+ prev_is_special = False
116
+ out_string += self.sp_model.decode(current_sub_tokens)
117
+ return out_string.strip()
118
+
119
+ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
120
+ """
121
+ Save the vocabulary and special tokens file to a directory.
122
+
123
+ Args:
124
+ save_directory (`str`):
125
+ The directory in which to save the vocabulary.
126
+
127
+ Returns:
128
+ `Tuple(str)`: Paths to the files saved.
129
+ """
130
+ if not os.path.isdir(save_directory):
131
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
132
+ return
133
+ out_vocab_file = os.path.join(
134
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
135
+ )
136
+
137
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
138
+ copyfile(self.vocab_file, out_vocab_file)
139
+ elif not os.path.isfile(self.vocab_file):
140
+ with open(out_vocab_file, "wb") as fi:
141
+ content_spiece_model = self.sp_model.serialized_model_proto()
142
+ fi.write(content_spiece_model)
143
+
144
+ return (out_vocab_file,)
145
+
146
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
147
+ if self.add_bos_token:
148
+ bos_token_ids = [self.bos_token_id]
149
+ else:
150
+ bos_token_ids = []
151
+
152
+ output = bos_token_ids + token_ids_0
153
+
154
+ if token_ids_1 is not None:
155
+ output = output + token_ids_1
156
+
157
+ if self.add_eos_token:
158
+ output = output + [self.eos_token_id]
159
+
160
+ return output
161
+
162
+ def get_special_tokens_mask(
163
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
164
+ ) -> List[int]:
165
+ """
166
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
167
+ special tokens using the tokenizer `prepare_for_model` method.
168
+
169
+ Args:
170
+ token_ids_0 (`List[int]`):
171
+ List of IDs.
172
+ token_ids_1 (`List[int]`, *optional*):
173
+ Optional second list of IDs for sequence pairs.
174
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
175
+ Whether or not the token list is already formatted with special tokens for the model.
176
+
177
+ Returns:
178
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
179
+ """
180
+ if already_has_special_tokens:
181
+ return super().get_special_tokens_mask(
182
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
183
+ )
184
+
185
+ if token_ids_1 is None:
186
+ return [1] + ([0] * len(token_ids_0)) + [1]
187
+ return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
188
+
189
+ def create_token_type_ids_from_sequences(
190
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
191
+ ) -> List[int]:
192
+ """
193
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
194
+ use of token type ids, therefore a list of zeros is returned.
195
+
196
+ Args:
197
+ token_ids_0 (`List[int]`):
198
+ List of IDs.
199
+ token_ids_1 (`List[int]`, *optional*):
200
+ Optional second list of IDs for sequence pairs.
201
+
202
+ Returns:
203
+ `List[int]`: List of zeros.
204
+ """
205
+ eos = [self.eos_token_id]
206
+
207
+ if token_ids_1 is None:
208
+ return len(token_ids_0 + eos) * [0]
209
+ return len(token_ids_0 + eos + token_ids_1 + eos) * [0]