visheratin commited on
Commit
1ceb68b
1 Parent(s): 413f0fc

Upload folder using huggingface_hub

Browse files
added_tokens.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "\t\t": 50294,
3
+ "\t\t\t": 50293,
4
+ "\t\t\t\t": 50292,
5
+ "\t\t\t\t\t": 50291,
6
+ "\t\t\t\t\t\t": 50290,
7
+ "\t\t\t\t\t\t\t": 50289,
8
+ "\t\t\t\t\t\t\t\t": 50288,
9
+ "\t\t\t\t\t\t\t\t\t": 50287,
10
+ " ": 50286,
11
+ " ": 50285,
12
+ " ": 50284,
13
+ " ": 50283,
14
+ " ": 50282,
15
+ " ": 50281,
16
+ " ": 50280,
17
+ " ": 50279,
18
+ " ": 50278,
19
+ " ": 50277,
20
+ " ": 50276,
21
+ " ": 50275,
22
+ " ": 50274,
23
+ " ": 50273,
24
+ " ": 50272,
25
+ " ": 50271,
26
+ " ": 50270,
27
+ " ": 50269,
28
+ " ": 50268,
29
+ " ": 50267,
30
+ " ": 50266,
31
+ " ": 50265,
32
+ " ": 50264,
33
+ " ": 50263,
34
+ " ": 50262,
35
+ " ": 50261,
36
+ " ": 50260,
37
+ " ": 50259,
38
+ " ": 50258,
39
+ " ": 50257,
40
+ "<image>": 50297,
41
+ "<pad>": 50298,
42
+ "<|im_end|>": 50295,
43
+ "<|im_start|>": 50296
44
+ }
config.json ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlavaForConditionalGeneration"
4
+ ],
5
+ "ignore_index": -100,
6
+ "image_token_index": 50297,
7
+ "model_type": "llava",
8
+ "projector_hidden_act": "gelu",
9
+ "projector_tokens_num": 5,
10
+ "auto_map": {
11
+ "AutoConfig": "visheratin/LLaVA-3b--configuration_llava.LlavaConfig",
12
+ "AutoModelForConditionalGeneration": "visheratin/LLaVA-3b--modeling_llava.LlavaForConditionalGeneration"
13
+ },
14
+ "text_config": {
15
+ "_name_or_path": "cognitivecomputations/dolphin-2_6-phi-2",
16
+ "activation_function": "gelu_new",
17
+ "add_cross_attention": false,
18
+ "architectures": [
19
+ "PhiForCausalLM"
20
+ ],
21
+ "attn_pdrop": 0.0,
22
+ "auto_map": {
23
+ "AutoConfig": "cognitivecomputations/dolphin-2_6-phi-2--configuration_phi.PhiConfig",
24
+ "AutoModelForCausalLM": "cognitivecomputations/dolphin-2_6-phi-2--modeling_phi.PhiForCausalLM"
25
+ },
26
+ "bad_words_ids": null,
27
+ "begin_suppress_tokens": null,
28
+ "bos_token_id": null,
29
+ "chunk_size_feed_forward": 0,
30
+ "cross_attention_hidden_size": null,
31
+ "decoder_start_token_id": null,
32
+ "diversity_penalty": 0.0,
33
+ "do_sample": false,
34
+ "early_stopping": false,
35
+ "embd_pdrop": 0.0,
36
+ "encoder_no_repeat_ngram_size": 0,
37
+ "eos_token_id": null,
38
+ "exponential_decay_length_penalty": null,
39
+ "finetuning_task": null,
40
+ "flash_attn": false,
41
+ "flash_rotary": false,
42
+ "forced_bos_token_id": null,
43
+ "forced_eos_token_id": null,
44
+ "fused_dense": false,
45
+ "id2label": {
46
+ "0": "LABEL_0",
47
+ "1": "LABEL_1"
48
+ },
49
+ "img_processor": null,
50
+ "initializer_range": 0.02,
51
+ "is_decoder": false,
52
+ "is_encoder_decoder": false,
53
+ "label2id": {
54
+ "LABEL_0": 0,
55
+ "LABEL_1": 1
56
+ },
57
+ "layer_norm_epsilon": 1e-05,
58
+ "length_penalty": 1.0,
59
+ "max_length": 20,
60
+ "min_length": 0,
61
+ "model_type": "phi-msft",
62
+ "n_embd": 2560,
63
+ "n_head": 32,
64
+ "n_head_kv": null,
65
+ "n_inner": null,
66
+ "n_layer": 32,
67
+ "n_positions": 2048,
68
+ "no_repeat_ngram_size": 0,
69
+ "num_beam_groups": 1,
70
+ "num_beams": 1,
71
+ "num_return_sequences": 1,
72
+ "output_attentions": false,
73
+ "output_hidden_states": false,
74
+ "output_scores": false,
75
+ "pad_token_id": null,
76
+ "prefix": null,
77
+ "problem_type": null,
78
+ "pruned_heads": {},
79
+ "remove_invalid_values": false,
80
+ "repetition_penalty": 1.0,
81
+ "resid_pdrop": 0.1,
82
+ "return_dict": true,
83
+ "return_dict_in_generate": false,
84
+ "rotary_dim": 32,
85
+ "sep_token_id": null,
86
+ "suppress_tokens": null,
87
+ "task_specific_params": null,
88
+ "temperature": 1.0,
89
+ "tf_legacy_loss": false,
90
+ "tie_encoder_decoder": false,
91
+ "tie_word_embeddings": false,
92
+ "tokenizer_class": null,
93
+ "top_k": 50,
94
+ "top_p": 1.0,
95
+ "torch_dtype": "float16",
96
+ "torchscript": false,
97
+ "typical_p": 1.0,
98
+ "use_bfloat16": false,
99
+ "use_cache": false,
100
+ "vocab_size": 51200
101
+ },
102
+ "preprocess_config": {
103
+ "mean": [
104
+ 0.5,
105
+ 0.5,
106
+ 0.5
107
+ ],
108
+ "std": [
109
+ 0.5,
110
+ 0.5,
111
+ 0.5
112
+ ],
113
+ "interpolation": "bicubic",
114
+ "resize_mode": "squash",
115
+ "size": 384
116
+ },
117
+ "torch_dtype": "float16",
118
+ "transformers_version": "4.36.2",
119
+ "vision_embed_dim": 1152,
120
+ "vision_tower_name": "ViT-SO400M-14-SigLIP-384",
121
+ "vocab_size": 51200
122
+ }
configuration_llava.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+
3
+ from transformers.configuration_utils import PretrainedConfig
4
+ from open_clip import get_model_config
5
+ from configuration_phi import PhiConfig
6
+
7
+
8
+ class LlavaConfig(PretrainedConfig):
9
+ model_type = "llava"
10
+ is_composition = False
11
+
12
+ def __init__(
13
+ self,
14
+ text_config=None,
15
+ vision_tower_name="ViT-SO400M-14-SigLIP-384",
16
+ ignore_index=-100,
17
+ image_token_index=50297,
18
+ projector_hidden_act="gelu",
19
+ projector_tokens_num=1,
20
+ vocab_size=51200,
21
+ **kwargs,
22
+ ):
23
+ self.ignore_index = ignore_index
24
+ self.image_token_index = image_token_index
25
+ self.projector_hidden_act = projector_hidden_act
26
+ self.projector_tokens_num = projector_tokens_num
27
+ self.vocab_size = vocab_size
28
+
29
+ self.vision_tower_name = vision_tower_name
30
+ vision_config = get_model_config(vision_tower_name)
31
+ self.vision_embed_dim = vision_config["embed_dim"]
32
+
33
+ self.vocab_size = self.vocab_size
34
+
35
+ self.text_config = text_config
36
+ if isinstance(self.text_config, dict):
37
+ text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
38
+ self.text_config = PhiConfig(**text_config)
39
+ self.vocab_size = self.text_config.vocab_size
40
+
41
+ super().__init__(**kwargs)
configuration_phi.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT license.
3
+
4
+ import math
5
+ from typing import Optional
6
+
7
+ from transformers import PretrainedConfig
8
+
9
+
10
+ class PhiConfig(PretrainedConfig):
11
+ """Phi configuration."""
12
+
13
+ model_type = "phi-msft"
14
+ attribute_map = {
15
+ "max_position_embeddings": "n_positions",
16
+ "hidden_size": "n_embd",
17
+ "num_attention_heads": "n_head",
18
+ "num_hidden_layers": "n_layer",
19
+ }
20
+
21
+ def __init__(
22
+ self,
23
+ vocab_size: int = 51200,
24
+ n_positions: int = 2048,
25
+ n_embd: int = 1024,
26
+ n_layer: int = 20,
27
+ n_inner: Optional[int] = None,
28
+ n_head: int = 16,
29
+ n_head_kv: Optional[int] = None,
30
+ rotary_dim: Optional[int] = 32,
31
+ activation_function: Optional[str] = "gelu_new",
32
+ flash_attn: bool = False,
33
+ flash_rotary: bool = False,
34
+ fused_dense: bool = False,
35
+ attn_pdrop: float = 0.0,
36
+ embd_pdrop: float = 0.0,
37
+ resid_pdrop: float = 0.0,
38
+ layer_norm_epsilon: float = 1e-5,
39
+ initializer_range: float = 0.02,
40
+ tie_word_embeddings: bool = False,
41
+ pad_vocab_size_multiple: int = 64,
42
+ **kwargs
43
+ ) -> None:
44
+ self.vocab_size = int(math.ceil(vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple)
45
+ self.n_positions = n_positions
46
+ self.n_embd = n_embd
47
+ self.n_layer = n_layer
48
+ self.n_inner = n_inner
49
+ self.n_head = n_head
50
+ self.n_head_kv = n_head_kv
51
+ self.rotary_dim = min(rotary_dim, n_embd // n_head)
52
+ self.activation_function = activation_function
53
+ self.flash_attn = flash_attn
54
+ self.flash_rotary = flash_rotary
55
+ self.fused_dense = fused_dense
56
+ self.attn_pdrop = attn_pdrop
57
+ self.embd_pdrop = embd_pdrop
58
+ self.resid_pdrop = resid_pdrop
59
+ self.layer_norm_epsilon = layer_norm_epsilon
60
+ self.initializer_range = initializer_range
61
+
62
+ super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
convert_model.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import argparse
15
+
16
+ import torch
17
+
18
+ from transformers import (
19
+ AddedToken,
20
+ AutoConfig,
21
+ AutoTokenizer,
22
+ )
23
+ from configuration_llava import LlavaConfig
24
+ from modeling_llava import LlavaForConditionalGeneration
25
+
26
+
27
+ KEYS_TO_MODIFY_MAPPING = {
28
+ "transformer.vision_tower.vision_tower": "vision_model",
29
+ "transformer.mm_projector": "multi_modal_projector",
30
+ "transformer": "language_model.transformer",
31
+ "lm_head": "language_model.lm_head",
32
+ "model.model": "language_model.transformer",
33
+ "multi_modal_projector.0": "multi_modal_projector.linear_1",
34
+ "multi_modal_projector.2": "multi_modal_projector.linear_2",
35
+ }
36
+
37
+
38
+ def convert_state_dict_to_hf(state_dict):
39
+ new_state_dict = {}
40
+ for key, value in state_dict.items():
41
+ for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
42
+ if key_to_modify in key:
43
+ key = key.replace(key_to_modify, new_key)
44
+
45
+ new_state_dict[key] = value
46
+ return new_state_dict
47
+
48
+
49
+ def convert_llava_llama_to_hf(text_model_id, vision_model_id, projector_tokens_num, output_path, old_state_dict_path):
50
+ torch.set_default_dtype(torch.float16)
51
+ text_config = AutoConfig.from_pretrained(text_model_id, trust_remote_code=True)
52
+
53
+ tokenizer = AutoTokenizer.from_pretrained(text_model_id)
54
+ tokenizer.add_tokens(AddedToken("<image>", special=True, normalized=False), special_tokens=True)
55
+ tokenizer.add_special_tokens({"pad_token": "<pad>"})
56
+
57
+ config = LlavaConfig(text_config=text_config, vocab_size=51200, vision_tower_name=vision_model_id, projector_tokens_num=projector_tokens_num)
58
+ config.text_config.vocab_size = config.vocab_size
59
+
60
+ with torch.device("cuda"):
61
+ model = LlavaForConditionalGeneration(config)
62
+
63
+ state_dict = torch.load(old_state_dict_path, map_location="cpu")
64
+ state_dict = convert_state_dict_to_hf(state_dict)
65
+ model.load_state_dict(state_dict, strict=True, assign=True)
66
+
67
+ model.config.vocab_size = model.config.vocab_size
68
+ model.config.text_config.vocab_size = model.config.text_config.vocab_size
69
+
70
+ model.save_pretrained(output_path)
71
+ tokenizer.save_pretrained(output_path)
72
+
73
+
74
+ def main():
75
+ parser = argparse.ArgumentParser()
76
+ parser.add_argument(
77
+ "--text_model_id",
78
+ help="Hub location of the text model",
79
+ )
80
+ parser.add_argument(
81
+ "--vision_model_id",
82
+ help="Hub location of the vision model",
83
+ )
84
+ parser.add_argument(
85
+ "--output_path",
86
+ help="Location of the converted model",
87
+ )
88
+ parser.add_argument(
89
+ "--old_state_dict_path",
90
+ help="Location on the hub of the raw state dict of the original model. The filename needs to be `model_state_dict.bin`",
91
+ )
92
+ parser.add_argument(
93
+ "--tokens_num",
94
+ type=int,
95
+ default=1
96
+ )
97
+ args = parser.parse_args()
98
+ convert_llava_llama_to_hf(args.text_model_id, args.vision_model_id, args.tokens_num, args.output_path, args.old_state_dict_path)
99
+
100
+
101
+ if __name__ == "__main__":
102
+ main()
generation_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "transformers_version": "4.36.2",
4
+ "use_cache": false
5
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23192ed248c081cc405cb02d63f3e450c244d9de8ba2a165ed7158acf4ea409c
3
+ size 4989884440
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00d4516be6a5011addb1337b1e76c29816dfdcabfff84796eeb60f30f2972ec2
3
+ size 1783236080
model.safetensors.index.json ADDED
@@ -0,0 +1,678 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 6773041280
4
+ },
5
+ "weight_map": {
6
+ "language_model.lm_head.linear.bias": "model-00002-of-00002.safetensors",
7
+ "language_model.lm_head.linear.weight": "model-00002-of-00002.safetensors",
8
+ "language_model.lm_head.ln.bias": "model-00002-of-00002.safetensors",
9
+ "language_model.lm_head.ln.weight": "model-00002-of-00002.safetensors",
10
+ "language_model.transformer.embd.wte.weight": "model-00001-of-00002.safetensors",
11
+ "language_model.transformer.h.0.ln.bias": "model-00001-of-00002.safetensors",
12
+ "language_model.transformer.h.0.ln.weight": "model-00001-of-00002.safetensors",
13
+ "language_model.transformer.h.0.mixer.Wqkv.bias": "model-00001-of-00002.safetensors",
14
+ "language_model.transformer.h.0.mixer.Wqkv.weight": "model-00001-of-00002.safetensors",
15
+ "language_model.transformer.h.0.mixer.out_proj.bias": "model-00001-of-00002.safetensors",
16
+ "language_model.transformer.h.0.mixer.out_proj.weight": "model-00001-of-00002.safetensors",
17
+ "language_model.transformer.h.0.mlp.fc1.bias": "model-00001-of-00002.safetensors",
18
+ "language_model.transformer.h.0.mlp.fc1.weight": "model-00001-of-00002.safetensors",
19
+ "language_model.transformer.h.0.mlp.fc2.bias": "model-00001-of-00002.safetensors",
20
+ "language_model.transformer.h.0.mlp.fc2.weight": "model-00001-of-00002.safetensors",
21
+ "language_model.transformer.h.1.ln.bias": "model-00001-of-00002.safetensors",
22
+ "language_model.transformer.h.1.ln.weight": "model-00001-of-00002.safetensors",
23
+ "language_model.transformer.h.1.mixer.Wqkv.bias": "model-00001-of-00002.safetensors",
24
+ "language_model.transformer.h.1.mixer.Wqkv.weight": "model-00001-of-00002.safetensors",
25
+ "language_model.transformer.h.1.mixer.out_proj.bias": "model-00001-of-00002.safetensors",
26
+ "language_model.transformer.h.1.mixer.out_proj.weight": "model-00001-of-00002.safetensors",
27
+ "language_model.transformer.h.1.mlp.fc1.bias": "model-00001-of-00002.safetensors",
28
+ "language_model.transformer.h.1.mlp.fc1.weight": "model-00001-of-00002.safetensors",
29
+ "language_model.transformer.h.1.mlp.fc2.bias": "model-00001-of-00002.safetensors",
30
+ "language_model.transformer.h.1.mlp.fc2.weight": "model-00001-of-00002.safetensors",
31
+ "language_model.transformer.h.10.ln.bias": "model-00001-of-00002.safetensors",
32
+ "language_model.transformer.h.10.ln.weight": "model-00001-of-00002.safetensors",
33
+ "language_model.transformer.h.10.mixer.Wqkv.bias": "model-00001-of-00002.safetensors",
34
+ "language_model.transformer.h.10.mixer.Wqkv.weight": "model-00001-of-00002.safetensors",
35
+ "language_model.transformer.h.10.mixer.out_proj.bias": "model-00001-of-00002.safetensors",
36
+ "language_model.transformer.h.10.mixer.out_proj.weight": "model-00001-of-00002.safetensors",
37
+ "language_model.transformer.h.10.mlp.fc1.bias": "model-00001-of-00002.safetensors",
38
+ "language_model.transformer.h.10.mlp.fc1.weight": "model-00001-of-00002.safetensors",
39
+ "language_model.transformer.h.10.mlp.fc2.bias": "model-00001-of-00002.safetensors",
40
+ "language_model.transformer.h.10.mlp.fc2.weight": "model-00001-of-00002.safetensors",
41
+ "language_model.transformer.h.11.ln.bias": "model-00001-of-00002.safetensors",
42
+ "language_model.transformer.h.11.ln.weight": "model-00001-of-00002.safetensors",
43
+ "language_model.transformer.h.11.mixer.Wqkv.bias": "model-00001-of-00002.safetensors",
44
+ "language_model.transformer.h.11.mixer.Wqkv.weight": "model-00001-of-00002.safetensors",
45
+ "language_model.transformer.h.11.mixer.out_proj.bias": "model-00001-of-00002.safetensors",
46
+ "language_model.transformer.h.11.mixer.out_proj.weight": "model-00001-of-00002.safetensors",
47
+ "language_model.transformer.h.11.mlp.fc1.bias": "model-00001-of-00002.safetensors",
48
+ "language_model.transformer.h.11.mlp.fc1.weight": "model-00001-of-00002.safetensors",
49
+ "language_model.transformer.h.11.mlp.fc2.bias": "model-00001-of-00002.safetensors",
50
+ "language_model.transformer.h.11.mlp.fc2.weight": "model-00001-of-00002.safetensors",
51
+ "language_model.transformer.h.12.ln.bias": "model-00001-of-00002.safetensors",
52
+ "language_model.transformer.h.12.ln.weight": "model-00001-of-00002.safetensors",
53
+ "language_model.transformer.h.12.mixer.Wqkv.bias": "model-00001-of-00002.safetensors",
54
+ "language_model.transformer.h.12.mixer.Wqkv.weight": "model-00001-of-00002.safetensors",
55
+ "language_model.transformer.h.12.mixer.out_proj.bias": "model-00001-of-00002.safetensors",
56
+ "language_model.transformer.h.12.mixer.out_proj.weight": "model-00001-of-00002.safetensors",
57
+ "language_model.transformer.h.12.mlp.fc1.bias": "model-00001-of-00002.safetensors",
58
+ "language_model.transformer.h.12.mlp.fc1.weight": "model-00001-of-00002.safetensors",
59
+ "language_model.transformer.h.12.mlp.fc2.bias": "model-00001-of-00002.safetensors",
60
+ "language_model.transformer.h.12.mlp.fc2.weight": "model-00001-of-00002.safetensors",
61
+ "language_model.transformer.h.13.ln.bias": "model-00001-of-00002.safetensors",
62
+ "language_model.transformer.h.13.ln.weight": "model-00001-of-00002.safetensors",
63
+ "language_model.transformer.h.13.mixer.Wqkv.bias": "model-00001-of-00002.safetensors",
64
+ "language_model.transformer.h.13.mixer.Wqkv.weight": "model-00001-of-00002.safetensors",
65
+ "language_model.transformer.h.13.mixer.out_proj.bias": "model-00001-of-00002.safetensors",
66
+ "language_model.transformer.h.13.mixer.out_proj.weight": "model-00001-of-00002.safetensors",
67
+ "language_model.transformer.h.13.mlp.fc1.bias": "model-00001-of-00002.safetensors",
68
+ "language_model.transformer.h.13.mlp.fc1.weight": "model-00001-of-00002.safetensors",
69
+ "language_model.transformer.h.13.mlp.fc2.bias": "model-00001-of-00002.safetensors",
70
+ "language_model.transformer.h.13.mlp.fc2.weight": "model-00001-of-00002.safetensors",
71
+ "language_model.transformer.h.14.ln.bias": "model-00001-of-00002.safetensors",
72
+ "language_model.transformer.h.14.ln.weight": "model-00001-of-00002.safetensors",
73
+ "language_model.transformer.h.14.mixer.Wqkv.bias": "model-00001-of-00002.safetensors",
74
+ "language_model.transformer.h.14.mixer.Wqkv.weight": "model-00001-of-00002.safetensors",
75
+ "language_model.transformer.h.14.mixer.out_proj.bias": "model-00001-of-00002.safetensors",
76
+ "language_model.transformer.h.14.mixer.out_proj.weight": "model-00001-of-00002.safetensors",
77
+ "language_model.transformer.h.14.mlp.fc1.bias": "model-00001-of-00002.safetensors",
78
+ "language_model.transformer.h.14.mlp.fc1.weight": "model-00001-of-00002.safetensors",
79
+ "language_model.transformer.h.14.mlp.fc2.bias": "model-00001-of-00002.safetensors",
80
+ "language_model.transformer.h.14.mlp.fc2.weight": "model-00001-of-00002.safetensors",
81
+ "language_model.transformer.h.15.ln.bias": "model-00001-of-00002.safetensors",
82
+ "language_model.transformer.h.15.ln.weight": "model-00001-of-00002.safetensors",
83
+ "language_model.transformer.h.15.mixer.Wqkv.bias": "model-00001-of-00002.safetensors",
84
+ "language_model.transformer.h.15.mixer.Wqkv.weight": "model-00001-of-00002.safetensors",
85
+ "language_model.transformer.h.15.mixer.out_proj.bias": "model-00001-of-00002.safetensors",
86
+ "language_model.transformer.h.15.mixer.out_proj.weight": "model-00001-of-00002.safetensors",
87
+ "language_model.transformer.h.15.mlp.fc1.bias": "model-00001-of-00002.safetensors",
88
+ "language_model.transformer.h.15.mlp.fc1.weight": "model-00001-of-00002.safetensors",
89
+ "language_model.transformer.h.15.mlp.fc2.bias": "model-00001-of-00002.safetensors",
90
+ "language_model.transformer.h.15.mlp.fc2.weight": "model-00001-of-00002.safetensors",
91
+ "language_model.transformer.h.16.ln.bias": "model-00001-of-00002.safetensors",
92
+ "language_model.transformer.h.16.ln.weight": "model-00001-of-00002.safetensors",
93
+ "language_model.transformer.h.16.mixer.Wqkv.bias": "model-00001-of-00002.safetensors",
94
+ "language_model.transformer.h.16.mixer.Wqkv.weight": "model-00001-of-00002.safetensors",
95
+ "language_model.transformer.h.16.mixer.out_proj.bias": "model-00001-of-00002.safetensors",
96
+ "language_model.transformer.h.16.mixer.out_proj.weight": "model-00001-of-00002.safetensors",
97
+ "language_model.transformer.h.16.mlp.fc1.bias": "model-00001-of-00002.safetensors",
98
+ "language_model.transformer.h.16.mlp.fc1.weight": "model-00001-of-00002.safetensors",
99
+ "language_model.transformer.h.16.mlp.fc2.bias": "model-00001-of-00002.safetensors",
100
+ "language_model.transformer.h.16.mlp.fc2.weight": "model-00001-of-00002.safetensors",
101
+ "language_model.transformer.h.17.ln.bias": "model-00001-of-00002.safetensors",
102
+ "language_model.transformer.h.17.ln.weight": "model-00001-of-00002.safetensors",
103
+ "language_model.transformer.h.17.mixer.Wqkv.bias": "model-00001-of-00002.safetensors",
104
+ "language_model.transformer.h.17.mixer.Wqkv.weight": "model-00001-of-00002.safetensors",
105
+ "language_model.transformer.h.17.mixer.out_proj.bias": "model-00001-of-00002.safetensors",
106
+ "language_model.transformer.h.17.mixer.out_proj.weight": "model-00001-of-00002.safetensors",
107
+ "language_model.transformer.h.17.mlp.fc1.bias": "model-00001-of-00002.safetensors",
108
+ "language_model.transformer.h.17.mlp.fc1.weight": "model-00001-of-00002.safetensors",
109
+ "language_model.transformer.h.17.mlp.fc2.bias": "model-00001-of-00002.safetensors",
110
+ "language_model.transformer.h.17.mlp.fc2.weight": "model-00001-of-00002.safetensors",
111
+ "language_model.transformer.h.18.ln.bias": "model-00001-of-00002.safetensors",
112
+ "language_model.transformer.h.18.ln.weight": "model-00001-of-00002.safetensors",
113
+ "language_model.transformer.h.18.mixer.Wqkv.bias": "model-00001-of-00002.safetensors",
114
+ "language_model.transformer.h.18.mixer.Wqkv.weight": "model-00001-of-00002.safetensors",
115
+ "language_model.transformer.h.18.mixer.out_proj.bias": "model-00001-of-00002.safetensors",
116
+ "language_model.transformer.h.18.mixer.out_proj.weight": "model-00001-of-00002.safetensors",
117
+ "language_model.transformer.h.18.mlp.fc1.bias": "model-00001-of-00002.safetensors",
118
+ "language_model.transformer.h.18.mlp.fc1.weight": "model-00001-of-00002.safetensors",
119
+ "language_model.transformer.h.18.mlp.fc2.bias": "model-00001-of-00002.safetensors",
120
+ "language_model.transformer.h.18.mlp.fc2.weight": "model-00001-of-00002.safetensors",
121
+ "language_model.transformer.h.19.ln.bias": "model-00001-of-00002.safetensors",
122
+ "language_model.transformer.h.19.ln.weight": "model-00001-of-00002.safetensors",
123
+ "language_model.transformer.h.19.mixer.Wqkv.bias": "model-00001-of-00002.safetensors",
124
+ "language_model.transformer.h.19.mixer.Wqkv.weight": "model-00001-of-00002.safetensors",
125
+ "language_model.transformer.h.19.mixer.out_proj.bias": "model-00001-of-00002.safetensors",
126
+ "language_model.transformer.h.19.mixer.out_proj.weight": "model-00001-of-00002.safetensors",
127
+ "language_model.transformer.h.19.mlp.fc1.bias": "model-00001-of-00002.safetensors",
128
+ "language_model.transformer.h.19.mlp.fc1.weight": "model-00001-of-00002.safetensors",
129
+ "language_model.transformer.h.19.mlp.fc2.bias": "model-00001-of-00002.safetensors",
130
+ "language_model.transformer.h.19.mlp.fc2.weight": "model-00001-of-00002.safetensors",
131
+ "language_model.transformer.h.2.ln.bias": "model-00001-of-00002.safetensors",
132
+ "language_model.transformer.h.2.ln.weight": "model-00001-of-00002.safetensors",
133
+ "language_model.transformer.h.2.mixer.Wqkv.bias": "model-00001-of-00002.safetensors",
134
+ "language_model.transformer.h.2.mixer.Wqkv.weight": "model-00001-of-00002.safetensors",
135
+ "language_model.transformer.h.2.mixer.out_proj.bias": "model-00001-of-00002.safetensors",
136
+ "language_model.transformer.h.2.mixer.out_proj.weight": "model-00001-of-00002.safetensors",
137
+ "language_model.transformer.h.2.mlp.fc1.bias": "model-00001-of-00002.safetensors",
138
+ "language_model.transformer.h.2.mlp.fc1.weight": "model-00001-of-00002.safetensors",
139
+ "language_model.transformer.h.2.mlp.fc2.bias": "model-00001-of-00002.safetensors",
140
+ "language_model.transformer.h.2.mlp.fc2.weight": "model-00001-of-00002.safetensors",
141
+ "language_model.transformer.h.20.ln.bias": "model-00001-of-00002.safetensors",
142
+ "language_model.transformer.h.20.ln.weight": "model-00001-of-00002.safetensors",
143
+ "language_model.transformer.h.20.mixer.Wqkv.bias": "model-00001-of-00002.safetensors",
144
+ "language_model.transformer.h.20.mixer.Wqkv.weight": "model-00001-of-00002.safetensors",
145
+ "language_model.transformer.h.20.mixer.out_proj.bias": "model-00001-of-00002.safetensors",
146
+ "language_model.transformer.h.20.mixer.out_proj.weight": "model-00001-of-00002.safetensors",
147
+ "language_model.transformer.h.20.mlp.fc1.bias": "model-00001-of-00002.safetensors",
148
+ "language_model.transformer.h.20.mlp.fc1.weight": "model-00001-of-00002.safetensors",
149
+ "language_model.transformer.h.20.mlp.fc2.bias": "model-00001-of-00002.safetensors",
150
+ "language_model.transformer.h.20.mlp.fc2.weight": "model-00001-of-00002.safetensors",
151
+ "language_model.transformer.h.21.ln.bias": "model-00001-of-00002.safetensors",
152
+ "language_model.transformer.h.21.ln.weight": "model-00001-of-00002.safetensors",
153
+ "language_model.transformer.h.21.mixer.Wqkv.bias": "model-00001-of-00002.safetensors",
154
+ "language_model.transformer.h.21.mixer.Wqkv.weight": "model-00001-of-00002.safetensors",
155
+ "language_model.transformer.h.21.mixer.out_proj.bias": "model-00001-of-00002.safetensors",
156
+ "language_model.transformer.h.21.mixer.out_proj.weight": "model-00001-of-00002.safetensors",
157
+ "language_model.transformer.h.21.mlp.fc1.bias": "model-00001-of-00002.safetensors",
158
+ "language_model.transformer.h.21.mlp.fc1.weight": "model-00001-of-00002.safetensors",
159
+ "language_model.transformer.h.21.mlp.fc2.bias": "model-00001-of-00002.safetensors",
160
+ "language_model.transformer.h.21.mlp.fc2.weight": "model-00001-of-00002.safetensors",
161
+ "language_model.transformer.h.22.ln.bias": "model-00001-of-00002.safetensors",
162
+ "language_model.transformer.h.22.ln.weight": "model-00001-of-00002.safetensors",
163
+ "language_model.transformer.h.22.mixer.Wqkv.bias": "model-00001-of-00002.safetensors",
164
+ "language_model.transformer.h.22.mixer.Wqkv.weight": "model-00001-of-00002.safetensors",
165
+ "language_model.transformer.h.22.mixer.out_proj.bias": "model-00001-of-00002.safetensors",
166
+ "language_model.transformer.h.22.mixer.out_proj.weight": "model-00001-of-00002.safetensors",
167
+ "language_model.transformer.h.22.mlp.fc1.bias": "model-00002-of-00002.safetensors",
168
+ "language_model.transformer.h.22.mlp.fc1.weight": "model-00002-of-00002.safetensors",
169
+ "language_model.transformer.h.22.mlp.fc2.bias": "model-00002-of-00002.safetensors",
170
+ "language_model.transformer.h.22.mlp.fc2.weight": "model-00002-of-00002.safetensors",
171
+ "language_model.transformer.h.23.ln.bias": "model-00002-of-00002.safetensors",
172
+ "language_model.transformer.h.23.ln.weight": "model-00002-of-00002.safetensors",
173
+ "language_model.transformer.h.23.mixer.Wqkv.bias": "model-00002-of-00002.safetensors",
174
+ "language_model.transformer.h.23.mixer.Wqkv.weight": "model-00002-of-00002.safetensors",
175
+ "language_model.transformer.h.23.mixer.out_proj.bias": "model-00002-of-00002.safetensors",
176
+ "language_model.transformer.h.23.mixer.out_proj.weight": "model-00002-of-00002.safetensors",
177
+ "language_model.transformer.h.23.mlp.fc1.bias": "model-00002-of-00002.safetensors",
178
+ "language_model.transformer.h.23.mlp.fc1.weight": "model-00002-of-00002.safetensors",
179
+ "language_model.transformer.h.23.mlp.fc2.bias": "model-00002-of-00002.safetensors",
180
+ "language_model.transformer.h.23.mlp.fc2.weight": "model-00002-of-00002.safetensors",
181
+ "language_model.transformer.h.24.ln.bias": "model-00002-of-00002.safetensors",
182
+ "language_model.transformer.h.24.ln.weight": "model-00002-of-00002.safetensors",
183
+ "language_model.transformer.h.24.mixer.Wqkv.bias": "model-00002-of-00002.safetensors",
184
+ "language_model.transformer.h.24.mixer.Wqkv.weight": "model-00002-of-00002.safetensors",
185
+ "language_model.transformer.h.24.mixer.out_proj.bias": "model-00002-of-00002.safetensors",
186
+ "language_model.transformer.h.24.mixer.out_proj.weight": "model-00002-of-00002.safetensors",
187
+ "language_model.transformer.h.24.mlp.fc1.bias": "model-00002-of-00002.safetensors",
188
+ "language_model.transformer.h.24.mlp.fc1.weight": "model-00002-of-00002.safetensors",
189
+ "language_model.transformer.h.24.mlp.fc2.bias": "model-00002-of-00002.safetensors",
190
+ "language_model.transformer.h.24.mlp.fc2.weight": "model-00002-of-00002.safetensors",
191
+ "language_model.transformer.h.25.ln.bias": "model-00002-of-00002.safetensors",
192
+ "language_model.transformer.h.25.ln.weight": "model-00002-of-00002.safetensors",
193
+ "language_model.transformer.h.25.mixer.Wqkv.bias": "model-00002-of-00002.safetensors",
194
+ "language_model.transformer.h.25.mixer.Wqkv.weight": "model-00002-of-00002.safetensors",
195
+ "language_model.transformer.h.25.mixer.out_proj.bias": "model-00002-of-00002.safetensors",
196
+ "language_model.transformer.h.25.mixer.out_proj.weight": "model-00002-of-00002.safetensors",
197
+ "language_model.transformer.h.25.mlp.fc1.bias": "model-00002-of-00002.safetensors",
198
+ "language_model.transformer.h.25.mlp.fc1.weight": "model-00002-of-00002.safetensors",
199
+ "language_model.transformer.h.25.mlp.fc2.bias": "model-00002-of-00002.safetensors",
200
+ "language_model.transformer.h.25.mlp.fc2.weight": "model-00002-of-00002.safetensors",
201
+ "language_model.transformer.h.26.ln.bias": "model-00002-of-00002.safetensors",
202
+ "language_model.transformer.h.26.ln.weight": "model-00002-of-00002.safetensors",
203
+ "language_model.transformer.h.26.mixer.Wqkv.bias": "model-00002-of-00002.safetensors",
204
+ "language_model.transformer.h.26.mixer.Wqkv.weight": "model-00002-of-00002.safetensors",
205
+ "language_model.transformer.h.26.mixer.out_proj.bias": "model-00002-of-00002.safetensors",
206
+ "language_model.transformer.h.26.mixer.out_proj.weight": "model-00002-of-00002.safetensors",
207
+ "language_model.transformer.h.26.mlp.fc1.bias": "model-00002-of-00002.safetensors",
208
+ "language_model.transformer.h.26.mlp.fc1.weight": "model-00002-of-00002.safetensors",
209
+ "language_model.transformer.h.26.mlp.fc2.bias": "model-00002-of-00002.safetensors",
210
+ "language_model.transformer.h.26.mlp.fc2.weight": "model-00002-of-00002.safetensors",
211
+ "language_model.transformer.h.27.ln.bias": "model-00002-of-00002.safetensors",
212
+ "language_model.transformer.h.27.ln.weight": "model-00002-of-00002.safetensors",
213
+ "language_model.transformer.h.27.mixer.Wqkv.bias": "model-00002-of-00002.safetensors",
214
+ "language_model.transformer.h.27.mixer.Wqkv.weight": "model-00002-of-00002.safetensors",
215
+ "language_model.transformer.h.27.mixer.out_proj.bias": "model-00002-of-00002.safetensors",
216
+ "language_model.transformer.h.27.mixer.out_proj.weight": "model-00002-of-00002.safetensors",
217
+ "language_model.transformer.h.27.mlp.fc1.bias": "model-00002-of-00002.safetensors",
218
+ "language_model.transformer.h.27.mlp.fc1.weight": "model-00002-of-00002.safetensors",
219
+ "language_model.transformer.h.27.mlp.fc2.bias": "model-00002-of-00002.safetensors",
220
+ "language_model.transformer.h.27.mlp.fc2.weight": "model-00002-of-00002.safetensors",
221
+ "language_model.transformer.h.28.ln.bias": "model-00002-of-00002.safetensors",
222
+ "language_model.transformer.h.28.ln.weight": "model-00002-of-00002.safetensors",
223
+ "language_model.transformer.h.28.mixer.Wqkv.bias": "model-00002-of-00002.safetensors",
224
+ "language_model.transformer.h.28.mixer.Wqkv.weight": "model-00002-of-00002.safetensors",
225
+ "language_model.transformer.h.28.mixer.out_proj.bias": "model-00002-of-00002.safetensors",
226
+ "language_model.transformer.h.28.mixer.out_proj.weight": "model-00002-of-00002.safetensors",
227
+ "language_model.transformer.h.28.mlp.fc1.bias": "model-00002-of-00002.safetensors",
228
+ "language_model.transformer.h.28.mlp.fc1.weight": "model-00002-of-00002.safetensors",
229
+ "language_model.transformer.h.28.mlp.fc2.bias": "model-00002-of-00002.safetensors",
230
+ "language_model.transformer.h.28.mlp.fc2.weight": "model-00002-of-00002.safetensors",
231
+ "language_model.transformer.h.29.ln.bias": "model-00002-of-00002.safetensors",
232
+ "language_model.transformer.h.29.ln.weight": "model-00002-of-00002.safetensors",
233
+ "language_model.transformer.h.29.mixer.Wqkv.bias": "model-00002-of-00002.safetensors",
234
+ "language_model.transformer.h.29.mixer.Wqkv.weight": "model-00002-of-00002.safetensors",
235
+ "language_model.transformer.h.29.mixer.out_proj.bias": "model-00002-of-00002.safetensors",
236
+ "language_model.transformer.h.29.mixer.out_proj.weight": "model-00002-of-00002.safetensors",
237
+ "language_model.transformer.h.29.mlp.fc1.bias": "model-00002-of-00002.safetensors",
238
+ "language_model.transformer.h.29.mlp.fc1.weight": "model-00002-of-00002.safetensors",
239
+ "language_model.transformer.h.29.mlp.fc2.bias": "model-00002-of-00002.safetensors",
240
+ "language_model.transformer.h.29.mlp.fc2.weight": "model-00002-of-00002.safetensors",
241
+ "language_model.transformer.h.3.ln.bias": "model-00001-of-00002.safetensors",
242
+ "language_model.transformer.h.3.ln.weight": "model-00001-of-00002.safetensors",
243
+ "language_model.transformer.h.3.mixer.Wqkv.bias": "model-00001-of-00002.safetensors",
244
+ "language_model.transformer.h.3.mixer.Wqkv.weight": "model-00001-of-00002.safetensors",
245
+ "language_model.transformer.h.3.mixer.out_proj.bias": "model-00001-of-00002.safetensors",
246
+ "language_model.transformer.h.3.mixer.out_proj.weight": "model-00001-of-00002.safetensors",
247
+ "language_model.transformer.h.3.mlp.fc1.bias": "model-00001-of-00002.safetensors",
248
+ "language_model.transformer.h.3.mlp.fc1.weight": "model-00001-of-00002.safetensors",
249
+ "language_model.transformer.h.3.mlp.fc2.bias": "model-00001-of-00002.safetensors",
250
+ "language_model.transformer.h.3.mlp.fc2.weight": "model-00001-of-00002.safetensors",
251
+ "language_model.transformer.h.30.ln.bias": "model-00002-of-00002.safetensors",
252
+ "language_model.transformer.h.30.ln.weight": "model-00002-of-00002.safetensors",
253
+ "language_model.transformer.h.30.mixer.Wqkv.bias": "model-00002-of-00002.safetensors",
254
+ "language_model.transformer.h.30.mixer.Wqkv.weight": "model-00002-of-00002.safetensors",
255
+ "language_model.transformer.h.30.mixer.out_proj.bias": "model-00002-of-00002.safetensors",
256
+ "language_model.transformer.h.30.mixer.out_proj.weight": "model-00002-of-00002.safetensors",
257
+ "language_model.transformer.h.30.mlp.fc1.bias": "model-00002-of-00002.safetensors",
258
+ "language_model.transformer.h.30.mlp.fc1.weight": "model-00002-of-00002.safetensors",
259
+ "language_model.transformer.h.30.mlp.fc2.bias": "model-00002-of-00002.safetensors",
260
+ "language_model.transformer.h.30.mlp.fc2.weight": "model-00002-of-00002.safetensors",
261
+ "language_model.transformer.h.31.ln.bias": "model-00002-of-00002.safetensors",
262
+ "language_model.transformer.h.31.ln.weight": "model-00002-of-00002.safetensors",
263
+ "language_model.transformer.h.31.mixer.Wqkv.bias": "model-00002-of-00002.safetensors",
264
+ "language_model.transformer.h.31.mixer.Wqkv.weight": "model-00002-of-00002.safetensors",
265
+ "language_model.transformer.h.31.mixer.out_proj.bias": "model-00002-of-00002.safetensors",
266
+ "language_model.transformer.h.31.mixer.out_proj.weight": "model-00002-of-00002.safetensors",
267
+ "language_model.transformer.h.31.mlp.fc1.bias": "model-00002-of-00002.safetensors",
268
+ "language_model.transformer.h.31.mlp.fc1.weight": "model-00002-of-00002.safetensors",
269
+ "language_model.transformer.h.31.mlp.fc2.bias": "model-00002-of-00002.safetensors",
270
+ "language_model.transformer.h.31.mlp.fc2.weight": "model-00002-of-00002.safetensors",
271
+ "language_model.transformer.h.4.ln.bias": "model-00001-of-00002.safetensors",
272
+ "language_model.transformer.h.4.ln.weight": "model-00001-of-00002.safetensors",
273
+ "language_model.transformer.h.4.mixer.Wqkv.bias": "model-00001-of-00002.safetensors",
274
+ "language_model.transformer.h.4.mixer.Wqkv.weight": "model-00001-of-00002.safetensors",
275
+ "language_model.transformer.h.4.mixer.out_proj.bias": "model-00001-of-00002.safetensors",
276
+ "language_model.transformer.h.4.mixer.out_proj.weight": "model-00001-of-00002.safetensors",
277
+ "language_model.transformer.h.4.mlp.fc1.bias": "model-00001-of-00002.safetensors",
278
+ "language_model.transformer.h.4.mlp.fc1.weight": "model-00001-of-00002.safetensors",
279
+ "language_model.transformer.h.4.mlp.fc2.bias": "model-00001-of-00002.safetensors",
280
+ "language_model.transformer.h.4.mlp.fc2.weight": "model-00001-of-00002.safetensors",
281
+ "language_model.transformer.h.5.ln.bias": "model-00001-of-00002.safetensors",
282
+ "language_model.transformer.h.5.ln.weight": "model-00001-of-00002.safetensors",
283
+ "language_model.transformer.h.5.mixer.Wqkv.bias": "model-00001-of-00002.safetensors",
284
+ "language_model.transformer.h.5.mixer.Wqkv.weight": "model-00001-of-00002.safetensors",
285
+ "language_model.transformer.h.5.mixer.out_proj.bias": "model-00001-of-00002.safetensors",
286
+ "language_model.transformer.h.5.mixer.out_proj.weight": "model-00001-of-00002.safetensors",
287
+ "language_model.transformer.h.5.mlp.fc1.bias": "model-00001-of-00002.safetensors",
288
+ "language_model.transformer.h.5.mlp.fc1.weight": "model-00001-of-00002.safetensors",
289
+ "language_model.transformer.h.5.mlp.fc2.bias": "model-00001-of-00002.safetensors",
290
+ "language_model.transformer.h.5.mlp.fc2.weight": "model-00001-of-00002.safetensors",
291
+ "language_model.transformer.h.6.ln.bias": "model-00001-of-00002.safetensors",
292
+ "language_model.transformer.h.6.ln.weight": "model-00001-of-00002.safetensors",
293
+ "language_model.transformer.h.6.mixer.Wqkv.bias": "model-00001-of-00002.safetensors",
294
+ "language_model.transformer.h.6.mixer.Wqkv.weight": "model-00001-of-00002.safetensors",
295
+ "language_model.transformer.h.6.mixer.out_proj.bias": "model-00001-of-00002.safetensors",
296
+ "language_model.transformer.h.6.mixer.out_proj.weight": "model-00001-of-00002.safetensors",
297
+ "language_model.transformer.h.6.mlp.fc1.bias": "model-00001-of-00002.safetensors",
298
+ "language_model.transformer.h.6.mlp.fc1.weight": "model-00001-of-00002.safetensors",
299
+ "language_model.transformer.h.6.mlp.fc2.bias": "model-00001-of-00002.safetensors",
300
+ "language_model.transformer.h.6.mlp.fc2.weight": "model-00001-of-00002.safetensors",
301
+ "language_model.transformer.h.7.ln.bias": "model-00001-of-00002.safetensors",
302
+ "language_model.transformer.h.7.ln.weight": "model-00001-of-00002.safetensors",
303
+ "language_model.transformer.h.7.mixer.Wqkv.bias": "model-00001-of-00002.safetensors",
304
+ "language_model.transformer.h.7.mixer.Wqkv.weight": "model-00001-of-00002.safetensors",
305
+ "language_model.transformer.h.7.mixer.out_proj.bias": "model-00001-of-00002.safetensors",
306
+ "language_model.transformer.h.7.mixer.out_proj.weight": "model-00001-of-00002.safetensors",
307
+ "language_model.transformer.h.7.mlp.fc1.bias": "model-00001-of-00002.safetensors",
308
+ "language_model.transformer.h.7.mlp.fc1.weight": "model-00001-of-00002.safetensors",
309
+ "language_model.transformer.h.7.mlp.fc2.bias": "model-00001-of-00002.safetensors",
310
+ "language_model.transformer.h.7.mlp.fc2.weight": "model-00001-of-00002.safetensors",
311
+ "language_model.transformer.h.8.ln.bias": "model-00001-of-00002.safetensors",
312
+ "language_model.transformer.h.8.ln.weight": "model-00001-of-00002.safetensors",
313
+ "language_model.transformer.h.8.mixer.Wqkv.bias": "model-00001-of-00002.safetensors",
314
+ "language_model.transformer.h.8.mixer.Wqkv.weight": "model-00001-of-00002.safetensors",
315
+ "language_model.transformer.h.8.mixer.out_proj.bias": "model-00001-of-00002.safetensors",
316
+ "language_model.transformer.h.8.mixer.out_proj.weight": "model-00001-of-00002.safetensors",
317
+ "language_model.transformer.h.8.mlp.fc1.bias": "model-00001-of-00002.safetensors",
318
+ "language_model.transformer.h.8.mlp.fc1.weight": "model-00001-of-00002.safetensors",
319
+ "language_model.transformer.h.8.mlp.fc2.bias": "model-00001-of-00002.safetensors",
320
+ "language_model.transformer.h.8.mlp.fc2.weight": "model-00001-of-00002.safetensors",
321
+ "language_model.transformer.h.9.ln.bias": "model-00001-of-00002.safetensors",
322
+ "language_model.transformer.h.9.ln.weight": "model-00001-of-00002.safetensors",
323
+ "language_model.transformer.h.9.mixer.Wqkv.bias": "model-00001-of-00002.safetensors",
324
+ "language_model.transformer.h.9.mixer.Wqkv.weight": "model-00001-of-00002.safetensors",
325
+ "language_model.transformer.h.9.mixer.out_proj.bias": "model-00001-of-00002.safetensors",
326
+ "language_model.transformer.h.9.mixer.out_proj.weight": "model-00001-of-00002.safetensors",
327
+ "language_model.transformer.h.9.mlp.fc1.bias": "model-00001-of-00002.safetensors",
328
+ "language_model.transformer.h.9.mlp.fc1.weight": "model-00001-of-00002.safetensors",
329
+ "language_model.transformer.h.9.mlp.fc2.bias": "model-00001-of-00002.safetensors",
330
+ "language_model.transformer.h.9.mlp.fc2.weight": "model-00001-of-00002.safetensors",
331
+ "multi_modal_projector.linear_1.bias": "model-00001-of-00002.safetensors",
332
+ "multi_modal_projector.linear_1.weight": "model-00001-of-00002.safetensors",
333
+ "multi_modal_projector.linear_2.bias": "model-00001-of-00002.safetensors",
334
+ "multi_modal_projector.linear_2.weight": "model-00001-of-00002.safetensors",
335
+ "vision_model.trunk.attn_pool.kv.bias": "model-00001-of-00002.safetensors",
336
+ "vision_model.trunk.attn_pool.kv.weight": "model-00001-of-00002.safetensors",
337
+ "vision_model.trunk.attn_pool.latent": "model-00001-of-00002.safetensors",
338
+ "vision_model.trunk.attn_pool.mlp.fc1.bias": "model-00001-of-00002.safetensors",
339
+ "vision_model.trunk.attn_pool.mlp.fc1.weight": "model-00001-of-00002.safetensors",
340
+ "vision_model.trunk.attn_pool.mlp.fc2.bias": "model-00001-of-00002.safetensors",
341
+ "vision_model.trunk.attn_pool.mlp.fc2.weight": "model-00001-of-00002.safetensors",
342
+ "vision_model.trunk.attn_pool.norm.bias": "model-00001-of-00002.safetensors",
343
+ "vision_model.trunk.attn_pool.norm.weight": "model-00001-of-00002.safetensors",
344
+ "vision_model.trunk.attn_pool.proj.bias": "model-00001-of-00002.safetensors",
345
+ "vision_model.trunk.attn_pool.proj.weight": "model-00001-of-00002.safetensors",
346
+ "vision_model.trunk.attn_pool.q.bias": "model-00001-of-00002.safetensors",
347
+ "vision_model.trunk.attn_pool.q.weight": "model-00001-of-00002.safetensors",
348
+ "vision_model.trunk.blocks.0.attn.proj.bias": "model-00001-of-00002.safetensors",
349
+ "vision_model.trunk.blocks.0.attn.proj.weight": "model-00001-of-00002.safetensors",
350
+ "vision_model.trunk.blocks.0.attn.qkv.bias": "model-00001-of-00002.safetensors",
351
+ "vision_model.trunk.blocks.0.attn.qkv.weight": "model-00001-of-00002.safetensors",
352
+ "vision_model.trunk.blocks.0.mlp.fc1.bias": "model-00001-of-00002.safetensors",
353
+ "vision_model.trunk.blocks.0.mlp.fc1.weight": "model-00001-of-00002.safetensors",
354
+ "vision_model.trunk.blocks.0.mlp.fc2.bias": "model-00001-of-00002.safetensors",
355
+ "vision_model.trunk.blocks.0.mlp.fc2.weight": "model-00001-of-00002.safetensors",
356
+ "vision_model.trunk.blocks.0.norm1.bias": "model-00001-of-00002.safetensors",
357
+ "vision_model.trunk.blocks.0.norm1.weight": "model-00001-of-00002.safetensors",
358
+ "vision_model.trunk.blocks.0.norm2.bias": "model-00001-of-00002.safetensors",
359
+ "vision_model.trunk.blocks.0.norm2.weight": "model-00001-of-00002.safetensors",
360
+ "vision_model.trunk.blocks.1.attn.proj.bias": "model-00001-of-00002.safetensors",
361
+ "vision_model.trunk.blocks.1.attn.proj.weight": "model-00001-of-00002.safetensors",
362
+ "vision_model.trunk.blocks.1.attn.qkv.bias": "model-00001-of-00002.safetensors",
363
+ "vision_model.trunk.blocks.1.attn.qkv.weight": "model-00001-of-00002.safetensors",
364
+ "vision_model.trunk.blocks.1.mlp.fc1.bias": "model-00001-of-00002.safetensors",
365
+ "vision_model.trunk.blocks.1.mlp.fc1.weight": "model-00001-of-00002.safetensors",
366
+ "vision_model.trunk.blocks.1.mlp.fc2.bias": "model-00001-of-00002.safetensors",
367
+ "vision_model.trunk.blocks.1.mlp.fc2.weight": "model-00001-of-00002.safetensors",
368
+ "vision_model.trunk.blocks.1.norm1.bias": "model-00001-of-00002.safetensors",
369
+ "vision_model.trunk.blocks.1.norm1.weight": "model-00001-of-00002.safetensors",
370
+ "vision_model.trunk.blocks.1.norm2.bias": "model-00001-of-00002.safetensors",
371
+ "vision_model.trunk.blocks.1.norm2.weight": "model-00001-of-00002.safetensors",
372
+ "vision_model.trunk.blocks.10.attn.proj.bias": "model-00001-of-00002.safetensors",
373
+ "vision_model.trunk.blocks.10.attn.proj.weight": "model-00001-of-00002.safetensors",
374
+ "vision_model.trunk.blocks.10.attn.qkv.bias": "model-00001-of-00002.safetensors",
375
+ "vision_model.trunk.blocks.10.attn.qkv.weight": "model-00001-of-00002.safetensors",
376
+ "vision_model.trunk.blocks.10.mlp.fc1.bias": "model-00001-of-00002.safetensors",
377
+ "vision_model.trunk.blocks.10.mlp.fc1.weight": "model-00001-of-00002.safetensors",
378
+ "vision_model.trunk.blocks.10.mlp.fc2.bias": "model-00001-of-00002.safetensors",
379
+ "vision_model.trunk.blocks.10.mlp.fc2.weight": "model-00001-of-00002.safetensors",
380
+ "vision_model.trunk.blocks.10.norm1.bias": "model-00001-of-00002.safetensors",
381
+ "vision_model.trunk.blocks.10.norm1.weight": "model-00001-of-00002.safetensors",
382
+ "vision_model.trunk.blocks.10.norm2.bias": "model-00001-of-00002.safetensors",
383
+ "vision_model.trunk.blocks.10.norm2.weight": "model-00001-of-00002.safetensors",
384
+ "vision_model.trunk.blocks.11.attn.proj.bias": "model-00001-of-00002.safetensors",
385
+ "vision_model.trunk.blocks.11.attn.proj.weight": "model-00001-of-00002.safetensors",
386
+ "vision_model.trunk.blocks.11.attn.qkv.bias": "model-00001-of-00002.safetensors",
387
+ "vision_model.trunk.blocks.11.attn.qkv.weight": "model-00001-of-00002.safetensors",
388
+ "vision_model.trunk.blocks.11.mlp.fc1.bias": "model-00001-of-00002.safetensors",
389
+ "vision_model.trunk.blocks.11.mlp.fc1.weight": "model-00001-of-00002.safetensors",
390
+ "vision_model.trunk.blocks.11.mlp.fc2.bias": "model-00001-of-00002.safetensors",
391
+ "vision_model.trunk.blocks.11.mlp.fc2.weight": "model-00001-of-00002.safetensors",
392
+ "vision_model.trunk.blocks.11.norm1.bias": "model-00001-of-00002.safetensors",
393
+ "vision_model.trunk.blocks.11.norm1.weight": "model-00001-of-00002.safetensors",
394
+ "vision_model.trunk.blocks.11.norm2.bias": "model-00001-of-00002.safetensors",
395
+ "vision_model.trunk.blocks.11.norm2.weight": "model-00001-of-00002.safetensors",
396
+ "vision_model.trunk.blocks.12.attn.proj.bias": "model-00001-of-00002.safetensors",
397
+ "vision_model.trunk.blocks.12.attn.proj.weight": "model-00001-of-00002.safetensors",
398
+ "vision_model.trunk.blocks.12.attn.qkv.bias": "model-00001-of-00002.safetensors",
399
+ "vision_model.trunk.blocks.12.attn.qkv.weight": "model-00001-of-00002.safetensors",
400
+ "vision_model.trunk.blocks.12.mlp.fc1.bias": "model-00001-of-00002.safetensors",
401
+ "vision_model.trunk.blocks.12.mlp.fc1.weight": "model-00001-of-00002.safetensors",
402
+ "vision_model.trunk.blocks.12.mlp.fc2.bias": "model-00001-of-00002.safetensors",
403
+ "vision_model.trunk.blocks.12.mlp.fc2.weight": "model-00001-of-00002.safetensors",
404
+ "vision_model.trunk.blocks.12.norm1.bias": "model-00001-of-00002.safetensors",
405
+ "vision_model.trunk.blocks.12.norm1.weight": "model-00001-of-00002.safetensors",
406
+ "vision_model.trunk.blocks.12.norm2.bias": "model-00001-of-00002.safetensors",
407
+ "vision_model.trunk.blocks.12.norm2.weight": "model-00001-of-00002.safetensors",
408
+ "vision_model.trunk.blocks.13.attn.proj.bias": "model-00001-of-00002.safetensors",
409
+ "vision_model.trunk.blocks.13.attn.proj.weight": "model-00001-of-00002.safetensors",
410
+ "vision_model.trunk.blocks.13.attn.qkv.bias": "model-00001-of-00002.safetensors",
411
+ "vision_model.trunk.blocks.13.attn.qkv.weight": "model-00001-of-00002.safetensors",
412
+ "vision_model.trunk.blocks.13.mlp.fc1.bias": "model-00001-of-00002.safetensors",
413
+ "vision_model.trunk.blocks.13.mlp.fc1.weight": "model-00001-of-00002.safetensors",
414
+ "vision_model.trunk.blocks.13.mlp.fc2.bias": "model-00001-of-00002.safetensors",
415
+ "vision_model.trunk.blocks.13.mlp.fc2.weight": "model-00001-of-00002.safetensors",
416
+ "vision_model.trunk.blocks.13.norm1.bias": "model-00001-of-00002.safetensors",
417
+ "vision_model.trunk.blocks.13.norm1.weight": "model-00001-of-00002.safetensors",
418
+ "vision_model.trunk.blocks.13.norm2.bias": "model-00001-of-00002.safetensors",
419
+ "vision_model.trunk.blocks.13.norm2.weight": "model-00001-of-00002.safetensors",
420
+ "vision_model.trunk.blocks.14.attn.proj.bias": "model-00001-of-00002.safetensors",
421
+ "vision_model.trunk.blocks.14.attn.proj.weight": "model-00001-of-00002.safetensors",
422
+ "vision_model.trunk.blocks.14.attn.qkv.bias": "model-00001-of-00002.safetensors",
423
+ "vision_model.trunk.blocks.14.attn.qkv.weight": "model-00001-of-00002.safetensors",
424
+ "vision_model.trunk.blocks.14.mlp.fc1.bias": "model-00001-of-00002.safetensors",
425
+ "vision_model.trunk.blocks.14.mlp.fc1.weight": "model-00001-of-00002.safetensors",
426
+ "vision_model.trunk.blocks.14.mlp.fc2.bias": "model-00001-of-00002.safetensors",
427
+ "vision_model.trunk.blocks.14.mlp.fc2.weight": "model-00001-of-00002.safetensors",
428
+ "vision_model.trunk.blocks.14.norm1.bias": "model-00001-of-00002.safetensors",
429
+ "vision_model.trunk.blocks.14.norm1.weight": "model-00001-of-00002.safetensors",
430
+ "vision_model.trunk.blocks.14.norm2.bias": "model-00001-of-00002.safetensors",
431
+ "vision_model.trunk.blocks.14.norm2.weight": "model-00001-of-00002.safetensors",
432
+ "vision_model.trunk.blocks.15.attn.proj.bias": "model-00001-of-00002.safetensors",
433
+ "vision_model.trunk.blocks.15.attn.proj.weight": "model-00001-of-00002.safetensors",
434
+ "vision_model.trunk.blocks.15.attn.qkv.bias": "model-00001-of-00002.safetensors",
435
+ "vision_model.trunk.blocks.15.attn.qkv.weight": "model-00001-of-00002.safetensors",
436
+ "vision_model.trunk.blocks.15.mlp.fc1.bias": "model-00001-of-00002.safetensors",
437
+ "vision_model.trunk.blocks.15.mlp.fc1.weight": "model-00001-of-00002.safetensors",
438
+ "vision_model.trunk.blocks.15.mlp.fc2.bias": "model-00001-of-00002.safetensors",
439
+ "vision_model.trunk.blocks.15.mlp.fc2.weight": "model-00001-of-00002.safetensors",
440
+ "vision_model.trunk.blocks.15.norm1.bias": "model-00001-of-00002.safetensors",
441
+ "vision_model.trunk.blocks.15.norm1.weight": "model-00001-of-00002.safetensors",
442
+ "vision_model.trunk.blocks.15.norm2.bias": "model-00001-of-00002.safetensors",
443
+ "vision_model.trunk.blocks.15.norm2.weight": "model-00001-of-00002.safetensors",
444
+ "vision_model.trunk.blocks.16.attn.proj.bias": "model-00001-of-00002.safetensors",
445
+ "vision_model.trunk.blocks.16.attn.proj.weight": "model-00001-of-00002.safetensors",
446
+ "vision_model.trunk.blocks.16.attn.qkv.bias": "model-00001-of-00002.safetensors",
447
+ "vision_model.trunk.blocks.16.attn.qkv.weight": "model-00001-of-00002.safetensors",
448
+ "vision_model.trunk.blocks.16.mlp.fc1.bias": "model-00001-of-00002.safetensors",
449
+ "vision_model.trunk.blocks.16.mlp.fc1.weight": "model-00001-of-00002.safetensors",
450
+ "vision_model.trunk.blocks.16.mlp.fc2.bias": "model-00001-of-00002.safetensors",
451
+ "vision_model.trunk.blocks.16.mlp.fc2.weight": "model-00001-of-00002.safetensors",
452
+ "vision_model.trunk.blocks.16.norm1.bias": "model-00001-of-00002.safetensors",
453
+ "vision_model.trunk.blocks.16.norm1.weight": "model-00001-of-00002.safetensors",
454
+ "vision_model.trunk.blocks.16.norm2.bias": "model-00001-of-00002.safetensors",
455
+ "vision_model.trunk.blocks.16.norm2.weight": "model-00001-of-00002.safetensors",
456
+ "vision_model.trunk.blocks.17.attn.proj.bias": "model-00001-of-00002.safetensors",
457
+ "vision_model.trunk.blocks.17.attn.proj.weight": "model-00001-of-00002.safetensors",
458
+ "vision_model.trunk.blocks.17.attn.qkv.bias": "model-00001-of-00002.safetensors",
459
+ "vision_model.trunk.blocks.17.attn.qkv.weight": "model-00001-of-00002.safetensors",
460
+ "vision_model.trunk.blocks.17.mlp.fc1.bias": "model-00001-of-00002.safetensors",
461
+ "vision_model.trunk.blocks.17.mlp.fc1.weight": "model-00001-of-00002.safetensors",
462
+ "vision_model.trunk.blocks.17.mlp.fc2.bias": "model-00001-of-00002.safetensors",
463
+ "vision_model.trunk.blocks.17.mlp.fc2.weight": "model-00001-of-00002.safetensors",
464
+ "vision_model.trunk.blocks.17.norm1.bias": "model-00001-of-00002.safetensors",
465
+ "vision_model.trunk.blocks.17.norm1.weight": "model-00001-of-00002.safetensors",
466
+ "vision_model.trunk.blocks.17.norm2.bias": "model-00001-of-00002.safetensors",
467
+ "vision_model.trunk.blocks.17.norm2.weight": "model-00001-of-00002.safetensors",
468
+ "vision_model.trunk.blocks.18.attn.proj.bias": "model-00001-of-00002.safetensors",
469
+ "vision_model.trunk.blocks.18.attn.proj.weight": "model-00001-of-00002.safetensors",
470
+ "vision_model.trunk.blocks.18.attn.qkv.bias": "model-00001-of-00002.safetensors",
471
+ "vision_model.trunk.blocks.18.attn.qkv.weight": "model-00001-of-00002.safetensors",
472
+ "vision_model.trunk.blocks.18.mlp.fc1.bias": "model-00001-of-00002.safetensors",
473
+ "vision_model.trunk.blocks.18.mlp.fc1.weight": "model-00001-of-00002.safetensors",
474
+ "vision_model.trunk.blocks.18.mlp.fc2.bias": "model-00001-of-00002.safetensors",
475
+ "vision_model.trunk.blocks.18.mlp.fc2.weight": "model-00001-of-00002.safetensors",
476
+ "vision_model.trunk.blocks.18.norm1.bias": "model-00001-of-00002.safetensors",
477
+ "vision_model.trunk.blocks.18.norm1.weight": "model-00001-of-00002.safetensors",
478
+ "vision_model.trunk.blocks.18.norm2.bias": "model-00001-of-00002.safetensors",
479
+ "vision_model.trunk.blocks.18.norm2.weight": "model-00001-of-00002.safetensors",
480
+ "vision_model.trunk.blocks.19.attn.proj.bias": "model-00001-of-00002.safetensors",
481
+ "vision_model.trunk.blocks.19.attn.proj.weight": "model-00001-of-00002.safetensors",
482
+ "vision_model.trunk.blocks.19.attn.qkv.bias": "model-00001-of-00002.safetensors",
483
+ "vision_model.trunk.blocks.19.attn.qkv.weight": "model-00001-of-00002.safetensors",
484
+ "vision_model.trunk.blocks.19.mlp.fc1.bias": "model-00001-of-00002.safetensors",
485
+ "vision_model.trunk.blocks.19.mlp.fc1.weight": "model-00001-of-00002.safetensors",
486
+ "vision_model.trunk.blocks.19.mlp.fc2.bias": "model-00001-of-00002.safetensors",
487
+ "vision_model.trunk.blocks.19.mlp.fc2.weight": "model-00001-of-00002.safetensors",
488
+ "vision_model.trunk.blocks.19.norm1.bias": "model-00001-of-00002.safetensors",
489
+ "vision_model.trunk.blocks.19.norm1.weight": "model-00001-of-00002.safetensors",
490
+ "vision_model.trunk.blocks.19.norm2.bias": "model-00001-of-00002.safetensors",
491
+ "vision_model.trunk.blocks.19.norm2.weight": "model-00001-of-00002.safetensors",
492
+ "vision_model.trunk.blocks.2.attn.proj.bias": "model-00001-of-00002.safetensors",
493
+ "vision_model.trunk.blocks.2.attn.proj.weight": "model-00001-of-00002.safetensors",
494
+ "vision_model.trunk.blocks.2.attn.qkv.bias": "model-00001-of-00002.safetensors",
495
+ "vision_model.trunk.blocks.2.attn.qkv.weight": "model-00001-of-00002.safetensors",
496
+ "vision_model.trunk.blocks.2.mlp.fc1.bias": "model-00001-of-00002.safetensors",
497
+ "vision_model.trunk.blocks.2.mlp.fc1.weight": "model-00001-of-00002.safetensors",
498
+ "vision_model.trunk.blocks.2.mlp.fc2.bias": "model-00001-of-00002.safetensors",
499
+ "vision_model.trunk.blocks.2.mlp.fc2.weight": "model-00001-of-00002.safetensors",
500
+ "vision_model.trunk.blocks.2.norm1.bias": "model-00001-of-00002.safetensors",
501
+ "vision_model.trunk.blocks.2.norm1.weight": "model-00001-of-00002.safetensors",
502
+ "vision_model.trunk.blocks.2.norm2.bias": "model-00001-of-00002.safetensors",
503
+ "vision_model.trunk.blocks.2.norm2.weight": "model-00001-of-00002.safetensors",
504
+ "vision_model.trunk.blocks.20.attn.proj.bias": "model-00001-of-00002.safetensors",
505
+ "vision_model.trunk.blocks.20.attn.proj.weight": "model-00001-of-00002.safetensors",
506
+ "vision_model.trunk.blocks.20.attn.qkv.bias": "model-00001-of-00002.safetensors",
507
+ "vision_model.trunk.blocks.20.attn.qkv.weight": "model-00001-of-00002.safetensors",
508
+ "vision_model.trunk.blocks.20.mlp.fc1.bias": "model-00001-of-00002.safetensors",
509
+ "vision_model.trunk.blocks.20.mlp.fc1.weight": "model-00001-of-00002.safetensors",
510
+ "vision_model.trunk.blocks.20.mlp.fc2.bias": "model-00001-of-00002.safetensors",
511
+ "vision_model.trunk.blocks.20.mlp.fc2.weight": "model-00001-of-00002.safetensors",
512
+ "vision_model.trunk.blocks.20.norm1.bias": "model-00001-of-00002.safetensors",
513
+ "vision_model.trunk.blocks.20.norm1.weight": "model-00001-of-00002.safetensors",
514
+ "vision_model.trunk.blocks.20.norm2.bias": "model-00001-of-00002.safetensors",
515
+ "vision_model.trunk.blocks.20.norm2.weight": "model-00001-of-00002.safetensors",
516
+ "vision_model.trunk.blocks.21.attn.proj.bias": "model-00001-of-00002.safetensors",
517
+ "vision_model.trunk.blocks.21.attn.proj.weight": "model-00001-of-00002.safetensors",
518
+ "vision_model.trunk.blocks.21.attn.qkv.bias": "model-00001-of-00002.safetensors",
519
+ "vision_model.trunk.blocks.21.attn.qkv.weight": "model-00001-of-00002.safetensors",
520
+ "vision_model.trunk.blocks.21.mlp.fc1.bias": "model-00001-of-00002.safetensors",
521
+ "vision_model.trunk.blocks.21.mlp.fc1.weight": "model-00001-of-00002.safetensors",
522
+ "vision_model.trunk.blocks.21.mlp.fc2.bias": "model-00001-of-00002.safetensors",
523
+ "vision_model.trunk.blocks.21.mlp.fc2.weight": "model-00001-of-00002.safetensors",
524
+ "vision_model.trunk.blocks.21.norm1.bias": "model-00001-of-00002.safetensors",
525
+ "vision_model.trunk.blocks.21.norm1.weight": "model-00001-of-00002.safetensors",
526
+ "vision_model.trunk.blocks.21.norm2.bias": "model-00001-of-00002.safetensors",
527
+ "vision_model.trunk.blocks.21.norm2.weight": "model-00001-of-00002.safetensors",
528
+ "vision_model.trunk.blocks.22.attn.proj.bias": "model-00001-of-00002.safetensors",
529
+ "vision_model.trunk.blocks.22.attn.proj.weight": "model-00001-of-00002.safetensors",
530
+ "vision_model.trunk.blocks.22.attn.qkv.bias": "model-00001-of-00002.safetensors",
531
+ "vision_model.trunk.blocks.22.attn.qkv.weight": "model-00001-of-00002.safetensors",
532
+ "vision_model.trunk.blocks.22.mlp.fc1.bias": "model-00001-of-00002.safetensors",
533
+ "vision_model.trunk.blocks.22.mlp.fc1.weight": "model-00001-of-00002.safetensors",
534
+ "vision_model.trunk.blocks.22.mlp.fc2.bias": "model-00001-of-00002.safetensors",
535
+ "vision_model.trunk.blocks.22.mlp.fc2.weight": "model-00001-of-00002.safetensors",
536
+ "vision_model.trunk.blocks.22.norm1.bias": "model-00001-of-00002.safetensors",
537
+ "vision_model.trunk.blocks.22.norm1.weight": "model-00001-of-00002.safetensors",
538
+ "vision_model.trunk.blocks.22.norm2.bias": "model-00001-of-00002.safetensors",
539
+ "vision_model.trunk.blocks.22.norm2.weight": "model-00001-of-00002.safetensors",
540
+ "vision_model.trunk.blocks.23.attn.proj.bias": "model-00001-of-00002.safetensors",
541
+ "vision_model.trunk.blocks.23.attn.proj.weight": "model-00001-of-00002.safetensors",
542
+ "vision_model.trunk.blocks.23.attn.qkv.bias": "model-00001-of-00002.safetensors",
543
+ "vision_model.trunk.blocks.23.attn.qkv.weight": "model-00001-of-00002.safetensors",
544
+ "vision_model.trunk.blocks.23.mlp.fc1.bias": "model-00001-of-00002.safetensors",
545
+ "vision_model.trunk.blocks.23.mlp.fc1.weight": "model-00001-of-00002.safetensors",
546
+ "vision_model.trunk.blocks.23.mlp.fc2.bias": "model-00001-of-00002.safetensors",
547
+ "vision_model.trunk.blocks.23.mlp.fc2.weight": "model-00001-of-00002.safetensors",
548
+ "vision_model.trunk.blocks.23.norm1.bias": "model-00001-of-00002.safetensors",
549
+ "vision_model.trunk.blocks.23.norm1.weight": "model-00001-of-00002.safetensors",
550
+ "vision_model.trunk.blocks.23.norm2.bias": "model-00001-of-00002.safetensors",
551
+ "vision_model.trunk.blocks.23.norm2.weight": "model-00001-of-00002.safetensors",
552
+ "vision_model.trunk.blocks.24.attn.proj.bias": "model-00001-of-00002.safetensors",
553
+ "vision_model.trunk.blocks.24.attn.proj.weight": "model-00001-of-00002.safetensors",
554
+ "vision_model.trunk.blocks.24.attn.qkv.bias": "model-00001-of-00002.safetensors",
555
+ "vision_model.trunk.blocks.24.attn.qkv.weight": "model-00001-of-00002.safetensors",
556
+ "vision_model.trunk.blocks.24.mlp.fc1.bias": "model-00001-of-00002.safetensors",
557
+ "vision_model.trunk.blocks.24.mlp.fc1.weight": "model-00001-of-00002.safetensors",
558
+ "vision_model.trunk.blocks.24.mlp.fc2.bias": "model-00001-of-00002.safetensors",
559
+ "vision_model.trunk.blocks.24.mlp.fc2.weight": "model-00001-of-00002.safetensors",
560
+ "vision_model.trunk.blocks.24.norm1.bias": "model-00001-of-00002.safetensors",
561
+ "vision_model.trunk.blocks.24.norm1.weight": "model-00001-of-00002.safetensors",
562
+ "vision_model.trunk.blocks.24.norm2.bias": "model-00001-of-00002.safetensors",
563
+ "vision_model.trunk.blocks.24.norm2.weight": "model-00001-of-00002.safetensors",
564
+ "vision_model.trunk.blocks.25.attn.proj.bias": "model-00001-of-00002.safetensors",
565
+ "vision_model.trunk.blocks.25.attn.proj.weight": "model-00001-of-00002.safetensors",
566
+ "vision_model.trunk.blocks.25.attn.qkv.bias": "model-00001-of-00002.safetensors",
567
+ "vision_model.trunk.blocks.25.attn.qkv.weight": "model-00001-of-00002.safetensors",
568
+ "vision_model.trunk.blocks.25.mlp.fc1.bias": "model-00001-of-00002.safetensors",
569
+ "vision_model.trunk.blocks.25.mlp.fc1.weight": "model-00001-of-00002.safetensors",
570
+ "vision_model.trunk.blocks.25.mlp.fc2.bias": "model-00001-of-00002.safetensors",
571
+ "vision_model.trunk.blocks.25.mlp.fc2.weight": "model-00001-of-00002.safetensors",
572
+ "vision_model.trunk.blocks.25.norm1.bias": "model-00001-of-00002.safetensors",
573
+ "vision_model.trunk.blocks.25.norm1.weight": "model-00001-of-00002.safetensors",
574
+ "vision_model.trunk.blocks.25.norm2.bias": "model-00001-of-00002.safetensors",
575
+ "vision_model.trunk.blocks.25.norm2.weight": "model-00001-of-00002.safetensors",
576
+ "vision_model.trunk.blocks.26.attn.proj.bias": "model-00001-of-00002.safetensors",
577
+ "vision_model.trunk.blocks.26.attn.proj.weight": "model-00001-of-00002.safetensors",
578
+ "vision_model.trunk.blocks.26.attn.qkv.bias": "model-00001-of-00002.safetensors",
579
+ "vision_model.trunk.blocks.26.attn.qkv.weight": "model-00001-of-00002.safetensors",
580
+ "vision_model.trunk.blocks.26.mlp.fc1.bias": "model-00001-of-00002.safetensors",
581
+ "vision_model.trunk.blocks.26.mlp.fc1.weight": "model-00001-of-00002.safetensors",
582
+ "vision_model.trunk.blocks.26.mlp.fc2.bias": "model-00001-of-00002.safetensors",
583
+ "vision_model.trunk.blocks.26.mlp.fc2.weight": "model-00001-of-00002.safetensors",
584
+ "vision_model.trunk.blocks.26.norm1.bias": "model-00001-of-00002.safetensors",
585
+ "vision_model.trunk.blocks.26.norm1.weight": "model-00001-of-00002.safetensors",
586
+ "vision_model.trunk.blocks.26.norm2.bias": "model-00001-of-00002.safetensors",
587
+ "vision_model.trunk.blocks.26.norm2.weight": "model-00001-of-00002.safetensors",
588
+ "vision_model.trunk.blocks.3.attn.proj.bias": "model-00001-of-00002.safetensors",
589
+ "vision_model.trunk.blocks.3.attn.proj.weight": "model-00001-of-00002.safetensors",
590
+ "vision_model.trunk.blocks.3.attn.qkv.bias": "model-00001-of-00002.safetensors",
591
+ "vision_model.trunk.blocks.3.attn.qkv.weight": "model-00001-of-00002.safetensors",
592
+ "vision_model.trunk.blocks.3.mlp.fc1.bias": "model-00001-of-00002.safetensors",
593
+ "vision_model.trunk.blocks.3.mlp.fc1.weight": "model-00001-of-00002.safetensors",
594
+ "vision_model.trunk.blocks.3.mlp.fc2.bias": "model-00001-of-00002.safetensors",
595
+ "vision_model.trunk.blocks.3.mlp.fc2.weight": "model-00001-of-00002.safetensors",
596
+ "vision_model.trunk.blocks.3.norm1.bias": "model-00001-of-00002.safetensors",
597
+ "vision_model.trunk.blocks.3.norm1.weight": "model-00001-of-00002.safetensors",
598
+ "vision_model.trunk.blocks.3.norm2.bias": "model-00001-of-00002.safetensors",
599
+ "vision_model.trunk.blocks.3.norm2.weight": "model-00001-of-00002.safetensors",
600
+ "vision_model.trunk.blocks.4.attn.proj.bias": "model-00001-of-00002.safetensors",
601
+ "vision_model.trunk.blocks.4.attn.proj.weight": "model-00001-of-00002.safetensors",
602
+ "vision_model.trunk.blocks.4.attn.qkv.bias": "model-00001-of-00002.safetensors",
603
+ "vision_model.trunk.blocks.4.attn.qkv.weight": "model-00001-of-00002.safetensors",
604
+ "vision_model.trunk.blocks.4.mlp.fc1.bias": "model-00001-of-00002.safetensors",
605
+ "vision_model.trunk.blocks.4.mlp.fc1.weight": "model-00001-of-00002.safetensors",
606
+ "vision_model.trunk.blocks.4.mlp.fc2.bias": "model-00001-of-00002.safetensors",
607
+ "vision_model.trunk.blocks.4.mlp.fc2.weight": "model-00001-of-00002.safetensors",
608
+ "vision_model.trunk.blocks.4.norm1.bias": "model-00001-of-00002.safetensors",
609
+ "vision_model.trunk.blocks.4.norm1.weight": "model-00001-of-00002.safetensors",
610
+ "vision_model.trunk.blocks.4.norm2.bias": "model-00001-of-00002.safetensors",
611
+ "vision_model.trunk.blocks.4.norm2.weight": "model-00001-of-00002.safetensors",
612
+ "vision_model.trunk.blocks.5.attn.proj.bias": "model-00001-of-00002.safetensors",
613
+ "vision_model.trunk.blocks.5.attn.proj.weight": "model-00001-of-00002.safetensors",
614
+ "vision_model.trunk.blocks.5.attn.qkv.bias": "model-00001-of-00002.safetensors",
615
+ "vision_model.trunk.blocks.5.attn.qkv.weight": "model-00001-of-00002.safetensors",
616
+ "vision_model.trunk.blocks.5.mlp.fc1.bias": "model-00001-of-00002.safetensors",
617
+ "vision_model.trunk.blocks.5.mlp.fc1.weight": "model-00001-of-00002.safetensors",
618
+ "vision_model.trunk.blocks.5.mlp.fc2.bias": "model-00001-of-00002.safetensors",
619
+ "vision_model.trunk.blocks.5.mlp.fc2.weight": "model-00001-of-00002.safetensors",
620
+ "vision_model.trunk.blocks.5.norm1.bias": "model-00001-of-00002.safetensors",
621
+ "vision_model.trunk.blocks.5.norm1.weight": "model-00001-of-00002.safetensors",
622
+ "vision_model.trunk.blocks.5.norm2.bias": "model-00001-of-00002.safetensors",
623
+ "vision_model.trunk.blocks.5.norm2.weight": "model-00001-of-00002.safetensors",
624
+ "vision_model.trunk.blocks.6.attn.proj.bias": "model-00001-of-00002.safetensors",
625
+ "vision_model.trunk.blocks.6.attn.proj.weight": "model-00001-of-00002.safetensors",
626
+ "vision_model.trunk.blocks.6.attn.qkv.bias": "model-00001-of-00002.safetensors",
627
+ "vision_model.trunk.blocks.6.attn.qkv.weight": "model-00001-of-00002.safetensors",
628
+ "vision_model.trunk.blocks.6.mlp.fc1.bias": "model-00001-of-00002.safetensors",
629
+ "vision_model.trunk.blocks.6.mlp.fc1.weight": "model-00001-of-00002.safetensors",
630
+ "vision_model.trunk.blocks.6.mlp.fc2.bias": "model-00001-of-00002.safetensors",
631
+ "vision_model.trunk.blocks.6.mlp.fc2.weight": "model-00001-of-00002.safetensors",
632
+ "vision_model.trunk.blocks.6.norm1.bias": "model-00001-of-00002.safetensors",
633
+ "vision_model.trunk.blocks.6.norm1.weight": "model-00001-of-00002.safetensors",
634
+ "vision_model.trunk.blocks.6.norm2.bias": "model-00001-of-00002.safetensors",
635
+ "vision_model.trunk.blocks.6.norm2.weight": "model-00001-of-00002.safetensors",
636
+ "vision_model.trunk.blocks.7.attn.proj.bias": "model-00001-of-00002.safetensors",
637
+ "vision_model.trunk.blocks.7.attn.proj.weight": "model-00001-of-00002.safetensors",
638
+ "vision_model.trunk.blocks.7.attn.qkv.bias": "model-00001-of-00002.safetensors",
639
+ "vision_model.trunk.blocks.7.attn.qkv.weight": "model-00001-of-00002.safetensors",
640
+ "vision_model.trunk.blocks.7.mlp.fc1.bias": "model-00001-of-00002.safetensors",
641
+ "vision_model.trunk.blocks.7.mlp.fc1.weight": "model-00001-of-00002.safetensors",
642
+ "vision_model.trunk.blocks.7.mlp.fc2.bias": "model-00001-of-00002.safetensors",
643
+ "vision_model.trunk.blocks.7.mlp.fc2.weight": "model-00001-of-00002.safetensors",
644
+ "vision_model.trunk.blocks.7.norm1.bias": "model-00001-of-00002.safetensors",
645
+ "vision_model.trunk.blocks.7.norm1.weight": "model-00001-of-00002.safetensors",
646
+ "vision_model.trunk.blocks.7.norm2.bias": "model-00001-of-00002.safetensors",
647
+ "vision_model.trunk.blocks.7.norm2.weight": "model-00001-of-00002.safetensors",
648
+ "vision_model.trunk.blocks.8.attn.proj.bias": "model-00001-of-00002.safetensors",
649
+ "vision_model.trunk.blocks.8.attn.proj.weight": "model-00001-of-00002.safetensors",
650
+ "vision_model.trunk.blocks.8.attn.qkv.bias": "model-00001-of-00002.safetensors",
651
+ "vision_model.trunk.blocks.8.attn.qkv.weight": "model-00001-of-00002.safetensors",
652
+ "vision_model.trunk.blocks.8.mlp.fc1.bias": "model-00001-of-00002.safetensors",
653
+ "vision_model.trunk.blocks.8.mlp.fc1.weight": "model-00001-of-00002.safetensors",
654
+ "vision_model.trunk.blocks.8.mlp.fc2.bias": "model-00001-of-00002.safetensors",
655
+ "vision_model.trunk.blocks.8.mlp.fc2.weight": "model-00001-of-00002.safetensors",
656
+ "vision_model.trunk.blocks.8.norm1.bias": "model-00001-of-00002.safetensors",
657
+ "vision_model.trunk.blocks.8.norm1.weight": "model-00001-of-00002.safetensors",
658
+ "vision_model.trunk.blocks.8.norm2.bias": "model-00001-of-00002.safetensors",
659
+ "vision_model.trunk.blocks.8.norm2.weight": "model-00001-of-00002.safetensors",
660
+ "vision_model.trunk.blocks.9.attn.proj.bias": "model-00001-of-00002.safetensors",
661
+ "vision_model.trunk.blocks.9.attn.proj.weight": "model-00001-of-00002.safetensors",
662
+ "vision_model.trunk.blocks.9.attn.qkv.bias": "model-00001-of-00002.safetensors",
663
+ "vision_model.trunk.blocks.9.attn.qkv.weight": "model-00001-of-00002.safetensors",
664
+ "vision_model.trunk.blocks.9.mlp.fc1.bias": "model-00001-of-00002.safetensors",
665
+ "vision_model.trunk.blocks.9.mlp.fc1.weight": "model-00001-of-00002.safetensors",
666
+ "vision_model.trunk.blocks.9.mlp.fc2.bias": "model-00001-of-00002.safetensors",
667
+ "vision_model.trunk.blocks.9.mlp.fc2.weight": "model-00001-of-00002.safetensors",
668
+ "vision_model.trunk.blocks.9.norm1.bias": "model-00001-of-00002.safetensors",
669
+ "vision_model.trunk.blocks.9.norm1.weight": "model-00001-of-00002.safetensors",
670
+ "vision_model.trunk.blocks.9.norm2.bias": "model-00001-of-00002.safetensors",
671
+ "vision_model.trunk.blocks.9.norm2.weight": "model-00001-of-00002.safetensors",
672
+ "vision_model.trunk.norm.bias": "model-00001-of-00002.safetensors",
673
+ "vision_model.trunk.norm.weight": "model-00001-of-00002.safetensors",
674
+ "vision_model.trunk.patch_embed.proj.bias": "model-00001-of-00002.safetensors",
675
+ "vision_model.trunk.patch_embed.proj.weight": "model-00001-of-00002.safetensors",
676
+ "vision_model.trunk.pos_embed": "model-00001-of-00002.safetensors"
677
+ }
678
+ }
modeling_llava.py ADDED
@@ -0,0 +1,417 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ from dataclasses import dataclass
3
+ from typing import List, Optional, Tuple, Union
4
+
5
+ import torch
6
+ import torch.utils.checkpoint
7
+ from torch import nn
8
+
9
+ from transformers import PreTrainedModel
10
+ from transformers.modeling_outputs import ModelOutput
11
+
12
+ from modeling_phi import PhiForCausalLM, InferenceParams
13
+ from processing_llava import OpenCLIPImageProcessor
14
+ from configuration_llava import LlavaConfig
15
+ from open_clip import create_model
16
+
17
+
18
+ @dataclass
19
+ class LlavaCausalLMOutputWithPast(ModelOutput):
20
+ loss: Optional[torch.FloatTensor] = None
21
+ logits: torch.FloatTensor = None
22
+ past_key_values: Optional[List[torch.FloatTensor]] = None
23
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
24
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
25
+ image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
26
+
27
+
28
+ class LlavaMultiModalProjector(nn.Module):
29
+ def __init__(self, config: LlavaConfig):
30
+ super().__init__()
31
+
32
+ self.linear_1 = nn.Linear(
33
+ config.vision_embed_dim,
34
+ config.text_config.n_embd * config.projector_tokens_num,
35
+ bias=True,
36
+ )
37
+ self.act = nn.GELU()
38
+ self.linear_2 = nn.Linear(
39
+ config.text_config.n_embd * config.projector_tokens_num,
40
+ config.text_config.n_embd * config.projector_tokens_num,
41
+ bias=True,
42
+ )
43
+ self.projector_tokens_num = config.projector_tokens_num
44
+
45
+ def forward(self, image_features):
46
+ hidden_states = self.linear_1(image_features)
47
+ hidden_states = self.act(hidden_states)
48
+ hidden_states = self.linear_2(hidden_states)
49
+ hidden_states = hidden_states.reshape(
50
+ hidden_states.shape[0],
51
+ self.projector_tokens_num,
52
+ int(hidden_states.shape[1] / self.projector_tokens_num),
53
+ )
54
+ return hidden_states
55
+
56
+
57
+ class LlavaPreTrainedModel(PreTrainedModel):
58
+ config_class = LlavaConfig
59
+ base_model_prefix = "model"
60
+ supports_gradient_checkpointing = True
61
+ _no_split_modules = ["LlavaVisionAttention"]
62
+ _skip_keys_device_placement = "past_key_values"
63
+ _supports_flash_attn_2 = True
64
+
65
+ def __init__(self, config):
66
+ super().__init__(config)
67
+
68
+ def _init_weights(self, module):
69
+ return
70
+
71
+ @property
72
+ def _supports_sdpa(self):
73
+ """
74
+ Retrieve language_model's attribute to check whether the model supports
75
+ SDPA or not.
76
+ """
77
+ return self.language_model._supports_sdpa
78
+
79
+
80
+ class LlavaForConditionalGeneration(LlavaPreTrainedModel):
81
+ def __init__(self, config: LlavaConfig):
82
+ super().__init__(config)
83
+ clip_model = create_model(config.vision_tower_name)
84
+ self.vision_model = clip_model.visual
85
+
86
+ self.multi_modal_projector = LlavaMultiModalProjector(config)
87
+ self.vocab_size = config.vocab_size
88
+ self.language_model = PhiForCausalLM(config.text_config)
89
+ self.pad_token_id = (
90
+ self.config.pad_token_id if self.config.pad_token_id is not None else -1
91
+ )
92
+ self.post_init()
93
+
94
+ def get_input_embeddings(self):
95
+ return self.language_model.get_input_embeddings()
96
+
97
+ def set_input_embeddings(self, value):
98
+ self.language_model.set_input_embeddings(value)
99
+
100
+ def get_output_embeddings(self):
101
+ return self.language_model.get_output_embeddings()
102
+
103
+ def set_output_embeddings(self, new_embeddings):
104
+ self.language_model.set_output_embeddings(new_embeddings)
105
+
106
+ def set_decoder(self, decoder):
107
+ self.language_model.transformer = decoder
108
+
109
+ def get_decoder(self):
110
+ return self.language_model.transformer
111
+
112
+ def tie_weights(self):
113
+ return self.language_model.tie_weights()
114
+
115
+ def resize_token_embeddings(
116
+ self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None
117
+ ) -> nn.Embedding:
118
+ model_embeds = self.language_model.resize_token_embeddings(
119
+ new_num_tokens, pad_to_multiple_of
120
+ )
121
+ # update vocab size
122
+ self.config.text_config.vocab_size = model_embeds.num_embeddings
123
+ self.config.vocab_size = model_embeds.num_embeddings
124
+ self.vocab_size = model_embeds.num_embeddings
125
+ return model_embeds
126
+
127
+ def _merge_input_ids_with_image_features(
128
+ self, image_features, inputs_embeds, input_ids, attention_mask, position_ids
129
+ ):
130
+ num_images, num_image_patches, embed_dim = image_features.shape
131
+ batch_size, sequence_length = input_ids.shape
132
+ left_padding = not torch.sum(
133
+ input_ids[:, -1] == torch.tensor(self.pad_token_id)
134
+ )
135
+ # 1. Create a mask to know where special image tokens are
136
+ special_image_token_mask = input_ids == self.config.image_token_index
137
+ num_special_image_tokens = torch.sum(special_image_token_mask, dim=-1)
138
+ # Compute the maximum embed dimension
139
+ max_embed_dim = (
140
+ num_special_image_tokens.max() * (num_image_patches - 1)
141
+ ) + sequence_length
142
+ batch_indices, non_image_indices = torch.where(
143
+ input_ids != self.config.image_token_index
144
+ )
145
+
146
+ # 2. Compute the positions where text should be written
147
+ # Calculate new positions for text tokens in merged image-text sequence.
148
+ # `special_image_token_mask` identifies image tokens. Each image token will be replaced by `nb_text_tokens_per_images - 1` text tokens.
149
+ # `torch.cumsum` computes how each image token shifts subsequent text token positions.
150
+ # - 1 to adjust for zero-based indexing, as `cumsum` inherently increases indices by one.
151
+ new_token_positions = (
152
+ torch.cumsum((special_image_token_mask * (num_image_patches - 1) + 1), -1)
153
+ - 1
154
+ )
155
+ nb_image_pad = max_embed_dim - 1 - new_token_positions[:, -1]
156
+ if left_padding:
157
+ new_token_positions += nb_image_pad[:, None] # offset for left padding
158
+ text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
159
+
160
+ # 3. Create the full embedding, already padded to the maximum position
161
+ final_embedding = torch.zeros(
162
+ batch_size,
163
+ max_embed_dim,
164
+ embed_dim,
165
+ dtype=inputs_embeds.dtype,
166
+ device=inputs_embeds.device,
167
+ )
168
+ final_attention_mask = torch.zeros(
169
+ batch_size,
170
+ max_embed_dim,
171
+ dtype=attention_mask.dtype,
172
+ device=inputs_embeds.device,
173
+ )
174
+ # In case the Vision model or the Language model has been offloaded to CPU, we need to manually
175
+ # set the corresponding tensors into their correct target device.
176
+ target_device = inputs_embeds.device
177
+ batch_indices, non_image_indices, text_to_overwrite = (
178
+ batch_indices.to(target_device),
179
+ non_image_indices.to(target_device),
180
+ text_to_overwrite.to(target_device),
181
+ )
182
+ attention_mask = attention_mask.to(target_device)
183
+
184
+ # 4. Fill the embeddings based on the mask. If we have ["hey" "<image>", "how", "are"]
185
+ # we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the image features
186
+ final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[
187
+ batch_indices, non_image_indices
188
+ ]
189
+ final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[
190
+ batch_indices, non_image_indices
191
+ ]
192
+
193
+ # 5. Fill the embeddings corresponding to the images. Anything that is still zeros needs filling
194
+ image_to_overwrite = torch.all(final_embedding == 0, dim=-1)
195
+ image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[
196
+ :, None
197
+ ].to(target_device)
198
+
199
+ if image_to_overwrite.sum() != image_features.shape[:-1].numel():
200
+ raise ValueError(
201
+ f"The input provided to the model are wrong. The number of image tokens is {torch.sum(special_image_token_mask)} while"
202
+ f" the number of image given to the model is {num_images}. This prevents correct indexing and breaks batch generation."
203
+ )
204
+
205
+ final_embedding[image_to_overwrite] = (
206
+ image_features.contiguous().reshape(-1, embed_dim).to(target_device)
207
+ )
208
+ final_attention_mask |= image_to_overwrite
209
+ position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_(
210
+ (final_attention_mask == 0), 1
211
+ )
212
+ return final_embedding, final_attention_mask, position_ids
213
+
214
+ def forward(
215
+ self,
216
+ input_ids: torch.LongTensor = None,
217
+ pixel_values: torch.FloatTensor = None,
218
+ attention_mask: Optional[torch.Tensor] = None,
219
+ position_ids: Optional[torch.LongTensor] = None,
220
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
221
+ inputs_embeds: Optional[torch.FloatTensor] = None,
222
+ vision_feature_layer: Optional[int] = None,
223
+ vision_feature_select_strategy: Optional[str] = None,
224
+ labels: Optional[torch.LongTensor] = None,
225
+ use_cache: Optional[bool] = None,
226
+ output_attentions: Optional[bool] = None,
227
+ output_hidden_states: Optional[bool] = None,
228
+ return_dict: Optional[bool] = None,
229
+ ) -> Union[Tuple, LlavaCausalLMOutputWithPast]:
230
+ output_attentions = (
231
+ output_attentions
232
+ if output_attentions is not None
233
+ else self.config.output_attentions
234
+ )
235
+ output_hidden_states = (
236
+ output_hidden_states
237
+ if output_hidden_states is not None
238
+ else self.config.output_hidden_states
239
+ )
240
+ return_dict = (
241
+ return_dict if return_dict is not None else self.config.use_return_dict
242
+ )
243
+
244
+ if inputs_embeds is None:
245
+ # 1. Extra the input embeddings
246
+ inputs_embeds = self.get_input_embeddings()(input_ids)
247
+
248
+ # 2. Merge text and images
249
+ if pixel_values is not None and input_ids.shape[1] != 1:
250
+ image_outputs = self.vision_model(pixel_values)
251
+
252
+ image_features = self.multi_modal_projector(image_outputs)
253
+ (
254
+ inputs_embeds,
255
+ attention_mask,
256
+ position_ids,
257
+ ) = self._merge_input_ids_with_image_features(
258
+ image_features,
259
+ inputs_embeds,
260
+ input_ids,
261
+ attention_mask,
262
+ position_ids,
263
+ )
264
+ # if labels is None:
265
+ # labels = torch.full_like(
266
+ # attention_mask, self.config.ignore_index
267
+ # ).to(torch.long)
268
+ else:
269
+ # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
270
+ # generation with cache
271
+ if (
272
+ past_key_values is not None
273
+ and pixel_values is not None
274
+ and input_ids.shape[1] == 1
275
+ ):
276
+ # Retrieve the first layer to inspect the logits and mask out the hidden states
277
+ # that are set to 0
278
+ first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
279
+
280
+ # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
281
+ batch_index, non_attended_tokens = torch.where(
282
+ first_layer_past_key_value.float().sum(-2) == 0
283
+ )
284
+
285
+ # Get the target length
286
+ target_seqlen = first_layer_past_key_value.shape[-1] + 1
287
+
288
+ extended_attention_mask = torch.ones(
289
+ (
290
+ attention_mask.shape[0],
291
+ target_seqlen - attention_mask.shape[1],
292
+ ),
293
+ dtype=attention_mask.dtype,
294
+ device=attention_mask.device,
295
+ )
296
+
297
+ # Zero-out the places where we don't need to attend
298
+ extended_attention_mask[batch_index, non_attended_tokens] = 0
299
+
300
+ attention_mask = torch.cat(
301
+ (attention_mask, extended_attention_mask), dim=1
302
+ )
303
+ position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
304
+
305
+ outputs = self.language_model(
306
+ input_ids=None,
307
+ attention_mask=attention_mask,
308
+ position_ids=position_ids,
309
+ past_key_values=past_key_values,
310
+ inputs_embeds=inputs_embeds,
311
+ use_cache=use_cache,
312
+ output_attentions=output_attentions,
313
+ output_hidden_states=output_hidden_states,
314
+ return_dict=return_dict,
315
+ )
316
+
317
+ logits = outputs[0]
318
+
319
+ loss = None
320
+ if labels is not None:
321
+ # Shift so that tokens < n predict n
322
+ if attention_mask is not None:
323
+ shift_attention_mask = attention_mask[..., 1:]
324
+ shift_logits = logits[..., :-1, :][
325
+ shift_attention_mask.to(logits.device) != 0
326
+ ].contiguous()
327
+ shift_labels = labels[..., 1:][
328
+ shift_attention_mask.to(labels.device) != 0
329
+ ].contiguous()
330
+ else:
331
+ shift_logits = logits[..., :-1, :].contiguous()
332
+ shift_labels = labels[..., 1:].contiguous()
333
+ # Flatten the tokens
334
+ loss_fct = nn.CrossEntropyLoss()
335
+ loss = loss_fct(
336
+ shift_logits.view(-1, shift_logits.size(-1)),
337
+ shift_labels.view(-1).to(shift_logits.device),
338
+ )
339
+
340
+ if not return_dict:
341
+ output = (logits,) + outputs[1:]
342
+ return (loss,) + output if loss is not None else output
343
+
344
+ return LlavaCausalLMOutputWithPast(
345
+ loss=loss,
346
+ logits=logits,
347
+ past_key_values=outputs.past_key_values,
348
+ hidden_states=outputs.hidden_states,
349
+ attentions=outputs.attentions,
350
+ )
351
+
352
+ def prepare_inputs_for_generation(
353
+ self,
354
+ input_ids,
355
+ past_key_values=None,
356
+ inputs_embeds=None,
357
+ pixel_values=None,
358
+ attention_mask=None,
359
+ **kwargs,
360
+ ):
361
+ if past_key_values is not None:
362
+ if isinstance(past_key_values, InferenceParams):
363
+ cache_length = past_key_values.max_seqlen
364
+ past_length = past_key_values.seqlen_offset
365
+ else:
366
+ cache_length = past_length = past_key_values[0][0].shape[2]
367
+
368
+ # Keep only the unprocessed tokens:
369
+ # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
370
+ # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
371
+ # input)
372
+ if (
373
+ attention_mask is not None
374
+ and attention_mask.shape[1] > input_ids.shape[1]
375
+ ):
376
+ input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
377
+ # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
378
+ # input_ids based on the past_length.
379
+ elif past_length < input_ids.shape[1]:
380
+ input_ids = input_ids[:, past_length:]
381
+ # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
382
+ elif self.config.image_token_index in input_ids:
383
+ input_ids = input_ids[:, input_ids.shape[1] - 1 :]
384
+ # If the cache has seen more tokens than it can hold, then the cache has a size limit. Let's discard the
385
+ # older attention values, as their corresponding values are not part of the input.
386
+ if cache_length < past_length and attention_mask is not None:
387
+ attention_mask = attention_mask[
388
+ :, -(cache_length + input_ids.shape[1]) :
389
+ ]
390
+
391
+ position_ids = kwargs.get("position_ids", None)
392
+ if attention_mask is not None and position_ids is None:
393
+ # create position_ids on the fly for batch generation
394
+ position_ids = attention_mask.long().cumsum(-1) - 1
395
+ position_ids.masked_fill_(attention_mask == 0, 1)
396
+ if past_key_values:
397
+ position_ids = position_ids[:, -input_ids.shape[1] :]
398
+
399
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
400
+ if inputs_embeds is not None and past_key_values is None:
401
+ model_inputs = {"inputs_embeds": inputs_embeds}
402
+ else:
403
+ model_inputs = {"input_ids": input_ids}
404
+
405
+ model_inputs.update(
406
+ {
407
+ "position_ids": position_ids,
408
+ "past_key_values": past_key_values,
409
+ "use_cache": kwargs.get("use_cache"),
410
+ "attention_mask": attention_mask,
411
+ "pixel_values": pixel_values,
412
+ }
413
+ )
414
+ return model_inputs
415
+
416
+ def _reorder_cache(self, *args, **kwargs):
417
+ return self.language_model._reorder_cache(*args, **kwargs)
modeling_phi.py ADDED
@@ -0,0 +1,972 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT license.
3
+ #
4
+ # Copyright (c) 2022, Tri Dao, trid@cs.stanford.edu.
5
+ # Licensed under the BSD 3-Clause License.
6
+
7
+ from __future__ import annotations
8
+
9
+ import math
10
+ from dataclasses import dataclass, field
11
+ from typing import Any, Dict, Optional, Tuple, Union
12
+
13
+ import torch
14
+ import torch.nn as nn
15
+ from einops import rearrange, repeat
16
+ from transformers import PretrainedConfig, PreTrainedModel
17
+ from transformers.activations import ACT2FN
18
+ from transformers.modeling_outputs import CausalLMOutputWithPast
19
+
20
+ from configuration_phi import PhiConfig
21
+
22
+ try:
23
+ from flash_attn.bert_padding import pad_input, unpad_input
24
+ from flash_attn.layers.rotary import RotaryEmbedding as FlashRotaryEmbedding
25
+ from flash_attn.modules.mha import FlashCrossAttention, FlashSelfAttention
26
+ from flash_attn.ops.fused_dense import FusedDense
27
+ except:
28
+ pad_input, unpad_input = None, None
29
+ FlashRotaryEmbedding = None
30
+ FlashSelfAttention, FlashCrossAttention = None, None
31
+ FusedDense = None
32
+
33
+
34
+ @dataclass
35
+ class InferenceParams:
36
+ """Inference parameters passed to model to efficiently calculate
37
+ and store context during inference.
38
+
39
+ Reference:
40
+ https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/utils/generation.py.
41
+
42
+ Args:
43
+ max_seqlen: Maximum sequence length.
44
+ max_batch_size: Maximum batch size.
45
+ seqlen_offset: Sequence length offset.
46
+ batch_size_offset: Batch size offset.
47
+ key_value_memory_dict: Key value memory dictionary.
48
+ lengths_per_sample: Lengths per sample.
49
+
50
+ """
51
+
52
+ max_seqlen: int = field(metadata={"help": "Maximum sequence length."})
53
+
54
+ max_batch_size: int = field(metadata={"help": "Maximum batch size."})
55
+
56
+ seqlen_offset: int = field(default=0, metadata={"help": "Sequence length offset."})
57
+
58
+ batch_size_offset: int = field(default=0, metadata={"help": "Batch size offset."})
59
+
60
+ key_value_memory_dict: Dict[str, Any] = field(
61
+ default_factory=dict, metadata={"help": "Key value memory dictionary."}
62
+ )
63
+
64
+ lengths_per_sample: torch.Tensor = field(default=None, metadata={"help": "Lengths per sample."})
65
+
66
+
67
+ class Embedding(nn.Module):
68
+ """Token embedding with dropout."""
69
+
70
+ def __init__(self, config: PretrainedConfig) -> None:
71
+ super().__init__()
72
+
73
+ self.wte = nn.Embedding(config.vocab_size, config.n_embd)
74
+ self.drop = nn.Dropout(config.embd_pdrop)
75
+
76
+ def forward(self, input_ids: torch.LongTensor) -> torch.FloatTensor:
77
+ input_shape = input_ids.size()
78
+ input_ids = input_ids.view(-1, input_shape[-1])
79
+
80
+ hidden_states = self.wte(input_ids)
81
+ hidden_states = self.drop(hidden_states)
82
+
83
+ return hidden_states
84
+
85
+
86
+ def _apply_rotary_emb(
87
+ x: torch.FloatTensor,
88
+ cos: torch.FloatTensor,
89
+ sin: torch.FloatTensor,
90
+ ) -> torch.FloatTensor:
91
+ _, seqlen, _, _ = x.shape
92
+ _, rotary_dim = cos.shape
93
+ rotary_dim *= 2
94
+
95
+ x_rot = x[:, :, :, :rotary_dim]
96
+ x_pass = x[:, :, :, rotary_dim:]
97
+
98
+ x1, x2 = x_rot.chunk(2, dim=-1)
99
+ c, s = rearrange(cos[:seqlen], "s d -> s 1 d"), rearrange(sin[:seqlen], "s d -> s 1 d")
100
+ x1, x2, c, s = [t.to(dtype=torch.float32) for t in [x1, x2, c, s]]
101
+
102
+ x_rot = torch.cat([x1 * c - x2 * s, x1 * s + x2 * c], axis=-1).to(x.dtype)
103
+
104
+ return torch.cat([x_rot, x_pass], axis=-1)
105
+
106
+
107
+ def _apply_rotary_emb_kv(
108
+ kv: torch.FloatTensor,
109
+ cos: torch.FloatTensor,
110
+ sin: torch.FloatTensor,
111
+ cos_k: Optional[torch.FloatTensor] = None,
112
+ sin_k: Optional[torch.FloatTensor] = None,
113
+ ) -> torch.FloatTensor:
114
+ _, seqlen, _, _, _ = kv.shape
115
+ _, rotary_dim = cos.shape
116
+ rotary_dim *= 2
117
+
118
+ k_rot = kv[:, :, 0, :, :rotary_dim]
119
+ k_pass = kv[:, :, 0, :, rotary_dim:]
120
+
121
+ k1, k2 = k_rot.chunk(2, dim=-1)
122
+ c, s = rearrange(cos[:seqlen], "s d -> s 1 d"), rearrange(sin[:seqlen], "s d -> s 1 d")
123
+ k1, k2, c, s = [t.to(dtype=torch.float32) for t in [k1, k2, c, s]]
124
+
125
+ k_rot = torch.cat([k1 * c - k2 * s, k1 * s + k2 * c], axis=-1).to(kv.dtype)
126
+
127
+ return torch.cat(
128
+ [
129
+ torch.cat([k_rot, k_pass], axis=-1).unsqueeze(2),
130
+ kv[:, :, 1:2, :, :],
131
+ ],
132
+ axis=2,
133
+ )
134
+
135
+
136
+ def _apply_rotary_emb_qkv(
137
+ qkv: torch.FloatTensor,
138
+ cos: torch.FloatTensor,
139
+ sin: torch.FloatTensor,
140
+ cos_k: Optional[torch.FloatTensor] = None,
141
+ sin_k: Optional[torch.FloatTensor] = None,
142
+ ) -> torch.FloatTensor:
143
+ _, seqlen, _, _, _ = qkv.shape
144
+ _, rotary_dim = cos.shape
145
+ rotary_dim *= 2
146
+
147
+ q_rot = qkv[:, :, 0, :, :rotary_dim]
148
+ q_pass = qkv[:, :, 0, :, rotary_dim:]
149
+
150
+ k_rot = qkv[:, :, 1, :, :rotary_dim]
151
+ k_pass = qkv[:, :, 1, :, rotary_dim:]
152
+
153
+ q1, q2 = q_rot.chunk(2, dim=-1)
154
+ k1, k2 = k_rot.chunk(2, dim=-1)
155
+ c, s = rearrange(cos[:seqlen], "s d -> s 1 d"), rearrange(sin[:seqlen], "s d -> s 1 d")
156
+ q1, q2, k1, k2, c, s = [t.to(dtype=torch.float32) for t in [q1, q2, k1, k2, c, s]]
157
+
158
+ q_rot = torch.cat([q1 * c - q2 * s, q1 * s + q2 * c], axis=-1).to(qkv.dtype)
159
+ k_rot = torch.cat([k1 * c - k2 * s, k1 * s + k2 * c], axis=-1).to(qkv.dtype)
160
+
161
+ return torch.cat(
162
+ [
163
+ torch.cat([q_rot, q_pass], axis=-1).unsqueeze(2),
164
+ torch.cat([k_rot, k_pass], axis=-1).unsqueeze(2),
165
+ qkv[:, :, 2:3, :, :],
166
+ ],
167
+ axis=2,
168
+ )
169
+
170
+
171
+ class RotaryEmbedding(nn.Module):
172
+ """Rotary positional embedding (RoPE).
173
+
174
+ Reference:
175
+ RoFormer: Enhanced Transformer with Rotary Position Embedding.
176
+ https://arxiv.org/pdf/2104.09864.pdf.
177
+
178
+ """
179
+
180
+ def __init__(
181
+ self,
182
+ dim: int,
183
+ base: int = 10000,
184
+ scale_base: Optional[float] = None,
185
+ pos_idx_in_fp32: bool = True,
186
+ max_position_embeddings: int = 2048,
187
+ device: Optional[str] = None,
188
+ **kwargs,
189
+ ) -> None:
190
+ super().__init__()
191
+
192
+ if scale_base is not None:
193
+ raise NotImplementedError
194
+
195
+ self.dim = dim
196
+ self.base = float(base)
197
+ self.scale_base = scale_base
198
+ self.pos_idx_in_fp32 = pos_idx_in_fp32
199
+ self.max_position_embeddings = max_position_embeddings
200
+ self.device = device
201
+
202
+ # Generate and save the inverse frequency buffer (non-trainable)
203
+ inv_freq = self._compute_inv_freq(device)
204
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
205
+
206
+ # Generate and save the scale buffer (non-trainable)
207
+ scale = (
208
+ (torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim)
209
+ if scale_base is not None
210
+ else None
211
+ )
212
+ self.register_buffer("scale", scale, persistent=False)
213
+
214
+ # Initialize cached attributes since ONNX can't rely on dynamic initialization
215
+ self._update_cos_sin_cache(max_position_embeddings, device=device, dtype=torch.float32)
216
+
217
+ def _compute_inv_freq(self, device: Optional[str] = None) -> torch.FloatTensor:
218
+ return 1.0 / (self.base ** (torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim))
219
+
220
+ def _update_cos_sin_cache(
221
+ self,
222
+ seqlen: int,
223
+ device: Optional[str] = None,
224
+ dtype: Optional[torch.dtype] = None,
225
+ ) -> None:
226
+ self._seq_len_cached = seqlen
227
+
228
+ # fp32 is preferred since the output of `torch.arange` can be quite large
229
+ # and bf16 would lose a lot of precision
230
+ if self.pos_idx_in_fp32:
231
+ t = torch.arange(seqlen, device=device, dtype=torch.float32)
232
+ if self.inv_freq.dtype != torch.float32:
233
+ inv_freq = self._compute_inv_freq(device=device)
234
+ else:
235
+ inv_freq = self.inv_freq
236
+ else:
237
+ t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
238
+ inv_freq = self.inv_freq
239
+
240
+ # `torch.outer` is preferred since `torch.einsum` converts from fp32 to fp16 if used with AMP
241
+ freqs = torch.outer(t, inv_freq)
242
+ if self.scale is None:
243
+ self._cos_cached = torch.cos(freqs).to(dtype)
244
+ self._sin_cached = torch.sin(freqs).to(dtype)
245
+ else:
246
+ power = (
247
+ torch.arange(seqlen, dtype=self.scale.dtype, device=self.scale.device) - seqlen // 2
248
+ ) / self.scale_base
249
+ scale = self.scale.to(device=power.device) ** rearrange(power, "s -> s 1")
250
+
251
+ # Force the scale multiplication to happen in fp32
252
+ self._cos_cached = (torch.cos(freqs) * scale).to(dtype)
253
+ self._sin_cached = (torch.sin(freqs) * scale).to(dtype)
254
+ self._cos_k_cached = (torch.cos(freqs) / scale).to(dtype)
255
+ self._sin_k_cached = (torch.sin(freqs) / scale).to(dtype)
256
+
257
+ def forward(
258
+ self,
259
+ qkv: torch.Tensor,
260
+ kv: Optional[torch.Tensor] = None,
261
+ seqlen_offset: int = 0,
262
+ **kwargs,
263
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
264
+ if (
265
+ self._seq_len_cached < qkv.shape[1] + seqlen_offset
266
+ or self._cos_cached.device != qkv.device
267
+ or self._cos_cached.dtype != qkv.dtype
268
+ or (self.training and self._cos_cached.is_inference())
269
+ ):
270
+ self._update_cos_sin_cache(qkv.shape[1] + seqlen_offset, device=qkv.device, dtype=qkv.dtype)
271
+
272
+ if kv is None:
273
+ return _apply_rotary_emb_qkv(
274
+ qkv,
275
+ self._cos_cached[seqlen_offset:],
276
+ self._sin_cached[seqlen_offset:],
277
+ )
278
+ else:
279
+ q = _apply_rotary_emb(
280
+ qkv,
281
+ self._cos_cached[seqlen_offset:],
282
+ self._sin_cached[seqlen_offset:],
283
+ )
284
+ kv = _apply_rotary_emb_kv(
285
+ kv,
286
+ self._cos_cached[seqlen_offset:],
287
+ self._sin_cached[seqlen_offset:],
288
+ )
289
+
290
+ return q, kv
291
+
292
+
293
+ class MLP(nn.Module):
294
+ """Multi-Layer Perceptron.
295
+
296
+ Reference:
297
+ Attention Is All You Need.
298
+ https://arxiv.org/pdf/1706.03762.pdf.
299
+
300
+ """
301
+
302
+ def __init__(
303
+ self,
304
+ config: PretrainedConfig,
305
+ n_inner: Optional[int] = None,
306
+ act_fn: Optional[str] = None,
307
+ ) -> None:
308
+ super().__init__()
309
+
310
+ act_fn = config.activation_function if act_fn is None else act_fn
311
+
312
+ n_inner = getattr(config, "n_inner", None) if n_inner is None else n_inner
313
+ n_inner = n_inner if n_inner is not None else 4 * config.n_embd
314
+
315
+ self.fc1 = nn.Linear(config.n_embd, n_inner)
316
+ self.fc2 = nn.Linear(n_inner, config.n_embd)
317
+ self.act = ACT2FN[act_fn]
318
+
319
+ def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
320
+ hidden_states = self.fc1(hidden_states)
321
+ hidden_states = self.act(hidden_states)
322
+ hidden_states = self.fc2(hidden_states)
323
+
324
+ return hidden_states
325
+
326
+
327
+ class SelfAttention(nn.Module):
328
+ """Self-attention layer (compatible with PyTorch).
329
+
330
+ Reference:
331
+ https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/modules/mha.py.
332
+
333
+ """
334
+
335
+ def __init__(
336
+ self,
337
+ causal: bool = True,
338
+ softmax_scale: Optional[float] = None,
339
+ attention_dropout: float = 0.0,
340
+ ) -> None:
341
+ super().__init__()
342
+
343
+ self.causal = causal
344
+ self.softmax_scale = softmax_scale
345
+ self.drop = nn.Dropout(attention_dropout)
346
+
347
+ @torch.autocast("cpu", enabled=False)
348
+ @torch.autocast("cuda", enabled=False)
349
+ def forward(
350
+ self,
351
+ qkv: torch.FloatTensor,
352
+ causal: bool = None,
353
+ key_padding_mask: Optional[torch.BoolTensor] = None,
354
+ **kwargs,
355
+ ) -> torch.FloatTensor:
356
+ batch_size, seqlen = qkv.shape[0], qkv.shape[1]
357
+ q, k, v = qkv.unbind(dim=2)
358
+
359
+ q = q.to(torch.float32)
360
+ k = k.to(torch.float32)
361
+
362
+ causal = self.causal if causal is None else causal
363
+ softmax_scale = self.softmax_scale or 1.0 / math.sqrt(q.shape[-1])
364
+
365
+ # Autocast is manually disabled to avoid `torch.einsum` performing the operation
366
+ # using float16, which might lead to overflow
367
+ scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
368
+
369
+ if key_padding_mask is not None:
370
+ padding_mask = torch.full((batch_size, seqlen), -10000.0, dtype=scores.dtype, device=scores.device)
371
+ padding_mask.masked_fill_(key_padding_mask, 0.0)
372
+
373
+ scores = scores + rearrange(padding_mask, "b s -> b 1 1 s")
374
+
375
+ if causal:
376
+ causal_mask = torch.triu(torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1)
377
+ scores = scores + causal_mask.to(dtype=scores.dtype)
378
+
379
+ attention = torch.softmax(scores, dim=-1).to(v.dtype)
380
+ attention = self.drop(attention)
381
+
382
+ output = torch.einsum("bhts,bshd->bthd", attention, v)
383
+
384
+ return output
385
+
386
+
387
+ class CrossAttention(nn.Module):
388
+ """Cross-attention layer (compatible with PyTorch).
389
+
390
+ Reference:
391
+ https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/modules/mha.py.
392
+
393
+ """
394
+
395
+ def __init__(
396
+ self,
397
+ causal: bool = True,
398
+ softmax_scale: Optional[float] = None,
399
+ attention_dropout: float = 0.0,
400
+ ) -> None:
401
+ super().__init__()
402
+
403
+ self.causal = causal
404
+ self.softmax_scale = softmax_scale
405
+ self.drop = nn.Dropout(attention_dropout)
406
+
407
+ @torch.autocast("cpu", enabled=False)
408
+ @torch.autocast("cuda", enabled=False)
409
+ def forward(
410
+ self,
411
+ q: torch.FloatTensor,
412
+ kv: torch.FloatTensor,
413
+ causal: bool = None,
414
+ key_padding_mask: Optional[torch.BoolTensor] = None,
415
+ **kwargs,
416
+ ) -> torch.FloatTensor:
417
+ batch_size, seqlen_q = q.shape[0], q.shape[1]
418
+ seqlen_k = kv.shape[1]
419
+
420
+ if kv.shape[3] != q.shape[2]:
421
+ kv = repeat(kv, "... hkv d -> ... (hkv g) d", g=q.shape[2] // kv.shape[3])
422
+ k, v = kv.unbind(dim=2)
423
+
424
+ q = q.to(torch.float32)
425
+ k = k.to(torch.float32)
426
+
427
+ causal = self.causal if causal is None else causal
428
+ softmax_scale = self.softmax_scale or 1.0 / math.sqrt(q.shape[-1])
429
+
430
+ # Autocast is manually disabled to avoid `torch.einsum` performing the operation
431
+ # using float16, which might lead to overflow
432
+ scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
433
+
434
+ if key_padding_mask is not None:
435
+ padding_mask = torch.full(
436
+ (batch_size, seqlen_k),
437
+ -10000.0,
438
+ dtype=scores.dtype,
439
+ device=scores.device,
440
+ )
441
+ padding_mask.masked_fill_(key_padding_mask, 0.0)
442
+
443
+ scores = scores + rearrange(padding_mask, "b s -> b 1 1 s")
444
+
445
+ if causal:
446
+ rows = rearrange(torch.arange(seqlen_q, device=q.device, dtype=torch.long), "s -> s 1")
447
+ cols = torch.arange(seqlen_k, device=k.device, dtype=torch.long)
448
+ causal_mask = cols > rows + seqlen_k - seqlen_q
449
+
450
+ scores = scores.masked_fill(causal_mask, -10000.0)
451
+
452
+ attention = torch.softmax(scores, dim=-1).to(v.dtype)
453
+ attention = self.drop(attention)
454
+
455
+ output = torch.einsum("bhts,bshd->bthd", attention, v)
456
+
457
+ return output
458
+
459
+
460
+ def _find_mha_dims(
461
+ config: PretrainedConfig,
462
+ n_head: Optional[int] = None,
463
+ n_head_kv: Optional[int] = None,
464
+ head_dim: Optional[int] = None,
465
+ ) -> Tuple[int, int]:
466
+ if n_head is None and head_dim is None:
467
+ head_dim = config.n_embd // config.n_head
468
+ n_head = config.n_head
469
+ elif n_head is None or head_dim is None:
470
+ raise ValueError("`n_head` and `head_dim` must be both specified or `None`.")
471
+
472
+ if n_head_kv is None:
473
+ n_head_kv = getattr(config, "n_head_kv", None) or n_head
474
+
475
+ return n_head, n_head_kv, head_dim
476
+
477
+
478
+ def _update_kv_cache(kv: torch.FloatTensor, inference_params: InferenceParams, layer_idx: int) -> torch.FloatTensor:
479
+ num_heads, head_dim = kv.shape[-2:]
480
+
481
+ if layer_idx not in inference_params.key_value_memory_dict:
482
+ inference_params.key_value_memory_dict[layer_idx] = torch.empty(
483
+ inference_params.max_batch_size,
484
+ inference_params.max_seqlen,
485
+ 2,
486
+ num_heads,
487
+ head_dim,
488
+ dtype=kv.dtype,
489
+ device=kv.device,
490
+ )
491
+
492
+ batch_start = inference_params.batch_size_offset
493
+ batch_end = batch_start + kv.shape[0]
494
+
495
+ sequence_start = inference_params.seqlen_offset
496
+ sequence_end = sequence_start + kv.shape[1]
497
+
498
+ # When the current sequence length is equal to or larger than the maximum sequence length,
499
+ # we need to concatenate the current `kv` with the cached `kv` to expand its length
500
+ if sequence_end >= inference_params.max_seqlen:
501
+ inference_params.key_value_memory_dict[layer_idx] = torch.concatenate((inference_params.key_value_memory_dict[layer_idx], kv), dim=1)
502
+
503
+ inference_params.key_value_memory_dict[layer_idx][batch_start:batch_end, sequence_start:sequence_end, ...] = kv
504
+ kv = inference_params.key_value_memory_dict[layer_idx][batch_start:batch_end, :sequence_end, ...]
505
+
506
+ return kv
507
+
508
+
509
+ class MHA(nn.Module):
510
+ """Multi-head attention layer."""
511
+
512
+ def __init__(
513
+ self,
514
+ config: PretrainedConfig,
515
+ dtype: Optional[torch.dtype] = None,
516
+ device: Optional[str] = None,
517
+ rotary_dim: Optional[int] = None,
518
+ rotary_base: float = 10000.0,
519
+ rotary_scale_base: Optional[float] = None,
520
+ n_head: Optional[int] = None,
521
+ n_head_kv: Optional[int] = None,
522
+ head_dim: Optional[int] = None,
523
+ bias: bool = True,
524
+ causal: bool = True,
525
+ softmax_scale: Optional[float] = None,
526
+ layer_idx: Optional[int] = None,
527
+ return_residual: bool = False,
528
+ checkpointing: bool = False,
529
+ ) -> None:
530
+ super().__init__()
531
+
532
+ # Rotary embedding
533
+ self.rotary_dim = rotary_dim if rotary_dim is not None else getattr(config, "rotary_dim", 0)
534
+ if self.rotary_dim > 0:
535
+ rotary_cls = FlashRotaryEmbedding if config.flash_rotary else RotaryEmbedding
536
+ if rotary_cls is None:
537
+ rotary_cls = RotaryEmbedding
538
+
539
+ rotary_kwargs = {}
540
+ if rotary_cls is RotaryEmbedding:
541
+ rotary_kwargs["max_position_embeddings"] = config.n_positions
542
+
543
+ self.rotary_emb = rotary_cls(
544
+ self.rotary_dim,
545
+ base=rotary_base,
546
+ scale_base=rotary_scale_base,
547
+ device=device,
548
+ **rotary_kwargs,
549
+ )
550
+
551
+ # MLP
552
+ self.n_head, self.n_head_kv, self.head_dim = _find_mha_dims(
553
+ config, n_head=n_head, n_head_kv=n_head_kv, head_dim=head_dim
554
+ )
555
+ op_size = self.head_dim * (self.n_head + 2 * self.n_head_kv)
556
+ hidden_size = config.n_embd
557
+
558
+ linear_cls = FusedDense if config.fused_dense else nn.Linear
559
+ if linear_cls is None:
560
+ linear_cls = nn.Linear
561
+
562
+ self.Wqkv = linear_cls(hidden_size, op_size, bias=bias, device=device, dtype=dtype)
563
+ self.out_proj = linear_cls(hidden_size, hidden_size, bias=bias, device=device, dtype=dtype)
564
+
565
+ # Attention
566
+ attn_cls = FlashSelfAttention if config.flash_attn else SelfAttention
567
+ if attn_cls is None:
568
+ attn_cls = SelfAttention
569
+
570
+ cross_attn_cls = FlashCrossAttention if config.flash_attn else CrossAttention
571
+ if cross_attn_cls is None:
572
+ cross_attn_cls = CrossAttention
573
+
574
+ self.inner_attn = attn_cls(
575
+ causal=causal,
576
+ softmax_scale=softmax_scale,
577
+ attention_dropout=config.attn_pdrop,
578
+ )
579
+ self.inner_cross_attn = cross_attn_cls(
580
+ causal=causal,
581
+ softmax_scale=softmax_scale,
582
+ attention_dropout=config.attn_pdrop,
583
+ )
584
+
585
+ self.flash_attn = config.flash_attn and attn_cls is FlashSelfAttention
586
+ self.layer_idx = layer_idx
587
+ self.return_residual = return_residual
588
+ self.checkpointing = checkpointing
589
+
590
+ def _forward_self_attn(
591
+ self, x: torch.FloatTensor, key_padding_mask: Optional[torch.BoolTensor]
592
+ ) -> torch.FloatTensor:
593
+ qkv = self.Wqkv(x)
594
+ qkv = rearrange(qkv, "... (three h d) -> ... three h d", three=3, d=self.head_dim)
595
+
596
+ if self.rotary_dim > 0:
597
+ qkv = self.rotary_emb(qkv)
598
+
599
+ if self.flash_attn:
600
+ batch_size, seqlen = qkv.shape[0], qkv.shape[1]
601
+
602
+ cu_seqlens, max_seqlen = None, None
603
+ if key_padding_mask is not None:
604
+ # If `key_padding_mask` is supplied, we need to unpad the input and retrieve
605
+ # the `cu_seqlens` and `max_seqlen` to be used by `flash-attn`
606
+ qkv, indices, cu_seqlens, max_seqlen = unpad_input(qkv, key_padding_mask)
607
+
608
+ if self.checkpointing:
609
+ attn_output = torch.utils.checkpoint.checkpoint(
610
+ self.inner_attn, qkv, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen
611
+ )
612
+ else:
613
+ attn_output = self.inner_attn(qkv, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen).to(qkv.device)
614
+
615
+ # If `key_padding_mask` is supplied, we need to pad the output back to the original shape
616
+ return pad_input(attn_output, indices, batch_size, seqlen) if key_padding_mask is not None else attn_output
617
+
618
+ if self.checkpointing:
619
+ return torch.utils.checkpoint.checkpoint(self.inner_attn, qkv, key_padding_mask=key_padding_mask)
620
+
621
+ return self.inner_attn(qkv, key_padding_mask=key_padding_mask)
622
+
623
+ def _forward_cross_attn(
624
+ self,
625
+ x: torch.FloatTensor,
626
+ past_key_values: Optional[InferenceParams],
627
+ key_padding_mask: Optional[torch.BoolTensor],
628
+ ) -> torch.FloatTensor:
629
+ batch_size = x.shape[0]
630
+
631
+ qkv = self.Wqkv(x)
632
+
633
+ q = qkv[..., : self.n_head * self.head_dim]
634
+ q = rearrange(q, "... (h d) -> ... h d", d=self.head_dim)
635
+
636
+ kv = qkv[..., self.n_head * self.head_dim :]
637
+ kv = rearrange(kv, "... (two hkv d) -> ... two hkv d", two=2, d=self.head_dim)
638
+
639
+ seqlen_offset = past_key_values.seqlen_offset if past_key_values is not None else 0
640
+ causal = None if seqlen_offset == 0 else False
641
+ if self.rotary_dim > 0:
642
+ q, kv = self.rotary_emb(q, kv=kv, seqlen_offset=seqlen_offset)
643
+
644
+ if past_key_values is not None:
645
+ kv = _update_kv_cache(kv, past_key_values, self.layer_idx)
646
+
647
+ if self.flash_attn:
648
+ batch_size, seqlen_q = q.shape[0], q.shape[1]
649
+ seqlen_k = kv.shape[1]
650
+
651
+ cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k = (
652
+ None,
653
+ None,
654
+ None,
655
+ None,
656
+ )
657
+ if key_padding_mask is not None:
658
+ kv, _, cu_seqlens_k, max_seqlen_k = unpad_input(kv, key_padding_mask)
659
+
660
+ if seqlen_q == 1:
661
+ key_padding_mask = torch.ones(batch_size, 1, device=q.device)
662
+ elif seqlen_q != seqlen_k:
663
+ key_padding_mask = key_padding_mask[:, -seqlen_q:]
664
+
665
+ q, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(q, key_padding_mask)
666
+
667
+ if self.checkpointing:
668
+ attn_output = torch.utils.checkpoint.checkpoint(
669
+ self.inner_cross_attn,
670
+ q,
671
+ kv,
672
+ causal=causal,
673
+ cu_seqlens=cu_seqlens_q,
674
+ max_seqlen=max_seqlen_q,
675
+ cu_seqlens_k=cu_seqlens_k,
676
+ max_seqlen_k=max_seqlen_k,
677
+ )
678
+ else:
679
+ attn_output = self.inner_cross_attn(
680
+ q,
681
+ kv,
682
+ causal=causal,
683
+ cu_seqlens=cu_seqlens_q,
684
+ max_seqlen=max_seqlen_q,
685
+ cu_seqlens_k=cu_seqlens_k,
686
+ max_seqlen_k=max_seqlen_k,
687
+ )
688
+
689
+ return (
690
+ pad_input(attn_output, indices_q, batch_size, max_seqlen_q)
691
+ if key_padding_mask is not None
692
+ else attn_output
693
+ )
694
+
695
+ if self.checkpointing:
696
+ return torch.utils.checkpoint.checkpoint(
697
+ self.inner_cross_attn,
698
+ q,
699
+ kv,
700
+ key_padding_mask=key_padding_mask,
701
+ causal=causal,
702
+ )
703
+
704
+ return self.inner_cross_attn(q, kv, key_padding_mask=key_padding_mask, causal=causal)
705
+
706
+ def forward(
707
+ self,
708
+ x: torch.FloatTensor,
709
+ past_key_values: Optional[InferenceParams] = None,
710
+ attention_mask: Optional[Union[torch.LongTensor, torch.BoolTensor]] = None,
711
+ **kwargs,
712
+ ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
713
+ if attention_mask is not None:
714
+ attention_mask = attention_mask.bool()
715
+ else:
716
+ attention_mask = None
717
+
718
+ # MHA
719
+ if self.n_head == self.n_head_kv:
720
+ if past_key_values is None:
721
+ # If `past_key_values` are not supplied, we run self-attention
722
+ attn_output = self._forward_self_attn(x, attention_mask)
723
+ else:
724
+ # If `past_key_values` are supplied, it means that we might have cached values and
725
+ # could take advantage of cross-attention
726
+ attn_output = self._forward_cross_attn(x, past_key_values, attention_mask)
727
+ # MQA / GQA
728
+ else:
729
+ # Regardless of `past_key_values` being supplied or not, it always use cross-attention
730
+ # because `q` and `kv` lengths might be different
731
+ attn_output = self._forward_cross_attn(x, past_key_values, attention_mask)
732
+
733
+ output = rearrange(attn_output, "... h d -> ... (h d)")
734
+ output = self.out_proj(output)
735
+
736
+ return output if not self.return_residual else (output, x)
737
+
738
+
739
+ class ParallelBlock(nn.Module):
740
+ """Parallel block.
741
+
742
+ This block applies parallel mixer and MLP layers to the input (used in GPT-J and CodeGen).
743
+
744
+ """
745
+
746
+ def __init__(
747
+ self,
748
+ config: PretrainedConfig,
749
+ block_idx: Optional[int] = None,
750
+ ) -> None:
751
+ super().__init__()
752
+
753
+ self.ln = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
754
+ self.resid_dropout = nn.Dropout(config.resid_pdrop)
755
+ self.block_idx = block_idx
756
+
757
+ self.mixer = MHA(config, layer_idx=block_idx)
758
+ self.mlp = MLP(config)
759
+
760
+ def forward(
761
+ self,
762
+ hidden_states: torch.FloatTensor,
763
+ past_key_values: Optional[Union[torch.FloatTensor, InferenceParams]] = None,
764
+ attention_mask: Optional[torch.BoolTensor] = None,
765
+ **kwargs,
766
+ ) -> torch.FloatTensor:
767
+ residual = hidden_states
768
+ hidden_states = self.ln(hidden_states)
769
+
770
+ attn_outputs = self.mixer(
771
+ hidden_states,
772
+ past_key_values=past_key_values,
773
+ attention_mask=attention_mask,
774
+ )
775
+ if isinstance(attn_outputs, tuple):
776
+ attn_outputs = attn_outputs[0]
777
+
778
+ attn_outputs = self.resid_dropout(attn_outputs)
779
+ feed_forward_hidden_states = self.resid_dropout(self.mlp(hidden_states))
780
+
781
+ hidden_states = attn_outputs + feed_forward_hidden_states + residual
782
+
783
+ return hidden_states
784
+
785
+
786
+ class CausalLMHead(nn.Module):
787
+ """Causal Language Modeling head.
788
+
789
+ Reference:
790
+ Improving Language Understanding by Generative Pre-Training.
791
+ https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf.
792
+
793
+ """
794
+
795
+ def __init__(self, config: PretrainedConfig) -> None:
796
+ super().__init__()
797
+
798
+ self.ln = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
799
+ self.linear = nn.Linear(config.n_embd, config.vocab_size)
800
+
801
+ def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
802
+ hidden_states = self.ln(hidden_states)
803
+ logits = self.linear(hidden_states).to(torch.float32)
804
+
805
+ return logits
806
+
807
+
808
+ class CausalLMLoss(nn.Module):
809
+ """Causal Language Modeling loss.
810
+
811
+ Reference:
812
+ Improving Language Understanding by Generative Pre-Training.
813
+ https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf.
814
+
815
+ """
816
+
817
+ def __init__(self, shift_labels: bool = True) -> None:
818
+ super().__init__()
819
+
820
+ self.shift_labels = shift_labels
821
+ self.loss_fct = nn.CrossEntropyLoss()
822
+
823
+ def forward(self, logits: torch.FloatTensor, labels: torch.LongTensor) -> torch.FloatTensor:
824
+ if self.shift_labels:
825
+ logits = logits[..., :-1, :].contiguous()
826
+ labels = labels[..., 1:].contiguous()
827
+
828
+ loss = self.loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
829
+
830
+ return loss
831
+
832
+
833
+ class PhiPreTrainedModel(PreTrainedModel):
834
+ """Phi pre-trained model."""
835
+
836
+ config_class = PhiConfig
837
+ base_model_prefix = "transformer"
838
+ supports_gradient_checkpointing = False
839
+ _no_split_modules = ["ParallelBlock"]
840
+
841
+ def __init__(self, *inputs, **kwargs) -> None:
842
+ super().__init__(*inputs, **kwargs)
843
+
844
+ def _init_weights(self, module: nn.Module) -> None:
845
+ if isinstance(module, (nn.Linear,)):
846
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
847
+ if module.bias is not None:
848
+ module.bias.data.zero_()
849
+ elif isinstance(module, nn.Embedding):
850
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
851
+ if module.padding_idx is not None:
852
+ module.weight.data[module.padding_idx].zero_()
853
+ elif isinstance(module, nn.LayerNorm):
854
+ if module.bias is not None:
855
+ module.bias.data.zero_()
856
+ module.weight.data.fill_(1.0)
857
+
858
+ def prepare_inputs_for_generation(
859
+ self,
860
+ input_ids: torch.LongTensor,
861
+ past_key_values: Optional[Union[torch.FloatTensor, InferenceParams]] = None,
862
+ attention_mask: Optional[Union[torch.LongTensor, torch.BoolTensor]] = None,
863
+ **kwargs,
864
+ ) -> Dict[str, Any]:
865
+ if past_key_values is None or not (isinstance(past_key_values, InferenceParams)):
866
+ past_key_values = InferenceParams(
867
+ max_seqlen=self.config.n_positions,
868
+ max_batch_size=input_ids.shape[0],
869
+ seqlen_offset=0,
870
+ batch_size_offset=0,
871
+ key_value_memory_dict={},
872
+ lengths_per_sample=None,
873
+ )
874
+ else:
875
+ # Assume that `past_key_values` has cached all tokens up to the last token in `input_ids`
876
+ past_key_values.seqlen_offset = input_ids.shape[1] - 1
877
+ input_ids = input_ids[:, -1].unsqueeze(-1)
878
+ attention_mask = attention_mask[:, -1].unsqueeze(-1)
879
+
880
+ return {
881
+ "input_ids": input_ids,
882
+ "past_key_values": past_key_values,
883
+ "attention_mask": attention_mask,
884
+ }
885
+
886
+
887
+ class PhiModel(PhiPreTrainedModel):
888
+ """Phi model."""
889
+
890
+ _keys_to_ignore_on_load_missing = [""]
891
+ _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.mlp.(fc_in|fc_out)\.(weight|bias)"]
892
+
893
+ def __init__(self, config: PhiConfig) -> None:
894
+ super().__init__(config)
895
+
896
+ self.embd = Embedding(config)
897
+ self.h = nn.ModuleList([ParallelBlock(config, block_idx=i) for i in range(config.n_layer)])
898
+ self.gradient_checkpointing = False
899
+ self.post_init()
900
+
901
+ def get_input_embeddings(self):
902
+ return self.embd
903
+
904
+ def set_input_embeddings(self, new_embeddings) -> None:
905
+ self.embd.wte = new_embeddings
906
+
907
+ def forward(
908
+ self,
909
+ input_ids: torch.LongTensor,
910
+ inputs_embeds: Optional[torch.FloatTensor] = None,
911
+ past_key_values: Optional[Union[torch.FloatTensor, InferenceParams]] = None,
912
+ attention_mask: Optional[torch.BoolTensor] = None,
913
+ ) -> torch.FloatTensor:
914
+ if input_ids is not None:
915
+ hidden_states = self.embd(input_ids)
916
+ elif inputs_embeds is not None:
917
+ hidden_states = inputs_embeds
918
+ else:
919
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
920
+
921
+ for layer in self.h:
922
+ hidden_states = layer(
923
+ hidden_states,
924
+ past_key_values=past_key_values,
925
+ attention_mask=attention_mask,
926
+ )
927
+
928
+ return hidden_states
929
+
930
+
931
+ class PhiForCausalLM(PhiPreTrainedModel):
932
+ """Phi for Causal Language Modeling."""
933
+
934
+ _keys_to_ignore_on_load_missing = [""]
935
+ _keys_to_ignore_on_load_unexpected = [r"transformer\.h\.\d+\.mlp.(fc_in|fc_out)\.(weight|bias)"]
936
+
937
+ supports_gradient_checkpointing = True
938
+ _no_split_modules = ["ParallelBlock"]
939
+ _skip_keys_device_placement = "past_key_values"
940
+
941
+ def __init__(self, config: PhiConfig) -> None:
942
+ super().__init__(config)
943
+
944
+ self.transformer = PhiModel(config)
945
+ self.lm_head = CausalLMHead(config)
946
+ self.loss = CausalLMLoss()
947
+
948
+ self.post_init()
949
+
950
+ def get_output_embeddings(self):
951
+ return self.lm_head
952
+
953
+ def set_output_embeddings(self, new_embeddings) -> None:
954
+ self.lm_head.linear = new_embeddings
955
+
956
+ def forward(
957
+ self,
958
+ input_ids: torch.LongTensor,
959
+ inputs_embeds: Optional[torch.FloatTensor] = None,
960
+ past_key_values: Optional[Union[torch.FloatTensor, InferenceParams]] = None,
961
+ attention_mask: Optional[torch.BoolTensor] = None,
962
+ labels: Optional[torch.LongTensor] = None,
963
+ **kwargs,
964
+ ) -> CausalLMOutputWithPast:
965
+ hidden_states = self.transformer(input_ids, inputs_embeds=inputs_embeds, past_key_values=past_key_values, attention_mask=attention_mask)
966
+ lm_logits = self.lm_head(hidden_states)
967
+
968
+ loss = None
969
+ if labels is not None:
970
+ loss = self.loss(lm_logits, labels)
971
+
972
+ return CausalLMOutputWithPast(loss=loss, logits=lm_logits, past_key_values=past_key_values)
processing_llava.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """
16
+ Processor class for Llava.
17
+ """
18
+
19
+
20
+ from typing import List, Optional, Union
21
+
22
+ from transformers.feature_extraction_utils import BatchFeature
23
+ from transformers.image_utils import ImageInput
24
+ from transformers.tokenization_utils_base import (
25
+ PaddingStrategy,
26
+ PreTokenizedInput,
27
+ TextInput,
28
+ TruncationStrategy,
29
+ )
30
+ from transformers.utils import TensorType
31
+ import torch
32
+ from open_clip.transform import PreprocessCfg, image_transform_v2
33
+
34
+
35
+ class OpenCLIPImageProcessor:
36
+ def __init__(self, config):
37
+ cfg = PreprocessCfg(**config)
38
+ transform = image_transform_v2(cfg=cfg, is_train=False)
39
+ self.transform = transform
40
+
41
+ def __call__(self, image, return_tensors):
42
+ if isinstance(image, list):
43
+ outputs = []
44
+ for item in image:
45
+ outputs.append(self.transform(item))
46
+ return {
47
+ "pixel_values": torch.tensor(outputs),
48
+ }
49
+ output = self.transform(image)
50
+ return {
51
+ "pixel_values": output.unsqueeze(0),
52
+ }
53
+
54
+ @property
55
+ def model_input_names(self):
56
+ return ["pixel_values"]
57
+
58
+
59
+ class LlavaProcessor:
60
+ def __init__(self, image_processor: OpenCLIPImageProcessor, tokenizer):
61
+ self.image_processor = image_processor
62
+ self.tokenizer = tokenizer
63
+
64
+ def __call__(
65
+ self,
66
+ text: Union[
67
+ TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]
68
+ ] = None,
69
+ images: ImageInput = None,
70
+ padding: Union[bool, str, PaddingStrategy] = False,
71
+ truncation: Union[bool, str, TruncationStrategy] = None,
72
+ max_length=None,
73
+ return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
74
+ ) -> BatchFeature:
75
+ if images is not None:
76
+ pixel_values = self.image_processor(images, return_tensors=return_tensors)[
77
+ "pixel_values"
78
+ ]
79
+ else:
80
+ pixel_values = None
81
+ text_inputs = self.tokenizer(
82
+ text,
83
+ return_tensors=return_tensors,
84
+ padding=padding,
85
+ truncation=truncation,
86
+ max_length=max_length,
87
+ )
88
+
89
+ return BatchFeature(data={**text_inputs, "pixel_values": pixel_values})
90
+
91
+ def batch_decode(self, *args, **kwargs):
92
+ return self.tokenizer.batch_decode(*args, **kwargs)
93
+
94
+ def decode(self, *args, **kwargs):
95
+ return self.tokenizer.decode(*args, **kwargs)
96
+
97
+ @property
98
+ def model_input_names(self):
99
+ tokenizer_input_names = self.tokenizer.model_input_names
100
+ image_processor_input_names = self.image_processor.model_input_names
101
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|im_end|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,357 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "50257": {
13
+ "content": " ",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": false
19
+ },
20
+ "50258": {
21
+ "content": " ",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": false
27
+ },
28
+ "50259": {
29
+ "content": " ",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": false
35
+ },
36
+ "50260": {
37
+ "content": " ",
38
+ "lstrip": false,
39
+ "normalized": true,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": false
43
+ },
44
+ "50261": {
45
+ "content": " ",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": false
51
+ },
52
+ "50262": {
53
+ "content": " ",
54
+ "lstrip": false,
55
+ "normalized": true,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": false
59
+ },
60
+ "50263": {
61
+ "content": " ",
62
+ "lstrip": false,
63
+ "normalized": true,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": false
67
+ },
68
+ "50264": {
69
+ "content": " ",
70
+ "lstrip": false,
71
+ "normalized": true,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": false
75
+ },
76
+ "50265": {
77
+ "content": " ",
78
+ "lstrip": false,
79
+ "normalized": true,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": false
83
+ },
84
+ "50266": {
85
+ "content": " ",
86
+ "lstrip": false,
87
+ "normalized": true,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": false
91
+ },
92
+ "50267": {
93
+ "content": " ",
94
+ "lstrip": false,
95
+ "normalized": true,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": false
99
+ },
100
+ "50268": {
101
+ "content": " ",
102
+ "lstrip": false,
103
+ "normalized": true,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": false
107
+ },
108
+ "50269": {
109
+ "content": " ",
110
+ "lstrip": false,
111
+ "normalized": true,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": false
115
+ },
116
+ "50270": {
117
+ "content": " ",
118
+ "lstrip": false,
119
+ "normalized": true,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": false
123
+ },
124
+ "50271": {
125
+ "content": " ",
126
+ "lstrip": false,
127
+ "normalized": true,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": false
131
+ },
132
+ "50272": {
133
+ "content": " ",
134
+ "lstrip": false,
135
+ "normalized": true,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": false
139
+ },
140
+ "50273": {
141
+ "content": " ",
142
+ "lstrip": false,
143
+ "normalized": true,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": false
147
+ },
148
+ "50274": {
149
+ "content": " ",
150
+ "lstrip": false,
151
+ "normalized": true,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": false
155
+ },
156
+ "50275": {
157
+ "content": " ",
158
+ "lstrip": false,
159
+ "normalized": true,
160
+ "rstrip": false,
161
+ "single_word": false,
162
+ "special": false
163
+ },
164
+ "50276": {
165
+ "content": " ",
166
+ "lstrip": false,
167
+ "normalized": true,
168
+ "rstrip": false,
169
+ "single_word": false,
170
+ "special": false
171
+ },
172
+ "50277": {
173
+ "content": " ",
174
+ "lstrip": false,
175
+ "normalized": true,
176
+ "rstrip": false,
177
+ "single_word": false,
178
+ "special": false
179
+ },
180
+ "50278": {
181
+ "content": " ",
182
+ "lstrip": false,
183
+ "normalized": true,
184
+ "rstrip": false,
185
+ "single_word": false,
186
+ "special": false
187
+ },
188
+ "50279": {
189
+ "content": " ",
190
+ "lstrip": false,
191
+ "normalized": true,
192
+ "rstrip": false,
193
+ "single_word": false,
194
+ "special": false
195
+ },
196
+ "50280": {
197
+ "content": " ",
198
+ "lstrip": false,
199
+ "normalized": true,
200
+ "rstrip": false,
201
+ "single_word": false,
202
+ "special": false
203
+ },
204
+ "50281": {
205
+ "content": " ",
206
+ "lstrip": false,
207
+ "normalized": true,
208
+ "rstrip": false,
209
+ "single_word": false,
210
+ "special": false
211
+ },
212
+ "50282": {
213
+ "content": " ",
214
+ "lstrip": false,
215
+ "normalized": true,
216
+ "rstrip": false,
217
+ "single_word": false,
218
+ "special": false
219
+ },
220
+ "50283": {
221
+ "content": " ",
222
+ "lstrip": false,
223
+ "normalized": true,
224
+ "rstrip": false,
225
+ "single_word": false,
226
+ "special": false
227
+ },
228
+ "50284": {
229
+ "content": " ",
230
+ "lstrip": false,
231
+ "normalized": true,
232
+ "rstrip": false,
233
+ "single_word": false,
234
+ "special": false
235
+ },
236
+ "50285": {
237
+ "content": " ",
238
+ "lstrip": false,
239
+ "normalized": true,
240
+ "rstrip": false,
241
+ "single_word": false,
242
+ "special": false
243
+ },
244
+ "50286": {
245
+ "content": " ",
246
+ "lstrip": false,
247
+ "normalized": true,
248
+ "rstrip": false,
249
+ "single_word": false,
250
+ "special": false
251
+ },
252
+ "50287": {
253
+ "content": "\t\t\t\t\t\t\t\t\t",
254
+ "lstrip": false,
255
+ "normalized": true,
256
+ "rstrip": false,
257
+ "single_word": false,
258
+ "special": false
259
+ },
260
+ "50288": {
261
+ "content": "\t\t\t\t\t\t\t\t",
262
+ "lstrip": false,
263
+ "normalized": true,
264
+ "rstrip": false,
265
+ "single_word": false,
266
+ "special": false
267
+ },
268
+ "50289": {
269
+ "content": "\t\t\t\t\t\t\t",
270
+ "lstrip": false,
271
+ "normalized": true,
272
+ "rstrip": false,
273
+ "single_word": false,
274
+ "special": false
275
+ },
276
+ "50290": {
277
+ "content": "\t\t\t\t\t\t",
278
+ "lstrip": false,
279
+ "normalized": true,
280
+ "rstrip": false,
281
+ "single_word": false,
282
+ "special": false
283
+ },
284
+ "50291": {
285
+ "content": "\t\t\t\t\t",
286
+ "lstrip": false,
287
+ "normalized": true,
288
+ "rstrip": false,
289
+ "single_word": false,
290
+ "special": false
291
+ },
292
+ "50292": {
293
+ "content": "\t\t\t\t",
294
+ "lstrip": false,
295
+ "normalized": true,
296
+ "rstrip": false,
297
+ "single_word": false,
298
+ "special": false
299
+ },
300
+ "50293": {
301
+ "content": "\t\t\t",
302
+ "lstrip": false,
303
+ "normalized": true,
304
+ "rstrip": false,
305
+ "single_word": false,
306
+ "special": false
307
+ },
308
+ "50294": {
309
+ "content": "\t\t",
310
+ "lstrip": false,
311
+ "normalized": true,
312
+ "rstrip": false,
313
+ "single_word": false,
314
+ "special": false
315
+ },
316
+ "50295": {
317
+ "content": "<|im_end|>",
318
+ "lstrip": false,
319
+ "normalized": false,
320
+ "rstrip": false,
321
+ "single_word": false,
322
+ "special": true
323
+ },
324
+ "50296": {
325
+ "content": "<|im_start|>",
326
+ "lstrip": false,
327
+ "normalized": false,
328
+ "rstrip": false,
329
+ "single_word": false,
330
+ "special": false
331
+ },
332
+ "50297": {
333
+ "content": "<image>",
334
+ "lstrip": false,
335
+ "normalized": false,
336
+ "rstrip": false,
337
+ "single_word": false,
338
+ "special": true
339
+ },
340
+ "50298": {
341
+ "content": "<pad>",
342
+ "lstrip": false,
343
+ "normalized": false,
344
+ "rstrip": false,
345
+ "single_word": false,
346
+ "special": true
347
+ }
348
+ },
349
+ "bos_token": "<|endoftext|>",
350
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
351
+ "clean_up_tokenization_spaces": true,
352
+ "eos_token": "<|im_end|>",
353
+ "model_max_length": 2048,
354
+ "pad_token": "<pad>",
355
+ "tokenizer_class": "CodeGenTokenizer",
356
+ "unk_token": "<|endoftext|>"
357
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff