winglian commited on
Commit
65e027e
1 Parent(s): b3fa02a

checkpoint 494

Browse files
config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "MightyLlamaForCausalLM"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "transformers.models.llama.configuration_llama.LlamaConfig",
7
+ "AutoModelForCausalLM": "openaccess-ai-collective/mighty-llama-1b--modeling_llama.MightyLlamaForCausalLM"
8
+ },
9
+ "attention_bias": false,
10
+ "attention_dropout": 0.0,
11
+ "bos_token_id": 1,
12
+ "eos_token_id": 2,
13
+ "hidden_act": "silu",
14
+ "hidden_size": 2048,
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": 5632,
17
+ "max_position_embeddings": 2048,
18
+ "model_type": "mighty-llama",
19
+ "num_attention_heads": 32,
20
+ "num_hidden_layers": 22,
21
+ "num_key_value_heads": 4,
22
+ "pretraining_tp": 1,
23
+ "rms_norm_eps": 1e-05,
24
+ "rope_scaling": null,
25
+ "rope_theta": 10000.0,
26
+ "tie_word_embeddings": false,
27
+ "torch_dtype": "bfloat16",
28
+ "transformers_version": "4.37.0.dev0",
29
+ "use_cache": false,
30
+ "vocab_size": 32000
31
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "eos_token_id": 2,
4
+ "max_length": 2048,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.37.0.dev0"
7
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0af19aa16c4d604f1a927c759de87ccb5fb0f003611998f1bd86294fc263132
3
+ size 2200119864
modeling_llama.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, List, Union, Tuple
2
+
3
+ import torch
4
+ from transformers import LlamaModel, Cache, DynamicCache
5
+ from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask_for_sdpa, \
6
+ _prepare_4d_causal_attention_mask
7
+ from transformers.modeling_outputs import BaseModelOutputWithPast
8
+ from transformers.utils import logging
9
+
10
+ logger = logging.get_logger(__name__)
11
+
12
+ class MightyLlamaModel(LlamaModel):
13
+ def forward(
14
+ self,
15
+ input_ids: torch.LongTensor = None,
16
+ attention_mask: Optional[torch.Tensor] = None,
17
+ position_ids: Optional[torch.LongTensor] = None,
18
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
19
+ inputs_embeds: Optional[torch.FloatTensor] = None,
20
+ use_cache: Optional[bool] = None,
21
+ output_attentions: Optional[bool] = None,
22
+ output_hidden_states: Optional[bool] = None,
23
+ return_dict: Optional[bool] = None,
24
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
25
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
26
+ output_hidden_states = (
27
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
28
+ )
29
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
30
+
31
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
32
+
33
+ # retrieve input_ids and inputs_embeds
34
+ if input_ids is not None and inputs_embeds is not None:
35
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
36
+ elif input_ids is not None:
37
+ batch_size, seq_length = input_ids.shape[:2]
38
+ elif inputs_embeds is not None:
39
+ batch_size, seq_length = inputs_embeds.shape[:2]
40
+ else:
41
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
42
+
43
+ if self.gradient_checkpointing and self.training:
44
+ if use_cache:
45
+ logger.warning_once(
46
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
47
+ )
48
+ use_cache = False
49
+
50
+ past_key_values_length = 0
51
+ if use_cache:
52
+ use_legacy_cache = not isinstance(past_key_values, Cache)
53
+ if use_legacy_cache:
54
+ past_key_values = DynamicCache.from_legacy_cache(past_key_values)
55
+ past_key_values_length = past_key_values.get_usable_length(seq_length)
56
+
57
+ if position_ids is None:
58
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
59
+ position_ids = torch.arange(
60
+ past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
61
+ )
62
+ position_ids = position_ids.unsqueeze(0)
63
+
64
+ if inputs_embeds is None:
65
+ inputs_embeds = self.embed_tokens(input_ids)
66
+
67
+ if self._use_flash_attention_2:
68
+ # 2d mask is passed through the layers
69
+ attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
70
+ elif self._use_sdpa and not output_attentions:
71
+ # output_attentions=True can not be supported when using SDPA, and we fall back on
72
+ # the manual implementation that requires a 4D causal mask in all cases.
73
+ attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
74
+ attention_mask,
75
+ (batch_size, seq_length),
76
+ inputs_embeds,
77
+ past_key_values_length,
78
+ )
79
+ else:
80
+ # 4d mask is passed through the layers
81
+ attention_mask = _prepare_4d_causal_attention_mask(
82
+ attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
83
+ )
84
+
85
+ # embed positions
86
+ hidden_states = inputs_embeds
87
+
88
+ # decoder layers
89
+ all_hidden_states = () if output_hidden_states else None
90
+ all_self_attns = () if output_attentions else None
91
+ next_decoder_cache = None
92
+
93
+ layer_order = list(range(0, 3)) + [x for _ in range(1) for x in range(4, 17)] + list(range(18, len(self.layers)))
94
+ for idx, layer_idx in enumerate(layer_order):
95
+ decoder_layer = self.layers[layer_idx]
96
+ if output_hidden_states:
97
+ all_hidden_states += (hidden_states,)
98
+
99
+ if self.gradient_checkpointing and self.training:
100
+ layer_outputs = self._gradient_checkpointing_func(
101
+ decoder_layer.__call__,
102
+ hidden_states,
103
+ attention_mask,
104
+ position_ids,
105
+ past_key_values,
106
+ output_attentions,
107
+ use_cache,
108
+ )
109
+ else:
110
+ layer_outputs = decoder_layer(
111
+ hidden_states,
112
+ attention_mask=attention_mask,
113
+ position_ids=position_ids,
114
+ past_key_value=past_key_values,
115
+ output_attentions=output_attentions,
116
+ use_cache=use_cache,
117
+ )
118
+
119
+ hidden_states = layer_outputs[0]
120
+
121
+ if use_cache:
122
+ next_decoder_cache = layer_outputs[2 if output_attentions else 1]
123
+
124
+ if output_attentions:
125
+ all_self_attns += (layer_outputs[1],)
126
+
127
+ hidden_states = self.norm(hidden_states)
128
+
129
+ # add hidden states from the last decoder layer
130
+ if output_hidden_states:
131
+ all_hidden_states += (hidden_states,)
132
+
133
+ next_cache = None
134
+ if use_cache:
135
+ next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
136
+ if not return_dict:
137
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
138
+ return BaseModelOutputWithPast(
139
+ last_hidden_state=hidden_states,
140
+ past_key_values=next_cache,
141
+ hidden_states=all_hidden_states,
142
+ attentions=all_self_attns,
143
+ )
144
+
145
+
146
+ class MightyLlamaForCausalLM(LlamaForCausalLM):
147
+ def __init__(self, config):
148
+ super().__init__(config)
149
+ self.model = MightyLlamaModel(config)
150
+
151
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
152
+
153
+ self.mem_id = None
154
+ self.mem_freq = None
155
+ self.top_k = None
156
+ self.max_seq_len = None
157
+
158
+ # Initialize weights and apply final processing
159
+ self.post_init()
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ }
29
+ },
30
+ "bos_token": "<s>",
31
+ "clean_up_tokenization_spaces": false,
32
+ "eos_token": "</s>",
33
+ "legacy": false,
34
+ "model_max_length": 1000000000000000019884624838656,
35
+ "pad_token": "</s>",
36
+ "padding_side": "right",
37
+ "sp_model_kwargs": {},
38
+ "spaces_between_special_tokens": false,
39
+ "tokenizer_class": "LlamaTokenizer",
40
+ "trust_remote_code": false,
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false,
43
+ "use_fast": true
44
+ }