Florian valade commited on
Commit
24bcb4b
·
1 Parent(s): 0fd92ab

modify forward to accept batch

Browse files
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
BranchyModel.py CHANGED
@@ -303,6 +303,15 @@ class BranchyCausalModel(PreTrainedModel):
303
  is_early_exited = False
304
  next_decoder_cache = None
305
 
 
 
 
 
 
 
 
 
 
306
  for layer, decoder_layer in enumerate(self.model.layers):
307
  if output_hidden_states:
308
  all_hidden_states += (hidden_states,)
@@ -326,32 +335,36 @@ class BranchyCausalModel(PreTrainedModel):
326
  output_attentions=output_attentions,
327
  use_cache=use_cache,
328
  )
329
- hidden_states = layer_outputs[0]
330
  if layer in self.config.branch_locations:
331
- logits = self.branches[self.config.branch_locations.index(layer)](layer_outputs[0])
332
  if not self.training:
333
  # During inference, calculate score on the fly to decide if we should early exit
334
- score = self.confidence_metric_fn(logits)[..., -1] # score for the classified token TODO migth be interesting to take score from whole vector ?
335
- if score > self.head_thresholds[self.config.branch_locations.index(layer)]:
336
- is_early_exited = True
337
- logger.debug(f"Early exit at layer {layer} with score {score}")
338
- break
 
 
 
339
  else:
340
  # if in training we return full logits
341
- all_logits += (logits,)
342
 
343
-
344
 
345
  if use_cache:
346
  next_decoder_cache = layer_outputs[2 if output_attentions else 1]
347
 
348
  if output_attentions:
349
  all_self_attns += (layer_outputs[1],)
350
- if not is_early_exited:
351
- logger.debug(f"No early exit")
352
- hidden_states = self.model.final_layernorm(hidden_states)
353
- logits = self.lm_head(hidden_states)
354
- logits = logits.float()
 
355
 
356
  if output_hidden_states:
357
  all_hidden_states += (hidden_states,)
@@ -366,17 +379,18 @@ class BranchyCausalModel(PreTrainedModel):
366
  )
367
  if not return_dict:
368
  raise NotImplementedError("return_dict=False is not implemented")
 
369
  return CausalBranchyLLMOutputWithPast(
370
  loss=loss[0],
371
  head_loss=loss[1],
372
  entropies=loss[2],
373
  entropy=loss[3],
374
- logits=logits, # shape (batch_size, seq_len, vocab_size)
375
- head_logits=all_logits, # shape (num_branches, batch_size, seq_len, vocab_size)
376
  past_key_values=next_cache,
377
  hidden_states=all_hidden_states,
378
  attentions=all_self_attns,
379
- head_indices=layer,
380
  )
381
 
382
  def compute_self_supervision_loss(self,
 
303
  is_early_exited = False
304
  next_decoder_cache = None
305
 
306
+ batch_size = hidden_states.shape[0]
307
+ seq_length = hidden_states.shape[1]
308
+ device = hidden_states.device
309
+
310
+ # Track which samples have exited early
311
+ early_exit_mask = torch.zeros(batch_size, dtype=torch.bool, device=device)
312
+ exit_layer = torch.full((batch_size,), self.num_layers, dtype=torch.long, device=device)
313
+ final_logits = torch.zeros((batch_size, seq_length, self.vocab_size), device=device)
314
+
315
  for layer, decoder_layer in enumerate(self.model.layers):
316
  if output_hidden_states:
317
  all_hidden_states += (hidden_states,)
 
335
  output_attentions=output_attentions,
336
  use_cache=use_cache,
337
  )
338
+
339
  if layer in self.config.branch_locations:
340
+ branch_logits = self.branches[self.config.branch_locations.index(layer)](layer_outputs[0])
341
  if not self.training:
342
  # During inference, calculate score on the fly to decide if we should early exit
343
+ scores = self.confidence_metric_fn(branch_logits)[..., -1]
344
+ exit_samples = (scores > self.head_thresholds[self.config.branch_locations.index(layer)]) & ~early_exit_mask
345
+ early_exit_mask |= exit_samples
346
+ exit_layer[exit_samples] = layer
347
+ final_logits[exit_samples] = branch_logits[exit_samples]
348
+
349
+ if early_exit_mask.all():
350
+ break # All samples have exited early
351
  else:
352
  # if in training we return full logits
353
+ all_logits += (branch_logits,)
354
 
355
+ hidden_states = layer_outputs[0]
356
 
357
  if use_cache:
358
  next_decoder_cache = layer_outputs[2 if output_attentions else 1]
359
 
360
  if output_attentions:
361
  all_self_attns += (layer_outputs[1],)
362
+
363
+ if not early_exit_mask.all():
364
+ remaining_hidden_states = hidden_states[~early_exit_mask]
365
+ remaining_hidden_states = self.model.final_layernorm(remaining_hidden_states)
366
+ remaining_logits = self.lm_head(remaining_hidden_states)
367
+ final_logits[~early_exit_mask] = remaining_logits
368
 
369
  if output_hidden_states:
370
  all_hidden_states += (hidden_states,)
 
379
  )
380
  if not return_dict:
381
  raise NotImplementedError("return_dict=False is not implemented")
382
+
383
  return CausalBranchyLLMOutputWithPast(
384
  loss=loss[0],
385
  head_loss=loss[1],
386
  entropies=loss[2],
387
  entropy=loss[3],
388
+ logits=final_logits,
389
+ head_logits=all_logits,
390
  past_key_values=next_cache,
391
  hidden_states=all_hidden_states,
392
  attentions=all_self_attns,
393
+ head_indices=exit_layer,
394
  )
395
 
396
  def compute_self_supervision_loss(self,
BranchyModelConfig.py DELETED
@@ -1,78 +0,0 @@
1
- from typing import List, Optional
2
- from transformers import PretrainedConfig
3
- import logging
4
-
5
- logger = logging.getLogger(__name__)
6
-
7
- class BranchyModelConfig(PretrainedConfig):
8
- """
9
- Configuration class for BranchyModel. This class extends the PretrainedConfig class from the Transformers
10
- library, providing configuration specific to models with branch functionality.
11
-
12
- Attributes:
13
- branch_locations (List[int]): Specifies the indices of layers after which branches are added. These indices
14
- start from 0, and each index represents a layer in the underlying transformer model.
15
- penalty_weight (Optional[float]): The weight of the penalty term used in the "penalized_cross_entropy" loss.
16
- This parameter is required and must be greater than 0
17
- window_size (int): Determines the number of tokens each branch considers from the input sequence. This allows
18
- for reducing the computational load by limiting the context size each branch processes.
19
-
20
- Example:
21
- config = BranchyModelConfig(
22
- branch_locations=[2, 4, 6],
23
- window_size=512
24
- )
25
-
26
- Note:
27
- This configuration class is specifically designed for use with the BranchyModel class, enabling flexible
28
- and customizable branching within transformer models.
29
- """
30
- model_type = "branchy" # Optional, but useful for identifying the model type in the Transformers library
31
-
32
- def __init__(
33
- self,
34
- model_str: str = None,
35
- head_thresholds: Optional[List[float]] = None,
36
- confidence_metric: Optional[str] = "breaking_ties",
37
- branch_locations: Optional[List[int]] = None,
38
- branch_number: Optional[int] = 3,
39
- penalty_weight: Optional[float] = 0,
40
- head_window_size: int = 512,
41
- copy_lm_head: Optional[bool] = False,
42
- **kwargs
43
- ):
44
- """
45
- Initializes the BranchyModelConfig.
46
-
47
- Args:
48
- model_str (str): The model string to be used for the model. From Huggingface's model hub.
49
- branch_locations (List[int], optional): Locations of the branches. Defaults to None, indicating no branches.
50
- branch_number (Optional[int], optional): Number of branches if branch_locations is not provided. Defaults to 3.
51
- penalty_weight (Optional[float], optional): Weight for the penalty in loss calculation.
52
- . Defaults to None.
53
- head_window_size (int, optional): Number of tokens each branch can see. Defaults to 512.
54
- """
55
- self.model_str = model_str
56
- self.head_thresholds = head_thresholds
57
- self.confidence_metric = confidence_metric
58
- assert self.confidence_metric in ["breaking_ties", "max"], "confidence_metric must be 'breaking_ties' or 'max'. It should depend on how you found the thresholds."
59
- self.branch_locations = branch_locations
60
- self.penalty_weight = penalty_weight
61
- self.head_window_size = head_window_size
62
- if branch_locations is not None and branch_number is not None:
63
- logger.warning("Both branch_locations and branch_number are provided. Using branch_locations.")
64
- self.branch_number = branch_number if branch_locations is None else len(branch_locations)
65
- self.copy_lm_head = copy_lm_head
66
- #assert self.model_str is not None, "model_str must be provided."
67
- assert self.branch_number > 0, "branch_number must be a positive integer."
68
- assert isinstance(self.penalty_weight, float) or isinstance(self.penalty_weight, int), "penalty_weight must be a float or an integer."
69
- assert self.penalty_weight >= 0 and self.penalty_weight <= 1, "penalty_weight must be in the range [0, 1]."
70
- if branch_locations is not None:
71
- assert all([isinstance(loc, int) for loc in self.branch_locations]), "Branch locations must be integers."
72
- assert all([loc >= 0 for loc in self.branch_locations]), "Branch locations must be non-negative."
73
- if self.head_window_size is not None:
74
- assert self.head_window_size > 0 , "head_window_size must be a positive integer or None."
75
- if type(self.head_thresholds) == list:
76
- assert len(self.head_thresholds) == self.branch_number, "Number of thresholds must match number of branches."
77
- assert all([isinstance(threshold, float) for threshold in self.head_thresholds]), "Thresholds must be floats."
78
- super().__init__(**kwargs) # Initialize with base class parameters
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md DELETED
@@ -1,82 +0,0 @@
1
- ---
2
- language:
3
- - en
4
- license: mit
5
- library_name: transformers
6
- pipeline_tag: text-generation
7
- ---
8
-
9
- # Model Card for Model ID
10
-
11
- Phi-2 is a Transformer with **2.7 billion** parameters. It was trained using the same data sources as [Phi-1.5](https://huggingface.co/microsoft/phi-1.5), augmented with a new data source that consists of various NLP synthetic texts and filtered websites (for safety and educational value). When assessed against benchmarks testing common sense, language understanding, and logical reasoning, Phi-2 showcased a nearly state-of-the-art performance among models with less than 13 billion parameters.
12
-
13
- This version of Phi-2 is one with added Early Exit in order to accelerate inference. Each Early Exit was trained using self-supervised technique from model outputs.
14
-
15
- ### Model Description
16
-
17
- This model provides trained head to make Phi-2 a Early exit model.
18
-
19
- - **Developed by:** Florian Valade
20
- - **Shared by:** Florian Valade
21
- - **Model type:** Text generation
22
- - **License:** MIT
23
- - **Finetuned from model :** https://huggingface.co/microsoft/phi-2
24
-
25
- ### Model Sources
26
-
27
- - **Repository:** [TBD]
28
- - **Paper:** [TBD]
29
- - **Demo:** [TBD]
30
-
31
- ## Uses
32
-
33
- When used as provided, the model does not use Early Exits. One needs to set head_thresholds in the configuration in order to use inference acceleration.
34
-
35
- different head_thresholds for different ε :
36
-
37
- | ε | head_thresholds |
38
- | ------------ | ----------------------------------------------------------------------------------- |
39
- | 0.4 | [1.0307843685150146, 0.8693032264709473, 0.6637287139892578, 0.3111608028411865] |
40
- | 0.5 | [1.505380630493164, 1.5712471008300781, 1.1971790790557861, 0.6908178329467773] |
41
- | 0.6 | [2.0270779132843018, 1.8969502449035645, 1.4789371490478516, 0.9875392913818359] |
42
- | 0.7 | [2.506962537765503, 2.656052589416504, 1.924393653869629, 1.4434680938720703] |
43
- | 0.8 | [3.3786778450012207, 2.568857192993164, 2.5665550231933594, 2.006620407104492] |
44
- | 0.9 | [3.187114715576172, 3.442272663116455, 2.636230945587158, 2.460529088973999] |
45
-
46
- When you have selected the thresholds you can use :
47
-
48
- ```python
49
- import torch
50
- from transformers import AutoModelForCausalLM, AutoTokenizer
51
-
52
- model = AutoModelForCausalLM.from_pretrained("valcore/branchy_phi-2_base", trust_remote_code=True, device_map="cpu")
53
- tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-2")
54
-
55
- model.eval()
56
-
57
- inputs = tokenizer('''def print_prime(n):
58
- """
59
- Print all primes between 1 and n
60
- """''', return_tensors="pt", return_attention_mask=False)
61
- # Put here the selected thresholds :
62
- model.head_thresholds = torch.tensor([3.187114715576172, 3.442272663116455, 2.636230945587158, 2.460529088973999])
63
-
64
- outputs = model.generate(**inputs, max_length=200)
65
- text = tokenizer.batch_decode(outputs)[0]
66
- print(text)
67
-
68
-
69
- ```
70
-
71
- ## Citation [optional]
72
-
73
- <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
74
-
75
- **BibTeX:**
76
-
77
- TBD
78
-
79
-
80
- ## Model Card Contact
81
-
82
- Florian Valade
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config.json DELETED
@@ -1,30 +0,0 @@
1
- {
2
- "architectures": [
3
- "BranchyCausalModel"
4
- ],
5
- "auto_map": {
6
- "AutoConfig": "BranchyModelConfig.BranchyModelConfig",
7
- "AutoModelForCausalLM": "BranchyModel.BranchyCausalModel"
8
- },
9
- "branch_locations": [
10
- 6,
11
- 12,
12
- 18,
13
- 24
14
- ],
15
- "branch_number": 4,
16
- "confidence_metric": "breaking_ties",
17
- "copy_lm_head": false,
18
- "head_thresholds": [
19
- 10.0,
20
- 10.0,
21
- 10.0,
22
- 10.0
23
- ],
24
- "head_window_size": 512,
25
- "model_str": "microsoft/phi-2",
26
- "model_type": "branchy",
27
- "penalty_weight": 0.9,
28
- "torch_dtype": "float32",
29
- "transformers_version": "4.40.2"
30
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
generation_config.json DELETED
@@ -1,4 +0,0 @@
1
- {
2
- "_from_model_config": true,
3
- "transformers_version": "4.40.2"
4
- }
 
 
 
 
 
model-00001-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:de9690424cd10d30cc5bbbf31b5ba7149fe2d1b4d1c9b3e28378c37496dfddcc
3
- size 4982355512
 
 
 
 
model-00002-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:526f616bb5753775548b200b2d5afffa862bf3a17cf53c004b1ba8d702fb5890
3
- size 4982541984
 
 
 
 
model-00003-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:963120cc9ecfbdd250e505d8a33ef881aa1cc393b06fe3bc9a7b7be286c3c242
3
- size 3251942344
 
 
 
 
model.safetensors.index.json DELETED
@@ -1,476 +0,0 @@
1
- {
2
- "metadata": {
3
- "total_size": 13216788480
4
- },
5
- "weight_map": {
6
- "branches.0.layernorm.bias": "model-00003-of-00003.safetensors",
7
- "branches.0.layernorm.weight": "model-00003-of-00003.safetensors",
8
- "branches.0.lm_head.bias": "model-00003-of-00003.safetensors",
9
- "branches.0.lm_head.weight": "model-00003-of-00003.safetensors",
10
- "branches.1.layernorm.bias": "model-00003-of-00003.safetensors",
11
- "branches.1.layernorm.weight": "model-00003-of-00003.safetensors",
12
- "branches.1.lm_head.bias": "model-00003-of-00003.safetensors",
13
- "branches.1.lm_head.weight": "model-00003-of-00003.safetensors",
14
- "branches.2.layernorm.bias": "model-00003-of-00003.safetensors",
15
- "branches.2.layernorm.weight": "model-00003-of-00003.safetensors",
16
- "branches.2.lm_head.bias": "model-00003-of-00003.safetensors",
17
- "branches.2.lm_head.weight": "model-00003-of-00003.safetensors",
18
- "branches.3.layernorm.bias": "model-00003-of-00003.safetensors",
19
- "branches.3.layernorm.weight": "model-00003-of-00003.safetensors",
20
- "branches.3.lm_head.bias": "model-00003-of-00003.safetensors",
21
- "branches.3.lm_head.weight": "model-00003-of-00003.safetensors",
22
- "lm_head.bias": "model-00003-of-00003.safetensors",
23
- "lm_head.weight": "model-00003-of-00003.safetensors",
24
- "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
25
- "model.final_layernorm.bias": "model-00003-of-00003.safetensors",
26
- "model.final_layernorm.weight": "model-00003-of-00003.safetensors",
27
- "model.layers.0.input_layernorm.bias": "model-00001-of-00003.safetensors",
28
- "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
29
- "model.layers.0.mlp.fc1.bias": "model-00001-of-00003.safetensors",
30
- "model.layers.0.mlp.fc1.weight": "model-00001-of-00003.safetensors",
31
- "model.layers.0.mlp.fc2.bias": "model-00001-of-00003.safetensors",
32
- "model.layers.0.mlp.fc2.weight": "model-00001-of-00003.safetensors",
33
- "model.layers.0.self_attn.dense.bias": "model-00001-of-00003.safetensors",
34
- "model.layers.0.self_attn.dense.weight": "model-00001-of-00003.safetensors",
35
- "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
36
- "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
37
- "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
38
- "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
39
- "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
40
- "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
41
- "model.layers.1.input_layernorm.bias": "model-00001-of-00003.safetensors",
42
- "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
43
- "model.layers.1.mlp.fc1.bias": "model-00001-of-00003.safetensors",
44
- "model.layers.1.mlp.fc1.weight": "model-00001-of-00003.safetensors",
45
- "model.layers.1.mlp.fc2.bias": "model-00001-of-00003.safetensors",
46
- "model.layers.1.mlp.fc2.weight": "model-00001-of-00003.safetensors",
47
- "model.layers.1.self_attn.dense.bias": "model-00001-of-00003.safetensors",
48
- "model.layers.1.self_attn.dense.weight": "model-00001-of-00003.safetensors",
49
- "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
50
- "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
51
- "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
52
- "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
53
- "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
54
- "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
55
- "model.layers.10.input_layernorm.bias": "model-00001-of-00003.safetensors",
56
- "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors",
57
- "model.layers.10.mlp.fc1.bias": "model-00001-of-00003.safetensors",
58
- "model.layers.10.mlp.fc1.weight": "model-00001-of-00003.safetensors",
59
- "model.layers.10.mlp.fc2.bias": "model-00001-of-00003.safetensors",
60
- "model.layers.10.mlp.fc2.weight": "model-00001-of-00003.safetensors",
61
- "model.layers.10.self_attn.dense.bias": "model-00001-of-00003.safetensors",
62
- "model.layers.10.self_attn.dense.weight": "model-00001-of-00003.safetensors",
63
- "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
64
- "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
65
- "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
66
- "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
67
- "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
68
- "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
69
- "model.layers.11.input_layernorm.bias": "model-00001-of-00003.safetensors",
70
- "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors",
71
- "model.layers.11.mlp.fc1.bias": "model-00001-of-00003.safetensors",
72
- "model.layers.11.mlp.fc1.weight": "model-00001-of-00003.safetensors",
73
- "model.layers.11.mlp.fc2.bias": "model-00001-of-00003.safetensors",
74
- "model.layers.11.mlp.fc2.weight": "model-00001-of-00003.safetensors",
75
- "model.layers.11.self_attn.dense.bias": "model-00001-of-00003.safetensors",
76
- "model.layers.11.self_attn.dense.weight": "model-00001-of-00003.safetensors",
77
- "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
78
- "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
79
- "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
80
- "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
81
- "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
82
- "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
83
- "model.layers.12.input_layernorm.bias": "model-00001-of-00003.safetensors",
84
- "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors",
85
- "model.layers.12.mlp.fc1.bias": "model-00001-of-00003.safetensors",
86
- "model.layers.12.mlp.fc1.weight": "model-00001-of-00003.safetensors",
87
- "model.layers.12.mlp.fc2.bias": "model-00001-of-00003.safetensors",
88
- "model.layers.12.mlp.fc2.weight": "model-00001-of-00003.safetensors",
89
- "model.layers.12.self_attn.dense.bias": "model-00001-of-00003.safetensors",
90
- "model.layers.12.self_attn.dense.weight": "model-00001-of-00003.safetensors",
91
- "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
92
- "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
93
- "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
94
- "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
95
- "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
96
- "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
97
- "model.layers.13.input_layernorm.bias": "model-00001-of-00003.safetensors",
98
- "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors",
99
- "model.layers.13.mlp.fc1.bias": "model-00001-of-00003.safetensors",
100
- "model.layers.13.mlp.fc1.weight": "model-00001-of-00003.safetensors",
101
- "model.layers.13.mlp.fc2.bias": "model-00001-of-00003.safetensors",
102
- "model.layers.13.mlp.fc2.weight": "model-00001-of-00003.safetensors",
103
- "model.layers.13.self_attn.dense.bias": "model-00001-of-00003.safetensors",
104
- "model.layers.13.self_attn.dense.weight": "model-00001-of-00003.safetensors",
105
- "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
106
- "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
107
- "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
108
- "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
109
- "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
110
- "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
111
- "model.layers.14.input_layernorm.bias": "model-00002-of-00003.safetensors",
112
- "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
113
- "model.layers.14.mlp.fc1.bias": "model-00002-of-00003.safetensors",
114
- "model.layers.14.mlp.fc1.weight": "model-00002-of-00003.safetensors",
115
- "model.layers.14.mlp.fc2.bias": "model-00002-of-00003.safetensors",
116
- "model.layers.14.mlp.fc2.weight": "model-00002-of-00003.safetensors",
117
- "model.layers.14.self_attn.dense.bias": "model-00002-of-00003.safetensors",
118
- "model.layers.14.self_attn.dense.weight": "model-00002-of-00003.safetensors",
119
- "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
120
- "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
121
- "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
122
- "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
123
- "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
124
- "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
125
- "model.layers.15.input_layernorm.bias": "model-00002-of-00003.safetensors",
126
- "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
127
- "model.layers.15.mlp.fc1.bias": "model-00002-of-00003.safetensors",
128
- "model.layers.15.mlp.fc1.weight": "model-00002-of-00003.safetensors",
129
- "model.layers.15.mlp.fc2.bias": "model-00002-of-00003.safetensors",
130
- "model.layers.15.mlp.fc2.weight": "model-00002-of-00003.safetensors",
131
- "model.layers.15.self_attn.dense.bias": "model-00002-of-00003.safetensors",
132
- "model.layers.15.self_attn.dense.weight": "model-00002-of-00003.safetensors",
133
- "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
134
- "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
135
- "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
136
- "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
137
- "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
138
- "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
139
- "model.layers.16.input_layernorm.bias": "model-00002-of-00003.safetensors",
140
- "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
141
- "model.layers.16.mlp.fc1.bias": "model-00002-of-00003.safetensors",
142
- "model.layers.16.mlp.fc1.weight": "model-00002-of-00003.safetensors",
143
- "model.layers.16.mlp.fc2.bias": "model-00002-of-00003.safetensors",
144
- "model.layers.16.mlp.fc2.weight": "model-00002-of-00003.safetensors",
145
- "model.layers.16.self_attn.dense.bias": "model-00002-of-00003.safetensors",
146
- "model.layers.16.self_attn.dense.weight": "model-00002-of-00003.safetensors",
147
- "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
148
- "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
149
- "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
150
- "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
151
- "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
152
- "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
153
- "model.layers.17.input_layernorm.bias": "model-00002-of-00003.safetensors",
154
- "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
155
- "model.layers.17.mlp.fc1.bias": "model-00002-of-00003.safetensors",
156
- "model.layers.17.mlp.fc1.weight": "model-00002-of-00003.safetensors",
157
- "model.layers.17.mlp.fc2.bias": "model-00002-of-00003.safetensors",
158
- "model.layers.17.mlp.fc2.weight": "model-00002-of-00003.safetensors",
159
- "model.layers.17.self_attn.dense.bias": "model-00002-of-00003.safetensors",
160
- "model.layers.17.self_attn.dense.weight": "model-00002-of-00003.safetensors",
161
- "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
162
- "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
163
- "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
164
- "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
165
- "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
166
- "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
167
- "model.layers.18.input_layernorm.bias": "model-00002-of-00003.safetensors",
168
- "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
169
- "model.layers.18.mlp.fc1.bias": "model-00002-of-00003.safetensors",
170
- "model.layers.18.mlp.fc1.weight": "model-00002-of-00003.safetensors",
171
- "model.layers.18.mlp.fc2.bias": "model-00002-of-00003.safetensors",
172
- "model.layers.18.mlp.fc2.weight": "model-00002-of-00003.safetensors",
173
- "model.layers.18.self_attn.dense.bias": "model-00002-of-00003.safetensors",
174
- "model.layers.18.self_attn.dense.weight": "model-00002-of-00003.safetensors",
175
- "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
176
- "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
177
- "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
178
- "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
179
- "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
180
- "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
181
- "model.layers.19.input_layernorm.bias": "model-00002-of-00003.safetensors",
182
- "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
183
- "model.layers.19.mlp.fc1.bias": "model-00002-of-00003.safetensors",
184
- "model.layers.19.mlp.fc1.weight": "model-00002-of-00003.safetensors",
185
- "model.layers.19.mlp.fc2.bias": "model-00002-of-00003.safetensors",
186
- "model.layers.19.mlp.fc2.weight": "model-00002-of-00003.safetensors",
187
- "model.layers.19.self_attn.dense.bias": "model-00002-of-00003.safetensors",
188
- "model.layers.19.self_attn.dense.weight": "model-00002-of-00003.safetensors",
189
- "model.layers.19.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
190
- "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
191
- "model.layers.19.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
192
- "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
193
- "model.layers.19.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
194
- "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
195
- "model.layers.2.input_layernorm.bias": "model-00001-of-00003.safetensors",
196
- "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
197
- "model.layers.2.mlp.fc1.bias": "model-00001-of-00003.safetensors",
198
- "model.layers.2.mlp.fc1.weight": "model-00001-of-00003.safetensors",
199
- "model.layers.2.mlp.fc2.bias": "model-00001-of-00003.safetensors",
200
- "model.layers.2.mlp.fc2.weight": "model-00001-of-00003.safetensors",
201
- "model.layers.2.self_attn.dense.bias": "model-00001-of-00003.safetensors",
202
- "model.layers.2.self_attn.dense.weight": "model-00001-of-00003.safetensors",
203
- "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
204
- "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
205
- "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
206
- "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
207
- "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
208
- "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
209
- "model.layers.20.input_layernorm.bias": "model-00002-of-00003.safetensors",
210
- "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
211
- "model.layers.20.mlp.fc1.bias": "model-00002-of-00003.safetensors",
212
- "model.layers.20.mlp.fc1.weight": "model-00002-of-00003.safetensors",
213
- "model.layers.20.mlp.fc2.bias": "model-00002-of-00003.safetensors",
214
- "model.layers.20.mlp.fc2.weight": "model-00002-of-00003.safetensors",
215
- "model.layers.20.self_attn.dense.bias": "model-00002-of-00003.safetensors",
216
- "model.layers.20.self_attn.dense.weight": "model-00002-of-00003.safetensors",
217
- "model.layers.20.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
218
- "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
219
- "model.layers.20.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
220
- "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
221
- "model.layers.20.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
222
- "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
223
- "model.layers.21.input_layernorm.bias": "model-00002-of-00003.safetensors",
224
- "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
225
- "model.layers.21.mlp.fc1.bias": "model-00002-of-00003.safetensors",
226
- "model.layers.21.mlp.fc1.weight": "model-00002-of-00003.safetensors",
227
- "model.layers.21.mlp.fc2.bias": "model-00002-of-00003.safetensors",
228
- "model.layers.21.mlp.fc2.weight": "model-00002-of-00003.safetensors",
229
- "model.layers.21.self_attn.dense.bias": "model-00002-of-00003.safetensors",
230
- "model.layers.21.self_attn.dense.weight": "model-00002-of-00003.safetensors",
231
- "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
232
- "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
233
- "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
234
- "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
235
- "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
236
- "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
237
- "model.layers.22.input_layernorm.bias": "model-00002-of-00003.safetensors",
238
- "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors",
239
- "model.layers.22.mlp.fc1.bias": "model-00002-of-00003.safetensors",
240
- "model.layers.22.mlp.fc1.weight": "model-00002-of-00003.safetensors",
241
- "model.layers.22.mlp.fc2.bias": "model-00002-of-00003.safetensors",
242
- "model.layers.22.mlp.fc2.weight": "model-00002-of-00003.safetensors",
243
- "model.layers.22.self_attn.dense.bias": "model-00002-of-00003.safetensors",
244
- "model.layers.22.self_attn.dense.weight": "model-00002-of-00003.safetensors",
245
- "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
246
- "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
247
- "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
248
- "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
249
- "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
250
- "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
251
- "model.layers.23.input_layernorm.bias": "model-00002-of-00003.safetensors",
252
- "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors",
253
- "model.layers.23.mlp.fc1.bias": "model-00002-of-00003.safetensors",
254
- "model.layers.23.mlp.fc1.weight": "model-00002-of-00003.safetensors",
255
- "model.layers.23.mlp.fc2.bias": "model-00002-of-00003.safetensors",
256
- "model.layers.23.mlp.fc2.weight": "model-00002-of-00003.safetensors",
257
- "model.layers.23.self_attn.dense.bias": "model-00002-of-00003.safetensors",
258
- "model.layers.23.self_attn.dense.weight": "model-00002-of-00003.safetensors",
259
- "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
260
- "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
261
- "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
262
- "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
263
- "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
264
- "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
265
- "model.layers.24.input_layernorm.bias": "model-00002-of-00003.safetensors",
266
- "model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors",
267
- "model.layers.24.mlp.fc1.bias": "model-00002-of-00003.safetensors",
268
- "model.layers.24.mlp.fc1.weight": "model-00002-of-00003.safetensors",
269
- "model.layers.24.mlp.fc2.bias": "model-00002-of-00003.safetensors",
270
- "model.layers.24.mlp.fc2.weight": "model-00002-of-00003.safetensors",
271
- "model.layers.24.self_attn.dense.bias": "model-00002-of-00003.safetensors",
272
- "model.layers.24.self_attn.dense.weight": "model-00002-of-00003.safetensors",
273
- "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
274
- "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
275
- "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
276
- "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
277
- "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
278
- "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
279
- "model.layers.25.input_layernorm.bias": "model-00002-of-00003.safetensors",
280
- "model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors",
281
- "model.layers.25.mlp.fc1.bias": "model-00002-of-00003.safetensors",
282
- "model.layers.25.mlp.fc1.weight": "model-00002-of-00003.safetensors",
283
- "model.layers.25.mlp.fc2.bias": "model-00002-of-00003.safetensors",
284
- "model.layers.25.mlp.fc2.weight": "model-00002-of-00003.safetensors",
285
- "model.layers.25.self_attn.dense.bias": "model-00002-of-00003.safetensors",
286
- "model.layers.25.self_attn.dense.weight": "model-00002-of-00003.safetensors",
287
- "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
288
- "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
289
- "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
290
- "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
291
- "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
292
- "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
293
- "model.layers.26.input_layernorm.bias": "model-00002-of-00003.safetensors",
294
- "model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors",
295
- "model.layers.26.mlp.fc1.bias": "model-00002-of-00003.safetensors",
296
- "model.layers.26.mlp.fc1.weight": "model-00002-of-00003.safetensors",
297
- "model.layers.26.mlp.fc2.bias": "model-00002-of-00003.safetensors",
298
- "model.layers.26.mlp.fc2.weight": "model-00002-of-00003.safetensors",
299
- "model.layers.26.self_attn.dense.bias": "model-00002-of-00003.safetensors",
300
- "model.layers.26.self_attn.dense.weight": "model-00002-of-00003.safetensors",
301
- "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
302
- "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
303
- "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
304
- "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
305
- "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
306
- "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
307
- "model.layers.27.input_layernorm.bias": "model-00002-of-00003.safetensors",
308
- "model.layers.27.input_layernorm.weight": "model-00002-of-00003.safetensors",
309
- "model.layers.27.mlp.fc1.bias": "model-00002-of-00003.safetensors",
310
- "model.layers.27.mlp.fc1.weight": "model-00002-of-00003.safetensors",
311
- "model.layers.27.mlp.fc2.bias": "model-00002-of-00003.safetensors",
312
- "model.layers.27.mlp.fc2.weight": "model-00002-of-00003.safetensors",
313
- "model.layers.27.self_attn.dense.bias": "model-00002-of-00003.safetensors",
314
- "model.layers.27.self_attn.dense.weight": "model-00002-of-00003.safetensors",
315
- "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
316
- "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
317
- "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
318
- "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
319
- "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
320
- "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
321
- "model.layers.28.input_layernorm.bias": "model-00002-of-00003.safetensors",
322
- "model.layers.28.input_layernorm.weight": "model-00002-of-00003.safetensors",
323
- "model.layers.28.mlp.fc1.bias": "model-00002-of-00003.safetensors",
324
- "model.layers.28.mlp.fc1.weight": "model-00002-of-00003.safetensors",
325
- "model.layers.28.mlp.fc2.bias": "model-00002-of-00003.safetensors",
326
- "model.layers.28.mlp.fc2.weight": "model-00002-of-00003.safetensors",
327
- "model.layers.28.self_attn.dense.bias": "model-00002-of-00003.safetensors",
328
- "model.layers.28.self_attn.dense.weight": "model-00002-of-00003.safetensors",
329
- "model.layers.28.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
330
- "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
331
- "model.layers.28.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
332
- "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
333
- "model.layers.28.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
334
- "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
335
- "model.layers.29.input_layernorm.bias": "model-00002-of-00003.safetensors",
336
- "model.layers.29.input_layernorm.weight": "model-00002-of-00003.safetensors",
337
- "model.layers.29.mlp.fc1.bias": "model-00002-of-00003.safetensors",
338
- "model.layers.29.mlp.fc1.weight": "model-00002-of-00003.safetensors",
339
- "model.layers.29.mlp.fc2.bias": "model-00002-of-00003.safetensors",
340
- "model.layers.29.mlp.fc2.weight": "model-00002-of-00003.safetensors",
341
- "model.layers.29.self_attn.dense.bias": "model-00002-of-00003.safetensors",
342
- "model.layers.29.self_attn.dense.weight": "model-00002-of-00003.safetensors",
343
- "model.layers.29.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
344
- "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
345
- "model.layers.29.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
346
- "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
347
- "model.layers.29.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
348
- "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
349
- "model.layers.3.input_layernorm.bias": "model-00001-of-00003.safetensors",
350
- "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
351
- "model.layers.3.mlp.fc1.bias": "model-00001-of-00003.safetensors",
352
- "model.layers.3.mlp.fc1.weight": "model-00001-of-00003.safetensors",
353
- "model.layers.3.mlp.fc2.bias": "model-00001-of-00003.safetensors",
354
- "model.layers.3.mlp.fc2.weight": "model-00001-of-00003.safetensors",
355
- "model.layers.3.self_attn.dense.bias": "model-00001-of-00003.safetensors",
356
- "model.layers.3.self_attn.dense.weight": "model-00001-of-00003.safetensors",
357
- "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
358
- "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
359
- "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
360
- "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
361
- "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
362
- "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
363
- "model.layers.30.input_layernorm.bias": "model-00003-of-00003.safetensors",
364
- "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors",
365
- "model.layers.30.mlp.fc1.bias": "model-00003-of-00003.safetensors",
366
- "model.layers.30.mlp.fc1.weight": "model-00003-of-00003.safetensors",
367
- "model.layers.30.mlp.fc2.bias": "model-00003-of-00003.safetensors",
368
- "model.layers.30.mlp.fc2.weight": "model-00003-of-00003.safetensors",
369
- "model.layers.30.self_attn.dense.bias": "model-00003-of-00003.safetensors",
370
- "model.layers.30.self_attn.dense.weight": "model-00003-of-00003.safetensors",
371
- "model.layers.30.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
372
- "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
373
- "model.layers.30.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
374
- "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
375
- "model.layers.30.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
376
- "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
377
- "model.layers.31.input_layernorm.bias": "model-00003-of-00003.safetensors",
378
- "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors",
379
- "model.layers.31.mlp.fc1.bias": "model-00003-of-00003.safetensors",
380
- "model.layers.31.mlp.fc1.weight": "model-00003-of-00003.safetensors",
381
- "model.layers.31.mlp.fc2.bias": "model-00003-of-00003.safetensors",
382
- "model.layers.31.mlp.fc2.weight": "model-00003-of-00003.safetensors",
383
- "model.layers.31.self_attn.dense.bias": "model-00003-of-00003.safetensors",
384
- "model.layers.31.self_attn.dense.weight": "model-00003-of-00003.safetensors",
385
- "model.layers.31.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
386
- "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
387
- "model.layers.31.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
388
- "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
389
- "model.layers.31.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
390
- "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
391
- "model.layers.4.input_layernorm.bias": "model-00001-of-00003.safetensors",
392
- "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
393
- "model.layers.4.mlp.fc1.bias": "model-00001-of-00003.safetensors",
394
- "model.layers.4.mlp.fc1.weight": "model-00001-of-00003.safetensors",
395
- "model.layers.4.mlp.fc2.bias": "model-00001-of-00003.safetensors",
396
- "model.layers.4.mlp.fc2.weight": "model-00001-of-00003.safetensors",
397
- "model.layers.4.self_attn.dense.bias": "model-00001-of-00003.safetensors",
398
- "model.layers.4.self_attn.dense.weight": "model-00001-of-00003.safetensors",
399
- "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
400
- "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
401
- "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
402
- "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
403
- "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
404
- "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
405
- "model.layers.5.input_layernorm.bias": "model-00001-of-00003.safetensors",
406
- "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
407
- "model.layers.5.mlp.fc1.bias": "model-00001-of-00003.safetensors",
408
- "model.layers.5.mlp.fc1.weight": "model-00001-of-00003.safetensors",
409
- "model.layers.5.mlp.fc2.bias": "model-00001-of-00003.safetensors",
410
- "model.layers.5.mlp.fc2.weight": "model-00001-of-00003.safetensors",
411
- "model.layers.5.self_attn.dense.bias": "model-00001-of-00003.safetensors",
412
- "model.layers.5.self_attn.dense.weight": "model-00001-of-00003.safetensors",
413
- "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
414
- "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
415
- "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
416
- "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
417
- "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
418
- "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
419
- "model.layers.6.input_layernorm.bias": "model-00001-of-00003.safetensors",
420
- "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
421
- "model.layers.6.mlp.fc1.bias": "model-00001-of-00003.safetensors",
422
- "model.layers.6.mlp.fc1.weight": "model-00001-of-00003.safetensors",
423
- "model.layers.6.mlp.fc2.bias": "model-00001-of-00003.safetensors",
424
- "model.layers.6.mlp.fc2.weight": "model-00001-of-00003.safetensors",
425
- "model.layers.6.self_attn.dense.bias": "model-00001-of-00003.safetensors",
426
- "model.layers.6.self_attn.dense.weight": "model-00001-of-00003.safetensors",
427
- "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
428
- "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
429
- "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
430
- "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
431
- "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
432
- "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
433
- "model.layers.7.input_layernorm.bias": "model-00001-of-00003.safetensors",
434
- "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
435
- "model.layers.7.mlp.fc1.bias": "model-00001-of-00003.safetensors",
436
- "model.layers.7.mlp.fc1.weight": "model-00001-of-00003.safetensors",
437
- "model.layers.7.mlp.fc2.bias": "model-00001-of-00003.safetensors",
438
- "model.layers.7.mlp.fc2.weight": "model-00001-of-00003.safetensors",
439
- "model.layers.7.self_attn.dense.bias": "model-00001-of-00003.safetensors",
440
- "model.layers.7.self_attn.dense.weight": "model-00001-of-00003.safetensors",
441
- "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
442
- "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
443
- "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
444
- "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
445
- "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
446
- "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
447
- "model.layers.8.input_layernorm.bias": "model-00001-of-00003.safetensors",
448
- "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
449
- "model.layers.8.mlp.fc1.bias": "model-00001-of-00003.safetensors",
450
- "model.layers.8.mlp.fc1.weight": "model-00001-of-00003.safetensors",
451
- "model.layers.8.mlp.fc2.bias": "model-00001-of-00003.safetensors",
452
- "model.layers.8.mlp.fc2.weight": "model-00001-of-00003.safetensors",
453
- "model.layers.8.self_attn.dense.bias": "model-00001-of-00003.safetensors",
454
- "model.layers.8.self_attn.dense.weight": "model-00001-of-00003.safetensors",
455
- "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
456
- "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
457
- "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
458
- "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
459
- "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
460
- "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
461
- "model.layers.9.input_layernorm.bias": "model-00001-of-00003.safetensors",
462
- "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
463
- "model.layers.9.mlp.fc1.bias": "model-00001-of-00003.safetensors",
464
- "model.layers.9.mlp.fc1.weight": "model-00001-of-00003.safetensors",
465
- "model.layers.9.mlp.fc2.bias": "model-00001-of-00003.safetensors",
466
- "model.layers.9.mlp.fc2.weight": "model-00001-of-00003.safetensors",
467
- "model.layers.9.self_attn.dense.bias": "model-00001-of-00003.safetensors",
468
- "model.layers.9.self_attn.dense.weight": "model-00001-of-00003.safetensors",
469
- "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
470
- "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
471
- "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
472
- "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
473
- "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
474
- "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors"
475
- }
476
- }