InternLM-Math commited on
Commit
08ad504
·
verified ·
1 Parent(s): a7e1823

Upload folder using huggingface_hub

Browse files
added_tokens.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "[UNUSED_TOKEN_141]": 92544,
3
+ "[UNUSED_TOKEN_142]": 92545,
4
+ "[UNUSED_TOKEN_143]": 92546,
5
+ "[UNUSED_TOKEN_144]": 92547,
6
+ "[UNUSED_TOKEN_145]": 92548,
7
+ "[UNUSED_TOKEN_146]": 92549
8
+ }
config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "internlm/internlm2_5-step-prover-critic",
3
+ "architectures": [
4
+ "InternLM2ForRewardModel"
5
+ ],
6
+ "attn_implementation": "flash_attention_2",
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_internlm2.InternLM2Config",
9
+ "AutoModel": "modeling_internlm2.InternLM2ForRewardModel"
10
+ },
11
+ "bias": false,
12
+ "bos_token_id": 1,
13
+ "eos_token_id": 2,
14
+ "hidden_act": "silu",
15
+ "hidden_size": 2048,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 8192,
18
+ "max_position_embeddings": 32768,
19
+ "model_type": "internlm2",
20
+ "num_attention_heads": 16,
21
+ "num_hidden_layers": 24,
22
+ "num_key_value_heads": 8,
23
+ "pad_token_id": 2,
24
+ "pretraining_tp": 1,
25
+ "reward_token_id": 92527,
26
+ "rms_norm_eps": 1e-05,
27
+ "rope_scaling": {
28
+ "factor": 2.0,
29
+ "type": "dynamic"
30
+ },
31
+ "rope_theta": 1000000,
32
+ "tie_word_embeddings": false,
33
+ "torch_dtype": "float16",
34
+ "transformers_version": "4.42.3",
35
+ "use_cache": true,
36
+ "vocab_size": 92544
37
+ }
configuration_internlm2.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on transformers/src/transformers/models/llama/configuration_llama.py
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+ """ InternLM2 model configuration"""
18
+
19
+ from transformers.configuration_utils import PretrainedConfig
20
+ from transformers.utils import logging
21
+
22
+ logger = logging.get_logger(__name__)
23
+
24
+ INTERNLM2_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
25
+
26
+
27
+ # Modified from transformers.model.llama.configuration_llama.LlamaConfig
28
+ class InternLM2Config(PretrainedConfig):
29
+ r"""
30
+ This is the configuration class to store the configuration of a [`InternLM2Model`]. It is used to instantiate
31
+ an InternLM2 model according to the specified arguments, defining the model architecture. Instantiating a
32
+ configuration with the defaults will yield a similar configuration to that of the InternLM2-7B.
33
+
34
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
35
+ documentation from [`PretrainedConfig`] for more information.
36
+
37
+
38
+ Args:
39
+ vocab_size (`int`, *optional*, defaults to 32000):
40
+ Vocabulary size of the InternLM2 model. Defines the number of different tokens that can be represented by the
41
+ `inputs_ids` passed when calling [`InternLM2Model`]
42
+ hidden_size (`int`, *optional*, defaults to 4096):
43
+ Dimension of the hidden representations.
44
+ intermediate_size (`int`, *optional*, defaults to 11008):
45
+ Dimension of the MLP representations.
46
+ num_hidden_layers (`int`, *optional*, defaults to 32):
47
+ Number of hidden layers in the Transformer decoder.
48
+ num_attention_heads (`int`, *optional*, defaults to 32):
49
+ Number of attention heads for each attention layer in the Transformer decoder.
50
+ num_key_value_heads (`int`, *optional*):
51
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
52
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
53
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
54
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
55
+ by meanpooling all the original heads within that group. For more details checkout [this
56
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
57
+ `num_attention_heads`.
58
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
59
+ The non-linear activation function (function or string) in the decoder.
60
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
61
+ The maximum sequence length that this model might ever be used with. InternLM2 supports up to 32768 tokens.
62
+ initializer_range (`float`, *optional*, defaults to 0.02):
63
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
64
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
65
+ The epsilon used by the rms normalization layers.
66
+ use_cache (`bool`, *optional*, defaults to `True`):
67
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
68
+ relevant if `config.is_decoder=True`.
69
+ pad_token_id (`int`, *optional*):
70
+ Padding token id.
71
+ bos_token_id (`int`, *optional*, defaults to 1):
72
+ Beginning of stream token id.
73
+ eos_token_id (`int`, *optional*, defaults to 2):
74
+ End of stream token id.
75
+ pretraining_tp (`int`, *optional*, defaults to 1):
76
+ Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
77
+ document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism)
78
+ to understand more about it. This value is necessary to ensure exact reproducibility
79
+ of the pretraining results. Please refer to [this
80
+ issue](https://github.com/pytorch/pytorch/issues/76232).
81
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
82
+ Whether to tie weight embeddings
83
+ rope_theta (`float`, *optional*, defaults to 10000.0):
84
+ The base period of the RoPE embeddings.
85
+ rope_scaling (`Dict`, *optional*):
86
+ Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
87
+ strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
88
+ `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
89
+ `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
90
+ these scaling strategies behave:
91
+ https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
92
+ experimental feature, subject to breaking API changes in future versions.
93
+ """
94
+ _auto_class = "AutoConfig"
95
+ model_type = "internlm2"
96
+ keys_to_ignore_at_inference = ["past_key_values"]
97
+
98
+ def __init__( # pylint: disable=W0102
99
+ self,
100
+ vocab_size=103168,
101
+ hidden_size=4096,
102
+ intermediate_size=11008,
103
+ num_hidden_layers=32,
104
+ num_attention_heads=32,
105
+ num_key_value_heads=None,
106
+ hidden_act="silu",
107
+ max_position_embeddings=2048,
108
+ initializer_range=0.02,
109
+ rms_norm_eps=1e-6,
110
+ use_cache=True,
111
+ pad_token_id=0,
112
+ bos_token_id=1,
113
+ eos_token_id=2,
114
+ pretraining_tp=1,
115
+ tie_word_embeddings=False,
116
+ bias=True,
117
+ rope_theta=10000,
118
+ rope_scaling=None,
119
+ attn_implementation=None,
120
+ **kwargs,
121
+ ):
122
+ self.vocab_size = vocab_size
123
+ self.max_position_embeddings = max_position_embeddings
124
+ self.hidden_size = hidden_size
125
+ self.intermediate_size = intermediate_size
126
+ self.num_hidden_layers = num_hidden_layers
127
+ self.num_attention_heads = num_attention_heads
128
+ self.bias = bias
129
+
130
+ if num_key_value_heads is None:
131
+ num_key_value_heads = num_attention_heads
132
+ self.num_key_value_heads = num_key_value_heads
133
+
134
+ self.hidden_act = hidden_act
135
+ self.initializer_range = initializer_range
136
+ self.rms_norm_eps = rms_norm_eps
137
+ self.pretraining_tp = pretraining_tp
138
+ self.use_cache = use_cache
139
+ self.rope_theta = rope_theta
140
+ self.rope_scaling = rope_scaling
141
+ self._rope_scaling_validation()
142
+ self.attn_implementation = attn_implementation
143
+ if self.attn_implementation is None:
144
+ self.attn_implementation = "eager"
145
+
146
+ super().__init__(
147
+ pad_token_id=pad_token_id,
148
+ bos_token_id=bos_token_id,
149
+ eos_token_id=eos_token_id,
150
+ tie_word_embeddings=tie_word_embeddings,
151
+ **kwargs,
152
+ )
153
+
154
+ def _rope_scaling_validation(self):
155
+ """
156
+ Validate the `rope_scaling` configuration.
157
+ """
158
+ if self.rope_scaling is None:
159
+ return
160
+
161
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
162
+ raise ValueError(
163
+ "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
164
+ f"got {self.rope_scaling}"
165
+ )
166
+ rope_scaling_type = self.rope_scaling.get("type", None)
167
+ rope_scaling_factor = self.rope_scaling.get("factor", None)
168
+ if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
169
+ raise ValueError(
170
+ f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
171
+ )
172
+ if (
173
+ rope_scaling_factor is None
174
+ or not isinstance(rope_scaling_factor, (float, int))
175
+ or rope_scaling_factor < 1.0
176
+ ):
177
+ raise ValueError(
178
+ f"`rope_scaling`'s factor field must be a number >= 1, got {rope_scaling_factor} "
179
+ f"of type {type(rope_scaling_factor)}"
180
+ )
modeling_internlm2.py ADDED
@@ -0,0 +1,1578 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
2
+ #
3
+ # This code is based on transformers/src/transformers/models/llama/modeling_llama.py
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """ PyTorch InternLM2 model."""
17
+ import math
18
+ import queue
19
+ import threading
20
+ import warnings
21
+ from typing import List, Optional, Tuple, Union
22
+
23
+ import torch
24
+ import torch.nn.functional as F
25
+ import torch.utils.checkpoint
26
+ from einops import rearrange
27
+ from torch import nn
28
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
29
+ from transformers.activations import ACT2FN
30
+ from transformers.modeling_outputs import (
31
+ BaseModelOutputWithPast,
32
+ CausalLMOutputWithPast,
33
+ SequenceClassifierOutputWithPast,
34
+ )
35
+ from transformers.modeling_utils import PreTrainedModel
36
+ from transformers.utils import (
37
+ add_start_docstrings,
38
+ add_start_docstrings_to_model_forward,
39
+ logging,
40
+ replace_return_docstrings,
41
+ )
42
+
43
+ try:
44
+ from transformers.generation.streamers import BaseStreamer
45
+ except: # noqa # pylint: disable=bare-except
46
+ BaseStreamer = None
47
+
48
+ from .configuration_internlm2 import InternLM2Config
49
+
50
+ logger = logging.get_logger(__name__)
51
+
52
+ _CONFIG_FOR_DOC = "InternLM2Config"
53
+
54
+ flash_attn_func, flash_attn_varlen_func = None, None
55
+ pad_input, index_first_axis, unpad_input = None, None, None
56
+ def _import_flash_attn():
57
+ global flash_attn_func, flash_attn_varlen_func
58
+ global pad_input, index_first_axis, unpad_input
59
+ try:
60
+ from flash_attn import flash_attn_func as _flash_attn_func, flash_attn_varlen_func as _flash_attn_varlen_func
61
+ from flash_attn.bert_padding import pad_input as _pad_input, index_first_axis as _index_first_axis, unpad_input as _unpad_input
62
+ flash_attn_func, flash_attn_varlen_func = _flash_attn_func, _flash_attn_varlen_func
63
+ pad_input, index_first_axis, unpad_input = _pad_input, _index_first_axis, _unpad_input
64
+ except ImportError:
65
+ raise ImportError("flash_attn is not installed.")
66
+
67
+ # Copied from transformers.models.llama.modeling_llama._get_unpad_data
68
+ def _get_unpad_data(attention_mask):
69
+ seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
70
+ indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
71
+ max_seqlen_in_batch = seqlens_in_batch.max().item()
72
+ cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
73
+ return (
74
+ indices,
75
+ cu_seqlens,
76
+ max_seqlen_in_batch,
77
+ )
78
+
79
+
80
+ # Copied from transformers.models.bart.modeling_bart._make_causal_mask
81
+ def _make_causal_mask(
82
+ input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
83
+ ):
84
+ """
85
+ Make causal mask used for bi-directional self-attention.
86
+ """
87
+ bsz, tgt_len = input_ids_shape
88
+ mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
89
+ mask_cond = torch.arange(mask.size(-1), device=device)
90
+ mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
91
+ mask = mask.to(dtype)
92
+
93
+ if past_key_values_length > 0:
94
+ mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
95
+ return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
96
+
97
+
98
+ # Copied from transformers.models.bart.modeling_bart._expand_mask
99
+ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
100
+ """
101
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
102
+ """
103
+ bsz, src_len = mask.size()
104
+ tgt_len = tgt_len if tgt_len is not None else src_len
105
+
106
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
107
+
108
+ inverted_mask = 1.0 - expanded_mask
109
+
110
+ return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
111
+
112
+
113
+ # Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->InternLM2
114
+ class InternLM2RMSNorm(nn.Module):
115
+ def __init__(self, hidden_size, eps=1e-6):
116
+ """
117
+ InternLM2RMSNorm is equivalent to T5LayerNorm
118
+ """
119
+ super().__init__()
120
+ self.weight = nn.Parameter(torch.ones(hidden_size))
121
+ self.variance_epsilon = eps
122
+
123
+ def forward(self, hidden_states):
124
+ input_dtype = hidden_states.dtype
125
+ hidden_states = hidden_states.to(torch.float32)
126
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
127
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
128
+ return self.weight * hidden_states.to(input_dtype)
129
+
130
+
131
+ # Copied from transformers.model.llama.modeling_llama.LlamaRotaryEmbedding with Llama->InternLM2
132
+ class InternLM2RotaryEmbedding(nn.Module):
133
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
134
+ super().__init__()
135
+
136
+ self.dim = dim
137
+ self.max_position_embeddings = max_position_embeddings
138
+ self.base = base
139
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
140
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
141
+
142
+ # Build here to make `torch.jit.trace` work.
143
+ self._set_cos_sin_cache(
144
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
145
+ )
146
+
147
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
148
+ self.max_seq_len_cached = seq_len
149
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
150
+
151
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
152
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
153
+ emb = torch.cat((freqs, freqs), dim=-1)
154
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
155
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
156
+
157
+ def forward(self, x, seq_len=None):
158
+ # x: [bs, num_attention_heads, seq_len, head_size]
159
+ if seq_len > self.max_seq_len_cached:
160
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=torch.float32)
161
+
162
+ return (
163
+ self.cos_cached[:seq_len].to(dtype=x.dtype),
164
+ self.sin_cached[:seq_len].to(dtype=x.dtype),
165
+ )
166
+
167
+
168
+ # Copied from transformers.model.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->InternLM2
169
+ class InternLM2LinearScalingRotaryEmbedding(InternLM2RotaryEmbedding):
170
+ """InternLM2RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
171
+
172
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
173
+ self.scaling_factor = scaling_factor
174
+ super().__init__(dim, max_position_embeddings, base, device)
175
+
176
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
177
+ self.max_seq_len_cached = seq_len
178
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
179
+ t = t / self.scaling_factor
180
+
181
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
182
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
183
+ emb = torch.cat((freqs, freqs), dim=-1)
184
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
185
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
186
+
187
+
188
+ # Copied from transformers.model.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->InternLM2
189
+ class InternLM2DynamicNTKScalingRotaryEmbedding(InternLM2RotaryEmbedding):
190
+ """InternLM2RotaryEmbedding extended with Dynamic NTK scaling.
191
+ Credits to the Reddit users /u/bloc97 and /u/emozilla.
192
+ """
193
+
194
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
195
+ self.scaling_factor = scaling_factor
196
+ super().__init__(dim, max_position_embeddings, base, device)
197
+
198
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
199
+ self.max_seq_len_cached = seq_len
200
+
201
+ if seq_len > self.max_position_embeddings:
202
+ base = self.base * (
203
+ (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
204
+ ) ** (self.dim / (self.dim - 2))
205
+ inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
206
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
207
+
208
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
209
+
210
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
211
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
212
+ emb = torch.cat((freqs, freqs), dim=-1)
213
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
214
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
215
+
216
+
217
+ # Copied from transformers.model.llama.modeling_llama.rotate_half
218
+ def rotate_half(x):
219
+ """Rotates half the hidden dims of the input."""
220
+ x1 = x[..., : x.shape[-1] // 2]
221
+ x2 = x[..., x.shape[-1] // 2 :]
222
+ return torch.cat((-x2, x1), dim=-1)
223
+
224
+
225
+ # Copied from transformers.model.llama.modeling_llama.apply_rotary_pos_emb
226
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
227
+ """Applies Rotary Position Embedding to the query and key tensors."""
228
+ cos = cos[position_ids].unsqueeze(unsqueeze_dim)
229
+ sin = sin[position_ids].unsqueeze(unsqueeze_dim)
230
+ q_embed = (q * cos) + (rotate_half(q) * sin)
231
+ k_embed = (k * cos) + (rotate_half(k) * sin)
232
+ return q_embed, k_embed
233
+
234
+
235
+ class InternLM2MLP(nn.Module):
236
+ def __init__(self, config):
237
+ super().__init__()
238
+ self.config = config
239
+ self.hidden_size = config.hidden_size
240
+ self.intermediate_size = config.intermediate_size
241
+ self.w1 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
242
+ self.w3 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
243
+ self.w2 = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
244
+ self.act_fn = ACT2FN[config.hidden_act]
245
+
246
+ def forward(self, x):
247
+ down_proj = self.w2(self.act_fn(self.w1(x)) * self.w3(x))
248
+
249
+ return down_proj
250
+
251
+
252
+ # Copied from transformers.model.llama.modeling_llama.repeat_kv
253
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
254
+ """
255
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
256
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
257
+ """
258
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
259
+ if n_rep == 1:
260
+ return hidden_states
261
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
262
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
263
+
264
+
265
+ # Modified from transformers.model.llama.modeling_llama.LlamaAttention
266
+ class InternLM2Attention(nn.Module):
267
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
268
+
269
+ def __init__(self, config: InternLM2Config):
270
+ super().__init__()
271
+ self.config = config
272
+ self.hidden_size = config.hidden_size
273
+ self.num_heads = config.num_attention_heads
274
+ self.head_dim = self.hidden_size // self.num_heads
275
+ self.num_key_value_heads = config.num_key_value_heads
276
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
277
+ self.max_position_embeddings = config.max_position_embeddings
278
+ self.is_causal = True
279
+
280
+ if (self.head_dim * self.num_heads) != self.hidden_size:
281
+ raise ValueError(
282
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
283
+ f" and `num_heads`: {self.num_heads})."
284
+ )
285
+
286
+ self.wqkv = nn.Linear(
287
+ self.hidden_size,
288
+ (self.num_heads + 2 * self.num_key_value_heads) * self.head_dim,
289
+ bias=config.bias,
290
+ )
291
+
292
+ self.wo = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.bias)
293
+ self._init_rope()
294
+
295
+ def _init_rope(self):
296
+ if self.config.rope_scaling is None:
297
+ self.rotary_emb = InternLM2RotaryEmbedding(
298
+ self.head_dim,
299
+ max_position_embeddings=self.max_position_embeddings,
300
+ base=self.config.rope_theta,
301
+ )
302
+ else:
303
+ scaling_type = self.config.rope_scaling["type"]
304
+ scaling_factor = self.config.rope_scaling["factor"]
305
+ if scaling_type == "dynamic":
306
+ self.rotary_emb = InternLM2DynamicNTKScalingRotaryEmbedding(
307
+ self.head_dim,
308
+ max_position_embeddings=self.max_position_embeddings,
309
+ base=self.config.rope_theta,
310
+ scaling_factor=scaling_factor,
311
+ )
312
+ elif scaling_type == "linear":
313
+ self.rotary_emb = InternLM2LinearScalingRotaryEmbedding(
314
+ self.head_dim,
315
+ max_position_embeddings=self.max_position_embeddings,
316
+ base=self.config.rope_theta,
317
+ scaling_factor=scaling_factor,
318
+ )
319
+ else:
320
+ raise ValueError("Currently we only support rotary embedding's type being 'dynamic' or 'linear'.")
321
+ return self.rotary_emb
322
+
323
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
324
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
325
+
326
+ def forward(
327
+ self,
328
+ hidden_states: torch.Tensor,
329
+ attention_mask: Optional[torch.Tensor] = None,
330
+ position_ids: Optional[torch.LongTensor] = None,
331
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
332
+ output_attentions: bool = False,
333
+ use_cache: bool = False,
334
+ **kwargs,
335
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
336
+ if "padding_mask" in kwargs:
337
+ warnings.warn(
338
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. "
339
+ "Please make sure use `attention_mask` instead.`"
340
+ )
341
+
342
+ bsz, q_len, _ = hidden_states.size()
343
+
344
+ qkv_states = self.wqkv(hidden_states)
345
+
346
+ qkv_states = rearrange(
347
+ qkv_states,
348
+ "b q (h gs d) -> b q h gs d",
349
+ gs=2 + self.num_key_value_groups,
350
+ d=self.head_dim,
351
+ )
352
+
353
+ query_states = qkv_states[..., : self.num_key_value_groups, :]
354
+ query_states = rearrange(query_states, "b q h gs d -> b q (h gs) d")
355
+ key_states = qkv_states[..., -2, :]
356
+ value_states = qkv_states[..., -1, :]
357
+
358
+ query_states = query_states.transpose(1, 2)
359
+ key_states = key_states.transpose(1, 2)
360
+ value_states = value_states.transpose(1, 2)
361
+
362
+ kv_seq_len = key_states.shape[-2]
363
+ if past_key_value is not None:
364
+ kv_seq_len += past_key_value[0].shape[-2]
365
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
366
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
367
+
368
+ if past_key_value is not None:
369
+ # reuse k, v, self_attention
370
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
371
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
372
+
373
+ past_key_value = (key_states, value_states) if use_cache else None
374
+
375
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
376
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
377
+
378
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
379
+
380
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
381
+ raise ValueError(
382
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
383
+ f" {attn_weights.size()}"
384
+ )
385
+
386
+ if attention_mask is not None:
387
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
388
+ raise ValueError(
389
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
390
+ )
391
+ attn_weights = attn_weights + attention_mask
392
+
393
+ # upcast attention to fp32
394
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
395
+ attn_output = torch.matmul(attn_weights, value_states)
396
+
397
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
398
+ raise ValueError(
399
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
400
+ f" {attn_output.size()}"
401
+ )
402
+
403
+ attn_output = attn_output.transpose(1, 2).contiguous()
404
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
405
+
406
+ attn_output = self.wo(attn_output)
407
+
408
+ if not output_attentions:
409
+ attn_weights = None
410
+
411
+ return attn_output, attn_weights, past_key_value
412
+
413
+
414
+ # Modified from transformers.model.llama.modeling_llama.InternLM2FlashAttention2
415
+ class InternLM2FlashAttention2(InternLM2Attention):
416
+ """
417
+ InternLM2 flash attention module. This module inherits from `InternLM2Attention` as the weights of the module stays
418
+ untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
419
+ flash attention and deal with padding tokens in case the input contains any of them.
420
+ """
421
+
422
+ def forward(
423
+ self,
424
+ hidden_states: torch.Tensor,
425
+ attention_mask: Optional[torch.LongTensor] = None,
426
+ position_ids: Optional[torch.LongTensor] = None,
427
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
428
+ output_attentions: bool = False,
429
+ use_cache: bool = False,
430
+ **kwargs,
431
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
432
+ # InternLM2FlashAttention2 attention does not support output_attentions
433
+ if "padding_mask" in kwargs:
434
+ warnings.warn(
435
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. "
436
+ "Please make sure use `attention_mask` instead.`"
437
+ )
438
+
439
+ # overwrite attention_mask with padding_mask
440
+ attention_mask = kwargs.pop("padding_mask")
441
+
442
+ output_attentions = False
443
+
444
+ bsz, q_len, _ = hidden_states.size()
445
+
446
+ qkv_states = self.wqkv(hidden_states)
447
+
448
+ qkv_states = rearrange(
449
+ qkv_states,
450
+ "b q (h gs d) -> b q h gs d",
451
+ gs=2 + self.num_key_value_groups,
452
+ d=self.head_dim,
453
+ )
454
+
455
+ query_states = qkv_states[..., : self.num_key_value_groups, :]
456
+ query_states = rearrange(query_states, "b q h gs d -> b q (h gs) d")
457
+ key_states = qkv_states[..., -2, :]
458
+ value_states = qkv_states[..., -1, :]
459
+
460
+ query_states = query_states.transpose(1, 2)
461
+ key_states = key_states.transpose(1, 2)
462
+ value_states = value_states.transpose(1, 2)
463
+
464
+ kv_seq_len = key_states.shape[-2]
465
+ if past_key_value is not None:
466
+ kv_seq_len += past_key_value[0].shape[-2]
467
+
468
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
469
+
470
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
471
+
472
+ if past_key_value is not None:
473
+ # reuse k, v, self_attention
474
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
475
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
476
+
477
+ past_key_value = (key_states, value_states) if use_cache else None
478
+
479
+ query_states = query_states.transpose(1, 2)
480
+ key_states = key_states.transpose(1, 2)
481
+ value_states = value_states.transpose(1, 2)
482
+
483
+ attn_output = self._flash_attention_forward(
484
+ query_states, key_states, value_states, attention_mask, q_len
485
+ )
486
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
487
+ attn_output = self.wo(attn_output)
488
+
489
+ if not output_attentions:
490
+ attn_weights = None
491
+
492
+ return attn_output, attn_weights, past_key_value
493
+
494
+ def _flash_attention_forward(
495
+ self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
496
+ ):
497
+ """
498
+ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
499
+ first unpad the input, then computes the attention scores and pad the final attention scores.
500
+
501
+ Args:
502
+ query_states (`torch.Tensor`):
503
+ Input query states to be passed to Flash Attention API
504
+ key_states (`torch.Tensor`):
505
+ Input key states to be passed to Flash Attention API
506
+ value_states (`torch.Tensor`):
507
+ Input value states to be passed to Flash Attention API
508
+ attention_mask (`torch.Tensor`):
509
+ The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
510
+ position of padding tokens and 1 for the position of non-padding tokens.
511
+ dropout (`int`, *optional*):
512
+ Attention dropout
513
+ softmax_scale (`float`, *optional*):
514
+ The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
515
+ """
516
+ # Contains at least one padding token in the sequence
517
+ causal = self.is_causal and query_length != 1
518
+ if attention_mask is not None:
519
+ batch_size = query_states.shape[0]
520
+ query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._unpad_input(
521
+ query_states, key_states, value_states, attention_mask, query_length
522
+ )
523
+
524
+ cu_seqlens_q, cu_seqlens_k = cu_seq_lens
525
+ max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
526
+
527
+ attn_output_unpad = flash_attn_varlen_func(
528
+ query_states,
529
+ key_states,
530
+ value_states,
531
+ cu_seqlens_q=cu_seqlens_q,
532
+ cu_seqlens_k=cu_seqlens_k,
533
+ max_seqlen_q=max_seqlen_in_batch_q,
534
+ max_seqlen_k=max_seqlen_in_batch_k,
535
+ dropout_p=dropout,
536
+ softmax_scale=softmax_scale,
537
+ causal=causal,
538
+ )
539
+
540
+ attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
541
+ else:
542
+ attn_output = flash_attn_func(
543
+ query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
544
+ )
545
+
546
+ return attn_output
547
+
548
+ def _unpad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
549
+ indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
550
+ batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
551
+
552
+ key_layer = index_first_axis(
553
+ key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
554
+ )
555
+ value_layer = index_first_axis(
556
+ value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
557
+ )
558
+
559
+ if query_length == kv_seq_len:
560
+ query_layer = index_first_axis(
561
+ query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
562
+ )
563
+ cu_seqlens_q = cu_seqlens_k
564
+ max_seqlen_in_batch_q = max_seqlen_in_batch_k
565
+ indices_q = indices_k
566
+ elif query_length == 1:
567
+ max_seqlen_in_batch_q = 1
568
+ cu_seqlens_q = torch.arange(
569
+ batch_size + 1, dtype=torch.int32, device=query_layer.device
570
+ ) # There is a memcpy here, that is very bad.
571
+ indices_q = cu_seqlens_q[:-1]
572
+ query_layer = query_layer.squeeze(1)
573
+ else:
574
+ # The -q_len: slice assumes left padding.
575
+ attention_mask = attention_mask[:, -query_length:]
576
+ query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
577
+
578
+ return (
579
+ query_layer,
580
+ key_layer,
581
+ value_layer,
582
+ indices_q.to(torch.int64),
583
+ (cu_seqlens_q, cu_seqlens_k),
584
+ (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
585
+ )
586
+
587
+ INTERNLM2_ATTENTION_CLASSES = {
588
+ "eager": InternLM2Attention,
589
+ "flash_attention_2": InternLM2FlashAttention2,
590
+ }
591
+
592
+ # Modified from transformers.model.llama.modeling_llama.LlamaDecoderLayer
593
+ class InternLM2DecoderLayer(nn.Module):
594
+ def __init__(self, config: InternLM2Config):
595
+ super().__init__()
596
+ self.hidden_size = config.hidden_size
597
+
598
+ self.attention = INTERNLM2_ATTENTION_CLASSES[config.attn_implementation](config=config)
599
+
600
+ self.feed_forward = InternLM2MLP(config)
601
+ self.attention_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
602
+ self.ffn_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
603
+
604
+ def forward(
605
+ self,
606
+ hidden_states: torch.Tensor,
607
+ attention_mask: Optional[torch.Tensor] = None,
608
+ position_ids: Optional[torch.LongTensor] = None,
609
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
610
+ output_attentions: Optional[bool] = False,
611
+ use_cache: Optional[bool] = False,
612
+ **kwargs,
613
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
614
+ """
615
+ Args:
616
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
617
+ attention_mask (`torch.FloatTensor`, *optional*):
618
+ attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
619
+ query_sequence_length, key_sequence_length)` if default attention is used.
620
+ output_attentions (`bool`, *optional*):
621
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
622
+ returned tensors for more detail.
623
+ use_cache (`bool`, *optional*):
624
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
625
+ (see `past_key_values`).
626
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
627
+ """
628
+ if "padding_mask" in kwargs:
629
+ warnings.warn(
630
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. "
631
+ "Please make sure use `attention_mask` instead.`"
632
+ )
633
+
634
+ residual = hidden_states
635
+
636
+ hidden_states = self.attention_norm(hidden_states)
637
+
638
+ # Self Attention
639
+ hidden_states, self_attn_weights, present_key_value = self.attention(
640
+ hidden_states=hidden_states,
641
+ attention_mask=attention_mask,
642
+ position_ids=position_ids,
643