remove useless code
Browse files- modeling_baichuan.py +0 -7
modeling_baichuan.py
CHANGED
@@ -171,9 +171,6 @@ class Attention(nn.Module):
|
|
171 |
f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
|
172 |
f" and `num_heads`: {self.num_heads})."
|
173 |
)
|
174 |
-
# self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
|
175 |
-
# self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
|
176 |
-
# self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
|
177 |
self.W_pack = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=False)
|
178 |
self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
|
179 |
self.rotary_emb = RotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
|
@@ -201,10 +198,6 @@ class Attention(nn.Module):
|
|
201 |
value_states = proj[2].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1,
|
202 |
2) # batch_size x source_len x hidden_size
|
203 |
|
204 |
-
# query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
|
205 |
-
# key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
|
206 |
-
# value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
|
207 |
-
|
208 |
kv_seq_len = key_states.shape[-2]
|
209 |
if past_key_value is not None:
|
210 |
kv_seq_len += past_key_value[0].shape[-2]
|
|
|
171 |
f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
|
172 |
f" and `num_heads`: {self.num_heads})."
|
173 |
)
|
|
|
|
|
|
|
174 |
self.W_pack = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=False)
|
175 |
self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
|
176 |
self.rotary_emb = RotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
|
|
|
198 |
value_states = proj[2].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1,
|
199 |
2) # batch_size x source_len x hidden_size
|
200 |
|
|
|
|
|
|
|
|
|
201 |
kv_seq_len = key_states.shape[-2]
|
202 |
if past_key_value is not None:
|
203 |
kv_seq_len += past_key_value[0].shape[-2]
|