FIX autogptq compat
Browse filesWe have a pending autogptq PR that will allow gptq quant of gllm. For the augptq PR to work we need this simple method def/typing fix to resolve compat issues with transformers and autogptq.
Ready gptq quants for testing:
https://huggingface.co/LnL-AI/glm-4-9b-gptq-4bit-qubitium-r1
https://huggingface.co/LnL-AI/glm-4-9b-chat-gptq-4bit-qubitium-r1
- modeling_chatglm.py +10 -10
modeling_chatglm.py
CHANGED
@@ -603,17 +603,17 @@ class GLMTransformer(torch.nn.Module):
|
|
603 |
layer_ret = torch.utils.checkpoint.checkpoint(
|
604 |
layer,
|
605 |
hidden_states,
|
606 |
-
attention_mask,
|
607 |
-
rotary_pos_emb,
|
608 |
-
kv_caches[index],
|
609 |
-
use_cache,
|
610 |
use_reentrant=False
|
611 |
)
|
612 |
else:
|
613 |
layer_ret = layer(
|
614 |
hidden_states,
|
615 |
-
attention_mask,
|
616 |
-
rotary_pos_emb,
|
617 |
kv_cache=kv_caches[index],
|
618 |
use_cache=use_cache
|
619 |
)
|
@@ -722,7 +722,7 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
|
|
722 |
config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
|
723 |
)
|
724 |
|
725 |
-
self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, rope_ratio=config.rope_ratio, original_impl=config.original_rope,
|
726 |
device=device, dtype=config.torch_dtype)
|
727 |
self.encoder = init_method(GLMTransformer, config, **init_kwargs)
|
728 |
self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,
|
@@ -738,8 +738,8 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
|
|
738 |
self,
|
739 |
input_ids,
|
740 |
position_ids: Optional[torch.Tensor] = None,
|
741 |
-
attention_mask: Optional[torch.
|
742 |
-
full_attention_mask: Optional[torch.
|
743 |
past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
|
744 |
inputs_embeds: Optional[torch.Tensor] = None,
|
745 |
use_cache: Optional[bool] = None,
|
@@ -1204,4 +1204,4 @@ class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel):
|
|
1204 |
past_key_values=transformer_outputs.past_key_values,
|
1205 |
hidden_states=transformer_outputs.hidden_states,
|
1206 |
attentions=transformer_outputs.attentions,
|
1207 |
-
)
|
|
|
603 |
layer_ret = torch.utils.checkpoint.checkpoint(
|
604 |
layer,
|
605 |
hidden_states,
|
606 |
+
attention_mask=attention_mask,
|
607 |
+
rotary_pos_emb=rotary_pos_emb,
|
608 |
+
kv_cache=kv_caches[index],
|
609 |
+
use_cache=use_cache,
|
610 |
use_reentrant=False
|
611 |
)
|
612 |
else:
|
613 |
layer_ret = layer(
|
614 |
hidden_states,
|
615 |
+
attention_mask=attention_mask,
|
616 |
+
rotary_pos_emb=rotary_pos_emb,
|
617 |
kv_cache=kv_caches[index],
|
618 |
use_cache=use_cache
|
619 |
)
|
|
|
722 |
config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
|
723 |
)
|
724 |
|
725 |
+
self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, rope_ratio=config.rope_ratio, original_impl=config.original_rope,
|
726 |
device=device, dtype=config.torch_dtype)
|
727 |
self.encoder = init_method(GLMTransformer, config, **init_kwargs)
|
728 |
self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,
|
|
|
738 |
self,
|
739 |
input_ids,
|
740 |
position_ids: Optional[torch.Tensor] = None,
|
741 |
+
attention_mask: Optional[torch.Tensor] = None,
|
742 |
+
full_attention_mask: Optional[torch.Tensor] = None,
|
743 |
past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
|
744 |
inputs_embeds: Optional[torch.Tensor] = None,
|
745 |
use_cache: Optional[bool] = None,
|
|
|
1204 |
past_key_values=transformer_outputs.past_key_values,
|
1205 |
hidden_states=transformer_outputs.hidden_states,
|
1206 |
attentions=transformer_outputs.attentions,
|
1207 |
+
)
|