FIX autogptq compat

We have a pending autogptq PR that will allow gptq quant of gllm. For the augptq PR to work we need this simple method def/typing fix to resolve compat issues with transformers and autogptq.

Ready gptq quants for testing:

https://huggingface.co/LnL-AI/glm-4-9b-gptq-4bit-qubitium-r1
https://huggingface.co/LnL-AI/glm-4-9b-chat-gptq-4bit-qubitium-r1

Files changed (1) hide show

modeling_chatglm.py +10 -10

modeling_chatglm.py CHANGED Viewed

@@ -603,17 +603,17 @@ class GLMTransformer(torch.nn.Module):
                 layer_ret = torch.utils.checkpoint.checkpoint(
                     layer,
                     hidden_states,
-                    attention_mask,
-                    rotary_pos_emb,
-                    kv_caches[index],
-                    use_cache,
                     use_reentrant=False
                 )
             else:
                 layer_ret = layer(
                     hidden_states,
-                    attention_mask,
-                    rotary_pos_emb,
                     kv_cache=kv_caches[index],
                     use_cache=use_cache
                 )
@@ -722,7 +722,7 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
             config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
         )
-        self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, rope_ratio=config.rope_ratio, original_impl=config.original_rope,
                                               device=device, dtype=config.torch_dtype)
         self.encoder = init_method(GLMTransformer, config, **init_kwargs)
         self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,
@@ -738,8 +738,8 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
             self,
             input_ids,
             position_ids: Optional[torch.Tensor] = None,
-            attention_mask: Optional[torch.BoolTensor] = None,
-            full_attention_mask: Optional[torch.BoolTensor] = None,
             past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
             inputs_embeds: Optional[torch.Tensor] = None,
             use_cache: Optional[bool] = None,
@@ -1204,4 +1204,4 @@ class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel):
             past_key_values=transformer_outputs.past_key_values,
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
-        )

                 layer_ret = torch.utils.checkpoint.checkpoint(
                     layer,
                     hidden_states,
+                    attention_mask=attention_mask,
+                    rotary_pos_emb=rotary_pos_emb,
+                    kv_cache=kv_caches[index],
+                    use_cache=use_cache,
                     use_reentrant=False
                 )
             else:
                 layer_ret = layer(
                     hidden_states,
+                    attention_mask=attention_mask,
+                    rotary_pos_emb=rotary_pos_emb,
                     kv_cache=kv_caches[index],
                     use_cache=use_cache
                 )
             config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
         )
+        self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, rope_ratio=config.rope_ratio, original_impl=config.original_rope,
                                               device=device, dtype=config.torch_dtype)
         self.encoder = init_method(GLMTransformer, config, **init_kwargs)
         self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,
             self,
             input_ids,
             position_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            full_attention_mask: Optional[torch.Tensor] = None,
             past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
             inputs_embeds: Optional[torch.Tensor] = None,
             use_cache: Optional[bool] = None,
             past_key_values=transformer_outputs.past_key_values,
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
+        )