Qubitium commited on
Commit
5a0fc97
·
verified ·
1 Parent(s): 911e84c

FIX autogptq compat

Browse files

We have a pending autogptq PR that will allow gptq quant of gllm. For the augptq PR to work we need this simple method def/typing fix to resolve compat issues with transformers and autogptq.

Ready gptq quants for testing:

https://huggingface.co/LnL-AI/glm-4-9b-gptq-4bit-qubitium-r1
https://huggingface.co/LnL-AI/glm-4-9b-chat-gptq-4bit-qubitium-r1

Files changed (1) hide show
  1. modeling_chatglm.py +10 -10
modeling_chatglm.py CHANGED
@@ -603,17 +603,17 @@ class GLMTransformer(torch.nn.Module):
603
  layer_ret = torch.utils.checkpoint.checkpoint(
604
  layer,
605
  hidden_states,
606
- attention_mask,
607
- rotary_pos_emb,
608
- kv_caches[index],
609
- use_cache,
610
  use_reentrant=False
611
  )
612
  else:
613
  layer_ret = layer(
614
  hidden_states,
615
- attention_mask,
616
- rotary_pos_emb,
617
  kv_cache=kv_caches[index],
618
  use_cache=use_cache
619
  )
@@ -722,7 +722,7 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
722
  config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
723
  )
724
 
725
- self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, rope_ratio=config.rope_ratio, original_impl=config.original_rope,
726
  device=device, dtype=config.torch_dtype)
727
  self.encoder = init_method(GLMTransformer, config, **init_kwargs)
728
  self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,
@@ -738,8 +738,8 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
738
  self,
739
  input_ids,
740
  position_ids: Optional[torch.Tensor] = None,
741
- attention_mask: Optional[torch.BoolTensor] = None,
742
- full_attention_mask: Optional[torch.BoolTensor] = None,
743
  past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
744
  inputs_embeds: Optional[torch.Tensor] = None,
745
  use_cache: Optional[bool] = None,
@@ -1204,4 +1204,4 @@ class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel):
1204
  past_key_values=transformer_outputs.past_key_values,
1205
  hidden_states=transformer_outputs.hidden_states,
1206
  attentions=transformer_outputs.attentions,
1207
- )
 
603
  layer_ret = torch.utils.checkpoint.checkpoint(
604
  layer,
605
  hidden_states,
606
+ attention_mask=attention_mask,
607
+ rotary_pos_emb=rotary_pos_emb,
608
+ kv_cache=kv_caches[index],
609
+ use_cache=use_cache,
610
  use_reentrant=False
611
  )
612
  else:
613
  layer_ret = layer(
614
  hidden_states,
615
+ attention_mask=attention_mask,
616
+ rotary_pos_emb=rotary_pos_emb,
617
  kv_cache=kv_caches[index],
618
  use_cache=use_cache
619
  )
 
722
  config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
723
  )
724
 
725
+ self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, rope_ratio=config.rope_ratio, original_impl=config.original_rope,
726
  device=device, dtype=config.torch_dtype)
727
  self.encoder = init_method(GLMTransformer, config, **init_kwargs)
728
  self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,
 
738
  self,
739
  input_ids,
740
  position_ids: Optional[torch.Tensor] = None,
741
+ attention_mask: Optional[torch.Tensor] = None,
742
+ full_attention_mask: Optional[torch.Tensor] = None,
743
  past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
744
  inputs_embeds: Optional[torch.Tensor] = None,
745
  use_cache: Optional[bool] = None,
 
1204
  past_key_values=transformer_outputs.past_key_values,
1205
  hidden_states=transformer_outputs.hidden_states,
1206
  attentions=transformer_outputs.attentions,
1207
+ )