add get_output_embeddings()

#64
by ranchlai - opened
README.md CHANGED
@@ -15,9 +15,6 @@ tags:
15
  <p align="center">
16
  👋 Join our <a href="https://join.slack.com/t/chatglm/shared_invite/zt-1y7pqoloy-9b1g6T6JjA8J0KxvUjbwJw" target="_blank">Slack</a> and <a href="https://github.com/THUDM/ChatGLM-6B/blob/main/resources/WECHAT.md" target="_blank">WeChat</a>
17
  </p>
18
- <p align="center">
19
- 📍Experience the larger-scale ChatGLM model at <a href="https://www.chatglm.cn">chatglm.cn</a>
20
- </p>
21
 
22
  ## 介绍
23
  ChatGLM**2**-6B 是开源中英双语对话模型 [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B) 的第二代版本,在保留了初代模型对话流畅、部署门槛较低等众多优秀特性的基础之上,ChatGLM**2**-6B 引入了如下新特性:
@@ -79,17 +76,22 @@ For more instructions, including how to run CLI and web demos, and model quantiz
79
 
80
  ## 引用
81
 
82
- 如果你觉得我们的工作有帮助的话,请考虑引用下列论文。
83
-
84
- If you find our work helpful, please consider citing the following paper.
85
 
86
  ```
87
- @misc{glm2024chatglm,
88
- title={ChatGLM: A Family of Large Language Models from GLM-130B to GLM-4 All Tools},
89
- author={Team GLM and Aohan Zeng and Bin Xu and Bowen Wang and Chenhui Zhang and Da Yin and Diego Rojas and Guanyu Feng and Hanlin Zhao and Hanyu Lai and Hao Yu and Hongning Wang and Jiadai Sun and Jiajie Zhang and Jiale Cheng and Jiayi Gui and Jie Tang and Jing Zhang and Juanzi Li and Lei Zhao and Lindong Wu and Lucen Zhong and Mingdao Liu and Minlie Huang and Peng Zhang and Qinkai Zheng and Rui Lu and Shuaiqi Duan and Shudan Zhang and Shulin Cao and Shuxun Yang and Weng Lam Tam and Wenyi Zhao and Xiao Liu and Xiao Xia and Xiaohan Zhang and Xiaotao Gu and Xin Lv and Xinghan Liu and Xinyi Liu and Xinyue Yang and Xixuan Song and Xunkai Zhang and Yifan An and Yifan Xu and Yilin Niu and Yuantao Yang and Yueyan Li and Yushi Bai and Yuxiao Dong and Zehan Qi and Zhaoyu Wang and Zhen Yang and Zhengxiao Du and Zhenyu Hou and Zihan Wang},
90
- year={2024},
91
- eprint={2406.12793},
92
- archivePrefix={arXiv},
93
- primaryClass={id='cs.CL' full_name='Computation and Language' is_active=True alt_name='cmp-lg' in_archive='cs' is_general=False description='Covers natural language processing. Roughly includes material in ACM Subject Class I.2.7. Note that work on artificial languages (programming languages, logics, formal systems) that does not explicitly address natural-language issues broadly construed (natural-language processing, computational linguistics, speech, text retrieval, etc.) is not appropriate for this area.'}
 
 
 
 
 
 
 
94
  }
95
  ```
 
15
  <p align="center">
16
  👋 Join our <a href="https://join.slack.com/t/chatglm/shared_invite/zt-1y7pqoloy-9b1g6T6JjA8J0KxvUjbwJw" target="_blank">Slack</a> and <a href="https://github.com/THUDM/ChatGLM-6B/blob/main/resources/WECHAT.md" target="_blank">WeChat</a>
17
  </p>
 
 
 
18
 
19
  ## 介绍
20
  ChatGLM**2**-6B 是开源中英双语对话模型 [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B) 的第二代版本,在保留了初代模型对话流畅、部署门槛较低等众多优秀特性的基础之上,ChatGLM**2**-6B 引入了如下新特性:
 
76
 
77
  ## 引用
78
 
79
+ 如果你觉得我们的工作有帮助的话,请考虑引用下列论文,ChatGLM2-6B 的论文会在近期公布,敬请期待~
 
 
80
 
81
  ```
82
+ @article{zeng2022glm,
83
+ title={Glm-130b: An open bilingual pre-trained model},
84
+ author={Zeng, Aohan and Liu, Xiao and Du, Zhengxiao and Wang, Zihan and Lai, Hanyu and Ding, Ming and Yang, Zhuoyi and Xu, Yifan and Zheng, Wendi and Xia, Xiao and others},
85
+ journal={arXiv preprint arXiv:2210.02414},
86
+ year={2022}
87
+ }
88
+ ```
89
+ ```
90
+ @inproceedings{du2022glm,
91
+ title={GLM: General Language Model Pretraining with Autoregressive Blank Infilling},
92
+ author={Du, Zhengxiao and Qian, Yujie and Liu, Xiao and Ding, Ming and Qiu, Jiezhong and Yang, Zhilin and Tang, Jie},
93
+ booktitle={Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
94
+ pages={320--335},
95
+ year={2022}
96
  }
97
  ```
config.json CHANGED
@@ -8,8 +8,7 @@
8
  "AutoConfig": "configuration_chatglm.ChatGLMConfig",
9
  "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
10
  "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
11
- "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
12
- "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification"
13
  },
14
  "add_bias_linear": false,
15
  "add_qkv_bias": true,
 
8
  "AutoConfig": "configuration_chatglm.ChatGLMConfig",
9
  "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
10
  "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
11
+ "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration"
 
12
  },
13
  "add_bias_linear": false,
14
  "add_qkv_bias": true,
configuration_chatglm.py CHANGED
@@ -13,7 +13,6 @@ class ChatGLMConfig(PretrainedConfig):
13
  num_attention_heads=32,
14
  seq_length=2048,
15
  hidden_dropout=0.0,
16
- classifier_dropout=None,
17
  attention_dropout=0.0,
18
  layernorm_epsilon=1e-5,
19
  rmsnorm=True,
@@ -41,7 +40,6 @@ class ChatGLMConfig(PretrainedConfig):
41
  self.num_attention_heads = num_attention_heads
42
  self.seq_length = seq_length
43
  self.hidden_dropout = hidden_dropout
44
- self.classifier_dropout = classifier_dropout
45
  self.attention_dropout = attention_dropout
46
  self.layernorm_epsilon = layernorm_epsilon
47
  self.rmsnorm = rmsnorm
 
13
  num_attention_heads=32,
14
  seq_length=2048,
15
  hidden_dropout=0.0,
 
16
  attention_dropout=0.0,
17
  layernorm_epsilon=1e-5,
18
  rmsnorm=True,
 
40
  self.num_attention_heads = num_attention_heads
41
  self.seq_length = seq_length
42
  self.hidden_dropout = hidden_dropout
 
43
  self.attention_dropout = attention_dropout
44
  self.layernorm_epsilon = layernorm_epsilon
45
  self.rmsnorm = rmsnorm
modeling_chatglm.py CHANGED
@@ -11,14 +11,12 @@ import torch.utils.checkpoint
11
  import torch.nn.functional as F
12
  from torch import nn
13
  from torch.nn import CrossEntropyLoss, LayerNorm
14
- from torch.nn import CrossEntropyLoss, LayerNorm, MSELoss, BCEWithLogitsLoss
15
  from torch.nn.utils import skip_init
16
  from typing import Optional, Tuple, Union, List, Callable, Dict, Any
17
 
18
  from transformers.modeling_outputs import (
19
  BaseModelOutputWithPast,
20
  CausalLMOutputWithPast,
21
- SequenceClassifierOutputWithPast,
22
  )
23
  from transformers.modeling_utils import PreTrainedModel
24
  from transformers.utils import logging
@@ -897,7 +895,6 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
897
  past_key_values: Optional[torch.Tensor] = None,
898
  attention_mask: Optional[torch.Tensor] = None,
899
  position_ids: Optional[torch.Tensor] = None,
900
- use_cache: Optional[bool] = None,
901
  is_first_forward: bool = True,
902
  **kwargs
903
  ) -> dict:
@@ -905,16 +902,14 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
905
  if position_ids is None:
906
  position_ids = self.get_position_ids(input_ids, device=input_ids.device)
907
  if not is_first_forward:
908
- if past_key_values is not None:
909
- position_ids = position_ids[..., -1:]
910
- input_ids = input_ids[:, -1:]
911
  return {
912
  "input_ids": input_ids,
913
  "past_key_values": past_key_values,
914
  "position_ids": position_ids,
915
  "attention_mask": attention_mask,
916
- "return_last_logit": True,
917
- "use_cache": use_cache
918
  }
919
 
920
  def forward(
@@ -1091,7 +1086,6 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
1091
  generation_config = self.generation_config
1092
  generation_config = copy.deepcopy(generation_config)
1093
  model_kwargs = generation_config.update(**kwargs)
1094
- model_kwargs["use_cache"] = generation_config.use_cache
1095
  bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id
1096
 
1097
  if isinstance(eos_token_id, int):
@@ -1197,89 +1191,3 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
1197
  self.transformer.encoder = quantize(self.transformer.encoder, bits, empty_init=empty_init, device=device,
1198
  **kwargs)
1199
  return self
1200
-
1201
-
1202
- class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel):
1203
- def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
1204
- super().__init__(config)
1205
-
1206
- self.num_labels = config.num_labels
1207
- self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
1208
-
1209
- self.classifier_head = nn.Linear(config.hidden_size, config.num_labels, bias=True, dtype=torch.half)
1210
- if config.classifier_dropout is not None:
1211
- self.dropout = nn.Dropout(config.classifier_dropout)
1212
- else:
1213
- self.dropout = None
1214
- self.config = config
1215
-
1216
- if self.config.quantization_bit:
1217
- self.quantize(self.config.quantization_bit, empty_init=True)
1218
-
1219
- def forward(
1220
- self,
1221
- input_ids: Optional[torch.LongTensor] = None,
1222
- position_ids: Optional[torch.LongTensor] = None,
1223
- attention_mask: Optional[torch.Tensor] = None,
1224
- full_attention_mask: Optional[torch.Tensor] = None,
1225
- past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
1226
- inputs_embeds: Optional[torch.LongTensor] = None,
1227
- labels: Optional[torch.LongTensor] = None,
1228
- use_cache: Optional[bool] = None,
1229
- output_hidden_states: Optional[bool] = None,
1230
- return_dict: Optional[bool] = None,
1231
- ) -> Union[Tuple[torch.Tensor, ...], SequenceClassifierOutputWithPast]:
1232
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1233
-
1234
- transformer_outputs = self.transformer(
1235
- input_ids=input_ids,
1236
- position_ids=position_ids,
1237
- attention_mask=attention_mask,
1238
- full_attention_mask=full_attention_mask,
1239
- past_key_values=past_key_values,
1240
- inputs_embeds=inputs_embeds,
1241
- use_cache=use_cache,
1242
- output_hidden_states=output_hidden_states,
1243
- return_dict=return_dict,
1244
- )
1245
-
1246
- hidden_states = transformer_outputs[0]
1247
- pooled_hidden_states = hidden_states[-1]
1248
- if self.dropout is not None:
1249
- pooled_hidden_states = self.dropout(pooled_hidden_states)
1250
- logits = self.classifier_head(pooled_hidden_states)
1251
-
1252
- loss = None
1253
- if labels is not None:
1254
- if self.config.problem_type is None:
1255
- if self.num_labels == 1:
1256
- self.config.problem_type = "regression"
1257
- elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
1258
- self.config.problem_type = "single_label_classification"
1259
- else:
1260
- self.config.problem_type = "multi_label_classification"
1261
-
1262
- if self.config.problem_type == "regression":
1263
- loss_fct = MSELoss()
1264
- if self.num_labels == 1:
1265
- loss = loss_fct(logits.squeeze().float(), labels.squeeze())
1266
- else:
1267
- loss = loss_fct(logits.float(), labels)
1268
- elif self.config.problem_type == "single_label_classification":
1269
- loss_fct = CrossEntropyLoss()
1270
- loss = loss_fct(logits.view(-1, self.num_labels).float(), labels.view(-1))
1271
- elif self.config.problem_type == "multi_label_classification":
1272
- loss_fct = BCEWithLogitsLoss()
1273
- loss = loss_fct(logits.float(), labels.view(-1, self.num_labels))
1274
-
1275
- if not return_dict:
1276
- output = (logits,) + transformer_outputs[1:]
1277
- return ((loss,) + output) if loss is not None else output
1278
-
1279
- return SequenceClassifierOutputWithPast(
1280
- loss=loss,
1281
- logits=logits,
1282
- past_key_values=transformer_outputs.past_key_values,
1283
- hidden_states=transformer_outputs.hidden_states,
1284
- attentions=transformer_outputs.attentions,
1285
- )
 
11
  import torch.nn.functional as F
12
  from torch import nn
13
  from torch.nn import CrossEntropyLoss, LayerNorm
 
14
  from torch.nn.utils import skip_init
15
  from typing import Optional, Tuple, Union, List, Callable, Dict, Any
16
 
17
  from transformers.modeling_outputs import (
18
  BaseModelOutputWithPast,
19
  CausalLMOutputWithPast,
 
20
  )
21
  from transformers.modeling_utils import PreTrainedModel
22
  from transformers.utils import logging
 
895
  past_key_values: Optional[torch.Tensor] = None,
896
  attention_mask: Optional[torch.Tensor] = None,
897
  position_ids: Optional[torch.Tensor] = None,
 
898
  is_first_forward: bool = True,
899
  **kwargs
900
  ) -> dict:
 
902
  if position_ids is None:
903
  position_ids = self.get_position_ids(input_ids, device=input_ids.device)
904
  if not is_first_forward:
905
+ position_ids = position_ids[..., -1:]
906
+ input_ids = input_ids[:, -1:]
 
907
  return {
908
  "input_ids": input_ids,
909
  "past_key_values": past_key_values,
910
  "position_ids": position_ids,
911
  "attention_mask": attention_mask,
912
+ "return_last_logit": True
 
913
  }
914
 
915
  def forward(
 
1086
  generation_config = self.generation_config
1087
  generation_config = copy.deepcopy(generation_config)
1088
  model_kwargs = generation_config.update(**kwargs)
 
1089
  bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id
1090
 
1091
  if isinstance(eos_token_id, int):
 
1191
  self.transformer.encoder = quantize(self.transformer.encoder, bits, empty_init=empty_init, device=device,
1192
  **kwargs)
1193
  return self
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tokenization_chatglm.py CHANGED
@@ -66,6 +66,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
66
  model_input_names = ["input_ids", "attention_mask", "position_ids"]
67
 
68
  def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs):
 
69
  self.name = "GLMTokenizer"
70
 
71
  self.vocab_file = vocab_file
@@ -75,7 +76,6 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
75
  "<eos>": self.tokenizer.eos_id,
76
  "<pad>": self.tokenizer.pad_id
77
  }
78
- super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs)
79
 
80
  def get_command(self, token):
81
  if token in self.special_tokens:
 
66
  model_input_names = ["input_ids", "attention_mask", "position_ids"]
67
 
68
  def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs):
69
+ super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs)
70
  self.name = "GLMTokenizer"
71
 
72
  self.vocab_file = vocab_file
 
76
  "<eos>": self.tokenizer.eos_id,
77
  "<pad>": self.tokenizer.pad_id
78
  }
 
79
 
80
  def get_command(self, token):
81
  if token in self.special_tokens: