myownskyW7 commited on
Commit
aa00681
1 Parent(s): b06eb0c

Speed up chat

Browse files
Files changed (2) hide show
  1. modeling_InternLM_XComposer.py +34 -19
  2. modeling_utils.py +36 -25
modeling_InternLM_XComposer.py CHANGED
@@ -26,6 +26,13 @@ class InternLMXComposerForCausalLM(PreTrainedModel):
26
  config_class = InternLMXComposerConfig
27
  _auto_class = "AutoModelForCausalLM"
28
 
 
 
 
 
 
 
 
29
  gen_config = dict(
30
  num_beams=5,
31
  do_sample=False,
@@ -33,7 +40,7 @@ class InternLMXComposerForCausalLM(PreTrainedModel):
33
  repetition_penalty=1.5,
34
  length_penalty=1.0,
35
  temperature=1.0,
36
- max_new_tokens=200,
37
  )
38
 
39
  def __init__(self, config):
@@ -74,13 +81,14 @@ class InternLMXComposerForCausalLM(PreTrainedModel):
74
  # speed up init llm
75
  with torch.device('meta'):
76
  self.internlm_model = InternLMForCausalLM._from_config(config)
77
- self.internlm_model.to_empty(device=config.device).to(torch.float16)
 
78
  for n, m in self.internlm_model.named_modules():
79
  if 'lora' in n:
80
  m.float()
81
 
82
  self.internlm_proj = nn.Linear(self.Qformer.config.hidden_size,
83
- self.internlm_model.config.hidden_size)
84
  print('Done')
85
 
86
  self.vis_processor = transforms.Compose([
@@ -93,15 +101,15 @@ class InternLMXComposerForCausalLM(PreTrainedModel):
93
 
94
  self.tokenizer = None
95
 
96
- @property
97
- def eoh(self):
98
- return self.tokenizer.decode(torch.Tensor([103027]),
99
- skip_special_tokens=True)
100
-
101
- @property
102
- def eoa(self):
103
- return self.tokenizer.decode(torch.Tensor([103028]),
104
- skip_special_tokens=True)
105
 
106
  def maybe_autocast(self, dtype=torch.float16):
107
  # if on cpu, don't use autocast
@@ -154,13 +162,14 @@ class InternLMXComposerForCausalLM(PreTrainedModel):
154
  encoder_attention_mask=image_atts,
155
  return_dict=True,
156
  )
157
- inputs_internlm = self.internlm_proj(query_output.last_hidden_state)
 
158
  inputs_internlm = torch.cat([
159
  self.flag_image_start.expand(inputs_internlm.shape[0], -1, -1),
160
  inputs_internlm,
161
  self.flag_image_end.expand(inputs_internlm.shape[0], -1, -1)
162
  ],
163
- dim=1)
164
  return inputs_internlm
165
 
166
  def encode_text(self, text, add_special_tokens=False):
@@ -195,8 +204,8 @@ class InternLMXComposerForCausalLM(PreTrainedModel):
195
  text_embeds = self.encode_text(text)
196
  img_embeds = self.encode_img(image)
197
  prompt_embeds = self.wrap_prompt(text_embeds, img_embeds)
198
- out_embeds = self.internlm_model.generate(inputs_embeds=prompt_embeds,
199
- **self.get_gen_args(**kwargs))
200
  out_text = self.decode_text(out_embeds)
201
  return out_text
202
 
@@ -206,8 +215,8 @@ class InternLMXComposerForCausalLM(PreTrainedModel):
206
  prompt_embeds = self.wrap_prompt(text_embeds,
207
  img_embeds,
208
  history=history)
209
- out_embeds = self.internlm_model.generate(inputs_embeds=prompt_embeds,
210
- **self.get_gen_args(**kwargs))
211
  out_text = self.decode_text(out_embeds)
212
 
213
  # trunc at eoh and eoa
@@ -231,7 +240,13 @@ class InternLMXComposerForCausalLM(PreTrainedModel):
231
  history=None,
232
  add_special=True):
233
  if add_special:
234
- prompt_segs = [' <|User|>:', f'\n{self.eoh} <|Bot|>:']
 
 
 
 
 
 
235
  else:
236
  prompt_segs = [' <|User|>:', ' <|Bot|>:'] # used in wrap history
237
  prompt_seg_embeds = []
 
26
  config_class = InternLMXComposerConfig
27
  _auto_class = "AutoModelForCausalLM"
28
 
29
+ meta_instruction = """meta instruction
30
+ You are an AI assistant whose name is 浦语.
31
+ - 浦语 is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
32
+ - 浦语 can understand and communicate fluently in the language chosen by the user such as English and 中文.
33
+ conversation
34
+ """
35
+
36
  gen_config = dict(
37
  num_beams=5,
38
  do_sample=False,
 
40
  repetition_penalty=1.5,
41
  length_penalty=1.0,
42
  temperature=1.0,
43
+ max_new_tokens=500,
44
  )
45
 
46
  def __init__(self, config):
 
81
  # speed up init llm
82
  with torch.device('meta'):
83
  self.internlm_model = InternLMForCausalLM._from_config(config)
84
+ self.internlm_model.to_empty(device=config.device).to(
85
+ torch.float16)
86
  for n, m in self.internlm_model.named_modules():
87
  if 'lora' in n:
88
  m.float()
89
 
90
  self.internlm_proj = nn.Linear(self.Qformer.config.hidden_size,
91
+ self.internlm_model.config.hidden_size)
92
  print('Done')
93
 
94
  self.vis_processor = transforms.Compose([
 
101
 
102
  self.tokenizer = None
103
 
104
+ self.eoh = '<TOKENS_UNUSED_0>' # end of human
105
+ self.eoa = '<TOKENS_UNUSED_1>' # end of assistant
106
+ stop_words_ids = [
107
+ torch.tensor([103027]).to(config.device),
108
+ torch.tensor([103028]).to(config.device),
109
+ ]
110
+ stopping_criteria = StoppingCriteriaList(
111
+ [StoppingCriteriaSub(stops=stop_words_ids)])
112
+ self.gen_config['stopping_criteria'] = stopping_criteria
113
 
114
  def maybe_autocast(self, dtype=torch.float16):
115
  # if on cpu, don't use autocast
 
162
  encoder_attention_mask=image_atts,
163
  return_dict=True,
164
  )
165
+ inputs_internlm = self.internlm_proj(
166
+ query_output.last_hidden_state)
167
  inputs_internlm = torch.cat([
168
  self.flag_image_start.expand(inputs_internlm.shape[0], -1, -1),
169
  inputs_internlm,
170
  self.flag_image_end.expand(inputs_internlm.shape[0], -1, -1)
171
  ],
172
+ dim=1)
173
  return inputs_internlm
174
 
175
  def encode_text(self, text, add_special_tokens=False):
 
204
  text_embeds = self.encode_text(text)
205
  img_embeds = self.encode_img(image)
206
  prompt_embeds = self.wrap_prompt(text_embeds, img_embeds)
207
+ out_embeds = self.internlm_model.generate(
208
+ inputs_embeds=prompt_embeds, **self.get_gen_args(**kwargs))
209
  out_text = self.decode_text(out_embeds)
210
  return out_text
211
 
 
215
  prompt_embeds = self.wrap_prompt(text_embeds,
216
  img_embeds,
217
  history=history)
218
+ out_embeds = self.internlm_model.generate(
219
+ inputs_embeds=prompt_embeds, **self.get_gen_args(**kwargs))
220
  out_text = self.decode_text(out_embeds)
221
 
222
  # trunc at eoh and eoa
 
240
  history=None,
241
  add_special=True):
242
  if add_special:
243
+ if history is None:
244
+ prompt_segs = [
245
+ self.meta_instruction + ' <|User|>:',
246
+ f'\n{self.eoh} <|Bot|>:'
247
+ ]
248
+ else:
249
+ prompt_segs = [' <|User|>:', f'\n{self.eoh} <|Bot|>:']
250
  else:
251
  prompt_segs = [' <|User|>:', ' <|Bot|>:'] # used in wrap history
252
  prompt_seg_embeds = []
modeling_utils.py CHANGED
@@ -2,6 +2,7 @@ import logging
2
  import math
3
  import os
4
  from contextlib import contextmanager
 
5
 
6
  import timm.models.hub as timm_hub
7
  import torch
@@ -32,6 +33,7 @@ def download_cached_file(url, check_hash=True, progress=False):
32
  Download a file from a URL and cache it locally. If the file already exists, it is not downloaded again.
33
  If distributed, only the main process downloads the file, and the other processes wait for the file to be downloaded.
34
  """
 
35
  def get_cached_file_path():
36
  # a hack to sync the file path across processes
37
  parts = torch.hub.urlparse(url)
@@ -74,49 +76,58 @@ def all_logging_disabled(highest_level=logging.CRITICAL):
74
 
75
 
76
  class LoRALinear(nn.Linear):
77
- def __init__(self,
78
- in_features: int,
79
- out_features: int,
80
- bias: bool = True,
81
- device=None,
82
- dtype=None,
83
- lora_r=8,
84
- lora_alpha=16,
85
- lora_dropout=0.05,
86
- **kwargs) -> None:
 
 
87
  super().__init__(in_features, out_features, bias, device, dtype)
88
  self.lora_r = lora_r
89
  self.lora_alpha = lora_alpha
90
- if lora_dropout > 0.:
91
  self.lora_dropout = nn.Dropout(p=lora_dropout)
92
  else:
93
  self.lora_dropout = lambda x: x
94
  self.lora_scaling = self.lora_alpha / self.lora_r
95
 
96
- self.lora_A = nn.Linear(in_features,
97
- self.lora_r,
98
- bias=False,
99
- device=device,
100
- dtype=dtype)
101
- self.lora_B = nn.Linear(self.lora_r,
102
- out_features,
103
- bias=False,
104
- device=device,
105
- dtype=dtype)
106
 
107
  self.reset_parameters()
108
 
109
  def reset_parameters(self):
110
- if hasattr(self, 'lora_A'):
111
  # initialize A the same way as the default for nn.Linear and B to zero
112
  nn.init.kaiming_uniform_(self.lora_A.weight, a=math.sqrt(5))
113
  nn.init.zeros_(self.lora_B.weight)
114
- #print ("lora weight init {} {}".format(torch.mean(self.lora_A.weight), torch.mean(self.lora_B.weight)))
115
 
116
  def forward(self, x):
117
  orig_type = x.dtype
118
  res = super().forward(x)
119
  x = x.float()
120
- res += self.lora_B(self.lora_A(
121
- self.lora_dropout(x))) * self.lora_scaling
122
  return res.to(orig_type)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import math
3
  import os
4
  from contextlib import contextmanager
5
+ from transformers import StoppingCriteria, StoppingCriteriaList
6
 
7
  import timm.models.hub as timm_hub
8
  import torch
 
33
  Download a file from a URL and cache it locally. If the file already exists, it is not downloaded again.
34
  If distributed, only the main process downloads the file, and the other processes wait for the file to be downloaded.
35
  """
36
+
37
  def get_cached_file_path():
38
  # a hack to sync the file path across processes
39
  parts = torch.hub.urlparse(url)
 
76
 
77
 
78
  class LoRALinear(nn.Linear):
79
+ def __init__(
80
+ self,
81
+ in_features: int,
82
+ out_features: int,
83
+ bias: bool = True,
84
+ device=None,
85
+ dtype=None,
86
+ lora_r=8,
87
+ lora_alpha=16,
88
+ lora_dropout=0.05,
89
+ **kwargs
90
+ ) -> None:
91
  super().__init__(in_features, out_features, bias, device, dtype)
92
  self.lora_r = lora_r
93
  self.lora_alpha = lora_alpha
94
+ if lora_dropout > 0.0:
95
  self.lora_dropout = nn.Dropout(p=lora_dropout)
96
  else:
97
  self.lora_dropout = lambda x: x
98
  self.lora_scaling = self.lora_alpha / self.lora_r
99
 
100
+ self.lora_A = nn.Linear(
101
+ in_features, self.lora_r, bias=False, device=device, dtype=dtype
102
+ )
103
+ self.lora_B = nn.Linear(
104
+ self.lora_r, out_features, bias=False, device=device, dtype=dtype
105
+ )
 
 
 
 
106
 
107
  self.reset_parameters()
108
 
109
  def reset_parameters(self):
110
+ if hasattr(self, "lora_A"):
111
  # initialize A the same way as the default for nn.Linear and B to zero
112
  nn.init.kaiming_uniform_(self.lora_A.weight, a=math.sqrt(5))
113
  nn.init.zeros_(self.lora_B.weight)
 
114
 
115
  def forward(self, x):
116
  orig_type = x.dtype
117
  res = super().forward(x)
118
  x = x.float()
119
+ res += self.lora_B(self.lora_A(self.lora_dropout(x))) * self.lora_scaling
 
120
  return res.to(orig_type)
121
+
122
+
123
+ class StoppingCriteriaSub(StoppingCriteria):
124
+ def __init__(self, stops=[], encounters=1):
125
+ super().__init__()
126
+ self.stops = stops
127
+
128
+ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
129
+ for stop in self.stops:
130
+ if torch.all((stop == input_ids[:, -len(stop) :])).item():
131
+ return True
132
+
133
+ return False