myownskyW7
commited on
Commit
•
aa00681
1
Parent(s):
b06eb0c
Speed up chat
Browse files- modeling_InternLM_XComposer.py +34 -19
- modeling_utils.py +36 -25
modeling_InternLM_XComposer.py
CHANGED
@@ -26,6 +26,13 @@ class InternLMXComposerForCausalLM(PreTrainedModel):
|
|
26 |
config_class = InternLMXComposerConfig
|
27 |
_auto_class = "AutoModelForCausalLM"
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
gen_config = dict(
|
30 |
num_beams=5,
|
31 |
do_sample=False,
|
@@ -33,7 +40,7 @@ class InternLMXComposerForCausalLM(PreTrainedModel):
|
|
33 |
repetition_penalty=1.5,
|
34 |
length_penalty=1.0,
|
35 |
temperature=1.0,
|
36 |
-
max_new_tokens=
|
37 |
)
|
38 |
|
39 |
def __init__(self, config):
|
@@ -74,13 +81,14 @@ class InternLMXComposerForCausalLM(PreTrainedModel):
|
|
74 |
# speed up init llm
|
75 |
with torch.device('meta'):
|
76 |
self.internlm_model = InternLMForCausalLM._from_config(config)
|
77 |
-
self.internlm_model.to_empty(device=config.device).to(
|
|
|
78 |
for n, m in self.internlm_model.named_modules():
|
79 |
if 'lora' in n:
|
80 |
m.float()
|
81 |
|
82 |
self.internlm_proj = nn.Linear(self.Qformer.config.hidden_size,
|
83 |
-
|
84 |
print('Done')
|
85 |
|
86 |
self.vis_processor = transforms.Compose([
|
@@ -93,15 +101,15 @@ class InternLMXComposerForCausalLM(PreTrainedModel):
|
|
93 |
|
94 |
self.tokenizer = None
|
95 |
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
|
106 |
def maybe_autocast(self, dtype=torch.float16):
|
107 |
# if on cpu, don't use autocast
|
@@ -154,13 +162,14 @@ class InternLMXComposerForCausalLM(PreTrainedModel):
|
|
154 |
encoder_attention_mask=image_atts,
|
155 |
return_dict=True,
|
156 |
)
|
157 |
-
inputs_internlm = self.internlm_proj(
|
|
|
158 |
inputs_internlm = torch.cat([
|
159 |
self.flag_image_start.expand(inputs_internlm.shape[0], -1, -1),
|
160 |
inputs_internlm,
|
161 |
self.flag_image_end.expand(inputs_internlm.shape[0], -1, -1)
|
162 |
],
|
163 |
-
|
164 |
return inputs_internlm
|
165 |
|
166 |
def encode_text(self, text, add_special_tokens=False):
|
@@ -195,8 +204,8 @@ class InternLMXComposerForCausalLM(PreTrainedModel):
|
|
195 |
text_embeds = self.encode_text(text)
|
196 |
img_embeds = self.encode_img(image)
|
197 |
prompt_embeds = self.wrap_prompt(text_embeds, img_embeds)
|
198 |
-
out_embeds = self.internlm_model.generate(
|
199 |
-
|
200 |
out_text = self.decode_text(out_embeds)
|
201 |
return out_text
|
202 |
|
@@ -206,8 +215,8 @@ class InternLMXComposerForCausalLM(PreTrainedModel):
|
|
206 |
prompt_embeds = self.wrap_prompt(text_embeds,
|
207 |
img_embeds,
|
208 |
history=history)
|
209 |
-
out_embeds = self.internlm_model.generate(
|
210 |
-
|
211 |
out_text = self.decode_text(out_embeds)
|
212 |
|
213 |
# trunc at eoh and eoa
|
@@ -231,7 +240,13 @@ class InternLMXComposerForCausalLM(PreTrainedModel):
|
|
231 |
history=None,
|
232 |
add_special=True):
|
233 |
if add_special:
|
234 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
235 |
else:
|
236 |
prompt_segs = [' <|User|>:', ' <|Bot|>:'] # used in wrap history
|
237 |
prompt_seg_embeds = []
|
|
|
26 |
config_class = InternLMXComposerConfig
|
27 |
_auto_class = "AutoModelForCausalLM"
|
28 |
|
29 |
+
meta_instruction = """meta instruction
|
30 |
+
You are an AI assistant whose name is 浦语.
|
31 |
+
- 浦语 is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
|
32 |
+
- 浦语 can understand and communicate fluently in the language chosen by the user such as English and 中文.
|
33 |
+
conversation
|
34 |
+
"""
|
35 |
+
|
36 |
gen_config = dict(
|
37 |
num_beams=5,
|
38 |
do_sample=False,
|
|
|
40 |
repetition_penalty=1.5,
|
41 |
length_penalty=1.0,
|
42 |
temperature=1.0,
|
43 |
+
max_new_tokens=500,
|
44 |
)
|
45 |
|
46 |
def __init__(self, config):
|
|
|
81 |
# speed up init llm
|
82 |
with torch.device('meta'):
|
83 |
self.internlm_model = InternLMForCausalLM._from_config(config)
|
84 |
+
self.internlm_model.to_empty(device=config.device).to(
|
85 |
+
torch.float16)
|
86 |
for n, m in self.internlm_model.named_modules():
|
87 |
if 'lora' in n:
|
88 |
m.float()
|
89 |
|
90 |
self.internlm_proj = nn.Linear(self.Qformer.config.hidden_size,
|
91 |
+
self.internlm_model.config.hidden_size)
|
92 |
print('Done')
|
93 |
|
94 |
self.vis_processor = transforms.Compose([
|
|
|
101 |
|
102 |
self.tokenizer = None
|
103 |
|
104 |
+
self.eoh = '<TOKENS_UNUSED_0>' # end of human
|
105 |
+
self.eoa = '<TOKENS_UNUSED_1>' # end of assistant
|
106 |
+
stop_words_ids = [
|
107 |
+
torch.tensor([103027]).to(config.device),
|
108 |
+
torch.tensor([103028]).to(config.device),
|
109 |
+
]
|
110 |
+
stopping_criteria = StoppingCriteriaList(
|
111 |
+
[StoppingCriteriaSub(stops=stop_words_ids)])
|
112 |
+
self.gen_config['stopping_criteria'] = stopping_criteria
|
113 |
|
114 |
def maybe_autocast(self, dtype=torch.float16):
|
115 |
# if on cpu, don't use autocast
|
|
|
162 |
encoder_attention_mask=image_atts,
|
163 |
return_dict=True,
|
164 |
)
|
165 |
+
inputs_internlm = self.internlm_proj(
|
166 |
+
query_output.last_hidden_state)
|
167 |
inputs_internlm = torch.cat([
|
168 |
self.flag_image_start.expand(inputs_internlm.shape[0], -1, -1),
|
169 |
inputs_internlm,
|
170 |
self.flag_image_end.expand(inputs_internlm.shape[0], -1, -1)
|
171 |
],
|
172 |
+
dim=1)
|
173 |
return inputs_internlm
|
174 |
|
175 |
def encode_text(self, text, add_special_tokens=False):
|
|
|
204 |
text_embeds = self.encode_text(text)
|
205 |
img_embeds = self.encode_img(image)
|
206 |
prompt_embeds = self.wrap_prompt(text_embeds, img_embeds)
|
207 |
+
out_embeds = self.internlm_model.generate(
|
208 |
+
inputs_embeds=prompt_embeds, **self.get_gen_args(**kwargs))
|
209 |
out_text = self.decode_text(out_embeds)
|
210 |
return out_text
|
211 |
|
|
|
215 |
prompt_embeds = self.wrap_prompt(text_embeds,
|
216 |
img_embeds,
|
217 |
history=history)
|
218 |
+
out_embeds = self.internlm_model.generate(
|
219 |
+
inputs_embeds=prompt_embeds, **self.get_gen_args(**kwargs))
|
220 |
out_text = self.decode_text(out_embeds)
|
221 |
|
222 |
# trunc at eoh and eoa
|
|
|
240 |
history=None,
|
241 |
add_special=True):
|
242 |
if add_special:
|
243 |
+
if history is None:
|
244 |
+
prompt_segs = [
|
245 |
+
self.meta_instruction + ' <|User|>:',
|
246 |
+
f'\n{self.eoh} <|Bot|>:'
|
247 |
+
]
|
248 |
+
else:
|
249 |
+
prompt_segs = [' <|User|>:', f'\n{self.eoh} <|Bot|>:']
|
250 |
else:
|
251 |
prompt_segs = [' <|User|>:', ' <|Bot|>:'] # used in wrap history
|
252 |
prompt_seg_embeds = []
|
modeling_utils.py
CHANGED
@@ -2,6 +2,7 @@ import logging
|
|
2 |
import math
|
3 |
import os
|
4 |
from contextlib import contextmanager
|
|
|
5 |
|
6 |
import timm.models.hub as timm_hub
|
7 |
import torch
|
@@ -32,6 +33,7 @@ def download_cached_file(url, check_hash=True, progress=False):
|
|
32 |
Download a file from a URL and cache it locally. If the file already exists, it is not downloaded again.
|
33 |
If distributed, only the main process downloads the file, and the other processes wait for the file to be downloaded.
|
34 |
"""
|
|
|
35 |
def get_cached_file_path():
|
36 |
# a hack to sync the file path across processes
|
37 |
parts = torch.hub.urlparse(url)
|
@@ -74,49 +76,58 @@ def all_logging_disabled(highest_level=logging.CRITICAL):
|
|
74 |
|
75 |
|
76 |
class LoRALinear(nn.Linear):
|
77 |
-
def __init__(
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
|
|
|
|
87 |
super().__init__(in_features, out_features, bias, device, dtype)
|
88 |
self.lora_r = lora_r
|
89 |
self.lora_alpha = lora_alpha
|
90 |
-
if lora_dropout > 0
|
91 |
self.lora_dropout = nn.Dropout(p=lora_dropout)
|
92 |
else:
|
93 |
self.lora_dropout = lambda x: x
|
94 |
self.lora_scaling = self.lora_alpha / self.lora_r
|
95 |
|
96 |
-
self.lora_A = nn.Linear(
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
out_features,
|
103 |
-
bias=False,
|
104 |
-
device=device,
|
105 |
-
dtype=dtype)
|
106 |
|
107 |
self.reset_parameters()
|
108 |
|
109 |
def reset_parameters(self):
|
110 |
-
if hasattr(self,
|
111 |
# initialize A the same way as the default for nn.Linear and B to zero
|
112 |
nn.init.kaiming_uniform_(self.lora_A.weight, a=math.sqrt(5))
|
113 |
nn.init.zeros_(self.lora_B.weight)
|
114 |
-
#print ("lora weight init {} {}".format(torch.mean(self.lora_A.weight), torch.mean(self.lora_B.weight)))
|
115 |
|
116 |
def forward(self, x):
|
117 |
orig_type = x.dtype
|
118 |
res = super().forward(x)
|
119 |
x = x.float()
|
120 |
-
res += self.lora_B(self.lora_A(
|
121 |
-
self.lora_dropout(x))) * self.lora_scaling
|
122 |
return res.to(orig_type)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import math
|
3 |
import os
|
4 |
from contextlib import contextmanager
|
5 |
+
from transformers import StoppingCriteria, StoppingCriteriaList
|
6 |
|
7 |
import timm.models.hub as timm_hub
|
8 |
import torch
|
|
|
33 |
Download a file from a URL and cache it locally. If the file already exists, it is not downloaded again.
|
34 |
If distributed, only the main process downloads the file, and the other processes wait for the file to be downloaded.
|
35 |
"""
|
36 |
+
|
37 |
def get_cached_file_path():
|
38 |
# a hack to sync the file path across processes
|
39 |
parts = torch.hub.urlparse(url)
|
|
|
76 |
|
77 |
|
78 |
class LoRALinear(nn.Linear):
|
79 |
+
def __init__(
|
80 |
+
self,
|
81 |
+
in_features: int,
|
82 |
+
out_features: int,
|
83 |
+
bias: bool = True,
|
84 |
+
device=None,
|
85 |
+
dtype=None,
|
86 |
+
lora_r=8,
|
87 |
+
lora_alpha=16,
|
88 |
+
lora_dropout=0.05,
|
89 |
+
**kwargs
|
90 |
+
) -> None:
|
91 |
super().__init__(in_features, out_features, bias, device, dtype)
|
92 |
self.lora_r = lora_r
|
93 |
self.lora_alpha = lora_alpha
|
94 |
+
if lora_dropout > 0.0:
|
95 |
self.lora_dropout = nn.Dropout(p=lora_dropout)
|
96 |
else:
|
97 |
self.lora_dropout = lambda x: x
|
98 |
self.lora_scaling = self.lora_alpha / self.lora_r
|
99 |
|
100 |
+
self.lora_A = nn.Linear(
|
101 |
+
in_features, self.lora_r, bias=False, device=device, dtype=dtype
|
102 |
+
)
|
103 |
+
self.lora_B = nn.Linear(
|
104 |
+
self.lora_r, out_features, bias=False, device=device, dtype=dtype
|
105 |
+
)
|
|
|
|
|
|
|
|
|
106 |
|
107 |
self.reset_parameters()
|
108 |
|
109 |
def reset_parameters(self):
|
110 |
+
if hasattr(self, "lora_A"):
|
111 |
# initialize A the same way as the default for nn.Linear and B to zero
|
112 |
nn.init.kaiming_uniform_(self.lora_A.weight, a=math.sqrt(5))
|
113 |
nn.init.zeros_(self.lora_B.weight)
|
|
|
114 |
|
115 |
def forward(self, x):
|
116 |
orig_type = x.dtype
|
117 |
res = super().forward(x)
|
118 |
x = x.float()
|
119 |
+
res += self.lora_B(self.lora_A(self.lora_dropout(x))) * self.lora_scaling
|
|
|
120 |
return res.to(orig_type)
|
121 |
+
|
122 |
+
|
123 |
+
class StoppingCriteriaSub(StoppingCriteria):
|
124 |
+
def __init__(self, stops=[], encounters=1):
|
125 |
+
super().__init__()
|
126 |
+
self.stops = stops
|
127 |
+
|
128 |
+
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
|
129 |
+
for stop in self.stops:
|
130 |
+
if torch.all((stop == input_ids[:, -len(stop) :])).item():
|
131 |
+
return True
|
132 |
+
|
133 |
+
return False
|