Upload 2 files
Browse files- mm_utils.py +0 -31
- mmalaya_arch.py +44 -1
mm_utils.py
CHANGED
@@ -21,15 +21,6 @@ DEFAULT_IM_START_TOKEN = "<im_start>"
|
|
21 |
DEFAULT_IM_END_TOKEN = "<im_end>"
|
22 |
|
23 |
|
24 |
-
def disable_torch_init():
|
25 |
-
"""
|
26 |
-
Disable the redundant torch default initialization to accelerate model creation.
|
27 |
-
"""
|
28 |
-
import torch
|
29 |
-
setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
|
30 |
-
setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
|
31 |
-
|
32 |
-
|
33 |
def load_image_from_base64(image):
|
34 |
return Image.open(BytesIO(base64.b64decode(image)))
|
35 |
|
@@ -63,28 +54,6 @@ def process_images(images, image_processor, model_cfg):
|
|
63 |
return new_images
|
64 |
|
65 |
|
66 |
-
def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
|
67 |
-
prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
|
68 |
-
|
69 |
-
def insert_separator(X, sep):
|
70 |
-
return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
|
71 |
-
|
72 |
-
input_ids = []
|
73 |
-
offset = 0
|
74 |
-
if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
|
75 |
-
offset = 1
|
76 |
-
input_ids.append(prompt_chunks[0][0])
|
77 |
-
|
78 |
-
for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
|
79 |
-
input_ids.extend(x[offset:])
|
80 |
-
|
81 |
-
if return_tensors is not None:
|
82 |
-
if return_tensors == 'pt':
|
83 |
-
return torch.tensor(input_ids, dtype=torch.long)
|
84 |
-
raise ValueError(f'Unsupported tensor type: {return_tensors}')
|
85 |
-
return input_ids
|
86 |
-
|
87 |
-
|
88 |
def get_model_name_from_path(model_path):
|
89 |
model_path = model_path.strip("/")
|
90 |
model_paths = model_path.split("/")
|
|
|
21 |
DEFAULT_IM_END_TOKEN = "<im_end>"
|
22 |
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
def load_image_from_base64(image):
|
25 |
return Image.open(BytesIO(base64.b64decode(image)))
|
26 |
|
|
|
54 |
return new_images
|
55 |
|
56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
def get_model_name_from_path(model_path):
|
58 |
model_path = model_path.strip("/")
|
59 |
model_paths = model_path.split("/")
|
mmalaya_arch.py
CHANGED
@@ -4,6 +4,7 @@ import torch
|
|
4 |
import torch.nn as nn
|
5 |
from transformers import Blip2Model, Blip2Processor, Blip2Config
|
6 |
from .mm_utils import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
|
|
|
7 |
|
8 |
|
9 |
class BLIP2VisionTower(nn.Module):
|
@@ -265,6 +266,48 @@ class MMAlayaMetaForCausalLM(ABC):
|
|
265 |
|
266 |
return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
|
267 |
|
268 |
-
def
|
269 |
tokenizer.add_tokens([DEFAULT_IMAGE_TOKEN], special_tokens=True)
|
270 |
self.resize_token_embeddings(len(tokenizer))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
import torch.nn as nn
|
5 |
from transformers import Blip2Model, Blip2Processor, Blip2Config
|
6 |
from .mm_utils import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
|
7 |
+
from .mm_utils import conv_templates
|
8 |
|
9 |
|
10 |
class BLIP2VisionTower(nn.Module):
|
|
|
266 |
|
267 |
return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
|
268 |
|
269 |
+
def initialize_tokenizer(self, tokenizer):
|
270 |
tokenizer.add_tokens([DEFAULT_IMAGE_TOKEN], special_tokens=True)
|
271 |
self.resize_token_embeddings(len(tokenizer))
|
272 |
+
|
273 |
+
def prepare_for_inference(
|
274 |
+
self,
|
275 |
+
prompt,
|
276 |
+
tokenizer,
|
277 |
+
image,
|
278 |
+
image_token_index=IMAGE_TOKEN_INDEX,
|
279 |
+
return_tensors=None
|
280 |
+
):
|
281 |
+
# 加载对话模板
|
282 |
+
conv = conv_templates["mmalaya_llama"].copy()
|
283 |
+
inp = DEFAULT_IMAGE_TOKEN + '\n' + prompt
|
284 |
+
conv.append_message(conv.roles[0], inp)
|
285 |
+
conv.append_message(conv.roles[1], None)
|
286 |
+
prompt = conv.get_prompt()
|
287 |
+
|
288 |
+
prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
|
289 |
+
|
290 |
+
def insert_separator(X, sep):
|
291 |
+
return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
|
292 |
+
|
293 |
+
input_ids = []
|
294 |
+
offset = 0
|
295 |
+
if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
|
296 |
+
offset = 1
|
297 |
+
input_ids.append(prompt_chunks[0][0])
|
298 |
+
|
299 |
+
for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
|
300 |
+
input_ids.extend(x[offset:])
|
301 |
+
|
302 |
+
if return_tensors is not None:
|
303 |
+
if return_tensors == 'pt':
|
304 |
+
return torch.tensor(input_ids, dtype=torch.long)
|
305 |
+
raise ValueError(f'Unsupported tensor type: {return_tensors}')
|
306 |
+
|
307 |
+
# 加载generate stop条件
|
308 |
+
stopping_criteria = KeywordsStoppingCriteria([conv.sep2], tokenizer, input_ids)
|
309 |
+
# 加载图像
|
310 |
+
image_processor = model.get_vision_tower().image_processor
|
311 |
+
image_tensor = image_processor(image, return_tensors='pt')['pixel_values'].half().cuda()
|
312 |
+
|
313 |
+
return input_ids, image_tensor, stopping_criteria
|