andrewqian123
commited on
Update modeling_minicpmv.py
Browse files- modeling_minicpmv.py +12 -12
modeling_minicpmv.py
CHANGED
@@ -102,7 +102,7 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
|
|
102 |
padding_value=0.0)
|
103 |
B, L, _ = all_pixel_values.shape
|
104 |
all_pixel_values = all_pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L)
|
105 |
-
print(B, "BATCH")
|
106 |
patch_attn_mask = torch.zeros((B, 1, max_patches), dtype=torch.bool, device=device)
|
107 |
for i in range(B):
|
108 |
patch_attn_mask[i, :tgt_sizes[i][0] * tgt_sizes[i][1]] = True
|
@@ -111,7 +111,7 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
|
|
111 |
vision_embedding = self.resampler(vision_embedding, tgt_sizes)
|
112 |
else:
|
113 |
# get vision_embedding foreach
|
114 |
-
print("HERE, NOT BATCH")
|
115 |
vision_embedding = []
|
116 |
for single_tgt_size, single_pixel_values in zip(tgt_sizes, all_pixel_values):
|
117 |
single_pixel_values = single_pixel_values.unsqueeze(0)
|
@@ -170,7 +170,7 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
|
|
170 |
elif self.training:
|
171 |
cur_vllm_emb += cur_vs_hs[0].mean() * 0
|
172 |
|
173 |
-
print(vllm_embedding.shape)
|
174 |
return vllm_embedding, vision_hidden_states
|
175 |
|
176 |
def forward(self, data, **kwargs):
|
@@ -194,7 +194,7 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
|
|
194 |
result = result[1:]
|
195 |
if result[-1] == tokenizer.eos_id or result[-1] == tokenizer.eot_id:
|
196 |
result = result[:-1]
|
197 |
-
print(result)
|
198 |
result_text.append(tokenizer.decode(result).strip())
|
199 |
return result_text
|
200 |
|
@@ -279,7 +279,7 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
|
|
279 |
input_embeds,
|
280 |
vision_hidden_states_dummy,
|
281 |
) = self.get_vllm_embedding(model_inputs)
|
282 |
-
print(input_embeds.shape, f"INPUT_EMBEDS {counter}")
|
283 |
counter += 1
|
284 |
batch.append(input_embeds)
|
285 |
|
@@ -309,7 +309,7 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
|
|
309 |
# Concatenate the padding tensor to the left of the original tensor
|
310 |
tensor = torch.cat((padding_tensor, tensor), dim=1)
|
311 |
|
312 |
-
print(tensor.shape, "UPDATED_SHAPE")
|
313 |
|
314 |
# Update the batch with the padded tensor
|
315 |
batch[place] = tensor
|
@@ -318,17 +318,17 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
|
|
318 |
attention_mask.append(to_add)
|
319 |
|
320 |
attention_mask = torch.tensor(attention_mask)
|
321 |
-
print(attention_mask.shape)
|
322 |
-
print(attention_mask, "ATTENTION")
|
323 |
# attention_mask = attention_mask.to(self.device)
|
324 |
# padded_tensors = [torch.nn.functional.pad(tensor, (0, 0, 0, max_x - tensor.shape[1])) for tensor in batch]
|
325 |
|
326 |
# Step 3: Stack the padded tensors into a single batch
|
327 |
-
for stuff in batch:
|
328 |
-
|
329 |
batch = torch.cat(batch, dim=0)
|
330 |
-
print(batch.shape)
|
331 |
-
print(batch)
|
332 |
# output_ids = self._decode(input_embeds, tokenizer, **kwargs)
|
333 |
if stream:
|
334 |
kwargs.pop("decode_text")
|
|
|
102 |
padding_value=0.0)
|
103 |
B, L, _ = all_pixel_values.shape
|
104 |
all_pixel_values = all_pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L)
|
105 |
+
# print(B, "BATCH")
|
106 |
patch_attn_mask = torch.zeros((B, 1, max_patches), dtype=torch.bool, device=device)
|
107 |
for i in range(B):
|
108 |
patch_attn_mask[i, :tgt_sizes[i][0] * tgt_sizes[i][1]] = True
|
|
|
111 |
vision_embedding = self.resampler(vision_embedding, tgt_sizes)
|
112 |
else:
|
113 |
# get vision_embedding foreach
|
114 |
+
# print("HERE, NOT BATCH")
|
115 |
vision_embedding = []
|
116 |
for single_tgt_size, single_pixel_values in zip(tgt_sizes, all_pixel_values):
|
117 |
single_pixel_values = single_pixel_values.unsqueeze(0)
|
|
|
170 |
elif self.training:
|
171 |
cur_vllm_emb += cur_vs_hs[0].mean() * 0
|
172 |
|
173 |
+
# print(vllm_embedding.shape)
|
174 |
return vllm_embedding, vision_hidden_states
|
175 |
|
176 |
def forward(self, data, **kwargs):
|
|
|
194 |
result = result[1:]
|
195 |
if result[-1] == tokenizer.eos_id or result[-1] == tokenizer.eot_id:
|
196 |
result = result[:-1]
|
197 |
+
# print(result)
|
198 |
result_text.append(tokenizer.decode(result).strip())
|
199 |
return result_text
|
200 |
|
|
|
279 |
input_embeds,
|
280 |
vision_hidden_states_dummy,
|
281 |
) = self.get_vllm_embedding(model_inputs)
|
282 |
+
# print(input_embeds.shape, f"INPUT_EMBEDS {counter}")
|
283 |
counter += 1
|
284 |
batch.append(input_embeds)
|
285 |
|
|
|
309 |
# Concatenate the padding tensor to the left of the original tensor
|
310 |
tensor = torch.cat((padding_tensor, tensor), dim=1)
|
311 |
|
312 |
+
# print(tensor.shape, "UPDATED_SHAPE")
|
313 |
|
314 |
# Update the batch with the padded tensor
|
315 |
batch[place] = tensor
|
|
|
318 |
attention_mask.append(to_add)
|
319 |
|
320 |
attention_mask = torch.tensor(attention_mask)
|
321 |
+
# print(attention_mask.shape)
|
322 |
+
# print(attention_mask, "ATTENTION")
|
323 |
# attention_mask = attention_mask.to(self.device)
|
324 |
# padded_tensors = [torch.nn.functional.pad(tensor, (0, 0, 0, max_x - tensor.shape[1])) for tensor in batch]
|
325 |
|
326 |
# Step 3: Stack the padded tensors into a single batch
|
327 |
+
# for stuff in batch:
|
328 |
+
# print(stuff.shape, "SHAPE")
|
329 |
batch = torch.cat(batch, dim=0)
|
330 |
+
# print(batch.shape)
|
331 |
+
# print(batch)
|
332 |
# output_ids = self._decode(input_embeds, tokenizer, **kwargs)
|
333 |
if stream:
|
334 |
kwargs.pop("decode_text")
|