andrewqian123
/

LLAMA_BATCH

@@ -102,7 +102,7 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
                                                                        padding_value=0.0)
                     B, L, _ = all_pixel_values.shape
                     all_pixel_values = all_pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L)
-                    print(B, "BATCH")
                     patch_attn_mask = torch.zeros((B, 1, max_patches), dtype=torch.bool, device=device)
                     for i in range(B):
                         patch_attn_mask[i, :tgt_sizes[i][0] * tgt_sizes[i][1]] = True
@@ -111,7 +111,7 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
                     vision_embedding = self.resampler(vision_embedding, tgt_sizes)
                 else:
                     # get vision_embedding foreach
-                    print("HERE, NOT BATCH")
                     vision_embedding = []
                     for single_tgt_size, single_pixel_values in zip(tgt_sizes, all_pixel_values):
                         single_pixel_values = single_pixel_values.unsqueeze(0)
@@ -170,7 +170,7 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
                 elif self.training:
                     cur_vllm_emb += cur_vs_hs[0].mean() * 0
-        print(vllm_embedding.shape)
         return vllm_embedding, vision_hidden_states
     def forward(self, data, **kwargs):
@@ -194,7 +194,7 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
                 result = result[1:]
             if result[-1] == tokenizer.eos_id or result[-1] == tokenizer.eot_id:
                 result = result[:-1]
-            print(result)
             result_text.append(tokenizer.decode(result).strip())
         return result_text
@@ -279,7 +279,7 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
                 input_embeds,
                 vision_hidden_states_dummy,
             ) = self.get_vllm_embedding(model_inputs)
-            print(input_embeds.shape, f"INPUT_EMBEDS {counter}")
             counter += 1
             batch.append(input_embeds)
@@ -309,7 +309,7 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
             # Concatenate the padding tensor to the left of the original tensor
             tensor = torch.cat((padding_tensor, tensor), dim=1)
-            print(tensor.shape, "UPDATED_SHAPE")
             # Update the batch with the padded tensor
             batch[place] = tensor
@@ -318,17 +318,17 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
             attention_mask.append(to_add)
         attention_mask = torch.tensor(attention_mask)
-        print(attention_mask.shape)
-        print(attention_mask, "ATTENTION")
         # attention_mask = attention_mask.to(self.device)
         # padded_tensors = [torch.nn.functional.pad(tensor, (0, 0, 0, max_x - tensor.shape[1])) for tensor in batch]
         # Step 3: Stack the padded tensors into a single batch
-        for stuff in batch:
-            print(stuff.shape, "SHAPE")
         batch = torch.cat(batch, dim=0)
-        print(batch.shape)
-        print(batch)
         # output_ids = self._decode(input_embeds, tokenizer, **kwargs)
         if stream:
             kwargs.pop("decode_text")

                                                                        padding_value=0.0)
                     B, L, _ = all_pixel_values.shape
                     all_pixel_values = all_pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L)
+                    # print(B, "BATCH")
                     patch_attn_mask = torch.zeros((B, 1, max_patches), dtype=torch.bool, device=device)
                     for i in range(B):
                         patch_attn_mask[i, :tgt_sizes[i][0] * tgt_sizes[i][1]] = True
                     vision_embedding = self.resampler(vision_embedding, tgt_sizes)
                 else:
                     # get vision_embedding foreach
+                    # print("HERE, NOT BATCH")
                     vision_embedding = []
                     for single_tgt_size, single_pixel_values in zip(tgt_sizes, all_pixel_values):
                         single_pixel_values = single_pixel_values.unsqueeze(0)
                 elif self.training:
                     cur_vllm_emb += cur_vs_hs[0].mean() * 0
+        # print(vllm_embedding.shape)
         return vllm_embedding, vision_hidden_states
     def forward(self, data, **kwargs):
                 result = result[1:]
             if result[-1] == tokenizer.eos_id or result[-1] == tokenizer.eot_id:
                 result = result[:-1]
+            # print(result)
             result_text.append(tokenizer.decode(result).strip())
         return result_text
                 input_embeds,
                 vision_hidden_states_dummy,
             ) = self.get_vllm_embedding(model_inputs)
+            # print(input_embeds.shape, f"INPUT_EMBEDS {counter}")
             counter += 1
             batch.append(input_embeds)
             # Concatenate the padding tensor to the left of the original tensor
             tensor = torch.cat((padding_tensor, tensor), dim=1)
+            # print(tensor.shape, "UPDATED_SHAPE")
             # Update the batch with the padded tensor
             batch[place] = tensor
             attention_mask.append(to_add)
         attention_mask = torch.tensor(attention_mask)
+        # print(attention_mask.shape)
+        # print(attention_mask, "ATTENTION")
         # attention_mask = attention_mask.to(self.device)
         # padded_tensors = [torch.nn.functional.pad(tensor, (0, 0, 0, max_x - tensor.shape[1])) for tensor in batch]
         # Step 3: Stack the padded tensors into a single batch
+        # for stuff in batch:
+        #     print(stuff.shape, "SHAPE")
         batch = torch.cat(batch, dim=0)
+        # print(batch.shape)
+        # print(batch)
         # output_ids = self._decode(input_embeds, tokenizer, **kwargs)
         if stream:
             kwargs.pop("decode_text")