Spaces:

shi-labs
/

VCoder

Runtime error

App Files Files Community

praeclarumjj3 commited on Dec 21, 2023

Commit

cfcddc7

1 Parent(s): b916070

Update vcoder_llava/model/vcoder_ds_llava_arch.py

Browse files

Files changed (1) hide show

vcoder_llava/model/vcoder_ds_llava_arch.py +18 -26

vcoder_llava/model/vcoder_ds_llava_arch.py CHANGED Viewed

@@ -158,26 +158,17 @@ class VCoderDSLlavaMetaForCausalLM(ABC):
                 seg_features = self.encode_seg_images(seg_images)
         if depth_images is not None:
-            try:
-                for p in self.get_model().depth_mm_projector.parameters():
-                    p.requires_grad = True
-                if type(depth_images) is list or depth_images.ndim == 5:
-                    concat_depth_images = torch.cat([image for image in depth_images], dim=0)
-                    depth_features = self.encode_depth_images(concat_depth_images)
-                    split_sizes = [image.shape[0] for image in depth_images]
-                    depth_features = torch.split(depth_features, split_sizes, dim=0)
-                    depth_features = [x.flatten(0, 1) for x in depth_features]
-                else:
-                    depth_features = self.encode_depth_images(depth_images)
-            except:
-                depth_images = None
-                mask = input_ids != DEPTH_TOKEN_INDEX # drop depth indices
-                input_ids = input_ids[mask]
-                for p in self.get_model().depth_mm_projector.parameters():
-                    p.requires_grad = False
         else:
-            for p in self.get_model().depth_mm_projector.parameters():
-                p.requires_grad = False
         self.get_model().vcoder_lm_emb.weight.data = self.get_model().get_input_embeddings().weight.data.clone()
@@ -187,13 +178,15 @@ class VCoderDSLlavaMetaForCausalLM(ABC):
         cur_seg_idx = 0
         cur_depth_idx = 0
         for batch_idx, cur_input_ids in enumerate(input_ids):
             if (cur_input_ids == IMAGE_TOKEN_INDEX).sum() == 0 and (cur_input_ids == SEG_TOKEN_INDEX).sum() == 0:
                 # FIXME: this is a hacky fix, for deepspeed zero3 to work
                 cur_image_features = image_features[cur_image_idx]
                 half_len = cur_input_ids.shape[0] // 2
                 if seg_images is not None:
                     cur_seg_features = seg_features[cur_seg_idx]
-                    if depth_images is not None:
                         cur_depth_features = depth_features[cur_depth_idx]
                     cur_input_embeds_1 = self.get_model().vcoder_lm_emb(cur_input_ids[:half_len])
                     cur_input_embeds_2 = self.get_model().vcoder_lm_emb(cur_input_ids[half_len:])
@@ -201,7 +194,7 @@ class VCoderDSLlavaMetaForCausalLM(ABC):
                     cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids[:half_len])
                     cur_input_embeds_2 = self.get_model().embed_tokens(cur_input_ids[half_len:])
                 if seg_images is not None:
-                    if depth_images is not None:
                         cur_input_embeds = torch.cat([cur_input_embeds_1, cur_depth_features[0:0], cur_seg_features[0:0], cur_image_features[0:0], cur_input_embeds_2], dim=0)
                     else:
                         cur_input_embeds = torch.cat([cur_input_embeds_1, cur_seg_features[0:0], cur_image_features[0:0], cur_input_embeds_2], dim=0)
@@ -243,19 +236,16 @@ class VCoderDSLlavaMetaForCausalLM(ABC):
                 while seg_token_indices.numel() > 0:
                     cur_seg_features = seg_features[cur_seg_idx]
                     seg_token_start = seg_token_indices[0]
-                    if depth_images is None:
-                        cur_new_input_embeds.append(self.get_model().vcoder_lm_emb(cur_input_ids[:seg_token_start]))
                     cur_new_input_embeds.append(cur_seg_features)
                     if labels is not None:
-                        if depth_images is None:
-                            cur_new_labels.append(cur_labels[:seg_token_start])
                         cur_new_labels.append(torch.full((cur_seg_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype))
                         cur_labels = cur_labels[seg_token_start+1:]
                     cur_seg_idx += 1
                     cur_input_ids = cur_input_ids[seg_token_start+1:]
                     seg_token_indices = torch.where(cur_input_ids == SEG_TOKEN_INDEX)[0]
-            if depth_images is not None:
                 depth_token_indices = torch.where(cur_input_ids == DEPTH_TOKEN_INDEX)[0]
                 while depth_token_indices.numel() > 0:
                     cur_depth_features = depth_features[cur_depth_idx]
@@ -269,6 +259,8 @@ class VCoderDSLlavaMetaForCausalLM(ABC):
                     cur_depth_idx += 1
                     cur_input_ids = cur_input_ids[depth_token_start+1:]
                     depth_token_indices = torch.where(cur_input_ids == DEPTH_TOKEN_INDEX)[0]
             if cur_input_ids.numel() > 0:
                 if seg_images is None:

                 seg_features = self.encode_seg_images(seg_images)
         if depth_images is not None:
+            is_depth_zero = [torch.mean(d) == 0 for d in depth_images]
+            if type(depth_images) is list or depth_images.ndim == 5:
+                concat_depth_images = torch.cat([image for image in depth_images], dim=0)
+                depth_features = self.encode_depth_images(concat_depth_images)
+                split_sizes = [image.shape[0] for image in depth_images]
+                depth_features = torch.split(depth_features, split_sizes, dim=0)
+                depth_features = [x.flatten(0, 1) for x in depth_features]
+            else:
+                depth_features = self.encode_depth_images(depth_images)
         else:
+            is_depth_zero = [True] * input_ids.shape[0]
         self.get_model().vcoder_lm_emb.weight.data = self.get_model().get_input_embeddings().weight.data.clone()
         cur_seg_idx = 0
         cur_depth_idx = 0
         for batch_idx, cur_input_ids in enumerate(input_ids):
+            print(cur_input_ids)
             if (cur_input_ids == IMAGE_TOKEN_INDEX).sum() == 0 and (cur_input_ids == SEG_TOKEN_INDEX).sum() == 0:
                 # FIXME: this is a hacky fix, for deepspeed zero3 to work
                 cur_image_features = image_features[cur_image_idx]
                 half_len = cur_input_ids.shape[0] // 2
                 if seg_images is not None:
                     cur_seg_features = seg_features[cur_seg_idx]
+                    is_cur_depth_zero = is_depth_zero[cur_depth_idx]
+                    if not is_cur_depth_zero:
                         cur_depth_features = depth_features[cur_depth_idx]
                     cur_input_embeds_1 = self.get_model().vcoder_lm_emb(cur_input_ids[:half_len])
                     cur_input_embeds_2 = self.get_model().vcoder_lm_emb(cur_input_ids[half_len:])
                     cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids[:half_len])
                     cur_input_embeds_2 = self.get_model().embed_tokens(cur_input_ids[half_len:])
                 if seg_images is not None:
+                    if not is_cur_depth_zero:
                         cur_input_embeds = torch.cat([cur_input_embeds_1, cur_depth_features[0:0], cur_seg_features[0:0], cur_image_features[0:0], cur_input_embeds_2], dim=0)
                     else:
                         cur_input_embeds = torch.cat([cur_input_embeds_1, cur_seg_features[0:0], cur_image_features[0:0], cur_input_embeds_2], dim=0)
                 while seg_token_indices.numel() > 0:
                     cur_seg_features = seg_features[cur_seg_idx]
                     seg_token_start = seg_token_indices[0]
                     cur_new_input_embeds.append(cur_seg_features)
                     if labels is not None:
                         cur_new_labels.append(torch.full((cur_seg_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype))
                         cur_labels = cur_labels[seg_token_start+1:]
                     cur_seg_idx += 1
                     cur_input_ids = cur_input_ids[seg_token_start+1:]
                     seg_token_indices = torch.where(cur_input_ids == SEG_TOKEN_INDEX)[0]
+            is_cur_depth_zero = is_depth_zero[cur_depth_idx]
+            if not is_cur_depth_zero:
                 depth_token_indices = torch.where(cur_input_ids == DEPTH_TOKEN_INDEX)[0]
                 while depth_token_indices.numel() > 0:
                     cur_depth_features = depth_features[cur_depth_idx]
                     cur_depth_idx += 1
                     cur_input_ids = cur_input_ids[depth_token_start+1:]
                     depth_token_indices = torch.where(cur_input_ids == DEPTH_TOKEN_INDEX)[0]
+            else:
+                cur_depth_idx += 1
             if cur_input_ids.numel() > 0:
                 if seg_images is None: