last update

Files changed (7) hide show

LLAVA_Biovil/biovil_t/transformer.py +1 -7
LLAVA_Biovil/llava/model/builder.py +0 -13
LLAVA_Biovil/llava/model/language_model/llava_llama.py +5 -8
LLAVA_Biovil/llava/model/llava_arch.py +9 -111
__pycache__/utils.cpython-310.pyc +0 -0
example_code.py +3 -25
findings_classifier/__pycache__/chexpert_train.cpython-310.pyc +0 -0

LLAVA_Biovil/biovil_t/transformer.py CHANGED Viewed

@@ -93,17 +93,11 @@ class VisionTransformerPooler(nn.Module):
     def forward_after_reshape(self,
                               x: torch.Tensor,
-                              pos_embed: torch.Tensor,
-                              x_previous: Optional[torch.Tensor] = None) -> torch.Tensor:
         B, L, _ = x.shape  # Batch, Sequence length, Feature dimension
         # Positional and type embeddings
         type_embed = self.type_embed[0].expand(B, L, -1)
-        if x_previous is not None:
-            x = torch.cat((x, x_previous), dim=1)
-            pos_embed = torch.cat((pos_embed, pos_embed), dim=1)
-            prev_type_embed = self.type_embed[1].expand(B, L, -1)
-            type_embed = torch.cat((type_embed, prev_type_embed), dim=1)
         # Add positional and type embeddings (used in query and key matching)
         pos_and_type_embed = pos_embed + type_embed

     def forward_after_reshape(self,
                               x: torch.Tensor,
+                              pos_embed: torch.Tensor) -> torch.Tensor:
         B, L, _ = x.shape  # Batch, Sequence length, Feature dimension
         # Positional and type embeddings
         type_embed = self.type_embed[0].expand(B, L, -1)
         # Add positional and type embeddings (used in query and key matching)
         pos_and_type_embed = pos_embed + type_embed

LLAVA_Biovil/llava/model/builder.py CHANGED Viewed

@@ -184,19 +184,6 @@ def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, l
                     new_vision_tower_state_dict[new_k] = v
             print('Loaded additional vision tower weights...')
             vision_tower.load_state_dict(new_vision_tower_state_dict, strict=False)
-            # weight difference sum([torch.norm(value-vision_tower.state_dict()[key].cpu())  for key,value in new_vision_tower_state_dict.items()])
-        image_pooler = model.get_image_pooler()
-        if image_pooler is not None:
-            image_pooler.to(device=device, dtype=torch.float16)
-            if non_lora_trainables is not None and any(k.startswith('model.image_pooler.') for k in non_lora_trainables):
-                new_image_pooler_state_dict = {}
-                for k, v in non_lora_trainables.items():  # we need remapping, because state_dict from model is always like model.vision_tower. It should be vision_tower.
-                    if 'model.image_pooler.' in k:
-                        new_k = k.replace('model.image_pooler.', '')
-                        new_image_pooler_state_dict[new_k] = v
-                print('Loading additional image pooler weights...')
-                image_pooler.load_state_dict(new_image_pooler_state_dict, strict=True)
     if hasattr(model.config, "max_sequence_length"):
         context_len = model.config.max_sequence_length

                     new_vision_tower_state_dict[new_k] = v
             print('Loaded additional vision tower weights...')
             vision_tower.load_state_dict(new_vision_tower_state_dict, strict=False)
     if hasattr(model.config, "max_sequence_length"):
         context_len = model.config.max_sequence_length

LLAVA_Biovil/llava/model/language_model/llava_llama.py CHANGED Viewed

@@ -35,20 +35,19 @@ class LlavaConfig(LlamaConfig):
 class LlavaLlamaModel(LlavaMetaModel, LlamaModel):
     config_class = LlavaConfig
-    def __init__(self, config: LlamaConfig, mv_type='none'):
-        super(LlavaLlamaModel, self).__init__(config, mv_type=mv_type)
 class LlavaLlamaForCausalLM(LlamaForCausalLM, LlavaMetaForCausalLM):
     config_class = LlavaConfig
-    def __init__(self, config, mv_type='none'):
         super(LlamaForCausalLM, self).__init__(config)
-        self.model = LlavaLlamaModel(config, mv_type=mv_type)
         self.pretraining_tp = config.pretraining_tp
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-        self.mv_type = mv_type
         # Initialize weights and apply final processing
         self.post_init()
@@ -68,7 +67,6 @@ class LlavaLlamaForCausalLM(LlamaForCausalLM, LlavaMetaForCausalLM):
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         images: Optional[torch.FloatTensor] = None,
-        prev_images: Optional[torch.FloatTensor] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         if inputs_embeds is None:
@@ -85,8 +83,7 @@ class LlavaLlamaForCausalLM(LlamaForCausalLM, LlavaMetaForCausalLM):
                 attention_mask,
                 past_key_values,
                 labels,
-                images,
-                prev_images
             )
         output = super().forward(
             input_ids=input_ids,

 class LlavaLlamaModel(LlavaMetaModel, LlamaModel):
     config_class = LlavaConfig
+    def __init__(self, config: LlamaConfig):
+        super(LlavaLlamaModel, self).__init__(config)
 class LlavaLlamaForCausalLM(LlamaForCausalLM, LlavaMetaForCausalLM):
     config_class = LlavaConfig
+    def __init__(self, config):
         super(LlamaForCausalLM, self).__init__(config)
+        self.model = LlavaLlamaModel(config)
         self.pretraining_tp = config.pretraining_tp
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         # Initialize weights and apply final processing
         self.post_init()
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         images: Optional[torch.FloatTensor] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         if inputs_embeds is None:
                 attention_mask,
                 past_key_values,
                 labels,
+                images
             )
         output = super().forward(
             input_ids=input_ids,

LLAVA_Biovil/llava/model/llava_arch.py CHANGED Viewed

@@ -27,13 +27,12 @@ from LLAVA_Biovil.llava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAUL
 class LlavaMetaModel:
-    def __init__(self, config, mv_type='none'):
         super(LlavaMetaModel, self).__init__(config)
         if hasattr(config, "mm_vision_tower"):
             self.vision_tower = build_vision_tower(config, delay_load=True)
             self.mm_projector = build_vision_projector(config)
-            self.image_pooler = build_image_pooler(config) if "pool" in mv_type else None
     def get_vision_tower(self):
         vision_tower = getattr(self, 'vision_tower', None)
@@ -51,7 +50,6 @@ class LlavaMetaModel:
         pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
         self.config.mm_vision_tower = vision_tower
-        self.config.mv_type = getattr(model_args, 'mv_type', False)
         if self.get_vision_tower() is None:
             if self.config.mm_vision_tower == 'biovil':
@@ -188,87 +186,8 @@ class LlavaMetaForCausalLM(ABC):
         return padded_embeddings.flatten(1,2), mask
-    def encode_images_pooled(self, images, split_sizes, num_imgs_present, num_imgs_past, mv_type="pool_all"):
-        image_pooler = self.get_image_pooler()
-        image_features = self.get_model().get_vision_tower()(images)
-        if self.get_model().config.mm_vision_tower == 'biovil':
-            image_features = image_features.patch_embeddings
-            # flatten
-            image_features = image_features.flatten(2).transpose(1,2)
-        if split_sizes is not None:
-            image_features = torch.split(image_features, split_sizes, dim=0)
-            if mv_type == "pool_all":
-                # merge present and past per batch
-                present_features = [image_features[i] for i in range(len(num_imgs_present))]
-                past_features = []
-                i = 0
-                for num_imgs_elem in num_imgs_past:
-                    if num_imgs_elem != 0:
-                        past_features.append(image_features[i+len(num_imgs_present)])
-                        i += 1
-                    else:
-                        past_features.append(None)
-                all_img_features = []
-                for idx, (batch_num_present, batch_num_past) in enumerate(zip(num_imgs_present, num_imgs_past)):
-                    if batch_num_past == 0:
-                        all_img_features.append(present_features[idx])
-                    else:
-                        all_img_features.append(torch.cat((present_features[idx], past_features[idx]), dim=0))
-                all_img_features, mask, token_type_ids  = self.pad_embeddings(all_img_features, num_imgs_present, num_imgs_past)
-                all_img_features = image_pooler(all_img_features, mask, token_type_ids)
-            elif mv_type == "pool_concat":
-                present_features = [image_features[i] for i in range(len(num_imgs_present))]
-                past_features = [image_features[i+len(num_imgs_present)] for i in range(len(image_features)-len(num_imgs_present))]
-                present_features, mask_present, _ = self.pad_embeddings(present_features)
-                past_features, mask_past, _ = self.pad_embeddings(past_features)
-                present_features = image_pooler(present_features, mask_present)
-                past_features = image_pooler(past_features, mask_past)
-                # TODO maybe max pool on past features to save tokens
-                # concat present and past per batch if past is not empty
-                all_img_features = []
-                idx_present = 0
-                idx_past = 0
-                for batch_num_present, batch_num_past in zip(num_imgs_present, num_imgs_past):
-                    if batch_num_past == 0:
-                        all_img_features.append(present_features[idx_present])
-                        idx_present += 1
-                    else:
-                        all_img_features.append(torch.cat((present_features[idx_present], past_features[idx_past]), dim=0))
-                        idx_present += 1
-                        idx_past += 1
-        else:
-            raise NotImplementedError
-        if type(all_img_features) is list:
-            split_sizes = [image.shape[0] for image in all_img_features]
-            all_img_features = self.get_model().mm_projector(torch.cat(all_img_features, dim=0))
-            all_img_features = torch.split(all_img_features, split_sizes, dim=0)
-        else:
-            all_img_features = self.get_model().mm_projector(all_img_features)
-        return all_img_features
-    def encode_images_pooled_mv(self, images, split_sizes):
-        image_pooler = self.get_image_pooler()
-        image_features = self.get_model().get_vision_tower()(images)
-        if split_sizes is not None:
-            image_features = torch.split(image_features, split_sizes, dim=0)
-            image_features, mask  = self.pad_embeddings_mv(image_features)
-            image_features = image_pooler(image_features, mask)
-        else:
-            mask = torch.ones((image_features.shape[0], image_features.shape[1]), dtype=torch.bool, device=image_features[0].device)
-            image_features = image_pooler(image_features, mask)
-        image_features = self.get_model().mm_projector(image_features)
-        return image_features
-    def get_image_pooler(self):
-        return self.get_model().get_image_pooler()
     def prepare_inputs_labels_for_multimodal(
-        self, input_ids, position_ids, attention_mask, past_key_values, labels, images, prev_images=None
     ):
         vision_tower = self.get_vision_tower()
         if vision_tower is None or images is None or input_ids.shape[1] == 1:
@@ -283,35 +202,14 @@ class LlavaMetaForCausalLM(ABC):
             return input_ids, position_ids, attention_mask, past_key_values, None, labels
         if type(images) is list or images.ndim == 5:
-            if getattr(self.config, 'mv_type') == "concat":
-                concat_images = torch.cat([image for image in images], dim=0)
-                image_features = self.encode_images(concat_images)
-                split_sizes = [image.shape[0] for image in images]
-                image_features = torch.split(image_features, split_sizes, dim=0)
-                image_features = [x.flatten(0, 1).to(self.device) for x in image_features]
-            if getattr(self.config, 'mv_type') == "pool_all":
-                concat_images = torch.cat((torch.cat([image for image in images], dim=0), torch.cat([image for image in prev_images if image is not None], dim=0))) # first present, then past, all will be merged
-                split_sizes = [image.shape[0] for image in images]+ [image.shape[0] for image in prev_images if image is not None]
-                num_imgs_present = [image.shape[0] if image is not None else 0 for image in images]
-                num_imgs_past = [image.shape[0] if image is not None else 0 for image in prev_images]
-                image_features = self.encode_images_pooled(concat_images, split_sizes, num_imgs_present, num_imgs_past, "pool_all")
-            if getattr(self.config, 'mv_type') == "pool_concat": # TODO make sure to allow empty past -> shorter sequence
-                concat_images = torch.cat((torch.cat([image for image in images], dim=0), torch.cat([image for image in prev_images if image is not None], dim=0))) # first present, then past, all will be merged
-                split_sizes = [image.shape[0] for image in images]+ [image.shape[0] for image in prev_images if image is not None]
-                num_imgs_present = [image.shape[0] if image is not None else 0 for image in images]
-                num_imgs_past = [image.shape[0] if image is not None else 0 for image in prev_images]
-                image_features = self.encode_images_pooled(concat_images, split_sizes, num_imgs_present, num_imgs_past, "pool_concat")
-            if getattr(self.config, 'mv_type') == "pool": #no past images
-                concat_images = torch.cat([image for image in images], dim=0)
-                split_sizes = [image.shape[0] for image in images]
-                image_features = self.encode_images_pooled_mv(concat_images, split_sizes)
         else:
-            if hasattr(self.config, 'mv_type') and getattr(self.config, 'mv_type') == "pool_all":
-                image_features = self.encode_images_pooled(images, None).to(self.device)
-            elif hasattr(self.config, 'mv_type') and getattr(self.config, 'mv_type') == "pool":
-                image_features = self.encode_images_pooled_mv(images, None).to(self.device)
-            else:
-                image_features = self.encode_images(images).to(self.device)
         # TODO: image start / end is not implemented here to support pretraining.
         if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False):

 class LlavaMetaModel:
+    def __init__(self, config):
         super(LlavaMetaModel, self).__init__(config)
         if hasattr(config, "mm_vision_tower"):
             self.vision_tower = build_vision_tower(config, delay_load=True)
             self.mm_projector = build_vision_projector(config)
     def get_vision_tower(self):
         vision_tower = getattr(self, 'vision_tower', None)
         pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
         self.config.mm_vision_tower = vision_tower
         if self.get_vision_tower() is None:
             if self.config.mm_vision_tower == 'biovil':
         return padded_embeddings.flatten(1,2), mask
     def prepare_inputs_labels_for_multimodal(
+        self, input_ids, position_ids, attention_mask, past_key_values, labels, images
     ):
         vision_tower = self.get_vision_tower()
         if vision_tower is None or images is None or input_ids.shape[1] == 1:
             return input_ids, position_ids, attention_mask, past_key_values, None, labels
         if type(images) is list or images.ndim == 5:
+            concat_images = torch.cat([image for image in images], dim=0)
+            image_features = self.encode_images(concat_images)
+            split_sizes = [image.shape[0] for image in images]
+            image_features = torch.split(image_features, split_sizes, dim=0)
+            image_features = [x.flatten(0, 1).to(self.device) for x in image_features]
         else:
+            image_features = self.encode_images(images).to(self.device)
         # TODO: image start / end is not implemented here to support pretraining.
         if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False):

__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (3.68 kB). View file

example_code.py CHANGED Viewed

@@ -6,7 +6,7 @@ import requests
 import torch
 from PIL import Image
 import numpy as np
-from huggingface_hub import snapshot_download
 from LLAVA_Biovil.llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria, remap_to_uint8
 from LLAVA_Biovil.llava.model.builder import load_pretrained_model
@@ -18,13 +18,12 @@ from utils import create_chest_xray_transform_for_inference, init_chexpert_predi
 def load_model_from_huggingface(repo_id):
     # Download model files
-    model_path = snapshot_download(repo_id=repo_id, revision="main", force_download=True)
     model_path = Path(model_path)
     tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, model_base='liuhaotian/llava-v1.5-7b',
                                                                            model_name="llava-v1.5-7b-task-lora_radialog_instruct_llava_biovil_unfrozen_2e-5_5epochs_v5_checkpoint-21000", load_8bit=False, load_4bit=False)
     return tokenizer, model, image_processor, context_len
@@ -37,7 +36,7 @@ if __name__ == '__main__':
     image = remap_to_uint8(np.array(image))
     image = Image.fromarray(image).convert("L")
-    tokenizer, model, image_processor, context_len = load_model_from_huggingface(repo_id="Chantal/RaDialog-interactive-radiology-report-generation")
     cp_model, cp_class_names, cp_transforms = init_chexpert_predictor()
     model.config.tokenizer_padding_side = "left"
@@ -82,27 +81,6 @@ if __name__ == '__main__':
     pred = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip().replace("</s>", "")
     print("ASSISTANT: ", pred)
-    # add prediction to conversation
-    conv.messages.pop()
-    conv.append_message("ASSISTANT", pred)
-    stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
-    stopping_criteria = KeywordsStoppingCriteria([stop_str], tokenizer, input_ids)
-    # generate a report
-    with torch.inference_mode():
-        output_ids = model.generate(
-            input_ids,
-            images=image_tensor,
-            do_sample=False,
-            use_cache=True,
-            max_new_tokens=300,
-            stopping_criteria=[stopping_criteria],
-            pad_token_id=tokenizer.pad_token_id
-        )
-    pred = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip().replace("</s>", "")
-    print("ASSISTANT: ", pred)
     # add prediction to conversation
     conv.messages.pop()
     conv.append_message("ASSISTANT", pred)

 import torch
 from PIL import Image
 import numpy as np
+from huggingface_hub import snapshot_download, hf_hub_download
 from LLAVA_Biovil.llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria, remap_to_uint8
 from LLAVA_Biovil.llava.model.builder import load_pretrained_model
 def load_model_from_huggingface(repo_id):
     # Download model files
+    model_path = snapshot_download(repo_id=repo_id, revision="main")
     model_path = Path(model_path)
     tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, model_base='liuhaotian/llava-v1.5-7b',
                                                                            model_name="llava-v1.5-7b-task-lora_radialog_instruct_llava_biovil_unfrozen_2e-5_5epochs_v5_checkpoint-21000", load_8bit=False, load_4bit=False)
     return tokenizer, model, image_processor, context_len
     image = remap_to_uint8(np.array(image))
     image = Image.fromarray(image).convert("L")
+    tokenizer, model, image_processor, context_len = load_model_from_huggingface(repo_id="ChantalPellegrini/RaDialog-interactive-radiology-report-generation")
     cp_model, cp_class_names, cp_transforms = init_chexpert_predictor()
     model.config.tokenizer_padding_side = "left"
     pred = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip().replace("</s>", "")
     print("ASSISTANT: ", pred)
     # add prediction to conversation
     conv.messages.pop()
     conv.append_message("ASSISTANT", pred)

findings_classifier/__pycache__/chexpert_train.cpython-310.pyc CHANGED Viewed

Binary files a/findings_classifier/__pycache__/chexpert_train.cpython-310.pyc and b/findings_classifier/__pycache__/chexpert_train.cpython-310.pyc differ