ECOFRI
/

CXR-LLAVA-v2

@@ -8,7 +8,8 @@ from transformers import TextIteratorStreamer
 from transformers import StoppingCriteria, GenerationConfig
 from threading import Thread
 from dataclasses import dataclass
 # Model Constants
 IGNORE_INDEX = -100
 IMAGE_TOKEN_INDEX = -200
@@ -596,8 +597,16 @@ class CXRLLAVAModel(PreTrainedModel):
     def generate_cxr_repsonse(self, chat, image, temperature=0.2, top_p=0.8):
         with torch.no_grad():
             streamer = TextIteratorStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=15)
-            import numpy as np
-            image = np.expand_dims(image,axis=-1)
             prompt = self.apply_chat_template(chat)
             images = self.vision_tower.image_processor(image, return_tensors='pt')['pixel_values']
             images = images.to(self.device)
@@ -610,7 +619,6 @@ class CXRLLAVAModel(PreTrainedModel):
             max_context_length = getattr(self.config, 'max_position_embeddings', 2048)
             max_new_tokens = min(512, max_context_length - input_ids.shape[-1] - num_image_tokens)
             thread = Thread(target=self.generate, kwargs=dict(
                 inputs=input_ids,
                 do_sample=do_sample,

 from transformers import StoppingCriteria, GenerationConfig
 from threading import Thread
 from dataclasses import dataclass
+import numpy as np
+from PIL import Image
 # Model Constants
 IGNORE_INDEX = -100
 IMAGE_TOKEN_INDEX = -200
     def generate_cxr_repsonse(self, chat, image, temperature=0.2, top_p=0.8):
         with torch.no_grad():
             streamer = TextIteratorStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=15)
+            if np.array(image).max()>255:
+                raise Exception("WARNING. 16-bit image is not supported.")
+            image = image.convert('L') # convert to grayscale
+            image = np.array(image)
+            if len(image.shape) == 2:
+                image = np.expand_dims(image,axis=-1) # (width, height) --> (width, height, 1)
             prompt = self.apply_chat_template(chat)
             images = self.vision_tower.image_processor(image, return_tensors='pt')['pixel_values']
             images = images.to(self.device)
             max_context_length = getattr(self.config, 'max_position_embeddings', 2048)
             max_new_tokens = min(512, max_context_length - input_ids.shape[-1] - num_image_tokens)
             thread = Thread(target=self.generate, kwargs=dict(
                 inputs=input_ids,
                 do_sample=do_sample,

config.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "_name_or_path": "CXR-LLAVA-v2",
   "architectures": [
     "CXRLLAVAModel"
   ],
@@ -26,7 +26,7 @@
     "std": 0.3821719215686275
   },
   "llama": {
-    "_name_or_path": "CXR-LLAVA-v2",
     "add_cross_attention": false,
     "architectures": [
       "LlamaForCausalLM"
@@ -105,7 +105,7 @@
     "vocab_size": 32000
   },
   "llama_model_dtype": "bf16",
-  "llama_model_path": "CXR-LLAVA-v2",
   "mm_projector_dim": 1024,
   "mm_projector_dtype": "fp32",
   "mm_projector_path": null,

 {
+  "_name_or_path": "G:\\Temp\\finetune_result\\LLAMA2-7B-CHAT_ViT-L-16-512_MOREKEYWORD_LN_PATCH_FINETUNE_ChexpertJSON_POSTTRAIN_25000_DIST",
   "architectures": [
     "CXRLLAVAModel"
   ],
     "std": 0.3821719215686275
   },
   "llama": {
+    "_name_or_path": "/home/jovyan/llava/SW_LLAVA/LLAMA2-7B-CHAT_ViT-L-16-512_MOREKEYWORD_LN_PATCH_FINETUNE_ChexpertJSON_POSTTRAIN",
     "add_cross_attention": false,
     "architectures": [
       "LlamaForCausalLM"
     "vocab_size": 32000
   },
   "llama_model_dtype": "bf16",
+  "llama_model_path": "/home/jovyan/llava/SW_LLAVA/LLAMA2-7B-CHAT_ViT-L-16-512_MOREKEYWORD_LN_PATCH_FINETUNE_ChexpertJSON_POSTTRAIN",
   "mm_projector_dim": 1024,
   "mm_projector_dtype": "fp32",
   "mm_projector_path": null,