Spaces:

shikunl
/

prismer

Sleeping

App Files Files Community

shikunl commited on Mar 12, 2023

Commit

ef365f5

•

1 Parent(s): fb94f78

Add VQA

Browse files

Files changed (3) hide show

app_caption.py +1 -1
app_vqa.py +1 -1
prismer_model.py +13 -9

app_caption.py CHANGED Viewed

@@ -28,7 +28,7 @@ def create_demo():
                 object_detection = gr.Image(label='Object Detection')
                 ocr = gr.Image(label='OCR Detection')
-    inputs = [image, model_name, 'caption']
     outputs = [caption, depth, edge, normals, segmentation, object_detection, ocr]
     # paths = sorted(pathlib.Path('prismer/images').glob('*'))

                 object_detection = gr.Image(label='Object Detection')
                 ocr = gr.Image(label='OCR Detection')
+    inputs = [image, model_name]
     outputs = [caption, depth, edge, normals, segmentation, object_detection, ocr]
     # paths = sorted(pathlib.Path('prismer/images').glob('*'))

app_vqa.py CHANGED Viewed

@@ -28,7 +28,7 @@ def create_demo():
                 object_detection = gr.Image(label='Object Detection')
                 ocr = gr.Image(label='OCR Detection')
-    inputs = [image, model_name, 'vqa', question]
     outputs = [answer, depth, edge, normals, segmentation, object_detection, ocr]
     # paths = sorted(pathlib.Path('prismer/images').glob('*'))

                 object_detection = gr.Image(label='Object Detection')
                 ocr = gr.Image(label='OCR Detection')
+    inputs = [image, model_name, question]
     outputs = [answer, depth, edge, normals, segmentation, object_detection, ocr]
     # paths = sorted(pathlib.Path('prismer/images').glob('*'))

prismer_model.py CHANGED Viewed

@@ -19,6 +19,7 @@ from dataset import create_dataset, create_loader
 from dataset.utils import pre_question
 from model.prismer_caption import PrismerCaption
 from model.prismer_vqa import PrismerVQA
 def download_models() -> None:
@@ -91,7 +92,8 @@ class Model:
             }
             model = PrismerCaption(config)
             state_dict = torch.load(f'prismer/logging/pretrain_{model_name}/pytorch_model.bin', map_location='cuda:0')
         elif self.mode == 'vqa':
             config = {
                 'dataset': 'demo',
@@ -105,6 +107,8 @@ class Model:
             model = PrismerVQA(config)
             state_dict = torch.load(f'prismer/logging/vqa_{model_name}/pytorch_model.bin', map_location='cuda:0')
         model.load_state_dict(state_dict)
         model.eval()
@@ -116,8 +120,8 @@ class Model:
         self.mode = mode
     @torch.inference_mode()
-    def run_caption_model(self, exp_name: str, mode: str) -> str:
-        self.set_model(exp_name, mode)
         _, test_dataset = create_dataset('caption', self.config)
         test_loader = create_loader(test_dataset, batch_size=1, num_workers=4, train=False)
         experts, _ = next(iter(test_loader))
@@ -128,15 +132,15 @@ class Model:
         caption = caption.capitalize() + '.'
         return caption
-    def run_caption(self, image_path: str, model_name: str, mode: str) -> tuple[str | None, ...]:
         out_paths = run_experts(image_path)
-        caption = self.run_caption_model(model_name, mode)
         label_prettify(image_path, out_paths)
         return caption, *out_paths
     @torch.inference_mode()
-    def run_vqa_model(self, exp_name: str, mode: str, question: str) -> str:
-        self.set_model(exp_name, mode)
         _, test_dataset = create_dataset('caption', self.config)
         test_loader = create_loader(test_dataset, batch_size=1, num_workers=4, train=False)
         experts, _ = next(iter(test_loader))
@@ -148,8 +152,8 @@ class Model:
         answer = answer.capitalize() + '.'
         return answer
-    def run_vqa(self, image_path: str, model_name: str, mode: str, question: str) -> tuple[str | None, ...]:
         out_paths = run_experts(image_path)
-        answer = self.run_vqa_model(model_name, mode, question)
         label_prettify(image_path, out_paths)
         return answer, *out_paths

 from dataset.utils import pre_question
 from model.prismer_caption import PrismerCaption
 from model.prismer_vqa import PrismerVQA
+from model.modules.utils import interpolate_pos_embed
 def download_models() -> None:
             }
             model = PrismerCaption(config)
             state_dict = torch.load(f'prismer/logging/pretrain_{model_name}/pytorch_model.bin', map_location='cuda:0')
+            state_dict['expert_encoder.positional_embedding'] = interpolate_pos_embed(state_dict['expert_encoder.positional_embedding'],
+                                                                                      len(model.expert_encoder.positional_embedding))
         elif self.mode == 'vqa':
             config = {
                 'dataset': 'demo',
             model = PrismerVQA(config)
             state_dict = torch.load(f'prismer/logging/vqa_{model_name}/pytorch_model.bin', map_location='cuda:0')
+            state_dict['expert_encoder.positional_embedding'] = interpolate_pos_embed(state_dict['expert_encoder.positional_embedding'],
+                                                                                      len(model.expert_encoder.positional_embedding))
         model.load_state_dict(state_dict)
         model.eval()
         self.mode = mode
     @torch.inference_mode()
+    def run_caption_model(self, exp_name: str) -> str:
+        self.set_model(exp_name, 'caption')
         _, test_dataset = create_dataset('caption', self.config)
         test_loader = create_loader(test_dataset, batch_size=1, num_workers=4, train=False)
         experts, _ = next(iter(test_loader))
         caption = caption.capitalize() + '.'
         return caption
+    def run_caption(self, image_path: str, model_name: str) -> tuple[str | None, ...]:
         out_paths = run_experts(image_path)
+        caption = self.run_caption_model(model_name)
         label_prettify(image_path, out_paths)
         return caption, *out_paths
     @torch.inference_mode()
+    def run_vqa_model(self, exp_name: str, question: str) -> str:
+        self.set_model(exp_name, 'vqa')
         _, test_dataset = create_dataset('caption', self.config)
         test_loader = create_loader(test_dataset, batch_size=1, num_workers=4, train=False)
         experts, _ = next(iter(test_loader))
         answer = answer.capitalize() + '.'
         return answer
+    def run_vqa(self, image_path: str, model_name: str, question: str) -> tuple[str | None, ...]:
         out_paths = run_experts(image_path)
+        answer = self.run_vqa_model(model_name, question)
         label_prettify(image_path, out_paths)
         return answer, *out_paths