Spaces:

shikunl
/

prismer

Running on A10G

App Files Files Community

shikunl commited on Mar 13, 2023

Commit

806eb00

1 Parent(s): 5a56ebb

Final test

Browse files

Files changed (3) hide show

app_caption.py +2 -9
app_vqa.py +8 -16
prismer/model/modules/vit.py +11 -42

app_caption.py CHANGED Viewed

@@ -31,20 +31,13 @@ def create_demo():
     inputs = [image, model_name]
     outputs = [caption, depth, edge, normals, segmentation, object_detection, ocr]
-    # paths = sorted(pathlib.Path('prismer/images').glob('*'))
-    # examples = [[path.as_posix(), 'prismer_base'] for path in paths]
-    # gr.Examples(examples=examples,
-    #             inputs=inputs,
-    #             outputs=outputs,
-    #             fn=model.run_caption,
-    #             cache_examples=os.getenv('SYSTEM') == 'spaces')
     paths = sorted(pathlib.Path('prismer/images').glob('*'))
     examples = [[path.as_posix(), 'Prismer-Base'] for path in paths]
     gr.Examples(examples=examples,
                 inputs=inputs,
                 outputs=outputs,
-                fn=model.run_caption)
     run_button.click(fn=model.run_caption, inputs=inputs, outputs=outputs)

     inputs = [image, model_name]
     outputs = [caption, depth, edge, normals, segmentation, object_detection, ocr]
     paths = sorted(pathlib.Path('prismer/images').glob('*'))
     examples = [[path.as_posix(), 'Prismer-Base'] for path in paths]
     gr.Examples(examples=examples,
                 inputs=inputs,
                 outputs=outputs,
+                fn=model.run_caption,
+                cache_examples=os.getenv('SYSTEM') == 'spaces')
     run_button.click(fn=model.run_caption, inputs=inputs, outputs=outputs)

app_vqa.py CHANGED Viewed

@@ -31,26 +31,18 @@ def create_demo():
     inputs = [image, model_name, question]
     outputs = [answer, depth, edge, normals, segmentation, object_detection, ocr]
-    # paths = sorted(pathlib.Path('prismer/images').glob('*'))
-    # ex_questions = ['What is the man on the right doing?',
-    #                 'What is this person playing?',
-    #                 'How many cows in this image?',
-    #                 'What is the type of animal in this image?',
-    #                 'What toy is it?']
-    #
-    # examples = [[path.as_posix(), 'Prismer-Base', ex_questions[i]] for i, path in enumerate(paths)]
-    # gr.Examples(examples=examples,
-    #             inputs=inputs,
-    #             outputs=outputs,
-    #             fn=model.run_vqa,
-    #             cache_examples=os.getenv('SYSTEM') == 'spaces')
     paths = sorted(pathlib.Path('prismer/images').glob('*'))
-    examples = [[path.as_posix(), 'Prismer-Base'] for path in paths]
     gr.Examples(examples=examples,
                 inputs=inputs,
                 outputs=outputs,
-                fn=model.run_vqa)
     run_button.click(fn=model.run_vqa, inputs=inputs, outputs=outputs)

     inputs = [image, model_name, question]
     outputs = [answer, depth, edge, normals, segmentation, object_detection, ocr]
     paths = sorted(pathlib.Path('prismer/images').glob('*'))
+    ex_questions = ['What is the man on the left doing?',
+                    'What is this person doing?',
+                    'How many cows in this image?',
+                    'What is the type of animal in this image?',
+                    'What toy is it?']
+    examples = [[path.as_posix(), 'Prismer-Base', ex_questions[i]] for i, path in enumerate(paths)]
     gr.Examples(examples=examples,
                 inputs=inputs,
                 outputs=outputs,
+                fn=model.run_vqa,
+                cache_examples=os.getenv('SYSTEM') == 'spaces')
     run_button.click(fn=model.run_vqa, inputs=inputs, outputs=outputs)

prismer/model/modules/vit.py CHANGED Viewed

@@ -173,45 +173,17 @@ class VisionTransformer(nn.Module):
 def load_encoder(name: str, experts: dict, image_resolution: int):
-    # load pre-trained model file
-    if name in _MODELS:
-        if name != 'ViT-H/14':
-            model_path = _download(_MODELS[name], os.path.expanduser("cache/clip"))
-            model = torch.jit.load(model_path, map_location="cpu")
-            state_dict = model.state_dict()
-        else:
-            model_path = hf_hub_download(_MODELS[name], 'open_clip_pytorch_model.bin', revision=None, cache_dir="cache/clip")
-            state_dict = torch.load(model_path, map_location="cpu")
-    else:
-        raise RuntimeError(f"Model {name} not found")
-    # modify keys (we only need Vision Transformer)
-    for key in list(state_dict.keys()):
-        if not key.startswith('visual'):
-            del state_dict[key]
-    for key in list(state_dict.keys()):
-        new_key = key.replace('visual.', '')
-        if 'proj' in new_key and 'transformer' not in new_key:
-            del state_dict[key]
-        elif 'conv1' in new_key:
-            new_key_ = new_key.replace('conv1', 'conv1.rgb')
-            state_dict[new_key_] = state_dict.pop(key)
-        elif 'positional_embedding' in new_key:
-            state_dict[new_key] = state_dict.pop(key)[1:]
-        elif 'transformer.resblocks' in new_key:
-            new_key_ = re.sub(".mlp", ".0.mlp", new_key)
-            new_key_ = re.sub(".attn", ".0.attn", new_key_)
-            new_key_ = re.sub(".ln", ".0.ln", new_key_)
-            state_dict[new_key_] = state_dict.pop(key)
-        else:
-            state_dict[new_key] = state_dict.pop(key)
-    # load pre-trained weights
-    vision_width = state_dict["conv1.rgb.weight"].shape[0]
-    vision_patch_size = state_dict["conv1.rgb.weight"].shape[-1]
-    vision_layers = len([k for k in state_dict.keys() if k.endswith(".attn.in_proj_weight")])
-    vision_heads = vision_width // 64
     ViT = VisionTransformer(input_resolution=image_resolution,
                             patch_size=vision_patch_size,
@@ -219,9 +191,6 @@ def load_encoder(name: str, experts: dict, image_resolution: int):
                             layers=vision_layers,
                             heads=vision_heads,
                             experts=experts)
-    state_dict['positional_embedding'] = interpolate_pos_embed(state_dict['positional_embedding'], len(ViT.positional_embedding))
-    ViT.load_state_dict(state_dict, strict=False)
     return ViT

 def load_encoder(name: str, experts: dict, image_resolution: int):
+    if name == 'ViT-B/16':
+        vision_width = 768
+        vision_patch_size = 16
+        vision_layers = 12
+        vision_heads = 12
+    elif name == 'ViT-L/14' or name == 'ViT-L/14@336px':
+        vision_width = 1024
+        vision_patch_size = 14
+        vision_layers = 24
+        vision_heads = 16
     ViT = VisionTransformer(input_resolution=image_resolution,
                             patch_size=vision_patch_size,
                             layers=vision_layers,
                             heads=vision_heads,
                             experts=experts)
     return ViT