Spaces:

facebook
/

vc1-base

Running

App Files Files Community

sneha commited on Apr 10, 2023

Commit

c54235c

•

1 Parent(s): 1df99f6

allow switching between models

Browse files

Files changed (1) hide show

app.py +44 -20

app.py CHANGED Viewed

@@ -20,32 +20,53 @@ if not os.path.isdir(MODEL_DIR):
 REPO_ID = "facebook/vc1-base"
 FILENAME = "config.yaml"
-MODEL_TUPLE = None
-def get_model():
-    global MODEL_TUPLE
-    download_bin()
-    if MODEL_TUPLE is None:
         model_cfg = omegaconf.OmegaConf.load(
-            hf_hub_download(repo_id=REPO_ID, filename=FILENAME,token=HF_TOKEN)
         )
-        model_cfg['model']['checkpoint_path'] = None
-        model_cfg['model']['checkpoint_path'] = 'model_ckpts/vc1_vitb.pth'
-        MODEL_TUPLE = utils.instantiate(model_cfg)
-        MODEL_TUPLE[0].eval()
-    return MODEL_TUPLE#model,embedding_dim,transform,metadata
-def download_bin():
-    bin_file = 'vc1_vitb.pth'
     bin_path = os.path.join(MODEL_DIR,bin_file)
     if not os.path.isfile(bin_path):
         model_bin = hf_hub_download(repo_id=REPO_ID, filename='pytorch_model.bin',local_dir=MODEL_DIR,local_dir_use_symlinks=True,token=HF_TOKEN)
         os.rename(model_bin, bin_path)
-def run_attn(input_img,fusion="min"):
-    download_bin()
-    model, embedding_dim, transform, metadata = get_model()
     if input_img.shape[0] != 3:
         input_img = input_img.transpose(2, 0, 1)
     if(len(input_img.shape)== 3):
@@ -63,11 +84,14 @@ def run_attn(input_img,fusion="min"):
     fig = plt.figure()
     ax = fig.subplots()
     im = ax.matshow(y.detach().numpy().reshape(16,-1))
     plt.colorbar(im)
     return attn_img, fig
 input_img = gr.Image(shape=(250,250))
 input_button = gr.Radio(["min", "max", "mean"], value="min",label="Attention Head Fusion", info="How to combine the last layer attention across all 12 heads of the transformer.")
 output_img = gr.Image(shape=(250,250))
@@ -75,8 +99,8 @@ output_plot = gr.Plot()
 markdown ="This is a demo for the Visual Cortex (Base) model. When passed an image input, it displays the attention of the last layer of the transformer.\n \
          The user can decide how the attention heads will be combined. \
-         Along with the attention heatmap, it also displays the embedding values reshaped to a 16x48 grid."
 demo = gr.Interface(fn=run_attn, title="Visual Cortex Base Model", description=markdown,
-                    examples=[[os.path.join('./imgs',x),None]for x in os.listdir(os.path.join(os.getcwd(),'imgs')) if 'jpg' in x],
-                    inputs=[input_img,input_button],outputs=[output_img,output_plot])
 demo.launch()

 REPO_ID = "facebook/vc1-base"
 FILENAME = "config.yaml"
+BASE_MODEL_TUPLE = None
+LARGE_MODEL_TUPLE = None
+def get_model(model_name):
+    global BASE_MODEL_TUPLE,LARGE_MODEL_TUPLE
+    download_bin(model_name)
+    model = None
+    if BASE_MODEL_TUPLE is None and model_name == 'vc1-base':
+        repo_name = "facebook/" + model_name
+        model_cfg = omegaconf.OmegaConf.load(
+            hf_hub_download(repo_id=repo_name, filename=FILENAME,token=HF_TOKEN)
+        )
+        # model_cfg['model']['checkpoint_path'] = None
+        # model_cfg['model']['checkpoint_path'] = 'model_ckpts/vc1_vitb.pth'
+        BASE_MODEL_TUPLE = utils.instantiate(model_cfg)
+        BASE_MODEL_TUPLE[0].eval()
+        model =  BASE_MODEL_TUPLE
+    elif LARGE_MODEL_TUPLE is None and model_name == 'vc1-large':
+        repo_name = "facebook/" + model_name
         model_cfg = omegaconf.OmegaConf.load(
+            hf_hub_download(repo_id=repo_name, filename=FILENAME,token=HF_TOKEN)
         )
+        # model_cfg['model']['checkpoint_path'] = None
+        # model_cfg['model']['checkpoint_path'] = 'model_ckpts/vc1_vitb.pth'
+        LARGE_MODEL_TUPLE = utils.instantiate(model_cfg)
+        LARGE_MODEL_TUPLE[0].eval()
+        model =  LARGE_MODEL_TUPLE
+    elif model_name == 'vc1-base':
+        model = BASE_MODEL_TUPLE
+    elif model_name == 'vc1-large':
+        model = LARGE_MODEL_TUPLE
+    return model #model,embedding_dim,transform,metadata
+def download_bin(model):
+    if model == "vc1-large":
+        bin_file = 'vc1_vitl.pth'
+    elif model == "vc1-base":
+        bin_file = 'vc1_vitb.pth'
     bin_path = os.path.join(MODEL_DIR,bin_file)
     if not os.path.isfile(bin_path):
         model_bin = hf_hub_download(repo_id=REPO_ID, filename='pytorch_model.bin',local_dir=MODEL_DIR,local_dir_use_symlinks=True,token=HF_TOKEN)
         os.rename(model_bin, bin_path)
+def run_attn(model, input_img,fusion="min"):
+    download_bin(model)
+    model, embedding_dim, transform, metadata = get_model(model)
     if input_img.shape[0] != 3:
         input_img = input_img.transpose(2, 0, 1)
     if(len(input_img.shape)== 3):
     fig = plt.figure()
     ax = fig.subplots()
+    print(y.shape)
     im = ax.matshow(y.detach().numpy().reshape(16,-1))
     plt.colorbar(im)
     return attn_img, fig
+model_type = gr.Dropdown(
+            ["vc1-base", "vc1-large"], label="Model Size", value="vc1-large")
 input_img = gr.Image(shape=(250,250))
 input_button = gr.Radio(["min", "max", "mean"], value="min",label="Attention Head Fusion", info="How to combine the last layer attention across all 12 heads of the transformer.")
 output_img = gr.Image(shape=(250,250))
 markdown ="This is a demo for the Visual Cortex (Base) model. When passed an image input, it displays the attention of the last layer of the transformer.\n \
          The user can decide how the attention heads will be combined. \
+         Along with the attention heatmap, it also displays the embedding values reshaped to a 16x48 or 16x64 grid."
 demo = gr.Interface(fn=run_attn, title="Visual Cortex Base Model", description=markdown,
+                    examples=[[None, os.path.join('./imgs',x),None]for x in os.listdir(os.path.join(os.getcwd(),'imgs')) if 'jpg' in x],
+                    inputs=[model_type,input_img,input_button],outputs=[output_img,output_plot])
 demo.launch()