Spaces:

osbm
/

token_merger_demo

Running

App Files Files Community

osbm commited on Apr 3, 2023

Commit

3a50a96

•

1 Parent(s): dce5d16

initial commit

Browse files

Files changed (8) hide show

.gitattributes +1 -0
.gitignore +2 -0
README.md +14 -1
app.py +75 -0
images/concept_figure.png +3 -0
images/husky.png +3 -0
images/image_vis.png +3 -0
requirements.txt +7 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .venv/
2	+ *.pyc

README.md CHANGED Viewed

@@ -9,4 +9,17 @@ app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 pinned: false
 ---
+# Token Merging: Your ViT but Faster
+github: https://github.com/facebookresearch/tome
+paper: https://arxiv.org/abs/2210.09461
+# Citation
+```bibtex
+@inproceedings{bolya2022tome,
+  title={Token Merging: Your {ViT} but Faster},
+  author={Bolya, Daniel and Fu, Cheng-Yang and Dai, Xiaoliang and Zhang, Peizhao and Feichtenhofer, Christoph and Hoffman, Judy},
+  booktitle={International Conference on Learning Representations},
+  year={2023}
+}
+```

app.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import tome
+import timm
+import gradio as gr
+from PIL import Image
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+model_name = "vit_large_patch16_384"
+print("Started Downloading:", model_name)
+model = timm.create_model(model_name, pretrained=True)
+print("Finished Downloading:", model_name)
+tome.patch.timm(model, trace_source=True)
+input_size = model.default_cfg["input_size"][1]
+# Make sure the transform is correct for your model!
+transform_list = [
+    transforms.Resize(int((256 / 224) * input_size), interpolation=InterpolationMode.BICUBIC),
+    transforms.CenterCrop(input_size)
+]
+# The visualization and model need different transforms
+transform_vis  = transforms.Compose(transform_list)
+transform_norm = transforms.Compose(transform_list + [
+    transforms.ToTensor(),
+    transforms.Normalize(model.default_cfg["mean"], model.default_cfg["std"]),
+])
+def process_image(img, r=25, layers=1):
+    img = Image.fromarray(img.astype('uint8'), 'RGB')
+    img_vis = transform_vis(img)
+    img_norm = transform_norm(img)
+    # from the paper:
+    # r can take the following forms:
+    #  - int: A constant number of tokens per layer.
+    #  - Tuple[int, float]: A pair of r, inflection.
+    #    Inflection describes there the the reduction / layer should trend
+    #    upward (+1), downward (-1), or stay constant (0). A value of (r, 0)
+    #    is as providing a constant r. (r, -1) is what we describe in the paper
+    #    as "decreasing schedule". Any value between -1 and +1 is accepted.
+    #  - List[int]: A specific number of tokens per layer. For extreme granularity.
+    if layers != 1:
+        r = [r] * layers
+    print(r)
+    model.r = r
+    _ = model(img_norm[None, ...])
+    source = model._tome_info["source"]
+    # print(f"{source.shape[1]} tokens at the end")
+    return tome.make_visualization(img_vis, source, patch_size=16, class_token=True)
+iface = gr.Interface(
+    fn=process_image,
+    inputs=[
+        "image",
+        gr.inputs.Slider(0, 50, step=1, label="r value (the amount of reduction. See paper for details.)"),
+        gr.inputs.Slider(1, 50, step=1, label="layers (1 means r is applied to all layers)"),
+    ],
+    outputs="image",
+    examples=[
+        ["images/husky.png", 25, 1],
+        ["images/husky.png", 25, 8],
+        ["images/husky.png", 25, 16],
+        ["images/husky.png", 25, 22],
+    ]
+)
+iface.launch()

images/concept_figure.png ADDED Viewed

Git LFS Details

SHA256: 535d645011be6021705eba4b8a2b48a43a1c5fad0afddb2ea76bafd31cbcd2b6
Pointer size: 131 Bytes
Size of remote file: 339 kB

images/husky.png ADDED Viewed

Git LFS Details

SHA256: 79699ff62ea6595a273ea54fb136a4a68edd580fad1a5225d54e15e67b613f4c
Pointer size: 131 Bytes
Size of remote file: 455 kB

images/image_vis.png ADDED Viewed

Git LFS Details

SHA256: 0bd8d344b2fac00867ff9e14cef362477cfa8f64aa8c4fc7354313d40dc69b6c
Pointer size: 132 Bytes
Size of remote file: 1.45 MB

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio
+timm==0.4.12
+torchvision
+torch
+pillow
+tqdm
+git+https://github.com/facebookresearch/tome