update model version, add examples · adirik/kakao-brain-vit at 5b7f9a4

@@ -9,31 +9,31 @@ from backbone import ClassificationModel
-vit_l16_384 = {
     "backbone_name": "vit-l/16",
     "backbone_params": {
-        "image_size": 384,
         "representation_size": 0,
         "attention_dropout_rate": 0.,
         "dropout_rate": 0.,
         "channels": 3
     },
     "dropout_rate": 0.,
-    "pretrained": "./weights/vit_l16_384/model-weights"
 }
 # Init backbone
-backbone = create_name_vit(vit_l16_384["backbone_name"], **vit_l16_384["backbone_params"])
 # Init classification model
 model = ClassificationModel(
     backbone=backbone,
-    dropout_rate=vit_l16_384["dropout_rate"],
     num_classes=1000
 )
 # Load weights
-model.load_weights(vit_l16_384["pretrained"])
 model.trainable = False
 # Load ImageNet idx to label mapping
@@ -41,7 +41,7 @@ with open("assets/imagenet_1000_idx2labels.json") as f:
     idx_to_label = json.load(f)
-def resize_with_normalization(image, size=[384, 384]):
     image = tf.cast(image, tf.float32)
     image = tf.image.resize(image, size)
     image -= tf.constant(127.5, shape=(1, 1, 3), dtype=tf.float32)
@@ -63,22 +63,16 @@ def classify_image(img, top_k):
     return {idx_to_label[str(idx)] : round(float(pred_probs[idx]), 4) for idx in top_k_labels}
-description = """
-Gradio demo for <a href="https://huggingface.co/docs/transformers/main/en/model_doc/owlvit">ViT released by Kakao Lab</a>,
-introduced in <a href="https://arxiv.org/abs/2205.06230">Simple Open-Vocabulary Object Detection
-with Vision Transformers</a>.
-\n\nYou can use OWL-ViT to query images with text descriptions of any object.
-To use it, simply upload an image and enter comma separated text descriptions of objects you want to query the image for. You
-can also use the score threshold slider to set a threshold to filter out low probability predictions.
-\n\nOWL-ViT is trained on text templates,
-hence you can get better predictions by querying the image with text templates used in training the original model: *"photo of a star-spangled banner"*,
-*"image of a shoe"*. Refer to the <a href="https://arxiv.org/abs/2103.00020">CLIP</a> paper to see the full list of text templates used to augment the training data.
-"""
 demo = gr.Interface(
     classify_image,
     inputs=[gr.Image(), gr.Slider(0, 1000, value=5)],
     outputs=gr.outputs.Label(),
     title="Image Classification with Kakao Brain ViT",
-    #description=description,
 )
 demo.launch()

+vit_l16_512 = {
     "backbone_name": "vit-l/16",
     "backbone_params": {
+        "image_size": 512,
         "representation_size": 0,
         "attention_dropout_rate": 0.,
         "dropout_rate": 0.,
         "channels": 3
     },
     "dropout_rate": 0.,
+    "pretrained": "./weights/vit_l16_512/model-weights"
 }
 # Init backbone
+backbone = create_name_vit(vit_l16_512["backbone_name"], **vit_l16_512["backbone_params"])
 # Init classification model
 model = ClassificationModel(
     backbone=backbone,
+    dropout_rate=vit_l16_512["dropout_rate"],
     num_classes=1000
 )
 # Load weights
+model.load_weights(vit_l16_512["pretrained"])
 model.trainable = False
 # Load ImageNet idx to label mapping
     idx_to_label = json.load(f)
+def resize_with_normalization(image, size=[512, 512]):
     image = tf.cast(image, tf.float32)
     image = tf.image.resize(image, size)
     image -= tf.constant(127.5, shape=(1, 1, 3), dtype=tf.float32)
     return {idx_to_label[str(idx)] : round(float(pred_probs[idx]), 4) for idx in top_k_labels}
 demo = gr.Interface(
     classify_image,
     inputs=[gr.Image(), gr.Slider(0, 1000, value=5)],
     outputs=gr.outputs.Label(),
     title="Image Classification with Kakao Brain ViT",
+    examples=[
+        ["assets/halloween-gaf8ad7ebc_1920.jpeg", 5],
+        ["assets/IMG_4484.jpeg", 5],
+        ["assets/IMG_4737.jpeg", 5],
+        ["assets/IMG_4740.jpeg", 5],
+    ],
 )
 demo.launch()

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3be15c56904f8b443bbd8eb07e0a4723227b88fde194b6a4ec0831506046c145
-size 2437847991

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4948ae4dc542254d66fe6d6e71bbb2b504dc253057cb92f9f097a8f78b3013db
-size 13703

 version https://git-lfs.github.com/spec/v1
+oid sha256:164eb26c694a610ea5d55099f2005da682a056d3f54f2895cb75763d2ba01f29
+size 3662286224

 version https://git-lfs.github.com/spec/v1
+oid sha256:d0102fedd4707740d5703aaf720e9fb9363d37651e2791a9ccb944ef88409fbe
+size 19191