adirik commited on
Commit
5b7f9a4
β€’
1 Parent(s): 2cb3c00

update model version, add examples

Browse files
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
app.py CHANGED
@@ -9,31 +9,31 @@ from backbone import ClassificationModel
9
 
10
 
11
 
12
- vit_l16_384 = {
13
  "backbone_name": "vit-l/16",
14
  "backbone_params": {
15
- "image_size": 384,
16
  "representation_size": 0,
17
  "attention_dropout_rate": 0.,
18
  "dropout_rate": 0.,
19
  "channels": 3
20
  },
21
  "dropout_rate": 0.,
22
- "pretrained": "./weights/vit_l16_384/model-weights"
23
  }
24
 
25
  # Init backbone
26
- backbone = create_name_vit(vit_l16_384["backbone_name"], **vit_l16_384["backbone_params"])
27
 
28
  # Init classification model
29
  model = ClassificationModel(
30
  backbone=backbone,
31
- dropout_rate=vit_l16_384["dropout_rate"],
32
  num_classes=1000
33
  )
34
 
35
  # Load weights
36
- model.load_weights(vit_l16_384["pretrained"])
37
  model.trainable = False
38
 
39
  # Load ImageNet idx to label mapping
@@ -41,7 +41,7 @@ with open("assets/imagenet_1000_idx2labels.json") as f:
41
  idx_to_label = json.load(f)
42
 
43
 
44
- def resize_with_normalization(image, size=[384, 384]):
45
  image = tf.cast(image, tf.float32)
46
  image = tf.image.resize(image, size)
47
  image -= tf.constant(127.5, shape=(1, 1, 3), dtype=tf.float32)
@@ -63,22 +63,16 @@ def classify_image(img, top_k):
63
  return {idx_to_label[str(idx)] : round(float(pred_probs[idx]), 4) for idx in top_k_labels}
64
 
65
 
66
- description = """
67
- Gradio demo for <a href="https://huggingface.co/docs/transformers/main/en/model_doc/owlvit">ViT released by Kakao Lab</a>,
68
- introduced in <a href="https://arxiv.org/abs/2205.06230">Simple Open-Vocabulary Object Detection
69
- with Vision Transformers</a>.
70
- \n\nYou can use OWL-ViT to query images with text descriptions of any object.
71
- To use it, simply upload an image and enter comma separated text descriptions of objects you want to query the image for. You
72
- can also use the score threshold slider to set a threshold to filter out low probability predictions.
73
- \n\nOWL-ViT is trained on text templates,
74
- hence you can get better predictions by querying the image with text templates used in training the original model: *"photo of a star-spangled banner"*,
75
- *"image of a shoe"*. Refer to the <a href="https://arxiv.org/abs/2103.00020">CLIP</a> paper to see the full list of text templates used to augment the training data.
76
- """
77
  demo = gr.Interface(
78
  classify_image,
79
  inputs=[gr.Image(), gr.Slider(0, 1000, value=5)],
80
  outputs=gr.outputs.Label(),
81
  title="Image Classification with Kakao Brain ViT",
82
- #description=description,
 
 
 
 
 
83
  )
84
  demo.launch()
 
9
 
10
 
11
 
12
+ vit_l16_512 = {
13
  "backbone_name": "vit-l/16",
14
  "backbone_params": {
15
+ "image_size": 512,
16
  "representation_size": 0,
17
  "attention_dropout_rate": 0.,
18
  "dropout_rate": 0.,
19
  "channels": 3
20
  },
21
  "dropout_rate": 0.,
22
+ "pretrained": "./weights/vit_l16_512/model-weights"
23
  }
24
 
25
  # Init backbone
26
+ backbone = create_name_vit(vit_l16_512["backbone_name"], **vit_l16_512["backbone_params"])
27
 
28
  # Init classification model
29
  model = ClassificationModel(
30
  backbone=backbone,
31
+ dropout_rate=vit_l16_512["dropout_rate"],
32
  num_classes=1000
33
  )
34
 
35
  # Load weights
36
+ model.load_weights(vit_l16_512["pretrained"])
37
  model.trainable = False
38
 
39
  # Load ImageNet idx to label mapping
 
41
  idx_to_label = json.load(f)
42
 
43
 
44
+ def resize_with_normalization(image, size=[512, 512]):
45
  image = tf.cast(image, tf.float32)
46
  image = tf.image.resize(image, size)
47
  image -= tf.constant(127.5, shape=(1, 1, 3), dtype=tf.float32)
 
63
  return {idx_to_label[str(idx)] : round(float(pred_probs[idx]), 4) for idx in top_k_labels}
64
 
65
 
 
 
 
 
 
 
 
 
 
 
 
66
  demo = gr.Interface(
67
  classify_image,
68
  inputs=[gr.Image(), gr.Slider(0, 1000, value=5)],
69
  outputs=gr.outputs.Label(),
70
  title="Image Classification with Kakao Brain ViT",
71
+ examples=[
72
+ ["assets/halloween-gaf8ad7ebc_1920.jpeg", 5],
73
+ ["assets/IMG_4484.jpeg", 5],
74
+ ["assets/IMG_4737.jpeg", 5],
75
+ ["assets/IMG_4740.jpeg", 5],
76
+ ],
77
  )
78
  demo.launch()
assets/.DS_Store CHANGED
Binary files a/assets/.DS_Store and b/assets/.DS_Store differ
 
assets/IMG_4484.jpeg ADDED
assets/IMG_4737.jpeg ADDED
assets/IMG_4740.jpeg ADDED
assets/halloween-gaf8ad7ebc_1920.jpeg ADDED
weights/.DS_Store CHANGED
Binary files a/weights/.DS_Store and b/weights/.DS_Store differ
 
weights/vit_l16_384/.DS_Store DELETED
Binary file (6.15 kB)
 
weights/{vit_l16_384 β†’ vit_l16_512}/model-weights.data-00000-of-00001 RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3be15c56904f8b443bbd8eb07e0a4723227b88fde194b6a4ec0831506046c145
3
- size 2437847991
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:164eb26c694a610ea5d55099f2005da682a056d3f54f2895cb75763d2ba01f29
3
+ size 3662286224
weights/{vit_l16_384 β†’ vit_l16_512}/model-weights.index RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4948ae4dc542254d66fe6d6e71bbb2b504dc253057cb92f9f097a8f78b3013db
3
- size 13703
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0102fedd4707740d5703aaf720e9fb9363d37651e2791a9ccb944ef88409fbe
3
+ size 19191