Spaces:
Build error
Build error
update model version, add examples
Browse files- .DS_Store +0 -0
- app.py +13 -19
- assets/.DS_Store +0 -0
- assets/IMG_4484.jpeg +0 -0
- assets/IMG_4737.jpeg +0 -0
- assets/IMG_4740.jpeg +0 -0
- assets/halloween-gaf8ad7ebc_1920.jpeg +0 -0
- weights/.DS_Store +0 -0
- weights/vit_l16_384/.DS_Store +0 -0
- weights/{vit_l16_384 β vit_l16_512}/model-weights.data-00000-of-00001 +2 -2
- weights/{vit_l16_384 β vit_l16_512}/model-weights.index +2 -2
.DS_Store
CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
|
|
app.py
CHANGED
@@ -9,31 +9,31 @@ from backbone import ClassificationModel
|
|
9 |
|
10 |
|
11 |
|
12 |
-
|
13 |
"backbone_name": "vit-l/16",
|
14 |
"backbone_params": {
|
15 |
-
"image_size":
|
16 |
"representation_size": 0,
|
17 |
"attention_dropout_rate": 0.,
|
18 |
"dropout_rate": 0.,
|
19 |
"channels": 3
|
20 |
},
|
21 |
"dropout_rate": 0.,
|
22 |
-
"pretrained": "./weights/
|
23 |
}
|
24 |
|
25 |
# Init backbone
|
26 |
-
backbone = create_name_vit(
|
27 |
|
28 |
# Init classification model
|
29 |
model = ClassificationModel(
|
30 |
backbone=backbone,
|
31 |
-
dropout_rate=
|
32 |
num_classes=1000
|
33 |
)
|
34 |
|
35 |
# Load weights
|
36 |
-
model.load_weights(
|
37 |
model.trainable = False
|
38 |
|
39 |
# Load ImageNet idx to label mapping
|
@@ -41,7 +41,7 @@ with open("assets/imagenet_1000_idx2labels.json") as f:
|
|
41 |
idx_to_label = json.load(f)
|
42 |
|
43 |
|
44 |
-
def resize_with_normalization(image, size=[
|
45 |
image = tf.cast(image, tf.float32)
|
46 |
image = tf.image.resize(image, size)
|
47 |
image -= tf.constant(127.5, shape=(1, 1, 3), dtype=tf.float32)
|
@@ -63,22 +63,16 @@ def classify_image(img, top_k):
|
|
63 |
return {idx_to_label[str(idx)] : round(float(pred_probs[idx]), 4) for idx in top_k_labels}
|
64 |
|
65 |
|
66 |
-
description = """
|
67 |
-
Gradio demo for <a href="https://huggingface.co/docs/transformers/main/en/model_doc/owlvit">ViT released by Kakao Lab</a>,
|
68 |
-
introduced in <a href="https://arxiv.org/abs/2205.06230">Simple Open-Vocabulary Object Detection
|
69 |
-
with Vision Transformers</a>.
|
70 |
-
\n\nYou can use OWL-ViT to query images with text descriptions of any object.
|
71 |
-
To use it, simply upload an image and enter comma separated text descriptions of objects you want to query the image for. You
|
72 |
-
can also use the score threshold slider to set a threshold to filter out low probability predictions.
|
73 |
-
\n\nOWL-ViT is trained on text templates,
|
74 |
-
hence you can get better predictions by querying the image with text templates used in training the original model: *"photo of a star-spangled banner"*,
|
75 |
-
*"image of a shoe"*. Refer to the <a href="https://arxiv.org/abs/2103.00020">CLIP</a> paper to see the full list of text templates used to augment the training data.
|
76 |
-
"""
|
77 |
demo = gr.Interface(
|
78 |
classify_image,
|
79 |
inputs=[gr.Image(), gr.Slider(0, 1000, value=5)],
|
80 |
outputs=gr.outputs.Label(),
|
81 |
title="Image Classification with Kakao Brain ViT",
|
82 |
-
|
|
|
|
|
|
|
|
|
|
|
83 |
)
|
84 |
demo.launch()
|
|
|
9 |
|
10 |
|
11 |
|
12 |
+
vit_l16_512 = {
|
13 |
"backbone_name": "vit-l/16",
|
14 |
"backbone_params": {
|
15 |
+
"image_size": 512,
|
16 |
"representation_size": 0,
|
17 |
"attention_dropout_rate": 0.,
|
18 |
"dropout_rate": 0.,
|
19 |
"channels": 3
|
20 |
},
|
21 |
"dropout_rate": 0.,
|
22 |
+
"pretrained": "./weights/vit_l16_512/model-weights"
|
23 |
}
|
24 |
|
25 |
# Init backbone
|
26 |
+
backbone = create_name_vit(vit_l16_512["backbone_name"], **vit_l16_512["backbone_params"])
|
27 |
|
28 |
# Init classification model
|
29 |
model = ClassificationModel(
|
30 |
backbone=backbone,
|
31 |
+
dropout_rate=vit_l16_512["dropout_rate"],
|
32 |
num_classes=1000
|
33 |
)
|
34 |
|
35 |
# Load weights
|
36 |
+
model.load_weights(vit_l16_512["pretrained"])
|
37 |
model.trainable = False
|
38 |
|
39 |
# Load ImageNet idx to label mapping
|
|
|
41 |
idx_to_label = json.load(f)
|
42 |
|
43 |
|
44 |
+
def resize_with_normalization(image, size=[512, 512]):
|
45 |
image = tf.cast(image, tf.float32)
|
46 |
image = tf.image.resize(image, size)
|
47 |
image -= tf.constant(127.5, shape=(1, 1, 3), dtype=tf.float32)
|
|
|
63 |
return {idx_to_label[str(idx)] : round(float(pred_probs[idx]), 4) for idx in top_k_labels}
|
64 |
|
65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
demo = gr.Interface(
|
67 |
classify_image,
|
68 |
inputs=[gr.Image(), gr.Slider(0, 1000, value=5)],
|
69 |
outputs=gr.outputs.Label(),
|
70 |
title="Image Classification with Kakao Brain ViT",
|
71 |
+
examples=[
|
72 |
+
["assets/halloween-gaf8ad7ebc_1920.jpeg", 5],
|
73 |
+
["assets/IMG_4484.jpeg", 5],
|
74 |
+
["assets/IMG_4737.jpeg", 5],
|
75 |
+
["assets/IMG_4740.jpeg", 5],
|
76 |
+
],
|
77 |
)
|
78 |
demo.launch()
|
assets/.DS_Store
CHANGED
Binary files a/assets/.DS_Store and b/assets/.DS_Store differ
|
|
assets/IMG_4484.jpeg
ADDED
assets/IMG_4737.jpeg
ADDED
assets/IMG_4740.jpeg
ADDED
assets/halloween-gaf8ad7ebc_1920.jpeg
ADDED
weights/.DS_Store
CHANGED
Binary files a/weights/.DS_Store and b/weights/.DS_Store differ
|
|
weights/vit_l16_384/.DS_Store
DELETED
Binary file (6.15 kB)
|
|
weights/{vit_l16_384 β vit_l16_512}/model-weights.data-00000-of-00001
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:164eb26c694a610ea5d55099f2005da682a056d3f54f2895cb75763d2ba01f29
|
3 |
+
size 3662286224
|
weights/{vit_l16_384 β vit_l16_512}/model-weights.index
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d0102fedd4707740d5703aaf720e9fb9363d37651e2791a9ccb944ef88409fbe
|
3 |
+
size 19191
|