Spaces:

adirik
/

OWL-ViT

Running

App Files Files Community

adirik commited on Aug 5, 2022

Commit

ba97523

•

1 Parent(s): 9808945

add slider

Browse files

Files changed (1) hide show

app.py +7 -6

app.py CHANGED Viewed

@@ -16,7 +16,8 @@ model.eval()
 processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
-def query_image(img, text_queries):
     text_queries = text_queries.split(",")
     inputs = processor(text=text_queries, images=img, return_tensors="pt").to(device)
@@ -30,8 +31,6 @@ def query_image(img, text_queries):
     boxes, scores, labels = results[0]["boxes"], results[0]["scores"], results[0]["labels"]
     img = cv2.resize(img, (768, 768), interpolation = cv2.INTER_AREA)
-    score_threshold = 0.11
     font = cv2.FONT_HERSHEY_SIMPLEX
     for box, score, label in zip(boxes, scores, labels):
@@ -55,15 +54,17 @@ Gradio demo for <a href="https://huggingface.co/docs/transformers/main/en/model_
 introduced in <a href="https://arxiv.org/abs/2205.06230">Simple Open-Vocabulary Object Detection
 with Vision Transformers</a>.
 \n\nYou can use OWL-ViT to query images with text descriptions of any object.
-To use it, simply upload an image and enter comma separated text descriptions of objects you want to query the image for.
 \n\n<a href="https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb">Colab demo</a>
 """
 demo = gr.Interface(
     query_image,
-    inputs=[gr.Image(shape=(768, 768)), "text"],
     outputs="image",
     title="Zero-Shot Object Detection with OWL-ViT",
     description=description,
-    examples=[["assets/astronaut.png", "human face, rocket, flag, nasa badge"], ["assets/coffee.png", "coffee mug, spoon, plate"]]
 )
 demo.launch(debug=True)

 processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
+def query_image(img, text_queries, score_threshold):
+    text_queries = text_queries
     text_queries = text_queries.split(",")
     inputs = processor(text=text_queries, images=img, return_tensors="pt").to(device)
     boxes, scores, labels = results[0]["boxes"], results[0]["scores"], results[0]["labels"]
     img = cv2.resize(img, (768, 768), interpolation = cv2.INTER_AREA)
     font = cv2.FONT_HERSHEY_SIMPLEX
     for box, score, label in zip(boxes, scores, labels):
 introduced in <a href="https://arxiv.org/abs/2205.06230">Simple Open-Vocabulary Object Detection
 with Vision Transformers</a>.
 \n\nYou can use OWL-ViT to query images with text descriptions of any object.
+To use it, simply upload an image and enter comma separated text descriptions of objects you want to query the image for. You
+can also use the score threshold slider to set a threshold to filter out low probability prediction.
 \n\n<a href="https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb">Colab demo</a>
 """
 demo = gr.Interface(
     query_image,
+    inputs=[gr.Image(shape=(768, 768)), "text", gr.Slider(0, 1, value=0.1),],
     outputs="image",
     title="Zero-Shot Object Detection with OWL-ViT",
     description=description,
+    examples=[["assets/astronaut.png", "human face, rocket, flag, nasa badge"], ["assets/coffee.png", "coffee mug, spoon, plate"]],
+    live=True
 )
 demo.launch(debug=True)