Spaces:

shengqiangShi
/

SV3

Running

App Files Files Community

shengqiangShi commited on Mar 31

Commit

e4dee6a

•

1 Parent(s): 15450dc

Application file

Browse files

Files changed (5) hide show

app.py +82 -32
images/animals.png +0 -0
images/dark_cell.png +0 -0
images/purple cell.png +0 -0
requirements.txt +3 -1

app.py CHANGED Viewed

@@ -2,6 +2,29 @@ import torch
 import gradio as gr
 from transformers import Owlv2Processor, Owlv2ForObjectDetection
 import spaces
 # Use GPU if available
 if torch.cuda.is_available():
@@ -11,55 +34,82 @@ else:
 model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble").to(device)
 processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")
 @spaces.GPU
-def query_image(img, text_queries, score_threshold):
-    text_queries = text_queries
     text_queries = text_queries.split(",")
     size = max(img.shape[:2])
     target_sizes = torch.Tensor([[size, size]])
     inputs = processor(text=text_queries, images=img, return_tensors="pt").to(device)
     with torch.no_grad():
-        outputs = model(**inputs)
-    outputs.logits = outputs.logits.cpu()
-    outputs.pred_boxes = outputs.pred_boxes.cpu()
-    results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes)
     boxes, scores, labels = results[0]["boxes"], results[0]["scores"], results[0]["labels"]
     result_labels = []
     for box, score, label in zip(boxes, scores, labels):
-        box = [int(i) for i in box.tolist()]
-        if score < score_threshold:
-            continue
-        result_labels.append((box, text_queries[label.item()]))
-    return img, result_labels
 description = """
-Try this demo for <a href="https://huggingface.co/docs/transformers/main/en/model_doc/owlv2">OWLv2</a>,
-introduced in <a href="https://arxiv.org/abs/2306.09683">Scaling Open-Vocabulary Object Detection</a>.
-\n\n Compared to OWLVIT, OWLv2 performs better both in yield and performance (average precision).
-You can use OWLv2 to query images with text descriptions of any object.
-To use it, simply upload an image and enter comma separated text descriptions of objects you want to query the image for. You
-can also use the score threshold slider to set a threshold to filter out low probability predictions.
-\n\nOWL-ViT is trained on text templates,
-hence you can get better predictions by querying the image with text templates used in training the original model: e.g. *"photo of a star-spangled banner"*,
-*"image of a shoe"*. Refer to the <a href="https://arxiv.org/abs/2103.00020">CLIP</a> paper to see the full list of text templates used to augment the training data.
-\n\n<a href="https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb">Colab demo</a>
 """
 demo = gr.Interface(
-    query_image,
-    inputs=[gr.Image(), "text", gr.Slider(0, 1, value=0.1)],
-    outputs="annotatedimage",
-    title="Zero-Shot Object Detection with OWLv2",
-    description=description,
     examples=[
-        ["assets/astronaut.png", "human face, rocket, star-spangled banner, nasa badge", 0.11],
-        ["assets/coffee.png", "coffee mug, spoon, plate", 0.1],
-        ["assets/butterflies.jpeg", "orange butterfly", 0.3],
     ],
 )
-demo.launch()

 import gradio as gr
 from transformers import Owlv2Processor, Owlv2ForObjectDetection
 import spaces
+import numpy as np
+from PIL import Image
+import io
+import random
+from transformers import SamModel, SamProcessor
+def apply_colored_masks_on_image(image, masks):
+    if not isinstance(image, Image.Image):
+        image = Image.fromarray(image.astype('uint8'), 'RGB')
+    image_rgba = image.convert("RGBA")
+    for i in range(masks.shape[0]):
+        mask = masks[i].squeeze().cpu().numpy()
+        mask_image = Image.fromarray((mask * 255).astype(np.uint8), 'L')
+        color = tuple([random.randint(0, 255) for _ in range(3)] + [128])
+        colored_mask = Image.new("RGBA", image.size, color)
+        colored_mask.putalpha(mask_image)
+        image_rgba = Image.alpha_composite(image_rgba, colored_mask)
+    return image_rgba
 # Use GPU if available
 if torch.cuda.is_available():
 model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble").to(device)
 processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")
+model_sam = SamModel.from_pretrained("facebook/sam-vit-huge").to(device)
+processor_sam = SamProcessor.from_pretrained("facebook/sam-vit-huge")
 @spaces.GPU
+def query_image(img, text_queries, score_threshold=0.5):
     text_queries = text_queries.split(",")
     size = max(img.shape[:2])
     target_sizes = torch.Tensor([[size, size]])
     inputs = processor(text=text_queries, images=img, return_tensors="pt").to(device)
     with torch.no_grad():
+        model_outputs = model(**inputs)
+        model_outputs.logits = model_outputs.logits.cpu()
+        model_outputs.pred_boxes = model_outputs.pred_boxes.cpu()
+        results = processor.post_process_object_detection(outputs=model_outputs, target_sizes=target_sizes)
     boxes, scores, labels = results[0]["boxes"], results[0]["scores"], results[0]["labels"]
+    img_pil = Image.fromarray(img.astype('uint8'), 'RGB')
     result_labels = []
+    result_boxes = []
     for box, score, label in zip(boxes, scores, labels):
+        if score >= score_threshold:
+            box = [int(i) for i in box.tolist()]
+            label_text = text_queries[label.item()]
+            result_labels.append((box, label_text))
+            result_boxes.append(box)
+    input_boxes_for_sam = [result_boxes]
+    sam_image = generate_image_with_sam(np.array(img_pil), input_boxes_for_sam)
+    return sam_image,result_labels
+def generate_image_with_sam(img, boxes):
+    img_pil = Image.fromarray(img.astype('uint8'), 'RGB')
+    inputs = processor_sam(img_pil, return_tensors="pt").to(device)
+    image_embeddings = model_sam.get_image_embeddings(inputs["pixel_values"])
+    inputs = processor_sam(img_pil, input_boxes=boxes, return_tensors="pt").to(device)
+    inputs["input_boxes"].shape
+    inputs.pop("pixel_values", None)
+    inputs.update({"image_embeddings": image_embeddings})
+    with torch.no_grad():
+        outputs = model_sam(**inputs, multimask_output=False)
+    masks = processor_sam.image_processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu())
+    scores = outputs.iou_scores
+    print(type(scores))
+    print(scores.shape if hasattr(scores, 'shape') else scores)
+    SAM_image = apply_colored_masks_on_image(img_pil, masks[0])
+    return SAM_image
 description = """
+Split anythings
 """
 demo = gr.Interface(
+    fn=query_image,
+    inputs=[gr.Image(), gr.Textbox(label="Query Text"), gr.Slider(0, 1, value=0.5, label="Score Threshold")],
+    outputs=gr.AnnotatedImage(),
+    title="Zero-Shot Object Detection SV3",
+    description="This interface demonstrates object detection using  zero-shot object detection and SAM  for image segmentation.",
     examples=[
+        ["images/purple cell.png", "purple cells", 0.11],
+        ["images/dark_cell.png", "gray cells", 0.1],
+        ["images/animals.png", "Rabbit,Squirrel,Parrot,Hedgehog,Turtle,Ladybug,Chick,Frog,Butterfly,Snail,Mouse", 0.1],
     ],
 )
+demo.launch()

images/animals.png ADDED Viewed

images/dark_cell.png ADDED Viewed

images/purple cell.png ADDED Viewed

requirements.txt CHANGED Viewed

@@ -3,4 +3,6 @@ torch>=1.7.0
 torchvision>=0.8.1
 git+https://github.com/huggingface/transformers.git
 scipy
-spaces

 torchvision>=0.8.1
 git+https://github.com/huggingface/transformers.git
 scipy
+spaces
+matplotlib
+pillow