Spaces:
Runtime error
Runtime error
from turtle import title | |
import os | |
import gradio as gr | |
from transformers import pipeline | |
import numpy as np | |
from PIL import Image | |
import torch | |
import cv2 | |
from transformers import CLIPSegProcessor, CLIPSegForImageSegmentation,AutoProcessor,AutoConfig | |
from skimage.measure import label, regionprops | |
processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined") | |
model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined") | |
def rescale_bbox(bbox,orig_image_shape=(1024,1024),model_shape=352): | |
bbox = np.asarray(bbox)/model_shape | |
y1,y2 = bbox[::2] *orig_image_shape[0] | |
x1,x2 = bbox[1::2]*orig_image_shape[1] | |
return [int(y1),int(x1),int(y2),int(x2)] | |
def detect_using_clip(image,prompts=[],threshould=0.4): | |
model_detections = dict() | |
inputs = processor( | |
text=prompts, | |
images=[image] * len(prompts), | |
padding="max_length", | |
return_tensors="pt", | |
) | |
with torch.no_grad(): # Use 'torch.no_grad()' to disable gradient computation | |
outputs = model(**inputs) | |
preds = outputs.logits.unsqueeze(1) | |
detection = outputs.logits[0] # Assuming class index 0 | |
for i,prompt in enumerate(prompts): | |
predicted_image = torch.sigmoid(preds[i][0]).detach().cpu().numpy() | |
predicted_image = np.where(predicted_image>threshould,255,0) | |
# extract countours from the image | |
lbl_0 = label(predicted_image) | |
props = regionprops(lbl_0) | |
model_detections[prompt] = [rescale_bbox(prop.bbox,orig_image_shape=image.shape[:2],model_shape=predicted_image.shape[0]) for prop in props] | |
return model_detections | |
def display_images(image,detections,prompt='traffic light'): | |
H,W = image.shape[:2] | |
image_copy = image.copy() | |
if prompt not in detections.keys(): | |
print("prompt not in query ..") | |
return image_copy | |
for bbox in detections[prompt]: | |
cv2.rectangle(image_copy, (int(bbox[1]), int(bbox[0])), (int(bbox[3]), int(bbox[2])), (255, 0, 0), 2) | |
return image_copy | |
def shot(image, labels_text): | |
print(labels_text) | |
prompts = labels_text.split(',') | |
global classes | |
classes = prompts | |
detections = detect_using_clip(image,prompts=prompts) | |
return detections | |
def add_text(text): | |
labels = text.split(',') | |
return labels | |
inputt = gr.Image(type="numpy", label="Input Image for Classification") | |
# with gr.Blocks(title="Zero Shot Object ddetection using Text Prompts") as demo : | |
# gr.Markdown( | |
# """ | |
# <center> | |
# <h1> | |
# The CLIP Model | |
# </h1> | |
# A neural network called CLIP which efficiently learns visual concepts from natural language supervision. CLIP can be applied to any visual classification benchmark by simply providing the names of the visual categories to be recognized, similar to the “zero-shot” capabilities of GPT-2 and GPT-3. | |
# </center> | |
# """ | |
# ) | |
# with gr.Row(): | |
# with gr.Column(): | |
# inputt = gr.Image(type="numpy", label="Input Image for Classification") | |
# labels = gr.Textbox(label="Enter Label/ labels",placeholder="ex. car,person",scale=4) | |
# button = gr.Button(value="Locate objects") | |
# with gr.Column(): | |
# outputs = gr.Image(type="numpy", label="Detected Objects with Selected Category") | |
# # dropdown = gr.Dropdown(labels,label="Select the category",info='Label selection panel') | |
# # labels.submit(add_text, inputs=labels) | |
# button.click(fn=shot,inputs=[inputt,labels],api_name='Get labels') | |
# demo.launch() | |
iface = gr.Interface(fn=shot, | |
inputs = ["image","text"], | |
outputs="label", | |
# examples=[["images/room.jpg","bed,table,plant"]], | |
# allow_flagging=False, | |
# analytics_enabled=False, | |
) | |
iface.launch() | |