Spaces:
Runtime error
Runtime error
| import os | |
| os.system('pip install gradio==2.3.0a0') | |
| os.system('pip freeze') | |
| import torch | |
| from PIL import Image | |
| import requests | |
| import torchvision.transforms as T | |
| import matplotlib.pyplot as plt | |
| from collections import defaultdict | |
| import torch.nn.functional as F | |
| import numpy as np | |
| from skimage.measure import find_contours | |
| from matplotlib import patches, lines | |
| from matplotlib.patches import Polygon | |
| import gradio as gr | |
| torch.hub.download_url_to_file('https://cdn.pixabay.com/photo/2014/03/04/15/10/elephants-279505_1280.jpg', 'elephant.jpg') | |
| torch.set_grad_enabled(False); | |
| # standard PyTorch mean-std input image normalization | |
| transform = T.Compose([ | |
| T.Resize(800), | |
| T.ToTensor(), | |
| T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) | |
| ]) | |
| # for output bounding box post-processing | |
| def box_cxcywh_to_xyxy(x): | |
| x_c, y_c, w, h = x.unbind(1) | |
| b = [(x_c - 0.5 * w), (y_c - 0.5 * h), | |
| (x_c + 0.5 * w), (y_c + 0.5 * h)] | |
| return torch.stack(b, dim=1) | |
| def rescale_bboxes(out_bbox, size): | |
| img_w, img_h = size | |
| b = box_cxcywh_to_xyxy(out_bbox) | |
| b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32) | |
| return b | |
| # colors for visualization | |
| COLORS = [[0.000, 0.447, 0.741], [0.850, 0.325, 0.098], [0.929, 0.694, 0.125], | |
| [0.494, 0.184, 0.556], [0.466, 0.674, 0.188], [0.301, 0.745, 0.933]] | |
| def apply_mask(image, mask, color, alpha=0.5): | |
| """Apply the given mask to the image. | |
| """ | |
| for c in range(3): | |
| image[:, :, c] = np.where(mask == 1, | |
| image[:, :, c] * | |
| (1 - alpha) + alpha * color[c] * 255, | |
| image[:, :, c]) | |
| return image | |
| def plot_results(pil_img, scores, boxes, labels, masks=None): | |
| plt.figure(figsize=(16,10)) | |
| np_image = np.array(pil_img) | |
| ax = plt.gca() | |
| colors = COLORS * 100 | |
| if masks is None: | |
| masks = [None for _ in range(len(scores))] | |
| assert len(scores) == len(boxes) == len(labels) == len(masks) | |
| for s, (xmin, ymin, xmax, ymax), l, mask, c in zip(scores, boxes.tolist(), labels, masks, colors): | |
| ax.add_patch(plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, | |
| fill=False, color=c, linewidth=3)) | |
| text = f'{l}: {s:0.2f}' | |
| ax.text(xmin, ymin, text, fontsize=15, bbox=dict(facecolor='white', alpha=0.8)) | |
| if mask is None: | |
| continue | |
| np_image = apply_mask(np_image, mask, c) | |
| padded_mask = np.zeros((mask.shape[0] + 2, mask.shape[1] + 2), dtype=np.uint8) | |
| padded_mask[1:-1, 1:-1] = mask | |
| contours = find_contours(padded_mask, 0.5) | |
| for verts in contours: | |
| # Subtract the padding and flip (y, x) to (x, y) | |
| verts = np.fliplr(verts) - 1 | |
| p = Polygon(verts, facecolor="none", edgecolor=c) | |
| ax.add_patch(p) | |
| plt.imshow(np_image) | |
| plt.axis('off') | |
| plt.savefig('foo.png',bbox_inches='tight') | |
| return 'foo.png' | |
| def add_res(results, ax, color='green'): | |
| #for tt in results.values(): | |
| if True: | |
| bboxes = results['boxes'] | |
| labels = results['labels'] | |
| scores = results['scores'] | |
| #keep = scores >= 0.0 | |
| #bboxes = bboxes[keep].tolist() | |
| #labels = labels[keep].tolist() | |
| #scores = scores[keep].tolist() | |
| #print(torchvision.ops.box_iou(tt['boxes'].cpu().detach(), torch.as_tensor([[xmin, ymin, xmax, ymax]]))) | |
| colors = ['purple', 'yellow', 'red', 'green', 'orange', 'pink'] | |
| for i, (b, ll, ss) in enumerate(zip(bboxes, labels, scores)): | |
| ax.add_patch(plt.Rectangle((b[0], b[1]), b[2] - b[0], b[3] - b[1], fill=False, color=colors[i], linewidth=3)) | |
| cls_name = ll if isinstance(ll,str) else CLASSES[ll] | |
| text = f'{cls_name}: {ss:.2f}' | |
| print(text) | |
| ax.text(b[0], b[1], text, fontsize=15, bbox=dict(facecolor='white', alpha=0.8)) | |
| model, postprocessor = torch.hub.load('ashkamath/mdetr:main', 'mdetr_efficientnetB5', pretrained=True, return_postprocessor=True) | |
| model = model.cpu() | |
| model.eval(); | |
| def plot_inference(im, caption): | |
| # mean-std normalize the input image (batch-size: 1) | |
| img = transform(im).unsqueeze(0).cpu() | |
| # propagate through the model | |
| memory_cache = model(img, [caption], encode_and_save=True) | |
| outputs = model(img, [caption], encode_and_save=False, memory_cache=memory_cache) | |
| # keep only predictions with 0.7+ confidence | |
| probas = 1 - outputs['pred_logits'].softmax(-1)[0, :, -1].cpu() | |
| keep = (probas > 0.7).cpu() | |
| # convert boxes from [0; 1] to image scales | |
| bboxes_scaled = rescale_bboxes(outputs['pred_boxes'].cpu()[0, keep], im.size) | |
| # Extract the text spans predicted by each box | |
| positive_tokens = (outputs["pred_logits"].cpu()[0, keep].softmax(-1) > 0.1).nonzero().tolist() | |
| predicted_spans = defaultdict(str) | |
| for tok in positive_tokens: | |
| item, pos = tok | |
| if pos < 255: | |
| span = memory_cache["tokenized"].token_to_chars(0, pos) | |
| predicted_spans [item] += " " + caption[span.start:span.end] | |
| labels = [predicted_spans [k] for k in sorted(list(predicted_spans .keys()))] | |
| return plot_results(im, probas[keep], bboxes_scaled, labels) | |
| title = "MDETR" | |
| description = "Gradio demo for MDETR: Modulated Detection for End-to-End Multi-Modal Understanding. To use it, simply upload your image and add text, or click one of the examples to load them. Read more at the links below." | |
| article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2104.12763'>MDETR: Modulated Detection for End-to-End Multi-Modal Understanding</a> | <a href='https://github.com/ashkamath/mdetr'>Github Repo</a></p>" | |
| examples =[['elephant.jpg','baby elephant']] | |
| gr.Interface( | |
| plot_inference, | |
| [gr.inputs.Image(type="pil", label="Input"), gr.inputs.Textbox(label="input text")], | |
| gr.outputs.Image(type="file", label="Output"), | |
| title=title, | |
| description=description, | |
| article=article, | |
| examples=examples, | |
| enable_queue=True | |
| ).launch(debug=True) |