Spaces:

ysalaun
/

Toolbox

Runtime error

File size: 7,501 Bytes

a36cede

import os

# gradio for visual demo
import gradio as gr

# transformers for easy access to nnet
os.system("pip install git+https://github.com/huggingface/transformers.git")
os.system("pip install datasets")
os.system("pip install scipy")
os.system("pip install torch")

from transformers import AutoImageProcessor, AutoModelForImageClassification, DPTForDepthEstimation, Mask2FormerForUniversalSegmentation
import torch
import numpy as np
from PIL import Image
import requests
from collections import defaultdict

palette = np.asarray([
    [0, 0, 0],
    [120, 120, 120],
    [180, 120, 120],
    [6, 230, 230],
    [80, 50, 50],
    [4, 200, 3],
    [120, 120, 80],
    [140, 140, 140],
    [204, 5, 255],
    [230, 230, 230],
    [4, 250, 7],
    [224, 5, 255],
    [235, 255, 7],
    [150, 5, 61],
    [120, 120, 70],
    [8, 255, 51],
    [255, 6, 82],
    [143, 255, 140],
    [204, 255, 4],
    [255, 51, 7],
    [204, 70, 3],
    [0, 102, 200],
    [61, 230, 250],
    [255, 6, 51],
    [11, 102, 255],
    [255, 7, 71],
    [255, 9, 224],
    [9, 7, 230],
    [220, 220, 220],
    [255, 9, 92],
    [112, 9, 255],
    [8, 255, 214],
    [7, 255, 224],
    [255, 184, 6],
    [10, 255, 71],
    [255, 41, 10],
    [7, 255, 255],
    [224, 255, 8],
    [102, 8, 255],
    [255, 61, 6],
    [255, 194, 7],
    [255, 122, 8],
    [0, 255, 20],
    [255, 8, 41],
    [255, 5, 153],
    [6, 51, 255],
    [235, 12, 255],
    [160, 150, 20],
    [0, 163, 255],
    [140, 140, 140],
    [250, 10, 15],
    [20, 255, 0],
    [31, 255, 0],
    [255, 31, 0],
    [255, 224, 0],
    [153, 255, 0],
    [0, 0, 255],
    [255, 71, 0],
    [0, 235, 255],
    [0, 173, 255],
    [31, 0, 255],
    [11, 200, 200],
    [255, 82, 0],
    [0, 255, 245],
    [0, 61, 255],
    [0, 255, 112],
    [0, 255, 133],
    [255, 0, 0],
    [255, 163, 0],
    [255, 102, 0],
    [194, 255, 0],
    [0, 143, 255],
    [51, 255, 0],
    [0, 82, 255],
    [0, 255, 41],
    [0, 255, 173],
    [10, 0, 255],
    [173, 255, 0],
    [0, 255, 153],
    [255, 92, 0],
    [255, 0, 255],
    [255, 0, 245],
    [255, 0, 102],
    [255, 173, 0],
    [255, 0, 20],
    [255, 184, 184],
    [0, 31, 255],
    [0, 255, 61],
    [0, 71, 255],
    [255, 0, 204],
    [0, 255, 194],
    [0, 255, 82],
    [0, 10, 255],
    [0, 112, 255],
    [51, 0, 255],
    [0, 194, 255],
    [0, 122, 255],
    [0, 255, 163],
    [255, 153, 0],
    [0, 255, 10],
    [255, 112, 0],
    [143, 255, 0],
    [82, 0, 255],
    [163, 255, 0],
    [255, 235, 0],
    [8, 184, 170],
    [133, 0, 255],
    [0, 255, 92],
    [184, 0, 255],
    [255, 0, 31],
    [0, 184, 255],
    [0, 214, 255],
    [255, 0, 112],
    [92, 255, 0],
    [0, 224, 255],
    [112, 224, 255],
    [70, 184, 160],
    [163, 0, 255],
    [153, 0, 255],
    [71, 255, 0],
    [255, 0, 163],
    [255, 204, 0],
    [255, 0, 143],
    [0, 255, 235],
    [133, 255, 0],
    [255, 0, 235],
    [245, 0, 255],
    [255, 0, 122],
    [255, 245, 0],
    [10, 190, 212],
    [214, 255, 0],
    [0, 204, 255],
    [20, 0, 255],
    [255, 255, 0],
    [0, 153, 255],
    [0, 41, 255],
    [0, 255, 204],
    [41, 0, 255],
    [41, 255, 0],
    [173, 0, 255],
    [0, 245, 255],
    [71, 0, 255],
    [122, 0, 255],
    [0, 255, 184],
    [0, 92, 255],
    [184, 255, 0],
    [0, 133, 255],
    [255, 214, 0],
    [25, 194, 194],
    [102, 255, 0],
    [92, 0, 255],
])

depth_image_processor = AutoImageProcessor.from_pretrained("facebook/dpt-dinov2-small-nyu")
depth_model = DPTForDepthEstimation.from_pretrained("facebook/dpt-dinov2-small-nyu")

def compute_depth(img):
    # prepare image for the model
    inputs = depth_image_processor(images=img, return_tensors="pt")

    with torch.no_grad():
        outputs = depth_model(**inputs)
        predicted_depth = outputs.predicted_depth

    # interpolate to original size
    prediction = torch.nn.functional.interpolate(
        predicted_depth.unsqueeze(1),
        size=img.size[::-1],
        mode="bicubic",
        align_corners=False,
    )

    # visualize the prediction
    output = prediction.squeeze().cpu().numpy()
    formatted = (output * 255 / np.max(output)).astype("uint8")
    depth = Image.fromarray(formatted)
    return [depth, "depth"]

clas_processor = AutoImageProcessor.from_pretrained('facebook/dinov2-small-imagenet1k-1-layer')
clas_model = AutoModelForImageClassification.from_pretrained('facebook/dinov2-small-imagenet1k-1-layer')

def compute_clas(img):
    inputs = clas_processor(images=img, return_tensors="pt")
    outputs = clas_model(**inputs)
    logits = outputs.logits
    predicted_class_idx = logits.argmax(-1).item()
    return[img, clas_model.config.id2label[predicted_class_idx]]

m2f_processor = AutoImageProcessor.from_pretrained("facebook/mask2former-swin-base-coco-panoptic")
m2f_model = Mask2FormerForUniversalSegmentation.from_pretrained("facebook/mask2former-swin-base-coco-panoptic")

def seg2sem(seg):
    color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8) # height, width, 3

    handles = []
    for label, color in enumerate(palette):
        color_seg[seg == label, :] = color
        if (seg == label).count_nonzero() > 0:
            handles.append(m2f_model.config.id2label[label]) 
            handles.append(color)

    color_seg = color_seg.astype(np.uint8)
    image = Image.fromarray(color_seg)
    
    return [image,handles]
    
def seg2pano(seg, segments_info):
    color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8) # height, width, 3

    handles = []
    for label, color in enumerate(palette):
        color_seg[seg == label, :] = color

    color_seg = color_seg.astype(np.uint8)
    image = Image.fromarray(color_seg)
    
    instances_counter = defaultdict(int)
    handles = []
    for segment in segments_info:
        segment_id = segment['id']
        segment_label_id = segment['label_id']
        segment_label = m2f_model.config.id2label[segment_label_id]
        label = f"{segment_label}-{instances_counter[segment_label_id]}"
        instances_counter[segment_label_id] += 1
        color = palette[segment_id]
        handles.append(label)
        handles.append(color)
    return [image,handles]

def compute_m2f_sem_seg(img):
    inputs = m2f_processor(images=img, return_tensors="pt")
    
    with torch.no_grad():
        outputs = m2f_model(**inputs)

    seg = m2f_processor.post_process_semantic_segmentation(
        outputs, target_sizes=[img.size[::-1]]
    )[0]

    return seg2sem(seg)

def compute_m2f_pano_seg(img):
    inputs = m2f_processor(images=img, return_tensors="pt")
    
    with torch.no_grad():
        outputs = m2f_model(**inputs)

    seg = m2f_processor.post_process_panoptic_segmentation(
        outputs, target_sizes=[img.size[::-1]]
    )[0]

    return seg2pano(seg["segmentation"], seg["segments_info"])

labels = ["Dinov2 - Depth", "Dinov2 - Classification", "M2F - Semantic Segmentation", "M2F - Panoptic Segmentation"]

# main function
def detect(img, application):
    if application == labels[0]:
        return compute_depth(img)
    elif application == labels[1]:
        return compute_clas(img)
    elif application == labels[2]:
        return compute_m2f_sem_seg(img)
    elif application == labels[3]:
        return compute_m2f_pano_seg(img)
    return img
    
# visual gradio interface
iface = gr.Interface(fn=detect, inputs=[gr.Image(type="pil"), gr.Radio(labels, label="Application")], outputs=[gr.Image(type="pil"), gr.Textbox()])
iface.launch(debug=True)