import gradio as gr from transformers import DPTFeatureExtractor, DPTForDepthEstimation import torch import numpy as np from PIL import Image import cv2 from sklearn.cluster import KMeans from matplotlib import pyplot as plt torch.hub.download_url_to_file('http://images.cocodataset.org/val2017/000000039769.jpg', 'cats.jpg') feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-large") model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large") def process_image(image): # Prepare image for the model encoding = feature_extractor(image, return_tensors="pt") # Forward pass with torch.no_grad(): outputs = model(**encoding) predicted_depth = outputs.predicted_depth # Interpolate to original size prediction = torch.nn.functional.interpolate( predicted_depth.unsqueeze(1), size=image.size[::-1], mode="bicubic", align_corners=False, ).squeeze() depth_map_gray = (prediction.cpu().numpy() * 255).astype('uint8') # Perform feature segmentation rgb_image = np.array(image) depth_threshold = 1000 binary_mask = np.where(depth_map_gray > depth_threshold, 255, 0).astype(np.uint8) gray_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2GRAY) pixels = gray_image.reshape((-1, 1)) num_clusters = 3 kmeans = KMeans(n_clusters=num_clusters) kmeans.fit(pixels) labels = kmeans.labels_ labels = labels.reshape(gray_image.shape) cluster_features = [] for i in range(num_clusters): mask = np.where(labels == i, 255, 0).astype(np.uint8) cluster_image = cv2.bitwise_and(rgb_image, rgb_image, mask=mask) cluster_features.append(cluster_image) # Prepare output images depth_image = Image.fromarray(depth_map_gray, mode='L') cluster_images = [Image.fromarray(cluster) for cluster in cluster_features] return depth_image, cluster_images title = "Demo: zero-shot depth estimation with DPT and feature segmentation" description = "Demo for Intel's DPT with feature segmentation, a Dense Prediction Transformer for state-of-the-art dense prediction tasks such as semantic segmentation and depth estimation." examples = [['cats.jpg']] iface = gr.Interface( fn=process_image, inputs=gr.inputs.Image(type="pil"), outputs=[ gr.outputs.Image(type="pil", label="predicted depth"), gr.outputs.Image(type="pil", label="cluster 1"), gr.outputs.Image(type="pil", label="cluster 2"), gr.outputs.Image(type="pil", label="cluster 3"), ], title=title, description=description, examples=examples, enable_queue=True ) iface.launch(debug=True)