import gradio as gr
from transformers import DPTFeatureExtractor, DPTForDepthEstimation
import torch
import numpy as np
from PIL import Image
import cv2
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt

torch.hub.download_url_to_file('http://images.cocodataset.org/val2017/000000039769.jpg', 'cats.jpg')

feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-large")
model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")

def process_image(image):
    # Prepare image for the model
    encoding = feature_extractor(image, return_tensors="pt")
    
    # Forward pass
    with torch.no_grad():
        outputs = model(**encoding)
        predicted_depth = outputs.predicted_depth
    
    # Interpolate to original size
    prediction = torch.nn.functional.interpolate(
        predicted_depth.unsqueeze(1),
        size=image.size[::-1],
        mode="bicubic",
        align_corners=False,
    ).squeeze()
    depth_map_gray = (prediction.cpu().numpy() * 255).astype('uint8')
    
    # Perform feature segmentation
    rgb_image = np.array(image)
    depth_threshold = 1000
    binary_mask = np.where(depth_map_gray > depth_threshold, 255, 0).astype(np.uint8)
    gray_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2GRAY)
    pixels = gray_image.reshape((-1, 1))
    num_clusters = 3
    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(pixels)
    labels = kmeans.labels_
    labels = labels.reshape(gray_image.shape)
    cluster_features = []
    for i in range(num_clusters):
        mask = np.where(labels == i, 255, 0).astype(np.uint8)
        cluster_image = cv2.bitwise_and(rgb_image, rgb_image, mask=mask)
        cluster_features.append(cluster_image)
    
    # Prepare output images
    depth_image = Image.fromarray(depth_map_gray, mode='L')
    cluster_images = [Image.fromarray(cluster) for cluster in cluster_features]
    
    return depth_image, cluster_images

title = "Demo: zero-shot depth estimation with DPT and feature segmentation"
description = "Demo for Intel's DPT with feature segmentation, a Dense Prediction Transformer for state-of-the-art dense prediction tasks such as semantic segmentation and depth estimation."
examples = [['cats.jpg']]

iface = gr.Interface(
    fn=process_image, 
    inputs=gr.inputs.Image(type="pil"), 
    outputs=[
        gr.outputs.Image(type="pil", label="predicted depth"),
        gr.outputs.Image(type="pil", label="cluster 1"),
        gr.outputs.Image(type="pil", label="cluster 2"),
        gr.outputs.Image(type="pil", label="cluster 3"),
    ],
    title=title,
    description=description,
    examples=examples,
    enable_queue=True
)
iface.launch(debug=True)