import spaces # Import spaces first import cv2 import torch import numpy as np import gradio as gr from depth_anything_v2.dpt import DepthAnythingV2 # Model initialization model_configs = { 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]}, 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]}, 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]} } class NormalMapSimple: @classmethod def INPUT_TYPES(s): return { "required": { "images": ("IMAGE",), "scale_XY": ("FLOAT",{"default": 1, "min": 0, "max": 100, "step": 0.001}), }, } RETURN_TYPES = ("IMAGE",) FUNCTION = "normal_map" CATEGORY = "image/filters" def normal_map(self, images, scale_XY): t = images.detach().clone().cpu().numpy().astype(np.float32) L = np.mean(t[:,:,:,:3], axis=3) for i in range(t.shape[0]): t[i,:,:,0] = cv2.Scharr(L[i], -1, 1, 0, cv2.BORDER_REFLECT) * -1 t[i,:,:,1] = cv2.Scharr(L[i], -1, 0, 1, cv2.BORDER_REFLECT) t[:,:,:,2] = 1 t = torch.from_numpy(t) t[:,:,:,:2] *= scale_XY t[:,:,:,:3] = torch.nn.functional.normalize(t[:,:,:,:3], dim=3) / 2 + 0.5 return (t,) class ConvertNormals: @classmethod def INPUT_TYPES(s): return { "required": { "normals": ("IMAGE",), "input_mode": (["BAE", "MiDaS", "Standard", "DirectX"],), "output_mode": (["BAE", "MiDaS", "Standard", "DirectX"],), "scale_XY": ("FLOAT",{"default": 1, "min": 0, "max": 100, "step": 0.001}), "normalize": ("BOOLEAN", {"default": True}), "fix_black": ("BOOLEAN", {"default": True}), }, "optional": { "optional_fill": ("IMAGE",), }, } RETURN_TYPES = ("IMAGE",) FUNCTION = "convert_normals" CATEGORY = "image/filters" def convert_normals(self, normals, input_mode, output_mode, scale_XY, normalize, fix_black, optional_fill=None): try: t = normals.detach().clone() if input_mode == "BAE": t[:,:,:,0] = 1 - t[:,:,:,0] # invert R elif input_mode == "MiDaS": t[:,:,:,:3] = torch.stack([1 - t[:,:,:,2], t[:,:,:,1], t[:,:,:,0]], dim=3) # BGR -> RGB and invert R elif input_mode == "DirectX": t[:,:,:,1] = 1 - t[:,:,:,1] # invert G if fix_black: key = torch.clamp(1 - t[:,:,:,2] * 2, min=0, max=1) if optional_fill is None: t[:,:,:,0] += key * 0.5 t[:,:,:,1] += key * 0.5 t[:,:,:,2] += key else: fill = optional_fill.detach().clone() if fill.shape[1:3] != t.shape[1:3]: fill = torch.nn.functional.interpolate(fill.movedim(-1,1), size=(t.shape[1], t.shape[2]), mode='bilinear').movedim(1,-1) if fill.shape[0] != t.shape[0]: fill = fill[0].unsqueeze(0).expand(t.shape[0], -1, -1, -1) t[:,:,:,:3] += fill[:,:,:,:3] * key.unsqueeze(3).expand(-1, -1, -1, 3) t[:,:,:,:2] = (t[:,:,:,:2] - 0.5) * scale_XY + 0.5 if normalize: # Transform to [-1, 1] range t_norm = t[:,:,:,:3] * 2 - 1 # Calculate the length of each vector lengths = torch.sqrt(torch.sum(t_norm**2, dim=3, keepdim=True)) # Avoid division by zero lengths = torch.clamp(lengths, min=1e-6) # Normalize each vector to unit length t_norm = t_norm / lengths # Transform back to [0, 1] range t[:,:,:,:3] = (t_norm + 1) / 2 if output_mode == "BAE": t[:,:,:,0] = 1 - t[:,:,:,0] # invert R elif output_mode == "MiDaS": t[:,:,:,:3] = torch.stack([t[:,:,:,2], t[:,:,:,1], 1 - t[:,:,:,0]], dim=3) # invert R and BGR -> RGB elif output_mode == "DirectX": t[:,:,:,1] = 1 - t[:,:,:,1] # invert G return (t,) except Exception as e: print(f"Error in convert_normals: {str(e)}") return (normals,) def get_image_intensity(img, gamma_correction=1.0): """ Extract intensity map from an image using HSV color space """ # Convert to HSV color space result = cv2.cvtColor(img, cv2.COLOR_RGB2HSV) # Extract Value channel (intensity) result = result[:, :, 2].astype(np.float32) / 255.0 # Apply gamma correction result = result ** gamma_correction # Convert back to 0-255 range result = (result * 255.0).clip(0, 255).astype(np.uint8) # Convert to RGB (still grayscale but in RGB format) result = cv2.cvtColor(result, cv2.COLOR_GRAY2RGB) return result def blend_numpy_images(image1, image2, blend_factor=0.4, mode="normal"): """ Blend two numpy images using normal mode """ # Convert to float32 and normalize to 0-1 img1 = image1.astype(np.float32) / 255.0 img2 = image2.astype(np.float32) / 255.0 # Normal blend mode blended = img1 * (1 - blend_factor) + img2 * blend_factor # Convert back to uint8 blended = (blended * 255.0).clip(0, 255).astype(np.uint8) return blended def process_normal_map(image): """ Process image through NormalMapSimple and ConvertNormals """ # Convert numpy image to torch tensor with batch dimension image_tensor = torch.from_numpy(image).unsqueeze(0).float() / 255.0 # Create instances of the classes normal_map_generator = NormalMapSimple() normal_converter = ConvertNormals() # Generate initial normal map normal_map = normal_map_generator.normal_map(image_tensor, scale_XY=1.0)[0] # Convert normal map from Standard to DirectX converted_normal = normal_converter.convert_normals( normal_map, input_mode="Standard", output_mode="DirectX", scale_XY=1.0, normalize=True, fix_black=True )[0] # Convert back to numpy array result = (converted_normal.squeeze(0).numpy() * 255).astype(np.uint8) return result # Initialize model globally def initialize_model(): encoder = 'vitl' max_depth = 1 model = DepthAnythingV2(**{**model_configs[encoder], 'max_depth': max_depth}) # Load checkpoint checkpoint = torch.load('checkpoints/model2.pth', map_location='cpu') # Get state dict state_dict = {} for key in checkpoint.keys(): if key not in ['optimizer', 'epoch', 'previous_best']: state_dict = checkpoint[key] # Handle module prefix my_state_dict = {} for key in state_dict.keys(): new_key = key.replace('module.', '') my_state_dict[new_key] = state_dict[key] model.load_state_dict(my_state_dict) return model MODEL = initialize_model() @spaces.GPU def process_image(input_image): """ Process the input image and return depth maps, intensity map, blended result, and normal map """ if input_image is None: return None, None, None, None, None # Move model to GPU for processing MODEL.to('cuda') MODEL.eval() # Convert from RGB to BGR for depth processing input_bgr = cv2.cvtColor(np.array(input_image), cv2.COLOR_RGB2BGR) with torch.no_grad(): # Get depth map depth = MODEL.infer_image(input_bgr) # Normalize depth for visualization (0-255) depth_normalized = ((depth - depth.min()) / (depth.max() - depth.min()) * 255).astype(np.uint8) # Apply colormap for better visualization depth_colormap = cv2.applyColorMap(depth_normalized, cv2.COLORMAP_INFERNO) depth_colormap = cv2.cvtColor(depth_colormap, cv2.COLOR_BGR2RGB) # Move model back to CPU after processing MODEL.to('cpu') # Get intensity map intensity_map = get_image_intensity(np.array(input_image), gamma_correction=1.0) # Blend depth raw with intensity map blended_result = blend_numpy_images( cv2.cvtColor(depth_normalized, cv2.COLOR_GRAY2RGB), # Convert depth to RGB intensity_map, blend_factor=0.4, mode="normal" ) # Generate normal map from blended result normal_map = process_normal_map(blended_result) return depth_normalized, depth_colormap, intensity_map, blended_result, normal_map @spaces.GPU def gradio_interface(input_img): try: depth_raw, depth_colored, intensity, blended, normal = process_image(input_img) return [input_img, depth_raw, depth_colored, intensity, blended, normal] except Exception as e: print(f"Error processing image: {str(e)}") return [input_img, None, None, None, None, None] # Define interface iface = gr.Interface( fn=gradio_interface, inputs=gr.Image(label="Input Image"), outputs=[ gr.Image(label="Raw Depth Map"), gr.Image(label="Normal Map") ], title="Depth, Intensity, and Normal Map Estimation", description="Upload an image to generate its depth map, intensity map, blended result, and normal map.", examples=["image.jpg"] ) # Launch the app if __name__ == "__main__": iface.launch()