Spaces:
Running
on
Zero
Running
on
Zero
import functools | |
import os | |
import shutil | |
import sys | |
import git | |
import gradio as gr | |
import numpy as np | |
import torch as torch | |
from PIL import Image | |
print(torch.version.cuda) | |
os.system('libcusolver.so.11') | |
from gradio_imageslider import ImageSlider | |
from bilateral_normal_integration.bilateral_normal_integration_cupy import bilateral_normal_integration_function | |
import spaces | |
import fire | |
import argparse | |
import os | |
import logging | |
import numpy as np | |
import torch | |
from PIL import Image | |
from tqdm.auto import tqdm | |
import glob | |
import json | |
import cv2 | |
from rembg import remove | |
from segment_anything import sam_model_registry, SamPredictor | |
from datetime import datetime | |
import time | |
import sys | |
sys.path.append("../") | |
from models.geowizard_pipeline import DepthNormalEstimationPipeline | |
from utils.seed_all import seed_all | |
import matplotlib.pyplot as plt | |
from utils.de_normalized import align_scale_shift | |
from utils.depth2normal import * | |
from diffusers import DiffusionPipeline, DDIMScheduler, AutoencoderKL | |
from models.unet_2d_condition import UNet2DConditionModel | |
from transformers import CLIPTextModel, CLIPTokenizer | |
from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection | |
import torchvision.transforms.functional as TF | |
from torchvision.transforms import InterpolationMode | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
stable_diffusion_repo_path = "stabilityai/stable-diffusion-2-1-unclip" | |
vae = AutoencoderKL.from_pretrained(stable_diffusion_repo_path, subfolder='vae') | |
scheduler = DDIMScheduler.from_pretrained(stable_diffusion_repo_path, subfolder='scheduler') | |
sd_image_variations_diffusers_path = 'lambdalabs/sd-image-variations-diffusers' | |
image_encoder = CLIPVisionModelWithProjection.from_pretrained(sd_image_variations_diffusers_path, subfolder="image_encoder") | |
feature_extractor = CLIPImageProcessor.from_pretrained(sd_image_variations_diffusers_path, subfolder="feature_extractor") | |
unet = UNet2DConditionModel.from_pretrained('.', subfolder="unet") | |
pipe = DepthNormalEstimationPipeline(vae=vae, | |
image_encoder=image_encoder, | |
feature_extractor=feature_extractor, | |
unet=unet, | |
scheduler=scheduler) | |
try: | |
import xformers | |
pipe.enable_xformers_memory_efficient_attention() | |
except: | |
pass # run without xformers | |
pipe = pipe.to(device) | |
def sam_init(): | |
sam_checkpoint = os.path.join(os.path.dirname(__file__), "sam_pt", "sam_vit_l_0b3195.pth") | |
model_type = "vit_l" | |
sam = sam_model_registry[model_type](checkpoint=sam_checkpoint).to(device=f"cuda") | |
predictor = SamPredictor(sam) | |
return predictor | |
sam_predictor = sam_init() | |
def sam_segment(predictor, input_image, *bbox_coords): | |
bbox = np.array(bbox_coords) | |
image = np.asarray(input_image) | |
start_time = time.time() | |
predictor.set_image(image) | |
masks_bbox, scores_bbox, logits_bbox = predictor.predict( | |
box=bbox, | |
multimask_output=True | |
) | |
print(f"SAM Time: {time.time() - start_time:.3f}s") | |
out_image = np.zeros((image.shape[0], image.shape[1], 4), dtype=np.uint8) | |
out_image[:, :, :3] = image | |
out_image_bbox = out_image.copy() | |
out_image_bbox[:, :, 3] = masks_bbox[-1].astype(np.uint8) * 255 | |
torch.cuda.empty_cache() | |
return Image.fromarray(out_image_bbox, mode='RGBA'), masks_bbox | |
def depth_normal(img_path, | |
denoising_steps, | |
ensemble_size, | |
processing_res, | |
seed, | |
domain): | |
seed = int(seed) | |
if seed >= 0: | |
torch.manual_seed(seed) | |
img = Image.open(img_path) | |
pipe_out = pipe( | |
img, | |
denoising_steps=denoising_steps, | |
ensemble_size=ensemble_size, | |
processing_res=processing_res, | |
batch_size=0, | |
domain=domain, | |
show_progress_bar=True, | |
) | |
depth_colored = pipe_out.depth_colored | |
normal_colored = pipe_out.normal_colored | |
depth_np = pipe_out.depth_np | |
normal_np = pipe_out.normal_np | |
path_output_dir = os.path.splitext(os.path.basename(img_path))[0] + datetime.now().strftime('%Y%m%d-%H%M%S') | |
os.makedirs(path_output_dir, exist_ok=True) | |
name_base = os.path.splitext(os.path.basename(img_path))[0] | |
depth_path = os.path.join(path_output_dir, f"{name_base}_depth.npy") | |
normal_path = os.path.join(path_output_dir, f"{name_base}_normal.npy") | |
np.save(normal_path, normal_np) | |
np.save(depth_path, depth_np) | |
return depth_colored, normal_colored, [depth_path, normal_path] | |
def reconstruction(image, files): | |
torch.cuda.empty_cache() | |
img = Image.open(image) | |
image_rem = img.convert('RGBA') | |
image_nobg = remove(image_rem, alpha_matting=True) | |
arr = np.asarray(image_nobg)[:,:,-1] | |
x_nonzero = np.nonzero(arr.sum(axis=0)) | |
y_nonzero = np.nonzero(arr.sum(axis=1)) | |
x_min = int(x_nonzero[0].min()) | |
y_min = int(y_nonzero[0].min()) | |
x_max = int(x_nonzero[0].max()) | |
y_max = int(y_nonzero[0].max()) | |
masked_image, mask = sam_segment(sam_predictor, img.convert('RGB'), x_min, y_min, x_max, y_max) | |
depth_np = np.load(files[0]) | |
normal_np = np.load(files[1]) | |
dir_name = os.path.dirname(os.path.realpath(files[0])) | |
mask_output_temp = mask[-1] | |
name_base = os.path.splitext(os.path.basename(files[0]))[0][:-6] | |
normal_np[:, :, 0] *= -1 | |
_, surface, _, _, _ = bilateral_normal_integration_function(normal_np, mask_output_temp, k=2, K=None, max_iter=100, tol=1e-4, cg_max_iter=5000, cg_tol=1e-3) | |
ply_path = os.path.join(dir_name, f"{name_base}_recon.ply") | |
surface.save(ply_path, binary=False) | |
return [ply_path] | |
def run_demo(): | |
custom_theme = gr.themes.Soft(primary_hue="blue").set( | |
button_secondary_background_fill="*neutral_100", | |
button_secondary_background_fill_hover="*neutral_200") | |
custom_css = '''#disp_image { | |
text-align: center; /* Horizontally center the content */ | |
}''' | |
_TITLE = '''GeoWizard: Unleashing the Diffusion Priors for 3D Geometry Estimation from a Single Image''' | |
_DESCRIPTION = ''' | |
<div> | |
Generate consistent depth and normal from single image. High quality and rich details. (PS: We find the demo running on ZeroGPU output slightly inferior results compared to A100 or 3060 with everything exactly the same.) | |
<a style="display:inline-block; margin-left: .5em" href='https://github.com/fuxiao0719/GeoWizard/'><img src='https://img.shields.io/github/stars/fuxiao0719/GeoWizard?style=social' /></a> | |
</div> | |
''' | |
_GPU_ID = 0 | |
with gr.Blocks(title=_TITLE, theme=custom_theme, css=custom_css) as demo: | |
with gr.Row(): | |
with gr.Column(scale=1): | |
gr.Markdown('# ' + _TITLE) | |
gr.Markdown(_DESCRIPTION) | |
with gr.Row(variant='panel'): | |
with gr.Column(scale=1): | |
input_image = gr.Image(type='filepath', height=320, label='Input image') | |
example_folder = os.path.join(os.path.dirname(__file__), "./files") | |
example_fns = [os.path.join(example_folder, example) for example in os.listdir(example_folder)] | |
gr.Examples( | |
examples=example_fns, | |
inputs=[input_image], | |
cache_examples=False, | |
label='Examples (click one of the images below to start)', | |
examples_per_page=30 | |
) | |
with gr.Column(scale=1): | |
with gr.Accordion('Advanced options', open=True): | |
with gr.Column(): | |
domain = gr.Radio( | |
[ | |
("Outdoor", "outdoor"), | |
("Indoor", "indoor"), | |
("Object", "object"), | |
], | |
label="Data Type (Must Select One matches your image)", | |
value="indoor", | |
) | |
denoising_steps = gr.Slider( | |
label="Number of denoising steps (More steps, better quality)", | |
minimum=1, | |
maximum=50, | |
step=1, | |
value=10, | |
) | |
ensemble_size = gr.Slider( | |
label="Ensemble size (More steps, higher accuracy)", | |
minimum=1, | |
maximum=15, | |
step=1, | |
value=3, | |
) | |
seed = gr.Number(0, label='Random Seed. Negative values for not specifying') | |
processing_res = gr.Radio( | |
[ | |
("Native", 0), | |
("Recommended", 768), | |
], | |
label="Processing resolution", | |
value=768, | |
) | |
run_btn = gr.Button('Generate', variant='primary', interactive=True) | |
with gr.Row(): | |
with gr.Column(): | |
depth = gr.Image(interactive=False, show_label=False) | |
with gr.Column(): | |
normal = gr.Image(interactive=False, show_label=False) | |
with gr.Row(): | |
files = gr.Files( | |
label = "Depth and Normal (numpy)", | |
elem_id = "download", | |
interactive=False, | |
) | |
with gr.Row(): | |
recon_btn = gr.Button('Is there a salient foreground object? If yes, Click here to Reconstruct its 3D model.', variant='primary', interactive=True) | |
with gr.Row(): | |
# reconstructed_3d = gr.Model3D( | |
# label = 'Bini post-processed 3D model', height=320, interactive=False, | |
# ) | |
reconstructed_3d = gr.Files( | |
label = "Bini post-processed 3D model (plyfile)", | |
elem_id = "download", | |
interactive=False, | |
) | |
run_btn.click(fn=depth_normal, | |
inputs=[input_image, denoising_steps, | |
ensemble_size, | |
processing_res, | |
seed, | |
domain], | |
outputs=[depth, normal, files] | |
) | |
recon_btn.click(fn=reconstruction, | |
inputs=[input_image, files], | |
outputs=[reconstructed_3d] | |
) | |
demo.queue().launch(share=True, max_threads=80) | |
if __name__ == '__main__': | |
fire.Fire(run_demo) | |