Spaces:

liuyuan-pal
/

SyncDreamer

Runtime error

App Files Files Community

liuyuan-pal commited on Sep 13, 2023

Commit

36a325d

•

1 Parent(s): ab287b7

update

Browse files

Files changed (4) hide show

app.py +7 -9
ckpt/sam_vit_h_4b8939.pth +3 -0
requirements.txt +1 -0
sam_utils.py +50 -0

app.py CHANGED Viewed

@@ -9,6 +9,7 @@ import fire
 from omegaconf import OmegaConf
 from ldm.util import add_margin, instantiate_from_config
 _TITLE = '''SyncDreamer: Generating Multiview-consistent Images from a Single-view Image'''
 _DESCRIPTION = '''
@@ -31,12 +32,6 @@ _USER_GUIDE3 = "Generated multiview images are shown below!"
 deployed = True
-def mask_prediction(mask_predictor, image_in: Image.Image):
-    if image_in.mode=='RGBA':
-        return image_in
-    else:
-        raise NotImplementedError
 def resize_inputs(image_input, crop_size):
     alpha_np = np.asarray(image_input)[:, :, 3]
     coords = np.stack(np.nonzero(alpha_np), 1)[:, (1, 0)]
@@ -58,6 +53,8 @@ def generate(model, batch_view_num, sample_num, cfg_scale, seed, image_input, el
     # prepare data
     image_input = np.asarray(image_input)
     image_input = image_input.astype(np.float32) / 255.0
     image_input = image_input[:, :, :3] * 2.0 - 1.0
     image_input = torch.from_numpy(image_input.astype(np.float32))
     elevation_input = torch.from_numpy(np.asarray([np.deg2rad(elevation_input)], np.float32))
@@ -103,7 +100,8 @@ def run_demo():
         model = None
     # init sam model
-    mask_predictor = None # sam_init(device_idx)
     # with open('instructions_12345.md', 'r') as f:
     #     article = f.read()
@@ -144,7 +142,7 @@ def run_demo():
                 fig0 = gr.Image(value=Image.open('assets/crop_size.jpg'), type='pil', image_mode='RGB', height=256, show_label=False, tool=None, interactive=False)
             with gr.Column(scale=1):
-                input_block = gr.Image(type='pil', image_mode='RGB', label="Input to SyncDreamer", height=256, interactive=False)
                 elevation = gr.Slider(-10, 40, 30, step=5, label='Elevation angle', interactive=True)
                 cfg_scale = gr.Slider(1.0, 5.0, 2.0, step=0.1, label='Classifier free guidance', interactive=True)
                 # sample_num = gr.Slider(1, 2, 2, step=1, label='Sample Num', interactive=True, info='How many instance (16 images per instance)')
@@ -156,7 +154,7 @@ def run_demo():
         output_block = gr.Image(type='pil', image_mode='RGB', label="Outputs of SyncDreamer", height=256, interactive=False)
         update_guide = lambda GUIDE_TEXT: gr.update(value=GUIDE_TEXT)
-        image_block.change(fn=partial(mask_prediction, mask_predictor), inputs=[image_block], outputs=[sam_block], queue=False)\
                    .success(fn=partial(update_guide, _USER_GUIDE1), outputs=[guide_text], queue=False)
         crop_size_slider.change(fn=resize_inputs, inputs=[sam_block, crop_size_slider], outputs=[input_block], queue=False)\

 from omegaconf import OmegaConf
 from ldm.util import add_margin, instantiate_from_config
+from sam_utils import sam_init, sam_out_nosave
 _TITLE = '''SyncDreamer: Generating Multiview-consistent Images from a Single-view Image'''
 _DESCRIPTION = '''
 deployed = True
 def resize_inputs(image_input, crop_size):
     alpha_np = np.asarray(image_input)[:, :, 3]
     coords = np.stack(np.nonzero(alpha_np), 1)[:, (1, 0)]
     # prepare data
     image_input = np.asarray(image_input)
     image_input = image_input.astype(np.float32) / 255.0
+    alpha_values = image_input[:,:, 3:]
+    image_input[:, :, :3] = alpha_values * image_input[:,:, :3] + 1 - alpha_values # white background
     image_input = image_input[:, :, :3] * 2.0 - 1.0
     image_input = torch.from_numpy(image_input.astype(np.float32))
     elevation_input = torch.from_numpy(np.asarray([np.deg2rad(elevation_input)], np.float32))
         model = None
     # init sam model
+    mask_predictor = sam_init()
+    mask_predict_fn = lambda x: sam_out_nosave(mask_predictor, x)
     # with open('instructions_12345.md', 'r') as f:
     #     article = f.read()
                 fig0 = gr.Image(value=Image.open('assets/crop_size.jpg'), type='pil', image_mode='RGB', height=256, show_label=False, tool=None, interactive=False)
             with gr.Column(scale=1):
+                input_block = gr.Image(type='pil', image_mode='RGBA', label="Input to SyncDreamer", height=256, interactive=False)
                 elevation = gr.Slider(-10, 40, 30, step=5, label='Elevation angle', interactive=True)
                 cfg_scale = gr.Slider(1.0, 5.0, 2.0, step=0.1, label='Classifier free guidance', interactive=True)
                 # sample_num = gr.Slider(1, 2, 2, step=1, label='Sample Num', interactive=True, info='How many instance (16 images per instance)')
         output_block = gr.Image(type='pil', image_mode='RGB', label="Outputs of SyncDreamer", height=256, interactive=False)
         update_guide = lambda GUIDE_TEXT: gr.update(value=GUIDE_TEXT)
+        image_block.change(fn=mask_predict_fn, inputs=[image_block], outputs=[sam_block], queue=False)\
                    .success(fn=partial(update_guide, _USER_GUIDE1), outputs=[guide_text], queue=False)
         crop_size_slider.change(fn=resize_inputs, inputs=[sam_block, crop_size_slider], outputs=[input_block], queue=False)\

ckpt/sam_vit_h_4b8939.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a7bf3b02f3ebf1267aba913ff637d9a2d5c33d3173bb679e46d9f338c26f262e
+size 2564550879

requirements.txt CHANGED Viewed

@@ -20,4 +20,5 @@ easydict
 nerfacc
 imageio-ffmpeg==0.4.7
 fire
 git+https://github.com/openai/CLIP.git

 nerfacc
 imageio-ffmpeg==0.4.7
 fire
+segment_anything
 git+https://github.com/openai/CLIP.git

sam_utils.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import os
+import numpy as np
+import torch
+from PIL import Image
+import time
+from segment_anything import sam_model_registry, SamPredictor
+def sam_init(device_id=0):
+    sam_checkpoint = os.path.join(os.path.dirname(__file__), "ckpt/sam_vit_h_4b8939.pth")
+    model_type = "vit_h"
+    device = "cuda:{}".format(device_id) if torch.cuda.is_available() else "cpu"
+    sam = sam_model_registry[model_type](checkpoint=sam_checkpoint).to(device=device)
+    predictor = SamPredictor(sam)
+    return predictor
+def sam_out_nosave(predictor, input_image, bbox_sliders=(0,0,255,255)):
+    bbox = np.array(bbox_sliders)
+    image = np.asarray(input_image)
+    start_time = time.time()
+    predictor.set_image(image)
+    h, w, _ = image.shape
+    input_point = np.array([[h//2, w//2]])
+    input_label = np.array([1])
+    masks, scores, logits = predictor.predict(
+        point_coords=input_point,
+        point_labels=input_label,
+        multimask_output=True,
+    )
+    masks_bbox, scores_bbox, logits_bbox = predictor.predict(
+        box=bbox,
+        multimask_output=True
+    )
+    print(f"SAM Time: {time.time() - start_time:.3f}s")
+    opt_idx = np.argmax(scores)
+    mask = masks[opt_idx]
+    out_image = np.zeros((image.shape[0], image.shape[1], 4), dtype=np.uint8)
+    out_image[:, :, :3] = image
+    out_image_bbox = out_image.copy()
+    out_image[:, :, 3] = mask.astype(np.uint8) * 255
+    out_image_bbox[:, :, 3] = masks_bbox[-1].astype(np.uint8) * 255 # np.argmax(scores_bbox)
+    torch.cuda.empty_cache()
+    return Image.fromarray(out_image_bbox, mode='RGBA')