import spaces import os, logging, time, argparse, random, tempfile, rembg, shlex, subprocess import gradio as gr import numpy as np import torch from PIL import Image from functools import partial subprocess.run(shlex.split('pip install wheel/torchmcubes-0.1.0-cp310-cp310-linux_x86_64.whl')) from tsr.system import TSR from tsr.utils import remove_background, resize_foreground, to_gradio_3d_orientation from src.scheduler_perflow import PeRFlowScheduler from diffusers import StableDiffusionPipeline, UNet2DConditionModel def fill_background(img): img = np.array(img).astype(np.float32) / 255.0 img = img[:, :, :3] * img[:, :, 3:4] + (1 - img[:, :, 3:4]) * 0.5 img = Image.fromarray((img * 255.0).astype(np.uint8)) return img def merge_delta_weights_into_unet(pipe, delta_weights, org_alpha = 1.0): unet_weights = pipe.unet.state_dict() for key in delta_weights.keys(): dtype = unet_weights[key].dtype try: unet_weights[key] = org_alpha * unet_weights[key].to(dtype=delta_weights[key].dtype) + delta_weights[key].to(device=unet_weights[key].device) except: unet_weights[key] = unet_weights[key].to(dtype=delta_weights[key].dtype) unet_weights[key] = unet_weights[key].to(dtype) pipe.unet.load_state_dict(unet_weights, strict=True) return pipe def setup_seed(seed): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.cudnn.deterministic = True if torch.cuda.is_available(): device = "cuda:0" else: device = "cpu" ### TripoSR model = TSR.from_pretrained( "stabilityai/TripoSR", config_name="config.yaml", weight_name="model.ckpt", ) # adjust the chunk size to balance between speed and memory usage model.renderer.set_chunk_size(8192) model.to(device) ### PeRFlow-T2I # pipe_t2i = StableDiffusionPipeline.from_pretrained("Lykon/dreamshaper-8", torch_dtype=torch.float16, safety_checker=None) # pipe_t2i = StableDiffusionPipeline.from_pretrained("stablediffusionapi/disney-pixar-cartoon", torch_dtype=torch.float16, safety_checker=None) # delta_weights = UNet2DConditionModel.from_pretrained("hansyan/piecewise-rectified-flow-delta-weights", torch_dtype=torch.float16, variant="v0-1",).state_dict() # pipe_t2i = merge_delta_weights_into_unet(pipe_t2i, delta_weights) pipe_t2i = StableDiffusionPipeline.from_pretrained("hansyan/perflow-sd15-disney", torch_dtype=torch.float16, safety_checker=None) pipe_t2i.scheduler = PeRFlowScheduler.from_config(pipe_t2i.scheduler.config, prediction_type="epsilon", num_time_windows=4) pipe_t2i.to('cuda:0', torch.float16) ### gradio rembg_session = rembg.new_session() @spaces.GPU def generate(text, seed): def fill_background(image): image = np.array(image).astype(np.float32) / 255.0 image = image[:, :, :3] * image[:, :, 3:4] + (1 - image[:, :, 3:4]) * 0.5 image = Image.fromarray((image * 255.0).astype(np.uint8)) return image setup_seed(int(seed)) prompt_prefix = "high quality, highly detailed, (best quality, masterpiece), " neg_prompt = "EasyNegative, drawn by bad-artist, sketch by bad-artist-anime, (bad_prompt:0.8), (artist name, signature, watermark:1.4), (ugly:1.2), (worst quality, poor details:1.4), bad-hands-5, badhandv4, blurry" text = prompt_prefix + text samples = pipe_t2i( prompt = [text], negative_prompt = [neg_prompt], height = 512, width = 512, # num_inference_steps = 6, # guidance_scale = 7.5, num_inference_steps = 8, guidance_scale = 7.5, output_type = 'pt', ).images samples = samples.squeeze(0).permute(1, 2, 0).cpu().numpy()*255. samples = samples.astype(np.uint8) samples = Image.fromarray(samples[:, :, :3]) return samples @spaces.GPU def render(image, mc_resolution=256, formats=["obj"]): image = Image.fromarray(image) image = image.resize((768, 768)) image = remove_background(image, rembg_session) image = resize_foreground(image, 0.85) image = fill_background(image) scene_codes = model(image, device=device) mesh = model.extract_mesh(scene_codes, resolution=mc_resolution)[0] mesh = to_gradio_3d_orientation(mesh) rv = [] for format in formats: mesh_path = tempfile.NamedTemporaryFile(suffix=f".{format}", delete=False) mesh.export(mesh_path.name) rv.append(mesh_path.name) return rv[0] # layout css = """ h1 { text-align: center; display:block; } h2 { text-align: center; display:block; } h3 { text-align: center; display:block; } """ with gr.Blocks(title="TripoSR", css=css) as interface: gr.Markdown( """ # Instant Text-to-3D Mesh Demo ### [PeRFlow](https://github.com/magic-research/piecewise-rectified-flow)-T2I + [TripoSR](https://github.com/VAST-AI-Research/TripoSR) Two-stage synthesis: 1) generating images by PeRFlow-T2I; 2) rendering 3D assests. Here, we plug the PeRFlow-delta-weights of SD-v1.5 into the Disney-Pixar-Cartoon dreambooth. """ ) with gr.Column(): with gr.Row(): output_image = gr.Image(label='Generated Image', height=384,) output_model_obj = gr.Model3D( label="Output 3D Model (OBJ Format)", interactive=False, height=384, ) with gr.Row(): textbox = gr.Textbox(label="Input Prompt", value="a husky dog") seed = gr.Textbox(label="Random Seed", value=42) gr.Markdown( """ Images should be generated within 1 second normally, sometimes, it could a bit slow due to warm-up of the program. Here are some examples provided: - a policeman - a robot, close-up - a red car, side view - a blue mug - a burger - a tea pot - a wooden chair - a unicorn """ ) # activate textbox.submit( fn=generate, inputs=[textbox, seed], outputs=[output_image], ).success( fn=render, inputs=[output_image], outputs=[output_model_obj], ) seed.submit( fn=generate, inputs=[textbox, seed], outputs=[output_image], ).success( fn=render, inputs=[output_image], outputs=[output_model_obj], ) if __name__ == '__main__': interface.queue(max_size=10) interface.launch()