import sys import os os.system("git clone https://github.com/royorel/StyleSDF.git") sys.path.append("StyleSDF") os.system(f"{sys.executable} -m pip install -U fvcore") import torch pyt_version_str=torch.__version__.split("+")[0].replace(".", "") version_str="".join([ f"py3{sys.version_info.minor}_cu", torch.version.cuda.replace(".",""), f"_pyt{pyt_version_str}" ]) os.system(f"{sys.executable} -m pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html") from download_models import download_pretrained_models download_pretrained_models() import torch import trimesh import numpy as np from munch import * from PIL import Image from tqdm import tqdm from torch.nn import functional as F from torch.utils import data from torchvision import utils from torchvision import transforms from skimage.measure import marching_cubes from scipy.spatial import Delaunay from options import BaseOptions from model import Generator from utils import ( generate_camera_params, align_volume, extract_mesh_with_marching_cubes, xyz2mesh, ) from utils import ( generate_camera_params, align_volume, extract_mesh_with_marching_cubes, xyz2mesh, create_cameras, create_mesh_renderer, add_textures, ) from pytorch3d.structures import Meshes from pdb import set_trace as st import skvideo.io def generate(opt, g_ema, surface_g_ema, device, mean_latent, surface_mean_latent): g_ema.eval() if not opt.no_surface_renderings: surface_g_ema.eval() # set camera angles if opt.fixed_camera_angles: # These can be changed to any other specific viewpoints. # You can add or remove viewpoints as you wish locations = torch.tensor([[0, 0], [-1.5 * opt.camera.azim, 0], [-1 * opt.camera.azim, 0], [-0.5 * opt.camera.azim, 0], [0.5 * opt.camera.azim, 0], [1 * opt.camera.azim, 0], [1.5 * opt.camera.azim, 0], [0, -1.5 * opt.camera.elev], [0, -1 * opt.camera.elev], [0, -0.5 * opt.camera.elev], [0, 0.5 * opt.camera.elev], [0, 1 * opt.camera.elev], [0, 1.5 * opt.camera.elev]], device=device) # For zooming in/out change the values of fov # (This can be defined for each view separately via a custom tensor # like the locations tensor above. Tensor shape should be [locations.shape[0],1]) # reasonable values are [0.75 * opt.camera.fov, 1.25 * opt.camera.fov] fov = opt.camera.fov * torch.ones((locations.shape[0],1), device=device) num_viewdirs = locations.shape[0] else: # draw random camera angles locations = None # fov = None fov = opt.camera.fov num_viewdirs = opt.num_views_per_id # generate images for i in tqdm(range(opt.identities)): with torch.no_grad(): chunk = 8 sample_z = torch.randn(1, opt.style_dim, device=device).repeat(num_viewdirs,1) sample_cam_extrinsics, sample_focals, sample_near, sample_far, sample_locations = \ generate_camera_params(opt.renderer_output_size, device, batch=num_viewdirs, locations=locations, #input_fov=fov, uniform=opt.camera.uniform, azim_range=opt.camera.azim, elev_range=opt.camera.elev, fov_ang=fov, dist_radius=opt.camera.dist_radius) rgb_images = torch.Tensor(0, 3, opt.size, opt.size) rgb_images_thumbs = torch.Tensor(0, 3, opt.renderer_output_size, opt.renderer_output_size) for j in range(0, num_viewdirs, chunk): out = g_ema([sample_z[j:j+chunk]], sample_cam_extrinsics[j:j+chunk], sample_focals[j:j+chunk], sample_near[j:j+chunk], sample_far[j:j+chunk], truncation=opt.truncation_ratio, truncation_latent=mean_latent) rgb_images = torch.cat([rgb_images, out[0].cpu()], 0) rgb_images_thumbs = torch.cat([rgb_images_thumbs, out[1].cpu()], 0) utils.save_image(rgb_images, os.path.join(opt.results_dst_dir, 'images','{}.png'.format(str(i).zfill(7))), nrow=num_viewdirs, normalize=True, padding=0, value_range=(-1, 1),) utils.save_image(rgb_images_thumbs, os.path.join(opt.results_dst_dir, 'images','{}_thumb.png'.format(str(i).zfill(7))), nrow=num_viewdirs, normalize=True, padding=0, value_range=(-1, 1),) # this is done to fit to RTX2080 RAM size (11GB) del out torch.cuda.empty_cache() if not opt.no_surface_renderings: surface_chunk = 1 scale = surface_g_ema.renderer.out_im_res / g_ema.renderer.out_im_res surface_sample_focals = sample_focals * scale for j in range(0, num_viewdirs, surface_chunk): surface_out = surface_g_ema([sample_z[j:j+surface_chunk]], sample_cam_extrinsics[j:j+surface_chunk], surface_sample_focals[j:j+surface_chunk], sample_near[j:j+surface_chunk], sample_far[j:j+surface_chunk], truncation=opt.truncation_ratio, truncation_latent=surface_mean_latent, return_sdf=True, return_xyz=True) xyz = surface_out[2].cpu() sdf = surface_out[3].cpu() # this is done to fit to RTX2080 RAM size (11GB) del surface_out torch.cuda.empty_cache() # mesh extractions are done one at a time for k in range(surface_chunk): curr_locations = sample_locations[j:j+surface_chunk] loc_str = '_azim{}_elev{}'.format(int(curr_locations[k,0] * 180 / np.pi), int(curr_locations[k,1] * 180 / np.pi)) # Save depth outputs as meshes depth_mesh_filename = os.path.join(opt.results_dst_dir,'depth_map_meshes','sample_{}_depth_mesh{}.obj'.format(i, loc_str)) depth_mesh = xyz2mesh(xyz[k:k+surface_chunk]) if depth_mesh != None: with open(depth_mesh_filename, 'w') as f: depth_mesh.export(f,file_type='obj') # extract full geometry with marching cubes if j == 0: try: frostum_aligned_sdf = align_volume(sdf) marching_cubes_mesh = extract_mesh_with_marching_cubes(frostum_aligned_sdf[k:k+surface_chunk]) except ValueError: marching_cubes_mesh = None print('Marching cubes extraction failed.') print('Please check whether the SDF values are all larger (or all smaller) than 0.') return depth_mesh,marching_cubes_mesh # User options def get_generate_vars(model_type): opt = BaseOptions().parse() opt.camera.uniform = True opt.model.is_test = True opt.model.freeze_renderer = False opt.rendering.offset_sampling = True opt.rendering.static_viewdirs = True opt.rendering.force_background = True opt.rendering.perturb = 0 opt.inference.renderer_output_size = opt.model.renderer_spatial_output_dim opt.inference.style_dim = opt.model.style_dim opt.inference.project_noise = opt.model.project_noise # User options opt.inference.no_surface_renderings = False # When true, only RGB images will be created opt.inference.fixed_camera_angles = False # When true, each identity will be rendered from a specific set of 13 viewpoints. Otherwise, random views are generated opt.inference.identities = 1 # Number of identities to generate opt.inference.num_views_per_id = 1 # Number of viewpoints generated per identity. This option is ignored if opt.inference.fixed_camera_angles is true. opt.inference.camera = opt.camera # Load saved model if model_type == 'ffhq': model_path = 'ffhq1024x1024.pt' opt.model.size = 1024 opt.experiment.expname = 'ffhq1024x1024' else: opt.inference.camera.azim = 0.15 model_path = 'afhq512x512.pt' opt.model.size = 512 opt.experiment.expname = 'afhq512x512' # Create results directory result_model_dir = 'final_model' results_dir_basename = os.path.join(opt.inference.results_dir, opt.experiment.expname) opt.inference.results_dst_dir = os.path.join(results_dir_basename, result_model_dir) if opt.inference.fixed_camera_angles: opt.inference.results_dst_dir = os.path.join(opt.inference.results_dst_dir, 'fixed_angles') else: opt.inference.results_dst_dir = os.path.join(opt.inference.results_dst_dir, 'random_angles') os.makedirs(opt.inference.results_dst_dir, exist_ok=True) os.makedirs(os.path.join(opt.inference.results_dst_dir, 'images'), exist_ok=True) if not opt.inference.no_surface_renderings: os.makedirs(os.path.join(opt.inference.results_dst_dir, 'depth_map_meshes'), exist_ok=True) os.makedirs(os.path.join(opt.inference.results_dst_dir, 'marching_cubes_meshes'), exist_ok=True) opt.inference.size = opt.model.size checkpoint_path = os.path.join('full_models', model_path) checkpoint = torch.load(checkpoint_path) # Load image generation model g_ema = Generator(opt.model, opt.rendering).to(device) pretrained_weights_dict = checkpoint["g_ema"] model_dict = g_ema.state_dict() for k, v in pretrained_weights_dict.items(): if v.size() == model_dict[k].size(): model_dict[k] = v g_ema.load_state_dict(model_dict) # Load a second volume renderer that extracts surfaces at 128x128x128 (or higher) for better surface resolution if not opt.inference.no_surface_renderings: opt['surf_extraction'] = Munch() opt.surf_extraction.rendering = opt.rendering opt.surf_extraction.model = opt.model.copy() opt.surf_extraction.model.renderer_spatial_output_dim = 128 opt.surf_extraction.rendering.N_samples = opt.surf_extraction.model.renderer_spatial_output_dim opt.surf_extraction.rendering.return_xyz = True opt.surf_extraction.rendering.return_sdf = True surface_g_ema = Generator(opt.surf_extraction.model, opt.surf_extraction.rendering, full_pipeline=False).to(device) # Load weights to surface extractor surface_extractor_dict = surface_g_ema.state_dict() for k, v in pretrained_weights_dict.items(): if k in surface_extractor_dict.keys() and v.size() == surface_extractor_dict[k].size(): surface_extractor_dict[k] = v surface_g_ema.load_state_dict(surface_extractor_dict) else: surface_g_ema = None # Get the mean latent vector for g_ema if opt.inference.truncation_ratio < 1: with torch.no_grad(): mean_latent = g_ema.mean_latent(opt.inference.truncation_mean, device) else: surface_mean_latent = None # Get the mean latent vector for surface_g_ema if not opt.inference.no_surface_renderings: surface_mean_latent = mean_latent[0] else: surface_mean_latent = None return opt.inference, g_ema, surface_g_ema, mean_latent, surface_mean_latent,opt.inference.results_dst_dir def get_rendervideo_vars(model_type,number_frames): opt = BaseOptions().parse() opt.model.is_test = True opt.model.style_dim = 256 opt.model.freeze_renderer = False opt.inference.size = opt.model.size opt.inference.camera = opt.camera opt.inference.renderer_output_size = opt.model.renderer_spatial_output_dim opt.inference.style_dim = opt.model.style_dim opt.inference.project_noise = opt.model.project_noise opt.rendering.perturb = 0 opt.rendering.force_background = True opt.rendering.static_viewdirs = True opt.rendering.return_sdf = True opt.rendering.N_samples = 64 opt.inference.identities = 1 # Load saved model if model_type == 'ffhq': model_path = 'ffhq1024x1024.pt' opt.model.size = 1024 opt.experiment.expname = 'ffhq1024x1024' else: opt.inference.camera.azim = 0.15 model_path = 'afhq512x512.pt' opt.model.size = 512 opt.experiment.expname = 'afhq512x512' opt.inference.size = opt.model.size # Create results directory result_model_dir = 'final_model' results_dir_basename = os.path.join(opt.inference.results_dir, opt.experiment.expname) opt.inference.results_dst_dir = os.path.join(results_dir_basename, result_model_dir) os.makedirs(opt.inference.results_dst_dir, exist_ok=True) os.makedirs(os.path.join(opt.inference.results_dst_dir, 'videos'), exist_ok=True) checkpoints_dir = './full_models' checkpoint_path = os.path.join('full_models', model_path) if os.path.isfile(checkpoint_path): # define results directory name result_model_dir = 'final_model' results_dir_basename = os.path.join(opt.inference.results_dir, opt.experiment.expname) opt.inference.results_dst_dir = os.path.join(results_dir_basename, result_model_dir, 'videos') if opt.model.project_noise: opt.inference.results_dst_dir = os.path.join(opt.inference.results_dst_dir, 'with_noise_projection') os.makedirs(opt.inference.results_dst_dir, exist_ok=True) print(checkpoint_path) # load saved model checkpoint = torch.load(checkpoint_path) # load image generation model g_ema = Generator(opt.model, opt.rendering).to(device) # temp fix because of wrong noise sizes pretrained_weights_dict = checkpoint["g_ema"] model_dict = g_ema.state_dict() for k, v in pretrained_weights_dict.items(): if v.size() == model_dict[k].size(): model_dict[k] = v g_ema.load_state_dict(model_dict) # load a the volume renderee to a second that extracts surfaces at 128x128x128 if not opt.inference.no_surface_renderings or opt.model.project_noise: opt['surf_extraction'] = Munch() opt.surf_extraction.rendering = opt.rendering opt.surf_extraction.model = opt.model.copy() opt.surf_extraction.model.renderer_spatial_output_dim = 128 opt.surf_extraction.rendering.N_samples = opt.surf_extraction.model.renderer_spatial_output_dim opt.surf_extraction.rendering.return_xyz = True opt.surf_extraction.rendering.return_sdf = True opt.inference.surf_extraction_output_size = opt.surf_extraction.model.renderer_spatial_output_dim surface_g_ema = Generator(opt.surf_extraction.model, opt.surf_extraction.rendering, full_pipeline=False).to(device) # Load weights to surface extractor surface_extractor_dict = surface_g_ema.state_dict() for k, v in pretrained_weights_dict.items(): if k in surface_extractor_dict.keys() and v.size() == surface_extractor_dict[k].size(): surface_extractor_dict[k] = v surface_g_ema.load_state_dict(surface_extractor_dict) else: surface_g_ema = None # get the mean latent vector for g_ema if opt.inference.truncation_ratio < 1: with torch.no_grad(): mean_latent = g_ema.mean_latent(opt.inference.truncation_mean, device) else: mean_latent = None # get the mean latent vector for surface_g_ema if not opt.inference.no_surface_renderings or opt.model.project_noise: surface_mean_latent = mean_latent[0] else: surface_mean_latent = None return opt.inference, g_ema, surface_g_ema, mean_latent, surface_mean_latent,opt.inference.results_dst_dir def render_video(opt, g_ema, surface_g_ema, device, mean_latent, surface_mean_latent,numberofframes): g_ema.eval() if not opt.no_surface_renderings or opt.project_noise: surface_g_ema.eval() images = torch.Tensor(0, 3, opt.size, opt.size) num_frames = numberofframes # Generate video trajectory trajectory = np.zeros((num_frames,3), dtype=np.float32) # set camera trajectory # sweep azimuth angles (4 seconds) if opt.azim_video: t = np.linspace(0, 1, num_frames) elev = 0 fov = opt.camera.fov if opt.camera.uniform: azim = opt.camera.azim * np.cos(t * 2 * np.pi) else: azim = 1.5 * opt.camera.azim * np.cos(t * 2 * np.pi) trajectory[:num_frames,0] = azim trajectory[:num_frames,1] = elev trajectory[:num_frames,2] = fov # elipsoid sweep (4 seconds) else: t = np.linspace(0, 1, num_frames) fov = opt.camera.fov #+ 1 * np.sin(t * 2 * np.pi) if opt.camera.uniform: elev = opt.camera.elev / 2 + opt.camera.elev / 2 * np.sin(t * 2 * np.pi) azim = opt.camera.azim * np.cos(t * 2 * np.pi) else: elev = 1.5 * opt.camera.elev * np.sin(t * 2 * np.pi) azim = 1.5 * opt.camera.azim * np.cos(t * 2 * np.pi) trajectory[:num_frames,0] = azim trajectory[:num_frames,1] = elev trajectory[:num_frames,2] = fov trajectory = torch.from_numpy(trajectory).to(device) # generate input parameters for the camera trajectory # sample_cam_poses, sample_focals, sample_near, sample_far = \ # generate_camera_params(trajectory, opt.renderer_output_size, device, dist_radius=opt.camera.dist_radius) sample_cam_extrinsics, sample_focals, sample_near, sample_far, _ = \ generate_camera_params(opt.renderer_output_size, device, locations=trajectory[:,:2], fov_ang=trajectory[:,2:], dist_radius=opt.camera.dist_radius) # In case of noise projection, generate input parameters for the frontal position. # The reference mesh for the noise projection is extracted from the frontal position. # For more details see section C.1 in the supplementary material. if opt.project_noise: frontal_pose = torch.tensor([[0.0,0.0,opt.camera.fov]]).to(device) # frontal_cam_pose, frontal_focals, frontal_near, frontal_far = \ # generate_camera_params(frontal_pose, opt.surf_extraction_output_size, device, dist_radius=opt.camera.dist_radius) frontal_cam_pose, frontal_focals, frontal_near, frontal_far, _ = \ generate_camera_params(opt.surf_extraction_output_size, device, location=frontal_pose[:,:2], fov_ang=frontal_pose[:,2:], dist_radius=opt.camera.dist_radius) # create geometry renderer (renders the depth maps) cameras = create_cameras(azim=np.rad2deg(trajectory[0,0].cpu().numpy()), elev=np.rad2deg(trajectory[0,1].cpu().numpy()), dist=1, device=device) renderer = create_mesh_renderer(cameras, image_size=512, specular_color=((0,0,0),), ambient_color=((0.1,.1,.1),), diffuse_color=((0.75,.75,.75),), device=device) suffix = '_azim' if opt.azim_video else '_elipsoid' # generate videos for i in range(opt.identities): print('Processing identity {}/{}...'.format(i+1, opt.identities)) chunk = 1 sample_z = torch.randn(1, opt.style_dim, device=device).repeat(chunk,1) video_filename = 'sample_video_{}{}.mp4'.format(i,suffix) writer = skvideo.io.FFmpegWriter(os.path.join(opt.results_dst_dir, video_filename), outputdict={'-pix_fmt': 'yuv420p', '-crf': '10'}) if not opt.no_surface_renderings: depth_video_filename = 'sample_depth_video_{}{}.mp4'.format(i,suffix) depth_writer = skvideo.io.FFmpegWriter(os.path.join(opt.results_dst_dir, depth_video_filename), outputdict={'-pix_fmt': 'yuv420p', '-crf': '1'}) ####################### Extract initial surface mesh from the frontal viewpoint ############# # For more details see section C.1 in the supplementary material. if opt.project_noise: with torch.no_grad(): frontal_surface_out = surface_g_ema([sample_z], frontal_cam_pose, frontal_focals, frontal_near, frontal_far, truncation=opt.truncation_ratio, truncation_latent=surface_mean_latent, return_sdf=True) frontal_sdf = frontal_surface_out[2].cpu() print('Extracting Identity {} Frontal view Marching Cubes for consistent video rendering'.format(i)) frostum_aligned_frontal_sdf = align_volume(frontal_sdf) del frontal_sdf try: frontal_marching_cubes_mesh = extract_mesh_with_marching_cubes(frostum_aligned_frontal_sdf) except ValueError: frontal_marching_cubes_mesh = None if frontal_marching_cubes_mesh != None: frontal_marching_cubes_mesh_filename = os.path.join(opt.results_dst_dir,'sample_{}_frontal_marching_cubes_mesh{}.obj'.format(i,suffix)) with open(frontal_marching_cubes_mesh_filename, 'w') as f: frontal_marching_cubes_mesh.export(f,file_type='obj') del frontal_surface_out torch.cuda.empty_cache() ############################################################################################# for j in tqdm(range(0, num_frames, chunk)): with torch.no_grad(): out = g_ema([sample_z], sample_cam_extrinsics[j:j+chunk], sample_focals[j:j+chunk], sample_near[j:j+chunk], sample_far[j:j+chunk], truncation=opt.truncation_ratio, truncation_latent=mean_latent, randomize_noise=False, project_noise=opt.project_noise, mesh_path=frontal_marching_cubes_mesh_filename if opt.project_noise else None) rgb = out[0].cpu() utils.save_image(rgb, os.path.join(opt.results_dst_dir, '{}.png'.format(str(i).zfill(7))), nrow= trajectory[:,:2].shape[0], normalize=True, padding=0, value_range=(-1, 1),) # this is done to fit to RTX2080 RAM size (11GB) del out torch.cuda.empty_cache() # Convert RGB from [-1, 1] to [0,255] rgb = 127.5 * (rgb.clamp(-1,1).permute(0,2,3,1).cpu().numpy() + 1) # Add RGB, frame to video for k in range(chunk): writer.writeFrame(rgb[k]) ########## Extract surface ########## if not opt.no_surface_renderings: scale = surface_g_ema.renderer.out_im_res / g_ema.renderer.out_im_res surface_sample_focals = sample_focals * scale surface_out = surface_g_ema([sample_z], sample_cam_extrinsics[j:j+chunk], surface_sample_focals[j:j+chunk], sample_near[j:j+chunk], sample_far[j:j+chunk], truncation=opt.truncation_ratio, truncation_latent=surface_mean_latent, return_xyz=True) xyz = surface_out[2].cpu() # this is done to fit to RTX2080 RAM size (11GB) del surface_out torch.cuda.empty_cache() # Render mesh for video depth_mesh = xyz2mesh(xyz) mesh = Meshes( verts=[torch.from_numpy(np.asarray(depth_mesh.vertices)).to(torch.float32).to(device)], faces = [torch.from_numpy(np.asarray(depth_mesh.faces)).to(torch.float32).to(device)], textures=None, verts_normals=[torch.from_numpy(np.copy(np.asarray(depth_mesh.vertex_normals))).to(torch.float32).to(device)], ) mesh = add_textures(mesh) cameras = create_cameras(azim=np.rad2deg(trajectory[j,0].cpu().numpy()), elev=np.rad2deg(trajectory[j,1].cpu().numpy()), fov=2*trajectory[j,2].cpu().numpy(), dist=1, device=device) renderer = create_mesh_renderer(cameras, image_size=512, light_location=((0.0,1.0,5.0),), specular_color=((0.2,0.2,0.2),), ambient_color=((0.1,0.1,0.1),), diffuse_color=((0.65,.65,.65),), device=device) mesh_image = 255 * renderer(mesh).cpu().numpy() mesh_image = mesh_image[...,:3] # Add depth frame to video for k in range(chunk): depth_writer.writeFrame(mesh_image[k]) # Close video writers writer.close() if not opt.no_surface_renderings: depth_writer.close() return video_filename import gradio as gr import plotly.graph_objects as go from PIL import Image device='cuda' if torch.cuda.is_available() else 'cpu' def get_video(model_type,numberofframes,mesh_type): options,g_ema,surface_g_ema, mean_latent, surface_mean_latent,result_filename=get_rendervideo_vars(model_type,numberofframes) render_video(options, g_ema, surface_g_ema, device, mean_latent, surface_mean_latent,numberofframes) torch.cuda.empty_cache() del options,g_ema,surface_g_ema, mean_latent, surface_mean_latent path_img=os.path.join(result_filename,"0000000.png") image=Image.open(path_img) if mesh_type=="DepthMesh": path=os.path.join(result_filename,"sample_depth_video_0_elipsoid.mp4") else: path=os.path.join(result_filename,"sample_video_0_elipsoid.mp4") return path,image def get_mesh(model_type,mesh_type): options,g_ema,surface_g_ema, mean_latent, surface_mean_latent,result_filename=get_generate_vars(model_type) depth_mesh,mc_mesh=generate(options, g_ema, surface_g_ema, device, mean_latent, surface_mean_latent) torch.cuda.empty_cache() del options,g_ema,surface_g_ema, mean_latent, surface_mean_latent if mesh_type=="DepthMesh": mesh=depth_mesh else: mesh=mc_mesh x=np.asarray(mesh.vertices).T[0] y=np.asarray(mesh.vertices).T[1] z=np.asarray(mesh.vertices).T[2] i=np.asarray(mesh.faces).T[0] j=np.asarray(mesh.faces).T[1] k=np.asarray(mesh.faces).T[2] fig = go.Figure(go.Mesh3d(x=x, y=y, z=z, i=i, j=j, k=k, colorscale="Viridis", colorbar_len=0.75, flatshading=True, lighting=dict(ambient=0.5, diffuse=1, fresnel=4, specular=0.5, roughness=0.05, facenormalsepsilon=0, vertexnormalsepsilon=0), lightposition=dict(x=100, y=100, z=1000))) path=os.path.join(result_filename,"images/0000000.png") image=Image.open(path) return fig,image markdown=f''' # StyleSDF: High-Resolution 3D-Consistent Image and Geometry Generation [The space demo for the CVPR 2022 paper "StyleSDF: High-Resolution 3D-Consistent Image and Geometry Generation".](https://arxiv.org/abs/2112.11427) [For the official implementation.](https://github.com/royorel/StyleSDF) ### Future Work based on interest - Adding new models for new type objects - New Customization It is running on {device} The process can take long time.Especially ,To generate videos and the time of process depends the number of frames and current compiler device. Note : For RGB video , choose marching cubes mesh type ''' with gr.Blocks() as demo: with gr.Row(): with gr.Column(): with gr.Row(): with gr.Column(): gr.Markdown(markdown) with gr.Column(): with gr.Row(): with gr.Column(): image=gr.Image(type="pil",shape=(512,512)) with gr.Column(): mesh = gr.Plot() with gr.Column(): video=gr.Video() with gr.Row(): numberoframes = gr.Slider( minimum=30, maximum=250,label='Number Of Frame For Video Generation') model_name=gr.Dropdown(choices=["ffhq","afhq"],label="Choose Model Type") mesh_type=gr.Dropdown(choices=["DepthMesh","Marching Cubes"],label="Choose Mesh Type") with gr.Row(): btn = gr.Button(value="Generate Mesh") btn_2=gr.Button(value="Generate Video") btn.click(get_mesh, [model_name,mesh_type],[ mesh,image]) btn_2.click(get_video,[model_name,numberoframes,mesh_type],[video,image]) demo.launch(debug=True)