Spaces:

Pie31415
/

rome

Runtime error

App Files Files Community

rome / app.py

Pie31415

updated app

c6d7c40 over 1 year ago

raw history blame contribute delete

No virus

8.83 kB

	import sys
	import torch
	import pickle
	import cv2
	import gradio as gr
	import numpy as np

	from PIL import Image
	from collections import defaultdict
	from glob import glob

	from matplotlib import pyplot as plt
	from matplotlib import animation

	from easydict import EasyDict as edict
	from huggingface_hub import hf_hub_download

	sys.path.append("./rome/")
	sys.path.append('./DECA')

	from rome.infer import Infer
	from rome.src.utils.processing import process_black_shape, tensor2image
	from rome.src.utils.visuals import mask_errosion

	# loading models ---- create model repo
	default_modnet_path = hf_hub_download('Pie31415/rome', 'modnet_photographic_portrait_matting.ckpt')
	default_model_path = hf_hub_download('Pie31415/rome', 'rome.pth')

	# parser configurations
	args = edict({
	"save_dir": ".",
	"save_render": True,
	"model_checkpoint": default_model_path,
	"modnet_path": default_modnet_path,
	"random_seed": 0,
	"debug": False,
	"verbose": False,
	"model_image_size": 256,
	"align_source": True,
	"align_target": False,
	"align_scale": 1.25,
	"use_mesh_deformations": False,
	"subdivide_mesh": False,
	"renderer_sigma": 1e-08,
	"renderer_zfar": 100.0,
	"renderer_type": "soft_mesh",
	"renderer_texture_type": "texture_uv",
	"renderer_normalized_alphas": False,
	"deca_path": "DECA",
	"rome_data_dir": "rome/data",
	"autoenc_cat_alphas": False,
	"autoenc_align_inputs": False,
	"autoenc_use_warp": False,
	"autoenc_num_channels": 64,
	"autoenc_max_channels": 512,
	"autoenc_num_groups": 4,
	"autoenc_num_bottleneck_groups": 0,
	"autoenc_num_blocks": 2,
	"autoenc_num_layers": 4,
	"autoenc_block_type": "bottleneck",
	"neural_texture_channels": 8,
	"num_harmonic_encoding_funcs": 6,
	"unet_num_channels": 64,
	"unet_max_channels": 512,
	"unet_num_groups": 4,
	"unet_num_blocks": 1,
	"unet_num_layers": 2,
	"unet_block_type": "conv",
	"unet_skip_connection_type": "cat",
	"unet_use_normals_cond": True,
	"unet_use_vertex_cond": False,
	"unet_use_uvs_cond": False,
	"unet_pred_mask": False,
	"use_separate_seg_unet": True,
	"norm_layer_type": "gn",
	"activation_type": "relu",
	"conv_layer_type": "ws_conv",
	"deform_norm_layer_type": "gn",
	"deform_activation_type": "relu",
	"deform_conv_layer_type": "ws_conv",
	"unet_seg_weight": 0.0,
	"unet_seg_type": "bce_with_logits",
	"deform_face_tightness": 0.0001,
	"use_whole_segmentation": False,
	"mask_hair_for_neck": False,
	"use_hair_from_avatar": False,
	"use_scalp_deforms": True,
	"use_neck_deforms": True,
	"use_basis_deformer": False,
	"use_unet_deformer": True,
	"pretrained_encoder_basis_path": "",
	"pretrained_vertex_basis_path": "",
	"num_basis": 50,
	"basis_init": "pca",
	"num_vertex": 5023,
	"train_basis": True,
	"path_to_deca": "DECA",
	"path_to_linear_hair_model": "data/linear_hair.pth", # N/A
	"path_to_mobile_model": "data/disp_model.pth", # N/A
	"n_scalp": 60,
	"use_distill": False,
	"use_mobile_version": False,
	"deformer_path": "data/rome.pth",
	"output_unet_deformer_feats": 32,
	"use_deca_details": False,
	"use_flametex": False,
	"upsample_type": "nearest",
	"num_frequencies": 6,
	"deform_face_scale_coef": 0.0,
	"device": "cuda"
	})

	# download FLAME and DECA pretrained
	generic_model_path = hf_hub_download('Pie31415/rome', 'generic_model.pkl')
	deca_model_path = hf_hub_download('Pie31415/rome', 'deca_model.tar')

	with open(generic_model_path, 'rb') as f:
	ss = pickle.load(f, encoding='latin1')

	with open('./DECA/data/generic_model.pkl', 'wb') as out:
	pickle.dump(ss, out)

	with open(deca_model_path, "rb") as input:
	with open('./DECA/data/deca_model.tar', "wb") as out:
	for line in input:
	out.write(line)

	# load ROME inference model
	infer = Infer(args)

	def image_inference(
	source_img: gr.inputs.Image = None,
	driver_img: gr.inputs.Image = None
	):
	out = infer.evaluate(source_img, driver_img, crop_center=False)
	res = tensor2image(torch.cat([out['source_information']['data_dict']['source_img'][0].cpu(),
	out['source_information']['data_dict']['target_img'][0].cpu(),
	out['render_masked'].cpu(), out['pred_target_shape_img'][0].cpu()], dim=2))
	return res[..., ::-1]

	def extract_frames(
	driver_vid: gr.inputs.Video = None
	):
	image_frames = []
	vid = cv2.VideoCapture(driver_vid) # path to mp4

	while True:
	success, img = vid.read()

	if not success: break

	img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
	pil_img = Image.fromarray(img)
	image_frames.append(pil_img)

	return image_frames

	def video_inference(
	source_img: gr.inputs.Image = None,
	driver_vid: gr.inputs.Video = None
	):
	image_frames = extract_frames(driver_vid)

	resulted_imgs = defaultdict(list)

	mask_hard_threshold = 0.5
	N = len(image_frames)
	for i in range(0, N, 4): # frame limits
	new_out = infer.evaluate(source_img, image_frames[i])

	mask_pred = (new_out['pred_target_unet_mask'].cpu() > mask_hard_threshold).float()
	mask_pred = mask_errosion(mask_pred[0].float().numpy() * 255)
	render = new_out['pred_target_img'].cpu() * (mask_pred) + (1 - mask_pred)

	normals = process_black_shape(((new_out['pred_target_normal'][0].cpu() + 1) / 2 * mask_pred + (1 - mask_pred) ) )
	normals[normals==0.5]=1.

	resulted_imgs['res_normal'].append(tensor2image(normals))
	resulted_imgs['res_mesh_images'].append(tensor2image(new_out['pred_target_shape_img'][0]))
	resulted_imgs['res_renders'].append(tensor2image(render[0]))

	video = np.array(resulted_imgs['res_renders'])

	fig = plt.figure()
	im = plt.imshow(video[0,:,:,::-1])
	plt.axis('off')
	plt.close() # this is required to not display the generated image

	def init():
	im.set_data(video[0,:,:,::-1])

	def animate(i):
	im.set_data(video[i,:,:,::-1])
	return im

	anim = animation.FuncAnimation(fig, animate, init_func=init, frames=video.shape[0], interval=30)
	anim.save("avatar.gif", dpi=300, writer = animation.PillowWriter(fps=24))

	return "avatar.gif"

	description = """<p style='text-align: center'> Create a personal avatar from just a single image using ROME. <br> <a href='https://arxiv.org/abs/2206.08343' target='_blank'>Paper</a> \| <a href='https://samsunglabs.github.io/rome' target='_blank'>Project Page</a> \| <a href='https://github.com/SamsungLabs/rome' target='_blank'>Github</a> </p>"""
	quote = """
	> <p style='text-align: center'> [The] system creates realistic mesh-based avatars from a single <strong>source</strong> photo. These avatars are rigged, i.e., they can be driven by the animation parameters from a different <strong>driving</strong> frame. </p>"""

	with gr.Blocks() as demo:
	gr.Markdown("# <p align='center'>ROME: Realistic one-shot mesh-based head avatars</p>")
	gr.HTML(value="<img src='file/media/tease.gif' alt='Teaser' style='display: block; margin: auto;'>")
	gr.Markdown(description)
	gr.Markdown(quote)

	with gr.Tab("Image Inference"):
	with gr.Row():
	source_img = gr.Image(type="pil", label="Source image", show_label=True)
	driver_img = gr.Image(type="pil", label="Driver image", show_label=True)
	image_output = gr.Image(label="Rendered avatar")
	image_button = gr.Button("Predict")
	with gr.Tab("Video Inference"):
	with gr.Row():
	source_img2 = gr.Image(type="pil", label="Source image", show_label=True)
	driver_vid = gr.Video(label="Driver video", source="upload")
	video_output = gr.Image(label="Rendered GIF avatar")
	video_button = gr.Button("Predict")
	with gr.Tab("Webcam Inference"):
	with gr.Row():
	source_img3 = gr.Image(type="pil", label="Source image", show_label=True)
	driver_cam = gr.Video(label="Driver video", source="webcam")
	cam_output = gr.Image(label="Rendered GIF avatar")
	cam_button = gr.Button("Predict")

	gr.Examples(
	examples=[
	["./examples/lincoln.jpg", "./examples/taras2.jpg"],
	["./examples/lincoln.jpg", "./examples/taras1.jpg"]
	],
	inputs=[source_img, driver_img],
	outputs=[image_output],
	fn=image_inference,
	cache_examples=True
	)

	image_button.click(image_inference, inputs=[source_img, driver_img], outputs=image_output)
	video_button.click(video_inference, inputs=[source_img2, driver_vid], outputs=video_output)
	cam_button.click(video_inference, inputs=[source_img3, driver_cam], outputs=cam_output)

	demo.launch()