"""run bash scripts/download_models.sh first to prepare the weights file""" import os import shutil from argparse import Namespace from src.utils.preprocess import CropAndExtract from src.test_audio2coeff import Audio2Coeff from src.facerender.animate import AnimateFromCoeff from src.generate_batch import get_data from src.generate_facerender_batch import get_facerender_data from src.utils.init_path import init_path from cog import BasePredictor, Input, Path checkpoints = "checkpoints" class Predictor(BasePredictor): def setup(self): """Load the model into memory to make running multiple predictions efficient""" device = "cuda" sadtalker_paths = init_path(checkpoints,os.path.join("src","config")) # init model self.preprocess_model = CropAndExtract(sadtalker_paths, device ) self.audio_to_coeff = Audio2Coeff( sadtalker_paths, device, ) self.animate_from_coeff = { "full": AnimateFromCoeff( sadtalker_paths, device, ), "others": AnimateFromCoeff( sadtalker_paths, device, ), } def predict( self, source_image: Path = Input( description="Upload the source image, it can be video.mp4 or picture.png", ), driven_audio: Path = Input( description="Upload the driven audio, accepts .wav and .mp4 file", ), enhancer: str = Input( description="Choose a face enhancer", choices=["gfpgan", "RestoreFormer"], default="gfpgan", ), preprocess: str = Input( description="how to preprocess the images", choices=["crop", "resize", "full"], default="full", ), ref_eyeblink: Path = Input( description="path to reference video providing eye blinking", default=None, ), ref_pose: Path = Input( description="path to reference video providing pose", default=None, ), still: bool = Input( description="can crop back to the original videos for the full body aniamtion when preprocess is full", default=True, ), ) -> Path: """Run a single prediction on the model""" animate_from_coeff = ( self.animate_from_coeff["full"] if preprocess == "full" else self.animate_from_coeff["others"] ) args = load_default() args.pic_path = str(source_image) args.audio_path = str(driven_audio) device = "cuda" args.still = still args.ref_eyeblink = None if ref_eyeblink is None else str(ref_eyeblink) args.ref_pose = None if ref_pose is None else str(ref_pose) # crop image and extract 3dmm from image results_dir = "results" if os.path.exists(results_dir): shutil.rmtree(results_dir) os.makedirs(results_dir) first_frame_dir = os.path.join(results_dir, "first_frame_dir") os.makedirs(first_frame_dir) print("3DMM Extraction for source image") first_coeff_path, crop_pic_path, crop_info = self.preprocess_model.generate( args.pic_path, first_frame_dir, preprocess, source_image_flag=True ) if first_coeff_path is None: print("Can't get the coeffs of the input") return if ref_eyeblink is not None: ref_eyeblink_videoname = os.path.splitext(os.path.split(ref_eyeblink)[-1])[ 0 ] ref_eyeblink_frame_dir = os.path.join(results_dir, ref_eyeblink_videoname) os.makedirs(ref_eyeblink_frame_dir, exist_ok=True) print("3DMM Extraction for the reference video providing eye blinking") ref_eyeblink_coeff_path, _, _ = self.preprocess_model.generate( ref_eyeblink, ref_eyeblink_frame_dir ) else: ref_eyeblink_coeff_path = None if ref_pose is not None: if ref_pose == ref_eyeblink: ref_pose_coeff_path = ref_eyeblink_coeff_path else: ref_pose_videoname = os.path.splitext(os.path.split(ref_pose)[-1])[0] ref_pose_frame_dir = os.path.join(results_dir, ref_pose_videoname) os.makedirs(ref_pose_frame_dir, exist_ok=True) print("3DMM Extraction for the reference video providing pose") ref_pose_coeff_path, _, _ = self.preprocess_model.generate( ref_pose, ref_pose_frame_dir ) else: ref_pose_coeff_path = None # audio2ceoff batch = get_data( first_coeff_path, args.audio_path, device, ref_eyeblink_coeff_path, still=still, ) coeff_path = self.audio_to_coeff.generate( batch, results_dir, args.pose_style, ref_pose_coeff_path ) # coeff2video print("coeff2video") data = get_facerender_data( coeff_path, crop_pic_path, first_coeff_path, args.audio_path, args.batch_size, args.input_yaw, args.input_pitch, args.input_roll, expression_scale=args.expression_scale, still_mode=still, preprocess=preprocess, ) animate_from_coeff.generate( data, results_dir, args.pic_path, crop_info, enhancer=enhancer, background_enhancer=args.background_enhancer, preprocess=preprocess) output = "/tmp/out.mp4" mp4_path = os.path.join(results_dir, [f for f in os.listdir(results_dir) if "enhanced.mp4" in f][0]) shutil.copy(mp4_path, output) return Path(output) def load_default(): return Namespace( pose_style=0, batch_size=2, expression_scale=1.0, input_yaw=None, input_pitch=None, input_roll=None, background_enhancer=None, face3dvis=False, net_recon="resnet50", init_path=None, use_last_fc=False, bfm_folder="./src/config/", bfm_model="BFM_model_front.mat", focal=1015.0, center=112.0, camera_d=10.0, z_near=5.0, z_far=15.0, )