AniPortrait_official

Running

App Files Files Community

cocktailpeanut commited on Apr 4

Commit

7f1584d

•

1 Parent(s): fa7d98a

update

Browse files

Files changed (2) hide show

app.py +15 -8
requirements.txt +7 -6

app.py CHANGED Viewed

@@ -31,6 +31,13 @@ from src.utils.crop_face_single import crop_face
 from src.audio2vid import get_headpose_temp, smooth_pose_seq
 from src.utils.frame_interpolation import init_frame_interpolation_model, batch_images_interpolation_tool
 config = OmegaConf.load('./configs/prompts/animation_audio.yaml')
 if config.weight_dtype == "fp16":
@@ -42,16 +49,16 @@ audio_infer_config = OmegaConf.load(config.audio_inference_config)
 # prepare model
 a2m_model = Audio2MeshModel(audio_infer_config['a2m_model'])
 a2m_model.load_state_dict(torch.load(audio_infer_config['pretrained_model']['a2m_ckpt'], map_location="cpu"), strict=False)
-a2m_model.cuda().eval()
 vae = AutoencoderKL.from_pretrained(
     config.pretrained_vae_path,
-).to("cuda", dtype=weight_dtype)
 reference_unet = UNet2DConditionModel.from_pretrained(
     config.pretrained_base_model_path,
     subfolder="unet",
-).to(dtype=weight_dtype, device="cuda")
 inference_config_path = config.inference_config
 infer_config = OmegaConf.load(inference_config_path)
@@ -60,13 +67,13 @@ denoising_unet = UNet3DConditionModel.from_pretrained_2d(
     config.motion_module_path,
     subfolder="unet",
     unet_additional_kwargs=infer_config.unet_additional_kwargs,
-).to(dtype=weight_dtype, device="cuda")
-pose_guider = PoseGuider(noise_latent_channels=320, use_ca=True).to(device="cuda", dtype=weight_dtype) # not use cross attention
 image_enc = CLIPVisionModelWithProjection.from_pretrained(
     config.image_encoder_path
-).to(dtype=weight_dtype, device="cuda")
 sched_kwargs = OmegaConf.to_container(infer_config.noise_scheduler_kwargs)
 scheduler = DDIMScheduler(**sched_kwargs)
@@ -91,7 +98,7 @@ pipe = Pose2VideoPipeline(
     pose_guider=pose_guider,
     scheduler=scheduler,
 )
-pipe = pipe.to("cuda", dtype=weight_dtype)
 lmk_extractor = LMKExtractor()
 vis = FaceMeshVisualizer()
@@ -130,7 +137,7 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
     ref_pose = vis.draw_landmarks((ref_image_np.shape[1], ref_image_np.shape[0]), lmks, normed=True)
     sample = prepare_audio_feature(input_audio, wav2vec_model_path=audio_infer_config['a2m_model']['model_path'])
-    sample['audio_feature'] = torch.from_numpy(sample['audio_feature']).float().cuda()
     sample['audio_feature'] = sample['audio_feature'].unsqueeze(0)
     # inference

 from src.audio2vid import get_headpose_temp, smooth_pose_seq
 from src.utils.frame_interpolation import init_frame_interpolation_model, batch_images_interpolation_tool
+if torch.backends.mps.is_available():
+  device = "mps"
+elif torch.cuda.is_available():
+  device = "cuda"
+else:
+  device = "cpu"
 config = OmegaConf.load('./configs/prompts/animation_audio.yaml')
 if config.weight_dtype == "fp16":
 # prepare model
 a2m_model = Audio2MeshModel(audio_infer_config['a2m_model'])
 a2m_model.load_state_dict(torch.load(audio_infer_config['pretrained_model']['a2m_ckpt'], map_location="cpu"), strict=False)
+a2m_model.to(device).eval()
 vae = AutoencoderKL.from_pretrained(
     config.pretrained_vae_path,
+).to(device, dtype=weight_dtype)
 reference_unet = UNet2DConditionModel.from_pretrained(
     config.pretrained_base_model_path,
     subfolder="unet",
+).to(dtype=weight_dtype, device=device)
 inference_config_path = config.inference_config
 infer_config = OmegaConf.load(inference_config_path)
     config.motion_module_path,
     subfolder="unet",
     unet_additional_kwargs=infer_config.unet_additional_kwargs,
+).to(dtype=weight_dtype, device=device)
+pose_guider = PoseGuider(noise_latent_channels=320, use_ca=True).to(device=device, dtype=weight_dtype) # not use cross attention
 image_enc = CLIPVisionModelWithProjection.from_pretrained(
     config.image_encoder_path
+).to(dtype=weight_dtype, device=device)
 sched_kwargs = OmegaConf.to_container(infer_config.noise_scheduler_kwargs)
 scheduler = DDIMScheduler(**sched_kwargs)
     pose_guider=pose_guider,
     scheduler=scheduler,
 )
+pipe = pipe.to(device, dtype=weight_dtype)
 lmk_extractor = LMKExtractor()
 vis = FaceMeshVisualizer()
     ref_pose = vis.draw_landmarks((ref_image_np.shape[1], ref_image_np.shape[0]), lmks, normed=True)
     sample = prepare_audio_feature(input_audio, wav2vec_model_path=audio_infer_config['a2m_model']['model_path'])
+    sample['audio_feature'] = torch.from_numpy(sample['audio_feature']).float().to(device)
     sample['audio_feature'] = sample['audio_feature'].unsqueeze(0)
     # inference

requirements.txt CHANGED Viewed

@@ -1,7 +1,7 @@
 accelerate==0.21.0
 av==11.0.0
 clip @ https://github.com/openai/CLIP/archive/d50d76daa670286dd6cacf3bcd80b5e4823fc8e1.zip#sha256=b5842c25da441d6c581b53a5c60e0c2127ebafe0f746f8e15561a006c6c3be6a
-decord==0.6.0
 diffusers==0.24.0
 einops==0.4.1
 gradio==4.24.0
@@ -10,7 +10,7 @@ imageio==2.33.0
 imageio-ffmpeg==0.4.9
 numpy==1.24.4
 omegaconf==2.2.3
-onnxruntime-gpu==1.16.3
 open-clip-torch==2.20.0
 opencv-contrib-python==4.8.1.78
 opencv-python==4.8.1.78
@@ -18,15 +18,16 @@ Pillow==9.5.0
 scikit-image==0.21.0
 scikit-learn==1.3.2
 scipy==1.11.4
-torch==2.0.1
 torchdiffeq==0.2.3
 torchmetrics==1.2.1
 torchsde==0.2.5
-torchvision==0.15.2
 tqdm==4.66.1
 transformers==4.30.2
-xformers==0.0.22
 controlnet-aux==0.0.7
-mediapipe==0.10.11
 librosa==0.9.2
 ffmpeg-python==0.2.0

 accelerate==0.21.0
 av==11.0.0
 clip @ https://github.com/openai/CLIP/archive/d50d76daa670286dd6cacf3bcd80b5e4823fc8e1.zip#sha256=b5842c25da441d6c581b53a5c60e0c2127ebafe0f746f8e15561a006c6c3be6a
+#decord==0.6.0
 diffusers==0.24.0
 einops==0.4.1
 gradio==4.24.0
 imageio-ffmpeg==0.4.9
 numpy==1.24.4
 omegaconf==2.2.3
+#onnxruntime-gpu==1.16.3
 open-clip-torch==2.20.0
 opencv-contrib-python==4.8.1.78
 opencv-python==4.8.1.78
 scikit-image==0.21.0
 scikit-learn==1.3.2
 scipy==1.11.4
+#torch==2.0.1
 torchdiffeq==0.2.3
 torchmetrics==1.2.1
 torchsde==0.2.5
+#torchvision==0.15.2
 tqdm==4.66.1
 transformers==4.30.2
+#xformers==0.0.22
 controlnet-aux==0.0.7
+#mediapipe==0.10.11
+mediapipe==0.10.9
 librosa==0.9.2
 ffmpeg-python==0.2.0