musepose

Running

musepose / pose_align.py

jhj0517

add progress for pose alignment

da162f8 5 months ago

23.1 kB

	import numpy as np
	import torch
	import copy
	import cv2
	import os
	import moviepy.video.io.ImageSequenceClip
	from datetime import datetime
	import gc
	import gradio as gr

	from pose.script.dwpose import DWposeDetector, draw_pose
	from pose.script.util import size_calculate, warpAffine_kps
	from downloading_weights import download_models

	# ZeroGPU
	import spaces


	'''
	Detect dwpose from img, then align it by scale parameters
	img: frame from the pose video
	detector: DWpose
	scales: scale parameters
	'''
	class PoseAlignmentInference:
	def __init__(self,
	model_dir,
	output_dir):
	self.detector = None
	self.model_paths = {
	"det_ckpt": os.path.join(model_dir, "dwpose", "yolox_l_8x8_300e_coco.pth"),
	"pose_ckpt": os.path.join(model_dir, "dwpose", "dw-ll_ucoco_384.pth")
	}
	self.config_paths = {
	"pose_config": os.path.join("pose", "config", "dwpose-l_384x288.py"),
	"det_config": os.path.join("pose", "config", "yolox_l_8xb8-300e_coco.py"),
	}
	self.model_dir = model_dir
	self.output_dir = os.path.join(output_dir, "pose_alignment")
	if not os.path.exists(self.output_dir):
	os.makedirs(self.output_dir)

	@spaces.GPU(duration=120)
	def align_pose(
	self,
	vidfn: str,
	imgfn_refer: str,
	detect_resolution: int,
	image_resolution: int,
	align_frame: int,
	max_frame: int,
	gradio_progress=gr.Progress()
	):
	download_models(model_dir=self.model_dir)
	output_filename = "pose_temp"
	outfn=os.path.abspath(os.path.join(self.output_dir, f'{output_filename}_demo.mp4'))
	outfn_align_pose_video=os.path.abspath(os.path.join(self.output_dir, f'{output_filename}.mp4'))

	video = cv2.VideoCapture(vidfn)
	width= video.get(cv2.CAP_PROP_FRAME_WIDTH)
	height= video.get(cv2.CAP_PROP_FRAME_HEIGHT)

	total_frame= video.get(cv2.CAP_PROP_FRAME_COUNT)
	fps= video.get(cv2.CAP_PROP_FPS)

	print("height:", height)
	print("width:", width)
	print("fps:", fps)

	H_in, W_in = height, width
	H_out, W_out = size_calculate(H_in,W_in, detect_resolution)
	H_out, W_out = size_calculate(H_out,W_out, image_resolution)

	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	self.detector = DWposeDetector(
	det_config = self.config_paths["det_config"],
	det_ckpt = self.model_paths["det_ckpt"],
	pose_config = self.config_paths["pose_config"],
	pose_ckpt = self.model_paths["pose_ckpt"],
	keypoints_only=False
	)
	detector = self.detector.to(device)

	refer_img = cv2.imread(imgfn_refer)
	output_refer, pose_refer = detector(refer_img,detect_resolution=detect_resolution, image_resolution=image_resolution, output_type='cv2',return_pose_dict=True)
	body_ref_img = pose_refer['bodies']['candidate']
	hands_ref_img = pose_refer['hands']
	faces_ref_img = pose_refer['faces']
	output_refer = cv2.cvtColor(output_refer, cv2.COLOR_RGB2BGR)


	skip_frames = align_frame
	max_frame = max_frame
	pose_list, video_frame_buffer, video_pose_buffer = [], [], []


	cap = cv2.VideoCapture('2.mp4') # 读取视频
	while cap.isOpened(): # 当视频被打开时：
	ret, frame = cap.read() # 读取视频，读取到的某一帧存储到frame，若是读取成功，ret为True，反之为False
	if ret: # 若是读取成功
	cv2.imshow('frame', frame) # 显示读取到的这一帧画面
	key = cv2.waitKey(25) # 等待一段时间，并且检测键盘输入
	if key == ord('q'): # 若是键盘输入'q',则退出，释放视频
	cap.release() # 释放视频
	break
	else:
	cap.release()
	cv2.destroyAllWindows() # 关闭所有窗口


	for i in range(max_frame):
	ret, img = video.read()
	if img is None:
	break
	else:
	if i < skip_frames:
	continue
	video_frame_buffer.append(img)

	# estimate scale parameters by the 1st frame in the video
	if i==skip_frames:
	output_1st_img, pose_1st_img = detector(img, detect_resolution, image_resolution, output_type='cv2', return_pose_dict=True)
	body_1st_img = pose_1st_img['bodies']['candidate']
	hands_1st_img = pose_1st_img['hands']
	faces_1st_img = pose_1st_img['faces']

	'''
	计算逻辑:
	1. 先把 ref 和 pose 的高 resize 到一样，且都保持原来的长宽比。
	2. 用点在图中的实际坐标来计算。
	3. 实际计算中，把h的坐标归一化到 [0, 1], w为[0, W/H]
	4. 由于 dwpose 的输出本来就是归一化的坐标，所以h不需要变，w要乘W/H
	注意：dwpose 输出是 (w, h)
	'''

	# h不变，w缩放到原比例
	ref_H, ref_W = refer_img.shape[0], refer_img.shape[1]
	ref_ratio = ref_W / ref_H
	body_ref_img[:, 0] = body_ref_img[:, 0] * ref_ratio
	hands_ref_img[:, :, 0] = hands_ref_img[:, :, 0] * ref_ratio
	faces_ref_img[:, :, 0] = faces_ref_img[:, :, 0] * ref_ratio

	video_ratio = width / height
	body_1st_img[:, 0] = body_1st_img[:, 0] * video_ratio
	hands_1st_img[:, :, 0] = hands_1st_img[:, :, 0] * video_ratio
	faces_1st_img[:, :, 0] = faces_1st_img[:, :, 0] * video_ratio

	# scale
	align_args = dict()

	dist_1st_img = np.linalg.norm(body_1st_img[0]-body_1st_img[1]) # 0.078
	dist_ref_img = np.linalg.norm(body_ref_img[0]-body_ref_img[1]) # 0.106
	align_args["scale_neck"] = dist_ref_img / dist_1st_img # align / pose = ref / 1st

	dist_1st_img = np.linalg.norm(body_1st_img[16]-body_1st_img[17])
	dist_ref_img = np.linalg.norm(body_ref_img[16]-body_ref_img[17])
	align_args["scale_face"] = dist_ref_img / dist_1st_img

	dist_1st_img = np.linalg.norm(body_1st_img[2]-body_1st_img[5]) # 0.112
	dist_ref_img = np.linalg.norm(body_ref_img[2]-body_ref_img[5]) # 0.174
	align_args["scale_shoulder"] = dist_ref_img / dist_1st_img

	dist_1st_img = np.linalg.norm(body_1st_img[2]-body_1st_img[3]) # 0.895
	dist_ref_img = np.linalg.norm(body_ref_img[2]-body_ref_img[3]) # 0.134
	s1 = dist_ref_img / dist_1st_img
	dist_1st_img = np.linalg.norm(body_1st_img[5]-body_1st_img[6])
	dist_ref_img = np.linalg.norm(body_ref_img[5]-body_ref_img[6])
	s2 = dist_ref_img / dist_1st_img
	align_args["scale_arm_upper"] = (s1+s2)/2 # 1.548

	dist_1st_img = np.linalg.norm(body_1st_img[3]-body_1st_img[4])
	dist_ref_img = np.linalg.norm(body_ref_img[3]-body_ref_img[4])
	s1 = dist_ref_img / dist_1st_img
	dist_1st_img = np.linalg.norm(body_1st_img[6]-body_1st_img[7])
	dist_ref_img = np.linalg.norm(body_ref_img[6]-body_ref_img[7])
	s2 = dist_ref_img / dist_1st_img
	align_args["scale_arm_lower"] = (s1+s2)/2

	# hand
	dist_1st_img = np.zeros(10)
	dist_ref_img = np.zeros(10)

	dist_1st_img[0] = np.linalg.norm(hands_1st_img[0,0]-hands_1st_img[0,1])
	dist_1st_img[1] = np.linalg.norm(hands_1st_img[0,0]-hands_1st_img[0,5])
	dist_1st_img[2] = np.linalg.norm(hands_1st_img[0,0]-hands_1st_img[0,9])
	dist_1st_img[3] = np.linalg.norm(hands_1st_img[0,0]-hands_1st_img[0,13])
	dist_1st_img[4] = np.linalg.norm(hands_1st_img[0,0]-hands_1st_img[0,17])
	dist_1st_img[5] = np.linalg.norm(hands_1st_img[1,0]-hands_1st_img[1,1])
	dist_1st_img[6] = np.linalg.norm(hands_1st_img[1,0]-hands_1st_img[1,5])
	dist_1st_img[7] = np.linalg.norm(hands_1st_img[1,0]-hands_1st_img[1,9])
	dist_1st_img[8] = np.linalg.norm(hands_1st_img[1,0]-hands_1st_img[1,13])
	dist_1st_img[9] = np.linalg.norm(hands_1st_img[1,0]-hands_1st_img[1,17])

	dist_ref_img[0] = np.linalg.norm(hands_ref_img[0,0]-hands_ref_img[0,1])
	dist_ref_img[1] = np.linalg.norm(hands_ref_img[0,0]-hands_ref_img[0,5])
	dist_ref_img[2] = np.linalg.norm(hands_ref_img[0,0]-hands_ref_img[0,9])
	dist_ref_img[3] = np.linalg.norm(hands_ref_img[0,0]-hands_ref_img[0,13])
	dist_ref_img[4] = np.linalg.norm(hands_ref_img[0,0]-hands_ref_img[0,17])
	dist_ref_img[5] = np.linalg.norm(hands_ref_img[1,0]-hands_ref_img[1,1])
	dist_ref_img[6] = np.linalg.norm(hands_ref_img[1,0]-hands_ref_img[1,5])
	dist_ref_img[7] = np.linalg.norm(hands_ref_img[1,0]-hands_ref_img[1,9])
	dist_ref_img[8] = np.linalg.norm(hands_ref_img[1,0]-hands_ref_img[1,13])
	dist_ref_img[9] = np.linalg.norm(hands_ref_img[1,0]-hands_ref_img[1,17])

	ratio = 0
	count = 0
	for i in range (10):
	if dist_1st_img[i] != 0:
	ratio = ratio + dist_ref_img[i]/dist_1st_img[i]
	count = count + 1
	if count!=0:
	align_args["scale_hand"] = (ratio/count+align_args["scale_arm_upper"]+align_args["scale_arm_lower"])/3
	else:
	align_args["scale_hand"] = (align_args["scale_arm_upper"]+align_args["scale_arm_lower"])/2

	# body
	dist_1st_img = np.linalg.norm(body_1st_img[1] - (body_1st_img[8] + body_1st_img[11])/2 )
	dist_ref_img = np.linalg.norm(body_ref_img[1] - (body_ref_img[8] + body_ref_img[11])/2 )
	align_args["scale_body_len"]=dist_ref_img / dist_1st_img

	dist_1st_img = np.linalg.norm(body_1st_img[8]-body_1st_img[9])
	dist_ref_img = np.linalg.norm(body_ref_img[8]-body_ref_img[9])
	s1 = dist_ref_img / dist_1st_img
	dist_1st_img = np.linalg.norm(body_1st_img[11]-body_1st_img[12])
	dist_ref_img = np.linalg.norm(body_ref_img[11]-body_ref_img[12])
	s2 = dist_ref_img / dist_1st_img
	align_args["scale_leg_upper"] = (s1+s2)/2

	dist_1st_img = np.linalg.norm(body_1st_img[9]-body_1st_img[10])
	dist_ref_img = np.linalg.norm(body_ref_img[9]-body_ref_img[10])
	s1 = dist_ref_img / dist_1st_img
	dist_1st_img = np.linalg.norm(body_1st_img[12]-body_1st_img[13])
	dist_ref_img = np.linalg.norm(body_ref_img[12]-body_ref_img[13])
	s2 = dist_ref_img / dist_1st_img
	align_args["scale_leg_lower"] = (s1+s2)/2

	####################
	####################
	# need adjust nan
	for k,v in align_args.items():
	if np.isnan(v):
	align_args[k]=1

	# centre offset (the offset of key point 1)
	offset = body_ref_img[1] - body_1st_img[1]


	# pose align
	pose_img, pose_ori = detector(img, detect_resolution, image_resolution, output_type='cv2', return_pose_dict=True)
	video_pose_buffer.append(pose_img)
	pose_align = self.align_img(img, pose_ori, align_args, detect_resolution, image_resolution)


	# add centre offset
	pose = pose_align
	pose['bodies']['candidate'] = pose['bodies']['candidate'] + offset
	pose['hands'] = pose['hands'] + offset
	pose['faces'] = pose['faces'] + offset


	# h不变，w从绝对坐标缩放回0-1 注意这里要回到ref的坐标系
	pose['bodies']['candidate'][:, 0] = pose['bodies']['candidate'][:, 0] / ref_ratio
	pose['hands'][:, :, 0] = pose['hands'][:, :, 0] / ref_ratio
	pose['faces'][:, :, 0] = pose['faces'][:, :, 0] / ref_ratio
	pose_list.append(pose)

	# stack
	body_list = [pose['bodies']['candidate'][:18] for pose in pose_list]
	body_list_subset = [pose['bodies']['subset'][:1] for pose in pose_list]
	hands_list = [pose['hands'][:2] for pose in pose_list]
	faces_list = [pose['faces'][:1] for pose in pose_list]

	body_seq = np.stack(body_list , axis=0)
	body_seq_subset = np.stack(body_list_subset, axis=0)
	hands_seq = np.stack(hands_list , axis=0)
	faces_seq = np.stack(faces_list , axis=0)


	# concatenate and paint results
	H = 768 # paint height
	W1 = int((H/ref_H * ref_W)//2 *2)
	W2 = int((H/height * width)//2 *2)
	result_demo = [] # = Writer(args, None, H, 3W1+2W2, outfn, fps)
	result_pose_only = [] # Writer(args, None, H, W1, args.outfn_align_pose_video, fps)
	for i in range(len(body_seq)):
	gradio_progress(i/len(body_seq), "Aligning Pose.... After this, go to Step 2.")

	pose_t={}
	pose_t["bodies"]={}
	pose_t["bodies"]["candidate"]=body_seq[i]
	pose_t["bodies"]["subset"]=body_seq_subset[i]
	pose_t["hands"]=hands_seq[i]
	pose_t["faces"]=faces_seq[i]

	ref_img = cv2.cvtColor(refer_img, cv2.COLOR_RGB2BGR)
	ref_img = cv2.resize(ref_img, (W1, H))
	ref_pose= cv2.resize(output_refer, (W1, H))

	output_transformed = draw_pose(
	pose_t,
	int(H_in*1024/W_in),
	1024,
	draw_face=False,
	)
	output_transformed = cv2.cvtColor(output_transformed, cv2.COLOR_BGR2RGB)
	output_transformed = cv2.resize(output_transformed, (W1, H))

	video_frame = cv2.resize(video_frame_buffer[i], (W2, H))
	video_pose = cv2.resize(video_pose_buffer[i], (W2, H))

	res = np.concatenate([ref_img, ref_pose, output_transformed, video_frame, video_pose], axis=1)
	result_demo.append(res)
	result_pose_only.append(output_transformed)

	print(f"pose_list len: {len(pose_list)}")
	clip = moviepy.video.io.ImageSequenceClip.ImageSequenceClip(result_demo, fps=fps)
	clip.write_videofile(outfn, fps=fps)
	clip = moviepy.video.io.ImageSequenceClip.ImageSequenceClip(result_pose_only, fps=fps)
	clip.write_videofile(outfn_align_pose_video, fps=fps)
	print('pose align done')
	self.release_vram()
	return outfn_align_pose_video, outfn

	def release_vram(self):
	if self.detector is not None:
	del self.detector
	self.detector = None
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	gc.collect()

	@staticmethod
	def align_img(img, pose_ori, scales, detect_resolution, image_resolution):

	body_pose = copy.deepcopy(pose_ori['bodies']['candidate'])
	hands = copy.deepcopy(pose_ori['hands'])
	faces = copy.deepcopy(pose_ori['faces'])

	'''
	计算逻辑:
	0. 该函数内进行绝对变换，始终保持人体中心点 body_pose[1] 不变
	1. 先把 ref 和 pose 的高 resize 到一样，且都保持原来的长宽比。
	2. 用点在图中的实际坐标来计算。
	3. 实际计算中，把h的坐标归一化到 [0, 1], w为[0, W/H]
	4. 由于 dwpose 的输出本来就是归一化的坐标，所以h不需要变，w要乘W/H
	注意：dwpose 输出是 (w, h)
	'''

	# h不变，w缩放到原比例
	H_in, W_in, C_in = img.shape
	video_ratio = W_in / H_in
	body_pose[:, 0] = body_pose[:, 0] * video_ratio
	hands[:, :, 0] = hands[:, :, 0] * video_ratio
	faces[:, :, 0] = faces[:, :, 0] * video_ratio

	# scales of 10 body parts
	scale_neck = scales["scale_neck"]
	scale_face = scales["scale_face"]
	scale_shoulder = scales["scale_shoulder"]
	scale_arm_upper = scales["scale_arm_upper"]
	scale_arm_lower = scales["scale_arm_lower"]
	scale_hand = scales["scale_hand"]
	scale_body_len = scales["scale_body_len"]
	scale_leg_upper = scales["scale_leg_upper"]
	scale_leg_lower = scales["scale_leg_lower"]

	scale_sum = 0
	count = 0
	scale_list = [scale_neck, scale_face, scale_shoulder, scale_arm_upper, scale_arm_lower, scale_hand,
	scale_body_len, scale_leg_upper, scale_leg_lower]
	for i in range(len(scale_list)):
	if not np.isinf(scale_list[i]):
	scale_sum = scale_sum + scale_list[i]
	count = count + 1
	for i in range(len(scale_list)):
	if np.isinf(scale_list[i]):
	scale_list[i] = scale_sum / count

	# offsets of each part
	offset = dict()
	offset["14_15_16_17_to_0"] = body_pose[[14, 15, 16, 17], :] - body_pose[[0], :]
	offset["3_to_2"] = body_pose[[3], :] - body_pose[[2], :]
	offset["4_to_3"] = body_pose[[4], :] - body_pose[[3], :]
	offset["6_to_5"] = body_pose[[6], :] - body_pose[[5], :]
	offset["7_to_6"] = body_pose[[7], :] - body_pose[[6], :]
	offset["9_to_8"] = body_pose[[9], :] - body_pose[[8], :]
	offset["10_to_9"] = body_pose[[10], :] - body_pose[[9], :]
	offset["12_to_11"] = body_pose[[12], :] - body_pose[[11], :]
	offset["13_to_12"] = body_pose[[13], :] - body_pose[[12], :]
	offset["hand_left_to_4"] = hands[1, :, :] - body_pose[[4], :]
	offset["hand_right_to_7"] = hands[0, :, :] - body_pose[[7], :]

	# neck
	c_ = body_pose[1]
	cx = c_[0]
	cy = c_[1]
	M = cv2.getRotationMatrix2D((cx, cy), 0, scale_neck)

	neck = body_pose[[0], :]
	neck = warpAffine_kps(neck, M)
	body_pose[[0], :] = neck

	# body_pose_up_shoulder
	c_ = body_pose[0]
	cx = c_[0]
	cy = c_[1]
	M = cv2.getRotationMatrix2D((cx, cy), 0, scale_face)

	body_pose_up_shoulder = offset["14_15_16_17_to_0"] + body_pose[[0], :]
	body_pose_up_shoulder = warpAffine_kps(body_pose_up_shoulder, M)
	body_pose[[14, 15, 16, 17], :] = body_pose_up_shoulder

	# shoulder
	c_ = body_pose[1]
	cx = c_[0]
	cy = c_[1]
	M = cv2.getRotationMatrix2D((cx, cy), 0, scale_shoulder)

	body_pose_shoulder = body_pose[[2, 5], :]
	body_pose_shoulder = warpAffine_kps(body_pose_shoulder, M)
	body_pose[[2, 5], :] = body_pose_shoulder

	# arm upper left
	c_ = body_pose[2]
	cx = c_[0]
	cy = c_[1]
	M = cv2.getRotationMatrix2D((cx, cy), 0, scale_arm_upper)

	elbow = offset["3_to_2"] + body_pose[[2], :]
	elbow = warpAffine_kps(elbow, M)
	body_pose[[3], :] = elbow

	# arm lower left
	c_ = body_pose[3]
	cx = c_[0]
	cy = c_[1]
	M = cv2.getRotationMatrix2D((cx, cy), 0, scale_arm_lower)

	wrist = offset["4_to_3"] + body_pose[[3], :]
	wrist = warpAffine_kps(wrist, M)
	body_pose[[4], :] = wrist

	# hand left
	c_ = body_pose[4]
	cx = c_[0]
	cy = c_[1]
	M = cv2.getRotationMatrix2D((cx, cy), 0, scale_hand)

	hand = offset["hand_left_to_4"] + body_pose[[4], :]
	hand = warpAffine_kps(hand, M)
	hands[1, :, :] = hand

	# arm upper right
	c_ = body_pose[5]
	cx = c_[0]
	cy = c_[1]
	M = cv2.getRotationMatrix2D((cx, cy), 0, scale_arm_upper)

	elbow = offset["6_to_5"] + body_pose[[5], :]
	elbow = warpAffine_kps(elbow, M)
	body_pose[[6], :] = elbow

	# arm lower right
	c_ = body_pose[6]
	cx = c_[0]
	cy = c_[1]
	M = cv2.getRotationMatrix2D((cx, cy), 0, scale_arm_lower)

	wrist = offset["7_to_6"] + body_pose[[6], :]
	wrist = warpAffine_kps(wrist, M)
	body_pose[[7], :] = wrist

	# hand right
	c_ = body_pose[7]
	cx = c_[0]
	cy = c_[1]
	M = cv2.getRotationMatrix2D((cx, cy), 0, scale_hand)

	hand = offset["hand_right_to_7"] + body_pose[[7], :]
	hand = warpAffine_kps(hand, M)
	hands[0, :, :] = hand

	# body len
	c_ = body_pose[1]
	cx = c_[0]
	cy = c_[1]
	M = cv2.getRotationMatrix2D((cx, cy), 0, scale_body_len)

	body_len = body_pose[[8, 11], :]
	body_len = warpAffine_kps(body_len, M)
	body_pose[[8, 11], :] = body_len

	# leg upper left
	c_ = body_pose[8]
	cx = c_[0]
	cy = c_[1]
	M = cv2.getRotationMatrix2D((cx, cy), 0, scale_leg_upper)

	knee = offset["9_to_8"] + body_pose[[8], :]
	knee = warpAffine_kps(knee, M)
	body_pose[[9], :] = knee

	# leg lower left
	c_ = body_pose[9]
	cx = c_[0]
	cy = c_[1]
	M = cv2.getRotationMatrix2D((cx, cy), 0, scale_leg_lower)

	ankle = offset["10_to_9"] + body_pose[[9], :]
	ankle = warpAffine_kps(ankle, M)
	body_pose[[10], :] = ankle

	# leg upper right
	c_ = body_pose[11]
	cx = c_[0]
	cy = c_[1]
	M = cv2.getRotationMatrix2D((cx, cy), 0, scale_leg_upper)

	knee = offset["12_to_11"] + body_pose[[11], :]
	knee = warpAffine_kps(knee, M)
	body_pose[[12], :] = knee

	# leg lower right
	c_ = body_pose[12]
	cx = c_[0]
	cy = c_[1]
	M = cv2.getRotationMatrix2D((cx, cy), 0, scale_leg_lower)

	ankle = offset["13_to_12"] + body_pose[[12], :]
	ankle = warpAffine_kps(ankle, M)
	body_pose[[13], :] = ankle

	# none part
	body_pose_none = pose_ori['bodies']['candidate'] == -1.
	hands_none = pose_ori['hands'] == -1.
	faces_none = pose_ori['faces'] == -1.

	body_pose[body_pose_none] = -1.
	hands[hands_none] = -1.
	nan = float('nan')
	if len(hands[np.isnan(hands)]) > 0:
	print('nan')
	faces[faces_none] = -1.

	# last check nan -> -1.
	body_pose = np.nan_to_num(body_pose, nan=-1.)
	hands = np.nan_to_num(hands, nan=-1.)
	faces = np.nan_to_num(faces, nan=-1.)

	# return
	pose_align = copy.deepcopy(pose_ori)
	pose_align['bodies']['candidate'] = body_pose
	pose_align['hands'] = hands
	pose_align['faces'] = faces

	return pose_align