one-shot-talking-face

397ee41 about 3 years ago

5.83 kB

	import os

	from skimage import io,img_as_float32
	import cv2
	import torch
	import numpy as np
	import subprocess
	import pandas
	from models.audio2pose import audio2poseLSTM
	from scipy.io import wavfile
	import python_speech_features
	import pyworld
	import config
	import json
	from scipy.interpolate import interp1d

	def inter_pitch(y,y_flag):
	frame_num = y.shape[0]
	i = 0
	last = -1
	while(i<frame_num):
	if y_flag[i] == 0:
	while True:
	if y_flag[i]==0:
	if i == frame_num-1:
	if last !=-1:
	y[last+1:] = y[last]
	i+=1
	break
	i+=1
	else:
	break
	if i >= frame_num:
	break
	elif last == -1:
	y[:i] = y[i]
	else:
	inter_num = i-last+1
	fy = np.array([y[last],y[i]])
	fx = np.linspace(0, 1, num=2)
	f = interp1d(fx,fy)
	fx_new = np.linspace(0,1,inter_num)
	fy_new = f(fx_new)
	y[last+1:i] = fy_new[1:-1]
	last = i
	i+=1

	else:
	last = i
	i+=1
	return y


	def load_ckpt(checkpoint_path, generator = None, kp_detector = None, ph2kp = None):
	checkpoint = torch.load(checkpoint_path)
	if ph2kp is not None:
	ph2kp.load_state_dict(checkpoint['ph2kp'])
	if generator is not None:
	generator.load_state_dict(checkpoint['generator'])
	if kp_detector is not None:
	kp_detector.load_state_dict(checkpoint['kp_detector'])

	def get_img_pose(img_path):
	processor = config.OPENFACE_POSE_EXTRACTOR_PATH

	tmp_dir = "samples/tmp_dir"
	os.makedirs((tmp_dir),exist_ok=True)
	subprocess.call([processor, "-f", img_path, "-out_dir", tmp_dir, "-pose"])

	img_file = os.path.basename(img_path)[:-4]+".csv"
	csv_file = os.path.join(tmp_dir,img_file)
	pos_data = pandas.read_csv(csv_file)
	i = 0
	pose = [pos_data["pose_Rx"][i], pos_data["pose_Ry"][i], pos_data["pose_Rz"][i],pos_data["pose_Tx"][i], pos_data["pose_Ty"][i], pos_data["pose_Tz"][i]]
	# pose = [pose]
	pose = np.array(pose,dtype=np.float32)
	return pose

	def read_img(path):
	img = io.imread(path)[:,:,:3]
	img = cv2.resize(img, (256, 256))
	# img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
	img = np.array(img_as_float32(img))
	img = img.transpose((2, 0, 1))
	img = torch.from_numpy(img).unsqueeze(0)
	return img


	def parse_phoneme_file(phoneme_path,use_index = True):
	with open(phoneme_path,'r') as f:
	result_text = json.load(f)
	frame_num = int(result_text[-1]['phones'][-1]['ed']/100*25)
	phoneset_list = []
	index = 0

	word_len = len(result_text)
	word_index = 0
	phone_index = 0
	cur_phone_list = result_text[0]["phones"]
	phone_len = len(cur_phone_list)
	cur_end = cur_phone_list[0]["ed"]

	phone_list = []

	phoneset_list.append(cur_phone_list[0]["ph"])
	i = 0
	while i < frame_num:
	if i * 4 < cur_end:
	phone_list.append(cur_phone_list[phone_index]["ph"])
	i += 1
	else:
	phone_index += 1
	if phone_index >= phone_len:
	word_index += 1
	if word_index >= word_len:
	phone_list.append(cur_phone_list[-1]["ph"])
	i += 1
	else:
	phone_index = 0
	cur_phone_list = result_text[word_index]["phones"]
	phone_len = len(cur_phone_list)
	cur_end = cur_phone_list[phone_index]["ed"]
	phoneset_list.append(cur_phone_list[phone_index]["ph"])
	index += 1
	else:
	# print(word_index,phone_index)
	cur_end = cur_phone_list[phone_index]["ed"]
	phoneset_list.append(cur_phone_list[phone_index]["ph"])
	index += 1

	with open("phindex.json") as f:
	ph2index = json.load(f)
	if use_index:
	phone_list = [ph2index[p] for p in phone_list]
	saves = {"phone_list": phone_list}

	return saves

	def get_audio_feature_from_audio(audio_path):
	sample_rate, audio = wavfile.read(audio_path)
	if len(audio.shape) == 2:
	if np.min(audio[:, 0]) <= 0:
	audio = audio[:, 1]
	else:
	audio = audio[:, 0]

	audio = audio - np.mean(audio)
	audio = audio / np.max(np.abs(audio))
	a = python_speech_features.mfcc(audio, sample_rate)
	b = python_speech_features.logfbank(audio, sample_rate)
	c, _ = pyworld.harvest(audio, sample_rate, frame_period=10)
	c_flag = (c == 0.0) ^ 1
	c = inter_pitch(c, c_flag)
	c = np.expand_dims(c, axis=1)
	c_flag = np.expand_dims(c_flag, axis=1)
	frame_num = np.min([a.shape[0], b.shape[0], c.shape[0]])

	cat = np.concatenate([a[:frame_num], b[:frame_num], c[:frame_num], c_flag[:frame_num]], axis=1)
	return cat

	def get_pose_from_audio(img,audio,audio2pose):

	num_frame = len(audio) // 4

	minv = np.array([-0.6, -0.6, -0.6, -128.0, -128.0, 128.0], dtype=np.float32)
	maxv = np.array([0.6, 0.6, 0.6, 128.0, 128.0, 384.0], dtype=np.float32)
	generator = audio2poseLSTM().cuda().eval()

	ckpt_para = torch.load(audio2pose)

	generator.load_state_dict(ckpt_para["generator"])
	generator.eval()


	audio_seq = []
	for i in range(num_frame):
	audio_seq.append(audio[i4:i4+4])

	audio = torch.from_numpy(np.array(audio_seq,dtype=np.float32)).unsqueeze(0).cuda()

	x = {}
	x ["img"] = img
	x["audio"] = audio
	poses = generator(x)

	poses = poses.cpu().data.numpy()[0]
	poses = (poses+1)/2*(maxv-minv)+minv

	return poses