Spaces:

saksham209
/

gface

Running

App Files Files Community

saksham209 commited on Jun 9, 2023

Commit

397bbeb

•

1 Parent(s): 6eab4f7

Upload 6 files

Browse files

Files changed (6) hide show

binarizer.py +91 -0
process_audio_hubert.py +87 -0
process_audio_mel_f0.py +98 -0
process_video_3dmm.py +133 -0
process_video_3dmm_th1kh.py +209 -0
process_video_3dmm_vox2.py +227 -0

binarizer.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import os
+import numpy as np
+from scipy.misc import face
+import torch
+from tqdm import trange
+import pickle
+from copy import deepcopy
+from data_util.face3d_helper import Face3DHelper
+from utils.commons.indexed_datasets import IndexedDataset, IndexedDatasetBuilder
+def load_video_npy(fn):
+    assert fn.endswith(".npy")
+    ret_dict = np.load(fn,allow_pickle=True).item()
+    video_dict = {
+        'coeff': ret_dict['coeff'], # [T, h]
+        'lm68': ret_dict['lm68'], # [T, 68, 2]
+        'lm5': ret_dict['lm5'], # [T, 5, 2]
+    }
+    return video_dict
+def cal_lm3d_in_video_dict(video_dict, face3d_helper):
+    coeff = torch.from_numpy(video_dict['coeff']).float()
+    identity = coeff[:, 0:80]
+    exp = coeff[:, 80:144]
+    idexp_lm3d = face3d_helper.reconstruct_idexp_lm3d(identity, exp).cpu().numpy()
+    video_dict['idexp_lm3d'] = idexp_lm3d
+def load_audio_npy(fn):
+    assert fn.endswith(".npy")
+    ret_dict = np.load(fn,allow_pickle=True).item()
+    audio_dict = {
+        "mel": ret_dict['mel'], # [T, 80]
+        "f0": ret_dict['f0'], # [T,1]
+    }
+    return audio_dict
+if __name__ == '__main__':
+    face3d_helper = Face3DHelper(use_gpu=False)
+    import glob,tqdm
+    prefixs = ['val', 'train']
+    binarized_ds_path = "data/binary/lrs3"
+    os.makedirs(binarized_ds_path, exist_ok=True)
+    for prefix in prefixs:
+        databuilder = IndexedDatasetBuilder(os.path.join(binarized_ds_path, prefix), gzip=False)
+        raw_base_dir =  '/home/yezhenhui/datasets/raw/lrs3_raw'
+        spk_ids = sorted([dir_name.split("/")[-1] for dir_name in glob.glob(raw_base_dir + "/*")])
+        spk_id2spk_idx = {spk_id : i for i,spk_id in enumerate(spk_ids) }
+        np.save(os.path.join(binarized_ds_path, "spk_id2spk_idx.npy"), spk_id2spk_idx, allow_pickle=True)
+        mp4_names = glob.glob(raw_base_dir + "/*/*.mp4")
+        cnt = 0
+        for i, mp4_name in tqdm.tqdm(enumerate(mp4_names), total=len(mp4_names)):
+            if prefix == 'train':
+                if i % 100 == 0:
+                    continue
+            else:
+                if i % 100 != 0:
+                    continue
+            lst = mp4_name.split("/")
+            spk_id = lst[-2]
+            clip_id = lst[-1][:-4]
+            audio_npy_name = os.path.join(raw_base_dir, spk_id, clip_id+"_audio.npy")
+            hubert_npy_name = os.path.join(raw_base_dir, spk_id, clip_id+"_hubert.npy")
+            video_npy_name = os.path.join(raw_base_dir, spk_id, clip_id+"_coeff_pt.npy")
+            if (not os.path.exists(audio_npy_name)) or (not os.path.exists(video_npy_name)):
+                print(f"Skip item for not found.")
+                continue
+            if (not os.path.exists(hubert_npy_name)):
+                print(f"Skip item for hubert_npy not found.")
+                continue
+            audio_dict = load_audio_npy(audio_npy_name)
+            hubert = np.load(hubert_npy_name)
+            video_dict = load_video_npy(video_npy_name)
+            cal_lm3d_in_video_dict(video_dict, face3d_helper)
+            mel = audio_dict['mel']
+            if mel.shape[0] < 64: # the video is shorter than 0.6s
+                print(f"Skip item for too short.")
+                continue
+            audio_dict.update(video_dict)
+            audio_dict['spk_id'] = spk_id
+            audio_dict['spk_idx'] = spk_id2spk_idx[spk_id]
+            audio_dict['item_id'] = spk_id + "_" + clip_id
+            audio_dict['hubert'] = hubert # [T_x, hid=1024]
+            databuilder.add_item(audio_dict)
+            cnt += 1
+        databuilder.finalize()
+        print(f"{prefix} set has {cnt} samples!")

process_audio_hubert.py ADDED Viewed

	@@ -0,0 +1,87 @@

+from transformers import Wav2Vec2Processor, HubertModel
+import soundfile as sf
+import numpy as np
+import torch
+print("Loading the Wav2Vec2 Processor...")
+wav2vec2_processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
+print("Loading the HuBERT Model...")
+hubert_model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
+def get_hubert_from_16k_wav(wav_16k_name):
+    speech_16k, _ = sf.read(wav_16k_name)
+    hubert = get_hubert_from_16k_speech(speech_16k)
+    return hubert
+@torch.no_grad()
+def get_hubert_from_16k_speech(speech, device="cuda:0"):
+    global hubert_model
+    hubert_model = hubert_model.to(device)
+    if speech.ndim ==2:
+        speech = speech[:, 0] # [T, 2] ==> [T,]
+    input_values_all = wav2vec2_processor(speech, return_tensors="pt", sampling_rate=16000).input_values # [1, T]
+    input_values_all = input_values_all.to(device)
+    # For long audio sequence, due to the memory limitation, we cannot process them in one run
+    # HuBERT process the wav with a CNN of stride [5,2,2,2,2,2], making a stride of 320
+    # Besides, the kernel is [10,3,3,3,3,2,2], making 400 a fundamental unit to get 1 time step.
+    # So the CNN is euqal to a big Conv1D with kernel k=400 and stride s=320
+    # We have the equation to calculate out time step: T = floor((t-k)/s)
+    # To prevent overlap, we set each clip length of (K+S*(N-1)), where N is the expected length T of this clip
+    # The start point of next clip should roll back with a length of (kernel-stride) so it is stride * N
+    kernel = 400
+    stride = 320
+    clip_length = stride * 1000
+    num_iter = input_values_all.shape[1] // clip_length
+    expected_T = (input_values_all.shape[1] - (kernel-stride)) // stride
+    res_lst = []
+    for i in range(num_iter):
+        if i == 0:
+            start_idx = 0
+            end_idx = clip_length - stride + kernel
+        else:
+            start_idx = clip_length * i
+            end_idx = start_idx + (clip_length - stride + kernel)
+        input_values = input_values_all[:, start_idx: end_idx]
+        hidden_states = hubert_model.forward(input_values).last_hidden_state # [B=1, T=pts//320, hid=1024]
+        res_lst.append(hidden_states[0])
+    if num_iter > 0:
+        input_values = input_values_all[:, clip_length * num_iter:]
+    else:
+        input_values = input_values_all
+    # if input_values.shape[1] != 0:
+    if input_values.shape[1] >= kernel: # if the last batch is shorter than kernel_size, skip it
+        hidden_states = hubert_model(input_values).last_hidden_state # [B=1, T=pts//320, hid=1024]
+        res_lst.append(hidden_states[0])
+    ret = torch.cat(res_lst, dim=0).cpu() # [T, 1024]
+    # assert ret.shape[0] == expected_T
+    assert abs(ret.shape[0] - expected_T) <= 1
+    if ret.shape[0] < expected_T:
+        ret = torch.nn.functional.pad(ret, (0,0,0,expected_T-ret.shape[0]))
+    else:
+        ret = ret[:expected_T]
+    return ret
+if __name__ == '__main__':
+    ### Process Single Long Audio for NeRF dataset
+    # person_id = 'May'
+    # wav_16k_name = f"data/processed/videos/{person_id}/aud.wav"
+    # hubert_npy_name = f"data/processed/videos/{person_id}/hubert.npy"
+    # speech_16k, _ = sf.read(wav_16k_name)
+    # hubert_hidden = get_hubert_from_16k_speech(speech_16k)
+    # np.save(hubert_npy_name, hubert_hidden.detach().numpy())
+    ### Process short audio clips for LRS3 dataset
+    import glob, os, tqdm
+    lrs3_dir = '/home/yezhenhui/datasets/raw/lrs3_raw/'
+    wav_16k_names = glob.glob(os.path.join(lrs3_dir, '*/*.wav'))
+    for wav_16k_name in tqdm.tqdm(wav_16k_names, total=len(wav_16k_names)):
+        spk_id = wav_16k_name.split("/")[-2]
+        clip_id = wav_16k_name.split("/")[-1][:-4]
+        out_name = os.path.join(lrs3_dir, spk_id, clip_id+'_hubert.npy')
+        if os.path.exists(out_name):
+            continue
+        speech_16k, _ = sf.read(wav_16k_name)
+        hubert_hidden = get_hubert_from_16k_speech(speech_16k)
+        np.save(out_name, hubert_hidden.detach().numpy())

process_audio_mel_f0.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import numpy as np
+import torch
+import glob
+import os
+import tqdm
+import librosa
+import parselmouth
+from utils.commons.pitch_utils import f0_to_coarse
+from utils.commons.multiprocess_utils import multiprocess_run_tqdm
+def librosa_pad_lr(x, fsize, fshift, pad_sides=1):
+    '''compute right padding (final frame) or both sides padding (first and final frames)
+    '''
+    assert pad_sides in (1, 2)
+    # return int(fsize // 2)
+    pad = (x.shape[0] // fshift + 1) * fshift - x.shape[0]
+    if pad_sides == 1:
+        return 0, pad
+    else:
+        return pad // 2, pad // 2 + pad % 2
+def extract_mel_from_fname(wav_path,
+                      fft_size=512,
+                      hop_size=320,
+                      win_length=512,
+                      window="hann",
+                      num_mels=80,
+                      fmin=80,
+                      fmax=7600,
+                      eps=1e-6,
+                      sample_rate=16000,
+                      min_level_db=-100):
+    if isinstance(wav_path, str):
+        wav, _ = librosa.core.load(wav_path, sr=sample_rate)
+    else:
+        wav = wav_path
+    # get amplitude spectrogram
+    x_stft = librosa.stft(wav, n_fft=fft_size, hop_length=hop_size,
+                          win_length=win_length, window=window, center=False)
+    spc = np.abs(x_stft)  # (n_bins, T)
+    # get mel basis
+    fmin = 0 if fmin == -1 else fmin
+    fmax = sample_rate / 2 if fmax == -1 else fmax
+    mel_basis = librosa.filters.mel(sr=sample_rate, n_fft=fft_size, n_mels=num_mels, fmin=fmin, fmax=fmax)
+    mel = mel_basis @ spc
+    mel = np.log10(np.maximum(eps, mel))  # (n_mel_bins, T)
+    mel = mel.T
+    l_pad, r_pad = librosa_pad_lr(wav, fft_size, hop_size, 1)
+    wav = np.pad(wav, (l_pad, r_pad), mode='constant', constant_values=0.0)
+    return wav.T, mel
+def extract_f0_from_wav_and_mel(wav, mel,
+                        hop_size=320,
+                        audio_sample_rate=16000,
+                        ):
+    time_step = hop_size / audio_sample_rate * 1000
+    f0_min = 80
+    f0_max = 750
+    f0 = parselmouth.Sound(wav, audio_sample_rate).to_pitch_ac(
+        time_step=time_step / 1000, voicing_threshold=0.6,
+        pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
+    delta_l = len(mel) - len(f0)
+    assert np.abs(delta_l) <= 8
+    if delta_l > 0:
+        f0 = np.concatenate([f0, [f0[-1]] * delta_l], 0)
+    f0 = f0[:len(mel)]
+    pitch_coarse = f0_to_coarse(f0)
+    return f0, pitch_coarse
+def extract_mel_f0_from_fname(fname, out_name=None):
+    assert fname.endswith(".wav")
+    if out_name is None:
+        out_name = fname[:-4] + '_audio.npy'
+    wav, mel = extract_mel_from_fname(fname)
+    f0, f0_coarse = extract_f0_from_wav_and_mel(wav, mel)
+    out_dict = {
+        "mel": mel, # [T, 80]
+        "f0": f0,
+    }
+    np.save(out_name, out_dict)
+    return True
+if __name__ == '__main__':
+    import os, glob
+    lrs3_dir = "/home/yezhenhui/datasets/raw/lrs3_raw"
+    wav_name_pattern = os.path.join(lrs3_dir, "*/*.wav")
+    wav_names = glob.glob(wav_name_pattern)
+    wav_names = sorted(wav_names)
+    for _ in multiprocess_run_tqdm(extract_mel_f0_from_fname, args=wav_names, num_workers=32,desc='extracting Mel and f0'):
+        pass

process_video_3dmm.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import os, sys
+import cv2
+import numpy as np
+from time import time
+from scipy.io import savemat
+import argparse
+from tqdm import tqdm, trange
+import torch
+import face_alignment
+import deep_3drecon
+from moviepy.editor import VideoFileClip
+import copy
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+fa = face_alignment.FaceAlignment(face_alignment.LandmarksType._2D, network_size=4, device='cuda')
+face_reconstructor = deep_3drecon.Reconstructor()
+# landmark detection in Deep3DRecon
+def lm68_2_lm5(in_lm):
+    # in_lm: shape=[68,2]
+    lm_idx = np.array([31,37,40,43,46,49,55]) - 1
+    # 将上述特殊角点的数据取出，得到5个新的角点数据，拼接起来。
+    lm = np.stack([in_lm[lm_idx[0],:],np.mean(in_lm[lm_idx[[1,2]],:],0),np.mean(in_lm[lm_idx[[3,4]],:],0),in_lm[lm_idx[5],:],in_lm[lm_idx[6],:]], axis = 0)
+    # 将第一个角点放在了第三个位置
+    lm = lm[[1,2,0,3,4],:2]
+    return lm
+def process_video(fname, out_name=None):
+    assert fname.endswith(".mp4")
+    if out_name is None:
+        out_name = fname[:-4] + '.npy'
+    tmp_name = out_name[:-4] + '.doi'
+    # if os.path.exists(tmp_name):
+    #     print("tmp exist, skip")
+    #     return
+    # if os.path.exists(out_name):
+        # print("out exisit, skip")
+        # return
+    os.system(f"touch {tmp_name}")
+    cap = cv2.VideoCapture(fname)
+    lm68_lst = []
+    lm5_lst = []
+    frame_rgb_lst = []
+    cnt = 0
+    while cap.isOpened():
+        ret, frame_bgr = cap.read()
+        if frame_bgr is None:
+            break
+        frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
+        try:
+            lm68 = fa.get_landmarks(frame_rgb)[0] # 识别图片中的人脸，获得角点, shape=[68,2]
+        except:
+            print(f"Skip Item: Caught errors when fa.get_landmarks, maybe No face detected in some frames in {fname}!")
+            # print(f"Caught error at {cnt}")
+            cnt +=1
+            return None
+            # continue
+        lm5 = lm68_2_lm5(lm68)
+        lm68_lst.append(lm68)
+        lm5_lst.append(lm5)
+        frame_rgb_lst.append(frame_rgb)
+        cnt += 1
+    video_rgb = np.stack(frame_rgb_lst) # [t, 224,224, 3]
+    lm68_arr = np.stack(lm68_lst).reshape([cnt, 68, 2])
+    lm5_arr = np.stack(lm5_lst).reshape([cnt, 5, 2])
+    num_frames = cnt
+    batch_size = 32
+    iter_times = num_frames // batch_size
+    last_bs = num_frames % batch_size
+    coeff_lst = []
+    for i_iter in range(iter_times):
+        start_idx = i_iter * batch_size
+        batched_images = video_rgb[start_idx: start_idx + batch_size]
+        batched_lm5 = lm5_arr[start_idx: start_idx + batch_size]
+        coeff, align_img = face_reconstructor.recon_coeff(batched_images, batched_lm5, return_image = True)
+        coeff_lst.append(coeff)
+    if last_bs != 0:
+        batched_images = video_rgb[-last_bs:]
+        batched_lm5 = lm5_arr[-last_bs:]
+        coeff, align_img = face_reconstructor.recon_coeff(batched_images, batched_lm5, return_image = True)
+        coeff_lst.append(coeff)
+    coeff_arr = np.concatenate(coeff_lst,axis=0)
+    result_dict = {
+        'coeff': coeff_arr.reshape([cnt, -1]),
+        'lm68': lm68_arr.reshape([cnt, 68, 2]),
+        'lm5': lm5_arr.reshape([cnt, 5, 2]),
+    }
+    np.save(out_name, result_dict)
+    os.system(f"rm {tmp_name}")
+def split_wav(mp4_name):
+    wav_name = mp4_name[:-4] + '.wav'
+    if os.path.exists(wav_name):
+        return
+    video = VideoFileClip(mp4_name,verbose=False)
+    dur = video.duration
+    audio = video.audio
+    assert audio is not None
+    audio.write_audiofile(wav_name,fps=16000,verbose=False,logger=None)
+if __name__ == '__main__':
+    ### Process Single Long video for NeRF dataset
+    # video_id = 'May'
+    # video_fname = f"data/raw/videos/{video_id}.mp4"
+    # out_fname = f"data/processed/videos/{video_id}/coeff.npy"
+    # process_video(video_fname, out_fname)
+    ### Process short video clips for LRS3 dataset
+    from argparse import ArgumentParser
+    parser = ArgumentParser()
+    parser.add_argument('--lrs3_path', type=int, default='/home/yezhenhui/datasets/raw/lrs3_raw', help='')
+    parser.add_argument('--process_id', type=int, default=0, help='')
+    parser.add_argument('--total_process', type=int, default=1, help='')
+    args = parser.parse_args()
+    import os, glob
+    lrs3_dir = parser.lrs3_path
+    mp4_name_pattern = os.path.join(lrs3_dir, "*/*.mp4")
+    mp4_names = glob.glob(mp4_name_pattern)
+    mp4_names = sorted(mp4_names)
+    if args.total_process > 1:
+        assert args.process_id <= args.total_process-1
+        num_samples_per_process = len(mp4_names) // args.total_process
+        if args.process_id == args.total_process-1:
+            mp4_names = mp4_names[args.process_id * num_samples_per_process : ]
+        else:
+            mp4_names = mp4_names[args.process_id * num_samples_per_process : (args.process_id+1) * num_samples_per_process]
+    for mp4_name in tqdm(mp4_names, desc='extracting 3DMM...'):
+        split_wav(mp4_name)
+        process_video(mp4_name,out_name=mp4_name.replace(".mp4", "_coeff_pt.npy"))

process_video_3dmm_th1kh.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import os, sys
+import cv2
+import numpy as np
+from time import time
+from scipy.io import savemat
+import argparse
+from tqdm import tqdm, trange
+import torch
+import face_alignment
+import deep_3drecon
+from moviepy.editor import VideoFileClip
+import copy
+from utils.commons.multiprocess_utils import multiprocess_run_tqdm, multiprocess_run
+from utils.commons.meters import Timer
+from decord import VideoReader
+from decord import cpu, gpu
+from utils.commons.face_alignment_utils import mediapipe_lm478_to_face_alignment_lm68
+import mediapipe
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+# fa = face_alignment.FaceAlignment(face_alignment.LandmarksType._2D, network_size=4, device='cuda')
+mp_face_mesh = mediapipe.solutions.face_mesh
+face_reconstructor = deep_3drecon.Reconstructor()
+def chunk(iterable, chunk_size):
+    final_ret = []
+    cnt = 0
+    ret = []
+    for record in iterable:
+        if cnt == 0:
+            ret = []
+        ret.append(record)
+        cnt += 1
+        if len(ret) == chunk_size:
+            final_ret.append(ret)
+            ret = []
+    if len(final_ret[-1]) != chunk_size:
+        final_ret.append(ret)
+    return final_ret
+# landmark detection in Deep3DRecon
+def lm68_2_lm5(in_lm):
+    assert in_lm.ndim == 2
+    # in_lm: shape=[68,2]
+    lm_idx = np.array([31,37,40,43,46,49,55]) - 1
+    # 将上述特殊角点的数据取出，得到5个新的角点数据，拼接起来。
+    lm = np.stack([in_lm[lm_idx[0],:],np.mean(in_lm[lm_idx[[1,2]],:],0),np.mean(in_lm[lm_idx[[3,4]],:],0),in_lm[lm_idx[5],:],in_lm[lm_idx[6],:]], axis = 0)
+    # 将第一个角点放在了第三个位置
+    lm = lm[[1,2,0,3,4],:2]
+    return lm
+def extract_frames_job(fname):
+    out_name=fname.replace(".mp4", "_coeff_pt.npy").replace("datasets/raw/cropped_clips", "datasets/processed/coeff")
+    if os.path.exists(out_name):
+        return None
+    video_reader = VideoReader(fname, ctx=cpu(0))
+    frame_rgb_lst = video_reader.get_batch(list(range(0,len(video_reader)))).asnumpy()
+    return frame_rgb_lst
+def extract_lms_mediapipe_job(frames):
+    if frames is None:
+        return None
+    with mp_face_mesh.FaceMesh(
+                        static_image_mode=False,
+                        max_num_faces=1,
+                        refine_landmarks=True,
+                        min_detection_confidence=0.5) as face_mesh:
+        ldms_normed = []
+        frame_i = 0
+        frame_ids = []
+        for i in range(len(frames)):
+            # Convert the BGR image to RGB before processing.
+            ret = face_mesh.process(frames[i])
+            # Print and draw face mesh landmarks on the image.
+            if not ret.multi_face_landmarks:
+                print(f"Skip Item: Caught errors when mediapipe get face_mesh, maybe No face detected in some frames!")
+                return None
+            else:
+                myFaceLandmarks = []
+                lms = ret.multi_face_landmarks[0]
+                for lm in lms.landmark:
+                    myFaceLandmarks.append([lm.x, lm.y, lm.z])
+                ldms_normed.append(myFaceLandmarks)
+            frame_ids.append(frame_i)
+            frame_i += 1
+    bs, H, W, _ = frames.shape
+    ldms478 = np.array(ldms_normed)
+    lm68 = mediapipe_lm478_to_face_alignment_lm68(ldms478, H, W, return_2d=True)
+    lm5_lst = [lm68_2_lm5(lm68[i]) for i in range(lm68.shape[0])]
+    lm5 = np.stack(lm5_lst)
+    return ldms478, lm68, lm5
+def process_video_batch(fname_lst, out_name_lst=None):
+    frames_lst = []
+    with Timer("load_frames", True):
+        for (i, res) in multiprocess_run_tqdm(extract_frames_job, fname_lst, num_workers=2, desc="decord is loading frames in the batch videos..."):
+            frames_lst.append(res)
+    lm478s_lst = []
+    lm68s_lst = []
+    lm5s_lst = []
+    with Timer("mediapipe_faceAlign", True):
+        for (i, res) in multiprocess_run_tqdm(extract_lms_mediapipe_job, frames_lst, num_workers=2, desc="mediapipe is predicting face mesh in batch videos..."):
+            if res is None:
+                res = (None, None, None)
+            lm478s, lm68s, lm5s = res
+            lm478s_lst.append(lm478s)
+            lm68s_lst.append(lm68s)
+            lm5s_lst.append(lm5s)
+    processed_cnt_in_this_batch = 0
+    with Timer("deep_3drecon_pytorch", True):
+        for i, fname in tqdm(enumerate(fname_lst), total=len(fname_lst), desc="extracting 3DMM in the batch videos..."):
+            video_rgb = frames_lst[i] # [t, 224,224, 3]
+            lm478_arr = lm478s_lst[i]
+            lm68_arr = lm68s_lst[i]
+            lm5_arr = lm5s_lst[i]
+            if lm5_arr is None:
+                continue
+            num_frames = len(video_rgb)
+            batch_size = 32
+            iter_times = num_frames // batch_size
+            last_bs = num_frames % batch_size
+            coeff_lst = []
+            for i_iter in range(iter_times):
+                start_idx = i_iter * batch_size
+                batched_images = video_rgb[start_idx: start_idx + batch_size]
+                batched_lm5 = lm5_arr[start_idx: start_idx + batch_size]
+                coeff, align_img = face_reconstructor.recon_coeff(batched_images, batched_lm5, return_image = True)
+                coeff_lst.append(coeff)
+            if last_bs != 0:
+                batched_images = video_rgb[-last_bs:]
+                batched_lm5 = lm5_arr[-last_bs:]
+                coeff, align_img = face_reconstructor.recon_coeff(batched_images, batched_lm5, return_image = True)
+                coeff_lst.append(coeff)
+            coeff_arr = np.concatenate(coeff_lst,axis=0)
+            result_dict = {
+                'coeff': coeff_arr.reshape([num_frames, -1]).astype(np.float32),
+                'lm478': lm478_arr.reshape([num_frames, 478, 3]).astype(np.float32),
+                'lm68': lm68_arr.reshape([num_frames, 68, 2]).astype(np.int16),
+                'lm5': lm5_arr.reshape([num_frames, 5, 2]).astype(np.int16),
+            }
+            np.save(out_name_lst[i], result_dict)
+            processed_cnt_in_this_batch +=1
+    print(f"In this batch {processed_cnt_in_this_batch} files are processed")
+def split_wav(mp4_name):
+    wav_name = mp4_name[:-4] + '.wav'
+    if os.path.exists(wav_name):
+        return
+    video = VideoFileClip(mp4_name,verbose=False)
+    dur = video.duration
+    audio = video.audio
+    assert audio is not None
+    audio.write_audiofile(wav_name,fps=16000,verbose=False,logger=None)
+if __name__ == '__main__':
+    ### Process Single Long video for NeRF dataset
+    # video_id = 'May'
+    # video_fname = f"data/raw/videos/{video_id}.mp4"
+    # out_fname = f"data/processed/videos/{video_id}/coeff.npy"
+    # process_video(video_fname, out_fname)
+    ### Process short video clips for LRS3 dataset
+    import random
+    from argparse import ArgumentParser
+    parser = ArgumentParser()
+    parser.add_argument('--lrs3_path', type=str, default='/home/yezhenhui/projects/TalkingHead-1KH/datasets/raw/cropped_clips', help='')
+    parser.add_argument('--process_id', type=int, default=0, help='')
+    parser.add_argument('--total_process', type=int, default=1, help='')
+    args = parser.parse_args()
+    import os, glob
+    lrs3_dir = args.lrs3_path
+    out_dir = lrs3_dir.replace("raw/cropped_clips", "processed/coeff")
+    os.makedirs(out_dir, exist_ok=True)
+    # mp4_name_pattern = os.path.join(lrs3_dir, "*.mp4")
+    # mp4_names = glob.glob(mp4_name_pattern)
+    with open('/home/yezhenhui/projects/LDMAvatar/clean.txt', 'r') as f:
+        txt = f.read()
+    mp4_names = txt.split("\n")
+    mp4_names = sorted(mp4_names)
+    if args.total_process > 1:
+        assert args.process_id <= args.total_process-1
+        num_samples_per_process = len(mp4_names) // args.total_process
+        if args.process_id == args.total_process-1:
+            mp4_names = mp4_names[args.process_id * num_samples_per_process : ]
+        else:
+            mp4_names = mp4_names[args.process_id * num_samples_per_process : (args.process_id+1) * num_samples_per_process]
+    random.seed(111)
+    random.shuffle(mp4_names)
+    batched_mp4_names_lst = chunk(mp4_names, chunk_size=8)
+    for batch_mp4_names in tqdm(batched_mp4_names_lst, desc='[ROOT]: extracting face mesh and 3DMM in batches...'):
+        try:
+            for mp4_name in batch_mp4_names:
+                split_wav(mp4_name)
+            out_names = [mp4_name.replace(".mp4", "_coeff_pt.npy").replace("datasets/raw/cropped_clips", "datasets/processed/coeff") for mp4_name in batch_mp4_names]
+            process_video_batch(batch_mp4_names, out_names)
+            # process_video(mp4_name,out_name=mp4_name.replace(".mp4", "_coeff_pt.npy").replace("datasets/raw/cropped_clips", "datasets/processed/coeff"))
+        except Exception as e:
+            print(e)
+            continue

process_video_3dmm_vox2.py ADDED Viewed

	@@ -0,0 +1,227 @@

+import os, sys
+import numpy as np
+from tqdm import tqdm, trange
+import deep_3drecon
+from moviepy.editor import VideoFileClip
+from utils.commons.multiprocess_utils import multiprocess_run_tqdm, multiprocess_run
+from utils.commons.meters import Timer
+from decord import VideoReader
+from decord import cpu, gpu
+from utils.commons.face_alignment_utils import mediapipe_lm478_to_face_alignment_lm68
+import mediapipe
+import cv2
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+# fa = face_alignment.FaceAlignment(face_alignment.LandmarksType._2D, network_size=4, device='cuda')
+mp_face_mesh = mediapipe.solutions.face_mesh
+face_reconstructor = deep_3drecon.Reconstructor()
+def chunk(iterable, chunk_size):
+    final_ret = []
+    cnt = 0
+    ret = []
+    for record in iterable:
+        if cnt == 0:
+            ret = []
+        ret.append(record)
+        cnt += 1
+        if len(ret) == chunk_size:
+            final_ret.append(ret)
+            ret = []
+    if len(final_ret[-1]) != chunk_size:
+        final_ret.append(ret)
+    return final_ret
+# landmark detection in Deep3DRecon
+def lm68_2_lm5(in_lm):
+    assert in_lm.ndim == 2
+    # in_lm: shape=[68,2]
+    lm_idx = np.array([31,37,40,43,46,49,55]) - 1
+    # 将上述特殊角点的数据取出，得到5个新的角点数据，拼接起来。
+    lm = np.stack([in_lm[lm_idx[0],:],np.mean(in_lm[lm_idx[[1,2]],:],0),np.mean(in_lm[lm_idx[[3,4]],:],0),in_lm[lm_idx[5],:],in_lm[lm_idx[6],:]], axis = 0)
+    # 将第一个角点放在了第三个位置
+    lm = lm[[1,2,0,3,4],:2]
+    return lm
+def extract_frames_job(fname):
+    try:
+        out_name=fname.replace(".mp4", "_coeff_pt.npy").replace("/dev/", "/coeff/")
+        if os.path.exists(out_name):
+            return None
+        cap = cv2.VideoCapture(fname)
+        frames = []
+        while cap.isOpened():
+            ret, frame_bgr = cap.read()
+            if frame_bgr is None:
+                break
+            frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
+            frames.append(frame_rgb)
+        return np.stack(frames)
+        # out_name=fname.replace(".mp4", "_coeff_pt.npy").replace("/dev/", "/coeff/")
+        # if os.path.exists(out_name):
+        #     return None
+        # video_reader = VideoReader(fname, ctx=cpu(0))
+        # frame_rgb_lst = video_reader.get_batch(list(range(0,len(video_reader)))).asnumpy()
+        # return frame_rgb_lst
+    except Exception as e:
+        print(e)
+        return None
+def extract_lms_mediapipe_job(frames):
+    try:
+        if frames is None:
+            return None
+        with mp_face_mesh.FaceMesh(
+                            static_image_mode=False,
+                            max_num_faces=1,
+                            refine_landmarks=True,
+                            min_detection_confidence=0.5) as face_mesh:
+            ldms_normed = []
+            frame_i = 0
+            frame_ids = []
+            for i in range(len(frames)):
+                # Convert the BGR image to RGB before processing.
+                ret = face_mesh.process(frames[i])
+                # Print and draw face mesh landmarks on the image.
+                if not ret.multi_face_landmarks:
+                    print(f"Skip Item: Caught errors when mediapipe get face_mesh, maybe No face detected in some frames!")
+                    return None
+                else:
+                    myFaceLandmarks = []
+                    lms = ret.multi_face_landmarks[0]
+                    for lm in lms.landmark:
+                        myFaceLandmarks.append([lm.x, lm.y, lm.z])
+                    ldms_normed.append(myFaceLandmarks)
+                frame_ids.append(frame_i)
+                frame_i += 1
+        bs, H, W, _ = frames.shape
+        ldms478 = np.array(ldms_normed)
+        lm68 = mediapipe_lm478_to_face_alignment_lm68(ldms478, H, W, return_2d=True)
+        lm5_lst = [lm68_2_lm5(lm68[i]) for i in range(lm68.shape[0])]
+        lm5 = np.stack(lm5_lst)
+        return ldms478, lm68, lm5
+    except Exception as e:
+        print(e)
+        return None
+def process_video_batch(fname_lst, out_name_lst=None):
+    frames_lst = []
+    with Timer("load_frames", True):
+        for fname in tqdm(fname_lst, desc="decord is loading frames in the batch videos..."):
+            res = extract_frames_job(fname)
+            frames_lst.append(res)
+        # for (i, res) in multiprocess_run_tqdm(extract_frames_job, fname_lst, num_workers=1, desc="decord is loading frames in the batch videos..."):
+            # frames_lst.append(res)
+    lm478s_lst = []
+    lm68s_lst = []
+    lm5s_lst = []
+    with Timer("mediapipe_faceAlign", True):
+        # for (i, res) in multiprocess_run_tqdm(extract_lms_mediapipe_job, frames_lst, num_workers=2, desc="mediapipe is predicting face mesh in batch videos..."):
+        for i, frames in tqdm(enumerate(frames_lst),total=len(fname_lst), desc="mediapipe is predicting face mesh in batch videos..."):
+            res = extract_lms_mediapipe_job(frames)
+            if res is None:
+                res = (None, None, None)
+            lm478s, lm68s, lm5s = res
+            lm478s_lst.append(lm478s)
+            lm68s_lst.append(lm68s)
+            lm5s_lst.append(lm5s)
+    processed_cnt_in_this_batch = 0
+    with Timer("deep_3drecon_pytorch", True):
+        for i, fname in tqdm(enumerate(fname_lst), total=len(fname_lst), desc="extracting 3DMM in the batch videos..."):
+            video_rgb = frames_lst[i] # [t, 224,224, 3]
+            lm478_arr = lm478s_lst[i]
+            lm68_arr = lm68s_lst[i]
+            lm5_arr = lm5s_lst[i]
+            if lm5_arr is None:
+                continue
+            num_frames = len(video_rgb)
+            batch_size = 32
+            iter_times = num_frames // batch_size
+            last_bs = num_frames % batch_size
+            coeff_lst = []
+            for i_iter in range(iter_times):
+                start_idx = i_iter * batch_size
+                batched_images = video_rgb[start_idx: start_idx + batch_size]
+                batched_lm5 = lm5_arr[start_idx: start_idx + batch_size]
+                coeff, align_img = face_reconstructor.recon_coeff(batched_images, batched_lm5, return_image = True)
+                coeff_lst.append(coeff)
+            if last_bs != 0:
+                batched_images = video_rgb[-last_bs:]
+                batched_lm5 = lm5_arr[-last_bs:]
+                coeff, align_img = face_reconstructor.recon_coeff(batched_images, batched_lm5, return_image = True)
+                coeff_lst.append(coeff)
+            coeff_arr = np.concatenate(coeff_lst,axis=0)
+            result_dict = {
+                'coeff': coeff_arr.reshape([num_frames, -1]).astype(np.float32),
+                'lm478': lm478_arr.reshape([num_frames, 478, 3]).astype(np.float32),
+                'lm68': lm68_arr.reshape([num_frames, 68, 2]).astype(np.int16),
+                'lm5': lm5_arr.reshape([num_frames, 5, 2]).astype(np.int16),
+            }
+            os.makedirs(os.path.dirname(out_name_lst[i]),exist_ok=True)
+            np.save(out_name_lst[i], result_dict)
+            processed_cnt_in_this_batch +=1
+    print(f"In this batch {processed_cnt_in_this_batch} files are processed")
+def split_wav(mp4_name):
+    try:
+        wav_name = mp4_name[:-4] + '.wav'
+        if os.path.exists(wav_name):
+            return
+        video = VideoFileClip(mp4_name,verbose=False)
+        dur = video.duration
+        audio = video.audio
+        assert audio is not None
+        audio.write_audiofile(wav_name,fps=16000,verbose=False,logger=None)
+    except Exception as e:
+        print(e)
+        return None
+if __name__ == '__main__':
+    ### Process Single Long video for NeRF dataset
+    # video_id = 'May'
+    # video_fname = f"data/raw/videos/{video_id}.mp4"
+    # out_fname = f"data/processed/videos/{video_id}/coeff.npy"
+    # process_video(video_fname, out_fname)
+    ### Process short video clips for LRS3 dataset
+    import random
+    from argparse import ArgumentParser
+    parser = ArgumentParser()
+    parser.add_argument('--lrs3_path', type=str, default='/mnt/sda/yezhenhui/datasets/voxceleb2', help='')
+    parser.add_argument('--process_id', type=int, default=0, help='')
+    parser.add_argument('--total_process', type=int, default=1, help='')
+    args = parser.parse_args()
+    import os, glob
+    lrs3_dir = args.lrs3_path
+    mp4_name_pattern = os.path.join(lrs3_dir, "dev/id*/*/*.mp4")
+    mp4_names = glob.glob(mp4_name_pattern)
+    if args.total_process > 1:
+        assert args.process_id <= args.total_process-1
+        num_samples_per_process = len(mp4_names) // args.total_process
+        if args.process_id == args.total_process-1:
+            mp4_names = mp4_names[args.process_id * num_samples_per_process : ]
+        else:
+            mp4_names = mp4_names[args.process_id * num_samples_per_process : (args.process_id+1) * num_samples_per_process]
+    random.seed(111)
+    random.shuffle(mp4_names)
+    batched_mp4_names_lst = chunk(mp4_names, chunk_size=1)
+    for batch_mp4_names in tqdm(batched_mp4_names_lst, desc='[ROOT]: extracting face mesh and 3DMM in batches...'):
+        try:
+            for mp4_name in batch_mp4_names:
+                split_wav(mp4_name)
+            out_names = [mp4_name.replace(".mp4", "_coeff_pt.npy").replace("/dev/", "/coeff/") for mp4_name in batch_mp4_names]
+            process_video_batch(batch_mp4_names, out_names)
+        except Exception as e:
+            print(e)
+            continue