feifeifeiliu commited on
Commit
865fd8a
1 Parent(s): d8f41ae

first version

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +7 -0
  2. README.md +2 -2
  3. __init__.py +0 -0
  4. app.py +282 -0
  5. config/LS3DCG.json +60 -0
  6. config/body_pixel.json +63 -0
  7. config/body_vq.json +62 -0
  8. config/face.json +59 -0
  9. data_utils/__init__.py +3 -0
  10. data_utils/__pycache__/__init__.cpython-37.pyc +0 -0
  11. data_utils/__pycache__/consts.cpython-37.pyc +0 -0
  12. data_utils/__pycache__/dataloader_torch.cpython-37.pyc +0 -0
  13. data_utils/__pycache__/lower_body.cpython-37.pyc +0 -0
  14. data_utils/__pycache__/mesh_dataset.cpython-37.pyc +0 -0
  15. data_utils/__pycache__/rotation_conversion.cpython-37.pyc +0 -0
  16. data_utils/__pycache__/utils.cpython-37.pyc +0 -0
  17. data_utils/axis2matrix.py +29 -0
  18. data_utils/consts.py +0 -0
  19. data_utils/dataloader_torch.py +279 -0
  20. data_utils/dataset_preprocess.py +170 -0
  21. data_utils/get_j.py +51 -0
  22. data_utils/hand_component.json +0 -0
  23. data_utils/lower_body.py +143 -0
  24. data_utils/mesh_dataset.py +348 -0
  25. data_utils/rotation_conversion.py +551 -0
  26. data_utils/utils.py +333 -0
  27. demo/1st-page/1st-page-upper.mp4 +0 -0
  28. demo/1st-page/1st-page-upper.npy +3 -0
  29. demo/french/french.mp4 +0 -0
  30. demo/french/french.npy +3 -0
  31. demo/rich/rich.mp4 +3 -0
  32. demo/rich/rich.npy +3 -0
  33. demo/song/cut.mp4 +0 -0
  34. demo/song/song.mp4 +3 -0
  35. demo/song/song.npy +3 -0
  36. demo/style/chemistry.mp4 +0 -0
  37. demo/style/chemistry.npy +3 -0
  38. demo/style/conan.mp4 +0 -0
  39. demo/style/conan.npy +3 -0
  40. demo/style/diversity.mp4 +3 -0
  41. demo/style/diversity.npy +3 -0
  42. demo/style/face.mp4 +0 -0
  43. demo/style/face.npy +3 -0
  44. demo/style/oliver.mp4 +0 -0
  45. demo/style/oliver.npy +3 -0
  46. demo/style/seth.mp4 +0 -0
  47. demo/style/seth.npy +3 -0
  48. demo_audio/1st-page.wav +0 -0
  49. demo_audio/french.wav +0 -0
  50. demo_audio/rich.wav +3 -0
.gitattributes CHANGED
@@ -32,3 +32,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ demo_audio/rich_short.wav filter=lfs diff=lfs merge=lfs -text
36
+ demo_audio/rich.wav filter=lfs diff=lfs merge=lfs -text
37
+ demo_audio/song.wav filter=lfs diff=lfs merge=lfs -text
38
+ demo/rich/rich.mp4 filter=lfs diff=lfs merge=lfs -text
39
+ demo/song/song.mp4 filter=lfs diff=lfs merge=lfs -text
40
+ demo/style/diversity.mp4 filter=lfs diff=lfs merge=lfs -text
41
+ visualise/teaser_01.png filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
  title: TalkSHOW
3
- emoji: 🏃
4
  colorFrom: pink
5
- colorTo: green
6
  sdk: gradio
7
  sdk_version: 3.23.0
8
  app_file: app.py
 
1
  ---
2
  title: TalkSHOW
3
+ emoji: 🌍
4
  colorFrom: pink
5
+ colorTo: red
6
  sdk: gradio
7
  sdk_version: 3.23.0
8
  app_file: app.py
__init__.py ADDED
File without changes
app.py ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import sys
4
+ sys.path.append(os.getcwd())
5
+ os.system(r"cd mesh-master")
6
+ os.system(r"make all")
7
+ os.system(r"cd ..")
8
+
9
+ from transformers import Wav2Vec2Processor
10
+
11
+ import numpy as np
12
+ import json
13
+ import smplx as smpl
14
+
15
+ from nets import *
16
+ from trainer.options import parse_args
17
+ from data_utils import torch_data
18
+ from trainer.config import load_JsonConfig
19
+
20
+ import torch
21
+ import torch.nn as nn
22
+ import torch.nn.functional as F
23
+ from torch.utils import data
24
+ from data_utils.rotation_conversion import rotation_6d_to_matrix, matrix_to_axis_angle
25
+ from data_utils.lower_body import part2full, pred2poses, poses2pred, poses2poses
26
+ from visualise.rendering import RenderTool
27
+
28
+ global device
29
+ device = 'cpu'
30
+
31
+
32
+ def init_model(model_name, model_path, args, config):
33
+ if model_name == 's2g_face':
34
+ generator = s2g_face(
35
+ args,
36
+ config,
37
+ )
38
+ elif model_name == 's2g_body_vq':
39
+ generator = s2g_body_vq(
40
+ args,
41
+ config,
42
+ )
43
+ elif model_name == 's2g_body_pixel':
44
+ generator = s2g_body_pixel(
45
+ args,
46
+ config,
47
+ )
48
+ elif model_name == 's2g_LS3DCG':
49
+ generator = LS3DCG(
50
+ args,
51
+ config,
52
+ )
53
+ else:
54
+ raise NotImplementedError
55
+
56
+ model_ckpt = torch.load(model_path, map_location=torch.device('cpu'))
57
+ if model_name == 'smplx_S2G':
58
+ generator.generator.load_state_dict(model_ckpt['generator']['generator'])
59
+
60
+ elif 'generator' in list(model_ckpt.keys()):
61
+ generator.load_state_dict(model_ckpt['generator'])
62
+ else:
63
+ model_ckpt = {'generator': model_ckpt}
64
+ generator.load_state_dict(model_ckpt)
65
+
66
+ return generator
67
+
68
+
69
+ def get_vertices(smplx_model, betas, result_list, exp, require_pose=False):
70
+ vertices_list = []
71
+ poses_list = []
72
+ expression = torch.zeros([1, 100])
73
+
74
+ for i in result_list:
75
+ vertices = []
76
+ poses = []
77
+ for j in range(i.shape[0]):
78
+ output = smplx_model(betas=betas,
79
+ expression=i[j][165:265].unsqueeze_(dim=0) if exp else expression,
80
+ jaw_pose=i[j][0:3].unsqueeze_(dim=0),
81
+ leye_pose=i[j][3:6].unsqueeze_(dim=0),
82
+ reye_pose=i[j][6:9].unsqueeze_(dim=0),
83
+ global_orient=i[j][9:12].unsqueeze_(dim=0),
84
+ body_pose=i[j][12:75].unsqueeze_(dim=0),
85
+ left_hand_pose=i[j][75:120].unsqueeze_(dim=0),
86
+ right_hand_pose=i[j][120:165].unsqueeze_(dim=0),
87
+ return_verts=True)
88
+ vertices.append(output.vertices.detach().cpu().numpy().squeeze())
89
+ # pose = torch.cat([output.body_pose, output.left_hand_pose, output.right_hand_pose], dim=1)
90
+ pose = output.body_pose
91
+ poses.append(pose.detach().cpu())
92
+ vertices = np.asarray(vertices)
93
+ vertices_list.append(vertices)
94
+ poses = torch.cat(poses, dim=0)
95
+ poses_list.append(poses)
96
+ if require_pose:
97
+ return vertices_list, poses_list
98
+ else:
99
+ return vertices_list, None
100
+
101
+
102
+ global_orient = torch.tensor([3.0747, -0.0158, -0.0152])
103
+
104
+ parser = parse_args()
105
+ args = parser.parse_args()
106
+
107
+ RUN_MODE = "local"
108
+ if RUN_MODE != "local":
109
+ os.system("wget -P experiments/2022-10-15-smplx_S2G-face-3d/ "
110
+ "https://huggingface.co/feifeifeiliu/TalkSHOW/resolve/main/2022-10-15-smplx_S2G-face-3d/ckpt-99.pth")
111
+ os.system("wget -P experiments/2022-10-31-smplx_S2G-body-vq-3d/ "
112
+ "https://huggingface.co/feifeifeiliu/TalkSHOW/resolve/main/2022-10-31-smplx_S2G-body-vq-3d/ckpt-99.pth")
113
+ os.system("wget -P experiments/2022-11-02-smplx_S2G-body-pixel-3d/ "
114
+ "https://huggingface.co/feifeifeiliu/TalkSHOW/resolve/main/2022-11-02-smplx_S2G-body-pixel-3d/ckpt-99.pth")
115
+ os.system("wget -P visualise/smplx/ "
116
+ "https://huggingface.co/feifeifeiliu/TalkSHOW/resolve/main/smplx/SMPLX_NEUTRAL.npz")
117
+
118
+ config = load_JsonConfig("config/body_pixel.json")
119
+
120
+ face_model_name = args.face_model_name
121
+ face_model_path = args.face_model_path
122
+ body_model_name = args.body_model_name
123
+ body_model_path = args.body_model_path
124
+ smplx_path = './visualise/'
125
+
126
+ os.environ['smplx_npz_path'] = config.smplx_npz_path
127
+ os.environ['extra_joint_path'] = config.extra_joint_path
128
+ os.environ['j14_regressor_path'] = config.j14_regressor_path
129
+
130
+ print('init model...')
131
+ g_body = init_model(body_model_name, body_model_path, args, config)
132
+ generator2 = None
133
+ g_face = init_model(face_model_name, face_model_path, args, config)
134
+
135
+ print('init smlpx model...')
136
+ dtype = torch.float64
137
+ model_params = dict(model_path=smplx_path,
138
+ model_type='smplx',
139
+ create_global_orient=True,
140
+ create_body_pose=True,
141
+ create_betas=True,
142
+ num_betas=300,
143
+ create_left_hand_pose=True,
144
+ create_right_hand_pose=True,
145
+ use_pca=False,
146
+ flat_hand_mean=False,
147
+ create_expression=True,
148
+ num_expression_coeffs=100,
149
+ num_pca_comps=12,
150
+ create_jaw_pose=True,
151
+ create_leye_pose=True,
152
+ create_reye_pose=True,
153
+ create_transl=False,
154
+ # gender='ne',
155
+ dtype=dtype, )
156
+ smplx_model = smpl.create(**model_params).to(device)
157
+ print('init rendertool...')
158
+ rendertool = RenderTool('visualise/video/' + config.Log.name)
159
+
160
+
161
+ def infer(wav, identity, pose):
162
+ betas = torch.zeros([1, 300], dtype=torch.float64).to(device)
163
+ am = Wav2Vec2Processor.from_pretrained("vitouphy/wav2vec2-xls-r-300m-phoneme")
164
+ am_sr = 16000
165
+ num_sample = args.num_sample
166
+ cur_wav_file = wav
167
+
168
+ if pose == 'Stand':
169
+ stand = True
170
+ face = False
171
+ elif pose == 'Sit':
172
+ stand = False
173
+ face = False
174
+ else:
175
+ stand = False
176
+ face = True
177
+
178
+ if face:
179
+ body_static = torch.zeros([1, 162], device=device)
180
+ body_static[:, 6:9] = torch.tensor([3.0747, -0.0158, -0.0152]).reshape(1, 3).repeat(body_static.shape[0], 1)
181
+
182
+ if identity == 'Oliver':
183
+ id = 0
184
+ elif identity == 'Chemistry':
185
+ id = 1
186
+ elif identity == 'Seth':
187
+ id = 2
188
+ elif identity == 'Conan':
189
+ id = 3
190
+
191
+ result_list = []
192
+
193
+ pred_face = g_face.infer_on_audio(cur_wav_file,
194
+ initial_pose=None,
195
+ norm_stats=None,
196
+ w_pre=False,
197
+ # id=id,
198
+ frame=None,
199
+ am=am,
200
+ am_sr=am_sr
201
+ )
202
+ pred_face = torch.tensor(pred_face).squeeze().to(device)
203
+ # pred_face = torch.zeros([gt.shape[0], 105])
204
+
205
+ if config.Data.pose.convert_to_6d:
206
+ pred_jaw = pred_face[:, :6].reshape(pred_face.shape[0], -1, 6)
207
+ pred_jaw = matrix_to_axis_angle(rotation_6d_to_matrix(pred_jaw)).reshape(pred_face.shape[0], -1)
208
+ pred_face = pred_face[:, 6:]
209
+ else:
210
+ pred_jaw = pred_face[:, :3]
211
+ pred_face = pred_face[:, 3:]
212
+
213
+ id = torch.tensor([id], device=device)
214
+
215
+ for i in range(num_sample):
216
+ pred_res = g_body.infer_on_audio(cur_wav_file,
217
+ initial_pose=None,
218
+ norm_stats=None,
219
+ txgfile=None,
220
+ id=id,
221
+ var=None,
222
+ fps=30,
223
+ w_pre=False
224
+ )
225
+ pred = torch.tensor(pred_res).squeeze().to(device)
226
+
227
+ if pred.shape[0] < pred_face.shape[0]:
228
+ repeat_frame = pred[-1].unsqueeze(dim=0).repeat(pred_face.shape[0] - pred.shape[0], 1)
229
+ pred = torch.cat([pred, repeat_frame], dim=0)
230
+ else:
231
+ pred = pred[:pred_face.shape[0], :]
232
+
233
+ body_or_face = False
234
+ if pred.shape[1] < 275:
235
+ body_or_face = True
236
+ if config.Data.pose.convert_to_6d:
237
+ pred = pred.reshape(pred.shape[0], -1, 6)
238
+ pred = matrix_to_axis_angle(rotation_6d_to_matrix(pred))
239
+ pred = pred.reshape(pred.shape[0], -1)
240
+
241
+ if config.Model.model_name == 's2g_LS3DCG':
242
+ pred = torch.cat([pred[:, :3], pred[:, 103:], pred[:, 3:103]], dim=-1)
243
+ else:
244
+ pred = torch.cat([pred_jaw, pred, pred_face], dim=-1)
245
+
246
+ # pred[:, 9:12] = global_orient
247
+ pred = part2full(pred, stand)
248
+ if face:
249
+ pred = torch.cat([pred[:, :3], body_static.repeat(pred.shape[0], 1), pred[:, -100:]], dim=-1)
250
+ # result_list[0] = poses2pred(result_list[0], stand)
251
+ # if gt_0 is None:
252
+ # gt_0 = gt
253
+ # pred = pred2poses(pred, gt_0)
254
+ # result_list[0] = poses2poses(result_list[0], gt_0)
255
+
256
+ result_list.append(pred)
257
+
258
+
259
+ vertices_list, _ = get_vertices(smplx_model, betas, result_list, config.Data.pose.expression)
260
+
261
+ result_list = [res.to('cpu') for res in result_list]
262
+ dict = np.concatenate(result_list[:], axis=0)
263
+
264
+ rendertool._render_sequences(cur_wav_file, vertices_list, stand=stand, face=face, whole_body=args.whole_body)
265
+ return "result.mp4"
266
+
267
+ def main():
268
+
269
+ iface = gr.Interface(fn=infer, inputs=["audio",
270
+ gr.Radio(["Oliver", "Chemistry", "Seth", "Conan"]),
271
+ gr.Radio(["Stand", "Sit", "Only Face"]),
272
+ ],
273
+ outputs="video",
274
+ examples=[[os.path.join(os.path.dirname(__file__), "demo_audio/style.wav"), "Oliver", "Sit"]])
275
+ iface.launch(debug=True)
276
+
277
+
278
+ if __name__ == '__main__':
279
+ main()
280
+
281
+
282
+
config/LS3DCG.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_root_path": "/is/cluster/scratch/hyi/ExpressiveBody/SMPLifyX4/scripts",
3
+ "dataset_load_mode": "pickle",
4
+ "store_file_path": "store.pkl",
5
+ "smplx_npz_path": "visualise/smplx_model/SMPLX_NEUTRAL_2020.npz",
6
+ "extra_joint_path": "visualise/smplx_model/smplx_extra_joints.yaml",
7
+ "j14_regressor_path": "visualise/smplx_model/SMPLX_to_J14.pkl",
8
+ "param": {
9
+ "w_j": 1,
10
+ "w_b": 1,
11
+ "w_h": 1
12
+ },
13
+ "Data": {
14
+ "data_root": "../ExpressiveWholeBodyDatasetv1.0/",
15
+ "pklname": "_3d_mfcc.pkl",
16
+ "whole_video": false,
17
+ "pose": {
18
+ "normalization": false,
19
+ "convert_to_6d": false,
20
+ "norm_method": "all",
21
+ "augmentation": false,
22
+ "generate_length": 88,
23
+ "pre_pose_length": 0,
24
+ "pose_dim": 99,
25
+ "expression": true
26
+ },
27
+ "aud": {
28
+ "feat_method": "mfcc",
29
+ "aud_feat_dim": 64,
30
+ "aud_feat_win_size": null,
31
+ "context_info": false
32
+ }
33
+ },
34
+ "Model": {
35
+ "model_type": "body",
36
+ "model_name": "s2g_LS3DCG",
37
+ "code_num": 2048,
38
+ "AudioOpt": "Adam",
39
+ "encoder_choice": "mfcc",
40
+ "gan": false,
41
+ },
42
+ "DataLoader": {
43
+ "batch_size": 128,
44
+ "num_workers": 0
45
+ },
46
+ "Train": {
47
+ "epochs": 100,
48
+ "max_gradient_norm": 5,
49
+ "learning_rate": {
50
+ "generator_learning_rate": 1e-4,
51
+ "discriminator_learning_rate": 1e-4
52
+ }
53
+ },
54
+ "Log": {
55
+ "save_every": 50,
56
+ "print_every": 200,
57
+ "name": "LS3DCG"
58
+ }
59
+ }
60
+
config/body_pixel.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_root_path": "/is/cluster/scratch/hyi/ExpressiveBody/SMPLifyX4/scripts",
3
+ "dataset_load_mode": "pickle",
4
+ "store_file_path": "store.pkl",
5
+ "smplx_npz_path": "visualise/smplx_model/SMPLX_NEUTRAL_2020.npz",
6
+ "extra_joint_path": "visualise/smplx_model/smplx_extra_joints.yaml",
7
+ "j14_regressor_path": "visualise/smplx_model/SMPLX_to_J14.pkl",
8
+ "param": {
9
+ "w_j": 1,
10
+ "w_b": 1,
11
+ "w_h": 1
12
+ },
13
+ "Data": {
14
+ "data_root": "../ExpressiveWholeBodyDatasetv1.0/",
15
+ "pklname": "_3d_mfcc.pkl",
16
+ "whole_video": false,
17
+ "pose": {
18
+ "normalization": false,
19
+ "convert_to_6d": false,
20
+ "norm_method": "all",
21
+ "augmentation": false,
22
+ "generate_length": 88,
23
+ "pre_pose_length": 0,
24
+ "pose_dim": 99,
25
+ "expression": true
26
+ },
27
+ "aud": {
28
+ "feat_method": "mfcc",
29
+ "aud_feat_dim": 64,
30
+ "aud_feat_win_size": null,
31
+ "context_info": false
32
+ }
33
+ },
34
+ "Model": {
35
+ "model_type": "body",
36
+ "model_name": "s2g_body_pixel",
37
+ "composition": true,
38
+ "code_num": 2048,
39
+ "bh_model": true,
40
+ "AudioOpt": "Adam",
41
+ "encoder_choice": "mfcc",
42
+ "gan": false,
43
+ "vq_path": "./experiments/2022-10-31-smplx_S2G-body-vq-3d/ckpt-99.pth"
44
+ },
45
+ "DataLoader": {
46
+ "batch_size": 128,
47
+ "num_workers": 0
48
+ },
49
+ "Train": {
50
+ "epochs": 100,
51
+ "max_gradient_norm": 5,
52
+ "learning_rate": {
53
+ "generator_learning_rate": 1e-4,
54
+ "discriminator_learning_rate": 1e-4
55
+ }
56
+ },
57
+ "Log": {
58
+ "save_every": 50,
59
+ "print_every": 200,
60
+ "name": "body-pixel2"
61
+ }
62
+ }
63
+
config/body_vq.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_root_path": "/is/cluster/scratch/hyi/ExpressiveBody/SMPLifyX4/scripts",
3
+ "dataset_load_mode": "pickle",
4
+ "store_file_path": "store.pkl",
5
+ "smplx_npz_path": "visualise/smplx_model/SMPLX_NEUTRAL_2020.npz",
6
+ "extra_joint_path": "visualise/smplx_model/smplx_extra_joints.yaml",
7
+ "j14_regressor_path": "visualise/smplx_model/SMPLX_to_J14.pkl",
8
+ "param": {
9
+ "w_j": 1,
10
+ "w_b": 1,
11
+ "w_h": 1
12
+ },
13
+ "Data": {
14
+ "data_root": "../expressive_body-V0.7/",
15
+ "pklname": "_3d_mfcc.pkl",
16
+ "whole_video": false,
17
+ "pose": {
18
+ "normalization": false,
19
+ "convert_to_6d": false,
20
+ "norm_method": "all",
21
+ "augmentation": false,
22
+ "generate_length": 88,
23
+ "pre_pose_length": 0,
24
+ "pose_dim": 99,
25
+ "expression": true
26
+ },
27
+ "aud": {
28
+ "feat_method": "mfcc",
29
+ "aud_feat_dim": 64,
30
+ "aud_feat_win_size": null,
31
+ "context_info": false
32
+ }
33
+ },
34
+ "Model": {
35
+ "model_type": "body",
36
+ "model_name": "s2g_body_vq",
37
+ "composition": false,
38
+ "code_num": 2048,
39
+ "bh_model": true,
40
+ "AudioOpt": "Adam",
41
+ "encoder_choice": "mfcc",
42
+ "gan": false
43
+ },
44
+ "DataLoader": {
45
+ "batch_size": 128,
46
+ "num_workers": 0
47
+ },
48
+ "Train": {
49
+ "epochs": 100,
50
+ "max_gradient_norm": 5,
51
+ "learning_rate": {
52
+ "generator_learning_rate": 1e-4,
53
+ "discriminator_learning_rate": 1e-4
54
+ }
55
+ },
56
+ "Log": {
57
+ "save_every": 50,
58
+ "print_every": 200,
59
+ "name": "test"
60
+ }
61
+ }
62
+
config/face.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_root_path": "/is/cluster/scratch/hyi/ExpressiveBody/SMPLifyX4/scripts",
3
+ "dataset_load_mode": "json",
4
+ "store_file_path": "store.pkl",
5
+ "smplx_npz_path": "visualise/smplx_model/SMPLX_NEUTRAL_2020.npz",
6
+ "extra_joint_path": "visualise/smplx_model/smplx_extra_joints.yaml",
7
+ "j14_regressor_path": "visualise/smplx_model/SMPLX_to_J14.pkl",
8
+ "param": {
9
+ "w_j": 1,
10
+ "w_b": 1,
11
+ "w_h": 1
12
+ },
13
+ "Data": {
14
+ "data_root": "../ExpressiveWholeBodyDatasetv1.0/",
15
+ "pklname": "_3d_wv2.pkl",
16
+ "whole_video": true,
17
+ "pose": {
18
+ "normalization": false,
19
+ "convert_to_6d": false,
20
+ "norm_method": "all",
21
+ "augmentation": false,
22
+ "generate_length": 88,
23
+ "pre_pose_length": 0,
24
+ "pose_dim": 99,
25
+ "expression": true
26
+ },
27
+ "aud": {
28
+ "feat_method": "mfcc",
29
+ "aud_feat_dim": 64,
30
+ "aud_feat_win_size": null,
31
+ "context_info": false
32
+ }
33
+ },
34
+ "Model": {
35
+ "model_type": "face",
36
+ "model_name": "s2g_face",
37
+ "AudioOpt": "SGD",
38
+ "encoder_choice": "faceformer",
39
+ "gan": false
40
+ },
41
+ "DataLoader": {
42
+ "batch_size": 1,
43
+ "num_workers": 0
44
+ },
45
+ "Train": {
46
+ "epochs": 100,
47
+ "max_gradient_norm": 5,
48
+ "learning_rate": {
49
+ "generator_learning_rate": 1e-4,
50
+ "discriminator_learning_rate": 1e-4
51
+ }
52
+ },
53
+ "Log": {
54
+ "save_every": 50,
55
+ "print_every": 1000,
56
+ "name": "face"
57
+ }
58
+ }
59
+
data_utils/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # from .dataloader_csv import MultiVidData as csv_data
2
+ from .dataloader_torch import MultiVidData as torch_data
3
+ from .utils import get_melspec, get_mfcc, get_mfcc_old, get_mfcc_psf, get_mfcc_psf_min, get_mfcc_ta
data_utils/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (375 Bytes). View file
 
data_utils/__pycache__/consts.cpython-37.pyc ADDED
Binary file (92.7 kB). View file
 
data_utils/__pycache__/dataloader_torch.cpython-37.pyc ADDED
Binary file (5.31 kB). View file
 
data_utils/__pycache__/lower_body.cpython-37.pyc ADDED
Binary file (3.91 kB). View file
 
data_utils/__pycache__/mesh_dataset.cpython-37.pyc ADDED
Binary file (7.9 kB). View file
 
data_utils/__pycache__/rotation_conversion.cpython-37.pyc ADDED
Binary file (16.4 kB). View file
 
data_utils/__pycache__/utils.cpython-37.pyc ADDED
Binary file (7.77 kB). View file
 
data_utils/axis2matrix.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import math
3
+ import scipy.linalg as linalg
4
+
5
+
6
+ def rotate_mat(axis, radian):
7
+
8
+ a = np.cross(np.eye(3), axis / linalg.norm(axis) * radian)
9
+
10
+ rot_matrix = linalg.expm(a)
11
+
12
+ return rot_matrix
13
+
14
+ def aaa2mat(axis, sin, cos):
15
+ i = np.eye(3)
16
+ nnt = np.dot(axis.T, axis)
17
+ s = np.asarray([[0, -axis[0,2], axis[0,1]],
18
+ [axis[0,2], 0, -axis[0,0]],
19
+ [-axis[0,1], axis[0,0], 0]])
20
+ r = cos * i + (1-cos)*nnt +sin * s
21
+ return r
22
+
23
+ rand_axis = np.asarray([[1,0,0]])
24
+ #旋转角度
25
+ r = math.pi/2
26
+ #返回旋转矩阵
27
+ rot_matrix = rotate_mat(rand_axis, r)
28
+ r2 = aaa2mat(rand_axis, np.sin(r), np.cos(r))
29
+ print(rot_matrix)
data_utils/consts.py ADDED
The diff for this file is too large to render. See raw diff
 
data_utils/dataloader_torch.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ sys.path.append(os.getcwd())
4
+ import os
5
+ from tqdm import tqdm
6
+ from data_utils.utils import *
7
+ import torch.utils.data as data
8
+ from data_utils.mesh_dataset import SmplxDataset
9
+ from transformers import Wav2Vec2Processor
10
+
11
+
12
+ class MultiVidData():
13
+ def __init__(self,
14
+ data_root,
15
+ speakers,
16
+ split='train',
17
+ limbscaling=False,
18
+ normalization=False,
19
+ norm_method='new',
20
+ split_trans_zero=False,
21
+ num_frames=25,
22
+ num_pre_frames=25,
23
+ num_generate_length=None,
24
+ aud_feat_win_size=None,
25
+ aud_feat_dim=64,
26
+ feat_method='mel_spec',
27
+ context_info=False,
28
+ smplx=False,
29
+ audio_sr=16000,
30
+ convert_to_6d=False,
31
+ expression=False,
32
+ config=None
33
+ ):
34
+ self.data_root = data_root
35
+ self.speakers = speakers
36
+ self.split = split
37
+ if split == 'pre':
38
+ self.split = 'train'
39
+ self.norm_method=norm_method
40
+ self.normalization = normalization
41
+ self.limbscaling = limbscaling
42
+ self.convert_to_6d = convert_to_6d
43
+ self.num_frames=num_frames
44
+ self.num_pre_frames=num_pre_frames
45
+ if num_generate_length is None:
46
+ self.num_generate_length = num_frames
47
+ else:
48
+ self.num_generate_length = num_generate_length
49
+ self.split_trans_zero=split_trans_zero
50
+
51
+ dataset = SmplxDataset
52
+
53
+ if self.split_trans_zero:
54
+ self.trans_dataset_list = []
55
+ self.zero_dataset_list = []
56
+ else:
57
+ self.all_dataset_list = []
58
+ self.dataset={}
59
+ self.complete_data=[]
60
+ self.config=config
61
+ load_mode=self.config.dataset_load_mode
62
+
63
+ ######################load with pickle file
64
+ if load_mode=='pickle':
65
+ import pickle
66
+ import subprocess
67
+
68
+ # store_file_path='/tmp/store.pkl'
69
+ # cp /is/cluster/scratch/hyi/ExpressiveBody/SMPLifyX4/scripts/store.pkl /tmp/store.pkl
70
+ # subprocess.run(f'cp /is/cluster/scratch/hyi/ExpressiveBody/SMPLifyX4/scripts/store.pkl {store_file_path}',shell=True)
71
+
72
+ # f = open(self.config.store_file_path, 'rb+')
73
+ f = open(self.split+config.Data.pklname, 'rb+')
74
+ self.dataset=pickle.load(f)
75
+ f.close()
76
+ for key in self.dataset:
77
+ self.complete_data.append(self.dataset[key].complete_data)
78
+ ######################load with pickle file
79
+
80
+ ######################load with a csv file
81
+ elif load_mode=='csv':
82
+
83
+ # 这里从我的一个code文件夹导入的,后续再完善进来
84
+ try:
85
+ sys.path.append(self.config.config_root_path)
86
+ from config import config_path
87
+ from csv_parser import csv_parse
88
+
89
+ except ImportError as e:
90
+ print(f'err: {e}')
91
+ raise ImportError('config root path error...')
92
+
93
+
94
+ for speaker_name in self.speakers:
95
+ # df_intervals=pd.read_csv(self.config.voca_csv_file_path)
96
+ df_intervals=None
97
+ df_intervals=df_intervals[df_intervals['speaker']==speaker_name]
98
+ df_intervals = df_intervals[df_intervals['dataset'] == self.split]
99
+
100
+ print(f'speaker {speaker_name} train interval length: {len(df_intervals)}')
101
+ for iter_index, (_, interval) in tqdm(
102
+ (enumerate(df_intervals.iterrows())),desc=f'load {speaker_name}'
103
+ ):
104
+
105
+ (
106
+ interval_index,
107
+ interval_speaker,
108
+ interval_video_fn,
109
+ interval_id,
110
+
111
+ start_time,
112
+ end_time,
113
+ duration_time,
114
+ start_time_10,
115
+ over_flow_flag,
116
+ short_dur_flag,
117
+
118
+ big_video_dir,
119
+ small_video_dir_name,
120
+ speaker_video_path,
121
+
122
+ voca_basename,
123
+ json_basename,
124
+ wav_basename,
125
+ voca_top_clip_path,
126
+ voca_json_clip_path,
127
+ voca_wav_clip_path,
128
+
129
+ audio_output_fn,
130
+ image_output_path,
131
+ pifpaf_output_path,
132
+ mp_output_path,
133
+ op_output_path,
134
+ deca_output_path,
135
+ pixie_output_path,
136
+ cam_output_path,
137
+ ours_output_path,
138
+ merge_output_path,
139
+ multi_output_path,
140
+ gt_output_path,
141
+ ours_images_path,
142
+ pkl_fil_path,
143
+ )=csv_parse(interval)
144
+
145
+ if not os.path.exists(pkl_fil_path) or not os.path.exists(audio_output_fn):
146
+ continue
147
+
148
+ key=f'{interval_video_fn}/{small_video_dir_name}'
149
+ self.dataset[key] = dataset(
150
+ data_root=pkl_fil_path,
151
+ speaker=speaker_name,
152
+ audio_fn=audio_output_fn,
153
+ audio_sr=audio_sr,
154
+ fps=num_frames,
155
+ feat_method=feat_method,
156
+ audio_feat_dim=aud_feat_dim,
157
+ train=(self.split == 'train'),
158
+ load_all=True,
159
+ split_trans_zero=self.split_trans_zero,
160
+ limbscaling=self.limbscaling,
161
+ num_frames=self.num_frames,
162
+ num_pre_frames=self.num_pre_frames,
163
+ num_generate_length=self.num_generate_length,
164
+ audio_feat_win_size=aud_feat_win_size,
165
+ context_info=context_info,
166
+ convert_to_6d=convert_to_6d,
167
+ expression=expression,
168
+ config=self.config
169
+ )
170
+ self.complete_data.append(self.dataset[key].complete_data)
171
+ ######################load with a csv file
172
+
173
+ ######################origin load method
174
+ elif load_mode=='json':
175
+
176
+ # if self.split == 'train':
177
+ # import pickle
178
+ # f = open('store.pkl', 'rb+')
179
+ # self.dataset=pickle.load(f)
180
+ # f.close()
181
+ # for key in self.dataset:
182
+ # self.complete_data.append(self.dataset[key].complete_data)
183
+ # else:https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav
184
+ # if config.Model.model_type == 'face':
185
+ am = Wav2Vec2Processor.from_pretrained("vitouphy/wav2vec2-xls-r-300m-phoneme")
186
+ am_sr = 16000
187
+ # else:
188
+ # am, am_sr = None, None
189
+ for speaker_name in self.speakers:
190
+ speaker_root = os.path.join(self.data_root, speaker_name)
191
+
192
+ videos=[v for v in os.listdir(speaker_root) ]
193
+ print(videos)
194
+
195
+ haode = huaide = 0
196
+
197
+ for vid in tqdm(videos, desc="Processing training data of {}......".format(speaker_name)):
198
+ source_vid=vid
199
+ # vid_pth=os.path.join(speaker_root, source_vid, 'images/half', self.split)
200
+ vid_pth = os.path.join(speaker_root, source_vid, self.split)
201
+ if smplx == 'pose':
202
+ seqs = [s for s in os.listdir(vid_pth) if (s.startswith('clip'))]
203
+ else:
204
+ try:
205
+ seqs = [s for s in os.listdir(vid_pth)]
206
+ except:
207
+ continue
208
+
209
+ for s in seqs:
210
+ seq_root=os.path.join(vid_pth, s)
211
+ key = seq_root # correspond to clip******
212
+ audio_fname = os.path.join(speaker_root, source_vid, self.split, s, '%s.wav' % (s))
213
+ motion_fname = os.path.join(speaker_root, source_vid, self.split, s, '%s.pkl' % (s))
214
+ if not os.path.isfile(audio_fname) or not os.path.isfile(motion_fname):
215
+ huaide = huaide + 1
216
+ continue
217
+
218
+ self.dataset[key]=dataset(
219
+ data_root=seq_root,
220
+ speaker=speaker_name,
221
+ motion_fn=motion_fname,
222
+ audio_fn=audio_fname,
223
+ audio_sr=audio_sr,
224
+ fps=num_frames,
225
+ feat_method=feat_method,
226
+ audio_feat_dim=aud_feat_dim,
227
+ train=(self.split=='train'),
228
+ load_all=True,
229
+ split_trans_zero=self.split_trans_zero,
230
+ limbscaling=self.limbscaling,
231
+ num_frames=self.num_frames,
232
+ num_pre_frames=self.num_pre_frames,
233
+ num_generate_length=self.num_generate_length,
234
+ audio_feat_win_size=aud_feat_win_size,
235
+ context_info=context_info,
236
+ convert_to_6d=convert_to_6d,
237
+ expression=expression,
238
+ config=self.config,
239
+ am=am,
240
+ am_sr=am_sr,
241
+ whole_video=config.Data.whole_video
242
+ )
243
+ self.complete_data.append(self.dataset[key].complete_data)
244
+ haode = haode + 1
245
+ print("huaide:{}, haode:{}".format(huaide, haode))
246
+ import pickle
247
+
248
+ f = open(self.split+config.Data.pklname, 'wb')
249
+ pickle.dump(self.dataset, f)
250
+ f.close()
251
+ ######################origin load method
252
+
253
+ self.complete_data=np.concatenate(self.complete_data, axis=0)
254
+
255
+ # assert self.complete_data.shape[-1] == (12+21+21)*2
256
+ self.normalize_stats = {}
257
+
258
+ self.data_mean = None
259
+ self.data_std = None
260
+
261
+ def get_dataset(self):
262
+ self.normalize_stats['mean'] = self.data_mean
263
+ self.normalize_stats['std'] = self.data_std
264
+
265
+ for key in list(self.dataset.keys()):
266
+ if self.dataset[key].complete_data.shape[0] < self.num_generate_length:
267
+ continue
268
+ self.dataset[key].num_generate_length = self.num_generate_length
269
+ self.dataset[key].get_dataset(self.normalization, self.normalize_stats, self.split)
270
+ self.all_dataset_list.append(self.dataset[key].all_dataset)
271
+
272
+ if self.split_trans_zero:
273
+ self.trans_dataset = data.ConcatDataset(self.trans_dataset_list)
274
+ self.zero_dataset = data.ConcatDataset(self.zero_dataset_list)
275
+ else:
276
+ self.all_dataset = data.ConcatDataset(self.all_dataset_list)
277
+
278
+
279
+
data_utils/dataset_preprocess.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ from tqdm import tqdm
4
+ import shutil
5
+ import torch
6
+ import numpy as np
7
+ import librosa
8
+ import random
9
+
10
+ speakers = ['seth', 'conan', 'oliver', 'chemistry']
11
+ data_root = "../ExpressiveWholeBodyDatasetv1.0/"
12
+ split = 'train'
13
+
14
+
15
+
16
+ def split_list(full_list,shuffle=False,ratio=0.2):
17
+ n_total = len(full_list)
18
+ offset_0 = int(n_total * ratio)
19
+ offset_1 = int(n_total * ratio * 2)
20
+ if n_total==0 or offset_1<1:
21
+ return [],full_list
22
+ if shuffle:
23
+ random.shuffle(full_list)
24
+ sublist_0 = full_list[:offset_0]
25
+ sublist_1 = full_list[offset_0:offset_1]
26
+ sublist_2 = full_list[offset_1:]
27
+ return sublist_0, sublist_1, sublist_2
28
+
29
+
30
+ def moveto(list, file):
31
+ for f in list:
32
+ before, after = '/'.join(f.split('/')[:-1]), f.split('/')[-1]
33
+ new_path = os.path.join(before, file)
34
+ new_path = os.path.join(new_path, after)
35
+ # os.makedirs(new_path)
36
+ # os.path.isdir(new_path)
37
+ # shutil.move(f, new_path)
38
+
39
+ #转移到新目录
40
+ shutil.copytree(f, new_path)
41
+ #删除原train里的文件
42
+ shutil.rmtree(f)
43
+ return None
44
+
45
+
46
+ def read_pkl(data):
47
+ betas = np.array(data['betas'])
48
+
49
+ jaw_pose = np.array(data['jaw_pose'])
50
+ leye_pose = np.array(data['leye_pose'])
51
+ reye_pose = np.array(data['reye_pose'])
52
+ global_orient = np.array(data['global_orient']).squeeze()
53
+ body_pose = np.array(data['body_pose_axis'])
54
+ left_hand_pose = np.array(data['left_hand_pose'])
55
+ right_hand_pose = np.array(data['right_hand_pose'])
56
+
57
+ full_body = np.concatenate(
58
+ (jaw_pose, leye_pose, reye_pose, global_orient, body_pose, left_hand_pose, right_hand_pose), axis=1)
59
+
60
+ expression = np.array(data['expression'])
61
+ full_body = np.concatenate((full_body, expression), axis=1)
62
+
63
+ if (full_body.shape[0] < 90) or (torch.isnan(torch.from_numpy(full_body)).sum() > 0):
64
+ return 1
65
+ else:
66
+ return 0
67
+
68
+
69
+ for speaker_name in speakers:
70
+ speaker_root = os.path.join(data_root, speaker_name)
71
+
72
+ videos = [v for v in os.listdir(speaker_root)]
73
+ print(videos)
74
+
75
+ haode = huaide = 0
76
+ total_seqs = []
77
+
78
+ for vid in tqdm(videos, desc="Processing training data of {}......".format(speaker_name)):
79
+ # for vid in videos:
80
+ source_vid = vid
81
+ vid_pth = os.path.join(speaker_root, source_vid)
82
+ # vid_pth = os.path.join(speaker_root, source_vid, 'images/half', split)
83
+ t = os.path.join(speaker_root, source_vid, 'test')
84
+ v = os.path.join(speaker_root, source_vid, 'val')
85
+
86
+ # if os.path.exists(t):
87
+ # shutil.rmtree(t)
88
+ # if os.path.exists(v):
89
+ # shutil.rmtree(v)
90
+ try:
91
+ seqs = [s for s in os.listdir(vid_pth)]
92
+ except:
93
+ continue
94
+ # if len(seqs) == 0:
95
+ # shutil.rmtree(os.path.join(speaker_root, source_vid))
96
+ # None
97
+ for s in seqs:
98
+ quality = 0
99
+ total_seqs.append(os.path.join(vid_pth,s))
100
+ seq_root = os.path.join(vid_pth, s)
101
+ key = seq_root # correspond to clip******
102
+ audio_fname = os.path.join(speaker_root, source_vid, s, '%s.wav' % (s))
103
+
104
+ # delete the data without audio or the audio file could not be read
105
+ if os.path.isfile(audio_fname):
106
+ try:
107
+ audio = librosa.load(audio_fname)
108
+ except:
109
+ # print(key)
110
+ shutil.rmtree(key)
111
+ huaide = huaide + 1
112
+ continue
113
+ else:
114
+ huaide = huaide + 1
115
+ # print(key)
116
+ shutil.rmtree(key)
117
+ continue
118
+
119
+ # check motion file
120
+ motion_fname = os.path.join(speaker_root, source_vid, s, '%s.pkl' % (s))
121
+ try:
122
+ f = open(motion_fname, 'rb+')
123
+ except:
124
+ shutil.rmtree(key)
125
+ huaide = huaide + 1
126
+ continue
127
+
128
+ data = pickle.load(f)
129
+ w = read_pkl(data)
130
+ f.close()
131
+ quality = quality + w
132
+
133
+ if w == 1:
134
+ shutil.rmtree(key)
135
+ # print(key)
136
+ huaide = huaide + 1
137
+ continue
138
+
139
+ haode = haode + 1
140
+
141
+ print("huaide:{}, haode:{}, total_seqs:{}".format(huaide, haode, total_seqs.__len__()))
142
+
143
+ for speaker_name in speakers:
144
+ speaker_root = os.path.join(data_root, speaker_name)
145
+
146
+ videos = [v for v in os.listdir(speaker_root)]
147
+ print(videos)
148
+
149
+ haode = huaide = 0
150
+ total_seqs = []
151
+
152
+ for vid in tqdm(videos, desc="Processing training data of {}......".format(speaker_name)):
153
+ # for vid in videos:
154
+ source_vid = vid
155
+ vid_pth = os.path.join(speaker_root, source_vid)
156
+ try:
157
+ seqs = [s for s in os.listdir(vid_pth)]
158
+ except:
159
+ continue
160
+ for s in seqs:
161
+ quality = 0
162
+ total_seqs.append(os.path.join(vid_pth, s))
163
+ print("total_seqs:{}".format(total_seqs.__len__()))
164
+ # split the dataset
165
+ test_list, val_list, train_list = split_list(total_seqs, True, 0.1)
166
+ print(len(test_list), len(val_list), len(train_list))
167
+ moveto(train_list, 'train')
168
+ moveto(test_list, 'test')
169
+ moveto(val_list, 'val')
170
+
data_utils/get_j.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ def to3d(poses, config):
5
+ if config.Data.pose.convert_to_6d:
6
+ if config.Data.pose.expression:
7
+ poses_exp = poses[:, -100:]
8
+ poses = poses[:, :-100]
9
+
10
+ poses = poses.reshape(poses.shape[0], -1, 5)
11
+ sin, cos = poses[:, :, 3], poses[:, :, 4]
12
+ pose_angle = torch.atan2(sin, cos)
13
+ poses = (poses[:, :, :3] * pose_angle.unsqueeze(dim=-1)).reshape(poses.shape[0], -1)
14
+
15
+ if config.Data.pose.expression:
16
+ poses = torch.cat([poses, poses_exp], dim=-1)
17
+ return poses
18
+
19
+
20
+ def get_joint(smplx_model, betas, pred):
21
+ joint = smplx_model(betas=betas.repeat(pred.shape[0], 1),
22
+ expression=pred[:, 165:265],
23
+ jaw_pose=pred[:, 0:3],
24
+ leye_pose=pred[:, 3:6],
25
+ reye_pose=pred[:, 6:9],
26
+ global_orient=pred[:, 9:12],
27
+ body_pose=pred[:, 12:75],
28
+ left_hand_pose=pred[:, 75:120],
29
+ right_hand_pose=pred[:, 120:165],
30
+ return_verts=True)['joints']
31
+ return joint
32
+
33
+
34
+ def get_joints(smplx_model, betas, pred):
35
+ if len(pred.shape) == 3:
36
+ B = pred.shape[0]
37
+ x = 4 if B>= 4 else B
38
+ T = pred.shape[1]
39
+ pred = pred.reshape(-1, 265)
40
+ smplx_model.batch_size = L = T * x
41
+
42
+ times = pred.shape[0] // smplx_model.batch_size
43
+ joints = []
44
+ for i in range(times):
45
+ joints.append(get_joint(smplx_model, betas, pred[i*L:(i+1)*L]))
46
+ joints = torch.cat(joints, dim=0)
47
+ joints = joints.reshape(B, T, -1, 3)
48
+ else:
49
+ smplx_model.batch_size = pred.shape[0]
50
+ joints = get_joint(smplx_model, betas, pred)
51
+ return joints
data_utils/hand_component.json ADDED
The diff for this file is too large to render. See raw diff
 
data_utils/lower_body.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+
4
+ lower_pose = torch.tensor(
5
+ [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0747, -0.0158, -0.0152, -1.1826512813568115, 0.23866955935955048,
6
+ 0.15146760642528534, -1.2604516744613647, -0.3160211145877838,
7
+ -0.1603458970785141, 1.1654603481292725, 0.0, 0.0, 1.2521806955337524, 0.041598282754421234, -0.06312154978513718,
8
+ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
9
+ lower_pose_stand = torch.tensor([
10
+ 8.9759e-04, 7.1074e-04, -5.9163e-06, 8.9759e-04, 7.1074e-04, -5.9163e-06,
11
+ 3.0747, -0.0158, -0.0152,
12
+ -3.6665e-01, -8.8455e-03, 1.6113e-01, -3.6665e-01, -8.8455e-03, 1.6113e-01,
13
+ -3.9716e-01, -4.0229e-02, -1.2637e-01,
14
+ 7.9163e-01, 6.8519e-02, -1.5091e-01, 7.9163e-01, 6.8519e-02, -1.5091e-01,
15
+ 7.8632e-01, -4.3810e-02, 1.4375e-02,
16
+ -1.0675e-01, 1.2635e-01, 1.6711e-02, -1.0675e-01, 1.2635e-01, 1.6711e-02, ])
17
+ # lower_pose_stand = torch.tensor(
18
+ # [6.4919e-02, 3.3018e-02, 1.7485e-02, 8.9759e-04, 7.1074e-04, -5.9163e-06,
19
+ # 3.0747, -0.0158, -0.0152,
20
+ # -3.3633e+00, -9.3915e-02, 3.0996e-01, -3.6665e-01, -8.8455e-03, 1.6113e-01,
21
+ # 1.1654603481292725, 0.0, 0.0,
22
+ # 4.4167e-01, 6.7183e-03, -3.6379e-03, 7.9163e-01, 6.8519e-02, -1.5091e-01,
23
+ # 0.0, 0.0, 0.0,
24
+ # 2.2910e-02, -2.4797e-02, -5.5657e-03, -1.0675e-01, 1.2635e-01, 1.6711e-02,])
25
+ lower_body = [0, 1, 3, 4, 6, 7, 9, 10]
26
+ count_part = [6, 9, 12, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
27
+ 29, 30, 31, 32, 33, 34, 35, 36, 37,
28
+ 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54]
29
+ fix_index = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
30
+ 29,
31
+ 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
32
+ 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
33
+ 65, 66, 67, 68, 69, 70, 71, 72, 73, 74]
34
+ all_index = np.ones(275)
35
+ all_index[fix_index] = 0
36
+ c_index = []
37
+ i = 0
38
+ for num in all_index:
39
+ if num == 1:
40
+ c_index.append(i)
41
+ i = i + 1
42
+ c_index = np.asarray(c_index)
43
+
44
+ fix_index_3d = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
45
+ 21, 22, 23, 24, 25, 26,
46
+ 30, 31, 32, 33, 34, 35,
47
+ 45, 46, 47, 48, 49, 50]
48
+ all_index_3d = np.ones(165)
49
+ all_index_3d[fix_index_3d] = 0
50
+ c_index_3d = []
51
+ i = 0
52
+ for num in all_index_3d:
53
+ if num == 1:
54
+ c_index_3d.append(i)
55
+ i = i + 1
56
+ c_index_3d = np.asarray(c_index_3d)
57
+
58
+ c_index_6d = []
59
+ i = 0
60
+ for num in all_index_3d:
61
+ if num == 1:
62
+ c_index_6d.append(2*i)
63
+ c_index_6d.append(2 * i + 1)
64
+ i = i + 1
65
+ c_index_6d = np.asarray(c_index_6d)
66
+
67
+
68
+ def part2full(input, stand=False):
69
+ if stand:
70
+ # lp = lower_pose_stand.unsqueeze(dim=0).repeat(input.shape[0], 1).to(input.device)
71
+ lp = torch.zeros_like(lower_pose)
72
+ lp[6:9] = torch.tensor([3.0747, -0.0158, -0.0152])
73
+ lp = lp.unsqueeze(dim=0).repeat(input.shape[0], 1).to(input.device)
74
+ else:
75
+ lp = lower_pose.unsqueeze(dim=0).repeat(input.shape[0], 1).to(input.device)
76
+
77
+ input = torch.cat([input[:, :3],
78
+ lp[:, :15],
79
+ input[:, 3:6],
80
+ lp[:, 15:21],
81
+ input[:, 6:9],
82
+ lp[:, 21:27],
83
+ input[:, 9:12],
84
+ lp[:, 27:],
85
+ input[:, 12:]]
86
+ , dim=1)
87
+ return input
88
+
89
+
90
+ def pred2poses(input, gt):
91
+ input = torch.cat([input[:, :3],
92
+ gt[0:1, 3:18].repeat(input.shape[0], 1),
93
+ input[:, 3:6],
94
+ gt[0:1, 21:27].repeat(input.shape[0], 1),
95
+ input[:, 6:9],
96
+ gt[0:1, 30:36].repeat(input.shape[0], 1),
97
+ input[:, 9:12],
98
+ gt[0:1, 39:45].repeat(input.shape[0], 1),
99
+ input[:, 12:]]
100
+ , dim=1)
101
+ return input
102
+
103
+
104
+ def poses2poses(input, gt):
105
+ input = torch.cat([input[:, :3],
106
+ gt[0:1, 3:18].repeat(input.shape[0], 1),
107
+ input[:, 18:21],
108
+ gt[0:1, 21:27].repeat(input.shape[0], 1),
109
+ input[:, 27:30],
110
+ gt[0:1, 30:36].repeat(input.shape[0], 1),
111
+ input[:, 36:39],
112
+ gt[0:1, 39:45].repeat(input.shape[0], 1),
113
+ input[:, 45:]]
114
+ , dim=1)
115
+ return input
116
+
117
+ def poses2pred(input, stand=False):
118
+ if stand:
119
+ lp = lower_pose_stand.unsqueeze(dim=0).repeat(input.shape[0], 1).to(input.device)
120
+ # lp = torch.zeros_like(lower_pose).unsqueeze(dim=0).repeat(input.shape[0], 1).to(input.device)
121
+ else:
122
+ lp = lower_pose.unsqueeze(dim=0).repeat(input.shape[0], 1).to(input.device)
123
+ input = torch.cat([input[:, :3],
124
+ lp[:, :15],
125
+ input[:, 18:21],
126
+ lp[:, 15:21],
127
+ input[:, 27:30],
128
+ lp[:, 21:27],
129
+ input[:, 36:39],
130
+ lp[:, 27:],
131
+ input[:, 45:]]
132
+ , dim=1)
133
+ return input
134
+
135
+
136
+ rearrange = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]\
137
+ # ,22, 23, 24, 25, 40, 26, 41,
138
+ # 27, 42, 28, 43, 29, 44, 30, 45, 31, 46, 32, 47, 33, 48, 34, 49, 35, 50, 36, 51, 37, 52, 38, 53, 39, 54, 55,
139
+ # 57, 56, 59, 58, 60, 63, 61, 64, 62, 65, 66, 71, 67, 72, 68, 73, 69, 74, 70, 75]
140
+
141
+ symmetry = [0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1]#, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
142
+ # 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
143
+ # 1, 1, 1, 1, 1, 1]
data_utils/mesh_dataset.py ADDED
@@ -0,0 +1,348 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import sys
3
+ import os
4
+
5
+ sys.path.append(os.getcwd())
6
+
7
+ import json
8
+ from glob import glob
9
+ from data_utils.utils import *
10
+ import torch.utils.data as data
11
+ from data_utils.consts import speaker_id
12
+ from data_utils.lower_body import count_part
13
+ import random
14
+ from data_utils.rotation_conversion import axis_angle_to_matrix, matrix_to_rotation_6d
15
+
16
+ with open('data_utils/hand_component.json') as file_obj:
17
+ comp = json.load(file_obj)
18
+ left_hand_c = np.asarray(comp['left'])
19
+ right_hand_c = np.asarray(comp['right'])
20
+
21
+
22
+ def to3d(data):
23
+ left_hand_pose = np.einsum('bi,ij->bj', data[:, 75:87], left_hand_c[:12, :])
24
+ right_hand_pose = np.einsum('bi,ij->bj', data[:, 87:99], right_hand_c[:12, :])
25
+ data = np.concatenate((data[:, :75], left_hand_pose, right_hand_pose), axis=-1)
26
+ return data
27
+
28
+
29
+ class SmplxDataset():
30
+ '''
31
+ creat a dataset for every segment and concat.
32
+ '''
33
+
34
+ def __init__(self,
35
+ data_root,
36
+ speaker,
37
+ motion_fn,
38
+ audio_fn,
39
+ audio_sr,
40
+ fps,
41
+ feat_method='mel_spec',
42
+ audio_feat_dim=64,
43
+ audio_feat_win_size=None,
44
+
45
+ train=True,
46
+ load_all=False,
47
+ split_trans_zero=False,
48
+ limbscaling=False,
49
+ num_frames=25,
50
+ num_pre_frames=25,
51
+ num_generate_length=25,
52
+ context_info=False,
53
+ convert_to_6d=False,
54
+ expression=False,
55
+ config=None,
56
+ am=None,
57
+ am_sr=None,
58
+ whole_video=False
59
+ ):
60
+
61
+ self.data_root = data_root
62
+ self.speaker = speaker
63
+
64
+ self.feat_method = feat_method
65
+ self.audio_fn = audio_fn
66
+ self.audio_sr = audio_sr
67
+ self.fps = fps
68
+ self.audio_feat_dim = audio_feat_dim
69
+ self.audio_feat_win_size = audio_feat_win_size
70
+ self.context_info = context_info # for aud feat
71
+ self.convert_to_6d = convert_to_6d
72
+ self.expression = expression
73
+
74
+ self.train = train
75
+ self.load_all = load_all
76
+ self.split_trans_zero = split_trans_zero
77
+ self.limbscaling = limbscaling
78
+ self.num_frames = num_frames
79
+ self.num_pre_frames = num_pre_frames
80
+ self.num_generate_length = num_generate_length
81
+ # print('num_generate_length ', self.num_generate_length)
82
+
83
+ self.config = config
84
+ self.am_sr = am_sr
85
+ self.whole_video = whole_video
86
+ load_mode = self.config.dataset_load_mode
87
+
88
+ if load_mode == 'pickle':
89
+ raise NotImplementedError
90
+
91
+ elif load_mode == 'csv':
92
+ import pickle
93
+ with open(data_root, 'rb') as f:
94
+ u = pickle._Unpickler(f)
95
+ data = u.load()
96
+ self.data = data[0]
97
+ if self.load_all:
98
+ self._load_npz_all()
99
+
100
+ elif load_mode == 'json':
101
+ self.annotations = glob(data_root + '/*pkl')
102
+ if len(self.annotations) == 0:
103
+ raise FileNotFoundError(data_root + ' are empty')
104
+ self.annotations = sorted(self.annotations)
105
+ self.img_name_list = self.annotations
106
+
107
+ if self.load_all:
108
+ self._load_them_all(am, am_sr, motion_fn)
109
+
110
+ def _load_npz_all(self):
111
+ self.loaded_data = {}
112
+ self.complete_data = []
113
+ data = self.data
114
+ shape = data['body_pose_axis'].shape[0]
115
+ self.betas = data['betas']
116
+ self.img_name_list = []
117
+ for index in range(shape):
118
+ img_name = f'{index:6d}'
119
+ self.img_name_list.append(img_name)
120
+
121
+ jaw_pose = data['jaw_pose'][index]
122
+ leye_pose = data['leye_pose'][index]
123
+ reye_pose = data['reye_pose'][index]
124
+ global_orient = data['global_orient'][index]
125
+ body_pose = data['body_pose_axis'][index]
126
+ left_hand_pose = data['left_hand_pose'][index]
127
+ right_hand_pose = data['right_hand_pose'][index]
128
+
129
+ full_body = np.concatenate(
130
+ (jaw_pose, leye_pose, reye_pose, global_orient, body_pose, left_hand_pose, right_hand_pose))
131
+ assert full_body.shape[0] == 99
132
+ if self.convert_to_6d:
133
+ full_body = to3d(full_body)
134
+ full_body = torch.from_numpy(full_body)
135
+ full_body = matrix_to_rotation_6d(axis_angle_to_matrix(full_body))
136
+ full_body = np.asarray(full_body)
137
+ if self.expression:
138
+ expression = data['expression'][index]
139
+ full_body = np.concatenate((full_body, expression))
140
+ # full_body = np.concatenate((full_body, non_zero))
141
+ else:
142
+ full_body = to3d(full_body)
143
+ if self.expression:
144
+ expression = data['expression'][index]
145
+ full_body = np.concatenate((full_body, expression))
146
+
147
+ self.loaded_data[img_name] = full_body.reshape(-1)
148
+ self.complete_data.append(full_body.reshape(-1))
149
+
150
+ self.complete_data = np.array(self.complete_data)
151
+
152
+ if self.audio_feat_win_size is not None:
153
+ self.audio_feat = get_mfcc_old(self.audio_fn).transpose(1, 0)
154
+ # print(self.audio_feat.shape)
155
+ else:
156
+ if self.feat_method == 'mel_spec':
157
+ self.audio_feat = get_melspec(self.audio_fn, fps=self.fps, sr=self.audio_sr, n_mels=self.audio_feat_dim)
158
+ elif self.feat_method == 'mfcc':
159
+ self.audio_feat = get_mfcc(self.audio_fn,
160
+ smlpx=True,
161
+ sr=self.audio_sr,
162
+ n_mfcc=self.audio_feat_dim,
163
+ win_size=self.audio_feat_win_size
164
+ )
165
+
166
+ def _load_them_all(self, am, am_sr, motion_fn):
167
+ self.loaded_data = {}
168
+ self.complete_data = []
169
+ f = open(motion_fn, 'rb+')
170
+ data = pickle.load(f)
171
+
172
+ self.betas = np.array(data['betas'])
173
+
174
+ jaw_pose = np.array(data['jaw_pose'])
175
+ leye_pose = np.array(data['leye_pose'])
176
+ reye_pose = np.array(data['reye_pose'])
177
+ global_orient = np.array(data['global_orient']).squeeze()
178
+ body_pose = np.array(data['body_pose_axis'])
179
+ left_hand_pose = np.array(data['left_hand_pose'])
180
+ right_hand_pose = np.array(data['right_hand_pose'])
181
+
182
+ full_body = np.concatenate(
183
+ (jaw_pose, leye_pose, reye_pose, global_orient, body_pose, left_hand_pose, right_hand_pose), axis=1)
184
+ assert full_body.shape[1] == 99
185
+
186
+
187
+ if self.convert_to_6d:
188
+ full_body = to3d(full_body)
189
+ full_body = torch.from_numpy(full_body)
190
+ full_body = matrix_to_rotation_6d(axis_angle_to_matrix(full_body.reshape(-1, 55, 3))).reshape(-1, 330)
191
+ full_body = np.asarray(full_body)
192
+ if self.expression:
193
+ expression = np.array(data['expression'])
194
+ full_body = np.concatenate((full_body, expression), axis=1)
195
+
196
+ else:
197
+ full_body = to3d(full_body)
198
+ expression = np.array(data['expression'])
199
+ full_body = np.concatenate((full_body, expression), axis=1)
200
+
201
+ self.complete_data = full_body
202
+ self.complete_data = np.array(self.complete_data)
203
+
204
+ if self.audio_feat_win_size is not None:
205
+ self.audio_feat = get_mfcc_old(self.audio_fn).transpose(1, 0)
206
+ else:
207
+ # if self.feat_method == 'mel_spec':
208
+ # self.audio_feat = get_melspec(self.audio_fn, fps=self.fps, sr=self.audio_sr, n_mels=self.audio_feat_dim)
209
+ # elif self.feat_method == 'mfcc':
210
+ self.audio_feat = get_mfcc_ta(self.audio_fn,
211
+ smlpx=True,
212
+ fps=30,
213
+ sr=self.audio_sr,
214
+ n_mfcc=self.audio_feat_dim,
215
+ win_size=self.audio_feat_win_size,
216
+ type=self.feat_method,
217
+ am=am,
218
+ am_sr=am_sr,
219
+ encoder_choice=self.config.Model.encoder_choice,
220
+ )
221
+ # with open(audio_file, 'w', encoding='utf-8') as file:
222
+ # file.write(json.dumps(self.audio_feat.__array__().tolist(), indent=0, ensure_ascii=False))
223
+
224
+ def get_dataset(self, normalization=False, normalize_stats=None, split='train'):
225
+
226
+ class __Worker__(data.Dataset):
227
+ def __init__(child, index_list, normalization, normalize_stats, split='train') -> None:
228
+ super().__init__()
229
+ child.index_list = index_list
230
+ child.normalization = normalization
231
+ child.normalize_stats = normalize_stats
232
+ child.split = split
233
+
234
+ def __getitem__(child, index):
235
+ num_generate_length = self.num_generate_length
236
+ num_pre_frames = self.num_pre_frames
237
+ seq_len = num_generate_length + num_pre_frames
238
+ # print(num_generate_length)
239
+
240
+ index = child.index_list[index]
241
+ index_new = index + random.randrange(0, 5, 3)
242
+ if index_new + seq_len > self.complete_data.shape[0]:
243
+ index_new = index
244
+ index = index_new
245
+
246
+ if child.split in ['val', 'pre', 'test'] or self.whole_video:
247
+ index = 0
248
+ seq_len = self.complete_data.shape[0]
249
+ seq_data = []
250
+ assert index + seq_len <= self.complete_data.shape[0]
251
+ # print(seq_len)
252
+ seq_data = self.complete_data[index:(index + seq_len), :]
253
+ seq_data = np.array(seq_data)
254
+
255
+ '''
256
+ audio feature,
257
+ '''
258
+ if not self.context_info:
259
+ if not self.whole_video:
260
+ audio_feat = self.audio_feat[index:index + seq_len, ...]
261
+ if audio_feat.shape[0] < seq_len:
262
+ audio_feat = np.pad(audio_feat, [[0, seq_len - audio_feat.shape[0]], [0, 0]],
263
+ mode='reflect')
264
+
265
+ assert audio_feat.shape[0] == seq_len and audio_feat.shape[1] == self.audio_feat_dim
266
+ else:
267
+ audio_feat = self.audio_feat
268
+
269
+ else: # including feature and history
270
+ if self.audio_feat_win_size is None:
271
+ audio_feat = self.audio_feat[index:index + seq_len + num_pre_frames, ...]
272
+ if audio_feat.shape[0] < seq_len + num_pre_frames:
273
+ audio_feat = np.pad(audio_feat,
274
+ [[0, seq_len + self.num_frames - audio_feat.shape[0]], [0, 0]],
275
+ mode='constant')
276
+
277
+ assert audio_feat.shape[0] == self.num_frames + seq_len and audio_feat.shape[
278
+ 1] == self.audio_feat_dim
279
+
280
+ if child.normalization:
281
+ data_mean = child.normalize_stats['mean'].reshape(1, -1)
282
+ data_std = child.normalize_stats['std'].reshape(1, -1)
283
+ seq_data[:, :330] = (seq_data[:, :330] - data_mean) / data_std
284
+ if child.split in['train', 'test']:
285
+ if self.convert_to_6d:
286
+ if self.expression:
287
+ data_sample = {
288
+ 'poses': seq_data[:, :330].astype(np.float).transpose(1, 0),
289
+ 'expression': seq_data[:, 330:].astype(np.float).transpose(1, 0),
290
+ # 'nzero': seq_data[:, 375:].astype(np.float).transpose(1, 0),
291
+ 'aud_feat': audio_feat.astype(np.float).transpose(1, 0),
292
+ 'speaker': speaker_id[self.speaker],
293
+ 'betas': self.betas,
294
+ 'aud_file': self.audio_fn,
295
+ }
296
+ else:
297
+ data_sample = {
298
+ 'poses': seq_data[:, :330].astype(np.float).transpose(1, 0),
299
+ 'nzero': seq_data[:, 330:].astype(np.float).transpose(1, 0),
300
+ 'aud_feat': audio_feat.astype(np.float).transpose(1, 0),
301
+ 'speaker': speaker_id[self.speaker],
302
+ 'betas': self.betas
303
+ }
304
+ else:
305
+ if self.expression:
306
+ data_sample = {
307
+ 'poses': seq_data[:, :165].astype(np.float).transpose(1, 0),
308
+ 'expression': seq_data[:, 165:].astype(np.float).transpose(1, 0),
309
+ 'aud_feat': audio_feat.astype(np.float).transpose(1, 0),
310
+ # 'wv2_feat': wv2_feat.astype(np.float).transpose(1, 0),
311
+ 'speaker': speaker_id[self.speaker],
312
+ 'aud_file': self.audio_fn,
313
+ 'betas': self.betas
314
+ }
315
+ else:
316
+ data_sample = {
317
+ 'poses': seq_data.astype(np.float).transpose(1, 0),
318
+ 'aud_feat': audio_feat.astype(np.float).transpose(1, 0),
319
+ 'speaker': speaker_id[self.speaker],
320
+ 'betas': self.betas
321
+ }
322
+ return data_sample
323
+ else:
324
+ data_sample = {
325
+ 'poses': seq_data[:, :330].astype(np.float).transpose(1, 0),
326
+ 'expression': seq_data[:, 330:].astype(np.float).transpose(1, 0),
327
+ # 'nzero': seq_data[:, 325:].astype(np.float).transpose(1, 0),
328
+ 'aud_feat': audio_feat.astype(np.float).transpose(1, 0),
329
+ 'aud_file': self.audio_fn,
330
+ 'speaker': speaker_id[self.speaker],
331
+ 'betas': self.betas
332
+ }
333
+ return data_sample
334
+ def __len__(child):
335
+ return len(child.index_list)
336
+
337
+ if split == 'train':
338
+ index_list = list(
339
+ range(0, min(self.complete_data.shape[0], self.audio_feat.shape[0]) - self.num_generate_length - self.num_pre_frames,
340
+ 6))
341
+ elif split in ['val', 'test']:
342
+ index_list = list([0])
343
+ if self.whole_video:
344
+ index_list = list([0])
345
+ self.all_dataset = __Worker__(index_list, normalization, normalize_stats, split)
346
+
347
+ def __len__(self):
348
+ return len(self.img_name_list)
data_utils/rotation_conversion.py ADDED
@@ -0,0 +1,551 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
2
+ # Check PYTORCH3D_LICENCE before use
3
+
4
+ import functools
5
+ from typing import Optional
6
+
7
+ import torch
8
+ import torch.nn.functional as F
9
+
10
+
11
+ """
12
+ The transformation matrices returned from the functions in this file assume
13
+ the points on which the transformation will be applied are column vectors.
14
+ i.e. the R matrix is structured as
15
+
16
+ R = [
17
+ [Rxx, Rxy, Rxz],
18
+ [Ryx, Ryy, Ryz],
19
+ [Rzx, Rzy, Rzz],
20
+ ] # (3, 3)
21
+
22
+ This matrix can be applied to column vectors by post multiplication
23
+ by the points e.g.
24
+
25
+ points = [[0], [1], [2]] # (3 x 1) xyz coordinates of a point
26
+ transformed_points = R * points
27
+
28
+ To apply the same matrix to points which are row vectors, the R matrix
29
+ can be transposed and pre multiplied by the points:
30
+
31
+ e.g.
32
+ points = [[0, 1, 2]] # (1 x 3) xyz coordinates of a point
33
+ transformed_points = points * R.transpose(1, 0)
34
+ """
35
+
36
+
37
+ def quaternion_to_matrix(quaternions):
38
+ """
39
+ Convert rotations given as quaternions to rotation matrices.
40
+
41
+ Args:
42
+ quaternions: quaternions with real part first,
43
+ as tensor of shape (..., 4).
44
+
45
+ Returns:
46
+ Rotation matrices as tensor of shape (..., 3, 3).
47
+ """
48
+ r, i, j, k = torch.unbind(quaternions, -1)
49
+ two_s = 2.0 / (quaternions * quaternions).sum(-1)
50
+
51
+ o = torch.stack(
52
+ (
53
+ 1 - two_s * (j * j + k * k),
54
+ two_s * (i * j - k * r),
55
+ two_s * (i * k + j * r),
56
+ two_s * (i * j + k * r),
57
+ 1 - two_s * (i * i + k * k),
58
+ two_s * (j * k - i * r),
59
+ two_s * (i * k - j * r),
60
+ two_s * (j * k + i * r),
61
+ 1 - two_s * (i * i + j * j),
62
+ ),
63
+ -1,
64
+ )
65
+ return o.reshape(quaternions.shape[:-1] + (3, 3))
66
+
67
+
68
+ def _copysign(a, b):
69
+ """
70
+ Return a tensor where each element has the absolute value taken from the,
71
+ corresponding element of a, with sign taken from the corresponding
72
+ element of b. This is like the standard copysign floating-point operation,
73
+ but is not careful about negative 0 and NaN.
74
+
75
+ Args:
76
+ a: source tensor.
77
+ b: tensor whose signs will be used, of the same shape as a.
78
+
79
+ Returns:
80
+ Tensor of the same shape as a with the signs of b.
81
+ """
82
+ signs_differ = (a < 0) != (b < 0)
83
+ return torch.where(signs_differ, -a, a)
84
+
85
+
86
+ def _sqrt_positive_part(x):
87
+ """
88
+ Returns torch.sqrt(torch.max(0, x))
89
+ but with a zero subgradient where x is 0.
90
+ """
91
+ ret = torch.zeros_like(x)
92
+ positive_mask = x > 0
93
+ ret[positive_mask] = torch.sqrt(x[positive_mask])
94
+ return ret
95
+
96
+
97
+ def matrix_to_quaternion(matrix):
98
+ """
99
+ Convert rotations given as rotation matrices to quaternions.
100
+
101
+ Args:
102
+ matrix: Rotation matrices as tensor of shape (..., 3, 3).
103
+
104
+ Returns:
105
+ quaternions with real part first, as tensor of shape (..., 4).
106
+ """
107
+ if matrix.size(-1) != 3 or matrix.size(-2) != 3:
108
+ raise ValueError(f"Invalid rotation matrix shape f{matrix.shape}.")
109
+ m00 = matrix[..., 0, 0]
110
+ m11 = matrix[..., 1, 1]
111
+ m22 = matrix[..., 2, 2]
112
+ o0 = 0.5 * _sqrt_positive_part(1 + m00 + m11 + m22)
113
+ x = 0.5 * _sqrt_positive_part(1 + m00 - m11 - m22)
114
+ y = 0.5 * _sqrt_positive_part(1 - m00 + m11 - m22)
115
+ z = 0.5 * _sqrt_positive_part(1 - m00 - m11 + m22)
116
+ o1 = _copysign(x, matrix[..., 2, 1] - matrix[..., 1, 2])
117
+ o2 = _copysign(y, matrix[..., 0, 2] - matrix[..., 2, 0])
118
+ o3 = _copysign(z, matrix[..., 1, 0] - matrix[..., 0, 1])
119
+ return torch.stack((o0, o1, o2, o3), -1)
120
+
121
+
122
+ def _axis_angle_rotation(axis: str, angle):
123
+ """
124
+ Return the rotation matrices for one of the rotations about an axis
125
+ of which Euler angles describe, for each value of the angle given.
126
+
127
+ Args:
128
+ axis: Axis label "X" or "Y or "Z".
129
+ angle: any shape tensor of Euler angles in radians
130
+
131
+ Returns:
132
+ Rotation matrices as tensor of shape (..., 3, 3).
133
+ """
134
+
135
+ cos = torch.cos(angle)
136
+ sin = torch.sin(angle)
137
+ one = torch.ones_like(angle)
138
+ zero = torch.zeros_like(angle)
139
+
140
+ if axis == "X":
141
+ R_flat = (one, zero, zero, zero, cos, -sin, zero, sin, cos)
142
+ if axis == "Y":
143
+ R_flat = (cos, zero, sin, zero, one, zero, -sin, zero, cos)
144
+ if axis == "Z":
145
+ R_flat = (cos, -sin, zero, sin, cos, zero, zero, zero, one)
146
+
147
+ return torch.stack(R_flat, -1).reshape(angle.shape + (3, 3))
148
+
149
+
150
+ def euler_angles_to_matrix(euler_angles, convention: str):
151
+ """
152
+ Convert rotations given as Euler angles in radians to rotation matrices.
153
+
154
+ Args:
155
+ euler_angles: Euler angles in radians as tensor of shape (..., 3).
156
+ convention: Convention string of three uppercase letters from
157
+ {"X", "Y", and "Z"}.
158
+
159
+ Returns:
160
+ Rotation matrices as tensor of shape (..., 3, 3).
161
+ """
162
+ if euler_angles.dim() == 0 or euler_angles.shape[-1] != 3:
163
+ raise ValueError("Invalid input euler angles.")
164
+ if len(convention) != 3:
165
+ raise ValueError("Convention must have 3 letters.")
166
+ if convention[1] in (convention[0], convention[2]):
167
+ raise ValueError(f"Invalid convention {convention}.")
168
+ for letter in convention:
169
+ if letter not in ("X", "Y", "Z"):
170
+ raise ValueError(f"Invalid letter {letter} in convention string.")
171
+ matrices = map(_axis_angle_rotation, convention, torch.unbind(euler_angles, -1))
172
+ return functools.reduce(torch.matmul, matrices)
173
+
174
+
175
+ def _angle_from_tan(
176
+ axis: str, other_axis: str, data, horizontal: bool, tait_bryan: bool
177
+ ):
178
+ """
179
+ Extract the first or third Euler angle from the two members of
180
+ the matrix which are positive constant times its sine and cosine.
181
+
182
+ Args:
183
+ axis: Axis label "X" or "Y or "Z" for the angle we are finding.
184
+ other_axis: Axis label "X" or "Y or "Z" for the middle axis in the
185
+ convention.
186
+ data: Rotation matrices as tensor of shape (..., 3, 3).
187
+ horizontal: Whether we are looking for the angle for the third axis,
188
+ which means the relevant entries are in the same row of the
189
+ rotation matrix. If not, they are in the same column.
190
+ tait_bryan: Whether the first and third axes in the convention differ.
191
+
192
+ Returns:
193
+ Euler Angles in radians for each matrix in data as a tensor
194
+ of shape (...).
195
+ """
196
+
197
+ i1, i2 = {"X": (2, 1), "Y": (0, 2), "Z": (1, 0)}[axis]
198
+ if horizontal:
199
+ i2, i1 = i1, i2
200
+ even = (axis + other_axis) in ["XY", "YZ", "ZX"]
201
+ if horizontal == even:
202
+ return torch.atan2(data[..., i1], data[..., i2])
203
+ if tait_bryan:
204
+ return torch.atan2(-data[..., i2], data[..., i1])
205
+ return torch.atan2(data[..., i2], -data[..., i1])
206
+
207
+
208
+ def _index_from_letter(letter: str):
209
+ if letter == "X":
210
+ return 0
211
+ if letter == "Y":
212
+ return 1
213
+ if letter == "Z":
214
+ return 2
215
+
216
+
217
+ def matrix_to_euler_angles(matrix, convention: str):
218
+ """
219
+ Convert rotations given as rotation matrices to Euler angles in radians.
220
+
221
+ Args:
222
+ matrix: Rotation matrices as tensor of shape (..., 3, 3).
223
+ convention: Convention string of three uppercase letters.
224
+
225
+ Returns:
226
+ Euler angles in radians as tensor of shape (..., 3).
227
+ """
228
+ if len(convention) != 3:
229
+ raise ValueError("Convention must have 3 letters.")
230
+ if convention[1] in (convention[0], convention[2]):
231
+ raise ValueError(f"Invalid convention {convention}.")
232
+ for letter in convention:
233
+ if letter not in ("X", "Y", "Z"):
234
+ raise ValueError(f"Invalid letter {letter} in convention string.")
235
+ if matrix.size(-1) != 3 or matrix.size(-2) != 3:
236
+ raise ValueError(f"Invalid rotation matrix shape f{matrix.shape}.")
237
+ i0 = _index_from_letter(convention[0])
238
+ i2 = _index_from_letter(convention[2])
239
+ tait_bryan = i0 != i2
240
+ if tait_bryan:
241
+ central_angle = torch.asin(
242
+ matrix[..., i0, i2] * (-1.0 if i0 - i2 in [-1, 2] else 1.0)
243
+ )
244
+ else:
245
+ central_angle = torch.acos(matrix[..., i0, i0])
246
+
247
+ o = (
248
+ _angle_from_tan(
249
+ convention[0], convention[1], matrix[..., i2], False, tait_bryan
250
+ ),
251
+ central_angle,
252
+ _angle_from_tan(
253
+ convention[2], convention[1], matrix[..., i0, :], True, tait_bryan
254
+ ),
255
+ )
256
+ return torch.stack(o, -1)
257
+
258
+
259
+ def random_quaternions(
260
+ n: int, dtype: Optional[torch.dtype] = None, device=None, requires_grad=False
261
+ ):
262
+ """
263
+ Generate random quaternions representing rotations,
264
+ i.e. versors with nonnegative real part.
265
+
266
+ Args:
267
+ n: Number of quaternions in a batch to return.
268
+ dtype: Type to return.
269
+ device: Desired device of returned tensor. Default:
270
+ uses the current device for the default tensor type.
271
+ requires_grad: Whether the resulting tensor should have the gradient
272
+ flag set.
273
+
274
+ Returns:
275
+ Quaternions as tensor of shape (N, 4).
276
+ """
277
+ o = torch.randn((n, 4), dtype=dtype, device=device, requires_grad=requires_grad)
278
+ s = (o * o).sum(1)
279
+ o = o / _copysign(torch.sqrt(s), o[:, 0])[:, None]
280
+ return o
281
+
282
+
283
+ def random_rotations(
284
+ n: int, dtype: Optional[torch.dtype] = None, device=None, requires_grad=False
285
+ ):
286
+ """
287
+ Generate random rotations as 3x3 rotation matrices.
288
+
289
+ Args:
290
+ n: Number of rotation matrices in a batch to return.
291
+ dtype: Type to return.
292
+ device: Device of returned tensor. Default: if None,
293
+ uses the current device for the default tensor type.
294
+ requires_grad: Whether the resulting tensor should have the gradient
295
+ flag set.
296
+
297
+ Returns:
298
+ Rotation matrices as tensor of shape (n, 3, 3).
299
+ """
300
+ quaternions = random_quaternions(
301
+ n, dtype=dtype, device=device, requires_grad=requires_grad
302
+ )
303
+ return quaternion_to_matrix(quaternions)
304
+
305
+
306
+ def random_rotation(
307
+ dtype: Optional[torch.dtype] = None, device=None, requires_grad=False
308
+ ):
309
+ """
310
+ Generate a single random 3x3 rotation matrix.
311
+
312
+ Args:
313
+ dtype: Type to return
314
+ device: Device of returned tensor. Default: if None,
315
+ uses the current device for the default tensor type
316
+ requires_grad: Whether the resulting tensor should have the gradient
317
+ flag set
318
+
319
+ Returns:
320
+ Rotation matrix as tensor of shape (3, 3).
321
+ """
322
+ return random_rotations(1, dtype, device, requires_grad)[0]
323
+
324
+
325
+ def standardize_quaternion(quaternions):
326
+ """
327
+ Convert a unit quaternion to a standard form: one in which the real
328
+ part is non negative.
329
+
330
+ Args:
331
+ quaternions: Quaternions with real part first,
332
+ as tensor of shape (..., 4).
333
+
334
+ Returns:
335
+ Standardized quaternions as tensor of shape (..., 4).
336
+ """
337
+ return torch.where(quaternions[..., 0:1] < 0, -quaternions, quaternions)
338
+
339
+
340
+ def quaternion_raw_multiply(a, b):
341
+ """
342
+ Multiply two quaternions.
343
+ Usual torch rules for broadcasting apply.
344
+
345
+ Args:
346
+ a: Quaternions as tensor of shape (..., 4), real part first.
347
+ b: Quaternions as tensor of shape (..., 4), real part first.
348
+
349
+ Returns:
350
+ The product of a and b, a tensor of quaternions shape (..., 4).
351
+ """
352
+ aw, ax, ay, az = torch.unbind(a, -1)
353
+ bw, bx, by, bz = torch.unbind(b, -1)
354
+ ow = aw * bw - ax * bx - ay * by - az * bz
355
+ ox = aw * bx + ax * bw + ay * bz - az * by
356
+ oy = aw * by - ax * bz + ay * bw + az * bx
357
+ oz = aw * bz + ax * by - ay * bx + az * bw
358
+ return torch.stack((ow, ox, oy, oz), -1)
359
+
360
+
361
+ def quaternion_multiply(a, b):
362
+ """
363
+ Multiply two quaternions representing rotations, returning the quaternion
364
+ representing their composition, i.e. the versor with nonnegative real part.
365
+ Usual torch rules for broadcasting apply.
366
+
367
+ Args:
368
+ a: Quaternions as tensor of shape (..., 4), real part first.
369
+ b: Quaternions as tensor of shape (..., 4), real part first.
370
+
371
+ Returns:
372
+ The product of a and b, a tensor of quaternions of shape (..., 4).
373
+ """
374
+ ab = quaternion_raw_multiply(a, b)
375
+ return standardize_quaternion(ab)
376
+
377
+
378
+ def quaternion_invert(quaternion):
379
+ """
380
+ Given a quaternion representing rotation, get the quaternion representing
381
+ its inverse.
382
+
383
+ Args:
384
+ quaternion: Quaternions as tensor of shape (..., 4), with real part
385
+ first, which must be versors (unit quaternions).
386
+
387
+ Returns:
388
+ The inverse, a tensor of quaternions of shape (..., 4).
389
+ """
390
+
391
+ return quaternion * quaternion.new_tensor([1, -1, -1, -1])
392
+
393
+
394
+ def quaternion_apply(quaternion, point):
395
+ """
396
+ Apply the rotation given by a quaternion to a 3D point.
397
+ Usual torch rules for broadcasting apply.
398
+
399
+ Args:
400
+ quaternion: Tensor of quaternions, real part first, of shape (..., 4).
401
+ point: Tensor of 3D points of shape (..., 3).
402
+
403
+ Returns:
404
+ Tensor of rotated points of shape (..., 3).
405
+ """
406
+ if point.size(-1) != 3:
407
+ raise ValueError(f"Points are not in 3D, f{point.shape}.")
408
+ real_parts = point.new_zeros(point.shape[:-1] + (1,))
409
+ point_as_quaternion = torch.cat((real_parts, point), -1)
410
+ out = quaternion_raw_multiply(
411
+ quaternion_raw_multiply(quaternion, point_as_quaternion),
412
+ quaternion_invert(quaternion),
413
+ )
414
+ return out[..., 1:]
415
+
416
+
417
+ def axis_angle_to_matrix(axis_angle):
418
+ """
419
+ Convert rotations given as axis/angle to rotation matrices.
420
+
421
+ Args:
422
+ axis_angle: Rotations given as a vector in axis angle form,
423
+ as a tensor of shape (..., 3), where the magnitude is
424
+ the angle turned anticlockwise in radians around the
425
+ vector's direction.
426
+
427
+ Returns:
428
+ Rotation matrices as tensor of shape (..., 3, 3).
429
+ """
430
+ return quaternion_to_matrix(axis_angle_to_quaternion(axis_angle))
431
+
432
+
433
+ def matrix_to_axis_angle(matrix):
434
+ """
435
+ Convert rotations given as rotation matrices to axis/angle.
436
+
437
+ Args:
438
+ matrix: Rotation matrices as tensor of shape (..., 3, 3).
439
+
440
+ Returns:
441
+ Rotations given as a vector in axis angle form, as a tensor
442
+ of shape (..., 3), where the magnitude is the angle
443
+ turned anticlockwise in radians around the vector's
444
+ direction.
445
+ """
446
+ return quaternion_to_axis_angle(matrix_to_quaternion(matrix))
447
+
448
+
449
+ def axis_angle_to_quaternion(axis_angle):
450
+ """
451
+ Convert rotations given as axis/angle to quaternions.
452
+
453
+ Args:
454
+ axis_angle: Rotations given as a vector in axis angle form,
455
+ as a tensor of shape (..., 3), where the magnitude is
456
+ the angle turned anticlockwise in radians around the
457
+ vector's direction.
458
+
459
+ Returns:
460
+ quaternions with real part first, as tensor of shape (..., 4).
461
+ """
462
+ angles = torch.norm(axis_angle, p=2, dim=-1, keepdim=True)
463
+ half_angles = 0.5 * angles
464
+ eps = 1e-6
465
+ small_angles = angles.abs() < eps
466
+ sin_half_angles_over_angles = torch.empty_like(angles)
467
+ sin_half_angles_over_angles[~small_angles] = (
468
+ torch.sin(half_angles[~small_angles]) / angles[~small_angles]
469
+ )
470
+ # for x small, sin(x/2) is about x/2 - (x/2)^3/6
471
+ # so sin(x/2)/x is about 1/2 - (x*x)/48
472
+ sin_half_angles_over_angles[small_angles] = (
473
+ 0.5 - (angles[small_angles] * angles[small_angles]) / 48
474
+ )
475
+ quaternions = torch.cat(
476
+ [torch.cos(half_angles), axis_angle * sin_half_angles_over_angles], dim=-1
477
+ )
478
+ return quaternions
479
+
480
+
481
+ def quaternion_to_axis_angle(quaternions):
482
+ """
483
+ Convert rotations given as quaternions to axis/angle.
484
+
485
+ Args:
486
+ quaternions: quaternions with real part first,
487
+ as tensor of shape (..., 4).
488
+
489
+ Returns:
490
+ Rotations given as a vector in axis angle form, as a tensor
491
+ of shape (..., 3), where the magnitude is the angle
492
+ turned anticlockwise in radians around the vector's
493
+ direction.
494
+ """
495
+ norms = torch.norm(quaternions[..., 1:], p=2, dim=-1, keepdim=True)
496
+ half_angles = torch.atan2(norms, quaternions[..., :1])
497
+ angles = 2 * half_angles
498
+ eps = 1e-6
499
+ small_angles = angles.abs() < eps
500
+ sin_half_angles_over_angles = torch.empty_like(angles)
501
+ sin_half_angles_over_angles[~small_angles] = (
502
+ torch.sin(half_angles[~small_angles]) / angles[~small_angles]
503
+ )
504
+ # for x small, sin(x/2) is about x/2 - (x/2)^3/6
505
+ # so sin(x/2)/x is about 1/2 - (x*x)/48
506
+ sin_half_angles_over_angles[small_angles] = (
507
+ 0.5 - (angles[small_angles] * angles[small_angles]) / 48
508
+ )
509
+ return quaternions[..., 1:] / sin_half_angles_over_angles
510
+
511
+
512
+ def rotation_6d_to_matrix(d6: torch.Tensor) -> torch.Tensor:
513
+ """
514
+ Converts 6D rotation representation by Zhou et al. [1] to rotation matrix
515
+ using Gram--Schmidt orthogonalisation per Section B of [1].
516
+ Args:
517
+ d6: 6D rotation representation, of size (*, 6)
518
+
519
+ Returns:
520
+ batch of rotation matrices of size (*, 3, 3)
521
+
522
+ [1] Zhou, Y., Barnes, C., Lu, J., Yang, J., & Li, H.
523
+ On the Continuity of Rotation Representations in Neural Networks.
524
+ IEEE Conference on Computer Vision and Pattern Recognition, 2019.
525
+ Retrieved from http://arxiv.org/abs/1812.07035
526
+ """
527
+
528
+ a1, a2 = d6[..., :3], d6[..., 3:]
529
+ b1 = F.normalize(a1, dim=-1)
530
+ b2 = a2 - (b1 * a2).sum(-1, keepdim=True) * b1
531
+ b2 = F.normalize(b2, dim=-1)
532
+ b3 = torch.cross(b1, b2, dim=-1)
533
+ return torch.stack((b1, b2, b3), dim=-2)
534
+
535
+
536
+ def matrix_to_rotation_6d(matrix: torch.Tensor) -> torch.Tensor:
537
+ """
538
+ Converts rotation matrices to 6D rotation representation by Zhou et al. [1]
539
+ by dropping the last row. Note that 6D representation is not unique.
540
+ Args:
541
+ matrix: batch of rotation matrices of size (*, 3, 3)
542
+
543
+ Returns:
544
+ 6D rotation representation, of size (*, 6)
545
+
546
+ [1] Zhou, Y., Barnes, C., Lu, J., Yang, J., & Li, H.
547
+ On the Continuity of Rotation Representations in Neural Networks.
548
+ IEEE Conference on Computer Vision and Pattern Recognition, 2019.
549
+ Retrieved from http://arxiv.org/abs/1812.07035
550
+ """
551
+ return matrix[..., :2, :].clone().reshape(*matrix.size()[:-2], 6)
data_utils/utils.py ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ # import librosa #has to do this cause librosa is not supported on my server
3
+ import python_speech_features
4
+ from scipy.io import wavfile
5
+ from scipy import signal
6
+ import librosa
7
+ import torch
8
+ import torchaudio as ta
9
+ import torchaudio.functional as ta_F
10
+ import torchaudio.transforms as ta_T
11
+ # import pyloudnorm as pyln
12
+
13
+
14
+ def load_wav_old(audio_fn, sr = 16000):
15
+ sample_rate, sig = wavfile.read(audio_fn)
16
+ if sample_rate != sr:
17
+ result = int((sig.shape[0]) / sample_rate * sr)
18
+ x_resampled = signal.resample(sig, result)
19
+ x_resampled = x_resampled.astype(np.float64)
20
+ return x_resampled, sr
21
+
22
+ sig = sig / (2**15)
23
+ return sig, sample_rate
24
+
25
+
26
+ def get_mfcc(audio_fn, eps=1e-6, fps=25, smlpx=False, sr=16000, n_mfcc=64, win_size=None):
27
+
28
+ y, sr = librosa.load(audio_fn, sr=sr, mono=True)
29
+
30
+ if win_size is None:
31
+ hop_len=int(sr / fps)
32
+ else:
33
+ hop_len=int(sr / win_size)
34
+
35
+ n_fft=2048
36
+
37
+ C = librosa.feature.mfcc(
38
+ y = y,
39
+ sr = sr,
40
+ n_mfcc = n_mfcc,
41
+ hop_length = hop_len,
42
+ n_fft = n_fft
43
+ )
44
+
45
+ if C.shape[0] == n_mfcc:
46
+ C = C.transpose(1, 0)
47
+
48
+ return C
49
+
50
+
51
+ def get_melspec(audio_fn, eps=1e-6, fps = 25, sr=16000, n_mels=64):
52
+ raise NotImplementedError
53
+ '''
54
+ # y, sr = load_wav(audio_fn=audio_fn, sr=sr)
55
+
56
+ # hop_len = int(sr / fps)
57
+ # n_fft = 2048
58
+
59
+ # C = librosa.feature.melspectrogram(
60
+ # y = y,
61
+ # sr = sr,
62
+ # n_fft=n_fft,
63
+ # hop_length=hop_len,
64
+ # n_mels = n_mels,
65
+ # fmin=0,
66
+ # fmax=8000)
67
+
68
+
69
+ # mask = (C == 0).astype(np.float)
70
+ # C = mask * eps + (1-mask) * C
71
+
72
+ # C = np.log(C)
73
+ # #wierd error may occur here
74
+ # assert not (np.isnan(C).any()), audio_fn
75
+ # if C.shape[0] == n_mels:
76
+ # C = C.transpose(1, 0)
77
+
78
+ # return C
79
+ '''
80
+
81
+ def extract_mfcc(audio,sample_rate=16000):
82
+ mfcc = zip(*python_speech_features.mfcc(audio,sample_rate, numcep=64, nfilt=64, nfft=2048, winstep=0.04))
83
+ mfcc = np.stack([np.array(i) for i in mfcc])
84
+ return mfcc
85
+
86
+ def get_mfcc_psf(audio_fn, eps=1e-6, fps=25, smlpx=False, sr=16000, n_mfcc=64, win_size=None):
87
+ y, sr = load_wav_old(audio_fn, sr=sr)
88
+
89
+ if y.shape.__len__() > 1:
90
+ y = (y[:,0]+y[:,1])/2
91
+
92
+ if win_size is None:
93
+ hop_len=int(sr / fps)
94
+ else:
95
+ hop_len=int(sr/ win_size)
96
+
97
+ n_fft=2048
98
+
99
+ #hard coded for 25 fps
100
+ if not smlpx:
101
+ C = python_speech_features.mfcc(y, sr, numcep=n_mfcc, nfilt=n_mfcc, nfft=n_fft, winstep=0.04)
102
+ else:
103
+ C = python_speech_features.mfcc(y, sr, numcep=n_mfcc, nfilt=n_mfcc, nfft=n_fft, winstep=1.01/15)
104
+ # if C.shape[0] == n_mfcc:
105
+ # C = C.transpose(1, 0)
106
+
107
+ return C
108
+
109
+
110
+ def get_mfcc_psf_min(audio_fn, eps=1e-6, fps=25, smlpx=False, sr=16000, n_mfcc=64, win_size=None):
111
+ y, sr = load_wav_old(audio_fn, sr=sr)
112
+
113
+ if y.shape.__len__() > 1:
114
+ y = (y[:, 0] + y[:, 1]) / 2
115
+ n_fft = 2048
116
+
117
+ slice_len = 22000 * 5
118
+ slice = y.size // slice_len
119
+
120
+ C = []
121
+
122
+ for i in range(slice):
123
+ if i != (slice - 1):
124
+ feat = python_speech_features.mfcc(y[i*slice_len:(i+1)*slice_len], sr, numcep=n_mfcc, nfilt=n_mfcc, nfft=n_fft, winstep=1.01 / 15)
125
+ else:
126
+ feat = python_speech_features.mfcc(y[i * slice_len:], sr, numcep=n_mfcc, nfilt=n_mfcc, nfft=n_fft, winstep=1.01 / 15)
127
+
128
+ C.append(feat)
129
+
130
+ return C
131
+
132
+
133
+ def audio_chunking(audio: torch.Tensor, frame_rate: int = 30, chunk_size: int = 16000):
134
+ """
135
+ :param audio: 1 x T tensor containing a 16kHz audio signal
136
+ :param frame_rate: frame rate for video (we need one audio chunk per video frame)
137
+ :param chunk_size: number of audio samples per chunk
138
+ :return: num_chunks x chunk_size tensor containing sliced audio
139
+ """
140
+ samples_per_frame = chunk_size // frame_rate
141
+ padding = (chunk_size - samples_per_frame) // 2
142
+ audio = torch.nn.functional.pad(audio.unsqueeze(0), pad=[padding, padding]).squeeze(0)
143
+ anchor_points = list(range(chunk_size//2, audio.shape[-1]-chunk_size//2, samples_per_frame))
144
+ audio = torch.cat([audio[:, i-chunk_size//2:i+chunk_size//2] for i in anchor_points], dim=0)
145
+ return audio
146
+
147
+
148
+ def get_mfcc_ta(audio_fn, eps=1e-6, fps=15, smlpx=False, sr=16000, n_mfcc=64, win_size=None, type='mfcc', am=None, am_sr=None, encoder_choice='mfcc'):
149
+ if am is None:
150
+ sr_0, audio = audio_fn
151
+ audio = torch.tensor(audio)/32767
152
+ if len(audio.shape) == 1:
153
+ audio.unsqueeze_(dim=0)
154
+ elif audio.shape[1] == 1 or audio.shape[1] == 2:
155
+ audio.transpose_(0, 1)
156
+
157
+ if sr != sr_0:
158
+ audio = ta.transforms.Resample(sr_0, sr)(audio)
159
+ if audio.shape[0] > 1:
160
+ audio = torch.mean(audio, dim=0, keepdim=True)
161
+
162
+ n_fft = 2048
163
+ if fps == 15:
164
+ hop_length = 1467
165
+ elif fps == 30:
166
+ hop_length = 734
167
+ win_length = hop_length * 2
168
+ n_mels = 256
169
+ n_mfcc = 64
170
+
171
+ if type == 'mfcc':
172
+ mfcc_transform = ta_T.MFCC(
173
+ sample_rate=sr,
174
+ n_mfcc=n_mfcc,
175
+ melkwargs={
176
+ "n_fft": n_fft,
177
+ "n_mels": n_mels,
178
+ # "win_length": win_length,
179
+ "hop_length": hop_length,
180
+ "mel_scale": "htk",
181
+ },
182
+ )
183
+ audio_ft = mfcc_transform(audio).squeeze(dim=0).transpose(0,1).numpy()
184
+ elif type == 'mel':
185
+ # audio = 0.01 * audio / torch.mean(torch.abs(audio))
186
+ mel_transform = ta_T.MelSpectrogram(
187
+ sample_rate=sr, n_fft=n_fft, win_length=None, hop_length=hop_length, n_mels=n_mels
188
+ )
189
+ audio_ft = mel_transform(audio).squeeze(0).transpose(0,1).numpy()
190
+ # audio_ft = torch.log(audio_ft.clamp(min=1e-10, max=None)).transpose(0,1).numpy()
191
+ elif type == 'mel_mul':
192
+ audio = 0.01 * audio / torch.mean(torch.abs(audio))
193
+ audio = audio_chunking(audio, frame_rate=fps, chunk_size=sr)
194
+ mel_transform = ta_T.MelSpectrogram(
195
+ sample_rate=sr, n_fft=n_fft, win_length=int(sr/20), hop_length=int(sr/100), n_mels=n_mels
196
+ )
197
+ audio_ft = mel_transform(audio).squeeze(1)
198
+ audio_ft = torch.log(audio_ft.clamp(min=1e-10, max=None)).numpy()
199
+ else:
200
+ sampling_rate, speech_array = audio_fn
201
+ speech_array = torch.tensor(speech_array) / 32767
202
+ if len(speech_array.shape) == 1:
203
+ speech_array.unsqueeze_(0)
204
+ elif speech_array.shape[1] == 1 or speech_array.shape[1] == 2:
205
+ speech_array.transpose_(0, 1)
206
+ if sr != sampling_rate:
207
+ speech_array = ta.transforms.Resample(sampling_rate, sr)(speech_array)
208
+ speech_array = torch.mean(speech_array, dim=0, keepdim=True)
209
+ speech_array = speech_array.numpy()
210
+
211
+ if encoder_choice == 'faceformer':
212
+ # audio_ft = np.squeeze(am(speech_array, sampling_rate=16000).input_values).reshape(-1, 1)
213
+ audio_ft = speech_array.reshape(-1, 1)
214
+ elif encoder_choice == 'meshtalk':
215
+ audio_ft = 0.01 * speech_array / np.mean(np.abs(speech_array))
216
+ elif encoder_choice == 'onset':
217
+ audio_ft = librosa.onset.onset_detect(y=speech_array, sr=16000, units='time').reshape(-1, 1)
218
+ else:
219
+ audio, sr_0 = ta.load(audio_fn)
220
+ if sr != sr_0:
221
+ audio = ta.transforms.Resample(sr_0, sr)(audio)
222
+ if audio.shape[0] > 1:
223
+ audio = torch.mean(audio, dim=0, keepdim=True)
224
+
225
+ n_fft = 2048
226
+ if fps == 15:
227
+ hop_length = 1467
228
+ elif fps == 30:
229
+ hop_length = 734
230
+ win_length = hop_length * 2
231
+ n_mels = 256
232
+ n_mfcc = 64
233
+
234
+ mfcc_transform = ta_T.MFCC(
235
+ sample_rate=sr,
236
+ n_mfcc=n_mfcc,
237
+ melkwargs={
238
+ "n_fft": n_fft,
239
+ "n_mels": n_mels,
240
+ # "win_length": win_length,
241
+ "hop_length": hop_length,
242
+ "mel_scale": "htk",
243
+ },
244
+ )
245
+ audio_ft = mfcc_transform(audio).squeeze(dim=0).transpose(0, 1).numpy()
246
+ return audio_ft
247
+
248
+
249
+ def get_mfcc_sepa(audio_fn, fps=15, sr=16000):
250
+ audio, sr_0 = ta.load(audio_fn)
251
+ if sr != sr_0:
252
+ audio = ta.transforms.Resample(sr_0, sr)(audio)
253
+ if audio.shape[0] > 1:
254
+ audio = torch.mean(audio, dim=0, keepdim=True)
255
+
256
+ n_fft = 2048
257
+ if fps == 15:
258
+ hop_length = 1467
259
+ elif fps == 30:
260
+ hop_length = 734
261
+ n_mels = 256
262
+ n_mfcc = 64
263
+
264
+ mfcc_transform = ta_T.MFCC(
265
+ sample_rate=sr,
266
+ n_mfcc=n_mfcc,
267
+ melkwargs={
268
+ "n_fft": n_fft,
269
+ "n_mels": n_mels,
270
+ # "win_length": win_length,
271
+ "hop_length": hop_length,
272
+ "mel_scale": "htk",
273
+ },
274
+ )
275
+ audio_ft_0 = mfcc_transform(audio[0, :sr*2]).squeeze(dim=0).transpose(0,1).numpy()
276
+ audio_ft_1 = mfcc_transform(audio[0, sr*2:]).squeeze(dim=0).transpose(0,1).numpy()
277
+ audio_ft = np.concatenate((audio_ft_0, audio_ft_1), axis=0)
278
+ return audio_ft, audio_ft_0.shape[0]
279
+
280
+
281
+ def get_mfcc_old(wav_file):
282
+ sig, sample_rate = load_wav_old(wav_file)
283
+ mfcc = extract_mfcc(sig)
284
+ return mfcc
285
+
286
+
287
+ def smooth_geom(geom, mask: torch.Tensor = None, filter_size: int = 9, sigma: float = 2.0):
288
+ """
289
+ :param geom: T x V x 3 tensor containing a temporal sequence of length T with V vertices in each frame
290
+ :param mask: V-dimensional Tensor containing a mask with vertices to be smoothed
291
+ :param filter_size: size of the Gaussian filter
292
+ :param sigma: standard deviation of the Gaussian filter
293
+ :return: T x V x 3 tensor containing smoothed geometry (i.e., smoothed in the area indicated by the mask)
294
+ """
295
+ assert filter_size % 2 == 1, f"filter size must be odd but is {filter_size}"
296
+ # Gaussian smoothing (low-pass filtering)
297
+ fltr = np.arange(-(filter_size // 2), filter_size // 2 + 1)
298
+ fltr = np.exp(-0.5 * fltr ** 2 / sigma ** 2)
299
+ fltr = torch.Tensor(fltr) / np.sum(fltr)
300
+ # apply fltr
301
+ fltr = fltr.view(1, 1, -1).to(device=geom.device)
302
+ T, V = geom.shape[1], geom.shape[2]
303
+ g = torch.nn.functional.pad(
304
+ geom.permute(2, 0, 1).view(V, 1, T),
305
+ pad=[filter_size // 2, filter_size // 2], mode='replicate'
306
+ )
307
+ g = torch.nn.functional.conv1d(g, fltr).view(V, 1, T)
308
+ smoothed = g.permute(1, 2, 0).contiguous()
309
+ # blend smoothed signal with original signal
310
+ if mask is None:
311
+ return smoothed
312
+ else:
313
+ return smoothed * mask[None, :, None] + geom * (-mask[None, :, None] + 1)
314
+
315
+ if __name__ == '__main__':
316
+ audio_fn = '../sample_audio/clip000028_tCAkv4ggPgI.wav'
317
+
318
+ C = get_mfcc_psf(audio_fn)
319
+ print(C.shape)
320
+
321
+ C_2 = get_mfcc_librosa(audio_fn)
322
+ print(C.shape)
323
+
324
+ print(C)
325
+ print(C_2)
326
+ print((C == C_2).all())
327
+ # print(y.shape, sr)
328
+ # mel_spec = get_melspec(audio_fn)
329
+ # print(mel_spec.shape)
330
+ # mfcc = get_mfcc(audio_fn, sr = 16000)
331
+ # print(mfcc.shape)
332
+ # print(mel_spec.max(), mel_spec.min())
333
+ # print(mfcc.max(), mfcc.min())
demo/1st-page/1st-page-upper.mp4 ADDED
Binary file (837 kB). View file
 
demo/1st-page/1st-page-upper.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:159eefc300544ea95d919b49707afa466e6246135da9a986b4abbc55bbc54850
3
+ size 407168
demo/french/french.mp4 ADDED
Binary file (592 kB). View file
 
demo/french/french.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:289d7a2abb18efa495587a4c4b094a109bdb7d3efd779800f028708bde4d1477
3
+ size 305408
demo/rich/rich.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8bc50b66c7df10233191921a6a3f19c2895249997206f30e5e099cc10b90903a
3
+ size 3608757
demo/rich/rich.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d03c956ed3992980fe37581019ec12350531489b12b46a55cfc4c562f7bd8ddb
3
+ size 1908128
demo/song/cut.mp4 ADDED
Binary file (655 kB). View file
 
demo/song/song.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8827d6daaec213bee7bd32af68a0cf8ea83d154f32d006bd7f38120e2c282045
3
+ size 3178290
demo/song/song.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:157bfbde5a1b15ac812e52d8b08997be1a41fae93b3a7fe613b897d1ff5d8996
3
+ size 1707788
demo/style/chemistry.mp4 ADDED
Binary file (670 kB). View file
 
demo/style/chemistry.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a8dc42938343bc10b149a6a74d43d5a4cef010c6f2a0c58bffee7f48b2a1e81
3
+ size 318128
demo/style/conan.mp4 ADDED
Binary file (610 kB). View file
 
demo/style/conan.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:350ca76806d86ff7b36fbfeaec219d7c0cf515c3c23dfe6791143b82e7ec3327
3
+ size 318128
demo/style/diversity.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09fd9e6330ced1ecbf10a6e7e0a4f6ebad098eb44115a2ee35a070d02e522ec8
3
+ size 5882474
demo/style/diversity.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e4c37f510943dad934da97a8eade5ddce25165df20419e74606fb0160b4ce07
3
+ size 3816128
demo/style/face.mp4 ADDED
Binary file (687 kB). View file
 
demo/style/face.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b51d0d309e92449323ab481a3cc16c88d2b04f6f487eb366720a9ad7f8754f03
3
+ size 318128
demo/style/oliver.mp4 ADDED
Binary file (589 kB). View file
 
demo/style/oliver.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:149258f227975e1f07b449f0ab5e4c3e3e1458f97fa646360eac3f1428c52f5a
3
+ size 318128
demo/style/seth.mp4 ADDED
Binary file (558 kB). View file
 
demo/style/seth.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6132a40d60ee8cf954d74293ea46e75cd4b4d2001ca96fc4713abe06a34b5a3c
3
+ size 318128
demo_audio/1st-page.wav ADDED
Binary file (410 kB). View file
 
demo_audio/french.wav ADDED
Binary file (461 kB). View file
 
demo_audio/rich.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db9c793b66a64ffb11f0f673e70f9e0188bfa1ce95a391cb9af7d9c7ccf92597
3
+ size 10584078