Elle McFarlane commited on
Commit
fa0aa6d
1 Parent(s): 02729fd

add placeholder huggingface space app, gradio version == 4.15.0

Browse files
.gitattributes CHANGED
@@ -1,2 +1,3 @@
1
  *.tar filter=lfs diff=lfs merge=lfs -text
2
  *.npy filter=lfs diff=lfs merge=lfs -text
 
 
1
  *.tar filter=lfs diff=lfs merge=lfs -text
2
  *.npy filter=lfs diff=lfs merge=lfs -text
3
+ *.gif filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,13 +1,14 @@
1
  ---
2
  title: Text2EMotionDiffuse
3
- emoji: 🏢
4
  colorFrom: blue
5
  colorTo: red
6
  sdk: gradio
7
  sdk_version: 3.44.1
8
- app_file: app.py
9
  pinned: false
10
  license: mit
 
11
  ---
12
  <div align="center">
13
 
 
1
  ---
2
  title: Text2EMotionDiffuse
3
+ emoji: 🧠
4
  colorFrom: blue
5
  colorTo: red
6
  sdk: gradio
7
  sdk_version: 3.44.1
8
+ app_file: text2motion/app.py
9
  pinned: false
10
  license: mit
11
+ tags: diffusion, motiondiffuse, text2motion, smplx, smpl, smpl-x, smplify-x
12
  ---
13
  <div align="center">
14
 
text2motion/Makefile CHANGED
@@ -12,6 +12,11 @@ GT_FILE=s2/cubesmall_lift # ground-truth
12
  FRAMES=60
13
  MEAN_EMOTION=surprise
14
 
 
 
 
 
 
15
  mean-mesh:
16
  cd $(ROOT_DIR) && vglrun ${PYTHON_BIN} -m datasets.mean_mesh \
17
  --emotion ${MEAN_EMOTION} \
@@ -42,9 +47,8 @@ train: w_stats
42
  --seed ${SEED} \
43
  --use_wandb \
44
 
45
- # get makes model generate seq according to text and writes result to npy file
46
  gen-npy:
47
- # checkpoints/t2m/t2m_motiondiffuse/opt.txt
48
  cd ${ROOT_DIR} && ${PYTHON_BIN} -m tools.inference \
49
  --opt_path ${MODEL_DIR}/opt.txt \
50
  --which_epoch ${EPOCH} \
 
12
  FRAMES=60
13
  MEAN_EMOTION=surprise
14
 
15
+ # to push changes to space, run 'git push space main'
16
+ # make sure to do 'git remote add space https://huggingface.co/spaces/ellemac/Text2EMotionDiffuse'
17
+ huggingface:
18
+ cd ${ROOT_DIR} && ${PYTHON_BIN} -m app
19
+
20
  mean-mesh:
21
  cd $(ROOT_DIR) && vglrun ${PYTHON_BIN} -m datasets.mean_mesh \
22
  --emotion ${MEAN_EMOTION} \
 
47
  --seed ${SEED} \
48
  --use_wandb \
49
 
50
+ # get-npy makes model generate seq according to text and writes result to npy file
51
  gen-npy:
 
52
  cd ${ROOT_DIR} && ${PYTHON_BIN} -m tools.inference \
53
  --opt_path ${MODEL_DIR}/opt.txt \
54
  --which_epoch ${EPOCH} \
text2motion/app.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import gradio as gr
4
+ import logging
5
+
6
+ os.makedirs("outputs", exist_ok=True)
7
+ sys.path.insert(0, ".")
8
+
9
+ def generate(prompt, length):
10
+ logging.warning("NOT generating per the prompt [TODO], just returning a denoising gif.")
11
+ result_path = "outputs/denoising_grab_model.gif"
12
+ return result_path
13
+
14
+
15
+ demo = gr.Interface(
16
+ fn=generate,
17
+ inputs=["text", gr.Slider(5, 30, value=10)],
18
+ examples=[
19
+ ["happily flying airplane", 10],
20
+ ],
21
+ outputs="image",
22
+ title="COMING SOON: Text2EMotionDiffuse Demo. Currently: shows denoising gif for any prompt.",
23
+ description="COMING SOON, SPACE NOT CURRENTLY CONFIGURED TO HANDLE PROMPTS, but please Github: https://github.com/ellemcfarlane/Text2EMotionDiffuse",
24
+ )
25
+
26
+ if __name__ == "__main__":
27
+ demo.launch()
text2motion/datasets/motionx_explorer.py CHANGED
@@ -1,19 +1,21 @@
1
  import argparse
2
  import logging as log
3
  import os
4
- import time
5
  from collections import defaultdict
6
  from os.path import join as pjoin
7
  from typing import Dict, Optional, Tuple
8
-
 
 
 
 
 
9
  import numpy as np
10
  import smplx
11
  import torch
12
  from numpy.typing import ArrayLike
13
  from torch import Tensor
14
 
15
- from .rendering import render_meshes
16
-
17
  log.basicConfig(
18
  level=log.INFO,
19
  format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
@@ -23,7 +25,9 @@ log.basicConfig(
23
  MOCAP_DATASETS = {"egobody", "grab", "humanml", "grab_motion"}
24
  DATA_DIR = "data"
25
  MODELS_DIR = "models"
26
- MOCAP_FACE_DIR = f"{DATA_DIR}/face_motion_data/smplx_322" # contains face motion data only
 
 
27
  MOTION_DIR = f"{DATA_DIR}/motion_data/smplx_322"
28
  ACTION_LABEL_DIR = f"{DATA_DIR}/semantic_labels"
29
  EMOTION_LABEL_DIR = f"{DATA_DIR}/face_texts"
@@ -40,20 +44,24 @@ NOTE: I think they are wrong about n_body_joints though, data indicates it's act
40
 
41
  MY_REPO = os.path.abspath("")
42
  log.info(f"MY_REPO: {MY_REPO}")
43
- NUM_BODY_JOINTS = 23 - 2 # SMPL has hand joints but we're replacing them with more detailed ones by SMLP-X, paper: 22x3 total body dims * not sure why paper says 22
44
- NUM_JAW_JOINTS = 1 # 1x3 total jaw dims
 
 
45
  # Motion-X paper says there
46
- NUM_HAND_JOINTS = 15 # x2 for each hand -> 30x3 total hand dims
47
- NUM_JOINTS = NUM_BODY_JOINTS + NUM_HAND_JOINTS * 2 + NUM_JAW_JOINTS # 21 + 30 + 1 = 52
48
- NUM_FACIAL_EXPRESSION_DIMS = 50 # as per Motion-X paper, but why is default 10 in smplx code then?
 
 
49
  FACE_SHAPE_DIMS = 100
50
- BODY_SHAPE_DIMS = 10 # betas
51
  ROOT_DIMS = 3
52
- TRANS_DIMS = 3 # same as root, no?
53
 
54
  pose_type_to_dims = {
55
  "pose_body": NUM_BODY_JOINTS * 3,
56
- "pose_hand": NUM_HAND_JOINTS * 2 * 3, # both hands
57
  "pose_jaw": NUM_JAW_JOINTS * 3,
58
  "face_expr": NUM_FACIAL_EXPRESSION_DIMS * 1, # double check
59
  "face_shape": FACE_SHAPE_DIMS * 1, # double check
@@ -62,6 +70,7 @@ pose_type_to_dims = {
62
  "trans": TRANS_DIMS * 1,
63
  }
64
 
 
65
  def names_to_arrays(root_dir, names, drop_shapes=True):
66
  all_arrays = []
67
  for name in names:
@@ -73,12 +82,14 @@ def names_to_arrays(root_dir, names, drop_shapes=True):
73
  all_arrays.append(array)
74
  return all_arrays
75
 
 
76
  def get_seq_names(file_path):
77
  with open(file_path, "r") as f:
78
  names = f.readlines()
79
  names = [name.strip() for name in names]
80
  return names
81
 
 
82
  def get_data_path(dataset_dir: str, seq: str, file: str) -> str:
83
  # MY_REPO/face_motion_data/smplx_322/GRAB/s1/airplane_fly_1.npy
84
  top_dir = MOCAP_FACE_DIR if dataset_dir.lower() in MOCAP_DATASETS else MOTION_DIR
@@ -88,11 +99,16 @@ def get_data_path(dataset_dir: str, seq: str, file: str) -> str:
88
 
89
  def get_label_paths(dataset_dir: str, seq: str, file: str) -> Dict[str, str]:
90
  # MY_REPO/MotionDiffuse/face_texts/GRAB/s1/airplane_fly_1.txt
91
- action_path = f"{os.path.join(MY_REPO, ACTION_LABEL_DIR, dataset_dir, seq, file)}.txt"
92
- emotion_path = f"{os.path.join(MY_REPO, EMOTION_LABEL_DIR, dataset_dir, seq, file)}.txt"
 
 
 
 
93
  paths = {"action": action_path, "emotion": emotion_path}
94
  return paths
95
 
 
96
  def load_data_as_dict(dataset_dir: str, seq: str, file: str) -> Dict[str, Tensor]:
97
  path = get_data_path(dataset_dir, seq, file)
98
  motion = np.load(path)
@@ -108,7 +124,10 @@ def load_data_as_dict(dataset_dir: str, seq: str, file: str) -> Dict[str, Tensor
108
  "betas": motion[:, 312:], # controls the body shape. Body shape is static
109
  }
110
 
111
- def motion_arr_to_dict(motion_arr: ArrayLike, shapes_dropped=False) -> Dict[str, Tensor]:
 
 
 
112
  # TODO (elmc): why did I need to convert to tensor again???
113
  motion_arr = torch.tensor(motion_arr).float()
114
  motion_dict = {
@@ -119,30 +138,40 @@ def motion_arr_to_dict(motion_arr: ArrayLike, shapes_dropped=False) -> Dict[str,
119
  "face_expr": motion_arr[:, 159 : 159 + 50], # controls the face expression
120
  }
121
  if not shapes_dropped:
122
- motion_dict["face_shape"] = motion_arr[:, 209 : 209 + 100] # controls the face shape
123
- motion_dict["trans"] = motion_arr[:, 309 : 309 + 3] # controls the global body position
124
- motion_dict["betas"] = motion_arr[:, 312:] # controls the body shape. Body shape is static
 
 
 
 
 
 
125
  else:
126
- motion_dict["trans"] = motion_arr[:, 209:] # controls the global body position
127
-
128
  return motion_dict
129
-
130
 
131
  def drop_shapes_from_motion_arr(motion_arr: ArrayLike) -> ArrayLike:
132
  if isinstance(motion_arr, torch.Tensor):
133
  new_motion_arr = motion_arr.numpy()
134
-
135
  # Slice the array to exclude 'face_shape' and 'betas'
136
- new_motion_arr = np.concatenate((motion_arr[:, :209], motion_arr[:, 309:312]), axis=1)
137
-
 
 
138
  return new_motion_arr
139
 
 
140
  def load_label_from_file(file_path: str) -> str:
141
  with open(file_path, "r") as file:
142
  # Read the contents of the file into a string
143
  label = file.read()
144
  return label
145
 
 
146
  def load_label(dataset_dir: str, seq: str, file_path: str) -> Dict[str, str]:
147
  paths = get_label_paths(dataset_dir, seq, file_path)
148
  action_path, emotion_path = paths["action"], paths["emotion"]
@@ -163,28 +192,31 @@ def label_code(full_label):
163
  # airplane -> air
164
  return full_label[:3]
165
 
 
166
  def get_seq_type(motion_label_dir, file_name):
167
  # e.g. s5/airplane_fly_1 -> airplane fly (motion label)
168
  seq_type_path = pjoin(motion_label_dir, f"{file_name}.txt")
169
- with open(seq_type_path, 'r') as f:
170
  seq_type = f.readline().strip()
171
  return seq_type
172
 
 
173
  def calc_mean_stddev_pose(arrays):
174
  # all_arrays = []
175
  # for file_path in file_list:
176
  # # Load each NumPy array and add it to the list
177
  # array = np.load(file_path)
178
  # all_arrays.append(array)
179
-
180
  # Concatenate all arrays along the first axis (stacking them on top of each other)
181
  concatenated_arrays = np.concatenate(arrays, axis=0)
182
  # Calculate the mean and standard deviation across all arrays
183
  mean = np.mean(concatenated_arrays, axis=0)
184
  stddev = np.std(concatenated_arrays, axis=0)
185
-
186
  return mean, stddev
187
 
 
188
  def get_info_from_file(file_path, emotions_label_dir, motion_label_dir):
189
  # train_names = get_seq_names(pjoin(data_dir, "train.txt"))
190
  names = get_seq_names(file_path)
@@ -220,9 +252,12 @@ def get_info_from_file(file_path, emotions_label_dir, motion_label_dir):
220
  "n_seq": n_seq,
221
  "code_to_label": code_to_label,
222
  }
223
- return info_dict
224
 
225
- def to_smplx_dict(motion_dict: Dict[str, Tensor], timestep_range: Optional[Tuple[int, int]] = None) -> Dict[str, Tensor]:
 
 
 
226
  if timestep_range is None:
227
  # get all timesteps
228
  timestep_range = (0, len(motion_dict["pose_body"]))
@@ -230,31 +265,51 @@ def to_smplx_dict(motion_dict: Dict[str, Tensor], timestep_range: Optional[Tuple
230
  "global_orient": motion_dict["root_orient"][
231
  timestep_range[0] : timestep_range[1]
232
  ], # controls the global root orientation
233
- "body_pose": motion_dict["pose_body"][timestep_range[0] : timestep_range[1]], # controls the body
234
- "left_hand_pose": motion_dict["pose_hand"][timestep_range[0] : timestep_range[1]][
235
- :, : NUM_HAND_JOINTS * 3
236
- ], # controls the finger articulation
237
- "right_hand_pose": motion_dict["pose_hand"][timestep_range[0] : timestep_range[1]][:, NUM_HAND_JOINTS * 3 :],
238
- "expression": motion_dict["face_expr"][timestep_range[0] : timestep_range[1]], # controls the face expression
239
- "jaw_pose": motion_dict["pose_jaw"][timestep_range[0] : timestep_range[1]], # controls the jaw pose
 
 
 
 
 
 
 
 
240
  # 'face_shape': motion_dict['face_shape'][timestep], # controls the face shape, drop since we don't care to train on this
241
- "transl": motion_dict["trans"][timestep_range[0] : timestep_range[1]], # controls the global body position
 
 
242
  # "betas": motion["betas"][
243
  # timestep_range[0] : timestep_range[1]
244
  # ], # controls the body shape. Body shape is static, drop since we don't care to train on this
245
  }
246
  return smplx_params
247
 
 
248
  def smplx_dict_to_array(smplx_dict):
249
  # convert smplx dict to array
250
  # list keys to ensure known order when iterating over dict
251
- keys = ["global_orient", "body_pose", "left_hand_pose", "right_hand_pose", "expression", "jaw_pose", "transl"]
 
 
 
 
 
 
 
 
252
  smplx_array = []
253
  for key in keys:
254
  smplx_array.append(smplx_dict[key])
255
  smplx_array = torch.cat(smplx_array, dim=1)
256
  return smplx_array
257
 
 
258
  def save_gif(gif_path, gif_frames, duration=0.01):
259
  if gif_frames:
260
  print(f"Saving GIF with {len(gif_frames)} frames to {gif_path}")
@@ -262,6 +317,7 @@ def save_gif(gif_path, gif_frames, duration=0.01):
262
  else:
263
  print("No frames to save.")
264
 
 
265
  # based on https://github.com/vchoutas/smplx/blob/main/examples/demo.py
266
  def render_meshes(output, should_save_gif=False, gif_path=None):
267
  should_display = not should_save_gif
@@ -294,7 +350,9 @@ def render_meshes(output, should_save_gif=False, gif_path=None):
294
  plot_joints = False
295
  if plotting_module == "pyrender":
296
  vertex_colors = np.ones([vertices.shape[0], 4]) * [0.3, 0.3, 0.3, 0.8]
297
- tri_mesh = trimesh.Trimesh(vertices, model.faces, vertex_colors=vertex_colors)
 
 
298
 
299
  # Apply rotation
300
  tri_mesh.apply_transform(rot)
@@ -322,7 +380,7 @@ def render_meshes(output, should_save_gif=False, gif_path=None):
322
  cam_pose = np.array(
323
  [
324
  [1.0, 0, 0, center[0]],
325
- [0, 1.0, 0, center[1]-1.0],
326
  [0, 0, 1.0, center[2] + distance + 0.5],
327
  [0, 0, 0, 1],
328
  ]
@@ -332,12 +390,14 @@ def render_meshes(output, should_save_gif=False, gif_path=None):
332
  angle = np.radians(90)
333
  cos_angle = np.cos(angle)
334
  sin_angle = np.sin(angle)
335
- rot_x = np.array([
336
- [1, 0, 0, 0],
337
- [0, cos_angle, -sin_angle, 0],
338
- [0, sin_angle, cos_angle, 0],
339
- [0, 0, 0, 1]
340
- ])
 
 
341
  cam_pose = np.matmul(cam_pose, rot_x)
342
  cam_pose[:3, 3] += np.array([0, -2.5, -3.5])
343
 
@@ -361,7 +421,9 @@ def render_meshes(output, should_save_gif=False, gif_path=None):
361
  scene.remove_node(joints_node)
362
  joints_node = scene.add(joints_pcl)
363
  if should_save_gif:
364
- r = pyrender.OffscreenRenderer(viewport_width=640, viewport_height=480)
 
 
365
  color, _ = r.render(scene)
366
  gif_frames.append(color)
367
  r.delete() # Free up the resources
@@ -375,11 +437,13 @@ def render_meshes(output, should_save_gif=False, gif_path=None):
375
  finally:
376
  save_gif(gif_path, gif_frames)
377
 
 
378
  def get_numpy_file_path(prompt, epoch, n_frames):
379
  # e.g. "airplane_fly_1_1000_60f.npy"
380
- prompt_no_spaces = prompt.replace(' ', '_')
381
  return f"{prompt_no_spaces}_{epoch}_{n_frames}f"
382
 
 
383
  if __name__ == "__main__":
384
  parser = argparse.ArgumentParser()
385
 
@@ -401,10 +465,10 @@ if __name__ == "__main__":
401
  parser.add_argument(
402
  "-dm",
403
  "--display_mesh",
404
- action='store_true',
405
  required=False,
406
  default=False,
407
- help="Display mesh if this flag is present"
408
  )
409
  # for now just specifies file name (with spaces) made by inference
410
  parser.add_argument(
@@ -435,10 +499,10 @@ if __name__ == "__main__":
435
  parser.add_argument(
436
  "-sg",
437
  "--save_gif",
438
- action='store_true',
439
  required=False,
440
  default=False,
441
- help="Save gif if this flag is present"
442
  )
443
  # add which_epoch
444
  parser.add_argument(
@@ -453,26 +517,30 @@ if __name__ == "__main__":
453
  prompt = args.prompt
454
  is_inference = len(prompt) > 0
455
  if args.seq_file != "" and args.prompt != "":
456
- log.error("cannot provide both prompt and seq_file; if trying to verify model inference, use --prompt, otherwise specify numpy --seq_file name to display")
 
 
457
  exit(1)
458
  elif args.seq_file == "" and args.prompt == "":
459
- log.error("must provide either prompt or seq_file; if trying to verify model inference, use --prompt, otherwise specify numpy --seq_file name to display")
 
 
460
  exit(1)
461
  if not is_inference:
462
  name = args.seq_file
463
- data_root = './data/GRAB'
464
- motion_dir = pjoin(data_root, 'joints')
465
  else:
466
- log.info(f"converting prompt into file name")
467
  name = get_numpy_file_path(prompt, args.which_epoch, args.max_t - args.min_t)
468
  model_type = args.model_path
469
- motion_dir = pjoin(model_type, 'outputs')
470
- motion_path = pjoin(motion_dir, name + '.npy')
471
  log.info(f"loading motion from {motion_path}")
472
  motion_arr = np.load(motion_path)
473
  t = 999
474
- mean_path = '/work3/s222376/MotionDiffuse2/text2motion/checkpoints/grab/md_fulem_2g_excl_196_seed42/meta/mean.npy'
475
- std_path = '/work3/s222376/MotionDiffuse2/text2motion/checkpoints/grab/md_fulem_2g_excl_196_seed42/meta/std.npy'
476
  mean = np.load(mean_path)
477
  std = np.load(std_path)
478
  # do range skipping by 100
@@ -481,7 +549,9 @@ if __name__ == "__main__":
481
  for t in list_:
482
  name = f"sample_tensor([{t}])"
483
  # breakpoint()
484
- motion_arr = np.load(f"/work3/s222376/MotionDiffuse2/text2motion/generation_samples/{name}.npy")
 
 
485
  motion_arr = np.squeeze(motion_arr)
486
 
487
  motion_arr = motion_arr * std + mean
@@ -491,7 +561,9 @@ if __name__ == "__main__":
491
  # directly get smplx dimensionality by dropping body and face shape data
492
  print("warning, dropping body and face shape data")
493
  motion_arr = drop_shapes_from_motion_arr(motion_arr)
494
- assert motion_arr.shape[1] == 212, f"expected 212 dims, got {motion_arr.shape[1]}"
 
 
495
 
496
  # our MotionDiffuse predicts motion data that doesn't include face and body shape
497
  motion_dict = motion_arr_to_dict(motion_arr, shapes_dropped=True)
@@ -522,23 +594,23 @@ if __name__ == "__main__":
522
  log.info(f"TOTAL SMPLX dims: {tot_smplx_dims}\n")
523
 
524
  if not is_inference:
525
- action_label_path = pjoin(data_root, 'texts', name + '.txt')
526
  action_label = load_label_from_file(action_label_path)
527
- emotion_label_path = pjoin(data_root, 'face_texts', name + '.txt')
528
  emotion_label = load_label_from_file(emotion_label_path)
529
  log.info(f"action: {action_label}")
530
  log.info(f"emotion: {emotion_label}")
531
 
532
  if is_inference:
533
- emotion_label = args.prompt.split(' ')[0]
534
-
535
  if args.display_mesh:
536
  model_folder = os.path.join(MY_REPO, MODELS_DIR, "smplx")
537
  batch_size = max_t - min_t
538
  log.info(f"calculating mesh with batch size {batch_size}")
539
  model = smplx.SMPLX(
540
  model_folder,
541
- use_pca=False, # our joints are not in pca space
542
  num_expression_coeffs=NUM_FACIAL_EXPRESSION_DIMS,
543
  batch_size=batch_size,
544
  )
@@ -546,7 +618,9 @@ if __name__ == "__main__":
546
  log.info(f"output size {output.vertices.shape}")
547
  log.info(f"output size {output.joints.shape}")
548
  log.info("rendering mesh")
549
- model_name = args.model_path.split('/')[-1] if args.model_path else "ground_truth"
 
 
550
  gif_path = f"gifs/{model_name}/{name}_{emotion_label}.gif"
551
  render_meshes(output, gif_path=gif_path, should_save_gif=args.save_gif)
552
  log.warning(
 
1
  import argparse
2
  import logging as log
3
  import os
 
4
  from collections import defaultdict
5
  from os.path import join as pjoin
6
  from typing import Dict, Optional, Tuple
7
+ import pyrender
8
+ from tqdm import tqdm
9
+ import trimesh
10
+ import numpy as np
11
+ import os
12
+ import imageio
13
  import numpy as np
14
  import smplx
15
  import torch
16
  from numpy.typing import ArrayLike
17
  from torch import Tensor
18
 
 
 
19
  log.basicConfig(
20
  level=log.INFO,
21
  format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
 
25
  MOCAP_DATASETS = {"egobody", "grab", "humanml", "grab_motion"}
26
  DATA_DIR = "data"
27
  MODELS_DIR = "models"
28
+ MOCAP_FACE_DIR = (
29
+ f"{DATA_DIR}/face_motion_data/smplx_322" # contains face motion data only
30
+ )
31
  MOTION_DIR = f"{DATA_DIR}/motion_data/smplx_322"
32
  ACTION_LABEL_DIR = f"{DATA_DIR}/semantic_labels"
33
  EMOTION_LABEL_DIR = f"{DATA_DIR}/face_texts"
 
44
 
45
  MY_REPO = os.path.abspath("")
46
  log.info(f"MY_REPO: {MY_REPO}")
47
+ NUM_BODY_JOINTS = (
48
+ 23 - 2
49
+ ) # SMPL has hand joints but we're replacing them with more detailed ones by SMLP-X, paper: 22x3 total body dims * not sure why paper says 22
50
+ NUM_JAW_JOINTS = 1 # 1x3 total jaw dims
51
  # Motion-X paper says there
52
+ NUM_HAND_JOINTS = 15 # x2 for each hand -> 30x3 total hand dims
53
+ NUM_JOINTS = NUM_BODY_JOINTS + NUM_HAND_JOINTS * 2 + NUM_JAW_JOINTS # 21 + 30 + 1 = 52
54
+ NUM_FACIAL_EXPRESSION_DIMS = (
55
+ 50 # as per Motion-X paper, but why is default 10 in smplx code then?
56
+ )
57
  FACE_SHAPE_DIMS = 100
58
+ BODY_SHAPE_DIMS = 10 # betas
59
  ROOT_DIMS = 3
60
+ TRANS_DIMS = 3 # same as root, no?
61
 
62
  pose_type_to_dims = {
63
  "pose_body": NUM_BODY_JOINTS * 3,
64
+ "pose_hand": NUM_HAND_JOINTS * 2 * 3, # both hands
65
  "pose_jaw": NUM_JAW_JOINTS * 3,
66
  "face_expr": NUM_FACIAL_EXPRESSION_DIMS * 1, # double check
67
  "face_shape": FACE_SHAPE_DIMS * 1, # double check
 
70
  "trans": TRANS_DIMS * 1,
71
  }
72
 
73
+
74
  def names_to_arrays(root_dir, names, drop_shapes=True):
75
  all_arrays = []
76
  for name in names:
 
82
  all_arrays.append(array)
83
  return all_arrays
84
 
85
+
86
  def get_seq_names(file_path):
87
  with open(file_path, "r") as f:
88
  names = f.readlines()
89
  names = [name.strip() for name in names]
90
  return names
91
 
92
+
93
  def get_data_path(dataset_dir: str, seq: str, file: str) -> str:
94
  # MY_REPO/face_motion_data/smplx_322/GRAB/s1/airplane_fly_1.npy
95
  top_dir = MOCAP_FACE_DIR if dataset_dir.lower() in MOCAP_DATASETS else MOTION_DIR
 
99
 
100
  def get_label_paths(dataset_dir: str, seq: str, file: str) -> Dict[str, str]:
101
  # MY_REPO/MotionDiffuse/face_texts/GRAB/s1/airplane_fly_1.txt
102
+ action_path = (
103
+ f"{os.path.join(MY_REPO, ACTION_LABEL_DIR, dataset_dir, seq, file)}.txt"
104
+ )
105
+ emotion_path = (
106
+ f"{os.path.join(MY_REPO, EMOTION_LABEL_DIR, dataset_dir, seq, file)}.txt"
107
+ )
108
  paths = {"action": action_path, "emotion": emotion_path}
109
  return paths
110
 
111
+
112
  def load_data_as_dict(dataset_dir: str, seq: str, file: str) -> Dict[str, Tensor]:
113
  path = get_data_path(dataset_dir, seq, file)
114
  motion = np.load(path)
 
124
  "betas": motion[:, 312:], # controls the body shape. Body shape is static
125
  }
126
 
127
+
128
+ def motion_arr_to_dict(
129
+ motion_arr: ArrayLike, shapes_dropped=False
130
+ ) -> Dict[str, Tensor]:
131
  # TODO (elmc): why did I need to convert to tensor again???
132
  motion_arr = torch.tensor(motion_arr).float()
133
  motion_dict = {
 
138
  "face_expr": motion_arr[:, 159 : 159 + 50], # controls the face expression
139
  }
140
  if not shapes_dropped:
141
+ motion_dict["face_shape"] = motion_arr[
142
+ :, 209 : 209 + 100
143
+ ] # controls the face shape
144
+ motion_dict["trans"] = motion_arr[
145
+ :, 309 : 309 + 3
146
+ ] # controls the global body position
147
+ motion_dict["betas"] = motion_arr[
148
+ :, 312:
149
+ ] # controls the body shape. Body shape is static
150
  else:
151
+ motion_dict["trans"] = motion_arr[:, 209:] # controls the global body position
152
+
153
  return motion_dict
154
+
155
 
156
  def drop_shapes_from_motion_arr(motion_arr: ArrayLike) -> ArrayLike:
157
  if isinstance(motion_arr, torch.Tensor):
158
  new_motion_arr = motion_arr.numpy()
159
+
160
  # Slice the array to exclude 'face_shape' and 'betas'
161
+ new_motion_arr = np.concatenate(
162
+ (motion_arr[:, :209], motion_arr[:, 309:312]), axis=1
163
+ )
164
+
165
  return new_motion_arr
166
 
167
+
168
  def load_label_from_file(file_path: str) -> str:
169
  with open(file_path, "r") as file:
170
  # Read the contents of the file into a string
171
  label = file.read()
172
  return label
173
 
174
+
175
  def load_label(dataset_dir: str, seq: str, file_path: str) -> Dict[str, str]:
176
  paths = get_label_paths(dataset_dir, seq, file_path)
177
  action_path, emotion_path = paths["action"], paths["emotion"]
 
192
  # airplane -> air
193
  return full_label[:3]
194
 
195
+
196
  def get_seq_type(motion_label_dir, file_name):
197
  # e.g. s5/airplane_fly_1 -> airplane fly (motion label)
198
  seq_type_path = pjoin(motion_label_dir, f"{file_name}.txt")
199
+ with open(seq_type_path, "r") as f:
200
  seq_type = f.readline().strip()
201
  return seq_type
202
 
203
+
204
  def calc_mean_stddev_pose(arrays):
205
  # all_arrays = []
206
  # for file_path in file_list:
207
  # # Load each NumPy array and add it to the list
208
  # array = np.load(file_path)
209
  # all_arrays.append(array)
210
+
211
  # Concatenate all arrays along the first axis (stacking them on top of each other)
212
  concatenated_arrays = np.concatenate(arrays, axis=0)
213
  # Calculate the mean and standard deviation across all arrays
214
  mean = np.mean(concatenated_arrays, axis=0)
215
  stddev = np.std(concatenated_arrays, axis=0)
216
+
217
  return mean, stddev
218
 
219
+
220
  def get_info_from_file(file_path, emotions_label_dir, motion_label_dir):
221
  # train_names = get_seq_names(pjoin(data_dir, "train.txt"))
222
  names = get_seq_names(file_path)
 
252
  "n_seq": n_seq,
253
  "code_to_label": code_to_label,
254
  }
255
+ return info_dict
256
 
257
+
258
+ def to_smplx_dict(
259
+ motion_dict: Dict[str, Tensor], timestep_range: Optional[Tuple[int, int]] = None
260
+ ) -> Dict[str, Tensor]:
261
  if timestep_range is None:
262
  # get all timesteps
263
  timestep_range = (0, len(motion_dict["pose_body"]))
 
265
  "global_orient": motion_dict["root_orient"][
266
  timestep_range[0] : timestep_range[1]
267
  ], # controls the global root orientation
268
+ "body_pose": motion_dict["pose_body"][
269
+ timestep_range[0] : timestep_range[1]
270
+ ], # controls the body
271
+ "left_hand_pose": motion_dict["pose_hand"][
272
+ timestep_range[0] : timestep_range[1]
273
+ ][:, : NUM_HAND_JOINTS * 3], # controls the finger articulation
274
+ "right_hand_pose": motion_dict["pose_hand"][
275
+ timestep_range[0] : timestep_range[1]
276
+ ][:, NUM_HAND_JOINTS * 3 :],
277
+ "expression": motion_dict["face_expr"][
278
+ timestep_range[0] : timestep_range[1]
279
+ ], # controls the face expression
280
+ "jaw_pose": motion_dict["pose_jaw"][
281
+ timestep_range[0] : timestep_range[1]
282
+ ], # controls the jaw pose
283
  # 'face_shape': motion_dict['face_shape'][timestep], # controls the face shape, drop since we don't care to train on this
284
+ "transl": motion_dict["trans"][
285
+ timestep_range[0] : timestep_range[1]
286
+ ], # controls the global body position
287
  # "betas": motion["betas"][
288
  # timestep_range[0] : timestep_range[1]
289
  # ], # controls the body shape. Body shape is static, drop since we don't care to train on this
290
  }
291
  return smplx_params
292
 
293
+
294
  def smplx_dict_to_array(smplx_dict):
295
  # convert smplx dict to array
296
  # list keys to ensure known order when iterating over dict
297
+ keys = [
298
+ "global_orient",
299
+ "body_pose",
300
+ "left_hand_pose",
301
+ "right_hand_pose",
302
+ "expression",
303
+ "jaw_pose",
304
+ "transl",
305
+ ]
306
  smplx_array = []
307
  for key in keys:
308
  smplx_array.append(smplx_dict[key])
309
  smplx_array = torch.cat(smplx_array, dim=1)
310
  return smplx_array
311
 
312
+
313
  def save_gif(gif_path, gif_frames, duration=0.01):
314
  if gif_frames:
315
  print(f"Saving GIF with {len(gif_frames)} frames to {gif_path}")
 
317
  else:
318
  print("No frames to save.")
319
 
320
+
321
  # based on https://github.com/vchoutas/smplx/blob/main/examples/demo.py
322
  def render_meshes(output, should_save_gif=False, gif_path=None):
323
  should_display = not should_save_gif
 
350
  plot_joints = False
351
  if plotting_module == "pyrender":
352
  vertex_colors = np.ones([vertices.shape[0], 4]) * [0.3, 0.3, 0.3, 0.8]
353
+ tri_mesh = trimesh.Trimesh(
354
+ vertices, model.faces, vertex_colors=vertex_colors
355
+ )
356
 
357
  # Apply rotation
358
  tri_mesh.apply_transform(rot)
 
380
  cam_pose = np.array(
381
  [
382
  [1.0, 0, 0, center[0]],
383
+ [0, 1.0, 0, center[1] - 1.0],
384
  [0, 0, 1.0, center[2] + distance + 0.5],
385
  [0, 0, 0, 1],
386
  ]
 
390
  angle = np.radians(90)
391
  cos_angle = np.cos(angle)
392
  sin_angle = np.sin(angle)
393
+ rot_x = np.array(
394
+ [
395
+ [1, 0, 0, 0],
396
+ [0, cos_angle, -sin_angle, 0],
397
+ [0, sin_angle, cos_angle, 0],
398
+ [0, 0, 0, 1],
399
+ ]
400
+ )
401
  cam_pose = np.matmul(cam_pose, rot_x)
402
  cam_pose[:3, 3] += np.array([0, -2.5, -3.5])
403
 
 
421
  scene.remove_node(joints_node)
422
  joints_node = scene.add(joints_pcl)
423
  if should_save_gif:
424
+ r = pyrender.OffscreenRenderer(
425
+ viewport_width=640, viewport_height=480
426
+ )
427
  color, _ = r.render(scene)
428
  gif_frames.append(color)
429
  r.delete() # Free up the resources
 
437
  finally:
438
  save_gif(gif_path, gif_frames)
439
 
440
+
441
  def get_numpy_file_path(prompt, epoch, n_frames):
442
  # e.g. "airplane_fly_1_1000_60f.npy"
443
+ prompt_no_spaces = prompt.replace(" ", "_")
444
  return f"{prompt_no_spaces}_{epoch}_{n_frames}f"
445
 
446
+
447
  if __name__ == "__main__":
448
  parser = argparse.ArgumentParser()
449
 
 
465
  parser.add_argument(
466
  "-dm",
467
  "--display_mesh",
468
+ action="store_true",
469
  required=False,
470
  default=False,
471
+ help="Display mesh if this flag is present",
472
  )
473
  # for now just specifies file name (with spaces) made by inference
474
  parser.add_argument(
 
499
  parser.add_argument(
500
  "-sg",
501
  "--save_gif",
502
+ action="store_true",
503
  required=False,
504
  default=False,
505
+ help="Save gif if this flag is present",
506
  )
507
  # add which_epoch
508
  parser.add_argument(
 
517
  prompt = args.prompt
518
  is_inference = len(prompt) > 0
519
  if args.seq_file != "" and args.prompt != "":
520
+ log.error(
521
+ "cannot provide both prompt and seq_file; if trying to verify model inference, use --prompt, otherwise specify numpy --seq_file name to display"
522
+ )
523
  exit(1)
524
  elif args.seq_file == "" and args.prompt == "":
525
+ log.error(
526
+ "must provide either prompt or seq_file; if trying to verify model inference, use --prompt, otherwise specify numpy --seq_file name to display"
527
+ )
528
  exit(1)
529
  if not is_inference:
530
  name = args.seq_file
531
+ data_root = "./data/GRAB"
532
+ motion_dir = pjoin(data_root, "joints")
533
  else:
534
+ log.info("converting prompt into file name")
535
  name = get_numpy_file_path(prompt, args.which_epoch, args.max_t - args.min_t)
536
  model_type = args.model_path
537
+ motion_dir = pjoin(model_type, "outputs")
538
+ motion_path = pjoin(motion_dir, name + ".npy")
539
  log.info(f"loading motion from {motion_path}")
540
  motion_arr = np.load(motion_path)
541
  t = 999
542
+ mean_path = "/work3/s222376/MotionDiffuse2/text2motion/checkpoints/grab/md_fulem_2g_excl_196_seed42/meta/mean.npy"
543
+ std_path = "/work3/s222376/MotionDiffuse2/text2motion/checkpoints/grab/md_fulem_2g_excl_196_seed42/meta/std.npy"
544
  mean = np.load(mean_path)
545
  std = np.load(std_path)
546
  # do range skipping by 100
 
549
  for t in list_:
550
  name = f"sample_tensor([{t}])"
551
  # breakpoint()
552
+ motion_arr = np.load(
553
+ f"/work3/s222376/MotionDiffuse2/text2motion/generation_samples/{name}.npy"
554
+ )
555
  motion_arr = np.squeeze(motion_arr)
556
 
557
  motion_arr = motion_arr * std + mean
 
561
  # directly get smplx dimensionality by dropping body and face shape data
562
  print("warning, dropping body and face shape data")
563
  motion_arr = drop_shapes_from_motion_arr(motion_arr)
564
+ assert (
565
+ motion_arr.shape[1] == 212
566
+ ), f"expected 212 dims, got {motion_arr.shape[1]}"
567
 
568
  # our MotionDiffuse predicts motion data that doesn't include face and body shape
569
  motion_dict = motion_arr_to_dict(motion_arr, shapes_dropped=True)
 
594
  log.info(f"TOTAL SMPLX dims: {tot_smplx_dims}\n")
595
 
596
  if not is_inference:
597
+ action_label_path = pjoin(data_root, "texts", name + ".txt")
598
  action_label = load_label_from_file(action_label_path)
599
+ emotion_label_path = pjoin(data_root, "face_texts", name + ".txt")
600
  emotion_label = load_label_from_file(emotion_label_path)
601
  log.info(f"action: {action_label}")
602
  log.info(f"emotion: {emotion_label}")
603
 
604
  if is_inference:
605
+ emotion_label = args.prompt.split(" ")[0]
606
+
607
  if args.display_mesh:
608
  model_folder = os.path.join(MY_REPO, MODELS_DIR, "smplx")
609
  batch_size = max_t - min_t
610
  log.info(f"calculating mesh with batch size {batch_size}")
611
  model = smplx.SMPLX(
612
  model_folder,
613
+ use_pca=False, # our joints are not in pca space
614
  num_expression_coeffs=NUM_FACIAL_EXPRESSION_DIMS,
615
  batch_size=batch_size,
616
  )
 
618
  log.info(f"output size {output.vertices.shape}")
619
  log.info(f"output size {output.joints.shape}")
620
  log.info("rendering mesh")
621
+ model_name = (
622
+ args.model_path.split("/")[-1] if args.model_path else "ground_truth"
623
+ )
624
  gif_path = f"gifs/{model_name}/{name}_{emotion_label}.gif"
625
  render_meshes(output, gif_path=gif_path, should_save_gif=args.save_gif)
626
  log.warning(
text2motion/tools/inference.py CHANGED
@@ -14,6 +14,24 @@ from utils.plot_script import *
14
  from utils.utils import *
15
  from utils.word_vectorizer import POS_enumerator
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  def plot_t2m(data, result_path, npy_path, caption, joints_n):
19
  joint = recover_from_ric(torch.from_numpy(data).float(), joints_n).numpy()
 
14
  from utils.utils import *
15
  from utils.word_vectorizer import POS_enumerator
16
 
17
+ # def plot_t2m(opt, data, result_path, caption):
18
+ # joint = recover_from_ric(torch.from_numpy(data).float(), opt.joints_num).numpy()
19
+ # # joint = motion_temporal_filter(joint, sigma=1)
20
+ # plot_3d_motion(result_path, paramUtil.t2m_kinematic_chain, joint, title=caption, fps=20)
21
+
22
+
23
+ # def process(trainer, opt, device, mean, std, text, motion_length, result_path):
24
+
25
+ # result_dict = {}
26
+ # with torch.no_grad():
27
+ # if motion_length != -1:
28
+ # caption = [text]
29
+ # m_lens = torch.LongTensor([motion_length]).to(device)
30
+ # pred_motions = trainer.generate(caption, m_lens, opt.dim_pose)
31
+ # motion = pred_motions[0].cpu().numpy()
32
+ # motion = motion * std + mean
33
+ # title = text + " #%d" % motion.shape[0]
34
+ # plot_t2m(opt, motion, result_path, title
35
 
36
  def plot_t2m(data, result_path, npy_path, caption, joints_n):
37
  joint = recover_from_ric(torch.from_numpy(data).float(), joints_n).numpy()