Elle McFarlane commited on
Commit
15d6c34
·
1 Parent(s): 98de0be

add gitignore etc

Browse files
Files changed (50) hide show
  1. .gitattributes +2 -0
  2. .gitignore +6 -0
  3. .python-version +1 -0
  4. text2motion/Makefile +131 -0
  5. text2motion/data/GRAB/grab_all.txt +1335 -0
  6. text2motion/data/GRAB/grab_test.txt +201 -0
  7. text2motion/data/GRAB/grab_train.txt +1068 -0
  8. text2motion/data/GRAB/grab_val.txt +66 -0
  9. text2motion/data/GRAB/test.txt +1 -0
  10. text2motion/data/GRAB/train.txt +1068 -0
  11. text2motion/data/GRAB/train_long.txt +8 -0
  12. text2motion/data/GRAB/train_short.txt +1 -0
  13. text2motion/data/GRAB/train_val.txt +1 -0
  14. text2motion/datasets/__init__.py +16 -0
  15. text2motion/datasets/combine_gifs.py +87 -0
  16. text2motion/datasets/dataloader.py +128 -0
  17. text2motion/datasets/dataset.py +194 -0
  18. text2motion/datasets/evaluator.py +469 -0
  19. text2motion/datasets/evaluator_models.py +438 -0
  20. text2motion/datasets/mean_mesh.py +231 -0
  21. text2motion/datasets/motionx_explorer.py +554 -0
  22. text2motion/datasets/rendering.py +126 -0
  23. text2motion/datasets/statistics_writer.py +56 -0
  24. text2motion/datasets/train_explorer.ipynb +0 -0
  25. text2motion/datasets/utils.py +18 -0
  26. text2motion/dtu_README.md +58 -0
  27. text2motion/install.md +154 -0
  28. text2motion/jobscript.sh +40 -0
  29. text2motion/models/__init__.py +4 -0
  30. text2motion/models/gaussian_diffusion.py +1147 -0
  31. text2motion/models/transformer.py +429 -0
  32. text2motion/options/base_options.py +91 -0
  33. text2motion/options/evaluate_options.py +27 -0
  34. text2motion/options/train_options.py +32 -0
  35. text2motion/requirements.txt +11 -0
  36. text2motion/tools/__init__.py +0 -0
  37. text2motion/tools/arguments.py +37 -0
  38. text2motion/tools/evaluation.py +278 -0
  39. text2motion/tools/inference.py +105 -0
  40. text2motion/tools/train.py +134 -0
  41. text2motion/utils/__init__.py +0 -0
  42. text2motion/utils/get_opt.py +102 -0
  43. text2motion/utils/metrics.py +146 -0
  44. text2motion/utils/motion_process.py +515 -0
  45. text2motion/utils/paramUtil.py +63 -0
  46. text2motion/utils/plot_script.py +115 -0
  47. text2motion/utils/quaternion.py +423 -0
  48. text2motion/utils/skeleton.py +199 -0
  49. text2motion/utils/utils.py +156 -0
  50. text2motion/utils/word_vectorizer.py +80 -0
.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *.tar filter=lfs diff=lfs merge=lfs -text
2
+ *.npy filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ **.gif
2
+ **.pyc
3
+ **.npz
4
+ .vscode/
5
+ demo*
6
+ .ipynb
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.7
text2motion/Makefile ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # get git root
2
+ ROOT_DIR:=$(shell git rev-parse --show-toplevel)/text2motion
3
+ PYTHON_BIN:=python3
4
+ EXP:=motiondiffuse
5
+ SEED = 42
6
+ MODEL_DIR:=checkpoints/grab/demo
7
+ # MODEL_DIR:=checkpoints/grab/md_fulem_2g_excl_196_seed42
8
+ EPOCH:=latest
9
+ PROMPT:=a person walking happily
10
+ # PROMPT:=happiness airplane pass
11
+ GT_FILE=s2/cubesmall_lift # ground-truth
12
+ FRAMES=60
13
+ MEAN_EMOTION=surprise
14
+
15
+ mean-mesh:
16
+ cd $(ROOT_DIR) && vglrun ${PYTHON_BIN} -m datasets.mean_mesh \
17
+ --emotion ${MEAN_EMOTION} \
18
+ --file train.txt \
19
+
20
+ expl-train:
21
+ cd ${ROOT_DIR} && ${PYTHON_BIN} -m datasets.train_explorer \
22
+
23
+ eval:
24
+ cd ${ROOT_DIR} && ${PYTHON_BIN} -m tools.evaluation ${MODEL_DIR}/opt.txt \
25
+
26
+ # TODO (elmc): increase batch_size from 1 when not debugging!!
27
+ train: w_stats
28
+ echo "experiment name md_${EXP}_seed${SEED}"
29
+ cd ${ROOT_DIR} && ${PYTHON_BIN} -m tools.train \
30
+ --name md_${EXP}_seed${SEED} \
31
+ --batch_size 128 \
32
+ --times 50 \
33
+ --num_epochs 50 \
34
+ --dataset_name grab \
35
+ --num_layers 8 \
36
+ --diffusion_steps 1000 \
37
+ --data_parallel \
38
+ --gpu_id 0 1 \
39
+ --wandb_user "elles" \
40
+ --experiment_name md_${EXP}_seed${SEED} \
41
+ --log_every 50 \
42
+ --seed ${SEED} \
43
+ --use_wandb \
44
+
45
+ # get makes model generate seq according to text and writes result to npy file
46
+ gen-npy:
47
+ # checkpoints/t2m/t2m_motiondiffuse/opt.txt
48
+ cd ${ROOT_DIR} && ${PYTHON_BIN} -m tools.inference \
49
+ --opt_path ${MODEL_DIR}/opt.txt \
50
+ --which_epoch ${EPOCH} \
51
+ --text "${PROMPT}" \
52
+ --npy_path ${MODEL_DIR}/outputs \
53
+ --seed 42 \
54
+ --motion_length ${FRAMES} \
55
+
56
+ # put the model your trained in MODEL_DIR (set at top of file) and generate poses with it conditioned on prompt
57
+ # smpl-x model then displays poses as meshes
58
+ # WARNING: make sure to run 'make gen' first to generate the npy files for make gen
59
+ play-gen-gif:
60
+ echo "make sure to run on hpc dtu gui with graphics support and that you use 'vglrun' before python3 call!"
61
+ echo "WARNING: run 'make gen' first to generate the npy files for make gen"
62
+ cd $(ROOT_DIR) && vglrun ${PYTHON_BIN} -m datasets.motionx_explorer \
63
+ --model_path ${MODEL_DIR} \
64
+ --which_epoch ${EPOCH} \
65
+ --prompt "${PROMPT}" \
66
+ --display_mesh \
67
+ --save_gif \
68
+ --max_t ${FRAMES} \
69
+
70
+ play-gen-gui:
71
+ echo "make sure to run on hpc dtu gui with graphics support and that you use 'vglrun' before python3 call!"
72
+ echo "WARNING: run 'make infer' first to generate the npy files for make gen"
73
+ cd $(ROOT_DIR) && ${PYTHON_BIN} -m datasets.motionx_explorer \
74
+ --model_path ${MODEL_DIR} \
75
+ --which_epoch ${EPOCH} \
76
+ --prompt "${PROMPT}" \
77
+ --display_mesh \
78
+ --max_t ${FRAMES} \
79
+
80
+ # smpl-x displays poses from seq_file as meshes
81
+ play-gt-gif:
82
+ cd $(ROOT_DIR) && vglrun ${PYTHON_BIN} -m datasets.motionx_explorer \
83
+ --seq_file ${GT_FILE} \
84
+ --display_mesh \
85
+ --save_gif \
86
+ --max_t ${FRAMES} \
87
+
88
+ play-gt-gui:
89
+ cd $(ROOT_DIR) && vglrun ${PYTHON_BIN} -m datasets.motionx_explorer \
90
+ --seq_file ${GT_FILE} \
91
+ --display_mesh \
92
+ --max_t ${FRAMES} \
93
+
94
+ gen: gen-npy play-gen-gif
95
+
96
+ aug:
97
+ cd $(ROOT_DIR) && ${PYTHON_BIN} -m Motion-X.mocap-dataset-process.face_motion_augmentation \
98
+
99
+ print-data:
100
+ cd $(ROOT_DIR) && ${PYTHON_BIN} -m datasets.motionx_explorer \
101
+
102
+ queue:
103
+ cd ${ROOT_DIR} && bsub < jobscript.sh
104
+
105
+ w_stats:
106
+ cd ${ROOT_DIR} && ${PYTHON_BIN} -m datasets.statistics_writer \
107
+
108
+ w_custom:
109
+ cd ${ROOT_DIR} && ${PYTHON_BIN} -m datasets.custom_data_writer \
110
+
111
+ stat:
112
+ @err_file=$$(ls -v gpu_*.err | tail -n 1); \
113
+ out_file=$$(ls -v gpu_*.out | tail -n 1); \
114
+ echo "Latest .err file: $$err_file"; \
115
+ echo "Latest .out file: $$out_file"; \
116
+
117
+ # checks gpu utilization of latest job
118
+ gpu:
119
+ @err_file=$$(ls -v gpu_*.err | tail -n 1); \
120
+ err_number=$$(echo $$err_file | grep -oP 'gpu_\K\d+(?=\.err)'); \
121
+ echo "Latest .err file: $$err_file with number $$err_number"; \
122
+ bnvtop $$err_number; \
123
+
124
+ space:
125
+ getquota_work3.sh
126
+
127
+ hog:
128
+ du -h --max-depth=1 --apparent /work3/s222376/
129
+
130
+ env_setup:
131
+ @echo "module load cuda/10.1 cudnn/v7.6.5.32-prod-cuda-10.1 gcc/5.4.0"
text2motion/data/GRAB/grab_all.txt ADDED
@@ -0,0 +1,1335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ s8/mug_toast_1
2
+ s3/airplane_pass_1
3
+ s1/lightbulb_pass_1
4
+ s10/camera_takepicture_3
5
+ s1/wineglass_toast_1
6
+ s1/phone_lift
7
+ s4/binoculars_see_1
8
+ s8/teapot_pass_1
9
+ s9/cubemedium_pass_1
10
+ s8/elephant_lift
11
+ s2/gamecontroller_play_1
12
+ s6/duck_lift
13
+ s4/stapler_staple_1
14
+ s8/mug_drink_1
15
+ s8/train_lift
16
+ s1/cubesmall_offhand_1
17
+ s6/cylindermedium_inspect_1
18
+ s8/cylindersmall_lift
19
+ s3/waterbottle_drink_1
20
+ s1/wineglass_offhand_1
21
+ s5/scissors_pass_1
22
+ s1/cup_lift
23
+ s8/binoculars_pass_1
24
+ s9/hand_pass_1
25
+ s7/doorknob_lift
26
+ s8/spherelarge_pass_1
27
+ s10/alarmclock_see_1
28
+ s5/waterbottle_shake_1
29
+ s5/mouse_use_1
30
+ s2/cubesmall_lift
31
+ s4/cubelarge_inspect_1
32
+ s8/apple_eat_1
33
+ s3/cylinderlarge_lift_Retake
34
+ s7/mug_drink_1
35
+ s8/knife_pass_1
36
+ s1/flute_offhand_1
37
+ s7/stapler_pass_1
38
+ s6/flute_play_1
39
+ s6/stapler_staple_1
40
+ s3/gamecontroller_lift
41
+ s9/spheresmall_pass_1
42
+ s9/camera_takepicture_1
43
+ s3/apple_offhand_1
44
+ s4/binoculars_lift
45
+ s8/fryingpan_lift
46
+ s7/camera_pass_1
47
+ s10/hand_shake_1
48
+ s3/stamp_stamp_1
49
+ s6/watch_set_2
50
+ s6/wineglass_drink_1
51
+ s1/cubelarge_pass_1
52
+ s6/hand_inspect_1
53
+ s2/torussmall_lift
54
+ s1/torussmall_pass_1
55
+ s3/mug_offhand_1
56
+ s1/cylinderlarge_offhand_1
57
+ s2/cylindermedium_inspect_1
58
+ s4/stamp_stamp_1
59
+ s4/mouse_use_1
60
+ s8/spheremedium_lift
61
+ s8/headphones_pass_1
62
+ s3/gamecontroller_play_1
63
+ s2/stanfordbunny_lift
64
+ s10/fryingpan_cook_2
65
+ s3/wineglass_pass_1
66
+ s1/pyramidlarge_offhand_1
67
+ s6/toothpaste_pass_1
68
+ s1/mug_lift
69
+ s10/waterbottle_pass_1
70
+ s1/flashlight_pass_1
71
+ s7/banana_lift
72
+ s6/waterbottle_drink_1
73
+ s1/stamp_offhand_1
74
+ s1/camera_takepicture_1
75
+ s10/cubesmall_inspect_1
76
+ s7/alarmclock_see_1
77
+ s7/gamecontroller_pass_1
78
+ s8/banana_peel_1
79
+ s2/camera_takepicture_2
80
+ s2/cubemedium_lift
81
+ s10/train_pass_1
82
+ s10/cylindermedium_pass_1
83
+ s6/spherelarge_lift
84
+ s8/duck_inspect_1
85
+ s3/mug_drink_3
86
+ s6/hand_pass_1
87
+ s3/headphones_use_1
88
+ s5/wineglass_drink_1
89
+ s10/doorknob_use_2
90
+ s6/spheremedium_pass_1
91
+ s5/wineglass_toast_1
92
+ s5/pyramidmedium_inspect_1
93
+ s10/spheremedium_inspect_1
94
+ s5/bowl_drink_2
95
+ s10/pyramidmedium_pass_1
96
+ s9/watch_set_1
97
+ s7/pyramidmedium_pass_1
98
+ s5/stanfordbunny_pass_1
99
+ s8/stanfordbunny_lift
100
+ s7/waterbottle_open_1_Retake
101
+ s9/pyramidmedium_lift_Retake
102
+ s5/cubelarge_pass_1
103
+ s2/stapler_staple_1
104
+ s2/knife_pass_1
105
+ s5/duck_inspect_1
106
+ s8/wineglass_pass_1
107
+ s5/spherelarge_pass_1
108
+ s2/elephant_pass_1
109
+ s1/spherelarge_offhand_1
110
+ s4/doorknob_use_1
111
+ s4/stanfordbunny_pass_1
112
+ s5/piggybank_pass_1
113
+ s5/banana_peel_2
114
+ s10/spheremedium_lift
115
+ s9/gamecontroller_pass_1
116
+ s6/alarmclock_see_1
117
+ s9/toruslarge_inspect_1
118
+ s1/waterbottle_pass_1
119
+ s7/piggybank_use_1
120
+ s8/toruslarge_pass_1
121
+ s1/hammer_use_1
122
+ s6/stapler_offhand_1_Retake
123
+ s8/toothpaste_pass_1
124
+ s10/knife_peel_1
125
+ s4/cylindersmall_pass_1
126
+ s10/cylinderlarge_inspect_1
127
+ s7/flashlight_on_2
128
+ s2/cubelarge_lift
129
+ s8/fryingpan_cook_1
130
+ s6/wineglass_lift
131
+ s1/stapler_lift
132
+ s4/cup_pass_1
133
+ s10/piggybank_use_1
134
+ s10/binoculars_see_1
135
+ s4/spheremedium_pass_1
136
+ s9/bowl_drink_1
137
+ s4/piggybank_use_1
138
+ s5/cubesmall_inspect_1
139
+ s7/stamp_stamp_1
140
+ s3/scissors_use_2
141
+ s10/torusmedium_inspect_1
142
+ s7/hand_inspect_1
143
+ s5/cylindermedium_inspect_1
144
+ s8/banana_eat_1
145
+ s7/binoculars_see_1
146
+ s4/spheresmall_pass_1
147
+ s4/cylinderlarge_lift
148
+ s9/waterbottle_shake_1
149
+ s4/hand_inspect_1
150
+ s7/spherelarge_lift
151
+ s2/flashlight_on_1
152
+ s10/duck_pass_1
153
+ s9/bowl_pass_1
154
+ s1/mouse_lift
155
+ s9/watch_pass_1
156
+ s2/phone_call_1
157
+ s6/cylinderlarge_inspect_1
158
+ s3/wineglass_drink_2
159
+ s10/toothpaste_pass_1
160
+ s10/fryingpan_cook_1
161
+ s8/watch_lift
162
+ s9/pyramidlarge_inspect_1
163
+ s2/mouse_use_1
164
+ s6/pyramidmedium_pass_1
165
+ s4/mouse_pass_1
166
+ s6/wineglass_toast_1
167
+ s1/watch_lift
168
+ s9/hand_inspect_1
169
+ s1/airplane_lift
170
+ s9/mouse_lift
171
+ s6/eyeglasses_clean_1
172
+ s4/waterbottle_drink_1
173
+ s4/torussmall_lift
174
+ s6/binoculars_lift
175
+ s8/bowl_offhand_1
176
+ s2/eyeglasses_pass_1
177
+ s8/fryingpan_cook_2
178
+ s8/spherelarge_lift
179
+ s1/cylindermedium_pass_1
180
+ s1/banana_eat_1
181
+ s10/gamecontroller_pass_1
182
+ s9/camera_browse_1
183
+ s6/eyeglasses_wear_1
184
+ s1/stanfordbunny_lift
185
+ s6/cylindersmall_inspect_1
186
+ s6/elephant_pass_1
187
+ s3/torusmedium_inspect_1
188
+ s8/spheresmall_pass_1
189
+ s6/apple_offhand_1_Retake
190
+ s6/stanfordbunny_pass_1
191
+ s6/mug_drink_2
192
+ s8/mug_lift
193
+ s1/binoculars_see_1
194
+ s1/torussmall_offhand_1
195
+ s9/duck_pass_1
196
+ s8/cubemedium_offhand_1
197
+ s8/spherelarge_inspect_1
198
+ s7/wineglass_pass_1
199
+ s9/toothpaste_squeeze_1
200
+ s10/scissors_pass_1
201
+ s10/torussmall_inspect_1
202
+ s4/wineglass_lift
203
+ s5/binoculars_pass_1
204
+ s7/stanfordbunny_pass_1
205
+ s1/mug_drink_1
206
+ s1/stamp_pass_1
207
+ s8/piggybank_use_1
208
+ s10/spheremedium_pass_1
209
+ s9/torusmedium_pass_1
210
+ s3/mouse_lift
211
+ s1/cylindermedium_inspect_1
212
+ s8/cylindermedium_pass_1
213
+ s7/cubesmall_pass_1
214
+ s6/cylindermedium_lift
215
+ s7/cup_pour_1
216
+ s9/banana_pass_1
217
+ s1/camera_browse_1
218
+ s3/eyeglasses_clean_1
219
+ s1/doorknob_use_fun_1
220
+ s1/banana_lift
221
+ s6/cylindersmall_pass_1
222
+ s5/elephant_lift
223
+ s5/elephant_inspect_1
224
+ s9/waterbottle_lift
225
+ s6/piggybank_lift_Retake
226
+ s5/camera_takepicture_3
227
+ s4/stapler_staple_2
228
+ s1/cylindersmall_lift
229
+ s1/airplane_offhand_1
230
+ s9/apple_pass_1
231
+ s9/cylindermedium_pass_1
232
+ s9/hammer_use_3
233
+ s7/cubemedium_pass_1
234
+ s6/cubemedium_inspect_1
235
+ s2/mug_lift
236
+ s6/phone_lift
237
+ s4/spherelarge_inspect_1
238
+ s1/elephant_inspect_1
239
+ s4/headphones_pass_1
240
+ s8/binoculars_see_1
241
+ s7/pyramidmedium_inspect_1
242
+ s1/toothpaste_lift
243
+ s4/stamp_lift
244
+ s8/eyeglasses_pass_1
245
+ s7/hand_pass_1
246
+ s10/spherelarge_lift
247
+ s2/stapler_pass_1
248
+ s9/banana_peel_1
249
+ s2/mug_pass_1
250
+ s6/stapler_pass_1
251
+ s10/wineglass_drink_1
252
+ s10/lightbulb_screw_1
253
+ s7/spheresmall_pass_1
254
+ s2/stanfordbunny_inspect_1
255
+ s7/piggybank_pass_1
256
+ s8/mug_pass_1
257
+ s10/torussmall_lift
258
+ s8/cup_pour_1
259
+ s5/pyramidlarge_pass_1
260
+ s1/binoculars_pass_1
261
+ s5/hammer_pass_1
262
+ s1/flashlight_offhand_1
263
+ s7/elephant_pass_1
264
+ s6/hand_lift
265
+ s1/cubelarge_offhand_1
266
+ s9/elephant_lift
267
+ s8/banana_peel_2
268
+ s8/knife_peel_1
269
+ s1/teapot_pass_1
270
+ s10/headphones_pass_1
271
+ s2/toothbrush_lift
272
+ s9/knife_pass_1
273
+ s2/cubemedium_inspect_1
274
+ s8/teapot_lift
275
+ s3/pyramidmedium_pass_1
276
+ s2/cylindermedium_lift
277
+ s4/mug_pass_1
278
+ s10/stamp_lift
279
+ s7/airplane_lift_Retake
280
+ s10/toruslarge_pass_1
281
+ s6/fryingpan_pass_1
282
+ s4/train_play_1
283
+ s5/pyramidmedium_pass_1
284
+ s7/binoculars_pass_1
285
+ s1/phone_call_1
286
+ s2/spheresmall_inspect_1
287
+ s3/apple_lift
288
+ s6/hammer_use_3
289
+ s8/toothbrush_pass_1
290
+ s7/spheresmall_inspect_1
291
+ s10/flashlight_on_1
292
+ s9/elephant_inspect_1
293
+ s4/elephant_pass_1
294
+ s1/bowl_drink_1
295
+ s6/cubemedium_pass_1
296
+ s7/eyeglasses_pass_1
297
+ s8/airplane_fly_1
298
+ s3/alarmclock_lift
299
+ s4/doorknob_lift
300
+ s4/fryingpan_cook_2
301
+ s4/toothbrush_pass_1
302
+ s5/cylinderlarge_pass_1
303
+ s2/banana_eat_1
304
+ s5/stamp_stamp_1
305
+ s1/knife_lift
306
+ s10/hand_pass_1
307
+ s4/bowl_drink_2
308
+ s8/cup_pass_1
309
+ s5/hammer_use_1
310
+ s3/train_lift
311
+ s2/cup_pass_1
312
+ s7/torussmall_pass_1
313
+ s8/hand_pass_1
314
+ s10/cup_drink_2
315
+ s3/toothpaste_lift
316
+ s2/fryingpan_pass_1
317
+ s3/cylindersmall_pass_1
318
+ s5/spheremedium_lift
319
+ s10/apple_eat_1
320
+ s4/teapot_pour_2
321
+ s4/torussmall_pass_1
322
+ s5/toothpaste_pass_1
323
+ s7/headphones_pass_1
324
+ s5/camera_pass_1
325
+ s9/elephant_pass_1
326
+ s6/toothpaste_lift
327
+ s8/flute_pass_1
328
+ s4/spheremedium_lift
329
+ s3/binoculars_see_1
330
+ s10/camera_browse_1
331
+ s1/banana_pass_1
332
+ s1/toruslarge_lift
333
+ s6/cubelarge_inspect_1
334
+ s1/pyramidlarge_pass_1
335
+ s2/teapot_pour_1
336
+ s3/waterbottle_pass_1
337
+ s2/hammer_lift
338
+ s9/apple_lift
339
+ s9/waterbottle_pour_1
340
+ s4/cubemedium_lift
341
+ s9/phone_pass_1
342
+ s5/cubemedium_lift
343
+ s4/gamecontroller_pass_1
344
+ s9/cubemedium_inspect_1
345
+ s3/cup_drink_1
346
+ s4/stapler_lift
347
+ s2/apple_pass_1
348
+ s5/mug_toast_1
349
+ s5/cylindersmall_inspect_1
350
+ s3/cup_lift
351
+ s1/hammer_pass_1
352
+ s10/hammer_lift
353
+ s5/elephant_pass_1
354
+ s1/spheresmall_inspect_1
355
+ s3/toothbrush_brush_1
356
+ s7/apple_eat_1
357
+ s7/pyramidlarge_lift
358
+ s7/wineglass_drink_1
359
+ s7/mug_lift
360
+ s6/cubesmall_offhand_1
361
+ s2/camera_lift
362
+ s1/cubemedium_lift
363
+ s10/gamecontroller_lift
364
+ s9/cylinderlarge_pass_1
365
+ s8/cubelarge_inspect_1
366
+ s1/scissors_pass_1
367
+ s1/train_play_1
368
+ s5/stanfordbunny_inspect_1
369
+ s4/spheresmall_lift
370
+ s4/watch_lift
371
+ s9/cylinderlarge_inspect_1
372
+ s2/hand_inspect_1
373
+ s8/toothpaste_squeeze_1
374
+ s8/toothpaste_squeeze_2
375
+ s8/mouse_use_1
376
+ s2/teapot_pass_1
377
+ s7/cup_pass_1
378
+ s9/spheremedium_inspect_1
379
+ s10/gamecontroller_play_1
380
+ s7/apple_lift
381
+ s6/eyeglasses_lift
382
+ s10/flute_pass_1
383
+ s9/airplane_fly_1
384
+ s5/stamp_pass_1
385
+ s5/alarmclock_lift
386
+ s10/hammer_use_3
387
+ s3/cylindersmall_inspect_1
388
+ s10/alarmclock_pass_1
389
+ s9/waterbottle_drink_1
390
+ s10/banana_pass_1
391
+ s6/watch_pass_1
392
+ s1/banana_peel_2
393
+ s8/pyramidmedium_pass_1
394
+ s7/cubemedium_lift
395
+ s6/cup_drink_2
396
+ s4/cubelarge_pass_1
397
+ s1/bowl_pass_1
398
+ s8/wineglass_drink_2
399
+ s8/phone_call_1
400
+ s8/torusmedium_lift
401
+ s4/piggybank_pass_1
402
+ s9/stamp_pass_1
403
+ s10/hammer_use_2
404
+ s9/wineglass_pass_1
405
+ s8/camera_browse_1
406
+ s7/airplane_pass_1
407
+ s6/cup_pass_1
408
+ s6/airplane_pass_1
409
+ s9/torussmall_pass_1
410
+ s8/teapot_pour_2
411
+ s4/mouse_use_2
412
+ s10/flashlight_on_2
413
+ s7/pyramidsmall_pass_1
414
+ s3/cup_pour_1
415
+ s1/piggybank_pass_1
416
+ s5/cubesmall_pass_1
417
+ s10/airplane_fly_1
418
+ s8/cylinderlarge_offhand_1
419
+ s6/gamecontroller_play_1
420
+ s3/stanfordbunny_pass_1
421
+ s7/stanfordbunny_lift
422
+ s9/banana_peel_2
423
+ s2/apple_lift
424
+ s10/waterbottle_pour_1
425
+ s1/alarmclock_pass_1
426
+ s1/hammer_use_3
427
+ s1/eyeglasses_clean_1
428
+ s6/phone_call_1
429
+ s7/banana_eat_1
430
+ s4/waterbottle_shake_1
431
+ s8/waterbottle_pass_1
432
+ s6/cylinderlarge_lift
433
+ s9/hand_shake_1
434
+ s6/camera_browse_1
435
+ s2/camera_browse_1
436
+ s4/phone_pass_1
437
+ s1/stamp_lift
438
+ s7/torussmall_inspect_1
439
+ s1/cylindersmall_offhand_1
440
+ s7/cylinderlarge_lift
441
+ s2/duck_inspect_1
442
+ s7/spheremedium_pass_1
443
+ s8/toruslarge_lift
444
+ s3/hammer_pass_1
445
+ s3/cup_offhand_1
446
+ s2/headphones_use_1
447
+ s2/train_play_1
448
+ s10/scissors_use_1
449
+ s5/cubesmall_lift
450
+ s10/pyramidsmall_inspect_1
451
+ s3/mug_drink_4
452
+ s1/cubemedium_offhand_1
453
+ s6/apple_eat_1
454
+ s1/doorknob_use_2
455
+ s1/cylinderlarge_lift
456
+ s2/hand_lift
457
+ s5/gamecontroller_lift
458
+ s3/waterbottle_lift
459
+ s3/apple_pass_1
460
+ s10/flute_play_1
461
+ s5/spherelarge_inspect_1
462
+ s7/doorknob_use_1
463
+ s8/torusmedium_pass_1
464
+ s4/bowl_pass_1
465
+ s10/headphones_lift
466
+ s6/pyramidlarge_pass_1
467
+ s2/mug_toast_1
468
+ s4/hammer_use_3
469
+ s1/stanfordbunny_inspect_1
470
+ s3/alarmclock_pass_1
471
+ s8/cylindermedium_inspect_1
472
+ s1/elephant_lift
473
+ s6/waterbottle_pass_1
474
+ s8/cubelarge_lift
475
+ s3/camera_takepicture_1
476
+ s5/torussmall_inspect_1
477
+ s2/airplane_fly_1
478
+ s7/hammer_use_1
479
+ s1/spherelarge_lift
480
+ s1/cubemedium_pass_1
481
+ s2/headphones_lift
482
+ s8/piggybank_lift
483
+ s1/apple_lift
484
+ s3/spherelarge_inspect_1
485
+ s8/banana_pass_1
486
+ s8/stanfordbunny_pass_1
487
+ s4/flute_play_1
488
+ s6/banana_eat_1
489
+ s1/scissors_offhand_1
490
+ s1/binoculars_lift
491
+ s10/cup_lift
492
+ s7/eyeglasses_lift
493
+ s3/stapler_lift
494
+ s7/bowl_drink_1
495
+ s1/cup_drink_1
496
+ s4/alarmclock_lift
497
+ s6/headphones_use_1
498
+ s4/apple_lift
499
+ s6/mouse_pass_1
500
+ s7/flashlight_lift
501
+ s5/torusmedium_inspect_1
502
+ s10/cup_pour_1
503
+ s5/toruslarge_lift
504
+ s10/cup_drink_1
505
+ s7/cylinderlarge_pass_1
506
+ s4/wineglass_drink_1
507
+ s1/stapler_staple_2
508
+ s10/mug_drink_1
509
+ s1/flute_pass_1
510
+ s8/cylinderlarge_lift
511
+ s10/piggybank_pass_1
512
+ s7/duck_inspect_1
513
+ s9/stamp_lift
514
+ s6/bowl_lift
515
+ s6/banana_pass_1
516
+ s9/cubemedium_lift
517
+ s7/flute_pass_1
518
+ s8/eyeglasses_wear_1
519
+ s9/flashlight_lift
520
+ s9/hammer_use_2
521
+ s1/cup_pour_1
522
+ s2/piggybank_lift
523
+ s8/pyramidlarge_inspect_1
524
+ s2/teapot_pour_2
525
+ s6/pyramidsmall_inspect_2
526
+ s1/toothpaste_pass_1
527
+ s10/banana_peel_2
528
+ s1/wineglass_pass_1
529
+ s5/pyramidlarge_inspect_1
530
+ s5/cubemedium_inspect_1
531
+ s7/knife_chop_1
532
+ s6/mug_toast_1
533
+ s5/mug_drink_1
534
+ s6/banana_peel_1
535
+ s7/cylindermedium_pass_1
536
+ s10/mug_drink_2
537
+ s3/elephant_offhand_1
538
+ s4/stapler_pass_1
539
+ s1/torussmall_lift
540
+ s1/duck_lift
541
+ s5/flute_play_1
542
+ s1/airplane_fly_1
543
+ s2/headphones_pass_1
544
+ s3/lightbulb_screw_1
545
+ s8/toothbrush_brush_1
546
+ s10/mouse_use_1
547
+ s5/flute_play_2
548
+ s3/waterbottle_pour_2
549
+ s10/airplane_lift
550
+ s7/eyeglasses_wear_1
551
+ s10/stamp_stamp_1
552
+ s8/cup_drink_1
553
+ s4/alarmclock_pass_1
554
+ s3/train_pass_1
555
+ s2/hammer_use_2
556
+ s1/pyramidsmall_inspect_1
557
+ s9/cubesmall_inspect_1
558
+ s7/camera_browse_1
559
+ s10/spheresmall_pass_1
560
+ s1/phone_offhand_1
561
+ s6/waterbottle_shake_1
562
+ s1/alarmclock_offhand_1
563
+ s8/spheresmall_inspect_1
564
+ s9/cylindermedium_inspect_1
565
+ s6/knife_lift
566
+ s6/mouse_lift
567
+ s1/flashlight_lift
568
+ s9/cubelarge_pass_1
569
+ s2/pyramidlarge_pass_1_Retake
570
+ s7/stanfordbunny_inspect_1
571
+ s9/gamecontroller_play_1
572
+ s4/hand_pass_1
573
+ s3/toothpaste_squeeze_1
574
+ s9/duck_lift
575
+ s5/flashlight_on_1
576
+ s5/cup_lift
577
+ s10/waterbottle_drink_1
578
+ s8/cubesmall_pass_1
579
+ s10/train_lift
580
+ s10/spheresmall_inspect_1
581
+ s10/cup_pass_1
582
+ s1/cubesmall_pass_1
583
+ s4/fryingpan_cook_3
584
+ s6/elephant_inspect_1
585
+ s9/stapler_lift
586
+ s1/pyramidmedium_offhand_1
587
+ s9/pyramidmedium_pass_1
588
+ s9/teapot_pour_2
589
+ s4/fryingpan_cook_1
590
+ s1/eyeglasses_wear_1
591
+ s1/gamecontroller_play_1
592
+ s3/binoculars_offhand_1
593
+ s6/waterbottle_pour_1
594
+ s3/cubesmall_inspect_1
595
+ s8/pyramidlarge_pass_1
596
+ s2/train_pass_1
597
+ s10/mouse_lift
598
+ s10/torusmedium_pass_1
599
+ s8/waterbottle_drink_1
600
+ s7/cubelarge_lift
601
+ s5/duck_pass_1
602
+ s1/gamecontroller_offhand_1
603
+ s2/camera_pass_1
604
+ s7/gamecontroller_play_1
605
+ s7/toothbrush_pass_1
606
+ s8/phone_pass_1
607
+ s3/bowl_drink_1
608
+ s6/toruslarge_pass_1
609
+ s5/spheresmall_inspect_1
610
+ s8/flashlight_pass_1
611
+ s1/flashlight_on_2
612
+ s8/gamecontroller_lift
613
+ s3/stanfordbunny_lift
614
+ s8/bowl_pass_1
615
+ s4/banana_pass_1
616
+ s6/stapler_lift
617
+ s2/teapot_lift
618
+ s3/camera_takepicture_2
619
+ s8/torussmall_pass_1
620
+ s9/camera_pass_1
621
+ s4/apple_eat_1
622
+ s1/watch_set_1
623
+ s5/stanfordbunny_lift
624
+ s6/spheremedium_inspect_1
625
+ s1/eyeglasses_lift
626
+ s10/headphones_use_1
627
+ s3/doorknob_use_1
628
+ s10/apple_pass_1
629
+ s9/flute_pass_1
630
+ s1/toruslarge_inspect_1
631
+ s5/duck_lift
632
+ s9/doorknob_use_2
633
+ s10/torussmall_pass_1
634
+ s2/doorknob_use_2
635
+ s1/pyramidlarge_lift
636
+ s6/stamp_lift
637
+ s3/elephant_pass_1
638
+ s7/headphones_use_1
639
+ s6/cylindermedium_pass_1
640
+ s6/stapler_staple_2
641
+ s6/piggybank_pass_1
642
+ s5/spheremedium_inspect_1
643
+ s1/cubemedium_inspect_1
644
+ s4/pyramidmedium_pass_1
645
+ s7/cylinderlarge_inspect_1
646
+ s1/cylindermedium_offhand_1
647
+ s6/toothpaste_squeeze_1_Retake
648
+ s9/stapler_pass_1
649
+ s8/flashlight_on_1
650
+ s5/cup_drink_2
651
+ s8/apple_lift
652
+ s8/airplane_pass_1
653
+ s6/wineglass_pass_1
654
+ s4/cylinderlarge_inspect_1
655
+ s4/phone_call_1
656
+ s5/stapler_pass_1
657
+ s8/camera_takepicture_2
658
+ s3/mouse_pass_1
659
+ s7/pyramidlarge_inspect_1
660
+ s10/stapler_staple_2
661
+ s9/teapot_pass_1
662
+ s5/gamecontroller_play_1
663
+ s4/train_lift
664
+ s8/doorknob_lift
665
+ s1/watch_offhand_1
666
+ s7/mug_drink_2
667
+ s4/flashlight_pass_1
668
+ s1/mug_offhand_1
669
+ s4/camera_takepicture_3
670
+ s8/train_pass_1
671
+ s7/train_pass_1
672
+ s8/gamecontroller_play_1
673
+ s1/scissors_use_2
674
+ s9/piggybank_pass_1
675
+ s6/flute_lift
676
+ s1/banana_peel_1
677
+ s6/bowl_pass_1
678
+ s1/mouse_pass_1
679
+ s6/cubesmall_lift
680
+ s5/airplane_lift
681
+ s3/scissors_pass_1
682
+ s2/cylinderlarge_pass_1
683
+ s6/watch_set_1
684
+ s10/stamp_pass_1
685
+ s9/banana_lift
686
+ s3/toruslarge_lift
687
+ s10/hammer_use_1
688
+ s10/cubesmall_lift
689
+ s6/teapot_pass_1
690
+ s3/pyramidlarge_offhand_1
691
+ s8/cylindersmall_inspect_1
692
+ s3/cylinderlarge_inspect_1
693
+ s7/teapot_pass_1
694
+ s9/mouse_pass_1
695
+ s3/mug_drink_1
696
+ s2/spheremedium_pass_1
697
+ s9/waterbottle_pass_1
698
+ s2/flute_pass_1
699
+ s9/lightbulb_screw_1
700
+ s8/cubemedium_pass_1
701
+ s7/flashlight_pass_1
702
+ s9/eyeglasses_lift
703
+ s7/cubemedium_inspect_1
704
+ s5/waterbottle_lift
705
+ s6/cup_drink_1
706
+ s1/pyramidlarge_inspect_1
707
+ s3/airplane_fly_1
708
+ s10/pyramidmedium_lift
709
+ s10/bowl_lift
710
+ s2/cubelarge_pass_1
711
+ s7/stapler_staple_1
712
+ s8/flashlight_on_2
713
+ s4/cylindersmall_lift
714
+ s10/stanfordbunny_pass_1
715
+ s6/cubemedium_lift
716
+ s10/stapler_pass_1
717
+ s5/apple_pass_1
718
+ s7/doorknob_use_2
719
+ s4/watch_pass_1
720
+ s1/elephant_pass_1
721
+ s1/watch_set_2
722
+ s2/cylinderlarge_inspect_1
723
+ s2/pyramidlarge_inspect_1
724
+ s9/eyeglasses_wear_1
725
+ s4/phone_lift
726
+ s1/spheremedium_lift
727
+ s3/airplane_offhand_1
728
+ s6/spheresmall_pass_1
729
+ s9/piggybank_lift_Retake
730
+ s8/cylinderlarge_inspect_1
731
+ s5/wineglass_lift
732
+ s3/train_play_1
733
+ s10/knife_chop_1
734
+ s9/wineglass_drink_2
735
+ s6/hammer_lift
736
+ s9/cylindersmall_pass_1
737
+ s4/bowl_drink_1
738
+ s1/mug_toast_1
739
+ s1/pyramidmedium_inspect_1
740
+ s9/stamp_stamp_1
741
+ s10/toothpaste_squeeze_1
742
+ s1/spheresmall_lift
743
+ s3/doorknob_use_2
744
+ s7/fryingpan_cook_1
745
+ s5/teapot_pass_1
746
+ s8/phone_lift
747
+ s2/cubesmall_inspect_1
748
+ s1/toruslarge_pass_1
749
+ s6/headphones_pass_1
750
+ s8/cylinderlarge_pass_1
751
+ s4/toothpaste_pass_1
752
+ s4/camera_takepicture_2
753
+ s3/camera_offhand_1
754
+ s8/cup_offhand_1
755
+ s9/spherelarge_pass_1
756
+ s10/stanfordbunny_lift
757
+ s1/spheremedium_offhand_1
758
+ s4/torusmedium_lift
759
+ s9/cylinderlarge_lift
760
+ s6/hammer_use_1
761
+ s4/wineglass_toast_1
762
+ s2/cylinderlarge_lift
763
+ s6/cubesmall_inspect_1
764
+ s1/camera_takepicture_2
765
+ s3/phone_pass_1
766
+ s1/hand_inspect_1
767
+ s8/bowl_drink_2
768
+ s1/stapler_offhand_1
769
+ s1/piggybank_use_1
770
+ s1/apple_pass_1
771
+ s9/wineglass_drink_1
772
+ s10/mug_lift
773
+ s10/cylinderlarge_lift
774
+ s2/stamp_stamp_1
775
+ s10/stanfordbunny_inspect_1
776
+ s4/cubelarge_lift
777
+ s8/alarmclock_see_1
778
+ s5/cubelarge_lift
779
+ s3/eyeglasses_pass_1
780
+ s1/pyramidsmall_pass_1
781
+ s2/cylindermedium_pass_1
782
+ s8/fryingpan_offhand_1
783
+ s1/mug_drink_3
784
+ s2/piggybank_pass_1_Retake
785
+ s8/headphones_lift
786
+ s9/alarmclock_see_1
787
+ s5/toruslarge_pass_1
788
+ s1/stapler_staple_1
789
+ s1/stanfordbunny_offhand_1
790
+ s3/airplane_lift
791
+ s8/cubemedium_inspect_1
792
+ s9/stapler_staple_2
793
+ s3/fryingpan_lift
794
+ s10/spherelarge_inspect_1
795
+ s7/pyramidlarge_pass_1
796
+ s5/mouse_lift
797
+ s8/watch_set_2
798
+ s3/wineglass_drink_1
799
+ s6/spheremedium_lift
800
+ s2/stapler_staple_2
801
+ s9/airplane_pass_1
802
+ s6/duck_inspect_1
803
+ s10/lightbulb_pass_1
804
+ s8/stapler_pass_1
805
+ s6/camera_takepicture_3
806
+ s10/camera_takepicture_2
807
+ s10/mouse_pass_1
808
+ s3/cup_drink_2
809
+ s5/alarmclock_see_1
810
+ s10/elephant_inspect_1
811
+ s6/apple_pass_1
812
+ s3/gamecontroller_pass_1
813
+ s10/cubemedium_pass_1
814
+ s5/wineglass_pass_1
815
+ s5/waterbottle_pass_1
816
+ s10/teapot_pour_2
817
+ s6/train_pass_1
818
+ s7/hammer_use_2
819
+ s8/apple_pass_1
820
+ s6/cup_pour_1
821
+ s7/wineglass_lift
822
+ s9/toothbrush_pass_1
823
+ s6/doorknob_lift
824
+ s9/banana_eat_1
825
+ s4/cylinderlarge_pass_1
826
+ s10/cubelarge_lift
827
+ s1/camera_lift
828
+ s1/stanfordbunny_pass_1
829
+ s7/watch_pass_1
830
+ s9/cup_lift
831
+ s7/apple_pass_1
832
+ s3/piggybank_pass_1
833
+ s1/cylindersmall_inspect_1
834
+ s4/cup_drink_2
835
+ s8/spheremedium_pass_1
836
+ s6/waterbottle_open_1
837
+ s3/watch_set_1
838
+ s8/cubesmall_inspect_1
839
+ s3/duck_inspect_1
840
+ s7/cylindermedium_inspect_1
841
+ s9/cylindersmall_inspect_1
842
+ s1/spherelarge_inspect_1
843
+ s1/spheresmall_offhand_1
844
+ s8/cylindermedium_lift
845
+ s1/bowl_drink_2
846
+ s5/train_pass_1
847
+ s4/alarmclock_see_1
848
+ s8/flashlight_offhand_1
849
+ s2/cup_drink_1
850
+ s8/duck_lift
851
+ s6/cubelarge_pass_1
852
+ s6/flute_pass_1
853
+ s2/toruslarge_inspect_1
854
+ s4/camera_pass_1
855
+ s7/train_play_1
856
+ s9/mouse_use_1
857
+ s5/cylindermedium_pass_1
858
+ s3/pyramidsmall_pass_1
859
+ s3/eyeglasses_offhand_1
860
+ s8/wineglass_toast_1
861
+ s7/eyeglasses_clean_1
862
+ s5/toothbrush_brush_1
863
+ s10/toothpaste_squeeze_2
864
+ s1/flute_lift
865
+ s3/toothbrush_pass_1
866
+ s1/bowl_lift
867
+ s9/bowl_drink_2
868
+ s7/banana_peel_2
869
+ s8/doorknob_use_2
870
+ s8/torussmall_inspect_1
871
+ s1/camera_pass_1
872
+ s8/hand_inspect_1
873
+ s1/watch_pass_1
874
+ s10/watch_pass_1
875
+ s3/toruslarge_inspect_1
876
+ s9/torusmedium_inspect_1
877
+ s9/cylindermedium_lift
878
+ s10/teapot_pass_1
879
+ s6/wineglass_drink_2
880
+ s7/headphones_lift
881
+ s10/stapler_staple_1
882
+ s2/pyramidmedium_inspect_1
883
+ s3/flashlight_pass_1
884
+ s1/torusmedium_pass_1
885
+ s6/spherelarge_pass_1
886
+ s2/phone_pass_1
887
+ s2/pyramidmedium_pass_1
888
+ s9/stanfordbunny_pass_1
889
+ s8/pyramidmedium_inspect_1
890
+ s1/toothbrush_lift
891
+ s7/hammer_pass_1
892
+ s8/hand_lift
893
+ s8/camera_takepicture_3
894
+ s8/stamp_lift
895
+ s8/torusmedium_inspect_1
896
+ s7/phone_call_1
897
+ s3/pyramidlarge_inspect_1
898
+ s6/torussmall_pass_1
899
+ s3/mouse_use_1
900
+ s6/toruslarge_inspect_1
901
+ s3/elephant_lift
902
+ s1/wineglass_drink_2
903
+ s7/lightbulb_pass_1
904
+ s8/hammer_pass_1
905
+ s6/pyramidmedium_inspect_1
906
+ s8/gamecontroller_pass_1
907
+ s1/waterbottle_pour_1
908
+ s9/toothpaste_lift
909
+ s9/spherelarge_inspect_1
910
+ s8/teapot_pour_1
911
+ s4/spheremedium_inspect_1
912
+ s6/stamp_stamp_1
913
+ s9/hammer_use_1
914
+ s1/torusmedium_offhand_1
915
+ s1/alarmclock_see_1
916
+ s6/phone_pass_1
917
+ s5/waterbottle_open_1
918
+ s4/cubemedium_pass_1
919
+ s6/apple_lift
920
+ s1/headphones_pass_1
921
+ s8/hammer_use_3
922
+ s3/piggybank_use_1
923
+ s3/wineglass_toast_1
924
+ s6/spheresmall_lift
925
+ s1/camera_takepicture_3_Retake
926
+ s1/cylindermedium_lift
927
+ s6/teapot_pour_1
928
+ s4/train_pass_1
929
+ s7/toruslarge_inspect_1_Retake
930
+ s8/cubelarge_pass_1
931
+ s3/alarmclock_offhand_1
932
+ s4/hand_lift
933
+ s8/mug_drink_2
934
+ s5/cup_pass_1
935
+ s3/stapler_staple_1
936
+ s10/phone_lift
937
+ s6/torusmedium_inspect_1
938
+ s4/cup_drink_1
939
+ s2/train_lift
940
+ s8/stapler_staple_1
941
+ s2/apple_eat_1
942
+ s6/torusmedium_pass_1
943
+ s4/watch_set_1
944
+ s3/camera_takepicture_3
945
+ s9/spheresmall_inspect_1
946
+ s4/cubesmall_pass_1
947
+ s6/alarmclock_lift
948
+ s2/toruslarge_pass_1
949
+ s9/pyramidmedium_inspect_1
950
+ s3/stanfordbunny_inspect_1
951
+ s6/scissors_use_2
952
+ s1/stamp_stamp_1
953
+ s3/torusmedium_pass_1
954
+ s6/eyeglasses_pass_1
955
+ s7/cubelarge_inspect_1
956
+ s3/stamp_lift
957
+ s8/mouse_pass_1
958
+ s7/elephant_inspect_1
959
+ s8/camera_pass_1
960
+ s10/bowl_pass_1
961
+ s1/waterbottle_lift
962
+ s8/bowl_lift
963
+ s3/phone_call_1
964
+ s8/phone_offhand_1
965
+ s3/eyeglasses_wear_1
966
+ s6/bowl_drink_1_Retake
967
+ s9/torussmall_inspect_1
968
+ s1/piggybank_offhand_1
969
+ s1/mug_drink_2
970
+ s4/airplane_fly_1
971
+ s8/cubelarge_offhand_1
972
+ s10/camera_pass_1
973
+ s4/stanfordbunny_inspect_1
974
+ s6/pyramidlarge_inspect_1
975
+ s10/wineglass_drink_2
976
+ s1/duck_inspect_1
977
+ s4/duck_inspect_1
978
+ s1/mouse_use_1
979
+ s5/eyeglasses_clean_2
980
+ s10/camera_takepicture_1
981
+ s8/fryingpan_pass_1
982
+ s10/cubelarge_inspect_1
983
+ s7/hammer_use_3
984
+ s5/spherelarge_lift
985
+ s1/fryingpan_cook_1
986
+ s5/cubemedium_pass_1
987
+ s3/flute_play_1
988
+ s9/headphones_use_1
989
+ s1/cylinderlarge_inspect_1
990
+ s7/hammer_lift
991
+ s7/wineglass_toast_1
992
+ s4/pyramidmedium_inspect_1
993
+ s10/bowl_drink_1_Retake
994
+ s9/piggybank_use_1
995
+ s1/cubesmall_lift
996
+ s1/fryingpan_cook_2
997
+ s1/apple_eat_1
998
+ s9/watch_set_2
999
+ s9/bowl_lift
1000
+ s10/hand_lift
1001
+ s10/cylindersmall_inspect_1
1002
+ s1/piggybank_lift
1003
+ s2/toothpaste_squeeze_1
1004
+ s6/binoculars_see_1
1005
+ s6/spherelarge_inspect_1
1006
+ s8/alarmclock_lift
1007
+ s1/cylindersmall_pass_1
1008
+ s6/pyramidlarge_lift
1009
+ s4/binoculars_pass_1
1010
+ s10/bowl_drink_2
1011
+ s5/flute_pass_1
1012
+ s10/torusmedium_lift
1013
+ s1/duck_pass_1
1014
+ s2/cubelarge_inspect_1
1015
+ s8/watch_pass_1
1016
+ s1/waterbottle_open_1
1017
+ s9/flute_play_1
1018
+ s9/airplane_lift
1019
+ s1/mug_drink_4
1020
+ s4/wineglass_pass_1
1021
+ s6/camera_pass_1
1022
+ s4/duck_pass_1
1023
+ s10/cylindersmall_pass_1
1024
+ s10/alarmclock_lift_Retake
1025
+ s4/toruslarge_pass_1
1026
+ s10/wineglass_pass_1
1027
+ s5/cylindermedium_lift
1028
+ s2/pyramidlarge_lift
1029
+ s3/flute_offhand_1
1030
+ s8/camera_offhand_1
1031
+ s2/elephant_inspect_1
1032
+ s5/mug_pass_1
1033
+ s3/hammer_use_1
1034
+ s9/mug_drink_2
1035
+ s5/wineglass_drink_2
1036
+ s5/piggybank_lift
1037
+ s9/waterbottle_open_1
1038
+ s7/alarmclock_lift
1039
+ s1/eyeglasses_offhand_1
1040
+ s1/flute_play_1
1041
+ s8/wineglass_lift
1042
+ s7/waterbottle_shake_1
1043
+ s1/cubelarge_inspect_1
1044
+ s1/hammer_use_2
1045
+ s5/doorknob_lift
1046
+ s1/spheremedium_inspect_1
1047
+ s9/spheremedium_pass_1
1048
+ s6/knife_peel_1
1049
+ s4/fryingpan_pass_1
1050
+ s3/binoculars_lift
1051
+ s7/toruslarge_lift
1052
+ s4/piggybank_lift
1053
+ s3/cubelarge_pass_1
1054
+ s2/banana_peel_1
1055
+ s1/alarmclock_lift
1056
+ s5/train_lift
1057
+ s1/cup_pass_1
1058
+ s1/waterbottle_drink_1
1059
+ s3/bowl_drink_2
1060
+ s5/cylinderlarge_inspect_1
1061
+ s4/headphones_use_1
1062
+ s3/hand_pass_1
1063
+ s3/flashlight_on_1
1064
+ s3/toruslarge_pass_1
1065
+ s9/flashlight_on_1
1066
+ s8/mug_offhand_1
1067
+ s8/cubesmall_lift
1068
+ s10/airplane_pass_1
1069
+ s4/stanfordbunny_lift
1070
+ s3/cubelarge_offhand_1
1071
+ s2/stanfordbunny_pass_1
1072
+ s5/bowl_drink_1
1073
+ s3/knife_lift
1074
+ s8/fryingpan_cook_3
1075
+ s7/stamp_pass_1
1076
+ s10/hammer_pass_1
1077
+ s1/phone_pass_1
1078
+ s6/teapot_pour_2
1079
+ s2/mug_drink_2
1080
+ s8/spheresmall_lift
1081
+ s7/banana_pass_1
1082
+ s4/cup_lift
1083
+ s5/train_play_1
1084
+ s6/airplane_lift
1085
+ s3/spheremedium_inspect_1
1086
+ s1/hand_offhand_1
1087
+ s3/camera_pass_1
1088
+ s7/bowl_pass_1
1089
+ s6/mug_pass_1
1090
+ s7/binoculars_lift
1091
+ s8/hammer_lift
1092
+ s1/waterbottle_offhand_1
1093
+ s3/cylindermedium_pass_1
1094
+ s7/waterbottle_drink_1
1095
+ s7/spheremedium_inspect_1
1096
+ s10/banana_eat_1
1097
+ s2/spherelarge_lift
1098
+ s1/duck_offhand_2
1099
+ s6/doorknob_use_2
1100
+ s6/cylinderlarge_pass_1
1101
+ s9/camera_lift
1102
+ s1/eyeglasses_pass_1
1103
+ s3/cylinderlarge_pass_1
1104
+ s10/elephant_pass_1
1105
+ s4/scissors_pass_1
1106
+ s3/cubemedium_inspect_1
1107
+ s6/stamp_pass_1
1108
+ s9/mug_pass_1
1109
+ s10/phone_pass_1
1110
+ s7/mouse_pass_1
1111
+ s3/scissors_use_1
1112
+ s6/airplane_fly_1
1113
+ s8/lightbulb_pass_1
1114
+ s2/toruslarge_lift
1115
+ s8/stapler_lift
1116
+ s5/piggybank_use_1
1117
+ s6/banana_peel_2
1118
+ s10/doorknob_lift
1119
+ s8/stamp_pass_1
1120
+ s5/hand_pass_1
1121
+ s5/bowl_pass_1
1122
+ s3/pyramidlarge_pass_1
1123
+ s6/camera_lift
1124
+ s9/duck_inspect_1
1125
+ s3/cubelarge_lift
1126
+ s4/watch_set_2
1127
+ s5/cylinderlarge_lift
1128
+ s2/toothpaste_lift
1129
+ s8/knife_chop_1
1130
+ s1/flashlight_on_1
1131
+ s10/mug_pass_1
1132
+ s1/gamecontroller_lift
1133
+ s1/airplane_pass_1
1134
+ s7/cylindersmall_pass_1
1135
+ s7/cup_drink_1
1136
+ s7/piggybank_lift
1137
+ s9/lightbulb_pass_1
1138
+ s7/flashlight_on_1
1139
+ s5/binoculars_see_1
1140
+ s1/headphones_use_1
1141
+ s5/cubelarge_inspect_1
1142
+ s7/cubelarge_pass_1
1143
+ s9/toothpaste_pass_1
1144
+ s7/spherelarge_pass_1
1145
+ s10/banana_peel_1
1146
+ s6/stapler_staple_1_Retake
1147
+ s8/wineglass_drink_1
1148
+ s9/eyeglasses_pass_1
1149
+ s6/mug_lift
1150
+ s5/phone_pass_1
1151
+ s9/torussmall_lift
1152
+ s5/hand_inspect_1
1153
+ s1/cubelarge_lift
1154
+ s10/cubesmall_pass_1
1155
+ s7/banana_peel_1
1156
+ s3/watch_pass_1
1157
+ s2/banana_pass_1
1158
+ s8/duck_pass_1
1159
+ s7/waterbottle_pass_1
1160
+ s6/bowl_drink_1
1161
+ s1/teapot_pour_1
1162
+ s9/pyramidlarge_pass_1
1163
+ s3/mug_drink_2
1164
+ s10/spherelarge_pass_1
1165
+ s6/alarmclock_pass_1
1166
+ s6/hammer_use_2
1167
+ s9/cubesmall_pass_1
1168
+ s7/mouse_use_1
1169
+ s10/wineglass_lift
1170
+ s3/alarmclock_see_1
1171
+ s5/mouse_pass_1
1172
+ s8/doorknob_use_1
1173
+ s8/stamp_stamp_1
1174
+ s2/mug_drink_1
1175
+ s8/hammer_use_2
1176
+ s3/spherelarge_pass_1
1177
+ s2/flashlight_on_2
1178
+ s4/elephant_inspect_1
1179
+ s3/wineglass_lift
1180
+ s3/cup_pass_1
1181
+ s6/doorknob_use_1
1182
+ s1/torussmall_inspect_1
1183
+ s10/toruslarge_lift
1184
+ s6/cup_lift
1185
+ s9/alarmclock_lift
1186
+ s9/apple_eat_1
1187
+ s3/spheresmall_pass_1
1188
+ s6/flashlight_on_1
1189
+ s10/binoculars_lift
1190
+ s9/hammer_pass_1
1191
+ s1/doorknob_use_1
1192
+ s3/mug_pass_1
1193
+ s1/apple_offhand_1
1194
+ s9/doorknob_lift
1195
+ s7/gamecontroller_lift
1196
+ s9/cubelarge_lift
1197
+ s5/alarmclock_pass_1
1198
+ s9/cup_pass_1
1199
+ s4/apple_pass_1
1200
+ s8/elephant_inspect_1
1201
+ s6/toruslarge_lift
1202
+ s4/airplane_pass_1
1203
+ s8/bowl_drink_1
1204
+ s9/flashlight_pass_1
1205
+ s2/spherelarge_inspect_1
1206
+ s6/toothpaste_squeeze_2
1207
+ s8/stanfordbunny_inspect_1
1208
+ s7/cylindersmall_inspect_1
1209
+ s1/spheremedium_pass_1
1210
+ s8/train_play_1
1211
+ s8/elephant_pass_1
1212
+ s9/binoculars_pass_1
1213
+ s7/spherelarge_inspect_1
1214
+ s4/hammer_use_2
1215
+ s3/stapler_pass_1
1216
+ s3/cylindermedium_lift
1217
+ s7/camera_takepicture_3
1218
+ s9/stanfordbunny_inspect_1
1219
+ s3/mouse_offhand_1
1220
+ s8/cup_lift
1221
+ s1/pyramidmedium_lift
1222
+ s3/pyramidmedium_inspect_1
1223
+ s4/mug_drink_2
1224
+ s1/camera_offhand_1
1225
+ s8/cubesmall_offhand_1
1226
+ s1/pyramidmedium_pass_1
1227
+ s6/flashlight_on_2
1228
+ s1/torusmedium_inspect_1
1229
+ s8/stapler_staple_2
1230
+ s8/alarmclock_pass_1
1231
+ s8/mouse_lift
1232
+ s8/piggybank_pass_1
1233
+ s10/mug_toast_1
1234
+ s6/hammer_pass_1
1235
+ s2/hammer_use_1
1236
+ s4/waterbottle_pour_1
1237
+ s5/flashlight_on_2
1238
+ s8/hammer_use_1
1239
+ s7/train_lift
1240
+ s8/waterbottle_lift
1241
+ s1/wineglass_lift
1242
+ s5/gamecontroller_pass_1
1243
+ s3/toothpaste_pass_1
1244
+ s10/train_play_1
1245
+ s4/hammer_use_1
1246
+ s1/spherelarge_pass_1
1247
+ s10/wineglass_toast_1
1248
+ s9/cup_drink_1
1249
+ s1/torusmedium_lift
1250
+ s7/pyramidsmall_inspect_1
1251
+ s5/banana_pass_1
1252
+ s6/duck_pass_1
1253
+ s2/spherelarge_pass_1
1254
+ s10/cubemedium_inspect_1
1255
+ s1/binoculars_offhand_1
1256
+ s1/elephant_offhand_1
1257
+ s1/mug_pass_1
1258
+ s1/toothpaste_squeeze_1
1259
+ s5/airplane_pass_1
1260
+ s1/hand_pass_1
1261
+ s2/eyeglasses_clean_1
1262
+ s9/cubelarge_inspect_1
1263
+ s10/hand_inspect_1
1264
+ s7/phone_pass_1
1265
+ s6/gamecontroller_lift
1266
+ s8/toothpaste_lift
1267
+ s1/banana_offhand_1
1268
+ s4/spherelarge_lift
1269
+ s5/cylindersmall_pass_1
1270
+ s4/pyramidlarge_pass_1
1271
+ s10/cylindermedium_inspect_1
1272
+ s10/phone_call_1
1273
+ s6/flashlight_pass_1
1274
+ s6/train_play_1
1275
+ s10/binoculars_pass_1
1276
+ s10/eyeglasses_wear_1
1277
+ s1/mouse_offhand_1
1278
+ s4/camera_lift
1279
+ s6/gamecontroller_pass_1
1280
+ s2/scissors_use_1
1281
+ s6/stanfordbunny_inspect_1
1282
+ s2/cubesmall_pass_1
1283
+ s5/binoculars_lift
1284
+ s5/torusmedium_pass_1
1285
+ s3/stamp_pass_1
1286
+ s5/spheremedium_pass_1
1287
+ s8/flute_play_1
1288
+ s5/watch_pass_1
1289
+ s6/cubesmall_pass_1
1290
+ s5/cup_pour_1
1291
+ s4/stamp_pass_1
1292
+ s3/hammer_use_2
1293
+ s10/cubemedium_lift
1294
+ s10/pyramidmedium_inspect_1
1295
+ s8/cubemedium_lift
1296
+ s1/train_offhand_1
1297
+ s4/wineglass_drink_2
1298
+ s10/stapler_lift
1299
+ s4/torusmedium_inspect_1
1300
+ s2/piggybank_use_1
1301
+ s10/toothpaste_lift
1302
+ s2/gamecontroller_pass_1
1303
+ s8/banana_lift
1304
+ s10/cubelarge_pass_1
1305
+ s9/eyeglasses_clean_1
1306
+ s10/knife_pass_1
1307
+ s6/camera_takepicture_1
1308
+ s9/wineglass_toast_1_Retake
1309
+ s1/cubesmall_inspect_1
1310
+ s4/cup_pour_1
1311
+ s10/teapot_pour_1
1312
+ s1/stapler_pass_1
1313
+ s3/apple_eat_1
1314
+ s9/alarmclock_pass_1
1315
+ s7/wineglass_drink_2
1316
+ s1/cylinderlarge_pass_1
1317
+ s1/spheresmall_pass_1
1318
+ s10/banana_lift
1319
+ s7/camera_takepicture_2
1320
+ s8/scissors_pass_1
1321
+ s6/binoculars_pass_1
1322
+ s4/spherelarge_pass_1
1323
+ s1/headphones_offhand_1
1324
+ s10/toruslarge_inspect_1
1325
+ s10/cylinderlarge_pass_1
1326
+ s1/duck_offhand_1
1327
+ s10/doorknob_use_1
1328
+ s7/duck_pass_1
1329
+ s2/mouse_pass_1
1330
+ s10/pyramidlarge_pass_1
1331
+ s4/cubemedium_inspect_1
1332
+ s8/toruslarge_inspect_1
1333
+ s1/cup_drink_2
1334
+ s2/cylindersmall_pass_1
1335
+ s6/camera_takepicture_2
text2motion/data/GRAB/grab_test.txt ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ s7/cup_drink_1
2
+ s7/piggybank_lift
3
+ s9/lightbulb_pass_1
4
+ s7/flashlight_on_1
5
+ s5/binoculars_see_1
6
+ s1/headphones_use_1
7
+ s5/cubelarge_inspect_1
8
+ s7/cubelarge_pass_1
9
+ s9/toothpaste_pass_1
10
+ s7/spherelarge_pass_1
11
+ s10/banana_peel_1
12
+ s6/stapler_staple_1_Retake
13
+ s8/wineglass_drink_1
14
+ s9/eyeglasses_pass_1
15
+ s6/mug_lift
16
+ s5/phone_pass_1
17
+ s9/torussmall_lift
18
+ s5/hand_inspect_1
19
+ s1/cubelarge_lift
20
+ s10/cubesmall_pass_1
21
+ s7/banana_peel_1
22
+ s3/watch_pass_1
23
+ s2/banana_pass_1
24
+ s8/duck_pass_1
25
+ s7/waterbottle_pass_1
26
+ s6/bowl_drink_1
27
+ s1/teapot_pour_1
28
+ s9/pyramidlarge_pass_1
29
+ s3/mug_drink_2
30
+ s10/spherelarge_pass_1
31
+ s6/alarmclock_pass_1
32
+ s6/hammer_use_2
33
+ s9/cubesmall_pass_1
34
+ s7/mouse_use_1
35
+ s10/wineglass_lift
36
+ s3/alarmclock_see_1
37
+ s5/mouse_pass_1
38
+ s8/doorknob_use_1
39
+ s8/stamp_stamp_1
40
+ s2/mug_drink_1
41
+ s8/hammer_use_2
42
+ s3/spherelarge_pass_1
43
+ s2/flashlight_on_2
44
+ s4/elephant_inspect_1
45
+ s3/wineglass_lift
46
+ s3/cup_pass_1
47
+ s6/doorknob_use_1
48
+ s1/torussmall_inspect_1
49
+ s10/toruslarge_lift
50
+ s6/cup_lift
51
+ s9/alarmclock_lift
52
+ s9/apple_eat_1
53
+ s3/spheresmall_pass_1
54
+ s6/flashlight_on_1
55
+ s10/binoculars_lift
56
+ s9/hammer_pass_1
57
+ s1/doorknob_use_1
58
+ s3/mug_pass_1
59
+ s1/apple_offhand_1
60
+ s9/doorknob_lift
61
+ s7/gamecontroller_lift
62
+ s9/cubelarge_lift
63
+ s5/alarmclock_pass_1
64
+ s9/cup_pass_1
65
+ s4/apple_pass_1
66
+ s8/elephant_inspect_1
67
+ s6/toruslarge_lift
68
+ s4/airplane_pass_1
69
+ s8/bowl_drink_1
70
+ s9/flashlight_pass_1
71
+ s2/spherelarge_inspect_1
72
+ s6/toothpaste_squeeze_2
73
+ s8/stanfordbunny_inspect_1
74
+ s7/cylindersmall_inspect_1
75
+ s1/spheremedium_pass_1
76
+ s8/train_play_1
77
+ s8/elephant_pass_1
78
+ s9/binoculars_pass_1
79
+ s7/spherelarge_inspect_1
80
+ s4/hammer_use_2
81
+ s3/stapler_pass_1
82
+ s3/cylindermedium_lift
83
+ s7/camera_takepicture_3
84
+ s9/stanfordbunny_inspect_1
85
+ s3/mouse_offhand_1
86
+ s8/cup_lift
87
+ s1/pyramidmedium_lift
88
+ s3/pyramidmedium_inspect_1
89
+ s4/mug_drink_2
90
+ s1/camera_offhand_1
91
+ s8/cubesmall_offhand_1
92
+ s1/pyramidmedium_pass_1
93
+ s6/flashlight_on_2
94
+ s1/torusmedium_inspect_1
95
+ s8/stapler_staple_2
96
+ s8/alarmclock_pass_1
97
+ s8/mouse_lift
98
+ s8/piggybank_pass_1
99
+ s10/mug_toast_1
100
+ s6/hammer_pass_1
101
+ s2/hammer_use_1
102
+ s4/waterbottle_pour_1
103
+ s5/flashlight_on_2
104
+ s8/hammer_use_1
105
+ s7/train_lift
106
+ s8/waterbottle_lift
107
+ s1/wineglass_lift
108
+ s5/gamecontroller_pass_1
109
+ s3/toothpaste_pass_1
110
+ s10/train_play_1
111
+ s4/hammer_use_1
112
+ s1/spherelarge_pass_1
113
+ s10/wineglass_toast_1
114
+ s9/cup_drink_1
115
+ s1/torusmedium_lift
116
+ s7/pyramidsmall_inspect_1
117
+ s5/banana_pass_1
118
+ s6/duck_pass_1
119
+ s2/spherelarge_pass_1
120
+ s10/cubemedium_inspect_1
121
+ s1/binoculars_offhand_1
122
+ s1/elephant_offhand_1
123
+ s1/mug_pass_1
124
+ s1/toothpaste_squeeze_1
125
+ s5/airplane_pass_1
126
+ s1/hand_pass_1
127
+ s2/eyeglasses_clean_1
128
+ s9/cubelarge_inspect_1
129
+ s10/hand_inspect_1
130
+ s7/phone_pass_1
131
+ s6/gamecontroller_lift
132
+ s8/toothpaste_lift
133
+ s1/banana_offhand_1
134
+ s4/spherelarge_lift
135
+ s5/cylindersmall_pass_1
136
+ s4/pyramidlarge_pass_1
137
+ s10/cylindermedium_inspect_1
138
+ s10/phone_call_1
139
+ s6/flashlight_pass_1
140
+ s6/train_play_1
141
+ s10/binoculars_pass_1
142
+ s10/eyeglasses_wear_1
143
+ s1/mouse_offhand_1
144
+ s4/camera_lift
145
+ s6/gamecontroller_pass_1
146
+ s2/scissors_use_1
147
+ s6/stanfordbunny_inspect_1
148
+ s2/cubesmall_pass_1
149
+ s5/binoculars_lift
150
+ s5/torusmedium_pass_1
151
+ s3/stamp_pass_1
152
+ s5/spheremedium_pass_1
153
+ s8/flute_play_1
154
+ s5/watch_pass_1
155
+ s6/cubesmall_pass_1
156
+ s5/cup_pour_1
157
+ s4/stamp_pass_1
158
+ s3/hammer_use_2
159
+ s10/cubemedium_lift
160
+ s10/pyramidmedium_inspect_1
161
+ s8/cubemedium_lift
162
+ s1/train_offhand_1
163
+ s4/wineglass_drink_2
164
+ s10/stapler_lift
165
+ s4/torusmedium_inspect_1
166
+ s2/piggybank_use_1
167
+ s10/toothpaste_lift
168
+ s2/gamecontroller_pass_1
169
+ s8/banana_lift
170
+ s10/cubelarge_pass_1
171
+ s9/eyeglasses_clean_1
172
+ s10/knife_pass_1
173
+ s6/camera_takepicture_1
174
+ s9/wineglass_toast_1_Retake
175
+ s1/cubesmall_inspect_1
176
+ s4/cup_pour_1
177
+ s10/teapot_pour_1
178
+ s1/stapler_pass_1
179
+ s3/apple_eat_1
180
+ s9/alarmclock_pass_1
181
+ s7/wineglass_drink_2
182
+ s1/cylinderlarge_pass_1
183
+ s1/spheresmall_pass_1
184
+ s10/banana_lift
185
+ s7/camera_takepicture_2
186
+ s8/scissors_pass_1
187
+ s6/binoculars_pass_1
188
+ s4/spherelarge_pass_1
189
+ s1/headphones_offhand_1
190
+ s10/toruslarge_inspect_1
191
+ s10/cylinderlarge_pass_1
192
+ s1/duck_offhand_1
193
+ s10/doorknob_use_1
194
+ s7/duck_pass_1
195
+ s2/mouse_pass_1
196
+ s10/pyramidlarge_pass_1
197
+ s4/cubemedium_inspect_1
198
+ s8/toruslarge_inspect_1
199
+ s1/cup_drink_2
200
+ s2/cylindersmall_pass_1
201
+ s6/camera_takepicture_2
text2motion/data/GRAB/grab_train.txt ADDED
@@ -0,0 +1,1068 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ s8/mug_toast_1
2
+ s3/airplane_pass_1
3
+ s1/lightbulb_pass_1
4
+ s10/camera_takepicture_3
5
+ s1/wineglass_toast_1
6
+ s1/phone_lift
7
+ s4/binoculars_see_1
8
+ s8/teapot_pass_1
9
+ s9/cubemedium_pass_1
10
+ s8/elephant_lift
11
+ s2/gamecontroller_play_1
12
+ s6/duck_lift
13
+ s4/stapler_staple_1
14
+ s8/mug_drink_1
15
+ s8/train_lift
16
+ s1/cubesmall_offhand_1
17
+ s6/cylindermedium_inspect_1
18
+ s8/cylindersmall_lift
19
+ s3/waterbottle_drink_1
20
+ s1/wineglass_offhand_1
21
+ s5/scissors_pass_1
22
+ s1/cup_lift
23
+ s8/binoculars_pass_1
24
+ s9/hand_pass_1
25
+ s7/doorknob_lift
26
+ s8/spherelarge_pass_1
27
+ s10/alarmclock_see_1
28
+ s5/waterbottle_shake_1
29
+ s5/mouse_use_1
30
+ s2/cubesmall_lift
31
+ s4/cubelarge_inspect_1
32
+ s8/apple_eat_1
33
+ s3/cylinderlarge_lift_Retake
34
+ s7/mug_drink_1
35
+ s8/knife_pass_1
36
+ s1/flute_offhand_1
37
+ s7/stapler_pass_1
38
+ s6/flute_play_1
39
+ s6/stapler_staple_1
40
+ s3/gamecontroller_lift
41
+ s9/spheresmall_pass_1
42
+ s9/camera_takepicture_1
43
+ s3/apple_offhand_1
44
+ s4/binoculars_lift
45
+ s8/fryingpan_lift
46
+ s7/camera_pass_1
47
+ s10/hand_shake_1
48
+ s3/stamp_stamp_1
49
+ s6/watch_set_2
50
+ s6/wineglass_drink_1
51
+ s1/cubelarge_pass_1
52
+ s6/hand_inspect_1
53
+ s2/torussmall_lift
54
+ s1/torussmall_pass_1
55
+ s3/mug_offhand_1
56
+ s1/cylinderlarge_offhand_1
57
+ s2/cylindermedium_inspect_1
58
+ s4/stamp_stamp_1
59
+ s4/mouse_use_1
60
+ s8/spheremedium_lift
61
+ s8/headphones_pass_1
62
+ s3/gamecontroller_play_1
63
+ s2/stanfordbunny_lift
64
+ s10/fryingpan_cook_2
65
+ s3/wineglass_pass_1
66
+ s1/pyramidlarge_offhand_1
67
+ s6/toothpaste_pass_1
68
+ s1/mug_lift
69
+ s10/waterbottle_pass_1
70
+ s1/flashlight_pass_1
71
+ s7/banana_lift
72
+ s6/waterbottle_drink_1
73
+ s1/stamp_offhand_1
74
+ s1/camera_takepicture_1
75
+ s10/cubesmall_inspect_1
76
+ s7/alarmclock_see_1
77
+ s7/gamecontroller_pass_1
78
+ s8/banana_peel_1
79
+ s2/camera_takepicture_2
80
+ s2/cubemedium_lift
81
+ s10/train_pass_1
82
+ s10/cylindermedium_pass_1
83
+ s6/spherelarge_lift
84
+ s8/duck_inspect_1
85
+ s3/mug_drink_3
86
+ s6/hand_pass_1
87
+ s3/headphones_use_1
88
+ s5/wineglass_drink_1
89
+ s10/doorknob_use_2
90
+ s6/spheremedium_pass_1
91
+ s5/wineglass_toast_1
92
+ s5/pyramidmedium_inspect_1
93
+ s10/spheremedium_inspect_1
94
+ s5/bowl_drink_2
95
+ s10/pyramidmedium_pass_1
96
+ s9/watch_set_1
97
+ s7/pyramidmedium_pass_1
98
+ s5/stanfordbunny_pass_1
99
+ s8/stanfordbunny_lift
100
+ s7/waterbottle_open_1_Retake
101
+ s9/pyramidmedium_lift_Retake
102
+ s5/cubelarge_pass_1
103
+ s2/stapler_staple_1
104
+ s2/knife_pass_1
105
+ s5/duck_inspect_1
106
+ s8/wineglass_pass_1
107
+ s5/spherelarge_pass_1
108
+ s2/elephant_pass_1
109
+ s1/spherelarge_offhand_1
110
+ s4/doorknob_use_1
111
+ s4/stanfordbunny_pass_1
112
+ s5/piggybank_pass_1
113
+ s5/banana_peel_2
114
+ s10/spheremedium_lift
115
+ s9/gamecontroller_pass_1
116
+ s6/alarmclock_see_1
117
+ s9/toruslarge_inspect_1
118
+ s1/waterbottle_pass_1
119
+ s7/piggybank_use_1
120
+ s8/toruslarge_pass_1
121
+ s1/hammer_use_1
122
+ s6/stapler_offhand_1_Retake
123
+ s8/toothpaste_pass_1
124
+ s10/knife_peel_1
125
+ s4/cylindersmall_pass_1
126
+ s10/cylinderlarge_inspect_1
127
+ s7/flashlight_on_2
128
+ s2/cubelarge_lift
129
+ s8/fryingpan_cook_1
130
+ s6/wineglass_lift
131
+ s1/stapler_lift
132
+ s4/cup_pass_1
133
+ s10/piggybank_use_1
134
+ s10/binoculars_see_1
135
+ s4/spheremedium_pass_1
136
+ s9/bowl_drink_1
137
+ s4/piggybank_use_1
138
+ s5/cubesmall_inspect_1
139
+ s7/stamp_stamp_1
140
+ s3/scissors_use_2
141
+ s10/torusmedium_inspect_1
142
+ s7/hand_inspect_1
143
+ s5/cylindermedium_inspect_1
144
+ s8/banana_eat_1
145
+ s7/binoculars_see_1
146
+ s4/spheresmall_pass_1
147
+ s4/cylinderlarge_lift
148
+ s9/waterbottle_shake_1
149
+ s4/hand_inspect_1
150
+ s7/spherelarge_lift
151
+ s2/flashlight_on_1
152
+ s10/duck_pass_1
153
+ s9/bowl_pass_1
154
+ s1/mouse_lift
155
+ s9/watch_pass_1
156
+ s2/phone_call_1
157
+ s6/cylinderlarge_inspect_1
158
+ s3/wineglass_drink_2
159
+ s10/toothpaste_pass_1
160
+ s10/fryingpan_cook_1
161
+ s8/watch_lift
162
+ s9/pyramidlarge_inspect_1
163
+ s2/mouse_use_1
164
+ s6/pyramidmedium_pass_1
165
+ s4/mouse_pass_1
166
+ s6/wineglass_toast_1
167
+ s1/watch_lift
168
+ s9/hand_inspect_1
169
+ s1/airplane_lift
170
+ s9/mouse_lift
171
+ s6/eyeglasses_clean_1
172
+ s4/waterbottle_drink_1
173
+ s4/torussmall_lift
174
+ s6/binoculars_lift
175
+ s8/bowl_offhand_1
176
+ s2/eyeglasses_pass_1
177
+ s8/fryingpan_cook_2
178
+ s8/spherelarge_lift
179
+ s1/cylindermedium_pass_1
180
+ s1/banana_eat_1
181
+ s10/gamecontroller_pass_1
182
+ s9/camera_browse_1
183
+ s6/eyeglasses_wear_1
184
+ s1/stanfordbunny_lift
185
+ s6/cylindersmall_inspect_1
186
+ s6/elephant_pass_1
187
+ s3/torusmedium_inspect_1
188
+ s8/spheresmall_pass_1
189
+ s6/apple_offhand_1_Retake
190
+ s6/stanfordbunny_pass_1
191
+ s6/mug_drink_2
192
+ s8/mug_lift
193
+ s1/binoculars_see_1
194
+ s1/torussmall_offhand_1
195
+ s9/duck_pass_1
196
+ s8/cubemedium_offhand_1
197
+ s8/spherelarge_inspect_1
198
+ s7/wineglass_pass_1
199
+ s9/toothpaste_squeeze_1
200
+ s10/scissors_pass_1
201
+ s10/torussmall_inspect_1
202
+ s4/wineglass_lift
203
+ s5/binoculars_pass_1
204
+ s7/stanfordbunny_pass_1
205
+ s1/mug_drink_1
206
+ s1/stamp_pass_1
207
+ s8/piggybank_use_1
208
+ s10/spheremedium_pass_1
209
+ s9/torusmedium_pass_1
210
+ s3/mouse_lift
211
+ s1/cylindermedium_inspect_1
212
+ s8/cylindermedium_pass_1
213
+ s7/cubesmall_pass_1
214
+ s6/cylindermedium_lift
215
+ s7/cup_pour_1
216
+ s9/banana_pass_1
217
+ s1/camera_browse_1
218
+ s3/eyeglasses_clean_1
219
+ s1/doorknob_use_fun_1
220
+ s1/banana_lift
221
+ s6/cylindersmall_pass_1
222
+ s5/elephant_lift
223
+ s5/elephant_inspect_1
224
+ s9/waterbottle_lift
225
+ s6/piggybank_lift_Retake
226
+ s5/camera_takepicture_3
227
+ s4/stapler_staple_2
228
+ s1/cylindersmall_lift
229
+ s1/airplane_offhand_1
230
+ s9/apple_pass_1
231
+ s9/cylindermedium_pass_1
232
+ s9/hammer_use_3
233
+ s7/cubemedium_pass_1
234
+ s6/cubemedium_inspect_1
235
+ s2/mug_lift
236
+ s6/phone_lift
237
+ s4/spherelarge_inspect_1
238
+ s1/elephant_inspect_1
239
+ s4/headphones_pass_1
240
+ s8/binoculars_see_1
241
+ s7/pyramidmedium_inspect_1
242
+ s1/toothpaste_lift
243
+ s4/stamp_lift
244
+ s8/eyeglasses_pass_1
245
+ s7/hand_pass_1
246
+ s10/spherelarge_lift
247
+ s2/stapler_pass_1
248
+ s9/banana_peel_1
249
+ s2/mug_pass_1
250
+ s6/stapler_pass_1
251
+ s10/wineglass_drink_1
252
+ s10/lightbulb_screw_1
253
+ s7/spheresmall_pass_1
254
+ s2/stanfordbunny_inspect_1
255
+ s7/piggybank_pass_1
256
+ s8/mug_pass_1
257
+ s10/torussmall_lift
258
+ s8/cup_pour_1
259
+ s5/pyramidlarge_pass_1
260
+ s1/binoculars_pass_1
261
+ s5/hammer_pass_1
262
+ s1/flashlight_offhand_1
263
+ s7/elephant_pass_1
264
+ s6/hand_lift
265
+ s1/cubelarge_offhand_1
266
+ s9/elephant_lift
267
+ s8/banana_peel_2
268
+ s8/knife_peel_1
269
+ s1/teapot_pass_1
270
+ s10/headphones_pass_1
271
+ s2/toothbrush_lift
272
+ s9/knife_pass_1
273
+ s2/cubemedium_inspect_1
274
+ s8/teapot_lift
275
+ s3/pyramidmedium_pass_1
276
+ s2/cylindermedium_lift
277
+ s4/mug_pass_1
278
+ s10/stamp_lift
279
+ s7/airplane_lift_Retake
280
+ s10/toruslarge_pass_1
281
+ s6/fryingpan_pass_1
282
+ s4/train_play_1
283
+ s5/pyramidmedium_pass_1
284
+ s7/binoculars_pass_1
285
+ s1/phone_call_1
286
+ s2/spheresmall_inspect_1
287
+ s3/apple_lift
288
+ s6/hammer_use_3
289
+ s8/toothbrush_pass_1
290
+ s7/spheresmall_inspect_1
291
+ s10/flashlight_on_1
292
+ s9/elephant_inspect_1
293
+ s4/elephant_pass_1
294
+ s1/bowl_drink_1
295
+ s6/cubemedium_pass_1
296
+ s7/eyeglasses_pass_1
297
+ s8/airplane_fly_1
298
+ s3/alarmclock_lift
299
+ s4/doorknob_lift
300
+ s4/fryingpan_cook_2
301
+ s4/toothbrush_pass_1
302
+ s5/cylinderlarge_pass_1
303
+ s2/banana_eat_1
304
+ s5/stamp_stamp_1
305
+ s1/knife_lift
306
+ s10/hand_pass_1
307
+ s4/bowl_drink_2
308
+ s8/cup_pass_1
309
+ s5/hammer_use_1
310
+ s3/train_lift
311
+ s2/cup_pass_1
312
+ s7/torussmall_pass_1
313
+ s8/hand_pass_1
314
+ s10/cup_drink_2
315
+ s3/toothpaste_lift
316
+ s2/fryingpan_pass_1
317
+ s3/cylindersmall_pass_1
318
+ s5/spheremedium_lift
319
+ s10/apple_eat_1
320
+ s4/teapot_pour_2
321
+ s4/torussmall_pass_1
322
+ s5/toothpaste_pass_1
323
+ s7/headphones_pass_1
324
+ s5/camera_pass_1
325
+ s9/elephant_pass_1
326
+ s6/toothpaste_lift
327
+ s8/flute_pass_1
328
+ s4/spheremedium_lift
329
+ s3/binoculars_see_1
330
+ s10/camera_browse_1
331
+ s1/banana_pass_1
332
+ s1/toruslarge_lift
333
+ s6/cubelarge_inspect_1
334
+ s1/pyramidlarge_pass_1
335
+ s2/teapot_pour_1
336
+ s3/waterbottle_pass_1
337
+ s2/hammer_lift
338
+ s9/apple_lift
339
+ s9/waterbottle_pour_1
340
+ s4/cubemedium_lift
341
+ s9/phone_pass_1
342
+ s5/cubemedium_lift
343
+ s4/gamecontroller_pass_1
344
+ s9/cubemedium_inspect_1
345
+ s3/cup_drink_1
346
+ s4/stapler_lift
347
+ s2/apple_pass_1
348
+ s5/mug_toast_1
349
+ s5/cylindersmall_inspect_1
350
+ s3/cup_lift
351
+ s1/hammer_pass_1
352
+ s10/hammer_lift
353
+ s5/elephant_pass_1
354
+ s1/spheresmall_inspect_1
355
+ s3/toothbrush_brush_1
356
+ s7/apple_eat_1
357
+ s7/pyramidlarge_lift
358
+ s7/wineglass_drink_1
359
+ s7/mug_lift
360
+ s6/cubesmall_offhand_1
361
+ s2/camera_lift
362
+ s1/cubemedium_lift
363
+ s10/gamecontroller_lift
364
+ s9/cylinderlarge_pass_1
365
+ s8/cubelarge_inspect_1
366
+ s1/scissors_pass_1
367
+ s1/train_play_1
368
+ s5/stanfordbunny_inspect_1
369
+ s4/spheresmall_lift
370
+ s4/watch_lift
371
+ s9/cylinderlarge_inspect_1
372
+ s2/hand_inspect_1
373
+ s8/toothpaste_squeeze_1
374
+ s8/toothpaste_squeeze_2
375
+ s8/mouse_use_1
376
+ s2/teapot_pass_1
377
+ s7/cup_pass_1
378
+ s9/spheremedium_inspect_1
379
+ s10/gamecontroller_play_1
380
+ s7/apple_lift
381
+ s6/eyeglasses_lift
382
+ s10/flute_pass_1
383
+ s9/airplane_fly_1
384
+ s5/stamp_pass_1
385
+ s5/alarmclock_lift
386
+ s10/hammer_use_3
387
+ s3/cylindersmall_inspect_1
388
+ s10/alarmclock_pass_1
389
+ s9/waterbottle_drink_1
390
+ s10/banana_pass_1
391
+ s6/watch_pass_1
392
+ s1/banana_peel_2
393
+ s8/pyramidmedium_pass_1
394
+ s7/cubemedium_lift
395
+ s6/cup_drink_2
396
+ s4/cubelarge_pass_1
397
+ s1/bowl_pass_1
398
+ s8/wineglass_drink_2
399
+ s8/phone_call_1
400
+ s8/torusmedium_lift
401
+ s4/piggybank_pass_1
402
+ s9/stamp_pass_1
403
+ s10/hammer_use_2
404
+ s9/wineglass_pass_1
405
+ s8/camera_browse_1
406
+ s7/airplane_pass_1
407
+ s6/cup_pass_1
408
+ s6/airplane_pass_1
409
+ s9/torussmall_pass_1
410
+ s8/teapot_pour_2
411
+ s4/mouse_use_2
412
+ s10/flashlight_on_2
413
+ s7/pyramidsmall_pass_1
414
+ s3/cup_pour_1
415
+ s1/piggybank_pass_1
416
+ s5/cubesmall_pass_1
417
+ s10/airplane_fly_1
418
+ s8/cylinderlarge_offhand_1
419
+ s6/gamecontroller_play_1
420
+ s3/stanfordbunny_pass_1
421
+ s7/stanfordbunny_lift
422
+ s9/banana_peel_2
423
+ s2/apple_lift
424
+ s10/waterbottle_pour_1
425
+ s1/alarmclock_pass_1
426
+ s1/hammer_use_3
427
+ s1/eyeglasses_clean_1
428
+ s6/phone_call_1
429
+ s7/banana_eat_1
430
+ s4/waterbottle_shake_1
431
+ s8/waterbottle_pass_1
432
+ s6/cylinderlarge_lift
433
+ s9/hand_shake_1
434
+ s6/camera_browse_1
435
+ s2/camera_browse_1
436
+ s4/phone_pass_1
437
+ s1/stamp_lift
438
+ s7/torussmall_inspect_1
439
+ s1/cylindersmall_offhand_1
440
+ s7/cylinderlarge_lift
441
+ s2/duck_inspect_1
442
+ s7/spheremedium_pass_1
443
+ s8/toruslarge_lift
444
+ s3/hammer_pass_1
445
+ s3/cup_offhand_1
446
+ s2/headphones_use_1
447
+ s2/train_play_1
448
+ s10/scissors_use_1
449
+ s5/cubesmall_lift
450
+ s10/pyramidsmall_inspect_1
451
+ s3/mug_drink_4
452
+ s1/cubemedium_offhand_1
453
+ s6/apple_eat_1
454
+ s1/doorknob_use_2
455
+ s1/cylinderlarge_lift
456
+ s2/hand_lift
457
+ s5/gamecontroller_lift
458
+ s3/waterbottle_lift
459
+ s3/apple_pass_1
460
+ s10/flute_play_1
461
+ s5/spherelarge_inspect_1
462
+ s7/doorknob_use_1
463
+ s8/torusmedium_pass_1
464
+ s4/bowl_pass_1
465
+ s10/headphones_lift
466
+ s6/pyramidlarge_pass_1
467
+ s2/mug_toast_1
468
+ s4/hammer_use_3
469
+ s1/stanfordbunny_inspect_1
470
+ s3/alarmclock_pass_1
471
+ s8/cylindermedium_inspect_1
472
+ s1/elephant_lift
473
+ s6/waterbottle_pass_1
474
+ s8/cubelarge_lift
475
+ s3/camera_takepicture_1
476
+ s5/torussmall_inspect_1
477
+ s2/airplane_fly_1
478
+ s7/hammer_use_1
479
+ s1/spherelarge_lift
480
+ s1/cubemedium_pass_1
481
+ s2/headphones_lift
482
+ s8/piggybank_lift
483
+ s1/apple_lift
484
+ s3/spherelarge_inspect_1
485
+ s8/banana_pass_1
486
+ s8/stanfordbunny_pass_1
487
+ s4/flute_play_1
488
+ s6/banana_eat_1
489
+ s1/scissors_offhand_1
490
+ s1/binoculars_lift
491
+ s10/cup_lift
492
+ s7/eyeglasses_lift
493
+ s3/stapler_lift
494
+ s7/bowl_drink_1
495
+ s1/cup_drink_1
496
+ s4/alarmclock_lift
497
+ s6/headphones_use_1
498
+ s4/apple_lift
499
+ s6/mouse_pass_1
500
+ s7/flashlight_lift
501
+ s5/torusmedium_inspect_1
502
+ s10/cup_pour_1
503
+ s5/toruslarge_lift
504
+ s10/cup_drink_1
505
+ s7/cylinderlarge_pass_1
506
+ s4/wineglass_drink_1
507
+ s1/stapler_staple_2
508
+ s10/mug_drink_1
509
+ s1/flute_pass_1
510
+ s8/cylinderlarge_lift
511
+ s10/piggybank_pass_1
512
+ s7/duck_inspect_1
513
+ s9/stamp_lift
514
+ s6/bowl_lift
515
+ s6/banana_pass_1
516
+ s9/cubemedium_lift
517
+ s7/flute_pass_1
518
+ s8/eyeglasses_wear_1
519
+ s9/flashlight_lift
520
+ s9/hammer_use_2
521
+ s1/cup_pour_1
522
+ s2/piggybank_lift
523
+ s8/pyramidlarge_inspect_1
524
+ s2/teapot_pour_2
525
+ s6/pyramidsmall_inspect_2
526
+ s1/toothpaste_pass_1
527
+ s10/banana_peel_2
528
+ s1/wineglass_pass_1
529
+ s5/pyramidlarge_inspect_1
530
+ s5/cubemedium_inspect_1
531
+ s7/knife_chop_1
532
+ s6/mug_toast_1
533
+ s5/mug_drink_1
534
+ s6/banana_peel_1
535
+ s7/cylindermedium_pass_1
536
+ s10/mug_drink_2
537
+ s3/elephant_offhand_1
538
+ s4/stapler_pass_1
539
+ s1/torussmall_lift
540
+ s1/duck_lift
541
+ s5/flute_play_1
542
+ s1/airplane_fly_1
543
+ s2/headphones_pass_1
544
+ s3/lightbulb_screw_1
545
+ s8/toothbrush_brush_1
546
+ s10/mouse_use_1
547
+ s5/flute_play_2
548
+ s3/waterbottle_pour_2
549
+ s10/airplane_lift
550
+ s7/eyeglasses_wear_1
551
+ s10/stamp_stamp_1
552
+ s8/cup_drink_1
553
+ s4/alarmclock_pass_1
554
+ s3/train_pass_1
555
+ s2/hammer_use_2
556
+ s1/pyramidsmall_inspect_1
557
+ s9/cubesmall_inspect_1
558
+ s7/camera_browse_1
559
+ s10/spheresmall_pass_1
560
+ s1/phone_offhand_1
561
+ s6/waterbottle_shake_1
562
+ s1/alarmclock_offhand_1
563
+ s8/spheresmall_inspect_1
564
+ s9/cylindermedium_inspect_1
565
+ s6/knife_lift
566
+ s6/mouse_lift
567
+ s1/flashlight_lift
568
+ s9/cubelarge_pass_1
569
+ s2/pyramidlarge_pass_1_Retake
570
+ s7/stanfordbunny_inspect_1
571
+ s9/gamecontroller_play_1
572
+ s4/hand_pass_1
573
+ s3/toothpaste_squeeze_1
574
+ s9/duck_lift
575
+ s5/flashlight_on_1
576
+ s5/cup_lift
577
+ s10/waterbottle_drink_1
578
+ s8/cubesmall_pass_1
579
+ s10/train_lift
580
+ s10/spheresmall_inspect_1
581
+ s10/cup_pass_1
582
+ s1/cubesmall_pass_1
583
+ s4/fryingpan_cook_3
584
+ s6/elephant_inspect_1
585
+ s9/stapler_lift
586
+ s1/pyramidmedium_offhand_1
587
+ s9/pyramidmedium_pass_1
588
+ s9/teapot_pour_2
589
+ s4/fryingpan_cook_1
590
+ s1/eyeglasses_wear_1
591
+ s1/gamecontroller_play_1
592
+ s3/binoculars_offhand_1
593
+ s6/waterbottle_pour_1
594
+ s3/cubesmall_inspect_1
595
+ s8/pyramidlarge_pass_1
596
+ s2/train_pass_1
597
+ s10/mouse_lift
598
+ s10/torusmedium_pass_1
599
+ s8/waterbottle_drink_1
600
+ s7/cubelarge_lift
601
+ s5/duck_pass_1
602
+ s1/gamecontroller_offhand_1
603
+ s2/camera_pass_1
604
+ s7/gamecontroller_play_1
605
+ s7/toothbrush_pass_1
606
+ s8/phone_pass_1
607
+ s3/bowl_drink_1
608
+ s6/toruslarge_pass_1
609
+ s5/spheresmall_inspect_1
610
+ s8/flashlight_pass_1
611
+ s1/flashlight_on_2
612
+ s8/gamecontroller_lift
613
+ s3/stanfordbunny_lift
614
+ s8/bowl_pass_1
615
+ s4/banana_pass_1
616
+ s6/stapler_lift
617
+ s2/teapot_lift
618
+ s3/camera_takepicture_2
619
+ s8/torussmall_pass_1
620
+ s9/camera_pass_1
621
+ s4/apple_eat_1
622
+ s1/watch_set_1
623
+ s5/stanfordbunny_lift
624
+ s6/spheremedium_inspect_1
625
+ s1/eyeglasses_lift
626
+ s10/headphones_use_1
627
+ s3/doorknob_use_1
628
+ s10/apple_pass_1
629
+ s9/flute_pass_1
630
+ s1/toruslarge_inspect_1
631
+ s5/duck_lift
632
+ s9/doorknob_use_2
633
+ s10/torussmall_pass_1
634
+ s2/doorknob_use_2
635
+ s1/pyramidlarge_lift
636
+ s6/stamp_lift
637
+ s3/elephant_pass_1
638
+ s7/headphones_use_1
639
+ s6/cylindermedium_pass_1
640
+ s6/stapler_staple_2
641
+ s6/piggybank_pass_1
642
+ s5/spheremedium_inspect_1
643
+ s1/cubemedium_inspect_1
644
+ s4/pyramidmedium_pass_1
645
+ s7/cylinderlarge_inspect_1
646
+ s1/cylindermedium_offhand_1
647
+ s6/toothpaste_squeeze_1_Retake
648
+ s9/stapler_pass_1
649
+ s8/flashlight_on_1
650
+ s5/cup_drink_2
651
+ s8/apple_lift
652
+ s8/airplane_pass_1
653
+ s6/wineglass_pass_1
654
+ s4/cylinderlarge_inspect_1
655
+ s4/phone_call_1
656
+ s5/stapler_pass_1
657
+ s8/camera_takepicture_2
658
+ s3/mouse_pass_1
659
+ s7/pyramidlarge_inspect_1
660
+ s10/stapler_staple_2
661
+ s9/teapot_pass_1
662
+ s5/gamecontroller_play_1
663
+ s4/train_lift
664
+ s8/doorknob_lift
665
+ s1/watch_offhand_1
666
+ s7/mug_drink_2
667
+ s4/flashlight_pass_1
668
+ s1/mug_offhand_1
669
+ s4/camera_takepicture_3
670
+ s8/train_pass_1
671
+ s7/train_pass_1
672
+ s8/gamecontroller_play_1
673
+ s1/scissors_use_2
674
+ s9/piggybank_pass_1
675
+ s6/flute_lift
676
+ s1/banana_peel_1
677
+ s6/bowl_pass_1
678
+ s1/mouse_pass_1
679
+ s6/cubesmall_lift
680
+ s5/airplane_lift
681
+ s3/scissors_pass_1
682
+ s2/cylinderlarge_pass_1
683
+ s6/watch_set_1
684
+ s10/stamp_pass_1
685
+ s9/banana_lift
686
+ s3/toruslarge_lift
687
+ s10/hammer_use_1
688
+ s10/cubesmall_lift
689
+ s6/teapot_pass_1
690
+ s3/pyramidlarge_offhand_1
691
+ s8/cylindersmall_inspect_1
692
+ s3/cylinderlarge_inspect_1
693
+ s7/teapot_pass_1
694
+ s9/mouse_pass_1
695
+ s3/mug_drink_1
696
+ s2/spheremedium_pass_1
697
+ s9/waterbottle_pass_1
698
+ s2/flute_pass_1
699
+ s9/lightbulb_screw_1
700
+ s8/cubemedium_pass_1
701
+ s7/flashlight_pass_1
702
+ s9/eyeglasses_lift
703
+ s7/cubemedium_inspect_1
704
+ s5/waterbottle_lift
705
+ s6/cup_drink_1
706
+ s1/pyramidlarge_inspect_1
707
+ s3/airplane_fly_1
708
+ s10/pyramidmedium_lift
709
+ s10/bowl_lift
710
+ s2/cubelarge_pass_1
711
+ s7/stapler_staple_1
712
+ s8/flashlight_on_2
713
+ s4/cylindersmall_lift
714
+ s10/stanfordbunny_pass_1
715
+ s6/cubemedium_lift
716
+ s10/stapler_pass_1
717
+ s5/apple_pass_1
718
+ s7/doorknob_use_2
719
+ s4/watch_pass_1
720
+ s1/elephant_pass_1
721
+ s1/watch_set_2
722
+ s2/cylinderlarge_inspect_1
723
+ s2/pyramidlarge_inspect_1
724
+ s9/eyeglasses_wear_1
725
+ s4/phone_lift
726
+ s1/spheremedium_lift
727
+ s3/airplane_offhand_1
728
+ s6/spheresmall_pass_1
729
+ s9/piggybank_lift_Retake
730
+ s8/cylinderlarge_inspect_1
731
+ s5/wineglass_lift
732
+ s3/train_play_1
733
+ s10/knife_chop_1
734
+ s9/wineglass_drink_2
735
+ s6/hammer_lift
736
+ s9/cylindersmall_pass_1
737
+ s4/bowl_drink_1
738
+ s1/mug_toast_1
739
+ s1/pyramidmedium_inspect_1
740
+ s9/stamp_stamp_1
741
+ s10/toothpaste_squeeze_1
742
+ s1/spheresmall_lift
743
+ s3/doorknob_use_2
744
+ s7/fryingpan_cook_1
745
+ s5/teapot_pass_1
746
+ s8/phone_lift
747
+ s2/cubesmall_inspect_1
748
+ s1/toruslarge_pass_1
749
+ s6/headphones_pass_1
750
+ s8/cylinderlarge_pass_1
751
+ s4/toothpaste_pass_1
752
+ s4/camera_takepicture_2
753
+ s3/camera_offhand_1
754
+ s8/cup_offhand_1
755
+ s9/spherelarge_pass_1
756
+ s10/stanfordbunny_lift
757
+ s1/spheremedium_offhand_1
758
+ s4/torusmedium_lift
759
+ s9/cylinderlarge_lift
760
+ s6/hammer_use_1
761
+ s4/wineglass_toast_1
762
+ s2/cylinderlarge_lift
763
+ s6/cubesmall_inspect_1
764
+ s1/camera_takepicture_2
765
+ s3/phone_pass_1
766
+ s1/hand_inspect_1
767
+ s8/bowl_drink_2
768
+ s1/stapler_offhand_1
769
+ s1/piggybank_use_1
770
+ s1/apple_pass_1
771
+ s9/wineglass_drink_1
772
+ s10/mug_lift
773
+ s10/cylinderlarge_lift
774
+ s2/stamp_stamp_1
775
+ s10/stanfordbunny_inspect_1
776
+ s4/cubelarge_lift
777
+ s8/alarmclock_see_1
778
+ s5/cubelarge_lift
779
+ s3/eyeglasses_pass_1
780
+ s1/pyramidsmall_pass_1
781
+ s2/cylindermedium_pass_1
782
+ s8/fryingpan_offhand_1
783
+ s1/mug_drink_3
784
+ s2/piggybank_pass_1_Retake
785
+ s8/headphones_lift
786
+ s9/alarmclock_see_1
787
+ s5/toruslarge_pass_1
788
+ s1/stapler_staple_1
789
+ s1/stanfordbunny_offhand_1
790
+ s3/airplane_lift
791
+ s8/cubemedium_inspect_1
792
+ s9/stapler_staple_2
793
+ s3/fryingpan_lift
794
+ s10/spherelarge_inspect_1
795
+ s7/pyramidlarge_pass_1
796
+ s5/mouse_lift
797
+ s8/watch_set_2
798
+ s3/wineglass_drink_1
799
+ s6/spheremedium_lift
800
+ s2/stapler_staple_2
801
+ s9/airplane_pass_1
802
+ s6/duck_inspect_1
803
+ s10/lightbulb_pass_1
804
+ s8/stapler_pass_1
805
+ s6/camera_takepicture_3
806
+ s10/camera_takepicture_2
807
+ s10/mouse_pass_1
808
+ s3/cup_drink_2
809
+ s5/alarmclock_see_1
810
+ s10/elephant_inspect_1
811
+ s6/apple_pass_1
812
+ s3/gamecontroller_pass_1
813
+ s10/cubemedium_pass_1
814
+ s5/wineglass_pass_1
815
+ s5/waterbottle_pass_1
816
+ s10/teapot_pour_2
817
+ s6/train_pass_1
818
+ s7/hammer_use_2
819
+ s8/apple_pass_1
820
+ s6/cup_pour_1
821
+ s7/wineglass_lift
822
+ s9/toothbrush_pass_1
823
+ s6/doorknob_lift
824
+ s9/banana_eat_1
825
+ s4/cylinderlarge_pass_1
826
+ s10/cubelarge_lift
827
+ s1/camera_lift
828
+ s1/stanfordbunny_pass_1
829
+ s7/watch_pass_1
830
+ s9/cup_lift
831
+ s7/apple_pass_1
832
+ s3/piggybank_pass_1
833
+ s1/cylindersmall_inspect_1
834
+ s4/cup_drink_2
835
+ s8/spheremedium_pass_1
836
+ s6/waterbottle_open_1
837
+ s3/watch_set_1
838
+ s8/cubesmall_inspect_1
839
+ s3/duck_inspect_1
840
+ s7/cylindermedium_inspect_1
841
+ s9/cylindersmall_inspect_1
842
+ s1/spherelarge_inspect_1
843
+ s1/spheresmall_offhand_1
844
+ s8/cylindermedium_lift
845
+ s1/bowl_drink_2
846
+ s5/train_pass_1
847
+ s4/alarmclock_see_1
848
+ s8/flashlight_offhand_1
849
+ s2/cup_drink_1
850
+ s8/duck_lift
851
+ s6/cubelarge_pass_1
852
+ s6/flute_pass_1
853
+ s2/toruslarge_inspect_1
854
+ s4/camera_pass_1
855
+ s7/train_play_1
856
+ s9/mouse_use_1
857
+ s5/cylindermedium_pass_1
858
+ s3/pyramidsmall_pass_1
859
+ s3/eyeglasses_offhand_1
860
+ s8/wineglass_toast_1
861
+ s7/eyeglasses_clean_1
862
+ s5/toothbrush_brush_1
863
+ s10/toothpaste_squeeze_2
864
+ s1/flute_lift
865
+ s3/toothbrush_pass_1
866
+ s1/bowl_lift
867
+ s9/bowl_drink_2
868
+ s7/banana_peel_2
869
+ s8/doorknob_use_2
870
+ s8/torussmall_inspect_1
871
+ s1/camera_pass_1
872
+ s8/hand_inspect_1
873
+ s1/watch_pass_1
874
+ s10/watch_pass_1
875
+ s3/toruslarge_inspect_1
876
+ s9/torusmedium_inspect_1
877
+ s9/cylindermedium_lift
878
+ s10/teapot_pass_1
879
+ s6/wineglass_drink_2
880
+ s7/headphones_lift
881
+ s10/stapler_staple_1
882
+ s2/pyramidmedium_inspect_1
883
+ s3/flashlight_pass_1
884
+ s1/torusmedium_pass_1
885
+ s6/spherelarge_pass_1
886
+ s2/phone_pass_1
887
+ s2/pyramidmedium_pass_1
888
+ s9/stanfordbunny_pass_1
889
+ s8/pyramidmedium_inspect_1
890
+ s1/toothbrush_lift
891
+ s7/hammer_pass_1
892
+ s8/hand_lift
893
+ s8/camera_takepicture_3
894
+ s8/stamp_lift
895
+ s8/torusmedium_inspect_1
896
+ s7/phone_call_1
897
+ s3/pyramidlarge_inspect_1
898
+ s6/torussmall_pass_1
899
+ s3/mouse_use_1
900
+ s6/toruslarge_inspect_1
901
+ s3/elephant_lift
902
+ s1/wineglass_drink_2
903
+ s7/lightbulb_pass_1
904
+ s8/hammer_pass_1
905
+ s6/pyramidmedium_inspect_1
906
+ s8/gamecontroller_pass_1
907
+ s1/waterbottle_pour_1
908
+ s9/toothpaste_lift
909
+ s9/spherelarge_inspect_1
910
+ s8/teapot_pour_1
911
+ s4/spheremedium_inspect_1
912
+ s6/stamp_stamp_1
913
+ s9/hammer_use_1
914
+ s1/torusmedium_offhand_1
915
+ s1/alarmclock_see_1
916
+ s6/phone_pass_1
917
+ s5/waterbottle_open_1
918
+ s4/cubemedium_pass_1
919
+ s6/apple_lift
920
+ s1/headphones_pass_1
921
+ s8/hammer_use_3
922
+ s3/piggybank_use_1
923
+ s3/wineglass_toast_1
924
+ s6/spheresmall_lift
925
+ s1/camera_takepicture_3_Retake
926
+ s1/cylindermedium_lift
927
+ s6/teapot_pour_1
928
+ s4/train_pass_1
929
+ s7/toruslarge_inspect_1_Retake
930
+ s8/cubelarge_pass_1
931
+ s3/alarmclock_offhand_1
932
+ s4/hand_lift
933
+ s8/mug_drink_2
934
+ s5/cup_pass_1
935
+ s3/stapler_staple_1
936
+ s10/phone_lift
937
+ s6/torusmedium_inspect_1
938
+ s4/cup_drink_1
939
+ s2/train_lift
940
+ s8/stapler_staple_1
941
+ s2/apple_eat_1
942
+ s6/torusmedium_pass_1
943
+ s4/watch_set_1
944
+ s3/camera_takepicture_3
945
+ s9/spheresmall_inspect_1
946
+ s4/cubesmall_pass_1
947
+ s6/alarmclock_lift
948
+ s2/toruslarge_pass_1
949
+ s9/pyramidmedium_inspect_1
950
+ s3/stanfordbunny_inspect_1
951
+ s6/scissors_use_2
952
+ s1/stamp_stamp_1
953
+ s3/torusmedium_pass_1
954
+ s6/eyeglasses_pass_1
955
+ s7/cubelarge_inspect_1
956
+ s3/stamp_lift
957
+ s8/mouse_pass_1
958
+ s7/elephant_inspect_1
959
+ s8/camera_pass_1
960
+ s10/bowl_pass_1
961
+ s1/waterbottle_lift
962
+ s8/bowl_lift
963
+ s3/phone_call_1
964
+ s8/phone_offhand_1
965
+ s3/eyeglasses_wear_1
966
+ s6/bowl_drink_1_Retake
967
+ s9/torussmall_inspect_1
968
+ s1/piggybank_offhand_1
969
+ s1/mug_drink_2
970
+ s4/airplane_fly_1
971
+ s8/cubelarge_offhand_1
972
+ s10/camera_pass_1
973
+ s4/stanfordbunny_inspect_1
974
+ s6/pyramidlarge_inspect_1
975
+ s10/wineglass_drink_2
976
+ s1/duck_inspect_1
977
+ s4/duck_inspect_1
978
+ s1/mouse_use_1
979
+ s5/eyeglasses_clean_2
980
+ s10/camera_takepicture_1
981
+ s8/fryingpan_pass_1
982
+ s10/cubelarge_inspect_1
983
+ s7/hammer_use_3
984
+ s5/spherelarge_lift
985
+ s1/fryingpan_cook_1
986
+ s5/cubemedium_pass_1
987
+ s3/flute_play_1
988
+ s9/headphones_use_1
989
+ s1/cylinderlarge_inspect_1
990
+ s7/hammer_lift
991
+ s7/wineglass_toast_1
992
+ s4/pyramidmedium_inspect_1
993
+ s10/bowl_drink_1_Retake
994
+ s9/piggybank_use_1
995
+ s1/cubesmall_lift
996
+ s1/fryingpan_cook_2
997
+ s1/apple_eat_1
998
+ s9/watch_set_2
999
+ s9/bowl_lift
1000
+ s10/hand_lift
1001
+ s10/cylindersmall_inspect_1
1002
+ s1/piggybank_lift
1003
+ s2/toothpaste_squeeze_1
1004
+ s6/binoculars_see_1
1005
+ s6/spherelarge_inspect_1
1006
+ s8/alarmclock_lift
1007
+ s1/cylindersmall_pass_1
1008
+ s6/pyramidlarge_lift
1009
+ s4/binoculars_pass_1
1010
+ s10/bowl_drink_2
1011
+ s5/flute_pass_1
1012
+ s10/torusmedium_lift
1013
+ s1/duck_pass_1
1014
+ s2/cubelarge_inspect_1
1015
+ s8/watch_pass_1
1016
+ s1/waterbottle_open_1
1017
+ s9/flute_play_1
1018
+ s9/airplane_lift
1019
+ s1/mug_drink_4
1020
+ s4/wineglass_pass_1
1021
+ s6/camera_pass_1
1022
+ s4/duck_pass_1
1023
+ s10/cylindersmall_pass_1
1024
+ s10/alarmclock_lift_Retake
1025
+ s4/toruslarge_pass_1
1026
+ s10/wineglass_pass_1
1027
+ s5/cylindermedium_lift
1028
+ s2/pyramidlarge_lift
1029
+ s3/flute_offhand_1
1030
+ s8/camera_offhand_1
1031
+ s2/elephant_inspect_1
1032
+ s5/mug_pass_1
1033
+ s3/hammer_use_1
1034
+ s9/mug_drink_2
1035
+ s5/wineglass_drink_2
1036
+ s5/piggybank_lift
1037
+ s9/waterbottle_open_1
1038
+ s7/alarmclock_lift
1039
+ s1/eyeglasses_offhand_1
1040
+ s1/flute_play_1
1041
+ s8/wineglass_lift
1042
+ s7/waterbottle_shake_1
1043
+ s1/cubelarge_inspect_1
1044
+ s1/hammer_use_2
1045
+ s5/doorknob_lift
1046
+ s1/spheremedium_inspect_1
1047
+ s9/spheremedium_pass_1
1048
+ s6/knife_peel_1
1049
+ s4/fryingpan_pass_1
1050
+ s3/binoculars_lift
1051
+ s7/toruslarge_lift
1052
+ s4/piggybank_lift
1053
+ s3/cubelarge_pass_1
1054
+ s2/banana_peel_1
1055
+ s1/alarmclock_lift
1056
+ s5/train_lift
1057
+ s1/cup_pass_1
1058
+ s1/waterbottle_drink_1
1059
+ s3/bowl_drink_2
1060
+ s5/cylinderlarge_inspect_1
1061
+ s4/headphones_use_1
1062
+ s3/hand_pass_1
1063
+ s3/flashlight_on_1
1064
+ s3/toruslarge_pass_1
1065
+ s9/flashlight_on_1
1066
+ s8/mug_offhand_1
1067
+ s8/cubesmall_lift
1068
+ s10/airplane_pass_1
text2motion/data/GRAB/grab_val.txt ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ s4/stanfordbunny_lift
2
+ s3/cubelarge_offhand_1
3
+ s2/stanfordbunny_pass_1
4
+ s5/bowl_drink_1
5
+ s3/knife_lift
6
+ s8/fryingpan_cook_3
7
+ s7/stamp_pass_1
8
+ s10/hammer_pass_1
9
+ s1/phone_pass_1
10
+ s6/teapot_pour_2
11
+ s2/mug_drink_2
12
+ s8/spheresmall_lift
13
+ s7/banana_pass_1
14
+ s4/cup_lift
15
+ s5/train_play_1
16
+ s6/airplane_lift
17
+ s3/spheremedium_inspect_1
18
+ s1/hand_offhand_1
19
+ s3/camera_pass_1
20
+ s7/bowl_pass_1
21
+ s6/mug_pass_1
22
+ s7/binoculars_lift
23
+ s8/hammer_lift
24
+ s1/waterbottle_offhand_1
25
+ s3/cylindermedium_pass_1
26
+ s7/waterbottle_drink_1
27
+ s7/spheremedium_inspect_1
28
+ s10/banana_eat_1
29
+ s2/spherelarge_lift
30
+ s1/duck_offhand_2
31
+ s6/doorknob_use_2
32
+ s6/cylinderlarge_pass_1
33
+ s9/camera_lift
34
+ s1/eyeglasses_pass_1
35
+ s3/cylinderlarge_pass_1
36
+ s10/elephant_pass_1
37
+ s4/scissors_pass_1
38
+ s3/cubemedium_inspect_1
39
+ s6/stamp_pass_1
40
+ s9/mug_pass_1
41
+ s10/phone_pass_1
42
+ s7/mouse_pass_1
43
+ s3/scissors_use_1
44
+ s6/airplane_fly_1
45
+ s8/lightbulb_pass_1
46
+ s2/toruslarge_lift
47
+ s8/stapler_lift
48
+ s5/piggybank_use_1
49
+ s6/banana_peel_2
50
+ s10/doorknob_lift
51
+ s8/stamp_pass_1
52
+ s5/hand_pass_1
53
+ s5/bowl_pass_1
54
+ s3/pyramidlarge_pass_1
55
+ s6/camera_lift
56
+ s9/duck_inspect_1
57
+ s3/cubelarge_lift
58
+ s4/watch_set_2
59
+ s5/cylinderlarge_lift
60
+ s2/toothpaste_lift
61
+ s8/knife_chop_1
62
+ s1/flashlight_on_1
63
+ s10/mug_pass_1
64
+ s1/gamecontroller_lift
65
+ s1/airplane_pass_1
66
+ s7/cylindersmall_pass_1
text2motion/data/GRAB/test.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ s1/airplane_pass_1
text2motion/data/GRAB/train.txt ADDED
@@ -0,0 +1,1068 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ s8/mug_toast_1
2
+ s3/airplane_pass_1
3
+ s1/lightbulb_pass_1
4
+ s10/camera_takepicture_3
5
+ s1/wineglass_toast_1
6
+ s1/phone_lift
7
+ s4/binoculars_see_1
8
+ s8/teapot_pass_1
9
+ s9/cubemedium_pass_1
10
+ s8/elephant_lift
11
+ s2/gamecontroller_play_1
12
+ s6/duck_lift
13
+ s4/stapler_staple_1
14
+ s8/mug_drink_1
15
+ s8/train_lift
16
+ s1/cubesmall_offhand_1
17
+ s6/cylindermedium_inspect_1
18
+ s8/cylindersmall_lift
19
+ s3/waterbottle_drink_1
20
+ s1/wineglass_offhand_1
21
+ s5/scissors_pass_1
22
+ s1/cup_lift
23
+ s8/binoculars_pass_1
24
+ s9/hand_pass_1
25
+ s7/doorknob_lift
26
+ s8/spherelarge_pass_1
27
+ s10/alarmclock_see_1
28
+ s5/waterbottle_shake_1
29
+ s5/mouse_use_1
30
+ s2/cubesmall_lift
31
+ s4/cubelarge_inspect_1
32
+ s8/apple_eat_1
33
+ s3/cylinderlarge_lift_Retake
34
+ s7/mug_drink_1
35
+ s8/knife_pass_1
36
+ s1/flute_offhand_1
37
+ s7/stapler_pass_1
38
+ s6/flute_play_1
39
+ s6/stapler_staple_1
40
+ s3/gamecontroller_lift
41
+ s9/spheresmall_pass_1
42
+ s9/camera_takepicture_1
43
+ s3/apple_offhand_1
44
+ s4/binoculars_lift
45
+ s8/fryingpan_lift
46
+ s7/camera_pass_1
47
+ s10/hand_shake_1
48
+ s3/stamp_stamp_1
49
+ s6/watch_set_2
50
+ s6/wineglass_drink_1
51
+ s1/cubelarge_pass_1
52
+ s6/hand_inspect_1
53
+ s2/torussmall_lift
54
+ s1/torussmall_pass_1
55
+ s3/mug_offhand_1
56
+ s1/cylinderlarge_offhand_1
57
+ s2/cylindermedium_inspect_1
58
+ s4/stamp_stamp_1
59
+ s4/mouse_use_1
60
+ s8/spheremedium_lift
61
+ s8/headphones_pass_1
62
+ s3/gamecontroller_play_1
63
+ s2/stanfordbunny_lift
64
+ s10/fryingpan_cook_2
65
+ s3/wineglass_pass_1
66
+ s1/pyramidlarge_offhand_1
67
+ s6/toothpaste_pass_1
68
+ s1/mug_lift
69
+ s10/waterbottle_pass_1
70
+ s1/flashlight_pass_1
71
+ s7/banana_lift
72
+ s6/waterbottle_drink_1
73
+ s1/stamp_offhand_1
74
+ s1/camera_takepicture_1
75
+ s10/cubesmall_inspect_1
76
+ s7/alarmclock_see_1
77
+ s7/gamecontroller_pass_1
78
+ s8/banana_peel_1
79
+ s2/camera_takepicture_2
80
+ s2/cubemedium_lift
81
+ s10/train_pass_1
82
+ s10/cylindermedium_pass_1
83
+ s6/spherelarge_lift
84
+ s8/duck_inspect_1
85
+ s3/mug_drink_3
86
+ s6/hand_pass_1
87
+ s3/headphones_use_1
88
+ s5/wineglass_drink_1
89
+ s10/doorknob_use_2
90
+ s6/spheremedium_pass_1
91
+ s5/wineglass_toast_1
92
+ s5/pyramidmedium_inspect_1
93
+ s10/spheremedium_inspect_1
94
+ s5/bowl_drink_2
95
+ s10/pyramidmedium_pass_1
96
+ s9/watch_set_1
97
+ s7/pyramidmedium_pass_1
98
+ s5/stanfordbunny_pass_1
99
+ s8/stanfordbunny_lift
100
+ s7/waterbottle_open_1_Retake
101
+ s9/pyramidmedium_lift_Retake
102
+ s5/cubelarge_pass_1
103
+ s2/stapler_staple_1
104
+ s2/knife_pass_1
105
+ s5/duck_inspect_1
106
+ s8/wineglass_pass_1
107
+ s5/spherelarge_pass_1
108
+ s2/elephant_pass_1
109
+ s1/spherelarge_offhand_1
110
+ s4/doorknob_use_1
111
+ s4/stanfordbunny_pass_1
112
+ s5/piggybank_pass_1
113
+ s5/banana_peel_2
114
+ s10/spheremedium_lift
115
+ s9/gamecontroller_pass_1
116
+ s6/alarmclock_see_1
117
+ s9/toruslarge_inspect_1
118
+ s1/waterbottle_pass_1
119
+ s7/piggybank_use_1
120
+ s8/toruslarge_pass_1
121
+ s1/hammer_use_1
122
+ s6/stapler_offhand_1_Retake
123
+ s8/toothpaste_pass_1
124
+ s10/knife_peel_1
125
+ s4/cylindersmall_pass_1
126
+ s10/cylinderlarge_inspect_1
127
+ s7/flashlight_on_2
128
+ s2/cubelarge_lift
129
+ s8/fryingpan_cook_1
130
+ s6/wineglass_lift
131
+ s1/stapler_lift
132
+ s4/cup_pass_1
133
+ s10/piggybank_use_1
134
+ s10/binoculars_see_1
135
+ s4/spheremedium_pass_1
136
+ s9/bowl_drink_1
137
+ s4/piggybank_use_1
138
+ s5/cubesmall_inspect_1
139
+ s7/stamp_stamp_1
140
+ s3/scissors_use_2
141
+ s10/torusmedium_inspect_1
142
+ s7/hand_inspect_1
143
+ s5/cylindermedium_inspect_1
144
+ s8/banana_eat_1
145
+ s7/binoculars_see_1
146
+ s4/spheresmall_pass_1
147
+ s4/cylinderlarge_lift
148
+ s9/waterbottle_shake_1
149
+ s4/hand_inspect_1
150
+ s7/spherelarge_lift
151
+ s2/flashlight_on_1
152
+ s10/duck_pass_1
153
+ s9/bowl_pass_1
154
+ s1/mouse_lift
155
+ s9/watch_pass_1
156
+ s2/phone_call_1
157
+ s6/cylinderlarge_inspect_1
158
+ s3/wineglass_drink_2
159
+ s10/toothpaste_pass_1
160
+ s10/fryingpan_cook_1
161
+ s8/watch_lift
162
+ s9/pyramidlarge_inspect_1
163
+ s2/mouse_use_1
164
+ s6/pyramidmedium_pass_1
165
+ s4/mouse_pass_1
166
+ s6/wineglass_toast_1
167
+ s1/watch_lift
168
+ s9/hand_inspect_1
169
+ s1/airplane_lift
170
+ s9/mouse_lift
171
+ s6/eyeglasses_clean_1
172
+ s4/waterbottle_drink_1
173
+ s4/torussmall_lift
174
+ s6/binoculars_lift
175
+ s8/bowl_offhand_1
176
+ s2/eyeglasses_pass_1
177
+ s8/fryingpan_cook_2
178
+ s8/spherelarge_lift
179
+ s1/cylindermedium_pass_1
180
+ s1/banana_eat_1
181
+ s10/gamecontroller_pass_1
182
+ s9/camera_browse_1
183
+ s6/eyeglasses_wear_1
184
+ s1/stanfordbunny_lift
185
+ s6/cylindersmall_inspect_1
186
+ s6/elephant_pass_1
187
+ s3/torusmedium_inspect_1
188
+ s8/spheresmall_pass_1
189
+ s6/apple_offhand_1_Retake
190
+ s6/stanfordbunny_pass_1
191
+ s6/mug_drink_2
192
+ s8/mug_lift
193
+ s1/binoculars_see_1
194
+ s1/torussmall_offhand_1
195
+ s9/duck_pass_1
196
+ s8/cubemedium_offhand_1
197
+ s8/spherelarge_inspect_1
198
+ s7/wineglass_pass_1
199
+ s9/toothpaste_squeeze_1
200
+ s10/scissors_pass_1
201
+ s10/torussmall_inspect_1
202
+ s4/wineglass_lift
203
+ s5/binoculars_pass_1
204
+ s7/stanfordbunny_pass_1
205
+ s1/mug_drink_1
206
+ s1/stamp_pass_1
207
+ s8/piggybank_use_1
208
+ s10/spheremedium_pass_1
209
+ s9/torusmedium_pass_1
210
+ s3/mouse_lift
211
+ s1/cylindermedium_inspect_1
212
+ s8/cylindermedium_pass_1
213
+ s7/cubesmall_pass_1
214
+ s6/cylindermedium_lift
215
+ s7/cup_pour_1
216
+ s9/banana_pass_1
217
+ s1/camera_browse_1
218
+ s3/eyeglasses_clean_1
219
+ s1/doorknob_use_fun_1
220
+ s1/banana_lift
221
+ s6/cylindersmall_pass_1
222
+ s5/elephant_lift
223
+ s5/elephant_inspect_1
224
+ s9/waterbottle_lift
225
+ s6/piggybank_lift_Retake
226
+ s5/camera_takepicture_3
227
+ s4/stapler_staple_2
228
+ s1/cylindersmall_lift
229
+ s1/airplane_offhand_1
230
+ s9/apple_pass_1
231
+ s9/cylindermedium_pass_1
232
+ s9/hammer_use_3
233
+ s7/cubemedium_pass_1
234
+ s6/cubemedium_inspect_1
235
+ s2/mug_lift
236
+ s6/phone_lift
237
+ s4/spherelarge_inspect_1
238
+ s1/elephant_inspect_1
239
+ s4/headphones_pass_1
240
+ s8/binoculars_see_1
241
+ s7/pyramidmedium_inspect_1
242
+ s1/toothpaste_lift
243
+ s4/stamp_lift
244
+ s8/eyeglasses_pass_1
245
+ s7/hand_pass_1
246
+ s10/spherelarge_lift
247
+ s2/stapler_pass_1
248
+ s9/banana_peel_1
249
+ s2/mug_pass_1
250
+ s6/stapler_pass_1
251
+ s10/wineglass_drink_1
252
+ s10/lightbulb_screw_1
253
+ s7/spheresmall_pass_1
254
+ s2/stanfordbunny_inspect_1
255
+ s7/piggybank_pass_1
256
+ s8/mug_pass_1
257
+ s10/torussmall_lift
258
+ s8/cup_pour_1
259
+ s5/pyramidlarge_pass_1
260
+ s1/binoculars_pass_1
261
+ s5/hammer_pass_1
262
+ s1/flashlight_offhand_1
263
+ s7/elephant_pass_1
264
+ s6/hand_lift
265
+ s1/cubelarge_offhand_1
266
+ s9/elephant_lift
267
+ s8/banana_peel_2
268
+ s8/knife_peel_1
269
+ s1/teapot_pass_1
270
+ s10/headphones_pass_1
271
+ s2/toothbrush_lift
272
+ s9/knife_pass_1
273
+ s2/cubemedium_inspect_1
274
+ s8/teapot_lift
275
+ s3/pyramidmedium_pass_1
276
+ s2/cylindermedium_lift
277
+ s4/mug_pass_1
278
+ s10/stamp_lift
279
+ s7/airplane_lift_Retake
280
+ s10/toruslarge_pass_1
281
+ s6/fryingpan_pass_1
282
+ s4/train_play_1
283
+ s5/pyramidmedium_pass_1
284
+ s7/binoculars_pass_1
285
+ s1/phone_call_1
286
+ s2/spheresmall_inspect_1
287
+ s3/apple_lift
288
+ s6/hammer_use_3
289
+ s8/toothbrush_pass_1
290
+ s7/spheresmall_inspect_1
291
+ s10/flashlight_on_1
292
+ s9/elephant_inspect_1
293
+ s4/elephant_pass_1
294
+ s1/bowl_drink_1
295
+ s6/cubemedium_pass_1
296
+ s7/eyeglasses_pass_1
297
+ s8/airplane_fly_1
298
+ s3/alarmclock_lift
299
+ s4/doorknob_lift
300
+ s4/fryingpan_cook_2
301
+ s4/toothbrush_pass_1
302
+ s5/cylinderlarge_pass_1
303
+ s2/banana_eat_1
304
+ s5/stamp_stamp_1
305
+ s1/knife_lift
306
+ s10/hand_pass_1
307
+ s4/bowl_drink_2
308
+ s8/cup_pass_1
309
+ s5/hammer_use_1
310
+ s3/train_lift
311
+ s2/cup_pass_1
312
+ s7/torussmall_pass_1
313
+ s8/hand_pass_1
314
+ s10/cup_drink_2
315
+ s3/toothpaste_lift
316
+ s2/fryingpan_pass_1
317
+ s3/cylindersmall_pass_1
318
+ s5/spheremedium_lift
319
+ s10/apple_eat_1
320
+ s4/teapot_pour_2
321
+ s4/torussmall_pass_1
322
+ s5/toothpaste_pass_1
323
+ s7/headphones_pass_1
324
+ s5/camera_pass_1
325
+ s9/elephant_pass_1
326
+ s6/toothpaste_lift
327
+ s8/flute_pass_1
328
+ s4/spheremedium_lift
329
+ s3/binoculars_see_1
330
+ s10/camera_browse_1
331
+ s1/banana_pass_1
332
+ s1/toruslarge_lift
333
+ s6/cubelarge_inspect_1
334
+ s1/pyramidlarge_pass_1
335
+ s2/teapot_pour_1
336
+ s3/waterbottle_pass_1
337
+ s2/hammer_lift
338
+ s9/apple_lift
339
+ s9/waterbottle_pour_1
340
+ s4/cubemedium_lift
341
+ s9/phone_pass_1
342
+ s5/cubemedium_lift
343
+ s4/gamecontroller_pass_1
344
+ s9/cubemedium_inspect_1
345
+ s3/cup_drink_1
346
+ s4/stapler_lift
347
+ s2/apple_pass_1
348
+ s5/mug_toast_1
349
+ s5/cylindersmall_inspect_1
350
+ s3/cup_lift
351
+ s1/hammer_pass_1
352
+ s10/hammer_lift
353
+ s5/elephant_pass_1
354
+ s1/spheresmall_inspect_1
355
+ s3/toothbrush_brush_1
356
+ s7/apple_eat_1
357
+ s7/pyramidlarge_lift
358
+ s7/wineglass_drink_1
359
+ s7/mug_lift
360
+ s6/cubesmall_offhand_1
361
+ s2/camera_lift
362
+ s1/cubemedium_lift
363
+ s10/gamecontroller_lift
364
+ s9/cylinderlarge_pass_1
365
+ s8/cubelarge_inspect_1
366
+ s1/scissors_pass_1
367
+ s1/train_play_1
368
+ s5/stanfordbunny_inspect_1
369
+ s4/spheresmall_lift
370
+ s4/watch_lift
371
+ s9/cylinderlarge_inspect_1
372
+ s2/hand_inspect_1
373
+ s8/toothpaste_squeeze_1
374
+ s8/toothpaste_squeeze_2
375
+ s8/mouse_use_1
376
+ s2/teapot_pass_1
377
+ s7/cup_pass_1
378
+ s9/spheremedium_inspect_1
379
+ s10/gamecontroller_play_1
380
+ s7/apple_lift
381
+ s6/eyeglasses_lift
382
+ s10/flute_pass_1
383
+ s9/airplane_fly_1
384
+ s5/stamp_pass_1
385
+ s5/alarmclock_lift
386
+ s10/hammer_use_3
387
+ s3/cylindersmall_inspect_1
388
+ s10/alarmclock_pass_1
389
+ s9/waterbottle_drink_1
390
+ s10/banana_pass_1
391
+ s6/watch_pass_1
392
+ s1/banana_peel_2
393
+ s8/pyramidmedium_pass_1
394
+ s7/cubemedium_lift
395
+ s6/cup_drink_2
396
+ s4/cubelarge_pass_1
397
+ s1/bowl_pass_1
398
+ s8/wineglass_drink_2
399
+ s8/phone_call_1
400
+ s8/torusmedium_lift
401
+ s4/piggybank_pass_1
402
+ s9/stamp_pass_1
403
+ s10/hammer_use_2
404
+ s9/wineglass_pass_1
405
+ s8/camera_browse_1
406
+ s7/airplane_pass_1
407
+ s6/cup_pass_1
408
+ s6/airplane_pass_1
409
+ s9/torussmall_pass_1
410
+ s8/teapot_pour_2
411
+ s4/mouse_use_2
412
+ s10/flashlight_on_2
413
+ s7/pyramidsmall_pass_1
414
+ s3/cup_pour_1
415
+ s1/piggybank_pass_1
416
+ s5/cubesmall_pass_1
417
+ s10/airplane_fly_1
418
+ s8/cylinderlarge_offhand_1
419
+ s6/gamecontroller_play_1
420
+ s3/stanfordbunny_pass_1
421
+ s7/stanfordbunny_lift
422
+ s9/banana_peel_2
423
+ s2/apple_lift
424
+ s10/waterbottle_pour_1
425
+ s1/alarmclock_pass_1
426
+ s1/hammer_use_3
427
+ s1/eyeglasses_clean_1
428
+ s6/phone_call_1
429
+ s7/banana_eat_1
430
+ s4/waterbottle_shake_1
431
+ s8/waterbottle_pass_1
432
+ s6/cylinderlarge_lift
433
+ s9/hand_shake_1
434
+ s6/camera_browse_1
435
+ s2/camera_browse_1
436
+ s4/phone_pass_1
437
+ s1/stamp_lift
438
+ s7/torussmall_inspect_1
439
+ s1/cylindersmall_offhand_1
440
+ s7/cylinderlarge_lift
441
+ s2/duck_inspect_1
442
+ s7/spheremedium_pass_1
443
+ s8/toruslarge_lift
444
+ s3/hammer_pass_1
445
+ s3/cup_offhand_1
446
+ s2/headphones_use_1
447
+ s2/train_play_1
448
+ s10/scissors_use_1
449
+ s5/cubesmall_lift
450
+ s10/pyramidsmall_inspect_1
451
+ s3/mug_drink_4
452
+ s1/cubemedium_offhand_1
453
+ s6/apple_eat_1
454
+ s1/doorknob_use_2
455
+ s1/cylinderlarge_lift
456
+ s2/hand_lift
457
+ s5/gamecontroller_lift
458
+ s3/waterbottle_lift
459
+ s3/apple_pass_1
460
+ s10/flute_play_1
461
+ s5/spherelarge_inspect_1
462
+ s7/doorknob_use_1
463
+ s8/torusmedium_pass_1
464
+ s4/bowl_pass_1
465
+ s10/headphones_lift
466
+ s6/pyramidlarge_pass_1
467
+ s2/mug_toast_1
468
+ s4/hammer_use_3
469
+ s1/stanfordbunny_inspect_1
470
+ s3/alarmclock_pass_1
471
+ s8/cylindermedium_inspect_1
472
+ s1/elephant_lift
473
+ s6/waterbottle_pass_1
474
+ s8/cubelarge_lift
475
+ s3/camera_takepicture_1
476
+ s5/torussmall_inspect_1
477
+ s2/airplane_fly_1
478
+ s7/hammer_use_1
479
+ s1/spherelarge_lift
480
+ s1/cubemedium_pass_1
481
+ s2/headphones_lift
482
+ s8/piggybank_lift
483
+ s1/apple_lift
484
+ s3/spherelarge_inspect_1
485
+ s8/banana_pass_1
486
+ s8/stanfordbunny_pass_1
487
+ s4/flute_play_1
488
+ s6/banana_eat_1
489
+ s1/scissors_offhand_1
490
+ s1/binoculars_lift
491
+ s10/cup_lift
492
+ s7/eyeglasses_lift
493
+ s3/stapler_lift
494
+ s7/bowl_drink_1
495
+ s1/cup_drink_1
496
+ s4/alarmclock_lift
497
+ s6/headphones_use_1
498
+ s4/apple_lift
499
+ s6/mouse_pass_1
500
+ s7/flashlight_lift
501
+ s5/torusmedium_inspect_1
502
+ s10/cup_pour_1
503
+ s5/toruslarge_lift
504
+ s10/cup_drink_1
505
+ s7/cylinderlarge_pass_1
506
+ s4/wineglass_drink_1
507
+ s1/stapler_staple_2
508
+ s10/mug_drink_1
509
+ s1/flute_pass_1
510
+ s8/cylinderlarge_lift
511
+ s10/piggybank_pass_1
512
+ s7/duck_inspect_1
513
+ s9/stamp_lift
514
+ s6/bowl_lift
515
+ s6/banana_pass_1
516
+ s9/cubemedium_lift
517
+ s7/flute_pass_1
518
+ s8/eyeglasses_wear_1
519
+ s9/flashlight_lift
520
+ s9/hammer_use_2
521
+ s1/cup_pour_1
522
+ s2/piggybank_lift
523
+ s8/pyramidlarge_inspect_1
524
+ s2/teapot_pour_2
525
+ s6/pyramidsmall_inspect_2
526
+ s1/toothpaste_pass_1
527
+ s10/banana_peel_2
528
+ s1/wineglass_pass_1
529
+ s5/pyramidlarge_inspect_1
530
+ s5/cubemedium_inspect_1
531
+ s7/knife_chop_1
532
+ s6/mug_toast_1
533
+ s5/mug_drink_1
534
+ s6/banana_peel_1
535
+ s7/cylindermedium_pass_1
536
+ s10/mug_drink_2
537
+ s3/elephant_offhand_1
538
+ s4/stapler_pass_1
539
+ s1/torussmall_lift
540
+ s1/duck_lift
541
+ s5/flute_play_1
542
+ s1/airplane_fly_1
543
+ s2/headphones_pass_1
544
+ s3/lightbulb_screw_1
545
+ s8/toothbrush_brush_1
546
+ s10/mouse_use_1
547
+ s5/flute_play_2
548
+ s3/waterbottle_pour_2
549
+ s10/airplane_lift
550
+ s7/eyeglasses_wear_1
551
+ s10/stamp_stamp_1
552
+ s8/cup_drink_1
553
+ s4/alarmclock_pass_1
554
+ s3/train_pass_1
555
+ s2/hammer_use_2
556
+ s1/pyramidsmall_inspect_1
557
+ s9/cubesmall_inspect_1
558
+ s7/camera_browse_1
559
+ s10/spheresmall_pass_1
560
+ s1/phone_offhand_1
561
+ s6/waterbottle_shake_1
562
+ s1/alarmclock_offhand_1
563
+ s8/spheresmall_inspect_1
564
+ s9/cylindermedium_inspect_1
565
+ s6/knife_lift
566
+ s6/mouse_lift
567
+ s1/flashlight_lift
568
+ s9/cubelarge_pass_1
569
+ s2/pyramidlarge_pass_1_Retake
570
+ s7/stanfordbunny_inspect_1
571
+ s9/gamecontroller_play_1
572
+ s4/hand_pass_1
573
+ s3/toothpaste_squeeze_1
574
+ s9/duck_lift
575
+ s5/flashlight_on_1
576
+ s5/cup_lift
577
+ s10/waterbottle_drink_1
578
+ s8/cubesmall_pass_1
579
+ s10/train_lift
580
+ s10/spheresmall_inspect_1
581
+ s10/cup_pass_1
582
+ s1/cubesmall_pass_1
583
+ s4/fryingpan_cook_3
584
+ s6/elephant_inspect_1
585
+ s9/stapler_lift
586
+ s1/pyramidmedium_offhand_1
587
+ s9/pyramidmedium_pass_1
588
+ s9/teapot_pour_2
589
+ s4/fryingpan_cook_1
590
+ s1/eyeglasses_wear_1
591
+ s1/gamecontroller_play_1
592
+ s3/binoculars_offhand_1
593
+ s6/waterbottle_pour_1
594
+ s3/cubesmall_inspect_1
595
+ s8/pyramidlarge_pass_1
596
+ s2/train_pass_1
597
+ s10/mouse_lift
598
+ s10/torusmedium_pass_1
599
+ s8/waterbottle_drink_1
600
+ s7/cubelarge_lift
601
+ s5/duck_pass_1
602
+ s1/gamecontroller_offhand_1
603
+ s2/camera_pass_1
604
+ s7/gamecontroller_play_1
605
+ s7/toothbrush_pass_1
606
+ s8/phone_pass_1
607
+ s3/bowl_drink_1
608
+ s6/toruslarge_pass_1
609
+ s5/spheresmall_inspect_1
610
+ s8/flashlight_pass_1
611
+ s1/flashlight_on_2
612
+ s8/gamecontroller_lift
613
+ s3/stanfordbunny_lift
614
+ s8/bowl_pass_1
615
+ s4/banana_pass_1
616
+ s6/stapler_lift
617
+ s2/teapot_lift
618
+ s3/camera_takepicture_2
619
+ s8/torussmall_pass_1
620
+ s9/camera_pass_1
621
+ s4/apple_eat_1
622
+ s1/watch_set_1
623
+ s5/stanfordbunny_lift
624
+ s6/spheremedium_inspect_1
625
+ s1/eyeglasses_lift
626
+ s10/headphones_use_1
627
+ s3/doorknob_use_1
628
+ s10/apple_pass_1
629
+ s9/flute_pass_1
630
+ s1/toruslarge_inspect_1
631
+ s5/duck_lift
632
+ s9/doorknob_use_2
633
+ s10/torussmall_pass_1
634
+ s2/doorknob_use_2
635
+ s1/pyramidlarge_lift
636
+ s6/stamp_lift
637
+ s3/elephant_pass_1
638
+ s7/headphones_use_1
639
+ s6/cylindermedium_pass_1
640
+ s6/stapler_staple_2
641
+ s6/piggybank_pass_1
642
+ s5/spheremedium_inspect_1
643
+ s1/cubemedium_inspect_1
644
+ s4/pyramidmedium_pass_1
645
+ s7/cylinderlarge_inspect_1
646
+ s1/cylindermedium_offhand_1
647
+ s6/toothpaste_squeeze_1_Retake
648
+ s9/stapler_pass_1
649
+ s8/flashlight_on_1
650
+ s5/cup_drink_2
651
+ s8/apple_lift
652
+ s8/airplane_pass_1
653
+ s6/wineglass_pass_1
654
+ s4/cylinderlarge_inspect_1
655
+ s4/phone_call_1
656
+ s5/stapler_pass_1
657
+ s8/camera_takepicture_2
658
+ s3/mouse_pass_1
659
+ s7/pyramidlarge_inspect_1
660
+ s10/stapler_staple_2
661
+ s9/teapot_pass_1
662
+ s5/gamecontroller_play_1
663
+ s4/train_lift
664
+ s8/doorknob_lift
665
+ s1/watch_offhand_1
666
+ s7/mug_drink_2
667
+ s4/flashlight_pass_1
668
+ s1/mug_offhand_1
669
+ s4/camera_takepicture_3
670
+ s8/train_pass_1
671
+ s7/train_pass_1
672
+ s8/gamecontroller_play_1
673
+ s1/scissors_use_2
674
+ s9/piggybank_pass_1
675
+ s6/flute_lift
676
+ s1/banana_peel_1
677
+ s6/bowl_pass_1
678
+ s1/mouse_pass_1
679
+ s6/cubesmall_lift
680
+ s5/airplane_lift
681
+ s3/scissors_pass_1
682
+ s2/cylinderlarge_pass_1
683
+ s6/watch_set_1
684
+ s10/stamp_pass_1
685
+ s9/banana_lift
686
+ s3/toruslarge_lift
687
+ s10/hammer_use_1
688
+ s10/cubesmall_lift
689
+ s6/teapot_pass_1
690
+ s3/pyramidlarge_offhand_1
691
+ s8/cylindersmall_inspect_1
692
+ s3/cylinderlarge_inspect_1
693
+ s7/teapot_pass_1
694
+ s9/mouse_pass_1
695
+ s3/mug_drink_1
696
+ s2/spheremedium_pass_1
697
+ s9/waterbottle_pass_1
698
+ s2/flute_pass_1
699
+ s9/lightbulb_screw_1
700
+ s8/cubemedium_pass_1
701
+ s7/flashlight_pass_1
702
+ s9/eyeglasses_lift
703
+ s7/cubemedium_inspect_1
704
+ s5/waterbottle_lift
705
+ s6/cup_drink_1
706
+ s1/pyramidlarge_inspect_1
707
+ s3/airplane_fly_1
708
+ s10/pyramidmedium_lift
709
+ s10/bowl_lift
710
+ s2/cubelarge_pass_1
711
+ s7/stapler_staple_1
712
+ s8/flashlight_on_2
713
+ s4/cylindersmall_lift
714
+ s10/stanfordbunny_pass_1
715
+ s6/cubemedium_lift
716
+ s10/stapler_pass_1
717
+ s5/apple_pass_1
718
+ s7/doorknob_use_2
719
+ s4/watch_pass_1
720
+ s1/elephant_pass_1
721
+ s1/watch_set_2
722
+ s2/cylinderlarge_inspect_1
723
+ s2/pyramidlarge_inspect_1
724
+ s9/eyeglasses_wear_1
725
+ s4/phone_lift
726
+ s1/spheremedium_lift
727
+ s3/airplane_offhand_1
728
+ s6/spheresmall_pass_1
729
+ s9/piggybank_lift_Retake
730
+ s8/cylinderlarge_inspect_1
731
+ s5/wineglass_lift
732
+ s3/train_play_1
733
+ s10/knife_chop_1
734
+ s9/wineglass_drink_2
735
+ s6/hammer_lift
736
+ s9/cylindersmall_pass_1
737
+ s4/bowl_drink_1
738
+ s1/mug_toast_1
739
+ s1/pyramidmedium_inspect_1
740
+ s9/stamp_stamp_1
741
+ s10/toothpaste_squeeze_1
742
+ s1/spheresmall_lift
743
+ s3/doorknob_use_2
744
+ s7/fryingpan_cook_1
745
+ s5/teapot_pass_1
746
+ s8/phone_lift
747
+ s2/cubesmall_inspect_1
748
+ s1/toruslarge_pass_1
749
+ s6/headphones_pass_1
750
+ s8/cylinderlarge_pass_1
751
+ s4/toothpaste_pass_1
752
+ s4/camera_takepicture_2
753
+ s3/camera_offhand_1
754
+ s8/cup_offhand_1
755
+ s9/spherelarge_pass_1
756
+ s10/stanfordbunny_lift
757
+ s1/spheremedium_offhand_1
758
+ s4/torusmedium_lift
759
+ s9/cylinderlarge_lift
760
+ s6/hammer_use_1
761
+ s4/wineglass_toast_1
762
+ s2/cylinderlarge_lift
763
+ s6/cubesmall_inspect_1
764
+ s1/camera_takepicture_2
765
+ s3/phone_pass_1
766
+ s1/hand_inspect_1
767
+ s8/bowl_drink_2
768
+ s1/stapler_offhand_1
769
+ s1/piggybank_use_1
770
+ s1/apple_pass_1
771
+ s9/wineglass_drink_1
772
+ s10/mug_lift
773
+ s10/cylinderlarge_lift
774
+ s2/stamp_stamp_1
775
+ s10/stanfordbunny_inspect_1
776
+ s4/cubelarge_lift
777
+ s8/alarmclock_see_1
778
+ s5/cubelarge_lift
779
+ s3/eyeglasses_pass_1
780
+ s1/pyramidsmall_pass_1
781
+ s2/cylindermedium_pass_1
782
+ s8/fryingpan_offhand_1
783
+ s1/mug_drink_3
784
+ s2/piggybank_pass_1_Retake
785
+ s8/headphones_lift
786
+ s9/alarmclock_see_1
787
+ s5/toruslarge_pass_1
788
+ s1/stapler_staple_1
789
+ s1/stanfordbunny_offhand_1
790
+ s3/airplane_lift
791
+ s8/cubemedium_inspect_1
792
+ s9/stapler_staple_2
793
+ s3/fryingpan_lift
794
+ s10/spherelarge_inspect_1
795
+ s7/pyramidlarge_pass_1
796
+ s5/mouse_lift
797
+ s8/watch_set_2
798
+ s3/wineglass_drink_1
799
+ s6/spheremedium_lift
800
+ s2/stapler_staple_2
801
+ s9/airplane_pass_1
802
+ s6/duck_inspect_1
803
+ s10/lightbulb_pass_1
804
+ s8/stapler_pass_1
805
+ s6/camera_takepicture_3
806
+ s10/camera_takepicture_2
807
+ s10/mouse_pass_1
808
+ s3/cup_drink_2
809
+ s5/alarmclock_see_1
810
+ s10/elephant_inspect_1
811
+ s6/apple_pass_1
812
+ s3/gamecontroller_pass_1
813
+ s10/cubemedium_pass_1
814
+ s5/wineglass_pass_1
815
+ s5/waterbottle_pass_1
816
+ s10/teapot_pour_2
817
+ s6/train_pass_1
818
+ s7/hammer_use_2
819
+ s8/apple_pass_1
820
+ s6/cup_pour_1
821
+ s7/wineglass_lift
822
+ s9/toothbrush_pass_1
823
+ s6/doorknob_lift
824
+ s9/banana_eat_1
825
+ s4/cylinderlarge_pass_1
826
+ s10/cubelarge_lift
827
+ s1/camera_lift
828
+ s1/stanfordbunny_pass_1
829
+ s7/watch_pass_1
830
+ s9/cup_lift
831
+ s7/apple_pass_1
832
+ s3/piggybank_pass_1
833
+ s1/cylindersmall_inspect_1
834
+ s4/cup_drink_2
835
+ s8/spheremedium_pass_1
836
+ s6/waterbottle_open_1
837
+ s3/watch_set_1
838
+ s8/cubesmall_inspect_1
839
+ s3/duck_inspect_1
840
+ s7/cylindermedium_inspect_1
841
+ s9/cylindersmall_inspect_1
842
+ s1/spherelarge_inspect_1
843
+ s1/spheresmall_offhand_1
844
+ s8/cylindermedium_lift
845
+ s1/bowl_drink_2
846
+ s5/train_pass_1
847
+ s4/alarmclock_see_1
848
+ s8/flashlight_offhand_1
849
+ s2/cup_drink_1
850
+ s8/duck_lift
851
+ s6/cubelarge_pass_1
852
+ s6/flute_pass_1
853
+ s2/toruslarge_inspect_1
854
+ s4/camera_pass_1
855
+ s7/train_play_1
856
+ s9/mouse_use_1
857
+ s5/cylindermedium_pass_1
858
+ s3/pyramidsmall_pass_1
859
+ s3/eyeglasses_offhand_1
860
+ s8/wineglass_toast_1
861
+ s7/eyeglasses_clean_1
862
+ s5/toothbrush_brush_1
863
+ s10/toothpaste_squeeze_2
864
+ s1/flute_lift
865
+ s3/toothbrush_pass_1
866
+ s1/bowl_lift
867
+ s9/bowl_drink_2
868
+ s7/banana_peel_2
869
+ s8/doorknob_use_2
870
+ s8/torussmall_inspect_1
871
+ s1/camera_pass_1
872
+ s8/hand_inspect_1
873
+ s1/watch_pass_1
874
+ s10/watch_pass_1
875
+ s3/toruslarge_inspect_1
876
+ s9/torusmedium_inspect_1
877
+ s9/cylindermedium_lift
878
+ s10/teapot_pass_1
879
+ s6/wineglass_drink_2
880
+ s7/headphones_lift
881
+ s10/stapler_staple_1
882
+ s2/pyramidmedium_inspect_1
883
+ s3/flashlight_pass_1
884
+ s1/torusmedium_pass_1
885
+ s6/spherelarge_pass_1
886
+ s2/phone_pass_1
887
+ s2/pyramidmedium_pass_1
888
+ s9/stanfordbunny_pass_1
889
+ s8/pyramidmedium_inspect_1
890
+ s1/toothbrush_lift
891
+ s7/hammer_pass_1
892
+ s8/hand_lift
893
+ s8/camera_takepicture_3
894
+ s8/stamp_lift
895
+ s8/torusmedium_inspect_1
896
+ s7/phone_call_1
897
+ s3/pyramidlarge_inspect_1
898
+ s6/torussmall_pass_1
899
+ s3/mouse_use_1
900
+ s6/toruslarge_inspect_1
901
+ s3/elephant_lift
902
+ s1/wineglass_drink_2
903
+ s7/lightbulb_pass_1
904
+ s8/hammer_pass_1
905
+ s6/pyramidmedium_inspect_1
906
+ s8/gamecontroller_pass_1
907
+ s1/waterbottle_pour_1
908
+ s9/toothpaste_lift
909
+ s9/spherelarge_inspect_1
910
+ s8/teapot_pour_1
911
+ s4/spheremedium_inspect_1
912
+ s6/stamp_stamp_1
913
+ s9/hammer_use_1
914
+ s1/torusmedium_offhand_1
915
+ s1/alarmclock_see_1
916
+ s6/phone_pass_1
917
+ s5/waterbottle_open_1
918
+ s4/cubemedium_pass_1
919
+ s6/apple_lift
920
+ s1/headphones_pass_1
921
+ s8/hammer_use_3
922
+ s3/piggybank_use_1
923
+ s3/wineglass_toast_1
924
+ s6/spheresmall_lift
925
+ s1/camera_takepicture_3_Retake
926
+ s1/cylindermedium_lift
927
+ s6/teapot_pour_1
928
+ s4/train_pass_1
929
+ s7/toruslarge_inspect_1_Retake
930
+ s8/cubelarge_pass_1
931
+ s3/alarmclock_offhand_1
932
+ s4/hand_lift
933
+ s8/mug_drink_2
934
+ s5/cup_pass_1
935
+ s3/stapler_staple_1
936
+ s10/phone_lift
937
+ s6/torusmedium_inspect_1
938
+ s4/cup_drink_1
939
+ s2/train_lift
940
+ s8/stapler_staple_1
941
+ s2/apple_eat_1
942
+ s6/torusmedium_pass_1
943
+ s4/watch_set_1
944
+ s3/camera_takepicture_3
945
+ s9/spheresmall_inspect_1
946
+ s4/cubesmall_pass_1
947
+ s6/alarmclock_lift
948
+ s2/toruslarge_pass_1
949
+ s9/pyramidmedium_inspect_1
950
+ s3/stanfordbunny_inspect_1
951
+ s6/scissors_use_2
952
+ s1/stamp_stamp_1
953
+ s3/torusmedium_pass_1
954
+ s6/eyeglasses_pass_1
955
+ s7/cubelarge_inspect_1
956
+ s3/stamp_lift
957
+ s8/mouse_pass_1
958
+ s7/elephant_inspect_1
959
+ s8/camera_pass_1
960
+ s10/bowl_pass_1
961
+ s1/waterbottle_lift
962
+ s8/bowl_lift
963
+ s3/phone_call_1
964
+ s8/phone_offhand_1
965
+ s3/eyeglasses_wear_1
966
+ s6/bowl_drink_1_Retake
967
+ s9/torussmall_inspect_1
968
+ s1/piggybank_offhand_1
969
+ s1/mug_drink_2
970
+ s4/airplane_fly_1
971
+ s8/cubelarge_offhand_1
972
+ s10/camera_pass_1
973
+ s4/stanfordbunny_inspect_1
974
+ s6/pyramidlarge_inspect_1
975
+ s10/wineglass_drink_2
976
+ s1/duck_inspect_1
977
+ s4/duck_inspect_1
978
+ s1/mouse_use_1
979
+ s5/eyeglasses_clean_2
980
+ s10/camera_takepicture_1
981
+ s8/fryingpan_pass_1
982
+ s10/cubelarge_inspect_1
983
+ s7/hammer_use_3
984
+ s5/spherelarge_lift
985
+ s1/fryingpan_cook_1
986
+ s5/cubemedium_pass_1
987
+ s3/flute_play_1
988
+ s9/headphones_use_1
989
+ s1/cylinderlarge_inspect_1
990
+ s7/hammer_lift
991
+ s7/wineglass_toast_1
992
+ s4/pyramidmedium_inspect_1
993
+ s10/bowl_drink_1_Retake
994
+ s9/piggybank_use_1
995
+ s1/cubesmall_lift
996
+ s1/fryingpan_cook_2
997
+ s1/apple_eat_1
998
+ s9/watch_set_2
999
+ s9/bowl_lift
1000
+ s10/hand_lift
1001
+ s10/cylindersmall_inspect_1
1002
+ s1/piggybank_lift
1003
+ s2/toothpaste_squeeze_1
1004
+ s6/binoculars_see_1
1005
+ s6/spherelarge_inspect_1
1006
+ s8/alarmclock_lift
1007
+ s1/cylindersmall_pass_1
1008
+ s6/pyramidlarge_lift
1009
+ s4/binoculars_pass_1
1010
+ s10/bowl_drink_2
1011
+ s5/flute_pass_1
1012
+ s10/torusmedium_lift
1013
+ s1/duck_pass_1
1014
+ s2/cubelarge_inspect_1
1015
+ s8/watch_pass_1
1016
+ s1/waterbottle_open_1
1017
+ s9/flute_play_1
1018
+ s9/airplane_lift
1019
+ s1/mug_drink_4
1020
+ s4/wineglass_pass_1
1021
+ s6/camera_pass_1
1022
+ s4/duck_pass_1
1023
+ s10/cylindersmall_pass_1
1024
+ s10/alarmclock_lift_Retake
1025
+ s4/toruslarge_pass_1
1026
+ s10/wineglass_pass_1
1027
+ s5/cylindermedium_lift
1028
+ s2/pyramidlarge_lift
1029
+ s3/flute_offhand_1
1030
+ s8/camera_offhand_1
1031
+ s2/elephant_inspect_1
1032
+ s5/mug_pass_1
1033
+ s3/hammer_use_1
1034
+ s9/mug_drink_2
1035
+ s5/wineglass_drink_2
1036
+ s5/piggybank_lift
1037
+ s9/waterbottle_open_1
1038
+ s7/alarmclock_lift
1039
+ s1/eyeglasses_offhand_1
1040
+ s1/flute_play_1
1041
+ s8/wineglass_lift
1042
+ s7/waterbottle_shake_1
1043
+ s1/cubelarge_inspect_1
1044
+ s1/hammer_use_2
1045
+ s5/doorknob_lift
1046
+ s1/spheremedium_inspect_1
1047
+ s9/spheremedium_pass_1
1048
+ s6/knife_peel_1
1049
+ s4/fryingpan_pass_1
1050
+ s3/binoculars_lift
1051
+ s7/toruslarge_lift
1052
+ s4/piggybank_lift
1053
+ s3/cubelarge_pass_1
1054
+ s2/banana_peel_1
1055
+ s1/alarmclock_lift
1056
+ s5/train_lift
1057
+ s1/cup_pass_1
1058
+ s1/waterbottle_drink_1
1059
+ s3/bowl_drink_2
1060
+ s5/cylinderlarge_inspect_1
1061
+ s4/headphones_use_1
1062
+ s3/hand_pass_1
1063
+ s3/flashlight_on_1
1064
+ s3/toruslarge_pass_1
1065
+ s9/flashlight_on_1
1066
+ s8/mug_offhand_1
1067
+ s8/cubesmall_lift
1068
+ s10/airplane_pass_1
text2motion/data/GRAB/train_long.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ s1/airplane_fly_1
2
+ s1/airplane_lift
3
+ s1/airplane_offhand_1
4
+ s1/airplane_pass_1
5
+ s1/alarmclock_offhand_1
6
+ s1/alarmclock_pass_1
7
+ s1/alarmclock_see_1
8
+ s1/apple_eat_1
text2motion/data/GRAB/train_short.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ custom/s5/airplane_pass_1
text2motion/data/GRAB/train_val.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ s3/airplane_pass_1
text2motion/datasets/__init__.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .dataloader import build_dataloader
2
+ from .dataset import Text2MotionDataset
3
+ from .evaluator import (EvaluationDataset, EvaluatorModelWrapper,
4
+ get_dataset_motion_loader, get_motion_loader)
5
+
6
+ from .utils import drop_shapes_from_motion_arr
7
+ # from .rendering import render_meshes
8
+ from .motionx_explorer import (load_data_as_dict,
9
+ motion_arr_to_dict, smplx_dict_to_array, to_smplx_dict)
10
+
11
+ __all__ = [
12
+ 'Text2MotionDataset', 'EvaluationDataset', 'build_dataloader',
13
+ 'get_dataset_motion_loader', 'get_motion_loader', 'EvaluatorModelWrapper',
14
+ 'load_data_as_dict', 'motion_arr_to_dict', 'smplx_dict_to_array',
15
+ 'drop_shapes_from_motion_arr', 'render_meshes', 'to_smplx_dict'
16
+ ]
text2motion/datasets/combine_gifs.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+
4
+ from PIL import Image, ImageDraw, ImageFont, ImageSequence
5
+
6
+
7
+ def combine_gifs_with_timestep(gif_paths, output_path):
8
+ """
9
+ Combines multiple GIF files into a single GIF, adding a timestep extracted from each file's name.
10
+
11
+ :param gif_paths: List of paths to the GIF files.
12
+ :param output_path: Path where the combined GIF will be saved.
13
+ """
14
+ frames = []
15
+ for gif_path in gif_paths:
16
+ # Extract timestep from filename using regular expressions
17
+ timestep = re.search(r'sample_tensor\(\[(\d+)\]\)', gif_path)
18
+ if timestep:
19
+ timestep = timestep.group(1)
20
+ else:
21
+ print(f"error: Timestep not found in filename {gif_path}. Skipping this file.")
22
+ exit(1)
23
+
24
+ start_i = 0
25
+ n_frames_keep = 30
26
+ tot_frames = 30
27
+ min_frames = 20
28
+ # Open the GIF
29
+ with Image.open(gif_path) as img:
30
+ # Loop over each frame in the GIF
31
+ # so lower the timestep, the more frames we keep
32
+ n_frames_keep = (999-int(timestep))/999 * tot_frames + min_frames
33
+ for i, frame in enumerate(ImageSequence.Iterator(img)):
34
+ if i >= start_i:
35
+ # if int(timestep) >= 50 and i >= (n_frames_keep + start_i):
36
+ # break # Stop after 10 frames
37
+ # elif int(timestep) < 50 and int(timestep) >= 20 and i >= n_frames_keep + start_i:
38
+ # break
39
+ # elif int(timestep) < 20 and int(timestep) > 0 and i >= n_frames_keep + start_i:
40
+ # break
41
+ if int(timestep) > 0 and i >= n_frames_keep + start_i:
42
+ break
43
+ # elif int(timestep) == 999 and i >= n_frames_keep + start_i:
44
+ # break
45
+ # Convert the frame to RGB mode and draw the timestep on it
46
+ frame = frame.convert("RGBA")
47
+ d = ImageDraw.Draw(frame)
48
+
49
+ # Load a font - you can adjust the size and font type as needed
50
+ font_path = '/work3/s222376/MotionDiffuse2/text2motion/datasets/arial.ttf'
51
+ font_large = ImageFont.truetype(font_path, 20) # Large font for the timestep number
52
+ font_small = ImageFont.truetype(font_path, 20) # Small font for the word 'timestep'
53
+
54
+ # Calculate text size and position
55
+ number_size = d.textsize(timestep, font=font_large)
56
+ # label_size = d.textsize("timestep", font=font_small)
57
+ # total_width = number_size[0] + label_size[0]
58
+ # x_number = (frame.width - total_width) // 2
59
+ # y = 10 # 10 pixels from the top
60
+ y_offset = 100
61
+ x_offset = 20
62
+ center = (frame.width // 2 - x_offset, frame.height // 2 - y_offset)
63
+ # Draw the timestep number and label
64
+ color = (0,0,0)
65
+ d.text((10,10), f"t={timestep}", font=font_large, fill=color)
66
+ # d.text((x_number + number_size[0], y + (number_size[1] - label_size[1]) // 2), "timestep", font=font_small, fill=color)
67
+
68
+ frames.append(frame)
69
+
70
+ # Save the frames as a new GIF
71
+ frames[0].save(output_path, save_all=True, append_images=frames[1:], loop=0)
72
+
73
+ # Example usage
74
+ dir = "/work3/s222376/MotionDiffuse2/text2motion/gifs/md_fulem_2g_excl_196_seed42/"
75
+ # list all files in dir
76
+ # gif_paths = [f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]
77
+ # # filter but contians 'sample_tensor' and 'happiness'
78
+ # gif_paths = [f for f in gif_paths if 'sample_tensor' in f and 'happiness' in f]
79
+ # # reverse sort by timestep
80
+ # gif_paths.sort(key=lambda f: int(re.search(r'sample_tensor\(\[(\d+)\]\)', f).group(1)), reverse=True)
81
+ # print(gif_paths)
82
+ times = [999, 80, 10, 0]
83
+ gif_paths = [f'sample_tensor([{t}])_happiness.gif' for t in times]
84
+ full_gif_paths = [os.path.join(dir, gif_path) for gif_path in gif_paths]
85
+ output_path = f'{dir}combined_gif.gif' # Replace with your desired output file path
86
+ combine_gifs_with_timestep(full_gif_paths, output_path)
87
+ print(f"Combined GIF saved to {output_path}")
text2motion/datasets/dataloader.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ from functools import partial
3
+ from typing import Optional, Union
4
+
5
+ import numpy as np
6
+ from mmcv.runner import get_dist_info
7
+ from torch.utils.data import DataLoader
8
+ from torch.utils.data.dataset import Dataset
9
+
10
+ import torch
11
+ from torch.utils.data import DistributedSampler as _DistributedSampler
12
+
13
+
14
+ class DistributedSampler(_DistributedSampler):
15
+
16
+ def __init__(self,
17
+ dataset,
18
+ num_replicas=None,
19
+ rank=None,
20
+ shuffle=True,
21
+ round_up=True):
22
+ super().__init__(dataset, num_replicas=num_replicas, rank=rank)
23
+ self.shuffle = shuffle
24
+ self.round_up = round_up
25
+ if self.round_up:
26
+ self.total_size = self.num_samples * self.num_replicas
27
+ else:
28
+ self.total_size = len(self.dataset)
29
+
30
+ def __iter__(self):
31
+ # deterministically shuffle based on epoch
32
+ if self.shuffle:
33
+ g = torch.Generator()
34
+ g.manual_seed(self.epoch)
35
+ indices = torch.randperm(len(self.dataset), generator=g).tolist()
36
+ else:
37
+ indices = torch.arange(len(self.dataset)).tolist()
38
+
39
+ # add extra samples to make it evenly divisible
40
+ if self.round_up:
41
+ indices = (
42
+ indices *
43
+ int(self.total_size / len(indices) + 1))[:self.total_size]
44
+ assert len(indices) == self.total_size
45
+
46
+ # subsample
47
+ indices = indices[self.rank:self.total_size:self.num_replicas]
48
+ if self.round_up:
49
+ assert len(indices) == self.num_samples
50
+
51
+ return iter(indices)
52
+
53
+
54
+ def build_dataloader(dataset: Dataset,
55
+ samples_per_gpu: int,
56
+ workers_per_gpu: int,
57
+ num_gpus: Optional[int] = 1,
58
+ dist: Optional[bool] = True,
59
+ shuffle: Optional[bool] = True,
60
+ round_up: Optional[bool] = True,
61
+ seed: Optional[Union[int, None]] = None,
62
+ persistent_workers: Optional[bool] = True,
63
+ **kwargs):
64
+ """Build PyTorch DataLoader.
65
+
66
+ In distributed training, each GPU/process has a dataloader.
67
+ In non-distributed training, there is only one dataloader for all GPUs.
68
+
69
+ Args:
70
+ dataset (:obj:`Dataset`): A PyTorch dataset.
71
+ samples_per_gpu (int): Number of training samples on each GPU, i.e.,
72
+ batch size of each GPU.
73
+ workers_per_gpu (int): How many subprocesses to use for data loading
74
+ for each GPU.
75
+ num_gpus (int, optional): Number of GPUs. Only used in non-distributed
76
+ training.
77
+ dist (bool, optional): Distributed training/test or not. Default: True.
78
+ shuffle (bool, optional): Whether to shuffle the data at every epoch.
79
+ Default: True.
80
+ round_up (bool, optional): Whether to round up the length of dataset by
81
+ adding extra samples to make it evenly divisible. Default: True.
82
+ persistent_workers (bool): If True, the data loader will not shutdown
83
+ the worker processes after a dataset has been consumed once.
84
+ This allows to maintain the workers Dataset instances alive.
85
+ The argument also has effect in PyTorch>=1.7.0.
86
+ Default: True
87
+ kwargs: any keyword argument to be used to initialize DataLoader
88
+
89
+ Returns:
90
+ DataLoader: A PyTorch dataloader.
91
+ """
92
+ rank, world_size = get_dist_info()
93
+ if dist:
94
+ sampler = DistributedSampler(
95
+ dataset, world_size, rank, shuffle=shuffle, round_up=round_up)
96
+ shuffle = False
97
+ batch_size = samples_per_gpu
98
+ num_workers = workers_per_gpu
99
+ else:
100
+ sampler = None
101
+ batch_size = num_gpus * samples_per_gpu
102
+ num_workers = num_gpus * workers_per_gpu
103
+
104
+ init_fn = partial(
105
+ worker_init_fn, num_workers=num_workers, rank=rank,
106
+ seed=seed) if seed is not None else None
107
+
108
+ data_loader = DataLoader(
109
+ dataset,
110
+ batch_size=batch_size,
111
+ sampler=sampler,
112
+ num_workers=num_workers,
113
+ pin_memory=False,
114
+ shuffle=shuffle,
115
+ worker_init_fn=init_fn,
116
+ persistent_workers=persistent_workers,
117
+ **kwargs)
118
+
119
+ return data_loader
120
+
121
+
122
+ def worker_init_fn(worker_id: int, num_workers: int, rank: int, seed: int):
123
+ """Init random seed for each worker."""
124
+ # The seed of each worker equals to
125
+ # num_worker * rank + worker_id + user_seed
126
+ worker_seed = num_workers * rank + worker_id + seed
127
+ np.random.seed(worker_seed)
128
+ random.seed(worker_seed)
text2motion/datasets/dataset.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import codecs as cs
2
+ import os
3
+ import random
4
+ from os.path import join as pjoin
5
+
6
+ import numpy as np
7
+ import torch
8
+ from torch.utils import data
9
+ from tqdm import tqdm
10
+
11
+ from .utils import drop_shapes_from_motion_arr, load_label_from_file
12
+
13
+ class Text2MotionDataset(data.Dataset):
14
+ """Dataset for Text2Motion generation task.
15
+
16
+ """
17
+ def __init__(self, opt, mean, std, split_file, times=1, w_vectorizer=None, eval_mode=False):
18
+ self.opt = opt
19
+ self.max_length = 20
20
+ self.times = times
21
+ self.w_vectorizer = w_vectorizer
22
+ self.eval_mode = eval_mode
23
+ min_motion_len = 40 if self.opt.dataset_name =='t2m' else 24
24
+
25
+ joints_num = opt.joints_num
26
+
27
+ data_dict = {}
28
+ id_list = []
29
+ print(f"split file: {split_file}")
30
+ with cs.open(split_file, 'r') as f:
31
+ for line in f.readlines():
32
+ id_list.append(line.strip())
33
+
34
+ new_name_list = []
35
+ length_list = []
36
+ print(f"id-list length: {len(id_list)}")
37
+ for name in tqdm(id_list):
38
+ try:
39
+ print(f"attempting to load motion for {name} at {pjoin(opt.motion_dir, name + '.npy')}")
40
+ motion = np.load(pjoin(opt.motion_dir, name + '.npy'))
41
+ if self.opt.dataset_name.lower() == 'grab':
42
+ motion = drop_shapes_from_motion_arr(motion)
43
+ assert motion.shape[-1] == opt.dim_pose, f"motion shape {motion.shape} does not match dim_pose {opt.dim_pose}"
44
+ print(f"grab motion shape: {motion.shape}")
45
+ print(f"len of motion: {len(motion)}")
46
+ # TODO (elmc): verify we don't need this for GRAB data
47
+ # if (len(motion)) < min_motion_len or (len(motion) >= 200):
48
+ # continue
49
+ text_data = []
50
+ flag = False
51
+ with cs.open(pjoin(opt.text_dir, name + '.txt')) as f:
52
+ for line in f.readlines():
53
+ text_dict = {}
54
+ line_split = line.strip().split('#')
55
+ caption = line_split[0]
56
+ # append face_text to caption
57
+ emotion_label = load_label_from_file(pjoin(opt.face_text_dir, name + '.txt'))
58
+ caption = f"{emotion_label} {caption}"
59
+ f_tag = 0.0
60
+ to_tag = 0.0
61
+ # TODO (elmc): add actual tokens back for grab
62
+ tokens = []
63
+ if self.opt.dataset_name.lower() != 'grab':
64
+ tokens = line_split[1].split(' ')
65
+ f_tag = float(line_split[2])
66
+ to_tag = float(line_split[3])
67
+ f_tag = 0.0 if np.isnan(f_tag) else f_tag
68
+ to_tag = 0.0 if np.isnan(to_tag) else to_tag
69
+
70
+ text_dict['caption'] = caption
71
+ text_dict['tokens'] = tokens
72
+ if f_tag == 0.0 and to_tag == 0.0:
73
+ flag = True
74
+ text_data.append(text_dict)
75
+ else:
76
+ n_motion = motion[int(f_tag*20) : int(to_tag*20)]
77
+ if (len(n_motion)) < min_motion_len or (len(n_motion) >= 200):
78
+ continue
79
+ new_name = random.choice('ABCDEFGHIJKLMNOPQRSTUVW') + '_' + name
80
+ while new_name in data_dict:
81
+ new_name = random.choice('ABCDEFGHIJKLMNOPQRSTUVW') + '_' + name
82
+ data_dict[new_name] = {'motion': n_motion,
83
+ 'length': len(n_motion),
84
+ 'text':[text_dict]}
85
+ new_name_list.append(new_name)
86
+ length_list.append(len(n_motion))
87
+
88
+ if flag:
89
+ data_dict[name] = {'motion': motion,
90
+ 'length': len(motion),
91
+ 'text':text_data}
92
+ new_name_list.append(name)
93
+ length_list.append(len(motion))
94
+ except Exception as e:
95
+ # Some motion may not exist in KIT dataset
96
+ print(f"failed to load motion for {name} at {pjoin(opt.motion_dir, name + '.npy')} due to {e}")
97
+
98
+ if not new_name_list or not length_list:
99
+ raise ValueError(f'No data loaded, new_name_list has len {len(new_name_list)} and length_list has len {len(length_list)}')
100
+ name_list, length_list = zip(*sorted(zip(new_name_list, length_list), key=lambda x: x[1]))
101
+ print(f"LOADED length of name_list: {len(name_list)}")
102
+ # TODO (elmc): calculate mean and std and save to load here?
103
+ if opt.is_train:
104
+ # # TODO (elle): how best to standardize the data?
105
+
106
+ # # root_rot_velocity (B, seq_len, 1)
107
+ # std[0:1] = std[0:1] / opt.feat_bias
108
+ # # root_linear_velocity (B, seq_len, 2)
109
+ # std[1:3] = std[1:3] / opt.feat_bias
110
+ # # root_y (B, seq_len, 1)
111
+ # std[3:4] = std[3:4] / opt.feat_bias
112
+ # # ric_data (B, seq_len, (joint_num - 1)*3)
113
+ # std[4: 4 + (joints_num - 1) * 3] = std[4: 4 + (joints_num - 1) * 3] / 1.0
114
+ # # rot_data (B, seq_len, (joint_num - 1)*6)
115
+ # std[4 + (joints_num - 1) * 3: 4 + (joints_num - 1) * 9] = std[4 + (joints_num - 1) * 3: 4 + (
116
+ # joints_num - 1) * 9] / 1.0
117
+ # # local_velocity (B, seq_len, joint_num*3)
118
+ # std[4 + (joints_num - 1) * 9: 4 + (joints_num - 1) * 9 + joints_num * 3] = std[
119
+ # 4 + (joints_num - 1) * 9: 4 + (
120
+ # joints_num - 1) * 9 + joints_num * 3] / 1.0
121
+ # # foot contact (B, seq_len, 4)
122
+ # std[4 + (joints_num - 1) * 9 + joints_num * 3:] = std[
123
+ # 4 + (joints_num - 1) * 9 + joints_num * 3:] / opt.feat_bias
124
+
125
+ # assert 4 + (joints_num - 1) * 9 + joints_num * 3 + 4 == mean.shape[-1]
126
+ # TODO (elmc): add back in
127
+ np.save(pjoin(opt.meta_dir, 'mean.npy'), mean)
128
+ np.save(pjoin(opt.meta_dir, 'std.npy'), std)
129
+
130
+ self.mean = mean
131
+ self.std = std
132
+ self.length_arr = np.array(length_list)
133
+ self.data_dict = data_dict
134
+ self.name_list = name_list
135
+
136
+ def inv_transform(self, data):
137
+ return data * self.std + self.mean
138
+
139
+ def real_len(self):
140
+ return len(self.data_dict)
141
+
142
+ def __len__(self):
143
+ # authors explain why they multiple self.times here instead of increasing epochs
144
+ # https://github.com/mingyuan-zhang/MotionDiffuse/issues/12
145
+ # also say it's not necessary set use persistent_workers = True in build_dataloader
146
+ return self.real_len() * self.times
147
+
148
+ def __getitem__(self, item):
149
+ idx = item % self.real_len()
150
+ data = self.data_dict[self.name_list[idx]]
151
+ motion, m_length, text_list = data['motion'], data['length'], data['text']
152
+ # Randomly select a caption
153
+ text_data = random.choice(text_list)
154
+ caption = text_data['caption']
155
+ max_motion_length = self.opt.max_motion_length
156
+ # TODO (elmc): delete this and replace with if m_length >= self..etc
157
+ # motion = motion[:max_motion_length]
158
+ # TODO (elmc): add back in
159
+ if m_length >= self.opt.max_motion_length:
160
+ idx = random.randint(0, len(motion) - max_motion_length)
161
+ motion = motion[idx: idx + max_motion_length]
162
+ else:
163
+ padding_len = max_motion_length - m_length
164
+ D = motion.shape[1]
165
+ padding_zeros = np.zeros((padding_len, D))
166
+ motion = np.concatenate((motion, padding_zeros), axis=0)
167
+
168
+ assert len(motion) == max_motion_length
169
+ "Z Normalization"
170
+ # TODO (elmc): add standardization back in
171
+ motion = (motion - self.mean) / self.std
172
+
173
+ if self.eval_mode:
174
+ tokens = text_data['tokens']
175
+ if len(tokens) < self.opt.max_text_len:
176
+ # pad with "unk"
177
+ tokens = ['sos/OTHER'] + tokens + ['eos/OTHER']
178
+ sent_len = len(tokens)
179
+ tokens = tokens + ['unk/OTHER'] * (self.opt.max_text_len + 2 - sent_len)
180
+ else:
181
+ # crop
182
+ tokens = tokens[:self.opt.max_text_len]
183
+ tokens = ['sos/OTHER'] + tokens + ['eos/OTHER']
184
+ sent_len = len(tokens)
185
+ pos_one_hots = []
186
+ word_embeddings = []
187
+ for token in tokens:
188
+ word_emb, pos_oh = self.w_vectorizer[token]
189
+ pos_one_hots.append(pos_oh[None, :])
190
+ word_embeddings.append(word_emb[None, :])
191
+ pos_one_hots = np.concatenate(pos_one_hots, axis=0)
192
+ word_embeddings = np.concatenate(word_embeddings, axis=0)
193
+ return word_embeddings, pos_one_hots, caption, sent_len, motion, m_length
194
+ return caption, motion, m_length
text2motion/datasets/evaluator.py ADDED
@@ -0,0 +1,469 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import codecs as cs
2
+ import os
3
+ import random
4
+ from os.path import join as pjoin
5
+
6
+ import numpy as np
7
+ import torch
8
+ from torch.utils.data import DataLoader, Dataset
9
+ from torch.utils.data._utils.collate import default_collate
10
+ from tqdm import tqdm
11
+
12
+ from models import MotionTransformer
13
+ from utils.get_opt import get_opt
14
+ from utils.word_vectorizer import POS_enumerator, WordVectorizer
15
+
16
+ from .evaluator_models import *
17
+ from .utils import drop_shapes_from_motion_arr
18
+
19
+
20
+ class EvaluationDataset(Dataset):
21
+
22
+ def __init__(self, opt, trainer, dataset, w_vectorizer, mm_num_samples, mm_num_repeats):
23
+ assert mm_num_samples < len(dataset)
24
+ print(opt.model_dir)
25
+
26
+ dataloader = DataLoader(dataset, batch_size=1, num_workers=1, shuffle=True)
27
+ epoch, it = trainer.load(pjoin(opt.model_dir, opt.which_epoch + '.tar'))
28
+
29
+ generated_motion = []
30
+ min_mov_length = 10 if opt.dataset_name == 't2m' else 6
31
+
32
+ trainer.eval_mode()
33
+ trainer.to(opt.device)
34
+
35
+ # Pre-process all target captions
36
+ mm_generated_motions = []
37
+ mm_idxs = np.random.choice(len(dataset), mm_num_samples, replace=False)
38
+ mm_idxs = np.sort(mm_idxs)
39
+ all_caption = []
40
+ all_m_lens = []
41
+ all_data = []
42
+ with torch.no_grad():
43
+ for i, data in tqdm(enumerate(dataloader)):
44
+ word_emb, pos_ohot, caption, cap_lens, motions, m_lens, tokens = data
45
+ all_data.append(data)
46
+ tokens = tokens[0].split('_')
47
+ mm_num_now = len(mm_generated_motions)
48
+ is_mm = True if ((mm_num_now < mm_num_samples) and (i == mm_idxs[mm_num_now])) else False
49
+ repeat_times = mm_num_repeats if is_mm else 1
50
+ m_lens = max(m_lens // opt.unit_length * opt.unit_length, min_mov_length * opt.unit_length)
51
+ m_lens = min(m_lens, opt.max_motion_length)
52
+ if isinstance(m_lens, int):
53
+ m_lens = torch.LongTensor([m_lens]).to(opt.device)
54
+ else:
55
+ m_lens = m_lens.to(opt.device)
56
+ for t in range(repeat_times):
57
+ all_m_lens.append(m_lens)
58
+ all_caption.extend(caption)
59
+ if is_mm:
60
+ mm_generated_motions.append(0)
61
+ all_m_lens = torch.stack(all_m_lens)
62
+
63
+ # Generate all sequences
64
+ with torch.no_grad():
65
+ all_pred_motions = trainer.generate(all_caption, all_m_lens, opt.dim_pose)
66
+
67
+ cur_idx = 0
68
+ mm_generated_motions = []
69
+ with torch.no_grad():
70
+ for i, data_dummy in tqdm(enumerate(dataloader)):
71
+ data = all_data[i]
72
+ word_emb, pos_ohot, caption, cap_lens, motions, m_lens, tokens = data
73
+ tokens = tokens[0].split('_')
74
+ mm_num_now = len(mm_generated_motions)
75
+ is_mm = True if ((mm_num_now < mm_num_samples) and (i == mm_idxs[mm_num_now])) else False
76
+ repeat_times = mm_num_repeats if is_mm else 1
77
+ mm_motions = []
78
+ m_lens = max(m_lens // opt.unit_length * opt.unit_length, min_mov_length * opt.unit_length)
79
+ m_lens = min(m_lens, opt.max_motion_length)
80
+ if isinstance(m_lens, int):
81
+ m_lens = torch.LongTensor([m_lens]).to(opt.device)
82
+ else:
83
+ m_lens = m_lens.to(opt.device)
84
+ for t in range(repeat_times):
85
+ m_len = m_lens[0].item()
86
+ pred_motions = all_pred_motions[cur_idx][:m_lens[0].item()]
87
+ assert pred_motions.shape[0] == m_lens[0].item()
88
+ cur_idx += 1
89
+ if t == 0:
90
+ sub_dict = {'motion': pred_motions.cpu().numpy(),
91
+ 'length': pred_motions.shape[0],
92
+ 'caption': caption[0],
93
+ 'cap_len': cap_lens[0].item(),
94
+ 'tokens': tokens}
95
+ generated_motion.append(sub_dict)
96
+
97
+ if is_mm:
98
+ mm_motions.append({
99
+ 'motion': pred_motions.cpu().numpy(),
100
+ 'length': m_lens[0].item()
101
+ })
102
+ if is_mm:
103
+ mm_generated_motions.append({'caption': caption[0],
104
+ 'tokens': tokens,
105
+ 'cap_len': cap_lens[0].item(),
106
+ 'mm_motions': mm_motions})
107
+ self.generated_motion = generated_motion
108
+ self.mm_generated_motion = mm_generated_motions
109
+ self.opt = opt
110
+ self.w_vectorizer = w_vectorizer
111
+
112
+
113
+ def __len__(self):
114
+ return len(self.generated_motion)
115
+
116
+
117
+ def __getitem__(self, item):
118
+ data = self.generated_motion[item]
119
+ motion, m_length, caption, tokens = data['motion'], data['length'], data['caption'], data['tokens']
120
+ sent_len = data['cap_len']
121
+ pos_one_hots = []
122
+ word_embeddings = []
123
+ for token in tokens:
124
+ word_emb, pos_oh = self.w_vectorizer[token]
125
+ pos_one_hots.append(pos_oh[None, :])
126
+ word_embeddings.append(word_emb[None, :])
127
+ pos_one_hots = np.concatenate(pos_one_hots, axis=0)
128
+ word_embeddings = np.concatenate(word_embeddings, axis=0)
129
+
130
+ if m_length < self.opt.max_motion_length:
131
+ motion = np.concatenate([motion,
132
+ np.zeros((self.opt.max_motion_length - m_length, motion.shape[1]))
133
+ ], axis=0)
134
+ return word_embeddings, pos_one_hots, caption, sent_len, motion, m_length, '_'.join(tokens)
135
+
136
+
137
+ def collate_fn(batch):
138
+ batch.sort(key=lambda x: x[3], reverse=True)
139
+ return default_collate(batch)
140
+
141
+
142
+ '''For use of training text motion matching model, and evaluations'''
143
+ class Text2MotionDatasetV2(Dataset):
144
+ def __init__(self, opt, mean, std, split_file, w_vectorizer):
145
+ self.opt = opt
146
+ self.w_vectorizer = w_vectorizer
147
+ self.max_length = 20
148
+ self.pointer = 0
149
+ self.max_motion_length = opt.max_motion_length
150
+ min_motion_len = 40 if self.opt.dataset_name =='t2m' else 24
151
+
152
+ data_dict = {}
153
+ id_list = []
154
+ with cs.open(split_file, 'r') as f:
155
+ for line in f.readlines():
156
+ id_list.append(line.strip())
157
+
158
+ new_name_list = []
159
+ length_list = []
160
+ for name in tqdm(id_list):
161
+ try:
162
+ print(f"attempting to load motion for {name} at {pjoin(opt.motion_dir, name + '.npy')}")
163
+ motion = np.load(pjoin(opt.motion_dir, name + '.npy'))
164
+ if self.opt.dataset_name.lower() == 'grab':
165
+ motion = drop_shapes_from_motion_arr(motion)
166
+ assert motion.shape[-1] == opt.dim_pose, f"motion shape {motion.shape} does not match dim_pose {opt.dim_pose}"
167
+ print(f"grab motion shape: {motion.shape}")
168
+ print(f"len of motion: {len(motion)}")
169
+ # TODO (elmc): verify we don't need this for GRAB data
170
+ # if (len(motion)) < min_motion_len or (len(motion) >= 200):
171
+ # continue
172
+ text_data = []
173
+ flag = False
174
+ with cs.open(pjoin(opt.text_dir, name + '.txt')) as f:
175
+ for line in f.readlines():
176
+ text_dict = {}
177
+ line_split = line.strip().split('#')
178
+ caption = line_split[0]
179
+ f_tag = 0.0
180
+ to_tag = 0.0
181
+ # TODO (elmc): add actual tokens back for grab
182
+ tokens = []
183
+ if self.opt.dataset_name.lower() != 'grab':
184
+ tokens = line_split[1].split(' ')
185
+ f_tag = float(line_split[2])
186
+ to_tag = float(line_split[3])
187
+ f_tag = 0.0 if np.isnan(f_tag) else f_tag
188
+ to_tag = 0.0 if np.isnan(to_tag) else to_tag
189
+
190
+ text_dict['caption'] = caption
191
+ text_dict['tokens'] = tokens
192
+ if f_tag == 0.0 and to_tag == 0.0:
193
+ flag = True
194
+ text_data.append(text_dict)
195
+ else:
196
+ n_motion = motion[int(f_tag*20) : int(to_tag*20)]
197
+ if (len(n_motion)) < min_motion_len or (len(n_motion) >= 200):
198
+ continue
199
+ new_name = random.choice('ABCDEFGHIJKLMNOPQRSTUVW') + '_' + name
200
+ while new_name in data_dict:
201
+ new_name = random.choice('ABCDEFGHIJKLMNOPQRSTUVW') + '_' + name
202
+ data_dict[new_name] = {'motion': n_motion,
203
+ 'length': len(n_motion),
204
+ 'text':[text_dict]}
205
+ new_name_list.append(new_name)
206
+ length_list.append(len(n_motion))
207
+
208
+ if flag:
209
+ data_dict[name] = {'motion': motion,
210
+ 'length': len(motion),
211
+ 'text':text_data}
212
+ new_name_list.append(name)
213
+ length_list.append(len(motion))
214
+ except Exception as e:
215
+ # Some motion may not exist in KIT dataset
216
+ print(f"failed to load motion for {name} at {pjoin(opt.motion_dir, name + '.npy')} due to {e}")
217
+ pass
218
+
219
+ if not new_name_list or not length_list:
220
+ raise ValueError(f'No data loaded, new_name_list has len {len(new_name_list)} and length_list has len {len(length_list)}')
221
+ name_list, length_list = zip(*sorted(zip(new_name_list, length_list), key=lambda x: x[1]))
222
+ print(f"LOADED length of name_list: {len(name_list)}")
223
+ self.mean = mean
224
+ self.std = std
225
+ self.length_arr = np.array(length_list)
226
+ self.data_dict = data_dict
227
+ self.name_list = name_list
228
+ # TODO (elmc): so.... V2 is same as V1 but has reset_max_len??
229
+ self.reset_max_len(self.max_length)
230
+
231
+ def reset_max_len(self, length):
232
+ assert length <= self.max_motion_length
233
+ self.pointer = np.searchsorted(self.length_arr, length)
234
+ print("Pointer Pointing at %d"%self.pointer)
235
+ self.max_length = length
236
+
237
+ def inv_transform(self, data):
238
+ return data * self.std + self.mean
239
+
240
+ def __len__(self):
241
+ return len(self.data_dict) - self.pointer
242
+
243
+ def __getitem__(self, item):
244
+ idx = self.pointer + item
245
+ data = self.data_dict[self.name_list[idx]]
246
+ motion, m_length, text_list = data['motion'], data['length'], data['text']
247
+ # Randomly select a caption
248
+ text_data = random.choice(text_list)
249
+ caption, tokens = text_data['caption'], text_data['tokens']
250
+
251
+ if len(tokens) < self.opt.max_text_len:
252
+ # pad with "unk"
253
+ tokens = ['sos/OTHER'] + tokens + ['eos/OTHER']
254
+ sent_len = len(tokens)
255
+ tokens = tokens + ['unk/OTHER'] * (self.opt.max_text_len + 2 - sent_len)
256
+ else:
257
+ # crop
258
+ tokens = tokens[:self.opt.max_text_len]
259
+ tokens = ['sos/OTHER'] + tokens + ['eos/OTHER']
260
+ sent_len = len(tokens)
261
+ pos_one_hots = []
262
+ word_embeddings = []
263
+ for token in tokens:
264
+ word_emb, pos_oh = self.w_vectorizer[token]
265
+ pos_one_hots.append(pos_oh[None, :])
266
+ word_embeddings.append(word_emb[None, :])
267
+ pos_one_hots = np.concatenate(pos_one_hots, axis=0)
268
+ word_embeddings = np.concatenate(word_embeddings, axis=0)
269
+
270
+ # Crop the motions in to times of 4, and introduce small variations
271
+ if self.opt.unit_length < 10:
272
+ coin2 = np.random.choice(['single', 'single', 'double'])
273
+ else:
274
+ coin2 = 'single'
275
+
276
+ if coin2 == 'double':
277
+ m_length = (m_length // self.opt.unit_length - 1) * self.opt.unit_length
278
+ elif coin2 == 'single':
279
+ m_length = (m_length // self.opt.unit_length) * self.opt.unit_length
280
+ idx = random.randint(0, len(motion) - m_length)
281
+ motion = motion[idx:idx+m_length]
282
+
283
+ "Z Normalization"
284
+ motion = (motion - self.mean) / self.std
285
+
286
+ if m_length < self.max_motion_length:
287
+ motion = np.concatenate([motion,
288
+ np.zeros((self.max_motion_length - m_length, motion.shape[1]))
289
+ ], axis=0)
290
+ return word_embeddings, pos_one_hots, caption, sent_len, motion, m_length, '_'.join(tokens)
291
+
292
+
293
+ def get_dataset_motion_loader(opt_path, batch_size, device):
294
+ opt = get_opt(opt_path, device)
295
+
296
+ # Configurations of T2M dataset and KIT dataset is almost the same
297
+ if opt.dataset_name == 't2m' or opt.dataset_name == 'kit' or opt.dataset_name == 'grab':
298
+ print('Loading dataset %s ...' % opt.dataset_name)
299
+
300
+ mean_path = pjoin(opt.meta_dir, 'mean.npy')
301
+ std_path = pjoin(opt.meta_dir, 'std.npy')
302
+ if not os.path.exists(mean_path):
303
+ mean = np.zeros(opt.dim_pose)
304
+ else:
305
+ mean = np.load(pjoin(opt.meta_dir, 'mean.npy'))
306
+ if not os.path.exists(std_path):
307
+ std = np.ones(opt.dim_pose)
308
+ else:
309
+ std = np.load(pjoin(opt.meta_dir, 'std.npy'))
310
+
311
+ # get glove data via following instructions here
312
+ # https://github.com/mingyuan-zhang/MotionDiffuse/blob/main/text2motion/install.md#data-preparation
313
+ w_vectorizer = WordVectorizer('./data/glove', 'our_vab')
314
+ split_file = pjoin(opt.data_root, 'test.txt')
315
+ dataset = Text2MotionDatasetV2(opt, mean, std, split_file, w_vectorizer)
316
+ dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=4, drop_last=True,
317
+ collate_fn=collate_fn, shuffle=True)
318
+ else:
319
+ raise KeyError('Dataset not Recognized !!')
320
+
321
+ print('Ground Truth Dataset Loading Completed!!!')
322
+ return dataloader, dataset
323
+
324
+
325
+ class MMGeneratedDataset(Dataset):
326
+ def __init__(self, opt, motion_dataset, w_vectorizer):
327
+ self.opt = opt
328
+ self.dataset = motion_dataset.mm_generated_motion
329
+ self.w_vectorizer = w_vectorizer
330
+
331
+ def __len__(self):
332
+ return len(self.dataset)
333
+
334
+ def __getitem__(self, item):
335
+ data = self.dataset[item]
336
+ mm_motions = data['mm_motions']
337
+ m_lens = []
338
+ motions = []
339
+ for mm_motion in mm_motions:
340
+ m_lens.append(mm_motion['length'])
341
+ motion = mm_motion['motion']
342
+ if len(motion) < self.opt.max_motion_length:
343
+ motion = np.concatenate([motion,
344
+ np.zeros((self.opt.max_motion_length - len(motion), motion.shape[1]))
345
+ ], axis=0)
346
+ motion = motion[None, :]
347
+ motions.append(motion)
348
+ m_lens = np.array(m_lens, dtype=np.int)
349
+ motions = np.concatenate(motions, axis=0)
350
+ sort_indx = np.argsort(m_lens)[::-1].copy()
351
+ # print(m_lens)
352
+ # print(sort_indx)
353
+ # print(m_lens[sort_indx])
354
+ m_lens = m_lens[sort_indx]
355
+ motions = motions[sort_indx]
356
+ return motions, m_lens
357
+
358
+
359
+
360
+ def get_motion_loader(opt, batch_size, trainer, ground_truth_dataset, mm_num_samples, mm_num_repeats):
361
+
362
+ # Currently the configurations of two datasets are almost the same
363
+ if opt.dataset_name == 't2m' or opt.dataset_name == 'kit':
364
+ w_vectorizer = WordVectorizer('./data/glove', 'our_vab')
365
+ else:
366
+ raise KeyError('Dataset not recognized!!')
367
+ print('Generating %s ...' % opt.name)
368
+
369
+ dataset = EvaluationDataset(opt, trainer, ground_truth_dataset, w_vectorizer, mm_num_samples, mm_num_repeats)
370
+ mm_dataset = MMGeneratedDataset(opt, dataset, w_vectorizer)
371
+
372
+ motion_loader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn, drop_last=True, num_workers=4)
373
+ mm_motion_loader = DataLoader(mm_dataset, batch_size=1, num_workers=1)
374
+
375
+ print('Generated Dataset Loading Completed!!!')
376
+
377
+ return motion_loader, mm_motion_loader
378
+
379
+
380
+ def build_models(opt):
381
+ movement_enc = MovementConvEncoder(opt.dim_pose-4, opt.dim_movement_enc_hidden, opt.dim_movement_latent)
382
+ text_enc = TextEncoderBiGRUCo(word_size=opt.dim_word,
383
+ pos_size=opt.dim_pos_ohot,
384
+ hidden_size=opt.dim_text_hidden,
385
+ output_size=opt.dim_coemb_hidden,
386
+ device=opt.device)
387
+
388
+ motion_enc = MotionEncoderBiGRUCo(input_size=opt.dim_movement_latent,
389
+ hidden_size=opt.dim_motion_hidden,
390
+ output_size=opt.dim_coemb_hidden,
391
+ device=opt.device)
392
+
393
+ checkpoint = torch.load(pjoin('data/pretrained_models', opt.dataset_name, 'text_mot_match', 'model', 'finest.tar'),
394
+ map_location=opt.device)
395
+ movement_enc.load_state_dict(checkpoint['movement_encoder'])
396
+ text_enc.load_state_dict(checkpoint['text_encoder'])
397
+ motion_enc.load_state_dict(checkpoint['motion_encoder'])
398
+ print('Loading Evaluation Model Wrapper (Epoch %d) Completed!!' % (checkpoint['epoch']))
399
+ return text_enc, motion_enc, movement_enc
400
+
401
+
402
+ class EvaluatorModelWrapper(object):
403
+
404
+ def __init__(self, opt):
405
+
406
+ if opt.dataset_name == 't2m':
407
+ opt.dim_pose = 263
408
+ elif opt.dataset_name == 'kit':
409
+ opt.dim_pose = 251
410
+ elif opt.dataset_name == 'grab':
411
+ opt.dim_pose = 212
412
+ else:
413
+ raise KeyError('Dataset not Recognized!!!')
414
+
415
+ opt.dim_word = 300
416
+ opt.max_motion_length = 196
417
+ opt.dim_pos_ohot = len(POS_enumerator)
418
+ opt.dim_motion_hidden = 1024
419
+ opt.max_text_len = 20
420
+ opt.dim_text_hidden = 512
421
+ opt.dim_coemb_hidden = 512
422
+
423
+ self.text_encoder, self.motion_encoder, self.movement_encoder = build_models(opt)
424
+ self.opt = opt
425
+ self.device = opt.device
426
+
427
+ self.text_encoder.to(opt.device)
428
+ self.motion_encoder.to(opt.device)
429
+ self.movement_encoder.to(opt.device)
430
+
431
+ self.text_encoder.eval()
432
+ self.motion_encoder.eval()
433
+ self.movement_encoder.eval()
434
+
435
+ # Please note that the results does not following the order of inputs
436
+ def get_co_embeddings(self, word_embs, pos_ohot, cap_lens, motions, m_lens):
437
+ with torch.no_grad():
438
+ word_embs = word_embs.detach().to(self.device).float()
439
+ pos_ohot = pos_ohot.detach().to(self.device).float()
440
+ motions = motions.detach().to(self.device).float()
441
+
442
+ align_idx = np.argsort(m_lens.data.tolist())[::-1].copy()
443
+ motions = motions[align_idx]
444
+ m_lens = m_lens[align_idx]
445
+
446
+ '''Movement Encoding'''
447
+ movements = self.movement_encoder(motions[..., :-4]).detach()
448
+ m_lens = m_lens // self.opt.unit_length
449
+ motion_embedding = self.motion_encoder(movements, m_lens)
450
+
451
+ '''Text Encoding'''
452
+ text_embedding = self.text_encoder(word_embs, pos_ohot, cap_lens)
453
+ text_embedding = text_embedding[align_idx]
454
+ return text_embedding, motion_embedding
455
+
456
+ # Please note that the results does not following the order of inputs
457
+ def get_motion_embeddings(self, motions, m_lens):
458
+ with torch.no_grad():
459
+ motions = motions.detach().to(self.device).float()
460
+
461
+ align_idx = np.argsort(m_lens.data.tolist())[::-1].copy()
462
+ motions = motions[align_idx]
463
+ m_lens = m_lens[align_idx]
464
+
465
+ '''Movement Encoding'''
466
+ movements = self.movement_encoder(motions[..., :-4]).detach()
467
+ m_lens = m_lens // self.opt.unit_length
468
+ motion_embedding = self.motion_encoder(movements, m_lens)
469
+ return motion_embedding
text2motion/datasets/evaluator_models.py ADDED
@@ -0,0 +1,438 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import numpy as np
4
+ import time
5
+ import math
6
+ from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
7
+ # from networks.layers import *
8
+ import torch.nn.functional as F
9
+
10
+
11
+ class ContrastiveLoss(torch.nn.Module):
12
+ """
13
+ Contrastive loss function.
14
+ Based on: http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
15
+ """
16
+ def __init__(self, margin=3.0):
17
+ super(ContrastiveLoss, self).__init__()
18
+ self.margin = margin
19
+
20
+ def forward(self, output1, output2, label):
21
+ euclidean_distance = F.pairwise_distance(output1, output2, keepdim=True)
22
+ loss_contrastive = torch.mean((1-label) * torch.pow(euclidean_distance, 2) +
23
+ (label) * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0), 2))
24
+ return loss_contrastive
25
+
26
+
27
+ def init_weight(m):
28
+ if isinstance(m, nn.Conv1d) or isinstance(m, nn.Linear) or isinstance(m, nn.ConvTranspose1d):
29
+ nn.init.xavier_normal_(m.weight)
30
+ # m.bias.data.fill_(0.01)
31
+ if m.bias is not None:
32
+ nn.init.constant_(m.bias, 0)
33
+
34
+
35
+ def reparameterize(mu, logvar):
36
+ s_var = logvar.mul(0.5).exp_()
37
+ eps = s_var.data.new(s_var.size()).normal_()
38
+ return eps.mul(s_var).add_(mu)
39
+
40
+
41
+ # batch_size, dimension and position
42
+ # output: (batch_size, dim)
43
+ def positional_encoding(batch_size, dim, pos):
44
+ assert batch_size == pos.shape[0]
45
+ positions_enc = np.array([
46
+ [pos[j] / np.power(10000, (i-i%2)/dim) for i in range(dim)]
47
+ for j in range(batch_size)
48
+ ], dtype=np.float32)
49
+ positions_enc[:, 0::2] = np.sin(positions_enc[:, 0::2])
50
+ positions_enc[:, 1::2] = np.cos(positions_enc[:, 1::2])
51
+ return torch.from_numpy(positions_enc).float()
52
+
53
+
54
+ def get_padding_mask(batch_size, seq_len, cap_lens):
55
+ cap_lens = cap_lens.data.tolist()
56
+ mask_2d = torch.ones((batch_size, seq_len, seq_len), dtype=torch.float32)
57
+ for i, cap_len in enumerate(cap_lens):
58
+ mask_2d[i, :, :cap_len] = 0
59
+ return mask_2d.bool(), 1 - mask_2d[:, :, 0].clone()
60
+
61
+
62
+ class PositionalEncoding(nn.Module):
63
+
64
+ def __init__(self, d_model, max_len=300):
65
+ super(PositionalEncoding, self).__init__()
66
+
67
+ pe = torch.zeros(max_len, d_model)
68
+ position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
69
+ div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
70
+ pe[:, 0::2] = torch.sin(position * div_term)
71
+ pe[:, 1::2] = torch.cos(position * div_term)
72
+ # pe = pe.unsqueeze(0).transpose(0, 1)
73
+ self.register_buffer('pe', pe)
74
+
75
+ def forward(self, pos):
76
+ return self.pe[pos]
77
+
78
+
79
+ class MovementConvEncoder(nn.Module):
80
+ def __init__(self, input_size, hidden_size, output_size):
81
+ super(MovementConvEncoder, self).__init__()
82
+ self.main = nn.Sequential(
83
+ nn.Conv1d(input_size, hidden_size, 4, 2, 1),
84
+ nn.Dropout(0.2, inplace=True),
85
+ nn.LeakyReLU(0.2, inplace=True),
86
+ nn.Conv1d(hidden_size, output_size, 4, 2, 1),
87
+ nn.Dropout(0.2, inplace=True),
88
+ nn.LeakyReLU(0.2, inplace=True),
89
+ )
90
+ self.out_net = nn.Linear(output_size, output_size)
91
+ self.main.apply(init_weight)
92
+ self.out_net.apply(init_weight)
93
+
94
+ def forward(self, inputs):
95
+ inputs = inputs.permute(0, 2, 1)
96
+ outputs = self.main(inputs).permute(0, 2, 1)
97
+ # print(outputs.shape)
98
+ return self.out_net(outputs)
99
+
100
+
101
+ class MovementConvDecoder(nn.Module):
102
+ def __init__(self, input_size, hidden_size, output_size):
103
+ super(MovementConvDecoder, self).__init__()
104
+ self.main = nn.Sequential(
105
+ nn.ConvTranspose1d(input_size, hidden_size, 4, 2, 1),
106
+ # nn.Dropout(0.2, inplace=True),
107
+ nn.LeakyReLU(0.2, inplace=True),
108
+ nn.ConvTranspose1d(hidden_size, output_size, 4, 2, 1),
109
+ # nn.Dropout(0.2, inplace=True),
110
+ nn.LeakyReLU(0.2, inplace=True),
111
+ )
112
+ self.out_net = nn.Linear(output_size, output_size)
113
+
114
+ self.main.apply(init_weight)
115
+ self.out_net.apply(init_weight)
116
+
117
+ def forward(self, inputs):
118
+ inputs = inputs.permute(0, 2, 1)
119
+ outputs = self.main(inputs).permute(0, 2, 1)
120
+ return self.out_net(outputs)
121
+
122
+
123
+ class TextVAEDecoder(nn.Module):
124
+ def __init__(self, text_size, input_size, output_size, hidden_size, n_layers):
125
+ super(TextVAEDecoder, self).__init__()
126
+ self.input_size = input_size
127
+ self.output_size = output_size
128
+ self.hidden_size = hidden_size
129
+ self.n_layers = n_layers
130
+ self.emb = nn.Sequential(
131
+ nn.Linear(input_size, hidden_size),
132
+ nn.LayerNorm(hidden_size),
133
+ nn.LeakyReLU(0.2, inplace=True))
134
+
135
+ self.z2init = nn.Linear(text_size, hidden_size * n_layers)
136
+ self.gru = nn.ModuleList([nn.GRUCell(hidden_size, hidden_size) for i in range(self.n_layers)])
137
+ self.positional_encoder = PositionalEncoding(hidden_size)
138
+
139
+
140
+ self.output = nn.Sequential(
141
+ nn.Linear(hidden_size, hidden_size),
142
+ nn.LayerNorm(hidden_size),
143
+ nn.LeakyReLU(0.2, inplace=True),
144
+ nn.Linear(hidden_size, output_size)
145
+ )
146
+
147
+ #
148
+ # self.output = nn.Sequential(
149
+ # nn.Linear(hidden_size, hidden_size),
150
+ # nn.LayerNorm(hidden_size),
151
+ # nn.LeakyReLU(0.2, inplace=True),
152
+ # nn.Linear(hidden_size, output_size-4)
153
+ # )
154
+
155
+ # self.contact_net = nn.Sequential(
156
+ # nn.Linear(output_size-4, 64),
157
+ # nn.LayerNorm(64),
158
+ # nn.LeakyReLU(0.2, inplace=True),
159
+ # nn.Linear(64, 4)
160
+ # )
161
+
162
+ self.output.apply(init_weight)
163
+ self.emb.apply(init_weight)
164
+ self.z2init.apply(init_weight)
165
+ # self.contact_net.apply(init_weight)
166
+
167
+ def get_init_hidden(self, latent):
168
+ hidden = self.z2init(latent)
169
+ hidden = torch.split(hidden, self.hidden_size, dim=-1)
170
+ return list(hidden)
171
+
172
+ def forward(self, inputs, last_pred, hidden, p):
173
+ h_in = self.emb(inputs)
174
+ pos_enc = self.positional_encoder(p).to(inputs.device).detach()
175
+ h_in = h_in + pos_enc
176
+ for i in range(self.n_layers):
177
+ # print(h_in.shape)
178
+ hidden[i] = self.gru[i](h_in, hidden[i])
179
+ h_in = hidden[i]
180
+ pose_pred = self.output(h_in)
181
+ # pose_pred = self.output(h_in) + last_pred.detach()
182
+ # contact = self.contact_net(pose_pred)
183
+ # return torch.cat([pose_pred, contact], dim=-1), hidden
184
+ return pose_pred, hidden
185
+
186
+
187
+ class TextDecoder(nn.Module):
188
+ def __init__(self, text_size, input_size, output_size, hidden_size, n_layers):
189
+ super(TextDecoder, self).__init__()
190
+ self.input_size = input_size
191
+ self.output_size = output_size
192
+ self.hidden_size = hidden_size
193
+ self.n_layers = n_layers
194
+ self.emb = nn.Sequential(
195
+ nn.Linear(input_size, hidden_size),
196
+ nn.LayerNorm(hidden_size),
197
+ nn.LeakyReLU(0.2, inplace=True))
198
+
199
+ self.gru = nn.ModuleList([nn.GRUCell(hidden_size, hidden_size) for i in range(self.n_layers)])
200
+ self.z2init = nn.Linear(text_size, hidden_size * n_layers)
201
+ self.positional_encoder = PositionalEncoding(hidden_size)
202
+
203
+ self.mu_net = nn.Linear(hidden_size, output_size)
204
+ self.logvar_net = nn.Linear(hidden_size, output_size)
205
+
206
+ self.emb.apply(init_weight)
207
+ self.z2init.apply(init_weight)
208
+ self.mu_net.apply(init_weight)
209
+ self.logvar_net.apply(init_weight)
210
+
211
+ def get_init_hidden(self, latent):
212
+
213
+ hidden = self.z2init(latent)
214
+ hidden = torch.split(hidden, self.hidden_size, dim=-1)
215
+
216
+ return list(hidden)
217
+
218
+ def forward(self, inputs, hidden, p):
219
+ # print(inputs.shape)
220
+ x_in = self.emb(inputs)
221
+ pos_enc = self.positional_encoder(p).to(inputs.device).detach()
222
+ x_in = x_in + pos_enc
223
+
224
+ for i in range(self.n_layers):
225
+ hidden[i] = self.gru[i](x_in, hidden[i])
226
+ h_in = hidden[i]
227
+ mu = self.mu_net(h_in)
228
+ logvar = self.logvar_net(h_in)
229
+ z = reparameterize(mu, logvar)
230
+ return z, mu, logvar, hidden
231
+
232
+ class AttLayer(nn.Module):
233
+ def __init__(self, query_dim, key_dim, value_dim):
234
+ super(AttLayer, self).__init__()
235
+ self.W_q = nn.Linear(query_dim, value_dim)
236
+ self.W_k = nn.Linear(key_dim, value_dim, bias=False)
237
+ self.W_v = nn.Linear(key_dim, value_dim)
238
+
239
+ self.softmax = nn.Softmax(dim=1)
240
+ self.dim = value_dim
241
+
242
+ self.W_q.apply(init_weight)
243
+ self.W_k.apply(init_weight)
244
+ self.W_v.apply(init_weight)
245
+
246
+ def forward(self, query, key_mat):
247
+ '''
248
+ query (batch, query_dim)
249
+ key (batch, seq_len, key_dim)
250
+ '''
251
+ # print(query.shape)
252
+ query_vec = self.W_q(query).unsqueeze(-1) # (batch, value_dim, 1)
253
+ val_set = self.W_v(key_mat) # (batch, seq_len, value_dim)
254
+ key_set = self.W_k(key_mat) # (batch, seq_len, value_dim)
255
+
256
+ weights = torch.matmul(key_set, query_vec) / np.sqrt(self.dim)
257
+
258
+ co_weights = self.softmax(weights) # (batch, seq_len, 1)
259
+ values = val_set * co_weights # (batch, seq_len, value_dim)
260
+ pred = values.sum(dim=1) # (batch, value_dim)
261
+ return pred, co_weights
262
+
263
+ def short_cut(self, querys, keys):
264
+ return self.W_q(querys), self.W_k(keys)
265
+
266
+
267
+ class TextEncoderBiGRU(nn.Module):
268
+ def __init__(self, word_size, pos_size, hidden_size, device):
269
+ super(TextEncoderBiGRU, self).__init__()
270
+ self.device = device
271
+
272
+ self.pos_emb = nn.Linear(pos_size, word_size)
273
+ self.input_emb = nn.Linear(word_size, hidden_size)
274
+ self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True, bidirectional=True)
275
+ # self.linear2 = nn.Linear(hidden_size, output_size)
276
+
277
+ self.input_emb.apply(init_weight)
278
+ self.pos_emb.apply(init_weight)
279
+ # self.linear2.apply(init_weight)
280
+ # self.batch_size = batch_size
281
+ self.hidden_size = hidden_size
282
+ self.hidden = nn.Parameter(torch.randn((2, 1, self.hidden_size), requires_grad=True))
283
+
284
+ # input(batch_size, seq_len, dim)
285
+ def forward(self, word_embs, pos_onehot, cap_lens):
286
+ num_samples = word_embs.shape[0]
287
+
288
+ pos_embs = self.pos_emb(pos_onehot)
289
+ inputs = word_embs + pos_embs
290
+ input_embs = self.input_emb(inputs)
291
+ hidden = self.hidden.repeat(1, num_samples, 1)
292
+
293
+ cap_lens = cap_lens.data.tolist()
294
+ emb = pack_padded_sequence(input_embs, cap_lens, batch_first=True)
295
+
296
+ gru_seq, gru_last = self.gru(emb, hidden)
297
+
298
+ gru_last = torch.cat([gru_last[0], gru_last[1]], dim=-1)
299
+ gru_seq = pad_packed_sequence(gru_seq, batch_first=True)[0]
300
+ forward_seq = gru_seq[..., :self.hidden_size]
301
+ backward_seq = gru_seq[..., self.hidden_size:].clone()
302
+
303
+ # Concate the forward and backward word embeddings
304
+ for i, length in enumerate(cap_lens):
305
+ backward_seq[i:i+1, :length] = torch.flip(backward_seq[i:i+1, :length].clone(), dims=[1])
306
+ gru_seq = torch.cat([forward_seq, backward_seq], dim=-1)
307
+
308
+ return gru_seq, gru_last
309
+
310
+
311
+ class TextEncoderBiGRUCo(nn.Module):
312
+ def __init__(self, word_size, pos_size, hidden_size, output_size, device):
313
+ super(TextEncoderBiGRUCo, self).__init__()
314
+ self.device = device
315
+
316
+ self.pos_emb = nn.Linear(pos_size, word_size)
317
+ self.input_emb = nn.Linear(word_size, hidden_size)
318
+ self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True, bidirectional=True)
319
+ self.output_net = nn.Sequential(
320
+ nn.Linear(hidden_size * 2, hidden_size),
321
+ nn.LayerNorm(hidden_size),
322
+ nn.LeakyReLU(0.2, inplace=True),
323
+ nn.Linear(hidden_size, output_size)
324
+ )
325
+
326
+ self.input_emb.apply(init_weight)
327
+ self.pos_emb.apply(init_weight)
328
+ self.output_net.apply(init_weight)
329
+ # self.linear2.apply(init_weight)
330
+ # self.batch_size = batch_size
331
+ self.hidden_size = hidden_size
332
+ self.hidden = nn.Parameter(torch.randn((2, 1, self.hidden_size), requires_grad=True))
333
+
334
+ # input(batch_size, seq_len, dim)
335
+ def forward(self, word_embs, pos_onehot, cap_lens):
336
+ num_samples = word_embs.shape[0]
337
+
338
+ pos_embs = self.pos_emb(pos_onehot)
339
+ inputs = word_embs + pos_embs
340
+ input_embs = self.input_emb(inputs)
341
+ hidden = self.hidden.repeat(1, num_samples, 1)
342
+
343
+ cap_lens = cap_lens.data.tolist()
344
+ emb = pack_padded_sequence(input_embs, cap_lens, batch_first=True)
345
+
346
+ gru_seq, gru_last = self.gru(emb, hidden)
347
+
348
+ gru_last = torch.cat([gru_last[0], gru_last[1]], dim=-1)
349
+
350
+ return self.output_net(gru_last)
351
+
352
+
353
+ class MotionEncoderBiGRUCo(nn.Module):
354
+ def __init__(self, input_size, hidden_size, output_size, device):
355
+ super(MotionEncoderBiGRUCo, self).__init__()
356
+ self.device = device
357
+
358
+ self.input_emb = nn.Linear(input_size, hidden_size)
359
+ self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True, bidirectional=True)
360
+ self.output_net = nn.Sequential(
361
+ nn.Linear(hidden_size*2, hidden_size),
362
+ nn.LayerNorm(hidden_size),
363
+ nn.LeakyReLU(0.2, inplace=True),
364
+ nn.Linear(hidden_size, output_size)
365
+ )
366
+
367
+ self.input_emb.apply(init_weight)
368
+ self.output_net.apply(init_weight)
369
+ self.hidden_size = hidden_size
370
+ self.hidden = nn.Parameter(torch.randn((2, 1, self.hidden_size), requires_grad=True))
371
+
372
+ # input(batch_size, seq_len, dim)
373
+ def forward(self, inputs, m_lens):
374
+ num_samples = inputs.shape[0]
375
+
376
+ input_embs = self.input_emb(inputs)
377
+ hidden = self.hidden.repeat(1, num_samples, 1)
378
+
379
+ cap_lens = m_lens.data.tolist()
380
+ emb = pack_padded_sequence(input_embs, cap_lens, batch_first=True)
381
+
382
+ gru_seq, gru_last = self.gru(emb, hidden)
383
+
384
+ gru_last = torch.cat([gru_last[0], gru_last[1]], dim=-1)
385
+
386
+ return self.output_net(gru_last)
387
+
388
+
389
+ class MotionLenEstimatorBiGRU(nn.Module):
390
+ def __init__(self, word_size, pos_size, hidden_size, output_size):
391
+ super(MotionLenEstimatorBiGRU, self).__init__()
392
+
393
+ self.pos_emb = nn.Linear(pos_size, word_size)
394
+ self.input_emb = nn.Linear(word_size, hidden_size)
395
+ self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True, bidirectional=True)
396
+ nd = 512
397
+ self.output = nn.Sequential(
398
+ nn.Linear(hidden_size*2, nd),
399
+ nn.LayerNorm(nd),
400
+ nn.LeakyReLU(0.2, inplace=True),
401
+
402
+ nn.Linear(nd, nd // 2),
403
+ nn.LayerNorm(nd // 2),
404
+ nn.LeakyReLU(0.2, inplace=True),
405
+
406
+ nn.Linear(nd // 2, nd // 4),
407
+ nn.LayerNorm(nd // 4),
408
+ nn.LeakyReLU(0.2, inplace=True),
409
+
410
+ nn.Linear(nd // 4, output_size)
411
+ )
412
+ # self.linear2 = nn.Linear(hidden_size, output_size)
413
+
414
+ self.input_emb.apply(init_weight)
415
+ self.pos_emb.apply(init_weight)
416
+ self.output.apply(init_weight)
417
+ # self.linear2.apply(init_weight)
418
+ # self.batch_size = batch_size
419
+ self.hidden_size = hidden_size
420
+ self.hidden = nn.Parameter(torch.randn((2, 1, self.hidden_size), requires_grad=True))
421
+
422
+ # input(batch_size, seq_len, dim)
423
+ def forward(self, word_embs, pos_onehot, cap_lens):
424
+ num_samples = word_embs.shape[0]
425
+
426
+ pos_embs = self.pos_emb(pos_onehot)
427
+ inputs = word_embs + pos_embs
428
+ input_embs = self.input_emb(inputs)
429
+ hidden = self.hidden.repeat(1, num_samples, 1)
430
+
431
+ cap_lens = cap_lens.data.tolist()
432
+ emb = pack_padded_sequence(input_embs, cap_lens, batch_first=True)
433
+
434
+ gru_seq, gru_last = self.gru(emb, hidden)
435
+
436
+ gru_last = torch.cat([gru_last[0], gru_last[1]], dim=-1)
437
+
438
+ return self.output(gru_last)
text2motion/datasets/mean_mesh.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ import logging as log
4
+ import os
5
+ import time
6
+ from collections import defaultdict
7
+ from os.path import join as pjoin
8
+ from typing import Dict, Optional, Tuple
9
+
10
+ import imageio
11
+ import numpy as np
12
+ import pyrender
13
+ import smplx
14
+ import torch
15
+ import trimesh
16
+ from numpy.typing import ArrayLike
17
+ from torch import Tensor
18
+ from tqdm import tqdm
19
+
20
+ from .motionx_explorer import (NUM_FACIAL_EXPRESSION_DIMS,
21
+ calc_mean_stddev_pose, get_info_from_file,
22
+ label_code, motion_arr_to_dict, names_to_arrays,
23
+ to_smplx_dict)
24
+
25
+ log.basicConfig(
26
+ level=log.INFO,
27
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
28
+ )
29
+
30
+ def save_img(img, save_path):
31
+ imageio.imwrite(save_path, img)
32
+
33
+ # based on https://github.com/vchoutas/smplx/blob/main/examples/demo.py
34
+ # used to render one pose (not sequence of poses) e.g. to see the mean pose
35
+ def render_mesh(model, output, should_save=False, save_path=None):
36
+ should_display = not should_save
37
+ vertices = output.vertices.detach().cpu().numpy().squeeze()
38
+ # joint points not visualized for now
39
+ # joints = output.joints.detach().cpu().numpy().squeeze()
40
+ scene = pyrender.Scene()
41
+ if should_display:
42
+ viewer = pyrender.Viewer(scene, run_in_thread=True)
43
+
44
+ mesh_node = None
45
+ joints_node = None
46
+ # Rotation matrix (90 degrees around the X-axis)
47
+ rot = trimesh.transformations.rotation_matrix(np.radians(90), [1, 0, 0])
48
+ if should_save:
49
+ os.makedirs(os.path.dirname(save_path), exist_ok=True)
50
+ try:
51
+ # print("Vertices shape =", vertices.shape)
52
+ # print("Joints shape =", joints.shape)
53
+
54
+ # from their demo script
55
+ plotting_module = "pyrender"
56
+ if plotting_module == "pyrender":
57
+ vertex_colors = np.ones([vertices.shape[0], 4]) * [0.3, 0.3, 0.3, 0.8]
58
+ tri_mesh = trimesh.Trimesh(vertices, model.faces, vertex_colors=vertex_colors)
59
+
60
+ # Apply rotation
61
+ tri_mesh.apply_transform(rot)
62
+ ##### RENDER LOCK #####
63
+ if should_display:
64
+ viewer.render_lock.acquire()
65
+ if mesh_node:
66
+ scene.remove_node(mesh_node)
67
+ mesh = pyrender.Mesh.from_trimesh(tri_mesh)
68
+ mesh_node = scene.add(mesh)
69
+
70
+ camera = pyrender.PerspectiveCamera(yfov=np.pi / 3.0, aspectRatio=1.0)
71
+ min_bound, max_bound = mesh.bounds
72
+
73
+ # Calculate the center of the bounding box
74
+ center = (min_bound + max_bound) / 2
75
+
76
+ # Calculate the extents (the dimensions of the bounding box)
77
+ extents = max_bound - min_bound
78
+
79
+ # Estimate a suitable distance
80
+ distance = max(extents) * 2 # Adjust the multiplier as needed
81
+
82
+ # Create a camera pose matrix
83
+ cam_pose = np.array(
84
+ [
85
+ [1.0, 0, 0, center[0]],
86
+ [0, 1.0, 0, center[1]-1.0],
87
+ [0, 0, 1.0, center[2] + distance + 0.5],
88
+ [0, 0, 0, 1],
89
+ ]
90
+ )
91
+ # Rotate around X-axis
92
+ angle = np.radians(90)
93
+ cos_angle = np.cos(angle)
94
+ sin_angle = np.sin(angle)
95
+ rot_x = np.array([
96
+ [1, 0, 0, 0],
97
+ [0, cos_angle, -sin_angle, 0],
98
+ [0, sin_angle, cos_angle, 0],
99
+ [0, 0, 0, 1]
100
+ ])
101
+ cam_pose = np.matmul(cam_pose, rot_x)
102
+ # this is great pose, head on, but a bit far from face
103
+ # cam_pose[:3, 3] += np.array([0, 0, -3.5])
104
+ cam_pose[:3, 3] += np.array([-.01, 0.65, -3.3])
105
+
106
+ scene.add(camera, pose=cam_pose)
107
+
108
+ # Add light for better visualization
109
+ light = pyrender.DirectionalLight(color=np.ones(3), intensity=2.0)
110
+ scene.add(light, pose=cam_pose)
111
+
112
+ if should_save:
113
+ r = pyrender.OffscreenRenderer(viewport_width=640, viewport_height=480)
114
+ col_img, _ = r.render(scene)
115
+ save_img(col_img, save_path)
116
+ r.delete() # Free up the resources
117
+ ###### RENDER LOCK RELEASE #####
118
+ if should_display:
119
+ viewer.render_lock.release()
120
+ except KeyboardInterrupt:
121
+ if should_display:
122
+ viewer.close_external()
123
+
124
+ # motion_arr is 212 dims (no shapes: aka no betas and no face shapes)
125
+ def mesh_and_save(args, motion_arr, seq_name, model_name, emotion):
126
+ motion_dict = motion_arr_to_dict(motion_arr, shapes_dropped=True)
127
+ smplx_params = to_smplx_dict(motion_dict)
128
+ model_folder = "./models/smplx"
129
+ batch_size = 1
130
+ model = smplx.SMPLX(
131
+ model_folder,
132
+ use_pca=False, # our joints are not in pca space
133
+ num_expression_coeffs=NUM_FACIAL_EXPRESSION_DIMS,
134
+ batch_size=batch_size,
135
+ )
136
+ output = model.forward(**smplx_params, return_verts=True)
137
+ log.info(f"output size {output.vertices.shape}")
138
+ log.info(f"output size {output.joints.shape}")
139
+ log.info("rendering mesh")
140
+ base_file = args.file.split('.')[0]
141
+ # add {emotion}_{base_file} as a subfolder if it doesn't exist
142
+ subfolder = f"single_pose_imgs/{model_name}/{emotion}_{base_file}"
143
+ if not os.path.exists(subfolder):
144
+ os.makedirs(subfolder)
145
+ save_path = f"{subfolder}/{seq_name}_pose.png"
146
+ render_mesh(model, output, should_save=True, save_path=save_path)
147
+ log.warning(
148
+ "if you don't see the mesh animation, make sure you are running on graphics compatible DTU machine (vgl xterm)."
149
+ )
150
+ return subfolder
151
+
152
+ if __name__ == "__main__":
153
+ parser = argparse.ArgumentParser()
154
+
155
+ parser.add_argument(
156
+ "-e",
157
+ "--emotion",
158
+ type=str,
159
+ required=True,
160
+ help="emotion to calculate mean, std for",
161
+ )
162
+ parser.add_argument(
163
+ "-f",
164
+ "--file",
165
+ type=str,
166
+ required=True,
167
+ help="file to filter for emotion",
168
+ )
169
+ parser.add_argument(
170
+ "-m",
171
+ "--model_path",
172
+ type=str,
173
+ required=False,
174
+ default="",
175
+ help="Path to model directory e.g. ./checkpoints/grab/grab_baseline_dp_2gpu_8layers_1000",
176
+ )
177
+ args = parser.parse_args()
178
+ data_root = './data/GRAB'
179
+ motion_label_dir = pjoin(data_root, 'texts')
180
+ emotions_label_dir = pjoin(data_root, 'face_texts')
181
+ args = parser.parse_args()
182
+ seq_list_file = pjoin(data_root, args.file)
183
+ logging.info("aggregating info about sequences...")
184
+ info_dict = get_info_from_file(seq_list_file, emotions_label_dir, motion_label_dir)
185
+
186
+ # get all files with args.emotion_code
187
+ logging.info("calculating mean pose statistics...")
188
+ emotions = info_dict["unique_emotions"]
189
+ # emotions = [args.emotion]
190
+ for emotion in emotions:
191
+ logging.info(f"render mean mesh for {emotion} in {args.file}...")
192
+ emo_code = label_code(emotion)
193
+ names_with_emo = info_dict["emotion_to_names"][emo_code]
194
+ arrays = names_to_arrays(data_root, names_with_emo)
195
+
196
+ mean, std = calc_mean_stddev_pose(arrays)
197
+ # add 1 dimension to mean and std
198
+ mean = mean.reshape(1, -1)
199
+ std = std.reshape(1, -1)
200
+
201
+ mean_dict = motion_arr_to_dict(mean, shapes_dropped=True)
202
+ std_dict = motion_arr_to_dict(std, shapes_dropped=True)
203
+ logging.info(f"{emotion} mean: {mean_dict['face_expr']}")
204
+ logging.info(f"{emotion} std: {std_dict['face_expr']}")
205
+
206
+ logging.info(f"rendering mean mesh for {emotion} in {args.file}...")
207
+ subfolder = mesh_and_save(args, mean, "mean", args.model_path, emotion)
208
+
209
+ model_name = args.model_path.split('/')[-1] if args.model_path else "ground_truth"
210
+ # write the sequence names in a metadata folder at subfolder
211
+ metadata_folder = f"{subfolder}/metadata"
212
+ if not os.path.exists(metadata_folder):
213
+ os.makedirs(metadata_folder)
214
+ metadata_path = f"{metadata_folder}/metadata.txt"
215
+ with open(metadata_path, 'w') as f:
216
+ f.write(f"model: {model_name}\n")
217
+ f.write(f"emotion: {emotion}\n")
218
+ f.write(f"file: {args.file}\n")
219
+ f.write(f"mean: {mean_dict}\n")
220
+ f.write(f"std: {std_dict}\n")
221
+ for name in names_with_emo:
222
+ f.write(f"{name}\n")
223
+
224
+ # now plot mesh for each of the sequences
225
+ for i, arr in enumerate(arrays):
226
+ one_pose = arr[0]
227
+ one_pose = one_pose.reshape(1, -1)
228
+ name = names_with_emo[i]
229
+ # replace / with _
230
+ name = name.replace("/", "_")
231
+ subfolder = mesh_and_save(args, one_pose, name, args.model_path, emotion)
text2motion/datasets/motionx_explorer.py ADDED
@@ -0,0 +1,554 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging as log
3
+ import os
4
+ import time
5
+ from collections import defaultdict
6
+ from os.path import join as pjoin
7
+ from typing import Dict, Optional, Tuple
8
+
9
+ import numpy as np
10
+ import smplx
11
+ import torch
12
+ from numpy.typing import ArrayLike
13
+ from torch import Tensor
14
+
15
+ from .rendering import render_meshes
16
+
17
+ log.basicConfig(
18
+ level=log.INFO,
19
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
20
+ )
21
+
22
+
23
+ MOCAP_DATASETS = {"egobody", "grab", "humanml", "grab_motion"}
24
+ DATA_DIR = "data"
25
+ MODELS_DIR = "models"
26
+ MOCAP_FACE_DIR = f"{DATA_DIR}/face_motion_data/smplx_322" # contains face motion data only
27
+ MOTION_DIR = f"{DATA_DIR}/motion_data/smplx_322"
28
+ ACTION_LABEL_DIR = f"{DATA_DIR}/semantic_labels"
29
+ EMOTION_LABEL_DIR = f"{DATA_DIR}/face_texts"
30
+
31
+
32
+ """
33
+ Page 12 of https://arxiv.org/pdf/2307.00818.pdf shows:
34
+
35
+ smpl-x = {θb, θh, θf , ψ, r} = 3D body pose, 3D hand pose, jaw pose, facial expression, global root orientation, global translation
36
+ dims: (22x3, 30x3, 1x3, 1x50, 1x3) = (66, 90, 3, 50, 3, 3)
37
+
38
+ NOTE: I think they are wrong about n_body_joints though, data indicates it's actually 21x3 = 63, not 22x3 = 66
39
+ """
40
+
41
+ MY_REPO = os.path.abspath("")
42
+ log.info(f"MY_REPO: {MY_REPO}")
43
+ NUM_BODY_JOINTS = 23 - 2 # SMPL has hand joints but we're replacing them with more detailed ones by SMLP-X, paper: 22x3 total body dims * not sure why paper says 22
44
+ NUM_JAW_JOINTS = 1 # 1x3 total jaw dims
45
+ # Motion-X paper says there
46
+ NUM_HAND_JOINTS = 15 # x2 for each hand -> 30x3 total hand dims
47
+ NUM_JOINTS = NUM_BODY_JOINTS + NUM_HAND_JOINTS * 2 + NUM_JAW_JOINTS # 21 + 30 + 1 = 52
48
+ NUM_FACIAL_EXPRESSION_DIMS = 50 # as per Motion-X paper, but why is default 10 in smplx code then?
49
+ FACE_SHAPE_DIMS = 100
50
+ BODY_SHAPE_DIMS = 10 # betas
51
+ ROOT_DIMS = 3
52
+ TRANS_DIMS = 3 # same as root, no?
53
+
54
+ pose_type_to_dims = {
55
+ "pose_body": NUM_BODY_JOINTS * 3,
56
+ "pose_hand": NUM_HAND_JOINTS * 2 * 3, # both hands
57
+ "pose_jaw": NUM_JAW_JOINTS * 3,
58
+ "face_expr": NUM_FACIAL_EXPRESSION_DIMS * 1, # double check
59
+ "face_shape": FACE_SHAPE_DIMS * 1, # double check
60
+ "root_orient": ROOT_DIMS * 1,
61
+ "betas": BODY_SHAPE_DIMS * 1,
62
+ "trans": TRANS_DIMS * 1,
63
+ }
64
+
65
+ def names_to_arrays(root_dir, names, drop_shapes=True):
66
+ all_arrays = []
67
+ for name in names:
68
+ # Load each NumPy array and add it to the list
69
+ array = np.load(pjoin(f"{root_dir}/joints", f"{name}.npy"))
70
+ # drop shapes -> 212 dims
71
+ if drop_shapes:
72
+ array = drop_shapes_from_motion_arr(array)
73
+ all_arrays.append(array)
74
+ return all_arrays
75
+
76
+ def get_seq_names(file_path):
77
+ with open(file_path, "r") as f:
78
+ names = f.readlines()
79
+ names = [name.strip() for name in names]
80
+ return names
81
+
82
+ def get_data_path(dataset_dir: str, seq: str, file: str) -> str:
83
+ # MY_REPO/face_motion_data/smplx_322/GRAB/s1/airplane_fly_1.npy
84
+ top_dir = MOCAP_FACE_DIR if dataset_dir.lower() in MOCAP_DATASETS else MOTION_DIR
85
+ path = f"{os.path.join(MY_REPO, top_dir, dataset_dir, seq, file)}.npy"
86
+ return path
87
+
88
+
89
+ def get_label_paths(dataset_dir: str, seq: str, file: str) -> Dict[str, str]:
90
+ # MY_REPO/MotionDiffuse/face_texts/GRAB/s1/airplane_fly_1.txt
91
+ action_path = f"{os.path.join(MY_REPO, ACTION_LABEL_DIR, dataset_dir, seq, file)}.txt"
92
+ emotion_path = f"{os.path.join(MY_REPO, EMOTION_LABEL_DIR, dataset_dir, seq, file)}.txt"
93
+ paths = {"action": action_path, "emotion": emotion_path}
94
+ return paths
95
+
96
+ def load_data_as_dict(dataset_dir: str, seq: str, file: str) -> Dict[str, Tensor]:
97
+ path = get_data_path(dataset_dir, seq, file)
98
+ motion = np.load(path)
99
+ motion = torch.tensor(motion).float()
100
+ return {
101
+ "root_orient": motion[:, :3], # controls the global root orientation
102
+ "pose_body": motion[:, 3 : 3 + 63], # controls the body
103
+ "pose_hand": motion[:, 66 : 66 + 90], # controls the finger articulation
104
+ "pose_jaw": motion[:, 66 + 90 : 66 + 93], # controls the jaw pose
105
+ "face_expr": motion[:, 159 : 159 + 50], # controls the face expression
106
+ "face_shape": motion[:, 209 : 209 + 100], # controls the face shape
107
+ "trans": motion[:, 309 : 309 + 3], # controls the global body position
108
+ "betas": motion[:, 312:], # controls the body shape. Body shape is static
109
+ }
110
+
111
+ def motion_arr_to_dict(motion_arr: ArrayLike, shapes_dropped=False) -> Dict[str, Tensor]:
112
+ # TODO (elmc): why did I need to convert to tensor again???
113
+ motion_arr = torch.tensor(motion_arr).float()
114
+ motion_dict = {
115
+ "root_orient": motion_arr[:, :3], # controls the global root orientation
116
+ "pose_body": motion_arr[:, 3 : 3 + 63], # controls the body
117
+ "pose_hand": motion_arr[:, 66 : 66 + 90], # controls the finger articulation
118
+ "pose_jaw": motion_arr[:, 66 + 90 : 66 + 93], # controls the jaw pose
119
+ "face_expr": motion_arr[:, 159 : 159 + 50], # controls the face expression
120
+ }
121
+ if not shapes_dropped:
122
+ motion_dict["face_shape"] = motion_arr[:, 209 : 209 + 100] # controls the face shape
123
+ motion_dict["trans"] = motion_arr[:, 309 : 309 + 3] # controls the global body position
124
+ motion_dict["betas"] = motion_arr[:, 312:] # controls the body shape. Body shape is static
125
+ else:
126
+ motion_dict["trans"] = motion_arr[:, 209:] # controls the global body position
127
+
128
+ return motion_dict
129
+
130
+
131
+ def drop_shapes_from_motion_arr(motion_arr: ArrayLike) -> ArrayLike:
132
+ if isinstance(motion_arr, torch.Tensor):
133
+ new_motion_arr = motion_arr.numpy()
134
+
135
+ # Slice the array to exclude 'face_shape' and 'betas'
136
+ new_motion_arr = np.concatenate((motion_arr[:, :209], motion_arr[:, 309:312]), axis=1)
137
+
138
+ return new_motion_arr
139
+
140
+ def load_label_from_file(file_path: str) -> str:
141
+ with open(file_path, "r") as file:
142
+ # Read the contents of the file into a string
143
+ label = file.read()
144
+ return label
145
+
146
+ def load_label(dataset_dir: str, seq: str, file_path: str) -> Dict[str, str]:
147
+ paths = get_label_paths(dataset_dir, seq, file_path)
148
+ action_path, emotion_path = paths["action"], paths["emotion"]
149
+ log.info(f"loading labels from {action_path} and {emotion_path}")
150
+ paths = {}
151
+ with open(action_path, "r") as file:
152
+ # Read the contents of the file into a string
153
+ action_label = file.read()
154
+ with open(emotion_path, "r") as file:
155
+ # Read the contents of the file into a string
156
+ emotion_label = file.read()
157
+ return {"action": action_label, "emotion": emotion_label}
158
+
159
+
160
+ def label_code(full_label):
161
+ # take first 3 letters of label
162
+ # surprise -> sur
163
+ # airplane -> air
164
+ return full_label[:3]
165
+
166
+ def get_seq_type(motion_label_dir, file_name):
167
+ # e.g. s5/airplane_fly_1 -> airplane fly (motion label)
168
+ seq_type_path = pjoin(motion_label_dir, f"{file_name}.txt")
169
+ with open(seq_type_path, 'r') as f:
170
+ seq_type = f.readline().strip()
171
+ return seq_type
172
+
173
+ def calc_mean_stddev_pose(arrays):
174
+ # all_arrays = []
175
+ # for file_path in file_list:
176
+ # # Load each NumPy array and add it to the list
177
+ # array = np.load(file_path)
178
+ # all_arrays.append(array)
179
+
180
+ # Concatenate all arrays along the first axis (stacking them on top of each other)
181
+ concatenated_arrays = np.concatenate(arrays, axis=0)
182
+ # Calculate the mean and standard deviation across all arrays
183
+ mean = np.mean(concatenated_arrays, axis=0)
184
+ stddev = np.std(concatenated_arrays, axis=0)
185
+
186
+ return mean, stddev
187
+
188
+ def get_info_from_file(file_path, emotions_label_dir, motion_label_dir):
189
+ # train_names = get_seq_names(pjoin(data_dir, "train.txt"))
190
+ names = get_seq_names(file_path)
191
+ seq_type_to_emotions = defaultdict(set)
192
+ emotions_count = defaultdict(int)
193
+ seq_type_count = defaultdict(int)
194
+ obj_count = defaultdict(int)
195
+ code_to_label = {}
196
+ emotion_to_names = defaultdict(list)
197
+ n_seq = len(names)
198
+ for name in names:
199
+ seq_type = get_seq_type(motion_label_dir, name)
200
+ emotion = load_label_from_file(pjoin(emotions_label_dir, f"{name}.txt"))
201
+ object_ = seq_type.split(" ")[0]
202
+ seq_type_to_emotions[seq_type].add(emotion)
203
+ emo_code = label_code(emotion)
204
+ emotions_count[emo_code] += 1
205
+ seq_type_count[seq_type] += 1
206
+ obj_code = label_code(object_)
207
+ obj_count[label_code(object_)] += 1
208
+ code_to_label[emo_code] = emotion
209
+ code_to_label[obj_code] = object_
210
+ emotion_to_names[emo_code].append(name)
211
+ unique_emotions = set([code_to_label[code] for code in emotions_count])
212
+ info_dict = {
213
+ "seq_type_to_emotions": seq_type_to_emotions,
214
+ "emotions_count": emotions_count,
215
+ "seq_type_count": seq_type_count,
216
+ "obj_count": obj_count,
217
+ "code_to_label": code_to_label,
218
+ "emotion_to_names": emotion_to_names,
219
+ "unique_emotions": unique_emotions,
220
+ "n_seq": n_seq,
221
+ "code_to_label": code_to_label,
222
+ }
223
+ return info_dict
224
+
225
+ def to_smplx_dict(motion_dict: Dict[str, Tensor], timestep_range: Optional[Tuple[int, int]] = None) -> Dict[str, Tensor]:
226
+ if timestep_range is None:
227
+ # get all timesteps
228
+ timestep_range = (0, len(motion_dict["pose_body"]))
229
+ smplx_params = {
230
+ "global_orient": motion_dict["root_orient"][
231
+ timestep_range[0] : timestep_range[1]
232
+ ], # controls the global root orientation
233
+ "body_pose": motion_dict["pose_body"][timestep_range[0] : timestep_range[1]], # controls the body
234
+ "left_hand_pose": motion_dict["pose_hand"][timestep_range[0] : timestep_range[1]][
235
+ :, : NUM_HAND_JOINTS * 3
236
+ ], # controls the finger articulation
237
+ "right_hand_pose": motion_dict["pose_hand"][timestep_range[0] : timestep_range[1]][:, NUM_HAND_JOINTS * 3 :],
238
+ "expression": motion_dict["face_expr"][timestep_range[0] : timestep_range[1]], # controls the face expression
239
+ "jaw_pose": motion_dict["pose_jaw"][timestep_range[0] : timestep_range[1]], # controls the jaw pose
240
+ # 'face_shape': motion_dict['face_shape'][timestep], # controls the face shape, drop since we don't care to train on this
241
+ "transl": motion_dict["trans"][timestep_range[0] : timestep_range[1]], # controls the global body position
242
+ # "betas": motion["betas"][
243
+ # timestep_range[0] : timestep_range[1]
244
+ # ], # controls the body shape. Body shape is static, drop since we don't care to train on this
245
+ }
246
+ return smplx_params
247
+
248
+ def smplx_dict_to_array(smplx_dict):
249
+ # convert smplx dict to array
250
+ # list keys to ensure known order when iterating over dict
251
+ keys = ["global_orient", "body_pose", "left_hand_pose", "right_hand_pose", "expression", "jaw_pose", "transl"]
252
+ smplx_array = []
253
+ for key in keys:
254
+ smplx_array.append(smplx_dict[key])
255
+ smplx_array = torch.cat(smplx_array, dim=1)
256
+ return smplx_array
257
+
258
+ def save_gif(gif_path, gif_frames, duration=0.01):
259
+ if gif_frames:
260
+ print(f"Saving GIF with {len(gif_frames)} frames to {gif_path}")
261
+ imageio.mimsave(uri=gif_path, ims=gif_frames, duration=duration)
262
+ else:
263
+ print("No frames to save.")
264
+
265
+ # based on https://github.com/vchoutas/smplx/blob/main/examples/demo.py
266
+ def render_meshes(output, should_save_gif=False, gif_path=None):
267
+ should_display = not should_save_gif
268
+ vertices_list = output.vertices.detach().cpu().numpy().squeeze()
269
+ joints_list = output.joints.detach().cpu().numpy().squeeze()
270
+ # TODO (elmc): why do I wrap these in a list again?
271
+ if len(vertices_list.shape) == 2:
272
+ vertices_list = [vertices_list]
273
+ joints_list = [joints_list]
274
+ scene = pyrender.Scene()
275
+ if should_display:
276
+ viewer = pyrender.Viewer(scene, run_in_thread=True)
277
+
278
+ mesh_node = None
279
+ joints_node = None
280
+ # Rotation matrix (90 degrees around the X-axis)
281
+ rot = trimesh.transformations.rotation_matrix(np.radians(90), [1, 0, 0])
282
+ gif_frames = []
283
+ if should_save_gif:
284
+ os.makedirs(os.path.dirname(gif_path), exist_ok=True)
285
+ try:
286
+ for i in tqdm(range(len(vertices_list))):
287
+ vertices = vertices_list[i]
288
+ joints = joints_list[i]
289
+ # print("Vertices shape =", vertices.shape)
290
+ # print("Joints shape =", joints.shape)
291
+
292
+ # from their demo script
293
+ plotting_module = "pyrender"
294
+ plot_joints = False
295
+ if plotting_module == "pyrender":
296
+ vertex_colors = np.ones([vertices.shape[0], 4]) * [0.3, 0.3, 0.3, 0.8]
297
+ tri_mesh = trimesh.Trimesh(vertices, model.faces, vertex_colors=vertex_colors)
298
+
299
+ # Apply rotation
300
+ tri_mesh.apply_transform(rot)
301
+ ##### RENDER LOCK #####
302
+ if should_display:
303
+ viewer.render_lock.acquire()
304
+ if mesh_node:
305
+ scene.remove_node(mesh_node)
306
+ mesh = pyrender.Mesh.from_trimesh(tri_mesh)
307
+ mesh_node = scene.add(mesh)
308
+
309
+ camera = pyrender.PerspectiveCamera(yfov=np.pi / 3.0, aspectRatio=1.0)
310
+ min_bound, max_bound = mesh.bounds
311
+
312
+ # Calculate the center of the bounding box
313
+ center = (min_bound + max_bound) / 2
314
+
315
+ # Calculate the extents (the dimensions of the bounding box)
316
+ extents = max_bound - min_bound
317
+
318
+ # Estimate a suitable distance
319
+ distance = max(extents) * 2 # Adjust the multiplier as needed
320
+
321
+ # Create a camera pose matrix
322
+ cam_pose = np.array(
323
+ [
324
+ [1.0, 0, 0, center[0]],
325
+ [0, 1.0, 0, center[1]-1.0],
326
+ [0, 0, 1.0, center[2] + distance + 0.5],
327
+ [0, 0, 0, 1],
328
+ ]
329
+ )
330
+ # Rotate around X-axis
331
+ # Rotate around X-axis
332
+ angle = np.radians(90)
333
+ cos_angle = np.cos(angle)
334
+ sin_angle = np.sin(angle)
335
+ rot_x = np.array([
336
+ [1, 0, 0, 0],
337
+ [0, cos_angle, -sin_angle, 0],
338
+ [0, sin_angle, cos_angle, 0],
339
+ [0, 0, 0, 1]
340
+ ])
341
+ cam_pose = np.matmul(cam_pose, rot_x)
342
+ cam_pose[:3, 3] += np.array([0, -2.5, -3.5])
343
+
344
+ scene.add(camera, pose=cam_pose)
345
+
346
+ # Add light for better visualization
347
+ light = pyrender.DirectionalLight(color=np.ones(3), intensity=2.0)
348
+ scene.add(light, pose=cam_pose)
349
+
350
+ # TODO: rotation doesn't work here, so appears sideways
351
+ if plot_joints:
352
+ sm = trimesh.creation.uv_sphere(radius=0.005)
353
+ sm.visual.vertex_colors = [0.9, 0.1, 0.1, 1.0]
354
+ tfs = np.tile(np.eye(4), (len(joints), 1, 1))
355
+ # tfs[:, :3, 3] = joints
356
+ for i, joint in enumerate(joints):
357
+ tfs[i, :3, :3] = rot[:3, :3]
358
+ tfs[i, :3, 3] = joint
359
+ joints_pcl = pyrender.Mesh.from_trimesh(sm, poses=tfs)
360
+ if joints_node:
361
+ scene.remove_node(joints_node)
362
+ joints_node = scene.add(joints_pcl)
363
+ if should_save_gif:
364
+ r = pyrender.OffscreenRenderer(viewport_width=640, viewport_height=480)
365
+ color, _ = r.render(scene)
366
+ gif_frames.append(color)
367
+ r.delete() # Free up the resources
368
+ ###### RENDER LOCK RELEASE #####
369
+ if should_display:
370
+ viewer.render_lock.release()
371
+ except KeyboardInterrupt:
372
+ if should_display:
373
+ viewer.close_external()
374
+ save_gif(gif_path, gif_frames)
375
+ finally:
376
+ save_gif(gif_path, gif_frames)
377
+
378
+ def get_numpy_file_path(prompt, epoch, n_frames):
379
+ # e.g. "airplane_fly_1_1000_60f.npy"
380
+ prompt_no_spaces = prompt.replace(' ', '_')
381
+ return f"{prompt_no_spaces}_{epoch}_{n_frames}f"
382
+
383
+ if __name__ == "__main__":
384
+ parser = argparse.ArgumentParser()
385
+
386
+ parser.add_argument(
387
+ "-mn",
388
+ "--min_t",
389
+ type=int,
390
+ required=False,
391
+ default=0,
392
+ help="Minimum number of timesteps to render",
393
+ )
394
+ parser.add_argument(
395
+ "-mx",
396
+ "--max_t",
397
+ type=int,
398
+ required=False,
399
+ help="Maximum number of timesteps to render",
400
+ )
401
+ parser.add_argument(
402
+ "-dm",
403
+ "--display_mesh",
404
+ action='store_true',
405
+ required=False,
406
+ default=False,
407
+ help="Display mesh if this flag is present"
408
+ )
409
+ # for now just specifies file name (with spaces) made by inference
410
+ parser.add_argument(
411
+ "-p",
412
+ "--prompt",
413
+ type=str,
414
+ required=False,
415
+ default="",
416
+ help="Prompt for inference display",
417
+ )
418
+ parser.add_argument(
419
+ "-sf",
420
+ "--seq_file",
421
+ type=str,
422
+ required=False,
423
+ default="",
424
+ help="file for non-inference display",
425
+ )
426
+ # add model_path arg
427
+ parser.add_argument(
428
+ "-m",
429
+ "--model_path",
430
+ type=str,
431
+ required=False,
432
+ default="",
433
+ help="Path to model directory e.g. ./checkpoints/grab/grab_baseline_dp_2gpu_8layers_1000",
434
+ )
435
+ parser.add_argument(
436
+ "-sg",
437
+ "--save_gif",
438
+ action='store_true',
439
+ required=False,
440
+ default=False,
441
+ help="Save gif if this flag is present"
442
+ )
443
+ # add which_epoch
444
+ parser.add_argument(
445
+ "-we",
446
+ "--which_epoch",
447
+ type=str,
448
+ required=True,
449
+ help="which epoch to load",
450
+ )
451
+ args = parser.parse_args()
452
+
453
+ prompt = args.prompt
454
+ is_inference = len(prompt) > 0
455
+ if args.seq_file != "" and args.prompt != "":
456
+ log.error("cannot provide both prompt and seq_file; if trying to verify model inference, use --prompt, otherwise specify numpy --seq_file name to display")
457
+ exit(1)
458
+ elif args.seq_file == "" and args.prompt == "":
459
+ log.error("must provide either prompt or seq_file; if trying to verify model inference, use --prompt, otherwise specify numpy --seq_file name to display")
460
+ exit(1)
461
+ if not is_inference:
462
+ name = args.seq_file
463
+ data_root = './data/GRAB'
464
+ motion_dir = pjoin(data_root, 'joints')
465
+ else:
466
+ log.info(f"converting prompt into file name")
467
+ name = get_numpy_file_path(prompt, args.which_epoch, args.max_t - args.min_t)
468
+ model_type = args.model_path
469
+ motion_dir = pjoin(model_type, 'outputs')
470
+ motion_path = pjoin(motion_dir, name + '.npy')
471
+ log.info(f"loading motion from {motion_path}")
472
+ motion_arr = np.load(motion_path)
473
+ t = 999
474
+ mean_path = '/work3/s222376/MotionDiffuse2/text2motion/checkpoints/grab/md_fulem_2g_excl_196_seed42/meta/mean.npy'
475
+ std_path = '/work3/s222376/MotionDiffuse2/text2motion/checkpoints/grab/md_fulem_2g_excl_196_seed42/meta/std.npy'
476
+ mean = np.load(mean_path)
477
+ std = np.load(std_path)
478
+ # do range skipping by 100
479
+ list_ = [t for t in range(10, 91, 10)]
480
+ list_ += [t for t in range(100, 200, 30)]
481
+ for t in list_:
482
+ name = f"sample_tensor([{t}])"
483
+ # breakpoint()
484
+ motion_arr = np.load(f"/work3/s222376/MotionDiffuse2/text2motion/generation_samples/{name}.npy")
485
+ motion_arr = np.squeeze(motion_arr)
486
+
487
+ motion_arr = motion_arr * std + mean
488
+ # drop shapes for ground-truth to have same dimensionality as inference
489
+ # for fair comparisons and reducing bugs
490
+ if not is_inference:
491
+ # directly get smplx dimensionality by dropping body and face shape data
492
+ print("warning, dropping body and face shape data")
493
+ motion_arr = drop_shapes_from_motion_arr(motion_arr)
494
+ assert motion_arr.shape[1] == 212, f"expected 212 dims, got {motion_arr.shape[1]}"
495
+
496
+ # our MotionDiffuse predicts motion data that doesn't include face and body shape
497
+ motion_dict = motion_arr_to_dict(motion_arr, shapes_dropped=True)
498
+ n_points = len(motion_dict["pose_body"])
499
+
500
+ min_t = args.min_t
501
+ max_t = args.max_t or n_points
502
+ if max_t > n_points:
503
+ max_t = n_points
504
+
505
+ timestep_range = (min_t, max_t)
506
+ frames = max_t - min_t
507
+ log.info(f"POSES: {n_points}")
508
+ # checks data has expected shape
509
+ tot_dims = 0
510
+ for key in motion_dict:
511
+ dims = motion_dict[key].shape[1]
512
+ exp_dims = pose_type_to_dims.get(key)
513
+ tot_dims += motion_dict[key].shape[1]
514
+ log.info(f"{key}: {motion_dict[key].shape}, dims {dims}, exp: {exp_dims}")
515
+ log.info(f"total MOTION-X dims: {tot_dims}\n")
516
+
517
+ smplx_params = to_smplx_dict(motion_dict, timestep_range)
518
+ tot_smplx_dims = 0
519
+ for key in smplx_params:
520
+ tot_smplx_dims += smplx_params[key].shape[1]
521
+ log.info(f"{key}: {smplx_params[key].shape}")
522
+ log.info(f"TOTAL SMPLX dims: {tot_smplx_dims}\n")
523
+
524
+ if not is_inference:
525
+ action_label_path = pjoin(data_root, 'texts', name + '.txt')
526
+ action_label = load_label_from_file(action_label_path)
527
+ emotion_label_path = pjoin(data_root, 'face_texts', name + '.txt')
528
+ emotion_label = load_label_from_file(emotion_label_path)
529
+ log.info(f"action: {action_label}")
530
+ log.info(f"emotion: {emotion_label}")
531
+
532
+ if is_inference:
533
+ emotion_label = args.prompt.split(' ')[0]
534
+
535
+ if args.display_mesh:
536
+ model_folder = os.path.join(MY_REPO, MODELS_DIR, "smplx")
537
+ batch_size = max_t - min_t
538
+ log.info(f"calculating mesh with batch size {batch_size}")
539
+ model = smplx.SMPLX(
540
+ model_folder,
541
+ use_pca=False, # our joints are not in pca space
542
+ num_expression_coeffs=NUM_FACIAL_EXPRESSION_DIMS,
543
+ batch_size=batch_size,
544
+ )
545
+ output = model.forward(**smplx_params, return_verts=True)
546
+ log.info(f"output size {output.vertices.shape}")
547
+ log.info(f"output size {output.joints.shape}")
548
+ log.info("rendering mesh")
549
+ model_name = args.model_path.split('/')[-1] if args.model_path else "ground_truth"
550
+ gif_path = f"gifs/{model_name}/{name}_{emotion_label}.gif"
551
+ render_meshes(output, gif_path=gif_path, should_save_gif=args.save_gif)
552
+ log.warning(
553
+ "if you don't see the mesh animation, make sure you are running on graphics compatible DTU machine (vgl xterm)."
554
+ )
text2motion/datasets/rendering.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pyrender
2
+ from tqdm import tqdm
3
+ import trimesh
4
+ import numpy as np
5
+ import os
6
+ import imageio
7
+
8
+
9
+ def save_gif(gif_path, gif_frames, duration=0.01):
10
+ if gif_frames:
11
+ print(f"Saving GIF with {len(gif_frames)} frames to {gif_path}")
12
+ imageio.mimsave(uri=gif_path, ims=gif_frames, duration=duration)
13
+ else:
14
+ print("No frames to save.")
15
+
16
+
17
+ def render_meshes(model, output, should_save_gif=False, gif_path=None):
18
+ should_display = not should_save_gif
19
+ vertices_list = output.vertices.detach().cpu().numpy().squeeze()
20
+ joints_list = output.joints.detach().cpu().numpy().squeeze()
21
+ if len(vertices_list.shape) == 2:
22
+ vertices_list = [vertices_list]
23
+ joints_list = [joints_list]
24
+ scene = pyrender.Scene()
25
+ if should_display:
26
+ viewer = pyrender.Viewer(scene, run_in_thread=True)
27
+
28
+ mesh_node = None
29
+ joints_node = None
30
+ # Rotation matrix (90 degrees around the X-axis)
31
+ rot = trimesh.transformations.rotation_matrix(np.radians(90), [1, 0, 0])
32
+ gif_frames = []
33
+ if should_save_gif:
34
+ os.makedirs(os.path.dirname(gif_path), exist_ok=True)
35
+ try:
36
+ for i in tqdm(range(len(vertices_list))):
37
+ vertices = vertices_list[i]
38
+ joints = joints_list[i]
39
+ # print("Vertices shape =", vertices.shape)
40
+ # print("Joints shape =", joints.shape)
41
+
42
+ # from their demo script
43
+ plotting_module = "pyrender"
44
+ plot_joints = False
45
+ if plotting_module == "pyrender":
46
+ vertex_colors = np.ones([vertices.shape[0], 4]) * [0.3, 0.3, 0.3, 0.8]
47
+ tri_mesh = trimesh.Trimesh(vertices, model.faces, vertex_colors=vertex_colors)
48
+
49
+ # Apply rotation
50
+ tri_mesh.apply_transform(rot)
51
+ ##### RENDER LOCK #####
52
+ if should_display:
53
+ viewer.render_lock.acquire()
54
+ if mesh_node:
55
+ scene.remove_node(mesh_node)
56
+ mesh = pyrender.Mesh.from_trimesh(tri_mesh)
57
+ mesh_node = scene.add(mesh)
58
+
59
+ camera = pyrender.PerspectiveCamera(yfov=np.pi / 3.0, aspectRatio=1.0)
60
+ min_bound, max_bound = mesh.bounds
61
+
62
+ # Calculate the center of the bounding box
63
+ center = (min_bound + max_bound) / 2
64
+
65
+ # Calculate the extents (the dimensions of the bounding box)
66
+ extents = max_bound - min_bound
67
+
68
+ # Estimate a suitable distance
69
+ distance = max(extents) * 2 # Adjust the multiplier as needed
70
+
71
+ # Create a camera pose matrix
72
+ cam_pose = np.array(
73
+ [
74
+ [1.0, 0, 0, center[0]],
75
+ [0, 1.0, 0, center[1]-1.0],
76
+ [0, 0, 1.0, center[2] + distance + 0.5],
77
+ [0, 0, 0, 1],
78
+ ]
79
+ )
80
+ # Rotate around X-axis
81
+ angle = np.radians(80)
82
+ cos_angle = np.cos(angle)
83
+ sin_angle = np.sin(angle)
84
+ rot_x_10_deg = np.array([
85
+ [1, 0, 0, 0],
86
+ [0, cos_angle, -sin_angle, 0],
87
+ [0, sin_angle, cos_angle, 0],
88
+ [0, 0, 0, 1]
89
+ ])
90
+ # rotate cam_pose with rot_x_10_deg
91
+ cam_pose = np.matmul(cam_pose, rot_x_10_deg)
92
+ cam_pose[:3, 3] += np.array([0, -2.2, -3.0])
93
+
94
+ scene.add(camera, pose=cam_pose)
95
+
96
+ # Add light for better visualization
97
+ light = pyrender.DirectionalLight(color=np.ones(3), intensity=2.0)
98
+ scene.add(light, pose=cam_pose)
99
+
100
+ # TODO: rotation doesn't work here, so appears sideways
101
+ if plot_joints:
102
+ sm = trimesh.creation.uv_sphere(radius=0.005)
103
+ sm.visual.vertex_colors = [0.9, 0.1, 0.1, 1.0]
104
+ tfs = np.tile(np.eye(4), (len(joints), 1, 1))
105
+ # tfs[:, :3, 3] = joints
106
+ for i, joint in enumerate(joints):
107
+ tfs[i, :3, :3] = rot[:3, :3]
108
+ tfs[i, :3, 3] = joint
109
+ joints_pcl = pyrender.Mesh.from_trimesh(sm, poses=tfs)
110
+ if joints_node:
111
+ scene.remove_node(joints_node)
112
+ joints_node = scene.add(joints_pcl)
113
+ if should_save_gif:
114
+ r = pyrender.OffscreenRenderer(viewport_width=640, viewport_height=480)
115
+ color, _ = r.render(scene)
116
+ gif_frames.append(color)
117
+ r.delete() # Free up the resources
118
+ ###### RENDER LOCK RELEASE #####
119
+ if should_display:
120
+ viewer.render_lock.release()
121
+ except KeyboardInterrupt:
122
+ if should_display:
123
+ viewer.close_external()
124
+ save_gif(gif_path, gif_frames)
125
+ finally:
126
+ save_gif(gif_path, gif_frames)
text2motion/datasets/statistics_writer.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from os.path import join as pjoin
2
+
3
+ import numpy as np
4
+ from motionx_explorer import (calc_mean_stddev_pose,
5
+ drop_shapes_from_motion_arr, get_seq_names)
6
+
7
+ if __name__ == "__main__":
8
+ # read names from ./data/GRAB/train.txt
9
+ with open(pjoin("./data/GRAB", "train.txt"), "r") as f:
10
+ names = f.readlines()
11
+ names = [name.strip() for name in names]
12
+ print(f"names: {names}")
13
+ all_arrays = []
14
+ for name in names:
15
+ # Load each NumPy array and add it to the list
16
+ array = np.load(pjoin("./data/GRAB/joints", f"{name}.npy"))
17
+ # drop shapes -> 212 dims
18
+ array = drop_shapes_from_motion_arr(array)
19
+ print(f"shape of {name}: {array.shape}")
20
+ all_arrays.append(array)
21
+ mean, stddev = calc_mean_stddev_pose(all_arrays)
22
+ pose_dims = 212
23
+ assert mean.shape[0] == pose_dims
24
+ assert stddev.shape[0] == pose_dims
25
+ # check if stddev has 0's
26
+ stdev_zeros = np.where(stddev == 0)
27
+ n_zeros = len(stdev_zeros[0])
28
+ print(f"idx of stddev where 0: {stdev_zeros}")
29
+ assert n_zeros == 0, "stddev has 0's, but it should not..."
30
+ # save to ./data/GRAB/Mean.npy and ./data/GRAB/Std.npy
31
+ mean_write_path = pjoin("./data/GRAB", "Mean.npy")
32
+ stddev_write_path = pjoin("./data/GRAB", "Std.npy")
33
+ with open(mean_write_path, "wb") as f:
34
+ print(f"saving mean to {mean_write_path}")
35
+ np.save(f, mean)
36
+ with open(stddev_write_path, "wb") as f:
37
+ print(f"saving stddev to {stddev_write_path}")
38
+ np.save(f, stddev)
39
+
40
+
41
+ # test calculate_mean_stddev
42
+ # pose_dim = 3
43
+ # arrays_1s = np.full((4, pose_dim), 3)
44
+ # arrays_2s = np.full((2, pose_dim), 2)
45
+ # single_mean = (4*3 + 2*2) / (4+2)
46
+ # std_dev_single = np.sqrt((4*(3-single_mean)**2 + 2*(2-single_mean)**2) / (4+2))
47
+ # exp_mean = np.full((pose_dim), single_mean)
48
+ # exp_stddev = np.full((pose_dim), std_dev_single)
49
+ # all_arrays = [arrays_1s, arrays_2s]
50
+ # mean, stddev = calc_mean_stddev_pose(all_arrays)
51
+ # print(f"mean: {mean}, exp mean: {exp_mean}")
52
+ # print(f"stddev: {stddev}, exp stddev: {exp_stddev}")
53
+ # assert mean.shape == (3,)
54
+ # assert np.all(mean == exp_mean)
55
+ # assert stddev.shape == (3,)
56
+ # assert np.all(stddev == exp_stddev)
text2motion/datasets/train_explorer.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
text2motion/datasets/utils.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ from numpy.typing import ArrayLike
4
+
5
+ def drop_shapes_from_motion_arr(motion_arr: ArrayLike) -> ArrayLike:
6
+ if isinstance(motion_arr, torch.Tensor):
7
+ new_motion_arr = motion_arr.numpy()
8
+
9
+ # Slice the array to exclude 'face_shape' and 'betas'
10
+ new_motion_arr = np.concatenate((motion_arr[:, :209], motion_arr[:, 309:312]), axis=1)
11
+
12
+ return new_motion_arr
13
+
14
+ def load_label_from_file(file_path: str) -> str:
15
+ with open(file_path, "r") as file:
16
+ # Read the contents of the file into a string
17
+ label = file.read()
18
+ return label
text2motion/dtu_README.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## INSTALLATION
2
+ if you've noticed your python3 bin doens't point to your conda env when using --prefix to point to your scratch dir, then you need to do the following:
3
+ * conda config --set always_copy True
4
+ * conda config --show | grep always_copy
5
+ now continue as normal:
6
+ * conda create --prefix <your-scratch-path>/MotionDiffuse/env python=3.7
7
+ * conda activate <your-scratch-path>/MotionDiffuse/env
8
+ * double check your GCC is 5+ by running `gcc --version`; if not, do module load gcc/5.4.0
9
+ * module load cuda/10.1 # you must run these icuda commands before installing torch otherwise it will say version not found!!
10
+ * module load cudnn/v7.6.5.32-prod-cuda-10.1
11
+ * conda install pytorch=1.7.1 torchvision=0.8.2 cudatoolkit=10.1 -c pytorch
12
+ * python3 -m pip install "mmcv-full>=1.3.17,<=1.5.3" -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.7.1/index.html
13
+ * python3 -m pip install -r requirements.txt
14
+ * python3 -m pip install --upgrade protobuf==3.20.0
15
+
16
+ fyi there is an annoying warning in the logs (https://stackoverflow.com/questions/57381430/synonym-of-type-is-deprecated-in-a-future-version-of-numpy-it-will-be-underst) that can be silenced by downgrading numpy:to 1.16.4 BUT this is incompatible with the other package versions, so don't do it
17
+
18
+ fyi:
19
+ (/work3/s222376/MotionDiffuseNew) s222376@n-62-20-1 /work3/s222376/MotionDiffuse/text2motion (train_baseline)$ module list
20
+ Currently Loaded Modulefiles:
21
+ 1) latex/TeXLive19(default) 3) cudnn/v7.6.5.32-prod-cuda-10.1 5) gcc/5.4.0
22
+ 2) cuda/10.1 4) binutils/2.29(default) <aL>
23
+
24
+ ## TRAINING
25
+ * download KIT-ML data from <> and put the zip for it in text2motion/data/
26
+ * cd text2motion/data && unzip KIT-ML-20231122T121619Z-001.zip
27
+ * cd KIT-ML && unrar x new_joint_vecs.rar
28
+ * unrar x new_joints.rar
29
+ * unrar x texts.rar
30
+ * dirs should look like
31
+ ```
32
+ text2motion/data/KIT-ML
33
+ ├── new_joint_vecs
34
+ │   ├─�
35
+ ├── new_joints
36
+ │   ├─�
37
+ └── texts
38
+ ├─�
39
+ --all.txt
40
+ --<etc>
41
+ ```
42
+ * voltash (dtu hpc command to go to interactive gpu node)
43
+ * make train
44
+ * verify above works without errors and then kill training because you're on interactive gpu, you will likely run out of memory anyway (can decrease --batchsize but then it's slow)
45
+ * to do full training, edit jobscript.sh to use your email and submit job via "make queue"
46
+
47
+ ## INFERENCE with pretrained model
48
+ * download...checkpoints?? idk look at their README.md
49
+
50
+ ## Changes I made
51
+ * ignore standardization
52
+ * tokens are [] empty...
53
+ * reusing kit_chain thing lol
54
+ * only training on one sequence from grab
55
+
56
+ TO KEEP IN MIND:
57
+ * they specify best way to train in readme somewhere -- follow this when doing real training!
58
+ * need to add the emotion text to the caption!!
text2motion/install.md ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Installation
2
+
3
+ <!-- TOC -->
4
+
5
+ - [Requirements](#requirements)
6
+ - [Prepare environment](#prepare-environment)
7
+ - [Data Preparation](#data-preparation)
8
+
9
+ <!-- TOC -->
10
+
11
+ ## Requirements
12
+
13
+ - Linux
14
+ - Python 3.7+
15
+ - PyTorch 1.6.0, 1.7.0, 1.7.1, 1.8.0, 1.8.1, 1.9.0 or 1.9.1.
16
+ - CUDA 9.2+
17
+ - GCC 5+
18
+ - [MMCV](https://github.com/open-mmlab/mmcv) (Please install mmcv-full>=1.3.17,<1.6.0 for GPU)
19
+
20
+ ## Prepare environment
21
+
22
+ a. Create a conda virtual environment and activate it.
23
+
24
+ ```shell
25
+ conda create -n motiondiffuse python=3.7 -y
26
+ conda activate motiondiffuse
27
+ ```
28
+
29
+ b. Install PyTorch and torchvision following the [official instructions](https://pytorch.org/).
30
+ ```shell
31
+ conda install pytorch={torch_version} torchvision cudatoolkit={cu_version} -c pytorch
32
+ ```
33
+
34
+ E.g., install PyTorch 1.7.1 & CUDA 10.1.
35
+ ```shell
36
+ conda install pytorch=1.7.1 torchvision cudatoolkit=10.1 -c pytorch
37
+ ```
38
+
39
+ **Important:** Make sure that your compilation CUDA version and runtime CUDA version match.
40
+
41
+ c. Build mmcv-full
42
+
43
+ - mmcv-full
44
+
45
+ We recommend you to install the pre-build package as below.
46
+
47
+ For CPU:
48
+ ```shell
49
+ pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cpu/{torch_version}/index.html
50
+ ```
51
+ Please replace `{torch_version}` in the url to your desired one.
52
+
53
+ For GPU:
54
+ ```shell
55
+ pip install "mmcv-full>=1.3.17,<=1.5.3" -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html
56
+ ```
57
+ Please replace `{cu_version}` and `{torch_version}` in the url to your desired one.
58
+
59
+ For example, to install mmcv-full with CUDA 10.1 and PyTorch 1.7.1, use the following command:
60
+ ```shell
61
+ pip install "mmcv-full>=1.3.17,<=1.5.3" -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.7.1/index.html
62
+ ```
63
+
64
+ See [here](https://mmcv.readthedocs.io/en/latest/get_started/installation.html) for different versions of MMCV compatible to different PyTorch and CUDA versions.
65
+ For more version download link, refer to [openmmlab-download](https://download.openmmlab.com/mmcv/dist/index.html).
66
+
67
+
68
+ d. Install other requirements
69
+
70
+ ```shell
71
+ pip install -r requirements.txt
72
+ ```
73
+
74
+ ## Data Preparation
75
+
76
+ a. Download datasets
77
+
78
+ For both the HumanML3D dataset and the KIT-ML dataset, you could find the details as well as download link [[here]](https://github.com/EricGuo5513/HumanML3D).
79
+
80
+ b. Download pretrained weights for evaluation
81
+
82
+ We use the same evaluation protocol as [this repo](https://github.com/EricGuo5513/text-to-motion). You should download pretrained weights of the contrastive models in [t2m](https://drive.google.com/file/d/1DSaKqWX2HlwBtVH5l7DdW96jeYUIXsOP/view) and [kit](https://drive.google.com/file/d/1tX79xk0fflp07EZ660Xz1RAFE33iEyJR/view) for calculating FID and precisions. To dynamically estimate the length of the target motion, `length_est_bigru` and [Glove data](https://drive.google.com/drive/folders/1qxHtwffhfI4qMwptNW6KJEDuT6bduqO7?usp=sharing) are required.
83
+
84
+ c. Download pretrained weights for **MotionDiffuse**
85
+
86
+ The pretrained weights for our proposed MotionDiffuse can be downloaded from [here](https://drive.google.com/drive/folders/1qxHtwffhfI4qMwptNW6KJEDuT6bduqO7?usp=sharing)
87
+
88
+
89
+ Download the above resources and arrange them in the following file structure:
90
+
91
+ ```text
92
+ MotionDiffuse
93
+ └── text2motion
94
+ ├── checkpoints
95
+ │ ├── kit
96
+ │ │ └── kit_motiondiffuse
97
+ │ │ ├── meta
98
+ │ │ │ ├── mean.npy
99
+ │ │ │ └── std.npy
100
+ │ │ ├── model
101
+ │ │ │ └── latest.tar
102
+ │ │ └── opt.txt
103
+ │ └── t2m
104
+ │ └── t2m_motiondiffuse
105
+ │ ├── meta
106
+ │ │ ├── mean.npy
107
+ │ │ └── std.npy
108
+ │ ├── model
109
+ │ │ └── latest.tar
110
+ │ └── opt.txt
111
+ └── data
112
+ ├── glove
113
+ │ ├── our_vab_data.npy
114
+ │ ├── our_vab_idx.pkl
115
+ │ └── out_vab_words.pkl
116
+ ├── pretrained_models
117
+ │ ├── kit
118
+ │ │ └── text_mot_match
119
+ │ │ └── model
120
+ │ │ └── finest.tar
121
+ │ └── t2m
122
+ │ │ ├── text_mot_match
123
+ │ │ │ └── model
124
+ │ │ │ └── finest.tar
125
+ │ │ └── length_est_bigru
126
+ │ │ └── model
127
+ │ │ └── finest.tar
128
+ ├── HumanML3D
129
+ │ ├── new_joint_vecs
130
+ │ │ └── ...
131
+ │ ├── new_joints
132
+ │ │ └── ...
133
+ │ ├── texts
134
+ │ │ └── ...
135
+ │ ├── Mean.npy
136
+ │ ├── Std.npy
137
+ │ ├── test.txt
138
+ │ ├── train_val.txt
139
+ │ ├── train.txt
140
+ │ └── val.txt
141
+ └── KIT-ML
142
+ ├── new_joint_vecs
143
+ │ └── ...
144
+ ├── new_joints
145
+ │ └── ...
146
+ ├── texts
147
+ │ └── ...
148
+ ├── Mean.npy
149
+ ├── Std.npy
150
+ ├── test.txt
151
+ ├── train_val.txt
152
+ ├── train.txt
153
+ └── val.txt
154
+ ```
text2motion/jobscript.sh ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/sh
2
+ ### General options
3
+ ### –- specify queue --
4
+ #BSUB -q gpuv100
5
+ ### -- set the job Name --
6
+ #BSUB -J fulem_2g_excl_196
7
+ ### -- ask for number of cores (default: 1) --
8
+ #BSUB -n 8
9
+ ### -- specify that the cores must be on the same host --
10
+ #BSUB -R "span[hosts=1]"
11
+ ### -- Select the resources: 1 gpu in exclusive process mode --
12
+ #BSUB -gpu "num=2:mode=exclusive_process"
13
+ ### -- set walltime limit: hh:mm -- maximum 24 hours for GPU-queues right now
14
+ #BSUB -W 15:00
15
+ # request 5GB of system-memory
16
+ #BSUB -R "rusage[mem=5GB]"
17
+ ### -- set the email address --
18
+ # please uncomment the following line and put in your e-mail address,
19
+ # if you want to receive e-mail notifications on a non-default address
20
+ #BSUB -u s222376@dtu.dk
21
+ ### -- send notification at start --
22
+ #BSUB -B
23
+ ### -- send notification at completion--
24
+ #BSUB -N
25
+ ### -- Specify the output and error file. %J is the job-id --
26
+ ### -- -o and -e mean append, -oo and -eo mean overwrite --
27
+ #BSUB -o gpu_%J.out
28
+ #BSUB -e gpu_%J.err
29
+ # -- end of LSF options --
30
+
31
+ nvidia-smi
32
+ # Load the cuda module
33
+
34
+ module load cuda/10.1
35
+ module load cudnn/v7.6.5.32-prod-cuda-10.1
36
+
37
+ echo "checking python bin location"
38
+ which python3
39
+ echo "training txt2motion diffusion model..."
40
+ make train EXP=fulem_2g_excl_196
text2motion/models/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .transformer import MotionTransformer
2
+ from .gaussian_diffusion import GaussianDiffusion
3
+
4
+ __all__ = ['MotionTransformer', 'GaussianDiffusion']
text2motion/models/gaussian_diffusion.py ADDED
@@ -0,0 +1,1147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This code is borrowed from https://github.com/openai/guided-diffusion/blob/main/guided_diffusion/gaussian_diffusion.py
3
+ """
4
+
5
+ import enum
6
+ import math
7
+ from abc import ABC, abstractmethod
8
+
9
+ import numpy as np
10
+ import torch as th
11
+ import torch.distributed as dist
12
+
13
+
14
+ def create_named_schedule_sampler(name, diffusion):
15
+ """
16
+ Create a ScheduleSampler from a library of pre-defined samplers.
17
+ :param name: the name of the sampler.
18
+ :param diffusion: the diffusion object to sample for.
19
+ """
20
+ if name == "uniform":
21
+ return UniformSampler(diffusion)
22
+ elif name == "loss-second-moment":
23
+ return LossSecondMomentResampler(diffusion)
24
+ else:
25
+ raise NotImplementedError(f"unknown schedule sampler: {name}")
26
+
27
+
28
+ class ScheduleSampler(ABC):
29
+ """
30
+ A distribution over timesteps in the diffusion process, intended to reduce
31
+ variance of the objective.
32
+ By default, samplers perform unbiased importance sampling, in which the
33
+ objective's mean is unchanged.
34
+ However, subclasses may override sample() to change how the resampled
35
+ terms are reweighted, allowing for actual changes in the objective.
36
+ """
37
+
38
+ @abstractmethod
39
+ def weights(self):
40
+ """
41
+ Get a numpy array of weights, one per diffusion step.
42
+ The weights needn't be normalized, but must be positive.
43
+ """
44
+
45
+ def sample(self, batch_size, device):
46
+ """
47
+ Importance-sample timesteps for a batch.
48
+ :param batch_size: the number of timesteps.
49
+ :param device: the torch device to save to.
50
+ :return: a tuple (timesteps, weights):
51
+ - timesteps: a tensor of timestep indices.
52
+ - weights: a tensor of weights to scale the resulting losses.
53
+ """
54
+ w = self.weights()
55
+ p = w / np.sum(w)
56
+ indices_np = np.random.choice(len(p), size=(batch_size,), p=p)
57
+ indices = th.from_numpy(indices_np).long().to(device)
58
+ weights_np = 1 / (len(p) * p[indices_np])
59
+ weights = th.from_numpy(weights_np).float().to(device)
60
+ return indices, weights
61
+
62
+
63
+ class UniformSampler(ScheduleSampler):
64
+ def __init__(self, diffusion):
65
+ self.diffusion = diffusion
66
+ self._weights = np.ones([diffusion.num_timesteps])
67
+
68
+ def weights(self):
69
+ return self._weights
70
+
71
+
72
+ class LossAwareSampler(ScheduleSampler):
73
+ def update_with_local_losses(self, local_ts, local_losses):
74
+ """
75
+ Update the reweighting using losses from a model.
76
+ Call this method from each rank with a batch of timesteps and the
77
+ corresponding losses for each of those timesteps.
78
+ This method will perform synchronization to make sure all of the ranks
79
+ maintain the exact same reweighting.
80
+ :param local_ts: an integer Tensor of timesteps.
81
+ :param local_losses: a 1D Tensor of losses.
82
+ """
83
+ batch_sizes = [
84
+ th.tensor([0], dtype=th.int32, device=local_ts.device)
85
+ for _ in range(dist.get_world_size())
86
+ ]
87
+ dist.all_gather(
88
+ batch_sizes,
89
+ th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device),
90
+ )
91
+
92
+ # Pad all_gather batches to be the maximum batch size.
93
+ batch_sizes = [x.item() for x in batch_sizes]
94
+ max_bs = max(batch_sizes)
95
+
96
+ timestep_batches = [th.zeros(max_bs).to(local_ts) for bs in batch_sizes]
97
+ loss_batches = [th.zeros(max_bs).to(local_losses) for bs in batch_sizes]
98
+ dist.all_gather(timestep_batches, local_ts)
99
+ dist.all_gather(loss_batches, local_losses)
100
+ timesteps = [
101
+ x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs]
102
+ ]
103
+ losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]]
104
+ self.update_with_all_losses(timesteps, losses)
105
+
106
+ @abstractmethod
107
+ def update_with_all_losses(self, ts, losses):
108
+ """
109
+ Update the reweighting using losses from a model.
110
+ Sub-classes should override this method to update the reweighting
111
+ using losses from the model.
112
+ This method directly updates the reweighting without synchronizing
113
+ between workers. It is called by update_with_local_losses from all
114
+ ranks with identical arguments. Thus, it should have deterministic
115
+ behavior to maintain state across workers.
116
+ :param ts: a list of int timesteps.
117
+ :param losses: a list of float losses, one per timestep.
118
+ """
119
+
120
+
121
+ class LossSecondMomentResampler(LossAwareSampler):
122
+ def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001):
123
+ self.diffusion = diffusion
124
+ self.history_per_term = history_per_term
125
+ self.uniform_prob = uniform_prob
126
+ self._loss_history = np.zeros(
127
+ [diffusion.num_timesteps, history_per_term], dtype=np.float64
128
+ )
129
+ self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int)
130
+
131
+ def weights(self):
132
+ if not self._warmed_up():
133
+ return np.ones([self.diffusion.num_timesteps], dtype=np.float64)
134
+ weights = np.sqrt(np.mean(self._loss_history ** 2, axis=-1))
135
+ weights /= np.sum(weights)
136
+ weights *= 1 - self.uniform_prob
137
+ weights += self.uniform_prob / len(weights)
138
+ return weights
139
+
140
+ def update_with_all_losses(self, ts, losses):
141
+ for t, loss in zip(ts, losses):
142
+ if self._loss_counts[t] == self.history_per_term:
143
+ # Shift out the oldest loss term.
144
+ self._loss_history[t, :-1] = self._loss_history[t, 1:]
145
+ self._loss_history[t, -1] = loss
146
+ else:
147
+ self._loss_history[t, self._loss_counts[t]] = loss
148
+ self._loss_counts[t] += 1
149
+
150
+ def _warmed_up(self):
151
+ return (self._loss_counts == self.history_per_term).all()
152
+
153
+
154
+ def mean_flat(tensor):
155
+ """
156
+ Take the mean over all non-batch dimensions.
157
+ """
158
+ return tensor.mean(dim=list(range(1, len(tensor.shape))))
159
+
160
+
161
+ def normal_kl(mean1, logvar1, mean2, logvar2):
162
+ """
163
+ Compute the KL divergence between two gaussians.
164
+ Shapes are automatically broadcasted, so batches can be compared to
165
+ scalars, among other use cases.
166
+ """
167
+ tensor = None
168
+ for obj in (mean1, logvar1, mean2, logvar2):
169
+ if isinstance(obj, th.Tensor):
170
+ tensor = obj
171
+ break
172
+ assert tensor is not None, "at least one argument must be a Tensor"
173
+
174
+ # Force variances to be Tensors. Broadcasting helps convert scalars to
175
+ # Tensors, but it does not work for th.exp().
176
+ logvar1, logvar2 = [
177
+ x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor)
178
+ for x in (logvar1, logvar2)
179
+ ]
180
+
181
+ return 0.5 * (
182
+ -1.0
183
+ + logvar2
184
+ - logvar1
185
+ + th.exp(logvar1 - logvar2)
186
+ + ((mean1 - mean2) ** 2) * th.exp(-logvar2)
187
+ )
188
+
189
+
190
+ def approx_standard_normal_cdf(x):
191
+ """
192
+ A fast approximation of the cumulative distribution function of the
193
+ standard normal.
194
+ """
195
+ return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3))))
196
+
197
+
198
+ def discretized_gaussian_log_likelihood(x, *, means, log_scales):
199
+ """
200
+ Compute the log-likelihood of a Gaussian distribution discretizing to a
201
+ given image.
202
+ :param x: the target images. It is assumed that this was uint8 values,
203
+ rescaled to the range [-1, 1].
204
+ :param means: the Gaussian mean Tensor.
205
+ :param log_scales: the Gaussian log stddev Tensor.
206
+ :return: a tensor like x of log probabilities (in nats).
207
+ """
208
+ assert x.shape == means.shape == log_scales.shape
209
+ centered_x = x - means
210
+ inv_stdv = th.exp(-log_scales)
211
+ plus_in = inv_stdv * (centered_x + 1.0 / 255.0)
212
+ cdf_plus = approx_standard_normal_cdf(plus_in)
213
+ min_in = inv_stdv * (centered_x - 1.0 / 255.0)
214
+ cdf_min = approx_standard_normal_cdf(min_in)
215
+ log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12))
216
+ log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12))
217
+ cdf_delta = cdf_plus - cdf_min
218
+ log_probs = th.where(
219
+ x < -0.999,
220
+ log_cdf_plus,
221
+ th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))),
222
+ )
223
+ assert log_probs.shape == x.shape
224
+ return log_probs
225
+
226
+
227
+ def get_named_beta_schedule(schedule_name, num_diffusion_timesteps):
228
+ """
229
+ Get a pre-defined beta schedule for the given name.
230
+
231
+ The beta schedule library consists of beta schedules which remain similar
232
+ in the limit of num_diffusion_timesteps.
233
+ Beta schedules may be added, but should not be removed or changed once
234
+ they are committed to maintain backwards compatibility.
235
+ """
236
+ if schedule_name == "linear":
237
+ # Linear schedule from Ho et al, extended to work for any number of
238
+ # diffusion steps.
239
+ scale = 1000 / num_diffusion_timesteps
240
+ beta_start = scale * 0.0001
241
+ beta_end = scale * 0.02
242
+ return np.linspace(
243
+ beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64
244
+ )
245
+ elif schedule_name == "cosine":
246
+ return betas_for_alpha_bar(
247
+ num_diffusion_timesteps,
248
+ lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
249
+ )
250
+ else:
251
+ raise NotImplementedError(f"unknown beta schedule: {schedule_name}")
252
+
253
+
254
+ def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
255
+ """
256
+ Create a beta schedule that discretizes the given alpha_t_bar function,
257
+ which defines the cumulative product of (1-beta) over time from t = [0,1].
258
+
259
+ :param num_diffusion_timesteps: the number of betas to produce.
260
+ :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
261
+ produces the cumulative product of (1-beta) up to that
262
+ part of the diffusion process.
263
+ :param max_beta: the maximum beta to use; use values lower than 1 to
264
+ prevent singularities.
265
+ """
266
+ betas = []
267
+ for i in range(num_diffusion_timesteps):
268
+ t1 = i / num_diffusion_timesteps
269
+ t2 = (i + 1) / num_diffusion_timesteps
270
+ betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
271
+ return np.array(betas)
272
+
273
+
274
+ class ModelMeanType(enum.Enum):
275
+ """
276
+ Which type of output the model predicts.
277
+ """
278
+
279
+ PREVIOUS_X = enum.auto() # the model predicts x_{t-1}
280
+ START_X = enum.auto() # the model predicts x_0
281
+ EPSILON = enum.auto() # the model predicts epsilon
282
+
283
+
284
+ class ModelVarType(enum.Enum):
285
+ """
286
+ What is used as the model's output variance.
287
+
288
+ The LEARNED_RANGE option has been added to allow the model to predict
289
+ values between FIXED_SMALL and FIXED_LARGE, making its job easier.
290
+ """
291
+
292
+ LEARNED = enum.auto()
293
+ FIXED_SMALL = enum.auto()
294
+ FIXED_LARGE = enum.auto()
295
+ LEARNED_RANGE = enum.auto()
296
+
297
+
298
+ class LossType(enum.Enum):
299
+ MSE = enum.auto() # use raw MSE loss (and KL when learning variances)
300
+ RESCALED_MSE = (
301
+ enum.auto()
302
+ ) # use raw MSE loss (with RESCALED_KL when learning variances)
303
+ KL = enum.auto() # use the variational lower-bound
304
+ RESCALED_KL = enum.auto() # like KL, but rescale to estimate the full VLB
305
+
306
+ def is_vb(self):
307
+ return self == LossType.KL or self == LossType.RESCALED_KL
308
+
309
+
310
+ class GaussianDiffusion:
311
+ """
312
+ Utilities for training and sampling diffusion models.
313
+
314
+ Ported directly from here, and then adapted over time to further experimentation.
315
+ https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42
316
+
317
+ :param betas: a 1-D numpy array of betas for each diffusion timestep,
318
+ starting at T and going to 1.
319
+ :param model_mean_type: a ModelMeanType determining what the model outputs.
320
+ :param model_var_type: a ModelVarType determining how variance is output.
321
+ :param loss_type: a LossType determining the loss function to use.
322
+ :param rescale_timesteps: if True, pass floating point timesteps into the
323
+ model so that they are always scaled like in the
324
+ original paper (0 to 1000).
325
+ """
326
+
327
+ def __init__(
328
+ self,
329
+ *,
330
+ betas,
331
+ model_mean_type,
332
+ model_var_type,
333
+ loss_type,
334
+ rescale_timesteps=False,
335
+ ):
336
+ self.model_mean_type = model_mean_type
337
+ self.model_var_type = model_var_type
338
+ self.loss_type = loss_type
339
+ self.rescale_timesteps = rescale_timesteps
340
+
341
+ # Use float64 for accuracy.
342
+ betas = np.array(betas, dtype=np.float64)
343
+ self.betas = betas
344
+ assert len(betas.shape) == 1, "betas must be 1-D"
345
+ assert (betas > 0).all() and (betas <= 1).all()
346
+
347
+ self.num_timesteps = int(betas.shape[0])
348
+
349
+ alphas = 1.0 - betas
350
+ self.alphas_cumprod = np.cumprod(alphas, axis=0)
351
+ self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1])
352
+ self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0)
353
+ assert self.alphas_cumprod_prev.shape == (self.num_timesteps,)
354
+
355
+ # calculations for diffusion q(x_t | x_{t-1}) and others
356
+ self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod)
357
+ self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod)
358
+ self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod)
359
+ self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod)
360
+ self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1)
361
+
362
+ # calculations for posterior q(x_{t-1} | x_t, x_0)
363
+ self.posterior_variance = (
364
+ betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
365
+ )
366
+ # log calculation clipped because the posterior variance is 0 at the
367
+ # beginning of the diffusion chain.
368
+ self.posterior_log_variance_clipped = np.log(
369
+ np.append(self.posterior_variance[1], self.posterior_variance[1:])
370
+ )
371
+ self.posterior_mean_coef1 = (
372
+ betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
373
+ )
374
+ self.posterior_mean_coef2 = (
375
+ (1.0 - self.alphas_cumprod_prev)
376
+ * np.sqrt(alphas)
377
+ / (1.0 - self.alphas_cumprod)
378
+ )
379
+
380
+ def q_mean_variance(self, x_start, t):
381
+ """
382
+ Get the distribution q(x_t | x_0).
383
+
384
+ :param x_start: the [N x C x ...] tensor of noiseless inputs.
385
+ :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
386
+ :return: A tuple (mean, variance, log_variance), all of x_start's shape.
387
+ """
388
+ mean = (
389
+ _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
390
+ )
391
+ variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
392
+ log_variance = _extract_into_tensor(
393
+ self.log_one_minus_alphas_cumprod, t, x_start.shape
394
+ )
395
+ return mean, variance, log_variance
396
+
397
+ def q_sample(self, x_start, t, noise=None):
398
+ """
399
+ Diffuse the data for a given number of diffusion steps.
400
+
401
+ In other words, sample from q(x_t | x_0).
402
+
403
+ :param x_start: the initial data batch.
404
+ :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
405
+ :param noise: if specified, the split-out normal noise.
406
+ :return: A noisy version of x_start.
407
+ """
408
+ if noise is None:
409
+ noise = th.randn_like(x_start)
410
+ assert noise.shape == x_start.shape
411
+ return (
412
+ _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
413
+ + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape)
414
+ * noise
415
+ )
416
+
417
+ def q_posterior_mean_variance(self, x_start, x_t, t):
418
+ """
419
+ Compute the mean and variance of the diffusion posterior:
420
+
421
+ q(x_{t-1} | x_t, x_0)
422
+
423
+ """
424
+ assert x_start.shape == x_t.shape
425
+ posterior_mean = (
426
+ _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start
427
+ + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
428
+ )
429
+ posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape)
430
+ posterior_log_variance_clipped = _extract_into_tensor(
431
+ self.posterior_log_variance_clipped, t, x_t.shape
432
+ )
433
+ assert (
434
+ posterior_mean.shape[0]
435
+ == posterior_variance.shape[0]
436
+ == posterior_log_variance_clipped.shape[0]
437
+ == x_start.shape[0]
438
+ )
439
+ return posterior_mean, posterior_variance, posterior_log_variance_clipped
440
+
441
+ def p_mean_variance(
442
+ self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None
443
+ ):
444
+ """
445
+ Apply the model to get p(x_{t-1} | x_t), as well as a prediction of
446
+ the initial x, x_0.
447
+
448
+ :param model: the model, which takes a signal and a batch of timesteps
449
+ as input.
450
+ :param x: the [N x C x ...] tensor at time t.
451
+ :param t: a 1-D Tensor of timesteps.
452
+ :param clip_denoised: if True, clip the denoised signal into [-1, 1].
453
+ :param denoised_fn: if not None, a function which applies to the
454
+ x_start prediction before it is used to sample. Applies before
455
+ clip_denoised.
456
+ :param model_kwargs: if not None, a dict of extra keyword arguments to
457
+ pass to the model. This can be used for conditioning.
458
+ :return: a dict with the following keys:
459
+ - 'mean': the model mean output.
460
+ - 'variance': the model variance output.
461
+ - 'log_variance': the log of 'variance'.
462
+ - 'pred_xstart': the prediction for x_0.
463
+ """
464
+ if model_kwargs is None:
465
+ model_kwargs = {}
466
+
467
+ B, C = x.shape[:2]
468
+ assert t.shape == (B,)
469
+ model_output = model(x, self._scale_timesteps(t), **model_kwargs)
470
+
471
+ if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]:
472
+ assert model_output.shape == (B, 2 * C, *x.shape[2:])
473
+ model_output, model_var_values = th.split(model_output, C, dim=1)
474
+ if self.model_var_type == ModelVarType.LEARNED:
475
+ model_log_variance = model_var_values
476
+ model_variance = th.exp(model_log_variance)
477
+ else:
478
+ min_log = _extract_into_tensor(
479
+ self.posterior_log_variance_clipped, t, x.shape
480
+ )
481
+ max_log = _extract_into_tensor(np.log(self.betas), t, x.shape)
482
+ # The model_var_values is [-1, 1] for [min_var, max_var].
483
+ frac = (model_var_values + 1) / 2
484
+ model_log_variance = frac * max_log + (1 - frac) * min_log
485
+ model_variance = th.exp(model_log_variance)
486
+ else:
487
+ model_variance, model_log_variance = {
488
+ # for fixedlarge, we set the initial (log-)variance like so
489
+ # to get a better decoder log likelihood.
490
+ ModelVarType.FIXED_LARGE: (
491
+ np.append(self.posterior_variance[1], self.betas[1:]),
492
+ np.log(np.append(self.posterior_variance[1], self.betas[1:])),
493
+ ),
494
+ ModelVarType.FIXED_SMALL: (
495
+ self.posterior_variance,
496
+ self.posterior_log_variance_clipped,
497
+ ),
498
+ }[self.model_var_type]
499
+ model_variance = _extract_into_tensor(model_variance, t, x.shape)
500
+ model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape)
501
+
502
+ def process_xstart(x):
503
+ if denoised_fn is not None:
504
+ x = denoised_fn(x)
505
+ if clip_denoised:
506
+ return x.clamp(-1, 1)
507
+ return x
508
+
509
+ if self.model_mean_type == ModelMeanType.PREVIOUS_X:
510
+ pred_xstart = process_xstart(
511
+ self._predict_xstart_from_xprev(x_t=x, t=t, xprev=model_output)
512
+ )
513
+ model_mean = model_output
514
+ elif self.model_mean_type in [ModelMeanType.START_X, ModelMeanType.EPSILON]:
515
+ if self.model_mean_type == ModelMeanType.START_X:
516
+ pred_xstart = process_xstart(model_output)
517
+ else:
518
+ pred_xstart = process_xstart(
519
+ self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output)
520
+ )
521
+ model_mean, _, _ = self.q_posterior_mean_variance(
522
+ x_start=pred_xstart, x_t=x, t=t
523
+ )
524
+ else:
525
+ raise NotImplementedError(self.model_mean_type)
526
+
527
+ assert (
528
+ model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape
529
+ )
530
+ return {
531
+ "mean": model_mean,
532
+ "variance": model_variance,
533
+ "log_variance": model_log_variance,
534
+ "pred_xstart": pred_xstart,
535
+ }
536
+
537
+ def _predict_xstart_from_eps(self, x_t, t, eps):
538
+ assert x_t.shape == eps.shape
539
+ return (
540
+ _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
541
+ - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps
542
+ )
543
+
544
+ def _predict_xstart_from_xprev(self, x_t, t, xprev):
545
+ assert x_t.shape == xprev.shape
546
+ return ( # (xprev - coef2*x_t) / coef1
547
+ _extract_into_tensor(1.0 / self.posterior_mean_coef1, t, x_t.shape) * xprev
548
+ - _extract_into_tensor(
549
+ self.posterior_mean_coef2 / self.posterior_mean_coef1, t, x_t.shape
550
+ )
551
+ * x_t
552
+ )
553
+
554
+ def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
555
+ return (
556
+ _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
557
+ - pred_xstart
558
+ ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
559
+
560
+ def _scale_timesteps(self, t):
561
+ if self.rescale_timesteps:
562
+ return t.float() * (1000.0 / self.num_timesteps)
563
+ return t
564
+
565
+ def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
566
+ """
567
+ Compute the mean for the previous step, given a function cond_fn that
568
+ computes the gradient of a conditional log probability with respect to
569
+ x. In particular, cond_fn computes grad(log(p(y|x))), and we want to
570
+ condition on y.
571
+
572
+ This uses the conditioning strategy from Sohl-Dickstein et al. (2015).
573
+ """
574
+ gradient = cond_fn(x, self._scale_timesteps(t), **model_kwargs)
575
+ new_mean = (
576
+ p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float()
577
+ )
578
+ return new_mean
579
+
580
+ def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
581
+ """
582
+ Compute what the p_mean_variance output would have been, should the
583
+ model's score function be conditioned by cond_fn.
584
+
585
+ See condition_mean() for details on cond_fn.
586
+
587
+ Unlike condition_mean(), this instead uses the conditioning strategy
588
+ from Song et al (2020).
589
+ """
590
+ alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
591
+
592
+ eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"])
593
+ eps = eps - (1 - alpha_bar).sqrt() * cond_fn(
594
+ x, self._scale_timesteps(t), **model_kwargs
595
+ )
596
+
597
+ out = p_mean_var.copy()
598
+ out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps)
599
+ out["mean"], _, _ = self.q_posterior_mean_variance(
600
+ x_start=out["pred_xstart"], x_t=x, t=t
601
+ )
602
+ return out
603
+
604
+ def p_sample(
605
+ self,
606
+ model,
607
+ x,
608
+ t,
609
+ clip_denoised=True,
610
+ denoised_fn=None,
611
+ cond_fn=None,
612
+ pre_seq=None,
613
+ transl_req=None,
614
+ model_kwargs=None,
615
+ ):
616
+ """
617
+ Sample x_{t-1} from the model at the given timestep.
618
+
619
+ :param model: the model to sample from.
620
+ :param x: the current tensor at x_{t-1}.
621
+ :param t: the value of t, starting at 0 for the first diffusion step.
622
+ :param clip_denoised: if True, clip the x_start prediction to [-1, 1].
623
+ :param denoised_fn: if not None, a function which applies to the
624
+ x_start prediction before it is used to sample.
625
+ :param cond_fn: if not None, this is a gradient function that acts
626
+ similarly to the model.
627
+ :param model_kwargs: if not None, a dict of extra keyword arguments to
628
+ pass to the model. This can be used for conditioning.
629
+ :return: a dict containing the following keys:
630
+ - 'sample': a random sample from the model.
631
+ - 'pred_xstart': a prediction of x_0.
632
+ """
633
+ # concat seq
634
+ if pre_seq is not None:
635
+ T = pre_seq.shape[2]
636
+ noise = th.randn_like(pre_seq)
637
+ x_t = self.q_sample(pre_seq, t, noise=noise)
638
+ x[:, :, :T] = x_t
639
+
640
+ if transl_req is not None:
641
+ for item in transl_req:
642
+ noise = th.randn(2).type_as(x)
643
+ transl = th.Tensor(item[1:]).type_as(x)
644
+ x_t = self.q_sample(transl, t, noise=noise)
645
+ x[:, :2, item[0]] = x_t
646
+
647
+ out = self.p_mean_variance(
648
+ model,
649
+ x,
650
+ t,
651
+ clip_denoised=clip_denoised,
652
+ denoised_fn=denoised_fn,
653
+ model_kwargs=model_kwargs,
654
+ )
655
+ noise = th.randn_like(x)
656
+ nonzero_mask = (
657
+ (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
658
+ ) # no noise when t == 0
659
+ if cond_fn is not None:
660
+ out["mean"] = self.condition_mean(
661
+ cond_fn, out, x, t, model_kwargs=model_kwargs
662
+ )
663
+ sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise
664
+ return {"sample": sample, "pred_xstart": out["pred_xstart"]}
665
+
666
+ def p_sample_loop(
667
+ self,
668
+ model,
669
+ shape,
670
+ noise=None,
671
+ clip_denoised=True,
672
+ denoised_fn=None,
673
+ cond_fn=None,
674
+ model_kwargs=None,
675
+ device=None,
676
+ pre_seq=None,
677
+ transl_req=None,
678
+ progress=False,
679
+ ):
680
+ """
681
+ Generate samples from the model.
682
+
683
+ :param model: the model module.
684
+ :param shape: the shape of the samples, (N, C, H, W).
685
+ :param noise: if specified, the noise from the encoder to sample.
686
+ Should be of the same shape as `shape`.
687
+ :param clip_denoised: if True, clip x_start predictions to [-1, 1].
688
+ :param denoised_fn: if not None, a function which applies to the
689
+ x_start prediction before it is used to sample.
690
+ :param cond_fn: if not None, this is a gradient function that acts
691
+ similarly to the model.
692
+ :param model_kwargs: if not None, a dict of extra keyword arguments to
693
+ pass to the model. This can be used for conditioning.
694
+ :param device: if specified, the device to create the samples on.
695
+ If not specified, use a model parameter's device.
696
+ :param progress: if True, show a tqdm progress bar.
697
+ :return: a non-differentiable batch of samples.
698
+ """
699
+ final = None
700
+ for sample in self.p_sample_loop_progressive(
701
+ model,
702
+ shape,
703
+ noise=noise,
704
+ clip_denoised=clip_denoised,
705
+ denoised_fn=denoised_fn,
706
+ cond_fn=cond_fn,
707
+ model_kwargs=model_kwargs,
708
+ device=device,
709
+ pre_seq=pre_seq,
710
+ transl_req=transl_req,
711
+ progress=progress,
712
+ ):
713
+ final = sample
714
+ return final["sample"]
715
+
716
+ def p_sample_loop_progressive(
717
+ self,
718
+ model,
719
+ shape,
720
+ noise=None,
721
+ clip_denoised=True,
722
+ denoised_fn=None,
723
+ cond_fn=None,
724
+ model_kwargs=None,
725
+ device=None,
726
+ pre_seq=None,
727
+ transl_req=None,
728
+ progress=False,
729
+ ):
730
+ """
731
+ Generate samples from the model and yield intermediate samples from
732
+ each timestep of diffusion.
733
+
734
+ Arguments are the same as p_sample_loop().
735
+ Returns a generator over dicts, where each dict is the return value of
736
+ p_sample().
737
+ """
738
+ if device is None:
739
+ device = next(model.parameters()).device
740
+ assert isinstance(shape, (tuple, list))
741
+ if noise is not None:
742
+ img = noise
743
+ else:
744
+ img = th.randn(*shape, device=device)
745
+ indices = list(range(self.num_timesteps))[::-1]
746
+ if progress:
747
+ # Lazy import so that we don't depend on tqdm.
748
+ from tqdm.auto import tqdm
749
+
750
+ indices = tqdm(indices)
751
+
752
+ for i in indices:
753
+ t = th.tensor([i] * shape[0], device=device)
754
+ with th.no_grad():
755
+ out = self.p_sample(
756
+ model,
757
+ img,
758
+ t,
759
+ clip_denoised=clip_denoised,
760
+ denoised_fn=denoised_fn,
761
+ cond_fn=cond_fn,
762
+ model_kwargs=model_kwargs,
763
+ pre_seq=pre_seq,
764
+ transl_req=transl_req
765
+ )
766
+ yield out
767
+ img = out["sample"]
768
+ # write sample to /work3/s222376/MotionDiffuse2/text2motion/generation_samples
769
+ # write torch tensor to numpy array
770
+ with open(f'/work3/s222376/MotionDiffuse2/text2motion/generation_samples/sample_{t}.npy', 'wb') as f:
771
+ np.save(f, img.cpu().numpy())
772
+
773
+ def ddim_sample(
774
+ self,
775
+ model,
776
+ x,
777
+ t,
778
+ clip_denoised=True,
779
+ denoised_fn=None,
780
+ cond_fn=None,
781
+ model_kwargs=None,
782
+ eta=0.0,
783
+ ):
784
+ """
785
+ Sample x_{t-1} from the model using DDIM.
786
+
787
+ Same usage as p_sample().
788
+ """
789
+ out = self.p_mean_variance(
790
+ model,
791
+ x,
792
+ t,
793
+ clip_denoised=clip_denoised,
794
+ denoised_fn=denoised_fn,
795
+ model_kwargs=model_kwargs,
796
+ )
797
+ if cond_fn is not None:
798
+ out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
799
+
800
+ # Usually our model outputs epsilon, but we re-derive it
801
+ # in case we used x_start or x_prev prediction.
802
+ eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])
803
+
804
+ alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
805
+ alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
806
+ sigma = (
807
+ eta
808
+ * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar))
809
+ * th.sqrt(1 - alpha_bar / alpha_bar_prev)
810
+ )
811
+ # Equation 12.
812
+ noise = th.randn_like(x)
813
+ mean_pred = (
814
+ out["pred_xstart"] * th.sqrt(alpha_bar_prev)
815
+ + th.sqrt(1 - alpha_bar_prev - sigma ** 2) * eps
816
+ )
817
+ nonzero_mask = (
818
+ (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
819
+ ) # no noise when t == 0
820
+ sample = mean_pred + nonzero_mask * sigma * noise
821
+ return {"sample": sample, "pred_xstart": out["pred_xstart"]}
822
+
823
+ def ddim_reverse_sample(
824
+ self,
825
+ model,
826
+ x,
827
+ t,
828
+ clip_denoised=True,
829
+ denoised_fn=None,
830
+ model_kwargs=None,
831
+ eta=0.0,
832
+ ):
833
+ """
834
+ Sample x_{t+1} from the model using DDIM reverse ODE.
835
+ """
836
+ assert eta == 0.0, "Reverse ODE only for deterministic path"
837
+ out = self.p_mean_variance(
838
+ model,
839
+ x,
840
+ t,
841
+ clip_denoised=clip_denoised,
842
+ denoised_fn=denoised_fn,
843
+ model_kwargs=model_kwargs,
844
+ )
845
+ # Usually our model outputs epsilon, but we re-derive it
846
+ # in case we used x_start or x_prev prediction.
847
+ eps = (
848
+ _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x
849
+ - out["pred_xstart"]
850
+ ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape)
851
+ alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape)
852
+
853
+ # Equation 12. reversed
854
+ mean_pred = (
855
+ out["pred_xstart"] * th.sqrt(alpha_bar_next)
856
+ + th.sqrt(1 - alpha_bar_next) * eps
857
+ )
858
+
859
+ return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]}
860
+
861
+ def ddim_sample_loop(
862
+ self,
863
+ model,
864
+ shape,
865
+ noise=None,
866
+ clip_denoised=True,
867
+ denoised_fn=None,
868
+ cond_fn=None,
869
+ model_kwargs=None,
870
+ device=None,
871
+ progress=False,
872
+ eta=0.0,
873
+ ):
874
+ """
875
+ Generate samples from the model using DDIM.
876
+
877
+ Same usage as p_sample_loop().
878
+ """
879
+ final = None
880
+ for sample in self.ddim_sample_loop_progressive(
881
+ model,
882
+ shape,
883
+ noise=noise,
884
+ clip_denoised=clip_denoised,
885
+ denoised_fn=denoised_fn,
886
+ cond_fn=cond_fn,
887
+ model_kwargs=model_kwargs,
888
+ device=device,
889
+ progress=progress,
890
+ eta=eta,
891
+ ):
892
+ final = sample
893
+ return final["sample"]
894
+
895
+ def ddim_sample_loop_progressive(
896
+ self,
897
+ model,
898
+ shape,
899
+ noise=None,
900
+ clip_denoised=True,
901
+ denoised_fn=None,
902
+ cond_fn=None,
903
+ model_kwargs=None,
904
+ device=None,
905
+ progress=False,
906
+ eta=0.0,
907
+ ):
908
+ """
909
+ Use DDIM to sample from the model and yield intermediate samples from
910
+ each timestep of DDIM.
911
+
912
+ Same usage as p_sample_loop_progressive().
913
+ """
914
+ if device is None:
915
+ device = next(model.parameters()).device
916
+ assert isinstance(shape, (tuple, list))
917
+ if noise is not None:
918
+ img = noise
919
+ else:
920
+ img = th.randn(*shape, device=device)
921
+ indices = list(range(self.num_timesteps))[::-1]
922
+
923
+ if progress:
924
+ # Lazy import so that we don't depend on tqdm.
925
+ from tqdm.auto import tqdm
926
+
927
+ indices = tqdm(indices)
928
+
929
+ for i in indices:
930
+ t = th.tensor([i] * shape[0], device=device)
931
+ with th.no_grad():
932
+ out = self.ddim_sample(
933
+ model,
934
+ img,
935
+ t,
936
+ clip_denoised=clip_denoised,
937
+ denoised_fn=denoised_fn,
938
+ cond_fn=cond_fn,
939
+ model_kwargs=model_kwargs,
940
+ eta=eta,
941
+ )
942
+ yield out
943
+ img = out["sample"]
944
+
945
+ def _vb_terms_bpd(
946
+ self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None
947
+ ):
948
+ """
949
+ Get a term for the variational lower-bound.
950
+
951
+ The resulting units are bits (rather than nats, as one might expect).
952
+ This allows for comparison to other papers.
953
+
954
+ :return: a dict with the following keys:
955
+ - 'output': a shape [N] tensor of NLLs or KLs.
956
+ - 'pred_xstart': the x_0 predictions.
957
+ """
958
+ true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(
959
+ x_start=x_start, x_t=x_t, t=t
960
+ )
961
+ out = self.p_mean_variance(
962
+ model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs
963
+ )
964
+ kl = normal_kl(
965
+ true_mean, true_log_variance_clipped, out["mean"], out["log_variance"]
966
+ )
967
+ kl = mean_flat(kl) / np.log(2.0)
968
+
969
+ decoder_nll = -discretized_gaussian_log_likelihood(
970
+ x_start, means=out["mean"], log_scales=0.5 * out["log_variance"]
971
+ )
972
+ assert decoder_nll.shape == x_start.shape
973
+ decoder_nll = mean_flat(decoder_nll) / np.log(2.0)
974
+
975
+ # At the first timestep return the decoder NLL,
976
+ # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t))
977
+ output = th.where((t == 0), decoder_nll, kl)
978
+ return {"output": output, "pred_xstart": out["pred_xstart"]}
979
+
980
+ def training_losses(self, model, x_start, t, model_kwargs=None, noise=None):
981
+ """
982
+ Compute training losses for a single timestep.
983
+
984
+ :param model: the model to evaluate loss on.
985
+ :param x_start: the [N x C x ...] tensor of inputs.
986
+ :param t: a batch of timestep indices.
987
+ :param model_kwargs: if not None, a dict of extra keyword arguments to
988
+ pass to the model. This can be used for conditioning.
989
+ :param noise: if specified, the specific Gaussian noise to try to remove.
990
+ :return: a dict with the key "loss" containing a tensor of shape [N].
991
+ Some mean or variance settings may also have other keys.
992
+ """
993
+ if model_kwargs is None:
994
+ model_kwargs = {}
995
+ if noise is None:
996
+ noise = th.randn_like(x_start)
997
+ x_t = self.q_sample(x_start, t, noise=noise)
998
+
999
+ terms = {}
1000
+
1001
+ if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL:
1002
+ terms["loss"] = self._vb_terms_bpd(
1003
+ model=model,
1004
+ x_start=x_start,
1005
+ x_t=x_t,
1006
+ t=t,
1007
+ clip_denoised=False,
1008
+ model_kwargs=model_kwargs,
1009
+ )["output"]
1010
+ if self.loss_type == LossType.RESCALED_KL:
1011
+ terms["loss"] *= self.num_timesteps
1012
+ elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE:
1013
+ model_output = model(x_t, self._scale_timesteps(t), **model_kwargs)
1014
+
1015
+ if self.model_var_type in [
1016
+ ModelVarType.LEARNED,
1017
+ ModelVarType.LEARNED_RANGE,
1018
+ ]:
1019
+ B, C = x_t.shape[:2]
1020
+ assert model_output.shape == (B, C * 2, *x_t.shape[2:])
1021
+ model_output, model_var_values = th.split(model_output, C, dim=1)
1022
+ # Learn the variance using the variational bound, but don't let
1023
+ # it affect our mean prediction.
1024
+ frozen_out = th.cat([model_output.detach(), model_var_values], dim=1)
1025
+ terms["vb"] = self._vb_terms_bpd(
1026
+ model=lambda *args, r=frozen_out: r,
1027
+ x_start=x_start,
1028
+ x_t=x_t,
1029
+ t=t,
1030
+ clip_denoised=False,
1031
+ )["output"]
1032
+ if self.loss_type == LossType.RESCALED_MSE:
1033
+ # Divide by 1000 for equivalence with initial implementation.
1034
+ # Without a factor of 1/1000, the VB term hurts the MSE term.
1035
+ terms["vb"] *= self.num_timesteps / 1000.0
1036
+
1037
+ target = {
1038
+ ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance(
1039
+ x_start=x_start, x_t=x_t, t=t
1040
+ )[0],
1041
+ ModelMeanType.START_X: x_start,
1042
+ ModelMeanType.EPSILON: noise,
1043
+ }[self.model_mean_type]
1044
+ assert model_output.shape == target.shape == x_start.shape
1045
+ terms["mse"] = mean_flat((target - model_output) ** 2).view(-1, 1).mean(-1)
1046
+ # if "vb" in terms:
1047
+ # terms["loss"] = terms["mse"] + terms["vb"]
1048
+ # else:
1049
+ # terms["loss"] = terms["mse"]
1050
+ terms["target"] = target
1051
+ terms["pred"] = model_output
1052
+ else:
1053
+ raise NotImplementedError(self.loss_type)
1054
+
1055
+ return terms
1056
+
1057
+ def _prior_bpd(self, x_start):
1058
+ """
1059
+ Get the prior KL term for the variational lower-bound, measured in
1060
+ bits-per-dim.
1061
+
1062
+ This term can't be optimized, as it only depends on the encoder.
1063
+
1064
+ :param x_start: the [N x C x ...] tensor of inputs.
1065
+ :return: a batch of [N] KL values (in bits), one per batch element.
1066
+ """
1067
+ batch_size = x_start.shape[0]
1068
+ t = th.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
1069
+ qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
1070
+ kl_prior = normal_kl(
1071
+ mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0
1072
+ )
1073
+ return mean_flat(kl_prior) / np.log(2.0)
1074
+
1075
+ def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None):
1076
+ """
1077
+ Compute the entire variational lower-bound, measured in bits-per-dim,
1078
+ as well as other related quantities.
1079
+
1080
+ :param model: the model to evaluate loss on.
1081
+ :param x_start: the [N x C x ...] tensor of inputs.
1082
+ :param clip_denoised: if True, clip denoised samples.
1083
+ :param model_kwargs: if not None, a dict of extra keyword arguments to
1084
+ pass to the model. This can be used for conditioning.
1085
+
1086
+ :return: a dict containing the following keys:
1087
+ - total_bpd: the total variational lower-bound, per batch element.
1088
+ - prior_bpd: the prior term in the lower-bound.
1089
+ - vb: an [N x T] tensor of terms in the lower-bound.
1090
+ - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep.
1091
+ - mse: an [N x T] tensor of epsilon MSEs for each timestep.
1092
+ """
1093
+ device = x_start.device
1094
+ batch_size = x_start.shape[0]
1095
+
1096
+ vb = []
1097
+ xstart_mse = []
1098
+ mse = []
1099
+ for t in list(range(self.num_timesteps))[::-1]:
1100
+ t_batch = th.tensor([t] * batch_size, device=device)
1101
+ noise = th.randn_like(x_start)
1102
+ x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise)
1103
+ # Calculate VLB term at the current timestep
1104
+ with th.no_grad():
1105
+ out = self._vb_terms_bpd(
1106
+ model,
1107
+ x_start=x_start,
1108
+ x_t=x_t,
1109
+ t=t_batch,
1110
+ clip_denoised=clip_denoised,
1111
+ model_kwargs=model_kwargs,
1112
+ )
1113
+ vb.append(out["output"])
1114
+ xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2))
1115
+ eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"])
1116
+ mse.append(mean_flat((eps - noise) ** 2))
1117
+
1118
+ vb = th.stack(vb, dim=1)
1119
+ xstart_mse = th.stack(xstart_mse, dim=1)
1120
+ mse = th.stack(mse, dim=1)
1121
+
1122
+ prior_bpd = self._prior_bpd(x_start)
1123
+ total_bpd = vb.sum(dim=1) + prior_bpd
1124
+ return {
1125
+ "total_bpd": total_bpd,
1126
+ "prior_bpd": prior_bpd,
1127
+ "vb": vb,
1128
+ "xstart_mse": xstart_mse,
1129
+ "mse": mse,
1130
+ }
1131
+
1132
+
1133
+ def _extract_into_tensor(arr, timesteps, broadcast_shape):
1134
+ """
1135
+ Extract values from a 1-D numpy array for a batch of indices.
1136
+
1137
+ :param arr: the 1-D numpy array.
1138
+ :param timesteps: a tensor of indices into the array to extract.
1139
+ :param broadcast_shape: a larger shape of K dimensions with the batch
1140
+ dimension equal to the length of timesteps.
1141
+ :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
1142
+ """
1143
+ res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
1144
+ while len(res.shape) < len(broadcast_shape):
1145
+ res = res[..., None]
1146
+ return res.expand(broadcast_shape)
1147
+
text2motion/models/transformer.py ADDED
@@ -0,0 +1,429 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Copyright 2021 S-Lab
3
+ """
4
+
5
+ from cv2 import norm
6
+ import torch
7
+ import torch.nn.functional as F
8
+ from torch import layer_norm, nn
9
+ import numpy as np
10
+ import clip
11
+
12
+ import math
13
+
14
+
15
+ def timestep_embedding(timesteps, dim, max_period=10000):
16
+ """
17
+ Create sinusoidal timestep embeddings.
18
+ :param timesteps: a 1-D Tensor of N indices, one per batch element.
19
+ These may be fractional.
20
+ :param dim: the dimension of the output.
21
+ :param max_period: controls the minimum frequency of the embeddings.
22
+ :return: an [N x dim] Tensor of positional embeddings.
23
+ """
24
+ half = dim // 2
25
+ freqs = torch.exp(
26
+ -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
27
+ ).to(device=timesteps.device)
28
+ args = timesteps[:, None].float() * freqs[None]
29
+ embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
30
+ if dim % 2:
31
+ embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
32
+ return embedding
33
+
34
+
35
+ def set_requires_grad(nets, requires_grad=False):
36
+ """Set requies_grad for all the networks.
37
+
38
+ Args:
39
+ nets (nn.Module | list[nn.Module]): A list of networks or a single
40
+ network.
41
+ requires_grad (bool): Whether the networks require gradients or not
42
+ """
43
+ if not isinstance(nets, list):
44
+ nets = [nets]
45
+ for net in nets:
46
+ if net is not None:
47
+ for param in net.parameters():
48
+ param.requires_grad = requires_grad
49
+
50
+
51
+ def zero_module(module):
52
+ """
53
+ Zero out the parameters of a module and return it.
54
+ """
55
+ for p in module.parameters():
56
+ p.detach().zero_()
57
+ return module
58
+
59
+
60
+ class StylizationBlock(nn.Module):
61
+
62
+ def __init__(self, latent_dim, time_embed_dim, dropout):
63
+ super().__init__()
64
+ self.emb_layers = nn.Sequential(
65
+ nn.SiLU(),
66
+ nn.Linear(time_embed_dim, 2 * latent_dim),
67
+ )
68
+ self.norm = nn.LayerNorm(latent_dim)
69
+ self.out_layers = nn.Sequential(
70
+ nn.SiLU(),
71
+ nn.Dropout(p=dropout),
72
+ zero_module(nn.Linear(latent_dim, latent_dim)),
73
+ )
74
+
75
+ def forward(self, h, emb):
76
+ """
77
+ h: B, T, D
78
+ emb: B, D
79
+ """
80
+ # B, 1, 2D
81
+ emb_out = self.emb_layers(emb).unsqueeze(1)
82
+ # scale: B, 1, D / shift: B, 1, D
83
+ scale, shift = torch.chunk(emb_out, 2, dim=2)
84
+ h = self.norm(h) * (1 + scale) + shift
85
+ h = self.out_layers(h)
86
+ return h
87
+
88
+
89
+ class LinearTemporalSelfAttention(nn.Module):
90
+
91
+ def __init__(self, seq_len, latent_dim, num_head, dropout, time_embed_dim):
92
+ super().__init__()
93
+ self.num_head = num_head
94
+ self.norm = nn.LayerNorm(latent_dim)
95
+ self.query = nn.Linear(latent_dim, latent_dim)
96
+ self.key = nn.Linear(latent_dim, latent_dim)
97
+ self.value = nn.Linear(latent_dim, latent_dim)
98
+ self.dropout = nn.Dropout(dropout)
99
+ self.proj_out = StylizationBlock(latent_dim, time_embed_dim, dropout)
100
+
101
+ def forward(self, x, emb, src_mask):
102
+ """
103
+ x: B, T, D
104
+ """
105
+ B, T, D = x.shape
106
+ H = self.num_head
107
+ # B, T, D
108
+ query = self.query(self.norm(x))
109
+ # B, T, D
110
+ key = (self.key(self.norm(x)) + (1 - src_mask) * -1000000)
111
+ query = F.softmax(query.view(B, T, H, -1), dim=-1)
112
+ key = F.softmax(key.view(B, T, H, -1), dim=1)
113
+ # B, T, H, HD
114
+ value = (self.value(self.norm(x)) * src_mask).view(B, T, H, -1)
115
+ # B, H, HD, HD
116
+ attention = torch.einsum('bnhd,bnhl->bhdl', key, value)
117
+ y = torch.einsum('bnhd,bhdl->bnhl', query, attention).reshape(B, T, D)
118
+ y = x + self.proj_out(y, emb)
119
+ return y
120
+
121
+
122
+ class LinearTemporalCrossAttention(nn.Module):
123
+
124
+ def __init__(self, seq_len, latent_dim, text_latent_dim, num_head, dropout, time_embed_dim):
125
+ super().__init__()
126
+ self.num_head = num_head
127
+ self.norm = nn.LayerNorm(latent_dim)
128
+ self.text_norm = nn.LayerNorm(text_latent_dim)
129
+ self.query = nn.Linear(latent_dim, latent_dim)
130
+ self.key = nn.Linear(text_latent_dim, latent_dim)
131
+ self.value = nn.Linear(text_latent_dim, latent_dim)
132
+ self.dropout = nn.Dropout(dropout)
133
+ self.proj_out = StylizationBlock(latent_dim, time_embed_dim, dropout)
134
+
135
+ def forward(self, x, xf, emb):
136
+ """
137
+ x: B, T, D
138
+ xf: B, N, L
139
+ """
140
+ B, T, D = x.shape
141
+ N = xf.shape[1]
142
+ H = self.num_head
143
+ # B, T, D
144
+ query = self.query(self.norm(x))
145
+ # B, N, D
146
+ key = self.key(self.text_norm(xf))
147
+ query = F.softmax(query.view(B, T, H, -1), dim=-1)
148
+ key = F.softmax(key.view(B, N, H, -1), dim=1)
149
+ # B, N, H, HD
150
+ value = self.value(self.text_norm(xf)).view(B, N, H, -1)
151
+ # B, H, HD, HD
152
+ attention = torch.einsum('bnhd,bnhl->bhdl', key, value)
153
+ y = torch.einsum('bnhd,bhdl->bnhl', query, attention).reshape(B, T, D)
154
+ y = x + self.proj_out(y, emb)
155
+ return y
156
+
157
+ class FFN(nn.Module):
158
+
159
+ def __init__(self, latent_dim, ffn_dim, dropout, time_embed_dim):
160
+ super().__init__()
161
+ self.linear1 = nn.Linear(latent_dim, ffn_dim)
162
+ self.linear2 = zero_module(nn.Linear(ffn_dim, latent_dim))
163
+ self.activation = nn.GELU()
164
+ self.dropout = nn.Dropout(dropout)
165
+ self.proj_out = StylizationBlock(latent_dim, time_embed_dim, dropout)
166
+
167
+ def forward(self, x, emb):
168
+ y = self.linear2(self.dropout(self.activation(self.linear1(x))))
169
+ y = x + self.proj_out(y, emb)
170
+ return y
171
+
172
+
173
+ class LinearTemporalDiffusionTransformerDecoderLayer(nn.Module):
174
+
175
+ def __init__(self,
176
+ seq_len=60,
177
+ latent_dim=32,
178
+ text_latent_dim=512,
179
+ time_embed_dim=128,
180
+ ffn_dim=256,
181
+ num_head=4,
182
+ dropout=0.1):
183
+ super().__init__()
184
+ self.sa_block = LinearTemporalSelfAttention(
185
+ seq_len, latent_dim, num_head, dropout, time_embed_dim)
186
+ self.ca_block = LinearTemporalCrossAttention(
187
+ seq_len, latent_dim, text_latent_dim, num_head, dropout, time_embed_dim)
188
+ self.ffn = FFN(latent_dim, ffn_dim, dropout, time_embed_dim)
189
+
190
+ def forward(self, x, xf, emb, src_mask):
191
+ x = self.sa_block(x, emb, src_mask)
192
+ x = self.ca_block(x, xf, emb)
193
+ x = self.ffn(x, emb)
194
+ return x
195
+
196
+ class TemporalSelfAttention(nn.Module):
197
+
198
+ def __init__(self, seq_len, latent_dim, num_head, dropout, time_embed_dim):
199
+ super().__init__()
200
+ self.num_head = num_head
201
+ self.norm = nn.LayerNorm(latent_dim)
202
+ self.query = nn.Linear(latent_dim, latent_dim)
203
+ self.key = nn.Linear(latent_dim, latent_dim)
204
+ self.value = nn.Linear(latent_dim, latent_dim)
205
+ self.dropout = nn.Dropout(dropout)
206
+ self.proj_out = StylizationBlock(latent_dim, time_embed_dim, dropout)
207
+
208
+ def forward(self, x, emb, src_mask):
209
+ """
210
+ x: B, T, D
211
+ """
212
+ B, T, D = x.shape
213
+ H = self.num_head
214
+ # B, T, 1, D
215
+ query = self.query(self.norm(x)).unsqueeze(2)
216
+ # B, 1, T, D
217
+ key = self.key(self.norm(x)).unsqueeze(1)
218
+ query = query.view(B, T, H, -1)
219
+ key = key.view(B, T, H, -1)
220
+ # B, T, T, H
221
+ attention = torch.einsum('bnhd,bmhd->bnmh', query, key) / math.sqrt(D // H)
222
+ attention = attention + (1 - src_mask.unsqueeze(-1)) * -100000
223
+ weight = self.dropout(F.softmax(attention, dim=2))
224
+ value = self.value(self.norm(x)).view(B, T, H, -1)
225
+ y = torch.einsum('bnmh,bmhd->bnhd', weight, value).reshape(B, T, D)
226
+ y = x + self.proj_out(y, emb)
227
+ return y
228
+
229
+ class TemporalCrossAttention(nn.Module):
230
+
231
+ def __init__(self, seq_len, latent_dim, text_latent_dim, num_head, dropout, time_embed_dim):
232
+ super().__init__()
233
+ self.num_head = num_head
234
+ self.norm = nn.LayerNorm(latent_dim)
235
+ self.text_norm = nn.LayerNorm(text_latent_dim)
236
+ self.query = nn.Linear(latent_dim, latent_dim)
237
+ self.key = nn.Linear(text_latent_dim, latent_dim)
238
+ self.value = nn.Linear(text_latent_dim, latent_dim)
239
+ self.dropout = nn.Dropout(dropout)
240
+ self.proj_out = StylizationBlock(latent_dim, time_embed_dim, dropout)
241
+
242
+ def forward(self, x, xf, emb):
243
+ """
244
+ x: B, T, D
245
+ xf: B, N, L
246
+ """
247
+ B, T, D = x.shape
248
+ N = xf.shape[1]
249
+ H = self.num_head
250
+ # B, T, 1, D
251
+ query = self.query(self.norm(x)).unsqueeze(2)
252
+ # B, 1, N, D
253
+ key = self.key(self.text_norm(xf)).unsqueeze(1)
254
+ query = query.view(B, T, H, -1)
255
+ key = key.view(B, N, H, -1)
256
+ # B, T, N, H
257
+ attention = torch.einsum('bnhd,bmhd->bnmh', query, key) / math.sqrt(D // H)
258
+ weight = self.dropout(F.softmax(attention, dim=2))
259
+ value = self.value(self.text_norm(xf)).view(B, N, H, -1)
260
+ y = torch.einsum('bnmh,bmhd->bnhd', weight, value).reshape(B, T, D)
261
+ y = x + self.proj_out(y, emb)
262
+ return y
263
+
264
+ class TemporalDiffusionTransformerDecoderLayer(nn.Module):
265
+
266
+ def __init__(self,
267
+ seq_len=60,
268
+ latent_dim=32,
269
+ text_latent_dim=512,
270
+ time_embed_dim=128,
271
+ ffn_dim=256,
272
+ num_head=4,
273
+ dropout=0.1):
274
+ super().__init__()
275
+ self.sa_block = TemporalSelfAttention(
276
+ seq_len, latent_dim, num_head, dropout, time_embed_dim)
277
+ self.ca_block = TemporalCrossAttention(
278
+ seq_len, latent_dim, text_latent_dim, num_head, dropout, time_embed_dim)
279
+ self.ffn = FFN(latent_dim, ffn_dim, dropout, time_embed_dim)
280
+
281
+ def forward(self, x, xf, emb, src_mask):
282
+ x = self.sa_block(x, emb, src_mask)
283
+ x = self.ca_block(x, xf, emb)
284
+ x = self.ffn(x, emb)
285
+ return x
286
+
287
+
288
+ class MotionTransformer(nn.Module):
289
+ def __init__(self,
290
+ input_feats,
291
+ num_frames=240,
292
+ latent_dim=512,
293
+ ff_size=1024,
294
+ num_layers=8,
295
+ num_heads=8,
296
+ dropout=0,
297
+ activation="gelu",
298
+ num_text_layers=4,
299
+ text_latent_dim=256,
300
+ text_ff_size=2048,
301
+ text_num_heads=4,
302
+ no_clip=False,
303
+ no_eff=False,
304
+ **kargs):
305
+ super().__init__()
306
+
307
+ self.num_frames = num_frames
308
+ self.latent_dim = latent_dim
309
+ self.ff_size = ff_size
310
+ self.num_layers = num_layers
311
+ self.num_heads = num_heads
312
+ self.dropout = dropout
313
+ self.activation = activation
314
+ self.input_feats = input_feats
315
+ self.time_embed_dim = latent_dim * 4
316
+ self.sequence_embedding = nn.Parameter(torch.randn(num_frames, latent_dim))
317
+
318
+ # Text Transformer
319
+ self.clip, _ = clip.load('ViT-B/32', "cpu")
320
+ if no_clip:
321
+ self.clip.initialize_parameters()
322
+ else:
323
+ set_requires_grad(self.clip, False)
324
+ if text_latent_dim != 512:
325
+ self.text_pre_proj = nn.Linear(512, text_latent_dim)
326
+ else:
327
+ self.text_pre_proj = nn.Identity()
328
+ textTransEncoderLayer = nn.TransformerEncoderLayer(
329
+ d_model=text_latent_dim,
330
+ nhead=text_num_heads,
331
+ dim_feedforward=text_ff_size,
332
+ dropout=dropout,
333
+ activation=activation)
334
+ self.textTransEncoder = nn.TransformerEncoder(
335
+ textTransEncoderLayer,
336
+ num_layers=num_text_layers)
337
+ self.text_ln = nn.LayerNorm(text_latent_dim)
338
+ self.text_proj = nn.Sequential(
339
+ nn.Linear(text_latent_dim, self.time_embed_dim)
340
+ )
341
+
342
+ # Input Embedding
343
+ self.joint_embed = nn.Linear(self.input_feats, self.latent_dim)
344
+
345
+ self.time_embed = nn.Sequential(
346
+ nn.Linear(self.latent_dim, self.time_embed_dim),
347
+ nn.SiLU(),
348
+ nn.Linear(self.time_embed_dim, self.time_embed_dim),
349
+ )
350
+ self.temporal_decoder_blocks = nn.ModuleList()
351
+ for i in range(num_layers):
352
+ if no_eff:
353
+ self.temporal_decoder_blocks.append(
354
+ TemporalDiffusionTransformerDecoderLayer(
355
+ seq_len=num_frames,
356
+ latent_dim=latent_dim,
357
+ text_latent_dim=text_latent_dim,
358
+ time_embed_dim=self.time_embed_dim,
359
+ ffn_dim=ff_size,
360
+ num_head=num_heads,
361
+ dropout=dropout
362
+ )
363
+ )
364
+ else:
365
+ self.temporal_decoder_blocks.append(
366
+ LinearTemporalDiffusionTransformerDecoderLayer(
367
+ seq_len=num_frames,
368
+ latent_dim=latent_dim,
369
+ text_latent_dim=text_latent_dim,
370
+ time_embed_dim=self.time_embed_dim,
371
+ ffn_dim=ff_size,
372
+ num_head=num_heads,
373
+ dropout=dropout
374
+ )
375
+ )
376
+
377
+ # Output Module
378
+ self.out = zero_module(nn.Linear(self.latent_dim, self.input_feats))
379
+
380
+ def encode_text(self, text, device):
381
+ with torch.no_grad():
382
+ text = clip.tokenize(text, truncate=True).to(device)
383
+ x = self.clip.token_embedding(text).type(self.clip.dtype) # [batch_size, n_ctx, d_model]
384
+
385
+ x = x + self.clip.positional_embedding.type(self.clip.dtype)
386
+ x = x.permute(1, 0, 2) # NLD -> LND
387
+ x = self.clip.transformer(x)
388
+ x = self.clip.ln_final(x).type(self.clip.dtype)
389
+
390
+ # T, B, D
391
+ x = self.text_pre_proj(x)
392
+ xf_out = self.textTransEncoder(x)
393
+ xf_out = self.text_ln(xf_out)
394
+ xf_proj = self.text_proj(xf_out[text.argmax(dim=-1), torch.arange(xf_out.shape[1])])
395
+ # B, T, D
396
+ xf_out = xf_out.permute(1, 0, 2)
397
+ return xf_proj, xf_out
398
+
399
+ def generate_src_mask(self, T, length):
400
+ B = len(length)
401
+ src_mask = torch.ones(B, T)
402
+ for i in range(B):
403
+ for j in range(length[i], T):
404
+ src_mask[i, j] = 0
405
+ return src_mask
406
+
407
+ def forward(self, x, timesteps, length=None, text=None, xf_proj=None, xf_out=None):
408
+ """
409
+ x: B, T, D
410
+ """
411
+ B, T = x.shape[0], x.shape[1]
412
+ if text is not None and len(text) != B:
413
+ index = x.device.index
414
+ text = text[index * B: index * B + B]
415
+ if xf_proj is None or xf_out is None:
416
+ xf_proj, xf_out = self.encode_text(text, x.device)
417
+
418
+ emb = self.time_embed(timestep_embedding(timesteps, self.latent_dim)) + xf_proj
419
+
420
+ # B, T, latent_dim
421
+ h = self.joint_embed(x)
422
+ h = h + self.sequence_embedding.unsqueeze(0)[:, :T, :]
423
+
424
+ src_mask = self.generate_src_mask(T, length).to(x.device).unsqueeze(-1)
425
+ for module in self.temporal_decoder_blocks:
426
+ h = module(h, xf_out, emb, src_mask)
427
+
428
+ output = self.out(h).view(B, T, -1).contiguous()
429
+ return output
text2motion/options/base_options.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+
4
+ import torch
5
+ import torch.distributed as dist
6
+ from mmcv.runner import get_dist_info, init_dist
7
+
8
+
9
+ class BaseOptions():
10
+ def __init__(self):
11
+ self.parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
12
+ self.initialized = False
13
+
14
+ def initialize(self):
15
+ self.parser.add_argument('--name', type=str, default="test", help='Name of this trial')
16
+ self.parser.add_argument('--decomp_name', type=str, default="Decomp_SP001_SM001_H512", help='Name of autoencoder model')
17
+
18
+ self.parser.add_argument("--gpu_id", type=int, nargs='+', default=(-1), help='GPU id')
19
+ self.parser.add_argument("--distributed", action="store_true", help='Whether to use DDP training')
20
+ self.parser.add_argument("--data_parallel", action="store_true", help="Whether to use DP training")
21
+
22
+ self.parser.add_argument('--dataset_name', type=str, default='t2m', help='Dataset Name')
23
+ self.parser.add_argument('--checkpoints_dir', type=str, default='./checkpoints', help='models are saved here')
24
+
25
+ self.parser.add_argument("--unit_length", type=int, default=4, help="Motions are cropped to the maximum times of unit_length")
26
+ self.parser.add_argument("--max_text_len", type=int, default=20, help="Maximum length of text description")
27
+
28
+ self.parser.add_argument('--text_enc_mod', type=str, default='bigru')
29
+ self.parser.add_argument('--estimator_mod', type=str, default='bigru')
30
+
31
+ self.parser.add_argument('--dim_text_hidden', type=int, default=512, help='Dimension of hidden unit in text encoder')
32
+ self.parser.add_argument('--dim_att_vec', type=int, default=512, help='Dimension of attention vector')
33
+ self.parser.add_argument('--dim_z', type=int, default=128, help='Dimension of latent Gaussian vector')
34
+
35
+ self.parser.add_argument('--n_layers_pri', type=int, default=1, help='Number of layers in prior network')
36
+ self.parser.add_argument('--n_layers_pos', type=int, default=1, help='Number of layers in posterior network')
37
+ self.parser.add_argument('--n_layers_dec', type=int, default=1, help='Number of layers in generator')
38
+
39
+ self.parser.add_argument('--dim_pri_hidden', type=int, default=1024, help='Dimension of hidden unit in prior network')
40
+ self.parser.add_argument('--dim_pos_hidden', type=int, default=1024, help='Dimension of hidden unit in posterior network')
41
+ self.parser.add_argument('--dim_dec_hidden', type=int, default=1024, help='Dimension of hidden unit in generator')
42
+
43
+ self.parser.add_argument('--dim_movement_enc_hidden', type=int, default=512,
44
+ help='Dimension of hidden in AutoEncoder(encoder)')
45
+ self.parser.add_argument('--dim_movement_dec_hidden', type=int, default=512,
46
+ help='Dimension of hidden in AutoEncoder(decoder)')
47
+ self.parser.add_argument('--dim_movement_latent', type=int, default=512, help='Dimension of motion snippet')
48
+
49
+ self.initialized = True
50
+
51
+
52
+
53
+ def parse(self):
54
+ if not self.initialized:
55
+ self.initialize()
56
+
57
+ self.opt = self.parser.parse_args()
58
+
59
+ self.opt.is_train = self.is_train
60
+
61
+ args = vars(self.opt)
62
+ if args["distributed"]:
63
+ init_dist('slurm')
64
+ rank, world_size = get_dist_info()
65
+ if args["distributed"]:
66
+ self.opt.gpu_id = range(world_size)
67
+ elif self.opt.gpu_id != (-1):
68
+ if len(self.opt.gpu_id) == 1:
69
+ torch.cuda.set_device(self.opt.gpu_id[0])
70
+ else:
71
+ assert args["data_parallel"] == False
72
+
73
+ if rank == 0:
74
+ print('------------ Options -------------')
75
+ for k, v in sorted(args.items()):
76
+ print('%s: %s' % (str(k), str(v)))
77
+ print('-------------- End ----------------')
78
+ if self.is_train:
79
+ # save to the disk
80
+ expr_dir = os.path.join(self.opt.checkpoints_dir, self.opt.dataset_name, self.opt.name)
81
+ if not os.path.exists(expr_dir):
82
+ os.makedirs(expr_dir)
83
+ file_name = os.path.join(expr_dir, 'opt.txt')
84
+ with open(file_name, 'wt') as opt_file:
85
+ opt_file.write('------------ Options -------------\n')
86
+ for k, v in sorted(args.items()):
87
+ opt_file.write('%s: %s\n' % (str(k), str(v)))
88
+ opt_file.write('-------------- End ----------------\n')
89
+ if world_size > 1:
90
+ dist.barrier()
91
+ return self.opt
text2motion/options/evaluate_options.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from options.base_options import BaseOptions
2
+
3
+
4
+ class TestOptions(BaseOptions):
5
+ def initialize(self):
6
+ BaseOptions.initialize(self)
7
+ self.parser.add_argument('--batch_size', type=int, default=1, help='Batch size')
8
+ self.parser.add_argument('--start_mov_len', type=int, default=10)
9
+ self.parser.add_argument('--est_length', action="store_true", help="Whether to use sampled motion length")
10
+ self.parser.add_argument('--num_layers', type=int, default=8, help='num_layers of transformer')
11
+ self.parser.add_argument('--latent_dim', type=int, default=512, help='latent_dim of transformer')
12
+ self.parser.add_argument('--diffusion_steps', type=int, default=1000, help='diffusion_steps of transformer')
13
+ self.parser.add_argument('--no_clip', action='store_true', help='whether use clip pretrain')
14
+ self.parser.add_argument('--no_eff', action='store_true', help='whether use efficient attention')
15
+
16
+
17
+ self.parser.add_argument('--repeat_times', type=int, default=3, help="Number of generation rounds for each text description")
18
+ self.parser.add_argument('--split_file', type=str, default='test.txt')
19
+ self.parser.add_argument('--text', type=str, default="", help='Text description for motion generation')
20
+ self.parser.add_argument('--motion_length', type=int, default=0, help='Number of framese for motion generation')
21
+ self.parser.add_argument('--text_file', type=str, default="", help='Path of text description for motion generation')
22
+ self.parser.add_argument('--which_epoch', type=str, default="latest", help='Checkpoint that will be used')
23
+ self.parser.add_argument('--result_path', type=str, default="./eval_results/", help='Path to save generation results')
24
+ self.parser.add_argument('--num_results', type=int, default=40, help='Number of descriptions that will be used')
25
+ self.parser.add_argument('--ext', type=str, default='default', help='Save file path extension')
26
+
27
+ self.is_train = False
text2motion/options/train_options.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+ from options.base_options import BaseOptions
4
+
5
+
6
+ class TrainCompOptions(BaseOptions):
7
+ def initialize(self):
8
+ BaseOptions.initialize(self)
9
+ self.parser.add_argument('--num_layers', type=int, default=8, help='num_layers of transformer')
10
+ self.parser.add_argument('--latent_dim', type=int, default=512, help='latent_dim of transformer')
11
+ self.parser.add_argument('--diffusion_steps', type=int, default=1000, help='diffusion_steps of transformer')
12
+ self.parser.add_argument('--no_clip', action='store_true', help='whether use clip pretrain')
13
+ self.parser.add_argument('--no_eff', action='store_true', help='whether use efficient attention')
14
+
15
+ self.parser.add_argument('--num_epochs', type=int, default=50, help='Number of epochs')
16
+ self.parser.add_argument('--lr', type=float, default=2e-4, help='Learning rate')
17
+ self.parser.add_argument('--batch_size', type=int, default=32, help='Batch size per GPU')
18
+ self.parser.add_argument('--times', type=int, default=1, help='times of dataset')
19
+
20
+ self.parser.add_argument('--feat_bias', type=float, default=25, help='Scales for global motion features and foot contact')
21
+
22
+ self.parser.add_argument('--is_continue', action="store_true", help='Is this trail continued from previous trail?')
23
+
24
+ self.parser.add_argument('--log_every', type=int, default=50, help='Frequency of printing training progress (by iteration)')
25
+ self.parser.add_argument('--save_every_e', type=int, default=5, help='Frequency of saving models (by epoch)')
26
+ self.parser.add_argument('--eval_every_e', type=int, default=5, help='Frequency of animation results (by epoch)')
27
+ self.parser.add_argument('--save_latest', type=int, default=500, help='Frequency of saving models (by iteration)')
28
+ self.parser.add_argument('--use_wandb', action='store_true', help='whether to log with wandb')
29
+ self.parser.add_argument('--wandb_user', type=str, default='text2motion', help='wandb user name')
30
+ self.parser.add_argument('--experiment_name', type=str, default='motiondiffuse', help='experiment name')
31
+ self.parser.add_argument('--seed', type=int, default=0, help='random seed')
32
+ self.is_train = True
text2motion/requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ scipy
2
+ smplx
3
+ opencv-python
4
+ tqdm
5
+ matplotlib==3.3.1
6
+ spacy
7
+ pyglet==1.5.0
8
+ imageoi
9
+ wandb
10
+ pyrender
11
+ git+https://github.com/openai/CLIP.git
text2motion/tools/__init__.py ADDED
File without changes
text2motion/tools/arguments.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from argparse import ArgumentParser, Namespace
2
+
3
+ GEN_NS = Namespace()
4
+ GEN_NS
5
+
6
+ GEN_PARSER = ArgumentParser()
7
+ GEN_PARSER.add_argument('--opt_path', type=str, help='Opt path')
8
+ GEN_PARSER.add_argument('--text', type=str, default="", help='Text description for motion generation')
9
+ GEN_PARSER.add_argument('--motion_length', type=int, default=60, help='Number of frames for motion generation')
10
+ GEN_PARSER.add_argument('--result_path', type=str, default="test_sample.gif", help='Path to save generation result')
11
+ GEN_PARSER.add_argument('--npy_path', type=str, default="", help='Path to save 3D keypoints sequence')
12
+ GEN_PARSER.add_argument('--gpu_id', type=int, default=-1, help="which gpu to use")
13
+ GEN_PARSER.add_argument('--seed', type=int, default=0, help="random seed")
14
+
15
+
16
+ GEN_PARSER.add_argument("-dm", "--display_mesh", action='store_true', required=False, default=False, help="Display mesh if this flag is present")
17
+ # for now just specifies file name (with spaces) made by inference
18
+ GEN_PARSER.add_argument("-p", "--prompt", type=str, required=False, default="", help="Prompt for inference display",)
19
+ GEN_PARSER.add_argument("-sf", "--seq_file", type=str, required=False, default="", help="file for non-inference display",)
20
+ # add model_path arg
21
+ GEN_PARSER.add_argument("-m", "--model_path", type=str, required=False, default="", help="Path to model directory e.g. ./checkpoints/grab/grab_baseline_dp_2gpu_8layers_1000",)
22
+ GEN_PARSER.add_argument("-sg", "--save_gif", action='store_true', required=False, default=False, help="Saves gif")
23
+
24
+ def get_case_arguments(case_:str):
25
+ """
26
+ Returns the parsed GEN_PARSER for the indicated case
27
+ cases can be: 'train','evaluation' and 'generation'
28
+ """
29
+ args = {
30
+ 'generation': GEN_PARSER.parse_args("")
31
+ }
32
+
33
+ if case_ in ('train','evaluation'):
34
+ raise NotImplementedError(f'{case_} arguments have not been included yet, \
35
+ you can find them in their script file in tools/{case_}.py')
36
+
37
+ return args.get(case_, None)
text2motion/tools/evaluation.py ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ import numpy as np
3
+ import torch
4
+ from datasets import get_dataset_motion_loader, get_motion_loader
5
+ from models import MotionTransformer
6
+ from utils.get_opt import get_opt
7
+ from utils.metrics import *
8
+ from datasets import EvaluatorModelWrapper
9
+ from collections import OrderedDict
10
+ from utils.plot_script import *
11
+ from utils import paramUtil
12
+ from utils.utils import *
13
+ from trainers import DDPMTrainer
14
+
15
+ from os.path import join as pjoin
16
+ import sys
17
+
18
+
19
+ def build_models(opt, dim_pose):
20
+ encoder = MotionTransformer(
21
+ input_feats=dim_pose,
22
+ num_frames=opt.max_motion_length,
23
+ num_layers=opt.num_layers,
24
+ latent_dim=opt.latent_dim,
25
+ no_clip=opt.no_clip,
26
+ no_eff=opt.no_eff)
27
+ return encoder
28
+
29
+
30
+ torch.multiprocessing.set_sharing_strategy('file_system')
31
+
32
+
33
+ def evaluate_matching_score(motion_loaders, file):
34
+ match_score_dict = OrderedDict({})
35
+ R_precision_dict = OrderedDict({})
36
+ activation_dict = OrderedDict({})
37
+ # print(motion_loaders.keys())
38
+ print('========== Evaluating Matching Score ==========')
39
+ for motion_loader_name, motion_loader in motion_loaders.items():
40
+ all_motion_embeddings = []
41
+ score_list = []
42
+ all_size = 0
43
+ matching_score_sum = 0
44
+ top_k_count = 0
45
+ # print(motion_loader_name)
46
+ with torch.no_grad():
47
+ for idx, batch in enumerate(motion_loader):
48
+ word_embeddings, pos_one_hots, _, sent_lens, motions, m_lens, _ = batch
49
+ text_embeddings, motion_embeddings = eval_wrapper.get_co_embeddings(
50
+ word_embs=word_embeddings,
51
+ pos_ohot=pos_one_hots,
52
+ cap_lens=sent_lens,
53
+ motions=motions,
54
+ m_lens=m_lens
55
+ )
56
+ dist_mat = euclidean_distance_matrix(text_embeddings.cpu().numpy(),
57
+ motion_embeddings.cpu().numpy())
58
+ matching_score_sum += dist_mat.trace()
59
+
60
+ argsmax = np.argsort(dist_mat, axis=1)
61
+ top_k_mat = calculate_top_k(argsmax, top_k=3)
62
+ top_k_count += top_k_mat.sum(axis=0)
63
+
64
+ all_size += text_embeddings.shape[0]
65
+
66
+ all_motion_embeddings.append(motion_embeddings.cpu().numpy())
67
+
68
+ all_motion_embeddings = np.concatenate(all_motion_embeddings, axis=0)
69
+ matching_score = matching_score_sum / all_size
70
+ R_precision = top_k_count / all_size
71
+ match_score_dict[motion_loader_name] = matching_score
72
+ R_precision_dict[motion_loader_name] = R_precision
73
+ activation_dict[motion_loader_name] = all_motion_embeddings
74
+
75
+ print(f'---> [{motion_loader_name}] Matching Score: {matching_score:.4f}')
76
+ print(f'---> [{motion_loader_name}] Matching Score: {matching_score:.4f}', file=file, flush=True)
77
+
78
+ line = f'---> [{motion_loader_name}] R_precision: '
79
+ for i in range(len(R_precision)):
80
+ line += '(top %d): %.4f ' % (i+1, R_precision[i])
81
+ print(line)
82
+ print(line, file=file, flush=True)
83
+
84
+ return match_score_dict, R_precision_dict, activation_dict
85
+
86
+
87
+ def evaluate_fid(groundtruth_loader, activation_dict, file):
88
+ eval_dict = OrderedDict({})
89
+ gt_motion_embeddings = []
90
+ print('========== Evaluating FID ==========')
91
+ with torch.no_grad():
92
+ for idx, batch in enumerate(groundtruth_loader):
93
+ _, _, _, sent_lens, motions, m_lens, _ = batch
94
+ motion_embeddings = eval_wrapper.get_motion_embeddings(
95
+ motions=motions,
96
+ m_lens=m_lens
97
+ )
98
+ gt_motion_embeddings.append(motion_embeddings.cpu().numpy())
99
+ gt_motion_embeddings = np.concatenate(gt_motion_embeddings, axis=0)
100
+ gt_mu, gt_cov = calculate_activation_statistics(gt_motion_embeddings)
101
+
102
+ # print(gt_mu)
103
+ for model_name, motion_embeddings in activation_dict.items():
104
+ mu, cov = calculate_activation_statistics(motion_embeddings)
105
+ # print(mu)
106
+ fid = calculate_frechet_distance(gt_mu, gt_cov, mu, cov)
107
+ print(f'---> [{model_name}] FID: {fid:.4f}')
108
+ print(f'---> [{model_name}] FID: {fid:.4f}', file=file, flush=True)
109
+ eval_dict[model_name] = fid
110
+ return eval_dict
111
+
112
+
113
+ def evaluate_diversity(activation_dict, file):
114
+ eval_dict = OrderedDict({})
115
+ print('========== Evaluating Diversity ==========')
116
+ for model_name, motion_embeddings in activation_dict.items():
117
+ diversity = calculate_diversity(motion_embeddings, diversity_times)
118
+ eval_dict[model_name] = diversity
119
+ print(f'---> [{model_name}] Diversity: {diversity:.4f}')
120
+ print(f'---> [{model_name}] Diversity: {diversity:.4f}', file=file, flush=True)
121
+ return eval_dict
122
+
123
+
124
+ def evaluate_multimodality(mm_motion_loaders, file):
125
+ eval_dict = OrderedDict({})
126
+ print('========== Evaluating MultiModality ==========')
127
+ for model_name, mm_motion_loader in mm_motion_loaders.items():
128
+ mm_motion_embeddings = []
129
+ with torch.no_grad():
130
+ for idx, batch in enumerate(mm_motion_loader):
131
+ # (1, mm_replications, dim_pos)
132
+ motions, m_lens = batch
133
+ motion_embedings = eval_wrapper.get_motion_embeddings(motions[0], m_lens[0])
134
+ mm_motion_embeddings.append(motion_embedings.unsqueeze(0))
135
+ if len(mm_motion_embeddings) == 0:
136
+ multimodality = 0
137
+ else:
138
+ mm_motion_embeddings = torch.cat(mm_motion_embeddings, dim=0).cpu().numpy()
139
+ multimodality = calculate_multimodality(mm_motion_embeddings, mm_num_times)
140
+ print(f'---> [{model_name}] Multimodality: {multimodality:.4f}')
141
+ print(f'---> [{model_name}] Multimodality: {multimodality:.4f}', file=file, flush=True)
142
+ eval_dict[model_name] = multimodality
143
+ return eval_dict
144
+
145
+
146
+ def get_metric_statistics(values):
147
+ mean = np.mean(values, axis=0)
148
+ std = np.std(values, axis=0)
149
+ conf_interval = 1.96 * std / np.sqrt(replication_times)
150
+ return mean, conf_interval
151
+
152
+
153
+ def evaluation(log_file):
154
+ with open(log_file, 'w') as f:
155
+ all_metrics = OrderedDict({'Matching Score': OrderedDict({}),
156
+ 'R_precision': OrderedDict({}),
157
+ 'FID': OrderedDict({}),
158
+ 'Diversity': OrderedDict({}),
159
+ 'MultiModality': OrderedDict({})})
160
+ for replication in range(replication_times):
161
+ motion_loaders = {}
162
+ mm_motion_loaders = {}
163
+ motion_loaders['ground truth'] = gt_loader
164
+ for motion_loader_name, motion_loader_getter in eval_motion_loaders.items():
165
+ motion_loader, mm_motion_loader = motion_loader_getter()
166
+ motion_loaders[motion_loader_name] = motion_loader
167
+ mm_motion_loaders[motion_loader_name] = mm_motion_loader
168
+
169
+ print(f'==================== Replication {replication} ====================')
170
+ print(f'==================== Replication {replication} ====================', file=f, flush=True)
171
+ print(f'Time: {datetime.now()}')
172
+ print(f'Time: {datetime.now()}', file=f, flush=True)
173
+ mat_score_dict, R_precision_dict, acti_dict = evaluate_matching_score(motion_loaders, f)
174
+
175
+ print(f'Time: {datetime.now()}')
176
+ print(f'Time: {datetime.now()}', file=f, flush=True)
177
+ fid_score_dict = evaluate_fid(gt_loader, acti_dict, f)
178
+
179
+ print(f'Time: {datetime.now()}')
180
+ print(f'Time: {datetime.now()}', file=f, flush=True)
181
+ div_score_dict = evaluate_diversity(acti_dict, f)
182
+
183
+ print(f'Time: {datetime.now()}')
184
+ print(f'Time: {datetime.now()}', file=f, flush=True)
185
+ mm_score_dict = evaluate_multimodality(mm_motion_loaders, f)
186
+
187
+ print(f'!!! DONE !!!')
188
+ print(f'!!! DONE !!!', file=f, flush=True)
189
+
190
+ for key, item in mat_score_dict.items():
191
+ if key not in all_metrics['Matching Score']:
192
+ all_metrics['Matching Score'][key] = [item]
193
+ else:
194
+ all_metrics['Matching Score'][key] += [item]
195
+
196
+ for key, item in R_precision_dict.items():
197
+ if key not in all_metrics['R_precision']:
198
+ all_metrics['R_precision'][key] = [item]
199
+ else:
200
+ all_metrics['R_precision'][key] += [item]
201
+
202
+ for key, item in fid_score_dict.items():
203
+ if key not in all_metrics['FID']:
204
+ all_metrics['FID'][key] = [item]
205
+ else:
206
+ all_metrics['FID'][key] += [item]
207
+
208
+ for key, item in div_score_dict.items():
209
+ if key not in all_metrics['Diversity']:
210
+ all_metrics['Diversity'][key] = [item]
211
+ else:
212
+ all_metrics['Diversity'][key] += [item]
213
+
214
+ for key, item in mm_score_dict.items():
215
+ if key not in all_metrics['MultiModality']:
216
+ all_metrics['MultiModality'][key] = [item]
217
+ else:
218
+ all_metrics['MultiModality'][key] += [item]
219
+
220
+
221
+ # print(all_metrics['Diversity'])
222
+ for metric_name, metric_dict in all_metrics.items():
223
+ print('========== %s Summary ==========' % metric_name)
224
+ print('========== %s Summary ==========' % metric_name, file=f, flush=True)
225
+
226
+ for model_name, values in metric_dict.items():
227
+ # print(metric_name, model_name)
228
+ mean, conf_interval = get_metric_statistics(np.array(values))
229
+ # print(mean, mean.dtype)
230
+ if isinstance(mean, np.float64) or isinstance(mean, np.float32):
231
+ print(f'---> [{model_name}] Mean: {mean:.4f} CInterval: {conf_interval:.4f}')
232
+ print(f'---> [{model_name}] Mean: {mean:.4f} CInterval: {conf_interval:.4f}', file=f, flush=True)
233
+ elif isinstance(mean, np.ndarray):
234
+ line = f'---> [{model_name}]'
235
+ for i in range(len(mean)):
236
+ line += '(top %d) Mean: %.4f CInt: %.4f;' % (i+1, mean[i], conf_interval[i])
237
+ print(line)
238
+ print(line, file=f, flush=True)
239
+
240
+
241
+ if __name__ == '__main__':
242
+ mm_num_samples = 100
243
+ mm_num_repeats = 30
244
+ mm_num_times = 10
245
+
246
+ diversity_times = 300
247
+ replication_times = 1
248
+ batch_size = 32
249
+ opt_path = sys.argv[1]
250
+ dataset_opt_path = opt_path
251
+
252
+ try:
253
+ device_id = int(sys.argv[2])
254
+ except:
255
+ device_id = 0
256
+ device = torch.device('cuda:%d' % device_id if torch.cuda.is_available() else 'cpu')
257
+ torch.cuda.set_device(device_id)
258
+
259
+ gt_loader, gt_dataset = get_dataset_motion_loader(dataset_opt_path, batch_size, device)
260
+ wrapper_opt = get_opt(dataset_opt_path, device)
261
+ eval_wrapper = EvaluatorModelWrapper(wrapper_opt)
262
+
263
+ opt = get_opt(opt_path, device)
264
+ encoder = build_models(opt, opt.dim_pose)
265
+ trainer = DDPMTrainer(opt, encoder)
266
+ eval_motion_loaders = {
267
+ 'text2motion': lambda: get_motion_loader(
268
+ opt,
269
+ batch_size,
270
+ trainer,
271
+ gt_dataset,
272
+ mm_num_samples,
273
+ mm_num_repeats
274
+ )
275
+ }
276
+
277
+ log_file = './t2m_evaluation.log'
278
+ evaluation(log_file)
text2motion/tools/inference.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ from os.path import join as pjoin
4
+
5
+ import numpy as np
6
+ import torch
7
+ import utils.paramUtil as paramUtil
8
+ from models import MotionTransformer
9
+ from torch.utils.data import DataLoader
10
+ from trainers import DDPMTrainer
11
+ from utils.get_opt import get_opt
12
+ from utils.motion_process import recover_from_ric
13
+ from utils.plot_script import *
14
+ from utils.utils import *
15
+ from utils.word_vectorizer import POS_enumerator
16
+
17
+
18
+ def plot_t2m(data, result_path, npy_path, caption, joints_n):
19
+ joint = recover_from_ric(torch.from_numpy(data).float(), joints_n).numpy()
20
+
21
+ def get_numpy_file_path(prompt, epoch, n_frames):
22
+ # e.g. "airplane_fly_1_1000_60f.npy"
23
+ prompt_no_spaces = prompt.replace(' ', '_')
24
+ return f"{prompt_no_spaces}_{epoch}_{n_frames}f"
25
+
26
+ def get_wordvec_model(opt):
27
+ encoder = MotionTransformer(
28
+ input_feats=opt.dim_pose,
29
+ num_frames=opt.max_motion_length,
30
+ num_layers=opt.num_layers,
31
+ latent_dim=opt.latent_dim,
32
+ no_clip=opt.no_clip,
33
+ no_eff=opt.no_eff)
34
+ return encoder
35
+
36
+
37
+ if __name__ == '__main__':
38
+ print("inferencemake started")
39
+ parser = argparse.ArgumentParser()
40
+ parser.add_argument('--opt_path', type=str, help='Opt path')
41
+ parser.add_argument('--text', type=str, default="", help='Text description for motion generation')
42
+ parser.add_argument('--motion_length', type=int, default=60, help='Number of frames for motion generation')
43
+ parser.add_argument('--result_path', type=str, default="test_sample.gif", help='Path to save generation result')
44
+ parser.add_argument('--npy_path', type=str, default="", help='Path to save 3D keypoints sequence')
45
+ parser.add_argument('--gpu_id', type=int, default=-1, help="which gpu to use")
46
+ parser.add_argument('--seed', type=int, default=0, help="random seed")
47
+ # add which_epoch
48
+ parser.add_argument('--which_epoch', type=str, default="latest", help="which epoch to load")
49
+ args = parser.parse_args()
50
+
51
+ set_random_seed(args.seed)
52
+ print(f"set random seed to {args.seed}")
53
+ device = torch.device('cuda:%d' % args.gpu_id if args.gpu_id != -1 else 'cpu')
54
+ opt = get_opt(args.opt_path, device)
55
+ opt.do_denoise = True
56
+ opt.which_epoch = args.which_epoch
57
+
58
+ # TODO (elmc): re-enable this
59
+ # assert opt.dataset_name == "t2m"
60
+ # assert args.motion_length <= 196
61
+ # opt.data_root = './dataset/HumanML3D'
62
+ opt.data_root = './data/GRAB'
63
+ # opt.motion_dir = pjoin(opt.data_root, 'new_joint_vecs')
64
+ opt.text_dir = pjoin(opt.data_root, 'texts')
65
+ # TODO (elmc): re-enable this
66
+ # opt.joints_num = 22
67
+ # opt.dim_pose = 263
68
+ opt.dim_pose = 212
69
+ dim_word = 300
70
+ dim_pos_ohot = len(POS_enumerator)
71
+ # TODO (elmc): re-enable this
72
+ # num_classes = 200 // opt.unit_length
73
+
74
+ mean = np.load(pjoin(opt.meta_dir, 'mean.npy'))
75
+ std = np.load(pjoin(opt.meta_dir, 'std.npy'))
76
+
77
+ print("Loading word vectorizer...")
78
+ encoder = get_wordvec_model(opt).to(device)
79
+ print("Loading model...")
80
+ trainer = DDPMTrainer(opt, encoder)
81
+ trainer.load(pjoin(opt.model_dir, opt.which_epoch + '.tar'))
82
+
83
+ trainer.eval_mode()
84
+ trainer.to(opt.device)
85
+
86
+ result_dict = {}
87
+ with torch.no_grad():
88
+ if args.motion_length != -1:
89
+ caption = [args.text]
90
+ file_name = f"{opt.which_epoch}_{args.motion_length}f.npy"
91
+ m_lens = torch.LongTensor([args.motion_length]).to(device)
92
+ pred_motions = trainer.generate(caption, m_lens, opt.dim_pose)
93
+ motion = pred_motions[0].cpu().numpy()
94
+ motion = motion * std + mean
95
+ title = args.text + " #%d" % motion.shape[0]
96
+ print(f"trying to plot {title}")
97
+ # write motion to numpy file
98
+ if not os.path.exists(args.npy_path):
99
+ os.makedirs(args.npy_path)
100
+ full_npy_path = f"{args.npy_path}/{get_numpy_file_path(caption[0], opt.which_epoch, args.motion_length)}.npy"
101
+ with open(full_npy_path, 'wb') as f:
102
+ print(f"saving output to {full_npy_path}")
103
+ np.save(f, motion)
104
+
105
+ # plot_t2m(motion, args.result_path, args.npy_path, title)
text2motion/tools/train.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ # print cwd
3
+ import sys
4
+ from os.path import join as pjoin
5
+
6
+ sys.path.append(os.getcwd())
7
+ import torch
8
+ import torch.distributed as dist
9
+ import utils.paramUtil as paramUtil
10
+ import wandb
11
+ from datasets import Text2MotionDataset
12
+ from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
13
+ from mmcv.runner import get_dist_info, init_dist
14
+ from models import MotionTransformer
15
+ from options.train_options import TrainCompOptions
16
+ from trainers import DDPMTrainer
17
+ from utils.plot_script import *
18
+ from utils.utils import *
19
+
20
+
21
+ def build_models(opt, dim_pose):
22
+ encoder = MotionTransformer(
23
+ input_feats=dim_pose,
24
+ num_frames=opt.max_motion_length,
25
+ num_layers=opt.num_layers,
26
+ latent_dim=opt.latent_dim,
27
+ no_clip=opt.no_clip,
28
+ no_eff=opt.no_eff)
29
+ return encoder
30
+
31
+
32
+ if __name__ == '__main__':
33
+
34
+ parser = TrainCompOptions()
35
+ opt = parser.parse()
36
+ rank, world_size = get_dist_info()
37
+
38
+ print(f"setting random seed to {opt.seed}")
39
+ set_random_seed(opt.seed)
40
+ opt.device = torch.device("cuda")
41
+ torch.autograd.set_detect_anomaly(True)
42
+ print(f"device id: {torch.cuda.current_device()}")
43
+ print(f"selected device ids: {opt.gpu_id}")
44
+ opt.save_root = pjoin(opt.checkpoints_dir, opt.dataset_name, opt.name)
45
+ opt.model_dir = pjoin(opt.save_root, 'model')
46
+ opt.meta_dir = pjoin(opt.save_root, 'meta')
47
+ opt.noise_dir = pjoin(opt.save_root, 'noise')
48
+
49
+ if rank == 0:
50
+ os.makedirs(opt.model_dir, exist_ok=True)
51
+ os.makedirs(opt.meta_dir, exist_ok=True)
52
+ os.makedirs(opt.noise_dir, exist_ok=True)
53
+ if world_size > 1:
54
+ dist.barrier()
55
+ if opt.use_wandb:
56
+ wandb_id = wandb.util.generate_id()
57
+ wandb.init(
58
+ project="text2motion",
59
+ name=f"{opt.experiment_name}",
60
+ entity=opt.wandb_user,
61
+ # notes=opt.EXPERIMENT_NOTE,
62
+ config=opt,
63
+ id=wandb_id,
64
+ resume="allow",
65
+ # monitor_gym=True,
66
+ sync_tensorboard=True,
67
+ )
68
+ # opt.wandb = wandb
69
+ if opt.dataset_name == 't2m':
70
+ opt.data_root = './data/HumanML3D'
71
+ opt.motion_dir = pjoin(opt.data_root, 'new_joint_vecs')
72
+ opt.text_dir = pjoin(opt.data_root, 'texts')
73
+ opt.joints_num = 22
74
+ radius = 4
75
+ fps = 20
76
+ opt.max_motion_length = 196
77
+ dim_pose = 263
78
+ kinematic_chain = paramUtil.t2m_kinematic_chain
79
+ elif opt.dataset_name == 'grab':
80
+ opt.data_root = './data/GRAB'
81
+ opt.motion_dir = pjoin(opt.data_root, 'joints')
82
+ opt.text_dir = pjoin(opt.data_root, 'texts')
83
+ opt.face_text_dir = pjoin(opt.data_root, 'face_texts')
84
+ opt.joints_num = 72 # TODO (elmc): verify this BUT ALSO I'M NOT USING IT FOR NOW!
85
+ # radius = 4 # TODO (elmc): verify this, think it's only for visualization purposes
86
+ # fps = 20 # TODO (elmc): verify this, also for visualization I think
87
+ dim_pose = 212 # drop betas (body shape) and face-shape from Motion data (via to_smplx_params & smplx_dict_to_array method)
88
+ opt.dim_pose = dim_pose
89
+ opt.max_motion_length = 196 # TODO (elmc): verify this; do this dynamically..??
90
+ # TODO (elmc): verify what this does and if we can use the t2m one
91
+ # NOTE: think, again, it's only for visualization
92
+ # kinematic_chain = paramUtil.t2m_kinematic_chain
93
+ # kinematic_chain = paramUtil.grab_kinematic_chain
94
+ print(f"loading data root: {opt.data_root}")
95
+ # print(f"kinematic chain: {kinematic_chain}")
96
+ elif opt.dataset_name == 'kit':
97
+ opt.data_root = './data/KIT-ML'
98
+ opt.motion_dir = pjoin(opt.data_root, 'new_joint_vecs')
99
+ opt.text_dir = pjoin(opt.data_root, 'texts')
100
+ opt.joints_num = 21
101
+ radius = 240 * 8
102
+ fps = 12.5
103
+ dim_pose = 251
104
+ opt.max_motion_length = 196
105
+ kinematic_chain = paramUtil.kit_kinematic_chain
106
+
107
+ else:
108
+ raise KeyError('Dataset Does Not Exist')
109
+
110
+ # TODO (elmc): check dim_word and add back in???
111
+ # dim_word = 300
112
+ mean = np.load(pjoin(opt.data_root, 'Mean.npy'))
113
+ std = np.load(pjoin(opt.data_root, 'Std.npy'))
114
+
115
+ train_split_file = pjoin(opt.data_root, 'train.txt')
116
+ print(f"cwd is {os.getcwd()}")
117
+ print(f"train_split_file: {train_split_file}")
118
+ encoder = build_models(opt, dim_pose)
119
+ if world_size > 1:
120
+ encoder = MMDistributedDataParallel(
121
+ encoder.cuda(),
122
+ device_ids=[torch.cuda.current_device()],
123
+ broadcast_buffers=False,
124
+ find_unused_parameters=True)
125
+ elif opt.data_parallel:
126
+ encoder = MMDataParallel(
127
+ encoder.cuda(opt.gpu_id[0]), device_ids=opt.gpu_id)
128
+ else:
129
+ encoder = encoder.cuda()
130
+
131
+ trainer = DDPMTrainer(opt, encoder)
132
+ train_dataset = Text2MotionDataset(opt, mean, std, train_split_file, opt.times)
133
+ print(f"loaded data, now training")
134
+ trainer.train(train_dataset)
text2motion/utils/__init__.py ADDED
File without changes
text2motion/utils/get_opt.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ from argparse import Namespace
4
+ from os.path import join as pjoin
5
+
6
+ from utils.word_vectorizer import POS_enumerator
7
+
8
+
9
+ def is_float(numStr):
10
+ flag = False
11
+ numStr = str(numStr).strip().lstrip('-').lstrip('+')
12
+ try:
13
+ reg = re.compile(r'^[-+]?[0-9]+\.[0-9]+$')
14
+ res = reg.match(str(numStr))
15
+ if res:
16
+ flag = True
17
+ except Exception as ex:
18
+ print("is_float() - error: " + str(ex))
19
+ return flag
20
+
21
+
22
+ def is_number(numStr):
23
+ flag = False
24
+ numStr = str(numStr).strip().lstrip('-').lstrip('+')
25
+ if str(numStr).isdigit():
26
+ flag = True
27
+ return flag
28
+
29
+
30
+ def get_opt(opt_path, device):
31
+ opt = Namespace()
32
+ opt_dict = vars(opt)
33
+
34
+ skip = ('-------------- End ----------------',
35
+ '------------ Options -------------',
36
+ '\n')
37
+ print('Reading', opt_path)
38
+ with open(opt_path) as f:
39
+ for line in f:
40
+ if line.strip() not in skip:
41
+ # print(line.strip())
42
+ key, value = line.strip().split(': ')
43
+ if value in ('True', 'False'):
44
+ opt_dict[key] = True if value == 'True' else False
45
+ elif is_float(value):
46
+ opt_dict[key] = float(value)
47
+ elif is_number(value):
48
+ opt_dict[key] = int(value)
49
+ else:
50
+ opt_dict[key] = str(value)
51
+
52
+ # opt_dict['which_epoch'] = 'latest'
53
+ if 'num_layers' not in opt_dict:
54
+ opt_dict['num_layers'] = 8
55
+ if 'latent_dim' not in opt_dict:
56
+ opt_dict['latent_dim'] = 512
57
+ if 'diffusion_steps' not in opt_dict:
58
+ opt_dict['diffusion_steps'] = 1000
59
+ if 'no_clip' not in opt_dict:
60
+ opt_dict['no_clip'] = False
61
+ if 'no_eff' not in opt_dict:
62
+ opt_dict['no_eff'] = False
63
+
64
+ opt.save_root = pjoin(opt.checkpoints_dir, opt.dataset_name, opt.name)
65
+ opt.model_dir = pjoin(opt.save_root, 'model')
66
+ opt.meta_dir = pjoin(opt.save_root, 'meta')
67
+
68
+ if opt.dataset_name == 't2m':
69
+ opt.data_root = './data/HumanML3D'
70
+ opt.motion_dir = pjoin(opt.data_root, 'new_joint_vecs')
71
+ opt.text_dir = pjoin(opt.data_root, 'texts')
72
+ opt.joints_num = 22
73
+ opt.dim_pose = 263
74
+ opt.max_motion_length = 196
75
+ elif opt.dataset_name == 'kit':
76
+ opt.data_root = './data/KIT-ML'
77
+ opt.motion_dir = pjoin(opt.data_root, 'new_joint_vecs')
78
+ opt.text_dir = pjoin(opt.data_root, 'texts')
79
+ opt.joints_num = 21
80
+ opt.dim_pose = 251
81
+ opt.max_motion_length = 196
82
+ elif opt.dataset_name == 'grab':
83
+ opt.data_root = './data/GRAB'
84
+ opt.motion_dir = pjoin(opt.data_root, 'joints')
85
+ opt.text_dir = pjoin(opt.data_root, 'texts')
86
+ # opt.joints_num = 72 # TODO (elmc): verify this BUT ALSO I'M NOT USING IT FOR NOW!
87
+ opt.dim_pose = 212 # drop betas (body shape) and face-shape from Motion data (via to_smplx_params & smplx_dict_to_array method)
88
+ # TOOD (elmc): verify this
89
+ opt.max_motion_length = 196
90
+ else:
91
+ raise KeyError('Dataset not recognized')
92
+
93
+ # TODO (elmc): is dim_word ever actually used?
94
+ # opt.dim_word = 300
95
+ # TODO (elmc): what is num classes for GRAB?
96
+ # opt.num_classes = 200 // opt.unit_length
97
+ opt.dim_pos_ohot = len(POS_enumerator)
98
+ opt.is_train = False
99
+ opt.is_continue = False
100
+ opt.device = device
101
+
102
+ return opt
text2motion/utils/metrics.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from scipy import linalg
3
+
4
+
5
+ # (X - X_train)*(X - X_train) = -2X*X_train + X*X + X_train*X_train
6
+ def euclidean_distance_matrix(matrix1, matrix2):
7
+ """
8
+ Params:
9
+ -- matrix1: N1 x D
10
+ -- matrix2: N2 x D
11
+ Returns:
12
+ -- dist: N1 x N2
13
+ dist[i, j] == distance(matrix1[i], matrix2[j])
14
+ """
15
+ assert matrix1.shape[1] == matrix2.shape[1]
16
+ d1 = -2 * np.dot(matrix1, matrix2.T) # shape (num_test, num_train)
17
+ d2 = np.sum(np.square(matrix1), axis=1, keepdims=True) # shape (num_test, 1)
18
+ d3 = np.sum(np.square(matrix2), axis=1) # shape (num_train, )
19
+ dists = np.sqrt(d1 + d2 + d3) # broadcasting
20
+ return dists
21
+
22
+ def calculate_top_k(mat, top_k):
23
+ size = mat.shape[0]
24
+ gt_mat = np.expand_dims(np.arange(size), 1).repeat(size, 1)
25
+ bool_mat = (mat == gt_mat)
26
+ correct_vec = False
27
+ top_k_list = []
28
+ for i in range(top_k):
29
+ # print(correct_vec, bool_mat[:, i])
30
+ correct_vec = (correct_vec | bool_mat[:, i])
31
+ # print(correct_vec)
32
+ top_k_list.append(correct_vec[:, None])
33
+ top_k_mat = np.concatenate(top_k_list, axis=1)
34
+ return top_k_mat
35
+
36
+
37
+ def calculate_R_precision(embedding1, embedding2, top_k, sum_all=False):
38
+ dist_mat = euclidean_distance_matrix(embedding1, embedding2)
39
+ argmax = np.argsort(dist_mat, axis=1)
40
+ top_k_mat = calculate_top_k(argmax, top_k)
41
+ if sum_all:
42
+ return top_k_mat.sum(axis=0)
43
+ else:
44
+ return top_k_mat
45
+
46
+
47
+ def calculate_matching_score(embedding1, embedding2, sum_all=False):
48
+ assert len(embedding1.shape) == 2
49
+ assert embedding1.shape[0] == embedding2.shape[0]
50
+ assert embedding1.shape[1] == embedding2.shape[1]
51
+
52
+ dist = linalg.norm(embedding1 - embedding2, axis=1)
53
+ if sum_all:
54
+ return dist.sum(axis=0)
55
+ else:
56
+ return dist
57
+
58
+
59
+
60
+ def calculate_activation_statistics(activations):
61
+ """
62
+ Params:
63
+ -- activation: num_samples x dim_feat
64
+ Returns:
65
+ -- mu: dim_feat
66
+ -- sigma: dim_feat x dim_feat
67
+ """
68
+ mu = np.mean(activations, axis=0)
69
+ cov = np.cov(activations, rowvar=False)
70
+ return mu, cov
71
+
72
+
73
+ def calculate_diversity(activation, diversity_times):
74
+ assert len(activation.shape) == 2
75
+ assert activation.shape[0] > diversity_times
76
+ num_samples = activation.shape[0]
77
+
78
+ first_indices = np.random.choice(num_samples, diversity_times, replace=False)
79
+ second_indices = np.random.choice(num_samples, diversity_times, replace=False)
80
+ dist = linalg.norm(activation[first_indices] - activation[second_indices], axis=1)
81
+ return dist.mean()
82
+
83
+
84
+ def calculate_multimodality(activation, multimodality_times):
85
+ assert len(activation.shape) == 3
86
+ assert activation.shape[1] > multimodality_times
87
+ num_per_sent = activation.shape[1]
88
+
89
+ first_dices = np.random.choice(num_per_sent, multimodality_times, replace=False)
90
+ second_dices = np.random.choice(num_per_sent, multimodality_times, replace=False)
91
+ dist = linalg.norm(activation[:, first_dices] - activation[:, second_dices], axis=2)
92
+ return dist.mean()
93
+
94
+
95
+ def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
96
+ """Numpy implementation of the Frechet Distance.
97
+ The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1)
98
+ and X_2 ~ N(mu_2, C_2) is
99
+ d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)).
100
+ Stable version by Dougal J. Sutherland.
101
+ Params:
102
+ -- mu1 : Numpy array containing the activations of a layer of the
103
+ inception net (like returned by the function 'get_predictions')
104
+ for generated samples.
105
+ -- mu2 : The sample mean over activations, precalculated on an
106
+ representative data set.
107
+ -- sigma1: The covariance matrix over activations for generated samples.
108
+ -- sigma2: The covariance matrix over activations, precalculated on an
109
+ representative data set.
110
+ Returns:
111
+ -- : The Frechet Distance.
112
+ """
113
+
114
+ mu1 = np.atleast_1d(mu1)
115
+ mu2 = np.atleast_1d(mu2)
116
+
117
+ sigma1 = np.atleast_2d(sigma1)
118
+ sigma2 = np.atleast_2d(sigma2)
119
+
120
+ assert mu1.shape == mu2.shape, \
121
+ 'Training and test mean vectors have different lengths'
122
+ assert sigma1.shape == sigma2.shape, \
123
+ 'Training and test covariances have different dimensions'
124
+
125
+ diff = mu1 - mu2
126
+
127
+ # Product might be almost singular
128
+ covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
129
+ if not np.isfinite(covmean).all():
130
+ msg = ('fid calculation produces singular product; '
131
+ 'adding %s to diagonal of cov estimates') % eps
132
+ print(msg)
133
+ offset = np.eye(sigma1.shape[0]) * eps
134
+ covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
135
+
136
+ # Numerical error might give slight imaginary component
137
+ if np.iscomplexobj(covmean):
138
+ if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
139
+ m = np.max(np.abs(covmean.imag))
140
+ raise ValueError('Imaginary component {}'.format(m))
141
+ covmean = covmean.real
142
+
143
+ tr_covmean = np.trace(covmean)
144
+
145
+ return (diff.dot(diff) + np.trace(sigma1) +
146
+ np.trace(sigma2) - 2 * tr_covmean)
text2motion/utils/motion_process.py ADDED
@@ -0,0 +1,515 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from os.path import join as pjoin
2
+
3
+ import numpy as np
4
+ import os
5
+ from utils.quaternion import *
6
+ from utils.skeleton import Skeleton
7
+ from utils.paramUtil import *
8
+
9
+ import torch
10
+ from tqdm import tqdm
11
+
12
+ # positions (batch, joint_num, 3)
13
+ def uniform_skeleton(positions, target_offset):
14
+ src_skel = Skeleton(n_raw_offsets, kinematic_chain, 'cpu')
15
+ src_offset = src_skel.get_offsets_joints(torch.from_numpy(positions[0]))
16
+ src_offset = src_offset.numpy()
17
+ tgt_offset = target_offset.numpy()
18
+ # print(src_offset)
19
+ # print(tgt_offset)
20
+ '''Calculate Scale Ratio as the ratio of legs'''
21
+ src_leg_len = np.abs(src_offset[l_idx1]).max() + np.abs(src_offset[l_idx2]).max()
22
+ tgt_leg_len = np.abs(tgt_offset[l_idx1]).max() + np.abs(tgt_offset[l_idx2]).max()
23
+
24
+ scale_rt = tgt_leg_len / src_leg_len
25
+ # print(scale_rt)
26
+ src_root_pos = positions[:, 0]
27
+ tgt_root_pos = src_root_pos * scale_rt
28
+
29
+ '''Inverse Kinematics'''
30
+ quat_params = src_skel.inverse_kinematics_np(positions, face_joint_indx)
31
+ # print(quat_params.shape)
32
+
33
+ '''Forward Kinematics'''
34
+ src_skel.set_offset(target_offset)
35
+ new_joints = src_skel.forward_kinematics_np(quat_params, tgt_root_pos)
36
+ return new_joints
37
+
38
+
39
+ def extract_features(positions, feet_thre, n_raw_offsets, kinematic_chain, face_joint_indx, fid_r, fid_l):
40
+ global_positions = positions.copy()
41
+ """ Get Foot Contacts """
42
+
43
+ def foot_detect(positions, thres):
44
+ velfactor, heightfactor = np.array([thres, thres]), np.array([3.0, 2.0])
45
+
46
+ feet_l_x = (positions[1:, fid_l, 0] - positions[:-1, fid_l, 0]) ** 2
47
+ feet_l_y = (positions[1:, fid_l, 1] - positions[:-1, fid_l, 1]) ** 2
48
+ feet_l_z = (positions[1:, fid_l, 2] - positions[:-1, fid_l, 2]) ** 2
49
+ # feet_l_h = positions[:-1,fid_l,1]
50
+ # feet_l = (((feet_l_x + feet_l_y + feet_l_z) < velfactor) & (feet_l_h < heightfactor)).astype(np.float)
51
+ feet_l = ((feet_l_x + feet_l_y + feet_l_z) < velfactor).astype(np.float)
52
+
53
+ feet_r_x = (positions[1:, fid_r, 0] - positions[:-1, fid_r, 0]) ** 2
54
+ feet_r_y = (positions[1:, fid_r, 1] - positions[:-1, fid_r, 1]) ** 2
55
+ feet_r_z = (positions[1:, fid_r, 2] - positions[:-1, fid_r, 2]) ** 2
56
+ # feet_r_h = positions[:-1,fid_r,1]
57
+ # feet_r = (((feet_r_x + feet_r_y + feet_r_z) < velfactor) & (feet_r_h < heightfactor)).astype(np.float)
58
+ feet_r = (((feet_r_x + feet_r_y + feet_r_z) < velfactor)).astype(np.float)
59
+ return feet_l, feet_r
60
+
61
+ #
62
+ feet_l, feet_r = foot_detect(positions, feet_thre)
63
+ # feet_l, feet_r = foot_detect(positions, 0.002)
64
+
65
+ '''Quaternion and Cartesian representation'''
66
+ r_rot = None
67
+
68
+ def get_rifke(positions):
69
+ '''Local pose'''
70
+ positions[..., 0] -= positions[:, 0:1, 0]
71
+ positions[..., 2] -= positions[:, 0:1, 2]
72
+ '''All pose face Z+'''
73
+ positions = qrot_np(np.repeat(r_rot[:, None], positions.shape[1], axis=1), positions)
74
+ return positions
75
+
76
+ def get_quaternion(positions):
77
+ skel = Skeleton(n_raw_offsets, kinematic_chain, "cpu")
78
+ # (seq_len, joints_num, 4)
79
+ quat_params = skel.inverse_kinematics_np(positions, face_joint_indx, smooth_forward=False)
80
+
81
+ '''Fix Quaternion Discontinuity'''
82
+ quat_params = qfix(quat_params)
83
+ # (seq_len, 4)
84
+ r_rot = quat_params[:, 0].copy()
85
+ # print(r_rot[0])
86
+ '''Root Linear Velocity'''
87
+ # (seq_len - 1, 3)
88
+ velocity = (positions[1:, 0] - positions[:-1, 0]).copy()
89
+ # print(r_rot.shape, velocity.shape)
90
+ velocity = qrot_np(r_rot[1:], velocity)
91
+ '''Root Angular Velocity'''
92
+ # (seq_len - 1, 4)
93
+ r_velocity = qmul_np(r_rot[1:], qinv_np(r_rot[:-1]))
94
+ quat_params[1:, 0] = r_velocity
95
+ # (seq_len, joints_num, 4)
96
+ return quat_params, r_velocity, velocity, r_rot
97
+
98
+ def get_cont6d_params(positions):
99
+ skel = Skeleton(n_raw_offsets, kinematic_chain, "cpu")
100
+ # (seq_len, joints_num, 4)
101
+ quat_params = skel.inverse_kinematics_np(positions, face_joint_indx, smooth_forward=True)
102
+
103
+ '''Quaternion to continuous 6D'''
104
+ cont_6d_params = quaternion_to_cont6d_np(quat_params)
105
+ # (seq_len, 4)
106
+ r_rot = quat_params[:, 0].copy()
107
+ # print(r_rot[0])
108
+ '''Root Linear Velocity'''
109
+ # (seq_len - 1, 3)
110
+ velocity = (positions[1:, 0] - positions[:-1, 0]).copy()
111
+ # print(r_rot.shape, velocity.shape)
112
+ velocity = qrot_np(r_rot[1:], velocity)
113
+ '''Root Angular Velocity'''
114
+ # (seq_len - 1, 4)
115
+ r_velocity = qmul_np(r_rot[1:], qinv_np(r_rot[:-1]))
116
+ # (seq_len, joints_num, 4)
117
+ return cont_6d_params, r_velocity, velocity, r_rot
118
+
119
+ cont_6d_params, r_velocity, velocity, r_rot = get_cont6d_params(positions)
120
+ positions = get_rifke(positions)
121
+
122
+ # trejec = np.cumsum(np.concatenate([np.array([[0, 0, 0]]), velocity], axis=0), axis=0)
123
+ # r_rotations, r_pos = recover_ric_glo_np(r_velocity, velocity[:, [0, 2]])
124
+
125
+ # plt.plot(positions_b[:, 0, 0], positions_b[:, 0, 2], marker='*')
126
+ # plt.plot(ground_positions[:, 0, 0], ground_positions[:, 0, 2], marker='o', color='r')
127
+ # plt.plot(trejec[:, 0], trejec[:, 2], marker='^', color='g')
128
+ # plt.plot(r_pos[:, 0], r_pos[:, 2], marker='s', color='y')
129
+ # plt.xlabel('x')
130
+ # plt.ylabel('z')
131
+ # plt.axis('equal')
132
+ # plt.show()
133
+
134
+ '''Root height'''
135
+ root_y = positions[:, 0, 1:2]
136
+
137
+ '''Root rotation and linear velocity'''
138
+ # (seq_len-1, 1) rotation velocity along y-axis
139
+ # (seq_len-1, 2) linear velovity on xz plane
140
+ r_velocity = np.arcsin(r_velocity[:, 2:3])
141
+ l_velocity = velocity[:, [0, 2]]
142
+ # print(r_velocity.shape, l_velocity.shape, root_y.shape)
143
+ root_data = np.concatenate([r_velocity, l_velocity, root_y[:-1]], axis=-1)
144
+
145
+ '''Get Joint Rotation Representation'''
146
+ # (seq_len, (joints_num-1) *6) quaternion for skeleton joints
147
+ rot_data = cont_6d_params[:, 1:].reshape(len(cont_6d_params), -1)
148
+
149
+ '''Get Joint Rotation Invariant Position Represention'''
150
+ # (seq_len, (joints_num-1)*3) local joint position
151
+ ric_data = positions[:, 1:].reshape(len(positions), -1)
152
+
153
+ '''Get Joint Velocity Representation'''
154
+ # (seq_len-1, joints_num*3)
155
+ local_vel = qrot_np(np.repeat(r_rot[:-1, None], global_positions.shape[1], axis=1),
156
+ global_positions[1:] - global_positions[:-1])
157
+ local_vel = local_vel.reshape(len(local_vel), -1)
158
+
159
+ data = root_data
160
+ data = np.concatenate([data, ric_data[:-1]], axis=-1)
161
+ data = np.concatenate([data, rot_data[:-1]], axis=-1)
162
+ # print(data.shape, local_vel.shape)
163
+ data = np.concatenate([data, local_vel], axis=-1)
164
+ data = np.concatenate([data, feet_l, feet_r], axis=-1)
165
+
166
+ return data
167
+
168
+
169
+ def process_file(positions, feet_thre):
170
+ # (seq_len, joints_num, 3)
171
+ # '''Down Sample'''
172
+ # positions = positions[::ds_num]
173
+
174
+ '''Uniform Skeleton'''
175
+ positions = uniform_skeleton(positions, tgt_offsets)
176
+
177
+ '''Put on Floor'''
178
+ floor_height = positions.min(axis=0).min(axis=0)[1]
179
+ positions[:, :, 1] -= floor_height
180
+ # print(floor_height)
181
+
182
+ # plot_3d_motion("./positions_1.mp4", kinematic_chain, positions, 'title', fps=20)
183
+
184
+ '''XZ at origin'''
185
+ root_pos_init = positions[0]
186
+ root_pose_init_xz = root_pos_init[0] * np.array([1, 0, 1])
187
+ positions = positions - root_pose_init_xz
188
+
189
+ # '''Move the first pose to origin '''
190
+ # root_pos_init = positions[0]
191
+ # positions = positions - root_pos_init[0]
192
+
193
+ '''All initially face Z+'''
194
+ r_hip, l_hip, sdr_r, sdr_l = face_joint_indx
195
+ across1 = root_pos_init[r_hip] - root_pos_init[l_hip]
196
+ across2 = root_pos_init[sdr_r] - root_pos_init[sdr_l]
197
+ across = across1 + across2
198
+ across = across / np.sqrt((across ** 2).sum(axis=-1))[..., np.newaxis]
199
+
200
+ # forward (3,), rotate around y-axis
201
+ forward_init = np.cross(np.array([[0, 1, 0]]), across, axis=-1)
202
+ # forward (3,)
203
+ forward_init = forward_init / np.sqrt((forward_init ** 2).sum(axis=-1))[..., np.newaxis]
204
+
205
+ # print(forward_init)
206
+
207
+ target = np.array([[0, 0, 1]])
208
+ root_quat_init = qbetween_np(forward_init, target)
209
+ root_quat_init = np.ones(positions.shape[:-1] + (4,)) * root_quat_init
210
+
211
+ positions_b = positions.copy()
212
+
213
+ positions = qrot_np(root_quat_init, positions)
214
+
215
+ # plot_3d_motion("./positions_2.mp4", kinematic_chain, positions, 'title', fps=20)
216
+
217
+ '''New ground truth positions'''
218
+ global_positions = positions.copy()
219
+
220
+ # plt.plot(positions_b[:, 0, 0], positions_b[:, 0, 2], marker='*')
221
+ # plt.plot(positions[:, 0, 0], positions[:, 0, 2], marker='o', color='r')
222
+ # plt.xlabel('x')
223
+ # plt.ylabel('z')
224
+ # plt.axis('equal')
225
+ # plt.show()
226
+
227
+ """ Get Foot Contacts """
228
+
229
+ def foot_detect(positions, thres):
230
+ velfactor, heightfactor = np.array([thres, thres]), np.array([3.0, 2.0])
231
+
232
+ feet_l_x = (positions[1:, fid_l, 0] - positions[:-1, fid_l, 0]) ** 2
233
+ feet_l_y = (positions[1:, fid_l, 1] - positions[:-1, fid_l, 1]) ** 2
234
+ feet_l_z = (positions[1:, fid_l, 2] - positions[:-1, fid_l, 2]) ** 2
235
+ # feet_l_h = positions[:-1,fid_l,1]
236
+ # feet_l = (((feet_l_x + feet_l_y + feet_l_z) < velfactor) & (feet_l_h < heightfactor)).astype(np.float)
237
+ feet_l = ((feet_l_x + feet_l_y + feet_l_z) < velfactor).astype(np.float)
238
+
239
+ feet_r_x = (positions[1:, fid_r, 0] - positions[:-1, fid_r, 0]) ** 2
240
+ feet_r_y = (positions[1:, fid_r, 1] - positions[:-1, fid_r, 1]) ** 2
241
+ feet_r_z = (positions[1:, fid_r, 2] - positions[:-1, fid_r, 2]) ** 2
242
+ # feet_r_h = positions[:-1,fid_r,1]
243
+ # feet_r = (((feet_r_x + feet_r_y + feet_r_z) < velfactor) & (feet_r_h < heightfactor)).astype(np.float)
244
+ feet_r = (((feet_r_x + feet_r_y + feet_r_z) < velfactor)).astype(np.float)
245
+ return feet_l, feet_r
246
+ #
247
+ feet_l, feet_r = foot_detect(positions, feet_thre)
248
+ # feet_l, feet_r = foot_detect(positions, 0.002)
249
+
250
+ '''Quaternion and Cartesian representation'''
251
+ r_rot = None
252
+
253
+ def get_rifke(positions):
254
+ '''Local pose'''
255
+ positions[..., 0] -= positions[:, 0:1, 0]
256
+ positions[..., 2] -= positions[:, 0:1, 2]
257
+ '''All pose face Z+'''
258
+ positions = qrot_np(np.repeat(r_rot[:, None], positions.shape[1], axis=1), positions)
259
+ return positions
260
+
261
+ def get_quaternion(positions):
262
+ skel = Skeleton(n_raw_offsets, kinematic_chain, "cpu")
263
+ # (seq_len, joints_num, 4)
264
+ quat_params = skel.inverse_kinematics_np(positions, face_joint_indx, smooth_forward=False)
265
+
266
+ '''Fix Quaternion Discontinuity'''
267
+ quat_params = qfix(quat_params)
268
+ # (seq_len, 4)
269
+ r_rot = quat_params[:, 0].copy()
270
+ # print(r_rot[0])
271
+ '''Root Linear Velocity'''
272
+ # (seq_len - 1, 3)
273
+ velocity = (positions[1:, 0] - positions[:-1, 0]).copy()
274
+ # print(r_rot.shape, velocity.shape)
275
+ velocity = qrot_np(r_rot[1:], velocity)
276
+ '''Root Angular Velocity'''
277
+ # (seq_len - 1, 4)
278
+ r_velocity = qmul_np(r_rot[1:], qinv_np(r_rot[:-1]))
279
+ quat_params[1:, 0] = r_velocity
280
+ # (seq_len, joints_num, 4)
281
+ return quat_params, r_velocity, velocity, r_rot
282
+
283
+ def get_cont6d_params(positions):
284
+ skel = Skeleton(n_raw_offsets, kinematic_chain, "cpu")
285
+ # (seq_len, joints_num, 4)
286
+ quat_params = skel.inverse_kinematics_np(positions, face_joint_indx, smooth_forward=True)
287
+
288
+ '''Quaternion to continuous 6D'''
289
+ cont_6d_params = quaternion_to_cont6d_np(quat_params)
290
+ # (seq_len, 4)
291
+ r_rot = quat_params[:, 0].copy()
292
+ # print(r_rot[0])
293
+ '''Root Linear Velocity'''
294
+ # (seq_len - 1, 3)
295
+ velocity = (positions[1:, 0] - positions[:-1, 0]).copy()
296
+ # print(r_rot.shape, velocity.shape)
297
+ velocity = qrot_np(r_rot[1:], velocity)
298
+ '''Root Angular Velocity'''
299
+ # (seq_len - 1, 4)
300
+ r_velocity = qmul_np(r_rot[1:], qinv_np(r_rot[:-1]))
301
+ # (seq_len, joints_num, 4)
302
+ return cont_6d_params, r_velocity, velocity, r_rot
303
+
304
+ cont_6d_params, r_velocity, velocity, r_rot = get_cont6d_params(positions)
305
+ positions = get_rifke(positions)
306
+
307
+ # trejec = np.cumsum(np.concatenate([np.array([[0, 0, 0]]), velocity], axis=0), axis=0)
308
+ # r_rotations, r_pos = recover_ric_glo_np(r_velocity, velocity[:, [0, 2]])
309
+
310
+ # plt.plot(positions_b[:, 0, 0], positions_b[:, 0, 2], marker='*')
311
+ # plt.plot(ground_positions[:, 0, 0], ground_positions[:, 0, 2], marker='o', color='r')
312
+ # plt.plot(trejec[:, 0], trejec[:, 2], marker='^', color='g')
313
+ # plt.plot(r_pos[:, 0], r_pos[:, 2], marker='s', color='y')
314
+ # plt.xlabel('x')
315
+ # plt.ylabel('z')
316
+ # plt.axis('equal')
317
+ # plt.show()
318
+
319
+ '''Root height'''
320
+ root_y = positions[:, 0, 1:2]
321
+
322
+ '''Root rotation and linear velocity'''
323
+ # (seq_len-1, 1) rotation velocity along y-axis
324
+ # (seq_len-1, 2) linear velovity on xz plane
325
+ r_velocity = np.arcsin(r_velocity[:, 2:3])
326
+ l_velocity = velocity[:, [0, 2]]
327
+ # print(r_velocity.shape, l_velocity.shape, root_y.shape)
328
+ root_data = np.concatenate([r_velocity, l_velocity, root_y[:-1]], axis=-1)
329
+
330
+ '''Get Joint Rotation Representation'''
331
+ # (seq_len, (joints_num-1) *6) quaternion for skeleton joints
332
+ rot_data = cont_6d_params[:, 1:].reshape(len(cont_6d_params), -1)
333
+
334
+ '''Get Joint Rotation Invariant Position Represention'''
335
+ # (seq_len, (joints_num-1)*3) local joint position
336
+ ric_data = positions[:, 1:].reshape(len(positions), -1)
337
+
338
+ '''Get Joint Velocity Representation'''
339
+ # (seq_len-1, joints_num*3)
340
+ local_vel = qrot_np(np.repeat(r_rot[:-1, None], global_positions.shape[1], axis=1),
341
+ global_positions[1:] - global_positions[:-1])
342
+ local_vel = local_vel.reshape(len(local_vel), -1)
343
+
344
+ data = root_data
345
+ data = np.concatenate([data, ric_data[:-1]], axis=-1)
346
+ data = np.concatenate([data, rot_data[:-1]], axis=-1)
347
+ # print(data.shape, local_vel.shape)
348
+ data = np.concatenate([data, local_vel], axis=-1)
349
+ data = np.concatenate([data, feet_l, feet_r], axis=-1)
350
+
351
+ return data, global_positions, positions, l_velocity
352
+
353
+
354
+ # Recover global angle and positions for rotation data
355
+ # root_rot_velocity (B, seq_len, 1)
356
+ # root_linear_velocity (B, seq_len, 2)
357
+ # root_y (B, seq_len, 1)
358
+ # ric_data (B, seq_len, (joint_num - 1)*3)
359
+ # rot_data (B, seq_len, (joint_num - 1)*6)
360
+ # local_velocity (B, seq_len, joint_num*3)
361
+ # foot contact (B, seq_len, 4)
362
+ def recover_root_rot_pos(data):
363
+ rot_vel = data[..., 0]
364
+ r_rot_ang = torch.zeros_like(rot_vel).to(data.device)
365
+ '''Get Y-axis rotation from rotation velocity'''
366
+ r_rot_ang[..., 1:] = rot_vel[..., :-1]
367
+ r_rot_ang = torch.cumsum(r_rot_ang, dim=-1)
368
+
369
+ r_rot_quat = torch.zeros(data.shape[:-1] + (4,)).to(data.device)
370
+ r_rot_quat[..., 0] = torch.cos(r_rot_ang)
371
+ r_rot_quat[..., 2] = torch.sin(r_rot_ang)
372
+
373
+ r_pos = torch.zeros(data.shape[:-1] + (3,)).to(data.device)
374
+ r_pos[..., 1:, [0, 2]] = data[..., :-1, 1:3]
375
+ '''Add Y-axis rotation to root position'''
376
+ r_pos = qrot(qinv(r_rot_quat), r_pos)
377
+
378
+ r_pos = torch.cumsum(r_pos, dim=-2)
379
+
380
+ r_pos[..., 1] = data[..., 3]
381
+ return r_rot_quat, r_pos
382
+
383
+
384
+ def recover_from_rot(data, joints_num, skeleton):
385
+ r_rot_quat, r_pos = recover_root_rot_pos(data)
386
+
387
+ r_rot_cont6d = quaternion_to_cont6d(r_rot_quat)
388
+
389
+ start_indx = 1 + 2 + 1 + (joints_num - 1) * 3
390
+ end_indx = start_indx + (joints_num - 1) * 6
391
+ cont6d_params = data[..., start_indx:end_indx]
392
+ # print(r_rot_cont6d.shape, cont6d_params.shape, r_pos.shape)
393
+ cont6d_params = torch.cat([r_rot_cont6d, cont6d_params], dim=-1)
394
+ cont6d_params = cont6d_params.view(-1, joints_num, 6)
395
+
396
+ positions = skeleton.forward_kinematics_cont6d(cont6d_params, r_pos)
397
+
398
+ return positions
399
+
400
+
401
+ def recover_from_ric(data, joints_num):
402
+ r_rot_quat, r_pos = recover_root_rot_pos(data)
403
+ positions = data[..., 4:(joints_num - 1) * 3 + 4]
404
+ positions = positions.view(positions.shape[:-1] + (-1, 3))
405
+
406
+ '''Add Y-axis rotation to local joints'''
407
+ positions = qrot(qinv(r_rot_quat[..., None, :]).expand(positions.shape[:-1] + (4,)), positions)
408
+
409
+ '''Add root XZ to joints'''
410
+ positions[..., 0] += r_pos[..., 0:1]
411
+ positions[..., 2] += r_pos[..., 2:3]
412
+
413
+ '''Concate root and joints'''
414
+ positions = torch.cat([r_pos.unsqueeze(-2), positions], dim=-2)
415
+
416
+ return positions
417
+ '''
418
+ For Text2Motion Dataset
419
+ '''
420
+ '''
421
+ if __name__ == "__main__":
422
+ example_id = "000021"
423
+ # Lower legs
424
+ l_idx1, l_idx2 = 5, 8
425
+ # Right/Left foot
426
+ fid_r, fid_l = [8, 11], [7, 10]
427
+ # Face direction, r_hip, l_hip, sdr_r, sdr_l
428
+ face_joint_indx = [2, 1, 17, 16]
429
+ # l_hip, r_hip
430
+ r_hip, l_hip = 2, 1
431
+ joints_num = 22
432
+ # ds_num = 8
433
+ data_dir = '../dataset/pose_data_raw/joints/'
434
+ save_dir1 = '../dataset/pose_data_raw/new_joints/'
435
+ save_dir2 = '../dataset/pose_data_raw/new_joint_vecs/'
436
+
437
+ n_raw_offsets = torch.from_numpy(t2m_raw_offsets)
438
+ kinematic_chain = t2m_kinematic_chain
439
+
440
+ # Get offsets of target skeleton
441
+ example_data = np.load(os.path.join(data_dir, example_id + '.npy'))
442
+ example_data = example_data.reshape(len(example_data), -1, 3)
443
+ example_data = torch.from_numpy(example_data)
444
+ tgt_skel = Skeleton(n_raw_offsets, kinematic_chain, 'cpu')
445
+ # (joints_num, 3)
446
+ tgt_offsets = tgt_skel.get_offsets_joints(example_data[0])
447
+ # print(tgt_offsets)
448
+
449
+ source_list = os.listdir(data_dir)
450
+ frame_num = 0
451
+ for source_file in tqdm(source_list):
452
+ source_data = np.load(os.path.join(data_dir, source_file))[:, :joints_num]
453
+ try:
454
+ data, ground_positions, positions, l_velocity = process_file(source_data, 0.002)
455
+ rec_ric_data = recover_from_ric(torch.from_numpy(data).unsqueeze(0).float(), joints_num)
456
+ np.save(pjoin(save_dir1, source_file), rec_ric_data.squeeze().numpy())
457
+ np.save(pjoin(save_dir2, source_file), data)
458
+ frame_num += data.shape[0]
459
+ except Exception as e:
460
+ print(source_file)
461
+ print(e)
462
+
463
+ print('Total clips: %d, Frames: %d, Duration: %fm' %
464
+ (len(source_list), frame_num, frame_num / 20 / 60))
465
+ '''
466
+
467
+ if __name__ == "__main__":
468
+ example_id = "03950_gt"
469
+ # Lower legs
470
+ l_idx1, l_idx2 = 17, 18
471
+ # Right/Left foot
472
+ fid_r, fid_l = [14, 15], [19, 20]
473
+ # Face direction, r_hip, l_hip, sdr_r, sdr_l
474
+ face_joint_indx = [11, 16, 5, 8]
475
+ # l_hip, r_hip
476
+ r_hip, l_hip = 11, 16
477
+ joints_num = 21
478
+ # ds_num = 8
479
+ data_dir = '../dataset/kit_mocap_dataset/joints/'
480
+ save_dir1 = '../dataset/kit_mocap_dataset/new_joints/'
481
+ save_dir2 = '../dataset/kit_mocap_dataset/new_joint_vecs/'
482
+
483
+ n_raw_offsets = torch.from_numpy(kit_raw_offsets)
484
+ kinematic_chain = kit_kinematic_chain
485
+
486
+ '''Get offsets of target skeleton'''
487
+ example_data = np.load(os.path.join(data_dir, example_id + '.npy'))
488
+ example_data = example_data.reshape(len(example_data), -1, 3)
489
+ example_data = torch.from_numpy(example_data)
490
+ tgt_skel = Skeleton(n_raw_offsets, kinematic_chain, 'cpu')
491
+ # (joints_num, 3)
492
+ tgt_offsets = tgt_skel.get_offsets_joints(example_data[0])
493
+ # print(tgt_offsets)
494
+
495
+ source_list = os.listdir(data_dir)
496
+ frame_num = 0
497
+ '''Read source data'''
498
+ for source_file in tqdm(source_list):
499
+ source_data = np.load(os.path.join(data_dir, source_file))[:, :joints_num]
500
+ try:
501
+ name = ''.join(source_file[:-7].split('_')) + '.npy'
502
+ data, ground_positions, positions, l_velocity = process_file(source_data, 0.05)
503
+ rec_ric_data = recover_from_ric(torch.from_numpy(data).unsqueeze(0).float(), joints_num)
504
+ if np.isnan(rec_ric_data.numpy()).any():
505
+ print(source_file)
506
+ continue
507
+ np.save(pjoin(save_dir1, name), rec_ric_data.squeeze().numpy())
508
+ np.save(pjoin(save_dir2, name), data)
509
+ frame_num += data.shape[0]
510
+ except Exception as e:
511
+ print(source_file)
512
+ print(e)
513
+
514
+ print('Total clips: %d, Frames: %d, Duration: %fm' %
515
+ (len(source_list), frame_num, frame_num / 12.5 / 60))
text2motion/utils/paramUtil.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ # Define a kinematic tree for the skeletal struture
4
+ kit_kinematic_chain = [[0, 11, 12, 13, 14, 15], [0, 16, 17, 18, 19, 20], [0, 1, 2, 3, 4], [3, 5, 6, 7], [3, 8, 9, 10]]
5
+
6
+ kit_raw_offsets = np.array(
7
+ [
8
+ [0, 0, 0],
9
+ [0, 1, 0],
10
+ [0, 1, 0],
11
+ [0, 1, 0],
12
+ [0, 1, 0],
13
+ [1, 0, 0],
14
+ [0, -1, 0],
15
+ [0, -1, 0],
16
+ [-1, 0, 0],
17
+ [0, -1, 0],
18
+ [0, -1, 0],
19
+ [1, 0, 0],
20
+ [0, -1, 0],
21
+ [0, -1, 0],
22
+ [0, 0, 1],
23
+ [0, 0, 1],
24
+ [-1, 0, 0],
25
+ [0, -1, 0],
26
+ [0, -1, 0],
27
+ [0, 0, 1],
28
+ [0, 0, 1]
29
+ ]
30
+ )
31
+
32
+ t2m_raw_offsets = np.array([[0,0,0],
33
+ [1,0,0],
34
+ [-1,0,0],
35
+ [0,1,0],
36
+ [0,-1,0],
37
+ [0,-1,0],
38
+ [0,1,0],
39
+ [0,-1,0],
40
+ [0,-1,0],
41
+ [0,1,0],
42
+ [0,0,1],
43
+ [0,0,1],
44
+ [0,1,0],
45
+ [1,0,0],
46
+ [-1,0,0],
47
+ [0,0,1],
48
+ [0,-1,0],
49
+ [0,-1,0],
50
+ [0,-1,0],
51
+ [0,-1,0],
52
+ [0,-1,0],
53
+ [0,-1,0]])
54
+
55
+ t2m_kinematic_chain = [[0, 2, 5, 8, 11], [0, 1, 4, 7, 10], [0, 3, 6, 9, 12, 15], [9, 14, 17, 19, 21], [9, 13, 16, 18, 20]]
56
+ t2m_left_hand_chain = [[20, 22, 23, 24], [20, 34, 35, 36], [20, 25, 26, 27], [20, 31, 32, 33], [20, 28, 29, 30]]
57
+ t2m_right_hand_chain = [[21, 43, 44, 45], [21, 46, 47, 48], [21, 40, 41, 42], [21, 37, 38, 39], [21, 49, 50, 51]]
58
+
59
+
60
+ kit_tgt_skel_id = '03950'
61
+
62
+ t2m_tgt_skel_id = '000021'
63
+
text2motion/utils/plot_script.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import numpy as np
3
+ import matplotlib
4
+ import matplotlib.pyplot as plt
5
+ from mpl_toolkits.mplot3d import Axes3D
6
+ from matplotlib.animation import FuncAnimation, FFMpegFileWriter
7
+ from mpl_toolkits.mplot3d.art3d import Poly3DCollection
8
+ import mpl_toolkits.mplot3d.axes3d as p3
9
+ # import cv2
10
+
11
+
12
+ def list_cut_average(ll, intervals):
13
+ if intervals == 1:
14
+ return ll
15
+
16
+ bins = math.ceil(len(ll) * 1.0 / intervals)
17
+ ll_new = []
18
+ for i in range(bins):
19
+ l_low = intervals * i
20
+ l_high = l_low + intervals
21
+ l_high = l_high if l_high < len(ll) else len(ll)
22
+ ll_new.append(np.mean(ll[l_low:l_high]))
23
+ return ll_new
24
+
25
+
26
+ def plot_3d_motion(save_path, kinematic_tree, joints, title, figsize=(10, 10), fps=120, radius=4):
27
+ matplotlib.use('Agg')
28
+
29
+ title_sp = title.split(' ')
30
+ if len(title_sp) > 20:
31
+ title = '\n'.join([' '.join(title_sp[:10]), ' '.join(title_sp[10:20]), ' '.join(title_sp[20:])])
32
+ elif len(title_sp) > 10:
33
+ title = '\n'.join([' '.join(title_sp[:10]), ' '.join(title_sp[10:])])
34
+
35
+ def init():
36
+ ax.set_xlim3d([-radius / 4, radius / 4])
37
+ ax.set_ylim3d([0, radius / 2])
38
+ ax.set_zlim3d([0, radius / 2])
39
+ # print(title)
40
+ fig.suptitle(title, fontsize=20)
41
+ ax.grid(b=False)
42
+
43
+ def plot_xzPlane(minx, maxx, miny, minz, maxz):
44
+ ## Plot a plane XZ
45
+ verts = [
46
+ [minx, miny, minz],
47
+ [minx, miny, maxz],
48
+ [maxx, miny, maxz],
49
+ [maxx, miny, minz]
50
+ ]
51
+ xz_plane = Poly3DCollection([verts])
52
+ xz_plane.set_facecolor((0.5, 0.5, 0.5, 0.5))
53
+ ax.add_collection3d(xz_plane)
54
+
55
+ # return ax
56
+
57
+ # (seq_len, joints_num, 3)
58
+ data = joints.copy().reshape(len(joints), -1, 3)
59
+ fig = plt.figure(figsize=figsize)
60
+ ax = p3.Axes3D(fig)
61
+ init()
62
+ MINS = data.min(axis=0).min(axis=0)
63
+ MAXS = data.max(axis=0).max(axis=0)
64
+ colors = ['red', 'blue', 'black', 'red', 'blue',
65
+ 'darkblue', 'darkblue', 'darkblue', 'darkblue', 'darkblue',
66
+ 'darkred', 'darkred', 'darkred', 'darkred', 'darkred']
67
+ frame_number = data.shape[0]
68
+ # print(data.shape)
69
+
70
+ height_offset = MINS[1]
71
+ data[:, :, 1] -= height_offset
72
+ trajec = data[:, 0, [0, 2]]
73
+
74
+ data[..., 0] -= data[:, 0:1, 0]
75
+ data[..., 2] -= data[:, 0:1, 2]
76
+
77
+ # print(trajec.shape)
78
+
79
+ def update(index):
80
+ # print(index)
81
+ ax.lines = []
82
+ ax.collections = []
83
+ ax.view_init(elev=120, azim=-90)
84
+ ax.dist = 7.5
85
+ # ax =
86
+ plot_xzPlane(MINS[0] - trajec[index, 0], MAXS[0] - trajec[index, 0], 0, MINS[2] - trajec[index, 1],
87
+ MAXS[2] - trajec[index, 1])
88
+ # ax.scatter(data[index, :22, 0], data[index, :22, 1], data[index, :22, 2], color='black', s=3)
89
+
90
+ if index > 1:
91
+ ax.plot3D(trajec[:index, 0] - trajec[index, 0], np.zeros_like(trajec[:index, 0]),
92
+ trajec[:index, 1] - trajec[index, 1], linewidth=1.0,
93
+ color='blue')
94
+ # ax = plot_xzPlane(ax, MINS[0], MAXS[0], 0, MINS[2], MAXS[2])
95
+
96
+ for i, (chain, color) in enumerate(zip(kinematic_tree, colors)):
97
+ # print(color)
98
+ if i < 5:
99
+ linewidth = 4.0
100
+ else:
101
+ linewidth = 2.0
102
+ ax.plot3D(data[index, chain, 0], data[index, chain, 1], data[index, chain, 2], linewidth=linewidth,
103
+ color=color)
104
+ # print(trajec[:index, 0].shape)
105
+
106
+ plt.axis('off')
107
+ ax.set_xticklabels([])
108
+ ax.set_yticklabels([])
109
+ ax.set_zticklabels([])
110
+
111
+ ani = FuncAnimation(fig, update, frames=frame_number, interval=1000 / fps, repeat=False)
112
+
113
+ # writer = FFMpegFileWriter(fps=fps)
114
+ ani.save(save_path, fps=fps)
115
+ plt.close()
text2motion/utils/quaternion.py ADDED
@@ -0,0 +1,423 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2018-present, Facebook, Inc.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+ #
7
+
8
+ import torch
9
+ import numpy as np
10
+
11
+ _EPS4 = np.finfo(float).eps * 4.0
12
+
13
+ _FLOAT_EPS = np.finfo(float).eps
14
+
15
+ # PyTorch-backed implementations
16
+ def qinv(q):
17
+ assert q.shape[-1] == 4, 'q must be a tensor of shape (*, 4)'
18
+ mask = torch.ones_like(q)
19
+ mask[..., 1:] = -mask[..., 1:]
20
+ return q * mask
21
+
22
+
23
+ def qinv_np(q):
24
+ assert q.shape[-1] == 4, 'q must be a tensor of shape (*, 4)'
25
+ return qinv(torch.from_numpy(q).float()).numpy()
26
+
27
+
28
+ def qnormalize(q):
29
+ assert q.shape[-1] == 4, 'q must be a tensor of shape (*, 4)'
30
+ return q / torch.norm(q, dim=-1, keepdim=True)
31
+
32
+
33
+ def qmul(q, r):
34
+ """
35
+ Multiply quaternion(s) q with quaternion(s) r.
36
+ Expects two equally-sized tensors of shape (*, 4), where * denotes any number of dimensions.
37
+ Returns q*r as a tensor of shape (*, 4).
38
+ """
39
+ assert q.shape[-1] == 4
40
+ assert r.shape[-1] == 4
41
+
42
+ original_shape = q.shape
43
+
44
+ # Compute outer product
45
+ terms = torch.bmm(r.view(-1, 4, 1), q.view(-1, 1, 4))
46
+
47
+ w = terms[:, 0, 0] - terms[:, 1, 1] - terms[:, 2, 2] - terms[:, 3, 3]
48
+ x = terms[:, 0, 1] + terms[:, 1, 0] - terms[:, 2, 3] + terms[:, 3, 2]
49
+ y = terms[:, 0, 2] + terms[:, 1, 3] + terms[:, 2, 0] - terms[:, 3, 1]
50
+ z = terms[:, 0, 3] - terms[:, 1, 2] + terms[:, 2, 1] + terms[:, 3, 0]
51
+ return torch.stack((w, x, y, z), dim=1).view(original_shape)
52
+
53
+
54
+ def qrot(q, v):
55
+ """
56
+ Rotate vector(s) v about the rotation described by quaternion(s) q.
57
+ Expects a tensor of shape (*, 4) for q and a tensor of shape (*, 3) for v,
58
+ where * denotes any number of dimensions.
59
+ Returns a tensor of shape (*, 3).
60
+ """
61
+ assert q.shape[-1] == 4
62
+ assert v.shape[-1] == 3
63
+ assert q.shape[:-1] == v.shape[:-1]
64
+
65
+ original_shape = list(v.shape)
66
+ # print(q.shape)
67
+ q = q.contiguous().view(-1, 4)
68
+ v = v.contiguous().view(-1, 3)
69
+
70
+ qvec = q[:, 1:]
71
+ uv = torch.cross(qvec, v, dim=1)
72
+ uuv = torch.cross(qvec, uv, dim=1)
73
+ return (v + 2 * (q[:, :1] * uv + uuv)).view(original_shape)
74
+
75
+
76
+ def qeuler(q, order, epsilon=0, deg=True):
77
+ """
78
+ Convert quaternion(s) q to Euler angles.
79
+ Expects a tensor of shape (*, 4), where * denotes any number of dimensions.
80
+ Returns a tensor of shape (*, 3).
81
+ """
82
+ assert q.shape[-1] == 4
83
+
84
+ original_shape = list(q.shape)
85
+ original_shape[-1] = 3
86
+ q = q.view(-1, 4)
87
+
88
+ q0 = q[:, 0]
89
+ q1 = q[:, 1]
90
+ q2 = q[:, 2]
91
+ q3 = q[:, 3]
92
+
93
+ if order == 'xyz':
94
+ x = torch.atan2(2 * (q0 * q1 - q2 * q3), 1 - 2 * (q1 * q1 + q2 * q2))
95
+ y = torch.asin(torch.clamp(2 * (q1 * q3 + q0 * q2), -1 + epsilon, 1 - epsilon))
96
+ z = torch.atan2(2 * (q0 * q3 - q1 * q2), 1 - 2 * (q2 * q2 + q3 * q3))
97
+ elif order == 'yzx':
98
+ x = torch.atan2(2 * (q0 * q1 - q2 * q3), 1 - 2 * (q1 * q1 + q3 * q3))
99
+ y = torch.atan2(2 * (q0 * q2 - q1 * q3), 1 - 2 * (q2 * q2 + q3 * q3))
100
+ z = torch.asin(torch.clamp(2 * (q1 * q2 + q0 * q3), -1 + epsilon, 1 - epsilon))
101
+ elif order == 'zxy':
102
+ x = torch.asin(torch.clamp(2 * (q0 * q1 + q2 * q3), -1 + epsilon, 1 - epsilon))
103
+ y = torch.atan2(2 * (q0 * q2 - q1 * q3), 1 - 2 * (q1 * q1 + q2 * q2))
104
+ z = torch.atan2(2 * (q0 * q3 - q1 * q2), 1 - 2 * (q1 * q1 + q3 * q3))
105
+ elif order == 'xzy':
106
+ x = torch.atan2(2 * (q0 * q1 + q2 * q3), 1 - 2 * (q1 * q1 + q3 * q3))
107
+ y = torch.atan2(2 * (q0 * q2 + q1 * q3), 1 - 2 * (q2 * q2 + q3 * q3))
108
+ z = torch.asin(torch.clamp(2 * (q0 * q3 - q1 * q2), -1 + epsilon, 1 - epsilon))
109
+ elif order == 'yxz':
110
+ x = torch.asin(torch.clamp(2 * (q0 * q1 - q2 * q3), -1 + epsilon, 1 - epsilon))
111
+ y = torch.atan2(2 * (q1 * q3 + q0 * q2), 1 - 2 * (q1 * q1 + q2 * q2))
112
+ z = torch.atan2(2 * (q1 * q2 + q0 * q3), 1 - 2 * (q1 * q1 + q3 * q3))
113
+ elif order == 'zyx':
114
+ x = torch.atan2(2 * (q0 * q1 + q2 * q3), 1 - 2 * (q1 * q1 + q2 * q2))
115
+ y = torch.asin(torch.clamp(2 * (q0 * q2 - q1 * q3), -1 + epsilon, 1 - epsilon))
116
+ z = torch.atan2(2 * (q0 * q3 + q1 * q2), 1 - 2 * (q2 * q2 + q3 * q3))
117
+ else:
118
+ raise
119
+
120
+ if deg:
121
+ return torch.stack((x, y, z), dim=1).view(original_shape) * 180 / np.pi
122
+ else:
123
+ return torch.stack((x, y, z), dim=1).view(original_shape)
124
+
125
+
126
+ # Numpy-backed implementations
127
+
128
+ def qmul_np(q, r):
129
+ q = torch.from_numpy(q).contiguous().float()
130
+ r = torch.from_numpy(r).contiguous().float()
131
+ return qmul(q, r).numpy()
132
+
133
+
134
+ def qrot_np(q, v):
135
+ q = torch.from_numpy(q).contiguous().float()
136
+ v = torch.from_numpy(v).contiguous().float()
137
+ return qrot(q, v).numpy()
138
+
139
+
140
+ def qeuler_np(q, order, epsilon=0, use_gpu=False):
141
+ if use_gpu:
142
+ q = torch.from_numpy(q).cuda().float()
143
+ return qeuler(q, order, epsilon).cpu().numpy()
144
+ else:
145
+ q = torch.from_numpy(q).contiguous().float()
146
+ return qeuler(q, order, epsilon).numpy()
147
+
148
+
149
+ def qfix(q):
150
+ """
151
+ Enforce quaternion continuity across the time dimension by selecting
152
+ the representation (q or -q) with minimal distance (or, equivalently, maximal dot product)
153
+ between two consecutive frames.
154
+
155
+ Expects a tensor of shape (L, J, 4), where L is the sequence length and J is the number of joints.
156
+ Returns a tensor of the same shape.
157
+ """
158
+ assert len(q.shape) == 3
159
+ assert q.shape[-1] == 4
160
+
161
+ result = q.copy()
162
+ dot_products = np.sum(q[1:] * q[:-1], axis=2)
163
+ mask = dot_products < 0
164
+ mask = (np.cumsum(mask, axis=0) % 2).astype(bool)
165
+ result[1:][mask] *= -1
166
+ return result
167
+
168
+
169
+ def euler2quat(e, order, deg=True):
170
+ """
171
+ Convert Euler angles to quaternions.
172
+ """
173
+ assert e.shape[-1] == 3
174
+
175
+ original_shape = list(e.shape)
176
+ original_shape[-1] = 4
177
+
178
+ e = e.view(-1, 3)
179
+
180
+ ## if euler angles in degrees
181
+ if deg:
182
+ e = e * np.pi / 180.
183
+
184
+ x = e[:, 0]
185
+ y = e[:, 1]
186
+ z = e[:, 2]
187
+
188
+ rx = torch.stack((torch.cos(x / 2), torch.sin(x / 2), torch.zeros_like(x), torch.zeros_like(x)), dim=1)
189
+ ry = torch.stack((torch.cos(y / 2), torch.zeros_like(y), torch.sin(y / 2), torch.zeros_like(y)), dim=1)
190
+ rz = torch.stack((torch.cos(z / 2), torch.zeros_like(z), torch.zeros_like(z), torch.sin(z / 2)), dim=1)
191
+
192
+ result = None
193
+ for coord in order:
194
+ if coord == 'x':
195
+ r = rx
196
+ elif coord == 'y':
197
+ r = ry
198
+ elif coord == 'z':
199
+ r = rz
200
+ else:
201
+ raise
202
+ if result is None:
203
+ result = r
204
+ else:
205
+ result = qmul(result, r)
206
+
207
+ # Reverse antipodal representation to have a non-negative "w"
208
+ if order in ['xyz', 'yzx', 'zxy']:
209
+ result *= -1
210
+
211
+ return result.view(original_shape)
212
+
213
+
214
+ def expmap_to_quaternion(e):
215
+ """
216
+ Convert axis-angle rotations (aka exponential maps) to quaternions.
217
+ Stable formula from "Practical Parameterization of Rotations Using the Exponential Map".
218
+ Expects a tensor of shape (*, 3), where * denotes any number of dimensions.
219
+ Returns a tensor of shape (*, 4).
220
+ """
221
+ assert e.shape[-1] == 3
222
+
223
+ original_shape = list(e.shape)
224
+ original_shape[-1] = 4
225
+ e = e.reshape(-1, 3)
226
+
227
+ theta = np.linalg.norm(e, axis=1).reshape(-1, 1)
228
+ w = np.cos(0.5 * theta).reshape(-1, 1)
229
+ xyz = 0.5 * np.sinc(0.5 * theta / np.pi) * e
230
+ return np.concatenate((w, xyz), axis=1).reshape(original_shape)
231
+
232
+
233
+ def euler_to_quaternion(e, order):
234
+ """
235
+ Convert Euler angles to quaternions.
236
+ """
237
+ assert e.shape[-1] == 3
238
+
239
+ original_shape = list(e.shape)
240
+ original_shape[-1] = 4
241
+
242
+ e = e.reshape(-1, 3)
243
+
244
+ x = e[:, 0]
245
+ y = e[:, 1]
246
+ z = e[:, 2]
247
+
248
+ rx = np.stack((np.cos(x / 2), np.sin(x / 2), np.zeros_like(x), np.zeros_like(x)), axis=1)
249
+ ry = np.stack((np.cos(y / 2), np.zeros_like(y), np.sin(y / 2), np.zeros_like(y)), axis=1)
250
+ rz = np.stack((np.cos(z / 2), np.zeros_like(z), np.zeros_like(z), np.sin(z / 2)), axis=1)
251
+
252
+ result = None
253
+ for coord in order:
254
+ if coord == 'x':
255
+ r = rx
256
+ elif coord == 'y':
257
+ r = ry
258
+ elif coord == 'z':
259
+ r = rz
260
+ else:
261
+ raise
262
+ if result is None:
263
+ result = r
264
+ else:
265
+ result = qmul_np(result, r)
266
+
267
+ # Reverse antipodal representation to have a non-negative "w"
268
+ if order in ['xyz', 'yzx', 'zxy']:
269
+ result *= -1
270
+
271
+ return result.reshape(original_shape)
272
+
273
+
274
+ def quaternion_to_matrix(quaternions):
275
+ """
276
+ Convert rotations given as quaternions to rotation matrices.
277
+ Args:
278
+ quaternions: quaternions with real part first,
279
+ as tensor of shape (..., 4).
280
+ Returns:
281
+ Rotation matrices as tensor of shape (..., 3, 3).
282
+ """
283
+ r, i, j, k = torch.unbind(quaternions, -1)
284
+ two_s = 2.0 / (quaternions * quaternions).sum(-1)
285
+
286
+ o = torch.stack(
287
+ (
288
+ 1 - two_s * (j * j + k * k),
289
+ two_s * (i * j - k * r),
290
+ two_s * (i * k + j * r),
291
+ two_s * (i * j + k * r),
292
+ 1 - two_s * (i * i + k * k),
293
+ two_s * (j * k - i * r),
294
+ two_s * (i * k - j * r),
295
+ two_s * (j * k + i * r),
296
+ 1 - two_s * (i * i + j * j),
297
+ ),
298
+ -1,
299
+ )
300
+ return o.reshape(quaternions.shape[:-1] + (3, 3))
301
+
302
+
303
+ def quaternion_to_matrix_np(quaternions):
304
+ q = torch.from_numpy(quaternions).contiguous().float()
305
+ return quaternion_to_matrix(q).numpy()
306
+
307
+
308
+ def quaternion_to_cont6d_np(quaternions):
309
+ rotation_mat = quaternion_to_matrix_np(quaternions)
310
+ cont_6d = np.concatenate([rotation_mat[..., 0], rotation_mat[..., 1]], axis=-1)
311
+ return cont_6d
312
+
313
+
314
+ def quaternion_to_cont6d(quaternions):
315
+ rotation_mat = quaternion_to_matrix(quaternions)
316
+ cont_6d = torch.cat([rotation_mat[..., 0], rotation_mat[..., 1]], dim=-1)
317
+ return cont_6d
318
+
319
+
320
+ def cont6d_to_matrix(cont6d):
321
+ assert cont6d.shape[-1] == 6, "The last dimension must be 6"
322
+ x_raw = cont6d[..., 0:3]
323
+ y_raw = cont6d[..., 3:6]
324
+
325
+ x = x_raw / torch.norm(x_raw, dim=-1, keepdim=True)
326
+ z = torch.cross(x, y_raw, dim=-1)
327
+ z = z / torch.norm(z, dim=-1, keepdim=True)
328
+
329
+ y = torch.cross(z, x, dim=-1)
330
+
331
+ x = x[..., None]
332
+ y = y[..., None]
333
+ z = z[..., None]
334
+
335
+ mat = torch.cat([x, y, z], dim=-1)
336
+ return mat
337
+
338
+
339
+ def cont6d_to_matrix_np(cont6d):
340
+ q = torch.from_numpy(cont6d).contiguous().float()
341
+ return cont6d_to_matrix(q).numpy()
342
+
343
+
344
+ def qpow(q0, t, dtype=torch.float):
345
+ ''' q0 : tensor of quaternions
346
+ t: tensor of powers
347
+ '''
348
+ q0 = qnormalize(q0)
349
+ theta0 = torch.acos(q0[..., 0])
350
+
351
+ ## if theta0 is close to zero, add epsilon to avoid NaNs
352
+ mask = (theta0 <= 10e-10) * (theta0 >= -10e-10)
353
+ theta0 = (1 - mask) * theta0 + mask * 10e-10
354
+ v0 = q0[..., 1:] / torch.sin(theta0).view(-1, 1)
355
+
356
+ if isinstance(t, torch.Tensor):
357
+ q = torch.zeros(t.shape + q0.shape)
358
+ theta = t.view(-1, 1) * theta0.view(1, -1)
359
+ else: ## if t is a number
360
+ q = torch.zeros(q0.shape)
361
+ theta = t * theta0
362
+
363
+ q[..., 0] = torch.cos(theta)
364
+ q[..., 1:] = v0 * torch.sin(theta).unsqueeze(-1)
365
+
366
+ return q.to(dtype)
367
+
368
+
369
+ def qslerp(q0, q1, t):
370
+ '''
371
+ q0: starting quaternion
372
+ q1: ending quaternion
373
+ t: array of points along the way
374
+
375
+ Returns:
376
+ Tensor of Slerps: t.shape + q0.shape
377
+ '''
378
+
379
+ q0 = qnormalize(q0)
380
+ q1 = qnormalize(q1)
381
+ q_ = qpow(qmul(q1, qinv(q0)), t)
382
+
383
+ return qmul(q_,
384
+ q0.contiguous().view(torch.Size([1] * len(t.shape)) + q0.shape).expand(t.shape + q0.shape).contiguous())
385
+
386
+
387
+ def qbetween(v0, v1):
388
+ '''
389
+ find the quaternion used to rotate v0 to v1
390
+ '''
391
+ assert v0.shape[-1] == 3, 'v0 must be of the shape (*, 3)'
392
+ assert v1.shape[-1] == 3, 'v1 must be of the shape (*, 3)'
393
+
394
+ v = torch.cross(v0, v1)
395
+ w = torch.sqrt((v0 ** 2).sum(dim=-1, keepdim=True) * (v1 ** 2).sum(dim=-1, keepdim=True)) + (v0 * v1).sum(dim=-1,
396
+ keepdim=True)
397
+ return qnormalize(torch.cat([w, v], dim=-1))
398
+
399
+
400
+ def qbetween_np(v0, v1):
401
+ '''
402
+ find the quaternion used to rotate v0 to v1
403
+ '''
404
+ assert v0.shape[-1] == 3, 'v0 must be of the shape (*, 3)'
405
+ assert v1.shape[-1] == 3, 'v1 must be of the shape (*, 3)'
406
+
407
+ v0 = torch.from_numpy(v0).float()
408
+ v1 = torch.from_numpy(v1).float()
409
+ return qbetween(v0, v1).numpy()
410
+
411
+
412
+ def lerp(p0, p1, t):
413
+ if not isinstance(t, torch.Tensor):
414
+ t = torch.Tensor([t])
415
+
416
+ new_shape = t.shape + p0.shape
417
+ new_view_t = t.shape + torch.Size([1] * len(p0.shape))
418
+ new_view_p = torch.Size([1] * len(t.shape)) + p0.shape
419
+ p0 = p0.view(new_view_p).expand(new_shape)
420
+ p1 = p1.view(new_view_p).expand(new_shape)
421
+ t = t.view(new_view_t).expand(new_shape)
422
+
423
+ return p0 + t * (p1 - p0)
text2motion/utils/skeleton.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils.quaternion import *
2
+ import scipy.ndimage.filters as filters
3
+
4
+ class Skeleton(object):
5
+ def __init__(self, offset, kinematic_tree, device):
6
+ self.device = device
7
+ self._raw_offset_np = offset.numpy()
8
+ self._raw_offset = offset.clone().detach().to(device).float()
9
+ self._kinematic_tree = kinematic_tree
10
+ self._offset = None
11
+ self._parents = [0] * len(self._raw_offset)
12
+ self._parents[0] = -1
13
+ for chain in self._kinematic_tree:
14
+ for j in range(1, len(chain)):
15
+ self._parents[chain[j]] = chain[j-1]
16
+
17
+ def njoints(self):
18
+ return len(self._raw_offset)
19
+
20
+ def offset(self):
21
+ return self._offset
22
+
23
+ def set_offset(self, offsets):
24
+ self._offset = offsets.clone().detach().to(self.device).float()
25
+
26
+ def kinematic_tree(self):
27
+ return self._kinematic_tree
28
+
29
+ def parents(self):
30
+ return self._parents
31
+
32
+ # joints (batch_size, joints_num, 3)
33
+ def get_offsets_joints_batch(self, joints):
34
+ assert len(joints.shape) == 3
35
+ _offsets = self._raw_offset.expand(joints.shape[0], -1, -1).clone()
36
+ for i in range(1, self._raw_offset.shape[0]):
37
+ _offsets[:, i] = torch.norm(joints[:, i] - joints[:, self._parents[i]], p=2, dim=1)[:, None] * _offsets[:, i]
38
+
39
+ self._offset = _offsets.detach()
40
+ return _offsets
41
+
42
+ # joints (joints_num, 3)
43
+ def get_offsets_joints(self, joints):
44
+ assert len(joints.shape) == 2
45
+ _offsets = self._raw_offset.clone()
46
+ for i in range(1, self._raw_offset.shape[0]):
47
+ # print(joints.shape)
48
+ _offsets[i] = torch.norm(joints[i] - joints[self._parents[i]], p=2, dim=0) * _offsets[i]
49
+
50
+ self._offset = _offsets.detach()
51
+ return _offsets
52
+
53
+ # face_joint_idx should follow the order of right hip, left hip, right shoulder, left shoulder
54
+ # joints (batch_size, joints_num, 3)
55
+ def inverse_kinematics_np(self, joints, face_joint_idx, smooth_forward=False):
56
+ assert len(face_joint_idx) == 4
57
+ '''Get Forward Direction'''
58
+ l_hip, r_hip, sdr_r, sdr_l = face_joint_idx
59
+ across1 = joints[:, r_hip] - joints[:, l_hip]
60
+ across2 = joints[:, sdr_r] - joints[:, sdr_l]
61
+ across = across1 + across2
62
+ across = across / np.sqrt((across**2).sum(axis=-1))[:, np.newaxis]
63
+ # print(across1.shape, across2.shape)
64
+
65
+ # forward (batch_size, 3)
66
+ forward = np.cross(np.array([[0, 1, 0]]), across, axis=-1)
67
+ if smooth_forward:
68
+ forward = filters.gaussian_filter1d(forward, 20, axis=0, mode='nearest')
69
+ # forward (batch_size, 3)
70
+ forward = forward / np.sqrt((forward**2).sum(axis=-1))[..., np.newaxis]
71
+
72
+ '''Get Root Rotation'''
73
+ target = np.array([[0,0,1]]).repeat(len(forward), axis=0)
74
+ root_quat = qbetween_np(forward, target)
75
+
76
+ '''Inverse Kinematics'''
77
+ # quat_params (batch_size, joints_num, 4)
78
+ # print(joints.shape[:-1])
79
+ quat_params = np.zeros(joints.shape[:-1] + (4,))
80
+ # print(quat_params.shape)
81
+ root_quat[0] = np.array([[1.0, 0.0, 0.0, 0.0]])
82
+ quat_params[:, 0] = root_quat
83
+ # quat_params[0, 0] = np.array([[1.0, 0.0, 0.0, 0.0]])
84
+ for chain in self._kinematic_tree:
85
+ R = root_quat
86
+ for j in range(len(chain) - 1):
87
+ # (batch, 3)
88
+ u = self._raw_offset_np[chain[j+1]][np.newaxis,...].repeat(len(joints), axis=0)
89
+ # print(u.shape)
90
+ # (batch, 3)
91
+ v = joints[:, chain[j+1]] - joints[:, chain[j]]
92
+ v = v / np.sqrt((v**2).sum(axis=-1))[:, np.newaxis]
93
+ # print(u.shape, v.shape)
94
+ rot_u_v = qbetween_np(u, v)
95
+
96
+ R_loc = qmul_np(qinv_np(R), rot_u_v)
97
+
98
+ quat_params[:,chain[j + 1], :] = R_loc
99
+ R = qmul_np(R, R_loc)
100
+
101
+ return quat_params
102
+
103
+ # Be sure root joint is at the beginning of kinematic chains
104
+ def forward_kinematics(self, quat_params, root_pos, skel_joints=None, do_root_R=True):
105
+ # quat_params (batch_size, joints_num, 4)
106
+ # joints (batch_size, joints_num, 3)
107
+ # root_pos (batch_size, 3)
108
+ if skel_joints is not None:
109
+ offsets = self.get_offsets_joints_batch(skel_joints)
110
+ if len(self._offset.shape) == 2:
111
+ offsets = self._offset.expand(quat_params.shape[0], -1, -1)
112
+ joints = torch.zeros(quat_params.shape[:-1] + (3,)).to(self.device)
113
+ joints[:, 0] = root_pos
114
+ for chain in self._kinematic_tree:
115
+ if do_root_R:
116
+ R = quat_params[:, 0]
117
+ else:
118
+ R = torch.tensor([[1.0, 0.0, 0.0, 0.0]]).expand(len(quat_params), -1).detach().to(self.device)
119
+ for i in range(1, len(chain)):
120
+ R = qmul(R, quat_params[:, chain[i]])
121
+ offset_vec = offsets[:, chain[i]]
122
+ joints[:, chain[i]] = qrot(R, offset_vec) + joints[:, chain[i-1]]
123
+ return joints
124
+
125
+ # Be sure root joint is at the beginning of kinematic chains
126
+ def forward_kinematics_np(self, quat_params, root_pos, skel_joints=None, do_root_R=True):
127
+ # quat_params (batch_size, joints_num, 4)
128
+ # joints (batch_size, joints_num, 3)
129
+ # root_pos (batch_size, 3)
130
+ if skel_joints is not None:
131
+ skel_joints = torch.from_numpy(skel_joints)
132
+ offsets = self.get_offsets_joints_batch(skel_joints)
133
+ if len(self._offset.shape) == 2:
134
+ offsets = self._offset.expand(quat_params.shape[0], -1, -1)
135
+ offsets = offsets.numpy()
136
+ joints = np.zeros(quat_params.shape[:-1] + (3,))
137
+ joints[:, 0] = root_pos
138
+ for chain in self._kinematic_tree:
139
+ if do_root_R:
140
+ R = quat_params[:, 0]
141
+ else:
142
+ R = np.array([[1.0, 0.0, 0.0, 0.0]]).repeat(len(quat_params), axis=0)
143
+ for i in range(1, len(chain)):
144
+ R = qmul_np(R, quat_params[:, chain[i]])
145
+ offset_vec = offsets[:, chain[i]]
146
+ joints[:, chain[i]] = qrot_np(R, offset_vec) + joints[:, chain[i - 1]]
147
+ return joints
148
+
149
+ def forward_kinematics_cont6d_np(self, cont6d_params, root_pos, skel_joints=None, do_root_R=True):
150
+ # cont6d_params (batch_size, joints_num, 6)
151
+ # joints (batch_size, joints_num, 3)
152
+ # root_pos (batch_size, 3)
153
+ if skel_joints is not None:
154
+ skel_joints = torch.from_numpy(skel_joints)
155
+ offsets = self.get_offsets_joints_batch(skel_joints)
156
+ if len(self._offset.shape) == 2:
157
+ offsets = self._offset.expand(cont6d_params.shape[0], -1, -1)
158
+ offsets = offsets.numpy()
159
+ joints = np.zeros(cont6d_params.shape[:-1] + (3,))
160
+ joints[:, 0] = root_pos
161
+ for chain in self._kinematic_tree:
162
+ if do_root_R:
163
+ matR = cont6d_to_matrix_np(cont6d_params[:, 0])
164
+ else:
165
+ matR = np.eye(3)[np.newaxis, :].repeat(len(cont6d_params), axis=0)
166
+ for i in range(1, len(chain)):
167
+ matR = np.matmul(matR, cont6d_to_matrix_np(cont6d_params[:, chain[i]]))
168
+ offset_vec = offsets[:, chain[i]][..., np.newaxis]
169
+ # print(matR.shape, offset_vec.shape)
170
+ joints[:, chain[i]] = np.matmul(matR, offset_vec).squeeze(-1) + joints[:, chain[i-1]]
171
+ return joints
172
+
173
+ def forward_kinematics_cont6d(self, cont6d_params, root_pos, skel_joints=None, do_root_R=True):
174
+ # cont6d_params (batch_size, joints_num, 6)
175
+ # joints (batch_size, joints_num, 3)
176
+ # root_pos (batch_size, 3)
177
+ if skel_joints is not None:
178
+ # skel_joints = torch.from_numpy(skel_joints)
179
+ offsets = self.get_offsets_joints_batch(skel_joints)
180
+ if len(self._offset.shape) == 2:
181
+ offsets = self._offset.expand(cont6d_params.shape[0], -1, -1)
182
+ joints = torch.zeros(cont6d_params.shape[:-1] + (3,)).to(cont6d_params.device)
183
+ joints[..., 0, :] = root_pos
184
+ for chain in self._kinematic_tree:
185
+ if do_root_R:
186
+ matR = cont6d_to_matrix(cont6d_params[:, 0])
187
+ else:
188
+ matR = torch.eye(3).expand((len(cont6d_params), -1, -1)).detach().to(cont6d_params.device)
189
+ for i in range(1, len(chain)):
190
+ matR = torch.matmul(matR, cont6d_to_matrix(cont6d_params[:, chain[i]]))
191
+ offset_vec = offsets[:, chain[i]].unsqueeze(-1)
192
+ # print(matR.shape, offset_vec.shape)
193
+ joints[:, chain[i]] = torch.matmul(matR, offset_vec).squeeze(-1) + joints[:, chain[i-1]]
194
+ return joints
195
+
196
+
197
+
198
+
199
+
text2motion/utils/utils.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import os
3
+ import random
4
+ import time
5
+ import torch
6
+
7
+ import numpy as np
8
+ import torch as th
9
+ from PIL import Image
10
+ from scipy.ndimage import gaussian_filter
11
+
12
+ from utils import paramUtil
13
+
14
+
15
+ def set_random_seed(seed: int, using_cuda: bool = False) -> None:
16
+ """Seed the different random generators.
17
+
18
+ :param seed:
19
+ :param using_cuda:
20
+ """
21
+ # Seed python RNG
22
+ random.seed(seed)
23
+ # Seed numpy RNG
24
+ np.random.seed(seed)
25
+ # seed the RNG for all devices (both CPU and CUDA)
26
+ th.manual_seed(seed)
27
+
28
+ if using_cuda:
29
+ # Deterministic operations for CuDNN, it may impact performances
30
+ th.backends.cudnn.deterministic = True
31
+ th.backends.cudnn.benchmark = False
32
+
33
+ def mkdir(path):
34
+ if not os.path.exists(path):
35
+ os.makedirs(path)
36
+
37
+ COLORS = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0],
38
+ [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255],
39
+ [170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]]
40
+
41
+ MISSING_VALUE = -1
42
+
43
+ def save_image(image_numpy, image_path):
44
+ img_pil = Image.fromarray(image_numpy)
45
+ img_pil.save(image_path)
46
+
47
+
48
+ def save_logfile(log_loss, save_path):
49
+ with open(save_path, 'wt') as f:
50
+ for k, v in log_loss.items():
51
+ w_line = k
52
+ for digit in v:
53
+ w_line += ' %.3f' % digit
54
+ f.write(w_line + '\n')
55
+
56
+
57
+ def print_current_loss(start_time, niter_state, losses, epoch=None, inner_iter=None):
58
+
59
+ def as_minutes(s):
60
+ m = math.floor(s / 60)
61
+ s -= m * 60
62
+ return '%dm %ds' % (m, s)
63
+
64
+ def time_since(since, percent):
65
+ now = time.time()
66
+ s = now - since
67
+ es = s / percent
68
+ rs = es - s
69
+ return '%s (- %s)' % (as_minutes(s), as_minutes(rs))
70
+
71
+ if epoch is not None:
72
+ print('epoch: %3d niter: %6d inner_iter: %4d' % (epoch, niter_state, inner_iter), end=" ")
73
+
74
+ now = time.time()
75
+ message = '%s'%(as_minutes(now - start_time))
76
+
77
+ for k, v in losses.items():
78
+ message += ' %s: %.4f ' % (k, v)
79
+ print(message)
80
+
81
+
82
+ def compose_gif_img_list(img_list, fp_out, duration):
83
+ img, *imgs = [Image.fromarray(np.array(image)) for image in img_list]
84
+ img.save(fp=fp_out, format='GIF', append_images=imgs, optimize=False,
85
+ save_all=True, loop=0, duration=duration)
86
+
87
+
88
+ def save_images(visuals, image_path):
89
+ if not os.path.exists(image_path):
90
+ os.makedirs(image_path)
91
+
92
+ for i, (label, img_numpy) in enumerate(visuals.items()):
93
+ img_name = '%d_%s.jpg' % (i, label)
94
+ save_path = os.path.join(image_path, img_name)
95
+ save_image(img_numpy, save_path)
96
+
97
+
98
+ def save_images_test(visuals, image_path, from_name, to_name):
99
+ if not os.path.exists(image_path):
100
+ os.makedirs(image_path)
101
+
102
+ for i, (label, img_numpy) in enumerate(visuals.items()):
103
+ img_name = "%s_%s_%s" % (from_name, to_name, label)
104
+ save_path = os.path.join(image_path, img_name)
105
+ save_image(img_numpy, save_path)
106
+
107
+
108
+ def compose_and_save_img(img_list, save_dir, img_name, col=4, row=1, img_size=(256, 200)):
109
+ # print(col, row)
110
+ compose_img = compose_image(img_list, col, row, img_size)
111
+ if not os.path.exists(save_dir):
112
+ os.makedirs(save_dir)
113
+ img_path = os.path.join(save_dir, img_name)
114
+ # print(img_path)
115
+ compose_img.save(img_path)
116
+
117
+
118
+ def compose_image(img_list, col, row, img_size):
119
+ to_image = Image.new('RGB', (col * img_size[0], row * img_size[1]))
120
+ for y in range(0, row):
121
+ for x in range(0, col):
122
+ from_img = Image.fromarray(img_list[y * col + x])
123
+ # print((x * img_size[0], y*img_size[1],
124
+ # (x + 1) * img_size[0], (y + 1) * img_size[1]))
125
+ paste_area = (x * img_size[0], y*img_size[1],
126
+ (x + 1) * img_size[0], (y + 1) * img_size[1])
127
+ to_image.paste(from_img, paste_area)
128
+ # to_image[y*img_size[1]:(y + 1) * img_size[1], x * img_size[0] :(x + 1) * img_size[0]] = from_img
129
+ return to_image
130
+
131
+
132
+ def list_cut_average(ll, intervals):
133
+ if intervals == 1:
134
+ return ll
135
+
136
+ bins = math.ceil(len(ll) * 1.0 / intervals)
137
+ ll_new = []
138
+ for i in range(bins):
139
+ l_low = intervals * i
140
+ l_high = l_low + intervals
141
+ l_high = l_high if l_high < len(ll) else len(ll)
142
+ ll_new.append(np.mean(ll[l_low:l_high]))
143
+ return ll_new
144
+
145
+
146
+ def motion_temporal_filter(motion, sigma=1):
147
+ motion = motion.reshape(motion.shape[0], -1)
148
+ # print(motion.shape)
149
+ for i in range(motion.shape[1]):
150
+ motion[:, i] = gaussian_filter(motion[:, i], sigma=sigma, mode="nearest")
151
+ return motion.reshape(motion.shape[0], -1, 3)
152
+
153
+
154
+ def get_device(args):
155
+ return torch.device('cuda:%d' % args.gpu_id if args.gpu_id != -1 else 'cpu')
156
+
text2motion/utils/word_vectorizer.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pickle
3
+ from os.path import join as pjoin
4
+
5
+ POS_enumerator = {
6
+ 'VERB': 0,
7
+ 'NOUN': 1,
8
+ 'DET': 2,
9
+ 'ADP': 3,
10
+ 'NUM': 4,
11
+ 'AUX': 5,
12
+ 'PRON': 6,
13
+ 'ADJ': 7,
14
+ 'ADV': 8,
15
+ 'Loc_VIP': 9,
16
+ 'Body_VIP': 10,
17
+ 'Obj_VIP': 11,
18
+ 'Act_VIP': 12,
19
+ 'Desc_VIP': 13,
20
+ 'OTHER': 14,
21
+ }
22
+
23
+ Loc_list = ('left', 'right', 'clockwise', 'counterclockwise', 'anticlockwise', 'forward', 'back', 'backward',
24
+ 'up', 'down', 'straight', 'curve')
25
+
26
+ Body_list = ('arm', 'chin', 'foot', 'feet', 'face', 'hand', 'mouth', 'leg', 'waist', 'eye', 'knee', 'shoulder', 'thigh')
27
+
28
+ Obj_List = ('stair', 'dumbbell', 'chair', 'window', 'floor', 'car', 'ball', 'handrail', 'baseball', 'basketball')
29
+
30
+ Act_list = ('walk', 'run', 'swing', 'pick', 'bring', 'kick', 'put', 'squat', 'throw', 'hop', 'dance', 'jump', 'turn',
31
+ 'stumble', 'dance', 'stop', 'sit', 'lift', 'lower', 'raise', 'wash', 'stand', 'kneel', 'stroll',
32
+ 'rub', 'bend', 'balance', 'flap', 'jog', 'shuffle', 'lean', 'rotate', 'spin', 'spread', 'climb')
33
+
34
+ Desc_list = ('slowly', 'carefully', 'fast', 'careful', 'slow', 'quickly', 'happy', 'angry', 'sad', 'happily',
35
+ 'angrily', 'sadly')
36
+
37
+ VIP_dict = {
38
+ 'Loc_VIP': Loc_list,
39
+ 'Body_VIP': Body_list,
40
+ 'Obj_VIP': Obj_List,
41
+ 'Act_VIP': Act_list,
42
+ 'Desc_VIP': Desc_list,
43
+ }
44
+
45
+
46
+ class WordVectorizer(object):
47
+ def __init__(self, meta_root, prefix):
48
+ vectors = np.load(pjoin(meta_root, '%s_data.npy'%prefix))
49
+ words = pickle.load(open(pjoin(meta_root, '%s_words.pkl'%prefix), 'rb'))
50
+ word2idx = pickle.load(open(pjoin(meta_root, '%s_idx.pkl'%prefix), 'rb'))
51
+ self.word2vec = {w: vectors[word2idx[w]] for w in words}
52
+
53
+ def _get_pos_ohot(self, pos):
54
+ pos_vec = np.zeros(len(POS_enumerator))
55
+ if pos in POS_enumerator:
56
+ pos_vec[POS_enumerator[pos]] = 1
57
+ else:
58
+ pos_vec[POS_enumerator['OTHER']] = 1
59
+ return pos_vec
60
+
61
+ def __len__(self):
62
+ return len(self.word2vec)
63
+
64
+ def __getitem__(self, item):
65
+ word, pos = item.split('/')
66
+ if word in self.word2vec:
67
+ word_vec = self.word2vec[word]
68
+ vip_pos = None
69
+ for key, values in VIP_dict.items():
70
+ if word in values:
71
+ vip_pos = key
72
+ break
73
+ if vip_pos is not None:
74
+ pos_vec = self._get_pos_ohot(vip_pos)
75
+ else:
76
+ pos_vec = self._get_pos_ohot(pos)
77
+ else:
78
+ word_vec = self.word2vec['unk']
79
+ pos_vec = self._get_pos_ohot('OTHER')
80
+ return word_vec, pos_vec