Spaces:

KAIST-Visual-AI-Group
/

salad-demo

Sleeping

App Files Files Community

DveloperY0115 commited on Jul 23, 2023

Commit

801501a

•

1 Parent(s): 7d3169e

init repo

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
README.md +5 -4
app.py +116 -0
checkpoints/lang_phase1/hparams.yaml +48 -0
checkpoints/lang_phase1/state_only.ckpt +3 -0
checkpoints/lang_phase2/hparams.yaml +47 -0
checkpoints/lang_phase2/state_only.ckpt +3 -0
checkpoints/phase1/hparams.yaml +39 -0
checkpoints/phase1/state_only.ckpt +3 -0
checkpoints/phase2/hparams.yaml +41 -0
checkpoints/phase2/state_only.ckpt +3 -0
custom_wheels/salad-0.1-py3-none-any.whl +0 -0
data/autosdf_spaghetti_intersec_game_data.csv +0 -0
data/spaghetti_airplane_latents.hdf5 +3 -0
data/spaghetti_airplane_latents_mean_std.hdf5 +3 -0
data/spaghetti_chair_latents.hdf5 +3 -0
data/spaghetti_chair_latents_mean_std.hdf5 +3 -0
data/spaghetti_table_latents.hdf5 +3 -0
data/spaghetti_table_latents_mean_std.hdf5 +3 -0
requirements.txt +1 -0
salad.egg-info/PKG-INFO +5 -0
salad.egg-info/SOURCES.txt +7 -0
salad.egg-info/dependency_links.txt +1 -0
salad.egg-info/not-zip-safe +1 -0
salad.egg-info/top_level.txt +1 -0
salad/data/__pycache__/dataset.cpython-39.pyc +0 -0
salad/data/dataset.py +149 -0
salad/model_components/__pycache__/lstm.cpython-39.pyc +0 -0
salad/model_components/__pycache__/network.cpython-39.pyc +0 -0
salad/model_components/__pycache__/simple_module.cpython-39.pyc +0 -0
salad/model_components/__pycache__/transformer.cpython-39.pyc +0 -0
salad/model_components/__pycache__/variance_schedule.cpython-39.pyc +0 -0
salad/model_components/lstm.py +56 -0
salad/model_components/network.py +229 -0
salad/model_components/simple_module.py +125 -0
salad/model_components/transformer.py +308 -0
salad/model_components/variance_schedule.py +57 -0
salad/models/__init__.py +0 -0
salad/models/__pycache__/__init__.cpython-39.pyc +0 -0
salad/models/__pycache__/base_model.cpython-39.pyc +0 -0
salad/models/__pycache__/language_phase1.cpython-39.pyc +0 -0
salad/models/__pycache__/language_phase2.cpython-39.pyc +0 -0
salad/models/__pycache__/phase1.cpython-39.pyc +0 -0
salad/models/__pycache__/phase2.cpython-39.pyc +0 -0
salad/models/base_model.py +147 -0
salad/models/language_phase1.py +340 -0
salad/models/language_phase2.py +201 -0
salad/models/phase1.py +65 -0
salad/models/phase2.py +183 -0
salad/spaghetti/.gitignore +9 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.hdf5 filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,12 +1,13 @@
 ---
-title: Salad Demo
-emoji: 🏆
 colorFrom: blue
-colorTo: yellow
 sdk: gradio
-sdk_version: 3.38.0
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Test
+emoji: 🦀
 colorFrom: blue
+colorTo: red
 sdk: gradio
+sdk_version: 3.36.1
 app_file: app.py
 pinned: false
+license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,116 @@

+"""
+app.py
+An interactive demo of text-guided shape generation.
+"""
+from pathlib import Path
+from typing import Literal
+import gradio as gr
+import plotly.graph_objects as go
+from salad.utils.spaghetti_util import (
+    get_mesh_from_spaghetti,
+    generate_zc_from_sj_gaus,
+    load_mesher,
+    load_spaghetti,
+)
+import hydra
+from omegaconf import OmegaConf
+import torch
+from pytorch_lightning import seed_everything
+def load_model(
+    model_class: Literal["phase1", "phase2", "lang_phase1", "lang_phase2"],
+    device,
+):
+    checkpoint_dir = Path(__file__).parent / "checkpoints"
+    c = OmegaConf.load(checkpoint_dir / f"{model_class}/hparams.yaml")
+    model = hydra.utils.instantiate(c)
+    ckpt = torch.load(checkpoint_dir / f"{model_class}/state_only.ckpt")
+    model.load_state_dict(ckpt)
+    model.eval()
+    for p in model.parameters(): p.requires_grad_(False)
+    model = model.to(device)
+    return model
+def run_inference(prompt: str):
+    """The entry point of the demo."""
+    device: torch.device = torch.device("cuda")
+    """Device to run the demo on."""
+    seed: int = 63
+    """Random seed for reproducibility."""
+    # set random seed
+    seed_everything(seed)
+    # load SPAGHETTI and mesher
+    spaghetti = load_spaghetti(device)
+    mesher = load_mesher(device)
+    # load SALAD
+    lang_phase1_model = load_model("lang_phase1", device)
+    lang_phase2_model = load_model("phase2", device)
+    lang_phase1_model._build_dataset("val")
+    # run phase 1
+    extrinsics = lang_phase1_model.sampling_gaussians([prompt])
+    # run phase 2
+    intrinsics = lang_phase2_model.sample(extrinsics)
+    # generate mesh
+    zcs = generate_zc_from_sj_gaus(spaghetti, intrinsics, extrinsics)
+    vertices, faces = get_mesh_from_spaghetti(
+        spaghetti,
+        mesher,
+        zcs[0],
+        res=256,
+    )
+    # plot
+    figure = go.Figure(
+        data=[
+            go.Mesh3d(
+                x=vertices[:, 0],  # flip front-back
+                y=-vertices[:, 2],
+                z=vertices[:, 1],
+                i=faces[:, 0],
+                j=faces[:, 1],
+                k=faces[:, 2],
+                color="gray",
+            )
+        ],
+        layout=dict(
+            scene=dict(
+                xaxis=dict(visible=False),
+                yaxis=dict(visible=False),
+                zaxis=dict(visible=False),
+            )
+        ),
+    )
+    return figure
+if __name__ == "__main__":
+    # create UI
+    demo = gr.Interface(
+        fn=run_inference,
+        inputs="text",
+        outputs=gr.Plot(),
+        title="SALAD: Text-Guided Shape Generation",
+        description="Describe a chair",
+        examples=[
+            "an office chair",
+            "a chair with armrests",
+            "a chair without armrests",
+        ]
+    )
+    # initiate
+    demo.queue(max_size=30)
+    demo.launch()

checkpoints/lang_phase1/hparams.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+_target_: salad.models.language_phase1.LangPhase1Model
+network:
+  _target_: salad.model_components.network.CondDiffNetwork
+  input_dim: 16
+  residual: true
+  context_dim: 768
+  context_embedding_dim: 1024
+  embedding_dim: 512
+  encoder_use_time: false
+  encoder_type: pointwise
+  decoder_type: transformer_encoder
+  enc_num_layers: 2
+  dec_num_layers: 6
+  use_timestep_embedder: true
+  timestep_embedder_dim: 128
+variance_schedule:
+  _target_: salad.model_components.variance_schedule.VarianceSchedule
+  num_steps: &time_steps 1000
+  beta_1: 1e-4
+  beta_T: 0.05
+  mode: linear
+# optimizer
+lr: 1e-4
+batch_size: 64
+# dataset
+dataset_kwargs:
+  data_path: spaghetti_chair_latents.hdf5
+  repeat: 1
+  data_keys: ["g_js_affine"]
+  only_easy_context: false
+  global_normalization: &normalization partial
+global_normalization: *normalization
+num_timesteps: *time_steps
+faster: true
+validation_step: 10
+no_run_validation: false
+spaghetti_tag: "chairs_large" # or airplanes, tables
+text_encoder_freeze: false
+use_lstm: true
+classifier_free_guidance: true
+conditioning_dropout_prob: 0.2

checkpoints/lang_phase1/state_only.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bf46454eaaabbb7f3008c51beaae5b16794b189f3cae48f79db70fcdf5413380
+size 318782397

checkpoints/lang_phase2/hparams.yaml ADDED Viewed

	@@ -0,0 +1,47 @@

+_target_: salad.models.language_phase2.LangPhase2Model
+network:
+  _target_: salad.model_components.network.CondDiffNetwork
+  input_dim: 512
+  residual: true
+  context_dim: 784 # concat of 768 lang feat and gaussian.
+  context_embedding_dim: 1024
+  embedding_dim: 512
+  encoder_use_time: false
+  encoder_type: transformer
+  decoder_type: transformer_encoder
+  enc_num_layers: 6
+  dec_num_layers: 6
+  use_timestep_embedder: true
+  timestep_embedder_dim: 128
+variance_schedule:
+  _target_: salad.model_components.variance_schedule.VarianceSchedule
+  num_steps: &time_steps 1000
+  beta_1: 1e-4
+  beta_T: 0.05
+  mode: linear
+# optimizer
+lr: 1e-4
+batch_size: 64
+# dataset
+dataset_kwargs:
+  data_path: spaghetti_chair_latents.hdf5
+  repeat: 1
+  data_keys: ["s_j_affine", "g_js_affine"]
+  only_easy_context: false
+  global_normalization: &normalization false
+global_normalization: *normalization
+num_timesteps: *time_steps
+faster: true
+validation_step: 10
+no_run_validation: false
+spaghetti_tag: "chairs_large" # or airplanes, tables
+text_encoder_freeze: false
+use_lstm: true
+classifier_free_guidance: true
+conditioning_dropout_prob: 0.2

checkpoints/lang_phase2/state_only.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4105dd24201fa8aad3fc4db2f74376f98f4df53b38ae749d944bfdb6552ea40f
+size 455307461

checkpoints/phase1/hparams.yaml ADDED Viewed

	@@ -0,0 +1,39 @@

+_target_: salad.models.phase1.Phase1Model
+network:
+  _target_: salad.model_components.network.UnCondDiffNetwork
+  input_dim: 16
+  embedding_dim: 512
+  num_heads: 4
+  use_timestep_embedder: true
+  timestep_embedder_dim: 128
+  enc_num_layers: 6
+  residual: true
+  encoder_type: transformer
+  attn_dropout: 0.0
+variance_schedule:
+  _target_: salad.model_components.variance_schedule.VarianceSchedule
+  num_steps: &time_steps 1000
+  beta_1: 1e-4
+  beta_T: 0.05
+  mode: linear
+# optimizer
+lr: 1e-4
+batch_size: 64
+# dataset
+dataset_kwargs:
+  data_path: spaghetti_chair_latents.hdf5
+  repeat: 3
+  data_keys: ["g_js_affine"]
+  global_normalization: &normalization partial
+global_normalization: *normalization # normalize pi, eigenvalues.
+num_timesteps: *time_steps
+faster: true
+validation_step: 10
+no_run_validation: false
+spaghetti_tag: "chairs_large" # or airplanes, tables

checkpoints/phase1/state_only.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5f616fa657723de4855e8571f3ef828ff25221b86cb516a755aaa93538b0c7de
+size 60275831

checkpoints/phase2/hparams.yaml ADDED Viewed

	@@ -0,0 +1,41 @@

+_target_: salad.models.phase2.Phase2Model
+network:
+  _target_: salad.model_components.network.CondDiffNetwork
+  input_dim: 512
+  residual: true
+  context_dim: 16 # gaussian condition dim.
+  context_embedding_dim: 512
+  embedding_dim: 512
+  encoder_use_time: false
+  encoder_type: transformer
+  decoder_type: transformer_encoder # we don't use cross attention.
+  enc_num_layers: 6
+  dec_num_layers: 6
+  use_timestep_embedder: true
+  timestep_embedder_dim: 128
+variance_schedule:
+  _target_: salad.model_components.variance_schedule.VarianceSchedule
+  num_steps: &time_steps 1000
+  beta_1: 1e-4
+  beta_T: 0.05
+  mode: linear
+# optimizer
+lr: 1e-4
+batch_size: 64
+# dataset
+dataset_kwargs:
+  data_path: spaghetti_chair_latents.hdf5
+  repeat: 3
+  data_keys: ["s_j_affine", "g_js_affine"]
+  global_normalization: &normalization null
+global_normalization: *normalization # normalize pi, eigenvalues.
+num_timesteps: *time_steps
+faster: true
+validation_step: 10
+no_run_validation: false
+spaghetti_tag: "chairs_large" # or airplanes, tables

checkpoints/phase2/state_only.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aed08103f6eebbd84fac523affaab2cd493f8f2a1d5e81e9d298cc0a7a807ed2
+size 150592331

custom_wheels/salad-0.1-py3-none-any.whl ADDED Viewed

Binary file (994 Bytes). View file

data/autosdf_spaghetti_intersec_game_data.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/spaghetti_airplane_latents.hdf5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c242271687d13159b0df44a3179a0d460c9e87c577851d8d0282f0369a529f46
+size 222017536

data/spaghetti_airplane_latents_mean_std.hdf5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c32e24c7786593ffdd918e05fdd1148634c3c707a4948e0a3bf6a6c002b540e1
+size 12544

data/spaghetti_chair_latents.hdf5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7bfa1533a0366e9271f6bf96d4f7a135f8763ba66ce26b5cc952e6af14e5bfe4
+size 1255457792

data/spaghetti_chair_latents_mean_std.hdf5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f6be55ae235fe77aa821146122f0e911e9593e78a001b8fa63dea041c49095fa
+size 8320

data/spaghetti_table_latents.hdf5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:20ef1da19e47e2c23782defa2c5d2172d7322c5476c92f7ed3fee271e3893f91
+size 1127843840

data/spaghetti_table_latents_mean_std.hdf5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:718825073ede0c52ccd5c277b1114425558a2a45258dd952a4707fecb3dc5d57
+size 8320

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ./custom_wheels/salad-0.1-py3-none-any.whl

salad.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,5 @@

+Metadata-Version: 2.1
+Name: salad
+Version: 0.1
+Summary: SALAD: Part-Level Latent Diffusion for 3D Shape Generation and Manipulation
+Home-page: https://github.com/63days/SALAD

salad.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+README.md
+setup.py
+salad.egg-info/PKG-INFO
+salad.egg-info/SOURCES.txt
+salad.egg-info/dependency_links.txt
+salad.egg-info/not-zip-safe
+salad.egg-info/top_level.txt

salad.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

salad.egg-info/not-zip-safe ADDED Viewed

	@@ -0,0 +1 @@


1	+

salad.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ salad

salad/data/__pycache__/dataset.cpython-39.pyc ADDED Viewed

Binary file (4.61 kB). View file

salad/data/dataset.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import h5py
+import numpy as np
+import pandas as pd
+import torch
+from dotmap import DotMap
+from salad.utils.paths import DATA_DIR
+from salad.utils import thutil
+class SALADDataset(torch.utils.data.Dataset):
+    def __init__(self, data_path, repeat=None, **kwargs):
+        super().__init__()
+        self.data_path = str(DATA_DIR / data_path)
+        self.repeat = repeat
+        self.__dict__.update(kwargs)
+        self.hparams = DotMap(self.__dict__)
+        """
+        Global Data statistics.
+        """
+        if self.hparams.get("global_normalization"):
+            with h5py.File(self.data_path.replace(".hdf5", "_mean_std.hdf5")) as f:
+                self.global_mean = f["mean"][:].astype(np.float32)
+                self.global_std = f["std"][:].astype(np.float32)
+        self.data = dict()
+        with h5py.File(self.data_path) as f:
+            for k in self.hparams.data_keys:
+                self.data[k] = f[k][:].astype(np.float32)
+                """
+                global_normalization arg is for gaussians only.
+                """
+                if k == "g_js_affine":
+                    if self.hparams.get("global_normalization") == "partial":
+                        assert k == "g_js_affine"
+                        if self.hparams.get("verbose"):
+                            print("[*] Normalize data only for pi and eigenvalues.")
+                        # 3: mu, 9: eigvec, 1: pi, 3: eigval
+                        self.data[k] = self.normalize_global_static(
+                            self.data[k], slice(12, None)
+                        )
+                    elif self.hparams.get("global_normalization") == "all":
+                        assert k == "g_js_affine"
+                        if self.hparams.get("verbose"):
+                            print("[*] Normalize data for all elements.")
+                        self.data[k] = self.normalize_global_static(
+                            self.data[k], slice(None)
+                        )
+    def __getitem__(self, idx):
+        if self.repeat is not None and self.repeat > 1:
+            idx = int(idx / self.repeat)
+        items = []
+        for k in self.hparams.data_keys:
+            data = torch.from_numpy(self.data[k][idx])
+            items.append(data)
+        if self.hparams.get("concat_data"):
+            return torch.cat(items, -1)  # [16,528]
+        if len(items) == 1:
+            return items[0]
+        return items
+    def __len__(self):
+        k = self.hparams.data_keys[0]
+        if self.repeat is not None and self.repeat > 1:
+            return len(self.data[k]) * self.repeat
+        return len(self.data[k])
+    def get_other_latents(self, key):
+        with h5py.File(self.data_path) as f:
+            return f[key][:].astype(np.float32)
+    def normalize_global_static(self, data: np.ndarray, normalize_indices=slice(None)):
+        """
+        Input:
+            np.ndarray or torch.Tensor. [16,16] or [B,16,16]
+            slice(None) -> full
+            slice(12, None) -> partial
+        Output:
+            [16,16] or [B,16,16]
+        """
+        assert normalize_indices == slice(None) or normalize_indices == slice(
+            12, None
+        ), print(f"{normalize_indices} is wrong.")
+        data = thutil.th2np(data).copy()
+        data[..., normalize_indices] = (
+            data[..., normalize_indices] - self.global_mean[normalize_indices]
+        ) / self.global_std[normalize_indices]
+        return data
+    def unnormalize_global_static(
+        self, data: np.ndarray, unnormalize_indices=slice(None)
+    ):
+        """
+        Input:
+            np.ndarray or torch.Tensor. [16,16] or [B,16,16]
+            slice(None) -> full
+            slice(12, None) -> partial
+        Output:
+            [16,16] or [B,16,16]
+        """
+        assert unnormalize_indices == slice(None) or unnormalize_indices == slice(
+            12, None
+        ), print(f"{unnormalize_indices} is wrong.")
+        data = thutil.th2np(data).copy()
+        data[..., unnormalize_indices] = (
+            data[..., unnormalize_indices]
+        ) * self.global_std[unnormalize_indices] + self.global_mean[unnormalize_indices]
+        return data
+class LangSALADDataset(SALADDataset):
+    def __init__(self, data_path, repeat=None, **kwargs):
+        super().__init__(data_path, repeat, **kwargs)
+        # self.game_data = pd.read_csv(self.hparams.lang_data_path)
+        self.game_data = pd.read_csv(DATA_DIR / "autosdf_spaghetti_intersec_game_data.csv")
+        self.shapenet_ids = np.array(self.game_data["sn"])
+        self.spaghetti_indices = np.array(self.game_data["spaghetti_idx"])  # for 5401
+        self.texts = np.array(self.game_data["text"])
+        assert len(self.shapenet_ids) == len(self.spaghetti_indices) == len(self.texts)
+    def __getitem__(self, idx):
+        if self.repeat is not None and self.repeat > 1:
+            idx = int(idx / self.repeat)
+        spa_idx = self.spaghetti_indices[idx]
+        text = self.texts[idx]
+        latents = []
+        for k in self.hparams.data_keys:
+            data = torch.from_numpy(self.data[k][spa_idx])
+            latents.append(data)
+        item = latents + [text]
+        if self.hparams.get("concat_data"):
+            latents = torch.cat(latents, -1)
+            return latents, text
+        return item
+    def __len__(self):
+        if self.repeat is not None and self.repeat > 1:
+            return len(self.texts) * self.repeat
+        return len(self.texts)

salad/model_components/__pycache__/lstm.cpython-39.pyc ADDED Viewed

Binary file (2.38 kB). View file

salad/model_components/__pycache__/network.cpython-39.pyc ADDED Viewed

Binary file (4.73 kB). View file

salad/model_components/__pycache__/simple_module.cpython-39.pyc ADDED Viewed

Binary file (3.9 kB). View file

salad/model_components/__pycache__/transformer.cpython-39.pyc ADDED Viewed

Binary file (8.63 kB). View file

salad/model_components/__pycache__/variance_schedule.cpython-39.pyc ADDED Viewed

Binary file (1.99 kB). View file

salad/model_components/lstm.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
+class LSTM(nn.Module):
+    def __init__(self, text_dim, embedding_dim, vocab_size, padding_idx=0):
+        super().__init__()
+        self.padding_idx = padding_idx
+        self.word_embedding = nn.Embedding(
+            vocab_size, embedding_dim, padding_idx=padding_idx
+        )
+        self.rnn = nn.LSTM(embedding_dim, text_dim, batch_first=True)
+        self.w_attn = nn.Parameter(torch.Tensor(1, text_dim))
+        nn.init.xavier_uniform_(self.w_attn)
+    def forward(self, padded_tokens, dropout=0.5):
+        w_emb = self.word_embedding(padded_tokens)
+        w_emb = F.dropout(w_emb, dropout, self.training)
+        len_seq = (padded_tokens != self.padding_idx).sum(dim=1).cpu()
+        x_packed = pack_padded_sequence(
+            w_emb, len_seq, enforce_sorted=False, batch_first=True
+            )
+        B = padded_tokens.shape[0]
+        rnn_out, _ = self.rnn(x_packed)
+        rnn_out, dummy = pad_packed_sequence(rnn_out, batch_first=True)
+        h = rnn_out[torch.arange(B), len_seq - 1]
+        final_feat, attn = self.word_attention(rnn_out, h, len_seq)
+        return final_feat, attn
+    def word_attention(self, R, h, len_seq):
+        """
+        Input:
+            R: hidden states of the entire words
+            h: the final hidden state after processing the entire words
+            len_seq: the length of the sequence
+        Output:
+            final_feat: the final feature after the bilinear attention
+            attn: word attention weights
+        """
+        B, N, D = R.shape
+        device = R.device
+        len_seq = len_seq.to(device)
+        W_attn = (self.w_attn * torch.eye(D).to(device))[None].repeat(B, 1, 1)
+        score = torch.bmm(torch.bmm(R, W_attn), h.unsqueeze(-1))
+        mask = torch.arange(N).reshape(1, N, 1).repeat(B, 1, 1).to(device)
+        mask = mask < len_seq.reshape(B, 1, 1)
+        score = score.masked_fill(mask == 0, -1e9)
+        attn = F.softmax(score, 1)
+        final_feat = torch.bmm(R.transpose(1, 2), attn).squeeze(-1)
+        return final_feat, attn.squeeze(-1)

salad/model_components/network.py ADDED Viewed

	@@ -0,0 +1,229 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from dotmap import DotMap
+from salad.model_components.simple_module import TimePointWiseEncoder, TimestepEmbedder
+from salad.model_components.transformer import (
+    PositionalEncoding,
+    TimeTransformerDecoder,
+    TimeTransformerEncoder,
+)
+class UnCondDiffNetwork(nn.Module):
+    def __init__(self, input_dim, residual, **kwargs):
+        """
+        Transformer Encoder.
+        """
+        super().__init__()
+        self.input_dim = input_dim
+        self.residual = residual
+        self.__dict__.update(kwargs)
+        self.hparams = DotMap(self.__dict__)
+        self._build_model()
+    def _build_model(self):
+        self.act = F.leaky_relu
+        if self.hparams.get("use_timestep_embedder"):
+            self.time_embedder = TimestepEmbedder(self.hparams.timestep_embedder_dim)
+            dim_ctx = self.hparams.timestep_embedder_dim
+        else:
+            dim_ctx = 3
+        """
+        Encoder part
+        """
+        enc_dim = self.hparams.embedding_dim
+        self.embedding = nn.Linear(self.hparams.input_dim, enc_dim)
+        if not self.hparams.get("encoder_type"):
+            self.encoder = TimeTransformerEncoder(
+                enc_dim,
+                dim_ctx=dim_ctx,
+                num_heads=self.hparams.num_heads
+                if self.hparams.get("num_heads")
+                else 4,
+                use_time=True,
+                num_layers=self.hparams.enc_num_layers,
+                last_fc=True,
+                last_fc_dim_out=self.hparams.input_dim,
+            )
+        else:
+            if self.hparams.encoder_type == "transformer":
+                self.encoder = TimeTransformerEncoder(
+                    enc_dim,
+                    dim_ctx=dim_ctx,
+                    num_heads=self.hparams.num_heads
+                    if self.hparams.get("num_heads")
+                    else 4,
+                    use_time=True,
+                    num_layers=self.hparams.enc_num_layers,
+                    last_fc=True,
+                    last_fc_dim_out=self.hparams.input_dim,
+                    dropout=self.hparams.get("attn_dropout", 0.0)
+                )
+            else:
+                raise ValueError
+    def forward(self, x, beta):
+        """
+        Input:
+            x: [B,G,D] latent
+            beta: B
+        Output:
+            eta: [B,G,D]
+        """
+        B, G = x.shape[:2]
+        if self.hparams.get("use_timestep_embedder"):
+            time_emb = self.time_embedder(beta).unsqueeze(1)
+        else:
+            beta = beta.view(B, 1, 1)
+            time_emb = torch.cat(
+                [beta, torch.sin(beta), torch.cos(beta)], dim=-1
+            )  # [B,1,3]
+        ctx = time_emb
+        x_emb = self.embedding(x)
+        out = self.encoder(x_emb, ctx=ctx)
+        if self.hparams.residual:
+            out = out + x
+        return out
+class CondDiffNetwork(nn.Module):
+    def __init__(self, input_dim, residual, **kwargs):
+        """
+        Transformer Encoder + Decoder.
+        """
+        super().__init__()
+        self.input_dim = input_dim
+        self.residual = residual
+        self.__dict__.update(kwargs)
+        self.hparams = DotMap(self.__dict__)
+        self._build_model()
+    def _build_model(self):
+        self.act = F.leaky_relu
+        if self.hparams.get("use_timestep_embedder"):
+            self.time_embedder = TimestepEmbedder(self.hparams.timestep_embedder_dim)
+            dim_ctx = self.hparams.timestep_embedder_dim
+        else:
+            dim_ctx = 3
+        """
+        Encoder part
+        """
+        enc_dim = self.hparams.context_embedding_dim
+        self.context_embedding = nn.Linear(self.hparams.context_dim, enc_dim)
+        if self.hparams.encoder_type == "transformer":
+            self.encoder = TimeTransformerEncoder(
+                enc_dim,
+                3,
+                num_heads=4,
+                use_time=self.hparams.encoder_use_time,
+                num_layers=self.hparams.enc_num_layers
+                if self.hparams.get("enc_num_layers")
+                else 3,
+                last_fc=False,
+            )
+        elif self.hparams.encoder_type == "pointwise":
+            self.encoder = TimePointWiseEncoder(
+                enc_dim,
+                dim_ctx=None,
+                use_time=self.hparams.encoder_use_time,
+                num_layers=self.hparams.enc_num_layers,
+            )
+        else:
+            raise ValueError
+        """
+        Decoder part
+        """
+        dec_dim = self.hparams.embedding_dim
+        input_dim = self.hparams.input_dim
+        self.query_embedding = nn.Linear(self.hparams.input_dim, dec_dim)
+        if self.hparams.decoder_type == "transformer_decoder":
+            self.decoder = TimeTransformerDecoder(
+                dec_dim,
+                enc_dim,
+                dim_ctx=dim_ctx,
+                num_heads=4,
+                last_fc=True,
+                last_fc_dim_out=input_dim,
+                num_layers=self.hparams.dec_num_layers
+                if self.hparams.get("dec_num_layers")
+                else 3,
+            )
+        elif self.hparams.decoder_type == "transformer_encoder":
+            self.decoder = TimeTransformerEncoder(
+                dec_dim,
+                dim_ctx=enc_dim + dim_ctx,
+                num_heads=4,
+                last_fc=True,
+                last_fc_dim_out=input_dim,
+                num_layers=self.hparams.dec_num_layers
+                if self.hparams.get("dec_num_layers")
+                else 3,
+            )
+        else:
+            raise ValueError
+    def forward(self, x, beta, context):
+        """
+        Input:
+            x: [B,G,D] intrinsic
+            beta: B
+            context: [B,G,D2] or [B, D2] condition
+        Output:
+            eta: [B,G,D]
+        """
+        # print(f"x: {x.shape} context: {context.shape} beta: {beta.shape}")
+        B, G = x.shape[:2]
+        if self.hparams.get("use_timestep_embedder"):
+            time_emb = self.time_embedder(beta).unsqueeze(1)
+        else:
+            beta = beta.view(B, 1, 1)
+            time_emb = torch.cat(
+                [beta, torch.sin(beta), torch.cos(beta)], dim=-1
+            )  # [B,1,3]
+        ctx = time_emb
+        """
+        Encoding
+        """
+        cout = self.context_embedding(context)
+        cout = self.encoder(cout, ctx=ctx if self.hparams.encoder_use_time else None)
+        if cout.ndim == 2:
+            cout = cout.unsqueeze(1).expand(-1, G, -1)
+        """
+        Decoding
+        """
+        out = self.query_embedding(x)
+        if self.hparams.get("use_pos_encoding"):
+            out = self.pos_encoding(out)
+        if self.hparams.decoder_type == "transformer_encoder":
+            try:
+                ctx = ctx.expand(-1, G, -1)
+                if cout.ndim == 2:
+                    cout = cout.unsqueeze(1)
+                cout = cout.expand(-1, G, -1)
+                ctx = torch.cat([ctx, cout], -1)
+            except Exception as e:
+                print(e, G, ctx.shape, cout.shape)
+            out = self.decoder(out, ctx=ctx)
+        else:
+            out = self.decoder(out, cout, ctx=ctx)
+        # if hasattr(self, "last_fc"):
+        # out = self.last_fc(out)
+        if self.hparams.residual:
+            out = out + x
+        return out

salad/model_components/simple_module.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+from salad.model_components.transformer import TimeMLP
+class TimePointwiseLayer(nn.Module):
+    def __init__(
+        self,
+        dim_in,
+        dim_ctx,
+        mlp_ratio=2,
+        act=F.leaky_relu,
+        dropout=0.0,
+        use_time=False,
+    ):
+        super().__init__()
+        self.use_time = use_time
+        self.act = act
+        self.mlp1 = TimeMLP(
+            dim_in, dim_in * mlp_ratio, dim_in, dim_ctx, use_time=use_time
+        )
+        self.norm1 = nn.LayerNorm(dim_in)
+        self.mlp2 = TimeMLP(
+            dim_in, dim_in * mlp_ratio, dim_in, dim_ctx, use_time=use_time
+        )
+        self.norm2 = nn.LayerNorm(dim_in)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, ctx=None):
+        res = x
+        x = self.mlp1(x, ctx=ctx)
+        x = self.norm1(x + res)
+        res = x
+        x = self.mlp2(x, ctx=ctx)
+        x = self.norm2(x + res)
+        return x
+class TimePointWiseEncoder(nn.Module):
+    def __init__(
+        self,
+        dim_in,
+        dim_ctx=None,
+        mlp_ratio=2,
+        act=F.leaky_relu,
+        dropout=0.0,
+        use_time=True,
+        num_layers=6,
+        last_fc=False,
+        last_fc_dim_out=None,
+    ):
+        super().__init__()
+        self.last_fc = last_fc
+        if last_fc:
+            self.fc = nn.Linear(dim_in, last_fc_dim_out)
+        self.layers = nn.ModuleList(
+            [
+                TimePointwiseLayer(
+                    dim_in,
+                    dim_ctx=dim_ctx,
+                    mlp_ratio=mlp_ratio,
+                    act=act,
+                    dropout=dropout,
+                    use_time=use_time,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+    def forward(self, x, ctx=None):
+        for i, layer in enumerate(self.layers):
+            x = layer(x, ctx=ctx)
+        if self.last_fc:
+            x = self.fc(x)
+        return x
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period)
+            * torch.arange(start=0, end=half, dtype=torch.float32)
+            / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat(
+                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
+            )
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq)
+        return t_emb

salad/model_components/transformer.py ADDED Viewed

	@@ -0,0 +1,308 @@

+"""
+Implementation of time conditioned Transformer.
+"""
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_hid, n_position=200):
+        super(PositionalEncoding, self).__init__()
+        # Not a parameter
+        self.register_buffer(
+            "pos_table", self._get_sinusoid_encoding_table(n_position, d_hid)
+        )
+    def _get_sinusoid_encoding_table(self, n_position, d_hid):
+        """Sinusoid position encoding table"""
+        # TODO: make it with torch instead of numpy
+        def get_position_angle_vec(position):
+            return [
+                position / np.power(10000, 2 * (hid_j // 2) / d_hid)
+                for hid_j in range(d_hid)
+            ]
+        sinusoid_table = np.array(
+            [get_position_angle_vec(pos_i) for pos_i in range(n_position)]
+        )
+        sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
+        sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
+        return torch.FloatTensor(sinusoid_table).unsqueeze(0)
+    def forward(self, x):
+        """
+        Input:
+            x: [B,N,D]
+        """
+        return x + self.pos_table[:, : x.size(1)].clone().detach()
+class ConcatSquashLinear(nn.Module):
+    def __init__(self, dim_in, dim_out, dim_ctx):
+        super(ConcatSquashLinear, self).__init__()
+        self._layer = nn.Linear(dim_in, dim_out)
+        self._hyper_bias = nn.Linear(dim_ctx, dim_out, bias=False)
+        self._hyper_gate = nn.Linear(dim_ctx, dim_out)
+    def forward(self, ctx, x):
+        assert ctx.dim() == x.dim()
+        gate = torch.sigmoid(self._hyper_gate(ctx))
+        bias = self._hyper_bias(ctx)
+        ret = self._layer(x) * gate + bias
+        return ret
+class TimeMLP(nn.Module):
+    def __init__(
+        self,
+        dim_in,
+        dim_h,
+        dim_out,
+        dim_ctx=None,
+        act=F.relu,
+        dropout=0.0,
+        use_time=False,
+    ):
+        super().__init__()
+        self.act = act
+        self.use_time = use_time
+        dim_h = int(dim_h)
+        if use_time:
+            self.fc1 = ConcatSquashLinear(dim_in, dim_h, dim_ctx)
+            self.fc2 = ConcatSquashLinear(dim_h, dim_out, dim_ctx)
+        else:
+            self.fc1 = nn.Linear(dim_in, dim_h)
+            self.fc2 = nn.Linear(dim_h, dim_out)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, ctx=None):
+        if self.use_time:
+            x = self.fc1(x=x, ctx=ctx)
+        else:
+            x = self.fc1(x)
+        x = self.act(x)
+        x = self.dropout(x)
+        if self.use_time:
+            x = self.fc2(x=x, ctx=ctx)
+        else:
+            x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+class MultiHeadAttention(nn.Module):
+    def __init__(self, dim_self, dim_ref, num_heads, dropout=0.0):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim_self // num_heads
+        self.scale = head_dim**-0.5
+        self.to_queries = nn.Linear(dim_self, dim_self)
+        self.to_keys_values = nn.Linear(dim_ref, dim_self * 2)
+        self.project = nn.Linear(dim_self, dim_self)
+        self.dropout = nn.Dropout(dropout)
+    def forward(
+        self,
+        x,
+        y=None,
+        mask=None,
+        alpha=None,
+    ):
+        y = y if y is not None else x
+        b_a, n, c = x.shape
+        b, m, d = y.shape
+        # b n h dh
+        queries = self.to_queries(x).reshape(
+            b_a, n, self.num_heads, c // self.num_heads
+        )
+        # b m 2 h dh
+        keys_values = self.to_keys_values(y).reshape(
+            b, m, 2, self.num_heads, c // self.num_heads
+        )
+        keys, values = keys_values[:, :, 0], keys_values[:, :, 1]
+        if alpha is not None:
+            out, attention = self.forward_interpolation(
+                queries, keys, values, alpha, mask
+            )
+        else:
+            attention = torch.einsum("bnhd,bmhd->bnmh", queries, keys) * self.scale
+            if mask is not None:
+                if mask.dim() == 2:
+                    mask = mask.unsqueeze(1)
+                attention = attention.masked_fill(mask.unsqueeze(3), float("-inf"))
+            attention = attention.softmax(dim=2)
+            attention = self.dropout(attention)
+            out = torch.einsum("bnmh,bmhd->bnhd", attention, values).reshape(b, n, c)
+        out = self.project(out)
+        return out, attention
+class TimeTransformerEncoderLayer(nn.Module):
+    def __init__(
+        self,
+        dim_self,
+        dim_ctx=None,
+        num_heads=1,
+        mlp_ratio=2.0,
+        act=F.leaky_relu,
+        dropout=0.0,
+        use_time=True,
+    ):
+        super().__init__()
+        self.use_time = use_time
+        self.act = act
+        self.attn = MultiHeadAttention(dim_self, dim_self, num_heads, dropout)
+        self.attn_norm = nn.LayerNorm(dim_self)
+        mlp_ratio = int(mlp_ratio)
+        self.mlp = TimeMLP(
+            dim_self, dim_self * mlp_ratio, dim_self, dim_ctx, use_time=use_time
+        )
+        self.norm = nn.LayerNorm(dim_self)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, ctx=None):
+        res = x
+        x, attn = self.attn(x)
+        x = self.attn_norm(x + res)
+        res = x
+        x = self.mlp(x, ctx=ctx)
+        x = self.norm(x + res)
+        return x, attn
+class TimeTransformerDecoderLayer(TimeTransformerEncoderLayer):
+    def __init__(
+        self,
+        dim_self,
+        dim_ref,
+        dim_ctx=None,
+        num_heads=1,
+        mlp_ratio=2,
+        act=F.leaky_relu,
+        dropout=0.0,
+        use_time=True,
+    ):
+        super().__init__(
+            dim_self=dim_self,
+            dim_ctx=dim_ctx,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            act=act,
+            dropout=dropout,
+            use_time=use_time,
+        )
+        self.cross_attn = MultiHeadAttention(dim_self, dim_ref, num_heads, dropout)
+        self.cross_attn_norm = nn.LayerNorm(dim_self)
+    def forward(self, x, y, ctx=None):
+        res = x
+        x, attn = self.attn(x)
+        x = self.attn_norm(x + res)
+        res = x
+        x, attn = self.cross_attn(x, y)
+        x = self.cross_attn_norm(x + res)
+        res = x
+        x = self.mlp(x, ctx=ctx)
+        x = self.norm(x + res)
+        return x, attn
+class TimeTransformerEncoder(nn.Module):
+    def __init__(
+        self,
+        dim_self,
+        dim_ctx=None,
+        num_heads=1,
+        mlp_ratio=2.0,
+        act=F.leaky_relu,
+        dropout=0.0,
+        use_time=True,
+        num_layers=3,
+        last_fc=False,
+        last_fc_dim_out=None,
+    ):
+        super().__init__()
+        self.last_fc = last_fc
+        if last_fc:
+            self.fc = nn.Linear(dim_self, last_fc_dim_out)
+        self.layers = nn.ModuleList(
+            [
+                TimeTransformerEncoderLayer(
+                    dim_self,
+                    dim_ctx=dim_ctx,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    act=act,
+                    dropout=dropout,
+                    use_time=use_time,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+    def forward(self, x, ctx=None):
+        for i, layer in enumerate(self.layers):
+            x, attn = layer(x, ctx=ctx)
+        if self.last_fc:
+            x = self.fc(x)
+        return x
+class TimeTransformerDecoder(nn.Module):
+    def __init__(
+        self,
+        dim_self,
+        dim_ref,
+        dim_ctx=None,
+        num_heads=1,
+        mlp_ratio=2.0,
+        act=F.leaky_relu,
+        dropout=0.0,
+        use_time=True,
+        num_layers=3,
+        last_fc=True,
+        last_fc_dim_out=None,
+    ):
+        super().__init__()
+        self.last_fc = last_fc
+        if last_fc:
+            self.fc = nn.Linear(dim_self, last_fc_dim_out)
+        self.layers = nn.ModuleList(
+            [
+                TimeTransformerDecoderLayer(
+                    dim_self,
+                    dim_ref,
+                    dim_ctx,
+                    num_heads,
+                    mlp_ratio,
+                    act,
+                    dropout,
+                    use_time,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+    def forward(self, x, y, ctx=None):
+        for i, layer in enumerate(self.layers):
+            x, attn = layer(x, y=y, ctx=ctx)
+        if self.last_fc:
+            x = self.fc(x)
+        return x

salad/model_components/variance_schedule.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import torch
+import numpy as np
+from torch.nn import Linear, Module
+class VarianceSchedule(Module):
+    def __init__(self, num_steps, beta_1, beta_T, mode="linear"):
+        super().__init__()
+        # assert mode in ("linear",)
+        self.num_steps = num_steps
+        self.beta_1 = beta_1
+        self.beta_T = beta_T
+        self.mode = mode
+        if mode == "linear":
+            betas = torch.linspace(beta_1, beta_T, steps=num_steps)
+        elif mode == "quad":
+            betas = torch.linspace(beta_1 ** 0.5, beta_T ** 0.5, num_steps) ** 2
+        elif mode == "cosine":
+            cosine_s = 8e-3
+            timesteps = torch.arange(num_steps + 1) / num_steps + cosine_s
+            alphas = timesteps / (1 + cosine_s) * np.pi / 2
+            alphas = torch.cos(alphas).pow(2)
+            betas = 1 - alphas[1:] / alphas[:-1]
+            betas = betas.clamp(max=0.999)
+        betas = torch.cat([torch.zeros([1]), betas], dim=0)  # Padding
+        alphas = 1 - betas
+        log_alphas = torch.log(alphas)
+        for i in range(1, log_alphas.size(0)):  # 1 to T
+            log_alphas[i] += log_alphas[i - 1]
+        alpha_bars = log_alphas.exp()
+        sigmas_flex = torch.sqrt(betas)
+        sigmas_inflex = torch.zeros_like(sigmas_flex)
+        for i in range(1, sigmas_flex.size(0)):
+            sigmas_inflex[i] = ((1 - alpha_bars[i - 1]) / (1 - alpha_bars[i])) * betas[
+                i
+            ]
+        sigmas_inflex = torch.sqrt(sigmas_inflex)
+        self.register_buffer("betas", betas)
+        self.register_buffer("alphas", alphas)
+        self.register_buffer("alpha_bars", alpha_bars)
+        self.register_buffer("sigmas_flex", sigmas_flex)
+        self.register_buffer("sigmas_inflex", sigmas_inflex)
+    def uniform_sample_t(self, batch_size):
+        ts = np.random.choice(np.arange(1, self.num_steps + 1), batch_size)
+        return ts.tolist()
+    def get_sigmas(self, t, flexibility):
+        assert 0 <= flexibility and flexibility <= 1
+        sigmas = self.sigmas_flex[t] * flexibility + self.sigmas_inflex[t] * (
+            1 - flexibility
+        )
+        return sigmas

salad/models/__init__.py ADDED Viewed

File without changes

salad/models/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (163 Bytes). View file

salad/models/__pycache__/base_model.cpython-39.pyc ADDED Viewed

Binary file (4.6 kB). View file

salad/models/__pycache__/language_phase1.cpython-39.pyc ADDED Viewed

Binary file (8.83 kB). View file

salad/models/__pycache__/language_phase2.cpython-39.pyc ADDED Viewed

Binary file (6.12 kB). View file

salad/models/__pycache__/phase1.cpython-39.pyc ADDED Viewed

Binary file (2.12 kB). View file

salad/models/__pycache__/phase2.cpython-39.pyc ADDED Viewed

Binary file (5.37 kB). View file

salad/models/base_model.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import pytorch_lightning as pl
+import torch
+import torch.nn.functional as F
+from salad.data.dataset import SALADDataset
+from salad.utils.train_util import PolyDecayScheduler
+class BaseModel(pl.LightningModule):
+    def __init__(
+        self,
+        network,
+        variance_schedule,
+        **kwargs,
+    ):
+        super().__init__()
+        self.save_hyperparameters(logger=False)
+        self.net = network
+        self.var_sched = variance_schedule
+    def forward(self, x):
+        return self.get_loss(x)
+    def step(self, x, stage: str):
+        loss = self(x)
+        self.log(
+            f"{stage}/loss",
+            loss,
+            on_step=stage == "train",
+            prog_bar=True,
+        )
+        return loss
+    def training_step(self, batch, batch_idx):
+        x = batch
+        return self.step(x, "train")
+    def add_noise(self, x, t):
+        """
+        Input:
+            x: [B,D] or [B,G,D]
+            t: list of size B
+        Output:
+            x_noisy: [B,D]
+            beta: [B]
+            e_rand: [B,D]
+        """
+        alpha_bar = self.var_sched.alpha_bars[t]
+        beta = self.var_sched.betas[t]
+        c0 = torch.sqrt(alpha_bar).view(-1, 1)  # [B,1]
+        c1 = torch.sqrt(1 - alpha_bar).view(-1, 1)
+        e_rand = torch.randn_like(x)
+        if e_rand.dim() == 3:
+            c0 = c0.unsqueeze(1)
+            c1 = c1.unsqueeze(1)
+        x_noisy = c0 * x + c1 * e_rand
+        return x_noisy, beta, e_rand
+    def get_loss(
+        self,
+        x0,
+        t=None,
+        noisy_in=False,
+        beta_in=None,
+        e_rand_in=None,
+    ):
+        if x0.dim() == 2:
+            B, D = x0.shape
+        else:
+            B, G, D = x0.shape
+        if not noisy_in:
+            if t is None:
+                t = self.var_sched.uniform_sample_t(B)
+            x_noisy, beta, e_rand = self.add_noise(x0, t)
+        else:
+            x_noisy = x0
+            beta = beta_in
+            e_rand = e_rand_in
+        e_theta = self.net(x_noisy, beta=beta)
+        loss = F.mse_loss(e_theta.flatten(), e_rand.flatten(), reduction="mean")
+        return loss
+    @torch.no_grad()
+    def sample(
+        self,
+        batch_size=0,
+        return_traj=False,
+    ):
+        raise NotImplementedError
+    def validation_epoch_end(self, outputs):
+        if self.hparams.no_run_validation:
+            return
+        if not self.trainer.sanity_checking:
+            if (self.current_epoch) % self.hparams.validation_step == 0:
+                self.validation()
+    def _build_dataset(self, stage):
+        if hasattr(self, f"data_{stage}"):
+            return getattr(self, f"data_{stage}")
+        if stage == "train":
+            ds = SALADDataset(**self.hparams.dataset_kwargs)
+        else:
+            dataset_kwargs = self.hparams.dataset_kwargs.copy()
+            dataset_kwargs["repeat"] = 1
+            ds = SALADDataset(**dataset_kwargs)
+        setattr(self, f"data_{stage}", ds)
+        return ds
+    def _build_dataloader(self, stage):
+        try:
+            ds = getattr(self, f"data_{stage}")
+        except:
+            ds = self._build_dataset(stage)
+        return torch.utils.data.DataLoader(
+            ds,
+            batch_size=self.hparams.batch_size,
+            shuffle=stage == "train",
+            drop_last=stage == "train",
+            num_workers=4,
+        )
+    def train_dataloader(self):
+        return self._build_dataloader("train")
+    def val_dataloader(self):
+        return self._build_dataloader("val")
+    def test_dataloader(self):
+        return self._build_dataloader("test")
+    def configure_optimizers(self):
+        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr)
+        scheduler = PolyDecayScheduler(optimizer, self.hparams.lr, power=0.999)
+        return [optimizer], [scheduler]
+    #TODO move get_wandb_logger to logutil.py
+    def get_wandb_logger(self):
+        for logger in self.logger:
+            if isinstance(logger, pl.loggers.wandb.WandbLogger):
+                return logger
+        return None

salad/models/language_phase1.py ADDED Viewed

	@@ -0,0 +1,340 @@

+import numpy as np
+import torch
+import torch.nn.functional as F
+from transformers import BertModel, BertTokenizer
+from salad.model_components.lstm import LSTM
+from salad.models.phase1 import Phase1Model
+from salad.utils import imageutil, nputil, visutil
+from salad.utils.spaghetti_util import (clip_eigenvalues,
+                                        generate_zc_from_sj_gaus,
+                                        get_mesh_from_spaghetti, load_mesher,
+                                        load_spaghetti, project_eigenvectors)
+from salad.utils.train_util import get_dropout_mask
+from salad.data.dataset import LangSALADDataset
+class LangPhase1Model(Phase1Model):
+    def __init__(self, network, variance_schedule, **kwargs):
+        super().__init__(network, variance_schedule, **kwargs)
+        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        if self.hparams.get("use_lstm"):
+            self.bertmodel = LSTM(
+                text_dim=768, embedding_dim=768, vocab_size=30522, padding_idx=0
+            )
+        else:
+            self.bertmodel = BertModel.from_pretrained("bert-base-uncased")
+        if self.hparams.get("text_encoder_freeze"):
+            for p in self.bertmodel.parameters():
+                p.requires_grad_(False)
+    def forward(self, x, text):
+        """
+        Input:
+            x: [B,G,16]
+            text: list of length [B]
+        """
+        B, G = x.shape[:2]
+        text = self.random_mask_text(text)
+        lang_emb = self.text_to_embedding(text)
+        return self.get_loss(x, lang_emb)
+    def tokenizing(self, text):
+        tokenized = self.tokenizer(
+            text, return_tensors="pt", padding=True, truncation=True
+        ).to(self.device)
+        return tokenized
+    def text_to_embedding(self, text):
+        """
+        text: list of length [B]
+        return [B,768]
+        """
+        tokenized = self.tokenizing(text)
+        if self.hparams.get("use_lstm"):
+            lang_emb, _ = self.bertmodel(tokenized.input_ids)
+        else:
+            if self.hparams.get("text_encoder_return_seq"):
+                lang_emb = self.bertmodel(**tokenized).last_hidden_state
+            else:
+                lang_emb = self.bertmodel(**tokenized).pooler_output
+        if lang_emb.ndim == 2:
+            lang_emb = lang_emb.unsqueeze(1)
+        return lang_emb
+    def random_mask_text(self, text):
+        text = list(text)
+        B = len(text)
+        if self.hparams.get("classifier_free_guidance"):
+            random_dp_mask = get_dropout_mask(
+                B, self.hparams.conditioning_dropout_prob, self.device
+            )
+            for i in range(B):
+                if random_dp_mask[i] == 0:
+                    text[i] = ""
+        return text
+    def get_loss(self, x0, cond, t=None, noisy_in=False, beta_in=None, e_rand_in=None):
+        B, G, D = x0.shape
+        if not noisy_in:
+            if t is None:
+                t = self.var_sched.uniform_sample_t(B)
+            x_noisy, beta, e_rand = self.add_noise(x0, t)
+        else:
+            x_noisy = x0
+            beta = beta_in
+            e_rand = e_rand_in
+        e_theta = self.net(x_noisy, beta, cond)
+        loss = F.mse_loss(e_theta.flatten(), e_rand.flatten(), reduction="mean")
+        return loss
+    def step(self, batch, stage: str):
+        x, text = batch
+        loss = self(x, text)
+        self.log(f"{stage}/loss", loss, on_step=stage == "train", prog_bar=True)
+        return loss
+    @torch.no_grad()
+    def sample(
+        self,
+        num_samples_or_text,
+        return_traj=False,
+        return_cond=False,
+        classifier_free_guidance=True,
+        free_guidance_weight=2.0,
+    ):
+        if isinstance(num_samples_or_text, str):
+            num_samples_or_text = [num_samples_or_text]
+        if isinstance(num_samples_or_text, int):
+            batch_size = num_samples_or_text
+            ds = self._build_dataset("val")
+            texts = [ds[i][1] for i in range(batch_size)]
+        elif isinstance(num_samples_or_text, list):
+            texts = num_samples_or_text
+            batch_size = len(num_samples_or_text)
+        if self.hparams.get("use_zc"):
+            x_T = torch.randn([batch_size, 16, 512]).to(self.device)
+        else:
+            x_T = torch.randn([batch_size, 16, 16]).to(self.device)
+        G = x_T.shape[1]
+        lang_emb = self.text_to_embedding(texts)
+        if classifier_free_guidance:
+            null_texts = ["" for _ in range(batch_size)]
+            null_lang_emb = self.text_to_embedding(null_texts)
+        traj = {self.var_sched.num_steps: x_T}
+        for t in range(self.var_sched.num_steps, 0, -1):
+            z = torch.randn_like(x_T) if t > 1 else torch.zeros_like(x_T)
+            alpha = self.var_sched.alphas[t]
+            alpha_bar = self.var_sched.alpha_bars[t]
+            sigma = self.var_sched.get_sigmas(t, flexibility=0)
+            c0 = 1.0 / torch.sqrt(alpha)
+            c1 = (1 - alpha) / torch.sqrt(1 - alpha_bar)
+            x_t = traj[t]
+            beta = self.var_sched.betas[[t] * batch_size]
+            e_theta = self.net(x_t, beta=beta, context=lang_emb)
+            if classifier_free_guidance:
+                null_e_theta = self.net(x_t, beta=beta, context=null_lang_emb)
+                w = free_guidance_weight
+                e_theta = (1 + w) * e_theta - w * null_e_theta
+            x_next = c0 * (x_t - c1 * e_theta) + sigma * z
+            traj[t - 1] = x_next.detach()
+            traj[t] = traj[t].cpu()
+            if not return_traj:
+                del traj[t]
+        if return_traj:
+            if return_cond:
+                return traj, lang_emb
+            return traj
+        else:
+            if return_cond:
+                return traj[0], lang_emb
+            return traj[0]
+    def sampling_gaussians(
+        self,
+        num_samples_or_text,
+        classifier_free_guidance=True,
+        free_guidance_weight=2.0,
+        return_cond=False,
+    ):
+        gaus = self.sample(
+            num_samples_or_text,
+            classifier_free_guidance=classifier_free_guidance,
+            free_guidance_weight=free_guidance_weight,
+            return_cond=return_cond,
+        )
+        if isinstance(gaus, tuple):
+            text = gaus[1]
+            gaus = gaus[0]
+        # gaus = reflect_and_concat_gmms(raw_gaus)
+        if self.hparams.get("global_normalization"):
+            if not hasattr(self, "data_val"):
+                self._build_dataset("val")
+            if self.hparams.get("global_normalization") == "partial":
+                gaus = self.data_val.unnormalize_global_static(gaus, slice(12, None))
+            elif self.hparams.get("global_normalization") == "all":
+                gaus = self.data_val.unnormalize_global_static(gaus, slice(None))
+        gaus = project_eigenvectors(clip_eigenvalues(gaus))
+        if return_cond:
+            return gaus, text
+        return gaus
+    def _build_dataset(self, stage):
+        if hasattr(self, f"data_{stage}"):
+            return getattr(self, f"data_{stage}")
+        ds_class = (
+            LangSALADDataset
+        )
+        if stage == "train":
+            ds = ds_class(**self.hparams.dataset_kwargs)
+        else:
+            dataset_kwargs = self.hparams.dataset_kwargs.copy()
+            dataset_kwargs["repeat"] = 1
+            ds = ds_class(**dataset_kwargs)
+        setattr(self, f"data_{stage}", ds)
+        return ds
+    def validation_zc(self):
+        vis_num_shapes = 4
+        vis_zcs = []
+        vis_texts = []
+        ds = self._build_dataset("val")
+        for i in [0, 1, 2, 3]:
+            zcs, text = ds[i]
+            vis_zcs.append(zcs)
+            vis_texts.append(text)
+        vis_zcs = torch.stack(vis_zcs, 0)
+        ldm_zcs = self.sample(vis_texts)
+        if not hasattr(self, "spaghetti"):
+            self.spaghetti = load_spaghetti(self.device, self.hparams.spaghetti_tag)
+        spaghetti = self.spaghetti
+        if not hasattr(self, "mesher"):
+            self.mesher = load_mesher(self.device)
+        mesher = self.mesher
+        wandb_logger = self.get_wandb_logger()
+        images = []
+        for i in range(vis_num_shapes):
+            try:
+                v, f = get_mesh_from_spaghetti(spaghetti, mesher, vis_zcs[i], res=128)
+                gt_img = visutil.render_mesh(v, f, resolution=(256, 256))
+            except:
+                pass
+            try:
+                v, f = get_mesh_from_spaghetti(spaghetti, mesher, ldm_zcs[i], res=128)
+                pred_img = visutil.render_mesh(v, f, resolution=(256, 256))
+            except:
+                pass
+            img = imageutil.merge_images([gt_img, pred_img])
+            img = imageutil.draw_text(
+                img,
+                f"Left: GT | Right: Pred \n{vis_texts[i]}",
+                font_size=14,
+                max_seq_length=50,
+            )
+            images.append([img])
+        images = imageutil.merge_images(images)
+        wandb_logger.log_image("vis", [images])
+    def validation(self):
+        if self.hparams.get("use_zc"):
+            self.validation_zc()
+            return
+        vis_num_shapes = 4
+        vis_gaus = []
+        vis_texts = []
+        ds = self._build_dataset("val")
+        vis_indices = [18453, 13036, 13204, 48244]
+        for i in vis_indices:
+            gaus, text = ds[i]
+            vis_gaus.append(gaus)
+            vis_texts.append(text)
+        vis_gaus = torch.stack(vis_gaus, 0)
+        if self.hparams.get("global_normalization"):
+            if self.hparams.get("global_normalization") == "partial":
+                vis_gaus = self.data_val.unnormalize_global_static(
+                    vis_gaus, slice(12, None)
+                )
+            elif self.hparams.get("global_normalization") == "all":
+                vis_gaus = self.dataval.unnormalize_global_static(vis_gaus, slice(None))
+        # vis_gaus = reflect_and_concat_gmms(vis_gaus)
+        pred_gaus = self.sampling_gaussians(vis_texts)
+        if not hasattr(self, "spaghetti"):
+            self.spaghetti = load_spaghetti(self.device, self.hparams.spaghetti_tag)
+        spaghetti = self.spaghetti
+        if not hasattr(self, "mesher"):
+            self.mesher = load_mesher(self.device)
+        mesher = self.mesher
+        """ get intrinsics """
+        # TODO change the ckpt path.
+        if not hasattr(self, "phase2_model"):
+            phase2_ckpt = "/home/juil/pvddir/results/phase2/augment_final_0214/0214_202607/checkpoints/epoch=4999-val_loss=0.0000.ckpt"
+            self.phase2_model = SpaghettiConditionSALDM.load_from_checkpoint(
+                phase2_ckpt, strict=False
+            ).to(self.device)
+            self.phase2_model.eval()
+            for p in self.phase2_model.parameters():
+                p.requires_grad_(False)
+        phase2_model = self.phase2_model
+        gt_sj = phase2_model.sample(vis_gaus)
+        pred_sj = phase2_model.sample(pred_gaus)
+        gt_zcs = generate_zc_from_sj_gaus(spaghetti, gt_sj, vis_gaus)
+        pred_zcs = generate_zc_from_sj_gaus(spaghetti, pred_sj, pred_gaus)
+        wandb_logger = self.get_wandb_logger()
+        images = []
+        for i in range(vis_num_shapes):
+            gt_img = visutil.render_gaussians(vis_gaus[i], resolution=(256, 256))
+            try:
+                v, f = get_mesh_from_spaghetti(spaghetti, mesher, gt_zcs[i], res=128)
+                gt_mesh_img = visutil.render_mesh(v, f, resolution=(256, 256))
+                gt_img = imageutil.merge_images([gt_img, gt_mesh_img])
+            except:
+                pass
+            pred_img = visutil.render_gaussians(pred_gaus[i], resolution=(256, 256))
+            try:
+                v, f = get_mesh_from_spaghetti(spaghetti, mesher, pred_zcs[i], res=128)
+                pred_mesh_img = visutil.render_mesh(v, f, resolution=(256, 256))
+                pred_img = imageutil.merge_images([pred_img, pred_mesh_img])
+            except:
+                pass
+            img = imageutil.merge_images([gt_img, pred_img])
+            img = imageutil.draw_text(
+                img,
+                f"Left: GT | Right: Pred \n{vis_texts[i]}",
+                font_size=14,
+                max_seq_length=50,
+            )
+            images.append([img])
+        images = imageutil.merge_images(images)
+        wandb_logger.log_image("vis", [images])

salad/models/language_phase2.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import numpy as np
+import torch
+import torch.nn.functional as F
+from transformers import BertModel, BertTokenizer
+from salad.model_components.lstm import LSTM
+from salad.models.language_phase1 import LangPhase1Model
+from salad.utils import imageutil, nputil, visutil
+from salad.utils.spaghetti_util import (generate_zc_from_sj_gaus,
+                                        get_mesh_from_spaghetti, load_mesher,
+                                        load_spaghetti)
+from salad.utils.train_util import get_dropout_mask
+class LangPhase2Model(LangPhase1Model):
+    def __init__(self, network, variance_schedule, **kwargs):
+        super().__init__(network, variance_schedule, **kwargs)
+    def random_mask_gaus_text(self, gaus, text):
+        if self.hparams.get("classifier_free_guidance"):
+            text = list(text)
+            B = gaus.shape[0]
+            random_dp_mask = get_dropout_mask(
+                B, self.hparams.conditioning_dropout_prob, self.device
+            )
+            gaus = gaus * random_dp_mask.unsqueeze(1).unsqueeze(2)
+            for i in range(B):
+                if random_dp_mask[i] == 0:
+                    text[i] = ""
+        return gaus, text
+    def forward(self, x, gaus, text):
+        """
+        Input:
+            x: [B,G,512]
+            gaus: [B,G,16]
+            text: list of [B]
+        """
+        B, G = x.shape[:2]
+        gaus, text = self.random_mask_gaus_text(gaus, text)
+        lang_emb = self.text_to_embedding(text)
+        cond = self.cond_from_gaus_lang_f(gaus, lang_emb)
+        return self.get_loss(x, cond)
+    def step(self, batch, stage):
+        x, gaus, text = batch
+        loss = self(x, gaus, text)
+        self.log(f"{stage}/loss", loss, on_step=stage == "train", prog_bar=True)
+        return loss
+    def get_loss(self, x0, cond, t=None, noisy_in=False, beta_in=None, e_rand_in=None):
+        B, G, D = x0.shape
+        if not noisy_in:
+            if t is None:
+                t = self.var_sched.uniform_sample_t(B)
+            x_noisy, beta, e_rand = self.add_noise(x0, t)
+        else:
+            x_noisy = x0
+            beta = beta_in
+            e_rand = e_rand_in
+        e_theta = self.net(x_noisy, beta, cond)
+        loss = F.mse_loss(e_theta.flatten(), e_rand.flatten(), reduction="mean")
+        return loss
+    def cond_from_gaus_lang_f(self, gaus, lang_f):
+        gaus = nputil.np2th(gaus).to(self.device)
+        G = gaus.shape[1]
+        lang_f = nputil.np2th(lang_f).to(self.device)
+        assert gaus.ndim == 3
+        if lang_f.ndim == 2:
+            lang_f = lang_f.unsqueeze(1)
+        lang_f = lang_f.expand(-1, G, -1)
+        return torch.cat([gaus, lang_f], -1)
+    def generate_null_cond(self, B, G):
+        text = ["" for _ in range(B)]
+        lang_emb = self.text_to_embedding(text)
+        gaus = torch.zeros(B, G, 16, dtype=torch.float, device=self.device)
+        return self.cond_from_gaus_lang_f(gaus, lang_emb)
+    @torch.no_grad()
+    def sample(
+        self,
+        num_samples_or_cond,
+        return_traj=False,
+        return_cond=False,
+        classifier_free_guidance=False,
+        free_guidance_weight=0.7,
+    ):
+        if isinstance(num_samples_or_cond, int):
+            batch_size = num_samples_or_cond
+            ds = self._build_dataset("val")
+            batch_gaus = []
+            batch_text = []
+            for i in range(batch_size):
+                _, gaus, text = ds[i]
+                batch_gaus.append(gaus)
+                batch_text.append(text)
+            batch_gaus = torch.stack(batch_gaus, 0)
+            lang_emb = self.text_to_embedding(batch_text)
+            cond = self.cond_from_gaus_lang_f(batch_gaus, lang_emb).to(self.device)
+        elif isinstance(num_samples_or_cond, np.ndarray) or isinstance(
+            num_samples_or_cond, torch.Tensor
+        ):
+            cond = nputil.np2th(num_samples_or_cond).to(self.device)
+            batch_size = len(cond)
+        G = cond.shape[1]
+        if classifier_free_guidance:
+            null_cond = self.generate_null_cond(batch_size, G)
+        x_T = torch.randn([batch_size, 16, 512]).to(self.device)
+        traj = {self.var_sched.num_steps: x_T}
+        for t in range(self.var_sched.num_steps, 0, -1):
+            z = torch.randn_like(x_T) if t > 1 else torch.zeros_like(x_T)
+            alpha = self.var_sched.alphas[t]
+            alpha_bar = self.var_sched.alpha_bars[t]
+            sigma = self.var_sched.get_sigmas(t, flexibility=0)
+            c0 = 1.0 / torch.sqrt(alpha)
+            c1 = (1 - alpha) / torch.sqrt(1 - alpha_bar)
+            x_t = traj[t]
+            beta = self.var_sched.betas[[t] * batch_size]
+            e_theta = self.net(x_t, beta=beta, context=cond)
+            if classifier_free_guidance:
+                null_e_theta = self.net(x_t, beta=beta, context=null_cond)
+                w = free_guidance_weight
+                e_theta = (1 + w) * e_theta - w * null_e_theta
+            x_next = c0 * (x_t - c1 * e_theta) + sigma * z
+            traj[t - 1] = x_next.detach()
+            traj[t] = traj[t].cpu()
+            if not return_traj:
+                del traj[t]
+        if return_traj:
+            if return_cond:
+                return traj, cond
+            return traj
+        else:
+            if return_cond:
+                return traj[0], cond
+            return traj[0]
+    def validation(self):
+        vis_num_shapes = 4
+        vis_gt_sj = []
+        vis_gaus = []
+        vis_texts = []
+        ds = self._build_dataset("val")
+        vis_indices = [18453, 13036, 13204, 48244]
+        for i in vis_indices:
+            sj, gaus, text = ds[i]
+            vis_gt_sj.append(sj)
+            vis_gaus.append(gaus)
+            vis_texts.append(text)
+        vis_gt_sj = torch.stack(vis_gt_sj, 0)
+        vis_gaus = torch.stack(vis_gaus, 0).to(self.device)
+        vis_lang_f = self.text_to_embedding(vis_texts)
+        vis_cond = self.cond_from_gaus_lang_f(vis_gaus, vis_lang_f)
+        pred_sj = self.sample(vis_cond)
+        if not hasattr(self, "spaghetti"):
+            self.spaghetti = load_spaghetti(self.device, self.hparams.spaghetti_tag)
+        spaghetti = self.spaghetti
+        if not hasattr(self, "mesher"):
+            self.mesher = load_mesher(self.device)
+        mesher = self.mesher
+        gt_zcs = generate_zc_from_sj_gaus(spaghetti, vis_gt_sj, vis_gaus)
+        pred_zcs = generate_zc_from_sj_gaus(spaghetti, pred_sj, vis_gaus)
+        wandb_logger = self.get_wandb_logger()
+        for i in range(vis_num_shapes):
+            gaus_img = visutil.render_gaussians(vis_gaus[i], resolution=(256, 256))
+            vert, face = get_mesh_from_spaghetti(spaghetti, mesher, gt_zcs[i], res=128)
+            gt_mesh_img = visutil.render_mesh(vert, face, resolution=(256, 256))
+            img = [gaus_img, gt_mesh_img]
+            try:
+                vert, face = get_mesh_from_spaghetti(spaghetti, mesher, pred_zcs[i])
+                pred_mesh_img = visutil.render_mesh(vert, face, resolution=(256, 256))
+                img.append(pred_mesh_img)
+            except Exception as e:
+                print(e)
+            img = imageutil.merge_images(img)
+            img = imageutil.draw_text(
+                img, vis_texts[i], font_size=14, max_seq_length=50
+            )
+            wandb_logger.log_image("vis", [img])

salad/models/phase1.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+import numpy as np
+from salad.models.base_model import BaseModel
+from salad.utils import nputil, thutil
+from salad.utils.spaghetti_util import clip_eigenvalues, project_eigenvectors
+class Phase1Model(BaseModel):
+    def __init__(self, network, variance_schedule, **kwargs):
+        super().__init__(network, variance_schedule, **kwargs)
+    @torch.no_grad()
+    def sample(
+        self,
+        batch_size=0,
+        return_traj=False,
+    ):
+        x_T = torch.randn([batch_size, 16, 16]).to(self.device)
+        traj = {self.var_sched.num_steps: x_T}
+        for t in range(self.var_sched.num_steps, 0, -1):
+            z = torch.randn_like(x_T) if t > 1 else torch.zeros_like(x_T)
+            alpha = self.var_sched.alphas[t]
+            alpha_bar = self.var_sched.alpha_bars[t]
+            sigma = self.var_sched.get_sigmas(t, flexibility=0)
+            c0 = 1.0 / torch.sqrt(alpha)
+            c1 = (1 - alpha) / torch.sqrt(1 - alpha_bar)
+            x_t = traj[t]
+            beta = self.var_sched.betas[[t] * batch_size]
+            e_theta = self.net(x_t, beta=beta)
+            # print(e_theta.norm(-1).mean())
+            x_next = c0 * (x_t - c1 * e_theta) + sigma * z
+            traj[t - 1] = x_next.detach()
+            traj[t] = traj[t].cpu()
+            if not return_traj:
+                del traj[t]
+        if return_traj:
+            return traj
+        else:
+            return traj[0]
+    def sampling_gaussians(self, num_shapes):
+        """
+        Return:
+            ldm_gaus: np.ndarray
+            gt_gaus: np.ndarray
+        """
+        ldm_gaus = self.sample(num_shapes)
+        if self.hparams.get("global_normalization"):
+            if not hasattr(self, "data_val"):
+                self._build_dataset("val")
+            if self.hparams.get("global_normalization") == "partial":
+                ldm_gaus = self.data_val.unnormalize_global_static(ldm_gaus, slice(12,None))
+            elif self.hparams.get("global_normalization") == "all":
+                ldm_gaus = self.data_val.unnormalize_global_static(ldm_gaus, slice(None))
+        ldm_gaus = clip_eigenvalues(ldm_gaus)
+        ldm_gaus = project_eigenvectors(ldm_gaus)
+        return ldm_gaus

salad/models/phase2.py ADDED Viewed

	@@ -0,0 +1,183 @@

+from typing import Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from salad.models.base_model import BaseModel
+from salad.utils import imageutil, nputil, sysutil, thutil, visutil
+from salad.utils.spaghetti_util import (clip_eigenvalues,
+                                        generate_zc_from_sj_gaus,
+                                        get_mesh_from_spaghetti, load_mesher,
+                                        load_spaghetti, project_eigenvectors)
+class Phase2Model(BaseModel):
+    def __init__(self, network, variance_schedule, **kwargs):
+        super().__init__(network, variance_schedule, **kwargs)
+    def forward(self, x, cond):
+        return self.get_loss(x, cond)
+    def step(self, batch, stage: str):
+        x, cond = batch
+        loss = self(x, cond)
+        self.log(f"{stage}/loss", loss, on_step=stage == "train", prog_bar=True)
+        return loss
+    def get_loss(self, x0, cond, t=None, noisy_in=False, beta_in=None, e_rand_in=None):
+        B, G, D = x0.shape
+        if not noisy_in:
+            if t is None:
+                t = self.var_sched.uniform_sample_t(B)
+            x_noisy, beta, e_rand = self.add_noise(x0, t)
+        else:
+            x_noisy = x0
+            beta = beta_in
+            e_rand = e_rand_in
+        e_theta = self.net(x_noisy, beta, cond)
+        loss = F.mse_loss(e_theta.flatten(), e_rand.flatten(), reduction="mean")
+        return loss
+    @torch.no_grad()
+    def sample(
+        self,
+        num_samples_or_gaus: Union[torch.Tensor, np.ndarray, int],
+        return_traj=False,
+        classifier_free_guidance=None,
+        free_guidance_weight=-0.7,
+        augment_condition_in_test=False,
+        return_cond=False,
+    ):
+        if isinstance(num_samples_or_gaus, int):
+            batch_size = num_samples_or_gaus
+            ds = self._build_dataset("val")
+            cond = torch.stack([ds[i][1] for i in range(batch_size)], 0)
+        elif isinstance(num_samples_or_gaus, np.ndarray) or isinstance(
+            num_samples_or_gaus, torch.Tensor
+        ):
+            cond = nputil.np2th(num_samples_or_gaus)
+            if cond.dim() == 2:
+                cond = cond[None]
+            batch_size = len(cond)
+        else:
+            raise ValueError(
+                "'num_samples_or_gaus' should be int, torch.Tensor or np.ndarray."
+            )
+        x_T = torch.randn([batch_size, 16, 512]).to(self.device)
+        cond = cond.to(self.device)
+        traj = {self.var_sched.num_steps: x_T}
+        for t in range(self.var_sched.num_steps, 0, -1):
+            z = torch.randn_like(x_T) if t > 1 else torch.zeros_like(x_T)
+            alpha = self.var_sched.alphas[t]
+            alpha_bar = self.var_sched.alpha_bars[t]
+            sigma = self.var_sched.get_sigmas(t, flexibility=0)
+            c0 = 1.0 / torch.sqrt(alpha)
+            c1 = (1 - alpha) / torch.sqrt(1 - alpha_bar)
+            x_t = traj[t]
+            beta = self.var_sched.betas[[t] * batch_size]
+            e_theta = self.net(x_t, beta=beta, context=cond)
+            x_next = c0 * (x_t - c1 * e_theta) + sigma * z
+            traj[t - 1] = x_next.detach()
+            traj[t] = traj[t].cpu()
+            if not return_traj:
+                del traj[t]
+        if return_traj:
+            if return_cond:
+                return traj, cond
+            return traj
+        else:
+            if return_cond:
+                return traj[0], cond
+            return traj[0]
+    def validation(self):
+        latent_ds = self._build_dataset("val")
+        vis_num_shapes = 3
+        num_variations = 3
+        sysutil.clean_gpu()
+        if not hasattr(self, "spaghetti"):
+            spaghetti = load_spaghetti(
+                self.device,
+                self.hparams.spaghetti_tag
+                if self.hparams.get("spaghetti_tag")
+                else "chairs_large",
+            )
+            self.spaghetti = spaghetti
+        else:
+            spaghetti = self.spaghetti
+        if not hasattr(self, "mesher"):
+            mesher = load_mesher(self.device)
+            self.mesher = mesher
+        else:
+            mesher = self.mesher
+        """======== Sampling ========"""
+        gt_zs = []
+        gt_gaus = []
+        gt_zs, gt_gaus = zip(*[latent_ds[i + 3] for i in range(vis_num_shapes)])
+        gt_zs, gt_gaus = list(map(lambda x: torch.stack(x), [gt_zs, gt_gaus]))
+        if self.hparams.get("sj_global_normalization"):
+            gt_zs = thutil.th2np(gt_zs)
+            gt_zs = latent_ds.unnormalize_sj_global_static(gt_zs)
+            gt_zs = nputil.np2th(gt_zs).to(self.device)
+        gt_gaus_repeated = gt_gaus.repeat_interleave(num_variations, 0)
+        clean_ldm_zs, clean_gaus = self.sample(gt_gaus_repeated, return_cond=True)
+        clean_gaus = project_eigenvectors(clip_eigenvalues(clean_gaus))
+        clean_zcs = generate_zc_from_sj_gaus(spaghetti, clean_ldm_zs, clean_gaus)
+        gt_zcs = generate_zc_from_sj_gaus(spaghetti, gt_zs, gt_gaus)
+        sysutil.clean_gpu()
+        """=========================="""
+        """ Spaghetti Decoding """
+        wandb_logger = self.get_wandb_logger()
+        resolution = (256, 256)
+        for i in range(vis_num_shapes):
+            img_per_shape = []
+            gaus_img = visutil.render_gaussians(gt_gaus[i], resolution=resolution)
+            vert, face = get_mesh_from_spaghetti(spaghetti, mesher, gt_zcs[i], res=128)
+            gt_mesh_img = visutil.render_mesh(vert, face, resolution=resolution)
+            gt_img = imageutil.merge_images([gaus_img, gt_mesh_img])
+            gt_img = imageutil.draw_text(gt_img, "GT", font_size=24)
+            img_per_shape.append(gt_img)
+            for j in range(num_variations):
+                try:
+                    gaus_img = visutil.render_gaussians(
+                        clean_gaus[i * num_variations + j], resolution=resolution
+                    )
+                    vert, face = get_mesh_from_spaghetti(
+                        spaghetti, mesher, clean_zcs[i * num_variations + j], res=128
+                    )
+                    mesh_img = visutil.render_mesh(vert, face, resolution=resolution)
+                    pred_img = imageutil.merge_images([gaus_img, mesh_img])
+                    pred_img = imageutil.draw_text(
+                        pred_img, f"{j}-th clean gaus", font_size=24
+                    )
+                    img_per_shape.append(pred_img)
+                except Exception as e:
+                    print(e)
+            try:
+                image = imageutil.merge_images(img_per_shape)
+                wandb_logger.log_image("visualization", [image])
+            except Exception as e:
+                print(e)
+        """ ================== """

salad/spaghetti/.gitignore ADDED Viewed

	@@ -0,0 +1,9 @@

+/assets/*
+!/assets/readme_resources/
+!/assets/ui_resources/
+!/assets/splits/
+!/assets/mesh/
+*.vtk
+.idea/
+__pycache__/
+**_ig_**