diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..cc2d809b1415492e465f617a3d6f2ae74de53006 Binary files /dev/null and b/.DS_Store differ diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..496ee2ca6a2f08396a4076fe43dedf3dc0da8b6d --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.DS_Store \ No newline at end of file diff --git a/CCD/.DS_Store b/CCD/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..2ece3d4715e377dac5ba70492f5d585572287de6 Binary files /dev/null and b/CCD/.DS_Store differ diff --git a/CCD/checkpoints/Mean_Std.npy b/CCD/checkpoints/Mean_Std.npy new file mode 100644 index 0000000000000000000000000000000000000000..c0ff08a8294fe5b01a97e3aeaaa091b7a7b6e849 --- /dev/null +++ b/CCD/checkpoints/Mean_Std.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:332dacb3ed6e6862c11b0b5f33a469ad3b715e15b7100219408188726ebb3ce7 +size 502 diff --git a/CCD/checkpoints/latest.pth b/CCD/checkpoints/latest.pth new file mode 100644 index 0000000000000000000000000000000000000000..8aa6710e99e86d82ab3ee1d2ae9bc3d397c6be7f --- /dev/null +++ b/CCD/checkpoints/latest.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd4c2981c5ba8807797c38c649da0111a0bc9fb4846f9d24fe8ca459a2fefc0a +size 18868379 diff --git a/CCD/src/.DS_Store b/CCD/src/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..8faa6e10e917ec4dd1fc626a43d15a96f5b17d5c Binary files /dev/null and b/CCD/src/.DS_Store differ diff --git a/CCD/src/LSTM.py b/CCD/src/LSTM.py new file mode 100644 index 0000000000000000000000000000000000000000..cc265c02ba07a875891a864ac60b3659f9a49e57 --- /dev/null +++ b/CCD/src/LSTM.py @@ -0,0 +1,202 @@ +''' +This script does conditional image generation on MNIST, using a diffusion model + +This code is modified from, +https://github.com/cloneofsimo/minDiffusion + +Diffusion model is based on DDPM, +https://arxiv.org/abs/2006.11239 + +The conditioning idea is taken from 'Classifier-Free Diffusion Guidance', +https://arxiv.org/abs/2207.12598 + +This technique also features in ImageGen 'Photorealistic Text-to-Image Diffusion Modelswith Deep Language Understanding', +https://arxiv.org/abs/2205.11487 + +''' + +from typing import Dict, Tuple +from tqdm import tqdm +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.data import DataLoader +from torchvision import models, transforms +from torchvision.datasets import MNIST +from torchvision.utils import save_image, make_grid +import matplotlib.pyplot as plt +from matplotlib.animation import FuncAnimation, PillowWriter +import numpy as np +import os +import clip + +class LSTM(nn.Module): + def __init__(self, input_size, hidden_size, output_size, embed_size=512, n_layer=1, bidirectional=False): + super(LSTM, self).__init__() + self.n_layer = n_layer + self.bidirectional = bidirectional + self.hidden_size = hidden_size + + self.num_directions = 2 if bidirectional else 1 + self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers=n_layer, batch_first=True, bidirectional=bidirectional) + + self.encoder = nn.Sequential(nn.Linear(embed_size, hidden_size)) + + self.decoder = nn.Sequential(nn.Linear(hidden_size, output_size)) + + self.embed = nn.Sequential(nn.Linear(embed_size, embed_size)) + + + def initHidden(self, batch_size=1): + h0 = torch.zeros(self.n_layer, batch_size, self.hidden_size, requires_grad=False).cuda() + c0 = torch.zeros(self.n_layer, batch_size, self.hidden_size, requires_grad=False).cuda() + return (h0, c0) + + def forward(self, input, embed): + bs, length, n_feat = input.shape + + embed = self.embed(embed).unsqueeze(1).repeat(1, length, 1) + + hidden = self.initHidden(bs) + output, hidden = self.lstm(embed, hidden) + + return self.decoder(output) + +import torch.utils.data as data +class camdataset(data.Dataset): + def __init__(self, data, label): + self.data = data + self.label = label + + def __getitem__(self, index): + text = np.random.choice(self.label[index], np.random.randint(1, len(self.label[index])+1), replace=False) + + d = self.data[index] + d = np.concatenate((d, d[-1:].repeat(300-len(d), 0)), 0) + + return np.array(d, dtype="float32"), " ".join(text) + + def __len__(self): + return len(self.data) + + +def train(): + data = np.load("data.npy", allow_pickle=True)[()] + + d = np.concatenate(data["cam"], 0) + Mean, Std = np.mean(d, 0), np.std(d, 0) + + for i in range(len(data["cam"])): + data["cam"][i] = (data["cam"][i] - Mean[None, :]) / (Std[None, :] + 1e-8) + + # hardcoding these here + n_epoch = 1000 + batch_size = 128 + device = "cuda:0" + n_feature = 5 + lrate = 1e-4 + save_model = True + save_dir = './result/' + if not os.path.exists(save_dir): + os.mkdir(save_dir) + + criterion = torch.nn.MSELoss() + trans = LSTM(input_size=n_feature, hidden_size=512, output_size=n_feature) + trans.to(device) + + optim = torch.optim.Adam(trans.parameters(), lr=lrate) + + dataloader = DataLoader(camdataset(data['cam'], data['info']), batch_size=batch_size, shuffle=True, num_workers=5) + + if not os.path.exists("result"): + os.mkdir("result") + + device = "cuda" if torch.cuda.is_available() else "cpu" + model, preprocess = clip.load("ViT-B/32", device=device) + + for ep in range(n_epoch): + print(f'epoch {ep}') + trans.train() + + # linear lrate decay + optim.param_groups[0]['lr'] = lrate * (1 - ep / n_epoch) + + pbar = tqdm(dataloader) + loss_ema = None + for x, c in pbar: + optim.zero_grad() + x = x.to(device) + with torch.no_grad(): + c = clip.tokenize(c, truncate=True).to(device) + c = model.encode_text(c).float().detach() + + loss = criterion(trans(x, c), x) + loss.backward() + if loss_ema is None: + loss_ema = loss.item() + else: + loss_ema = 0.95 * loss_ema + 0.05 * loss.item() + pbar.set_description(f"loss: {loss_ema:.4f}") + optim.step() + + torch.save(trans.state_dict(), save_dir + f"latest.pth") + if save_model and ep % 100 == 0: + + torch.save(trans.state_dict(), save_dir + f"model_{ep}.pth") + print('saved model at ' + save_dir + f"model_{ep}.pth") + +def eval(): + if not os.path.exists("Mean_Std.npy"): + data = np.load("data.npy", allow_pickle=True)[()] + + d = np.concatenate(data["cam"], 0) + Mean, Std = np.mean(d, 0), np.std(d, 0) + np.save("Mean_Std", {"Mean": Mean, "Std": Std}) + d = np.load("Mean_Std.npy", allow_pickle=True)[()] + Mean, Std = d["Mean"], d["Std"] + + device = "cuda:0" + n_feature = 5 + + trans = LSTM(input_size=n_feature, hidden_size=512, output_size=n_feature) + trans.to(device) + + # optionally load a model + trans.load_state_dict(torch.load("./result/latest.pth")) + + if not os.path.exists("viz"): + os.mkdir("viz") + + device = "cuda" if torch.cuda.is_available() else "cpu" + model, preprocess = clip.load("ViT-B/32", device=device) + + d = np.load("test_prompt.npy", allow_pickle=True)[()] + + result = [] + for i in tqdm(range(0, len(d['info']), 100)): + txt = d['info'][i:i + 100] + text = [" ".join(v) for v in txt] + + with torch.no_grad(): + c = clip.tokenize(text, truncate=True).to(device) + c = model.encode_text(c).float().detach() + + sample = trans(torch.zeros(len(c), 300, n_feature), c) + sample = sample.detach().cpu().numpy() + + for j in range(len(text)): + s = sample[j] * Std[None, :] + Mean[None, :] + result.append(s) + + np.save("LSTM_test", {"result": result, "label": d["label"]}) + +if __name__ == "__main__": + import sys + mode = sys.argv[1] + + if mode == 'train': + train() + elif mode == 'eval': + eval() + else: + print('Error, instruction {} is not in {train, eval}') diff --git a/CCD/src/README.md b/CCD/src/README.md new file mode 100644 index 0000000000000000000000000000000000000000..79c4e8e8b93198aa19687b83cf2c44a6ebff25b2 --- /dev/null +++ b/CCD/src/README.md @@ -0,0 +1,51 @@ +# Cinematographic Camera Diffusion Model + +This repo provides PyTorch implementation of our paper : + +*Cinematographic Camera Diffusion Model* + +[Hongda Jiang](https://jianghd1996.github.io/), [Xi Wang](https://triocrossing.github.io/), [Marc Christie](http://people.irisa.fr/Marc.Christie/), [Libin Liu](http://libliu.info/),[Baoquan Chen](https://baoquanchen.info/) + +Eurographics 2024 + +The homepage and paper will release after published. + +## Prerequisites + +The environment requirement for this repo is simple. + +- Linux +- NVIDIA GPU + CUDA CuDNN +- Python 3.8 +- Pytorch, torchvision, tqdm, matplotlib, numpy, [CLIP](https://github.com/openai/CLIP) + +## Dataset + +We provide dataset in [link](https://drive.google.com/file/d/1VxmGy9szWShOKzWvIxrmgaNEkeqGPLJU/view?usp=sharing). The dataset is a numpy dict, where the key 'cam' includes the camera trajectories and 'info' includes the text descriptions. + +## Pretrained Model + +We provide [weights](https://drive.google.com/file/d/136IZeL4PSf9L6FJ4n_jFM6QFLTDjbvr1/view?usp=sharing) with text only training results. Please create an empty folder `weight` and put the weight file into the folder. + +Tips: +If you want to use the pretrained weight, please use *zooms in* and *zooms out* when you want to generate sequence with *pushes in* and *pulls out* since in the training we use these two prompts. + +## Inference + +Simply run ```python main.py gen``` and the generated sequences will be put in folder `gen`. + +We provide a Unity Scene for visualize the result [link](https://drive.google.com/file/d/1zAOJ8zN2hYO-dlQJSNl5uR_JtKapjpM8/view?usp=sharing), the version of the project is 2018.2.13f1. You need to set the file path, shooting target (head), shooting character. Here we provide an example of 'pan' motion with prompt 'The camera pans to the character. The camera switches from right front view to right back view. The character is at the middle center of the screen. The camera shoots at close shot.'. + + + + + +## Evaluation + +We provide the code of classifier `classify.py`, metric `metric.py`, and LSTM based camera motion generator `LSTM.py`. The training and testing dataset are separated with 9:1 ratio randomly. + +## Acknowledgement + +This code is standing on the shoulders of giants. We want to thank the following contributors that our code is based on: + +[Conditional Diffusion MNIST](https://github.com/TeaPearce/Conditional_Diffusion_MNIST), [MDM: Human Motion Diffusion Model](https://github.com/GuyTevet/motion-diffusion-model). \ No newline at end of file diff --git a/CCD/src/classify.py b/CCD/src/classify.py new file mode 100644 index 0000000000000000000000000000000000000000..1d569394b43e26ca4b7f4aa58c57cbacadcdcf2a --- /dev/null +++ b/CCD/src/classify.py @@ -0,0 +1,368 @@ +''' +This script does conditional image generation on MNIST, using a diffusion model + +This code is modified from, +https://github.com/cloneofsimo/minDiffusion + +Diffusion model is based on DDPM, +https://arxiv.org/abs/2006.11239 + +The conditioning idea is taken from 'Classifier-Free Diffusion Guidance', +https://arxiv.org/abs/2207.12598 + +This technique also features in ImageGen 'Photorealistic Text-to-Image Diffusion Modelswith Deep Language Understanding', +https://arxiv.org/abs/2205.11487 + +''' + +from typing import Dict, Tuple +from tqdm import tqdm +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.data import DataLoader +from torchvision import models, transforms +from torchvision.datasets import MNIST +from torchvision.utils import save_image, make_grid +import matplotlib.pyplot as plt +from matplotlib.animation import FuncAnimation, PillowWriter +import numpy as np +import os +import clip + +class PositionalEncoding(nn.Module): + def __init__(self, d_model, dropout=0.1, max_len=5000): + super(PositionalEncoding, self).__init__() + self.dropout = nn.Dropout(p=dropout) + + pe = torch.zeros(max_len, d_model) + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model)) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0).transpose(0, 1) + self.register_buffer('pe', pe) + + def forward(self, x): + # not used in the final model + x = x + self.pe[:x.shape[0], :] + return self.dropout(x) + +class TimestepEmbedder(nn.Module): + def __init__(self, latent_dim, sequence_pos_encoder): + super().__init__() + self.latent_dim = latent_dim + self.sequence_pos_encoder = sequence_pos_encoder + + time_embed_dim = self.latent_dim + self.time_embed = nn.Sequential( + nn.Linear(self.latent_dim, time_embed_dim), + nn.SiLU(), + nn.Linear(time_embed_dim, time_embed_dim), + ) + + def forward(self, timesteps): + return self.time_embed(self.sequence_pos_encoder.pe[timesteps]).permute(1, 0, 2) + +class Transformer(nn.Module): + def __init__(self, n_feature, n_label, latent_dim=256, + num_heads=4, ff_size=1024, dropout=0.1, activation='gelu', + num_layers=4, sliding_wind=300): + super(Transformer, self).__init__() + + self.n_feature = n_feature + self.n_label = n_label + self.num_heads = num_heads + self.ff_size = ff_size + self.dropout = dropout + self.activation = activation + self.num_layers = num_layers + self.latent_dim = latent_dim + + self.input_process = nn.Linear(self.n_feature, self.latent_dim) + + seqTransEncoderlayer = nn.TransformerEncoderLayer(d_model=self.latent_dim, + nhead = self.num_heads, + dim_feedforward = self.ff_size, + dropout = self.dropout, + activation=self.activation) + + self.seqTransEncoder = nn.TransformerEncoder(seqTransEncoderlayer, + num_layers = self.num_layers) + + self.sequence_pos_encoder = PositionalEncoding(self.latent_dim, self.dropout) + self.embed_timestep = TimestepEmbedder(self.latent_dim, self.sequence_pos_encoder) + + self.output_process = nn.Sequential( + nn.Linear(self.latent_dim, 1), + nn.ReLU() + ) + self.pred = nn.Sequential( + nn.Linear(sliding_wind, n_label), + # nn.Softmax(dim=1), + ) + + + def forward(self, x): + bs = len(x) + x = self.input_process(x.permute(1, 0, 2)) + + xseq = self.sequence_pos_encoder(x) + xseq = self.seqTransEncoder(xseq) + xseq = self.output_process(xseq).permute(1, 0, 2) + + xseq = xseq.view(bs, -1) + + return self.pred(xseq) + + def forward_feature(self, x): + bs = len(x) + x = self.input_process(x.permute(1, 0, 2)) + + xseq = self.sequence_pos_encoder(x) + xseq = self.seqTransEncoder(xseq) + xseq = self.output_process(xseq).permute(1, 0, 2) + + return xseq.view(bs, -1) + +import torch.utils.data as data +class camdataset(data.Dataset): + def __init__(self, cam, label): + self.cam = cam + self.label = label + + def __getitem__(self, index): + d = self.cam[index] + data = np.concatenate((d, d[-1:].repeat(300-len(d), 0)), 0) + return np.array(data, dtype="float32"), self.label[index] + + def __len__(self): + return len(self.cam) + + +def train_mnist(): + data = np.load("data.npy", allow_pickle=True)[()] + + d = np.concatenate(data["train_cam"]+data["test_cam"], 0) + Mean, Std = np.mean(d, 0), np.std(d, 0) + + np.save("Mean_Std", {"Mean": Mean, "Std": Std}) + + for i in range(len(data["train_cam"])): + data["train_cam"][i] = (data["train_cam"][i] - Mean[None, :]) / (Std[None, :]+1e-8) + + for i in range(len(data["test_cam"])): + data["test_cam"][i] = (data["test_cam"][i] - Mean[None, :]) / (Std[None, :]+1e-8) + + # hardcoding these here + n_epoch = 1000 + batch_size = 128 + device = "cuda:0" + n_feature = 5 + n_label = 6 + lrate = 1e-4 + save_model = True + save_dir = './result/' + if not os.path.exists(save_dir): + os.mkdir(save_dir) + + criterion = torch.nn.CrossEntropyLoss() + trans = Transformer(n_feature=n_feature, n_label=n_label) + trans.to(device) + + optim = torch.optim.Adam(trans.parameters(), lr=lrate) + + dataloader = DataLoader(camdataset(data['train_cam'], data['train_label']), batch_size=batch_size, shuffle=True, num_workers=5) + testloader = DataLoader(camdataset(data['test_cam'], data['test_label']), batch_size=batch_size, shuffle=False, num_workers=5) + + if not os.path.exists("result"): + os.mkdir("result") + + for ep in range(n_epoch): + print(f'epoch {ep}') + + # linear lrate decay + optim.param_groups[0]['lr'] = lrate*(1-ep/n_epoch) + + pbar = tqdm(dataloader) + + trans.train() + correct = 0 + total = 0 + for cam, label in pbar: + cam = cam.to(device) + label = label.to(device) + + pred_v = trans(cam) + + predictions = torch.argmax(pred_v, dim=1) + correct += torch.sum(predictions == label).item() + total += len(predictions) + + optim.zero_grad() + loss = criterion(pred_v, label) + loss.backward() + + pbar.set_description(f"training acc: {100.0 * correct/total:.4f}") + optim.step() + + trans.eval() + correct = 0 + total = 0 + for cam, label in testloader: + cam = cam.to(device) + label = label.to(device) + + pred_v = trans(cam) + predictions = torch.argmax(pred_v, dim=1) + + correct += torch.sum(predictions == label) + total += len(predictions) + print("evaluation accuracy : {}".format(1.0 * correct / total)) + + torch.save(trans.state_dict(), save_dir + f"latest.pth") + if save_model and ep % 100 == 0: + torch.save(trans.state_dict(), save_dir + f"model_{ep}.pth") + print('saved model at ' + save_dir + f"model_{ep}.pth") + +def eval_mnist(file_name): + if not os.path.exists("Mean_Std.npy"): + data = np.load("data.npy", allow_pickle=True)[()] + + d = np.concatenate(data["train_cam"] + data["test_cam"], 0) + Mean, Std = np.mean(d, 0), np.std(d, 0) + np.save("Mean_Std", {"Mean":Mean, "Std":Std}) + + d = np.load("Mean_Std.npy", allow_pickle=True)[()] + Mean, Std = d["Mean"], d["Std"] + + data = np.load(file_name+".npy", allow_pickle=True)[()] + + for i in range(len(data["result"])): + data["result"][i] = (data["result"][i] - Mean[None, :]) / (Std[None, :]+1e-8) + + device = "cuda:0" + n_feature = 5 + n_label = 6 + + trans = Transformer(n_feature=n_feature, n_label=n_label) + trans.to(device) + + # optionally load a model + trans.load_state_dict(torch.load("./result/latest.pth")) + + testloader = DataLoader(camdataset(data['result'], data['label']), batch_size=8, num_workers=5) + + correct = 0 + total = 0 + t = [0] * 10 + f = [0] * 10 + trans.eval() + with torch.no_grad(): + for cam, label in tqdm(testloader): + cam = cam.to(device) + label = label.to(device) + + pred_v = trans(cam) + predictions = torch.argmax(pred_v, dim=1) + + correct += torch.sum(predictions == label) + total += len(predictions) + + for i in range(len(predictions)): + if predictions[i] == label[i]: + t[label[i]] += 1 + else: + f[label[i]] += 1 + + print("gen accuracy : {}/{}={} ".format(correct, total, 1.0 * correct / total)) + for i in range(n_label): + print("{} {} {}".format(i, t[i], t[i]+f[i])) + +def process_feature(file_list): + data = np.load("data.npy", allow_pickle=True)[()] + + d = np.concatenate(data["train_cam"] + data["test_cam"], 0) + Mean, Std = np.mean(d, 0), np.std(d, 0) + + for i in range(len(data["train_cam"])): + data["train_cam"][i] = (data["train_cam"][i] - Mean[None, :]) / (Std[None, :]+1e-8) + + for i in range(len(data["test_cam"])): + data["test_cam"][i] = (data["test_cam"][i] - Mean[None, :]) / (Std[None, :]+1e-8) + + device = "cuda:0" + n_feature = 5 + n_label = 6 + + trans = Transformer(n_feature=n_feature, n_label=n_label) + trans.to(device) + + # optionally load a model + trans.load_state_dict(torch.load("./result/latest.pth")) + + trans.eval() + + d = dict() + + testloader = DataLoader(camdataset(data['train_cam'], data['train_label']), batch_size=8, num_workers=5) + + feature = [] + + with torch.no_grad(): + for cam, label in tqdm(testloader): + cam = cam.to(device) + + pred_v = trans.forward_feature(cam).detach().cpu().numpy() + + for v in pred_v: + feature.append(v) + + d["train_data"] = feature + + testloader = DataLoader(camdataset(data['test_cam'], data['test_label']), batch_size=8, num_workers=5) + + feature = [] + + with torch.no_grad(): + for cam, label in tqdm(testloader): + cam = cam.to(device) + + pred_v = trans.forward_feature(cam).detach().cpu().numpy() + + for v in pred_v: + feature.append(v) + + d["test_data"] = feature + + + for file in file_list: + data = np.load(file+".npy", allow_pickle=True)[()] + + for i in range(len(data["result"])): + data["result"][i] = (data["result"][i] - Mean[None, :]) / (Std[None, :] + 1e-8) + + testloader = DataLoader(camdataset(data['result'], data['label']), batch_size=8, num_workers=5) + + feature = [] + + with torch.no_grad(): + for cam, label in tqdm(testloader): + cam = cam.to(device) + + pred_v = trans.forward_feature(cam).detach().cpu().numpy() + + for v in pred_v: + feature.append(v) + + d[file] = feature + + np.save("feature", d) + + +if __name__ == "__main__": + train_mnist() + # + # eval_mnist() + + # process_feature() diff --git a/CCD/src/main.py b/CCD/src/main.py new file mode 100644 index 0000000000000000000000000000000000000000..f28d26aa40e5e6e55fefe0bdba0204e1173ac1aa --- /dev/null +++ b/CCD/src/main.py @@ -0,0 +1,389 @@ +''' +This script does conditional image generation on MNIST, using a diffusion model + +This code is modified from, +https://github.com/cloneofsimo/minDiffusion + +Diffusion model is based on DDPM, +https://arxiv.org/abs/2006.11239 + +The conditioning idea is taken from 'Classifier-Free Diffusion Guidance', +https://arxiv.org/abs/2207.12598 + +This technique also features in ImageGen 'Photorealistic Text-to-Image Diffusion Modelswith Deep Language Understanding', +https://arxiv.org/abs/2205.11487 + +''' +import random +from typing import Dict, Tuple +from tqdm import tqdm +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.data import DataLoader +from torchvision import models, transforms +from torchvision.datasets import MNIST +from torchvision.utils import save_image, make_grid +import matplotlib.pyplot as plt +from matplotlib.animation import FuncAnimation, PillowWriter +import numpy as np +import os +import clip + +class PositionalEncoding(nn.Module): + def __init__(self, d_model, dropout=0.1, max_len=5000): + super(PositionalEncoding, self).__init__() + self.dropout = nn.Dropout(p=dropout) + + pe = torch.zeros(max_len, d_model) + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model)) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0).transpose(0, 1) + self.register_buffer('pe', pe) + + def forward(self, x): + # not used in the final model + x = x + self.pe[:x.shape[0], :] + return self.dropout(x) + +class TimestepEmbedder(nn.Module): + def __init__(self, latent_dim, sequence_pos_encoder): + super().__init__() + self.latent_dim = latent_dim + self.sequence_pos_encoder = sequence_pos_encoder + + time_embed_dim = self.latent_dim + self.time_embed = nn.Sequential( + nn.Linear(self.latent_dim, time_embed_dim), + nn.SiLU(), + nn.Linear(time_embed_dim, time_embed_dim), + ) + + def forward(self, timesteps): + return self.time_embed(self.sequence_pos_encoder.pe[timesteps]).permute(1, 0, 2) + +class Transformer(nn.Module): + def __init__(self, n_feature, n_textemb, latent_dim=256, + num_heads=4, ff_size=1024, dropout=0.1, activation='gelu', + num_layers=4, cond_mask_prob=0.1): + super(Transformer, self).__init__() + + self.n_feature = n_feature + self.n_textemb = n_textemb + self.num_heads = num_heads + self.ff_size = ff_size + self.dropout = dropout + self.activation = activation + self.num_layers = num_layers + self.latent_dim = latent_dim + self.cond_mask_prob = cond_mask_prob + + self.embed_text = nn.Linear(self.n_textemb, self.latent_dim) + + self.input_process = nn.Linear(self.n_feature, self.latent_dim) + + seqTransEncoderlayer = nn.TransformerEncoderLayer(d_model=self.latent_dim, + nhead = self.num_heads, + dim_feedforward = self.ff_size, + dropout = self.dropout, + activation=self.activation) + + self.seqTransEncoder = nn.TransformerEncoder(seqTransEncoderlayer, + num_layers = self.num_layers) + + self.sequence_pos_encoder = PositionalEncoding(self.latent_dim, self.dropout) + self.embed_timestep = TimestepEmbedder(self.latent_dim, self.sequence_pos_encoder) + + self.output_process = nn.Linear(self.latent_dim, self.n_feature) + + def mask_cond(self, cond, force_mask=False): + bs, d = cond.shape + if force_mask: + return torch.zeros_like(cond) + elif self.training and self.cond_mask_prob > 0.: + mask = torch.bernoulli(torch.ones(bs, device=cond.device) * self.cond_mask_prob).view(bs, 1) # 1-> use null_cond, 0-> use real cond + return cond * (1. - mask) + else: + return cond + + def forward(self, x, emb_text, timesteps, force_mask=False): + emb_time = self.embed_timestep(timesteps) + + emb_text = self.embed_text(self.mask_cond(emb_text, force_mask=force_mask)) + emb = (emb_time + emb_text) + + x = self.input_process(x.permute(1, 0, 2)) + + xseq = torch.cat((emb, x), axis=0) + xseq = self.sequence_pos_encoder(xseq) + output = self.seqTransEncoder(xseq)[1:] + + return self.output_process(output).permute(1, 0, 2) + + + +def ddpm_schedules(beta1, beta2, T): + """ + Returns pre-computed schedules for DDPM sampling, training process. + """ + assert beta1 < beta2 < 1.0, "beta1 and beta2 must be in (0, 1)" + + beta_t = (beta2 - beta1) * torch.arange(0, T + 1, dtype=torch.float32) / T + beta1 + sqrt_beta_t = torch.sqrt(beta_t) + alpha_t = 1 - beta_t + log_alpha_t = torch.log(alpha_t) + alphabar_t = torch.cumsum(log_alpha_t, dim=0).exp() + + sqrtab = torch.sqrt(alphabar_t) + oneover_sqrta = 1 / torch.sqrt(alpha_t) + + sqrtmab = torch.sqrt(1 - alphabar_t) + mab_over_sqrtmab_inv = (1 - alpha_t) / sqrtmab + + return { + "alpha_t": alpha_t, # \alpha_t + "oneover_sqrta": oneover_sqrta, # 1/\sqrt{\alpha_t} + "sqrt_beta_t": sqrt_beta_t, # \sqrt{\beta_t} + "alphabar_t": alphabar_t, # \bar{\alpha_t} + "sqrtab": sqrtab, # \sqrt{\bar{\alpha_t}} + "sqrtmab": sqrtmab, # \sqrt{1-\bar{\alpha_t}} + "mab_over_sqrtmab": mab_over_sqrtmab_inv, # (1-\alpha_t)/\sqrt{1-\bar{\alpha_t}} + } + + +class DDPM(nn.Module): + def __init__(self, nn_model, betas, n_T, device): + super(DDPM, self).__init__() + self.nn_model = nn_model.to(device) + + # register_buffer allows accessing dictionary produced by ddpm_schedules + # e.g. can access self.sqrtab later + for k, v in ddpm_schedules(betas[0], betas[1], n_T).items(): + self.register_buffer(k, v) + + self.n_T = n_T + self.device = device + self.loss_mse = nn.MSELoss() + + self.count = [0] * n_T + + def forward(self, x, c): + """ + this method is used in training, so samples t and noise randomly + """ + + _ts = torch.randint(1, self.n_T, (x.shape[0],)).to(self.device) # t ~ Uniform(0, n_T) + noise = torch.randn_like(x) # eps ~ N(0, 1) + + for t in _ts: + self.count[t] += 1 + + x_t = ( + self.sqrtab[_ts, None, None] * x + + self.sqrtmab[_ts, None, None] * noise + ) # This is the x_t, which is sqrt(alphabar) x_0 + sqrt(1-alphabar) * eps + # We should predict the "error term" from this x_t. Loss is what we return. + + # return MSE between added noise, and our predicted noise + return self.loss_mse(noise, self.nn_model(x_t, c, _ts)) + + def sample(self, n_sample, c, size, device, guide_w): + # we follow the guidance sampling scheme described in 'Classifier-Free Diffusion Guidance' + # to make the fwd passes efficient, we concat two versions of the dataset, + # one with context_mask=0 and the other context_mask=1 + # we then mix the outputs with the guidance scale, w + # where w>0 means more guidance + + x_i = torch.randn(n_sample, *size).to(device) # x_T ~ N(0, 1), sample initial noise + + if c.shape[0] == 1: + c_i = c.repeat(n_sample, 1).float() + else: + c_i = c.float() + + for i in tqdm(range(self.n_T, 0, -1)): + t_is = torch.tensor(i).to(device).repeat(n_sample) + + # split predictions and compute weighting + eps1 = self.nn_model(x_i, c_i, t_is) + eps2 = self.nn_model(x_i, c_i, t_is, force_mask=True) + eps = eps2 + guide_w * (eps1 - eps2) + + z = torch.randn(n_sample, *size).to(device) if i > 1 else 0 + + + x_i = ( + self.oneover_sqrta[i] * (x_i - eps * self.mab_over_sqrtmab[i]) + + self.sqrt_beta_t[i] * z + ) + + return x_i + + + +import torch.utils.data as data +class camdataset(data.Dataset): + def __init__(self, data, label): + self.data = data + self.label = label + + def __getitem__(self, index): + text = np.random.choice(self.label[index], np.random.randint(1, len(self.label[index])+1), replace=False) + + d = self.data[index] + d = np.concatenate((d, d[-1:].repeat(300-len(d), 0)), 0) + + return np.array(d, dtype="float32"), " ".join(text) + + def __len__(self): + return len(self.data) + + +def train(): + data = np.load("data.npy", allow_pickle=True)[()] + + d = np.concatenate(data["cam"], 0) + Mean, Std = np.mean(d, 0), np.std(d, 0) + + for i in range(len(data["cam"])): + data["cam"][i] = (data["cam"][i] - Mean[None, :]) / (Std[None, :]+1e-8) + + # hardcoding these here + n_epoch = 20000 + batch_size = 256 + n_T = 1000 # 500 + device = "cuda:0" + n_feature = 5 + n_textemb = 512 + lrate = 1e-4 + save_model = True + save_dir = './weight/' + if not os.path.exists(save_dir): + os.mkdir(save_dir) + + ddpm = DDPM(nn_model=Transformer(n_feature=n_feature, n_textemb=n_textemb), betas=(1e-4, 0.02), n_T=n_T, device=device) + ddpm.to(device) + + optim = torch.optim.Adam(ddpm.parameters(), lr=lrate) + + dataloader = DataLoader(camdataset(data['cam'], data['info']), batch_size=batch_size, shuffle=True, num_workers=5) + + if not os.path.exists("result"): + os.mkdir("result") + + device = "cuda" if torch.cuda.is_available() else "cpu" + model, preprocess = clip.load("ViT-B/32", device=device) + + for ep in range(n_epoch): + print(f'epoch {ep}') + ddpm.train() + + # linear lrate decay + optim.param_groups[0]['lr'] = lrate*(1-ep/n_epoch) + + pbar = tqdm(dataloader) + loss_ema = None + for x, c in pbar: + optim.zero_grad() + x = x.to(device) + with torch.no_grad(): + c = clip.tokenize(c, truncate=True).to(device) + c = model.encode_text(c).detach() + + loss = ddpm(x, c) + loss.backward() + if loss_ema is None: + loss_ema = loss.item() + else: + loss_ema = 0.95 * loss_ema + 0.05 * loss.item() + pbar.set_description(f"loss: {loss_ema:.4f}") + optim.step() + + torch.save(ddpm.state_dict(), save_dir + f"latest.pth") + if save_model and ep % 100 == 0: + torch.save(ddpm.state_dict(), save_dir + f"model_{ep}.pth") + print('saved model at ' + save_dir + f"model_{ep}.pth") + + +def gen(text: str): + script_dir = os.path.dirname(os.path.abspath(__file__)) + + mean_std_path = os.path.join(script_dir, "..", "checkpoints", "Mean_Std.npy") + + latest_path = os.path.join(script_dir, "..", "checkpoints", "latest.pth") + + if not os.path.exists(mean_std_path): + data = np.load("data.npy", allow_pickle=True)[()] + + d = np.concatenate(data["cam"], 0) + Mean, Std = np.mean(d, 0), np.std(d, 0) + np.save("Mean_Std", {"Mean": Mean, "Std": Std}) + + d = np.load(mean_std_path, allow_pickle=True)[()] + Mean, Std = d["Mean"], d["Std"] + + n_T = 1000 # 500 + device = "cuda:0" + n_feature = 5 + n_textemb = 512 + + ddpm = DDPM(nn_model=Transformer(n_feature=n_feature, n_textemb=n_textemb), betas=(1e-4, 0.02), n_T=n_T, + device=device) + ddpm.to(device) + + # optionally load a model + ddpm.load_state_dict(torch.load(latest_path)) + + if not os.path.exists("gen"): + os.mkdir("gen") + + device = "cuda" if torch.cuda.is_available() else "cpu" + model, preprocess = clip.load("ViT-B/32", device=device) + + #text = ["The camera pans to the character. The camera switches from right front view to right back view. The character is at the middle center of the screen. The camera shoots at close shot."] + + result = [] + + def smooth(x, winds=10, T=4): + if T == 0: + return x + n_x = np.array(x) + for i in range(len(x)): + n_x[i] = np.mean(x[max(0, i - winds):min(len(x), i + winds), :], 0) + return smooth(n_x, T=T - 1) + + with torch.no_grad(): + c = clip.tokenize(text, truncate=True).to(device) + c = model.encode_text(c) + + sample = ddpm.sample(10, c, (300, n_feature), device, guide_w=2.0) + sample = sample.detach().cpu().numpy() + + for j in range(len(sample)): + s = smooth(sample[j] * Std[None, :] + Mean[None, :]) + result.append(s) + return result + # with open("gen/{}.txt".format(j), "w") as f: + # for i in range(len(s)): + # txt = "" + # for k in range(5): + # txt += str(s[i][k]) + " " + # f.write(txt+"\n") + + +def generate_CCD_sample(text: str): + return gen(text) + +if __name__ == "__main__": + import sys + mode = sys.argv[1] + + if mode == 'train': + train() + elif mode == 'gen': + gen() + else: + print('Error, instruction {} is not in {train, gen}') + diff --git a/CCD/src/metric.py b/CCD/src/metric.py new file mode 100644 index 0000000000000000000000000000000000000000..80d24d619af7816224577c9faa0a920704a8202f --- /dev/null +++ b/CCD/src/metric.py @@ -0,0 +1,178 @@ +import numpy as np +import torch +import matplotlib.pyplot as plt +from scipy import linalg +import os +from tqdm import tqdm + +def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6): + """Numpy implementation of the Frechet Distance. + The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1) + and X_2 ~ N(mu_2, C_2) is + d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)). + Stable version by Dougal J. Sutherland. + Params: + -- mu1 : Numpy array containing the activations of a layer of the + inception net (like returned by the function 'get_predictions') + for generated samples. + -- mu2 : The sample mean over activations, precalculated on an + representative data set. + -- sigma1: The covariance matrix over activations for generated samples. + -- sigma2: The covariance matrix over activations, precalculated on an + representative data set. + Returns: + -- : The Frechet Distance. + """ + + mu1 = np.atleast_1d(mu1) + mu2 = np.atleast_1d(mu2) + + sigma1 = np.atleast_2d(sigma1) + sigma2 = np.atleast_2d(sigma2) + + assert mu1.shape == mu2.shape, \ + 'Training and test mean vectors have different lengths' + assert sigma1.shape == sigma2.shape, \ + 'Training and test covariances have different dimensions' + + diff = mu1 - mu2 + + # Product might be almost singular + covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False) + if not np.isfinite(covmean).all(): + msg = ('fid calculation produces singular product; ' + 'adding %s to diagonal of cov estimates') % eps + print(msg) + offset = np.eye(sigma1.shape[0]) * eps + covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset)) + + # Numerical error might give slight imaginary component + if np.iscomplexobj(covmean): + if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3): + m = np.max(np.abs(covmean.imag)) + raise ValueError('Imaginary component {}'.format(m)) + covmean = covmean.real + + tr_covmean = np.trace(covmean) + + return (diff.dot(diff) + np.trace(sigma1) + + np.trace(sigma2) - 2 * tr_covmean) + +def calculate_activation_statistics(data): + """Calculation of the statistics used by the FID. + Params: + -- files : List of image files paths + -- model : Instance of inception model + -- batch_size : The images numpy array is split into batches with + batch size batch_size. A reasonable batch size + depends on the hardware. + -- dims : Dimensionality of features returned by Inception + -- device : Device to run calculations + -- num_workers : Number of parallel dataloader workers + Returns: + -- mu : The mean over samples of the activations of the pool_3 layer of + the inception model. + -- sigma : The covariance matrix of the activations of the pool_3 layer of + the inception model. + """ + mu = np.mean(data, axis=0) + sigma = np.cov(data, rowvar=False) + return mu, sigma + +def calculate_diversity(data, first_indices, second_indices): + diversity = 0 + + d = torch.FloatTensor(data) + + for first_idx, second_idx in zip(first_indices, second_indices): + diversity += torch.dist(d[first_idx, :], d[second_idx, :]) + + diversity /= len(first_indices) + return diversity + +d = np.load("feature.npy", allow_pickle=True)[()] + +d0 = d["train_data"] +d1 = d["test_data"] +d2 = d["gen_T5"] +d3 = d["gen_GRU_T5"] +d4 = d["LSTM_Des"] +d5 = d["gen"] + +Mean, Std = np.mean(d0, 0), np.std(d0, 0) +d0 = [(v - Mean[None, :]) / Std[None, :] for v in d0] +d1 = [(v - Mean[None, :]) / Std[None, :] for v in d1] +d2 = [(v - Mean[None, :]) / Std[None, :] for v in d2] +d3 = [(v - Mean[None, :]) / Std[None, :] for v in d3] +d4 = [(v - Mean[None, :]) / Std[None, :] for v in d4] +d5 = [(v - Mean[None, :]) / Std[None, :] for v in d5] + +if not os.path.exists("viz"): + os.mkdir("viz") + + +d0 = np.array([v.flatten() for v in d0]) +d1 = np.array([v.flatten() for v in d1]) +d2 = np.array([v.flatten() for v in d2]) +d3 = np.array([v.flatten() for v in d3]) +d4 = np.array([v.flatten() for v in d4]) +d5 = np.array([v.flatten() for v in d5]) + +print("Diversity") + +diversity_times = 10000 +num_motions = len(d1) +first_indices = np.random.randint(0, num_motions, diversity_times) +second_indices = np.random.randint(0, num_motions, diversity_times) + +print(calculate_diversity(d1, first_indices, second_indices)) +print(calculate_diversity(d2, first_indices, second_indices)) +print(calculate_diversity(d3, first_indices, second_indices)) +print(calculate_diversity(d4, first_indices, second_indices)) +print(calculate_diversity(d5, first_indices, second_indices)) + +print("Diversity with action label") + +d = np.load("data.npy", allow_pickle=True)[()] + +label = dict() +for i in range(6): + label[i] = [] +for i in range(len(d['test_label'])): + label[d['test_label'][i]].append(i) + +diversity_times = 1000 +first_indices = [] +second_indices = [] +for i in range(6): + idx = np.random.randint(0, len(label[i]), diversity_times) + for j in idx: + first_indices.append(label[i][j]) + idx = np.random.randint(0, len(label[i]), diversity_times) + for j in idx: + second_indices.append(label[i][j]) + +import random +print(random.shuffle(second_indices)) + +print(calculate_diversity(d1, first_indices, second_indices)) +print(calculate_diversity(d2, first_indices, second_indices)) +print(calculate_diversity(d3, first_indices, second_indices)) +print(calculate_diversity(d4, first_indices, second_indices)) +print(calculate_diversity(d5, first_indices, second_indices)) + + +print("FID with training") + +mu0, sigma0 = calculate_activation_statistics(d0) +mu1, sigma1 = calculate_activation_statistics(d1) +mu2, sigma2 = calculate_activation_statistics(d2) +mu3, sigma3 = calculate_activation_statistics(d3) +mu4, sigma4 = calculate_activation_statistics(d4) +mu5, sigma5 = calculate_activation_statistics(d5) + +print(calculate_frechet_distance(mu0, sigma0, mu1, sigma1)) +print(calculate_frechet_distance(mu0, sigma0, mu2, sigma2)) +print(calculate_frechet_distance(mu0, sigma0, mu3, sigma3)) +print(calculate_frechet_distance(mu0, sigma0, mu4, sigma4)) +print(calculate_frechet_distance(mu0, sigma0, mu5, sigma5)) \ No newline at end of file diff --git a/CCD/utils/rerun.py b/CCD/utils/rerun.py new file mode 100644 index 0000000000000000000000000000000000000000..ee94134195051e5a359ac9f6530bcb7a6afa385e --- /dev/null +++ b/CCD/utils/rerun.py @@ -0,0 +1,102 @@ +import numpy as np +from matplotlib import colormaps +import rerun as rr +from rerun.components import Material +from scipy.spatial import transform + + +def color_fn(x, cmap="tab10"): + return colormaps[cmap](x % colormaps[cmap].N) + + +def ccd_log_sample( + root_name: str, + traj: np.ndarray, +): + + traj = traj[0] + num_cameras = traj.shape[0] + + rr.log(root_name, rr.ViewCoordinates.RIGHT_HAND_Y_DOWN, timeless=True) + + rr.log( + f"{root_name}/trajectory/points", + rr.Points3D(traj[:, :3]), + timeless=True, + ) + + rr.log( + f"{root_name}/trajectory/line", + rr.LineStrips3D( + np.stack((traj[:, :3][:-1], traj[:, :3][1:]), axis=1), + colors=[(1.0, 0.0, 1.0, 1.0)], # Purple color + ), + timeless=True, + ) + + + for k in range(num_cameras): + rr.set_time_sequence("frame_idx", k) + + translation = traj[k][:3] + + fx = 955.02 # Focal length in X + fy = 955.02 # Focal length in Y (same as fx for 1:1 aspect ratio) + cx = 256 # Principal point X (center of 512x512 image) + cy = 256 # Principal point Y (center of 512x512 image) + K = np.array([ + [fx, 0, cx], + [0, fy, cy], + [0, 0, 1] + ]) + + rr.log( + f"{root_name}/camera/image", + rr.Pinhole( + image_from_camera=K, + width=K[0, -1] * 2, + height=K[1, -1] * 2, + ), + ) + + width = K[0, -1] * 2 + height = K[1, -1] * 2 + fov_x = 2 * np.arctan(width / (2 * K[0, 0])) + fov_y = 2 * np.arctan(height / (2 * K[1, 1])) + + horizontal_angle = np.arctan(traj[k][3] * np.tan(fov_x / 2)) + vertical_angle = np.arctan(traj[k][4] * np.tan(fov_y / 2)) + + direction = -translation + direction /= np.linalg.norm(direction) + + up = np.array([0, 1, 0]) + + right = np.cross(up, direction) + right /= np.linalg.norm(right) + up = np.cross(direction, right) + + rotation_matrix = np.vstack([right, up, direction]).T + + rotation_x = transform.Rotation.from_rotvec(vertical_angle * np.array([1, 0, 0])) + rotation_y = transform.Rotation.from_rotvec(-horizontal_angle * np.array([0, 1, 0])) + + rotation_combined = rotation_y * transform.Rotation.from_matrix(rotation_matrix) * rotation_x + + rotation_q = rotation_combined.as_quat() + + rr.log( + f"{root_name}/camera", + rr.Transform3D( + translation=translation, + rotation=rr.Quaternion(xyzw=rotation_q), + ), + ) + + rr.set_time_sequence("image", k) + + rr.log( + f"{root_name}/char_traj/points", + rr.Points3D([[0, 0, 0]], colors=[(1.0, 0.0, 0.0, 1.0)]), + timeless=True, + ) diff --git a/checkpoints/ca-mixed-e449.ckpt b/ET/checkpoints/ca-mixed-e449.ckpt similarity index 100% rename from checkpoints/ca-mixed-e449.ckpt rename to ET/checkpoints/ca-mixed-e449.ckpt diff --git a/configs/compnode/cpu.yaml b/ET/configs/compnode/cpu.yaml similarity index 100% rename from configs/compnode/cpu.yaml rename to ET/configs/compnode/cpu.yaml diff --git a/configs/compnode/gpu.yaml b/ET/configs/compnode/gpu.yaml similarity index 100% rename from configs/compnode/gpu.yaml rename to ET/configs/compnode/gpu.yaml diff --git a/configs/config.yaml b/ET/configs/config.yaml similarity index 75% rename from configs/config.yaml rename to ET/configs/config.yaml index ad2522b9a7a3da0fa57763e646ae33fedb974168..01a3e30a436b2542c95e5cb9e0984fb2b32b85fe 100644 --- a/configs/config.yaml +++ b/ET/configs/config.yaml @@ -8,9 +8,9 @@ dataset: char: load_vertices: true -checkpoint_path: 'checkpoints/ca-mixed-e449.ckpt' +checkpoint_path: 'ET/checkpoints/ca-mixed-e449.ckpt' batch_size: 128 -data_dir: data +data_dir: ET/data hydra: run: diff --git a/configs/dataset/caption/caption.yaml b/ET/configs/dataset/caption/caption.yaml similarity index 100% rename from configs/dataset/caption/caption.yaml rename to ET/configs/dataset/caption/caption.yaml diff --git a/configs/dataset/char/char.yaml b/ET/configs/dataset/char/char.yaml similarity index 100% rename from configs/dataset/char/char.yaml rename to ET/configs/dataset/char/char.yaml diff --git a/configs/dataset/standardization/0300.yaml b/ET/configs/dataset/standardization/0300.yaml similarity index 100% rename from configs/dataset/standardization/0300.yaml rename to ET/configs/dataset/standardization/0300.yaml diff --git a/configs/dataset/traj+caption+char.yaml b/ET/configs/dataset/traj+caption+char.yaml similarity index 100% rename from configs/dataset/traj+caption+char.yaml rename to ET/configs/dataset/traj+caption+char.yaml diff --git a/configs/dataset/trajectory/rot6d_trajectory.yaml b/ET/configs/dataset/trajectory/rot6d_trajectory.yaml similarity index 100% rename from configs/dataset/trajectory/rot6d_trajectory.yaml rename to ET/configs/dataset/trajectory/rot6d_trajectory.yaml diff --git a/configs/diffuser/network/module/ca_director.yaml b/ET/configs/diffuser/network/module/ca_director.yaml similarity index 100% rename from configs/diffuser/network/module/ca_director.yaml rename to ET/configs/diffuser/network/module/ca_director.yaml diff --git a/configs/diffuser/network/rn_director.yaml b/ET/configs/diffuser/network/rn_director.yaml similarity index 100% rename from configs/diffuser/network/rn_director.yaml rename to ET/configs/diffuser/network/rn_director.yaml diff --git a/configs/diffuser/rn_director_edm.yaml b/ET/configs/diffuser/rn_director_edm.yaml similarity index 100% rename from configs/diffuser/rn_director_edm.yaml rename to ET/configs/diffuser/rn_director_edm.yaml diff --git a/data/cam_segments/2011_F_EuMeT2wBo_00014_00001.npy b/ET/data/cam_segments/2011_F_EuMeT2wBo_00014_00001.npy similarity index 100% rename from data/cam_segments/2011_F_EuMeT2wBo_00014_00001.npy rename to ET/data/cam_segments/2011_F_EuMeT2wBo_00014_00001.npy diff --git a/data/cam_segments/2011_KAeAqaA0Llg_00005_00001.npy b/ET/data/cam_segments/2011_KAeAqaA0Llg_00005_00001.npy similarity index 100% rename from data/cam_segments/2011_KAeAqaA0Llg_00005_00001.npy rename to ET/data/cam_segments/2011_KAeAqaA0Llg_00005_00001.npy diff --git a/data/cam_segments/2011_MCkKihQrNA4_00014_00000.npy b/ET/data/cam_segments/2011_MCkKihQrNA4_00014_00000.npy similarity index 100% rename from data/cam_segments/2011_MCkKihQrNA4_00014_00000.npy rename to ET/data/cam_segments/2011_MCkKihQrNA4_00014_00000.npy diff --git a/data/caption/2011_F_EuMeT2wBo_00014_00001.txt b/ET/data/caption/2011_F_EuMeT2wBo_00014_00001.txt similarity index 100% rename from data/caption/2011_F_EuMeT2wBo_00014_00001.txt rename to ET/data/caption/2011_F_EuMeT2wBo_00014_00001.txt diff --git a/data/caption/2011_KAeAqaA0Llg_00005_00001.txt b/ET/data/caption/2011_KAeAqaA0Llg_00005_00001.txt similarity index 100% rename from data/caption/2011_KAeAqaA0Llg_00005_00001.txt rename to ET/data/caption/2011_KAeAqaA0Llg_00005_00001.txt diff --git a/data/caption/2011_MCkKihQrNA4_00014_00000.txt b/ET/data/caption/2011_MCkKihQrNA4_00014_00000.txt similarity index 100% rename from data/caption/2011_MCkKihQrNA4_00014_00000.txt rename to ET/data/caption/2011_MCkKihQrNA4_00014_00000.txt diff --git a/data/caption_clip/seq/2011_F_EuMeT2wBo_00014_00001.npy b/ET/data/caption_clip/seq/2011_F_EuMeT2wBo_00014_00001.npy similarity index 100% rename from data/caption_clip/seq/2011_F_EuMeT2wBo_00014_00001.npy rename to ET/data/caption_clip/seq/2011_F_EuMeT2wBo_00014_00001.npy diff --git a/data/caption_clip/seq/2011_KAeAqaA0Llg_00005_00001.npy b/ET/data/caption_clip/seq/2011_KAeAqaA0Llg_00005_00001.npy similarity index 100% rename from data/caption_clip/seq/2011_KAeAqaA0Llg_00005_00001.npy rename to ET/data/caption_clip/seq/2011_KAeAqaA0Llg_00005_00001.npy diff --git a/data/caption_clip/seq/2011_MCkKihQrNA4_00014_00000.npy b/ET/data/caption_clip/seq/2011_MCkKihQrNA4_00014_00000.npy similarity index 100% rename from data/caption_clip/seq/2011_MCkKihQrNA4_00014_00000.npy rename to ET/data/caption_clip/seq/2011_MCkKihQrNA4_00014_00000.npy diff --git a/data/caption_clip/token/2011_F_EuMeT2wBo_00014_00001.npy b/ET/data/caption_clip/token/2011_F_EuMeT2wBo_00014_00001.npy similarity index 100% rename from data/caption_clip/token/2011_F_EuMeT2wBo_00014_00001.npy rename to ET/data/caption_clip/token/2011_F_EuMeT2wBo_00014_00001.npy diff --git a/data/caption_clip/token/2011_KAeAqaA0Llg_00005_00001.npy b/ET/data/caption_clip/token/2011_KAeAqaA0Llg_00005_00001.npy similarity index 100% rename from data/caption_clip/token/2011_KAeAqaA0Llg_00005_00001.npy rename to ET/data/caption_clip/token/2011_KAeAqaA0Llg_00005_00001.npy diff --git a/data/caption_clip/token/2011_MCkKihQrNA4_00014_00000.npy b/ET/data/caption_clip/token/2011_MCkKihQrNA4_00014_00000.npy similarity index 100% rename from data/caption_clip/token/2011_MCkKihQrNA4_00014_00000.npy rename to ET/data/caption_clip/token/2011_MCkKihQrNA4_00014_00000.npy diff --git a/data/char/2011_F_EuMeT2wBo_00014_00001.npy b/ET/data/char/2011_F_EuMeT2wBo_00014_00001.npy similarity index 100% rename from data/char/2011_F_EuMeT2wBo_00014_00001.npy rename to ET/data/char/2011_F_EuMeT2wBo_00014_00001.npy diff --git a/data/char/2011_KAeAqaA0Llg_00005_00001.npy b/ET/data/char/2011_KAeAqaA0Llg_00005_00001.npy similarity index 100% rename from data/char/2011_KAeAqaA0Llg_00005_00001.npy rename to ET/data/char/2011_KAeAqaA0Llg_00005_00001.npy diff --git a/data/char/2011_MCkKihQrNA4_00014_00000.npy b/ET/data/char/2011_MCkKihQrNA4_00014_00000.npy similarity index 100% rename from data/char/2011_MCkKihQrNA4_00014_00000.npy rename to ET/data/char/2011_MCkKihQrNA4_00014_00000.npy diff --git a/data/char_raw/2011_F_EuMeT2wBo_00014_00001.npy b/ET/data/char_raw/2011_F_EuMeT2wBo_00014_00001.npy similarity index 100% rename from data/char_raw/2011_F_EuMeT2wBo_00014_00001.npy rename to ET/data/char_raw/2011_F_EuMeT2wBo_00014_00001.npy diff --git a/data/char_raw/2011_KAeAqaA0Llg_00005_00001.npy b/ET/data/char_raw/2011_KAeAqaA0Llg_00005_00001.npy similarity index 100% rename from data/char_raw/2011_KAeAqaA0Llg_00005_00001.npy rename to ET/data/char_raw/2011_KAeAqaA0Llg_00005_00001.npy diff --git a/data/char_raw/2011_MCkKihQrNA4_00014_00000.npy b/ET/data/char_raw/2011_MCkKihQrNA4_00014_00000.npy similarity index 100% rename from data/char_raw/2011_MCkKihQrNA4_00014_00000.npy rename to ET/data/char_raw/2011_MCkKihQrNA4_00014_00000.npy diff --git a/data/char_segments/2011_F_EuMeT2wBo_00014_00001.npy b/ET/data/char_segments/2011_F_EuMeT2wBo_00014_00001.npy similarity index 100% rename from data/char_segments/2011_F_EuMeT2wBo_00014_00001.npy rename to ET/data/char_segments/2011_F_EuMeT2wBo_00014_00001.npy diff --git a/data/char_segments/2011_KAeAqaA0Llg_00005_00001.npy b/ET/data/char_segments/2011_KAeAqaA0Llg_00005_00001.npy similarity index 100% rename from data/char_segments/2011_KAeAqaA0Llg_00005_00001.npy rename to ET/data/char_segments/2011_KAeAqaA0Llg_00005_00001.npy diff --git a/data/char_segments/2011_MCkKihQrNA4_00014_00000.npy b/ET/data/char_segments/2011_MCkKihQrNA4_00014_00000.npy similarity index 100% rename from data/char_segments/2011_MCkKihQrNA4_00014_00000.npy rename to ET/data/char_segments/2011_MCkKihQrNA4_00014_00000.npy diff --git a/data/demo_split.txt b/ET/data/demo_split.txt similarity index 100% rename from data/demo_split.txt rename to ET/data/demo_split.txt diff --git a/data/intrinsics/2011_F_EuMeT2wBo_00014_00001.npy b/ET/data/intrinsics/2011_F_EuMeT2wBo_00014_00001.npy similarity index 100% rename from data/intrinsics/2011_F_EuMeT2wBo_00014_00001.npy rename to ET/data/intrinsics/2011_F_EuMeT2wBo_00014_00001.npy diff --git a/data/intrinsics/2011_KAeAqaA0Llg_00005_00001.npy b/ET/data/intrinsics/2011_KAeAqaA0Llg_00005_00001.npy similarity index 100% rename from data/intrinsics/2011_KAeAqaA0Llg_00005_00001.npy rename to ET/data/intrinsics/2011_KAeAqaA0Llg_00005_00001.npy diff --git a/data/intrinsics/2011_MCkKihQrNA4_00014_00000.npy b/ET/data/intrinsics/2011_MCkKihQrNA4_00014_00000.npy similarity index 100% rename from data/intrinsics/2011_MCkKihQrNA4_00014_00000.npy rename to ET/data/intrinsics/2011_MCkKihQrNA4_00014_00000.npy diff --git a/data/traj/2011_F_EuMeT2wBo_00014_00001.txt b/ET/data/traj/2011_F_EuMeT2wBo_00014_00001.txt similarity index 100% rename from data/traj/2011_F_EuMeT2wBo_00014_00001.txt rename to ET/data/traj/2011_F_EuMeT2wBo_00014_00001.txt diff --git a/data/traj/2011_KAeAqaA0Llg_00005_00001.txt b/ET/data/traj/2011_KAeAqaA0Llg_00005_00001.txt similarity index 100% rename from data/traj/2011_KAeAqaA0Llg_00005_00001.txt rename to ET/data/traj/2011_KAeAqaA0Llg_00005_00001.txt diff --git a/data/traj/2011_MCkKihQrNA4_00014_00000.txt b/ET/data/traj/2011_MCkKihQrNA4_00014_00000.txt similarity index 100% rename from data/traj/2011_MCkKihQrNA4_00014_00000.txt rename to ET/data/traj/2011_MCkKihQrNA4_00014_00000.txt diff --git a/data/traj_raw/2011_F_EuMeT2wBo_00014_00001.txt b/ET/data/traj_raw/2011_F_EuMeT2wBo_00014_00001.txt similarity index 100% rename from data/traj_raw/2011_F_EuMeT2wBo_00014_00001.txt rename to ET/data/traj_raw/2011_F_EuMeT2wBo_00014_00001.txt diff --git a/data/traj_raw/2011_KAeAqaA0Llg_00005_00001.txt b/ET/data/traj_raw/2011_KAeAqaA0Llg_00005_00001.txt similarity index 100% rename from data/traj_raw/2011_KAeAqaA0Llg_00005_00001.txt rename to ET/data/traj_raw/2011_KAeAqaA0Llg_00005_00001.txt diff --git a/data/traj_raw/2011_MCkKihQrNA4_00014_00000.txt b/ET/data/traj_raw/2011_MCkKihQrNA4_00014_00000.txt similarity index 100% rename from data/traj_raw/2011_MCkKihQrNA4_00014_00000.txt rename to ET/data/traj_raw/2011_MCkKihQrNA4_00014_00000.txt diff --git a/data/vert/2011_F_EuMeT2wBo_00014_00001.npy b/ET/data/vert/2011_F_EuMeT2wBo_00014_00001.npy similarity index 100% rename from data/vert/2011_F_EuMeT2wBo_00014_00001.npy rename to ET/data/vert/2011_F_EuMeT2wBo_00014_00001.npy diff --git a/data/vert/2011_KAeAqaA0Llg_00005_00001.npy b/ET/data/vert/2011_KAeAqaA0Llg_00005_00001.npy similarity index 100% rename from data/vert/2011_KAeAqaA0Llg_00005_00001.npy rename to ET/data/vert/2011_KAeAqaA0Llg_00005_00001.npy diff --git a/data/vert/2011_MCkKihQrNA4_00014_00000.npy b/ET/data/vert/2011_MCkKihQrNA4_00014_00000.npy similarity index 100% rename from data/vert/2011_MCkKihQrNA4_00014_00000.npy rename to ET/data/vert/2011_MCkKihQrNA4_00014_00000.npy diff --git a/data/vert_raw/2011_F_EuMeT2wBo_00014_00001.npy b/ET/data/vert_raw/2011_F_EuMeT2wBo_00014_00001.npy similarity index 100% rename from data/vert_raw/2011_F_EuMeT2wBo_00014_00001.npy rename to ET/data/vert_raw/2011_F_EuMeT2wBo_00014_00001.npy diff --git a/data/vert_raw/2011_KAeAqaA0Llg_00005_00001.npy b/ET/data/vert_raw/2011_KAeAqaA0Llg_00005_00001.npy similarity index 100% rename from data/vert_raw/2011_KAeAqaA0Llg_00005_00001.npy rename to ET/data/vert_raw/2011_KAeAqaA0Llg_00005_00001.npy diff --git a/data/vert_raw/2011_MCkKihQrNA4_00014_00000.npy b/ET/data/vert_raw/2011_MCkKihQrNA4_00014_00000.npy similarity index 100% rename from data/vert_raw/2011_MCkKihQrNA4_00014_00000.npy rename to ET/data/vert_raw/2011_MCkKihQrNA4_00014_00000.npy diff --git a/src/datasets/datamodule.py b/ET/src/datasets/datamodule.py similarity index 100% rename from src/datasets/datamodule.py rename to ET/src/datasets/datamodule.py diff --git a/src/datasets/modalities/caption_dataset.py b/ET/src/datasets/modalities/caption_dataset.py similarity index 100% rename from src/datasets/modalities/caption_dataset.py rename to ET/src/datasets/modalities/caption_dataset.py diff --git a/src/datasets/modalities/char_dataset.py b/ET/src/datasets/modalities/char_dataset.py similarity index 100% rename from src/datasets/modalities/char_dataset.py rename to ET/src/datasets/modalities/char_dataset.py diff --git a/src/datasets/modalities/trajectory_dataset.py b/ET/src/datasets/modalities/trajectory_dataset.py similarity index 100% rename from src/datasets/modalities/trajectory_dataset.py rename to ET/src/datasets/modalities/trajectory_dataset.py diff --git a/src/datasets/multimodal_dataset.py b/ET/src/datasets/multimodal_dataset.py similarity index 100% rename from src/datasets/multimodal_dataset.py rename to ET/src/datasets/multimodal_dataset.py diff --git a/src/diffuser.py b/ET/src/diffuser.py similarity index 100% rename from src/diffuser.py rename to ET/src/diffuser.py diff --git a/src/models/modules/director.py b/ET/src/models/modules/director.py similarity index 100% rename from src/models/modules/director.py rename to ET/src/models/modules/director.py diff --git a/src/models/networks.py b/ET/src/models/networks.py similarity index 100% rename from src/models/networks.py rename to ET/src/models/networks.py diff --git a/utils/common_viz.py b/ET/utils/common_viz.py similarity index 100% rename from utils/common_viz.py rename to ET/utils/common_viz.py diff --git a/utils/file_utils.py b/ET/utils/file_utils.py similarity index 100% rename from utils/file_utils.py rename to ET/utils/file_utils.py diff --git a/utils/random_utils.py b/ET/utils/random_utils.py similarity index 100% rename from utils/random_utils.py rename to ET/utils/random_utils.py diff --git a/utils/rerun.py b/ET/utils/rerun.py similarity index 98% rename from utils/rerun.py rename to ET/utils/rerun.py index b1091cab23516fd9bbec6c8d7165a2fcbf71962c..097ddf04782c834f154777fb8a997574313ce673 100644 --- a/utils/rerun.py +++ b/ET/utils/rerun.py @@ -9,7 +9,7 @@ def color_fn(x, cmap="tab10"): return colormaps[cmap](x % colormaps[cmap].N) -def log_sample( +def et_log_sample( root_name: str, traj: np.ndarray, char_traj: np.ndarray, diff --git a/utils/rotation_utils.py b/ET/utils/rotation_utils.py similarity index 100% rename from utils/rotation_utils.py rename to ET/utils/rotation_utils.py diff --git a/app.py b/app.py index c90199e169ab6cb1f30f316fe56a7b14a682832e..f13986beecbb0d6a0c1c6c0d038042a0db1e23ab 100644 --- a/app.py +++ b/app.py @@ -1,5 +1,14 @@ import spaces +import sys +import os + +current_dir = os.path.dirname(os.path.abspath(__file__)) +et_dir = os.path.join(current_dir, 'ET') +ccd_dir = os.path.join(current_dir, 'CCD') +sys.path.append(et_dir) +sys.path.append(ccd_dir) + from functools import partial from typing import Any, Callable, Dict @@ -11,11 +20,16 @@ import trimesh import rerun as rr import torch -from utils.common_viz import init, get_batch -from utils.random_utils import set_random_seed -from utils.rerun import log_sample -from src.diffuser import Diffuser -from src.datasets.multimodal_dataset import MultimodalDataset +from ET.utils.common_viz import init, get_batch +from ET.utils.random_utils import set_random_seed +from ET.utils.rerun import et_log_sample +from ET.src.diffuser import Diffuser +from ET.src.datasets.multimodal_dataset import MultimodalDataset + +from CCD.utils.rerun import ccd_log_sample +from CCD.src.main import generate_CCD_sample + + # ------------------------------------------------------------------------------------- # @@ -35,7 +49,7 @@ EXAMPLES = [ "While the character moves right, the camera trucks right.", "While the character moves right, the camera performs a push in.", "While the character moves right, the camera performs a pull out.", - "Movement: shortArcShotRight Easing: easeInOutQuad Frames: 30 Camera Angle: birdsEyeView Shot Type: mediumShot Subject Index: 0", + "The camera pans to the character. The camera switches from right front view to right back view. The character is at the middle center of the screen. The camera shoots at close shot.", "Movement: fullZoomIn Easing: easeInOutSine Frames: 30 Camera Angle: highAngle Shot Type: closeUp", "Movement: pedestalDown Easing: easeOutExpo Frames: 30 Camera Angle: mediumAngle Shot Type: longShot", # noqa "Movement: dollyIn Easing: easeOutBounce Frames: 30 Camera Angle: mediumAngle Shot Type: longShot", # noqa @@ -50,7 +64,7 @@ DEFAULT_TEXT = [ HEADER = """