diff --git a/.gitattributes b/.gitattributes index 4552ec6db93419ef7aae141c26af4eec7e00962a..aba0213ba9b0fdae865cf6ec046c4e3100072bcc 100644 --- a/.gitattributes +++ b/.gitattributes @@ -35,3 +35,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text Text2LIVE-main/Text2LIVE-main/data/data/pretrained_nla_models/blackswan/checkpoint filter=lfs diff=lfs merge=lfs -text Text2LIVE-main/Text2LIVE-main/data/data/pretrained_nla_models/car-turn/checkpoint filter=lfs diff=lfs merge=lfs -text Text2LIVE-main/Text2LIVE-main/data/data/pretrained_nla_models/libby/checkpoint filter=lfs diff=lfs merge=lfs -text +Text2LIVE-main/data/data/pretrained_nla_models/blackswan/checkpoint filter=lfs diff=lfs merge=lfs -text +Text2LIVE-main/data/data/pretrained_nla_models/car-turn/checkpoint filter=lfs diff=lfs merge=lfs -text +Text2LIVE-main/data/data/pretrained_nla_models/libby/checkpoint filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..3458ddd086b34ff9346c3653bfd49f124597eaf4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/wandb/ +__pycache__/ +/idea \ No newline at end of file diff --git a/CLIP/CLIP.png b/CLIP/CLIP.png new file mode 100644 index 0000000000000000000000000000000000000000..a1b5ec9171fd7a51e36e845a02304eb837142ba1 Binary files /dev/null and b/CLIP/CLIP.png differ diff --git a/CLIP/LICENSE b/CLIP/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..4e97f0b45803b7c04ae89548934af4f257a97501 --- /dev/null +++ b/CLIP/LICENSE @@ -0,0 +1,22 @@ +MIT License + +Copyright (c) 2021 OpenAI + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + diff --git a/CLIP/MANIFEST.in b/CLIP/MANIFEST.in new file mode 100644 index 0000000000000000000000000000000000000000..effd8d995ff1842a48c69d2a0f7c8dce4423d7a2 --- /dev/null +++ b/CLIP/MANIFEST.in @@ -0,0 +1 @@ +include clip/bpe_simple_vocab_16e6.txt.gz diff --git a/CLIP/README.md b/CLIP/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5d2d20cd9e1cafcdf8bd8dfd83a0a9c47a884a39 --- /dev/null +++ b/CLIP/README.md @@ -0,0 +1,193 @@ +# CLIP + +[[Blog]](https://openai.com/blog/clip/) [[Paper]](https://arxiv.org/abs/2103.00020) [[Model Card]](model-card.md) [[Colab]](https://colab.research.google.com/github/openai/clip/blob/master/notebooks/Interacting_with_CLIP.ipynb) + +CLIP (Contrastive Language-Image Pre-Training) is a neural network trained on a variety of (image, text) pairs. It can be instructed in natural language to predict the most relevant text snippet, given an image, without directly optimizing for the task, similarly to the zero-shot capabilities of GPT-2 and 3. We found CLIP matches the performance of the original ResNet50 on ImageNet “zero-shot” without using any of the original 1.28M labeled examples, overcoming several major challenges in computer vision. + + + +## Approach + +![CLIP](CLIP.png) + + + +## Usage + +First, [install PyTorch 1.7.1](https://pytorch.org/get-started/locally/) and torchvision, as well as small additional dependencies, and then install this repo as a Python package. On a CUDA GPU machine, the following will do the trick: + +```bash +$ conda install --yes -c pytorch pytorch=1.7.1 torchvision cudatoolkit=11.0 +$ pip install ftfy regex tqdm +$ pip install git+https://github.com/openai/CLIP.git +``` + +Replace `cudatoolkit=11.0` above with the appropriate CUDA version on your machine or `cpuonly` when installing on a machine without a GPU. + +```python +import torch +import clip +from PIL import Image + +device = "cuda" if torch.cuda.is_available() else "cpu" +model, preprocess = clip.load("ViT-B/32", device=device) + +image = preprocess(Image.open("CLIP.png")).unsqueeze(0).to(device) +text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device) + +with torch.no_grad(): + image_features = model.encode_image(image) + text_features = model.encode_text(text) + + logits_per_image, logits_per_text = model(image, text) + probs = logits_per_image.softmax(dim=-1).cpu().numpy() + +print("Label probs:", probs) # prints: [[0.9927937 0.00421068 0.00299572]] +``` + + +## API + +The CLIP module `clip` provides the following methods: + +#### `clip.available_models()` + +Returns the names of the available CLIP models. + +#### `clip.load(name, device=..., jit=False)` + +Returns the model and the TorchVision transform needed by the model, specified by the model name returned by `clip.available_models()`. It will download the model as necessary. The `name` argument can also be a path to a local checkpoint. + +The device to run the model can be optionally specified, and the default is to use the first CUDA device if there is any, otherwise the CPU. When `jit` is `False`, a non-JIT version of the model will be loaded. + +#### `clip.tokenize(text: Union[str, List[str]], context_length=77)` + +Returns a LongTensor containing tokenized sequences of given text input(s). This can be used as the input to the model + +--- + +The model returned by `clip.load()` supports the following methods: + +#### `model.encode_image(image: Tensor)` + +Given a batch of images, returns the image features encoded by the vision portion of the CLIP model. + +#### `model.encode_text(text: Tensor)` + +Given a batch of text tokens, returns the text features encoded by the language portion of the CLIP model. + +#### `model(image: Tensor, text: Tensor)` + +Given a batch of images and a batch of text tokens, returns two Tensors, containing the logit scores corresponding to each image and text input. The values are cosine similarities between the corresponding image and text features, times 100. + + + +## More Examples + +### Zero-Shot Prediction + +The code below performs zero-shot prediction using CLIP, as shown in Appendix B in the paper. This example takes an image from the [CIFAR-100 dataset](https://www.cs.toronto.edu/~kriz/cifar.html), and predicts the most likely labels among the 100 textual labels from the dataset. + +```python +import os +import clip +import torch +from torchvision.datasets import CIFAR100 + +# Load the model +device = "cuda" if torch.cuda.is_available() else "cpu" +model, preprocess = clip.load('ViT-B/32', device) + +# Download the dataset +cifar100 = CIFAR100(root=os.path.expanduser("~/.cache"), download=True, train=False) + +# Prepare the inputs +image, class_id = cifar100[3637] +image_input = preprocess(image).unsqueeze(0).to(device) +text_inputs = torch.cat([clip.tokenize(f"a photo of a {c}") for c in cifar100.classes]).to(device) + +# Calculate features +with torch.no_grad(): + image_features = model.encode_image(image_input) + text_features = model.encode_text(text_inputs) + +# Pick the top 5 most similar labels for the image +image_features /= image_features.norm(dim=-1, keepdim=True) +text_features /= text_features.norm(dim=-1, keepdim=True) +similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1) +values, indices = similarity[0].topk(5) + +# Print the result +print("\nTop predictions:\n") +for value, index in zip(values, indices): + print(f"{cifar100.classes[index]:>16s}: {100 * value.item():.2f}%") +``` + +The output will look like the following (the exact numbers may be slightly different depending on the compute device): + +``` +Top predictions: + + snake: 65.31% + turtle: 12.29% + sweet_pepper: 3.83% + lizard: 1.88% + crocodile: 1.75% +``` + +Note that this example uses the `encode_image()` and `encode_text()` methods that return the encoded features of given inputs. + + +### Linear-probe evaluation + +The example below uses [scikit-learn](https://scikit-learn.org/) to perform logistic regression on image features. + +```python +import os +import clip +import torch + +import numpy as np +from sklearn.linear_model import LogisticRegression +from torch.utils.data import DataLoader +from torchvision.datasets import CIFAR100 +from tqdm import tqdm + +# Load the model +device = "cuda" if torch.cuda.is_available() else "cpu" +model, preprocess = clip.load('ViT-B/32', device) + +# Load the dataset +root = os.path.expanduser("~/.cache") +train = CIFAR100(root, download=True, train=True, transform=preprocess) +test = CIFAR100(root, download=True, train=False, transform=preprocess) + + +def get_features(dataset): + all_features = [] + all_labels = [] + + with torch.no_grad(): + for images, labels in tqdm(DataLoader(dataset, batch_size=100)): + features = model.encode_image(images.to(device)) + + all_features.append(features) + all_labels.append(labels) + + return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy() + +# Calculate the image features +train_features, train_labels = get_features(train) +test_features, test_labels = get_features(test) + +# Perform logistic regression +classifier = LogisticRegression(random_state=0, C=0.316, max_iter=1000, verbose=1) +classifier.fit(train_features, train_labels) + +# Evaluate using the logistic regression classifier +predictions = classifier.predict(test_features) +accuracy = np.mean((test_labels == predictions).astype(np.float)) * 100. +print(f"Accuracy = {accuracy:.3f}") +``` + +Note that the `C` value should be determined via a hyperparameter sweep using a validation split. diff --git a/CLIP/__init__.py b/CLIP/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/CLIP/clip/__init__.py b/CLIP/clip/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..dcc5619538c0f7c782508bdbd9587259d805e0d9 --- /dev/null +++ b/CLIP/clip/__init__.py @@ -0,0 +1 @@ +from .clip import * diff --git a/CLIP/clip/bpe_simple_vocab_16e6.txt.gz b/CLIP/clip/bpe_simple_vocab_16e6.txt.gz new file mode 100644 index 0000000000000000000000000000000000000000..36a15856e00a06a9fbed8cdd34d2393fea4a3113 --- /dev/null +++ b/CLIP/clip/bpe_simple_vocab_16e6.txt.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a +size 1356917 diff --git a/CLIP/clip/clip.py b/CLIP/clip/clip.py new file mode 100644 index 0000000000000000000000000000000000000000..2c911d0606ad63efd9e3a65582b4fefb695791fc --- /dev/null +++ b/CLIP/clip/clip.py @@ -0,0 +1,231 @@ +import hashlib +import os +import urllib +import warnings +from typing import Any, Union, List +from pkg_resources import packaging + +import torch +from PIL import Image +from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize +from tqdm import tqdm + +from .model import build_model +from .simple_tokenizer import SimpleTokenizer as _Tokenizer + +try: + from torchvision.transforms import InterpolationMode + BICUBIC = InterpolationMode.BICUBIC +except ImportError: + BICUBIC = Image.BICUBIC + + +if packaging.version.parse(torch.__version__) < packaging.version.parse("1.7.1"): + warnings.warn("PyTorch version 1.7.1 or higher is recommended") + + +__all__ = ["available_models", "load", "tokenize"] +_tokenizer = _Tokenizer() + +_MODELS = { + "RN50": "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt", + "RN101": "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt", + "RN50x4": "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt", + "RN50x16": "https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt", + "RN50x64": "https://openaipublic.azureedge.net/clip/models/be1cfb55d75a9666199fb2206c106743da0f6468c9d327f3e0d0a543a9919d9c/RN50x64.pt", + "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt", + "ViT-B/16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt", + "ViT-L/14": "https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt", +} + + +def _download(url: str, root: str): + os.makedirs(root, exist_ok=True) + filename = os.path.basename(url) + + expected_sha256 = url.split("/")[-2] + download_target = os.path.join(root, filename) + + if os.path.exists(download_target) and not os.path.isfile(download_target): + raise RuntimeError(f"{download_target} exists and is not a regular file") + + if os.path.isfile(download_target): + if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256: + return download_target + else: + warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file") + + with urllib.request.urlopen(url) as source, open(download_target, "wb") as output: + with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True, unit_divisor=1024) as loop: + while True: + buffer = source.read(8192) + if not buffer: + break + + output.write(buffer) + loop.update(len(buffer)) + + if hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256: + raise RuntimeError(f"Model has been downloaded but the SHA256 checksum does not not match") + + return download_target + + +def _convert_image_to_rgb(image): + return image.convert("RGB") + + +def _transform(n_px): + return Compose([ + Resize(n_px, interpolation=BICUBIC), + CenterCrop(n_px), + _convert_image_to_rgb, + ToTensor(), + Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), + ]) + + +def available_models() -> List[str]: + """Returns the names of available CLIP models""" + return list(_MODELS.keys()) + + +def load(name: str, device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu", jit: bool = False, download_root: str = None): + """Load a CLIP model + + Parameters + ---------- + name : str + A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict + + device : Union[str, torch.device] + The device to put the loaded model + + jit : bool + Whether to load the optimized JIT model or more hackable non-JIT model (default). + + download_root: str + path to download the model files; by default, it uses "~/.cache/clip" + + Returns + ------- + model : torch.nn.Module + The CLIP model + + preprocess : Callable[[PIL.Image], torch.Tensor] + A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input + """ + if name in _MODELS: + model_path = _download(_MODELS[name], download_root or os.path.expanduser("~/.cache/clip")) + elif os.path.isfile(name): + model_path = name + else: + raise RuntimeError(f"Model {name} not found; available models = {available_models()}") + + try: + # loading JIT archive + model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval() + state_dict = None + except RuntimeError: + # loading saved state dict + if jit: + warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead") + jit = False + state_dict = torch.load(model_path, map_location="cpu") + + if not jit: + model = build_model(state_dict or model.state_dict()).to(device) + if str(device) == "cpu": + model.float() + return model, _transform(model.visual.input_resolution) + + # patch the device names + device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[]) + device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1] + + def patch_device(module): + try: + graphs = [module.graph] if hasattr(module, "graph") else [] + except RuntimeError: + graphs = [] + + if hasattr(module, "forward1"): + graphs.append(module.forward1.graph) + + for graph in graphs: + for node in graph.findAllNodes("prim::Constant"): + if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"): + node.copyAttributes(device_node) + + model.apply(patch_device) + patch_device(model.encode_image) + patch_device(model.encode_text) + + # patch dtype to float32 on CPU + if str(device) == "cpu": + float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[]) + float_input = list(float_holder.graph.findNode("aten::to").inputs())[1] + float_node = float_input.node() + + def patch_float(module): + try: + graphs = [module.graph] if hasattr(module, "graph") else [] + except RuntimeError: + graphs = [] + + if hasattr(module, "forward1"): + graphs.append(module.forward1.graph) + + for graph in graphs: + for node in graph.findAllNodes("aten::to"): + inputs = list(node.inputs()) + for i in [1, 2]: # dtype can be the second or third argument to aten::to() + if inputs[i].node()["value"] == 5: + inputs[i].node().copyAttributes(float_node) + + model.apply(patch_float) + patch_float(model.encode_image) + patch_float(model.encode_text) + + model.float() + + return model, _transform(model.input_resolution.item()) + + +def tokenize(texts: Union[str, List[str]], context_length: int = 77, truncate: bool = False) -> torch.LongTensor: + """ + Returns the tokenized representation of given input string(s) + + Parameters + ---------- + texts : Union[str, List[str]] + An input string or a list of input strings to tokenize + + context_length : int + The context length to use; all CLIP models use 77 as the context length + + truncate: bool + Whether to truncate the text in case its encoding is longer than the context length + + Returns + ------- + A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length] + """ + if isinstance(texts, str): + texts = [texts] + + sot_token = _tokenizer.encoder["<|startoftext|>"] + eot_token = _tokenizer.encoder["<|endoftext|>"] + all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts] + result = torch.zeros(len(all_tokens), context_length, dtype=torch.long) + + for i, tokens in enumerate(all_tokens): + if len(tokens) > context_length: + if truncate: + tokens = tokens[:context_length] + tokens[-1] = eot_token + else: + raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}") + result[i, :len(tokens)] = torch.tensor(tokens) + + return result diff --git a/CLIP/clip/model.py b/CLIP/clip/model.py new file mode 100644 index 0000000000000000000000000000000000000000..2b95aa93b2972387b6bd17c64a8ff58fb2330d1a --- /dev/null +++ b/CLIP/clip/model.py @@ -0,0 +1,484 @@ +from collections import OrderedDict +from typing import Tuple, Union + +import numpy as np +import torch +import torch.nn.functional as F +from torch import nn +import math + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1): + super().__init__() + + # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1 + self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + + self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + + self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity() + + self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion) + + self.relu = nn.ReLU(inplace=True) + self.downsample = None + self.stride = stride + + if stride > 1 or inplanes != planes * Bottleneck.expansion: + # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1 + self.downsample = nn.Sequential(OrderedDict([ + ("-1", nn.AvgPool2d(stride)), + ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)), + ("1", nn.BatchNorm2d(planes * self.expansion)) + ])) + + def forward(self, x: torch.Tensor): + identity = x + + out = self.relu(self.bn1(self.conv1(x))) + out = self.relu(self.bn2(self.conv2(out))) + out = self.avgpool(out) + out = self.bn3(self.conv3(out)) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.relu(out) + return out + + +class AttentionPool2d(nn.Module): + def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None): + super().__init__() + self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5) + self.k_proj = nn.Linear(embed_dim, embed_dim) + self.q_proj = nn.Linear(embed_dim, embed_dim) + self.v_proj = nn.Linear(embed_dim, embed_dim) + self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim) + self.num_heads = num_heads + + def forward(self, x): + x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute(2, 0, 1) # NCHW -> (HW)NC + x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0) # (HW+1)NC + x = x + self.positional_embedding[:, None, :].to(x.dtype) # (HW+1)NC + x, _ = F.multi_head_attention_forward( + query=x, key=x, value=x, + embed_dim_to_check=x.shape[-1], + num_heads=self.num_heads, + q_proj_weight=self.q_proj.weight, + k_proj_weight=self.k_proj.weight, + v_proj_weight=self.v_proj.weight, + in_proj_weight=None, + in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]), + bias_k=None, + bias_v=None, + add_zero_attn=False, + dropout_p=0, + out_proj_weight=self.c_proj.weight, + out_proj_bias=self.c_proj.bias, + use_separate_proj_weight=True, + training=self.training, + need_weights=False + ) + + return x[0] + + +class ModifiedResNet(nn.Module): + """ + A ResNet class that is similar to torchvision's but contains the following changes: + - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool. + - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1 + - The final pooling layer is a QKV attention instead of an average pool + """ + + def __init__(self, layers, output_dim, heads, input_resolution=224, width=64): + super().__init__() + self.output_dim = output_dim + self.input_resolution = input_resolution + + # the 3-layer stem + self.conv1 = nn.Conv2d(3, width // 2, kernel_size=3, stride=2, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(width // 2) + self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(width // 2) + self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False) + self.bn3 = nn.BatchNorm2d(width) + self.avgpool = nn.AvgPool2d(2) + self.relu = nn.ReLU(inplace=True) + + # residual layers + self._inplanes = width # this is a *mutable* variable used during construction + self.layer1 = self._make_layer(width, layers[0]) + self.layer2 = self._make_layer(width * 2, layers[1], stride=2) + self.layer3 = self._make_layer(width * 4, layers[2], stride=2) + self.layer4 = self._make_layer(width * 8, layers[3], stride=2) + + embed_dim = width * 32 # the ResNet feature dimension + self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, heads, output_dim) + + def _make_layer(self, planes, blocks, stride=1): + layers = [Bottleneck(self._inplanes, planes, stride)] + + self._inplanes = planes * Bottleneck.expansion + for _ in range(1, blocks): + layers.append(Bottleneck(self._inplanes, planes)) + + return nn.Sequential(*layers) + + def forward(self, x): + def stem(x): + for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2), (self.conv3, self.bn3)]: + x = self.relu(bn(conv(x))) + x = self.avgpool(x) + return x + + x = x.type(self.conv1.weight.dtype) + x = stem(x) + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + x = self.attnpool(x) + + return x + + +class LayerNorm(nn.LayerNorm): + """Subclass torch's LayerNorm to handle fp16.""" + + def forward(self, x: torch.Tensor): + orig_type = x.dtype + ret = super().forward(x.type(torch.float32)) + return ret.type(orig_type) + + +class QuickGELU(nn.Module): + def forward(self, x: torch.Tensor): + return x * torch.sigmoid(1.702 * x) + + +class ResidualAttentionBlock(nn.Module): + def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None): + super().__init__() + + self.attn = nn.MultiheadAttention(d_model, n_head) + self.ln_1 = LayerNorm(d_model) + self.mlp = nn.Sequential(OrderedDict([ + ("c_fc", nn.Linear(d_model, d_model * 4)), + ("gelu", QuickGELU()), + ("c_proj", nn.Linear(d_model * 4, d_model)) + ])) + self.ln_2 = LayerNorm(d_model) + self.attn_mask = attn_mask + + def attention(self, x: torch.Tensor): + self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None + return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0] + + def forward(self, x: torch.Tensor): + x = x + self.attention(self.ln_1(x)) + x = x + self.mlp(self.ln_2(x)) + return x + + +class Transformer(nn.Module): + def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None): + super().__init__() + self.width = width + self.layers = layers + self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)]) + + def forward(self, x: torch.Tensor): + return self.resblocks(x) + + +class VisionTransformer(nn.Module): + def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, output_dim: int): + super().__init__() + self.input_resolution = input_resolution + self.output_dim = output_dim + self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False) + + scale = width ** -0.5 + self.class_embedding = nn.Parameter(scale * torch.randn(width)) + self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width)) + self.ln_pre = LayerNorm(width) + + self.transformer = Transformer(width, layers, heads) + + self.ln_post = LayerNorm(width) + self.proj = nn.Parameter(scale * torch.randn(width, output_dim)) + + # https://github.com/facebookresearch/dino + def interpolate_pos_encoding(self, x, w, h): + positional_embedding = self.positional_embedding.unsqueeze(0) + patch_size = self.conv1.kernel_size[0] + + npatch = x.shape[1] - 1 + N = positional_embedding.shape[1] - 1 + if npatch == N and w == h: + return positional_embedding + class_pos_embed = positional_embedding[:, 0] + patch_pos_embed = positional_embedding[:, 1:] + dim = x.shape[-1] + + w0 = w // patch_size + h0 = h // patch_size + + # we add a small number to avoid floating point error in the interpolation + # see discussion at https://github.com/facebookresearch/dino/issues/8 + w0, h0 = w0 + 0.1, h0 + 0.1 + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(0, 3, 1, 2), + scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)), + mode='bicubic', + ) + assert int(w0) == patch_pos_embed.shape[-2] and int(h0) == patch_pos_embed.shape[-1] + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + def forward(self, x: torch.Tensor): + x = self.transformer_first_blocks_forward(x) + x = self.transformer.resblocks[-1](x) + x = x.permute(1, 0, 2) # LND -> NLD + x = self.ln_post(x[:, 0, :]) + + if self.proj is not None: + x = x @ self.proj + + return x + + def transformer_first_blocks_forward(self, x): + h, w = x.shape[-2:] + x = self.conv1(x) # shape = [*, width, grid, grid] + x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2] + x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] + x = torch.cat( + [self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), + x], dim=1) # shape = [*, grid ** 2 + 1, width] + positional_embedding = self.interpolate_pos_encoding(x, w, h) + x = x + positional_embedding.to(x.dtype) + # x = x + self.positional_embedding.to(x.dtype) + x = self.ln_pre(x) + x = x.permute(1, 0, 2) # NLD -> LND + x = self.transformer.resblocks[:-1](x) + return x + + @staticmethod + def attn_cosine_sim(x, eps=1e-08): + norm = x.norm(dim=2, keepdim=True) + factor = torch.clamp(norm @ norm.permute(0, 2, 1), min=eps) # shape [1, t, t] + sim_matrix = (x @ x.permute(0, 2, 1)) / factor # shape [1, t, t] + return sim_matrix + + +class CLIP(nn.Module): + def __init__(self, + embed_dim: int, + # vision + image_resolution: int, + vision_layers: Union[Tuple[int, int, int, int], int], + vision_width: int, + vision_patch_size: int, + # text + context_length: int, + vocab_size: int, + transformer_width: int, + transformer_heads: int, + transformer_layers: int + ): + super().__init__() + + self.context_length = context_length + + if isinstance(vision_layers, (tuple, list)): + vision_heads = vision_width * 32 // 64 + self.visual = ModifiedResNet( + layers=vision_layers, + output_dim=embed_dim, + heads=vision_heads, + input_resolution=image_resolution, + width=vision_width + ) + else: + vision_heads = vision_width // 64 + self.visual = VisionTransformer( + input_resolution=image_resolution, + patch_size=vision_patch_size, + width=vision_width, + layers=vision_layers, + heads=vision_heads, + output_dim=embed_dim + ) + + self.transformer = Transformer( + width=transformer_width, + layers=transformer_layers, + heads=transformer_heads, + attn_mask=self.build_attention_mask() + ) + + self.vocab_size = vocab_size + self.token_embedding = nn.Embedding(vocab_size, transformer_width) + self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width)) + self.ln_final = LayerNorm(transformer_width) + + self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim)) + self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07)) + + self.initialize_parameters() + + def initialize_parameters(self): + nn.init.normal_(self.token_embedding.weight, std=0.02) + nn.init.normal_(self.positional_embedding, std=0.01) + + if isinstance(self.visual, ModifiedResNet): + if self.visual.attnpool is not None: + std = self.visual.attnpool.c_proj.in_features ** -0.5 + nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std) + nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std) + nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std) + nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std) + + for resnet_block in [self.visual.layer1, self.visual.layer2, self.visual.layer3, self.visual.layer4]: + for name, param in resnet_block.named_parameters(): + if name.endswith("bn3.weight"): + nn.init.zeros_(param) + + proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5) + attn_std = self.transformer.width ** -0.5 + fc_std = (2 * self.transformer.width) ** -0.5 + for block in self.transformer.resblocks: + nn.init.normal_(block.attn.in_proj_weight, std=attn_std) + nn.init.normal_(block.attn.out_proj.weight, std=proj_std) + nn.init.normal_(block.mlp.c_fc.weight, std=fc_std) + nn.init.normal_(block.mlp.c_proj.weight, std=proj_std) + + if self.text_projection is not None: + nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5) + + def build_attention_mask(self): + # lazily create causal attention mask, with full attention between the vision tokens + # pytorch uses additive attention mask; fill with -inf + mask = torch.empty(self.context_length, self.context_length) + mask.fill_(float("-inf")) + mask.triu_(1) # zero out the lower diagonal + return mask + + @property + def dtype(self): + return self.visual.conv1.weight.dtype + + def calculate_self_sim(self, x: torch.Tensor): + tokens = self.visual.transformer_first_blocks_forward( + x.type(self.dtype)) # shape = [batch, tokens, emb_dim] tokens include class token + tokens = tokens.permute(1, 0, 2) + ssim = self.visual.attn_cosine_sim(tokens) # shape = [batch, tokens, tokens] + return ssim + + def encode_image(self, image): + return self.visual(image.type(self.dtype)) + + def encode_text(self, text): + x = self.token_embedding(text).type(self.dtype) # [batch_size, n_ctx, d_model] + x = x + self.positional_embedding.type(self.dtype) + x = x.permute(1, 0, 2) # NLD -> LND + x = self.transformer(x) + x = x.permute(1, 0, 2) # LND -> NLD + x = self.ln_final(x).type(self.dtype) + + # x.shape = [batch_size, n_ctx, transformer.width] + # take features from the eot embedding (eot_token is the highest number in each sequence) + x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection + + return x + + def forward(self, image, text): + image_features = self.encode_image(image) + text_features = self.encode_text(text) + + # normalized features + image_features = image_features / image_features.norm(dim=-1, keepdim=True) + text_features = text_features / text_features.norm(dim=-1, keepdim=True) + + # cosine similarity as logits + logit_scale = self.logit_scale.exp() + logits_per_image = logit_scale * image_features @ text_features.t() + logits_per_text = logits_per_image.t() + + # shape = [global_batch_size, global_batch_size] + return logits_per_image, logits_per_text + + +def convert_weights(model: nn.Module): + """Convert applicable model parameters to fp16""" + + def _convert_weights_to_fp16(l): + if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)): + l.weight.data = l.weight.data.half() + if l.bias is not None: + l.bias.data = l.bias.data.half() + + if isinstance(l, nn.MultiheadAttention): + for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]: + tensor = getattr(l, attr) + if tensor is not None: + tensor.data = tensor.data.half() + + for name in ["text_projection", "proj"]: + if hasattr(l, name): + attr = getattr(l, name) + if attr is not None: + attr.data = attr.data.half() + + model.apply(_convert_weights_to_fp16) + + +def build_model(state_dict: dict): + vit = "visual.proj" in state_dict + + if vit: + vision_width = state_dict["visual.conv1.weight"].shape[0] + vision_layers = len( + [k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")]) + vision_patch_size = state_dict["visual.conv1.weight"].shape[-1] + grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5) + image_resolution = vision_patch_size * grid_size + else: + counts: list = [len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in + [1, 2, 3, 4]] + vision_layers = tuple(counts) + vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0] + output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5) + vision_patch_size = None + assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0] + image_resolution = output_width * 32 + + embed_dim = state_dict["text_projection"].shape[1] + context_length = state_dict["positional_embedding"].shape[0] + vocab_size = state_dict["token_embedding.weight"].shape[0] + transformer_width = state_dict["ln_final.weight"].shape[0] + transformer_heads = transformer_width // 64 + transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith(f"transformer.resblocks"))) + + model = CLIP( + embed_dim, + image_resolution, vision_layers, vision_width, vision_patch_size, + context_length, vocab_size, transformer_width, transformer_heads, transformer_layers + ) + + for key in ["input_resolution", "context_length", "vocab_size"]: + if key in state_dict: + del state_dict[key] + + convert_weights(model) + model.load_state_dict(state_dict) + return model.eval() diff --git a/CLIP/clip/simple_tokenizer.py b/CLIP/clip/simple_tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..0a66286b7d5019c6e221932a813768038f839c91 --- /dev/null +++ b/CLIP/clip/simple_tokenizer.py @@ -0,0 +1,132 @@ +import gzip +import html +import os +from functools import lru_cache + +import ftfy +import regex as re + + +@lru_cache() +def default_bpe(): + return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz") + + +@lru_cache() +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a signficant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8+n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +def get_pairs(word): + """Return set of symbol pairs in a word. + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +def basic_clean(text): + text = ftfy.fix_text(text) + text = html.unescape(html.unescape(text)) + return text.strip() + + +def whitespace_clean(text): + text = re.sub(r'\s+', ' ', text) + text = text.strip() + return text + + +class SimpleTokenizer(object): + def __init__(self, bpe_path: str = default_bpe()): + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} + merges = gzip.open(bpe_path).read().decode("utf-8").split('\n') + merges = merges[1:49152-256-2+1] + merges = [tuple(merge.split()) for merge in merges] + vocab = list(bytes_to_unicode().values()) + vocab = vocab + [v+'' for v in vocab] + for merge in merges: + vocab.append(''.join(merge)) + vocab.extend(['<|startoftext|>', '<|endoftext|>']) + self.encoder = dict(zip(vocab, range(len(vocab)))) + self.decoder = {v: k for k, v in self.encoder.items()} + self.bpe_ranks = dict(zip(merges, range(len(merges)))) + self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'} + self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE) + + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token[:-1]) + ( token[-1] + '',) + pairs = get_pairs(word) + + if not pairs: + return token+'' + + while True: + bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf'))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + new_word.extend(word[i:j]) + i = j + except: + new_word.extend(word[i:]) + break + + if word[i] == first and i < len(word)-1 and word[i+1] == second: + new_word.append(first+second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = ' '.join(word) + self.cache[token] = word + return word + + def encode(self, text): + bpe_tokens = [] + text = whitespace_clean(basic_clean(text)).lower() + for token in re.findall(self.pat, text): + token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) + bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' ')) + return bpe_tokens + + def decode(self, tokens): + text = ''.join([self.decoder[token] for token in tokens]) + text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('', ' ') + return text diff --git a/CLIP/clip_explainability/__init__.py b/CLIP/clip_explainability/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..dcc5619538c0f7c782508bdbd9587259d805e0d9 --- /dev/null +++ b/CLIP/clip_explainability/__init__.py @@ -0,0 +1 @@ +from .clip import * diff --git a/CLIP/clip_explainability/auxilary.py b/CLIP/clip_explainability/auxilary.py new file mode 100644 index 0000000000000000000000000000000000000000..843b3e88c6d88146497e352e9fa924c89d9cc96e --- /dev/null +++ b/CLIP/clip_explainability/auxilary.py @@ -0,0 +1,422 @@ +import torch +import warnings +from typing import Tuple, Optional + +import torch +from torch import Tensor +from torch.nn.init import xavier_uniform_ +from torch.nn.init import constant_ +from torch.nn.init import xavier_normal_ +from torch.nn.parameter import Parameter +from torch.nn import functional as F + +# We define this function as _pad because it takes an argument +# named pad, which clobbers the recursive reference to the pad +# function needed for __torch_function__ support +pad = F._pad + +# This class exists solely for Transformer; it has an annotation stating +# that bias is never None, which appeases TorchScript +class _LinearWithBias(torch.nn.Linear): + bias: Tensor + + def __init__(self, in_features: int, out_features: int) -> None: + super().__init__(in_features, out_features, bias=True) + +def multi_head_attention_forward(query: Tensor, + key: Tensor, + value: Tensor, + embed_dim_to_check: int, + num_heads: int, + in_proj_weight: Tensor, + in_proj_bias: Tensor, + bias_k: Optional[Tensor], + bias_v: Optional[Tensor], + add_zero_attn: bool, + dropout_p: float, + out_proj_weight: Tensor, + out_proj_bias: Tensor, + training: bool = True, + key_padding_mask: Optional[Tensor] = None, + need_weights: bool = True, + attn_mask: Optional[Tensor] = None, + use_separate_proj_weight: bool = False, + q_proj_weight: Optional[Tensor] = None, + k_proj_weight: Optional[Tensor] = None, + v_proj_weight: Optional[Tensor] = None, + static_k: Optional[Tensor] = None, + static_v: Optional[Tensor] = None, + attention_probs_forward_hook = None, + attention_probs_backwards_hook = None, + ) -> Tuple[Tensor, Optional[Tensor]]: + if not torch.jit.is_scripting(): + tens_ops = (query, key, value, in_proj_weight, in_proj_bias, bias_k, bias_v, + out_proj_weight, out_proj_bias) + if any([type(t) is not Tensor for t in tens_ops]) and F.has_torch_function(tens_ops): + return F.handle_torch_function( + multi_head_attention_forward, tens_ops, query, key, value, + embed_dim_to_check, num_heads, in_proj_weight, in_proj_bias, + bias_k, bias_v, add_zero_attn, dropout_p, out_proj_weight, + out_proj_bias, training=training, key_padding_mask=key_padding_mask, + need_weights=need_weights, attn_mask=attn_mask, + use_separate_proj_weight=use_separate_proj_weight, + q_proj_weight=q_proj_weight, k_proj_weight=k_proj_weight, + v_proj_weight=v_proj_weight, static_k=static_k, static_v=static_v) + tgt_len, bsz, embed_dim = query.size() + assert embed_dim == embed_dim_to_check + # allow MHA to have different sizes for the feature dimension + assert key.size(0) == value.size(0) and key.size(1) == value.size(1) + + head_dim = embed_dim // num_heads + assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads" + scaling = float(head_dim) ** -0.5 + + if not use_separate_proj_weight: + if torch.equal(query, key) and torch.equal(key, value): + # self-attention + q, k, v = F.linear(query, in_proj_weight, in_proj_bias).chunk(3, dim=-1) + + elif torch.equal(key, value): + # encoder-decoder attention + # This is inline in_proj function with in_proj_weight and in_proj_bias + _b = in_proj_bias + _start = 0 + _end = embed_dim + _w = in_proj_weight[_start:_end, :] + if _b is not None: + _b = _b[_start:_end] + q = F.linear(query, _w, _b) + + if key is None: + assert value is None + k = None + v = None + else: + + # This is inline in_proj function with in_proj_weight and in_proj_bias + _b = in_proj_bias + _start = embed_dim + _end = None + _w = in_proj_weight[_start:, :] + if _b is not None: + _b = _b[_start:] + k, v = F.linear(key, _w, _b).chunk(2, dim=-1) + + else: + # This is inline in_proj function with in_proj_weight and in_proj_bias + _b = in_proj_bias + _start = 0 + _end = embed_dim + _w = in_proj_weight[_start:_end, :] + if _b is not None: + _b = _b[_start:_end] + q = F.linear(query, _w, _b) + + # This is inline in_proj function with in_proj_weight and in_proj_bias + _b = in_proj_bias + _start = embed_dim + _end = embed_dim * 2 + _w = in_proj_weight[_start:_end, :] + if _b is not None: + _b = _b[_start:_end] + k = F.linear(key, _w, _b) + + # This is inline in_proj function with in_proj_weight and in_proj_bias + _b = in_proj_bias + _start = embed_dim * 2 + _end = None + _w = in_proj_weight[_start:, :] + if _b is not None: + _b = _b[_start:] + v = F.linear(value, _w, _b) + else: + q_proj_weight_non_opt = torch.jit._unwrap_optional(q_proj_weight) + len1, len2 = q_proj_weight_non_opt.size() + assert len1 == embed_dim and len2 == query.size(-1) + + k_proj_weight_non_opt = torch.jit._unwrap_optional(k_proj_weight) + len1, len2 = k_proj_weight_non_opt.size() + assert len1 == embed_dim and len2 == key.size(-1) + + v_proj_weight_non_opt = torch.jit._unwrap_optional(v_proj_weight) + len1, len2 = v_proj_weight_non_opt.size() + assert len1 == embed_dim and len2 == value.size(-1) + + if in_proj_bias is not None: + q = F.linear(query, q_proj_weight_non_opt, in_proj_bias[0:embed_dim]) + k = F.linear(key, k_proj_weight_non_opt, in_proj_bias[embed_dim:(embed_dim * 2)]) + v = F.linear(value, v_proj_weight_non_opt, in_proj_bias[(embed_dim * 2):]) + else: + q = F.linear(query, q_proj_weight_non_opt, in_proj_bias) + k = F.linear(key, k_proj_weight_non_opt, in_proj_bias) + v = F.linear(value, v_proj_weight_non_opt, in_proj_bias) + q = q * scaling + + if attn_mask is not None: + assert attn_mask.dtype == torch.float32 or attn_mask.dtype == torch.float64 or \ + attn_mask.dtype == torch.float16 or attn_mask.dtype == torch.uint8 or attn_mask.dtype == torch.bool, \ + 'Only float, byte, and bool types are supported for attn_mask, not {}'.format(attn_mask.dtype) + if attn_mask.dtype == torch.uint8: + warnings.warn("Byte tensor for attn_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.") + attn_mask = attn_mask.to(torch.bool) + + if attn_mask.dim() == 2: + attn_mask = attn_mask.unsqueeze(0) + if list(attn_mask.size()) != [1, query.size(0), key.size(0)]: + raise RuntimeError('The size of the 2D attn_mask is not correct.') + elif attn_mask.dim() == 3: + if list(attn_mask.size()) != [bsz * num_heads, query.size(0), key.size(0)]: + raise RuntimeError('The size of the 3D attn_mask is not correct.') + else: + raise RuntimeError("attn_mask's dimension {} is not supported".format(attn_mask.dim())) + # attn_mask's dim is 3 now. + + # convert ByteTensor key_padding_mask to bool + if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8: + warnings.warn("Byte tensor for key_padding_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.") + key_padding_mask = key_padding_mask.to(torch.bool) + + if bias_k is not None and bias_v is not None: + if static_k is None and static_v is None: + k = torch.cat([k, bias_k.repeat(1, bsz, 1)]) + v = torch.cat([v, bias_v.repeat(1, bsz, 1)]) + if attn_mask is not None: + attn_mask = pad(attn_mask, (0, 1)) + if key_padding_mask is not None: + key_padding_mask = pad(key_padding_mask, (0, 1)) + else: + assert static_k is None, "bias cannot be added to static key." + assert static_v is None, "bias cannot be added to static value." + else: + assert bias_k is None + assert bias_v is None + + q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1) + if k is not None: + k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1) + if v is not None: + v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1) + + if static_k is not None: + assert static_k.size(0) == bsz * num_heads + assert static_k.size(2) == head_dim + k = static_k + + if static_v is not None: + assert static_v.size(0) == bsz * num_heads + assert static_v.size(2) == head_dim + v = static_v + + src_len = k.size(1) + + if key_padding_mask is not None: + assert key_padding_mask.size(0) == bsz + assert key_padding_mask.size(1) == src_len + + if add_zero_attn: + src_len += 1 + k = torch.cat([k, torch.zeros((k.size(0), 1) + k.size()[2:], dtype=k.dtype, device=k.device)], dim=1) + v = torch.cat([v, torch.zeros((v.size(0), 1) + v.size()[2:], dtype=v.dtype, device=v.device)], dim=1) + if attn_mask is not None: + attn_mask = pad(attn_mask, (0, 1)) + if key_padding_mask is not None: + key_padding_mask = pad(key_padding_mask, (0, 1)) + + attn_output_weights = torch.bmm(q, k.transpose(1, 2)) + assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len] + + if attn_mask is not None: + if attn_mask.dtype == torch.bool: + attn_output_weights.masked_fill_(attn_mask, float('-inf')) + else: + attn_output_weights += attn_mask + + + if key_padding_mask is not None: + attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len) + attn_output_weights = attn_output_weights.masked_fill( + key_padding_mask.unsqueeze(1).unsqueeze(2), + float('-inf'), + ) + attn_output_weights = attn_output_weights.view(bsz * num_heads, tgt_len, src_len) + + attn_output_weights = F.softmax( + attn_output_weights, dim=-1) + attn_output_weights = F.dropout(attn_output_weights, p=dropout_p, training=training) + + # use hooks for the attention weights if necessary + if attention_probs_forward_hook is not None and attention_probs_backwards_hook is not None: + attention_probs_forward_hook(attn_output_weights) + attn_output_weights.register_hook(attention_probs_backwards_hook) + + attn_output = torch.bmm(attn_output_weights, v) + assert list(attn_output.size()) == [bsz * num_heads, tgt_len, head_dim] + attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) + attn_output = F.linear(attn_output, out_proj_weight, out_proj_bias) + + if need_weights: + # average attention weights over heads + attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len) + return attn_output, attn_output_weights.sum(dim=1) / num_heads + else: + return attn_output, None + + +class MultiheadAttention(torch.nn.Module): + r"""Allows the model to jointly attend to information + from different representation subspaces. + See reference: Attention Is All You Need + + .. math:: + \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O + \text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V) + + Args: + embed_dim: total dimension of the model. + num_heads: parallel attention heads. + dropout: a Dropout layer on attn_output_weights. Default: 0.0. + bias: add bias as module parameter. Default: True. + add_bias_kv: add bias to the key and value sequences at dim=0. + add_zero_attn: add a new batch of zeros to the key and + value sequences at dim=1. + kdim: total number of features in key. Default: None. + vdim: total number of features in value. Default: None. + + Note: if kdim and vdim are None, they will be set to embed_dim such that + query, key, and value have the same number of features. + + Examples:: + + >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads) + >>> attn_output, attn_output_weights = multihead_attn(query, key, value) + """ + bias_k: Optional[torch.Tensor] + bias_v: Optional[torch.Tensor] + + def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None, vdim=None): + super(MultiheadAttention, self).__init__() + self.embed_dim = embed_dim + self.kdim = kdim if kdim is not None else embed_dim + self.vdim = vdim if vdim is not None else embed_dim + self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim + + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" + + if self._qkv_same_embed_dim is False: + self.q_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim)) + self.k_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim)) + self.v_proj_weight = Parameter(torch.Tensor(embed_dim, self.vdim)) + self.register_parameter('in_proj_weight', None) + else: + self.in_proj_weight = Parameter(torch.empty(3 * embed_dim, embed_dim)) + self.register_parameter('q_proj_weight', None) + self.register_parameter('k_proj_weight', None) + self.register_parameter('v_proj_weight', None) + + if bias: + self.in_proj_bias = Parameter(torch.empty(3 * embed_dim)) + else: + self.register_parameter('in_proj_bias', None) + self.out_proj = _LinearWithBias(embed_dim, embed_dim) + + if add_bias_kv: + self.bias_k = Parameter(torch.empty(1, 1, embed_dim)) + self.bias_v = Parameter(torch.empty(1, 1, embed_dim)) + else: + self.bias_k = self.bias_v = None + + self.add_zero_attn = add_zero_attn + + self._reset_parameters() + + def _reset_parameters(self): + if self._qkv_same_embed_dim: + xavier_uniform_(self.in_proj_weight) + else: + xavier_uniform_(self.q_proj_weight) + xavier_uniform_(self.k_proj_weight) + xavier_uniform_(self.v_proj_weight) + + if self.in_proj_bias is not None: + constant_(self.in_proj_bias, 0.) + constant_(self.out_proj.bias, 0.) + if self.bias_k is not None: + xavier_normal_(self.bias_k) + if self.bias_v is not None: + xavier_normal_(self.bias_v) + + def __setstate__(self, state): + # Support loading old MultiheadAttention checkpoints generated by v1.1.0 + if '_qkv_same_embed_dim' not in state: + state['_qkv_same_embed_dim'] = True + + super(MultiheadAttention, self).__setstate__(state) + + def forward(self, query, key, value, key_padding_mask=None, + need_weights=True, attn_mask=None, attention_probs_forward_hook=None, attention_probs_backwards_hook=None): + r""" + Args: + query, key, value: map a query and a set of key-value pairs to an output. + See "Attention Is All You Need" for more details. + key_padding_mask: if provided, specified padding elements in the key will + be ignored by the attention. When given a binary mask and a value is True, + the corresponding value on the attention layer will be ignored. When given + a byte mask and a value is non-zero, the corresponding value on the attention + layer will be ignored + need_weights: output attn_output_weights. + attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all + the batches while a 3D mask allows to specify a different mask for the entries of each batch. + + Shape: + - Inputs: + - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is + the embedding dimension. + - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is + the embedding dimension. + - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is + the embedding dimension. + - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length. + If a ByteTensor is provided, the non-zero positions will be ignored while the position + with the zero positions will be unchanged. If a BoolTensor is provided, the positions with the + value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged. + - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length. + 3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length, + S is the source sequence length. attn_mask ensure that position i is allowed to attend the unmasked + positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend + while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True`` + is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor + is provided, it will be added to the attention weight. + + - Outputs: + - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, + E is the embedding dimension. + - attn_output_weights: :math:`(N, L, S)` where N is the batch size, + L is the target sequence length, S is the source sequence length. + """ + if not self._qkv_same_embed_dim: + return multi_head_attention_forward( + query, key, value, self.embed_dim, self.num_heads, + self.in_proj_weight, self.in_proj_bias, + self.bias_k, self.bias_v, self.add_zero_attn, + self.dropout, self.out_proj.weight, self.out_proj.bias, + training=self.training, + key_padding_mask=key_padding_mask, need_weights=need_weights, + attn_mask=attn_mask, use_separate_proj_weight=True, + q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight, + v_proj_weight=self.v_proj_weight, + attention_probs_forward_hook=attention_probs_forward_hook, + attention_probs_backwards_hook=attention_probs_backwards_hook) + else: + return multi_head_attention_forward( + query, key, value, self.embed_dim, self.num_heads, + self.in_proj_weight, self.in_proj_bias, + self.bias_k, self.bias_v, self.add_zero_attn, + self.dropout, self.out_proj.weight, self.out_proj.bias, + training=self.training, + key_padding_mask=key_padding_mask, need_weights=need_weights, + attn_mask=attn_mask, + attention_probs_forward_hook=attention_probs_forward_hook, + attention_probs_backwards_hook=attention_probs_backwards_hook) diff --git a/CLIP/clip_explainability/bpe_simple_vocab_16e6.txt.gz b/CLIP/clip_explainability/bpe_simple_vocab_16e6.txt.gz new file mode 100644 index 0000000000000000000000000000000000000000..36a15856e00a06a9fbed8cdd34d2393fea4a3113 --- /dev/null +++ b/CLIP/clip_explainability/bpe_simple_vocab_16e6.txt.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a +size 1356917 diff --git a/CLIP/clip_explainability/clip.py b/CLIP/clip_explainability/clip.py new file mode 100644 index 0000000000000000000000000000000000000000..de901e3f27462e77254b171bac8349b94e3c081f --- /dev/null +++ b/CLIP/clip_explainability/clip.py @@ -0,0 +1,196 @@ +import hashlib +import os +import urllib +import warnings +from typing import Union, List + +import torch +from PIL import Image +from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize +from tqdm import tqdm + +from .model import build_model +from .simple_tokenizer import SimpleTokenizer as _Tokenizer + +__all__ = ["available_models", "load", "tokenize"] +_tokenizer = _Tokenizer() + +_MODELS = { + "RN50": "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt", + "RN101": "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt", + "RN50x4": "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt", + "RN50x16": "https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt", + "RN50x64": "https://openaipublic.azureedge.net/clip/models/be1cfb55d75a9666199fb2206c106743da0f6468c9d327f3e0d0a543a9919d9c/RN50x64.pt", + "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt", + "ViT-B/16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt", + "ViT-L/14": "https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt", +} + +def _download(url: str, root: str = os.path.expanduser("~/.cache/clip")): + os.makedirs(root, exist_ok=True) + filename = os.path.basename(url) + + expected_sha256 = url.split("/")[-2] + download_target = os.path.join(root, filename) + + if os.path.exists(download_target) and not os.path.isfile(download_target): + raise RuntimeError(f"{download_target} exists and is not a regular file") + + if os.path.isfile(download_target): + if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256: + return download_target + else: + warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file") + + with urllib.request.urlopen(url) as source, open(download_target, "wb") as output: + with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True) as loop: + while True: + buffer = source.read(8192) + if not buffer: + break + + output.write(buffer) + loop.update(len(buffer)) + + if hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256: + raise RuntimeError(f"Model has been downloaded but the SHA256 checksum does not not match") + + return download_target + + +def _transform(n_px): + return Compose([ + Resize(n_px, interpolation=Image.BICUBIC), + CenterCrop(n_px), + lambda image: image.convert("RGB"), + ToTensor(), + Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), + ]) + + +def available_models() -> List[str]: + """Returns the names of available CLIP models""" + return list(_MODELS.keys()) + + +def load(name: str, device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu", jit=True): + """Load a CLIP model + + Parameters + ---------- + name : str + A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict + + device : Union[str, torch.device] + The device to put the loaded model + + jit : bool + Whether to load the optimized JIT model (default) or more hackable non-JIT model. + + Returns + ------- + model : torch.nn.Module + The CLIP model + + preprocess : Callable[[PIL.Image], torch.Tensor] + A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input + """ + if name in _MODELS: + model_path = _download(_MODELS[name]) + elif os.path.isfile(name): + model_path = name + else: + raise RuntimeError(f"Model {name} not found; available models = {available_models()}") + + try: + # loading JIT archive + model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval() + state_dict = None + except RuntimeError: + # loading saved state dict + if jit: + warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead") + jit = False + state_dict = torch.load(model_path, map_location="cpu") + + if not jit: + model = build_model(state_dict or model.state_dict()).to(device) + if str(device) == "cpu": + model.float() + return model, _transform(model.visual.input_resolution) + + # patch the device names + device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[]) + device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1] + + def patch_device(module): + graphs = [module.graph] if hasattr(module, "graph") else [] + if hasattr(module, "forward1"): + graphs.append(module.forward1.graph) + + for graph in graphs: + for node in graph.findAllNodes("prim::Constant"): + if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"): + node.copyAttributes(device_node) + + model.apply(patch_device) + patch_device(model.encode_image) + patch_device(model.encode_text) + + # patch dtype to float32 on CPU + if str(device) == "cpu": + float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[]) + float_input = list(float_holder.graph.findNode("aten::to").inputs())[1] + float_node = float_input.node() + + def patch_float(module): + graphs = [module.graph] if hasattr(module, "graph") else [] + if hasattr(module, "forward1"): + graphs.append(module.forward1.graph) + + for graph in graphs: + for node in graph.findAllNodes("aten::to"): + inputs = list(node.inputs()) + for i in [1, 2]: # dtype can be the second or third argument to aten::to() + if inputs[i].node()["value"] == 5: + inputs[i].node().copyAttributes(float_node) + + model.apply(patch_float) + patch_float(model.encode_image) + patch_float(model.encode_text) + + model.float() + + return model, _transform(model.input_resolution.item()) + + +def tokenize(texts: Union[str, List[str]], context_length: int = 77) -> torch.LongTensor: + """ + Returns the tokenized representation of given input string(s) + + Parameters + ---------- + texts : Union[str, List[str]] + An input string or a list of input strings to tokenize + + context_length : int + The context length to use; all CLIP models use 77 as the context length + + Returns + ------- + A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length] + """ + if isinstance(texts, str): + texts = [texts] + + sot_token = _tokenizer.encoder["<|startoftext|>"] + eot_token = _tokenizer.encoder["<|endoftext|>"] + all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts] + result = torch.zeros(len(all_tokens), context_length, dtype=torch.long) + + for i, tokens in enumerate(all_tokens): + if len(tokens) > context_length: + raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}") + result[i, :len(tokens)] = torch.tensor(tokens) + + return result diff --git a/CLIP/clip_explainability/model.py b/CLIP/clip_explainability/model.py new file mode 100644 index 0000000000000000000000000000000000000000..7e2c21b2083f12522faa175b95d37f1d99f28bb6 --- /dev/null +++ b/CLIP/clip_explainability/model.py @@ -0,0 +1,442 @@ +from collections import OrderedDict +from typing import Tuple, Union + +import numpy as np +import torch +import torch.nn.functional as F +from torch import nn +from .auxilary import * + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1): + super().__init__() + + # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1 + self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + + self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + + self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity() + + self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion) + + self.relu = nn.ReLU(inplace=True) + self.downsample = None + self.stride = stride + + if stride > 1 or inplanes != planes * Bottleneck.expansion: + # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1 + self.downsample = nn.Sequential(OrderedDict([ + ("-1", nn.AvgPool2d(stride)), + ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)), + ("1", nn.BatchNorm2d(planes * self.expansion)) + ])) + + def forward(self, x: torch.Tensor): + identity = x + + out = self.relu(self.bn1(self.conv1(x))) + out = self.relu(self.bn2(self.conv2(out))) + out = self.avgpool(out) + out = self.bn3(self.conv3(out)) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.relu(out) + return out + + +class AttentionPool2d(nn.Module): + def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None): + super().__init__() + self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5) + self.k_proj = nn.Linear(embed_dim, embed_dim) + self.q_proj = nn.Linear(embed_dim, embed_dim) + self.v_proj = nn.Linear(embed_dim, embed_dim) + self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim) + self.num_heads = num_heads + + def forward(self, x): + x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute(2, 0, 1) # NCHW -> (HW)NC + x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0) # (HW+1)NC + x = x + self.positional_embedding[:, None, :].to(x.dtype) # (HW+1)NC + x, _ = multi_head_attention_forward( + query=x, key=x, value=x, + embed_dim_to_check=x.shape[-1], + num_heads=self.num_heads, + q_proj_weight=self.q_proj.weight, + k_proj_weight=self.k_proj.weight, + v_proj_weight=self.v_proj.weight, + in_proj_weight=None, + in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]), + bias_k=None, + bias_v=None, + add_zero_attn=False, + dropout_p=0, + out_proj_weight=self.c_proj.weight, + out_proj_bias=self.c_proj.bias, + use_separate_proj_weight=True, + training=self.training, + need_weights=False + ) + + return x[0] + + +class ModifiedResNet(nn.Module): + """ + A ResNet class that is similar to torchvision's but contains the following changes: + - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool. + - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1 + - The final pooling layer is a QKV attention instead of an average pool + """ + + def __init__(self, layers, output_dim, heads, input_resolution=224, width=64): + super().__init__() + self.output_dim = output_dim + self.input_resolution = input_resolution + + # the 3-layer stem + self.conv1 = nn.Conv2d(3, width // 2, kernel_size=3, stride=2, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(width // 2) + self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(width // 2) + self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False) + self.bn3 = nn.BatchNorm2d(width) + self.avgpool = nn.AvgPool2d(2) + self.relu = nn.ReLU(inplace=True) + + # residual layers + self._inplanes = width # this is a *mutable* variable used during construction + self.layer1 = self._make_layer(width, layers[0]) + self.layer2 = self._make_layer(width * 2, layers[1], stride=2) + self.layer3 = self._make_layer(width * 4, layers[2], stride=2) + self.layer4 = self._make_layer(width * 8, layers[3], stride=2) + + embed_dim = width * 32 # the ResNet feature dimension + self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, heads, output_dim) + + def _make_layer(self, planes, blocks, stride=1): + layers = [Bottleneck(self._inplanes, planes, stride)] + + self._inplanes = planes * Bottleneck.expansion + for _ in range(1, blocks): + layers.append(Bottleneck(self._inplanes, planes)) + + return nn.Sequential(*layers) + + def forward(self, x): + def stem(x): + for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2), (self.conv3, self.bn3)]: + x = self.relu(bn(conv(x))) + x = self.avgpool(x) + return x + + x = x.type(self.conv1.weight.dtype) + x = stem(x) + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + x = self.attnpool(x) + + return x + + +class LayerNorm(nn.LayerNorm): + """Subclass torch's LayerNorm to handle fp16.""" + + def forward(self, x: torch.Tensor): + orig_type = x.dtype + ret = super().forward(x.type(torch.float32)) + return ret.type(orig_type) + + +class QuickGELU(nn.Module): + def forward(self, x: torch.Tensor): + return x * torch.sigmoid(1.702 * x) + + +class ResidualAttentionBlock(nn.Module): + def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None): + super().__init__() + + self.attn = MultiheadAttention(d_model, n_head) + self.ln_1 = LayerNorm(d_model) + self.mlp = nn.Sequential(OrderedDict([ + ("c_fc", nn.Linear(d_model, d_model * 4)), + ("gelu", QuickGELU()), + ("c_proj", nn.Linear(d_model * 4, d_model)) + ])) + self.ln_2 = LayerNorm(d_model) + self.attn_mask = attn_mask + + self.attn_probs = None + self.attn_grad = None + + def set_attn_probs(self, attn_probs): + self.attn_probs = attn_probs + + def set_attn_grad(self, attn_grad): + self.attn_grad = attn_grad + + def attention(self, x: torch.Tensor): + self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None + return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask, attention_probs_forward_hook=self.set_attn_probs, + attention_probs_backwards_hook=self.set_attn_grad)[0] + + def forward(self, x: torch.Tensor): + x = x + self.attention(self.ln_1(x)) + x = x + self.mlp(self.ln_2(x)) + return x + + +class Transformer(nn.Module): + def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None): + super().__init__() + self.width = width + self.layers = layers + self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)]) + + def forward(self, x: torch.Tensor): + return self.resblocks(x) + + +class VisualTransformer(nn.Module): + def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, output_dim: int): + super().__init__() + self.input_resolution = input_resolution + self.output_dim = output_dim + self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False) + + scale = width ** -0.5 + self.class_embedding = nn.Parameter(scale * torch.randn(width)) + self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width)) + self.ln_pre = LayerNorm(width) + + self.transformer = Transformer(width, layers, heads) + + self.ln_post = LayerNorm(width) + self.proj = nn.Parameter(scale * torch.randn(width, output_dim)) + + def forward(self, x: torch.Tensor): + x = self.conv1(x) # shape = [*, width, grid, grid] + x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2] + x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] + x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1) # shape = [*, grid ** 2 + 1, width] + x = x + self.positional_embedding.to(x.dtype) + x = self.ln_pre(x) + + x = x.permute(1, 0, 2) # NLD -> LND + x = self.transformer(x) + x = x.permute(1, 0, 2) # LND -> NLD + + x = self.ln_post(x[:, 0, :]) + + if self.proj is not None: + x = x @ self.proj + + return x + + +class CLIP(nn.Module): + def __init__(self, + embed_dim: int, + # vision + image_resolution: int, + vision_layers: Union[Tuple[int, int, int, int], int], + vision_width: int, + vision_patch_size: int, + # text + context_length: int, + vocab_size: int, + transformer_width: int, + transformer_heads: int, + transformer_layers: int + ): + super().__init__() + + self.context_length = context_length + + if isinstance(vision_layers, (tuple, list)): + vision_heads = vision_width * 32 // 64 + self.visual = ModifiedResNet( + layers=vision_layers, + output_dim=embed_dim, + heads=vision_heads, + input_resolution=image_resolution, + width=vision_width + ) + else: + vision_heads = vision_width // 64 + self.visual = VisualTransformer( + input_resolution=image_resolution, + patch_size=vision_patch_size, + width=vision_width, + layers=vision_layers, + heads=vision_heads, + output_dim=embed_dim + ) + + self.transformer = Transformer( + width=transformer_width, + layers=transformer_layers, + heads=transformer_heads, + attn_mask=self.build_attention_mask() + ) + + self.vocab_size = vocab_size + self.token_embedding = nn.Embedding(vocab_size, transformer_width) + self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width)) + self.ln_final = LayerNorm(transformer_width) + + self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim)) + self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07)) + + self.initialize_parameters() + + def initialize_parameters(self): + nn.init.normal_(self.token_embedding.weight, std=0.02) + nn.init.normal_(self.positional_embedding, std=0.01) + + if isinstance(self.visual, ModifiedResNet): + if self.visual.attnpool is not None: + std = self.visual.attnpool.c_proj.in_features ** -0.5 + nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std) + nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std) + nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std) + nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std) + + for resnet_block in [self.visual.layer1, self.visual.layer2, self.visual.layer3, self.visual.layer4]: + for name, param in resnet_block.named_parameters(): + if name.endswith("bn3.weight"): + nn.init.zeros_(param) + + proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5) + attn_std = self.transformer.width ** -0.5 + fc_std = (2 * self.transformer.width) ** -0.5 + for block in self.transformer.resblocks: + nn.init.normal_(block.attn.in_proj_weight, std=attn_std) + nn.init.normal_(block.attn.out_proj.weight, std=proj_std) + nn.init.normal_(block.mlp.c_fc.weight, std=fc_std) + nn.init.normal_(block.mlp.c_proj.weight, std=proj_std) + + if self.text_projection is not None: + nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5) + + def build_attention_mask(self): + # lazily create causal attention mask, with full attention between the vision tokens + # pytorch uses additive attention mask; fill with -inf + mask = torch.empty(self.context_length, self.context_length) + mask.fill_(float("-inf")) + mask.triu_(1) # zero out the lower diagonal + return mask + + @property + def dtype(self): + return self.visual.conv1.weight.dtype + + def encode_image(self, image): + return self.visual(image.type(self.dtype)) + + def encode_text(self, text): + x = self.token_embedding(text).type(self.dtype) # [batch_size, n_ctx, d_model] + + x = x + self.positional_embedding.type(self.dtype) + x = x.permute(1, 0, 2) # NLD -> LND + x = self.transformer(x) + x = x.permute(1, 0, 2) # LND -> NLD + x = self.ln_final(x).type(self.dtype) + + # x.shape = [batch_size, n_ctx, transformer.width] + # take features from the eot embedding (eot_token is the highest number in each sequence) + x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection + + return x + + def forward(self, image, text): + image_features = self.encode_image(image) + text_features = self.encode_text(text) + + # normalized features + image_features = image_features / image_features.norm(dim=-1, keepdim=True) + text_features = text_features / text_features.norm(dim=-1, keepdim=True) + + # cosine similarity as logits + logit_scale = self.logit_scale.exp() + logits_per_image = logit_scale * image_features @ text_features.t() + logits_per_text = logit_scale * text_features @ image_features.t() + + # shape = [global_batch_size, global_batch_size] + return logits_per_image, logits_per_text + + +def convert_weights(model: nn.Module): + """Convert applicable model parameters to fp16""" + + def _convert_weights_to_fp16(l): + if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)): + l.weight.data = l.weight.data.half() + if l.bias is not None: + l.bias.data = l.bias.data.half() + + if isinstance(l, MultiheadAttention): + for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]: + tensor = getattr(l, attr) + if tensor is not None: + tensor.data = tensor.data.half() + + for name in ["text_projection", "proj"]: + if hasattr(l, name): + attr = getattr(l, name) + if attr is not None: + attr.data = attr.data.half() + + model.apply(_convert_weights_to_fp16) + + +def build_model(state_dict: dict): + vit = "visual.proj" in state_dict + + if vit: + vision_width = state_dict["visual.conv1.weight"].shape[0] + vision_layers = len([k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")]) + vision_patch_size = state_dict["visual.conv1.weight"].shape[-1] + grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5) + image_resolution = vision_patch_size * grid_size + else: + counts: list = [len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in [1, 2, 3, 4]] + vision_layers = tuple(counts) + vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0] + output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5) + vision_patch_size = None + assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0] + image_resolution = output_width * 32 + + embed_dim = state_dict["text_projection"].shape[1] + context_length = state_dict["positional_embedding"].shape[0] + vocab_size = state_dict["token_embedding.weight"].shape[0] + transformer_width = state_dict["ln_final.weight"].shape[0] + transformer_heads = transformer_width // 64 + transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith(f"transformer.resblocks"))) + + model = CLIP( + embed_dim, + image_resolution, vision_layers, vision_width, vision_patch_size, + context_length, vocab_size, transformer_width, transformer_heads, transformer_layers + ) + + for key in ["input_resolution", "context_length", "vocab_size"]: + if key in state_dict: + del state_dict[key] + + convert_weights(model) + model.load_state_dict(state_dict) + return model.eval() diff --git a/CLIP/clip_explainability/simple_tokenizer.py b/CLIP/clip_explainability/simple_tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..0a66286b7d5019c6e221932a813768038f839c91 --- /dev/null +++ b/CLIP/clip_explainability/simple_tokenizer.py @@ -0,0 +1,132 @@ +import gzip +import html +import os +from functools import lru_cache + +import ftfy +import regex as re + + +@lru_cache() +def default_bpe(): + return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz") + + +@lru_cache() +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a signficant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8+n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +def get_pairs(word): + """Return set of symbol pairs in a word. + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +def basic_clean(text): + text = ftfy.fix_text(text) + text = html.unescape(html.unescape(text)) + return text.strip() + + +def whitespace_clean(text): + text = re.sub(r'\s+', ' ', text) + text = text.strip() + return text + + +class SimpleTokenizer(object): + def __init__(self, bpe_path: str = default_bpe()): + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} + merges = gzip.open(bpe_path).read().decode("utf-8").split('\n') + merges = merges[1:49152-256-2+1] + merges = [tuple(merge.split()) for merge in merges] + vocab = list(bytes_to_unicode().values()) + vocab = vocab + [v+'' for v in vocab] + for merge in merges: + vocab.append(''.join(merge)) + vocab.extend(['<|startoftext|>', '<|endoftext|>']) + self.encoder = dict(zip(vocab, range(len(vocab)))) + self.decoder = {v: k for k, v in self.encoder.items()} + self.bpe_ranks = dict(zip(merges, range(len(merges)))) + self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'} + self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE) + + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token[:-1]) + ( token[-1] + '',) + pairs = get_pairs(word) + + if not pairs: + return token+'' + + while True: + bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf'))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + new_word.extend(word[i:j]) + i = j + except: + new_word.extend(word[i:]) + break + + if word[i] == first and i < len(word)-1 and word[i+1] == second: + new_word.append(first+second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = ' '.join(word) + self.cache[token] = word + return word + + def encode(self, text): + bpe_tokens = [] + text = whitespace_clean(basic_clean(text)).lower() + for token in re.findall(self.pat, text): + token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) + bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' ')) + return bpe_tokens + + def decode(self, tokens): + text = ''.join([self.decoder[token] for token in tokens]) + text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('', ' ') + return text diff --git a/CLIP/data/country211.md b/CLIP/data/country211.md new file mode 100644 index 0000000000000000000000000000000000000000..4cd096005c8e5777e0706d97d182c3bd87b651a9 --- /dev/null +++ b/CLIP/data/country211.md @@ -0,0 +1,12 @@ +# The Country211 Dataset + +In the paper, we used an image classification dataset called Country211, to evaluate the model's capability on geolocation. To do so, we filtered the YFCC100m dataset that have GPS coordinate corresponding to a [ISO-3166 country code](https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes) and created a balanced dataset by sampling 150 train images, 50 validation images, and 100 test images images for each country. + +The following command will download an 11GB archive countaining the images and extract into a subdirectory `country211`: + +```bash +wget https://openaipublic.azureedge.net/clip/data/country211.tgz +tar zxvf country211.tgz +``` + +These images are a subset of the YFCC100m dataset. Use of the underlying media files is subject to the Creative Commons licenses chosen by their creators/uploaders. For more information about the YFCC100M dataset, visit [the official website](https://multimediacommons.wordpress.com/yfcc100m-core-dataset/). \ No newline at end of file diff --git a/CLIP/data/prompts.md b/CLIP/data/prompts.md new file mode 100644 index 0000000000000000000000000000000000000000..6d8aaf7b13f04031e7ea00d58a1c131b98bdfe20 --- /dev/null +++ b/CLIP/data/prompts.md @@ -0,0 +1,3401 @@ +# Prompts for Image Classification + +Below are the class names and templates that are used for collecting the zero-shot classification scores in the paper. Each dataset has two lists `classes` and `templates`, where the string `{}` in the template is to be replaced with the corresponding class names. For the Facial Emotion Recognition 2013 dataset specifically, we used multiple class names for certain classes. + +This file contains prompt data for 26 of the 27 datasets shown in Table 9 of the paper; the text prompts for ImageNet (as well as other [ImageNet Testbed](https://modestyachts.github.io/imagenet-testbed/) datasets in Figure 13) can be found in [this notebook](https://github.com/openai/CLIP/blob/main/notebooks/Prompt_Engineering_for_ImageNet.ipynb), as well as how to ensemble predictions from multiple prompts using these templates. + +If you are viewing this document on GitHub, use the table of contents icon at the upper left to browse the datasets. + + +## Birdsnap + +```bash +classes = [ + 'Acadian Flycatcher', + 'Acorn Woodpecker', + 'Alder Flycatcher', + 'Allens Hummingbird', + 'Altamira Oriole', + 'American Avocet', + 'American Bittern', + 'American Black Duck', + 'American Coot', + 'American Crow', + 'American Dipper', + 'American Golden Plover', + 'American Goldfinch', + 'American Kestrel', + 'American Oystercatcher', + 'American Pipit', + 'American Redstart', + 'American Robin', + 'American Three toed Woodpecker', + 'American Tree Sparrow', + 'American White Pelican', + 'American Wigeon', + 'American Woodcock', + 'Anhinga', + 'Annas Hummingbird', + 'Arctic Tern', + 'Ash throated Flycatcher', + 'Audubons Oriole', + 'Bairds Sandpiper', + 'Bald Eagle', + 'Baltimore Oriole', + 'Band tailed Pigeon', + 'Barn Swallow', + 'Barred Owl', + 'Barrows Goldeneye', + 'Bay breasted Warbler', + 'Bells Vireo', + 'Belted Kingfisher', + 'Bewicks Wren', + 'Black Guillemot', + 'Black Oystercatcher', + 'Black Phoebe', + 'Black Rosy Finch', + 'Black Scoter', + 'Black Skimmer', + 'Black Tern', + 'Black Turnstone', + 'Black Vulture', + 'Black and white Warbler', + 'Black backed Woodpecker', + 'Black bellied Plover', + 'Black billed Cuckoo', + 'Black billed Magpie', + 'Black capped Chickadee', + 'Black chinned Hummingbird', + 'Black chinned Sparrow', + 'Black crested Titmouse', + 'Black crowned Night Heron', + 'Black headed Grosbeak', + 'Black legged Kittiwake', + 'Black necked Stilt', + 'Black throated Blue Warbler', + 'Black throated Gray Warbler', + 'Black throated Green Warbler', + 'Black throated Sparrow', + 'Blackburnian Warbler', + 'Blackpoll Warbler', + 'Blue Grosbeak', + 'Blue Jay', + 'Blue gray Gnatcatcher', + 'Blue headed Vireo', + 'Blue winged Teal', + 'Blue winged Warbler', + 'Boat tailed Grackle', + 'Bobolink', + 'Bohemian Waxwing', + 'Bonapartes Gull', + 'Boreal Chickadee', + 'Brandts Cormorant', + 'Brant', + 'Brewers Blackbird', + 'Brewers Sparrow', + 'Bridled Titmouse', + 'Broad billed Hummingbird', + 'Broad tailed Hummingbird', + 'Broad winged Hawk', + 'Bronzed Cowbird', + 'Brown Creeper', + 'Brown Pelican', + 'Brown Thrasher', + 'Brown capped Rosy Finch', + 'Brown crested Flycatcher', + 'Brown headed Cowbird', + 'Brown headed Nuthatch', + 'Bufflehead', + 'Bullocks Oriole', + 'Burrowing Owl', + 'Bushtit', + 'Cackling Goose', + 'Cactus Wren', + 'California Gull', + 'California Quail', + 'California Thrasher', + 'California Towhee', + 'Calliope Hummingbird', + 'Canada Goose', + 'Canada Warbler', + 'Canvasback', + 'Canyon Towhee', + 'Canyon Wren', + 'Cape May Warbler', + 'Carolina Chickadee', + 'Carolina Wren', + 'Caspian Tern', + 'Cassins Finch', + 'Cassins Kingbird', + 'Cassins Sparrow', + 'Cassins Vireo', + 'Cattle Egret', + 'Cave Swallow', + 'Cedar Waxwing', + 'Cerulean Warbler', + 'Chestnut backed Chickadee', + 'Chestnut collared Longspur', + 'Chestnut sided Warbler', + 'Chihuahuan Raven', + 'Chimney Swift', + 'Chipping Sparrow', + 'Cinnamon Teal', + 'Clapper Rail', + 'Clarks Grebe', + 'Clarks Nutcracker', + 'Clay colored Sparrow', + 'Cliff Swallow', + 'Common Black Hawk', + 'Common Eider', + 'Common Gallinule', + 'Common Goldeneye', + 'Common Grackle', + 'Common Ground Dove', + 'Common Loon', + 'Common Merganser', + 'Common Murre', + 'Common Nighthawk', + 'Common Raven', + 'Common Redpoll', + 'Common Tern', + 'Common Yellowthroat', + 'Connecticut Warbler', + 'Coopers Hawk', + 'Cordilleran Flycatcher', + 'Costas Hummingbird', + 'Couchs Kingbird', + 'Crested Caracara', + 'Curve billed Thrasher', + 'Dark eyed Junco', + 'Dickcissel', + 'Double crested Cormorant', + 'Downy Woodpecker', + 'Dunlin', + 'Dusky Flycatcher', + 'Dusky Grouse', + 'Eared Grebe', + 'Eastern Bluebird', + 'Eastern Kingbird', + 'Eastern Meadowlark', + 'Eastern Phoebe', + 'Eastern Screech Owl', + 'Eastern Towhee', + 'Eastern Wood Pewee', + 'Elegant Trogon', + 'Elf Owl', + 'Eurasian Collared Dove', + 'Eurasian Wigeon', + 'European Starling', + 'Evening Grosbeak', + 'Ferruginous Hawk', + 'Ferruginous Pygmy Owl', + 'Field Sparrow', + 'Fish Crow', + 'Florida Scrub Jay', + 'Forsters Tern', + 'Fox Sparrow', + 'Franklins Gull', + 'Fulvous Whistling Duck', + 'Gadwall', + 'Gambels Quail', + 'Gila Woodpecker', + 'Glaucous Gull', + 'Glaucous winged Gull', + 'Glossy Ibis', + 'Golden Eagle', + 'Golden crowned Kinglet', + 'Golden crowned Sparrow', + 'Golden fronted Woodpecker', + 'Golden winged Warbler', + 'Grasshopper Sparrow', + 'Gray Catbird', + 'Gray Flycatcher', + 'Gray Jay', + 'Gray Kingbird', + 'Gray cheeked Thrush', + 'Gray crowned Rosy Finch', + 'Great Black backed Gull', + 'Great Blue Heron', + 'Great Cormorant', + 'Great Crested Flycatcher', + 'Great Egret', + 'Great Gray Owl', + 'Great Horned Owl', + 'Great Kiskadee', + 'Great tailed Grackle', + 'Greater Prairie Chicken', + 'Greater Roadrunner', + 'Greater Sage Grouse', + 'Greater Scaup', + 'Greater White fronted Goose', + 'Greater Yellowlegs', + 'Green Jay', + 'Green tailed Towhee', + 'Green winged Teal', + 'Groove billed Ani', + 'Gull billed Tern', + 'Hairy Woodpecker', + 'Hammonds Flycatcher', + 'Harlequin Duck', + 'Harriss Hawk', + 'Harriss Sparrow', + 'Heermanns Gull', + 'Henslows Sparrow', + 'Hepatic Tanager', + 'Hermit Thrush', + 'Herring Gull', + 'Hoary Redpoll', + 'Hooded Merganser', + 'Hooded Oriole', + 'Hooded Warbler', + 'Horned Grebe', + 'Horned Lark', + 'House Finch', + 'House Sparrow', + 'House Wren', + 'Huttons Vireo', + 'Iceland Gull', + 'Inca Dove', + 'Indigo Bunting', + 'Killdeer', + 'King Rail', + 'Ladder backed Woodpecker', + 'Lapland Longspur', + 'Lark Bunting', + 'Lark Sparrow', + 'Laughing Gull', + 'Lazuli Bunting', + 'Le Contes Sparrow', + 'Least Bittern', + 'Least Flycatcher', + 'Least Grebe', + 'Least Sandpiper', + 'Least Tern', + 'Lesser Goldfinch', + 'Lesser Nighthawk', + 'Lesser Scaup', + 'Lesser Yellowlegs', + 'Lewiss Woodpecker', + 'Limpkin', + 'Lincolns Sparrow', + 'Little Blue Heron', + 'Loggerhead Shrike', + 'Long billed Curlew', + 'Long billed Dowitcher', + 'Long billed Thrasher', + 'Long eared Owl', + 'Long tailed Duck', + 'Louisiana Waterthrush', + 'Magnificent Frigatebird', + 'Magnolia Warbler', + 'Mallard', + 'Marbled Godwit', + 'Marsh Wren', + 'Merlin', + 'Mew Gull', + 'Mexican Jay', + 'Mississippi Kite', + 'Monk Parakeet', + 'Mottled Duck', + 'Mountain Bluebird', + 'Mountain Chickadee', + 'Mountain Plover', + 'Mourning Dove', + 'Mourning Warbler', + 'Muscovy Duck', + 'Mute Swan', + 'Nashville Warbler', + 'Nelsons Sparrow', + 'Neotropic Cormorant', + 'Northern Bobwhite', + 'Northern Cardinal', + 'Northern Flicker', + 'Northern Gannet', + 'Northern Goshawk', + 'Northern Harrier', + 'Northern Hawk Owl', + 'Northern Mockingbird', + 'Northern Parula', + 'Northern Pintail', + 'Northern Rough winged Swallow', + 'Northern Saw whet Owl', + 'Northern Shrike', + 'Northern Waterthrush', + 'Nuttalls Woodpecker', + 'Oak Titmouse', + 'Olive Sparrow', + 'Olive sided Flycatcher', + 'Orange crowned Warbler', + 'Orchard Oriole', + 'Osprey', + 'Ovenbird', + 'Pacific Golden Plover', + 'Pacific Loon', + 'Pacific Wren', + 'Pacific slope Flycatcher', + 'Painted Bunting', + 'Painted Redstart', + 'Palm Warbler', + 'Pectoral Sandpiper', + 'Peregrine Falcon', + 'Phainopepla', + 'Philadelphia Vireo', + 'Pied billed Grebe', + 'Pigeon Guillemot', + 'Pileated Woodpecker', + 'Pine Grosbeak', + 'Pine Siskin', + 'Pine Warbler', + 'Piping Plover', + 'Plumbeous Vireo', + 'Prairie Falcon', + 'Prairie Warbler', + 'Prothonotary Warbler', + 'Purple Finch', + 'Purple Gallinule', + 'Purple Martin', + 'Purple Sandpiper', + 'Pygmy Nuthatch', + 'Pyrrhuloxia', + 'Red Crossbill', + 'Red Knot', + 'Red Phalarope', + 'Red bellied Woodpecker', + 'Red breasted Merganser', + 'Red breasted Nuthatch', + 'Red breasted Sapsucker', + 'Red cockaded Woodpecker', + 'Red eyed Vireo', + 'Red headed Woodpecker', + 'Red naped Sapsucker', + 'Red necked Grebe', + 'Red necked Phalarope', + 'Red shouldered Hawk', + 'Red tailed Hawk', + 'Red throated Loon', + 'Red winged Blackbird', + 'Reddish Egret', + 'Redhead', + 'Ring billed Gull', + 'Ring necked Duck', + 'Ring necked Pheasant', + 'Rock Pigeon', + 'Rock Ptarmigan', + 'Rock Sandpiper', + 'Rock Wren', + 'Rose breasted Grosbeak', + 'Roseate Tern', + 'Rosss Goose', + 'Rough legged Hawk', + 'Royal Tern', + 'Ruby crowned Kinglet', + 'Ruby throated Hummingbird', + 'Ruddy Duck', + 'Ruddy Turnstone', + 'Ruffed Grouse', + 'Rufous Hummingbird', + 'Rufous crowned Sparrow', + 'Rusty Blackbird', + 'Sage Thrasher', + 'Saltmarsh Sparrow', + 'Sanderling', + 'Sandhill Crane', + 'Sandwich Tern', + 'Says Phoebe', + 'Scaled Quail', + 'Scarlet Tanager', + 'Scissor tailed Flycatcher', + 'Scotts Oriole', + 'Seaside Sparrow', + 'Sedge Wren', + 'Semipalmated Plover', + 'Semipalmated Sandpiper', + 'Sharp shinned Hawk', + 'Sharp tailed Grouse', + 'Short billed Dowitcher', + 'Short eared Owl', + 'Snail Kite', + 'Snow Bunting', + 'Snow Goose', + 'Snowy Egret', + 'Snowy Owl', + 'Snowy Plover', + 'Solitary Sandpiper', + 'Song Sparrow', + 'Sooty Grouse', + 'Sora', + 'Spotted Owl', + 'Spotted Sandpiper', + 'Spotted Towhee', + 'Spruce Grouse', + 'Stellers Jay', + 'Stilt Sandpiper', + 'Summer Tanager', + 'Surf Scoter', + 'Surfbird', + 'Swainsons Hawk', + 'Swainsons Thrush', + 'Swallow tailed Kite', + 'Swamp Sparrow', + 'Tennessee Warbler', + 'Thayers Gull', + 'Townsends Solitaire', + 'Townsends Warbler', + 'Tree Swallow', + 'Tricolored Heron', + 'Tropical Kingbird', + 'Trumpeter Swan', + 'Tufted Titmouse', + 'Tundra Swan', + 'Turkey Vulture', + 'Upland Sandpiper', + 'Varied Thrush', + 'Veery', + 'Verdin', + 'Vermilion Flycatcher', + 'Vesper Sparrow', + 'Violet green Swallow', + 'Virginia Rail', + 'Wandering Tattler', + 'Warbling Vireo', + 'Western Bluebird', + 'Western Grebe', + 'Western Gull', + 'Western Kingbird', + 'Western Meadowlark', + 'Western Sandpiper', + 'Western Screech Owl', + 'Western Scrub Jay', + 'Western Tanager', + 'Western Wood Pewee', + 'Whimbrel', + 'White Ibis', + 'White breasted Nuthatch', + 'White crowned Sparrow', + 'White eyed Vireo', + 'White faced Ibis', + 'White headed Woodpecker', + 'White rumped Sandpiper', + 'White tailed Hawk', + 'White tailed Kite', + 'White tailed Ptarmigan', + 'White throated Sparrow', + 'White throated Swift', + 'White winged Crossbill', + 'White winged Dove', + 'White winged Scoter', + 'Wild Turkey', + 'Willet', + 'Williamsons Sapsucker', + 'Willow Flycatcher', + 'Willow Ptarmigan', + 'Wilsons Phalarope', + 'Wilsons Plover', + 'Wilsons Snipe', + 'Wilsons Warbler', + 'Winter Wren', + 'Wood Stork', + 'Wood Thrush', + 'Worm eating Warbler', + 'Wrentit', + 'Yellow Warbler', + 'Yellow bellied Flycatcher', + 'Yellow bellied Sapsucker', + 'Yellow billed Cuckoo', + 'Yellow billed Magpie', + 'Yellow breasted Chat', + 'Yellow crowned Night Heron', + 'Yellow eyed Junco', + 'Yellow headed Blackbird', + 'Yellow rumped Warbler', + 'Yellow throated Vireo', + 'Yellow throated Warbler', + 'Zone tailed Hawk', +] + +templates = [ + 'a photo of a {}, a type of bird.', +] +``` + + + +## CIFAR10 + +```bash +classes = [ + 'airplane', + 'automobile', + 'bird', + 'cat', + 'deer', + 'dog', + 'frog', + 'horse', + 'ship', + 'truck', +] + +templates = [ + 'a photo of a {}.', + 'a blurry photo of a {}.', + 'a black and white photo of a {}.', + 'a low contrast photo of a {}.', + 'a high contrast photo of a {}.', + 'a bad photo of a {}.', + 'a good photo of a {}.', + 'a photo of a small {}.', + 'a photo of a big {}.', + 'a photo of the {}.', + 'a blurry photo of the {}.', + 'a black and white photo of the {}.', + 'a low contrast photo of the {}.', + 'a high contrast photo of the {}.', + 'a bad photo of the {}.', + 'a good photo of the {}.', + 'a photo of the small {}.', + 'a photo of the big {}.', +] +``` + + + +## CIFAR100 + +```bash +classes = [ + 'apple', + 'aquarium fish', + 'baby', + 'bear', + 'beaver', + 'bed', + 'bee', + 'beetle', + 'bicycle', + 'bottle', + 'bowl', + 'boy', + 'bridge', + 'bus', + 'butterfly', + 'camel', + 'can', + 'castle', + 'caterpillar', + 'cattle', + 'chair', + 'chimpanzee', + 'clock', + 'cloud', + 'cockroach', + 'couch', + 'crab', + 'crocodile', + 'cup', + 'dinosaur', + 'dolphin', + 'elephant', + 'flatfish', + 'forest', + 'fox', + 'girl', + 'hamster', + 'house', + 'kangaroo', + 'keyboard', + 'lamp', + 'lawn mower', + 'leopard', + 'lion', + 'lizard', + 'lobster', + 'man', + 'maple tree', + 'motorcycle', + 'mountain', + 'mouse', + 'mushroom', + 'oak tree', + 'orange', + 'orchid', + 'otter', + 'palm tree', + 'pear', + 'pickup truck', + 'pine tree', + 'plain', + 'plate', + 'poppy', + 'porcupine', + 'possum', + 'rabbit', + 'raccoon', + 'ray', + 'road', + 'rocket', + 'rose', + 'sea', + 'seal', + 'shark', + 'shrew', + 'skunk', + 'skyscraper', + 'snail', + 'snake', + 'spider', + 'squirrel', + 'streetcar', + 'sunflower', + 'sweet pepper', + 'table', + 'tank', + 'telephone', + 'television', + 'tiger', + 'tractor', + 'train', + 'trout', + 'tulip', + 'turtle', + 'wardrobe', + 'whale', + 'willow tree', + 'wolf', + 'woman', + 'worm', +] + +templates = [ + 'a photo of a {}.', + 'a blurry photo of a {}.', + 'a black and white photo of a {}.', + 'a low contrast photo of a {}.', + 'a high contrast photo of a {}.', + 'a bad photo of a {}.', + 'a good photo of a {}.', + 'a photo of a small {}.', + 'a photo of a big {}.', + 'a photo of the {}.', + 'a blurry photo of the {}.', + 'a black and white photo of the {}.', + 'a low contrast photo of the {}.', + 'a high contrast photo of the {}.', + 'a bad photo of the {}.', + 'a good photo of the {}.', + 'a photo of the small {}.', + 'a photo of the big {}.', +] +``` + + + +## CLEVRCounts + +```bash +classes = [ + '10', + '3', + '4', + '5', + '6', + '7', + '8', + '9', +] + +templates = [ + 'a photo of {} objects.', +] +``` + + + +## Caltech101 + +```bash +classes = [ + 'background', + 'off-center face', + 'centered face', + 'leopard', + 'motorbike', + 'accordion', + 'airplane', + 'anchor', + 'ant', + 'barrel', + 'bass', + 'beaver', + 'binocular', + 'bonsai', + 'brain', + 'brontosaurus', + 'buddha', + 'butterfly', + 'camera', + 'cannon', + 'side of a car', + 'ceiling fan', + 'cellphone', + 'chair', + 'chandelier', + 'body of a cougar cat', + 'face of a cougar cat', + 'crab', + 'crayfish', + 'crocodile', + 'head of a crocodile', + 'cup', + 'dalmatian', + 'dollar bill', + 'dolphin', + 'dragonfly', + 'electric guitar', + 'elephant', + 'emu', + 'euphonium', + 'ewer', + 'ferry', + 'flamingo', + 'head of a flamingo', + 'garfield', + 'gerenuk', + 'gramophone', + 'grand piano', + 'hawksbill', + 'headphone', + 'hedgehog', + 'helicopter', + 'ibis', + 'inline skate', + 'joshua tree', + 'kangaroo', + 'ketch', + 'lamp', + 'laptop', + 'llama', + 'lobster', + 'lotus', + 'mandolin', + 'mayfly', + 'menorah', + 'metronome', + 'minaret', + 'nautilus', + 'octopus', + 'okapi', + 'pagoda', + 'panda', + 'pigeon', + 'pizza', + 'platypus', + 'pyramid', + 'revolver', + 'rhino', + 'rooster', + 'saxophone', + 'schooner', + 'scissors', + 'scorpion', + 'sea horse', + 'snoopy (cartoon beagle)', + 'soccer ball', + 'stapler', + 'starfish', + 'stegosaurus', + 'stop sign', + 'strawberry', + 'sunflower', + 'tick', + 'trilobite', + 'umbrella', + 'watch', + 'water lilly', + 'wheelchair', + 'wild cat', + 'windsor chair', + 'wrench', + 'yin and yang symbol', +] + +templates = [ + 'a photo of a {}.', + 'a painting of a {}.', + 'a plastic {}.', + 'a sculpture of a {}.', + 'a sketch of a {}.', + 'a tattoo of a {}.', + 'a toy {}.', + 'a rendition of a {}.', + 'a embroidered {}.', + 'a cartoon {}.', + 'a {} in a video game.', + 'a plushie {}.', + 'a origami {}.', + 'art of a {}.', + 'graffiti of a {}.', + 'a drawing of a {}.', + 'a doodle of a {}.', + 'a photo of the {}.', + 'a painting of the {}.', + 'the plastic {}.', + 'a sculpture of the {}.', + 'a sketch of the {}.', + 'a tattoo of the {}.', + 'the toy {}.', + 'a rendition of the {}.', + 'the embroidered {}.', + 'the cartoon {}.', + 'the {} in a video game.', + 'the plushie {}.', + 'the origami {}.', + 'art of the {}.', + 'graffiti of the {}.', + 'a drawing of the {}.', + 'a doodle of the {}.', +] +``` + + + +## Country211 + +```bash +classes = [ + 'Andorra', + 'United Arab Emirates', + 'Afghanistan', + 'Antigua and Barbuda', + 'Anguilla', + 'Albania', + 'Armenia', + 'Angola', + 'Antarctica', + 'Argentina', + 'Austria', + 'Australia', + 'Aruba', + 'Aland Islands', + 'Azerbaijan', + 'Bosnia and Herzegovina', + 'Barbados', + 'Bangladesh', + 'Belgium', + 'Burkina Faso', + 'Bulgaria', + 'Bahrain', + 'Benin', + 'Bermuda', + 'Brunei Darussalam', + 'Bolivia', + 'Bonaire, Saint Eustatius and Saba', + 'Brazil', + 'Bahamas', + 'Bhutan', + 'Botswana', + 'Belarus', + 'Belize', + 'Canada', + 'DR Congo', + 'Central African Republic', + 'Switzerland', + "Cote d'Ivoire", + 'Cook Islands', + 'Chile', + 'Cameroon', + 'China', + 'Colombia', + 'Costa Rica', + 'Cuba', + 'Cabo Verde', + 'Curacao', + 'Cyprus', + 'Czech Republic', + 'Germany', + 'Denmark', + 'Dominica', + 'Dominican Republic', + 'Algeria', + 'Ecuador', + 'Estonia', + 'Egypt', + 'Spain', + 'Ethiopia', + 'Finland', + 'Fiji', + 'Falkland Islands', + 'Faeroe Islands', + 'France', + 'Gabon', + 'United Kingdom', + 'Grenada', + 'Georgia', + 'French Guiana', + 'Guernsey', + 'Ghana', + 'Gibraltar', + 'Greenland', + 'Gambia', + 'Guadeloupe', + 'Greece', + 'South Georgia and South Sandwich Is.', + 'Guatemala', + 'Guam', + 'Guyana', + 'Hong Kong', + 'Honduras', + 'Croatia', + 'Haiti', + 'Hungary', + 'Indonesia', + 'Ireland', + 'Israel', + 'Isle of Man', + 'India', + 'Iraq', + 'Iran', + 'Iceland', + 'Italy', + 'Jersey', + 'Jamaica', + 'Jordan', + 'Japan', + 'Kenya', + 'Kyrgyz Republic', + 'Cambodia', + 'St. Kitts and Nevis', + 'North Korea', + 'South Korea', + 'Kuwait', + 'Cayman Islands', + 'Kazakhstan', + 'Laos', + 'Lebanon', + 'St. Lucia', + 'Liechtenstein', + 'Sri Lanka', + 'Liberia', + 'Lithuania', + 'Luxembourg', + 'Latvia', + 'Libya', + 'Morocco', + 'Monaco', + 'Moldova', + 'Montenegro', + 'Saint-Martin', + 'Madagascar', + 'Macedonia', + 'Mali', + 'Myanmar', + 'Mongolia', + 'Macau', + 'Martinique', + 'Mauritania', + 'Malta', + 'Mauritius', + 'Maldives', + 'Malawi', + 'Mexico', + 'Malaysia', + 'Mozambique', + 'Namibia', + 'New Caledonia', + 'Nigeria', + 'Nicaragua', + 'Netherlands', + 'Norway', + 'Nepal', + 'New Zealand', + 'Oman', + 'Panama', + 'Peru', + 'French Polynesia', + 'Papua New Guinea', + 'Philippines', + 'Pakistan', + 'Poland', + 'Puerto Rico', + 'Palestine', + 'Portugal', + 'Palau', + 'Paraguay', + 'Qatar', + 'Reunion', + 'Romania', + 'Serbia', + 'Russia', + 'Rwanda', + 'Saudi Arabia', + 'Solomon Islands', + 'Seychelles', + 'Sudan', + 'Sweden', + 'Singapore', + 'St. Helena', + 'Slovenia', + 'Svalbard and Jan Mayen Islands', + 'Slovakia', + 'Sierra Leone', + 'San Marino', + 'Senegal', + 'Somalia', + 'South Sudan', + 'El Salvador', + 'Sint Maarten', + 'Syria', + 'Eswatini', + 'Togo', + 'Thailand', + 'Tajikistan', + 'Timor-Leste', + 'Turkmenistan', + 'Tunisia', + 'Tonga', + 'Turkey', + 'Trinidad and Tobago', + 'Taiwan', + 'Tanzania', + 'Ukraine', + 'Uganda', + 'United States', + 'Uruguay', + 'Uzbekistan', + 'Vatican', + 'Venezuela', + 'British Virgin Islands', + 'United States Virgin Islands', + 'Vietnam', + 'Vanuatu', + 'Samoa', + 'Kosovo', + 'Yemen', + 'South Africa', + 'Zambia', + 'Zimbabwe', +] + +templates = [ + 'a photo i took in {}.', + 'a photo i took while visiting {}.', + 'a photo from my home country of {}.', + 'a photo from my visit to {}.', + 'a photo showing the country of {}.', +] +``` + + + +## DescribableTextures + +```bash +classes = [ + 'banded', + 'blotchy', + 'braided', + 'bubbly', + 'bumpy', + 'chequered', + 'cobwebbed', + 'cracked', + 'crosshatched', + 'crystalline', + 'dotted', + 'fibrous', + 'flecked', + 'freckled', + 'frilly', + 'gauzy', + 'grid', + 'grooved', + 'honeycombed', + 'interlaced', + 'knitted', + 'lacelike', + 'lined', + 'marbled', + 'matted', + 'meshed', + 'paisley', + 'perforated', + 'pitted', + 'pleated', + 'polka-dotted', + 'porous', + 'potholed', + 'scaly', + 'smeared', + 'spiralled', + 'sprinkled', + 'stained', + 'stratified', + 'striped', + 'studded', + 'swirly', + 'veined', + 'waffled', + 'woven', + 'wrinkled', + 'zigzagged', +] + +templates = [ + 'a photo of a {} texture.', + 'a photo of a {} pattern.', + 'a photo of a {} thing.', + 'a photo of a {} object.', + 'a photo of the {} texture.', + 'a photo of the {} pattern.', + 'a photo of the {} thing.', + 'a photo of the {} object.', +] +``` + + + +## EuroSAT + +```bash +classes = [ + 'forest', + 'permanent crop land', + 'residential buildings or homes or apartments', + 'river', + 'pasture land', + 'lake or sea', + 'brushland or shrubland', + 'annual crop land', + 'industrial buildings or commercial buildings', + 'highway or road', +] + +templates = [ + 'a centered satellite photo of {}.', + 'a centered satellite photo of a {}.', + 'a centered satellite photo of the {}.', +] +``` + + + +## FGVCAircraft + +```bash +classes = [ + '707-320', + '727-200', + '737-200', + '737-300', + '737-400', + '737-500', + '737-600', + '737-700', + '737-800', + '737-900', + '747-100', + '747-200', + '747-300', + '747-400', + '757-200', + '757-300', + '767-200', + '767-300', + '767-400', + '777-200', + '777-300', + 'A300B4', + 'A310', + 'A318', + 'A319', + 'A320', + 'A321', + 'A330-200', + 'A330-300', + 'A340-200', + 'A340-300', + 'A340-500', + 'A340-600', + 'A380', + 'ATR-42', + 'ATR-72', + 'An-12', + 'BAE 146-200', + 'BAE 146-300', + 'BAE-125', + 'Beechcraft 1900', + 'Boeing 717', + 'C-130', + 'C-47', + 'CRJ-200', + 'CRJ-700', + 'CRJ-900', + 'Cessna 172', + 'Cessna 208', + 'Cessna 525', + 'Cessna 560', + 'Challenger 600', + 'DC-10', + 'DC-3', + 'DC-6', + 'DC-8', + 'DC-9-30', + 'DH-82', + 'DHC-1', + 'DHC-6', + 'DHC-8-100', + 'DHC-8-300', + 'DR-400', + 'Dornier 328', + 'E-170', + 'E-190', + 'E-195', + 'EMB-120', + 'ERJ 135', + 'ERJ 145', + 'Embraer Legacy 600', + 'Eurofighter Typhoon', + 'F-16A/B', + 'F/A-18', + 'Falcon 2000', + 'Falcon 900', + 'Fokker 100', + 'Fokker 50', + 'Fokker 70', + 'Global Express', + 'Gulfstream IV', + 'Gulfstream V', + 'Hawk T1', + 'Il-76', + 'L-1011', + 'MD-11', + 'MD-80', + 'MD-87', + 'MD-90', + 'Metroliner', + 'Model B200', + 'PA-28', + 'SR-20', + 'Saab 2000', + 'Saab 340', + 'Spitfire', + 'Tornado', + 'Tu-134', + 'Tu-154', + 'Yak-42', +] + +templates = [ + 'a photo of a {}, a type of aircraft.', + 'a photo of the {}, a type of aircraft.', +] +``` + + + +## FacialEmotionRecognition2013 + +```bash +classes = [ + ['angry'], + ['disgusted'], + ['fearful'], + ['happy', 'smiling'], + ['sad', 'depressed'], + ['surprised', 'shocked', 'spooked'], + ['neutral', 'bored'], +] + +templates = [ + 'a photo of a {} looking face.', + 'a photo of a face showing the emotion: {}.', + 'a photo of a face looking {}.', + 'a face that looks {}.', + 'they look {}.', + 'look at how {} they are.', +] +``` + + + +## Flowers102 + +```bash +classes = [ + 'pink primrose', + 'hard-leaved pocket orchid', + 'canterbury bells', + 'sweet pea', + 'english marigold', + 'tiger lily', + 'moon orchid', + 'bird of paradise', + 'monkshood', + 'globe thistle', + 'snapdragon', + "colt's foot", + 'king protea', + 'spear thistle', + 'yellow iris', + 'globe flower', + 'purple coneflower', + 'peruvian lily', + 'balloon flower', + 'giant white arum lily', + 'fire lily', + 'pincushion flower', + 'fritillary', + 'red ginger', + 'grape hyacinth', + 'corn poppy', + 'prince of wales feathers', + 'stemless gentian', + 'artichoke', + 'sweet william', + 'carnation', + 'garden phlox', + 'love in the mist', + 'mexican aster', + 'alpine sea holly', + 'ruby-lipped cattleya', + 'cape flower', + 'great masterwort', + 'siam tulip', + 'lenten rose', + 'barbeton daisy', + 'daffodil', + 'sword lily', + 'poinsettia', + 'bolero deep blue', + 'wallflower', + 'marigold', + 'buttercup', + 'oxeye daisy', + 'common dandelion', + 'petunia', + 'wild pansy', + 'primula', + 'sunflower', + 'pelargonium', + 'bishop of llandaff', + 'gaura', + 'geranium', + 'orange dahlia', + 'pink and yellow dahlia', + 'cautleya spicata', + 'japanese anemone', + 'black-eyed susan', + 'silverbush', + 'californian poppy', + 'osteospermum', + 'spring crocus', + 'bearded iris', + 'windflower', + 'tree poppy', + 'gazania', + 'azalea', + 'water lily', + 'rose', + 'thorn apple', + 'morning glory', + 'passion flower', + 'lotus', + 'toad lily', + 'anthurium', + 'frangipani', + 'clematis', + 'hibiscus', + 'columbine', + 'desert-rose', + 'tree mallow', + 'magnolia', + 'cyclamen', + 'watercress', + 'canna lily', + 'hippeastrum', + 'bee balm', + 'air plant', + 'foxglove', + 'bougainvillea', + 'camellia', + 'mallow', + 'mexican petunia', + 'bromelia', + 'blanket flower', + 'trumpet creeper', + 'blackberry lily', +] + +templates = [ + 'a photo of a {}, a type of flower.', +] +``` + + + +## Food101 + +```bash +classes = [ + 'apple pie', + 'baby back ribs', + 'baklava', + 'beef carpaccio', + 'beef tartare', + 'beet salad', + 'beignets', + 'bibimbap', + 'bread pudding', + 'breakfast burrito', + 'bruschetta', + 'caesar salad', + 'cannoli', + 'caprese salad', + 'carrot cake', + 'ceviche', + 'cheese plate', + 'cheesecake', + 'chicken curry', + 'chicken quesadilla', + 'chicken wings', + 'chocolate cake', + 'chocolate mousse', + 'churros', + 'clam chowder', + 'club sandwich', + 'crab cakes', + 'creme brulee', + 'croque madame', + 'cup cakes', + 'deviled eggs', + 'donuts', + 'dumplings', + 'edamame', + 'eggs benedict', + 'escargots', + 'falafel', + 'filet mignon', + 'fish and chips', + 'foie gras', + 'french fries', + 'french onion soup', + 'french toast', + 'fried calamari', + 'fried rice', + 'frozen yogurt', + 'garlic bread', + 'gnocchi', + 'greek salad', + 'grilled cheese sandwich', + 'grilled salmon', + 'guacamole', + 'gyoza', + 'hamburger', + 'hot and sour soup', + 'hot dog', + 'huevos rancheros', + 'hummus', + 'ice cream', + 'lasagna', + 'lobster bisque', + 'lobster roll sandwich', + 'macaroni and cheese', + 'macarons', + 'miso soup', + 'mussels', + 'nachos', + 'omelette', + 'onion rings', + 'oysters', + 'pad thai', + 'paella', + 'pancakes', + 'panna cotta', + 'peking duck', + 'pho', + 'pizza', + 'pork chop', + 'poutine', + 'prime rib', + 'pulled pork sandwich', + 'ramen', + 'ravioli', + 'red velvet cake', + 'risotto', + 'samosa', + 'sashimi', + 'scallops', + 'seaweed salad', + 'shrimp and grits', + 'spaghetti bolognese', + 'spaghetti carbonara', + 'spring rolls', + 'steak', + 'strawberry shortcake', + 'sushi', + 'tacos', + 'takoyaki', + 'tiramisu', + 'tuna tartare', + 'waffles', +] + +templates = [ + 'a photo of {}, a type of food.', +] +``` + + + +## GTSRB + +```bash +classes = [ + 'red and white circle 20 kph speed limit', + 'red and white circle 30 kph speed limit', + 'red and white circle 50 kph speed limit', + 'red and white circle 60 kph speed limit', + 'red and white circle 70 kph speed limit', + 'red and white circle 80 kph speed limit', + 'end / de-restriction of 80 kph speed limit', + 'red and white circle 100 kph speed limit', + 'red and white circle 120 kph speed limit', + 'red and white circle red car and black car no passing', + 'red and white circle red truck and black car no passing', + 'red and white triangle road intersection warning', + 'white and yellow diamond priority road', + 'red and white upside down triangle yield right-of-way', + 'stop', + 'empty red and white circle', + 'red and white circle no truck entry', + 'red circle with white horizonal stripe no entry', + 'red and white triangle with exclamation mark warning', + 'red and white triangle with black left curve approaching warning', + 'red and white triangle with black right curve approaching warning', + 'red and white triangle with black double curve approaching warning', + 'red and white triangle rough / bumpy road warning', + 'red and white triangle car skidding / slipping warning', + 'red and white triangle with merging / narrow lanes warning', + 'red and white triangle with person digging / construction / road work warning', + 'red and white triangle with traffic light approaching warning', + 'red and white triangle with person walking warning', + 'red and white triangle with child and person walking warning', + 'red and white triangle with bicyle warning', + 'red and white triangle with snowflake / ice warning', + 'red and white triangle with deer warning', + 'white circle with gray strike bar no speed limit', + 'blue circle with white right turn arrow mandatory', + 'blue circle with white left turn arrow mandatory', + 'blue circle with white forward arrow mandatory', + 'blue circle with white forward or right turn arrow mandatory', + 'blue circle with white forward or left turn arrow mandatory', + 'blue circle with white keep right arrow mandatory', + 'blue circle with white keep left arrow mandatory', + 'blue circle with white arrows indicating a traffic circle', + 'white circle with gray strike bar indicating no passing for cars has ended', + 'white circle with gray strike bar indicating no passing for trucks has ended', +] + +templates = [ + 'a zoomed in photo of a "{}" traffic sign.', + 'a centered photo of a "{}" traffic sign.', + 'a close up photo of a "{}" traffic sign.', +] +``` + + + +## HatefulMemes + +```bash +classes = [ + 'meme', + 'hatespeech meme', +] + +templates = [ + 'a {}.', +] +``` + + + +## KITTI + +```bash +classes = [ + 'a photo i took of a car on my left or right side.', + 'a photo i took with a car nearby.', + 'a photo i took with a car in the distance.', + 'a photo i took with no car.', +] + +templates = [ + '{}', +] +``` + + + +## Kinetics700 + +```bash +classes = [ + 'abseiling', + 'acting in play', + 'adjusting glasses', + 'air drumming', + 'alligator wrestling', + 'answering questions', + 'applauding', + 'applying cream', + 'archaeological excavation', + 'archery', + 'arguing', + 'arm wrestling', + 'arranging flowers', + 'arresting', + 'assembling bicycle', + 'assembling computer', + 'attending conference', + 'auctioning', + 'baby waking up', + 'backflip (human)', + 'baking cookies', + 'bandaging', + 'barbequing', + 'bartending', + 'base jumping', + 'bathing dog', + 'battle rope training', + 'beatboxing', + 'bee keeping', + 'being excited', + 'being in zero gravity', + 'belly dancing', + 'bench pressing', + 'bending back', + 'bending metal', + 'biking through snow', + 'blasting sand', + 'blending fruit', + 'blowdrying hair', + 'blowing bubble gum', + 'blowing glass', + 'blowing leaves', + 'blowing nose', + 'blowing out candles', + 'bobsledding', + 'bodysurfing', + 'bookbinding', + 'bottling', + 'bouncing ball (not juggling)', + 'bouncing on bouncy castle', + 'bouncing on trampoline', + 'bowling', + 'braiding hair', + 'breading or breadcrumbing', + 'breakdancing', + 'breaking boards', + 'breaking glass', + 'breathing fire', + 'brush painting', + 'brushing floor', + 'brushing hair', + 'brushing teeth', + 'building cabinet', + 'building lego', + 'building sandcastle', + 'building shed', + 'bulldozing', + 'bungee jumping', + 'burping', + 'busking', + 'calculating', + 'calligraphy', + 'canoeing or kayaking', + 'capoeira', + 'capsizing', + 'card stacking', + 'card throwing', + 'carrying baby', + 'carrying weight', + 'cartwheeling', + 'carving ice', + 'carving marble', + 'carving pumpkin', + 'carving wood with a knife', + 'casting fishing line', + 'catching fish', + 'catching or throwing baseball', + 'catching or throwing frisbee', + 'catching or throwing softball', + 'celebrating', + 'changing gear in car', + 'changing oil', + 'changing wheel (not on bike)', + 'chasing', + 'checking tires', + 'checking watch', + 'cheerleading', + 'chewing gum', + 'chiseling stone', + 'chiseling wood', + 'chopping meat', + 'chopping wood', + 'clam digging', + 'clapping', + 'clay pottery making', + 'clean and jerk', + 'cleaning gutters', + 'cleaning pool', + 'cleaning shoes', + 'cleaning toilet', + 'cleaning windows', + 'climbing a rope', + 'climbing ladder', + 'climbing tree', + 'closing door', + 'coloring in', + 'combing hair', + 'contact juggling', + 'contorting', + 'cooking chicken', + 'cooking egg', + 'cooking on campfire', + 'cooking sausages (not on barbeque)', + 'cooking scallops', + 'cosplaying', + 'coughing', + 'counting money', + 'country line dancing', + 'cracking back', + 'cracking knuckles', + 'cracking neck', + 'crawling baby', + 'crocheting', + 'crossing eyes', + 'crossing river', + 'crying', + 'cumbia', + 'curling (sport)', + 'curling eyelashes', + 'curling hair', + 'cutting apple', + 'cutting cake', + 'cutting nails', + 'cutting orange', + 'cutting pineapple', + 'cutting watermelon', + 'dancing ballet', + 'dancing charleston', + 'dancing gangnam style', + 'dancing macarena', + 'deadlifting', + 'dealing cards', + 'decorating the christmas tree', + 'decoupage', + 'delivering mail', + 'digging', + 'dining', + 'directing traffic', + 'disc golfing', + 'diving cliff', + 'docking boat', + 'dodgeball', + 'doing aerobics', + 'doing jigsaw puzzle', + 'doing laundry', + 'doing nails', + 'doing sudoku', + 'drawing', + 'dribbling basketball', + 'drinking shots', + 'driving car', + 'driving tractor', + 'drooling', + 'drop kicking', + 'drumming fingers', + 'dumpster diving', + 'dunking basketball', + 'dyeing eyebrows', + 'dyeing hair', + 'eating burger', + 'eating cake', + 'eating carrots', + 'eating chips', + 'eating doughnuts', + 'eating hotdog', + 'eating ice cream', + 'eating nachos', + 'eating spaghetti', + 'eating watermelon', + 'egg hunting', + 'embroidering', + 'entering church', + 'exercising arm', + 'exercising with an exercise ball', + 'extinguishing fire', + 'faceplanting', + 'falling off bike', + 'falling off chair', + 'feeding birds', + 'feeding fish', + 'feeding goats', + 'fencing (sport)', + 'fidgeting', + 'filling cake', + 'filling eyebrows', + 'finger snapping', + 'fixing bicycle', + 'fixing hair', + 'flint knapping', + 'flipping bottle', + 'flipping pancake', + 'fly tying', + 'flying kite', + 'folding clothes', + 'folding napkins', + 'folding paper', + 'front raises', + 'frying vegetables', + 'gargling', + 'geocaching', + 'getting a haircut', + 'getting a piercing', + 'getting a tattoo', + 'giving or receiving award', + 'gold panning', + 'golf chipping', + 'golf driving', + 'golf putting', + 'gospel singing in church', + 'grinding meat', + 'grooming cat', + 'grooming dog', + 'grooming horse', + 'gymnastics tumbling', + 'hammer throw', + 'hand washing clothes', + 'head stand', + 'headbanging', + 'headbutting', + 'helmet diving', + 'herding cattle', + 'high fiving', + 'high jump', + 'high kick', + 'historical reenactment', + 'hitting baseball', + 'hockey stop', + 'holding snake', + 'home roasting coffee', + 'hopscotch', + 'hoverboarding', + 'huddling', + 'hugging (not baby)', + 'hugging baby', + 'hula hooping', + 'hurdling', + 'hurling (sport)', + 'ice climbing', + 'ice fishing', + 'ice skating', + 'ice swimming', + 'inflating balloons', + 'installing carpet', + 'ironing', + 'ironing hair', + 'javelin throw', + 'jaywalking', + 'jetskiing', + 'jogging', + 'juggling balls', + 'juggling fire', + 'juggling soccer ball', + 'jumping bicycle', + 'jumping into pool', + 'jumping jacks', + 'jumping sofa', + 'jumpstyle dancing', + 'karaoke', + 'kicking field goal', + 'kicking soccer ball', + 'kissing', + 'kitesurfing', + 'knitting', + 'krumping', + 'land sailing', + 'laughing', + 'lawn mower racing', + 'laying bricks', + 'laying concrete', + 'laying decking', + 'laying stone', + 'laying tiles', + 'leatherworking', + 'letting go of balloon', + 'licking', + 'lifting hat', + 'lighting candle', + 'lighting fire', + 'listening with headphones', + 'lock picking', + 'long jump', + 'longboarding', + 'looking at phone', + 'looking in mirror', + 'luge', + 'lunge', + 'making a cake', + 'making a sandwich', + 'making balloon shapes', + 'making bubbles', + 'making cheese', + 'making horseshoes', + 'making jewelry', + 'making latte art', + 'making paper aeroplanes', + 'making pizza', + 'making slime', + 'making snowman', + 'making sushi', + 'making tea', + 'making the bed', + 'marching', + 'marriage proposal', + 'massaging back', + 'massaging feet', + 'massaging legs', + 'massaging neck', + "massaging person's head", + 'metal detecting', + 'milking cow', + 'milking goat', + 'mixing colours', + 'moon walking', + 'mopping floor', + 'mosh pit dancing', + 'motorcycling', + 'mountain climber (exercise)', + 'moving baby', + 'moving child', + 'moving furniture', + 'mowing lawn', + 'mushroom foraging', + 'needle felting', + 'news anchoring', + 'opening bottle (not wine)', + 'opening coconuts', + 'opening door', + 'opening present', + 'opening refrigerator', + 'opening wine bottle', + 'packing', + 'paragliding', + 'parasailing', + 'parkour', + 'passing American football (in game)', + 'passing American football (not in game)', + 'passing soccer ball', + 'peeling apples', + 'peeling banana', + 'peeling potatoes', + 'person collecting garbage', + 'petting animal (not cat)', + 'petting cat', + 'petting horse', + 'photobombing', + 'photocopying', + 'picking apples', + 'picking blueberries', + 'pillow fight', + 'pinching', + 'pirouetting', + 'planing wood', + 'planting trees', + 'plastering', + 'playing accordion', + 'playing american football', + 'playing badminton', + 'playing bagpipes', + 'playing basketball', + 'playing bass guitar', + 'playing beer pong', + 'playing billiards', + 'playing blackjack', + 'playing cards', + 'playing cello', + 'playing checkers', + 'playing chess', + 'playing clarinet', + 'playing controller', + 'playing cricket', + 'playing cymbals', + 'playing darts', + 'playing didgeridoo', + 'playing dominoes', + 'playing drums', + 'playing field hockey', + 'playing flute', + 'playing gong', + 'playing guitar', + 'playing hand clapping games', + 'playing harmonica', + 'playing harp', + 'playing ice hockey', + 'playing keyboard', + 'playing kickball', + 'playing laser tag', + 'playing lute', + 'playing mahjong', + 'playing maracas', + 'playing marbles', + 'playing monopoly', + 'playing netball', + 'playing nose flute', + 'playing oboe', + 'playing ocarina', + 'playing organ', + 'playing paintball', + 'playing pan pipes', + 'playing piano', + 'playing piccolo', + 'playing pinball', + 'playing ping pong', + 'playing poker', + 'playing polo', + 'playing recorder', + 'playing road hockey', + 'playing rounders', + 'playing rubiks cube', + 'playing saxophone', + 'playing scrabble', + 'playing shuffleboard', + 'playing slot machine', + 'playing squash or racquetball', + 'playing tennis', + 'playing trombone', + 'playing trumpet', + 'playing ukulele', + 'playing violin', + 'playing volleyball', + 'playing with trains', + 'playing xylophone', + 'poaching eggs', + 'poking bellybutton', + 'pole vault', + 'polishing furniture', + 'polishing metal', + 'popping balloons', + 'pouring beer', + 'pouring milk', + 'pouring wine', + 'preparing salad', + 'presenting weather forecast', + 'pretending to be a statue', + 'pull ups', + 'pulling espresso shot', + 'pulling rope (game)', + 'pumping fist', + 'pumping gas', + 'punching bag', + 'punching person (boxing)', + 'push up', + 'pushing car', + 'pushing cart', + 'pushing wheelbarrow', + 'pushing wheelchair', + 'putting in contact lenses', + 'putting on eyeliner', + 'putting on foundation', + 'putting on lipstick', + 'putting on mascara', + 'putting on sari', + 'putting on shoes', + 'putting wallpaper on wall', + 'raising eyebrows', + 'reading book', + 'reading newspaper', + 'recording music', + 'repairing puncture', + 'riding a bike', + 'riding camel', + 'riding elephant', + 'riding mechanical bull', + 'riding mule', + 'riding or walking with horse', + 'riding scooter', + 'riding snow blower', + 'riding unicycle', + 'ripping paper', + 'roasting marshmallows', + 'roasting pig', + 'robot dancing', + 'rock climbing', + 'rock scissors paper', + 'roller skating', + 'rolling eyes', + 'rolling pastry', + 'rope pushdown', + 'running on treadmill', + 'sailing', + 'salsa dancing', + 'saluting', + 'sanding floor', + 'sanding wood', + 'sausage making', + 'sawing wood', + 'scrambling eggs', + 'scrapbooking', + 'scrubbing face', + 'scuba diving', + 'seasoning food', + 'separating eggs', + 'setting table', + 'sewing', + 'shaking hands', + 'shaking head', + 'shaping bread dough', + 'sharpening knives', + 'sharpening pencil', + 'shaving head', + 'shaving legs', + 'shearing sheep', + 'shining flashlight', + 'shining shoes', + 'shoot dance', + 'shooting basketball', + 'shooting goal (soccer)', + 'shooting off fireworks', + 'shopping', + 'shot put', + 'shouting', + 'shoveling snow', + 'shredding paper', + 'shucking oysters', + 'shuffling cards', + 'shuffling feet', + 'side kick', + 'sieving', + 'sign language interpreting', + 'silent disco', + 'singing', + 'sipping cup', + 'situp', + 'skateboarding', + 'ski ballet', + 'ski jumping', + 'skiing crosscountry', + 'skiing mono', + 'skiing slalom', + 'skipping rope', + 'skipping stone', + 'skydiving', + 'slacklining', + 'slapping', + 'sled dog racing', + 'sleeping', + 'slicing onion', + 'smashing', + 'smelling feet', + 'smoking', + 'smoking hookah', + 'smoking pipe', + 'snatch weight lifting', + 'sneezing', + 'snorkeling', + 'snowboarding', + 'snowkiting', + 'snowmobiling', + 'somersaulting', + 'spelunking', + 'spinning plates', + 'spinning poi', + 'splashing water', + 'spray painting', + 'spraying', + 'springboard diving', + 'square dancing', + 'squat', + 'squeezing orange', + 'stacking cups', + 'stacking dice', + 'standing on hands', + 'staring', + 'steer roping', + 'steering car', + 'sticking tongue out', + 'stomping grapes', + 'stretching arm', + 'stretching leg', + 'sucking lolly', + 'surfing crowd', + 'surfing water', + 'surveying', + 'sweeping floor', + 'swimming backstroke', + 'swimming breast stroke', + 'swimming butterfly stroke', + 'swimming front crawl', + 'swimming with dolphins', + 'swimming with sharks', + 'swing dancing', + 'swinging baseball bat', + 'swinging on something', + 'sword fighting', + 'sword swallowing', + 'tackling', + 'tagging graffiti', + 'tai chi', + 'taking photo', + 'talking on cell phone', + 'tango dancing', + 'tap dancing', + 'tapping guitar', + 'tapping pen', + 'tasting beer', + 'tasting food', + 'tasting wine', + 'testifying', + 'texting', + 'threading needle', + 'throwing axe', + 'throwing ball (not baseball or American football)', + 'throwing discus', + 'throwing knife', + 'throwing snowballs', + 'throwing tantrum', + 'throwing water balloon', + 'tickling', + 'tie dying', + 'tightrope walking', + 'tiptoeing', + 'tobogganing', + 'tossing coin', + 'tossing salad', + 'training dog', + 'trapezing', + 'treating wood', + 'trimming or shaving beard', + 'trimming shrubs', + 'trimming trees', + 'triple jump', + 'twiddling fingers', + 'tying bow tie', + 'tying knot (not on a tie)', + 'tying necktie', + 'tying shoe laces', + 'unboxing', + 'uncorking champagne', + 'unloading truck', + 'using a microscope', + 'using a paint roller', + 'using a power drill', + 'using a sledge hammer', + 'using a wrench', + 'using atm', + 'using bagging machine', + 'using circular saw', + 'using inhaler', + 'using megaphone', + 'using puppets', + 'using remote controller (not gaming)', + 'using segway', + 'vacuuming car', + 'vacuuming floor', + 'visiting the zoo', + 'wading through mud', + 'wading through water', + 'waiting in line', + 'waking up', + 'walking on stilts', + 'walking the dog', + 'walking through snow', + 'walking with crutches', + 'washing dishes', + 'washing feet', + 'washing hair', + 'washing hands', + 'watching tv', + 'water skiing', + 'water sliding', + 'watering plants', + 'waving hand', + 'waxing armpits', + 'waxing back', + 'waxing chest', + 'waxing eyebrows', + 'waxing legs', + 'weaving basket', + 'weaving fabric', + 'welding', + 'whistling', + 'windsurfing', + 'winking', + 'wood burning (art)', + 'wrapping present', + 'wrestling', + 'writing', + 'yarn spinning', + 'yawning', + 'yoga', + 'zumba' +] + +templates = [ + 'a photo of {}.', + 'a photo of a person {}.', + 'a photo of a person using {}.', + 'a photo of a person doing {}.', + 'a photo of a person during {}.', + 'a photo of a person performing {}.', + 'a photo of a person practicing {}.', + 'a video of {}.', + 'a video of a person {}.', + 'a video of a person using {}.', + 'a video of a person doing {}.', + 'a video of a person during {}.', + 'a video of a person performing {}.', + 'a video of a person practicing {}.', + 'a example of {}.', + 'a example of a person {}.', + 'a example of a person using {}.', + 'a example of a person doing {}.', + 'a example of a person during {}.', + 'a example of a person performing {}.', + 'a example of a person practicing {}.', + 'a demonstration of {}.', + 'a demonstration of a person {}.', + 'a demonstration of a person using {}.', + 'a demonstration of a person doing {}.', + 'a demonstration of a person during {}.', + 'a demonstration of a person performing {}.', + 'a demonstration of a person practicing {}.', +] +``` + + + +## MNIST + +```bash +classes = [ + '0', + '1', + '2', + '3', + '4', + '5', + '6', + '7', + '8', + '9', +] + +templates = [ + 'a photo of the number: "{}".', +] +``` + + + +## OxfordPets + +```bash +classes = [ + 'Abyssinian', + 'Bengal', + 'Birman', + 'Bombay', + 'British Shorthair', + 'Egyptian Mau', + 'Maine Coon', + 'Persian', + 'Ragdoll', + 'Russian Blue', + 'Siamese', + 'Sphynx', + 'american bulldog', + 'american pit bull terrier', + 'basset hound', + 'beagle', + 'boxer', + 'chihuahua', + 'english cocker spaniel', + 'english setter', + 'german shorthaired', + 'great pyrenees', + 'havanese', + 'japanese chin', + 'keeshond', + 'leonberger', + 'miniature pinscher', + 'newfoundland', + 'pomeranian', + 'pug', + 'saint bernard', + 'samoyed', + 'scottish terrier', + 'shiba inu', + 'staffordshire bull terrier', + 'wheaten terrier', + 'yorkshire terrier', +] + +templates = [ + 'a photo of a {}, a type of pet.', +] +``` + + + +## PascalVOC2007 + +```bash +classes = [ + 'aeroplane', + 'bicycle', + 'bird', + 'boat', + 'bottle', + 'bus', + 'car', + 'cat', + 'chair', + 'cow', + 'dog', + 'horse', + 'motorbike', + 'person', + 'sheep', + 'sofa', + 'diningtable', + 'pottedplant', + 'train', + 'tvmonitor', +] + +templates = [ + 'a photo of a {}.', +] +``` + + + +## PatchCamelyon + +```bash +classes = [ + 'lymph node', + 'lymph node containing metastatic tumor tissue', +] + +templates = [ + 'this is a photo of {}', +] +``` + + + +## RESISC45 + +```bash +classes = [ + 'airplane', + 'airport', + 'baseball diamond', + 'basketball court', + 'beach', + 'bridge', + 'chaparral', + 'church', + 'circular farmland', + 'cloud', + 'commercial area', + 'dense residential', + 'desert', + 'forest', + 'freeway', + 'golf course', + 'ground track field', + 'harbor', + 'industrial area', + 'intersection', + 'island', + 'lake', + 'meadow', + 'medium residential', + 'mobile home park', + 'mountain', + 'overpass', + 'palace', + 'parking lot', + 'railway', + 'railway station', + 'rectangular farmland', + 'river', + 'roundabout', + 'runway', + 'sea ice', + 'ship', + 'snowberg', + 'sparse residential', + 'stadium', + 'storage tank', + 'tennis court', + 'terrace', + 'thermal power station', + 'wetland', +] + +templates = [ + 'satellite imagery of {}.', + 'aerial imagery of {}.', + 'satellite photo of {}.', + 'aerial photo of {}.', + 'satellite view of {}.', + 'aerial view of {}.', + 'satellite imagery of a {}.', + 'aerial imagery of a {}.', + 'satellite photo of a {}.', + 'aerial photo of a {}.', + 'satellite view of a {}.', + 'aerial view of a {}.', + 'satellite imagery of the {}.', + 'aerial imagery of the {}.', + 'satellite photo of the {}.', + 'aerial photo of the {}.', + 'satellite view of the {}.', + 'aerial view of the {}.', +] +``` + + + +## SST2 + +```bash +classes = [ + 'negative', + 'positive', +] + +templates = [ + 'a {} review of a movie.', +] +``` + + + +## STL10 + +```bash +classes = [ + 'airplane', + 'bird', + 'car', + 'cat', + 'deer', + 'dog', + 'horse', + 'monkey', + 'ship', + 'truck', +] + +templates = [ + 'a photo of a {}.', + 'a photo of the {}.', +] +``` + + + +## SUN397 + +```bash +classes = [ + 'abbey', + 'airplane cabin', + 'airport terminal', + 'alley', + 'amphitheater', + 'amusement arcade', + 'amusement park', + 'anechoic chamber', + 'apartment building outdoor', + 'apse indoor', + 'aquarium', + 'aqueduct', + 'arch', + 'archive', + 'arrival gate outdoor', + 'art gallery', + 'art school', + 'art studio', + 'assembly line', + 'athletic field outdoor', + 'atrium public', + 'attic', + 'auditorium', + 'auto factory', + 'badlands', + 'badminton court indoor', + 'baggage claim', + 'bakery shop', + 'balcony exterior', + 'balcony interior', + 'ball pit', + 'ballroom', + 'bamboo forest', + 'banquet hall', + 'bar', + 'barn', + 'barndoor', + 'baseball field', + 'basement', + 'basilica', + 'basketball court outdoor', + 'bathroom', + 'batters box', + 'bayou', + 'bazaar indoor', + 'bazaar outdoor', + 'beach', + 'beauty salon', + 'bedroom', + 'berth', + 'biology laboratory', + 'bistro indoor', + 'boardwalk', + 'boat deck', + 'boathouse', + 'bookstore', + 'booth indoor', + 'botanical garden', + 'bow window indoor', + 'bow window outdoor', + 'bowling alley', + 'boxing ring', + 'brewery indoor', + 'bridge', + 'building facade', + 'bullring', + 'burial chamber', + 'bus interior', + 'butchers shop', + 'butte', + 'cabin outdoor', + 'cafeteria', + 'campsite', + 'campus', + 'canal natural', + 'canal urban', + 'candy store', + 'canyon', + 'car interior backseat', + 'car interior frontseat', + 'carrousel', + 'casino indoor', + 'castle', + 'catacomb', + 'cathedral indoor', + 'cathedral outdoor', + 'cavern indoor', + 'cemetery', + 'chalet', + 'cheese factory', + 'chemistry lab', + 'chicken coop indoor', + 'chicken coop outdoor', + 'childs room', + 'church indoor', + 'church outdoor', + 'classroom', + 'clean room', + 'cliff', + 'cloister indoor', + 'closet', + 'clothing store', + 'coast', + 'cockpit', + 'coffee shop', + 'computer room', + 'conference center', + 'conference room', + 'construction site', + 'control room', + 'control tower outdoor', + 'corn field', + 'corral', + 'corridor', + 'cottage garden', + 'courthouse', + 'courtroom', + 'courtyard', + 'covered bridge exterior', + 'creek', + 'crevasse', + 'crosswalk', + 'cubicle office', + 'dam', + 'delicatessen', + 'dentists office', + 'desert sand', + 'desert vegetation', + 'diner indoor', + 'diner outdoor', + 'dinette home', + 'dinette vehicle', + 'dining car', + 'dining room', + 'discotheque', + 'dock', + 'doorway outdoor', + 'dorm room', + 'driveway', + 'driving range outdoor', + 'drugstore', + 'electrical substation', + 'elevator door', + 'elevator interior', + 'elevator shaft', + 'engine room', + 'escalator indoor', + 'excavation', + 'factory indoor', + 'fairway', + 'fastfood restaurant', + 'field cultivated', + 'field wild', + 'fire escape', + 'fire station', + 'firing range indoor', + 'fishpond', + 'florist shop indoor', + 'food court', + 'forest broadleaf', + 'forest needleleaf', + 'forest path', + 'forest road', + 'formal garden', + 'fountain', + 'galley', + 'game room', + 'garage indoor', + 'garbage dump', + 'gas station', + 'gazebo exterior', + 'general store indoor', + 'general store outdoor', + 'gift shop', + 'golf course', + 'greenhouse indoor', + 'greenhouse outdoor', + 'gymnasium indoor', + 'hangar indoor', + 'hangar outdoor', + 'harbor', + 'hayfield', + 'heliport', + 'herb garden', + 'highway', + 'hill', + 'home office', + 'hospital', + 'hospital room', + 'hot spring', + 'hot tub outdoor', + 'hotel outdoor', + 'hotel room', + 'house', + 'hunting lodge outdoor', + 'ice cream parlor', + 'ice floe', + 'ice shelf', + 'ice skating rink indoor', + 'ice skating rink outdoor', + 'iceberg', + 'igloo', + 'industrial area', + 'inn outdoor', + 'islet', + 'jacuzzi indoor', + 'jail cell', + 'jail indoor', + 'jewelry shop', + 'kasbah', + 'kennel indoor', + 'kennel outdoor', + 'kindergarden classroom', + 'kitchen', + 'kitchenette', + 'labyrinth outdoor', + 'lake natural', + 'landfill', + 'landing deck', + 'laundromat', + 'lecture room', + 'library indoor', + 'library outdoor', + 'lido deck outdoor', + 'lift bridge', + 'lighthouse', + 'limousine interior', + 'living room', + 'lobby', + 'lock chamber', + 'locker room', + 'mansion', + 'manufactured home', + 'market indoor', + 'market outdoor', + 'marsh', + 'martial arts gym', + 'mausoleum', + 'medina', + 'moat water', + 'monastery outdoor', + 'mosque indoor', + 'mosque outdoor', + 'motel', + 'mountain', + 'mountain snowy', + 'movie theater indoor', + 'museum indoor', + 'music store', + 'music studio', + 'nuclear power plant outdoor', + 'nursery', + 'oast house', + 'observatory outdoor', + 'ocean', + 'office', + 'office building', + 'oil refinery outdoor', + 'oilrig', + 'operating room', + 'orchard', + 'outhouse outdoor', + 'pagoda', + 'palace', + 'pantry', + 'park', + 'parking garage indoor', + 'parking garage outdoor', + 'parking lot', + 'parlor', + 'pasture', + 'patio', + 'pavilion', + 'pharmacy', + 'phone booth', + 'physics laboratory', + 'picnic area', + 'pilothouse indoor', + 'planetarium outdoor', + 'playground', + 'playroom', + 'plaza', + 'podium indoor', + 'podium outdoor', + 'pond', + 'poolroom establishment', + 'poolroom home', + 'power plant outdoor', + 'promenade deck', + 'pub indoor', + 'pulpit', + 'putting green', + 'racecourse', + 'raceway', + 'raft', + 'railroad track', + 'rainforest', + 'reception', + 'recreation room', + 'residential neighborhood', + 'restaurant', + 'restaurant kitchen', + 'restaurant patio', + 'rice paddy', + 'riding arena', + 'river', + 'rock arch', + 'rope bridge', + 'ruin', + 'runway', + 'sandbar', + 'sandbox', + 'sauna', + 'schoolhouse', + 'sea cliff', + 'server room', + 'shed', + 'shoe shop', + 'shopfront', + 'shopping mall indoor', + 'shower', + 'skatepark', + 'ski lodge', + 'ski resort', + 'ski slope', + 'sky', + 'skyscraper', + 'slum', + 'snowfield', + 'squash court', + 'stable', + 'stadium baseball', + 'stadium football', + 'stage indoor', + 'staircase', + 'street', + 'subway interior', + 'subway station platform', + 'supermarket', + 'sushi bar', + 'swamp', + 'swimming pool indoor', + 'swimming pool outdoor', + 'synagogue indoor', + 'synagogue outdoor', + 'television studio', + 'temple east asia', + 'temple south asia', + 'tennis court indoor', + 'tennis court outdoor', + 'tent outdoor', + 'theater indoor procenium', + 'theater indoor seats', + 'thriftshop', + 'throne room', + 'ticket booth', + 'toll plaza', + 'topiary garden', + 'tower', + 'toyshop', + 'track outdoor', + 'train railway', + 'train station platform', + 'tree farm', + 'tree house', + 'trench', + 'underwater coral reef', + 'utility room', + 'valley', + 'van interior', + 'vegetable garden', + 'veranda', + 'veterinarians office', + 'viaduct', + 'videostore', + 'village', + 'vineyard', + 'volcano', + 'volleyball court indoor', + 'volleyball court outdoor', + 'waiting room', + 'warehouse indoor', + 'water tower', + 'waterfall block', + 'waterfall fan', + 'waterfall plunge', + 'watering hole', + 'wave', + 'wet bar', + 'wheat field', + 'wind farm', + 'windmill', + 'wine cellar barrel storage', + 'wine cellar bottle storage', + 'wrestling ring indoor', + 'yard', + 'youth hostel', +] + +templates = [ + 'a photo of a {}.', + 'a photo of the {}.', +] +``` + + + +## StanfordCars + +```bash +classes = [ + 'AM General Hummer SUV 2000', + 'Acura RL Sedan 2012', + 'Acura TL Sedan 2012', + 'Acura TL Type-S 2008', + 'Acura TSX Sedan 2012', + 'Acura Integra Type R 2001', + 'Acura ZDX Hatchback 2012', + 'Aston Martin V8 Vantage Convertible 2012', + 'Aston Martin V8 Vantage Coupe 2012', + 'Aston Martin Virage Convertible 2012', + 'Aston Martin Virage Coupe 2012', + 'Audi RS 4 Convertible 2008', + 'Audi A5 Coupe 2012', + 'Audi TTS Coupe 2012', + 'Audi R8 Coupe 2012', + 'Audi V8 Sedan 1994', + 'Audi 100 Sedan 1994', + 'Audi 100 Wagon 1994', + 'Audi TT Hatchback 2011', + 'Audi S6 Sedan 2011', + 'Audi S5 Convertible 2012', + 'Audi S5 Coupe 2012', + 'Audi S4 Sedan 2012', + 'Audi S4 Sedan 2007', + 'Audi TT RS Coupe 2012', + 'BMW ActiveHybrid 5 Sedan 2012', + 'BMW 1 Series Convertible 2012', + 'BMW 1 Series Coupe 2012', + 'BMW 3 Series Sedan 2012', + 'BMW 3 Series Wagon 2012', + 'BMW 6 Series Convertible 2007', + 'BMW X5 SUV 2007', + 'BMW X6 SUV 2012', + 'BMW M3 Coupe 2012', + 'BMW M5 Sedan 2010', + 'BMW M6 Convertible 2010', + 'BMW X3 SUV 2012', + 'BMW Z4 Convertible 2012', + 'Bentley Continental Supersports Conv. Convertible 2012', + 'Bentley Arnage Sedan 2009', + 'Bentley Mulsanne Sedan 2011', + 'Bentley Continental GT Coupe 2012', + 'Bentley Continental GT Coupe 2007', + 'Bentley Continental Flying Spur Sedan 2007', + 'Bugatti Veyron 16.4 Convertible 2009', + 'Bugatti Veyron 16.4 Coupe 2009', + 'Buick Regal GS 2012', + 'Buick Rainier SUV 2007', + 'Buick Verano Sedan 2012', + 'Buick Enclave SUV 2012', + 'Cadillac CTS-V Sedan 2012', + 'Cadillac SRX SUV 2012', + 'Cadillac Escalade EXT Crew Cab 2007', + 'Chevrolet Silverado 1500 Hybrid Crew Cab 2012', + 'Chevrolet Corvette Convertible 2012', + 'Chevrolet Corvette ZR1 2012', + 'Chevrolet Corvette Ron Fellows Edition Z06 2007', + 'Chevrolet Traverse SUV 2012', + 'Chevrolet Camaro Convertible 2012', + 'Chevrolet HHR SS 2010', + 'Chevrolet Impala Sedan 2007', + 'Chevrolet Tahoe Hybrid SUV 2012', + 'Chevrolet Sonic Sedan 2012', + 'Chevrolet Express Cargo Van 2007', + 'Chevrolet Avalanche Crew Cab 2012', + 'Chevrolet Cobalt SS 2010', + 'Chevrolet Malibu Hybrid Sedan 2010', + 'Chevrolet TrailBlazer SS 2009', + 'Chevrolet Silverado 2500HD Regular Cab 2012', + 'Chevrolet Silverado 1500 Classic Extended Cab 2007', + 'Chevrolet Express Van 2007', + 'Chevrolet Monte Carlo Coupe 2007', + 'Chevrolet Malibu Sedan 2007', + 'Chevrolet Silverado 1500 Extended Cab 2012', + 'Chevrolet Silverado 1500 Regular Cab 2012', + 'Chrysler Aspen SUV 2009', + 'Chrysler Sebring Convertible 2010', + 'Chrysler Town and Country Minivan 2012', + 'Chrysler 300 SRT-8 2010', + 'Chrysler Crossfire Convertible 2008', + 'Chrysler PT Cruiser Convertible 2008', + 'Daewoo Nubira Wagon 2002', + 'Dodge Caliber Wagon 2012', + 'Dodge Caliber Wagon 2007', + 'Dodge Caravan Minivan 1997', + 'Dodge Ram Pickup 3500 Crew Cab 2010', + 'Dodge Ram Pickup 3500 Quad Cab 2009', + 'Dodge Sprinter Cargo Van 2009', + 'Dodge Journey SUV 2012', + 'Dodge Dakota Crew Cab 2010', + 'Dodge Dakota Club Cab 2007', + 'Dodge Magnum Wagon 2008', + 'Dodge Challenger SRT8 2011', + 'Dodge Durango SUV 2012', + 'Dodge Durango SUV 2007', + 'Dodge Charger Sedan 2012', + 'Dodge Charger SRT-8 2009', + 'Eagle Talon Hatchback 1998', + 'FIAT 500 Abarth 2012', + 'FIAT 500 Convertible 2012', + 'Ferrari FF Coupe 2012', + 'Ferrari California Convertible 2012', + 'Ferrari 458 Italia Convertible 2012', + 'Ferrari 458 Italia Coupe 2012', + 'Fisker Karma Sedan 2012', + 'Ford F-450 Super Duty Crew Cab 2012', + 'Ford Mustang Convertible 2007', + 'Ford Freestar Minivan 2007', + 'Ford Expedition EL SUV 2009', + 'Ford Edge SUV 2012', + 'Ford Ranger SuperCab 2011', + 'Ford GT Coupe 2006', + 'Ford F-150 Regular Cab 2012', + 'Ford F-150 Regular Cab 2007', + 'Ford Focus Sedan 2007', + 'Ford E-Series Wagon Van 2012', + 'Ford Fiesta Sedan 2012', + 'GMC Terrain SUV 2012', + 'GMC Savana Van 2012', + 'GMC Yukon Hybrid SUV 2012', + 'GMC Acadia SUV 2012', + 'GMC Canyon Extended Cab 2012', + 'Geo Metro Convertible 1993', + 'HUMMER H3T Crew Cab 2010', + 'HUMMER H2 SUT Crew Cab 2009', + 'Honda Odyssey Minivan 2012', + 'Honda Odyssey Minivan 2007', + 'Honda Accord Coupe 2012', + 'Honda Accord Sedan 2012', + 'Hyundai Veloster Hatchback 2012', + 'Hyundai Santa Fe SUV 2012', + 'Hyundai Tucson SUV 2012', + 'Hyundai Veracruz SUV 2012', + 'Hyundai Sonata Hybrid Sedan 2012', + 'Hyundai Elantra Sedan 2007', + 'Hyundai Accent Sedan 2012', + 'Hyundai Genesis Sedan 2012', + 'Hyundai Sonata Sedan 2012', + 'Hyundai Elantra Touring Hatchback 2012', + 'Hyundai Azera Sedan 2012', + 'Infiniti G Coupe IPL 2012', + 'Infiniti QX56 SUV 2011', + 'Isuzu Ascender SUV 2008', + 'Jaguar XK XKR 2012', + 'Jeep Patriot SUV 2012', + 'Jeep Wrangler SUV 2012', + 'Jeep Liberty SUV 2012', + 'Jeep Grand Cherokee SUV 2012', + 'Jeep Compass SUV 2012', + 'Lamborghini Reventon Coupe 2008', + 'Lamborghini Aventador Coupe 2012', + 'Lamborghini Gallardo LP 570-4 Superleggera 2012', + 'Lamborghini Diablo Coupe 2001', + 'Land Rover Range Rover SUV 2012', + 'Land Rover LR2 SUV 2012', + 'Lincoln Town Car Sedan 2011', + 'MINI Cooper Roadster Convertible 2012', + 'Maybach Landaulet Convertible 2012', + 'Mazda Tribute SUV 2011', + 'McLaren MP4-12C Coupe 2012', + 'Mercedes-Benz 300-Class Convertible 1993', + 'Mercedes-Benz C-Class Sedan 2012', + 'Mercedes-Benz SL-Class Coupe 2009', + 'Mercedes-Benz E-Class Sedan 2012', + 'Mercedes-Benz S-Class Sedan 2012', + 'Mercedes-Benz Sprinter Van 2012', + 'Mitsubishi Lancer Sedan 2012', + 'Nissan Leaf Hatchback 2012', + 'Nissan NV Passenger Van 2012', + 'Nissan Juke Hatchback 2012', + 'Nissan 240SX Coupe 1998', + 'Plymouth Neon Coupe 1999', + 'Porsche Panamera Sedan 2012', + 'Ram C/V Cargo Van Minivan 2012', + 'Rolls-Royce Phantom Drophead Coupe Convertible 2012', + 'Rolls-Royce Ghost Sedan 2012', + 'Rolls-Royce Phantom Sedan 2012', + 'Scion xD Hatchback 2012', + 'Spyker C8 Convertible 2009', + 'Spyker C8 Coupe 2009', + 'Suzuki Aerio Sedan 2007', + 'Suzuki Kizashi Sedan 2012', + 'Suzuki SX4 Hatchback 2012', + 'Suzuki SX4 Sedan 2012', + 'Tesla Model S Sedan 2012', + 'Toyota Sequoia SUV 2012', + 'Toyota Camry Sedan 2012', + 'Toyota Corolla Sedan 2012', + 'Toyota 4Runner SUV 2012', + 'Volkswagen Golf Hatchback 2012', + 'Volkswagen Golf Hatchback 1991', + 'Volkswagen Beetle Hatchback 2012', + 'Volvo C30 Hatchback 2012', + 'Volvo 240 Sedan 1993', + 'Volvo XC90 SUV 2007', + 'smart fortwo Convertible 2012', +] + +templates = [ + 'a photo of a {}.', + 'a photo of the {}.', + 'a photo of my {}.', + 'i love my {}!', + 'a photo of my dirty {}.', + 'a photo of my clean {}.', + 'a photo of my new {}.', + 'a photo of my old {}.', +] +``` + + + +## UCF101 + +```bash +classes = [ + 'Apply Eye Makeup', + 'Apply Lipstick', + 'Archery', + 'Baby Crawling', + 'Balance Beam', + 'Band Marching', + 'Baseball Pitch', + 'Basketball', + 'Basketball Dunk', + 'Bench Press', + 'Biking', + 'Billiards', + 'Blow Dry Hair', + 'Blowing Candles', + 'Body Weight Squats', + 'Bowling', + 'Boxing Punching Bag', + 'Boxing Speed Bag', + 'Breast Stroke', + 'Brushing Teeth', + 'Clean And Jerk', + 'Cliff Diving', + 'Cricket Bowling', + 'Cricket Shot', + 'Cutting In Kitchen', + 'Diving', + 'Drumming', + 'Fencing', + 'Field Hockey Penalty', + 'Floor Gymnastics', + 'Frisbee Catch', + 'Front Crawl', + 'Golf Swing', + 'Haircut', + 'Hammer Throw', + 'Hammering', + 'Hand Stand Pushups', + 'Handstand Walking', + 'Head Massage', + 'High Jump', + 'Horse Race', + 'Horse Riding', + 'Hula Hoop', + 'Ice Dancing', + 'Javelin Throw', + 'Juggling Balls', + 'Jump Rope', + 'Jumping Jack', + 'Kayaking', + 'Knitting', + 'Long Jump', + 'Lunges', + 'Military Parade', + 'Mixing', + 'Mopping Floor', + 'Nunchucks', + 'Parallel Bars', + 'Pizza Tossing', + 'Playing Cello', + 'Playing Daf', + 'Playing Dhol', + 'Playing Flute', + 'Playing Guitar', + 'Playing Piano', + 'Playing Sitar', + 'Playing Tabla', + 'Playing Violin', + 'Pole Vault', + 'Pommel Horse', + 'Pull Ups', + 'Punch', + 'Push Ups', + 'Rafting', + 'Rock Climbing Indoor', + 'Rope Climbing', + 'Rowing', + 'Salsa Spin', + 'Shaving Beard', + 'Shotput', + 'Skate Boarding', + 'Skiing', + 'Skijet', + 'Sky Diving', + 'Soccer Juggling', + 'Soccer Penalty', + 'Still Rings', + 'Sumo Wrestling', + 'Surfing', + 'Swing', + 'Table Tennis Shot', + 'Tai Chi', + 'Tennis Swing', + 'Throw Discus', + 'Trampoline Jumping', + 'Typing', + 'Uneven Bars', + 'Volleyball Spiking', + 'Walking With Dog', + 'Wall Pushups', + 'Writing On Board', + 'Yo Yo', +] + +templates = [ + 'a photo of a person {}.', + 'a video of a person {}.', + 'a example of a person {}.', + 'a demonstration of a person {}.', + 'a photo of the person {}.', + 'a video of the person {}.', + 'a example of the person {}.', + 'a demonstration of the person {}.', + 'a photo of a person using {}.', + 'a video of a person using {}.', + 'a example of a person using {}.', + 'a demonstration of a person using {}.', + 'a photo of the person using {}.', + 'a video of the person using {}.', + 'a example of the person using {}.', + 'a demonstration of the person using {}.', + 'a photo of a person doing {}.', + 'a video of a person doing {}.', + 'a example of a person doing {}.', + 'a demonstration of a person doing {}.', + 'a photo of the person doing {}.', + 'a video of the person doing {}.', + 'a example of the person doing {}.', + 'a demonstration of the person doing {}.', + 'a photo of a person during {}.', + 'a video of a person during {}.', + 'a example of a person during {}.', + 'a demonstration of a person during {}.', + 'a photo of the person during {}.', + 'a video of the person during {}.', + 'a example of the person during {}.', + 'a demonstration of the person during {}.', + 'a photo of a person performing {}.', + 'a video of a person performing {}.', + 'a example of a person performing {}.', + 'a demonstration of a person performing {}.', + 'a photo of the person performing {}.', + 'a video of the person performing {}.', + 'a example of the person performing {}.', + 'a demonstration of the person performing {}.', + 'a photo of a person practicing {}.', + 'a video of a person practicing {}.', + 'a example of a person practicing {}.', + 'a demonstration of a person practicing {}.', + 'a photo of the person practicing {}.', + 'a video of the person practicing {}.', + 'a example of the person practicing {}.', + 'a demonstration of the person practicing {}.', +] +``` + + diff --git a/CLIP/data/rendered-sst2.md b/CLIP/data/rendered-sst2.md new file mode 100644 index 0000000000000000000000000000000000000000..d27454caf046d8d699f87b8cb8404a05e488dbef --- /dev/null +++ b/CLIP/data/rendered-sst2.md @@ -0,0 +1,11 @@ +# The Rendered SST2 Dataset + +In the paper, we used an image classification dataset called Rendered SST2, to evaluate the model's capability on optical character recognition. To do so, we rendered the sentences in the [Standford Sentiment Treebank v2](https://nlp.stanford.edu/sentiment/treebank.html) dataset and used those as the input to the CLIP image encoder. + +The following command will download a 131MB archive countaining the images and extract into a subdirectory `rendered-sst2`: + +```bash +wget https://openaipublic.azureedge.net/clip/data/rendered-sst2.tgz +tar zxvf rendered-sst2.tgz +``` + diff --git a/CLIP/data/yfcc100m.md b/CLIP/data/yfcc100m.md new file mode 100644 index 0000000000000000000000000000000000000000..06083ef9a613b5d360e87c3f395c2a16c6e9208e --- /dev/null +++ b/CLIP/data/yfcc100m.md @@ -0,0 +1,14 @@ +# The YFCC100M Subset + +In the paper, we performed a dataset ablation using a subset of the YFCC100M dataset and showed that the performance remained largely similar. + +The subset contains 14,829,396 images, about 15% of the full dataset, which have been filtered to only keep those with natural languag titles and/or descriptions in English. + +We provide the list of (line number, photo identifier, photo hash) of each image contained in this subset. These correspond to the first three columns in the dataset's metadata TSV file. + +```bash +wget https://openaipublic.azureedge.net/clip/data/yfcc100m_subset_data.tsv.bz2 +bunzip2 yfcc100m_subset_data.tsv.bz2 +``` + +Use of the underlying media files is subject to the Creative Commons licenses chosen by their creators/uploaders. For more information about the YFCC100M dataset, visit [the official website](https://multimediacommons.wordpress.com/yfcc100m-core-dataset/). \ No newline at end of file diff --git a/CLIP/model-card.md b/CLIP/model-card.md new file mode 100644 index 0000000000000000000000000000000000000000..126d84511377c66b5bf16480bacca78e89938227 --- /dev/null +++ b/CLIP/model-card.md @@ -0,0 +1,120 @@ +# Model Card: CLIP + +Inspired by [Model Cards for Model Reporting (Mitchell et al.)](https://arxiv.org/abs/1810.03993) and [Lessons from Archives (Jo & Gebru)](https://arxiv.org/pdf/1912.10389.pdf), we’re providing some accompanying information about the multimodal model. + +## Model Details + +The CLIP model was developed by researchers at OpenAI to learn about what contributes to robustness in computer vision tasks. The model was also developed to test the ability of models to generalize to arbitrary image classification tasks in a zero-shot manner. It was not developed for general model deployment - to deploy models like CLIP, researchers will first need to carefully study their capabilities in relation to the specific context they’re being deployed within. + +### Model Date + +January 2021 + +### Model Type + +The base model uses a ResNet50 with several modifications as an image encoder and uses a masked self-attention Transformer as a text encoder. These encoders are trained to maximize the similarity of (image, text) pairs via a contrastive loss. There is also a variant of the model where the ResNet image encoder is replaced with a Vision Transformer. + +### Model Versions + +Initially, we’ve released one CLIP model based on the Vision Transformer architecture equivalent to ViT-B/32, along with the RN50 model, using the architecture equivalent to ResNet-50. + +As part of the staged release process, we have also released the RN101 model, as well as RN50x4, a RN50 scaled up 4x according to the [EfficientNet](https://arxiv.org/abs/1905.11946) scaling rule. In July 2021, we additionally released the RN50x16 and ViT-B/16 models, and In January 2022, the RN50x64 and ViT-L/14 models were released. + +Please see the paper linked below for further details about their specification. + +### Documents + +- [Blog Post](https://openai.com/blog/clip/) +- [CLIP Paper](https://arxiv.org/abs/2103.00020) + + + +## Model Use + +### Intended Use + +The model is intended as a research output for research communities. We hope that this model will enable researchers to better understand and explore zero-shot, arbitrary image classification. We also hope it can be used for interdisciplinary studies of the potential impact of such models - the CLIP paper includes a discussion of potential downstream impacts to provide an example for this sort of analysis. + +#### Primary intended uses + +The primary intended users of these models are AI researchers. + +We primarily imagine the model will be used by researchers to better understand robustness, generalization, and other capabilities, biases, and constraints of computer vision models. + +### Out-of-Scope Use Cases + +**Any** deployed use case of the model - whether commercial or not - is currently out of scope. Non-deployed use cases such as image search in a constrained environment, are also not recommended unless there is thorough in-domain testing of the model with a specific, fixed class taxonomy. This is because our safety assessment demonstrated a high need for task specific testing especially given the variability of CLIP’s performance with different class taxonomies. This makes untested and unconstrained deployment of the model in any use case currently potentially harmful. + +Certain use cases which would fall under the domain of surveillance and facial recognition are always out-of-scope regardless of performance of the model. This is because the use of artificial intelligence for tasks such as these can be premature currently given the lack of testing norms and checks to ensure its fair use. + +Since the model has not been purposefully trained in or evaluated on any languages other than English, its use should be limited to English language use cases. + + + +## Data + +The model was trained on publicly available image-caption data. This was done through a combination of crawling a handful of websites and using commonly-used pre-existing image datasets such as [YFCC100M](http://projects.dfki.uni-kl.de/yfcc100m/). A large portion of the data comes from our crawling of the internet. This means that the data is more representative of people and societies most connected to the internet which tend to skew towards more developed nations, and younger, male users. + +### Data Mission Statement + +Our goal with building this dataset was to test out robustness and generalizability in computer vision tasks. As a result, the focus was on gathering large quantities of data from different publicly-available internet data sources. The data was gathered in a mostly non-interventionist manner. However, we only crawled websites that had policies against excessively violent and adult images and allowed us to filter out such content. We do not intend for this dataset to be used as the basis for any commercial or deployed model and will not be releasing the dataset. + + + +## Performance and Limitations + +### Performance + +We have evaluated the performance of CLIP on a wide range of benchmarks across a variety of computer vision datasets such as OCR to texture recognition to fine-grained classification. The paper describes model performance on the following datasets: + +- Food101 +- CIFAR10 +- CIFAR100 +- Birdsnap +- SUN397 +- Stanford Cars +- FGVC Aircraft +- VOC2007 +- DTD +- Oxford-IIIT Pet dataset +- Caltech101 +- Flowers102 +- MNIST +- SVHN +- IIIT5K +- Hateful Memes +- SST-2 +- UCF101 +- Kinetics700 +- Country211 +- CLEVR Counting +- KITTI Distance +- STL-10 +- RareAct +- Flickr30 +- MSCOCO +- ImageNet +- ImageNet-A +- ImageNet-R +- ImageNet Sketch +- ObjectNet (ImageNet Overlap) +- Youtube-BB +- ImageNet-Vid + +## Limitations + +CLIP and our analysis of it have a number of limitations. CLIP currently struggles with respect to certain tasks such as fine grained classification and counting objects. CLIP also poses issues with regards to fairness and bias which we discuss in the paper and briefly in the next section. Additionally, our approach to testing CLIP also has an important limitation- in many cases we have used linear probes to evaluate the performance of CLIP and there is evidence suggesting that linear probes can underestimate model performance. + +### Bias and Fairness + +We find that the performance of CLIP - and the specific biases it exhibits - can depend significantly on class design and the choices one makes for categories to include and exclude. We tested the risk of certain kinds of denigration with CLIP by classifying images of people from [Fairface](https://arxiv.org/abs/1908.04913) into crime-related and non-human animal categories. We found significant disparities with respect to race and gender. Additionally, we found that these disparities could shift based on how the classes were constructed. (Details captured in the Broader Impacts Section in the paper). + +We also tested the performance of CLIP on gender, race and age classification using the Fairface dataset (We default to using race categories as they are constructed in the Fairface dataset.) in order to assess quality of performance across different demographics. We found accuracy >96% across all races for gender classification with ‘Middle Eastern’ having the highest accuracy (98.4%) and ‘White’ having the lowest (96.5%). Additionally, CLIP averaged ~93% for racial classification and ~63% for age classification. Our use of evaluations to test for gender, race and age classification as well as denigration harms is simply to evaluate performance of the model across people and surface potential risks and not to demonstrate an endorsement/enthusiasm for such tasks. + + + +## Feedback + +### Where to send questions or comments about the model + +Please use [this Google Form](https://forms.gle/Uv7afRH5dvY34ZEs9) diff --git a/CLIP/requirements.txt b/CLIP/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b98c33f3a0e09ddf982606430472de3061c6e9f --- /dev/null +++ b/CLIP/requirements.txt @@ -0,0 +1,5 @@ +ftfy +regex +tqdm +torch +torchvision diff --git a/CLIP/setup.py b/CLIP/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..c9ea7d0d2f3d2fcf66d6f6e2aa0eb1a97a524bb6 --- /dev/null +++ b/CLIP/setup.py @@ -0,0 +1,21 @@ +import os + +import pkg_resources +from setuptools import setup, find_packages + +setup( + name="clip", + py_modules=["clip"], + version="1.0", + description="", + author="OpenAI", + packages=find_packages(exclude=["tests*"]), + install_requires=[ + str(r) + for r in pkg_resources.parse_requirements( + open(os.path.join(os.path.dirname(__file__), "requirements.txt")) + ) + ], + include_package_data=True, + extras_require={'dev': ['pytest']}, +) diff --git a/CLIP/tests/test_consistency.py b/CLIP/tests/test_consistency.py new file mode 100644 index 0000000000000000000000000000000000000000..f2c6fd4fe9074143803e0eb6c99fa02a47632094 --- /dev/null +++ b/CLIP/tests/test_consistency.py @@ -0,0 +1,25 @@ +import numpy as np +import pytest +import torch +from PIL import Image + +import clip + + +@pytest.mark.parametrize('model_name', clip.available_models()) +def test_consistency(model_name): + device = "cpu" + jit_model, transform = clip.load(model_name, device=device, jit=True) + py_model, _ = clip.load(model_name, device=device, jit=False) + + image = transform(Image.open("CLIP.png")).unsqueeze(0).to(device) + text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device) + + with torch.no_grad(): + logits_per_image, _ = jit_model(image, text) + jit_probs = logits_per_image.softmax(dim=-1).cpu().numpy() + + logits_per_image, _ = py_model(image, text) + py_probs = logits_per_image.softmax(dim=-1).cpu().numpy() + + assert np.allclose(jit_probs, py_probs, atol=0.01, rtol=0.1) diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..a24ee28d19663bdc064955f8daea4335bf1c6a58 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 Omer Bar-Tal, Dolev Ofri-Amar, Rafail Fridman + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index b345613bec50a132caba19839ab1610e86a61fb1..67d60d6f09ef7392ca0dab94a1051b8583d6a369 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,86 @@ ---- -title: Text2live -emoji: 🚀 -colorFrom: yellow -colorTo: green -sdk: streamlit -sdk_version: 1.17.0 -app_file: app.py -pinned: false -license: unknown ---- - -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference +# Text2LIVE: Text-Driven Layered Image and Video Editing (ECCV 2022 - Oral) +## [Project Page] + +[![arXiv](https://img.shields.io/badge/arXiv-Text2LIVE-b31b1b.svg)](https://arxiv.org/abs/2204.02491) +![Pytorch](https://img.shields.io/badge/PyTorch->=1.10.0-Red?logo=pytorch) +[![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/weizmannscience/text2live) + +![teaser](https://user-images.githubusercontent.com/22198039/179798581-ca6f6652-600a-400a-b21b-713fc5c15d56.png) + +**Text2LIVE** is a method for text-driven editing of real-world images and videos, as described in (link to paper). + +[//]: # (. It can be used for localized and global edits that change the texture of existing objects or augment the scene with semi-transparent effects (e.g. smoke, fire, snow).) + +[//]: # (### Abstract) +>We present a method for zero-shot, text-driven appearance manipulation in natural images and videos. Specifically, given an input image or video and a target text prompt, our goal is to edit the appearance of existing objects (e.g., object's texture) or augment the scene with new visual effects (e.g., smoke, fire) in a semantically meaningful manner. Our framework trains a generator using an internal dataset of training examples, extracted from a single input (image or video and target text prompt), while leveraging an external pre-trained CLIP model to establish our losses. Rather than directly generating the edited output, our key idea is to generate an edit layer (color+opacity) that is composited over the original input. This allows us to constrain the generation process and maintain high fidelity to the original input via novel text-driven losses that are applied directly to the edit layer. Our method neither relies on a pre-trained generator nor requires user-provided edit masks. Thus, it can perform localized, semantic edits on high-resolution natural images and videos across a variety of objects and scenes. + + +## Getting Started +### Installation + +``` +git clone https://github.com/omerbt/Text2LIVE.git +conda create --name text2live python=3.9 +conda activate text2live +pip install -r requirements.txt +``` + +### Download sample images and videos +Download sample images and videos from the DAVIS dataset: +``` +cd Text2LIVE +gdown https://drive.google.com/uc?id=1osN4PlPkY9uk6pFqJZo8lhJUjTIpa80J&export=download +unzip data.zip +``` +It will create a folder `data`: +``` +Text2LIVE +├── ... +├── data +│ ├── pretrained_nla_models # NLA models are stored here +│ ├── images # sample images +│ └── videos # sample videos from DAVIS dataset +│ ├── car-turn # contains video frames +│ ├── ... +└── ... +``` +To enforce temporal consistency in video edits, we utilize the Neural Layered Atlases (NLA). Pretrained NLA models are taken from here, and are already inside the `data` folder. + +### Run examples +* Our method is designed to change textures of existing objects / augment the scene with semi-transparent effects (e.g., smoke, fire). It is not designed for adding new objects or significantly deviating from the original spatial layout. +* Training **Text2LIVE** multiple times with the same inputs can lead to slightly different results. +* CLIP sometimes exhibits bias towards specific solutions (see figure 9 in the paper), thus slightly different text prompts may lead to different flavors of edits. + + +The required GPU memory depends on the input image/video size, but you should be good with a Tesla V100 32GB :). +Currently mixed precision introduces some instability in the training process, but it could be added later. + +#### Video Editing +Run the following command to start training +``` +python train_video.py --example_config car-turn_winter.yaml +``` +#### Image Editing +Run the following command to start training +``` +python train_image.py --example_config golden_horse.yaml +``` +Intermediate results will be saved to `results` during optimization. The frequency of saving intermediate results is indicated in the `log_images_freq` flag of the configuration. + +## Sample Results +https://user-images.githubusercontent.com/22198039/179797381-983e0453-2e5d-40e8-983d-578217b358e4.mov + +For more see the [supplementary material](https://text2live.github.io/sm/index.html). + + +## Citation +``` +@inproceedings{bar2022text2live, + title={Text2live: Text-driven layered image and video editing}, + author={Bar-Tal, Omer and Ofri-Amar, Dolev and Fridman, Rafail and Kasten, Yoni and Dekel, Tali}, + booktitle={European Conference on Computer Vision}, + pages={707--723}, + year={2022}, + organization={Springer} +} +``` diff --git a/Text2LIVE-main/CLIP/__pycache__/__init__.cpython-37.pyc b/Text2LIVE-main/CLIP/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d95a06a678a32e3cf1e9ff299d0da28c66d3d2ae Binary files /dev/null and b/Text2LIVE-main/CLIP/__pycache__/__init__.cpython-37.pyc differ diff --git a/Text2LIVE-main/CLIP/clip/__pycache__/__init__.cpython-37.pyc b/Text2LIVE-main/CLIP/clip/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2851ac29176cd88af1e7a0722fa1063f9b048a98 Binary files /dev/null and b/Text2LIVE-main/CLIP/clip/__pycache__/__init__.cpython-37.pyc differ diff --git a/Text2LIVE-main/CLIP/clip/__pycache__/clip.cpython-37.pyc b/Text2LIVE-main/CLIP/clip/__pycache__/clip.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fa9937d9edfd5239051852abfb4d05f876b8e4ea Binary files /dev/null and b/Text2LIVE-main/CLIP/clip/__pycache__/clip.cpython-37.pyc differ diff --git a/Text2LIVE-main/CLIP/clip/__pycache__/model.cpython-37.pyc b/Text2LIVE-main/CLIP/clip/__pycache__/model.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..72a3550449907dc066f615164cd565705e6d4cad Binary files /dev/null and b/Text2LIVE-main/CLIP/clip/__pycache__/model.cpython-37.pyc differ diff --git a/Text2LIVE-main/CLIP/clip/__pycache__/simple_tokenizer.cpython-37.pyc b/Text2LIVE-main/CLIP/clip/__pycache__/simple_tokenizer.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ec21247f4b40e22a56c84e9b75b2be1455c0b743 Binary files /dev/null and b/Text2LIVE-main/CLIP/clip/__pycache__/simple_tokenizer.cpython-37.pyc differ diff --git a/Text2LIVE-main/CLIP/clip_explainability/__pycache__/__init__.cpython-37.pyc b/Text2LIVE-main/CLIP/clip_explainability/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d9c5a76b249a763878daec50833f1da786ae27b8 Binary files /dev/null and b/Text2LIVE-main/CLIP/clip_explainability/__pycache__/__init__.cpython-37.pyc differ diff --git a/Text2LIVE-main/CLIP/clip_explainability/__pycache__/auxilary.cpython-37.pyc b/Text2LIVE-main/CLIP/clip_explainability/__pycache__/auxilary.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ed93db1d68aba1091285e0d5fef0f1d7176dfc31 Binary files /dev/null and b/Text2LIVE-main/CLIP/clip_explainability/__pycache__/auxilary.cpython-37.pyc differ diff --git a/Text2LIVE-main/CLIP/clip_explainability/__pycache__/clip.cpython-37.pyc b/Text2LIVE-main/CLIP/clip_explainability/__pycache__/clip.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3b22db6bd7e436b1e91729635cc955e768aac9a6 Binary files /dev/null and b/Text2LIVE-main/CLIP/clip_explainability/__pycache__/clip.cpython-37.pyc differ diff --git a/Text2LIVE-main/CLIP/clip_explainability/__pycache__/model.cpython-37.pyc b/Text2LIVE-main/CLIP/clip_explainability/__pycache__/model.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..61271189b14ddb88e3d4d2462acd1f108560312c Binary files /dev/null and b/Text2LIVE-main/CLIP/clip_explainability/__pycache__/model.cpython-37.pyc differ diff --git a/Text2LIVE-main/CLIP/clip_explainability/__pycache__/simple_tokenizer.cpython-37.pyc b/Text2LIVE-main/CLIP/clip_explainability/__pycache__/simple_tokenizer.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..94d9a100508c317efff57c71806f5497a0737090 Binary files /dev/null and b/Text2LIVE-main/CLIP/clip_explainability/__pycache__/simple_tokenizer.cpython-37.pyc differ diff --git a/Text2LIVE-main/README.md b/Text2LIVE-main/README.md index 67d60d6f09ef7392ca0dab94a1051b8583d6a369..4f17bd82855082e75e8e25fbe8aa25e1f77d27fa 100644 --- a/Text2LIVE-main/README.md +++ b/Text2LIVE-main/README.md @@ -75,12 +75,10 @@ For more see the [supplementary material](https://text2live.github.io/sm/index.h ## Citation ``` -@inproceedings{bar2022text2live, - title={Text2live: Text-driven layered image and video editing}, - author={Bar-Tal, Omer and Ofri-Amar, Dolev and Fridman, Rafail and Kasten, Yoni and Dekel, Tali}, - booktitle={European Conference on Computer Vision}, - pages={707--723}, - year={2022}, - organization={Springer} +@article{bar2022text2live, + title = {Text2LIVE: Text-Driven Layered Image and Video Editing}, + author = {Bar-Tal, Omer and Ofri-Amar, Dolev and Fridman, Rafail and Kasten, Yoni and Dekel, Tali}, + journal = {arXiv preprint arXiv:2204.02491}, + year = {2022} } ``` diff --git a/Text2LIVE-main/data/data/images/Thumbs.db b/Text2LIVE-main/data/data/images/Thumbs.db new file mode 100644 index 0000000000000000000000000000000000000000..ab616b900f90eea83fc0b73d58ea34d73d15780b Binary files /dev/null and b/Text2LIVE-main/data/data/images/Thumbs.db differ diff --git a/Text2LIVE-main/data/data/images/cake.jpeg b/Text2LIVE-main/data/data/images/cake.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..8cce6274941d1715b76777ec42afe75317ada5ff Binary files /dev/null and b/Text2LIVE-main/data/data/images/cake.jpeg differ diff --git a/Text2LIVE-main/data/data/images/horse.jpg b/Text2LIVE-main/data/data/images/horse.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e40f79fcc61360f3fd74e3db9b6d79d2edfe162e Binary files /dev/null and b/Text2LIVE-main/data/data/images/horse.jpg differ diff --git a/Text2LIVE-main/data/data/pretrained_nla_models/blackswan/checkpoint b/Text2LIVE-main/data/data/pretrained_nla_models/blackswan/checkpoint new file mode 100644 index 0000000000000000000000000000000000000000..bf2b12cccf3d155e8044d590f9e9a1a81908103d --- /dev/null +++ b/Text2LIVE-main/data/data/pretrained_nla_models/blackswan/checkpoint @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f50895f39815de243cb8166001771260d9720e6d1bda6289088a0366c7c70f2 +size 14657387 diff --git a/Text2LIVE-main/data/data/pretrained_nla_models/car-turn/checkpoint b/Text2LIVE-main/data/data/pretrained_nla_models/car-turn/checkpoint new file mode 100644 index 0000000000000000000000000000000000000000..5d6281cf9dbe93cda39b960129343bbf99e84fc3 --- /dev/null +++ b/Text2LIVE-main/data/data/pretrained_nla_models/car-turn/checkpoint @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:973953ed6f0f742df9ab3fd21e7369db541689c40a8cd22ddb12f912c2e84b95 +size 14657387 diff --git a/Text2LIVE-main/data/data/pretrained_nla_models/libby/checkpoint b/Text2LIVE-main/data/data/pretrained_nla_models/libby/checkpoint new file mode 100644 index 0000000000000000000000000000000000000000..74579d6a74972d2511c4611756ecaa36d4643362 --- /dev/null +++ b/Text2LIVE-main/data/data/pretrained_nla_models/libby/checkpoint @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2095f38eacee144175b08fdaaffd52e97991c08f0825be0d8cf836a5297ae535 +size 14657387 diff --git a/Text2LIVE-main/data/data/videos/blackswan/00000.jpg b/Text2LIVE-main/data/data/videos/blackswan/00000.jpg new file mode 100644 index 0000000000000000000000000000000000000000..480ebca5c2bb6c3b3bfe720d0bb06c51fbaf6cd2 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00000.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00001.jpg b/Text2LIVE-main/data/data/videos/blackswan/00001.jpg new file mode 100644 index 0000000000000000000000000000000000000000..19129ecbd0c7114bcb2c4df036bf69d04ddc905c Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00001.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00002.jpg b/Text2LIVE-main/data/data/videos/blackswan/00002.jpg new file mode 100644 index 0000000000000000000000000000000000000000..ae9073304c83ee17a95667e2195406681caf126e Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00002.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00003.jpg b/Text2LIVE-main/data/data/videos/blackswan/00003.jpg new file mode 100644 index 0000000000000000000000000000000000000000..51ce78002ddd67ac653f051341e0c2891dd2bd1b Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00003.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00004.jpg b/Text2LIVE-main/data/data/videos/blackswan/00004.jpg new file mode 100644 index 0000000000000000000000000000000000000000..1c83c2c176d083744c90468978826cbda0989bd1 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00004.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00005.jpg b/Text2LIVE-main/data/data/videos/blackswan/00005.jpg new file mode 100644 index 0000000000000000000000000000000000000000..79c7fa9536c0ec7d725b9507158902385bca1828 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00005.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00006.jpg b/Text2LIVE-main/data/data/videos/blackswan/00006.jpg new file mode 100644 index 0000000000000000000000000000000000000000..869d7f1a8173a5be4bf26fbc9be5a924f9841174 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00006.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00007.jpg b/Text2LIVE-main/data/data/videos/blackswan/00007.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f62c3596539b7c15bd54c58dac9c719e1bcc15fa Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00007.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00008.jpg b/Text2LIVE-main/data/data/videos/blackswan/00008.jpg new file mode 100644 index 0000000000000000000000000000000000000000..203cd1e8d3c9ea6cb13b51709c7df164ae661bfd Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00008.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00009.jpg b/Text2LIVE-main/data/data/videos/blackswan/00009.jpg new file mode 100644 index 0000000000000000000000000000000000000000..456ea4ee757f91887b04ac741283503512cb7200 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00009.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00010.jpg b/Text2LIVE-main/data/data/videos/blackswan/00010.jpg new file mode 100644 index 0000000000000000000000000000000000000000..10aec6bee6ed4e15f727a1304bc16e2c1c4b8a31 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00010.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00011.jpg b/Text2LIVE-main/data/data/videos/blackswan/00011.jpg new file mode 100644 index 0000000000000000000000000000000000000000..7f43134d747c6e8857bac235c19434c60e536b1a Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00011.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00012.jpg b/Text2LIVE-main/data/data/videos/blackswan/00012.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c76854f99e4f16064de7d84431a43608f7fe98cc Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00012.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00013.jpg b/Text2LIVE-main/data/data/videos/blackswan/00013.jpg new file mode 100644 index 0000000000000000000000000000000000000000..7644aec96e80eaa38c7476969c03e7185eebf215 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00013.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00014.jpg b/Text2LIVE-main/data/data/videos/blackswan/00014.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a20409159f966301a35abab5a3aa4c5f9d8293c2 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00014.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00015.jpg b/Text2LIVE-main/data/data/videos/blackswan/00015.jpg new file mode 100644 index 0000000000000000000000000000000000000000..72e24e74c6f5891338383b27d041f5de934f77c9 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00015.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00016.jpg b/Text2LIVE-main/data/data/videos/blackswan/00016.jpg new file mode 100644 index 0000000000000000000000000000000000000000..36e9fe43b77fbde76eb9cafde90ea8c4b2444ff7 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00016.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00017.jpg b/Text2LIVE-main/data/data/videos/blackswan/00017.jpg new file mode 100644 index 0000000000000000000000000000000000000000..89debac0ae909ab11974da3097d0cf5eef72e45d Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00017.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00018.jpg b/Text2LIVE-main/data/data/videos/blackswan/00018.jpg new file mode 100644 index 0000000000000000000000000000000000000000..78a929c87cf8b09d7fd2c047ac39f3801874eab7 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00018.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00019.jpg b/Text2LIVE-main/data/data/videos/blackswan/00019.jpg new file mode 100644 index 0000000000000000000000000000000000000000..852d83dd1314134ff4e56965d3663020b424c505 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00019.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00020.jpg b/Text2LIVE-main/data/data/videos/blackswan/00020.jpg new file mode 100644 index 0000000000000000000000000000000000000000..649ad9f17a284c8888cdf72c75462d692ddb07c1 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00020.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00021.jpg b/Text2LIVE-main/data/data/videos/blackswan/00021.jpg new file mode 100644 index 0000000000000000000000000000000000000000..201bf0fdc88c8c10afacbfad9ae72afe313bec47 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00021.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00022.jpg b/Text2LIVE-main/data/data/videos/blackswan/00022.jpg new file mode 100644 index 0000000000000000000000000000000000000000..64cc2abdcdda32e7ed3a12c0088c2851ab46ea00 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00022.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00023.jpg b/Text2LIVE-main/data/data/videos/blackswan/00023.jpg new file mode 100644 index 0000000000000000000000000000000000000000..23e2bd5c34128a0de58dabd3c07aa7a88d78b97f Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00023.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00024.jpg b/Text2LIVE-main/data/data/videos/blackswan/00024.jpg new file mode 100644 index 0000000000000000000000000000000000000000..96906d4df20e0059bd0fb775410ae52981975e1b Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00024.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00025.jpg b/Text2LIVE-main/data/data/videos/blackswan/00025.jpg new file mode 100644 index 0000000000000000000000000000000000000000..05bc0328a652f3a9f57a8da95663a3cfdd3a0c43 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00025.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00026.jpg b/Text2LIVE-main/data/data/videos/blackswan/00026.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6372af9fc7dfd53c969276bf72c1ad8dbe0427c0 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00026.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00027.jpg b/Text2LIVE-main/data/data/videos/blackswan/00027.jpg new file mode 100644 index 0000000000000000000000000000000000000000..90003ecdfbe10232d1afe7bea09157de4828cc28 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00027.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00028.jpg b/Text2LIVE-main/data/data/videos/blackswan/00028.jpg new file mode 100644 index 0000000000000000000000000000000000000000..0d76faebc21479839bcd14889553257fccbd9e3d Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00028.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00029.jpg b/Text2LIVE-main/data/data/videos/blackswan/00029.jpg new file mode 100644 index 0000000000000000000000000000000000000000..8fb266f51c791bfc1ba240af6f238e9f20ee6908 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00029.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00030.jpg b/Text2LIVE-main/data/data/videos/blackswan/00030.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a80b463cdd60bbe635e30cb09cbee6a306b4ca6f Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00030.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00031.jpg b/Text2LIVE-main/data/data/videos/blackswan/00031.jpg new file mode 100644 index 0000000000000000000000000000000000000000..2b14e9d508b6c27c08467b3a30fd71e44e7e2372 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00031.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00032.jpg b/Text2LIVE-main/data/data/videos/blackswan/00032.jpg new file mode 100644 index 0000000000000000000000000000000000000000..8bd17e9ca056a7c4b48e8af13c73046805e28294 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00032.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00033.jpg b/Text2LIVE-main/data/data/videos/blackswan/00033.jpg new file mode 100644 index 0000000000000000000000000000000000000000..3973d23c41059e2762b74e5a0dab6ac619efe86d Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00033.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00034.jpg b/Text2LIVE-main/data/data/videos/blackswan/00034.jpg new file mode 100644 index 0000000000000000000000000000000000000000..3f992386d1b475f7b3cb4b7221e5e7704f2f3c64 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00034.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00035.jpg b/Text2LIVE-main/data/data/videos/blackswan/00035.jpg new file mode 100644 index 0000000000000000000000000000000000000000..45bf8db10a4a5270d49ec28136d4243d6207b3a4 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00035.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00036.jpg b/Text2LIVE-main/data/data/videos/blackswan/00036.jpg new file mode 100644 index 0000000000000000000000000000000000000000..38711dcada724f4cd8ad15eda8a90ced5fd411b5 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00036.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00037.jpg b/Text2LIVE-main/data/data/videos/blackswan/00037.jpg new file mode 100644 index 0000000000000000000000000000000000000000..49658f2ce1b3656732bd7448318d73f9147ee3c5 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00037.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00038.jpg b/Text2LIVE-main/data/data/videos/blackswan/00038.jpg new file mode 100644 index 0000000000000000000000000000000000000000..395df1207af70732578fcba06c85a43543092147 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00038.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00039.jpg b/Text2LIVE-main/data/data/videos/blackswan/00039.jpg new file mode 100644 index 0000000000000000000000000000000000000000..cce617f0d6846c55c5a79191f79d67807e876a63 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00039.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00040.jpg b/Text2LIVE-main/data/data/videos/blackswan/00040.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9ec0ee4bf5ed1b06ccb5e690b2d1ce128ec069d2 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00040.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00041.jpg b/Text2LIVE-main/data/data/videos/blackswan/00041.jpg new file mode 100644 index 0000000000000000000000000000000000000000..eb2362607e98b97eaf39069d962131e4828cd284 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00041.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00042.jpg b/Text2LIVE-main/data/data/videos/blackswan/00042.jpg new file mode 100644 index 0000000000000000000000000000000000000000..ef6555d2428032b74155df334a9bb4345d38f75d Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00042.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00043.jpg b/Text2LIVE-main/data/data/videos/blackswan/00043.jpg new file mode 100644 index 0000000000000000000000000000000000000000..49a6647feb55d48cb2e7eafced83a8f12d98e494 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00043.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00044.jpg b/Text2LIVE-main/data/data/videos/blackswan/00044.jpg new file mode 100644 index 0000000000000000000000000000000000000000..eafe23a396cbf230f0754c31b89d848794c3cf6a Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00044.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00045.jpg b/Text2LIVE-main/data/data/videos/blackswan/00045.jpg new file mode 100644 index 0000000000000000000000000000000000000000..41934a12a889a4af92fc538c5c523fdb60b04049 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00045.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00046.jpg b/Text2LIVE-main/data/data/videos/blackswan/00046.jpg new file mode 100644 index 0000000000000000000000000000000000000000..89cdb8a90857662639e7d6bcf0eaf35f2f2715e1 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00046.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00047.jpg b/Text2LIVE-main/data/data/videos/blackswan/00047.jpg new file mode 100644 index 0000000000000000000000000000000000000000..41b24f76f2ffb68771bae6cf1f0562de281ae619 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00047.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00048.jpg b/Text2LIVE-main/data/data/videos/blackswan/00048.jpg new file mode 100644 index 0000000000000000000000000000000000000000..3564e55654914c5627be7ccbf0f87f837b564a25 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00048.jpg differ diff --git a/Text2LIVE-main/data/data/videos/blackswan/00049.jpg b/Text2LIVE-main/data/data/videos/blackswan/00049.jpg new file mode 100644 index 0000000000000000000000000000000000000000..eae5de17160d791d79607b8c5953d273715653eb Binary files /dev/null and b/Text2LIVE-main/data/data/videos/blackswan/00049.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00000.jpg b/Text2LIVE-main/data/data/videos/car-turn/00000.jpg new file mode 100644 index 0000000000000000000000000000000000000000..da7747b86f5ac48f8def2ff5b1e5ba167639cd0e Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00000.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00001.jpg b/Text2LIVE-main/data/data/videos/car-turn/00001.jpg new file mode 100644 index 0000000000000000000000000000000000000000..0714ed5951af3de691d996301766735934695700 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00001.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00002.jpg b/Text2LIVE-main/data/data/videos/car-turn/00002.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c649da56a12084b539e523e5e141bb1274cb6856 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00002.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00003.jpg b/Text2LIVE-main/data/data/videos/car-turn/00003.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a31c6b87f6e08bfd85762de0d89924436967d74c Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00003.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00004.jpg b/Text2LIVE-main/data/data/videos/car-turn/00004.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5f6a45952c930e562522f50d568074c03e42c726 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00004.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00005.jpg b/Text2LIVE-main/data/data/videos/car-turn/00005.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f8960bf92266d4d73d29e1251dab89131c0a87ba Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00005.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00006.jpg b/Text2LIVE-main/data/data/videos/car-turn/00006.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f0383450ff251e315fdf49022e160846d30e0653 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00006.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00007.jpg b/Text2LIVE-main/data/data/videos/car-turn/00007.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5e12714d28be493c93827a08eb97eaea01316c3d Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00007.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00008.jpg b/Text2LIVE-main/data/data/videos/car-turn/00008.jpg new file mode 100644 index 0000000000000000000000000000000000000000..8503f027d2f4697731a7f5789ea901e5189be9c8 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00008.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00009.jpg b/Text2LIVE-main/data/data/videos/car-turn/00009.jpg new file mode 100644 index 0000000000000000000000000000000000000000..fca81eb8aef57da21675c959c267f9a72aaa851c Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00009.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00010.jpg b/Text2LIVE-main/data/data/videos/car-turn/00010.jpg new file mode 100644 index 0000000000000000000000000000000000000000..4d83ec00faa83428848ec0ca9513bf4c5ce143eb Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00010.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00011.jpg b/Text2LIVE-main/data/data/videos/car-turn/00011.jpg new file mode 100644 index 0000000000000000000000000000000000000000..b06f7dc38c6ea49c202a88ae2ced748f60b33192 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00011.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00012.jpg b/Text2LIVE-main/data/data/videos/car-turn/00012.jpg new file mode 100644 index 0000000000000000000000000000000000000000..556e853c979b77ac85cf79b4e8767f15e13df94b Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00012.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00013.jpg b/Text2LIVE-main/data/data/videos/car-turn/00013.jpg new file mode 100644 index 0000000000000000000000000000000000000000..8e56fd219435ebee92cf73b94bb4b9c9785acb4a Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00013.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00014.jpg b/Text2LIVE-main/data/data/videos/car-turn/00014.jpg new file mode 100644 index 0000000000000000000000000000000000000000..afa3c581340c8bf628dc793c27915cb41274611f Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00014.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00015.jpg b/Text2LIVE-main/data/data/videos/car-turn/00015.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5fc831b464a3f3fc3eacdbfb427e17d5bb45857b Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00015.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00016.jpg b/Text2LIVE-main/data/data/videos/car-turn/00016.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6b5a1432589cc335d5c7605b041093dcdf2b8611 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00016.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00017.jpg b/Text2LIVE-main/data/data/videos/car-turn/00017.jpg new file mode 100644 index 0000000000000000000000000000000000000000..60b5b07d72be303fc5643c424e5cac04a94cb5bd Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00017.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00018.jpg b/Text2LIVE-main/data/data/videos/car-turn/00018.jpg new file mode 100644 index 0000000000000000000000000000000000000000..1f0352f296c616b58a1b066fbffc00df713d9d53 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00018.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00019.jpg b/Text2LIVE-main/data/data/videos/car-turn/00019.jpg new file mode 100644 index 0000000000000000000000000000000000000000..099a2e4ea817b8ea3f23ec1294d89d958cdc2324 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00019.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00020.jpg b/Text2LIVE-main/data/data/videos/car-turn/00020.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f1e4353ddf1d95f6af7e3e8709313066f1236b10 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00020.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00021.jpg b/Text2LIVE-main/data/data/videos/car-turn/00021.jpg new file mode 100644 index 0000000000000000000000000000000000000000..56fc13f578b200280c990bc5170e7efc0c2ae4d9 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00021.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00022.jpg b/Text2LIVE-main/data/data/videos/car-turn/00022.jpg new file mode 100644 index 0000000000000000000000000000000000000000..b9593eb4ec46c20c55511a7fbb602617acb0d93b Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00022.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00023.jpg b/Text2LIVE-main/data/data/videos/car-turn/00023.jpg new file mode 100644 index 0000000000000000000000000000000000000000..ba2008e9b262788e65015c91a503eecb38fcd6ce Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00023.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00024.jpg b/Text2LIVE-main/data/data/videos/car-turn/00024.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e5cfd8dfbcdb4c91aee98b5e4a8b0a16747d7a57 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00024.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00025.jpg b/Text2LIVE-main/data/data/videos/car-turn/00025.jpg new file mode 100644 index 0000000000000000000000000000000000000000..7e742284486e91d5434ada42ce75ef9cfdeb752c Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00025.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00026.jpg b/Text2LIVE-main/data/data/videos/car-turn/00026.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e3403a63a8d3e05acf5c6b172c1628e8b4ef4252 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00026.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00027.jpg b/Text2LIVE-main/data/data/videos/car-turn/00027.jpg new file mode 100644 index 0000000000000000000000000000000000000000..2c7daa028ab13d9d6b61f926273365b177eb8f4c Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00027.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00028.jpg b/Text2LIVE-main/data/data/videos/car-turn/00028.jpg new file mode 100644 index 0000000000000000000000000000000000000000..088d5125ad32cae1d87273ac0da795c40fa9af46 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00028.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00029.jpg b/Text2LIVE-main/data/data/videos/car-turn/00029.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f804eb0a23b8e5b0031146f7554de573dec5df5f Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00029.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00030.jpg b/Text2LIVE-main/data/data/videos/car-turn/00030.jpg new file mode 100644 index 0000000000000000000000000000000000000000..79351aa7a6eff8a274b6b93ba0617d03dade2838 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00030.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00031.jpg b/Text2LIVE-main/data/data/videos/car-turn/00031.jpg new file mode 100644 index 0000000000000000000000000000000000000000..1d6021cb91c39177b35b5ddf298d3c82a3e7c462 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00031.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00032.jpg b/Text2LIVE-main/data/data/videos/car-turn/00032.jpg new file mode 100644 index 0000000000000000000000000000000000000000..4747d0cb7ee962a2c63914745d52c00754699946 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00032.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00033.jpg b/Text2LIVE-main/data/data/videos/car-turn/00033.jpg new file mode 100644 index 0000000000000000000000000000000000000000..d15abf543d9e43b68ab0f7655ba801017ed07f07 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00033.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00034.jpg b/Text2LIVE-main/data/data/videos/car-turn/00034.jpg new file mode 100644 index 0000000000000000000000000000000000000000..4eee5f3c156c18ee43688fe372307153b3e1551c Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00034.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00035.jpg b/Text2LIVE-main/data/data/videos/car-turn/00035.jpg new file mode 100644 index 0000000000000000000000000000000000000000..11f9e19e0d72dcde88ec72b34f5a2eab3dfbbe02 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00035.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00036.jpg b/Text2LIVE-main/data/data/videos/car-turn/00036.jpg new file mode 100644 index 0000000000000000000000000000000000000000..3786da3962662702932e878e12ed8b7a1705a4f4 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00036.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00037.jpg b/Text2LIVE-main/data/data/videos/car-turn/00037.jpg new file mode 100644 index 0000000000000000000000000000000000000000..2565a6636f240e598af37efee55ef1e9ceec7dcd Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00037.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00038.jpg b/Text2LIVE-main/data/data/videos/car-turn/00038.jpg new file mode 100644 index 0000000000000000000000000000000000000000..811791b27dbd2e805bc0eb26bd11857dd2fbee9e Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00038.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00039.jpg b/Text2LIVE-main/data/data/videos/car-turn/00039.jpg new file mode 100644 index 0000000000000000000000000000000000000000..bcf17ec7eaf2da9c3758a88039029195c96f010a Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00039.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00040.jpg b/Text2LIVE-main/data/data/videos/car-turn/00040.jpg new file mode 100644 index 0000000000000000000000000000000000000000..7bc9464fd1893d8bd6d35cf2016a84a10c451ac4 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00040.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00041.jpg b/Text2LIVE-main/data/data/videos/car-turn/00041.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5767940f8da637b81466d6d102e4b5806baa02dc Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00041.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00042.jpg b/Text2LIVE-main/data/data/videos/car-turn/00042.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5ef85ba9e94dbf9b8a2405d9065c49bcd7db8865 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00042.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00043.jpg b/Text2LIVE-main/data/data/videos/car-turn/00043.jpg new file mode 100644 index 0000000000000000000000000000000000000000..eb11e30965b01b389335ec31e15101928c81f49c Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00043.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00044.jpg b/Text2LIVE-main/data/data/videos/car-turn/00044.jpg new file mode 100644 index 0000000000000000000000000000000000000000..92f6191e9d311b5f9d70993dc75545884159de6c Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00044.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00045.jpg b/Text2LIVE-main/data/data/videos/car-turn/00045.jpg new file mode 100644 index 0000000000000000000000000000000000000000..36cb05749f1ade43939ba2818f0a4b2155074325 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00045.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00046.jpg b/Text2LIVE-main/data/data/videos/car-turn/00046.jpg new file mode 100644 index 0000000000000000000000000000000000000000..b8d6dcbfc743c4d27c39e39ad00795f8bf11e63c Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00046.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00047.jpg b/Text2LIVE-main/data/data/videos/car-turn/00047.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e713b076fc01409180f0cb6f0bddd118cc7d2a2a Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00047.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00048.jpg b/Text2LIVE-main/data/data/videos/car-turn/00048.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6865f7a3319d20b61f364e61dbf5c38bfee058b9 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00048.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00049.jpg b/Text2LIVE-main/data/data/videos/car-turn/00049.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c97c150b9f96a6ed23851eacb247f2a7e71f95be Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00049.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00050.jpg b/Text2LIVE-main/data/data/videos/car-turn/00050.jpg new file mode 100644 index 0000000000000000000000000000000000000000..8ab3239e8db68d5e44caac2c8cd9a6e983a19cb5 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00050.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00051.jpg b/Text2LIVE-main/data/data/videos/car-turn/00051.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f22892560420a721f6d2e70da7260e2604d4c686 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00051.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00052.jpg b/Text2LIVE-main/data/data/videos/car-turn/00052.jpg new file mode 100644 index 0000000000000000000000000000000000000000..0ca96b32f497e28060b47f4c934ef0dd5c2ee575 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00052.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00053.jpg b/Text2LIVE-main/data/data/videos/car-turn/00053.jpg new file mode 100644 index 0000000000000000000000000000000000000000..fa489119a539560239cebbfa57ffadb36fdd1a68 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00053.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00054.jpg b/Text2LIVE-main/data/data/videos/car-turn/00054.jpg new file mode 100644 index 0000000000000000000000000000000000000000..0cb1d726297b69f25aa240b389cfb23ed9806609 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00054.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00055.jpg b/Text2LIVE-main/data/data/videos/car-turn/00055.jpg new file mode 100644 index 0000000000000000000000000000000000000000..b009c8e0725bdfea5379acb30225bc2d400041a0 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00055.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00056.jpg b/Text2LIVE-main/data/data/videos/car-turn/00056.jpg new file mode 100644 index 0000000000000000000000000000000000000000..249adcc735d87613d24b08fa926f527346c1135d Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00056.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00057.jpg b/Text2LIVE-main/data/data/videos/car-turn/00057.jpg new file mode 100644 index 0000000000000000000000000000000000000000..027569c9125bd704837d1cc0ffd661e7e55687c5 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00057.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00058.jpg b/Text2LIVE-main/data/data/videos/car-turn/00058.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5ab5fce583f3bde86f3ef3c8e7c1ee4a5ae00fb4 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00058.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00059.jpg b/Text2LIVE-main/data/data/videos/car-turn/00059.jpg new file mode 100644 index 0000000000000000000000000000000000000000..848f299ae1008f8852176d9be82d0f34ff15f3b4 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00059.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00060.jpg b/Text2LIVE-main/data/data/videos/car-turn/00060.jpg new file mode 100644 index 0000000000000000000000000000000000000000..fa8e759a83ddbf2864c54330f1513f2f5b5d23a0 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00060.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00061.jpg b/Text2LIVE-main/data/data/videos/car-turn/00061.jpg new file mode 100644 index 0000000000000000000000000000000000000000..05cde86260c3ccd11b5d857b0f8d39f298e142a1 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00061.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00062.jpg b/Text2LIVE-main/data/data/videos/car-turn/00062.jpg new file mode 100644 index 0000000000000000000000000000000000000000..b7083517c0e745491b9bf5900aeefb89883c755a Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00062.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00063.jpg b/Text2LIVE-main/data/data/videos/car-turn/00063.jpg new file mode 100644 index 0000000000000000000000000000000000000000..2856d3a55586acdf2c487839ebd007a26ad2a094 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00063.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00064.jpg b/Text2LIVE-main/data/data/videos/car-turn/00064.jpg new file mode 100644 index 0000000000000000000000000000000000000000..48294e110282c36e32e9cec3f3c655ad5b37c068 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00064.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00065.jpg b/Text2LIVE-main/data/data/videos/car-turn/00065.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c4fdfbe78edb0ec39c7f0acf21531d7d70f3a58d Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00065.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00066.jpg b/Text2LIVE-main/data/data/videos/car-turn/00066.jpg new file mode 100644 index 0000000000000000000000000000000000000000..96b8b6c68106f986357966f9cd2a1184728b8e6b Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00066.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00067.jpg b/Text2LIVE-main/data/data/videos/car-turn/00067.jpg new file mode 100644 index 0000000000000000000000000000000000000000..79ef9fd7ea7a080ac1b1224830fd1de9faa2e3f5 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00067.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00068.jpg b/Text2LIVE-main/data/data/videos/car-turn/00068.jpg new file mode 100644 index 0000000000000000000000000000000000000000..842506eed296873488310ba87cea0686c4fa2fd5 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00068.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00069.jpg b/Text2LIVE-main/data/data/videos/car-turn/00069.jpg new file mode 100644 index 0000000000000000000000000000000000000000..98229051dba1faa40e5f3f383ac1b6280de8556c Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00069.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00070.jpg b/Text2LIVE-main/data/data/videos/car-turn/00070.jpg new file mode 100644 index 0000000000000000000000000000000000000000..dcf020a7f205591311ad4d8bae9ba012e3027649 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00070.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00071.jpg b/Text2LIVE-main/data/data/videos/car-turn/00071.jpg new file mode 100644 index 0000000000000000000000000000000000000000..475d2d1cfd20ca6a96c39d82c7a8171f2c8ac8ab Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00071.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00072.jpg b/Text2LIVE-main/data/data/videos/car-turn/00072.jpg new file mode 100644 index 0000000000000000000000000000000000000000..42339a5787a3707f24994601d6c1528adb2c73a8 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00072.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00073.jpg b/Text2LIVE-main/data/data/videos/car-turn/00073.jpg new file mode 100644 index 0000000000000000000000000000000000000000..3ab25a0657801bf0131a8374010b69576e85f947 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00073.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00074.jpg b/Text2LIVE-main/data/data/videos/car-turn/00074.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c0f184f0cd94ef0444a99a15a03c8bfb44e8c060 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00074.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00075.jpg b/Text2LIVE-main/data/data/videos/car-turn/00075.jpg new file mode 100644 index 0000000000000000000000000000000000000000..fa9551372baa78c373201b75fa4e830e204316ec Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00075.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00076.jpg b/Text2LIVE-main/data/data/videos/car-turn/00076.jpg new file mode 100644 index 0000000000000000000000000000000000000000..7174579d71c8beb24d26e51c8ea8fa22e6b1b31c Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00076.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00077.jpg b/Text2LIVE-main/data/data/videos/car-turn/00077.jpg new file mode 100644 index 0000000000000000000000000000000000000000..ecd9daa16f4321f0ed32f24560e17f618c35c1e4 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00077.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00078.jpg b/Text2LIVE-main/data/data/videos/car-turn/00078.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5685fd74247680efd1e53b825e798ae0884a73c9 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00078.jpg differ diff --git a/Text2LIVE-main/data/data/videos/car-turn/00079.jpg b/Text2LIVE-main/data/data/videos/car-turn/00079.jpg new file mode 100644 index 0000000000000000000000000000000000000000..0c7d9c916ecc92a5526b4f120c6edc4b0fc66478 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/car-turn/00079.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00000.jpg b/Text2LIVE-main/data/data/videos/libby/00000.jpg new file mode 100644 index 0000000000000000000000000000000000000000..262fbc11f186e88612579006487af2603bd7cfcf Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00000.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00001.jpg b/Text2LIVE-main/data/data/videos/libby/00001.jpg new file mode 100644 index 0000000000000000000000000000000000000000..b0b783c148cf5a09165a91dc6f93e693e4ce36b4 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00001.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00002.jpg b/Text2LIVE-main/data/data/videos/libby/00002.jpg new file mode 100644 index 0000000000000000000000000000000000000000..d89e2693d6047b60937a8a54adbd22ddd2cc624f Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00002.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00003.jpg b/Text2LIVE-main/data/data/videos/libby/00003.jpg new file mode 100644 index 0000000000000000000000000000000000000000..bc4b9aab8dbe978eb94769df632a551a0b2a10e5 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00003.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00004.jpg b/Text2LIVE-main/data/data/videos/libby/00004.jpg new file mode 100644 index 0000000000000000000000000000000000000000..66d4bd9691e11b9a57eb25b33b62bce3383d0b68 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00004.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00005.jpg b/Text2LIVE-main/data/data/videos/libby/00005.jpg new file mode 100644 index 0000000000000000000000000000000000000000..90f771de33adec8fffbef77450ceda4cc04eccae Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00005.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00006.jpg b/Text2LIVE-main/data/data/videos/libby/00006.jpg new file mode 100644 index 0000000000000000000000000000000000000000..87119ba51eff6498821c8d101603b815951eeeec Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00006.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00007.jpg b/Text2LIVE-main/data/data/videos/libby/00007.jpg new file mode 100644 index 0000000000000000000000000000000000000000..cdc374c82e56d7ece2434209b956832735bc483d Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00007.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00008.jpg b/Text2LIVE-main/data/data/videos/libby/00008.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a2f7d1ddbcc97a405565e64e7f214bf820f55818 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00008.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00009.jpg b/Text2LIVE-main/data/data/videos/libby/00009.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e0237059fae445885051876d561cb7384f9fd82f Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00009.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00010.jpg b/Text2LIVE-main/data/data/videos/libby/00010.jpg new file mode 100644 index 0000000000000000000000000000000000000000..b4485f6751a473ebcb3a0dfdb9ed5a20b17a6222 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00010.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00011.jpg b/Text2LIVE-main/data/data/videos/libby/00011.jpg new file mode 100644 index 0000000000000000000000000000000000000000..2e1202e8794e275ae5ff536dc444c6df4ef8726c Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00011.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00012.jpg b/Text2LIVE-main/data/data/videos/libby/00012.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c68a998d26a24b093f16699b30c64c889a67f3ac Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00012.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00013.jpg b/Text2LIVE-main/data/data/videos/libby/00013.jpg new file mode 100644 index 0000000000000000000000000000000000000000..1740dbbf0c53148fa76f2ca99c2a3b35ea6daf2e Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00013.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00014.jpg b/Text2LIVE-main/data/data/videos/libby/00014.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6b6457afaa433ba7fa90ab1f4ea4ac657e118362 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00014.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00015.jpg b/Text2LIVE-main/data/data/videos/libby/00015.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e62b81acda832093a868ce8a169f05787de0c439 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00015.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00016.jpg b/Text2LIVE-main/data/data/videos/libby/00016.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6f455f2ca6a748959de9f90e388c932e060e4083 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00016.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00017.jpg b/Text2LIVE-main/data/data/videos/libby/00017.jpg new file mode 100644 index 0000000000000000000000000000000000000000..afdf5b0309fc47d263fe7773b37567bdb6658448 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00017.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00018.jpg b/Text2LIVE-main/data/data/videos/libby/00018.jpg new file mode 100644 index 0000000000000000000000000000000000000000..048532d03780b3818e134de01c7d28da210b1374 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00018.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00019.jpg b/Text2LIVE-main/data/data/videos/libby/00019.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e4fe7ce4c8748c4698196c9c0f79cf3b874fde67 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00019.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00020.jpg b/Text2LIVE-main/data/data/videos/libby/00020.jpg new file mode 100644 index 0000000000000000000000000000000000000000..8760c647d28327321ead49ea754931f573604655 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00020.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00021.jpg b/Text2LIVE-main/data/data/videos/libby/00021.jpg new file mode 100644 index 0000000000000000000000000000000000000000..8d86fb68d46fb3cb15ad831f9938b849ef0b0009 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00021.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00022.jpg b/Text2LIVE-main/data/data/videos/libby/00022.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a042f77161d42027cdd8af4a8fb3ef06b9f8cba2 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00022.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00023.jpg b/Text2LIVE-main/data/data/videos/libby/00023.jpg new file mode 100644 index 0000000000000000000000000000000000000000..0c0af8bce10aec7dabbedaebfee66e40576a61c1 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00023.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00024.jpg b/Text2LIVE-main/data/data/videos/libby/00024.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9f1476a71b3401103870490056a0cc88a20cb4ed Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00024.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00025.jpg b/Text2LIVE-main/data/data/videos/libby/00025.jpg new file mode 100644 index 0000000000000000000000000000000000000000..7e3b4f61e6d8551204cebfbd527648ee4eccb69c Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00025.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00026.jpg b/Text2LIVE-main/data/data/videos/libby/00026.jpg new file mode 100644 index 0000000000000000000000000000000000000000..d39ce8c1274c39446a4677043f1ef4763cd35684 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00026.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00027.jpg b/Text2LIVE-main/data/data/videos/libby/00027.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c16c56c5ab71244902e2466e01ee996bbfc5cd49 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00027.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00028.jpg b/Text2LIVE-main/data/data/videos/libby/00028.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a9bd4d6d6727fd3a45f17a479680fb62d93bd4cb Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00028.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00029.jpg b/Text2LIVE-main/data/data/videos/libby/00029.jpg new file mode 100644 index 0000000000000000000000000000000000000000..36dd5bc53ce919b8cf02c6024b4daa456c47f253 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00029.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00030.jpg b/Text2LIVE-main/data/data/videos/libby/00030.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5def59446602155a294bbdab4634694635ef1265 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00030.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00031.jpg b/Text2LIVE-main/data/data/videos/libby/00031.jpg new file mode 100644 index 0000000000000000000000000000000000000000..8fb9fe13d16f65176ad26c78c59df2432cc14535 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00031.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00032.jpg b/Text2LIVE-main/data/data/videos/libby/00032.jpg new file mode 100644 index 0000000000000000000000000000000000000000..2d24a89caf06da8ca1ed8613b5a81d25b032d7c6 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00032.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00033.jpg b/Text2LIVE-main/data/data/videos/libby/00033.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9ac5e7fe7289c7ca886b6bd5ce6c7df0329169a7 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00033.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00034.jpg b/Text2LIVE-main/data/data/videos/libby/00034.jpg new file mode 100644 index 0000000000000000000000000000000000000000..44a90d8f631a7b6857c30026a6a305236d94687d Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00034.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00035.jpg b/Text2LIVE-main/data/data/videos/libby/00035.jpg new file mode 100644 index 0000000000000000000000000000000000000000..8664afe8bcbcd3881451e21962f5d2ed7e5ce334 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00035.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00036.jpg b/Text2LIVE-main/data/data/videos/libby/00036.jpg new file mode 100644 index 0000000000000000000000000000000000000000..106b6a63efa01caefdf1b57bd1f8f91eb93090ba Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00036.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00037.jpg b/Text2LIVE-main/data/data/videos/libby/00037.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e03441d2d141ffacc9c6ae5e97933b3fd94cbd85 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00037.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00038.jpg b/Text2LIVE-main/data/data/videos/libby/00038.jpg new file mode 100644 index 0000000000000000000000000000000000000000..80a2f86fc13459129a65cd58a1dc5b4809468f8f Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00038.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00039.jpg b/Text2LIVE-main/data/data/videos/libby/00039.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6154ba5d5a17b309f6c515ec089293f2b2eebc1e Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00039.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00040.jpg b/Text2LIVE-main/data/data/videos/libby/00040.jpg new file mode 100644 index 0000000000000000000000000000000000000000..d04f2d1ade517c6caee295a89e092a0f5b7ff6e7 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00040.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00041.jpg b/Text2LIVE-main/data/data/videos/libby/00041.jpg new file mode 100644 index 0000000000000000000000000000000000000000..108a30c14062287878038f72631268c21ea5e74a Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00041.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00042.jpg b/Text2LIVE-main/data/data/videos/libby/00042.jpg new file mode 100644 index 0000000000000000000000000000000000000000..35f00fa08cc970141dddaa50107ac0bdb87728f8 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00042.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00043.jpg b/Text2LIVE-main/data/data/videos/libby/00043.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6dea3f1e2b2d1af91b703a66228d0fc7a5826f40 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00043.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00044.jpg b/Text2LIVE-main/data/data/videos/libby/00044.jpg new file mode 100644 index 0000000000000000000000000000000000000000..732d7effeb858d6a0e6ee8151e149711a2905e05 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00044.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00045.jpg b/Text2LIVE-main/data/data/videos/libby/00045.jpg new file mode 100644 index 0000000000000000000000000000000000000000..61490e7498b8bae2c2bac5f9249341ae730205bf Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00045.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00046.jpg b/Text2LIVE-main/data/data/videos/libby/00046.jpg new file mode 100644 index 0000000000000000000000000000000000000000..1f01109451cadc24753cad5296a0b781c756a2dd Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00046.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00047.jpg b/Text2LIVE-main/data/data/videos/libby/00047.jpg new file mode 100644 index 0000000000000000000000000000000000000000..010d90e39d631f8f6a7273ebf48199ad52c263d5 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00047.jpg differ diff --git a/Text2LIVE-main/data/data/videos/libby/00048.jpg b/Text2LIVE-main/data/data/videos/libby/00048.jpg new file mode 100644 index 0000000000000000000000000000000000000000..b97d0b084825e2d2924f40d661fb729c9fad8aa5 Binary files /dev/null and b/Text2LIVE-main/data/data/videos/libby/00048.jpg differ diff --git a/Text2LIVE-main/datasets/__pycache__/__init__.cpython-37.pyc b/Text2LIVE-main/datasets/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4f5cf9683c9f87c24ec5a14b5a3dbbcbfd917dbb Binary files /dev/null and b/Text2LIVE-main/datasets/__pycache__/__init__.cpython-37.pyc differ diff --git a/Text2LIVE-main/datasets/__pycache__/video_dataset.cpython-37.pyc b/Text2LIVE-main/datasets/__pycache__/video_dataset.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..67a402baf6dd590b722a9b58db93ab8795740195 Binary files /dev/null and b/Text2LIVE-main/datasets/__pycache__/video_dataset.cpython-37.pyc differ diff --git a/Text2LIVE-main/models/__pycache__/__init__.cpython-37.pyc b/Text2LIVE-main/models/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..543703474b350456e696a83610bdbc640f8c5fc4 Binary files /dev/null and b/Text2LIVE-main/models/__pycache__/__init__.cpython-37.pyc differ diff --git a/Text2LIVE-main/models/__pycache__/clip_extractor.cpython-37.pyc b/Text2LIVE-main/models/__pycache__/clip_extractor.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f9ad1a1cf9919a2597e9b5274775857d693e064e Binary files /dev/null and b/Text2LIVE-main/models/__pycache__/clip_extractor.cpython-37.pyc differ diff --git a/Text2LIVE-main/models/__pycache__/clip_relevancy.cpython-37.pyc b/Text2LIVE-main/models/__pycache__/clip_relevancy.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b6fea2f90fb4699f1c50db1c7000404b3bf7b4da Binary files /dev/null and b/Text2LIVE-main/models/__pycache__/clip_relevancy.cpython-37.pyc differ diff --git a/Text2LIVE-main/models/__pycache__/image_model.cpython-37.pyc b/Text2LIVE-main/models/__pycache__/image_model.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7150f052b2003249211dfef4d34812285859e9b5 Binary files /dev/null and b/Text2LIVE-main/models/__pycache__/image_model.cpython-37.pyc differ diff --git a/Text2LIVE-main/models/__pycache__/implicit_neural_networks.cpython-37.pyc b/Text2LIVE-main/models/__pycache__/implicit_neural_networks.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..10a777f640e2a2f31fcf6909849ca181e434a38d Binary files /dev/null and b/Text2LIVE-main/models/__pycache__/implicit_neural_networks.cpython-37.pyc differ diff --git a/Text2LIVE-main/models/__pycache__/networks.cpython-37.pyc b/Text2LIVE-main/models/__pycache__/networks.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fed2410ac7c3b77dce04db91a791bd21210515ea Binary files /dev/null and b/Text2LIVE-main/models/__pycache__/networks.cpython-37.pyc differ diff --git a/Text2LIVE-main/models/__pycache__/video_model.cpython-37.pyc b/Text2LIVE-main/models/__pycache__/video_model.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b558eb4605e1629c611fb1027126dca236a5ba21 Binary files /dev/null and b/Text2LIVE-main/models/__pycache__/video_model.cpython-37.pyc differ diff --git a/Text2LIVE-main/models/backbone/__pycache__/__init__.cpython-37.pyc b/Text2LIVE-main/models/backbone/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b9666747520096ab0ed0a5a5cc73259535d95248 Binary files /dev/null and b/Text2LIVE-main/models/backbone/__pycache__/__init__.cpython-37.pyc differ diff --git a/Text2LIVE-main/models/backbone/__pycache__/common.cpython-37.pyc b/Text2LIVE-main/models/backbone/__pycache__/common.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cec73f4e512381fc276ec3178700b1d42d0e5c17 Binary files /dev/null and b/Text2LIVE-main/models/backbone/__pycache__/common.cpython-37.pyc differ diff --git a/Text2LIVE-main/models/backbone/__pycache__/downsampler.cpython-37.pyc b/Text2LIVE-main/models/backbone/__pycache__/downsampler.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..058c4105a602423fbe8c5764934891341b6f684f Binary files /dev/null and b/Text2LIVE-main/models/backbone/__pycache__/downsampler.cpython-37.pyc differ diff --git a/Text2LIVE-main/models/backbone/__pycache__/skip.cpython-37.pyc b/Text2LIVE-main/models/backbone/__pycache__/skip.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..800677f429e3430f3bd703aed8de82065a4584ec Binary files /dev/null and b/Text2LIVE-main/models/backbone/__pycache__/skip.cpython-37.pyc differ diff --git a/Text2LIVE-main/results/2022-08-22_20-50-50-car-turn/config.yaml b/Text2LIVE-main/results/2022-08-22_20-50-50-car-turn/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..542a0a56e0a2482c95a59dfe7fe9aae123edce6c --- /dev/null +++ b/Text2LIVE-main/results/2022-08-22_20-50-50-car-turn/config.yaml @@ -0,0 +1,68 @@ +align_corners: false +bootstrap_epoch: -1 +bootstrap_negative_map_threshold: 0.6 +bootstrap_negative_text: [] +bootstrap_scheduler: none +bootstrap_text: '' +bootstrapping_min_cover: 1 +center_frame_distance: 2 +checkpoint_path: data/pretrained_nla_models/car-turn/checkpoint +clip_affine_transform_fill: 0 +clip_model_name: ViT-B/32 +comp_text: +- winter countryside scene +- snowy countryside scene +- beautiful snow +crops_min_cover: 0.95 +device: cuda +entire_atlas_every: 75 +example_config: car-turn_winter.yaml +finetune_background: true +finetune_foreground: false +flip_p: 0.5 +gamma: 0.999 +grid_atlas_resolution: 2000 +input_entire_atlas: true +lambda_alpha_l0: 0.005 +lambda_alpha_l1: 0.01 +lambda_bootstrap: 10 +lambda_bootstrap_min: 0 +lambda_clip: 1 +lambda_composition: 1 +lambda_screen: 1 +lambda_sparsity: 0.1 +lambda_structure: 3 +log_images_freq: 500 +lr: 0.0025 +mask_alpha_threshold: 0.95 +masks_border_expansion: 30 +maximum_number_of_frames: 70 +min_lr: 1.0e-05 +multiply_foreground_alpha: true +n_aug: 6 +n_epochs: 3000 +num_scales: 7 +optimizer: madgrad +relevancy_num_layers: 10 +results_folder: results +resx: 768 +resy: 432 +return_atlas_alpha: false +save_model_starting_epoch: 900 +screen_text: +- winter scene +- snow everywhere +- beautiful snow +seed: -1 +skip_n11: 4 +skip_n33d: 128 +skip_n33u: 128 +src_text: +- countryside +- countryside with trees +- countryside +text_criterion: spherical +use_negative_bootstrap: false +use_wandb: false +wandb_entity: project_entity +wandb_project: project_name diff --git a/Text2LIVE-main/results/2022-08-22_20-51-42-car-turn/config.yaml b/Text2LIVE-main/results/2022-08-22_20-51-42-car-turn/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..542a0a56e0a2482c95a59dfe7fe9aae123edce6c --- /dev/null +++ b/Text2LIVE-main/results/2022-08-22_20-51-42-car-turn/config.yaml @@ -0,0 +1,68 @@ +align_corners: false +bootstrap_epoch: -1 +bootstrap_negative_map_threshold: 0.6 +bootstrap_negative_text: [] +bootstrap_scheduler: none +bootstrap_text: '' +bootstrapping_min_cover: 1 +center_frame_distance: 2 +checkpoint_path: data/pretrained_nla_models/car-turn/checkpoint +clip_affine_transform_fill: 0 +clip_model_name: ViT-B/32 +comp_text: +- winter countryside scene +- snowy countryside scene +- beautiful snow +crops_min_cover: 0.95 +device: cuda +entire_atlas_every: 75 +example_config: car-turn_winter.yaml +finetune_background: true +finetune_foreground: false +flip_p: 0.5 +gamma: 0.999 +grid_atlas_resolution: 2000 +input_entire_atlas: true +lambda_alpha_l0: 0.005 +lambda_alpha_l1: 0.01 +lambda_bootstrap: 10 +lambda_bootstrap_min: 0 +lambda_clip: 1 +lambda_composition: 1 +lambda_screen: 1 +lambda_sparsity: 0.1 +lambda_structure: 3 +log_images_freq: 500 +lr: 0.0025 +mask_alpha_threshold: 0.95 +masks_border_expansion: 30 +maximum_number_of_frames: 70 +min_lr: 1.0e-05 +multiply_foreground_alpha: true +n_aug: 6 +n_epochs: 3000 +num_scales: 7 +optimizer: madgrad +relevancy_num_layers: 10 +results_folder: results +resx: 768 +resy: 432 +return_atlas_alpha: false +save_model_starting_epoch: 900 +screen_text: +- winter scene +- snow everywhere +- beautiful snow +seed: -1 +skip_n11: 4 +skip_n33d: 128 +skip_n33u: 128 +src_text: +- countryside +- countryside with trees +- countryside +text_criterion: spherical +use_negative_bootstrap: false +use_wandb: false +wandb_entity: project_entity +wandb_project: project_name diff --git a/Text2LIVE-main/results/2022-08-22_20-52-21-car-turn/config.yaml b/Text2LIVE-main/results/2022-08-22_20-52-21-car-turn/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..542a0a56e0a2482c95a59dfe7fe9aae123edce6c --- /dev/null +++ b/Text2LIVE-main/results/2022-08-22_20-52-21-car-turn/config.yaml @@ -0,0 +1,68 @@ +align_corners: false +bootstrap_epoch: -1 +bootstrap_negative_map_threshold: 0.6 +bootstrap_negative_text: [] +bootstrap_scheduler: none +bootstrap_text: '' +bootstrapping_min_cover: 1 +center_frame_distance: 2 +checkpoint_path: data/pretrained_nla_models/car-turn/checkpoint +clip_affine_transform_fill: 0 +clip_model_name: ViT-B/32 +comp_text: +- winter countryside scene +- snowy countryside scene +- beautiful snow +crops_min_cover: 0.95 +device: cuda +entire_atlas_every: 75 +example_config: car-turn_winter.yaml +finetune_background: true +finetune_foreground: false +flip_p: 0.5 +gamma: 0.999 +grid_atlas_resolution: 2000 +input_entire_atlas: true +lambda_alpha_l0: 0.005 +lambda_alpha_l1: 0.01 +lambda_bootstrap: 10 +lambda_bootstrap_min: 0 +lambda_clip: 1 +lambda_composition: 1 +lambda_screen: 1 +lambda_sparsity: 0.1 +lambda_structure: 3 +log_images_freq: 500 +lr: 0.0025 +mask_alpha_threshold: 0.95 +masks_border_expansion: 30 +maximum_number_of_frames: 70 +min_lr: 1.0e-05 +multiply_foreground_alpha: true +n_aug: 6 +n_epochs: 3000 +num_scales: 7 +optimizer: madgrad +relevancy_num_layers: 10 +results_folder: results +resx: 768 +resy: 432 +return_atlas_alpha: false +save_model_starting_epoch: 900 +screen_text: +- winter scene +- snow everywhere +- beautiful snow +seed: -1 +skip_n11: 4 +skip_n33d: 128 +skip_n33u: 128 +src_text: +- countryside +- countryside with trees +- countryside +text_criterion: spherical +use_negative_bootstrap: false +use_wandb: false +wandb_entity: project_entity +wandb_project: project_name diff --git a/Text2LIVE-main/util/__pycache__/__init__.cpython-37.pyc b/Text2LIVE-main/util/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a191e9f552f2233808e5aef17a2886263a66a424 Binary files /dev/null and b/Text2LIVE-main/util/__pycache__/__init__.cpython-37.pyc differ diff --git a/Text2LIVE-main/util/__pycache__/atlas_loss.cpython-37.pyc b/Text2LIVE-main/util/__pycache__/atlas_loss.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bf6a688a1b2c5ae28ff696dcaae0c08ffb3cdea0 Binary files /dev/null and b/Text2LIVE-main/util/__pycache__/atlas_loss.cpython-37.pyc differ diff --git a/Text2LIVE-main/util/__pycache__/atlas_utils.cpython-37.pyc b/Text2LIVE-main/util/__pycache__/atlas_utils.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d7c4b6dc86986f19e13d46234e42dbfba9adf011 Binary files /dev/null and b/Text2LIVE-main/util/__pycache__/atlas_utils.cpython-37.pyc differ diff --git a/Text2LIVE-main/util/__pycache__/aug_utils.cpython-37.pyc b/Text2LIVE-main/util/__pycache__/aug_utils.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5ac946719bf7b295fdccc562648500a1c6ac3295 Binary files /dev/null and b/Text2LIVE-main/util/__pycache__/aug_utils.cpython-37.pyc differ diff --git a/Text2LIVE-main/util/__pycache__/losses.cpython-37.pyc b/Text2LIVE-main/util/__pycache__/losses.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e92b0c8e55d626a90dc74912a426c6b8a89937c0 Binary files /dev/null and b/Text2LIVE-main/util/__pycache__/losses.cpython-37.pyc differ diff --git a/Text2LIVE-main/util/__pycache__/util.cpython-37.pyc b/Text2LIVE-main/util/__pycache__/util.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..af48d5b1ba6123720d22da256f15c69046581947 Binary files /dev/null and b/Text2LIVE-main/util/__pycache__/util.cpython-37.pyc differ diff --git a/Text2LIVE-main/util/__pycache__/video_logger.cpython-37.pyc b/Text2LIVE-main/util/__pycache__/video_logger.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9d4fc595f2167e99930ee79cb4b6e8b7f08631af Binary files /dev/null and b/Text2LIVE-main/util/__pycache__/video_logger.cpython-37.pyc differ diff --git a/configs/image_config.yaml b/configs/image_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..98dff3221f873772f441a3360aa64d87407d4e56 --- /dev/null +++ b/configs/image_config.yaml @@ -0,0 +1,60 @@ +seed: -1 +use_wandb: False +wandb_project: project_name # enter your project name here +wandb_entity: project_entity # enter your entity here +device: cuda +log_images_freq: 50 +# dataset configuration +results_folder: "results" +resize_input: 512 +d_divisible_crops: 1 +source_image_every: 75 +crops_min_cover: 0.85 + +# input augmentations +flip_p: 0.5 +jitter_p: 0.1 +scale_min: 0.8 +scale_max: 1.2 + +# text configuration +#bootstrap_text: horse # this prompt is not composed with any augmentations at the moment +bootstrap_scheduler: linear # linear | exponential | none +text_criterion: spherical # spherical | cosine + +# loss configuration +bootstrap_epoch: -1 +lambda_bootstrap: 10 +lambda_bootstrap_min: 0 +bootstrapping_min_cover: 1 +lambda_structure: 2 +lambda_screen: 1 +lambda_sparsity: 6 # lambda_sparsity * ( lambda_alpha_l0 * L0_loss + lambda_alpha_l1 * L1_loss ) +lambda_alpha_l0: 0.01 +lambda_alpha_l1: 0.01 +lambda_composition: 1 +use_negative_bootstrap: False + +# CLIP handling configuration +n_aug: 16 # number of augmentations to be applied before CLIP +clip_model_name: "ViT-B/32" # ViT-B/16 | ViT-B/32 | ViT-L/14 +relevancy_num_layers: 10 + +# CLIP augmentations +clip_affine_transform_fill: 1 + +# training configuration +n_epochs: 1000 +scheduler_policy: exponential # [exponential| linear | step | plateau | cosine | none] +gamma: 0.99 +min_lr: 0.00001 +lr: 0.0025 +optimizer: madgrad # [madgrad | adam | radam | rmsprop | sgd] + +# CNN backbone configuration +skip_n33d: 128 +skip_n33u: 128 +skip_n11: 4 +num_scales: 7 + + diff --git a/configs/image_example_configs/golden_horse.yaml b/configs/image_example_configs/golden_horse.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e2839735d0ffd1fb957d79f4bb10526378bdbe68 --- /dev/null +++ b/configs/image_example_configs/golden_horse.yaml @@ -0,0 +1,8 @@ +image_path: "./data/images/horse.jpg" # path to the input image + +screen_text: "golden horse" # texts, describing the edit layer +comp_text: "golden horse" # texts, describing the full edited image +src_text: "horse" # texts, describing the input image + +bootstrap_text: "horse" # text, describing the region of interest in the input image +bootstrap_epoch: 1000 # number of epochs for bootstrapping (it is annealed during training) \ No newline at end of file diff --git a/configs/image_example_configs/ice_cake.yaml b/configs/image_example_configs/ice_cake.yaml new file mode 100644 index 0000000000000000000000000000000000000000..832a08702292d83981c6697a26fb970bbd477bc8 --- /dev/null +++ b/configs/image_example_configs/ice_cake.yaml @@ -0,0 +1,5 @@ +image_path: "./data/images/cake.jpeg" + +screen_text: "ice" # texts, describing the edit layer +comp_text: "ice" # texts, describing the full edited image +src_text: "cake" # texts, describing the input image \ No newline at end of file diff --git a/configs/video_config.yaml b/configs/video_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0eb2501b537365dbb609007f521ca560b199580c --- /dev/null +++ b/configs/video_config.yaml @@ -0,0 +1,66 @@ +seed: -1 +device: cuda +use_wandb: False +wandb_project: project_name # enter your project name here +wandb_entity: project_entity # enter your entity here +results_folder: results # if not use wandb, this is the folder where the results will be saved + +resx: 768 +resy: 432 +example_config: "car-turn_winter.yaml" + +save_model_starting_epoch: 900 +multiply_foreground_alpha: True + +flip_p: 0.5 # probability of applying flip before cnn +n_aug: 6 # set to -1 to disable augmentation before CLIP +clip_affine_transform_fill: 0 # 0 for black, 1 for white +clip_model_name: "ViT-B/32" # ViT-B/16 | ViT-B/32 | ViT-L/14 + +text_criterion: spherical # spherical | cosine | scaled_cosine (*1.2) + +bootstrap_text: "" +bootstrap_scheduler: none +bootstrap_epoch: -1 # epoch to stop penalizing sparsity +use_negative_bootstrap: False # whether to use negative relevance +lambda_bootstrap_min: 0 +bootstrap_negative_text: [] # negative alpha - will ignore this +bootstrap_negative_map_threshold: 0.6 # penalizing only locations with high values in relevancy +bootstrapping_min_cover: 1 +relevancy_num_layers: 10 + +lambda_screen: 1 +lambda_sparsity: 0.1 # lambda_sparsity * ( lambda_alpha_l0 * L0_loss + lambda_alpha_l1 * L1_loss ) +lambda_alpha_l0: 0.005 +lambda_alpha_l1: 0.01 +lambda_structure: 3 +lambda_bootstrap: 10 +lambda_clip: 1 # lambda_clip * ( lambda_comp_clip * L_comp + lambda_layer_clip * L_layer ) +lambda_composition: 1 + +n_epochs: 3000 +gamma: 0.999 +min_lr: 0.00001 +lr: 0.0025 +optimizer: madgrad # [adam | radam | rmsprop | sgd] + +# the following is relevant only for dip_backbones backbone +skip_n33d: 128 +skip_n33u: 128 +skip_n11: 4 +num_scales: 7 + +log_images_freq: 500 +center_frame_distance: 2 + +input_entire_atlas: True +entire_atlas_every: 75 +return_atlas_alpha: False + +grid_atlas_resolution: 2000 +align_corners: False + +crops_min_cover: 0.95 # 0.95 for foreground, 0.8 for background + +masks_border_expansion: 30 +mask_alpha_threshold: 0.95 diff --git a/configs/video_example_configs/blackswan_crochet.yaml b/configs/video_example_configs/blackswan_crochet.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0560f881970c96c29d7f94c9590cf83e101b876f --- /dev/null +++ b/configs/video_example_configs/blackswan_crochet.yaml @@ -0,0 +1,9 @@ +finetune_background: False +finetune_foreground: True +checkpoint_path: data/pretrained_nla_models/blackswan/checkpoint # path to the checkpoint file +maximum_number_of_frames: 50 # maximum number of frames to use in the video + +screen_text: ["crochet swan","knitted swan"] # texts, describing the edit layer +comp_text: ["crochet swan","knitted swan"] # texts, describing the full edited video +src_text: ["black swan", "swan"] # texts, describing the input video + diff --git a/configs/video_example_configs/blackswan_crystal.yaml b/configs/video_example_configs/blackswan_crystal.yaml new file mode 100644 index 0000000000000000000000000000000000000000..298911a2992506974dfb45054f1a3a17d3a6b1c8 --- /dev/null +++ b/configs/video_example_configs/blackswan_crystal.yaml @@ -0,0 +1,9 @@ +finetune_background: False +finetune_foreground: True +checkpoint_path: data/pretrained_nla_models/blackswan/checkpoint # path to the checkpoint file +maximum_number_of_frames: 50 # maximum number of frames to use in the video + +screen_text: ["swarovski blue crystal swan", "swarovski blue crystal swan"] # texts, describing the edit layer +comp_text: ["swarovski blue crystal swan", "swarovski blue crystal swan"] # texts, describing the full edited video +src_text: ["black swan", "swan"] # texts, describing the input video + diff --git a/configs/video_example_configs/blackswan_cube.yaml b/configs/video_example_configs/blackswan_cube.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fa876cea2fbb54edf5e8abcad631d77eed70ee88 --- /dev/null +++ b/configs/video_example_configs/blackswan_cube.yaml @@ -0,0 +1,9 @@ +finetune_background: False +finetune_foreground: True +checkpoint_path: data/pretrained_nla_models/blackswan/checkpoint # path to the checkpoint file +maximum_number_of_frames: 50 # maximum number of frames to use in the video + +screen_text: ["swan rubik's cube", "swan rubik's cube"] # texts, describing the edit layer +comp_text: ["swan rubik's cube", "swan rubik's cube"] # texts, describing the full edited video +src_text: ["black swan", "swan"] # texts, describing the input video + diff --git a/configs/video_example_configs/blackswan_gold.yaml b/configs/video_example_configs/blackswan_gold.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cf560bc6f56a430e7b0b6f13cc5deb9e5a5d3689 --- /dev/null +++ b/configs/video_example_configs/blackswan_gold.yaml @@ -0,0 +1,9 @@ +finetune_background: False +finetune_foreground: True +checkpoint_path: data/pretrained_nla_models/blackswan/checkpoint # path to the checkpoint file +maximum_number_of_frames: 50 # maximum number of frames to use in the video + +screen_text: ["golden swan", "golden swan"] # texts, describing the edit layer +comp_text: ["golden swan", "golden swan"] # texts, describing the full edited video +src_text: ["black swan", "swan"] # texts, describing the input video + diff --git a/configs/video_example_configs/car-turn_nighttime.yaml b/configs/video_example_configs/car-turn_nighttime.yaml new file mode 100644 index 0000000000000000000000000000000000000000..012b537405fb24b828b7dfeccf0ebeb06c254505 --- /dev/null +++ b/configs/video_example_configs/car-turn_nighttime.yaml @@ -0,0 +1,10 @@ +finetune_background: True +finetune_foreground: False +checkpoint_path: data/pretrained_nla_models/car-turn/checkpoint # path to the checkpoint file +maximum_number_of_frames: 70 # maximum number of frames to use in the video + +screen_text: ["nighttime","night"] # texts, describing the edit layer +comp_text: ["countryside at nighttime","countryside at night"] # texts, describing the full edited video +src_text: ["countryside","countryside with trees"] # texts, describing the input video + +n_epochs: 1500 # number of epochs to train diff --git a/configs/video_example_configs/car-turn_rusty.yaml b/configs/video_example_configs/car-turn_rusty.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b18cd05536fd2ea798771b2e74f162dfd1768a1d --- /dev/null +++ b/configs/video_example_configs/car-turn_rusty.yaml @@ -0,0 +1,11 @@ +finetune_background: False +finetune_foreground: True +checkpoint_path: data/pretrained_nla_models/car-turn/checkpoint # path to the checkpoint file +maximum_number_of_frames: 70 # maximum number of frames to use in the video + +screen_text: ["rusty jeep"] # texts, describing the edit layer +comp_text: ["rusty jeep"] # texts, describing the full edited video +src_text: ["jeep"] # texts, describing the input video + +bootstrap_epoch: 3000 # number of epochs for bootstrapping +bootstrap_text: "car" # text, describing the region of interest in the input video diff --git a/configs/video_example_configs/car-turn_winter.yaml b/configs/video_example_configs/car-turn_winter.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4dea6d385e789e0e8776c03c01cbae0a4f13d22b --- /dev/null +++ b/configs/video_example_configs/car-turn_winter.yaml @@ -0,0 +1,10 @@ +finetune_background: True +finetune_foreground: False +checkpoint_path: data/pretrained_nla_models/car-turn/checkpoint # path to the checkpoint file +maximum_number_of_frames: 70 # maximum number of frames to use in the video + +screen_text: ["winter scene", "snow everywhere", "beautiful snow"] # texts, describing the edit layer +comp_text: ["winter countryside scene", "snowy countryside scene", "beautiful snow"] # texts, describing the full edited video +src_text: ["countryside", "countryside with trees", "countryside"] # texts, describing the input video + +n_epochs: 3000 # number of epochs to train diff --git a/configs/video_example_configs/libby_diamond.yaml b/configs/video_example_configs/libby_diamond.yaml new file mode 100644 index 0000000000000000000000000000000000000000..caa8d10b570d075e498b2e3c1a8d53246fb73b87 --- /dev/null +++ b/configs/video_example_configs/libby_diamond.yaml @@ -0,0 +1,8 @@ +finetune_background: False +finetune_foreground: True +checkpoint_path: data/pretrained_nla_models/libby/checkpoint # path to the checkpoint file +maximum_number_of_frames: 49 # maximum number of frames to use in the video + +screen_text: ["dog made out of diamonds"] # texts, describing the edit layer +comp_text: ["dog made out of diamonds"] # texts, describing the full edited video +src_text: ["dog"] # texts, describing the input video diff --git a/configs/video_example_configs/libby_giraffe.yaml b/configs/video_example_configs/libby_giraffe.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0b0cc8abdd7627c579f110fb5a39993a372bd770 --- /dev/null +++ b/configs/video_example_configs/libby_giraffe.yaml @@ -0,0 +1,8 @@ +finetune_background: False +finetune_foreground: True +checkpoint_path: data/pretrained_nla_models/libby/checkpoint # path to the checkpoint file +maximum_number_of_frames: 49 # maximum number of frames to use in the video + +screen_text: ["dog with giraffe texture"] # texts, describing the edit layer +comp_text: ["dog with giraffe texture"] # texts, describing the full edited video +src_text: ["dog"] # texts, describing the input video diff --git a/configs/video_example_configs/libby_gold.yaml b/configs/video_example_configs/libby_gold.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7fb8ac2e7141c93580c4e4fea553cd213b5f5de5 --- /dev/null +++ b/configs/video_example_configs/libby_gold.yaml @@ -0,0 +1,8 @@ +finetune_background: False +finetune_foreground: True +checkpoint_path: data/pretrained_nla_models/libby/checkpoint # path to the checkpoint file +maximum_number_of_frames: 49 # maximum number of frames to use in the video + +screen_text: ["golden dog"] # texts, describing the edit layer +comp_text: ["golden dog"] # texts, describing the full edited video +src_text: ["dog"] # texts, describing the input video diff --git a/configs/video_example_configs/libby_leopard.yaml b/configs/video_example_configs/libby_leopard.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1bfa1079e0b3f5f35c781163c286f844f0fbaccb --- /dev/null +++ b/configs/video_example_configs/libby_leopard.yaml @@ -0,0 +1,8 @@ +finetune_background: False +finetune_foreground: True +checkpoint_path: data/pretrained_nla_models/libby/checkpoint # path to the checkpoint file +maximum_number_of_frames: 49 # maximum number of frames to use in the video + +screen_text: ["dog with leopard texture"] # texts, describing the edit layer +comp_text: ["dog with leopard texture"] # texts, describing the full edited video +src_text: ["dog"] # texts, describing the input video diff --git a/configs/video_example_configs/lucia_fog.yaml b/configs/video_example_configs/lucia_fog.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e2878f47a7162e0c37889a23e7f8dbfaeabaeabd --- /dev/null +++ b/configs/video_example_configs/lucia_fog.yaml @@ -0,0 +1,10 @@ +finetune_background: True +finetune_foreground: False +checkpoint_path: data/pretrained_nla_models/lucia/checkpoint # path to the checkpoint file +maximum_number_of_frames: 70 # maximum number of frames to use in the video + +screen_text: ["fog","foggy"] # texts, describing the edit layer +comp_text: ["foggy park","foggy city park"] # texts, describing the full edited video +src_text: ["park","city park"] # texts, describing the input video + +n_epochs: 3000 # number of epochs to train diff --git a/configs/video_example_configs/lucia_winter.yaml b/configs/video_example_configs/lucia_winter.yaml new file mode 100644 index 0000000000000000000000000000000000000000..74ceb6b3781fccd58ec1b4a5951eb4abcedbcda6 --- /dev/null +++ b/configs/video_example_configs/lucia_winter.yaml @@ -0,0 +1,10 @@ +finetune_background: True +finetune_foreground: False +checkpoint_path: data/pretrained_nla_models/lucia/checkpoint # path to the checkpoint file +maximum_number_of_frames: 70 # maximum number of frames to use in the video + +screen_text: ["winter scene", "snow everywhere", "beautiful snow"] # texts, describing the edit layer +comp_text: ["winter in park", "snowy park", "beautiful snow at city park"] # texts, describing the full edited video +src_text: ["park","city park"] # texts, describing the input video + +n_epochs: 3000 # number of epochs to train diff --git a/configs/video_example_configs/lucia_wonderland.yaml b/configs/video_example_configs/lucia_wonderland.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9fbccb762e569f8d6f2c19e23a14dc2724df111e --- /dev/null +++ b/configs/video_example_configs/lucia_wonderland.yaml @@ -0,0 +1,10 @@ +finetune_background: True +finetune_foreground: False +checkpoint_path: data/pretrained_nla_models/lucia/checkpoint # path to the checkpoint file +maximum_number_of_frames: 70 # maximum number of frames to use in the video + +screen_text: ["wonderland","wonderland"] # texts, describing the edit layer +comp_text: ["wonderland park","wonderland city park"] # texts, describing the full edited video +src_text: ["park","city park"] # texts, describing the input video + +n_epochs: 3000 # number of epochs to train diff --git a/models/__init__.py b/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/models/backbone/__init__.py b/models/backbone/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b49b5cf2a013677a1bb59947663ac224de228982 --- /dev/null +++ b/models/backbone/__init__.py @@ -0,0 +1,35 @@ +from .skip import skip + + +def get_net( + input_depth, + pad, + upsample_mode, + n_channels=3, + act_fun="LeakyReLU", + skip_n33d=128, + skip_n33u=128, + skip_n11=4, + num_scales=5, + downsample_mode="stride", + need_sigmoid=True, + need_tanh=False, + decorr_rgb=False, +): + assert need_sigmoid != need_tanh + net = skip( + input_depth, + n_channels, + num_channels_down=[skip_n33d] * num_scales if isinstance(skip_n33d, int) else skip_n33d, + num_channels_up=[skip_n33u] * num_scales if isinstance(skip_n33u, int) else skip_n33u, + num_channels_skip=[skip_n11] * num_scales if isinstance(skip_n11, int) else skip_n11, + upsample_mode=upsample_mode, + downsample_mode=downsample_mode, + need_sigmoid=need_sigmoid, + need_tanh=need_tanh, + need_bias=True, + pad=pad, + act_fun=act_fun, + decorr_rgb=decorr_rgb, + ) + return net diff --git a/models/backbone/common.py b/models/backbone/common.py new file mode 100644 index 0000000000000000000000000000000000000000..bc70b367242e7beaaaf6d5a43ab92d0554fed5f5 --- /dev/null +++ b/models/backbone/common.py @@ -0,0 +1,179 @@ +import torch +import torch.nn as nn +import numpy as np +from .downsampler import Downsampler + + +def add_module(self, module): + self.add_module(str(len(self) + 1), module) + + +torch.nn.Module.add = add_module + + +class Concat(nn.Module): + def __init__(self, dim, *args): + super(Concat, self).__init__() + self.dim = dim + + for idx, module in enumerate(args): + self.add_module(str(idx), module) + + def forward(self, input): + inputs = [] + for module in self._modules.values(): + inputs.append(module(input)) + + inputs_shapes2 = [x.shape[2] for x in inputs] + inputs_shapes3 = [x.shape[3] for x in inputs] + + if np.all(np.array(inputs_shapes2) == min(inputs_shapes2)) and np.all( + np.array(inputs_shapes3) == min(inputs_shapes3) + ): + inputs_ = inputs + else: + target_shape2 = min(inputs_shapes2) + target_shape3 = min(inputs_shapes3) + + inputs_ = [] + for inp in inputs: + diff2 = (inp.size(2) - target_shape2) // 2 + diff3 = (inp.size(3) - target_shape3) // 2 + inputs_.append(inp[:, :, diff2 : diff2 + target_shape2, diff3 : diff3 + target_shape3]) + + return torch.cat(inputs_, dim=self.dim) + + def __len__(self): + return len(self._modules) + + +class GenNoise(nn.Module): + def __init__(self, dim2): + super(GenNoise, self).__init__() + self.dim2 = dim2 + + def forward(self, input): + a = list(input.size()) + a[1] = self.dim2 + # print (input.data.type()) + + b = torch.zeros(a).type_as(input.data) + b.normal_() + + x = torch.autograd.Variable(b) + + return x + + +class Swish(nn.Module): + """ + https://arxiv.org/abs/1710.05941 + The hype was so huge that I could not help but try it + """ + + def __init__(self): + super(Swish, self).__init__() + self.s = nn.Sigmoid() + + def forward(self, x): + return x * self.s(x) + + +def act(act_fun="LeakyReLU"): + """ + Either string defining an activation function or module (e.g. nn.ReLU) + """ + if isinstance(act_fun, str): + if act_fun == "LeakyReLU": + return nn.LeakyReLU(0.2, inplace=True) + elif act_fun == "Swish": + return Swish() + elif act_fun == "ELU": + return nn.ELU() + elif act_fun == "none": + return nn.Sequential() + else: + assert False + else: + return act_fun() + + +class PixelNormLayer(nn.Module): + """ + Pixelwise feature vector normalization. + """ + + def __init__(self, eps=1e-8): + super(PixelNormLayer, self).__init__() + self.eps = eps + + def forward(self, x): + return x / torch.sqrt(torch.mean(x ** 2, dim=1, keepdim=True) + 1e-8) + + def __repr__(self): + return self.__class__.__name__ + "(eps = %s)" % (self.eps) + + +def pixelnorm(num_features): + return PixelNormLayer() + + +def bn(num_features): + return nn.BatchNorm2d(num_features) + + +def conv(in_f, out_f, kernel_size, stride=1, bias=True, pad="zero", downsample_mode="stride"): + downsampler = None + if stride != 1 and downsample_mode != "stride": + + if downsample_mode == "avg": + downsampler = nn.AvgPool2d(stride, stride) + elif downsample_mode == "max": + downsampler = nn.MaxPool2d(stride, stride) + elif downsample_mode in ["lanczos2", "lanczos3"]: + downsampler = Downsampler( + n_planes=out_f, factor=stride, kernel_type=downsample_mode, phase=0.5, preserve_size=True + ) + else: + assert False + + stride = 1 + + padder = None + to_pad = int((kernel_size - 1) / 2) + if pad == "reflection": + padder = nn.ReflectionPad2d(to_pad) + to_pad = 0 + + convolver = nn.Conv2d(in_f, out_f, kernel_size, stride, padding=to_pad, bias=bias) + + layers = filter(lambda x: x is not None, [padder, convolver, downsampler]) + return nn.Sequential(*layers) + + +class DecorrelatedColorsToRGB(nn.Module): + """Converts from a decorrelated color space to RGB. See + https://github.com/eps696/aphantasia/blob/master/aphantasia/image.py. Usually intended + to be followed by a sigmoid. + """ + + def __init__(self, inv_color_scale=1.6): + super().__init__() + color_correlation_svd_sqrt = torch.tensor([[0.26, 0.09, 0.02], [0.27, 0.00, -0.05], [0.27, -0.09, 0.03]]) + color_correlation_svd_sqrt /= torch.tensor([inv_color_scale, 1.0, 1.0]) # saturate, empirical + max_norm_svd_sqrt = color_correlation_svd_sqrt.norm(dim=0).max() + color_correlation_normalized = color_correlation_svd_sqrt / max_norm_svd_sqrt + self.register_buffer("colcorr_t", color_correlation_normalized.T) + + def inverse(self, image): + colcorr_t_inv = torch.linalg.inv(self.colcorr_t) + return torch.einsum("nchw,cd->ndhw", image, colcorr_t_inv) + + def forward(self, image): + if image.dim() == 4: + image_rgb, alpha = image[:, :3], image[:, 3].unsqueeze(1) + image_rgb = torch.einsum("nchw,cd->ndhw", image_rgb, self.colcorr_t) + image = torch.cat([image_rgb, alpha], dim=1) + else: + image = torch.einsum("nchw,cd->ndhw", image, self.colcorr_t) + return image diff --git a/models/backbone/downsampler.py b/models/backbone/downsampler.py new file mode 100644 index 0000000000000000000000000000000000000000..1c10b799f40b58b258d91a9aa7de076415908b7e --- /dev/null +++ b/models/backbone/downsampler.py @@ -0,0 +1,166 @@ +import numpy as np +import torch +import torch.nn as nn + + +class Downsampler(nn.Module): + """ + http://www.realitypixels.com/turk/computergraphics/ResamplingFilters.pdf + """ + + def __init__( + self, n_planes, factor, kernel_type, phase=0, kernel_width=None, support=None, sigma=None, preserve_size=False + ): + super(Downsampler, self).__init__() + + assert phase in [0, 0.5], "phase should be 0 or 0.5" + + if kernel_type == "lanczos2": + support = 2 + kernel_width = 4 * factor + 1 + kernel_type_ = "lanczos" + + elif kernel_type == "lanczos3": + support = 3 + kernel_width = 6 * factor + 1 + kernel_type_ = "lanczos" + + elif kernel_type == "gauss12": + kernel_width = 7 + sigma = 1 / 2 + kernel_type_ = "gauss" + + elif kernel_type == "gauss1sq2": + kernel_width = 9 + sigma = 1.0 / np.sqrt(2) + kernel_type_ = "gauss" + + elif kernel_type in ["lanczos", "gauss", "box"]: + kernel_type_ = kernel_type + + else: + assert False, "wrong name kernel" + + # note that `kernel width` will be different to actual size for phase = 1/2 + self.kernel = get_kernel(factor, kernel_type_, phase, kernel_width, support=support, sigma=sigma) + + downsampler = nn.Conv2d(n_planes, n_planes, kernel_size=self.kernel.shape, stride=factor, padding=0) + downsampler.weight.data[:] = 0 + downsampler.bias.data[:] = 0 + + kernel_torch = torch.from_numpy(self.kernel) + for i in range(n_planes): + downsampler.weight.data[i, i] = kernel_torch + + self.downsampler_ = downsampler + + if preserve_size: + + if self.kernel.shape[0] % 2 == 1: + pad = int((self.kernel.shape[0] - 1) / 2.0) + else: + pad = int((self.kernel.shape[0] - factor) / 2.0) + + self.padding = nn.ReplicationPad2d(pad) + + self.preserve_size = preserve_size + + def forward(self, input): + if self.preserve_size: + x = self.padding(input) + else: + x = input + self.x = x + return self.downsampler_(x) + + +def get_kernel(factor, kernel_type, phase, kernel_width, support=None, sigma=None): + assert kernel_type in ["lanczos", "gauss", "box"] + + # factor = float(factor) + if phase == 0.5 and kernel_type != "box": + kernel = np.zeros([kernel_width - 1, kernel_width - 1]) + else: + kernel = np.zeros([kernel_width, kernel_width]) + + if kernel_type == "box": + assert phase == 0.5, "Box filter is always half-phased" + kernel[:] = 1.0 / (kernel_width * kernel_width) + + elif kernel_type == "gauss": + assert sigma, "sigma is not specified" + assert phase != 0.5, "phase 1/2 for gauss not implemented" + + center = (kernel_width + 1.0) / 2.0 + print(center, kernel_width) + sigma_sq = sigma * sigma + + for i in range(1, kernel.shape[0] + 1): + for j in range(1, kernel.shape[1] + 1): + di = (i - center) / 2.0 + dj = (j - center) / 2.0 + kernel[i - 1][j - 1] = np.exp(-(di * di + dj * dj) / (2 * sigma_sq)) + kernel[i - 1][j - 1] = kernel[i - 1][j - 1] / (2.0 * np.pi * sigma_sq) + elif kernel_type == "lanczos": + assert support, "support is not specified" + center = (kernel_width + 1) / 2.0 + + for i in range(1, kernel.shape[0] + 1): + for j in range(1, kernel.shape[1] + 1): + + if phase == 0.5: + di = abs(i + 0.5 - center) / factor + dj = abs(j + 0.5 - center) / factor + else: + di = abs(i - center) / factor + dj = abs(j - center) / factor + + pi_sq = np.pi * np.pi + + val = 1 + if di != 0: + val = val * support * np.sin(np.pi * di) * np.sin(np.pi * di / support) + val = val / (np.pi * np.pi * di * di) + + if dj != 0: + val = val * support * np.sin(np.pi * dj) * np.sin(np.pi * dj / support) + val = val / (np.pi * np.pi * dj * dj) + + kernel[i - 1][j - 1] = val + + else: + assert False, "wrong method name" + + kernel /= kernel.sum() + + return kernel + + +# a = Downsampler(n_planes=3, factor=2, kernel_type='lanczos2', phase='1', preserve_size=True) + + +################# +# Learnable downsampler + +# KS = 32 +# dow = nn.Sequential(nn.ReplicationPad2d(int((KS - factor) / 2.)), nn.Conv2d(1,1,KS,factor)) + +# class Apply(nn.Module): +# def __init__(self, what, dim, *args): +# super(Apply, self).__init__() +# self.dim = dim + +# self.what = what + +# def forward(self, input): +# inputs = [] +# for i in range(input.size(self.dim)): +# inputs.append(self.what(input.narrow(self.dim, i, 1))) + +# return torch.cat(inputs, dim=self.dim) + +# def __len__(self): +# return len(self._modules) + +# downs = Apply(dow, 1) +# downs.type(dtype)(net_input.type(dtype)).size() diff --git a/models/backbone/skip.py b/models/backbone/skip.py new file mode 100644 index 0000000000000000000000000000000000000000..575ce025eb5312d529d639fa722a2b9dbb6a47b0 --- /dev/null +++ b/models/backbone/skip.py @@ -0,0 +1,138 @@ +from .common import * + +_norm = bn + + +# _norm = pixelnorm + + +def norm(channels): + return _norm(channels) + + +def skip( + num_input_channels=2, + num_output_channels=3, + num_channels_down=[16, 32, 64, 128, 128], + num_channels_up=[16, 32, 64, 128, 128], + num_channels_skip=[4, 4, 4, 4, 4], + filter_size_down=3, + filter_size_up=3, + filter_skip_size=1, + need_sigmoid=True, + need_tanh=False, + need_bias=True, + pad="reflection", + upsample_mode="bilinear", + downsample_mode="stride", + act_fun="LeakyReLU", + need1x1_up=True, + decorr_rgb=True, +): + """Assembles encoder-decoder with skip connections. + + Arguments: + act_fun: Either string 'LeakyReLU|Swish|ELU|none' or module (e.g. nn.ReLU) + pad (string): zero|reflection (default: 'zero') + upsample_mode (string): 'nearest|bilinear' (default: 'nearest') + downsample_mode (string): 'stride|avg|max|lanczos2' (default: 'stride') + + """ + assert len(num_channels_down) == len(num_channels_up) == len(num_channels_skip) + + n_scales = len(num_channels_down) + + if not (isinstance(upsample_mode, list) or isinstance(upsample_mode, tuple)): + upsample_mode = [upsample_mode] * n_scales + + if not (isinstance(downsample_mode, list) or isinstance(downsample_mode, tuple)): + downsample_mode = [downsample_mode] * n_scales + + if not (isinstance(filter_size_down, list) or isinstance(filter_size_down, tuple)): + filter_size_down = [filter_size_down] * n_scales + + if not (isinstance(filter_size_up, list) or isinstance(filter_size_up, tuple)): + filter_size_up = [filter_size_up] * n_scales + + last_scale = n_scales - 1 + + cur_depth = None + + model = nn.Sequential() + # model.add(transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])) + model_tmp = model + + input_depth = num_input_channels + for i in range(len(num_channels_down)): + + deeper = nn.Sequential() + skip = nn.Sequential() + + if num_channels_skip[i] != 0: + model_tmp.add(Concat(1, skip, deeper)) + else: + model_tmp.add(deeper) + + model_tmp.add(norm(num_channels_skip[i] + (num_channels_up[i + 1] if i < last_scale else num_channels_down[i]))) + + if num_channels_skip[i] != 0: + skip.add(conv(input_depth, num_channels_skip[i], filter_skip_size, bias=need_bias, pad=pad)) + skip.add(norm(num_channels_skip[i])) + skip.add(act(act_fun)) + + # skip.add(Concat(2, GenNoise(nums_noise[i]), skip_part)) + + deeper.add( + conv( + input_depth, + num_channels_down[i], + filter_size_down[i], + 2, + bias=need_bias, + pad=pad, + downsample_mode=downsample_mode[i], + ) + ) + deeper.add(norm(num_channels_down[i])) + deeper.add(act(act_fun)) + + deeper.add(conv(num_channels_down[i], num_channels_down[i], filter_size_down[i], bias=need_bias, pad=pad)) + deeper.add(norm(num_channels_down[i])) + deeper.add(act(act_fun)) + + deeper_main = nn.Sequential() + + if i == len(num_channels_down) - 1: + # The deepest + k = num_channels_down[i] + else: + deeper.add(deeper_main) + k = num_channels_up[i + 1] + + deeper.add(nn.Upsample(scale_factor=2, mode=upsample_mode[i])) + + model_tmp.add(conv(num_channels_skip[i] + k, num_channels_up[i], filter_size_up[i], 1, bias=need_bias, pad=pad)) + # if i > 0: + # model_tmp.add(norm(num_channels_up[i])) + model_tmp.add(norm(num_channels_up[i])) + + model_tmp.add(act(act_fun)) + + if need1x1_up: + model_tmp.add(conv(num_channels_up[i], num_channels_up[i], 1, bias=need_bias, pad=pad)) + if i > 0: + model_tmp.add(norm(num_channels_up[i])) + model_tmp.add(act(act_fun)) + + input_depth = num_channels_down[i] + model_tmp = deeper_main + + model.add(conv(num_channels_up[0], num_output_channels, 1, bias=need_bias, pad=pad)) + if decorr_rgb: + model.add(DecorrelatedColorsToRGB()) + if need_sigmoid: + model.add(nn.Sigmoid()) + elif need_tanh: + model.add(nn.Tanh()) + + return model diff --git a/models/clip_extractor.py b/models/clip_extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..9c6361dbd1a60772f2480278504b65b6974b9f01 --- /dev/null +++ b/models/clip_extractor.py @@ -0,0 +1,123 @@ +import torch +from torch.nn import functional as F +import torchvision.transforms as T +from torchvision.transforms import InterpolationMode + +from CLIP import clip + +from util.util import compose_text_with_templates + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +class ClipExtractor(torch.nn.Module): + def __init__(self, cfg): + super().__init__() + self.cfg = cfg + model = clip.load(cfg["clip_model_name"], device=device)[0] + self.model = model.eval().requires_grad_(False) + + self.clip_input_size = 224 + self.clip_normalize = T.Normalize( + mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711] + ) + self.basic_transform = T.Compose( + [ + # we added interpolation to CLIP positional embedding, allowing to work with arbitrary resolution. + T.Resize(self.clip_input_size, max_size=380), + self.clip_normalize, + ] + ) + # list of augmentations we apply before calculating the CLIP losses + self.augs = T.Compose( + [ + T.RandomHorizontalFlip(p=0.5), + T.RandomApply( + [ + T.RandomAffine( + degrees=15, + translate=(0.1, 0.1), + fill=cfg["clip_affine_transform_fill"], + interpolation=InterpolationMode.BILINEAR, + ) + ], + p=0.8, + ), + T.RandomPerspective( + distortion_scale=0.4, + p=0.5, + interpolation=InterpolationMode.BILINEAR, + fill=cfg["clip_affine_transform_fill"], + ), + T.RandomApply([T.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1)], p=0.7), + T.RandomGrayscale(p=0.15), + ] + ) + + self.n_aug = cfg["n_aug"] + + def augment_input(self, input, n_aug=None, clip_input_size=None): + if n_aug is None: + n_aug = self.n_aug + if clip_input_size is None: + clip_input_size = self.clip_input_size + + cutouts = [] + cutout = T.Resize(clip_input_size, max_size=320)(input) + cutout_h, cutout_w = cutout.shape[-2:] + cutout = self.augs(cutout) + cutouts.append(cutout) + sideY, sideX = input.shape[2:4] + for _ in range(n_aug - 1): + s = ( + torch.zeros( + 1, + ) + .uniform_(0.6, 1) + .item() + ) + h = int(sideY * s) + w = int(sideX * s) + cutout = T.RandomCrop(size=(h, w))(input) + cutout = T.Resize((cutout_h, cutout_w))(cutout) + cutout = self.augs(cutout) + cutouts.append(cutout) + + cutouts = torch.cat(cutouts) + return cutouts + + def get_image_embedding(self, x, aug=True): + if aug: + views = self.augment_input(x) + else: + views = self.basic_transform(x) + if type(views) == list: + image_embeds = [] + for view in views: + image_embeds.append(self.encode_image(self.clip_normalize(view))) + image_embeds = torch.cat(image_embeds) + else: + image_embeds = self.encode_image(self.clip_normalize(views)) + return image_embeds + + def encode_image(self, x): + return self.model.encode_image(x) + + def get_text_embedding(self, text, template, average_embeddings=False): + if type(text) == str: + text = [text] + embeddings = [] + for prompt in text: + with torch.no_grad(): + embedding = self.model.encode_text( + clip.tokenize(compose_text_with_templates(prompt, template)).to(device) + ) + embeddings.append(embedding) + embeddings = torch.cat(embeddings) + if average_embeddings: + embeddings = embeddings.mean(dim=0, keepdim=True) + return embeddings + + def get_self_sim(self, x): + x = self.basic_transform(x) + return self.model.calculate_self_sim(x) diff --git a/models/clip_relevancy.py b/models/clip_relevancy.py new file mode 100644 index 0000000000000000000000000000000000000000..3f8d77e9d3ab5939fe498328650e9ec4a8021376 --- /dev/null +++ b/models/clip_relevancy.py @@ -0,0 +1,84 @@ +import torch +from torchvision import transforms as T +import numpy as np +from CLIP import clip_explainability as clip + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +# https://github.com/hila-chefer/Transformer-MM-Explainability/blob/main/CLIP_explainability.ipynb +class ClipRelevancy(torch.nn.Module): + def __init__(self, cfg): + super().__init__() + self.cfg = cfg + # TODO it would make more sense not to load ths model again (already done in the extractor) + self.model = clip.load("ViT-B/32", device=device, jit=False)[0] + clip_input_size = 224 + self.preprocess = T.Compose( + [ + T.Resize((clip_input_size, clip_input_size)), + T.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]), + ] + ) + input_prompts = cfg["bootstrap_text"] + if type(input_prompts) == str: + input_prompts = [input_prompts] + self.text = clip.tokenize(input_prompts).to(cfg["device"]) + + if self.cfg["use_negative_bootstrap"]: + input_negative_prompts = cfg["bootstrap_negative_text"] + if type(input_negative_prompts) == str: + input_negative_prompts = [input_negative_prompts] + self.bootstrap_negative_text = clip.tokenize(input_negative_prompts).to(cfg["device"]) + + def image_relevance(self, image_relevance): + patch_size = 32 # hardcoded for ViT-B/32 which we use + h = w = 224 + image_relevance = image_relevance.reshape(1, 1, h // patch_size, w // patch_size) + image_relevance = torch.nn.functional.interpolate(image_relevance, size=(h, w), mode="bilinear") + image_relevance = image_relevance.reshape(h, w).to(device) + image_relevance = (image_relevance - image_relevance.min()) / (image_relevance.max() - image_relevance.min()) + return image_relevance + + def interpret(self, image, negative=False): + text = self.text if not negative else self.bootstrap_negative_text + batch_size = text.shape[0] + images = image.repeat(batch_size, 1, 1, 1) + # TODO this is pretty inefficient, we can calculate the text embeddings instead of recomputing at each call + logits_per_image, logits_per_text = self.model(images, text) + probs = logits_per_image.softmax(dim=-1).detach().cpu().numpy() + index = [i for i in range(batch_size)] + one_hot = np.zeros((logits_per_image.shape[0], logits_per_image.shape[1]), dtype=np.float32) + one_hot[torch.arange(logits_per_image.shape[0]), index] = 1 + one_hot = torch.from_numpy(one_hot).requires_grad_(True) + one_hot = torch.sum(one_hot.to(device) * logits_per_image) + self.model.zero_grad() + + image_attn_blocks = list(dict(self.model.visual.transformer.resblocks.named_children()).values()) + num_tokens = image_attn_blocks[0].attn_probs.shape[-1] + R = torch.eye(num_tokens, num_tokens, dtype=image_attn_blocks[0].attn_probs.dtype).to(device) + R = R.unsqueeze(0).expand(batch_size, num_tokens, num_tokens) + for i, blk in enumerate(image_attn_blocks): + if i <= self.cfg["relevancy_num_layers"]: + continue + grad = torch.autograd.grad(one_hot, [blk.attn_probs], retain_graph=True)[0].detach() + cam = blk.attn_probs.detach() + cam = cam.reshape(-1, cam.shape[-1], cam.shape[-1]) + grad = grad.reshape(-1, grad.shape[-1], grad.shape[-1]) + cam = grad * cam + cam = cam.reshape(batch_size, -1, cam.shape[-1], cam.shape[-1]) + cam = cam.clamp(min=0).mean(dim=1) + R = R + torch.bmm(cam, R) + image_relevance = R[:, 0, 1:] + + return image_relevance + + def forward(self, img, preprocess=True, negative=False): + if preprocess: + img = self.preprocess(img) + R_image = self.interpret(img, negative=negative) + res = [] + for el in R_image: + res.append(self.image_relevance(el).float()) + res = torch.stack(res, dim=0) + return res diff --git a/models/image_model.py b/models/image_model.py new file mode 100644 index 0000000000000000000000000000000000000000..9bf2e3d9f974eb206fb2ddd3d79cdf58232fd031 --- /dev/null +++ b/models/image_model.py @@ -0,0 +1,42 @@ +import torch +from .networks import define_G + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +class Model(torch.nn.Module): + def __init__(self, cfg): + super().__init__() + self.cfg = cfg + self.netG = define_G(cfg).to(device) + + def render(self, net_output, bg_image=None): + assert net_output.min() >= 0 and net_output.max() <= 1 + edit = net_output[:, :3] + alpha = net_output[:, 3].unsqueeze(1).repeat(1, 3, 1, 1) + greenscreen = torch.zeros_like(edit).to(edit.device) + greenscreen[:, 1, :, :] = 177 / 255 + greenscreen[:, 2, :, :] = 64 / 255 + edit_on_greenscreen = alpha * edit + (1 - alpha) * greenscreen + outputs = {"edit": edit, "alpha": alpha, "edit_on_greenscreen": edit_on_greenscreen} + if bg_image is not None: + outputs["composite"] = (1 - alpha) * bg_image + alpha * edit + + return outputs + + def forward(self, input): + outputs = {} + # augmented examples + if "input_crop" in input: + outputs["output_crop"] = self.render(self.netG(input["input_crop"]), bg_image=input["input_crop"]) + + # pass the entire image (w/o augmentations) + if "input_image" in input: + outputs["output_image"] = self.render(self.netG(input["input_image"]), bg_image=input["input_image"]) + + # move outputs to list + for outer_key in outputs.keys(): + for key, value in outputs[outer_key].items(): + outputs[outer_key][key] = [value[0]] + + return outputs diff --git a/models/implicit_neural_networks.py b/models/implicit_neural_networks.py new file mode 100644 index 0000000000000000000000000000000000000000..d52c4c28fdd71f3510df0cc25ca3e450668140a8 --- /dev/null +++ b/models/implicit_neural_networks.py @@ -0,0 +1,89 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np + +# code taken from https://github.com/ykasten/layered-neural-atlases + + +def count_parameters(model): + return sum(p.numel() for p in model.parameters() if p.requires_grad) + + +def positionalEncoding_vec(in_tensor, b): + proj = torch.einsum("ij, k -> ijk", in_tensor, b) # shape (batch, in_tensor.size(1), freqNum) + mapped_coords = torch.cat((torch.sin(proj), torch.cos(proj)), dim=1) # shape (batch, 2*in_tensor.size(1), freqNum) + output = mapped_coords.transpose(2, 1).contiguous().view(mapped_coords.size(0), -1) + return output + + +class IMLP(nn.Module): + def __init__( + self, + input_dim, + output_dim, + hidden_dim=256, + use_positional=True, + positional_dim=10, + skip_layers=[4, 6], + num_layers=8, # includes the output layer + verbose=True, + use_tanh=True, + apply_softmax=False, + ): + super(IMLP, self).__init__() + self.verbose = verbose + self.use_tanh = use_tanh + self.apply_softmax = apply_softmax + if apply_softmax: + self.softmax = nn.Softmax() + if use_positional: + encoding_dimensions = 2 * input_dim * positional_dim + self.b = torch.tensor([(2 ** j) * np.pi for j in range(positional_dim)], requires_grad=False) + else: + encoding_dimensions = input_dim + + self.hidden = nn.ModuleList() + for i in range(num_layers): + if i == 0: + input_dims = encoding_dimensions + elif i in skip_layers: + input_dims = hidden_dim + encoding_dimensions + else: + input_dims = hidden_dim + + if i == num_layers - 1: + # last layer + self.hidden.append(nn.Linear(input_dims, output_dim, bias=True)) + else: + self.hidden.append(nn.Linear(input_dims, hidden_dim, bias=True)) + + self.skip_layers = skip_layers + self.num_layers = num_layers + + self.positional_dim = positional_dim + self.use_positional = use_positional + + if self.verbose: + print(f"Model has {count_parameters(self)} params") + + def forward(self, x): + if self.use_positional: + if self.b.device != x.device: + self.b = self.b.to(x.device) + pos = positionalEncoding_vec(x, self.b) + x = pos + + input = x.detach().clone() + for i, layer in enumerate(self.hidden): + if i > 0: + x = F.relu(x) + if i in self.skip_layers: + x = torch.cat((x, input), 1) + x = layer(x) + if self.use_tanh: + x = torch.tanh(x) + + if self.apply_softmax: + x = self.softmax(x) + return x diff --git a/models/networks.py b/models/networks.py new file mode 100644 index 0000000000000000000000000000000000000000..95393c602daf4530c6fa2f6c672ec9d95d2f6c66 --- /dev/null +++ b/models/networks.py @@ -0,0 +1,39 @@ +from torch.optim import lr_scheduler +from models.backbone.skip import skip + + +def get_scheduler(optimizer, opt): + if opt.lr_policy == "linear": + + def lambda_rule(epoch): + lr_l = 1.0 - max(0, epoch + opt.epoch_count - opt.n_epochs) / float(opt.n_epochs_decay + 1) + return lr_l + + scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule) + elif opt.lr_policy == "step": + scheduler = lr_scheduler.StepLR(optimizer, step_size=opt.lr_decay_iters, gamma=0.1) + elif opt.lr_policy == "plateau": + scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.2, threshold=0.01, patience=5) + elif opt.lr_policy == "cosine": + scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=opt.n_epochs, eta_min=0) + else: + return NotImplementedError("learning rate policy [%s] is not implemented", opt.lr_policy) + return scheduler + + +def define_G(cfg): + netG = skip( + 3, + 4, + num_channels_down=[cfg["skip_n33d"]] * cfg["num_scales"] + if isinstance(cfg["skip_n33d"], int) + else cfg["skip_n33d"], + num_channels_up=[cfg["skip_n33u"]] * cfg["num_scales"] + if isinstance(cfg["skip_n33u"], int) + else cfg["skip_n33u"], + num_channels_skip=[cfg["skip_n11"]] * cfg["num_scales"] + if isinstance(cfg["skip_n11"], int) + else cfg["skip_n11"], + need_bias=True, + ) + return netG diff --git a/models/video_model.py b/models/video_model.py new file mode 100644 index 0000000000000000000000000000000000000000..f150e2dd3dad2f99898cee1789f9ecb077a846ec --- /dev/null +++ b/models/video_model.py @@ -0,0 +1,116 @@ +import random + +import torch +import torch.nn.functional as F +import torchvision +from torchvision import transforms +from torchvision.transforms import InterpolationMode + +from models.image_model import Model + + +class VideoModel(Model): + def __init__(self, config): + super().__init__(config) + self.config = config + self.net_preprocess = transforms.Compose([]) + + @staticmethod + def resize_crops(crops, resize_factor): + return torchvision.transforms.functional.resize( + crops, + [ + crops.shape[-2] // resize_factor, + crops.shape[-1] // resize_factor, + ], + InterpolationMode.BILINEAR, + antialias=True, + ) + + def process_crops(self, uv_values, crops, original_crops, alpha=None): + resized_crops = [] + cnn_output_crops = [] + render_dict = {"edit": [], "alpha": [], "edit_on_greenscreen": [], "composite": []} + + atlas_crop = crops[0] + for i in range(3): + grid_sampled_atlas_crop = F.grid_sample( + atlas_crop, + uv_values[i], + mode="bilinear", + align_corners=self.config["align_corners"], + ).clamp(min=0.0, max=1.0) + resized_crops.append(grid_sampled_atlas_crop) + cnn_output = self.netG(atlas_crop) + cnn_output_crops.append(cnn_output[:, :3]) + rendered_atlas_crops = self.render(cnn_output, bg_image=atlas_crop) + for key, value in rendered_atlas_crops.items(): + for i in range(3): + sampled_frame_crop = F.grid_sample( + value, + uv_values[i], + mode="bilinear", + align_corners=self.config["align_corners"], + ).clamp(min=0.0, max=1.0) + if alpha is not None: + sampled_frame_crop = sampled_frame_crop * alpha[i] + if key == "edit_on_greenscreen": + greenscreen = torch.zeros_like(sampled_frame_crop).to(sampled_frame_crop.device) + greenscreen[:, 1, :, :] = 177 / 255 + greenscreen[:, 2, :, :] = 64 / 255 + sampled_frame_crop += (1 - alpha[i]) * greenscreen + render_dict[key].append(sampled_frame_crop.squeeze(0)) + + # passing a random frame to the network + frame_index = random.randint(0, 2) # randomly sample one of three frames + rec_crop = original_crops[frame_index] + resized_crops.append(rec_crop) + cnn_output = self.netG(rec_crop) + if alpha is not None: + alpha_crop = alpha[frame_index] + cnn_output = cnn_output * alpha_crop + cnn_output_crops.append(cnn_output[:, :3]) + + rendered_frame_crop = self.render(cnn_output, bg_image=original_crops[frame_index]) + for key, value in rendered_frame_crop.items(): + render_dict[key].append(value.squeeze(0)) + + return render_dict, resized_crops, cnn_output_crops + + def process_atlas(self, atlas): + atlas_edit = self.netG(atlas) + rendered_atlas = self.render(atlas_edit, bg_image=atlas) + return rendered_atlas + + def forward(self, input_dict): + inputs = input_dict["global_crops"] + outputs = {"background": {}, "foreground": {}} + + if self.config["finetune_foreground"]: + if self.config["multiply_foreground_alpha"]: + alpha = inputs["foreground_alpha"] + else: + alpha = None + foreground_outputs, resized_crops, cnn_output_crops = self.process_crops( + inputs["foreground_uvs"], + inputs["foreground_atlas_crops"], + inputs["original_foreground_crops"], + alpha=alpha, + ) + outputs["foreground"]["output_crop"] = foreground_outputs + outputs["foreground"]["cnn_inputs"] = resized_crops + outputs["foreground"]["cnn_outputs"] = cnn_output_crops + if "input_image" in input_dict.keys(): + outputs["foreground"]["output_image"] = self.process_atlas(input_dict["input_image"]) + elif self.config["finetune_background"]: + background_outputs, resized_crops, cnn_output_crops = self.process_crops( + inputs["background_uvs"], + inputs["background_atlas_crops"], + inputs["original_background_crops"], + ) + outputs["background"]["output_crop"] = background_outputs + outputs["background"]["cnn_inputs"] = resized_crops + outputs["background"]["cnn_outputs"] = cnn_output_crops + if "input_image" in input_dict.keys(): + outputs["background"]["output_image"] = self.process_atlas(input_dict["input_image"]) + return outputs diff --git a/train_image.py b/train_image.py new file mode 100644 index 0000000000000000000000000000000000000000..aaf570e11bee314e5be9ac5a7821047bf0009b4f --- /dev/null +++ b/train_image.py @@ -0,0 +1,131 @@ +import random +from argparse import ArgumentParser +import datetime +from pathlib import Path + +import imageio +import numpy as np +import torch +import yaml +from tqdm import tqdm + +from datasets.image_dataset import SingleImageDataset +from models.clip_extractor import ClipExtractor +from models.image_model import Model +from util.losses import LossG +from util.util import tensor2im, get_optimizer + + +def train_model(config): + + # set seed + seed = config["seed"] + if seed == -1: + seed = np.random.randint(2 ** 32) + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + print(f"running with seed: {seed}.") + + # create dataset, loader + dataset = SingleImageDataset(config) + + # define model + model = Model(config) + + # define loss function + clip_extractor = ClipExtractor(config) + criterion = LossG(config, clip_extractor) + + # define optimizer, scheduler + optimizer = get_optimizer(config, model.parameters()) + + for epoch in tqdm(range(1, config["n_epochs"] + 1)): + inputs = dataset[0] + for key in inputs: + if key != "step": + inputs[key] = inputs[key].to(config["device"]) + optimizer.zero_grad() + outputs = model(inputs) + for key in inputs: + if key != "step": + inputs[key] = [inputs[key][0]] + losses = criterion(outputs, inputs) + loss_G = losses["loss"] + log_data = losses + log_data["epoch"] = epoch + + # log current generated image to wandb + if epoch % config["log_images_freq"] == 0: + src_img = dataset.get_img().to(config["device"]) + with torch.no_grad(): + output = model.render(model.netG(src_img), bg_image=src_img) + for layer_name, layer_img in output.items(): + image_numpy_output = tensor2im(layer_img) + log_data[layer_name] = [wandb.Image(image_numpy_output)] if config["use_wandb"] else image_numpy_output + + loss_G.backward() + optimizer.step() + + # update learning rate + if config["scheduler_policy"] == "exponential": + optimizer.param_groups[0]["lr"] = max(config["min_lr"], config["gamma"] * optimizer.param_groups[0]["lr"]) + lr = optimizer.param_groups[0]["lr"] + log_data["lr"] = lr + + if config["use_wandb"]: + wandb.log(log_data) + else: + if epoch % config["log_images_freq"] == 0: + save_locally(config["results_folder"], log_data) + + +def save_locally(results_folder, log_data): + path = Path(results_folder, str(log_data["epoch"])) + path.mkdir(parents=True, exist_ok=True) + for key in log_data.keys(): + if key in ["composite", "alpha", "edit_on_greenscreen", "edit"]: + imageio.imwrite(f"{path}/{key}.png", log_data[key]) + + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument( + "--config", + default="./configs/image_config.yaml", + help="Config path", + ) + parser.add_argument( + "--example_config", + default="golden_horse.yaml", + help="Example config name", + ) + args = parser.parse_args() + config_path = args.config + + with open(config_path, "r") as f: + config = yaml.safe_load(f) + with open(f"./configs/image_example_configs/{args.example_config}", "r") as f: + example_config = yaml.safe_load(f) + config["example_config"] = args.example_config + config.update(example_config) + + run_name = f"-{config['image_path'].split('/')[-1]}" + if config["use_wandb"]: + import wandb + + wandb.init(project=config["wandb_project"], entity=config["wandb_entity"], config=config, name=run_name) + wandb.run.name = str(wandb.run.id) + wandb.run.name + config = dict(wandb.config) + else: + now = datetime.datetime.now() + run_name = f"{now.strftime('%Y-%m-%d_%H-%M-%S')}{run_name}" + path = Path(f"{config['results_folder']}/{run_name}") + path.mkdir(parents=True, exist_ok=True) + with open(path / "config.yaml", "w") as f: + yaml.dump(config, f) + config["results_folder"] = str(path) + + train_model(config) + if config["use_wandb"]: + wandb.finish() \ No newline at end of file diff --git a/util/__init__.py b/util/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/util/atlas_loss.py b/util/atlas_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..b6b8d2c5c7758af3e0ac4177f59b2381aaee8d2f --- /dev/null +++ b/util/atlas_loss.py @@ -0,0 +1,55 @@ +import torch.nn + +from models.clip_extractor import ClipExtractor +from util.losses import LossG + + +class AtlasLoss(torch.nn.Module): + def __init__(self, config): + super().__init__() + + self.clip_extractor = ClipExtractor(config) + common_config = { + key: config[key] + for key in [ + "lambda_composition", + "lambda_sparsity", + "lambda_screen", + "lambda_alpha_l1", + "lambda_alpha_l0", + "text_criterion", + "clip_model_name", + "bootstrap_epoch", + "lambda_bootstrap", + "relevancy_num_layers", + "lambda_structure", + "bootstrap_text", + "bootstrap_scheduler", + "bootstrapping_min_cover", + "use_negative_bootstrap", + "bootstrap_negative_text", + "bootstrap_negative_map_threshold", + "lambda_bootstrap_min", + "device", + ] + } + texts_config = { + "screen_text": config["screen_text"], + "comp_text": config["comp_text"], + "src_text": config["src_text"], + } + common_config.update(texts_config) + self.loss = LossG(common_config, self.clip_extractor) + + + self.config = config + + def forward(self, outputs, inputs): + losses = {} + if self.config["finetune_background"]: + inputs["input_crop"] = [el.squeeze(0) for el in outputs["background"]["cnn_inputs"]] + losses["background"] = self.loss(outputs["background"], inputs) + elif self.config["finetune_foreground"]: + inputs["input_crop"] = [el.squeeze(0) for el in outputs["foreground"]["cnn_inputs"]] + losses["foreground"] = self.loss(outputs["foreground"], inputs) + return losses diff --git a/util/atlas_utils.py b/util/atlas_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f24a75a74eb164efd7ca459353f76c7ab01d500c --- /dev/null +++ b/util/atlas_utils.py @@ -0,0 +1,298 @@ +import scipy.interpolate +import torch +from torchvision.transforms.functional import crop +from tqdm import tqdm + +from models.implicit_neural_networks import IMLP + + +def load_neural_atlases_models(config): + foreground_mapping = IMLP( + input_dim=3, + output_dim=2, + hidden_dim=256, + use_positional=False, + num_layers=6, + skip_layers=[], + ).to(config["device"]) + + background_mapping = IMLP( + input_dim=3, + output_dim=2, + hidden_dim=256, + use_positional=False, + num_layers=4, + skip_layers=[], + ).to(config["device"]) + + foreground_atlas_model = IMLP( + input_dim=2, + output_dim=3, + hidden_dim=256, + use_positional=True, + positional_dim=10, + num_layers=8, + skip_layers=[4, 7], + ).to(config["device"]) + + background_atlas_model = IMLP( + input_dim=2, + output_dim=3, + hidden_dim=256, + use_positional=True, + positional_dim=10, + num_layers=8, + skip_layers=[4, 7], + ).to(config["device"]) + + alpha_model = IMLP( + input_dim=3, + output_dim=1, + hidden_dim=256, + use_positional=True, + positional_dim=5, + num_layers=8, + skip_layers=[], + ).to(config["device"]) + + checkpoint = torch.load(config["checkpoint_path"]) + foreground_mapping.load_state_dict(checkpoint["model_F_mapping1_state_dict"]) + background_mapping.load_state_dict(checkpoint["model_F_mapping2_state_dict"]) + foreground_atlas_model.load_state_dict(checkpoint["F_atlas_state_dict"]) + background_atlas_model.load_state_dict(checkpoint["F_atlas_state_dict"]) + alpha_model.load_state_dict(checkpoint["model_F_alpha_state_dict"]) + + foreground_mapping = foreground_mapping.eval().requires_grad_(False) + background_mapping = background_mapping.eval().requires_grad_(False) + foreground_atlas_model = foreground_atlas_model.eval().requires_grad_(False) + background_atlas_model = background_atlas_model.eval().requires_grad_(False) + alpha_model = alpha_model.eval().requires_grad_(False) + + return foreground_mapping, background_mapping, foreground_atlas_model, background_atlas_model, alpha_model + + +@torch.no_grad() +def get_frames_data(config, foreground_mapping, background_mapping, alpha_model): + max_size = max(config["resx"], config["resy"]) + normalizing_factor = torch.tensor([max_size / 2, max_size / 2, config["maximum_number_of_frames"] / 2]) + background_uv_values = torch.zeros( + size=(config["maximum_number_of_frames"], config["resy"], config["resx"], 2), device=config["device"] + ) + foreground_uv_values = torch.zeros( + size=(config["maximum_number_of_frames"], config["resy"], config["resx"], 2), device=config["device"] + ) + alpha = torch.zeros( + size=(config["maximum_number_of_frames"], config["resy"], config["resx"], 1), device=config["device"] + ) + + for frame in tqdm(range(config["maximum_number_of_frames"]), leave=False): + indices = get_grid_indices(0, 0, config["resy"], config["resx"], t=torch.tensor(frame)) + + normalized_chunk = (indices / normalizing_factor - 1).to(config["device"]) + + # get the atlas UV coordinates from the two mapping networks; + with torch.no_grad(): + current_background_uv_values = background_mapping(normalized_chunk) + current_foreground_uv_values = foreground_mapping(normalized_chunk) + current_alpha = alpha_model(normalized_chunk) + + background_uv_values[frame, indices[:, 1], indices[:, 0]] = current_background_uv_values * 0.5 - 0.5 + foreground_uv_values[frame, indices[:, 1], indices[:, 0]] = current_foreground_uv_values * 0.5 + 0.5 + current_alpha = 0.5 * (current_alpha + 1.0) + current_alpha = 0.99 * current_alpha + 0.001 + alpha[frame, indices[:, 1], indices[:, 0]] = current_alpha + + if config["return_atlas_alpha"]: # this should take a few minutes + foreground_atlas_alpha = torch.zeros( + size=( + config["maximum_number_of_frames"], + config["grid_atlas_resolution"], + config["grid_atlas_resolution"], + 1, + ), + ) + foreground_uv_values_grid = foreground_uv_values * config["grid_atlas_resolution"] + indices = get_grid_indices(0, 0, config["grid_atlas_resolution"], config["grid_atlas_resolution"]) + for frame in tqdm(range(config["maximum_number_of_frames"]), leave=False): + interpolated = scipy.interpolate.griddata( + foreground_uv_values_grid[frame].reshape(-1, 2).cpu().numpy(), + alpha[frame] + .reshape( + -1, + ) + .cpu() + .numpy(), + indices.reshape(-1, 2).cpu().numpy(), + method="linear", + ).reshape(config["grid_atlas_resolution"], config["grid_atlas_resolution"], 1) + foreground_atlas_alpha[frame] = torch.from_numpy(interpolated) + foreground_atlas_alpha[foreground_atlas_alpha.isnan()] = 0.0 + foreground_atlas_alpha = ( + torch.median(foreground_atlas_alpha, dim=0, keepdim=True).values.to(config["device"]).permute(0, 3, 2, 1) + ) + else: + foreground_atlas_alpha = None + return background_uv_values, foreground_uv_values, alpha.permute(0, 3, 1, 2), foreground_atlas_alpha + + +@torch.no_grad() +def reconstruct_video_layer(uv_values, atlas_model): + t, h, w, _ = uv_values.shape + reconstruction = torch.zeros(size=(t, h, w, 3), device=uv_values.device) + for frame in range(t): + rgb = (atlas_model(uv_values[frame].reshape(-1, 2)) + 1) * 0.5 + reconstruction[frame] = rgb.reshape(h, w, 3) + return reconstruction.permute(0, 3, 1, 2) + + +@torch.no_grad() +def create_uv_mask(config, mapping_model, min_u, min_v, max_u, max_v, uv_shift=-0.5, resolution_shift=1): + max_size = max(config["resx"], config["resy"]) + normalizing_factor = torch.tensor([max_size / 2, max_size / 2, config["maximum_number_of_frames"] / 2]) + resolution = config["grid_atlas_resolution"] + uv_mask = torch.zeros(size=(resolution, resolution), device=config["device"]) + + for frame in tqdm(range(config["maximum_number_of_frames"]), leave=False): + indices = get_grid_indices(0, 0, config["resy"], config["resx"], t=torch.tensor(frame)) + for chunk in indices.split(50000, dim=0): + normalized_chunk = (chunk / normalizing_factor - 1).to(config["device"]) + + # get the atlas UV coordinates from the two mapping networks; + with torch.no_grad(): + uv_values = mapping_model(normalized_chunk) + uv_values = uv_values * 0.5 + uv_shift + uv_values = ((uv_values + resolution_shift) * resolution).clip(0, resolution - 1) + + uv_mask[uv_values[:, 1].floor().long(), uv_values[:, 0].floor().long()] = 1 + uv_mask[uv_values[:, 1].floor().long(), uv_values[:, 0].ceil().long()] = 1 + uv_mask[uv_values[:, 1].ceil().long(), uv_values[:, 0].floor().long()] = 1 + uv_mask[uv_values[:, 1].ceil().long(), uv_values[:, 0].ceil().long()] = 1 + + uv_mask = crop(uv_mask.unsqueeze(0).unsqueeze(0), min_v, min_u, max_v, max_u) + return uv_mask.detach().cpu() # shape [1, 1, resolution, resolution] + + +@torch.no_grad() +def get_high_res_atlas(atlas_model, min_v, min_u, max_v, max_u, resolution, device="cuda", layer="background"): + inds_grid = get_grid_indices(0, 0, resolution, resolution) + inds_grid_chunks = inds_grid.split(50000, dim=0) + if layer == "background": + shift = -1 + else: + shift = 0 + + rendered_atlas = torch.zeros((resolution, resolution, 3)).to(device) # resy, resx, 3 + with torch.no_grad(): + # reconstruct image row by row + for chunk in inds_grid_chunks: + normalized_chunk = torch.stack( + [ + (chunk[:, 0] / resolution) + shift, + (chunk[:, 1] / resolution) + shift, + ], + dim=-1, + ).to(device) + + rgb_output = atlas_model(normalized_chunk) + rendered_atlas[chunk[:, 1], chunk[:, 0], :] = rgb_output + # move colors to RGB color domain (0,1) + rendered_atlas = 0.5 * (rendered_atlas + 1) + rendered_atlas = rendered_atlas.permute(2, 0, 1).unsqueeze(0) # shape (1, 3, resy, resx) + cropped_atlas = crop( + rendered_atlas, + min_v, + min_u, + max_v, + max_u, + ) + + return cropped_atlas + + +def get_grid_indices(x_start, y_start, h_crop, w_crop, t=None): + crop_indices = torch.meshgrid(torch.arange(w_crop) + x_start, torch.arange(h_crop) + y_start) + crop_indices = torch.stack(crop_indices, dim=-1) + crop_indices = crop_indices.reshape(h_crop * w_crop, crop_indices.shape[-1]) + if t is not None: + crop_indices = torch.cat([crop_indices, t.repeat(h_crop * w_crop, 1)], dim=1) + return crop_indices + + +def get_atlas_crops(uv_values, grid_atlas, augmentation=None): + if len(uv_values.shape) == 3: + dims = [0, 1] + elif len(uv_values.shape) == 4: + dims = [0, 1, 2] + else: + raise ValueError("uv_values should be of shape of len 3 or 4") + + min_u, min_v = uv_values.amin(dim=dims).long() + max_u, max_v = uv_values.amax(dim=dims).ceil().long() + # min_u, min_v = uv_values.min(dim=0).values + # max_u, max_v = uv_values.max(dim=0).values + + h_v = max_v - min_v + w_u = max_u - min_u + atlas_crop = crop(grid_atlas, min_v, min_u, h_v, w_u) + if augmentation is not None: + atlas_crop = augmentation(atlas_crop) + return atlas_crop, torch.stack([min_u, min_v]), torch.stack([max_u, max_v]) + + +def get_random_crop_params(input_size, output_size): + w, h = input_size + th, tw = output_size + + if h + 1 < th or w + 1 < tw: + raise ValueError(f"Required crop size {(th, tw)} is larger then input image size {(h, w)}") + + if w == tw and h == th: + return 0, 0, h, w + + i = torch.randint(0, h - th + 1, size=(1,)).item() + j = torch.randint(0, w - tw + 1, size=(1,)).item() + return i, j, th, tw + + +def get_masks_boundaries(alpha_video, border=20, threshold=0.95, min_crop_size=2 ** 7 + 1): + resy, resx = alpha_video.shape[-2:] + num_frames = alpha_video.shape[0] + masks_borders = torch.zeros((num_frames, 4), dtype=torch.int64) + for i, file in enumerate(range(num_frames)): + mask_im = alpha_video[i] + mask_im[mask_im >= threshold] = 1 + mask_im[mask_im < threshold] = 0 + all_ones = mask_im.squeeze().nonzero() + min_y, min_x = torch.maximum(all_ones.min(dim=0).values - border, torch.tensor([0, 0])) + max_y, max_x = torch.minimum(all_ones.max(dim=0).values + border, torch.tensor([resy, resx])) + h = max_y - min_y + w = max_x - min_x + if h < min_crop_size: + pad = min_crop_size - h + if max_y + pad > resy: + min_y -= pad + else: + max_y += pad + h = max_y - min_y + if w < min_crop_size: + pad = min_crop_size - w + if max_x + pad > resx: + min_x -= pad + else: + max_x += pad + w = max_x - min_x + masks_borders[i] = torch.tensor([min_y, min_x, h, w]) + return masks_borders + + +def get_atlas_bounding_box(mask_boundaries, grid_atlas, video_uvs): + min_uv = torch.tensor(grid_atlas.shape[-2:], device=video_uvs.device) + max_uv = torch.tensor([0, 0], device=video_uvs.device) + for boundary, frame in zip(mask_boundaries, video_uvs): + cropped_uvs = crop(frame.permute(2, 0, 1).unsqueeze(0), *list(boundary)) # 1,2,h,w + min_uv = torch.minimum(cropped_uvs.amin(dim=[0, 2, 3]), min_uv).floor().int() + max_uv = torch.maximum(cropped_uvs.amax(dim=[0, 2, 3]), max_uv).ceil().int() + + hw = max_uv - min_uv + crop_data = [*list(min_uv)[::-1], *list(hw)[::-1]] + return crop(grid_atlas, *crop_data), crop_data diff --git a/util/aug_utils.py b/util/aug_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..6c910cbb8c1a8b2e076f0af9b1c8f7ca2fb823b1 --- /dev/null +++ b/util/aug_utils.py @@ -0,0 +1,144 @@ +import warnings +from collections.abc import Sequence +import numbers +import torchvision.transforms as T +from torchvision.transforms.functional import ( + InterpolationMode, + _interpolation_modes_from_int, + get_image_num_channels, + get_image_size, + perspective, + crop, +) +import torch +import numpy as np + + +class RandomScale(object): + def __init__(self, scale_range=(0.8, 1.2), min_size=None): + super(RandomScale, self).__init__() + self.scale_range = scale_range + self.min_size = min_size if min_size is not None else 0 + + def __call__(self, img): + if isinstance(img, torch.Tensor): + height, width = img.shape[-2:] + else: + width, height = img.size[-2:] + s = np.random.uniform(*self.scale_range) + resize_h = max(int(height * s), self.min_size) + resize_w = max(int(width * s), self.min_size) + size = (resize_h, resize_w) + return T.Resize(size)(img) + + +class RandomSizeCrop(object): + def __init__(self, min_cover): + super(RandomSizeCrop, self).__init__() + self.min_cover = min_cover + + def __call__(self, img): + if self.min_cover == 1: + return img + if isinstance(img, torch.Tensor): + h, w = img.shape[-2:] + else: + w, h = img.size[-2:] + s = np.random.uniform(self.min_cover, 1) + size_h = int(h * s) + size_w = int(w * s) + return T.RandomCrop((size_h, size_w))(img) + + +class DivisibleCrop(object): + def __init__(self, d): + super(DivisibleCrop, self).__init__() + self.d = d + + def __call__(self, img): + if isinstance(img, torch.Tensor): + h, w = img.shape[-2:] + else: + w, h = img.size[-2:] + + h = h - h % self.d + w = w - w % self.d + return T.CenterCrop((h, w))(img) + + +class ToTensorSafe(object): + def __init__(self): + super(ToTensorSafe, self).__init__() + + def __call__(self, img): + if isinstance(img, torch.Tensor): + return img + return T.ToTensor()(img) + + +class BorderlessRandomPerspective(object): + """Applies random perspective and crops the image to be without borders + + Args: + distortion_scale (float): argument to control the degree of distortion and ranges from 0 to 1. + Default is 0.5. + p (float): probability of the image being transformed. Default is 0.5. + interpolation (InterpolationMode): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. + For backward compatibility integer values (e.g. ``PIL.Image.NEAREST``) are still acceptable. + fill (sequence or number): Pixel fill value for the area outside the transformed + image. Default is ``0``. If given a number, the value is used for all bands respectively. + """ + + def __init__(self, distortion_scale=0.5, p=0.5, interpolation=InterpolationMode.BILINEAR, fill=0): + super().__init__() + self.p = p + + # Backward compatibility with integer value + if isinstance(interpolation, int): + warnings.warn( + "Argument interpolation should be of type InterpolationMode instead of int. " + "Please, use InterpolationMode enum." + ) + interpolation = _interpolation_modes_from_int(interpolation) + + self.interpolation = interpolation + self.distortion_scale = distortion_scale + + if fill is None: + fill = 0 + elif not isinstance(fill, (Sequence, numbers.Number)): + raise TypeError("Fill should be either a sequence or a number.") + + self.fill = fill + + @staticmethod + def get_crop_endpoints(endpoints): + topleft, topright, botright, botleft = endpoints + topy = max(topleft[1], topright[1]) + leftx = max(topleft[0], botleft[0]) + boty = min(botleft[1], botright[1]) + rightx = min(topright[0], botright[0]) + + h = boty - topy + w = rightx - leftx + return topy, leftx, h, w + + def __call__(self, img): + fill = self.fill + if isinstance(img, torch.Tensor): + if isinstance(fill, (int, float)): + fill = [float(fill)] * get_image_num_channels(img) + else: + fill = [float(f) for f in fill] + + if torch.rand(1) < self.p: + width, height = get_image_size(img) + startpoints, endpoints = T.RandomPerspective.get_params(width, height, self.distortion_scale) + warped = perspective(img, startpoints, endpoints, self.interpolation, fill) + i, j, h, w = self.get_crop_endpoints(endpoints) + # print(f"Crop size: {h,w}") + cropped = crop(warped, i, j, h, w) + return T.Compose([T.Resize(224), T.CenterCrop(224)])(cropped) + return img diff --git a/util/losses.py b/util/losses.py new file mode 100644 index 0000000000000000000000000000000000000000..4acf37ed858cfa55762142ad4910528841bcc007 --- /dev/null +++ b/util/losses.py @@ -0,0 +1,181 @@ +import torch +import numpy as np +import torchvision.transforms as T + +from models.clip_relevancy import ClipRelevancy +from util.aug_utils import RandomSizeCrop +from util.util import get_screen_template, get_text_criterion, get_augmentations_template + + +class LossG(torch.nn.Module): + def __init__(self, cfg, clip_extractor): + super().__init__() + + self.cfg = cfg + + # calculate target text embeddings + template = get_augmentations_template() + self.src_e = clip_extractor.get_text_embedding(cfg["src_text"], template) + self.target_comp_e = clip_extractor.get_text_embedding(cfg["comp_text"], template) + self.target_greenscreen_e = clip_extractor.get_text_embedding(cfg["screen_text"], get_screen_template()) + + self.clip_extractor = clip_extractor + self.text_criterion = get_text_criterion(cfg) + + if cfg["bootstrap_epoch"] > 0 and cfg["lambda_bootstrap"] > 0: + self.relevancy_extractor = ClipRelevancy(cfg) + self.relevancy_criterion = torch.nn.MSELoss() + self.lambda_bootstrap = cfg["lambda_bootstrap"] + + def forward(self, outputs, inputs): + losses = {} + loss_G = 0 + + all_outputs_composite = [] + all_outputs_greenscreen = [] + all_outputs_edit = [] + all_outputs_alpha = [] + all_inputs = [] + for out, ins in zip(["output_crop", "output_image"], ["input_crop", "input_image"]): + if out not in outputs: + continue + all_outputs_composite += outputs[out]["composite"] + all_outputs_greenscreen += outputs[out]["edit_on_greenscreen"] + all_outputs_edit += outputs[out]["edit"] + all_outputs_alpha += outputs[out]["alpha"] + all_inputs += inputs[ins] + + # calculate alpha bootstrapping loss + if inputs["step"] < self.cfg["bootstrap_epoch"] and self.cfg["lambda_bootstrap"] > 0: + losses["loss_bootstrap"] = self.calculate_relevancy_loss(all_outputs_alpha, all_inputs) + + if self.cfg["bootstrap_scheduler"] == "linear": + lambda_bootstrap = self.cfg["lambda_bootstrap"] * ( + 1 - (inputs["step"] + 1) / self.cfg["bootstrap_epoch"] + ) + elif self.cfg["bootstrap_scheduler"] == "exponential": + lambda_bootstrap = self.lambda_bootstrap * 0.99 + self.lambda_bootstrap = lambda_bootstrap + elif self.cfg["bootstrap_scheduler"] == "none": + lambda_bootstrap = self.lambda_bootstrap + else: + raise ValueError("Unknown bootstrap scheduler") + lambda_bootstrap = max(lambda_bootstrap, self.cfg["lambda_bootstrap_min"]) + loss_G += losses["loss_bootstrap"] * lambda_bootstrap + + # calculate structure loss + if self.cfg["lambda_structure"] > 0: + losses["loss_structure"] = self.calculate_structure_loss(all_outputs_composite, all_inputs) + loss_G += losses["loss_structure"] * self.cfg["lambda_structure"] + + # calculate composition loss + if self.cfg["lambda_composition"] > 0: + losses["loss_comp_clip"] = self.calculate_clip_loss(all_outputs_composite, self.target_comp_e) + + losses["loss_comp_dir"] = self.calculate_clip_dir_loss( + all_inputs, all_outputs_composite, self.target_comp_e + ) + + loss_G += (losses["loss_comp_clip"] + losses["loss_comp_dir"]) * self.cfg["lambda_composition"] + + # calculate sparsity loss + if self.cfg["lambda_sparsity"] > 0: + total, l0, l1 = self.calculate_alpha_reg(all_outputs_alpha) + losses["loss_sparsity"] = total + losses["loss_sparsity_l0"] = l0 + losses["loss_sparsity_l1"] = l1 + + loss_G += losses["loss_sparsity"] * self.cfg["lambda_sparsity"] + + # calculate screen loss + if self.cfg["lambda_screen"] > 0: + losses["loss_screen"] = self.calculate_clip_loss(all_outputs_greenscreen, self.target_greenscreen_e) + loss_G += losses["loss_screen"] * self.cfg["lambda_screen"] + + losses["loss"] = loss_G + return losses + + def calculate_alpha_reg(self, prediction): + """ + Calculate the alpha sparsity term: linear combination between L1 and pseudo L0 penalties + """ + l1_loss = 0.0 + for el in prediction: + l1_loss += el.mean() + l1_loss = l1_loss / len(prediction) + loss = self.cfg["lambda_alpha_l1"] * l1_loss + # Pseudo L0 loss using a squished sigmoid curve. + l0_loss = 0.0 + for el in prediction: + l0_loss += torch.mean((torch.sigmoid(el * 5.0) - 0.5) * 2.0) + l0_loss = l0_loss / len(prediction) + loss += self.cfg["lambda_alpha_l0"] * l0_loss + return loss, l0_loss, l1_loss + + def calculate_clip_loss(self, outputs, target_embeddings): + # randomly select embeddings + n_embeddings = np.random.randint(1, len(target_embeddings) + 1) + target_embeddings = target_embeddings[torch.randint(len(target_embeddings), (n_embeddings,))] + + loss = 0.0 + for img in outputs: # avoid memory limitations + img_e = self.clip_extractor.get_image_embedding(img.unsqueeze(0)) + for target_embedding in target_embeddings: + loss += self.text_criterion(img_e, target_embedding.unsqueeze(0)) + + loss /= len(outputs) * len(target_embeddings) + return loss + + def calculate_clip_dir_loss(self, inputs, outputs, target_embeddings): + # randomly select embeddings + n_embeddings = np.random.randint(1, min(len(self.src_e), len(target_embeddings)) + 1) + idx = torch.randint(min(len(self.src_e), len(target_embeddings)), (n_embeddings,)) + src_embeddings = self.src_e[idx] + target_embeddings = target_embeddings[idx] + target_dirs = target_embeddings - src_embeddings + + loss = 0.0 + for in_img, out_img in zip(inputs, outputs): # avoid memory limitations + in_e = self.clip_extractor.get_image_embedding(in_img.unsqueeze(0)) + out_e = self.clip_extractor.get_image_embedding(out_img.unsqueeze(0)) + for target_dir in target_dirs: + loss += 1 - torch.nn.CosineSimilarity()(out_e - in_e, target_dir.unsqueeze(0)).mean() + + loss /= len(outputs) * len(target_dirs) + return loss + + def calculate_structure_loss(self, outputs, inputs): + loss = 0.0 + for input, output in zip(inputs, outputs): + with torch.no_grad(): + target_self_sim = self.clip_extractor.get_self_sim(input.unsqueeze(0)) + current_self_sim = self.clip_extractor.get_self_sim(output.unsqueeze(0)) + loss = loss + torch.nn.MSELoss()(current_self_sim, target_self_sim) + loss = loss / len(outputs) + return loss + + def calculate_relevancy_loss(self, alpha, input_img): + positive_relevance_loss = 0.0 + for curr_alpha, curr_img in zip(alpha, input_img): + x = torch.stack([curr_alpha, curr_img], dim=0) # [2, 3, H, W] + x = T.Compose( + [ + RandomSizeCrop(min_cover=self.cfg["bootstrapping_min_cover"]), + T.Resize((224, 224)), + ] + )(x) + curr_alpha, curr_img = x[0].unsqueeze(0), x[1].unsqueeze(0) + positive_relevance = self.relevancy_extractor(curr_img) + positive_relevance_loss = self.relevancy_criterion(curr_alpha[0], positive_relevance.repeat(3, 1, 1)) + if self.cfg["use_negative_bootstrap"]: + negative_relevance = self.relevancy_extractor(curr_img, negative=True) + relevant_values = negative_relevance > self.cfg["bootstrap_negative_map_threshold"] + negative_alpha_local = (1 - curr_alpha) * relevant_values.unsqueeze(1) + negative_relevance_local = negative_relevance * relevant_values + negative_relevance_loss = self.relevancy_criterion( + negative_alpha_local, + negative_relevance_local.unsqueeze(1).repeat(1, 3, 1, 1), + ) + positive_relevance_loss += negative_relevance_loss + positive_relevance_loss = positive_relevance_loss / len(alpha) + return positive_relevance_loss diff --git a/util/util.py b/util/util.py new file mode 100644 index 0000000000000000000000000000000000000000..2a22143196355b6c044dc2fc6114382c89bcda80 --- /dev/null +++ b/util/util.py @@ -0,0 +1,111 @@ +from pathlib import Path + +import numpy as np +import torch +import torch.nn.functional as F +from PIL import Image +from madgrad import MADGRAD +from torchvision import transforms + + +def get_optimizer(cfg, params): + if cfg["optimizer"] == "adam": + optimizer = torch.optim.Adam(params, lr=cfg["lr"]) + elif cfg["optimizer"] == "radam": + optimizer = torch.optim.RAdam(params, lr=cfg["lr"]) + elif cfg["optimizer"] == "madgrad": + optimizer = MADGRAD(params, lr=cfg["lr"], weight_decay=0.01, momentum=0.9) + elif cfg["optimizer"] == "rmsprop": + optimizer = torch.optim.RMSprop(params, lr=cfg["lr"], weight_decay=0.01) + elif cfg["optimizer"] == "sgd": + optimizer = torch.optim.SGD(params, lr=cfg["lr"]) + else: + return NotImplementedError("optimizer [%s] is not implemented", cfg["optimizer"]) + return optimizer + + +def get_text_criterion(cfg): + if cfg["text_criterion"] == "spherical": + text_criterion = spherical_dist_loss + elif cfg["text_criterion"] == "cosine": + text_criterion = cosine_loss + else: + return NotImplementedError("text criterion [%s] is not implemented", cfg["text_criterion"]) + return text_criterion + + +def spherical_dist_loss(x, y): + x = F.normalize(x, dim=-1) + y = F.normalize(y, dim=-1) + return ((x - y).norm(dim=-1).div(2).arcsin().pow(2).mul(2)).mean() + + +def cosine_loss(x, y, scaling=1.2): + return scaling * (1 - F.cosine_similarity(x, y).mean()) + + +def tensor2im(input_image, imtype=np.uint8): + if not isinstance(input_image, np.ndarray): + if isinstance(input_image, torch.Tensor): # get the data from a variable + image_tensor = input_image.data + else: + return input_image + image_numpy = image_tensor[0].clamp(0.0, 1.0).cpu().float().numpy() # convert it into a numpy array + image_numpy = np.transpose(image_numpy, (1, 2, 0)) * 255.0 # post-processing: tranpose and scaling + else: # if it is a numpy array, do nothing + image_numpy = input_image + return image_numpy.astype(imtype) + + +def get_screen_template(): + return [ + "{} over a green screen.", + "{} in front of a green screen.", + ] + + +def get_augmentations_template(): + templates = [ + "photo of {}.", + "high quality photo of {}.", + "a photo of {}.", + "the photo of {}.", + "image of {}.", + "an image of {}.", + "high quality image of {}.", + "a high quality image of {}.", + "the {}.", + "a {}.", + "{}.", + "{}", + "{}!", + "{}...", + ] + return templates + + +def compose_text_with_templates(text: str, templates) -> list: + return [template.format(text) for template in templates] + + +def get_mask_boundary(img, mask): + mask = mask.squeeze() # mask.shape -> (H, W) + if torch.sum(mask) > 0: + y, x = torch.where(mask) + y0, x0 = y.min(), x.min() + y1, x1 = y.max(), x.max() + return img[:, :, y0:y1, x0:x1] + else: + return img + + +def load_video(folder: str, resize=(432, 768), num_frames=70): + resy, resx = resize + folder = Path(folder) + input_files = sorted(list(folder.glob("*.jpg")) + list(folder.glob("*.png")))[:num_frames] + video = torch.zeros((len(input_files), 3, resy, resx)) + + for i, file in enumerate(input_files): + video[i] = transforms.ToTensor()(Image.open(str(file)).resize((resx, resy), Image.LANCZOS)) + + return video diff --git a/util/video_logger.py b/util/video_logger.py new file mode 100644 index 0000000000000000000000000000000000000000..f6ba3a1f63ebfe84f2ffe3a7a5205ff07d883bae --- /dev/null +++ b/util/video_logger.py @@ -0,0 +1,86 @@ +from pathlib import Path + +import imageio +import torch + +from util.util import tensor2im + + +class DataLogger: + def __init__(self, config, dataset): + self.layers_edits = { + "background": dataset.original_video.detach().cpu(), + "foreground": dataset.original_video.detach().cpu(), + } + self.alpha_video = dataset.all_alpha.detach().cpu() + self.config = config + self.layer_name = "foreground" if config["finetune_foreground"] else "background" + + @torch.no_grad() + def log_data(self, epoch, lr, losses, model, dataset): + log_data = {} + for layer, layer_losses in losses.items(): + for key in layer_losses: + log_data[f"Loss/{layer}_{key}"] = layer_losses[key].detach() + log_data["epoch"] = epoch + + log_data["lr"] = lr + + if epoch % self.config["log_images_freq"] == 0: + layer = self.layer_name + edited_atlas_dict, edit_dict, uv_mask = dataset.render_video_from_atlas(model, layer=layer) + alpha_of_edit = None + edit_only = None + for key in edited_atlas_dict.keys(): + if key != "edit": + masked = tensor2im(edited_atlas_dict[key].detach().cpu() * uv_mask) + log_data[f"Atlases/{layer}_masked_{key}"] = ( + wandb.Image(masked) if self.config["use_wandb"] else masked + ) + if key == "alpha": + alpha_of_edit = edited_atlas_dict[key].detach().cpu() * uv_mask + if key == "edit": + edit_only = edited_atlas_dict[key].detach().cpu() * uv_mask + rgba_edit = tensor2im(torch.cat((edit_only, alpha_of_edit[:, [0]]), dim=1)) + log_data[f"Atlases/{layer}_rgba_layer"] = wandb.Image(rgba_edit) if self.config["use_wandb"] else rgba_edit + + for key in edit_dict.keys(): + if key != "composite" and key != "edit": + video = (255 * edit_dict[key].detach().cpu()).to(torch.uint8) + log_data[f"Videos/{layer}_{key}"] = ( + wandb.Video(video, fps=10, format="mp4") if self.config["use_wandb"] else video + ) + + if self.config[f"finetune_{layer}"]: + self.layers_edits[layer] = edit_dict["composite"].detach().cpu() + full_video = ( + self.alpha_video * self.layers_edits["foreground"] + + (1 - self.alpha_video) * self.layers_edits["background"] + ) + full_video = (255 * full_video.detach().cpu()).to(torch.uint8) + log_data["Videos/full_video"] = ( + wandb.Video(full_video, fps=10, format="mp4") if self.config["use_wandb"] else full_video + ) + + # save model checkpoint + if epoch > self.config["save_model_starting_epoch"]: + filename = f"checkpoint_epoch_{epoch}.pt" + dict_to_save = { + "model": model.state_dict(), + } + if self.config["use_wandb"]: + checkpoint_path = f"{wandb.run.dir}/{filename}" + else: + checkpoint_path = f"{self.config['results_folder']}/{filename}" + torch.save(dict_to_save, checkpoint_path) + return log_data + + def save_locally(self, log_data): + path = Path(self.config["results_folder"], str(log_data["epoch"])) + path.mkdir(parents=True, exist_ok=True) + for key in log_data.keys(): + save_name = key.replace("/", "_") + if key.startswith("Videos"): + imageio.mimwrite(f"{path}/{save_name}.mp4", log_data[key].permute(0, 2, 3, 1)) + elif key.startswith("Atlases"): + imageio.imwrite(f"{path}/{save_name}.png", log_data[key])