Spaces:

EleutherAI
/

magma

Runtime error

App Files Files Community

stellaathena commited on Mar 15, 2022

Commit

bb5cd12

1 Parent(s): 23ee17f

This should work

Browse files

Files changed (29) hide show

LICENSE +21 -0
app.py +54 -0
configs/MAGMA_v1.yml +33 -0
configs/MAGMA_v2.yml +36 -0
example_inference.py +27 -0
examples/magma_oracle.png +0 -0
examples/magma_present.jpg +0 -0
examples/magma_social.png +0 -0
examples/magma_treasure.png +0 -0
examples/magma_tree.jpg +0 -0
examples/model.jpg +0 -0
magma/__init__.py +20 -0
magma/adapters.py +116 -0
magma/config.py +144 -0
magma/datasets/__init__.py +5 -0
magma/datasets/convert_datasets.py +118 -0
magma/datasets/dataset.py +160 -0
magma/image_encoders.py +91 -0
magma/image_input.py +24 -0
magma/image_prefix.py +109 -0
magma/language_model.py +45 -0
magma/magma.py +301 -0
magma/sampling.py +121 -0
magma/train_loop.py +98 -0
magma/transforms.py +134 -0
magma/utils.py +372 -0
requirements.txt +9 -0
test.py +43 -0
train.py +192 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2022 Aleph Alpha GmbH
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

app.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import gradio as gr
+import re
+from magma import Magma
+from magma.image_input import ImageInput
+model = Magma.from_checkpoint(
+    config_path = "configs/MAGMA_v1.yml",
+    checkpoint_path = "./mp_rank_00_model_states.pt",
+    device = 'cuda:0'
+)
+def generate(context, length, temperature, top_k):
+  context = context.strip()
+  url_regex = r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)'
+  lines = context.split('\n')
+  inputs = []
+  for line in lines:
+    if re.match(url_regex, line):
+      try:
+        inputs.append(ImageInput(line))
+      except Exception as e:
+        return str(e)
+    else:
+      inputs.append(line)
+  ## returns a tensor of shape: (1, 149, 4096)
+  embeddings = model.preprocess_inputs(inputs)
+  ## returns a list of length embeddings.shape[0] (batch size)
+  output = model.generate(
+    embeddings = embeddings,
+    max_steps = length,
+    temperature = (0.01 if temperature == 0 else temperature),
+    top_k = top_k
+  )
+  return context + output[0]
+iface = gr.Interface(
+  fn=generate,
+  inputs=[
+    gr.inputs.Textbox(
+      label="Prompt (image URLs need to be on their own lines):",
+      default="https://www.art-prints-on-demand.com/kunst/thomas_cole/woods_hi.jpg\nDescribe the painting:",
+      lines=7),
+    gr.inputs.Slider(minimum=1, maximum=100, default=15, step=1, label="Output tokens:"),
+    gr.inputs.Slider(minimum=0.0, maximum=1.0, default=0.7, label='Temperature'),
+    gr.inputs.Slider(minimum=0, maximum=100, default=0, step=1, label='Top K')
+  ],
+  outputs=["textbox"]
+).launch(share=True)

configs/MAGMA_v1.yml ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+    # image encoder settings
+    encoder_name: 'clip_resnet_large',
+    adapter_config: {"mlp": {"adapter_type": "normal", "downsample_factor": 4}},
+    freeze_img_encoder: false,
+    # train settings
+    batch_size: 256,
+    train_steps: 150000,
+    lr: 8.0e-4,
+    min_lr: 0.0,
+    lr_decay_iters: 300000,
+    image_enc_lr: 2.0e-6,
+    use_image_embed_layernorm: true,
+    image_embed_dropout_prob: 0.1,
+    image_size: 384,
+    gradient_accumulation_steps: 8,
+    zero_stage: 2,
+    gradient_clipping: 1.0,
+    # dataset / save / load settings
+    train_dataset_name: 'conceptual_captions',
+    train_dataset_dir: '/mnt/localdisk/conceptual_captions',
+    eval_dataset_name: 'coco',
+    eval_dataset_dir: '/mnt/localdisk/coco_data',
+    save: "/mnt/shared_vol/checkpoints/multimodal_transformer_rn50x16",
+    load: "/mnt/shared_vol/checkpoints/multimodal_transformer_rn50x16",
+    eval_every: 100,
+}

configs/MAGMA_v2.yml ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+    # image encoder settings
+    encoder_name: 'clip_resnet_large',
+    adapter_config: {"mlp": {"adapter_type": "normal", "downsample_factor": 8}, "attention": {"adapter_type": "normal", "downsample_factor": 8}},
+    freeze_img_encoder: false,
+    # train settings
+    batch_size: 256,
+    train_steps: 150000,
+    lr: 8.0e-4,
+    min_lr: 0.0,
+    lr_decay_iters: 300000,
+    image_enc_lr: 2.0e-6,
+    use_image_embed_layernorm: true,
+    image_embed_dropout_prob: 0.1,
+    image_size: 384,
+    gradient_accumulation_steps: 4,
+    zero_stage: 2,
+    gradient_clipping: 1.0,
+    # dataset / save / load settings
+    dataset_type: 'new',
+    train_dataset_dir: ['/mnt/localdisk/laion', '/mnt/brick/CC3M_converted', '/mnt/localdisk/localized_narratives', '/mnt/localdisk/visual_genome_converted', '/mnt/localdisk/hateful_memes_converted', '/mnt/localdisk/coco_converted', '/mnt/brick/wit_converted', '/mnt/localdisk/gqa_train_converted', '/mnt/localdisk/vqa_train_converted', '/mnt/localdisk/okvqa_train_converted'], #'/mnt/brick/wit_converted'
+    eval_dataset_dir: null, # if this is none, train dataset will be split
+    vqa_dir: "/mnt/localdisk/vqa_val_converted",
+    gqa_dir: "/mnt/localdisk/gqa_val_converted",
+    save: "/mnt/shared_vol/checkpoints/MAGMA_RN50x16",
+    load: "/mnt/shared_vol/checkpoints/MAGMA_RN50x16",
+    eval_every: 250,
+    wandb_project: "MAGMA_training",
+    name: "MAGMA_RN50x16_v1"
+}

example_inference.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from magma import Magma
+from magma.image_input import ImageInput
+model = Magma.from_checkpoint(
+    config_path = "configs/MAGMA_v1.yml",
+    checkpoint_path = "./mp_rank_00_model_states.pt",
+    device = 'cuda:0'
+)
+inputs =[
+    ## supports urls and path/to/image
+    ImageInput('https://www.art-prints-on-demand.com/kunst/thomas_cole/woods_hi.jpg'),
+    'Describe the painting:'
+]
+## returns a tensor of shape: (1, 149, 4096)
+embeddings = model.preprocess_inputs(inputs)
+## returns a list of length embeddings.shape[0] (batch size)
+output = model.generate(
+    embeddings = embeddings,
+    max_steps = 6,
+    temperature = 0.7,
+    top_k = 0,
+)
+print(output[0]) ##  A cabin on a lake

examples/magma_oracle.png ADDED Viewed

examples/magma_present.jpg ADDED Viewed

examples/magma_social.png ADDED Viewed

examples/magma_treasure.png ADDED Viewed

examples/magma_tree.jpg ADDED Viewed

examples/model.jpg ADDED Viewed

magma/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from .config import MultimodalConfig
+from .magma import Magma
+from .language_model import get_gptj
+from .transforms import get_transforms
+from .utils import (
+    count_parameters,
+    is_main,
+    cycle,
+    get_tokenizer,
+    parse_args,
+    wandb_log,
+    wandb_init,
+    save_model,
+    load_model,
+    print_main,
+    configure_param_groups,
+    log_table,
+)
+from .train_loop import eval_step, inference_step, train_step
+from .datasets import collate_fn

magma/adapters.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import torch
+import torch.nn as nn
+from torchtyping import TensorType
+class Adapter(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        downsample_factor: int = 4,
+        activation: nn.Module = nn.ReLU,
+        add_layernorm: bool = False,
+    ):
+        super().__init__()
+        layers = []
+        if add_layernorm:
+            layers.append(nn.LayerNorm(dim))
+        layers.extend(
+            [
+                nn.Linear(dim, dim // downsample_factor),
+                activation(),
+                nn.Linear(dim // downsample_factor, dim),
+            ]
+        )
+        self.adapter = nn.Sequential(*layers)
+        self.adapter.apply(self.init_weights)
+    def init_weights(self, m: nn.Module, std=1e-3):
+        if isinstance(m, nn.Linear):
+            torch.nn.init.normal_(m.weight, std=std)
+            torch.nn.init.normal_(m.bias, std=std)
+            m.weight.data = torch.clamp(m.weight.data, min=-2 * std, max=2 * std)
+            m.bias.data = torch.clamp(m.bias.data, min=-2 * std, max=2 * std)
+        elif isinstance(m, nn.LayerNorm):
+            m.bias.data.zero_()
+            m.weight.data.fill_(1.0)
+    def forward(self, x: TensorType["b", "s", "d"]) -> TensorType["b", "s", "d"]:
+        return self.adapter(x) + x
+class ParallelAdapter(Adapter):
+    def __init__(
+        self,
+        module: nn.Module,
+        dim: int,
+        downsample_factor: int = 4,
+        scaled: bool = False,
+        add_layernorm: bool = False,
+        activation: nn.Module = nn.ReLU,
+    ):
+        super().__init__(
+            dim, downsample_factor, add_layernorm=add_layernorm, activation=activation
+        )
+        self.module = module
+        if scaled:
+            # init scaling param
+            self.adapter_scale = nn.Parameter(torch.ones(1))
+        else:
+            self.adapter_scale = 1
+    def forward(self, x: TensorType["b", "s", "d"], **module_kwargs):
+        y = self.module(x, **module_kwargs)
+        z = self.adapter(x)
+        return y + (z * self.adapter_scale)
+class ParallelAdapterWrapper(ParallelAdapter):
+    # used to add an adapter to the attention block
+    def __init__(
+        self,
+        module: nn.Module,
+        dim: int,
+        downsample_factor: int = 4,
+        scaled: bool = False,
+        add_layernorm: bool = False,
+        activation: nn.Module = nn.ReLU,
+    ):
+        super().__init__(
+            module, dim, downsample_factor, scaled, add_layernorm, activation
+        )
+    def forward(self, x: TensorType["b", "s", "d"], *attn_args, **attn_kwargs):
+        attn_outputs = self.module(x, *attn_args, **attn_kwargs)
+        attn_output, outputs = (
+            attn_outputs[0],
+            attn_outputs[1:],
+        )  # output_attn: a, present, (attentions)
+        hidden_states = attn_output + (self.adapter(x) * self.adapter_scale)
+        return (hidden_states,) + outputs
+class AdapterWrapper(Adapter):
+    # used to add an adapter to the attention block
+    def __init__(
+        self,
+        attn_block: nn.Module,
+        dim: int,
+        downsample_factor: int = 4,
+        activation: nn.Module = nn.ReLU,
+        add_layernorm: bool = False,
+    ):
+        super().__init__(dim, downsample_factor, activation, add_layernorm)
+        self.attn_block = attn_block
+    def forward(self, x: TensorType["b", "s", "d"], *attn_args, **attn_kwargs):
+        attn_outputs = self.attn_block(x, *attn_args, **attn_kwargs)
+        attn_output, outputs = (
+            attn_outputs[0],
+            attn_outputs[1:],
+        )  # output_attn: a, present, (attentions)
+        hidden_states = self.adapter(attn_output) + attn_output
+        return (hidden_states,) + outputs

magma/config.py ADDED Viewed

	@@ -0,0 +1,144 @@

+from dataclasses import dataclass, asdict
+import yaml
+from pprint import pprint
+from .utils import is_main
+import os
+from pathlib import Path
+import uuid
+def load_config(path, config_dir=Path("configs")):
+    if not path.endswith(".yml"):
+        path += ".yml"
+    if not os.path.exists(path):
+        path = config_dir / path
+    with open(path, "r") as stream:
+        config = yaml.safe_load(stream)
+    return config
+@dataclass
+class MultimodalConfig:
+    # Training:
+    # ------------------------------------------------------------
+    batch_size: int
+    train_steps: int
+    optimizer_name: str = "AdamW"
+    lr: float = 8.0e-4
+    image_enc_lr: float = None
+    min_lr: float = 0.0
+    lr_decay_iters: int = None
+    gradient_accumulation_steps: int = 1
+    image_size: int = 256
+    eval_every: int = 250
+    eval_steps: int = 25
+    zero_stage: int = 2
+    gradient_clipping: float = 1.0
+    warmup_num_steps: int = 100
+    weight_decay: float = 0.00
+    run_blind: bool = False
+    fine_tune: bool = False
+    load_optimizer: bool = True
+    # Checkpointing:
+    # ------------------------------------------------------------
+    save_every: int = 2500
+    save: str = None
+    load: str = None
+    # Data:
+    # ------------------------------------------------------------
+    train_dataset_name: str = "conceptual_captions"
+    eval_dataset_name: str = "/data/conceptual_captions"
+    train_dataset_dir: str = "/data/coco_data"
+    eval_dataset_dir: str = "/data/coco_data"
+    eval_dataset_pct: float = 0.1
+    # Model architecture:
+    # ------------------------------------------------------------
+    encoder_name: str = "clip"
+    tokenizer_name: str = "gpt2"
+    lm_name: str = "EleutherAI/gpt-j-6B"
+    image_seq_len: int = 2
+    pretrained_img_encoder: bool = False
+    seq_len: int = None
+    # Layer Freezing settings:
+    # ------------------------------------------------------------
+    freeze_lm: bool = True
+    freeze_img_encoder: bool = True
+    image_embed_dropout_prob: float = 0.0
+    use_image_embed_layernorm: bool = False
+    # Adapter settings:
+    # ------------------------------------------------------------
+    adapter_config: dict = None
+    # Classification Finetuning settings:
+    # ------------------------------------------------------------
+    class_dict: dict = None  # {num_classes: .., ckpt_path: .., classifier_type:, .., interface_type: .., interface_position: .., freeze_model: ..}
+    # Logging settings:
+    # ------------------------------------------------------------
+    name: str = None  # name, just used for wandb logging
+    log_every: int = 1
+    wandb_project: str = "magma"
+    def print(self):
+        if is_main():
+            print("-" * 100)
+            pprint(self.__dict__, indent=4)
+            print("-" * 100)
+    def __post_init__(self):
+        self.is_classifier = self.class_dict is not None
+        if self.adapter_config is None:
+            self.adapter_config = {}
+        # Deepspeed Settings:
+        # ------------------------------------------------------------
+        if self.lr_decay_iters is None:
+            self.lr_scheduler = "WarmupLR"
+            self.scheduler_dict = {
+                "type": self.lr_scheduler,
+                "params": {
+                    "warmup_min_lr": self.min_lr,
+                    "warmup_max_lr": self.lr,
+                    "warmup_num_steps": self.warmup_num_steps,
+                },
+            }
+        else:
+            self.lr_scheduler = "WarmupDecayLR"
+            self.scheduler_dict = {
+                "type": self.lr_scheduler,
+                "params": {
+                    "total_num_steps": self.lr_decay_iters,
+                    "warmup_min_lr": self.min_lr,
+                    "warmup_max_lr": self.lr,
+                    "warmup_num_steps": self.warmup_num_steps,
+                },
+            }
+        self.deepspeed_config_params = {
+            "train_batch_size": self.batch_size,
+            "gradient_accumulation_steps": self.gradient_accumulation_steps,
+            "gradient_clipping": self.gradient_clipping,
+            "fp16": {"enabled": True, "loss_scale_window": 250},
+            "scheduler": self.scheduler_dict,
+            "zero_optimization": {
+                "stage": self.zero_stage,
+                "load_from_fp32_weights": False,
+            },
+        }
+        if self.name is None:
+            self.name = str(uuid.uuid4())[:8]
+    @classmethod
+    def from_yml(cls, path):
+        return cls(**load_config(path))
+    def to_dict(self):
+        return asdict(self)

magma/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .dataset import (
+    ImgCptDataset,
+    collate_fn,
+)

magma/datasets/convert_datasets.py ADDED Viewed

	@@ -0,0 +1,118 @@

+from PIL import Image
+from PIL import UnidentifiedImageError
+import os
+import json
+from pathlib import Path
+from tqdm import tqdm
+import shutil
+def save_to_jsons(data_list, target_dir, starting_idx=0):
+    pbar = tqdm(
+        enumerate(data_list), desc=f"saving {len(data_list)} jsons to {str(target_dir)}"
+    )
+    for k, data in pbar:
+        filename = Path(target_dir) / Path(f"{k+starting_idx}.json")
+        with open(filename, "w") as f:
+            json.dump(data, f)
+    return None
+def save_images(img_list, target_dir, mode="mv"):
+    for img_path in tqdm(
+        img_list,
+        desc=f"saving {len(img_list)} images (mode={mode}) to {str(target_dir)}",
+    ):
+        if mode == "mv":
+            shutil.move(img_path, target_dir)
+        elif mode == "cp":
+            shutil.copy(img_path, target_dir)
+def convert_dataset(
+    data_dir,
+    dir_size=10000,
+    hash_fn=None,
+    mode="mv",
+    ds_iterator=None,
+):
+    """
+    Builds a dataset directory in our standard format. ds_iterator should return data of the form
+    image_path, {"captions": [...], "metadata": {...}, }, where image_path should be a Path object, captions should map to a list of strings
+    and metadata can contain any custom data about the image. If a hash_fn is specified (such as phash), the image hash gets saved in metadata.
+    """
+    data_dir = Path(data_dir)
+    # folders for images and corresponding data which is stored in a json file for each image
+    os.makedirs(data_dir / "images", exist_ok=True)
+    os.makedirs(data_dir / "image_data", exist_ok=True)
+    img_data_list = []
+    img_path_list = []
+    save_img_dir = data_dir / "images" / "0"
+    save_data_dir = data_dir / "image_data" / "0"
+    num_img_dirs = 0
+    # save the new locations of all img files in case some datafiles point to the same image
+    new_img_locations = {}
+    pbar = tqdm(
+        enumerate(ds_iterator),
+        desc="converting dataset to standard format...",
+    )
+    for k, (img_path, data) in pbar:
+        img_cpt_data = {}
+        # get img data
+        img_cpt_data.update(data)
+        if str(img_path) in new_img_locations.keys():
+            # if filename is in the dictionary, it already has a new location
+            new_img_path = new_img_locations[str(img_path)]["new_img_path"]
+            img_cpt_data["image_path"] = new_img_path
+            if hash_fn is not None:
+                img_cpt_data["metadata"]["image_hash"] = new_img_locations[
+                    str(img_path)
+                ]["hash"]
+        else:
+            # if file exists in the old location, it will get moved to a new directory
+            new_img_path = f"images/{save_img_dir.name}/{img_path.name}"
+            img_cpt_data["image_path"] = new_img_path
+            new_img_locations[str(img_path)] = {"new_img_path": new_img_path}
+            # original location is saved an later saved to the new directory
+            img_path_list.append(img_path)
+            # if given, apply hash fn
+            if hash_fn is not None:
+                try:
+                    img = Image.open(img_path).convert("RGB")
+                    hash_str = str(hash_fn(img))
+                    img_cpt_data["metadata"]["image_hash"] = hash_str
+                    # save hash so it does not have to be recomputed
+                    new_img_locations[str(img_path)]["hash"] = hash_str
+                except (UnidentifiedImageError, FileNotFoundError):
+                    print("Warning: corrupted or non-existent Image")
+        img_data_list.append(img_cpt_data)
+        # save images in specified images folder (maximum of dir_size images per folder)
+        if (len(img_path_list) % dir_size == 0 and len(img_path_list) > 0) or (
+            k == len(ds_iterator) - 1
+        ):
+            os.makedirs(save_img_dir, exist_ok=True)
+            save_images(img_path_list, save_img_dir, mode=mode)
+            img_path_list = []
+            num_img_dirs += 1
+            save_img_dir = data_dir / "images" / f"{num_img_dirs}/"
+        # save jdon data in specified image_data folder with consecutive labeling of the json files
+        if ((k + 1) % dir_size == 0) or (k == len(ds_iterator) - 1):
+            os.makedirs(save_data_dir, exist_ok=True)
+            save_to_jsons(
+                img_data_list, save_data_dir, starting_idx=max(k + 1 - dir_size, 0)
+            )
+            # empty path and data lists and update save directories for next saving step
+            img_data_list = []
+            save_data_dir = data_dir / "image_data" / f"{int((k+1)/dir_size)}/"

magma/datasets/dataset.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import torch
+from torch.utils.data import Dataset
+from PIL import Image
+from PIL.Image import Image as img
+from PIL.Image import DecompressionBombError
+from PIL import UnidentifiedImageError
+import json
+from pathlib import Path
+from tqdm import tqdm
+from typing import List, Tuple, Generator
+import random
+from multiprocessing import Pool, cpu_count
+from PIL import Image
+from torch.utils.data import Dataset
+from typing import Tuple
+from torchtyping import TensorType
+import traceback
+def read_jsonl(filename: str) -> Generator[List, None, None]:
+    """
+    Iterator over data from a jsonl file
+    """
+    with open(filename) as file:
+        for line in file:
+            yield json.loads(line.rstrip("\n|\r"))
+def read_img_captions(filename: str) -> List[Tuple[str, str]]:
+    """
+    Yields image_path, image_caption from cc jsonl files
+    """
+    img_captions = []
+    for item in read_jsonl(filename):
+        if not "N/A" in item[-2:]:
+            img_captions.append((item[-1], item[-2]))
+    return img_captions
+def load_json(filename):
+    try:
+        with open(filename) as f:
+            return json.load(f)
+    except Exception:
+        print(f"ERROR: Error loading json file {filename}")
+        traceback.print_exc()
+def _read_image_data(data_dir):
+    image_data = []
+    img_data_dir = data_dir / "image_data"
+    paths = _load_paths(data_dir)
+    pbar = tqdm(
+        paths,
+        desc=f"loading dataset from {str(data_dir)}",
+    )
+    # read data with multiprocessing
+    with Pool(cpu_count()) as pool:
+        for img_data in pool.imap(load_json, pbar):
+            if img_data is not None:
+                image_data.append(img_data)
+    return image_data
+def _load_paths(data_dir, sort=True):
+    paths = []
+    img_data_dir = data_dir / "image_data"
+    for p in tqdm(
+        Path(img_data_dir).glob("*/*.json"),
+        desc=f"loading dataset paths from {str(data_dir)}",
+    ):
+        paths.append(p)
+    return sorted(paths)
+class LazyLoader:
+    def __init__(self, data_dir):
+        self.paths = _load_paths(data_dir)
+    def __len__(self):
+        return len(self.paths)
+    def __getitem__(self, idx):
+        data = load_json(self.paths[idx])
+        if data is None:
+            return self[random.randint(0, len(self) - 1)]
+        return data
+class ImgCptDataset(Dataset):
+    """
+    Dataset which loads image caption data from our standard format and transforms them into tensors that can be input to the model.
+    Images are expected to be stored in data_dir/images, image data in data_dir/image_data and each data item is a json file with format {"image_path": img_path, "captions": [caption1, caption2,...], "metadata":{...}}
+    """
+    def __init__(
+        self, data_dir, tokenizer, transforms, seq_len=2048, load_data_in_memory=False
+    ):
+        self.data_dir = Path(data_dir)
+        self.tokenizer = tokenizer
+        self.transforms = transforms
+        self.seq_len = seq_len
+        self.load_data_in_memory = load_data_in_memory
+        if self.load_data_in_memory:
+            self.data = _read_image_data(self.data_dir)
+        else:
+            self.data = LazyLoader(self.data_dir)
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(
+        self, idx
+    ) -> Tuple[TensorType["b", "c", "h", "w"], TensorType["b", "s"]]:
+        img_data = self.data[idx]
+        try:
+            try:
+                img_path = self.data_dir / img_data["image_path"]
+            except KeyError as e:
+                # if no image path is found, assume path is same as .json, but .jpg
+                if not self.load_data_in_memory:
+                    p = self.data.paths[idx]
+                    img_path = (
+                        self.data_dir
+                        / "images"
+                        / Path(p.parent).name
+                        / Path(p.name).with_suffix(".jpg")
+                    )
+                else:
+                    raise e
+            img = Image.open(img_path)
+            img_tensor = self.transforms(img)
+            caption = random.choice(img_data["captions"])
+            caption_tensor = self.tokenizer.encode(
+                caption,
+                return_tensors="pt",
+                max_length=self.seq_len,
+                padding="max_length",
+                truncation=True,
+            )
+            return img_tensor, caption_tensor
+        except (
+            UnidentifiedImageError,
+            OSError,
+            DecompressionBombError,
+            IndexError,
+        ) as e:
+            # return random index if image is corrupt
+            print(f"Warning: Could not load image {str(img_path)}")
+            return self[random.randint(0, len(self) - 1)]
+def collate_fn(batch_data: List[Tuple[torch.Tensor, torch.Tensor]], seq_len=2048):
+    all_images, all_captions = list(
+        zip(*batch_data)
+    )  # [(img1, caption1), (img2, caption2), ... ] -> [(img1, img2, ... ), (caption1, caption2, ... )]
+    return torch.cat(all_images), torch.cat([i[:, :seq_len] for i in all_captions])

magma/image_encoders.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import torch
+import torch.nn as nn
+from typing import Callable, Union
+from torchtyping import patch_typeguard
+from einops import rearrange
+import timm
+import clip
+from functools import partial
+# ----------------------------- Utils --------------------------------------
+clip.model.LayerNorm = (
+    nn.LayerNorm
+)  # we need to patch this for clip to work with deepspeed
+patch_typeguard()  # needed for torchtyping typechecks to work
+class Lambda(torch.nn.Module):
+    def __init__(self, fn: Callable):
+        super().__init__()
+        assert hasattr(fn, "__call__")
+        self.fn = fn
+    def forward(self, x):
+        return self.fn(x)
+# ------------------------- Image encoders ----------------------------------
+def nfresnet50(
+    device: Union[torch.device, str] = None, pretrained: bool = True
+) -> nn.Module:
+    """
+    Loads nfresnet50 model, removing the pooling layer and replacing it with
+    an adaptive pooling layer.
+    """
+    encoder = torch.nn.Sequential(
+        *list(timm.create_model("nf_resnet50", pretrained=pretrained).children())[:-1]
+    )
+    pooling = torch.nn.AdaptiveAvgPool2d((1, 1))
+    encoder = torch.nn.Sequential(encoder, pooling)
+    if device is not None:
+        encoder = encoder.to(device)
+    return encoder
+def clip_encoder(
+    device: Union[torch.device, str] = None, name: str = "clip",
+) -> nn.Module:
+    """
+    Loads clip's image encoder module, discarding the lm component.
+    If the variant is a resnet model, we also remove the attention pooling.
+    """
+    if name in ["clip", "ViT-B/32"]:
+        name = "ViT-B/32"
+    elif name in ["clip_resnet", "RN50x4"]:
+        name = "RN50x4"
+    elif name in ["clip_resnet_large", "RN50x16"]:
+        name = "RN50x16"
+    else:
+        raise ValueError(f"encoder {name} not recognized")
+    encoder = clip.load(name, device=device)[0].visual
+    if device is not None:
+        encoder = encoder.to(device)
+    if "RN" in name:
+        # remove attention pooling
+        encoder.attnpool = Lambda(
+            partial(rearrange, pattern="b d h w -> b (h w) d")
+        )  # remove attn pooling, just use reshaped features
+    return encoder
+def get_image_encoder(
+    name: str, device: Union[torch.device, str] = None, pretrained: bool = False
+) -> torch.nn.Module:
+    """
+    Loads image encoder module
+    """
+    if name == "nfresnet50":
+        encoder = nfresnet50(device=device, pretrained=pretrained)
+    elif "clip" in name:
+        encoder = clip_encoder(device=device, name=name)
+    else:
+        raise ValueError(f"image encoder {name} not recognized")
+    return encoder

magma/image_input.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import requests
+from io import BytesIO
+import  PIL.Image as PilImage
+from typing import Callable
+class ImageInput():
+    """Wrapper to handle image inputs both from local paths and urls
+    Args:
+        path_or_url (str): path or link to image.
+    """
+    def __init__(self, path_or_url):
+        self.path_or_url = path_or_url
+        if self.path_or_url.startswith("http://") or self.path_or_url.startswith("https://"):
+            try:
+                response = requests.get(path_or_url)
+                self.pil_image = PilImage.open(BytesIO(response.content))
+            except:
+                raise Exception(f'Could not retrieve image from url:\n{self.path_or_url}')
+        else:
+            self.pil_image = PilImage.open(path_or_url)
+    def get_transformed_image(self, transform_fn: Callable):  ## to be called internally
+        return transform_fn(self.pil_image)

magma/image_prefix.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import torch
+import torch.nn as nn
+from torchtyping import TensorType
+from einops import rearrange
+from .image_encoders import get_image_encoder
+from .config import MultimodalConfig
+# ------------------------- Image prefix ----------------------------------
+# for models that are fixed to a specific sequence lengths (i.e clip models with no pooling), the sequence lengths are below
+ENCODER_SEQ_LENS = {
+    "clip_resnet": 49,
+    "clip_resnet_large": 144,
+}
+ENCODER_OUT_DIMS = {
+    "nfresnet50": 2048,
+    "clip": 512,
+    "clip_resnet": 2560,
+    "clip_resnet_large": 3072,
+}
+class ImagePrefix(nn.Module):
+    """
+    Takes in a batch of images and returns a batch of embeddings of the
+    same dimensions as the LM's word embeddings.
+    :param config: MultimodalConfig object
+    :param out_dim: output dimension of the embedding
+    :param device: device to run the model on
+    """
+    def __init__(
+        self,
+        config: MultimodalConfig,
+        out_dim: int = 2048,
+        device=None,
+    ):
+        super().__init__()
+        self.device = device or torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu"
+        )
+        self.config = config
+        self.encoder_type = config.encoder_name
+        # get image encoder backbone
+        self.enc = get_image_encoder(
+            config.encoder_name,
+            pretrained=config.pretrained_img_encoder,
+        )
+        self.encoder_out_dim = ENCODER_OUT_DIMS[
+            self.encoder_type
+        ]  # out dim for image encoder
+        self.out_dim = out_dim  # out dim for lm
+        # set the out seq len to that specified in the config, or for some models, the hardcoded value
+        self.out_seq_len = (
+            config.image_seq_len
+            if config.encoder_name not in ENCODER_SEQ_LENS
+            else ENCODER_SEQ_LENS[config.encoder_name]
+        )
+        # get the output projection
+        proj_out_dim = (
+            (self.out_dim * self.out_seq_len)
+            if self.encoder_type not in ENCODER_SEQ_LENS
+            else self.out_dim
+        )
+        self.proj = nn.Linear(self.encoder_out_dim, proj_out_dim)
+        self.dropout = nn.Dropout(config.image_embed_dropout_prob)
+        self.use_layernorm = config.use_image_embed_layernorm
+        if self.use_layernorm:
+            self.ln = nn.LayerNorm(self.out_dim)
+    def forward(
+        self, x: TensorType["b", "c", "h", "w"]
+    ) -> TensorType["b", "seq", "out_dim"]:
+        # pass through image encoder
+        logits = self.enc(x)
+        # remove trailing dimensions of size 1 + pass through linear
+        if logits.ndim == 4:
+            logits = rearrange(logits, "b d 1 1 -> b d")
+        elif logits.ndim == 3:
+            assert self.encoder_type in ENCODER_SEQ_LENS
+        else:
+            assert logits.ndim == 2
+        logits = self.proj(logits)
+        # reshape to desired output shape
+        if (
+            self.encoder_type not in ENCODER_SEQ_LENS
+        ):  # don't need to reshape those with fixed seq lens / no pooling
+            logits = rearrange(
+                logits, "b (s d) -> b s d", d=self.out_dim, s=self.out_seq_len
+            )
+        # pass through dropout and layer norm
+        logits = self.dropout(logits)
+        if self.use_layernorm:
+            logits = self.ln(logits)
+        return logits

magma/language_model.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import torch
+from transformers import GPTNeoForCausalLM, AutoConfig, GPT2LMHeadModel
+from .utils import print_main
+from pathlib import Path
+from transformers.modeling_utils import no_init_weights
+LANGUAGE_MODELS = [
+    "gptj",
+]
+def gptj_config():
+    config = AutoConfig.from_pretrained("EleutherAI/gpt-neo-2.7B")
+    config.attention_layers = ["global"] * 28
+    config.attention_types = [["global"], 28]
+    config.num_layers = 28
+    config.num_heads = 16
+    config.hidden_size = 256 * config.num_heads
+    config.vocab_size = 50400
+    config.rotary = True
+    config.rotary_dim = 64
+    config.jax = True
+    config.gradient_checkpointing = True
+    return config
+def get_gptj(
+    gradient_checkpointing: bool = True,
+    from_pretrained=False,
+) -> torch.nn.Module:
+    """
+    Loads GPTJ language model from HF
+    """
+    print_main("Loading GPTJ language model...")
+    config = gptj_config()
+    config.gradient_checkpointing = gradient_checkpointing
+    if gradient_checkpointing:
+        config.use_cache = False
+    config.model_device = "cpu"
+    if from_pretrained:
+        raise NotImplemented("GPTJ pretrained not implemented")
+    else:
+        with no_init_weights():
+            model = GPTNeoForCausalLM(config=config)
+    return model

magma/magma.py ADDED Viewed

	@@ -0,0 +1,301 @@

+from pathlib import Path
+from os.path import exists
+import torch
+import torch.nn as nn
+from copy import deepcopy
+from typing import Literal, Optional, List
+from torchtyping import TensorType
+from transformers.file_utils import ModelOutput
+from magma.config import MultimodalConfig
+from magma.utils import get_tokenizer
+from .language_model import get_gptj
+from .adapters import (
+    Adapter,
+    ParallelAdapter,
+    AdapterWrapper,
+    ParallelAdapterWrapper,
+)
+from .image_prefix import ImagePrefix
+from .sampling import generate
+from .utils import build_labels, is_url, print_main, download_checkpoint
+from .image_input import ImageInput
+from .transforms import get_transforms
+# ------------------------- Magma main class ----------------------------------
+class Magma(nn.Module):
+    def __init__(self, config, device=None):
+        super().__init__()
+        if isinstance(config, (str, Path)):
+            config = MultimodalConfig.from_yml(
+                config
+            )  # load config from yml file if config is a string
+        else:
+            assert isinstance(config, MultimodalConfig)
+        self.device = device or torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu"
+        )
+        self.config = config
+        self.lm = get_gptj().to(self.device)
+        self.seq_len = self.lm.config.max_position_embeddings
+        self.tokenizer = get_tokenizer("gpt2", sequence_length=self.seq_len)
+        self.image_token = self.tokenizer.cls_token_id
+        self.eos_token = self.tokenizer.eos_token_id
+        self.lm.resize_token_embeddings(len(self.tokenizer))
+        self.lm.config.pad_token_id = self.tokenizer.eos_token_id
+        self.word_embedding = self.lm.transformer.wte.to(device)
+        self.transformer = self.lm.transformer.h
+        # adapter settings
+        self.mlp_adapter_added, self.attn_adapter_added = False, False
+        self.image_prefix = ImagePrefix(
+            config=config,
+            out_dim=self.lm.config.hidden_size,
+        ).to(self.device)
+        # might change based on the type of image encoder, so get from prefix instead of config
+        self.image_prefix_seq_len = self.image_prefix.out_seq_len
+        self.transforms = get_transforms(
+            config.image_size,
+            config.encoder_name,
+            input_resolution=self.image_prefix.enc.input_resolution,
+        )
+        # add adapters
+        if config.adapter_config:
+            mlp_config = deepcopy(config.adapter_config.get("mlp", None))
+            if mlp_config:
+                assert mlp_config.get("adapter_type") is not None
+                self.add_adapters(
+                    location="mlp",
+                    adapter_type=mlp_config.pop("adapter_type"),
+                    downsample_factor=mlp_config.pop("downsample_factor", 4),
+                    **mlp_config,
+                )
+            attn_config = deepcopy(config.adapter_config.get("attention", None))
+            if attn_config:
+                assert attn_config.get("adapter_type") is not None
+                self.add_adapters(
+                    location="attention",
+                    adapter_type=attn_config.pop("adapter_type"),
+                    **attn_config,
+                )
+        # freeze parameters
+        if config.freeze_lm:
+            for name, param in self.lm.named_parameters():  # freeze lm weights
+                if config.adapter_config and "adapter" in name:
+                    param.requires_grad = True
+        if config.freeze_img_encoder:
+            for param in self.image_prefix.enc.parameters():
+                param.requires_grad = False
+    def add_adapters(
+        self,
+        downsample_factor: int = 4,
+        adapter_type: Literal["normal", "parallel", "scaled_parallel"] = "normal",
+        location: Literal["mlp", "attention"] = "mlp",
+        ff_attr: str = "mlp",
+        attn_attr: str = "attn",
+        **adapter_kwargs,
+    ):
+        """
+        Adds an adapter layer to `self` at the specified location
+        """
+        assert adapter_type in [
+            "normal",
+            "parallel",
+            "scaled_parallel",
+        ], "adapter_type must be one of 'normal', 'parallel', or 'scaled_parallel'"
+        assert location in [
+            "mlp",
+            "attention",
+        ], "location must be one of 'mlp' or 'attention'"
+        for l in range(len(self.transformer)):
+            if location == "mlp":
+                if self.mlp_adapter_added:
+                    raise ValueError("Adapter layer already added")
+                mlp = getattr(self.transformer[l], ff_attr)
+                if adapter_type in ["parallel", "scaled_parallel"]:
+                    adapter_layer = ParallelAdapter(
+                        module=mlp,
+                        dim=self.lm.config.hidden_size,
+                        downsample_factor=downsample_factor,
+                        scaled=adapter_type == "scaled_parallel",
+                        **adapter_kwargs,
+                    )
+                else:
+                    adpt = Adapter(
+                        dim=self.lm.config.hidden_size,
+                        downsample_factor=downsample_factor,
+                        **adapter_kwargs,
+                    )
+                    adapter_layer = nn.Sequential(
+                        *[
+                            mlp,
+                            adpt,
+                        ]
+                    )
+                setattr(self.transformer[l], ff_attr, adapter_layer)
+            else:
+                if self.attn_adapter_added:
+                    raise ValueError("Adapter layer already added")
+                attn = getattr(self.transformer[l], attn_attr)
+                if adapter_type in ["parallel", "scaled_parallel"]:
+                    adapter_layer = ParallelAdapterWrapper(
+                        module=attn,
+                        dim=self.lm.config.hidden_size,
+                        downsample_factor=downsample_factor,
+                        scaled="scaled" in adapter_type,
+                        **adapter_kwargs,
+                    )
+                else:
+                    adapter_layer = AdapterWrapper(
+                        attn_block=attn,
+                        dim=self.lm.config.hidden_size,
+                        downsample_factor=downsample_factor,
+                        **adapter_kwargs,
+                    )
+                setattr(self.transformer[l], attn_attr, adapter_layer)
+        if location == "mlp":
+            self.mlp_adapter_added = True
+        else:
+            self.attn_adapter_added = True
+    def preprocess_inputs(self, input_list: list, embed = True) -> List[torch.Tensor]:
+        """
+        Expects a list of strings and instances of ImageInput
+        Converts them into a list of tensors and then optionally runs self.embed over it
+        """
+        for i in range(len(input_list)):
+            inp = input_list[i]
+            if isinstance(inp, str):
+                input_list[i] = self.tokenizer.encode(inp, return_tensors="pt")
+            elif isinstance(inp, ImageInput):
+                input_list[i] = inp.get_transformed_image(transform_fn = self.transforms)
+            else:
+                raise Exception(f'Invalid input type:{type(inp)}')
+        if embed == True:
+            return self.embed(input_list)
+        else:
+            return input_list
+    def embed(self, inputs: List[torch.Tensor]) -> TensorType["b", "s", "d"]:
+        """
+        Embeds a list of tensors In the correct format to input into the LM (b, s, d).
+        For each tensor, if it's 2d assume it's text and use word embedding,
+        if it's 4d, assume it's an image, and use image_prefix to embed.
+        """
+        emb_list = []
+        for x in inputs:
+            if x.ndim == 2:
+                x = x.to(self.device)
+                emb_list.append(self.word_embedding(x))
+            elif x.ndim == 4:
+                x = x.to(self.device).half()
+                image_embeddings = self.image_prefix(x)
+                emb_list.append(image_embeddings)
+            else:
+                raise ValueError(f"Expected 2d or 4d tensor, got {x.ndim}d")
+        return torch.cat(emb_list, dim=1)
+    @torch.no_grad()
+    def generate(
+        self,
+        embeddings: TensorType["b", "s", "d"],
+        max_steps: int = 100,
+        temperature: float = 0.7,
+        top_k: int = 0,
+        top_p: float = 0.9,
+        decode: bool = True,
+    ):
+        """
+        Generates captions for a batch of embeddings.
+        """
+        return generate(
+            self,
+            embeddings=embeddings,
+            max_steps=max_steps,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            decode=decode,
+        )
+    def forward(
+        self,
+        images: TensorType["b", "c", "h", "w"] = None,
+        captions: Optional[TensorType["b", "seq"]] = None,
+        output_hidden_states: bool = False,
+        input_embeddings: TensorType["b", "s", "d"] = None,
+    ) -> ModelOutput:
+        assert captions is not None, "Must provide captions in training"
+        assert any([i is not None for i in [images, input_embeddings]]) and not all(
+            [i is not None for i in [images, input_embeddings]]
+        ), "Pass in either images, or input embeddings, not both."
+        assert (
+            captions.shape[1] == self.seq_len
+        ), f"in training, captions should be padded to sequence length ({self.seq_len}), but are length {captions.shape[1]}"
+        if input_embeddings is None:
+            input_embeddings = self.image_prefix(images)
+        labels = build_labels(
+            input_embeddings, captions, self.eos_token, self.device
+        )  # build labels from input_embeddings
+        word_embeddings = self.word_embedding(captions)
+        # join together
+        input_embeddings = torch.cat(
+            (
+                input_embeddings,
+                word_embeddings[:, : -input_embeddings.shape[1], :],
+            ),  # remove padding in the word embedding before concatenating
+            dim=1,
+        )
+        # forward joined embeddings through lm
+        lm_outputs = self.lm(
+            inputs_embeds=input_embeddings,
+            labels=labels,
+            output_hidden_states=output_hidden_states,
+        )
+        return lm_outputs
+    @classmethod
+    def from_checkpoint(cls, config_path, checkpoint_path, device = 'cpu'):
+        """
+        Loads a model checkpoint from disk / downlods from url if not present
+        """
+        checkpoint_url = 'https://drive.google.com/u/0/uc?id=1EiAY3IcKWmGADaLDzdG25ykQghUwza6L&export=download'
+        if exists(checkpoint_path) ==  False:
+            print_main(f'checkpoint: {checkpoint_path} does not exist, downloading model')
+            download_checkpoint(checkpoint_url = checkpoint_url, save_as = checkpoint_path)
+        model = cls(config = config_path)
+        sd = torch.load(checkpoint_path, map_location=torch.device("cpu"))
+        if "module" in sd.keys():
+            sd = sd["module"]
+        print_main('loading checkpoint magma')
+        model.load_state_dict(sd, strict=False)
+        print_main("magma model successfully loaded")
+        model.half().to(device)
+        return model

magma/sampling.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import torch
+import torch.nn.functional as F
+from torchtyping import TensorType
+from typing import Union, List
+def top_p_filter(logits: TensorType[..., "vocab"], threshold: float = 0.9):
+    """
+    Nucleus sampling
+    """
+    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+    cum_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+    sorted_indices_to_remove = cum_probs > (1 - threshold)
+    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+    sorted_indices_to_remove[..., 0] = 0
+    sorted_logits[sorted_indices_to_remove] = float("-inf")
+    return sorted_logits.scatter(1, sorted_indices, sorted_logits)
+def top_k_filter(logits, k):
+    """
+    Top K sampling
+    """
+    assert k > 0
+    val, ind = torch.topk(logits, k)
+    probs = torch.full_like(logits, float("-inf"))
+    probs.scatter_(1, ind, val)
+    return probs
+def remove_tokens_after_eos(tensor, eos_token, image_token):
+    # any tokens after and end of sequence token is produced are also set to the eos token, and removed
+    eos_index = (tensor == eos_token).nonzero()
+    if eos_index.any():
+        tensor[eos_index[0] :] = eos_token
+    tensor = tensor.tolist()
+    return [i for i in tensor if (not i == image_token) and (not i == eos_token)]
+@torch.no_grad()
+def generate(
+    model: "Magma",
+    embeddings: TensorType["b", "s", "d"],
+    max_steps: int = 100,
+    temperature: float = 0.7,
+    top_k: int = 0,
+    top_p: float = 0.9,
+    eos_token: int = None,
+    decode: bool = True,
+) -> Union[List[str], TensorType["b", "s"]]:
+    """
+    Generates captions for a batch of embeddings.
+    :param model: The model to use for generation.
+    :param embeddings: The embeddings to generate captions for.
+    :param max_steps: The maximum number of steps to generate captions for.
+    :param temperature: The temperature to use for sampling.
+    :param top_k: value for top k sampling. If 0, no sampling will be used.
+    :param top_p: value for top p sampling. If 0, no sampling will be used.
+    :param eos_token: The token to use for end of sequence.
+    :param decode: Whether to decode the output into text, or return the raw tokens.
+    """
+    # init values
+    eos_token = eos_token or model.eos_token
+    was_training = model.training
+    model.eval()
+    b, s, _ = embeddings.shape
+    past_key_values = None
+    # init output with image tokens
+    out = torch.zeros((b, s), dtype=torch.long).to(model.device) + model.image_token
+    # do sampling
+    for i in range(max_steps):
+        if i == 0:
+            # initial input
+            outputs = model.lm(
+                inputs_embeds=embeddings,
+                use_cache=True,
+                past_key_values=past_key_values,
+            )
+        else:
+            # now caching past k/v so we can use only the last token
+            outputs = model.lm(
+                input_ids=out[:, -1:], use_cache=True, past_key_values=past_key_values
+            )
+        logits = outputs.logits[:, -1, :].float()
+        past_key_values = outputs.past_key_values
+        # filter / temperature sample
+        if temperature == 0.0:
+            next_token = torch.argmax(logits, dim=-1)
+        else:
+            if top_k > 0:
+                logits = top_k_filter(logits, k=top_k)
+            if top_p > 0:
+                logits = top_p_filter(logits, threshold=top_p)
+            probs = F.softmax(logits / temperature, dim=-1)
+            next_token = torch.multinomial(probs, num_samples=1)
+        out = torch.cat((out, next_token), dim=-1)
+        if eos_token is not None and (next_token == eos_token).all():
+            break
+    if decode:
+        captions = []
+        for b in out:
+            b = remove_tokens_after_eos(b, eos_token, model.image_token)
+            caption = model.tokenizer.decode(b)
+            captions.append(caption)
+        out = captions
+    model.train(was_training)
+    return out

magma/train_loop.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import torch
+from tqdm import tqdm
+from .utils import reduce_losses, to_cuda_half
+from torchvision.utils import make_grid
+def train_step(config, train_loader, model_engine):
+    losses = []
+    for _ in range(config.gradient_accumulation_steps):
+        images, captions = next(train_loader)
+        images, captions = images.half().cuda(), captions.cuda()
+        if config.run_blind:
+            images = torch.zeros_like(images)
+        outputs = model_engine(images, captions)
+        loss = outputs.loss
+        losses.append(loss)
+        model_engine.backward(loss)
+        model_engine.step()
+    return reduce_losses(torch.mean(torch.stack(losses))).item()
+def train_step_classification(config, train_loader, model_engine, return_accuracy=True):
+    losses = []
+    if return_accuracy:
+        accuracies = []
+    for _ in range(config.gradient_accumulation_steps):
+        images, captions, class_labels = next(train_loader)
+        images, captions, class_labels = to_cuda_half(images, captions, class_labels)
+        if config.run_blind:
+            images = torch.zeros_like(images)
+        loss, logits = model_engine(images, captions, class_labels)
+        losses.append(loss)
+        if return_accuracy:
+            argmax_pred = logits.argmax(dim=-1)
+            accuracies.append((argmax_pred == class_labels).float().mean())
+        model_engine.backward(loss)
+        model_engine.step()
+    loss_reduced = reduce_losses(torch.mean(torch.stack(losses))).item()
+    if return_accuracy:
+        accuracy_reduced = reduce_losses(torch.mean(torch.stack(accuracies))).item()
+        return loss_reduced, accuracy_reduced
+    return loss_reduced
+def eval_step(config, eval_loader, model_engine):
+    losses = []
+    for i in tqdm(range(config.eval_steps), "evaluating..."):
+        images, captions = next(eval_loader)
+        images, captions = images.half().cuda(), captions.cuda()
+        if config.run_blind:
+            images = torch.zeros_like(images)
+        outputs = model_engine(images, captions)
+        loss = outputs.loss
+        losses.append(loss)
+    return reduce_losses(torch.mean(torch.stack(losses))).item()
+def eval_step_classification(config, train_loader, model_engine, return_accuracy=True):
+    losses = []
+    if return_accuracy:
+        accuracies = []
+    for _ in range(config.gradient_accumulation_steps):
+        images, captions, class_labels = next(train_loader)
+        images, captions, class_labels = to_cuda_half(images, captions, class_labels)
+        if config.run_blind:
+            images = torch.zeros_like(images)
+        loss, logits = model_engine(images, captions, class_labels)
+        losses.append(loss)
+        if return_accuracy:
+            argmax_pred = logits.argmax(dim=-1)
+            accuracies.append((argmax_pred == class_labels).float().mean())
+    loss_reduced = reduce_losses(torch.mean(torch.stack(losses))).item()
+    if return_accuracy:
+        accuracy_reduced = reduce_losses(torch.mean(torch.stack(accuracies))).item()
+        return loss_reduced, accuracy_reduced
+    return loss_reduced
+def inference_step(config, eval_loader, model_engine):
+    images, _ = next(eval_loader)
+    images = images.half().cuda()
+    if config.run_blind:
+        images = torch.zeros_like(images)
+    captions = model_engine(
+        images, captions=None, inference=True
+    )  # [caption1, caption2, ... b]
+    width = min(2, images.shape[0])
+    image_grid = make_grid(images[:width])
+    caption = ""
+    for i in range(width):
+        caption += f"Caption {i}: \n{captions[i]}\n"
+    return image_grid, caption

magma/transforms.py ADDED Viewed

	@@ -0,0 +1,134 @@

+from torchvision import transforms as T
+import torch.nn.functional as F
+from PIL import ImageOps
+import PIL
+import random
+def pad_to_size(x, size=256):
+    delta_w = size - x.size[0]
+    delta_h = size - x.size[1]
+    padding = (
+        delta_w // 2,
+        delta_h // 2,
+        delta_w - (delta_w // 2),
+        delta_h - (delta_h // 2),
+    )
+    new_im = ImageOps.expand(x, padding)
+    return new_im
+def pad_to_size_tensor(x, size=256):
+    offset_dim_1 = size - x.shape[1]
+    offset_dim_2 = size - x.shape[2]
+    padding_dim_1 = max(offset_dim_1 // 2, 0)
+    padding_dim_2 = max(offset_dim_2 // 2, 0)
+    if offset_dim_1 % 2 == 0:
+        pad_tuple_1 = (padding_dim_1, padding_dim_1)
+    else:
+        pad_tuple_1 = (padding_dim_1 + 1, padding_dim_1)
+    if offset_dim_2 % 2 == 0:
+        pad_tuple_2 = (padding_dim_2, padding_dim_2)
+    else:
+        pad_tuple_2 = (padding_dim_2 + 1, padding_dim_2)
+    padded = F.pad(x, pad=(*pad_tuple_2, *pad_tuple_1, 0, 0))
+    return padded
+class RandCropResize(object):
+    """
+    Randomly crops, then randomly resizes, then randomly crops again, an image. Mirroring the augmentations from https://arxiv.org/abs/2102.12092
+    """
+    def __init__(self, target_size):
+        self.target_size = target_size
+    def __call__(self, img):
+        img = pad_to_size(img, self.target_size)
+        d_min = min(img.size)
+        img = T.RandomCrop(size=d_min)(img)
+        t_min = min(d_min, round(9 / 8 * self.target_size))
+        t_max = min(d_min, round(12 / 8 * self.target_size))
+        t = random.randint(t_min, t_max + 1)
+        img = T.Resize(t)(img)
+        if min(img.size) < 256:
+            img = T.Resize(256)(img)
+        return T.RandomCrop(size=self.target_size)(img)
+def get_transforms(
+    image_size, encoder_name, input_resolution=None, use_extra_transforms=False
+):
+    if "clip" in encoder_name:
+        assert input_resolution is not None
+        return clip_preprocess(input_resolution)
+    base_transforms = [
+        T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
+        RandCropResize(image_size),
+        T.RandomHorizontalFlip(p=0.5),
+    ]
+    if use_extra_transforms:
+        extra_transforms = [T.ColorJitter(0.1, 0.1, 0.1, 0.05)]
+        base_transforms += extra_transforms
+    base_transforms += [
+        T.ToTensor(),
+        maybe_add_batch_dim,
+    ]
+    base_transforms = T.Compose(base_transforms)
+    return base_transforms
+def maybe_add_batch_dim(t):
+    if t.ndim == 3:
+        return t.unsqueeze(0)
+    else:
+        return t
+def pad_img(desired_size):
+    def fn(im):
+        old_size = im.size  # old_size[0] is in (width, height) format
+        ratio = float(desired_size) / max(old_size)
+        new_size = tuple([int(x * ratio) for x in old_size])
+        im = im.resize(new_size, PIL.Image.ANTIALIAS)
+        # create a new image and paste the resized on it
+        new_im = PIL.Image.new("RGB", (desired_size, desired_size))
+        new_im.paste(
+            im, ((desired_size - new_size[0]) // 2, (desired_size - new_size[1]) // 2)
+        )
+        return new_im
+    return fn
+def crop_or_pad(n_px, pad=False):
+    if pad:
+        return pad_img(n_px)
+    else:
+        return T.CenterCrop(n_px)
+def clip_preprocess(n_px, use_pad=False):
+    return T.Compose(
+        [
+            T.Resize(n_px, interpolation=T.InterpolationMode.BICUBIC),
+            crop_or_pad(n_px, pad=use_pad),
+            lambda image: image.convert("RGB"),
+            T.ToTensor(),
+            maybe_add_batch_dim,
+            T.Normalize(
+                (0.48145466, 0.4578275, 0.40821073),
+                (0.26862954, 0.26130258, 0.27577711),
+            ),
+        ]
+    )

magma/utils.py ADDED Viewed

	@@ -0,0 +1,372 @@

+import argparse
+import torch.distributed as dist
+from transformers import GPT2TokenizerFast
+import deepspeed
+from pathlib import Path
+import wandb
+import os
+import yaml
+import torch
+from collections import defaultdict
+from torchtyping import TensorType
+import gdown
+def is_main():
+    if dist.is_initialized():
+        return dist.get_rank() == 0
+    return True
+def print_main(*msg):
+    if is_main():
+        print(*msg)
+def reduce_losses(losses):
+    """Reduce a tensor of losses across all GPUs."""
+    if dist.is_initialized():
+        losses = losses.detach().clone()
+        # We use `all_reduce` because it is better supported than `reduce`
+        dist.all_reduce(losses, dist.ReduceOp.SUM)
+        return losses / dist.get_world_size()
+    else:
+        return losses
+def cycle(loader):
+    while True:
+        for data in loader:
+            yield data
+def get_tokenizer(name="gpt2", sequence_length=2048):
+    """
+    Gets tokenizer for LM
+    """
+    if name == "gpt2":
+        tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
+        tokenizer.pad_token_id = tokenizer.eos_token
+        tokenizer.padding_side = "right"
+        tokenizer.model_max_length = sequence_length
+        # setup lm settings
+        tokenizer.add_special_tokens(
+            {"cls_token": "<|image|>"}
+        )  # add special image token to tokenizer
+    else:
+        raise ValueError(f"Tokenizer {name} not recognized")
+    return tokenizer
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config", type=str, required=False, help="path to your training config"
+    )
+    parser.add_argument(
+        "--local_rank",
+        type=int,
+        default=-1,
+        help="local rank passed from distributed launcher",
+    )
+    deepspeed.add_config_arguments(parser)
+    args = parser.parse_args()
+    args.deepspeed = True
+    return args
+def wandb_log(*args, **kwargs):
+    if is_main():
+        wandb.log(*args, **kwargs)
+def wandb_init(*args, **kwargs):
+    if is_main():
+        wandb.init(*args, **kwargs)
+def save_model(model_engine, save_dir, global_step, config=None):
+    os.makedirs(save_dir, exist_ok=True)
+    if config is not None:
+        config = config.to_dict()
+        with open(str(Path(save_dir) / "config.yml"), "w") as f:
+            yaml.dump(config, f, default_flow_style=False)
+    sd = {"global_step": global_step, "config": config}
+    model_engine.save_checkpoint(save_dir, client_state=sd)
+def load_model(
+    model_engine, load_dir, load_optimizer_states=True, load_lr_scheduler_states=True
+):
+    """
+    Loads a model from disk and returns the global step to resume from if loading was successful, otherwise returns 0
+    """
+    try:
+        load_path, sd = model_engine.load_checkpoint(
+            load_dir,
+            load_optimizer_states=load_optimizer_states,
+            load_lr_scheduler_states=load_lr_scheduler_states,
+        )
+    except AssertionError as e:
+        load_path = None
+        print(e)
+    if load_path is None:
+        print("Model loading failed - starting from global step 0")
+        return 0
+    return sd["global_step"]
+def get_params_for_weight_decay_optimization(module, config):
+    """
+    Divide params into with-weight-decay and without-weight-decay groups.
+    Layernorms and biases will have no weight decay but the rest will.
+    """
+    weight_decay_params = {"params": []}
+    no_weight_decay_params = {"params": [], "weight_decay": 0.0}
+    blacklist_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
+    for module_ in module.modules():
+        if isinstance(module_, blacklist_modules) or (
+            config.weight_decay == 0.0
+        ):  # also include all parameters here if no weight decay is being done
+            no_weight_decay_params["params"].extend(
+                [
+                    p
+                    for p in list(module_._parameters.values())
+                    if (p is not None) and p.requires_grad
+                ]
+            )
+        else:
+            for n, p in list(module_._parameters.items()):
+                if p is not None and p.requires_grad:
+                    if n != "bias":
+                        weight_decay_params["params"].append(p)
+                    else:
+                        no_weight_decay_params["params"].append(p)
+    param_dict = {
+        pn: p
+        for pn, p in module.named_parameters()
+        if p is not None and p.requires_grad
+    }
+    assert len(no_weight_decay_params["params"]) + len(
+        weight_decay_params["params"]
+    ) == len(
+        param_dict.keys()
+    ), "Number of params in both groups != total number of trainable params"
+    if config.weight_decay == 0.0:
+        # only return a single param group if no weight decay is being used anyway
+        return [no_weight_decay_params]
+    return [weight_decay_params, no_weight_decay_params]
+def configure_param_groups(model, config):
+    """
+    Configures the different parameter groups in the model for training.
+    If a separate learning rate for the image prefix is provided, we separate out the groups here.
+    Additionally, parameters to which weight decay shouldn't be applied (layernorms / biases) are separated.
+    """
+    if config.image_enc_lr is not None:
+        # get the params for the image prefix / proj
+        image_enc_params = get_params_for_weight_decay_optimization(
+            model.image_prefix.enc, config
+        )
+        for pdict in image_enc_params:
+            pdict["lr"] = config.image_enc_lr
+        image_proj_params = get_params_for_weight_decay_optimization(
+            model.image_prefix.proj, config
+        )
+        # get the params for layernorm if it exists
+        if config.use_image_embed_layernorm:
+            image_ln_params = get_params_for_weight_decay_optimization(
+                model.image_prefix.ln, config
+            )
+            image_proj_params += image_ln_params
+        # get the params for the lm
+        lm_params = get_params_for_weight_decay_optimization(model.lm, config)
+        # get params for class head if it exists
+        class_params = []
+        if hasattr(model, "class_head") and model.class_head is not None:
+            class_params = get_params_for_weight_decay_optimization(
+                model.class_head, config
+            )
+        all_params = []
+        for p in image_enc_params + lm_params + image_proj_params + class_params:
+            if p["params"]:
+                all_params.append(p)
+    else:
+        all_params = get_params_for_weight_decay_optimization(model, config)
+    # merge param dicts with shared lr / wd values
+    d = defaultdict(dict)
+    for param_group in all_params:
+        lr = param_group.get("lr", None)
+        wd = param_group.get("weight_decay", None)
+        key = f"lr_{lr}_wd_{wd}"
+        if d[key].get("params") is None:
+            d[key]["params"] = []
+        d[key]["params"].extend(param_group["params"])
+        if lr is not None:
+            d[key]["lr"] = lr
+        if wd is not None:
+            d[key]["weight_decay"] = wd
+    all_params = list(d.values())
+    n_params = sum([len(d["params"]) for d in all_params])
+    param_dict = {
+        pn: p for pn, p in model.named_parameters() if p is not None and p.requires_grad
+    }
+    assert n_params == len(
+        param_dict
+    ), f"Some parameters are missing from param groups ({n_params} | {len(param_dict)})"
+    # if we're using multiple param groups, set the min / max lr for each one[]
+    # appropriately in deepspeed's scheduler
+    config.deepspeed_config_params["scheduler"]["params"]["warmup_min_lr"] = [
+        config.min_lr for _ in all_params
+    ]
+    config.deepspeed_config_params["scheduler"]["params"]["warmup_max_lr"] = [
+        d.get("lr", config.lr) for d in all_params
+    ]
+    return all_params
+def count_parameters(model):
+    """
+    Counts the number of trainable parameters in a model
+    """
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+def log_table(name, model_outputs, gt_answers_list, global_step):
+    results_table = wandb.Table(columns=["model output", "ground truth(s)"])
+    for o, gt in zip(model_outputs, gt_answers_list):
+        results_table.add_data(o, gt)
+    wandb_log({f"eval/{name}": results_table}, step=global_step)
+def get_world_info():
+    local_rank = int(os.environ["LOCAL_RANK"])
+    rank = int(os.environ["RANK"])
+    world_size = int(os.environ["WORLD_SIZE"])
+    return local_rank, rank, world_size
+def init_distributed(backend="nccl"):
+    if not torch.distributed.is_initialized():
+        deepspeed.init_distributed(
+            dist_backend=backend, verbose=True, auto_mpi_discovery=True
+        )
+    local_rank, rank, world_size = get_world_info()
+    torch.cuda.set_device(local_rank)
+    return local_rank, rank, world_size
+def collate_fn_classification(batch_data, seq_len=2048):
+    # for nvlr2: list(zip*(batch_data)) = [l_images, r_images, captions, class_labels]
+    image_list = list(zip(*batch_data))[:-2]
+    captions, class_labels = list(zip(*batch_data))[-2:]
+    # images, captions, class_labels = list(zip(*batch_data))
+    images_list = [torch.cat(image) for image in image_list]
+    captions = torch.cat([i[:, :seq_len] for i in captions])
+    class_labels = torch.stack(class_labels)
+    return images_list, captions, class_labels
+def infer_checkpoint_path_from_config(config):
+    checkpoint_folder = config.save
+    if checkpoint_folder is None:
+        raise ValueError(
+            "No checkpoint folder specified in config. Please provide a checkpoint."
+        )
+    # check for 'latest' tag in checkpoint folder
+    if (Path(checkpoint_folder) / "latest").exists():
+        latest_ckpt = (Path(checkpoint_folder) / "latest").read_text().strip()
+    else:
+        raise ValueError(
+            f"No checkpoint found in {checkpoint_folder}. Please provide a checkpoint."
+        )
+    checkpoint_path = str(
+        Path(checkpoint_folder) / latest_ckpt / "mp_rank_00_model_states.pt"
+    )
+    if not Path(checkpoint_path).exists():
+        raise ValueError(
+            f"No checkpoint found in {checkpoint_path}. Please provide a checkpoint."
+        )
+    return checkpoint_path
+# [tensor_1, tensor_2], tensor_3, tensor_4 = to_cuda_half([tensor_1, tensor_2], tensor_3, tensor_4)
+# probably not working yet
+def to_cuda_half(*args):
+    cuda_half_args = []
+    for x in args:
+        if isinstance(x, list):
+            x_cuda_half = to_cuda_half(*x)
+            cuda_half_args.append(x_cuda_half)
+        elif isinstance(x, tuple):
+            x_cuda_half = to_cuda_half(*x)
+            cuda_half_args.append(x_cuda_half)
+        else:
+            if x.dtype in [torch.float32, torch.float16]:
+                cuda_half_args.append(x.cuda().half())
+            elif x.dtype == torch.long:
+                cuda_half_args.append(x.cuda())
+    if len(cuda_half_args) == 1:
+        return cuda_half_args[0]
+    else:
+        return cuda_half_args
+def build_labels(
+    input_embeddings: TensorType["b", "s", "d"],
+    captions: TensorType["b", "s"],
+    eos_token,
+    device,
+) -> TensorType["b", "s"]:
+    """
+    Builds labels from input embeddings.
+    Masks out the labels with -100 in positions up to the seq length of the embeddings, so loss is only computed for captions,
+    and not for image tokens.
+    Additionally, masks out everything *after* the first eos token.
+    """
+    shape = input_embeddings.shape[:2]  # b, s
+    assert captions.shape[1] >= shape[1]
+    # make sure to add masked embedding tokens in the appropriate locations in the labels
+    embedding_tokens = torch.zeros(shape, dtype=torch.int64).to(device) - 100
+    labels = torch.cat(
+        (embedding_tokens, captions[:, : -shape[1]]), dim=1
+    )  # we truncate the sequence length of the captions, as they are always padded to the full sequence length
+    # mask out repeating eos tokens
+    for label in labels:
+        for k, token in enumerate(label):
+            if token == eos_token:
+                label[k + 1 :] = -100
+                break
+    return labels
+def is_url(string):
+    return string.startswith("http://") or string.startswith("https://")
+def download_checkpoint(checkpoint_url, save_as):
+    gdown.download(url = checkpoint_url, output = save_as, quiet=False)

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+torchtyping
+typeguard
+git+https://github.com/finetuneanon/transformers.git#egg=transformers
+gdown
+tqdm
+timm
+git+https://github.com/openai/CLIP.git
+deepspeed
+wandb

test.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import torch
+import numpy as np
+from magma import Magma
+from magma.language_model import get_language_model
+from magma.utils import get_tokenizer
+if __name__ == "__main__":
+    # model = Magma.from_checkpoint(
+    #     "configs/MAGMA_v1.yml",
+    #     "/mnt/localdisk/mp_rank_00_model_states.pt",
+    #     model_dir="/mnt/localdisk/gptj",
+    #     lm_from_pretrained=True,
+    # )
+    # gptj_model = model.lm
+    # model.half().cuda().eval()
+    tokenizer = get_tokenizer()
+    input_text = tokenizer.encode("this is a test", return_tensors="pt").cuda()
+    input_img = torch.ones(1, 3, 384, 384).half().cuda()
+    # input = model.embed([input_img, input_text])
+    # logits = gptj_model(inputs_embeds=input).logits
+    # logits = logits.detach().cpu().numpy()
+    # np.save("/mnt/localdisk/logits_new.npy", logits)
+    from transformers import GPTJForCausalLM
+    import torch
+    # load new model
+    model = GPTJForCausalLM.from_pretrained(
+        "EleutherAI/gpt-j-6B",
+        revision="float16",
+        torch_dtype=torch.float16,
+        low_cpu_mem_usage=True,
+    )
+    model.cuda()
+    model.eval()
+    logits = model(input_text).logits
+    logits = logits.detach().cpu().numpy()
+    np.save("/mnt/localdisk/gptj_logits_new.npy", logits)
+    print("test")

train.py ADDED Viewed

	@@ -0,0 +1,192 @@

+import torch
+import os
+import deepspeed
+import wandb
+from torch.utils.data import random_split, ConcatDataset
+from torch.optim import AdamW
+from tqdm import tqdm
+from functools import partial
+from magma.datasets import (
+    collate_fn,
+    ImgCptDataset,
+)
+from magma.magma import (
+    Magma,
+)
+from magma.utils import (
+    is_main,
+    cycle,
+    parse_args,
+    wandb_log,
+    wandb_init,
+    save_model,
+    load_model,
+    print_main,
+    configure_param_groups,
+)
+from magma.train_loop import (
+    eval_step,
+    inference_step,
+    train_step,
+)
+def _load_img_cpt_datasets(dataset_dir, tokenizer, transforms):
+    if isinstance(dataset_dir, (list, tuple)):
+        return ConcatDataset(
+            [_load_img_cpt_datasets(d, tokenizer, transforms) for d in dataset_dir]
+        )
+    elif isinstance(dataset_dir, str):
+        return ImgCptDataset(dataset_dir, tokenizer=tokenizer, transforms=transforms)
+    else:
+        raise TypeError("dataset dir wrong type")
+def get_pretraining_datasets(config, tokenizer, transforms):
+    # if config.train_dataset_dir is a list, load all datasets + join together
+    train_dataset = _load_img_cpt_datasets(
+        config.train_dataset_dir, tokenizer, transforms
+    )
+    # if no dedicated eval sets are given, use a percentage of the train dataset
+    if config.eval_dataset_dir is None:
+        eval_len = int(len(train_dataset) * config.eval_dataset_pct)
+        train_len = len(train_dataset) - eval_len
+        print(
+            f"Randomly splitting train_dataset into two datasets of length {train_len} and {eval_len}"
+        )
+        train_dataset, eval_dataset = random_split(train_dataset, [train_len, eval_len])
+    else:
+        eval_dataset = _load_img_cpt_datasets(
+            config.eval_dataset_dir, tokenizer, transforms
+        )
+    print_main(f"Loaded train dataset with {len(train_dataset)} samples")
+    print_main(f"Loaded eval dataset with {len(eval_dataset)} samples")
+    return train_dataset, eval_dataset
+# tell tokenizers not to do parallelism
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+if __name__ == "__main__":
+    # parse command line arguments:
+    args = parse_args()
+    deepspeed.init_distributed()
+    # load model + tokenizer:
+    model = Magma(
+        args.config
+    )  # for finetuning one might want to load the model via Magma.from_checkpoint(...) here
+    tokenizer, config, transforms = model.tokenizer, model.config, model.transforms
+    # filter frozen from trainable parameters:
+    trainable_parameters = configure_param_groups(model, config)
+    # load data:
+    train_dataset, eval_dataset = get_pretraining_datasets(
+        config, tokenizer, transforms
+    )
+    print_main(f"Loaded train dataset with {len(train_dataset)} samples")
+    print_main(f"Loaded eval dataset with {len(eval_dataset)} samples")
+    opt = AdamW(
+        trainable_parameters,
+        config.lr,
+        betas=(0.9, 0.95),
+        weight_decay=config.weight_decay,
+    )
+    model_engine, opt, train_loader, lr_scheduler = deepspeed.initialize(
+        args=args,
+        model=model,
+        optimizer=opt,
+        model_parameters=trainable_parameters,
+        training_data=train_dataset,
+        collate_fn=partial(collate_fn, seq_len=model.seq_len),
+        config_params=config.deepspeed_config_params,
+    )
+    eval_loader = cycle(model_engine.deepspeed_io(eval_dataset))
+    train_loader = cycle(train_loader)
+    # initialize training
+    global_step = 0
+    if config.load:
+        # loads a deepspeed checkpoint if provided. For finetuning, set load_optimizer to false
+        previous_global_step = load_model(
+            model_engine,
+            config.load,
+            load_optimizer_states=config.load_optimizer,
+            load_lr_scheduler_states=config.load_optimizer,
+        )
+        if config.load_optimizer:
+            global_step = previous_global_step
+    pbar = tqdm(
+        range(0, config.train_steps),
+        desc="training...",
+        initial=global_step,
+        total=config.train_steps,
+        disable=not is_main(),
+    )
+    wandb_init(
+        project=config.wandb_project,
+        name=config.name or wandb.util.generate_id(),
+        config=config,
+    )
+    # training loop
+    for i in pbar:
+        if global_step >= config.train_steps:
+            break
+        ##### train step
+        loss = train_step(config, train_loader, model_engine)
+        global_step += 1
+        if global_step % config.log_every == 0:
+            pbar.set_description(f"training... Step: {global_step} Loss: {loss}")
+            current_lr = (
+                [lr for lr in lr_scheduler.get_lr()]
+                if lr_scheduler is not None
+                else config.lr
+            )
+            to_log = {"train/loss": loss, "train/lr": current_lr}
+            wandb_log(to_log, step=global_step)
+        ##### Evaluation phase
+        if global_step % config.eval_every == 0:
+            model_engine.eval()
+            with torch.no_grad():
+                ##### eval step:
+                eval_loss = eval_step(config, eval_loader, model_engine)
+                wandb_log({"eval/loss": eval_loss}, step=global_step)
+                pbar.set_description(
+                    f"evaluating... Step: {global_step} Eval Loss: {eval_loss}"
+                )
+                ##### inference:
+                image_grid, caption = inference_step(config, eval_loader, model_engine)
+                wandb_log(
+                    {"inference/image": wandb.Image(image_grid, caption=caption)},
+                    step=global_step,
+                )
+            model_engine.train()
+        ##### Save model
+        if global_step % config.save_every == 0:
+            if config.save is not None:
+                save_model(model_engine, config.save, global_step)
+                print_main(f"saving model at step {global_step}")
+    ##### Save model after training is finished
+    if config.save is not None:
+        save_model(model_engine, config.save, global_step)
+        print_main(f"saving model at end of training (step {global_step})")