Spaces:

flax-community
/

dalle-mini

Running

App Files Files Community

boris commited on Jul 13, 2021

Commit

9c0e5c9

•

2 Parent(s): 3f0364c 86ba774

Merge pull request #8 from pcuenca/main

Browse files

Files changed (10) hide show

.gitignore +1 -0
dalle_mini/__init__.py +1 -0
dalle_mini/dataset.py +122 -0
dalle_mini/vqgan_jax/__init__.py +0 -0
dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +109 -0
dalle_mini/vqgan_jax/modeling_flax_vqgan.py +609 -0
encoding/vqgan-jax-encoding-with-captions.ipynb +363 -0
encoding/vqgan-jax-encoding.ipynb +0 -0
model/data-pipeline.ipynb +385 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

dalle_mini/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __version__ = "0.0.1"

dalle_mini/dataset.py ADDED Viewed

	@@ -0,0 +1,122 @@

+"""
+An image-caption dataset dataloader.
+Luke Melas-Kyriazi, 2021
+"""
+import warnings
+from typing import Optional, Callable
+from pathlib import Path
+import numpy as np
+import torch
+import pandas as pd
+from torch.utils.data import Dataset
+from torchvision.datasets.folder import default_loader
+from PIL import ImageFile
+from PIL.Image import DecompressionBombWarning
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+warnings.filterwarnings("ignore", category=UserWarning)
+warnings.filterwarnings("ignore", category=DecompressionBombWarning)
+class CaptionDataset(Dataset):
+    """
+    A PyTorch Dataset class for (image, texts) tasks. Note that this dataset
+    returns the raw text rather than tokens. This is done on purpose, because
+    it's easy to tokenize a batch of text after loading it from this dataset.
+    """
+    def __init__(self, *, images_root: str, captions_path: str, text_transform: Optional[Callable] = None,
+                 image_transform: Optional[Callable] = None, image_transform_type: str = 'torchvision',
+                 include_captions: bool = True):
+        """
+        :param images_root: folder where images are stored
+        :param captions_path: path to csv that maps image filenames to captions
+        :param image_transform: image transform pipeline
+        :param text_transform: image transform pipeline
+        :param image_transform_type: image transform type, either `torchvision` or `albumentations`
+        :param include_captions: Returns a dictionary with `image`, `text` if `true`; otherwise returns just the images.
+        """
+        # Base path for images
+        self.images_root = Path(images_root)
+        # Load captions as DataFrame
+        self.captions = pd.read_csv(captions_path, delimiter='\t', header=0)
+        self.captions['image_file'] = self.captions['image_file'].astype(str)
+        # PyTorch transformation pipeline for the image (normalizing, etc.)
+        self.text_transform = text_transform
+        self.image_transform = image_transform
+        self.image_transform_type = image_transform_type.lower()
+        assert self.image_transform_type in ['torchvision', 'albumentations']
+        # Total number of datapoints
+        self.size = len(self.captions)
+        # Return image+captions or just images
+        self.include_captions = include_captions
+    def verify_that_all_images_exist(self):
+        for image_file in self.captions['image_file']:
+            p = self.images_root / image_file
+            if not p.is_file():
+                print(f'file does not exist: {p}')
+    def _get_raw_image(self, i):
+        image_file = self.captions.iloc[i]['image_file']
+        image_path = self.images_root / image_file
+        image = default_loader(image_path)
+        return image
+    def _get_raw_text(self, i):
+        return self.captions.iloc[i]['caption']
+    def __getitem__(self, i):
+        image = self._get_raw_image(i)
+        caption = self._get_raw_text(i)
+        if self.image_transform is not None:
+            if self.image_transform_type == 'torchvision':
+                image = self.image_transform(image)
+            elif self.image_transform_type == 'albumentations':
+                image = self.image_transform(image=np.array(image))['image']
+            else:
+                raise NotImplementedError(f"{self.image_transform_type=}")
+        return {'image': image, 'text': caption} if self.include_captions else image
+    def __len__(self):
+        return self.size
+if __name__ == "__main__":
+    import albumentations as A
+    from albumentations.pytorch import ToTensorV2
+    from transformers import AutoTokenizer
+    # Paths
+    images_root = './images'
+    captions_path = './images-list-clean.tsv'
+    # Create transforms
+    tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')
+    def tokenize(text):
+        return tokenizer(text, max_length=32, truncation=True, return_tensors='pt', padding='max_length')
+    image_transform = A.Compose([
+        A.Resize(256, 256), A.CenterCrop(256, 256),
+        A.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), ToTensorV2()])
+    # Create dataset
+    dataset = CaptionDataset(
+        images_root=images_root,
+        captions_path=captions_path,
+        image_transform=image_transform,
+        text_transform=tokenize,
+        image_transform_type='albumentations')
+    # Create dataloader
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size=2)
+    batch = next(iter(dataloader))
+    print({k: (v.shape if isinstance(v, torch.Tensor) else v) for k, v in batch.items()})
+    # # (Optional) Check that all the images exist
+    # dataset = CaptionDataset(images_root=images_root, captions_path=captions_path)
+    # dataset.verify_that_all_images_exist()
+    # print('Done')

dalle_mini/vqgan_jax/__init__.py ADDED Viewed

File without changes

dalle_mini/vqgan_jax/configuration_vqgan.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from typing import Tuple
+from transformers import PretrainedConfig
+class VQGANConfig(PretrainedConfig):
+    def __init__(
+        self,
+        ch: int = 128,
+        out_ch: int = 3,
+        in_channels: int = 3,
+        num_res_blocks: int = 2,
+        resolution: int = 256,
+        z_channels: int = 256,
+        ch_mult: Tuple = (1, 1, 2, 2, 4),
+        attn_resolutions: int = (16,),
+        n_embed: int = 1024,
+        embed_dim: int = 256,
+        dropout: float = 0.0,
+        double_z: bool = False,
+        resamp_with_conv: bool = True,
+        give_pre_end: bool = False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.ch = ch
+        self.out_ch = out_ch
+        self.in_channels = in_channels
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.z_channels = z_channels
+        self.ch_mult = list(ch_mult)
+        self.attn_resolutions = list(attn_resolutions)
+        self.n_embed = n_embed
+        self.embed_dim = embed_dim
+        self.dropout = dropout
+        self.double_z = double_z
+        self.resamp_with_conv = resamp_with_conv
+        self.give_pre_end = give_pre_end
+        self.num_resolutions = len(ch_mult)

dalle_mini/vqgan_jax/convert_pt_model_to_jax.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import re
+import jax.numpy as jnp
+from flax.traverse_util import flatten_dict, unflatten_dict
+import torch
+from modeling_flax_vqgan import VQModel
+from configuration_vqgan import VQGANConfig
+regex = r"\w+[.]\d+"
+def rename_key(key):
+    pats = re.findall(regex, key)
+    for pat in pats:
+        key = key.replace(pat, "_".join(pat.split(".")))
+    return key
+# Adapted from https://github.com/huggingface/transformers/blob/ff5cdc086be1e0c3e2bbad8e3469b34cffb55a85/src/transformers/modeling_flax_pytorch_utils.py#L61
+def convert_pytorch_state_dict_to_flax(pt_state_dict, flax_model):
+    # convert pytorch tensor to numpy
+    pt_state_dict = {k: v.numpy() for k, v in pt_state_dict.items()}
+    random_flax_state_dict = flatten_dict(flax_model.params)
+    flax_state_dict = {}
+    remove_base_model_prefix = (flax_model.base_model_prefix not in flax_model.params) and (
+        flax_model.base_model_prefix in set([k.split(".")[0] for k in pt_state_dict.keys()])
+    )
+    add_base_model_prefix = (flax_model.base_model_prefix in flax_model.params) and (
+        flax_model.base_model_prefix not in set([k.split(".")[0] for k in pt_state_dict.keys()])
+    )
+    # Need to change some parameters name to match Flax names so that we don't have to fork any layer
+    for pt_key, pt_tensor in pt_state_dict.items():
+        pt_tuple_key = tuple(pt_key.split("."))
+        has_base_model_prefix = pt_tuple_key[0] == flax_model.base_model_prefix
+        require_base_model_prefix = (flax_model.base_model_prefix,) + pt_tuple_key in random_flax_state_dict
+        if remove_base_model_prefix and has_base_model_prefix:
+            pt_tuple_key = pt_tuple_key[1:]
+        elif add_base_model_prefix and require_base_model_prefix:
+            pt_tuple_key = (flax_model.base_model_prefix,) + pt_tuple_key
+        # Correctly rename weight parameters
+        if (
+            "norm" in pt_key
+            and (pt_tuple_key[-1] == "bias")
+            and (pt_tuple_key[:-1] + ("bias",) in random_flax_state_dict)
+        ):
+            pt_tensor = pt_tensor[None, None, None, :]
+        elif (
+            "norm" in pt_key
+            and (pt_tuple_key[-1] == "bias")
+            and (pt_tuple_key[:-1] + ("scale",) in random_flax_state_dict)
+        ):
+            pt_tuple_key = pt_tuple_key[:-1] + ("scale",)
+            pt_tensor = pt_tensor[None, None, None, :]
+        elif pt_tuple_key[-1] in ["weight", "gamma"] and pt_tuple_key[:-1] + ("scale",) in random_flax_state_dict:
+            pt_tuple_key = pt_tuple_key[:-1] + ("scale",)
+            pt_tensor = pt_tensor[None, None, None, :]
+        if pt_tuple_key[-1] == "weight" and pt_tuple_key[:-1] + ("embedding",) in random_flax_state_dict:
+            pt_tuple_key = pt_tuple_key[:-1] + ("embedding",)
+        elif pt_tuple_key[-1] == "weight" and pt_tensor.ndim == 4 and pt_tuple_key not in random_flax_state_dict:
+            # conv layer
+            pt_tuple_key = pt_tuple_key[:-1] + ("kernel",)
+            pt_tensor = pt_tensor.transpose(2, 3, 1, 0)
+        elif pt_tuple_key[-1] == "weight" and pt_tuple_key not in random_flax_state_dict:
+            # linear layer
+            pt_tuple_key = pt_tuple_key[:-1] + ("kernel",)
+            pt_tensor = pt_tensor.T
+        elif pt_tuple_key[-1] == "gamma":
+            pt_tuple_key = pt_tuple_key[:-1] + ("weight",)
+        elif pt_tuple_key[-1] == "beta":
+            pt_tuple_key = pt_tuple_key[:-1] + ("bias",)
+        if pt_tuple_key in random_flax_state_dict:
+            if pt_tensor.shape != random_flax_state_dict[pt_tuple_key].shape:
+                raise ValueError(
+                    f"PyTorch checkpoint seems to be incorrect. Weight {pt_key} was expected to be of shape "
+                    f"{random_flax_state_dict[pt_tuple_key].shape}, but is {pt_tensor.shape}."
+                )
+        # also add unexpected weight so that warning is thrown
+        flax_state_dict[pt_tuple_key] = jnp.asarray(pt_tensor)
+    return unflatten_dict(flax_state_dict)
+def convert_model(config_path, pt_state_dict_path, save_path):
+    config = VQGANConfig.from_pretrained(config_path)
+    model = VQModel(config)
+    state_dict = torch.load(pt_state_dict_path, map_location="cpu")["state_dict"]
+    keys = list(state_dict.keys())
+    for key in keys:
+        if key.startswith("loss"):
+            state_dict.pop(key)
+            continue
+        renamed_key = rename_key(key)
+        state_dict[renamed_key] = state_dict.pop(key)
+    state = convert_pytorch_state_dict_to_flax(state_dict, model)
+    model.params = unflatten_dict(state)
+    model.save_pretrained(save_path)

dalle_mini/vqgan_jax/modeling_flax_vqgan.py ADDED Viewed

	@@ -0,0 +1,609 @@

+# JAX implementation of VQGAN from taming-transformers https://github.com/CompVis/taming-transformers
+from functools import partial
+from typing import Tuple
+import math
+import jax
+import jax.numpy as jnp
+import numpy as np
+import flax.linen as nn
+from flax.core.frozen_dict import FrozenDict
+from transformers.modeling_flax_utils import FlaxPreTrainedModel
+from .configuration_vqgan import VQGANConfig
+class Upsample(nn.Module):
+    in_channels: int
+    with_conv: bool
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        if self.with_conv:
+            self.conv = nn.Conv(
+                self.in_channels,
+                kernel_size=(3, 3),
+                strides=(1, 1),
+                padding=((1, 1), (1, 1)),
+                dtype=self.dtype,
+            )
+    def __call__(self, hidden_states):
+        batch, height, width, channels = hidden_states.shape
+        hidden_states = jax.image.resize(
+            hidden_states,
+            shape=(batch, height * 2, width * 2, channels),
+            method="nearest",
+        )
+        if self.with_conv:
+            hidden_states = self.conv(hidden_states)
+        return hidden_states
+class Downsample(nn.Module):
+    in_channels: int
+    with_conv: bool
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        if self.with_conv:
+            self.conv = nn.Conv(
+                self.in_channels,
+                kernel_size=(3, 3),
+                strides=(2, 2),
+                padding="VALID",
+                dtype=self.dtype,
+            )
+    def __call__(self, hidden_states):
+        if self.with_conv:
+            pad = ((0, 0), (0, 1), (0, 1), (0, 0))  # pad height and width dim
+            hidden_states = jnp.pad(hidden_states, pad_width=pad)
+            hidden_states = self.conv(hidden_states)
+        else:
+            hidden_states = nn.avg_pool(hidden_states, window_shape=(2, 2), strides=(2, 2), padding="VALID")
+        return hidden_states
+class ResnetBlock(nn.Module):
+    in_channels: int
+    out_channels: int = None
+    use_conv_shortcut: bool = False
+    temb_channels: int = 512
+    dropout_prob: float = 0.0
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.out_channels_ = self.in_channels if self.out_channels is None else self.out_channels
+        self.norm1 = nn.GroupNorm(num_groups=32, epsilon=1e-6)
+        self.conv1 = nn.Conv(
+            self.out_channels_,
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+        if self.temb_channels:
+            self.temb_proj = nn.Dense(self.out_channels_, dtype=self.dtype)
+        self.norm2 = nn.GroupNorm(num_groups=32, epsilon=1e-6)
+        self.dropout = nn.Dropout(self.dropout_prob)
+        self.conv2 = nn.Conv(
+            self.out_channels_,
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+        if self.in_channels != self.out_channels_:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = nn.Conv(
+                    self.out_channels_,
+                    kernel_size=(3, 3),
+                    strides=(1, 1),
+                    padding=((1, 1), (1, 1)),
+                    dtype=self.dtype,
+                )
+            else:
+                self.nin_shortcut = nn.Conv(
+                    self.out_channels_,
+                    kernel_size=(1, 1),
+                    strides=(1, 1),
+                    padding="VALID",
+                    dtype=self.dtype,
+                )
+    def __call__(self, hidden_states, temb=None, deterministic: bool = True):
+        residual = hidden_states
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = nn.swish(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+        if temb is not None:
+            hidden_states = hidden_states + self.temb_proj(nn.swish(temb))[:, :, None, None]  # TODO: check shapes
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = nn.swish(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic)
+        hidden_states = self.conv2(hidden_states)
+        if self.in_channels != self.out_channels_:
+            if self.use_conv_shortcut:
+                residual = self.conv_shortcut(residual)
+            else:
+                residual = self.nin_shortcut(residual)
+        return hidden_states + residual
+class AttnBlock(nn.Module):
+    in_channels: int
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        conv = partial(
+            nn.Conv, self.in_channels, kernel_size=(1, 1), strides=(1, 1), padding="VALID", dtype=self.dtype
+        )
+        self.norm = nn.GroupNorm(num_groups=32, epsilon=1e-6)
+        self.q, self.k, self.v = conv(), conv(), conv()
+        self.proj_out = conv()
+    def __call__(self, hidden_states):
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)
+        query = self.q(hidden_states)
+        key = self.k(hidden_states)
+        value = self.v(hidden_states)
+        # compute attentions
+        batch, height, width, channels = query.shape
+        query = query.reshape((batch, height * width, channels))
+        key = key.reshape((batch, height * width, channels))
+        attn_weights = jnp.einsum("...qc,...kc->...qk", query, key)
+        attn_weights = attn_weights * (int(channels) ** -0.5)
+        attn_weights = nn.softmax(attn_weights, axis=2)
+        ## attend to values
+        value = value.reshape((batch, height * width, channels))
+        hidden_states = jnp.einsum("...kc,...qk->...qc", value, attn_weights)
+        hidden_states = hidden_states.reshape((batch, height, width, channels))
+        hidden_states = self.proj_out(hidden_states)
+        hidden_states = hidden_states + residual
+        return hidden_states
+class UpsamplingBlock(nn.Module):
+    config: VQGANConfig
+    curr_res: int
+    block_idx: int
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        if self.block_idx == self.config.num_resolutions - 1:
+            block_in = self.config.ch * self.config.ch_mult[-1]
+        else:
+            block_in = self.config.ch * self.config.ch_mult[self.block_idx + 1]
+        block_out = self.config.ch * self.config.ch_mult[self.block_idx]
+        self.temb_ch = 0
+        res_blocks = []
+        attn_blocks = []
+        for _ in range(self.config.num_res_blocks + 1):
+            res_blocks.append(
+                ResnetBlock(
+                    block_in, block_out, temb_channels=self.temb_ch, dropout_prob=self.config.dropout, dtype=self.dtype
+                )
+            )
+            block_in = block_out
+            if self.curr_res in self.config.attn_resolutions:
+                attn_blocks.append(AttnBlock(block_in, dtype=self.dtype))
+        self.block = res_blocks
+        self.attn = attn_blocks
+        self.upsample = None
+        if self.block_idx != 0:
+            self.upsample = Upsample(block_in, self.config.resamp_with_conv, dtype=self.dtype)
+    def __call__(self, hidden_states, temb=None, deterministic: bool = True):
+        for res_block in self.block:
+            hidden_states = res_block(hidden_states, temb, deterministic=deterministic)
+            for attn_block in self.attn:
+                hidden_states = attn_block(hidden_states)
+        if self.upsample is not None:
+            hidden_states = self.upsample(hidden_states)
+        return hidden_states
+class DownsamplingBlock(nn.Module):
+    config: VQGANConfig
+    curr_res: int
+    block_idx: int
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        in_ch_mult = (1,) + tuple(self.config.ch_mult)
+        block_in = self.config.ch * in_ch_mult[self.block_idx]
+        block_out = self.config.ch * self.config.ch_mult[self.block_idx]
+        self.temb_ch = 0
+        res_blocks = []
+        attn_blocks = []
+        for _ in range(self.config.num_res_blocks):
+            res_blocks.append(
+                ResnetBlock(
+                    block_in, block_out, temb_channels=self.temb_ch, dropout_prob=self.config.dropout, dtype=self.dtype
+                )
+            )
+            block_in = block_out
+            if self.curr_res in self.config.attn_resolutions:
+                attn_blocks.append(AttnBlock(block_in, dtype=self.dtype))
+        self.block = res_blocks
+        self.attn = attn_blocks
+        self.downsample = None
+        if self.block_idx != self.config.num_resolutions - 1:
+            self.downsample = Downsample(block_in, self.config.resamp_with_conv, dtype=self.dtype)
+    def __call__(self, hidden_states, temb=None, deterministic: bool = True):
+        for res_block in self.block:
+            hidden_states = res_block(hidden_states, temb, deterministic=deterministic)
+            for attn_block in self.attn:
+                hidden_states = attn_block(hidden_states)
+        if self.downsample is not None:
+            hidden_states = self.downsample(hidden_states)
+        return hidden_states
+class MidBlock(nn.Module):
+    in_channels: int
+    temb_channels: int
+    dropout: float
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.block_1 = ResnetBlock(
+            self.in_channels,
+            self.in_channels,
+            temb_channels=self.temb_channels,
+            dropout_prob=self.dropout,
+            dtype=self.dtype,
+        )
+        self.attn_1 = AttnBlock(self.in_channels, dtype=self.dtype)
+        self.block_2 = ResnetBlock(
+            self.in_channels,
+            self.in_channels,
+            temb_channels=self.temb_channels,
+            dropout_prob=self.dropout,
+            dtype=self.dtype,
+        )
+    def __call__(self, hidden_states, temb=None, deterministic: bool = True):
+        hidden_states = self.block_1(hidden_states, temb, deterministic=deterministic)
+        hidden_states = self.attn_1(hidden_states)
+        hidden_states = self.block_2(hidden_states, temb, deterministic=deterministic)
+        return hidden_states
+class Encoder(nn.Module):
+    config: VQGANConfig
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.temb_ch = 0
+        # downsampling
+        self.conv_in = nn.Conv(
+            self.config.ch,
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+        curr_res = self.config.resolution
+        downsample_blocks = []
+        for i_level in range(self.config.num_resolutions):
+            downsample_blocks.append(DownsamplingBlock(self.config, curr_res, block_idx=i_level, dtype=self.dtype))
+            if i_level != self.config.num_resolutions - 1:
+                curr_res = curr_res // 2
+        self.down = downsample_blocks
+        # middle
+        mid_channels = self.config.ch * self.config.ch_mult[-1]
+        self.mid = MidBlock(mid_channels, self.temb_ch, self.config.dropout, dtype=self.dtype)
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, epsilon=1e-6)
+        self.conv_out = nn.Conv(
+            2 * self.config.z_channels if self.config.double_z else self.config.z_channels,
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+    def __call__(self, pixel_values, deterministic: bool = True):
+        # timestep embedding
+        temb = None
+        # downsampling
+        hidden_states = self.conv_in(pixel_values)
+        for block in self.down:
+            hidden_states = block(hidden_states, temb, deterministic=deterministic)
+        # middle
+        hidden_states = self.mid(hidden_states, temb, deterministic=deterministic)
+        # end
+        hidden_states = self.norm_out(hidden_states)
+        hidden_states = nn.swish(hidden_states)
+        hidden_states = self.conv_out(hidden_states)
+        return hidden_states
+class Decoder(nn.Module):
+    config: VQGANConfig
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.temb_ch = 0
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        block_in = self.config.ch * self.config.ch_mult[self.config.num_resolutions - 1]
+        curr_res = self.config.resolution // 2 ** (self.config.num_resolutions - 1)
+        self.z_shape = (1, self.config.z_channels, curr_res, curr_res)
+        print("Working with z of shape {} = {} dimensions.".format(self.z_shape, np.prod(self.z_shape)))
+        # z to block_in
+        self.conv_in = nn.Conv(
+            block_in,
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+        # middle
+        self.mid = MidBlock(block_in, self.temb_ch, self.config.dropout, dtype=self.dtype)
+        # upsampling
+        upsample_blocks = []
+        for i_level in reversed(range(self.config.num_resolutions)):
+            upsample_blocks.append(UpsamplingBlock(self.config, curr_res, block_idx=i_level, dtype=self.dtype))
+            if i_level != 0:
+                curr_res = curr_res * 2
+        self.up = list(reversed(upsample_blocks))  # reverse to get consistent order
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, epsilon=1e-6)
+        self.conv_out = nn.Conv(
+            self.config.out_ch,
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+    def __call__(self, hidden_states, deterministic: bool = True):
+        # timestep embedding
+        temb = None
+        # z to block_in
+        hidden_states = self.conv_in(hidden_states)
+        # middle
+        hidden_states = self.mid(hidden_states, temb, deterministic=deterministic)
+        # upsampling
+        for block in reversed(self.up):
+            hidden_states = block(hidden_states, temb, deterministic=deterministic)
+        # end
+        if self.config.give_pre_end:
+            return hidden_states
+        hidden_states = self.norm_out(hidden_states)
+        hidden_states = nn.swish(hidden_states)
+        hidden_states = self.conv_out(hidden_states)
+        return hidden_states
+class VectorQuantizer(nn.Module):
+    """
+    see https://github.com/MishaLaskin/vqvae/blob/d761a999e2267766400dc646d82d3ac3657771d4/models/quantizer.py
+    ____________________________________________
+    Discretization bottleneck part of the VQ-VAE.
+    Inputs:
+    - n_e : number of embeddings
+    - e_dim : dimension of embedding
+    - beta : commitment cost used in loss term, beta * ||z_e(x)-sg[e]||^2
+    _____________________________________________
+    """
+    config: VQGANConfig
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.embedding = nn.Embed(self.config.n_embed, self.config.embed_dim, dtype=self.dtype)  # TODO: init
+    def __call__(self, hidden_states):
+        """
+        Inputs the output of the encoder network z and maps it to a discrete
+        one-hot vector that is the index of the closest embedding vector e_j
+        z (continuous) -> z_q (discrete)
+        z.shape = (batch, channel, height, width)
+        quantization pipeline:
+            1. get encoder input (B,C,H,W)
+            2. flatten input to (B*H*W,C)
+        """
+        #  flatten
+        hidden_states_flattended = hidden_states.reshape((-1, self.config.embed_dim))
+        # dummy op to init the weights, so we can access them below
+        self.embedding(jnp.ones((1, 1), dtype="i4"))
+        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
+        emb_weights = self.variables["params"]["embedding"]["embedding"]
+        distance = (
+            jnp.sum(hidden_states_flattended ** 2, axis=1, keepdims=True)
+            + jnp.sum(emb_weights ** 2, axis=1)
+            - 2 * jnp.dot(hidden_states_flattended, emb_weights.T)
+        )
+        # get quantized latent vectors
+        min_encoding_indices = jnp.argmin(distance, axis=1)
+        z_q = self.embedding(min_encoding_indices).reshape(hidden_states.shape)
+        # reshape to (batch, num_tokens)
+        min_encoding_indices = min_encoding_indices.reshape(hidden_states.shape[0], -1)
+        # compute the codebook_loss (q_loss) outside the model
+        # here we return the embeddings and indices
+        return z_q, min_encoding_indices
+    def get_codebook_entry(self, indices, shape=None):
+        # indices are expected to be of shape (batch, num_tokens)
+        # get quantized latent vectors
+        batch, num_tokens = indices.shape
+        z_q = self.embedding(indices)
+        z_q = z_q.reshape(batch, int(math.sqrt(num_tokens)), int(math.sqrt(num_tokens)), -1)
+        return z_q
+class VQModule(nn.Module):
+    config: VQGANConfig
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.encoder = Encoder(self.config, dtype=self.dtype)
+        self.decoder = Decoder(self.config, dtype=self.dtype)
+        self.quantize = VectorQuantizer(self.config, dtype=self.dtype)
+        self.quant_conv = nn.Conv(
+            self.config.embed_dim,
+            kernel_size=(1, 1),
+            strides=(1, 1),
+            padding="VALID",
+            dtype=self.dtype,
+        )
+        self.post_quant_conv = nn.Conv(
+            self.config.z_channels,
+            kernel_size=(1, 1),
+            strides=(1, 1),
+            padding="VALID",
+            dtype=self.dtype,
+        )
+    def encode(self, pixel_values, deterministic: bool = True):
+        hidden_states = self.encoder(pixel_values, deterministic=deterministic)
+        hidden_states = self.quant_conv(hidden_states)
+        quant_states, indices = self.quantize(hidden_states)
+        return quant_states, indices
+    def decode(self, hidden_states, deterministic: bool = True):
+        hidden_states = self.post_quant_conv(hidden_states)
+        hidden_states = self.decoder(hidden_states, deterministic=deterministic)
+        return hidden_states
+    def decode_code(self, code_b):
+        hidden_states = self.quantize.get_codebook_entry(code_b)
+        hidden_states = self.decode(hidden_states)
+        return hidden_states
+    def __call__(self, pixel_values, deterministic: bool = True):
+        quant_states, indices = self.encode(pixel_values, deterministic)
+        hidden_states = self.decode(quant_states, deterministic)
+        return hidden_states, indices
+class VQGANPreTrainedModel(FlaxPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface
+    for downloading and loading pretrained models.
+    """
+    config_class = VQGANConfig
+    base_model_prefix = "model"
+    module_class: nn.Module = None
+    def __init__(
+        self,
+        config: VQGANConfig,
+        input_shape: Tuple = (1, 256, 256, 3),
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype)
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple) -> FrozenDict:
+        # init input tensors
+        pixel_values = jnp.zeros(input_shape, dtype=jnp.float32)
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+        return self.module.init(rngs, pixel_values)["params"]
+    def encode(self, pixel_values, params: dict = None, dropout_rng: jax.random.PRNGKey = None, train: bool = False):
+        # Handle any PRNG if needed
+        rngs = {"dropout": dropout_rng} if dropout_rng is not None else {}
+        return self.module.apply(
+            {"params": params or self.params}, jnp.array(pixel_values), not train, rngs=rngs, method=self.module.encode
+        )
+    def decode(self, hidden_states, params: dict = None, dropout_rng: jax.random.PRNGKey = None, train: bool = False):
+        # Handle any PRNG if needed
+        rngs = {"dropout": dropout_rng} if dropout_rng is not None else {}
+        return self.module.apply(
+            {"params": params or self.params},
+            jnp.array(hidden_states),
+            not train,
+            rngs=rngs,
+            method=self.module.decode,
+        )
+    def decode_code(self, indices, params: dict = None):
+        return self.module.apply(
+            {"params": params or self.params}, jnp.array(indices, dtype="i4"), method=self.module.decode_code
+        )
+    def __call__(
+        self,
+        pixel_values,
+        params: dict = None,
+        dropout_rng: jax.random.PRNGKey = None,
+        train: bool = False,
+    ):
+        # Handle any PRNG if needed
+        rngs = {"dropout": dropout_rng} if dropout_rng is not None else {}
+        return self.module.apply(
+            {"params": params or self.params},
+            jnp.array(pixel_values),
+            not train,
+            rngs=rngs,
+        )
+class VQModel(VQGANPreTrainedModel):
+    module_class = VQModule

encoding/vqgan-jax-encoding-with-captions.ipynb ADDED Viewed

	@@ -0,0 +1,363 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "d0b72877",
+   "metadata": {},
+   "source": [
+    "# vqgan-jax-encoding-with-captions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "875c82b3",
+   "metadata": {},
+   "source": [
+    "Notebook based on [vqgan-jax-reconstruction](https://colab.research.google.com/drive/1mdXXsMbV6K_LTvCh3IImRsFIWcKU5m1w?usp=sharing) by @surajpatil.\n",
+    "\n",
+    "We process a `tsv` file with `image_file` and `caption` fields, and add a `vqgan_indices` column with indices extracted from a VQGAN-JAX model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "3b59489e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import io\n",
+    "\n",
+    "import requests\n",
+    "from PIL import Image\n",
+    "import numpy as np\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "import torch\n",
+    "import torchvision.transforms as T\n",
+    "import torchvision.transforms.functional as TF\n",
+    "from torchvision.transforms import InterpolationMode\n",
+    "from torch.utils.data import Dataset, DataLoader\n",
+    "\n",
+    "import jax\n",
+    "from jax import pmap"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "511c3b9e",
+   "metadata": {},
+   "source": [
+    "## VQGAN-JAX model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bb408f6c",
+   "metadata": {},
+   "source": [
+    "`dalle_mini` is a local package that contains the VQGAN-JAX model and other utilities."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "2ca50dc7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dalle_mini.vqgan_jax.modeling_flax_vqgan import VQModel"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7b60da9a",
+   "metadata": {},
+   "source": [
+    "We'll use a VQGAN trained by using Taming Transformers and converted to a JAX model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "29ce8b15",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "db406bdfc5d5428eaeae1631a04989dd",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3e37f07fba6d48fca70313ae1fa8cc32",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading:   0%|          | 0.00/304M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:absl:Starting the local TPU driver.\n",
+      "INFO:absl:Unable to initialize backend 'tpu_driver': Not found: Unable to find driver in registry given worker: local://\n",
+      "INFO:absl:Unable to initialize backend 'gpu': Not found: Could not find registered platform with name: \"cuda\". Available platform names are: Interpreter Host TPU\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Working with z of shape (1, 256, 16, 16) = 65536 dimensions.\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = VQModel.from_pretrained(\"flax-community/vqgan_f16_16384\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c7c4c1e6",
+   "metadata": {},
+   "source": [
+    "## Dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7014a7ce",
+   "metadata": {},
+   "source": [
+    "We use Luke Melas-Kyriazi's `dataset.py` which reads image paths and captions from a tsv file that contains both. We only need the images for encoding."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "85832702",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dalle_mini.dataset import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "81b19eca",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cc12m_images = '/data/CC12M/images'\n",
+    "cc12m_list = '/data/CC12M/images-list-clean.tsv'\n",
+    "# cc12m_list = '/data/CC12M/images-10000.tsv'\n",
+    "cc12m_output = '/data/CC12M/images-encoded.tsv'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "fecc9a00",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image_size = 256\n",
+    "def image_transform(image):\n",
+    "    s = min(image.size)\n",
+    "    r = image_size / s\n",
+    "    s = (round(r * image.size[1]), round(r * image.size[0]))\n",
+    "    image = TF.resize(image, s, interpolation=InterpolationMode.LANCZOS)\n",
+    "    image = TF.center_crop(image, output_size = 2 * [image_size])\n",
+    "    image = torch.unsqueeze(T.ToTensor()(image), 0)\n",
+    "    image = image.permute(0, 2, 3, 1).numpy()\n",
+    "    return image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "4ce2211f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = CaptionDataset(\n",
+    "    images_root=cc12m_images,\n",
+    "    captions_path=cc12m_list,\n",
+    "    image_transform=image_transform,\n",
+    "    image_transform_type='torchvision',\n",
+    "    include_captions=False\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "cc922704",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "8592141"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(dataset)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "62ad01c3",
+   "metadata": {},
+   "source": [
+    "## Encoding"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "88f36d0b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def encode(model, batch):\n",
+    "#     print(\"jitting encode function\")\n",
+    "    _, indices = model.encode(batch)\n",
+    "    return indices"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "1f35f0cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def superbatch_generator(dataloader, num_tpus):\n",
+    "    iter_loader = iter(dataloader)\n",
+    "    for batch in iter_loader:\n",
+    "        superbatch = [batch.squeeze(1)]\n",
+    "        try:\n",
+    "            for b in range(num_tpus-1):\n",
+    "                batch = next(iter_loader)\n",
+    "                if batch is None:\n",
+    "                    break\n",
+    "                # Skip incomplete last batch\n",
+    "                if batch.shape[0] == dataloader.batch_size:\n",
+    "                    superbatch.append(batch.squeeze(1))\n",
+    "        except StopIteration:\n",
+    "            pass\n",
+    "        superbatch = torch.stack(superbatch, axis=0)\n",
+    "        yield superbatch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "2210705b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "def encode_captioned_dataset(dataset, output_tsv, batch_size=32, num_workers=16):\n",
+    "    if os.path.isfile(output_tsv):\n",
+    "        print(f\"Destination file {output_tsv} already exists, please move away.\")\n",
+    "        return\n",
+    "    \n",
+    "    num_tpus = 8    \n",
+    "    dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers)\n",
+    "    superbatches = superbatch_generator(dataloader, num_tpus=num_tpus)\n",
+    "    \n",
+    "    p_encoder = pmap(lambda batch: encode(model, batch))\n",
+    "\n",
+    "    # We save each superbatch to avoid reallocation of buffers as we process them.\n",
+    "    # We keep the file open to prevent excessive file seeks.\n",
+    "    with open(output_tsv, \"w\") as file:\n",
+    "        iterations = len(dataset) // (batch_size * num_tpus)\n",
+    "        for n in tqdm(range(iterations)):\n",
+    "            superbatch = next(superbatches)\n",
+    "            encoded = p_encoder(superbatch.numpy())\n",
+    "            encoded = encoded.reshape(-1, encoded.shape[-1])\n",
+    "\n",
+    "            # Extract fields from the dataset internal `captions` property, and save to disk\n",
+    "            start_index = n * batch_size * num_tpus\n",
+    "            end_index = (n+1) * batch_size * num_tpus\n",
+    "            paths = dataset.captions[\"image_file\"][start_index:end_index].values\n",
+    "            captions = dataset.captions[\"caption\"][start_index:end_index].values\n",
+    "            encoded_as_string = list(map(lambda item: np.array2string(item, separator=',', max_line_width=50000, formatter={'int':lambda x: str(x)}), encoded))\n",
+    "            batch_df = pd.DataFrame.from_dict({\"image_file\": paths, \"caption\": captions, \"encoding\": encoded_as_string})\n",
+    "            batch_df.to_csv(file, sep='\\t', header=(n==0), index=None)\n",
+    "            "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7704863d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  4%|██▋                                                                      | 621/16781 [07:09<3:02:46,  1.47it/s]"
+     ]
+    }
+   ],
+   "source": [
+    "encode_captioned_dataset(dataset, cc12m_output, batch_size=64, num_workers=16)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8953dd84",
+   "metadata": {},
+   "source": [
+    "----"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

encoding/vqgan-jax-encoding.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

model/data-pipeline.ipynb ADDED Viewed

	@@ -0,0 +1,385 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "bf8fb38a",
+   "metadata": {},
+   "source": [
+    "# Data Pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "9b83dcb9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dataclasses import dataclass, field\n",
+    "from pathlib import Path\n",
+    "\n",
+    "import datasets\n",
+    "from datasets import Dataset, load_dataset\n",
+    "import numpy as np\n",
+    "\n",
+    "from transformers import BartTokenizer\n",
+    "\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "import jax\n",
+    "import jax.numpy as jnp\n",
+    "\n",
+    "from flax.training.common_utils import shard"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a661a89e",
+   "metadata": {},
+   "source": [
+    "File containing image paths, captions and VQGAN-encoded indices."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "0e84e889",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "datafile = '/data/CC12M/images-encoded-10000.tsv'   # 9999 encoded images from CC12M"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7fdc640b",
+   "metadata": {},
+   "source": [
+    "TODO: generate train/test splits if necessary."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "cc6789b4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using custom data configuration default-91833df78e844785\n",
+      "Reusing dataset csv (/home/pedro/.cache/huggingface/datasets/csv/default-91833df78e844785/0.0.0/e138af468cb14e747fb46a19c787ffcfa5170c821476d20d5304287ce12bbc23)\n"
+     ]
+    }
+   ],
+   "source": [
+    "dataset = load_dataset('csv', delimiter='\\t', data_files=[datafile])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "f3ed4919",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['image_file', 'caption', 'encoding'],\n",
+       "        num_rows: 9999\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "a70c7354",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['image_file', 'caption', 'encoding'],\n",
+       "    num_rows: 9999\n",
+       "})"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset = dataset[\"train\"]\n",
+    "dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a73454cf",
+   "metadata": {},
+   "source": [
+    "We don't really need the `image_file` field for training. We'll drop it during pre-processing because we won't be able to numericalize it to a `jnp.array`, which would be required in JAX."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7c0fa992",
+   "metadata": {},
+   "source": [
+    "## Preprocessing"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a0e36582",
+   "metadata": {},
+   "source": [
+    "The `encoding` field contains a string representation of the encoded indices. We'll convert them to numbers. We also need to tokenize the captions."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "d46f6ac5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Setting padding=\"max_length\" as we need fixed length inputs for jitted functions\n",
+    "max_length = 256   # Read from data_args.max_source_length\n",
+    "tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')\n",
+    "image_bos = 16384   # Max token is 16383 in our VQGAN configuration"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "4cac6643",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def preprocess_function(examples):\n",
+    "    inputs = examples[\"caption\"]\n",
+    "#     inputs = [prefix + inp for inp in inputs]   # Do we need this?\n",
+    "    model_inputs = tokenizer(\n",
+    "        inputs, max_length=max_length, padding=\"max_length\", truncation=True, return_tensors=\"np\"\n",
+    "    )\n",
+    "\n",
+    "    model_inputs[\"labels\"] = [[image_bos] + eval(indices) for indices in examples['encoding']]\n",
+    "\n",
+    "    return model_inputs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "e6a4cb91",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_workers = 48     # We have 96 processors in the TPU\n",
+    "column_names = dataset.column_names\n",
+    "input_dataset = dataset.map(preprocess_function,\n",
+    "                            remove_columns=column_names,\n",
+    "                            batched=True,\n",
+    "                            num_proc=48\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "a9b1b467",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def data_loader(rng: jax.random.PRNGKey, dataset: Dataset, batch_size: int, shuffle: bool = False):\n",
+    "    \"\"\"\n",
+    "    Returns batches of size `batch_size` from truncated `dataset`, sharded over all local devices.\n",
+    "    Shuffle batches if `shuffle` is `True`.\n",
+    "    \"\"\"\n",
+    "    steps_per_epoch = len(dataset) // batch_size\n",
+    "\n",
+    "    if shuffle:\n",
+    "        batch_idx = jax.random.permutation(rng, len(dataset))\n",
+    "    else:\n",
+    "        batch_idx = jnp.arange(len(dataset))\n",
+    "\n",
+    "    batch_idx = batch_idx[: steps_per_epoch * batch_size]  # Skip incomplete batch.\n",
+    "    batch_idx = batch_idx.reshape((steps_per_epoch, batch_size))\n",
+    "\n",
+    "    for idx in batch_idx:\n",
+    "        batch = dataset[idx]        \n",
+    "        batch = {k: jnp.array(v) for k, v in batch.items()}\n",
+    "        batch = shard(batch)\n",
+    "        yield batch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "0a628505",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:absl:Starting the local TPU driver.\n",
+      "INFO:absl:Unable to initialize backend 'tpu_driver': Not found: Unable to find driver in registry given worker: local://\n",
+      "INFO:absl:Unable to initialize backend 'gpu': Not found: Could not find registered platform with name: \"cuda\". Available platform names are: Host TPU Interpreter\n"
+     ]
+    }
+   ],
+   "source": [
+    "rng = jax.random.PRNGKey(23)  # Use training_args.seed\n",
+    "batch_size = 64    # Per device\n",
+    "super_batch_size = batch_size * jax.device_count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "b3a5ce7d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = data_loader(rng, input_dataset, batch_size=super_batch_size)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "67aa8f9c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "superbatch = next(iter(loader))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "7cd99402",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "dict_keys(['attention_mask', 'input_ids', 'labels'])"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "superbatch.keys()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "652a4a9e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "8"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(superbatch[\"labels\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "de7de4e8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(8, 64, 257)"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "superbatch[\"labels\"].shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6800153b",
+   "metadata": {},
+   "source": [
+    "Any image sequence should begin with `image_bos`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "cfe23a71",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "assert superbatch[\"labels\"][1][5][0].item() == image_bos"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0fb899b4",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}