first commit

Files changed (16) hide show

README.md +2 -2
added_tokens.json +4 -0
config.json +62 -0
configuration_infimm_hd.py +42 -0
eva_vit_model.py +837 -0
flamingo.py +319 -0
flamingo_lm.py +414 -0
modeling_infimm_hd.py +134 -0
modules.py +233 -0
preprocessor_config.json +7 -0
processing_infimm_hd.py +422 -0
pytorch_model.bin +3 -0
special_tokens_map.json +46 -0
tokenizer.model +3 -0
tokenizer_config.json +63 -0
utils.py +98 -0

README.md CHANGED Viewed

@@ -26,7 +26,7 @@ import torch
 from transformers import AutoModelForCausalLM, AutoProcessor
 from open_flamingo.eval.models.cruise_model import EvalModel
-processor = AutoProcessor.from_pretrained("/mnt/bn/bohanzhainas1/haogeng/infimm-HD", trust_remote_code=True)
 prompts = [
     {
@@ -40,7 +40,7 @@ prompts = [
 inputs = processor(prompts)
 # use bf16 and gpu 0
 model = AutoModelForCausalLM.from_pretrained(
-    "/opt/tiger/mlm-tool/infimm-HD",
     local_files_only=True,
     torch_dtype=torch.bfloat16,
     trust_remote_code=True,

 from transformers import AutoModelForCausalLM, AutoProcessor
 from open_flamingo.eval.models.cruise_model import EvalModel
+processor = AutoProcessor.from_pretrained("infimm/infimm-hd", trust_remote_code=True)
 prompts = [
     {
 inputs = processor(prompts)
 # use bf16 and gpu 0
 model = AutoModelForCausalLM.from_pretrained(
+    "infimm/infimm-hd",
     local_files_only=True,
     torch_dtype=torch.bfloat16,
     trust_remote_code=True,

added_tokens.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "<image>": 32001,
+  "<|endofchunk|>": 32000
+}

config.json ADDED Viewed

	@@ -0,0 +1,62 @@

+{
+    "_name_or_path": "./",
+    "architectures": [
+        "InfiMMHDModel"
+    ],
+    "auto_map": {
+        "AutoConfig": "configuration_infimm_hd.InfiMMHDConfig",
+        "AutoModelForCausalLM": "modeling_infimm_hd.InfiMMHDModel"
+    },
+    "model_type": "infimm-hd",
+    "seq_length": 4096,
+    "tokenizer_type": "LlamaTokenizer",
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.35.2",
+    "use_cache": true,
+    "use_flash_attn": false,
+    "cross_attn_every_n_layers": 4,
+    "use_grad_checkpoint": false,
+    "freeze_llm": true,
+    "image_token_id": 32001,
+    "eoc_token_id": 32000,
+    "visual": {
+        "image_size": 448,
+        "layers": 64,
+        "width": 1792,
+        "head_width": 112,
+        "patch_size": 14,
+        "mlp_ratio": 8.571428571428571,
+        "eva_model_name": "eva-clip-4b-14-x",
+        "drop_path_rate": 0.0,
+        "xattn": false,
+        "postnorm": true,
+        "fusedLN": false,
+        "embed_dim": 1024
+    },
+    "language": {
+        "_name_or_path": "lmsys/vicuna-13b-v1.5",
+        "architectures": [
+            "LlamaForCausalLM"
+        ],
+        "bos_token_id": 1,
+        "eos_token_id": 2,
+        "hidden_act": "silu",
+        "hidden_size": 5120,
+        "initializer_range": 0.02,
+        "intermediate_size": 13824,
+        "max_position_embeddings": 4096,
+        "model_type": "llama",
+        "num_attention_heads": 40,
+        "num_hidden_layers": 40,
+        "num_key_value_heads": 40,
+        "pad_token_id": 0,
+        "pretraining_tp": 1,
+        "rms_norm_eps": 1e-05,
+        "rope_scaling": null,
+        "tie_word_embeddings": false,
+        "torch_dtype": "bfloat16",
+        "transformers_version": "4.32.0.dev0",
+        "use_cache": true,
+        "vocab_size": 32002
+    }
+  }

configuration_infimm_hd.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from transformers import PretrainedConfig
+class InfiMMHDConfig(PretrainedConfig):
+    model_type = "infimmhd"
+    def __init__(
+        self,
+        model_type="infimm-hd",
+        seq_length=1024,
+        tokenizer_type="LlamaTokenizer",
+        torch_dtype="bfloat16",
+        transformers_version="4.28.2",
+        use_cache=True,
+        use_flash_attn=False,
+        cross_attn_every_n_layers=4,
+        use_grad_checkpoint=False,
+        freeze_llm=True,
+        visual=None,
+        language=None,
+        image_token_id=None,
+        eoc_token_id=None,
+        **kwargs,
+    ):
+        self.model_type = model_type
+        self.seq_length = seq_length
+        self.tokenizer_type = tokenizer_type
+        self.torch_dtype = torch_dtype
+        self.transformers_version = transformers_version
+        self.use_cache = use_cache
+        self.use_flash_attn = use_flash_attn
+        self.cross_attn_every_n_layers = cross_attn_every_n_layers
+        self.use_grad_checkpoint = use_grad_checkpoint
+        self.freeze_llm = freeze_llm
+        self.visual = visual
+        self.language = language
+        self.image_token_id = image_token_id
+        self.eoc_token_id = eoc_token_id
+        super().__init__(**kwargs)

eva_vit_model.py ADDED Viewed

	@@ -0,0 +1,837 @@

+# --------------------------------------------------------
+# Adapted from  https://github.com/baaivision/EVA/blob/master/EVA-CLIP/rei/eva_clip/eva_vit_model.py
+# --------------------------------------------------------
+import math
+import os
+import tempfile
+from dataclasses import dataclass
+from functools import partial
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import yaml
+from open_clip.transform import image_transform
+from timm.models.layers import drop_path, to_2tuple, trunc_normal_
+from open_flamingo.src.util.download_utils import download_pretrained_weights_from_hdfs
+from open_flamingo.src.visual_encoder.rope import VisionRotaryEmbeddingFast
+from open_flamingo.src.visual_encoder.transformer import Attention, PatchDropout
+from open_flamingo.src.xperf_training import FTFlashAttention, FTLayerNorm, FTLinear
+if os.getenv("ENV_TYPE") == "deepspeed":
+    try:
+        from deepspeed.runtime.activation_checkpointing.checkpointing import checkpoint
+    except:
+        from torch.utils.checkpoint import checkpoint
+else:
+    from torch.utils.checkpoint import checkpoint
+from .utils import resize_eva_pos_embed
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        drop=0.0,
+        subln=False,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        use_ft_linear = False
+        if use_ft_linear:
+            self.fc1 = FTLinear(in_features, hidden_features)
+        else:
+            self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.ffn_ln = norm_layer(hidden_features) if subln else nn.Identity()
+        if use_ft_linear:
+            self.fc2 = FTLinear(hidden_features, out_features)
+        else:
+            self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        # x = self.drop(x)
+        # commit this for the orignal BERT implement
+        x = self.ffn_ln(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class SwiGLU(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.SiLU,
+        drop=0.0,
+        norm_layer=nn.LayerNorm,
+        subln=False,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        use_ft_linear = False
+        if use_ft_linear:
+            self.w1 = FTLinear(in_features, hidden_features)
+            self.w2 = FTLinear(in_features, hidden_features)
+        else:
+            self.w1 = nn.Linear(in_features, hidden_features)
+            self.w2 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.ffn_ln = norm_layer(hidden_features) if subln else nn.Identity()
+        if use_ft_linear:
+            self.w3 = FTLinear(hidden_features, out_features)
+        else:
+            self.w3 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x1 = self.w1(x)
+        x2 = self.w2(x)
+        hidden = self.act(x1) * x2
+        x = self.ffn_ln(hidden)
+        x = self.w3(x)
+        x = self.drop(x)
+        return x
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        window_size=None,
+        attn_head_dim=None,
+        xattn=False,
+        rope=None,
+        subln=False,
+        norm_layer=nn.LayerNorm,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.use_ft_flash_attention = False
+        self.subln = subln
+        if self.subln:
+            self.q_proj = nn.Linear(dim, all_head_dim, bias=False)
+            self.k_proj = nn.Linear(dim, all_head_dim, bias=False)
+            self.v_proj = nn.Linear(dim, all_head_dim, bias=False)
+        else:
+            self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+        else:
+            self.q_bias = None
+            self.v_bias = None
+        if window_size:
+            self.window_size = window_size
+            self.num_relative_distance = (2 * window_size[0] - 1) * (
+                2 * window_size[1] - 1
+            ) + 3
+            self.relative_position_bias_table = nn.Parameter(
+                torch.zeros(self.num_relative_distance, num_heads)
+            )  # 2*Wh-1 * 2*Ww-1, nH
+            # cls to token & token 2 cls & cls to cls
+            # get pair-wise relative position index for each token inside the window
+            coords_h = torch.arange(window_size[0])
+            coords_w = torch.arange(window_size[1])
+            coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+            coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+            relative_coords = (
+                coords_flatten[:, :, None] - coords_flatten[:, None, :]
+            )  # 2, Wh*Ww, Wh*Ww
+            relative_coords = relative_coords.permute(
+                1, 2, 0
+            ).contiguous()  # Wh*Ww, Wh*Ww, 2
+            relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+            relative_coords[:, :, 1] += window_size[1] - 1
+            relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+            relative_position_index = torch.zeros(
+                size=(window_size[0] * window_size[1] + 1,) * 2,
+                dtype=relative_coords.dtype,
+            )
+            relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+            relative_position_index[0, 0:] = self.num_relative_distance - 3
+            relative_position_index[0:, 0] = self.num_relative_distance - 2
+            relative_position_index[0, 0] = self.num_relative_distance - 1
+            self.register_buffer("relative_position_index", relative_position_index)
+        else:
+            self.window_size = None
+            self.relative_position_bias_table = None
+            self.relative_position_index = None
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.inner_attn_ln = norm_layer(all_head_dim) if subln else nn.Identity()
+        # self.proj = nn.Linear(all_head_dim, all_head_dim)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.xattn = xattn
+        self.xattn_drop = attn_drop
+        if self.use_ft_flash_attention:
+            assert FTFlashAttention is not None
+            self.ft_flash_attn = FTFlashAttention()
+        self.rope = rope
+    def forward(self, x, rel_pos_bias=None, attn_mask=None):
+        B, N, C = x.shape
+        if self.subln:
+            q = F.linear(input=x, weight=self.q_proj.weight, bias=self.q_bias)
+            k = F.linear(input=x, weight=self.k_proj.weight, bias=None)
+            v = F.linear(input=x, weight=self.v_proj.weight, bias=self.v_bias)
+            q = q.reshape(B, N, self.num_heads, -1).permute(
+                0, 2, 1, 3
+            )  # B, num_heads, N, C
+            k = k.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3)
+            v = v.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3)
+        else:
+            qkv_bias = None
+            if self.q_bias is not None:
+                qkv_bias = torch.cat(
+                    (
+                        self.q_bias,
+                        torch.zeros_like(self.v_bias, requires_grad=False),
+                        self.v_bias,
+                    )
+                )
+            qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+            qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(
+                2, 0, 3, 1, 4
+            )  # 3, B, num_heads, N, C
+            q, k, v = qkv[0], qkv[1], qkv[2]
+        if self.rope:
+            # slightly fast impl
+            q_t = q[:, :, 1:, :]
+            ro_q_t = self.rope(q_t)
+            q = torch.cat((q[:, :, :1, :], ro_q_t), -2).type_as(v)
+            k_t = k[:, :, 1:, :]
+            ro_k_t = self.rope(k_t)
+            k = torch.cat((k[:, :, :1, :], ro_k_t), -2).type_as(v)
+        if self.use_ft_flash_attention:
+            q = q.permute(0, 2, 1, 3).contiguous()
+            q = q.view(
+                q.shape[0], q.shape[1], -1
+            )  # B, num_heads, N, C -> B, N, num_heads, C
+            k = k.permute(0, 2, 1, 3).contiguous()
+            k = k.view(k.shape[0], k.shape[1], -1)
+            v = v.permute(0, 2, 1, 3).contiguous()
+            v = v.view(v.shape[0], v.shape[1], -1)
+            x = self.ft_flash_attn(
+                [q, k, v],
+                self.num_heads,
+                attn_mask=None,
+                causal=False,
+                attention_dropout=self.xattn_drop if self.training else 0.0,
+                softmax_scale=self.scale,
+                use_rmpad_attn=False,
+            )
+            x = self.inner_attn_ln(x)
+            x = self.proj(x)
+            x = self.proj_drop(x)
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+            if self.relative_position_bias_table is not None:
+                relative_position_bias = self.relative_position_bias_table[
+                    self.relative_position_index.view(-1)
+                ].view(
+                    self.window_size[0] * self.window_size[1] + 1,
+                    self.window_size[0] * self.window_size[1] + 1,
+                    -1,
+                )  # Wh*Ww,Wh*Ww,nH
+                relative_position_bias = relative_position_bias.permute(
+                    2, 0, 1
+                ).contiguous()  # nH, Wh*Ww, Wh*Ww
+                attn = attn + relative_position_bias.unsqueeze(0).type_as(attn)
+            if rel_pos_bias is not None:
+                attn = attn + rel_pos_bias.type_as(attn)
+            if attn_mask is not None:
+                attn_mask = attn_mask.bool()
+                attn = attn.masked_fill(~attn_mask[:, None, None, :], float("-inf"))
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+            x = self.inner_attn_ln(x)
+            x = self.proj(x)
+            x = self.proj_drop(x)
+        return x
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        init_values=None,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        window_size=None,
+        attn_head_dim=None,
+        xattn=False,
+        rope=None,
+        postnorm=False,
+        subln=False,
+        naiveswiglu=False,
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            window_size=window_size,
+            attn_head_dim=attn_head_dim,
+            xattn=xattn,
+            rope=rope,
+            subln=subln,
+            norm_layer=norm_layer,
+        )
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        if naiveswiglu:
+            self.mlp = SwiGLU(
+                in_features=dim,
+                hidden_features=mlp_hidden_dim,
+                subln=subln,
+                norm_layer=norm_layer,
+            )
+        else:
+            self.mlp = Mlp(
+                in_features=dim,
+                hidden_features=mlp_hidden_dim,
+                act_layer=act_layer,
+                subln=subln,
+                drop=drop,
+            )
+        if init_values is not None and init_values > 0:
+            self.gamma_1 = nn.Parameter(
+                init_values * torch.ones((dim)), requires_grad=True
+            )
+            self.gamma_2 = nn.Parameter(
+                init_values * torch.ones((dim)), requires_grad=True
+            )
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+        self.postnorm = postnorm
+    def forward(self, x, rel_pos_bias=None, attn_mask=None):
+        if self.gamma_1 is None:
+            if self.postnorm:
+                x = x + self.drop_path(
+                    self.norm1(
+                        self.attn(x, rel_pos_bias=rel_pos_bias, attn_mask=attn_mask)
+                    )
+                )
+                x = x + self.drop_path(self.norm2(self.mlp(x)))
+            else:
+                x = x + self.drop_path(
+                    self.attn(
+                        self.norm1(x), rel_pos_bias=rel_pos_bias, attn_mask=attn_mask
+                    )
+                )
+                x = x + self.drop_path(self.mlp(self.norm2(x)))
+        else:
+            if self.postnorm:
+                x = x + self.drop_path(
+                    self.gamma_1
+                    * self.norm1(
+                        self.attn(x, rel_pos_bias=rel_pos_bias, attn_mask=attn_mask)
+                    )
+                )
+                x = x + self.drop_path(self.gamma_2 * self.norm2(self.mlp(x)))
+            else:
+                x = x + self.drop_path(
+                    self.gamma_1
+                    * self.attn(
+                        self.norm1(x), rel_pos_bias=rel_pos_bias, attn_mask=attn_mask
+                    )
+                )
+                x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+class PatchEmbed(nn.Module):
+    """Image to Patch Embedding"""
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+        self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size
+        )
+    def forward(self, x, **kwargs):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        assert H == self.img_size[0] and W == self.img_size[1], (
+            f"Input image size ({H}*{W}) doesn't match model"
+            f" ({self.img_size[0]}*{self.img_size[1]})."
+        )
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+class RelativePositionBias(nn.Module):
+    def __init__(self, window_size, num_heads):
+        super().__init__()
+        self.window_size = window_size
+        self.num_relative_distance = (2 * window_size[0] - 1) * (
+            2 * window_size[1] - 1
+        ) + 3
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros(self.num_relative_distance, num_heads)
+        )  # 2*Wh-1 * 2*Ww-1, nH
+        # cls to token & token 2 cls & cls to cls
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(window_size[0])
+        coords_w = torch.arange(window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = (
+            coords_flatten[:, :, None] - coords_flatten[:, None, :]
+        )  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(
+            1, 2, 0
+        ).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+        relative_position_index = torch.zeros(
+            size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype
+        )
+        relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        relative_position_index[0, 0:] = self.num_relative_distance - 3
+        relative_position_index[0:, 0] = self.num_relative_distance - 2
+        relative_position_index[0, 0] = self.num_relative_distance - 1
+        self.register_buffer("relative_position_index", relative_position_index)
+    def forward(self):
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)
+        ].view(
+            self.window_size[0] * self.window_size[1] + 1,
+            self.window_size[0] * self.window_size[1] + 1,
+            -1,
+        )  # Wh*Ww,Wh*Ww,nH
+        return relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+class EVAVisionTransformer(nn.Module):
+    """Vision Transformer with support for patch or hybrid CNN input stage"""
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        num_classes=1000,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        qk_scale=None,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.0,
+        norm_layer=nn.LayerNorm,
+        init_values=None,
+        patch_dropout=0.0,
+        use_abs_pos_emb=True,
+        use_rel_pos_bias=False,
+        use_shared_rel_pos_bias=False,
+        rope=False,
+        use_mean_pooling=True,
+        init_scale=0.001,
+        grad_checkpointing=False,
+        xattn=False,
+        postnorm=False,
+        pt_hw_seq_len=16,
+        intp_freq=False,
+        naiveswiglu=False,
+        subln=False,
+    ):
+        super().__init__()
+        self.image_size = img_size
+        self.num_classes = num_classes
+        self.num_features = (
+            self.embed_dim
+        ) = embed_dim  # num_features for consistency with other models
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        # self.mask_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        if use_abs_pos_emb:
+            self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+        else:
+            self.pos_embed = None
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        if use_shared_rel_pos_bias:
+            self.rel_pos_bias = RelativePositionBias(
+                window_size=self.patch_embed.patch_shape, num_heads=num_heads
+            )
+        else:
+            self.rel_pos_bias = None
+        if rope:
+            half_head_dim = embed_dim // num_heads // 2
+            hw_seq_len = img_size // patch_size
+            self.rope = VisionRotaryEmbeddingFast(
+                dim=half_head_dim,
+                pt_seq_len=pt_hw_seq_len,
+                ft_seq_len=hw_seq_len if intp_freq else None,
+                # patch_dropout=patch_dropout
+            )
+        else:
+            self.rope = None
+        self.naiveswiglu = naiveswiglu
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, depth)
+        ]  # stochastic depth decay rule
+        self.use_rel_pos_bias = use_rel_pos_bias
+        self.blocks = nn.ModuleList(
+            [
+                Block(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer,
+                    init_values=init_values,
+                    window_size=(
+                        self.patch_embed.patch_shape if use_rel_pos_bias else None
+                    ),
+                    xattn=xattn,
+                    rope=self.rope,
+                    postnorm=postnorm,
+                    subln=subln,
+                    naiveswiglu=naiveswiglu,
+                )
+                for i in range(depth)
+            ]
+        )
+        self.norm = nn.Identity() if use_mean_pooling else norm_layer(embed_dim)
+        self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None
+        self.head = (
+            nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        )
+        if self.pos_embed is not None:
+            trunc_normal_(self.pos_embed, std=0.02)
+        trunc_normal_(self.cls_token, std=0.02)
+        # trunc_normal_(self.mask_token, std=.02)
+        self.apply(self._init_weights)
+        self.fix_init_weight()
+        if isinstance(self.head, nn.Linear):
+            trunc_normal_(self.head.weight, std=0.02)
+            self.head.weight.data.mul_(init_scale)
+            self.head.bias.data.mul_(init_scale)
+        # setting a patch_dropout of 0. would mean it is disabled and this function would be the identity fn
+        self.patch_dropout = (
+            PatchDropout(patch_dropout) if patch_dropout > 0.0 else nn.Identity()
+        )
+        self.grad_checkpointing = grad_checkpointing
+    def fix_init_weight(self):
+        def rescale(param, layer_id):
+            param.div_(math.sqrt(2.0 * layer_id))
+        for layer_id, layer in enumerate(self.blocks):
+            rescale(layer.attn.proj.weight.data, layer_id + 1)
+            if self.naiveswiglu:
+                rescale(layer.mlp.w3.weight.data, layer_id + 1)
+            else:
+                rescale(layer.mlp.fc2.weight.data, layer_id + 1)
+    def get_cast_dtype(self) -> torch.dtype:
+        return self.blocks[0].mlp.fc2.weight.dtype
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def get_num_layers(self):
+        return len(self.blocks)
+    def lock(self, unlocked_groups=0, freeze_bn_stats=False):
+        assert (
+            unlocked_groups == 0
+        ), "partial locking not currently supported for this model"
+        for param in self.parameters():
+            param.requires_grad = False
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.grad_checkpointing = enable
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {"pos_embed", "cls_token"}
+    def get_classifier(self):
+        return self.head
+    def reset_classifier(self, num_classes, global_pool=""):
+        self.num_classes = num_classes
+        self.head = (
+            nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        )
+    def forward_features(self, x, return_all_features=False):
+        x = self.patch_embed(x)
+        batch_size, seq_len, _ = x.size()
+        cls_tokens = self.cls_token.expand(
+            batch_size, -1, -1
+        )  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+        if self.pos_embed is not None:
+            x = x + self.pos_embed
+        x = self.pos_drop(x)
+        # a patch_dropout of 0. would mean it is disabled and this function would do nothing but return what was passed in
+        if os.getenv("RoPE") == "1":
+            if self.training and not isinstance(self.patch_dropout, nn.Identity):
+                x, patch_indices_keep = self.patch_dropout(x)
+                self.rope.forward = partial(
+                    self.rope.forward, patch_indices_keep=patch_indices_keep
+                )
+            else:
+                self.rope.forward = partial(self.rope.forward, patch_indices_keep=None)
+                x = self.patch_dropout(x)
+        else:
+            x = self.patch_dropout(x)
+        rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None
+        for blk in self.blocks:
+            if self.grad_checkpointing:
+                x = checkpoint(blk, x, (rel_pos_bias,))
+            else:
+                x = blk(x, rel_pos_bias=rel_pos_bias)
+        if not return_all_features:
+            x = self.norm(x)
+            if self.fc_norm is not None:
+                return self.fc_norm(x.mean(1))
+            else:
+                return x[:, 0]
+        return x
+    def forward(self, x, return_all_features=False):
+        if return_all_features:
+            return self.forward_features(x, return_all_features)
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+@dataclass
+class CLIPVisionCfg:
+    layers: Union[Tuple[int, int, int, int], int] = 12
+    width: int = 768
+    head_width: int = 64
+    mlp_ratio: float = 4.0
+    patch_size: int = 16
+    image_size: Union[Tuple[int, int], int] = 224
+    ls_init_value: Optional[float] = None  # layer scale initial value
+    patch_dropout: float = 0.0  # what fraction of patches to dropout during training (0 would mean disabled and no patches dropped) - 0.5 to 0.75 recommended in the paper for optimal results
+    global_average_pool: bool = False  # whether to global average pool the last embedding layer, instead of using CLS token (https://arxiv.org/abs/2205.01580)
+    drop_path_rate: Optional[float] = None  # drop path rate
+    timm_model_name: str = (
+        None  # a valid model name overrides layers, width, patch_size
+    )
+    timm_model_pretrained: bool = (
+        False  # use (imagenet) pretrained weights for named model
+    )
+    timm_pool: str = (  # feature pooling for timm model ('abs_attn', 'rot_attn', 'avg', '')
+        "avg"
+    )
+    timm_proj: str = (  # linear projection for timm model output ('linear', 'mlp', '')
+        "linear"
+    )
+    timm_proj_bias: bool = False  # enable bias final projection
+    eva_model_name: str = (
+        None  # a valid eva model name overrides layers, width, patch_size
+    )
+    qkv_bias: bool = True
+    fusedLN: bool = False
+    embed_dim: int = 1024
+    xattn: bool = False
+    postnorm: bool = False
+    rope: bool = False
+    pt_hw_seq_len: int = 16  # 224/14
+    intp_freq: bool = False
+    naiveswiglu: bool = False
+    subln: bool = False
+def load_state_dict(
+    checkpoint_path: str,
+    map_location: str = "cpu",
+    model_key: str = "model|module|state_dict",
+    is_openai: bool = False,
+    skip_list: list = [],
+):
+    if is_openai:
+        model = torch.jit.load(checkpoint_path, map_location="cpu").eval()
+        state_dict = model.state_dict()
+        for key in ["input_resolution", "context_length", "vocab_size"]:
+            state_dict.pop(key, None)
+    else:
+        checkpoint = torch.load(checkpoint_path, map_location=map_location)
+        for mk in model_key.split("|"):
+            if isinstance(checkpoint, dict) and mk in checkpoint:
+                state_dict = checkpoint[mk]
+                break
+            else:
+                state_dict = checkpoint
+        if next(iter(state_dict.items()))[0].startswith("module"):
+            state_dict = {k[7:]: v for k, v in state_dict.items()}
+    for k in skip_list:
+        if k in list(state_dict.keys()):
+            print(f"Removing key {k} from pretrained checkpoint")
+            del state_dict[k]
+    if os.getenv("RoPE") == "1":
+        for k in list(state_dict.keys()):
+            if "freqs_cos" in k or "freqs_sin" in k:
+                del state_dict[k]
+    return state_dict
+def load_clip_visual_state_dict(
+    checkpoint_path: str,
+    map_location: str = "cpu",
+    is_openai: bool = False,
+    skip_list: list = [],
+):
+    state_dict = load_state_dict(
+        checkpoint_path,
+        map_location=map_location,
+        is_openai=is_openai,
+        skip_list=skip_list,
+    )
+    for k in list(state_dict.keys()):
+        if not k.startswith("visual."):
+            del state_dict[k]
+    for k in list(state_dict.keys()):
+        if k.startswith("visual."):
+            new_k = k[7:]
+            state_dict[new_k] = state_dict[k]
+            del state_dict[k]
+    return state_dict

flamingo.py ADDED Viewed

	@@ -0,0 +1,319 @@

+import inspect
+import torch
+import numpy as np
+from einops import rearrange
+from torch import nn
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+try:
+    from deepspeed.runtime.activation_checkpointing.checkpointing import checkpoint
+except:
+    from torch.utils.checkpoint import checkpoint
+def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
+    """ Sinusoid position encoding table """
+    def cal_angle(position, hid_idx):
+        return position / np.power(10000, 2 * (hid_idx // 2) / d_hid)
+    def get_posi_angle_vec(position):
+        return [cal_angle(position, hid_j) for hid_j in range(d_hid)]
+    sinusoid_table = np.array(
+        [get_posi_angle_vec(pos_i) for pos_i in range(n_position)]
+    )
+    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
+    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
+    if padding_idx is not None:
+        # zero vector for padding dimension
+        sinusoid_table[padding_idx] = 0.0
+    return torch.FloatTensor(sinusoid_table)
+def construct_position_encoding(vis_dim, max_pos, rows, cols):
+    seq = get_sinusoid_encoding_table(max_pos, int(vis_dim/2))
+    y_coords, x_coords = torch.meshgrid(torch.arange(rows), torch.arange(cols), indexing='ij')
+    row_positions = seq[y_coords.flatten(), :]
+    col_positions = seq[x_coords.flatten(), :]
+    position_encoding = torch.cat((col_positions, row_positions), dim=-1)
+    return position_encoding
+def unwrap_fsdp(m):
+    if isinstance(m, FSDP):
+        return unwrap_fsdp(m.module)
+    return m
+def accepts_parameter(func, parameter_name):
+    signature = inspect.signature(func)
+    return parameter_name in signature.parameters
+class Flamingo(nn.Module):
+    def __init__(
+        self,
+        vision_encoder: nn.Module,
+        lang_encoder: nn.Module,
+        eoc_token_id: int,
+        media_token_id: int,
+        vis_dim: int,
+        cross_attn_every_n_layers: int = 1,
+        gradient_checkpointing: bool = False,
+        use_ft_layernorm: bool = False,
+        use_ft_flash_attention: bool = False,
+        enable_init_network_params: bool = False,
+        initializer_range: float = 0.02,
+    ):
+        """
+        Args:
+            vision_encoder (nn.Module): HF CLIPModel
+            lang_encoder (nn.Module): HF causal language model
+            eoc_token_id (int): Token id for <|endofchunk|>
+            media_token_id (int): Token id for <image>
+            vis_dim (int): Dimension of the visual features.
+                Visual features are projected to match this shape along the last dimension.
+            cross_attn_every_n_layers (int, optional): How often to apply cross attention after transformer layer. Defaults to 1.
+        """
+        super().__init__()
+        self.vit_use_grad = False
+        self.eoc_token_id = eoc_token_id
+        self.media_token_id = media_token_id
+        self.vis_dim = vis_dim
+        if hasattr(lang_encoder.config, "d_model"):
+            self.lang_dim = lang_encoder.config.d_model  # mpt uses d_model
+        else:
+            self.lang_dim = lang_encoder.config.hidden_size
+        self.vision_encoder = (
+            vision_encoder.visual
+            if hasattr(vision_encoder, "visual")
+            else vision_encoder
+        )
+        self.lang_encoder = lang_encoder
+        self.lang_encoder.init_flamingo(
+            media_token_id=media_token_id,
+            lang_hidden_size=self.lang_dim,
+            vis_hidden_size=self.vis_dim,
+            cross_attn_every_n_layers=cross_attn_every_n_layers,
+            gradient_checkpointing=gradient_checkpointing,
+            use_ft_layernorm=use_ft_layernorm,
+            use_ft_flash_attention=use_ft_flash_attention,
+            enable_init_network_params=enable_init_network_params,
+            initializer_range=initializer_range,
+        )
+        self._use_gradient_checkpointing = gradient_checkpointing
+    def forward(
+        self,
+        vision_x: torch.Tensor,
+        lang_x: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        labels: torch.Tensor = None,
+        image_mask: torch.Tensor = None,
+        subimage_shape: torch.Tensor = None,
+        clear_conditioned_layers: bool = True,
+        past_key_values=None,
+        use_cache: bool = False,
+    ):
+        """
+        Forward pass of Flamingo.
+        Args:
+            vision_x (torch.Tensor): Vision input
+                shape (B, T_img, F, C, H, W) with F=1
+            lang_x (torch.Tensor): Language input ids
+                shape (B, T_txt)
+            attention_mask (torch.Tensor, optional): Attention mask. Defaults to None.
+            labels (torch.Tensor, optional): Labels. Defaults to None.
+            clear_conditioned_layers: if True, clear the conditioned layers
+                once the foward pass is completed. Set this to false if the
+                same set of images will be reused in another subsequent
+                forward pass.
+            past_key_values: pre-computed values to pass to language model.
+                See past_key_values documentation in Hugging Face
+                CausalLM models.
+            use_cache: whether to use cached key values. See use_cache
+                documentation in Hugging Face CausalLM models.
+        """
+        assert (
+            self.lang_encoder.initialized_flamingo
+        ), "Flamingo layers are not initialized. Please call `init_flamingo` first."
+        assert (
+            self.lang_encoder._use_cached_vision_x or vision_x is not None
+        ), "Must provide either vision_x or have precached media using cache_media()."
+        if self.lang_encoder._use_cached_vision_x:
+            # Case: use cached; vision_x should be cached and other
+            # vision-related inputs should not be provided.
+            assert vision_x is None, (
+                "Expect vision_x to be None when media has been cached using"
+                " cache_media(). Try uncache_media() first."
+            )
+            assert self.lang_encoder.is_conditioned()
+        else:
+            # Case: do not use caching (i.e. this is a standard forward pass);
+            self._encode_vision_x(vision_x=vision_x, image_mask=image_mask, subimage_shape=subimage_shape)
+            self._condition_media_locations(input_ids=lang_x)
+        output = self.lang_encoder(
+            input_ids=lang_x,
+            attention_mask=attention_mask,
+            labels=labels,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+        )
+        if clear_conditioned_layers:
+            self.lang_encoder.clear_conditioned_layers()
+        return output
+    def generate(
+        self,
+        vision_x: torch.Tensor,
+        lang_x: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        **kwargs,
+    ):
+        """
+        Generate text conditioned on vision and language inputs.
+        Args:
+            vision_x (torch.Tensor): Vision input
+                shape (B, T_img, F, C, H, W)
+                images in the same chunk are collated along T_img, and frames are collated along F
+                currently only F=1 is supported (single-frame videos)
+            lang_x (torch.Tensor): Language input
+                shape (B, T_txt)
+            **kwargs: see generate documentation in Hugging Face CausalLM models. Some notable kwargs:
+                max_length (int, optional): Maximum length of the output. Defaults to None.
+                attention_mask (torch.Tensor, optional): Attention mask. Defaults to None.
+                num_beams (int, optional): Number of beams. Defaults to 1.
+                max_new_tokens (int, optional): Maximum new tokens. Defaults to None.
+                temperature (float, optional): Temperature. Defaults to 1.0.
+                top_k (int, optional): Top k. Defaults to 50.
+                top_p (float, optional): Top p. Defaults to 1.0.
+                no_repeat_ngram_size (int, optional): No repeat ngram size. Defaults to 0.
+                length_penalty (float, optional): Length penalty. Defaults to 1.0.
+                num_return_sequences (int, optional): Number of return sequences. Defaults to 1.
+                do_sample (bool, optional): Do sample. Defaults to False.
+                early_stopping (bool, optional): Early stopping. Defaults to False.
+        Returns:
+            torch.Tensor: lang_x with generated tokens appended to it
+        """
+        subimage_shape = kwargs.pop("subimage_shape", None)
+        image_mask = kwargs.pop("image_mask", None)
+        num_beams = kwargs.pop("num_beams", 1)
+        if num_beams > 1:
+            vision_x = vision_x.repeat_interleave(num_beams, dim=0)
+            if image_mask is not None:
+                image_mask = image_mask.repeat_interleave(num_beams, dim=0)
+            if subimage_shape is not None:
+                subimage_shape = subimage_shape.repeat_interleave(num_beams, dim=0)
+        self.lang_encoder._use_cached_vision_x = True
+        self._encode_vision_x(vision_x=vision_x, image_mask=image_mask, subimage_shape=subimage_shape)
+        eos_token_id = kwargs.pop("eos_token_id", self.eoc_token_id)
+        output = self.lang_encoder.generate(
+            input_ids=lang_x,
+            attention_mask=attention_mask,
+            eos_token_id=eos_token_id,
+            num_beams=num_beams,
+            **kwargs,
+        )
+        self.lang_encoder.clear_conditioned_layers()
+        self.lang_encoder._use_cached_vision_x = False
+        return output
+    def _encode_vision_x(self, vision_x: torch.Tensor, image_mask: torch.Tensor=None, subimage_shape: torch.Tensor=None):
+        """
+        Compute media tokens from vision input by passing it through vision encoder and conditioning language model.
+        Args:
+            vision_x (torch.Tensor): Vision input
+                shape (B, T_img, F, C, H, W)
+                Images in the same chunk are collated along T_img, and frames are collated along F
+                Currently only F=1 is supported (single-frame videos)
+        rearrange code based on https://github.com/dhansmair/flamingo-mini
+        """
+        assert vision_x.ndim == 6, "vision_x should be of shape (b, T_img, F, C, H, W)"
+        b, T, F = vision_x.shape[:3]
+        assert F == 1, "Only single frame supported"
+        vision_x = rearrange(vision_x, "b T F c h w -> (b T F) c h w")
+        if not self.vit_use_grad:
+            with torch.no_grad():
+                module_to_inspect = unwrap_fsdp(self.vision_encoder)
+                if accepts_parameter(module_to_inspect.forward, "return_all_features"):
+                    vision_x = self.vision_encoder(vision_x, return_all_features=True)
+                else:
+                    vision_x = self.vision_encoder(vision_x)[1]
+        else:
+            module_to_inspect = unwrap_fsdp(self.vision_encoder)
+            if accepts_parameter(module_to_inspect.forward, "return_all_features"):
+                if self.training:
+                    vision_x = checkpoint(self.vision_encoder, vision_x, True)
+                else:
+                    vision_x = self.vision_encoder(vision_x, return_all_features=True)
+            else:
+                vision_x = self.vision_encoder(vision_x)[1]
+        vision_x = rearrange(vision_x, "(b T F) v d -> b (T F) v d", b=b, T=T, F=F)
+        pos_emb = torch.zeros((T,self.vis_dim)).to(vision_x.dtype).to(vision_x.device)
+        for i in range(subimage_shape.shape[0]):
+            cols, rows = int(subimage_shape[i,0]), int(subimage_shape[i,1])
+            tmp_pos_emb = construct_position_encoding(vision_x.shape[-1], 20, rows, cols).to(vision_x.dtype).to(vision_x.device)
+            pos_emb[1:int(cols*rows)+1,:] = tmp_pos_emb
+        vision_x = vision_x + pos_emb.unsqueeze(1).unsqueeze(0).detach()
+        for layer in self.lang_encoder._get_decoder_layers():
+            layer.condition_vis_x((vision_x, image_mask))
+    def _condition_media_locations(self, input_ids: torch.Tensor):
+        """
+        Compute the media token locations from lang_x and condition the language model on these.
+        Args:
+            input_ids (torch.Tensor): Language input
+                shape (B, T_txt)
+        """
+        print(111)
+        media_locations = input_ids == self.media_token_id
+        # make all of the seq focus on the first fake image to avoid nan
+        # media_locations = torch.where(tmp_mask==False, tmp_mask, media_locations)
+        for layer in self.lang_encoder._get_decoder_layers():
+            layer.condition_media_locations(media_locations)
+    def cache_media(self, input_ids: torch.Tensor, vision_x: torch.Tensor):
+        """
+        Pre-cache a prompt/sequence of images / text for log-likelihood evaluations.
+        All subsequent calls to forward() will generate attending to the LAST
+        image in vision_x.
+        This is not meant to be used to cache things for generate().
+        Args:
+            input_ids (torch.Tensor): Language input
+                shape (B, T_txt)
+            vision_x (torch.Tensor): Vision input
+                shape (B, T_img, F, C, H, W)
+                Images in the same chunk are collated along T_img, and frames are collated along F
+                Currently only F=1 is supported (single-frame videos)
+        """
+        self._encode_vision_x(vision_x=vision_x)
+        self._condition_media_locations(input_ids=input_ids)
+        self.lang_encoder._use_cached_vision_x = True
+    def uncache_media(self):
+        """
+        Clear all conditioning.
+        """
+        self.lang_encoder.clear_conditioned_layers()
+        self.lang_encoder._use_cached_vision_x = False

flamingo_lm.py ADDED Viewed

	@@ -0,0 +1,414 @@

+import torch
+import torch.nn as nn
+from einops import rearrange, repeat
+from torch import einsum, nn
+from einops_exts import rearrange_many
+# from .modules import GatedCrossAttentionBlock
+from .utils import getattr_recursive, setattr_recursive
+def exists(val):
+    return val is not None
+def FeedForward(
+    dim,
+    mult=4,
+    use_ft_layernorm=False,
+    enable_init_network_params=False,
+    initializer_range=0.02,
+):
+    inner_dim = int(dim * mult)
+    net = nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias=False),
+    )
+    if use_ft_layernorm and enable_init_network_params:
+        # only use_ft_layernorm is on and enalbe_init_network_params
+        # then start the initialization
+        net[0].weight.data.normal_(mean=0.0, std=initializer_range)
+        net[0].bias.data.zero_()
+        net[1].weight.data.normal_(mean=0.0, std=initializer_range)
+        net[3].weight.data.normal_(mean=0.0, std=initializer_range)
+    return net
+# gated cross attention
+class MaskedCrossAttention(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_visual,
+        dim_head=64,
+        heads=8,
+        only_attend_immediate_media=True,
+        use_ft_layernorm=False,
+        use_ft_flash_attention=False,
+        enable_init_network_params=False,
+        initializer_range=0.02,
+    ):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        self.use_ft_flash_attention = False
+        self.initializer_range = initializer_range
+        inner_dim = dim_head * heads
+        self.norm = nn.LayerNorm(dim)
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim_visual, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+        # whether for text to only attend to immediate preceding image, or all previous images
+        self.only_attend_immediate_media = only_attend_immediate_media
+        if enable_init_network_params:
+            self.apply(self._init_weights)
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def forward(self, x, media, media_locations=None, use_cached_media=False, image_mask=None):
+        """
+        Args:
+            x (torch.Tensor): text features
+                shape (B, T_txt, D_txt)
+            media (torch.Tensor): image features
+                shape (B, T_img, n, D_img) where n is the dim of the latents
+            media_locations: boolean mask identifying the media tokens in x
+                shape (B, T_txt)
+            use_cached_media: bool
+                If true, treat all of x as if they occur after the last media
+                registered in media_locations. T_txt does not need to exactly
+                equal media_locations.shape[1] in this case
+        """
+        if not use_cached_media:
+            assert media_locations.shape[1] == x.shape[1], (
+                f"media_location.shape is {media_locations.shape} but x.shape is"
+                f" {x.shape}"
+            )
+        T_txt = x.shape[1]
+        _, T_img, n = media.shape[:3]
+        h = self.heads
+        x = self.norm(x.contiguous())
+        q = self.to_q(x)
+        media = rearrange(media, "b t n d -> b (t n) d")
+        k, v = self.to_kv(media).chunk(2, dim=-1)
+        if exists(media_locations):
+            media_time = torch.arange(T_img, device=x.device) + 1
+            if use_cached_media:
+                # text time is set to the last cached media location
+                text_time = repeat(
+                    torch.count_nonzero(media_locations, dim=1),
+                    "b -> b i",
+                    i=T_txt,
+                )
+            else:
+                # at each boolean of True, increment the time counter (relative to media time)
+                text_time = media_locations.cumsum(dim=-1)
+            # text time must equal media time if only attending to most immediate image
+            # otherwise, as long as text time is greater than media time (if attending to all previous images / media)
+            mask_op = torch.eq if self.only_attend_immediate_media else torch.ge
+            text_to_media_mask = mask_op(
+                rearrange(text_time, "b i -> b 1 i 1"),
+                repeat(media_time, "j -> 1 1 1 (j n)", n=n),
+            )
+            if self.only_attend_immediate_media:
+                # any text without a preceding media needs to have attention zeroed out
+                text_without_media_mask = text_time == 0
+                text_without_media_mask = rearrange(
+                    text_without_media_mask, "b i -> b 1 i 1"
+                )
+        q, k, v = rearrange_many((q, k, v), "b n (h d) -> b h n d", h=h)
+        q = q * self.scale
+        sim = einsum("... i d, ... j d -> ... i j", q, k)
+        if exists(image_mask):
+            image_mask = image_mask.unsqueeze(1).unsqueeze(1).bool()
+            image_mask = image_mask.repeat_interleave(int(sim.shape[3] / image_mask.shape[3]), dim=-1)
+            sim = sim.masked_fill(~image_mask, -torch.finfo(sim.dtype).max)
+        # if exists(media_locations):
+        #     sim = sim.masked_fill(~text_to_media_mask, -torch.finfo(sim.dtype).max)
+        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
+        attn = sim.softmax(dim=-1)
+        if exists(media_locations) and self.only_attend_immediate_media:
+            # any text without a preceding media needs to have attention zeroed out
+            attn = attn.masked_fill(text_without_media_mask, 0.0)
+        out = einsum("... i j, ... j d -> ... i d", attn, v)
+        out = rearrange(out, "b h n d -> b n (h d)")
+        return self.to_out(out)
+class GatedCrossAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_visual,
+        dim_head=64,
+        heads=12,
+        ff_mult=1,
+        only_attend_immediate_media=True,
+        use_ft_layernorm=False,
+        use_ft_flash_attention=False,
+        enable_init_network_params=False,
+        initializer_range=0.02,
+        gradient_checkpointing=False,
+    ):
+        super().__init__()
+        self.attn = MaskedCrossAttention(
+            dim=dim,
+            dim_visual=dim_visual,
+            dim_head=dim_head,
+            heads=heads,
+            only_attend_immediate_media=only_attend_immediate_media,
+            use_ft_flash_attention=use_ft_flash_attention,
+            use_ft_layernorm=use_ft_layernorm,
+            enable_init_network_params=enable_init_network_params,
+            initializer_range=initializer_range,
+        )
+        self.attn_gate = nn.Parameter(torch.zeros(dim))
+        self.ff = FeedForward(dim, mult=ff_mult)
+        self.ff_gate = nn.Parameter(torch.zeros(dim))
+        self.gradient_checkpointing = gradient_checkpointing
+    def forward(
+        self,
+        x,
+        media,
+        media_locations=None,
+        use_cached_media=False,
+        image_mask=None,
+    ):
+        flag = torch.sum(media_locations, dim=-1)
+        flag = torch.where(flag > 0.0, 1.0, 0.0)
+        flag = flag.unsqueeze(1).unsqueeze(1).to(torch.bfloat16)
+        x = (
+            flag
+            * self.attn(
+                x,
+                media,
+                media_locations=media_locations,
+                use_cached_media=use_cached_media,
+                image_mask=image_mask,
+            )
+            * self.attn_gate.tanh()
+            + x
+        )
+        x = flag * self.ff(x) * self.ff_gate.tanh() + x
+        return x
+class FlamingoLayer(nn.Module):
+    """
+    FlamingoLayer is a wrapper around the GatedCrossAttentionBlock and DecoderLayer.
+    """
+    def __init__(
+        self, gated_cross_attn_layer, decoder_layer, gradient_checkpointing=False
+    ):
+        super().__init__()
+        self.gated_cross_attn_layer = gated_cross_attn_layer
+        self.decoder_layer = decoder_layer
+        self.vis_x = None
+        self.media_locations = None
+        if self.gated_cross_attn_layer is not None:
+            self.gated_cross_attn_layer._use_gradient_checkpointing = (
+                gradient_checkpointing
+            )
+        self.decoder_layer._use_gradient_checkpointing = gradient_checkpointing
+    def is_conditioned(self) -> bool:
+        """Check whether the layer is conditioned."""
+        return self.vis_x is not None and self.media_locations is not None
+    # Used this great idea from this implementation of Flamingo (https://github.com/dhansmair/flamingo-mini/)
+    def condition_vis_x(self, vis_x):
+        if vis_x is not None:
+            self.vis_x, self.image_mask = vis_x
+        else:
+            self.vis_x, self.image_mask = None, None
+    def condition_media_locations(self, media_locations):
+        self.media_locations = media_locations
+    def condition_use_cached_media(self, use_cached_media):
+        self.use_cached_media = use_cached_media
+    def forward(
+        self,
+        lang_x,
+        attention_mask=None,
+        **decoder_layer_kwargs,
+    ):
+        # Cross attention
+        if self.gated_cross_attn_layer is not None:
+            if self.vis_x is None:
+                raise ValueError("vis_x must be conditioned before forward pass")
+            if self.media_locations is None:
+                raise ValueError(
+                    "media_locations must be conditioned before forward pass"
+                )
+            lang_x = self.gated_cross_attn_layer(
+                lang_x,
+                self.vis_x,
+                media_locations=self.media_locations,
+                use_cached_media=self.use_cached_media,
+                image_mask=self.image_mask,
+            )
+        # Normal decoder layer
+        lang_x = self.decoder_layer(
+            lang_x, attention_mask=attention_mask, **decoder_layer_kwargs
+        )
+        return lang_x
+class FlamingoLMMixin(nn.Module):
+    """
+    Mixin to add cross-attention layers to a language model.
+    """
+    def set_decoder_layers_attr_name(self, decoder_layers_attr_name):
+        self.decoder_layers_attr_name = decoder_layers_attr_name
+    def _get_decoder_layers(self):
+        return getattr_recursive(self, self.decoder_layers_attr_name)
+    def _set_decoder_layers(self, value):
+        setattr_recursive(self, self.decoder_layers_attr_name, value)
+    def init_flamingo(
+        self,
+        media_token_id,
+        lang_hidden_size,
+        vis_hidden_size,
+        cross_attn_every_n_layers,
+        *,
+        use_ft_layernorm=False,
+        use_ft_flash_attention=False,
+        enable_init_network_params=False,
+        initializer_range=0.02,
+        gradient_checkpointing=False,
+    ):
+        """
+        Initialize Flamingo by adding a new gated cross attn to the decoder. Store the media token id for computing the media locations.
+        """
+        self.old_decoder_blocks = self._get_decoder_layers()
+        self.gated_cross_attn_layers = nn.ModuleList(
+            [
+                (
+                    GatedCrossAttentionBlock(
+                        dim=lang_hidden_size,
+                        dim_visual=vis_hidden_size,
+                        use_ft_layernorm=use_ft_layernorm,
+                        use_ft_flash_attention=use_ft_flash_attention,
+                        enable_init_network_params=enable_init_network_params,
+                        initializer_range=initializer_range,
+                        gradient_checkpointing=gradient_checkpointing,
+                    )
+                    if (layer_idx + 1) % cross_attn_every_n_layers == 0
+                    else None
+                )
+                for layer_idx, _ in enumerate(self._get_decoder_layers())
+            ]
+        )
+        self.init_flamingo_layers(gradient_checkpointing)
+        self.media_token_id = media_token_id
+        self.initialized_flamingo = True
+        self._use_cached_vision_x = False
+    def init_flamingo_layers(self, gradient_checkpointing):
+        """
+        Re initializes the FlamingoLayers.
+        Propagates any changes made to self.gated_corss_attn_layers or self.old_decoder_blocks
+        """
+        self._set_decoder_layers(
+            nn.ModuleList(
+                [
+                    FlamingoLayer(
+                        gated_cross_attn_layer, decoder_layer, gradient_checkpointing
+                    )
+                    for gated_cross_attn_layer, decoder_layer in zip(
+                        self.gated_cross_attn_layers, self.old_decoder_blocks
+                    )
+                ]
+            )
+        )
+    def forward(self, input_ids, attention_mask, **kwargs):
+        """Condition the Flamingo layers on the media locations before forward()"""
+        if not self.initialized_flamingo:
+            raise ValueError(
+                "Flamingo layers are not initialized. Please call `init_flamingo`"
+                " first."
+            )
+        media_locations = input_ids == self.media_token_id
+        # make all of the seq focus on the first fake image to avoid nan
+        # if there are media already cached and we're generating and there are no media tokens in the input,
+        # we'll assume that ALL input tokens should attend to the last previous media that is cached.
+        # this is especially important for HF generate() compatibility, since generate() calls forward()
+        # repeatedly one token at a time (with no media tokens).
+        # without this check, the model would not attend to any images when generating (after the first token)
+        use_cached_media_locations = (
+            self._use_cached_vision_x
+            and self.is_conditioned()
+            and not media_locations.any()
+        )
+        for layer in self._get_decoder_layers():
+            if not use_cached_media_locations:
+                layer.condition_media_locations(media_locations)
+            layer.condition_use_cached_media(use_cached_media_locations)
+        # package arguments for the other parent's forward. since we don't know the order of the arguments,
+        # make them all kwargs
+        kwargs["input_ids"] = input_ids
+        kwargs["attention_mask"] = attention_mask
+        return super().forward(**kwargs)  # Call the other parent's forward method
+    def is_conditioned(self) -> bool:
+        """Check whether all decoder layers are already conditioned."""
+        return all(l.is_conditioned() for l in self._get_decoder_layers())
+    def clear_conditioned_layers(self):
+        for layer in self._get_decoder_layers():
+            layer.condition_vis_x(None)
+            layer.condition_media_locations(None)
+            layer.condition_use_cached_media(None)

modeling_infimm_hd.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import importlib
+import math
+from functools import partial
+from typing import TYPE_CHECKING, Any, Callable, Generator, List, Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch.cuda.amp import autocast
+from transformers import GenerationConfig, PreTrainedTokenizer, StoppingCriteriaList
+from transformers.generation.logits_process import LogitsProcessorList
+if TYPE_CHECKING:
+    from transformers.generation.streamers import BaseStreamer
+from transformers.generation.utils import GenerateOutput
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.utils import logging
+try:
+    from einops import rearrange
+except ImportError:
+    rearrange = None
+from torch import nn
+from .configuration_infimm_hd import InfiMMHDConfig
+from .eva_vit_model import CLIPVisionCfg, EVAVisionTransformer
+from .flamingo import Flamingo
+from .flamingo_lm import FlamingoLMMixin
+from .utils import _infer_decoder_layers_attr_name, extend_instance
+SUPPORT_CUDA = torch.cuda.is_available()
+SUPPORT_BF16 = SUPPORT_CUDA and torch.cuda.is_bf16_supported()
+SUPPORT_FP16 = SUPPORT_CUDA and torch.cuda.get_device_capability(0)[0] >= 7
+class InfiMMPreTrainedModel(PreTrainedModel):
+    config_class = InfiMMHDConfig
+    base_model_prefix = "transformer"
+    is_parallelizable = False
+    supports_gradient_checkpointing = True
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+class InfiMMHDModel(InfiMMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.vision_config = config.visual
+        vision_encoder = self.build_vision_encoder()
+        self.language_config = config.language
+        language_encoder = self.build_language_encoder()
+        self.model = self.build_flamingo(vision_encoder, language_encoder)
+    def build_vision_encoder(self, image_size=448):
+        vision_cfg = CLIPVisionCfg(**self.vision_config)
+        if image_size:
+            vision_cfg.image_size = image_size
+        vision_encoder = EVAVisionTransformer(
+            img_size=vision_cfg.image_size,
+            patch_size=vision_cfg.patch_size,
+            num_classes=vision_cfg.embed_dim,
+            use_mean_pooling=vision_cfg.global_average_pool,  # False
+            init_values=vision_cfg.ls_init_value,
+            patch_dropout=vision_cfg.patch_dropout,
+            embed_dim=vision_cfg.width,
+            depth=vision_cfg.layers,
+            num_heads=vision_cfg.width // vision_cfg.head_width,
+            mlp_ratio=vision_cfg.mlp_ratio,
+            qkv_bias=vision_cfg.qkv_bias,
+            drop_path_rate=vision_cfg.drop_path_rate,
+            norm_layer=partial(nn.LayerNorm, eps=1e-6),
+            xattn=vision_cfg.xattn,
+            rope=vision_cfg.rope,
+            postnorm=vision_cfg.postnorm,
+            pt_hw_seq_len=vision_cfg.pt_hw_seq_len,  # 224/14
+            intp_freq=vision_cfg.intp_freq,
+            naiveswiglu=vision_cfg.naiveswiglu,
+            subln=vision_cfg.subln,
+        )
+        return vision_encoder
+    def build_language_encoder(self):
+        lang_encoder = AutoModelForCausalLM.from_pretrained(
+            self.language_config["_name_or_path"]
+        )
+        lang_encoder.resize_token_embeddings(self.language_config["vocab_size"])
+        return lang_encoder
+    def build_flamingo(self, vision_encoder, lang_encoder):
+        extend_instance(lang_encoder, FlamingoLMMixin)
+        decoder_layers_attr_name = _infer_decoder_layers_attr_name(lang_encoder)
+        lang_encoder.set_decoder_layers_attr_name(decoder_layers_attr_name)
+        model = Flamingo(
+            vision_encoder,
+            lang_encoder,
+            self.config.eoc_token_id,
+            self.config.image_token_id,
+            vis_dim=self.vision_config["width"],
+            cross_attn_every_n_layers=self.config.cross_attn_every_n_layers,
+            gradient_checkpointing=self.config.use_grad_checkpoint,
+        )
+        return model
+    def generate(
+        self,
+        batch_images,
+        input_ids,
+        attention_mask,
+        **kwargs,
+    ):
+        with torch.inference_mode():
+            outputs = self.model.generate(
+                batch_images,
+                input_ids,
+                attention_mask,
+                **kwargs,
+            )
+        # Extract only the new gnerated tokens
+        outputs = outputs[:, len(input_ids[0]) :]
+        return outputs

modules.py ADDED Viewed

	@@ -0,0 +1,233 @@

+"""
+Based on: https://github.com/lucidrains/flamingo-pytorch
+"""
+import torch
+from einops import rearrange, repeat
+from torch import einsum, nn
+from einops_exts import rearrange_many
+def exists(val):
+    return val is not None
+def FeedForward(
+    dim,
+    mult=4,
+    use_ft_layernorm=False,
+    enable_init_network_params=False,
+    initializer_range=0.02,
+):
+    inner_dim = int(dim * mult)
+    net = nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias=False),
+    )
+    if use_ft_layernorm and enable_init_network_params:
+        # only use_ft_layernorm is on and enalbe_init_network_params
+        # then start the initialization
+        net[0].weight.data.normal_(mean=0.0, std=initializer_range)
+        net[0].bias.data.zero_()
+        net[1].weight.data.normal_(mean=0.0, std=initializer_range)
+        net[3].weight.data.normal_(mean=0.0, std=initializer_range)
+    return net
+# gated cross attention
+class MaskedCrossAttention(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_visual,
+        dim_head=64,
+        heads=8,
+        only_attend_immediate_media=True,
+        use_ft_layernorm=False,
+        use_ft_flash_attention=False,
+        enable_init_network_params=False,
+        initializer_range=0.02,
+    ):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        self.use_ft_flash_attention = False
+        self.initializer_range = initializer_range
+        inner_dim = dim_head * heads
+        self.norm = nn.LayerNorm(dim)
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim_visual, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+        # whether for text to only attend to immediate preceding image, or all previous images
+        self.only_attend_immediate_media = only_attend_immediate_media
+        if enable_init_network_params:
+            self.apply(self._init_weights)
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def forward(self, x, media, media_locations=None, use_cached_media=False, image_mask=None):
+        """
+        Args:
+            x (torch.Tensor): text features
+                shape (B, T_txt, D_txt)
+            media (torch.Tensor): image features
+                shape (B, T_img, n, D_img) where n is the dim of the latents
+            media_locations: boolean mask identifying the media tokens in x
+                shape (B, T_txt)
+            use_cached_media: bool
+                If true, treat all of x as if they occur after the last media
+                registered in media_locations. T_txt does not need to exactly
+                equal media_locations.shape[1] in this case
+        """
+        if not use_cached_media:
+            assert media_locations.shape[1] == x.shape[1], (
+                f"media_location.shape is {media_locations.shape} but x.shape is"
+                f" {x.shape}"
+            )
+        T_txt = x.shape[1]
+        _, T_img, n = media.shape[:3]
+        h = self.heads
+        x = self.norm(x.contiguous())
+        q = self.to_q(x)
+        media = rearrange(media, "b t n d -> b (t n) d")
+        k, v = self.to_kv(media).chunk(2, dim=-1)
+        if exists(media_locations):
+            media_time = torch.arange(T_img, device=x.device) + 1
+            if use_cached_media:
+                # text time is set to the last cached media location
+                text_time = repeat(
+                    torch.count_nonzero(media_locations, dim=1),
+                    "b -> b i",
+                    i=T_txt,
+                )
+            else:
+                # at each boolean of True, increment the time counter (relative to media time)
+                text_time = media_locations.cumsum(dim=-1)
+            # text time must equal media time if only attending to most immediate image
+            # otherwise, as long as text time is greater than media time (if attending to all previous images / media)
+            mask_op = torch.eq if self.only_attend_immediate_media else torch.ge
+            text_to_media_mask = mask_op(
+                rearrange(text_time, "b i -> b 1 i 1"),
+                repeat(media_time, "j -> 1 1 1 (j n)", n=n),
+            )
+            if self.only_attend_immediate_media:
+                # any text without a preceding media needs to have attention zeroed out
+                text_without_media_mask = text_time == 0
+                text_without_media_mask = rearrange(
+                    text_without_media_mask, "b i -> b 1 i 1"
+                )
+        q, k, v = rearrange_many((q, k, v), "b n (h d) -> b h n d", h=h)
+        q = q * self.scale
+        sim = einsum("... i d, ... j d -> ... i j", q, k)
+        if exists(image_mask):
+            image_mask = image_mask.unsqueeze(1).unsqueeze(1).bool()
+            image_mask = image_mask.repeat_interleave(int(sim.shape[3] / image_mask.shape[3]), dim=-1)
+            sim = sim.masked_fill(~image_mask, -torch.finfo(sim.dtype).max)
+        # if exists(media_locations):
+        #     sim = sim.masked_fill(~text_to_media_mask, -torch.finfo(sim.dtype).max)
+        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
+        attn = sim.softmax(dim=-1)
+        if exists(media_locations) and self.only_attend_immediate_media:
+            # any text without a preceding media needs to have attention zeroed out
+            attn = attn.masked_fill(text_without_media_mask, 0.0)
+        out = einsum("... i j, ... j d -> ... i d", attn, v)
+        out = rearrange(out, "b h n d -> b n (h d)")
+        return self.to_out(out)
+class GatedCrossAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_visual,
+        dim_head=64,
+        heads=12,
+        ff_mult=1,
+        only_attend_immediate_media=True,
+        use_ft_layernorm=False,
+        use_ft_flash_attention=False,
+        enable_init_network_params=False,
+        initializer_range=0.02,
+        gradient_checkpointing=False,
+    ):
+        super().__init__()
+        self.attn = MaskedCrossAttention(
+            dim=dim,
+            dim_visual=dim_visual,
+            dim_head=dim_head,
+            heads=heads,
+            only_attend_immediate_media=only_attend_immediate_media,
+            use_ft_flash_attention=use_ft_flash_attention,
+            use_ft_layernorm=use_ft_layernorm,
+            enable_init_network_params=enable_init_network_params,
+            initializer_range=initializer_range,
+        )
+        self.attn_gate = nn.Parameter(torch.zeros(dim))
+        self.ff = FeedForward(dim, mult=ff_mult)
+        self.ff_gate = nn.Parameter(torch.zeros(dim))
+        self.gradient_checkpointing = gradient_checkpointing
+    def forward(
+        self,
+        x,
+        media,
+        media_locations=None,
+        use_cached_media=False,
+        image_mask=None,
+    ):
+        flag = torch.sum(media_locations, dim=-1)
+        flag = torch.where(flag > 0.0, 1.0, 0.0)
+        flag = flag.unsqueeze(1).unsqueeze(1).to(torch.bfloat16)
+        x = (
+            flag
+            * self.attn(
+                x,
+                media,
+                media_locations=media_locations,
+                use_cached_media=use_cached_media,
+                image_mask=image_mask,
+            )
+            * self.attn_gate.tanh()
+            + x
+        )
+        x = flag * self.ff(x) * self.ff_gate.tanh() + x
+        return x

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "_name_or_path": "./",
+    "auto_map": {
+        "AutoProcessor": "processing_infimm_hd.InfiMMHDProcessor"
+    },
+    "image_size": 224
+}

processing_infimm_hd.py ADDED Viewed

	@@ -0,0 +1,422 @@

+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for InfiMMHD.
+"""
+import random
+from typing import List, Optional, Tuple, Union
+import torch
+import torchvision.transforms.functional as F
+from PIL import Image
+from torchvision.transforms import (
+    CenterCrop,
+    Compose,
+    InterpolationMode,
+    Normalize,
+    Resize,
+    ToTensor,
+)
+from transformers import AutoTokenizer
+from transformers.image_processing_utils import ImageProcessingMixin
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import BatchEncoding
+IMAGE_TOKEN = "<image>"
+END_OF_CHUNK_TOKEN = "<|endofchunk|>"
+PAD_TOKEN = "<PAD>"
+OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
+OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
+def _convert_to_rgb(image):
+    return image.convert("RGB")
+class ResizeKeepRatio:
+    """Resize and Keep Ratio
+    Copy & paste from `timm`
+    """
+    def __init__(
+        self,
+        size,
+        longest=0.0,
+        interpolation=InterpolationMode.BICUBIC,
+        random_scale_prob=0.0,
+        random_scale_range=(0.85, 1.05),
+        random_aspect_prob=0.0,
+        random_aspect_range=(0.9, 1.11),
+    ):
+        if isinstance(size, (list, tuple)):
+            self.size = tuple(size)
+        else:
+            self.size = (size, size)
+        self.interpolation = interpolation
+        self.longest = float(longest)  # [0, 1] where 0 == shortest edge, 1 == longest
+        self.random_scale_prob = random_scale_prob
+        self.random_scale_range = random_scale_range
+        self.random_aspect_prob = random_aspect_prob
+        self.random_aspect_range = random_aspect_range
+    @staticmethod
+    def get_params(
+        img,
+        target_size,
+        longest,
+        random_scale_prob=0.0,
+        random_scale_range=(0.85, 1.05),
+        random_aspect_prob=0.0,
+        random_aspect_range=(0.9, 1.11),
+    ):
+        """Get parameters"""
+        source_size = img.size[::-1]  # h, w
+        h, w = source_size
+        target_h, target_w = target_size
+        ratio_h = h / target_h
+        ratio_w = w / target_w
+        ratio = max(ratio_h, ratio_w) * longest + min(ratio_h, ratio_w) * (
+            1.0 - longest
+        )
+        if random_scale_prob > 0 and random.random() < random_scale_prob:
+            ratio_factor = random.uniform(random_scale_range[0], random_scale_range[1])
+            ratio_factor = (ratio_factor, ratio_factor)
+        else:
+            ratio_factor = (1.0, 1.0)
+        if random_aspect_prob > 0 and random.random() < random_aspect_prob:
+            aspect_factor = random.uniform(
+                random_aspect_range[0], random_aspect_range[1]
+            )
+            ratio_factor = (
+                ratio_factor[0] / aspect_factor,
+                ratio_factor[1] * aspect_factor,
+            )
+        size = [round(x * f / ratio) for x, f in zip(source_size, ratio_factor)]
+        return size
+    def __call__(self, img):
+        """
+        Args:
+            img (PIL Image): Image to be cropped and resized.
+        Returns:
+            PIL Image: Resized, padded to at least target size, possibly cropped to exactly target size
+        """
+        size = self.get_params(
+            img,
+            self.size,
+            self.longest,
+            self.random_scale_prob,
+            self.random_scale_range,
+            self.random_aspect_prob,
+            self.random_aspect_range,
+        )
+        img = F.resize(img, size, self.interpolation)
+        return img
+    def __repr__(self):
+        format_string = self.__class__.__name__ + "(size={0}".format(self.size)
+        format_string += f", interpolation={self.interpolation})"
+        format_string += f", longest={self.longest:.3f})"
+        return format_string
+def image_transform(
+    image_size: Union[int, Tuple[int, int]],
+    mean: Optional[Tuple[float, ...]] = None,
+    std: Optional[Tuple[float, ...]] = None,
+    resize_mode: Optional[str] = None,
+    interpolation: Optional[str] = None,
+):
+    mean = mean or OPENAI_DATASET_MEAN
+    if not isinstance(mean, (list, tuple)):
+        mean = (mean,) * 3
+    std = std or OPENAI_DATASET_STD
+    if not isinstance(std, (list, tuple)):
+        std = (std,) * 3
+    interpolation = interpolation or "bicubic"
+    assert interpolation in ["bicubic", "bilinear", "random"]
+    # NOTE random is ignored for interpolation_mode, so defaults to BICUBIC for inference if set
+    interpolation_mode = (
+        InterpolationMode.BILINEAR
+        if interpolation == "bilinear"
+        else InterpolationMode.BICUBIC
+    )
+    resize_mode = resize_mode or "shortest"
+    assert resize_mode in ("shortest", "longest", "squash")
+    normalize = Normalize(mean=mean, std=std)
+    assert resize_mode == "shortest"
+    if not isinstance(image_size, (tuple, list)):
+        image_size = (image_size, image_size)
+    if image_size[0] == image_size[1]:
+        # simple case, use torchvision built-in Resize w/ shortest edge mode (scalar size arg)
+        transforms = [Resize(image_size[0], interpolation=interpolation_mode)]
+    else:
+        # resize shortest edge to matching target dim for non-square target
+        transforms = [ResizeKeepRatio(image_size)]
+    transforms += [CenterCrop(image_size)]
+    transforms.extend(
+        [
+            _convert_to_rgb,
+            ToTensor(),
+            normalize,
+        ]
+    )
+    return Compose(transforms)
+def get_target_size(width, height, max_image_size, min_image_size):
+    target_width = 0
+    target_height = 0
+    if width < min_image_size:
+        target_width = min_image_size
+    elif width > max_image_size:
+        target_width = max_image_size
+    if height < min_image_size:
+        target_height = min_image_size
+    elif height > max_image_size:
+        target_height = max_image_size
+    if target_width == 0:
+        ratio = ((width - min_image_size) + int(0.5*min_image_size))//min_image_size
+        target_width = ratio * min_image_size + min_image_size
+    if target_height == 0:
+        ratio = ((height - min_image_size) + int(0.5*min_image_size))//min_image_size
+        target_height = ratio * min_image_size + min_image_size
+    return target_width, target_height
+class EVAClipImageProcessor(ImageProcessingMixin):
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.image_processor = image_transform(image_size=448)
+        self.img_size = 448
+    def _prepare_images(self, batch: List[List[Image.Image]]) -> torch.Tensor:
+        """
+        Convert images to tensors, reshape them, and stack them.
+        Args:
+            batch: A list of lists of images.
+        Returns:
+            preprocessed images (tensors) or None
+                shape (B, T_img, F, C, H, W)
+                None if no images in batch
+        """
+        target_image_num = []
+        target_shape = []
+        for x in batch:
+            width, height = x[0].size
+            tar_wid, tar_hei = get_target_size(width, height, 1344, self.img_size)
+            target_shape.append((tar_wid, tar_hei))
+            target_image_num.append(int(tar_wid/self.img_size*tar_hei/self.img_size))
+        images_per_example = max(target_image_num)
+        batch_images = None
+        image_mask = None
+        sub_image_shape = None
+        for iexample, example in enumerate(batch):
+            for img in example:
+                img_ori = img
+                tar_wid, tar_hei = target_shape[iexample]
+                img_new = img.resize((tar_wid, tar_hei), Image.BILINEAR)
+            sub_images = [img_ori]
+            for y in range(0, tar_hei, self.img_size):
+                for x in range(0, tar_wid, self.img_size):
+                    sub_img = img_new.crop((x, y, x + self.img_size, y + self.img_size))
+                    sub_images.append(sub_img)
+            for iimage, image in enumerate(sub_images):
+                preprocessed = self.image_processor(image)
+                if batch_images is None:
+                    batch_images = torch.zeros(
+                        (len(batch), images_per_example+1, 1) + preprocessed.shape,
+                        dtype=preprocessed.dtype,
+                    )
+                batch_images[iexample, iimage, 0] = preprocessed
+            if not torch.is_tensor(image_mask):
+                image_mask = torch.zeros((len(batch), images_per_example+1), dtype=preprocessed.dtype)
+            image_mask[iexample,:target_image_num[iexample]+1] = 1.0
+            if not torch.is_tensor(sub_image_shape):
+                sub_image_shape = torch.zeros((len(batch), 2), dtype=preprocessed.dtype)
+            sub_image_shape[iexample, 0], sub_image_shape[iexample, 1] = tar_wid/self.img_size, tar_hei/self.img_size
+        # if batch_images is not None:
+        #     batch_images = batch_images.to(
+        #         self.device, dtype=self.cast_dtype, non_blocking=True
+        #     )
+        # if image_mask is not None:
+        #     image_mask = image_mask.to(
+        #         self.device, dtype=self.cast_dtype, non_blocking=True
+        #     )
+        # if sub_image_shape is not None:
+        #     sub_image_shape = sub_image_shape.to(
+        #         self.device, dtype=self.cast_dtype, non_blocking=True
+        #     )
+        return batch_images, image_mask, sub_image_shape
+    def preprocess(self, imgpaths=None):
+        if imgpaths is None or len(imgpaths) == 0:
+            images = [(Image.new("RGB", (224, 224), color="black"))]
+        else:
+            images = [Image.open(fp) for fp in imgpaths]
+        return self._prepare_images([images])
+class InfiMMHDProcessor(ProcessorMixin):
+    r"""
+    Constructs a InfiMMLlama2 processor which wraps a tokenizer and an image processor into a single processor.
+    Args:
+        image_processor (`EVAClipImageProcessor`):
+            An instance of [`EVAClipImageProcessor`]. The image processor is a required input.
+        tokenizer (`LlamaTokenizer`):
+            An instance of [`LlamaTokenizer`]. The tokenizer is a required input.
+        image_size (`int`, *optional*, defaults to 336): Image size (assuming a square image)
+    """
+    attributes = ["tokenizer"]
+    tokenizer_class = "LlamaTokenizer"
+    def __init__(self, tokenizer=None, **kwargs):
+        self.image_processor = EVAClipImageProcessor()
+        if tokenizer is None:
+            tokenizer = AutoTokenizer.from_pretrained("infimm-hd", verbose=False)
+        super().__init__(tokenizer, tokenizer)
+    def _prepare_text(
+        self,
+        batch: List[List[str]],
+        padding="longest",
+        truncation=True,
+        max_length=2048,
+    ):
+        """
+        Tokenize the text and stack them.
+        Args:
+            batch: A list of lists of strings.
+        Returns:
+            input_ids (tensor)
+                shape (B, T_txt)
+            attention_mask (tensor)
+                shape (B, T_txt)
+        """
+        batch = [b.strip() for b in batch]
+        encodings = self.tokenizer(
+            batch,
+            padding=padding,
+            truncation=truncation,
+            return_tensors="pt",
+            max_length=max_length,
+        )
+        input_ids, attention_mask = encodings["input_ids"], encodings["attention_mask"]
+        # print(self.tokenizer.convert_ids_to_tokens(input_ids[]))
+        return input_ids, attention_mask
+    def __call__(
+        self,
+        prompts,
+    ) -> BatchEncoding:
+        """This method takes batched or non-batched prompts made of text and images and converts them into prompts that
+        the model was trained on and prepares the image pixel values for the model to process.
+        """
+        image_paths = self._extract_image_paths(prompts)
+        images, image_mask, sub_image_shape = self.image_processor.preprocess(image_paths)
+        prompts = self._replace_with_media_tokens(prompts)
+        final_prompt = self.apply_template(prompts)
+        # system_prompt = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions."
+        # final_prompt = f"{system_prompt} USER: <image>" + prompts + " ASSISTANT:"
+        input_ids, attention_mask = self._prepare_text([final_prompt])
+        return BatchEncoding(
+            data={
+                "input_ids": input_ids,
+                "attention_mask": attention_mask,
+                "batch_images": images,
+                "image_mask": image_mask,
+                "subimage_shape": sub_image_shape,
+            }
+        )
+    def _extract_image_paths(self, prompts):
+        image_paths = []
+        for round in prompts:
+            if round["role"] != "user":
+                continue
+            for piece in round["content"]:
+                if isinstance(piece, dict):
+                    image_paths.append(piece["image"])
+        return image_paths
+    def _replace_with_media_tokens(self, prompts):
+        new_prompts = []
+        is_first_img = True
+        for round in prompts:
+            if round["role"] != "user":
+                new_prompts.append(round)
+            new_content = []
+            for piece in round["content"]:
+                if isinstance(piece, dict):
+                    new_content.append(
+                        f"{IMAGE_TOKEN}" if is_first_img
+                        else f"{END_OF_CHUNK_TOKEN}{IMAGE_TOKEN}"
+                    )
+                    is_first_img = False
+                else:
+                    new_content.append(piece)
+            new_prompts.append({"role": "user", "content": "".join(new_content)})
+        return new_prompts
+    def apply_template(self, messages, task="generation"):
+        prompt = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True if task == "generation" else False,
+        )
+        return prompt
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:688e7927fe4f8a80b8d6905d77fdb0922b53f61ed5f7345749408a8654bca4fa
+size 35997587561

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "<|endofchunk|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,63 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32000": {
+      "content": "<|endofchunk|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|endofchunk|>",
+    "<image>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = 'A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\\'s questions.' %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{{ system_message }}{% endif %}{% if message['role'] == 'user' %}{{ ' USER: ' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ ' ASSISTANT: ' + message['content'].strip() + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ ' ASSISTANT:' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": false,
+  "model_max_length": 4096,
+  "pad_token": "<unk>",
+  "padding_side": "left",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

utils.py ADDED Viewed

	@@ -0,0 +1,98 @@

+def extend_instance(obj, mixin):
+    """Apply mixins to a class instance after creation"""
+    base_cls = obj.__class__
+    base_cls_name = obj.__class__.__name__
+    obj.__class__ = type(
+        base_cls_name, (mixin, base_cls), {}
+    )  # mixin needs to go first for our forward() logic to work
+def getattr_recursive(obj, att):
+    """
+    Return nested attribute of obj
+    Example: getattr_recursive(obj, 'a.b.c') is equivalent to obj.a.b.c
+    """
+    if att == "":
+        return obj
+    i = att.find(".")
+    if i < 0:
+        return getattr(obj, att)
+    else:
+        return getattr_recursive(getattr(obj, att[:i]), att[i + 1 :])
+def setattr_recursive(obj, att, val):
+    """
+    Set nested attribute of obj
+    Example: setattr_recursive(obj, 'a.b.c', val) is equivalent to obj.a.b.c = val
+    """
+    if "." in att:
+        obj = getattr_recursive(obj, ".".join(att.split(".")[:-1]))
+    setattr(obj, att.split(".")[-1], val)
+def _infer_decoder_layers_attr_name(model):
+    for k in __KNOWN_DECODER_LAYERS_ATTR_NAMES:
+        if k.lower() in model.__class__.__name__.lower():
+            return __KNOWN_DECODER_LAYERS_ATTR_NAMES[k]
+    raise ValueError(
+        "We require the attribute name for the nn.ModuleList in the decoder storing"
+        " the transformer block layers. Please supply this string manually."
+    )
+__KNOWN_DECODER_LAYERS_ATTR_NAMES = {
+    "llama": "model.layers",
+    "mistral": "model.layers",
+}
+def resize_eva_pos_embed(state_dict, model, interpolation: str = "bicubic", seq_dim=1):
+    # interpolate position embedding
+    if "pos_embed" in state_dict:
+        pos_embed_checkpoint = state_dict["pos_embed"]
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        num_patches = model.patch_embed.num_patches
+        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+        # height (== width) for the checkpoint position embedding
+        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+        # height (== width) for the new position embedding
+        new_size = int(num_patches**0.5)
+        # class_token and dist_token are kept unchanged
+        if orig_size != new_size:
+            print(
+                "Position interpolate from %dx%d to %dx%d"
+                % (orig_size, orig_size, new_size, new_size)
+            )
+            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+            # only the position tokens are interpolated
+            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+            pos_tokens = pos_tokens.reshape(
+                -1, orig_size, orig_size, embedding_size
+            ).permute(0, 3, 1, 2)
+            # Convert to float for interpolation
+            pos_tokens = pos_tokens.float()
+            pos_tokens = torch.nn.functional.interpolate(
+                pos_tokens,
+                size=(new_size, new_size),
+                mode="bicubic",
+                align_corners=False,
+            )
+            # Convert back to Half if needed
+            pos_tokens = pos_tokens.half()
+            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+            state_dict["pos_embed"] = new_pos_embed
+            patch_embed_proj = state_dict["patch_embed.proj.weight"]
+            patch_size = model.patch_embed.patch_size
+            # Convert to float for interpolation
+            patch_embed_proj = patch_embed_proj.float()
+            state_dict["patch_embed.proj.weight"] = torch.nn.functional.interpolate(
+                patch_embed_proj.float(),
+                size=patch_size,
+                mode="bicubic",
+                align_corners=False,
+            )
+            state_dict["patch_embed.proj.weight"] = state_dict["patch_embed.proj.weight"].half()