llava-jp-1.3b-v1.1

Runtime error

App Files Files Community

toshi456 commited on Apr 24

Commit

7d0ed79

•

1 Parent(s): 744af63

Upload 14 files

Browse files

Files changed (14) hide show

llava/constants.py +9 -0
llava/conversation.py +213 -0
llava/model/clip_encoder.py +132 -0
llava/model/llava_arch.py +250 -0
llava/model/llava_gpt2.py +111 -0
llava/model/llava_gpt_neox.py +112 -0
llava/model/llava_llama.py +110 -0
llava/model/vision_projector.py +106 -0
llava/s2wrapper/__init__.py +2 -0
llava/s2wrapper/core.py +74 -0
llava/s2wrapper/utils.py +32 -0
llava/train/arguments_dataclass.py +88 -0
llava/train/dataset.py +306 -0
llava/train/llava_trainer.py +237 -0

llava/constants.py ADDED Viewed

	@@ -0,0 +1,9 @@

+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+LOGDIR = "."
+# Model Constants
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"

llava/conversation.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import dataclasses
+from enum import auto, Enum
+from typing import List, Tuple
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    PLAIN = auto()
+    TWO = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+    skip_next: bool = False
+    def get_prompt(self):
+        messages = self.messages
+        if len(messages) > 0 and type(messages[0][1]) is tuple:
+            messages = self.messages.copy()
+            init_role, init_msg = messages[0].copy()
+            init_msg = init_msg[0].replace("<image>", "").strip()
+            messages[0] = (init_role, "<image>\n" + init_msg)
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ": "
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += message + seps[i % 2]
+                else:
+                    ret += ""
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+        return ret
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+    def get_images(self, return_pil=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    from PIL import Image
+                    msg, image, image_process_mode = msg
+                    if image_process_mode == "Pad":
+                        def expand2square(pil_img, background_color=(122, 116, 104)):
+                            width, height = pil_img.size
+                            if width == height:
+                                return pil_img
+                            elif width > height:
+                                result = Image.new(pil_img.mode, (width, width), background_color)
+                                result.paste(pil_img, (0, (width - height) // 2))
+                                return result
+                            else:
+                                result = Image.new(pil_img.mode, (height, height), background_color)
+                                result.paste(pil_img, ((height - width) // 2, 0))
+                                return result
+                        image = expand2square(image)
+                    elif image_process_mode in ["Default", "Crop"]:
+                        pass
+                    elif image_process_mode == "Resize":
+                        image = image.resize((336, 336))
+                    else:
+                        raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if longest_edge != max(image.size):
+                        if H > W:
+                            H, W = longest_edge, shortest_edge
+                        else:
+                            H, W = shortest_edge, longest_edge
+                        image = image.resize((W, H))
+                    if return_pil:
+                        images.append(image)
+                    else:
+                        buffered = BytesIO()
+                        image.save(buffered, format="PNG")
+                        img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                        images.append(img_b64_str)
+        return images
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    msg, image, image_process_mode = msg
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if H > W:
+                        H, W = longest_edge, shortest_edge
+                    else:
+                        H, W = shortest_edge, longest_edge
+                    image = image.resize((W, H))
+                    buffered = BytesIO()
+                    image.save(buffered, format="JPEG")
+                    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                    img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
+                    msg = img_str + msg.replace('<image>', '').strip()
+                    ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version)
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+conv_vicuna_v1 = Conversation(
+    system="これは好奇心旺盛なユーザーと人工知能システムのチャットです。"
+    "システムはユーザーの質問に親切、詳細、丁寧に答える。",
+    roles=("ユーザー", "システム"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="<EOD|LLM-jp>", # if you use llm-jp : <EOD|LLM-jp>, gpt2 and gpt_neox: </s>
+)
+conv_llava_plain = Conversation(
+    system="",
+    roles=("", ""),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="\n",
+)
+default_conversation = conv_llava_plain
+conv_templates = {
+    "v1": conv_vicuna_v1,
+    "plain": conv_llava_plain,
+}
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())

llava/model/clip_encoder.py ADDED Viewed

	@@ -0,0 +1,132 @@

+from typing import Optional
+import torch
+import torch.nn as nn
+from transformers import (
+    CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig,\
+    SiglipVisionModel, SiglipImageProcessor, SiglipVisionConfig
+)
+from llava.s2wrapper import forward as multiscale_forward
+class CLIPVisionTower(nn.Module):
+    def __init__(
+        self,
+        vision_tower_name: str="openai/clip-vit-large-patch14-336",
+        mm_vision_select_layer: int=-2, # v1.5 is -2
+        mm_vision_select_feature: str="patch",
+        delay_load: bool=False,
+        requires_grad: bool=False,
+        scales: Optional[float] = None
+    ):
+        super().__init__()
+        self.is_loaded = False
+        self.requires_grad = requires_grad
+        self.scales = scales
+        self.vision_tower_name = vision_tower_name
+        self.select_layer = mm_vision_select_layer
+        self.select_feature = mm_vision_select_feature
+        self.image_processor = None
+        self.vision_tower = None
+        if not delay_load:
+            self.load_model()
+        else:
+            if "clip" in self.vision_tower_name:
+                self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
+            elif "siglip" in self.vision_tower_name:
+                self.cfg_only = SiglipVisionConfig.from_pretrained(self.vision_tower_name)
+            else:
+                raise ValueError(f'Unsupported vision_tower_name: {self.vision_tower_name}')
+    def load_model(self):
+        if "clip" in self.vision_tower_name:
+            self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
+            self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name)
+        elif "siglip" in self.vision_tower_name:
+            self.image_processor = SiglipImageProcessor.from_pretrained(self.vision_tower_name)
+            self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name)
+        else:
+            raise ValueError(f'Unsupported vision_tower_name: {self.vision_tower_name}')
+        self.vision_tower.requires_grad_(self.requires_grad)
+        self.is_loaded = True
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+        if self.select_feature == 'patch':
+            image_features = image_features[:, 1:]
+        elif self.select_feature == 'cls_patch':
+            image_features = image_features
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                if self.scales is None:
+                    image_feature = self._forward_feature(images.unsqueeze(0))
+                else:
+                    image_feature = multiscale_forward(
+                        self._forward_feature,
+                        images.unsqueeze(0),
+                        scales=self.scales,
+                        num_prefix_token=0,
+                        max_split_size=self.image_processor.size["height"]
+                    )
+                #image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            if self.scales is None:
+                image_features = self._forward_feature(images)
+            else:
+                image_features = multiscale_forward(
+                    self._forward_feature,
+                    images,
+                    scales=self.scales,
+                    num_prefix_token=0,
+                    max_split_size=self.image_processor.size["height"]
+                )
+            #image_features = self.feature_select(image_forward_outs).to(images.dtype)
+        return image_features
+    def _forward_feature(self, inputs):
+        return self.feature_select(self.vision_tower(inputs.to(device=self.device, dtype=self.dtype), output_hidden_states=True))
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+    @property
+    def device(self):
+        return self.vision_tower.device
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+    @property
+    def hidden_size(self):
+        if self.scales is None:
+            return self.config.hidden_size
+        return self.config.hidden_size*len(self.scales)
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2

llava/model/llava_arch.py ADDED Viewed

	@@ -0,0 +1,250 @@

+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+from abc import ABC, abstractmethod
+import torch
+from llava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX
+from llava.model.clip_encoder import CLIPVisionTower
+from llava.model.vision_projector import get_vision_projector
+class LlavaMetaModel:
+    def __init__(self, config):
+        super(LlavaMetaModel, self).__init__(config)
+        #self.config = config
+        if hasattr(config, "mm_vision_tower"):
+            self.initialize_vision_modules(config)
+        else:
+            self.vision_tower = None
+            self.mm_projector = None
+    def get_vision_tower(self):
+        vision_tower = getattr(self, 'vision_tower', None)
+        if type(vision_tower) is list:
+            vision_tower = vision_tower[0]
+        return vision_tower
+    def initialize_vision_modules(self, model_args):
+        vision_tower = model_args.vision_tower if hasattr(model_args, "vision_tower") else model_args.mm_vision_tower
+        mm_vision_select_layer = model_args.mm_vision_select_layer
+        mm_vision_select_feature = model_args.mm_vision_select_feature
+        pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter if hasattr(model_args, "pretrain_mm_mlp_adapter") else None
+        self.config.mm_vision_tower = vision_tower
+        self.config.scales = model_args.scales if hasattr(model_args, 'scales') else None
+        self.vision_tower = CLIPVisionTower(
+            vision_tower,
+            mm_vision_select_layer,
+            mm_vision_select_feature,
+            delay_load=True,
+            scales=model_args.scales,
+        )
+        self.vision_tower.load_model()
+        self.config.use_mm_proj = True
+        self.config.mm_projector_type = getattr(model_args, 'mm_projector_type', 'linear')
+        self.config.mm_hidden_size = self.vision_tower.hidden_size
+        self.config.mm_vision_select_layer = mm_vision_select_layer
+        self.config.mm_vision_select_feature = mm_vision_select_feature
+        self.mm_projector = get_vision_projector(self.config)
+        # In case it is frozen by LoRA
+        for p in self.mm_projector.parameters():
+            p.requires_grad = True
+        if pretrain_mm_mlp_adapter is not None:
+            mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
+            def get_w(weights, keyword):
+                return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
+            self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'))
+class LlavaMetaForCausalLM(ABC):
+    base_model = "" # gpt2 or llama or gptneox
+    @abstractmethod
+    def get_model(self):
+        pass
+    def get_vision_tower(self):
+        return self.get_model().get_vision_tower()
+    def encode_images(self, images):
+        image_features = self.get_model().get_vision_tower()(images)
+        image_features = self.get_model().mm_projector(image_features)
+        return image_features
+    def embed(self, input_ids):
+        if self.base_model == "gpt2":
+            return self.transformer.wte(input_ids)
+        elif self.base_model == "gpt_neox":
+            return self.embed_in(input_ids) # NeoX
+        elif self.base_model == "llama":
+            return self.get_model().embed_tokens(input_ids) # Llama
+    def prepare_inputs_labels_for_multimodal(
+        self, input_ids, position_ids, attention_mask, past_key_values, labels, images
+    ):
+        vision_tower = self.get_vision_tower()
+        if vision_tower is None or images is None or input_ids.shape[1] == 1:
+            if past_key_values is not None and vision_tower is not None and images is not None and input_ids.shape[1] == 1:
+                target_shape = past_key_values[-1][-1].shape[-2] + 1
+                attention_mask = torch.cat((attention_mask, torch.ones(
+                    (attention_mask.shape[0], target_shape - attention_mask.shape[1]),
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device
+                )), dim=1)
+                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+            return input_ids, position_ids, attention_mask, past_key_values, None, labels
+        if type(images) is list or images.ndim == 5:
+            # 動画の場合の処理
+            concat_images = torch.cat([image for image in images], dim=0)
+            image_features = self.encode_images(concat_images)
+            split_sizes = [image.shape[0] for image in images]
+            image_features = torch.split(image_features, split_sizes, dim=0)
+            image_features = [x.flatten(0, 1).to(self.device) for x in image_features]
+        else:
+            image_features = self.encode_images(images).to(self.device)
+        # Let's just add dummy tensors if they do not exist,
+        # it is a headache to deal with None all the time.
+        # But it is not ideal, and if you have a better idea,
+        # please open an issue / submit a PR, thanks.
+        _labels = labels
+        _position_ids = position_ids
+        _attention_mask = attention_mask
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+        else:
+            attention_mask = attention_mask.bool()
+        if position_ids is None:
+            position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
+        if labels is None:
+            labels = torch.full_like(input_ids, IGNORE_INDEX)
+        # remove the padding using attention_mask -- TODO: double check
+        input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)]
+        labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
+        new_input_embeds = []
+        new_labels = []
+        cur_image_idx = 0
+        for batch_idx, cur_input_ids in enumerate(input_ids):
+            num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
+            if num_images == 0:
+                cur_image_features = image_features[cur_image_idx]
+                cur_input_embeds_1 = self.embed(cur_input_ids)
+                cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0)
+                new_input_embeds.append(cur_input_embeds)
+                new_labels.append(labels[batch_idx])
+                cur_image_idx += 1
+                continue
+            image_token_indices = [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
+            cur_input_ids_noim = []
+            cur_labels = labels[batch_idx]
+            cur_labels_noim = []
+            # IMAGE_TOKEN_INDEXで前後にtokenを分割
+            # ex. input_ids -> cur_input_ids_noim
+            # [1 2 3 -200 4 5 6] -> [1 2 3], [4 5 6]
+            for i in range(len(image_token_indices) - 1):
+                cur_input_ids_noim.append(cur_input_ids[image_token_indices[i]+1:image_token_indices[i+1]])
+                cur_labels_noim.append(cur_labels[image_token_indices[i]+1:image_token_indices[i+1]])
+            split_sizes = [x.shape[0] for x in cur_labels_noim]
+            # cur_input_embeds_no_im[0].size() (27, 768)
+            # cur_input_embeds_no_im[1].size() (xxx, 768)
+            cur_input_embeds = self.embed(torch.cat(cur_input_ids_noim))
+            cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
+            cur_new_input_embeds = []
+            cur_new_labels = []
+            # IMAGE_TOKEN_INDEXの部分を画像特徴量に置き換える
+            # cur_image_fearures.size() (576, 768)
+            for i in range(num_images + 1):
+                cur_new_input_embeds.append(cur_input_embeds_no_im[i])
+                cur_new_labels.append(cur_labels_noim[i])
+                if i < num_images:
+                    cur_image_features = image_features[cur_image_idx]
+                    cur_image_idx += 1
+                    cur_new_input_embeds.append(cur_image_features)
+                    cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=cur_labels.device, dtype=cur_labels.dtype))
+            cur_new_input_embeds = torch.cat(cur_new_input_embeds)
+            cur_new_labels = torch.cat(cur_new_labels)
+            new_input_embeds.append(cur_new_input_embeds)
+            new_labels.append(cur_new_labels)
+        # Truncate sequences to max length as image embeddings can make the sequence longer
+        tokenizer_model_max_length = getattr(self.config, 'tokenizer_model_max_length', None)
+        if tokenizer_model_max_length is not None:
+            new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
+            new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
+        # Combine them
+        max_len = max(x.shape[0] for x in new_input_embeds)
+        batch_size = len(new_input_embeds)
+        new_input_embeds_padded = []
+        new_labels_padded = torch.full((batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device)
+        attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
+        position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
+        for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
+            cur_len = cur_new_embed.shape[0]
+            if getattr(self.config, 'tokenizer_padding_side', 'right') == "left":
+                new_input_embeds_padded.append(torch.cat((
+                    torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device),
+                    cur_new_embed
+                ), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, -cur_len:] = cur_new_labels
+                    attention_mask[i, -cur_len:] = True
+                    position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+            else:
+                new_input_embeds_padded.append(torch.cat((
+                    cur_new_embed,
+                    torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)
+                ), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, :cur_len] = cur_new_labels
+                    attention_mask[i, :cur_len] = True
+                    position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+        new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
+        if _labels is None:
+            new_labels = None
+        else:
+            new_labels = new_labels_padded
+        if _attention_mask is None:
+            attention_mask = None
+        else:
+            attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
+        if _position_ids is None:
+            position_ids = None
+        return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels

llava/model/llava_gpt2.py ADDED Viewed

	@@ -0,0 +1,111 @@

+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from transformers import AutoConfig, AutoModelForCausalLM, \
+                         GPT2LMHeadModel, GPT2Config, PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
+class LlavaConfig(GPT2Config):
+    model_type = "llava-jp"
+class LlavaGpt2Model(LlavaMetaModel, PreTrainedModel):
+    config_class = LlavaConfig
+    def __init__(self, config: GPT2Config):
+        super(LlavaGpt2Model, self).__init__(config)
+class LlavaGpt2ForCausalLM(GPT2LMHeadModel, LlavaMetaForCausalLM):
+    config_class = LlavaConfig
+    base_model = "gpt2"
+    def __init__(self, config):
+        super(LlavaGpt2ForCausalLM, self).__init__(config)
+        self.model = LlavaGpt2Model(config)
+        #self.model = LlavaMetaModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_model(self):
+        return self.model
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                images
+            )
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict
+        )
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
+        images = kwargs.pop("images", None)
+        _inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
+        )
+        if images is not None:
+            _inputs['images'] = images
+        return _inputs
+AutoConfig.register("llava-jp", LlavaConfig)
+AutoModelForCausalLM.register(LlavaConfig, LlavaGpt2ForCausalLM)

llava/model/llava_gpt_neox.py ADDED Viewed

	@@ -0,0 +1,112 @@

+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from transformers import AutoConfig, AutoModelForCausalLM, \
+                         GPTNeoXModel, GPTNeoXForCausalLM, GPTNeoXConfig, PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
+class LlavaConfig(GPTNeoXConfig):
+    model_type = "llava-jp"
+class LlavaGptNeoxModel(LlavaMetaModel, GPTNeoXModel):
+    config_class = LlavaConfig
+    def __init__(self, config: GPTNeoXConfig):
+        super(LlavaGptNeoxModel, self).__init__(config)
+class LlavaGptNeoxForCausalLM(PreTrainedModel, LlavaMetaForCausalLM):
+    config_class = LlavaConfig
+    base_model = "gpt_neox"
+    def __init__(self, config):
+        super(LlavaGptNeoxForCausalLM, self).__init__(config)
+        self.model = LlavaGptNeoxModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_model(self):
+        return self.model
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                images
+            )
+        print(inputs_embeds.size())
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict
+        )
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
+        images = kwargs.pop("images", None)
+        _inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
+        )
+        if images is not None:
+            _inputs['images'] = images
+        return _inputs
+AutoConfig.register("llava-jp", LlavaConfig)
+AutoModelForCausalLM.register(LlavaConfig, LlavaGptNeoxForCausalLM)

llava/model/llava_llama.py ADDED Viewed

	@@ -0,0 +1,110 @@

+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from transformers import AutoConfig, AutoModelForCausalLM, LlamaForCausalLM, \
+                         LlamaModel, LlamaConfig
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
+class LlavaConfig(LlamaConfig):
+    model_type = "llava-jp"
+class LlavaLlamaModel(LlavaMetaModel, LlamaModel):
+    config_class = LlavaConfig
+    def __init__(self, config: LlamaConfig):
+        super(LlavaLlamaModel, self).__init__(config)
+class LlavaLlamaForCausalLM(LlamaForCausalLM, LlavaMetaForCausalLM):
+    config_class = LlavaConfig
+    base_model = "llama"
+    def __init__(self, config):
+        super(LlavaLlamaForCausalLM, self).__init__(config)
+        self.model = LlavaLlamaModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_model(self):
+        return self.model
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                images
+            )
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict
+        )
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
+        images = kwargs.pop("images", None)
+        _inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
+        )
+        if images is not None:
+            _inputs['images'] = images
+        return _inputs
+AutoConfig.register("llava-jp", LlavaConfig)
+AutoModelForCausalLM.register(LlavaConfig, LlavaLlamaForCausalLM)

llava/model/vision_projector.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import math
+import re
+import torch
+import torch.nn as nn
+class IdentityMap(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x, *args, **kwargs):
+        return x
+    @property
+    def config(self):
+        return {"mm_projector_type": 'identity'}
+class FeatureIRLayer(nn.Module):
+    def __init__(self, in_dim: int, out_dim: int) -> None:
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(in_dim, out_dim), nn.GELU(), nn.Linear(out_dim, out_dim)
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.mlp(x)
+class TokenDownLayer(nn.Module):
+    def __init__(self, shape) -> None:
+        super().__init__()
+        self.dwn = nn.Sequential(
+            nn.AdaptiveAvgPool2d(shape)
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        b, num_tokens, c = x.shape
+        h = int(math.sqrt(num_tokens))
+        if h * h == num_tokens:
+            x = x.permute(0, 2, 1).reshape(b, -1, h, h)
+        else:
+            # FIXME サイズによっては失敗する
+            w = int(num_tokens/h)
+            assert w*h == num_tokens
+            x = x.permute(0, 2, 1).reshape(b, -1, w, h)
+        x = self.dwn(x)
+        x = x.flatten(2).transpose(1, 2)
+        return x
+class PosInjectLayer(nn.Module):
+    # https://github.com/Meituan-AutoML/Twins/blob/main/gvt.py
+    def __init__(self, in_dim: int, out_dim: int, stride: int = 1) -> None:
+        super().__init__()
+        self.peg = nn.Sequential(
+            nn.Conv2d(in_dim, out_dim, 3, stride, 1, bias=True, groups=out_dim)
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        b, num_tokens, c = x.shape
+        h = int(math.sqrt(num_tokens))
+        assert h * h == num_tokens
+        cnn_feat = x.transpose(1, 2).view(b, c, h, h)
+        x = self.peg(cnn_feat) + cnn_feat
+        x = x.flatten(2).transpose(1, 2)
+        return x
+class LDPNetV2Projector(nn.Module):
+    # https://github.com/Meituan-AutoML/MobileVLM/blob/main/mobilevlm/model/vision_projector.py
+    def __init__(self, config=None):
+        super().__init__()
+        inc, ouc = config.mm_hidden_size, config.hidden_size
+        self.mlp = FeatureIRLayer(inc, ouc)
+        self.dwn = TokenDownLayer((12, 12))
+        self.peg = PosInjectLayer(ouc, ouc, stride=1)
+    def forward(self, x):
+        x = self.mlp(x)
+        x = self.dwn(x)
+        x = self.peg(x)
+        return x
+def get_vision_projector(config, delay_load=False, **kwargs):
+    projector_type = getattr(config, 'mm_projector_type', 'linear')
+    if projector_type == 'linear':
+        return nn.Linear(config.mm_hidden_size, config.hidden_size)
+    elif projector_type == 'identity':
+        return IdentityMap()
+    elif projector_type == 'ldpnetv2':
+        return LDPNetV2Projector(config)
+    mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
+    if mlp_gelu_match:
+        mlp_depth = int(mlp_gelu_match.group(1))
+        modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
+        for _ in range(1, mlp_depth):
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+        return nn.Sequential(*modules)
+    raise ValueError(f'Unknown projector type: {projector_type}')

llava/s2wrapper/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .core import *
2	+ from .utils import *

llava/s2wrapper/core.py ADDED Viewed

	@@ -0,0 +1,74 @@

+#  ------------------------------------------------------------------------------------------
+#  Copyright (c) 2024 Baifeng Shi.
+#  All rights reserved.
+#
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
+#  ------------------------------------------------------------------------------------------
+import math
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from .utils import split_chessboard, merge_chessboard
+def forward(model, input, scales=None, img_sizes=None, max_split_size=None, resize_output_to_idx=0, num_prefix_token=0,
+            output_shape='bnc'):
+    assert input.dim() == 4, "Input image must be in the shape of BxCxHxW."
+    assert input.shape[2] == input.shape[3], "Currently only square images are supported."
+    assert output_shape in ['bnc', 'bchw'], "Output shape should be either BxNxC (e.g., ViT) or BxCxHxW (e.g., ConvNet)."
+    assert output_shape == 'bnc' or num_prefix_token == 0, "For ConvNet there shouldn't be any prefix token."
+    b, c, input_size, _ = input.shape
+    # image size for each scale
+    assert scales is not None or img_sizes is not None, "Please assign either scales or img_sizes."
+    img_sizes = img_sizes or [int(input_size * scale) for scale in scales]
+    # prepare multiscale inputs
+    max_split_size = max_split_size or input_size   # The maximum size of each split of image. Set as the input size by default
+    num_splits = [math.ceil(size / max_split_size) for size in img_sizes]   # number of splits each scale
+    input_multiscale = []
+    for size, num_split in zip(img_sizes, num_splits):
+        x = F.interpolate(input.to(torch.float32), size=size, mode='bicubic').to(input.dtype)
+        x = split_chessboard(x, num_split=num_split)
+        input_multiscale.append(x)
+    # run feedforward on each scale
+    outs_multiscale = [model(x) for x in input_multiscale]
+    if num_prefix_token > 0:
+        outs_prefix_multiscale = [out[:, :num_prefix_token] for out in outs_multiscale]
+        outs_multiscale = [out[:, num_prefix_token:] for out in outs_multiscale]
+    if output_shape == 'bnc':
+        height = int(outs_multiscale[0].shape[1] ** 0.5)
+        if height**2 == outs_multiscale[0].shape[1]:
+            width = height
+        else:
+            width = int(outs_multiscale[0].shape[1]/height)
+        assert width*height == outs_multiscale[0].shape[1]
+        #print(height, width, outs_multiscale[0].shape[1])
+        # available by siglip
+        #outs_multiscale = [rearrange(out, 'b (h w) c -> b c h w', h=int(out.shape[1] ** 0.5), w=int(out.shape[1] ** 0.5))
+        #                   for out in outs_multiscale]
+        outs_multiscale = [rearrange(out, 'b (h w) c -> b c h w', h=height, w=width)
+                           for out in outs_multiscale]
+    # merge outputs of different splits for each scale separately
+    outs_multiscale = [merge_chessboard(out, num_split=num_split) for num_split, out in zip(num_splits, outs_multiscale)]
+    # interpolate outputs from different scales and concat together
+    #output_size = outs_multiscale[resize_output_to_idx].shape[-2]
+    output_size = [height, width]
+    out = torch.cat([F.interpolate(outs_multiscale[i].to(torch.float32), size=output_size,
+                                   mode='area').to(outs_multiscale[i].dtype)
+                     for i in range(len(outs_multiscale))], dim=1)
+    if output_shape == 'bnc':
+        out = rearrange(out, 'b c h w -> b (h w) c')
+    if num_prefix_token > 0:
+        # take the mean of prefix tokens from different splits for each scale
+        outs_prefix_multiscale = [torch.stack(out.split(b, dim=0), dim=0).mean(dim=0) for out in outs_prefix_multiscale]
+        out_prefix_multiscale = torch.cat(outs_prefix_multiscale, dim=-1)
+        out = torch.cat([out_prefix_multiscale, out], dim=1)
+    return out

llava/s2wrapper/utils.py ADDED Viewed

	@@ -0,0 +1,32 @@

+#  ------------------------------------------------------------------------------------------
+#  Copyright (c) 2024 Baifeng Shi.
+#  All rights reserved.
+#
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
+#  ------------------------------------------------------------------------------------------
+import torch
+def split_chessboard(x, num_split):
+    """
+        x: b * c * h * w
+        Deividing x into num_split**2 sub-squares, and concatenate all the sub-squares on the batch dimension
+    """
+    B, C, H, W = x.shape
+    assert H % num_split == 0 and W % num_split == 0
+    h, w = H // num_split, W // num_split
+    x_split = torch.cat([x[:, :, i*h:(i+1)*h, j*w:(j+1)*w] for i in range(num_split) for j in range(num_split)], dim=0)
+    return x_split
+def merge_chessboard(x, num_split):
+    """
+        x: b * c * h * w
+        Assuming x contains num_split**2 sub-squares concatenated along batch dimension, merge the sub-squares back to the original whole square.
+        (inverse of split_chessboard)
+    """
+    B, C, H, W = x.shape
+    assert B % (num_split**2) == 0
+    b = B // (num_split**2)
+    x_merge = torch.cat([torch.cat([x[(i*num_split + j)*b:(i*num_split + j + 1)*b] for j in range(num_split)], dim=-1)
+                         for i in range(num_split)], dim=-2)
+    return x_merge

llava/train/arguments_dataclass.py ADDED Viewed

	@@ -0,0 +1,88 @@

+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Optional
+import transformers
+@dataclass
+class ModelArguments:
+    base_model: Optional[str] = field(default="gpt2",
+                                      metadata={"help": "gpt2 or gpt_neox or llama"})
+    model_name_or_path: Optional[str] = field(default="rinna/japanese-gpt2-xsmall")
+    version: Optional[str] = field(default="plain")
+    freeze_backbone: bool = field(default=False) # LLMをFreezeするか
+    tune_mm_mlp_adapter: bool = field(default=False) # 事前学習のときはmm_mlp_adapterだけ保存する.
+    vision_tower: Optional[str] = field(default="openai/clip-vit-large-patch14-336")
+    mm_vision_select_layer: Optional[int] = field(default=-2)   # default to the last two layer
+    pretrain_mm_mlp_adapter: Optional[str] = field(default=None) # fine-tuningのときには設定
+    mm_projector_type: Optional[str] = field(default='mlp2x_gelu') # 2層の線形層
+    mm_vision_select_feature: Optional[str] = field(default="patch")
+    scales: Optional[list[float]] = field(default=None)
+@dataclass
+class DataArguments:
+    data_path: str = field(default="",
+                           metadata={"help": "Path to the training data."})
+    lazy_preprocess: bool = False
+    is_multimodal: bool = False
+    image_folder: Optional[str] = field(default="/home/toshi/work/llava_jp/input/LLaVA-CC3M-Pretrain-595K/images",
+                                        metadata={"help": "Path to image data."})
+    image_aspect_ratio: str = 'square'
+    image_size: Optional[int] = None
+@dataclass
+class TrainingArguments(transformers.TrainingArguments):
+    cache_dir: Optional[str] = field(default=None)
+    optim: str = field(default="adamw_torch")
+    model_max_length: int = field(
+        default=1024,
+        metadata={
+            "help":
+            "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
+        },
+    )
+    double_quant: bool = field(
+        default=True,
+        metadata={"help": "Compress the quantization statistics through double quantization."}
+    )
+    quant_type: str = field(
+        default="nf4",
+        metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
+    )
+    bits: int = field(
+        default=16,
+        metadata={"help": "How many bits to use."}
+    )
+    lora_enable: bool = False
+    lora_r: int = 64
+    lora_alpha: int = 16
+    lora_dropout: float = 0.05
+    lora_weight_path: str = ""
+    lora_bias: str = "none"
+    mm_projector_lr: Optional[float] = None
+    group_by_modality_length: bool = field(default=False) # dataset sampler option
+    fp16: bool = field(default=False)
+    bf16: bool = field(default=False)
+    output_dir: str = field(default="./output_llava/checkpoints/llava-v1.5-japanese-gpt2-xsmall")
+    num_train_epochs: int = field(default=1)
+    per_device_train_batch_size: int = field(default=32)
+    per_device_eval_batch_size: int = field(default=4)
+    gradient_accumulation_steps: int = field(default=1)
+    evaluation_strategy: str = field(default="no")
+    save_strategy: str = field(default="steps")
+    save_steps: int = field(default=24000)
+    save_total_limit: int = field(default=1)
+    learning_rate: float = field(default=1e-3)
+    weight_decay: float = field(default=0.)
+    warmup_ratio: float = field(default=0.03)
+    logging_steps: int = field(default=1)
+    model_max_length: int = field(default=1024)
+    gradient_checkpointing: bool = field(default=True)
+    dataloader_num_workers: int = field(default=16)
+    lr_scheduler_type: str = field(default="cosine")
+    seed: int = field(default=42)

llava/train/dataset.py ADDED Viewed

	@@ -0,0 +1,306 @@

+import copy
+import json
+import os
+from dataclasses import dataclass
+from typing import Dict
+from typing import Sequence
+import torch
+import transformers
+from PIL import Image
+from torch.utils.data import Dataset
+from llava import conversation as conversation_lib
+from llava.constants import DEFAULT_IMAGE_TOKEN, IGNORE_INDEX, IMAGE_TOKEN_INDEX
+from llava.train.arguments_dataclass import DataArguments
+def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
+    input_ids = []
+    offset = 0
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+    for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
+        input_ids.extend(x[offset:])
+    if return_tensors is not None:
+        if return_tensors == 'pt':
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f'Unsupported tensor type: {return_tensors}')
+    return input_ids
+def preprocess_multimodal(
+    sources: Sequence[str],
+    data_args: DataArguments
+) -> Dict:
+    is_multimodal = data_args.is_multimodal
+    if not is_multimodal:
+        return sources
+    for source in sources:
+        for sentence in source:
+            if DEFAULT_IMAGE_TOKEN in sentence['value']:
+                sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, '').strip()
+                sentence['value'] = DEFAULT_IMAGE_TOKEN + '\n' + sentence['value']
+                sentence['value'] = sentence['value'].strip()
+            replace_token = DEFAULT_IMAGE_TOKEN
+            sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token)
+    return sources
+def preprocess_plain(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+) -> Dict:
+    # add end signal and concatenate together
+    conversations = []
+    for source in sources:
+        assert len(source) == 2
+        assert DEFAULT_IMAGE_TOKEN in source[0]['value']
+        source[0]['value'] = DEFAULT_IMAGE_TOKEN
+        conversation = source[0]['value'] + source[1]['value'] + conversation_lib.default_conversation.sep
+        conversations.append(conversation)
+    # tokenize conversations
+    input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations]
+    targets = copy.deepcopy(input_ids)
+    for target, source in zip(targets, sources):
+        tokenized_len = len(tokenizer_image_token(source[0]['value'], tokenizer))
+        target[:tokenized_len] = IGNORE_INDEX
+    return dict(input_ids=input_ids, labels=targets)
+def preprocess_v1(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"ユーザー": conv.roles[0], "システム": conv.roles[1]}
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+    # Tokenize conversations
+    if has_image:
+        input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    targets = input_ids.clone()
+    assert conv.sep_style == conversation_lib.SeparatorStyle.TWO
+    # Mask targets
+    sep = conv.sep + conv.roles[1] + ": "
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+        rounds = conversation.split(conv.sep2)
+        cur_len = 0 #1
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+            if has_image:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+def preprocess(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False
+) -> Dict:
+    """
+    Given a list of sources, each is a conversation list. This transform:
+    1. Add signal '### ' at the beginning each sentence, with end signal '\n';
+    2. Concatenate conversations together;
+    3. Tokenize the concatenated conversation;
+    4. Make a deepcopy as the target. Mask human words with IGNORE_INDEX.
+    """
+    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.PLAIN:
+        return preprocess_plain(sources, tokenizer)
+    elif conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.TWO:
+        return preprocess_v1(sources, tokenizer, has_image)
+    else:
+        raise ValueError(f"Invalid style: {conversation_lib.default_conversation.sep_style}")
+class LazySupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+    def __init__(
+        self, data_path: str,
+        tokenizer: transformers.PreTrainedTokenizer,
+        data_args: DataArguments,
+    ):
+        super(LazySupervisedDataset, self).__init__()
+        list_data_dict = json.load(open(data_path, "r"))
+        from pathlib import Path
+        print("Formatting inputs...Skip in lazy mode")
+        self.tokenizer = tokenizer
+        self.list_data_dict = [i for i in list_data_dict if Path(data_args.image_folder, i['image']).is_file()]
+        self.data_args = data_args
+    def __len__(self):
+        return len(self.list_data_dict)
+    @property
+    def lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            img_tokens = 128 if 'image' in sample else 0
+            length_list.append(sum(len(conv['value'].split()) for conv in sample['conversations']) + img_tokens)
+        return length_list
+    @property
+    def modality_lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            cur_len = sum(len(conv['value'].split()) for conv in sample['conversations'])
+            cur_len = cur_len if 'images' in sample else -cur_len
+            length_list.append(cur_len)
+        return length_list
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        sources = self.list_data_dict[i]
+        if isinstance(i, int):
+            sources = [sources]
+        assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME
+        if 'image' in sources[0]:
+            image_file = self.list_data_dict[i]['image']
+            image_folder = self.data_args.image_folder
+            processor = self.data_args.image_processor
+            image = Image.open(os.path.join(image_folder, image_file)).convert('RGB')
+            if self.data_args.image_aspect_ratio == 'pad':
+                def expand2square(pil_img, background_color):
+                    width, height = pil_img.size
+                    if width == height:
+                        return pil_img
+                    elif width > height:
+                        result = Image.new(pil_img.mode, (width, width), background_color)
+                        result.paste(pil_img, (0, (width - height) // 2))
+                        return result
+                    else:
+                        result = Image.new(pil_img.mode, (height, height), background_color)
+                        result.paste(pil_img, ((height - width) // 2, 0))
+                        return result
+                image = expand2square(image, tuple(int(x*255) for x in processor.image_mean))
+                image = processor.preprocess(
+                    image,
+                    return_tensors='pt',
+                    size={"height": self.data_args.image_size, "width": self.data_args.image_size}
+                )['pixel_values'][0]
+            else:
+                image = processor.preprocess(
+                    image,
+                    return_tensors='pt',
+                    size={"height": self.data_args.image_size, "width": self.data_args.image_size}
+                )['pixel_values'][0]
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args
+            )
+        else:
+            sources = copy.deepcopy([e["conversations"] for e in sources])
+        data_dict = preprocess(
+            sources,
+            self.tokenizer,
+            has_image=('image' in self.list_data_dict[i]))
+        if isinstance(i, int):
+            data_dict = dict(input_ids=data_dict["input_ids"][0],
+                             labels=data_dict["labels"][0])
+        # image exist in the data
+        if 'image' in self.list_data_dict[i]:
+            data_dict['images'] = image
+        elif self.data_args.is_multimodal:
+            # image does not exist in the data, but the model is multimodal
+            crop_size = self.data_args.image_processor.crop_size
+            data_dict['images'] = torch.zeros(3, crop_size['height'], crop_size['width'])
+        return data_dict
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    """Collate examples for supervised fine-tuning."""
+    tokenizer: transformers.PreTrainedTokenizer
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels = tuple([instance[key] for instance in instances]
+                                  for key in ("input_ids", "labels"))
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids,
+            batch_first=True,
+            padding_value=self.tokenizer.pad_token_id)
+        labels = torch.nn.utils.rnn.pad_sequence(labels,
+                                                 batch_first=True,
+                                                 padding_value=IGNORE_INDEX)
+        input_ids = input_ids[:, :self.tokenizer.model_max_length]
+        labels = labels[:, :self.tokenizer.model_max_length]
+        batch = dict(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
+        )
+        if 'images' in instances[0]:
+            images = [instance['images'] for instance in instances]
+            if all(x is not None and x.shape == images[0].shape for x in images):
+                batch['images'] = torch.stack(images)
+            else:
+                batch['images'] = images
+        return batch

llava/train/llava_trainer.py ADDED Viewed

	@@ -0,0 +1,237 @@

+import os
+from typing import List, Optional
+import torch
+import torch.nn as nn
+from torch.utils.data import Sampler
+from transformers import Trainer
+from transformers.trainer import (
+    get_parameter_names,
+    has_length,
+    ALL_LAYERNORM_LAYERS,
+    logger,
+)
+def split_to_even_chunks(indices, lengths, num_chunks):
+    """
+    Split a list of indices into `chunks` chunks of roughly equal lengths.
+    """
+    if len(indices) % num_chunks != 0:
+        return [indices[i::num_chunks] for i in range(num_chunks)]
+    num_indices_per_chunk = len(indices) // num_chunks
+    chunks = [[] for _ in range(num_chunks)]
+    chunks_lengths = [0 for _ in range(num_chunks)]
+    for index in indices:
+        shortest_chunk = chunks_lengths.index(min(chunks_lengths))
+        chunks[shortest_chunk].append(index)
+        chunks_lengths[shortest_chunk] += lengths[index]
+        if len(chunks[shortest_chunk]) == num_indices_per_chunk:
+            chunks_lengths[shortest_chunk] = float("inf")
+    return chunks
+def get_modality_length_grouped_indices(lengths, batch_size, world_size, generator=None):
+    # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
+    assert all(l != 0 for l in lengths), "Should not have zero length."
+    if all(l > 0 for l in lengths) or all(l < 0 for l in lengths):
+        # all samples are in the same modality
+        return get_length_grouped_indices(lengths, batch_size, world_size, generator=generator)
+    mm_indices, mm_lengths = zip(*[(i, l) for i, l in enumerate(lengths) if l > 0])
+    lang_indices, lang_lengths = zip(*[(i, -l) for i, l in enumerate(lengths) if l < 0])
+    mm_shuffle = [mm_indices[i] for i in get_length_grouped_indices(mm_lengths, batch_size, world_size, generator=None)]
+    lang_shuffle = [lang_indices[i] for i in get_length_grouped_indices(lang_lengths, batch_size, world_size, generator=None)]
+    megabatch_size = world_size * batch_size
+    mm_megabatches = [mm_shuffle[i : i + megabatch_size] for i in range(0, len(mm_shuffle), megabatch_size)]
+    lang_megabatches = [lang_shuffle[i : i + megabatch_size] for i in range(0, len(lang_shuffle), megabatch_size)]
+    last_mm = mm_megabatches[-1]
+    last_lang = lang_megabatches[-1]
+    additional_batch = last_mm + last_lang
+    megabatches = mm_megabatches[:-1] + lang_megabatches[:-1]
+    megabatch_indices = torch.randperm(len(megabatches), generator=generator)
+    megabatches = [megabatches[i] for i in megabatch_indices]
+    if len(additional_batch) > 0:
+        megabatches.append(sorted(additional_batch))
+    return [i for megabatch in megabatches for i in megabatch]
+def get_length_grouped_indices(lengths, batch_size, world_size, generator=None, merge=True):
+    # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
+    indices = torch.randperm(len(lengths), generator=generator)
+    megabatch_size = world_size * batch_size
+    megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)]
+    megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches]
+    megabatches = [split_to_even_chunks(megabatch, lengths, world_size) for megabatch in megabatches]
+    return [i for megabatch in megabatches for batch in megabatch for i in batch]
+class LengthGroupedSampler(Sampler):
+    # fine-tuningのときだけ使っているみたい
+    r"""
+    Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while
+    keeping a bit of randomness.
+    """
+    def __init__(
+        self,
+        batch_size: int,
+        world_size: int,
+        lengths: Optional[List[int]] = None,
+        generator=None,
+        group_by_modality: bool = False,
+    ):
+        if lengths is None:
+            raise ValueError("Lengths must be provided.")
+        self.batch_size = batch_size
+        self.world_size = world_size
+        self.lengths = lengths
+        self.generator = generator
+        self.group_by_modality = group_by_modality
+    def __len__(self):
+        return len(self.lengths)
+    def __iter__(self):
+        if self.group_by_modality:
+            indices = get_modality_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
+        else:
+            indices = get_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
+        return iter(indices)
+def get_mm_adapter_state(named_params, keys_to_match):
+    to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)}
+    to_return = {k: v.detach().cpu().clone() for k, v in to_return.items()}
+    return to_return
+class LLaVATrainer(Trainer):
+    def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
+        if self.train_dataset is None or not has_length(self.train_dataset):
+            return None
+        if self.args.group_by_modality_length:
+            lengths = self.train_dataset.modality_lengths
+            return LengthGroupedSampler(
+                self.args.train_batch_size,
+                world_size=self.args.world_size * self.args.gradient_accumulation_steps,
+                lengths=lengths,
+                group_by_modality=True,
+            )
+        else:
+            return super()._get_train_sampler()
+    def create_optimizer(self):
+        """
+        Setup the optimizer.
+        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
+        Trainer's init through `optimizers`, or subclass and override this method in a subclass.
+        """
+        opt_model = self.model
+        if self.optimizer is None:
+            decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS)
+            decay_parameters = [name for name in decay_parameters if "bias" not in name]
+            if self.args.mm_projector_lr is not None:
+                projector_parameters = [name for name, _ in opt_model.named_parameters() if "mm_projector" in name]
+                optimizer_grouped_parameters = [
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and n not in projector_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n not in projector_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": 0.0,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and n in projector_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                        "lr": self.args.mm_projector_lr,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n in projector_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": 0.0,
+                        "lr": self.args.mm_projector_lr,
+                    },
+                ]
+            else:
+                optimizer_grouped_parameters = [
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": 0.0,
+                    },
+                ]
+            optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)
+            self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
+            if optimizer_cls.__name__ == "Adam8bit":
+                import bitsandbytes
+                manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
+                skipped = 0
+                for module in opt_model.modules():
+                    if isinstance(module, nn.Embedding):
+                        skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values())
+                        logger.info(f"skipped {module}: {skipped/2**20}M params")
+                        manager.register_module_override(module, "weight", {"optim_bits": 32})
+                        logger.debug(f"bitsandbytes: will optimize {module} in fp32")
+                logger.info(f"skipped: {skipped/2**20}M params")
+        return self.optimizer
+    def _save_checkpoint(self, model, trial, metrics=None):
+        if getattr(self.args, 'tune_mm_mlp_adapter', False):
+            from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
+            checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
+            run_dir = self._get_output_dir(trial=trial)
+            output_dir = os.path.join(run_dir, checkpoint_folder)
+            # Only save Adapter
+            #keys_to_match = ['mm_projector', 'vision_resampler']
+            keys_to_match = ['mm_projector']
+            weight_to_save = get_mm_adapter_state(self.model.named_parameters(), keys_to_match)
+            #weight_to_save = self.model.named_parameters().detach().cpu().clone()
+            if self.args.local_rank == 0 or self.args.local_rank == -1:
+                self.model.config.save_pretrained(output_dir)
+                torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin'))
+        else:
+            super(LLaVATrainer, self)._save_checkpoint(model, trial, metrics)
+    def _save(self, output_dir: Optional[str] = None, state_dict=None):
+        if getattr(self.args, 'tune_mm_mlp_adapter', False):
+            pass
+        else:
+            super(LLaVATrainer, self)._save(output_dir, state_dict)