big renaming

Browse files

Files changed (6) hide show

README.md +68 -0
config.json +5 -5
configuration_img2html.py → configuration_vmistral.py +14 -14
image_processing_idefics.py +168 -0
modeling_img2html.py → modeling_vmistral.py +21 -33
processing_idefics.py +414 -0

README.md CHANGED Viewed

@@ -19,6 +19,74 @@ It is based on a very early checkpoint of our forthcoming vision-language founda
 This is very much an alpha version. The goal is to kick off an effort to develop improved models capable of converting a website screenshot into actual code.
 # Model Details
 - **Developed by:** Hugging Face

 This is very much an alpha version. The goal is to kick off an effort to develop improved models capable of converting a website screenshot into actual code.
+# Code snippet
+```python
+import torch
+from PIL import Image
+from transformers import AutoModelForCausalLM, AutoProcessor
+from transformers.image_utils import to_numpy_array, PILImageResampling, ChannelDimension
+from transformers.image_transforms import resize, to_channel_dimension_format
+DEVICE = torch.device("cuda")
+PROCESSOR = AutoProcessor.from_pretrained(
+    "HuggingFaceM4/VLM_WebSight_finetuned",
+    token=API_TOKEN,
+)
+MODEL = AutoModelForCausalLM.from_pretrained(
+    "HuggingFaceM4/VLM_WebSight_finetuned",
+    token=API_TOKEN,
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16,
+).to(DEVICE)
+image_seq_len = MODEL.config.perceiver_config.resampler_n_latents
+BOS_TOKEN = PROCESSOR.tokenizer.bos_token
+BAD_WORDS_IDS = PROCESSOR.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
+def convert_to_rgb(image):
+    # `image.convert("RGB")` would only work for .jpg images, as it creates a wrong background
+    # for transparent images. The call to `alpha_composite` handles this case
+    if image.mode == "RGB":
+        return image
+    image_rgba = image.convert("RGBA")
+    background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
+    alpha_composite = Image.alpha_composite(background, image_rgba)
+    alpha_composite = alpha_composite.convert("RGB")
+    return alpha_composite
+# The processor is the same as the Idefics processor except for the BILINEAR interpolation,
+# so this is a hack in order to redefine ONLY the transform method
+def custom_transform(x):
+    x = convert_to_rgb(x)
+    x = to_numpy_array(x)
+    x = resize(x, (960, 960), resample=PILImageResampling.BILINEAR)
+    x = PROCESSOR.image_processor.rescale(x, scale=1 / 255)
+    x = PROCESSOR.image_processor.normalize(
+        x,
+        mean=PROCESSOR.image_processor.image_mean,
+        std=PROCESSOR.image_processor.image_std
+    )
+    x = to_channel_dimension_format(x, ChannelDimension.FIRST)
+    x = torch.tensor(x)
+    return x
+inputs = PROCESSOR.tokenizer(
+    f"{BOS_TOKEN}<fake_token_around_image>{'<image>' * image_seq_len}<fake_token_around_image>",
+    return_tensors="pt",
+    add_special_tokens=False,
+)
+inputs["pixel_values"] = PROCESSOR.image_processor([image], transform=custom_transform)
+inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
+generated_ids = MODEL.generate(**inputs, bad_words_ids=BAD_WORDS_IDS, max_length=4096)
+generated_text = PROCESSOR.batch_decode(generated_ids, skip_special_tokens=True)[0]
+print(generated_text)
+```
 # Model Details
 - **Developed by:** Hugging Face

config.json CHANGED Viewed

@@ -6,12 +6,12 @@
   "alpha_type": "float",
   "alphas_initializer_range": 0.0,
   "architectures": [
-    "Img2HTMLForVisionText2Text"
   ],
   "attention_dropout": 0.0,
   "auto_map": {
-    "AutoConfig": "configuration_img2html.Img2HTMLConfig",
-    "AutoModelForCausalLM": "modeling_img2html.Img2HTMLForVisionText2Text"
   },
   "bos_token_id": 1,
   "cross_layer_interval": 1,
@@ -27,7 +27,7 @@
   "initializer_range": 0.02,
   "intermediate_size": 14336,
   "max_position_embeddings": 32768,
-  "model_type": "img2html",
   "num_attention_heads": 32,
   "num_hidden_layers": 32,
   "num_key_value_heads": 8,
@@ -52,7 +52,7 @@
     "hidden_size": 1152,
     "image_size": 960,
     "intermediate_size": 4304,
-    "model_type": "img2html",
     "num_attention_heads": 16,
     "num_hidden_layers": 27,
     "patch_size": 14

   "alpha_type": "float",
   "alphas_initializer_range": 0.0,
   "architectures": [
+    "VMistralForVisionText2Text"
   ],
   "attention_dropout": 0.0,
   "auto_map": {
+    "AutoConfig": "configuration_vmistral.VMistralConfig",
+    "AutoModelForCausalLM": "modeling_vmistral.VMistralForVisionText2Text"
   },
   "bos_token_id": 1,
   "cross_layer_interval": 1,
   "initializer_range": 0.02,
   "intermediate_size": 14336,
   "max_position_embeddings": 32768,
+  "model_type": "vmistral",
   "num_attention_heads": 32,
   "num_hidden_layers": 32,
   "num_key_value_heads": 8,
     "hidden_size": 1152,
     "image_size": 960,
     "intermediate_size": 4304,
+    "model_type": "vmistral",
     "num_attention_heads": 16,
     "num_hidden_layers": 27,
     "patch_size": 14

configuration_img2html.py → configuration_vmistral.py RENAMED Viewed

@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Img2HTML model configuration"""
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
@@ -20,14 +20,14 @@ from transformers.utils import logging
 logger = logging.get_logger(__name__)
 MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "HuggingFaceM4/Img2HTML": "https://huggingface.co/HuggingFaceM4/Img2HTML/resolve/main/config.json",
 }
-class Img2HTMLVisionConfig(PretrainedConfig):
     r"""
     """
-    model_type = "img2html"
     def __init__(
         self,
@@ -63,7 +63,7 @@ class Img2HTMLVisionConfig(PretrainedConfig):
         self._flash_attn_2_enabled = _flash_attn_2_enabled
-class Img2HTMLPerceiverConfig(PretrainedConfig):
     r"""
     TThis is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an
     Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration
@@ -89,7 +89,7 @@ class Img2HTMLPerceiverConfig(PretrainedConfig):
         qk_layer_norms_perceiver (`bool`, *optional*, defaults to `False`):
             Whether or not to use qk layer norms in perceiver
     """
-    model_type = "img2html"
     def __init__(
         self,
@@ -109,7 +109,7 @@ class Img2HTMLPerceiverConfig(PretrainedConfig):
         super().__init__(**kwargs)
-class Img2HTMLConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an
     Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration
@@ -201,7 +201,7 @@ class Img2HTMLConfig(PretrainedConfig):
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
-    model_type = "img2html"
     is_composition = False
     def __init__(
@@ -280,17 +280,17 @@ class Img2HTMLConfig(PretrainedConfig):
         self.attention_dropout = attention_dropout
         if perceiver_config is None:
-            self.perceiver_config = Img2HTMLPerceiverConfig()
         elif isinstance(perceiver_config, dict):
-            self.perceiver_config = Img2HTMLPerceiverConfig(**perceiver_config)
-        elif isinstance(perceiver_config, Img2HTMLPerceiverConfig):
             self.perceiver_config = perceiver_config
         if vision_config is None:
-            self.vision_config = Img2HTMLVisionConfig()
         elif isinstance(vision_config, dict):
-            self.vision_config = Img2HTMLVisionConfig(**vision_config)
-        elif isinstance(vision_config, Img2HTMLVisionConfig):
             self.vision_config = vision_config
         super().__init__(

 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+""" VMistral model configuration"""
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
 logger = logging.get_logger(__name__)
 MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "HuggingFaceM4/VLM_WebSight_finetuned": "https://huggingface.co/HuggingFaceM4/VLM_WebSight_finetuned/resolve/main/config.json",
 }
+class VMistralVisionConfig(PretrainedConfig):
     r"""
     """
+    model_type = "vmistral"
     def __init__(
         self,
         self._flash_attn_2_enabled = _flash_attn_2_enabled
+class VMistralPerceiverConfig(PretrainedConfig):
     r"""
     TThis is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an
     Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration
         qk_layer_norms_perceiver (`bool`, *optional*, defaults to `False`):
             Whether or not to use qk layer norms in perceiver
     """
+    model_type = "vmistral"
     def __init__(
         self,
         super().__init__(**kwargs)
+class VMistralConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an
     Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
+    model_type = "vmistral"
     is_composition = False
     def __init__(
         self.attention_dropout = attention_dropout
         if perceiver_config is None:
+            self.perceiver_config = VMistralPerceiverConfig()
         elif isinstance(perceiver_config, dict):
+            self.perceiver_config = VMistralPerceiverConfig(**perceiver_config)
+        elif isinstance(perceiver_config, VMistralPerceiverConfig):
             self.perceiver_config = perceiver_config
         if vision_config is None:
+            self.vision_config = VMistralVisionConfig()
         elif isinstance(vision_config, dict):
+            self.vision_config = VMistralVisionConfig(**vision_config)
+        elif isinstance(vision_config, VMistralVisionConfig):
             self.vision_config = vision_config
         super().__init__(

image_processing_idefics.py ADDED Viewed

	@@ -0,0 +1,168 @@

+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Idefics."""
+from typing import Callable, Dict, List, Optional, Union
+from PIL import Image
+from ...image_processing_utils import BaseImageProcessor, BatchFeature
+from ...image_transforms import resize, to_channel_dimension_format
+from ...image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import TensorType, is_torch_available
+IDEFICS_STANDARD_MEAN = [0.48145466, 0.4578275, 0.40821073]
+IDEFICS_STANDARD_STD = [0.26862954, 0.26130258, 0.27577711]
+def convert_to_rgb(image):
+    # `image.convert("RGB")` would only work for .jpg images, as it creates a wrong background
+    # for transparent images. The call to `alpha_composite` handles this case
+    if image.mode == "RGB":
+        return image
+    image_rgba = image.convert("RGBA")
+    background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
+    alpha_composite = Image.alpha_composite(background, image_rgba)
+    alpha_composite = alpha_composite.convert("RGB")
+    return alpha_composite
+class IdeficsImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Idefics image processor.
+    Args:
+        image_size (`int`, *optional*, defaults to 224):
+            Resize to image size
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
+            overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        image_num_channels (`int`, *optional*, defaults to 3):
+            Number of image channels.
+    """
+    model_input_names = ["pixel_values"]
+    def __init__(
+        self,
+        image_size: int = 224,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        image_num_channels: Optional[int] = 3,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.image_size = image_size
+        self.image_num_channels = image_num_channels
+        self.image_mean = image_mean
+        self.image_std = image_std
+    def preprocess(
+        self,
+        images: ImageInput,
+        image_num_channels: Optional[int] = 3,
+        image_size: Optional[Dict[str, int]] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        transform: Callable = None,
+        **kwargs,
+    ) -> TensorType.PYTORCH:
+        """
+        Preprocess a batch of images.
+        Args:
+            images (`ImageInput`):
+                A list of images to preprocess.
+            image_size (`int`, *optional*, defaults to `self.image_size`):
+                Resize to image size
+            image_num_channels (`int`, *optional*, defaults to `self.image_num_channels`):
+                Number of image channels.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`):
+                Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+                channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can
+                be overridden by the `image_mean` parameter in the `preprocess` method.
+            image_std (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`):
+                Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+                number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess`
+                method. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            transform (`Callable`, *optional*, defaults to `None`):
+                A custom transform function that accepts a single image can be passed for training. For example,
+                `torchvision.Compose` can be used to compose multiple transforms. If `None` - an inference mode is
+                assumed - and then a preset of inference-specific transforms will be applied to the images
+        Returns:
+            a PyTorch tensor of the processed images
+        """
+        image_size = image_size if image_size is not None else self.image_size
+        image_num_channels = image_num_channels if image_num_channels is not None else self.image_num_channels
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        size = (image_size, image_size)
+        if isinstance(images, list) and len(images) == 0:
+            return []
+        images = make_list_of_images(images)
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        # For training a user needs to pass their own set of transforms as a Callable.
+        # For reference this is what was used in the original IDEFICS training:
+        # transform = transforms.Compose([
+        #     convert_to_rgb,
+        #     transforms.RandomResizedCrop((size, size), scale=(0.9, 1.0), interpolation=transforms.InterpolationMode.BICUBIC),
+        #     transforms.ToTensor(),
+        #     transforms.Normalize(mean=image_mean, std=image_std),
+        # ])
+        if transform is not None:
+            if not is_torch_available():
+                raise ImportError("To pass in `transform` torch must be installed")
+            import torch
+            images = [transform(x) for x in images]
+            return torch.stack(images)
+        # for inference we do the exact transforms that were used to train IDEFICS
+        images = [convert_to_rgb(x) for x in images]
+        # further transforms expect numpy arrays
+        images = [to_numpy_array(x) for x in images]
+        images = [resize(x, size, resample=PILImageResampling.BICUBIC) for x in images]
+        images = [self.rescale(image=image, scale=1 / 255) for image in images]
+        images = [self.normalize(x, mean=image_mean, std=image_std) for x in images]
+        images = [to_channel_dimension_format(x, ChannelDimension.FIRST) for x in images]
+        # TODO: this converts to torch tensors - switch to convert_to_tensors once it becomes available
+        images = BatchFeature(data={"pixel_values": images}, tensor_type=TensorType.PYTORCH)["pixel_values"]
+        return images

modeling_img2html.py → modeling_vmistral.py RENAMED Viewed

@@ -17,7 +17,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Mistral model."""
 from dataclasses import dataclass
 import inspect
 import math
@@ -43,7 +43,7 @@ from transformers import PreTrainedModel
 from transformers.utils import logging
 from transformers.modeling_outputs import ModelOutput
-from .configuration_img2html import Img2HTMLConfig
 from .vision import SiglipVisionModel
@@ -55,16 +55,16 @@ if is_flash_attn_2_available():
 logger = logging.get_logger(__name__)
-_CONFIG_FOR_DOC = "Img2HTMLConfig"
-IMG2HTML_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "HuggingFaceM4/Img2HTML"
 ]
 @dataclass
-class Img2HTMLBaseModelOutputWithPast(ModelOutput):
     """
-    Base class for Img2HTML model's outputs that may also contain a past key/values (to speed up sequential decoding).
     Args:
         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
@@ -107,9 +107,9 @@ class Img2HTMLBaseModelOutputWithPast(ModelOutput):
 @dataclass
-class Img2HTMLCausalLMOutputWithPast(ModelOutput):
     """
-    Base class for Img2HTML causal language model (or autoregressive) outputs.
     Args:
         loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
@@ -162,7 +162,6 @@ def expand_inputs_for_generation(
     input_ids = input_ids.index_select(0, expanded_return_idx)
     model_kwargs["pixel_values"] = model_kwargs.get("pixel_values", None)
     model_kwargs["image_hidden_states"] = model_kwargs.get("image_hidden_states", None)
-    # model_kwargs["image_attention_mask"] = model_kwargs.get("image_attention_mask", None)
     if "token_type_ids" in model_kwargs:
         token_type_ids = model_kwargs["token_type_ids"]
@@ -171,11 +170,6 @@ def expand_inputs_for_generation(
     if attention_mask is not None:
         model_kwargs["attention_mask"] = attention_mask.index_select(0, expanded_return_idx)
-    # if model_kwargs["image_attention_mask"] is not None:
-    #     model_kwargs["image_attention_mask"] = model_kwargs["image_attention_mask"].index_select(
-    #         0, expanded_return_idx
-    #     )
     if model_kwargs["pixel_values"] is not None:
         model_kwargs["pixel_values"] = model_kwargs["pixel_values"].index_select(0, expanded_return_idx)
@@ -203,10 +197,6 @@ def update_model_kwargs_for_generation(outputs, model_kwargs):
         model_kwargs["attention_mask"] = torch.cat(
             [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
         )
-    # if "image_attention_mask" in model_kwargs:
-    #     image_attention_mask = model_kwargs["image_attention_mask"]
-    #     last_mask = image_attention_mask[:, -1, :].unsqueeze(1)
-    #     model_kwargs["image_attention_mask"] = last_mask
     # Get the precomputed image_hidden_states
     model_kwargs["image_hidden_states"] = outputs.image_hidden_states
@@ -234,7 +224,6 @@ def prepare_inputs_for_generation(input_ids, past_key_values=None, **kwargs):
     pixel_values = kwargs.get("pixel_values", None)
     image_hidden_states = kwargs.get("image_hidden_states", None)
-    # image_attention_mask = kwargs.get("image_attention_mask", None)
     return {
         "input_ids": input_ids,
@@ -245,7 +234,6 @@ def prepare_inputs_for_generation(input_ids, past_key_values=None, **kwargs):
         "token_type_ids": token_type_ids,
         "pixel_values": pixel_values,
         "image_hidden_states": image_hidden_states,
-        # "image_attention_mask": image_attention_mask,
     }
@@ -696,7 +684,7 @@ class MistralAttention(nn.Module):
     and "Generating Long Sequences with Sparse Transformers".
     """
-    def __init__(self, config: Img2HTMLConfig, qk_layer_norms: bool = False):
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
@@ -1091,7 +1079,7 @@ class MistralFlashAttention2(MistralAttention):
 class MistralDecoderLayer(nn.Module):
-    def __init__(self, config: Img2HTMLConfig):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.self_attn = (
@@ -1174,7 +1162,7 @@ MISTRAL_START_DOCSTRING = r"""
     and behavior.
     Parameters:
-        config ([`Img2HTMLConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
@@ -1186,7 +1174,7 @@ MISTRAL_START_DOCSTRING = r"""
     MISTRAL_START_DOCSTRING,
 )
 class VMistralPreTrainedModel(PreTrainedModel):
-    config_class = Img2HTMLConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _no_split_modules = ["MistralDecoderLayer"]
@@ -1288,10 +1276,10 @@ class VMistralModel(VMistralPreTrainedModel):
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`]
     Args:
-        config: Img2HTMLConfig
     """
-    def __init__(self, config: Img2HTMLConfig, vision_model=None):
         super().__init__(config)
         self.config = config
         self.padding_idx = config.pad_token_id
@@ -1435,7 +1423,7 @@ class VMistralModel(VMistralPreTrainedModel):
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, Img2HTMLBaseModelOutputWithPast]:
         device = input_ids.device if input_ids is not None else inputs_embeds.device
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -1599,7 +1587,7 @@ class VMistralModel(VMistralPreTrainedModel):
                 for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, image_hidden_states]
                 if v is not None
             )
-        return Img2HTMLBaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=next_cache,
             hidden_states=all_hidden_states,
@@ -1608,7 +1596,7 @@ class VMistralModel(VMistralPreTrainedModel):
         )
-class Img2HTMLForVisionText2Text(VMistralPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config, vision_model=None):
@@ -1665,7 +1653,7 @@ class Img2HTMLForVisionText2Text(VMistralPreTrainedModel):
                 output_embeddings.out_additional_features = input_embeddings.num_additional_embeddings
     @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=Img2HTMLCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -1680,7 +1668,7 @@ class Img2HTMLForVisionText2Text(VMistralPreTrainedModel):
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, Img2HTMLCausalLMOutputWithPast]:
         r"""
         Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1736,7 +1724,7 @@ class Img2HTMLForVisionText2Text(VMistralPreTrainedModel):
             output = (logits,) + outputs[1:]
             return (loss,) + output if loss is not None else output
-        return Img2HTMLCausalLMOutputWithPast(
             loss=loss,
             logits=logits,
             past_key_values=outputs.past_key_values,

 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+""" PyTorch VMistral model."""
 from dataclasses import dataclass
 import inspect
 import math
 from transformers.utils import logging
 from transformers.modeling_outputs import ModelOutput
+from .configuration_vmistral import VMistralConfig
 from .vision import SiglipVisionModel
 logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "VMistralConfig"
+VMistral_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "HuggingFaceM4/VLM_WebSight_finetuned"
 ]
 @dataclass
+class VMistralBaseModelOutputWithPast(ModelOutput):
     """
+    Base class for VMistral model's outputs that may also contain a past key/values (to speed up sequential decoding).
     Args:
         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
 @dataclass
+class VMistralCausalLMOutputWithPast(ModelOutput):
     """
+    Base class for VMistral causal language model (or autoregressive) outputs.
     Args:
         loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
     input_ids = input_ids.index_select(0, expanded_return_idx)
     model_kwargs["pixel_values"] = model_kwargs.get("pixel_values", None)
     model_kwargs["image_hidden_states"] = model_kwargs.get("image_hidden_states", None)
     if "token_type_ids" in model_kwargs:
         token_type_ids = model_kwargs["token_type_ids"]
     if attention_mask is not None:
         model_kwargs["attention_mask"] = attention_mask.index_select(0, expanded_return_idx)
     if model_kwargs["pixel_values"] is not None:
         model_kwargs["pixel_values"] = model_kwargs["pixel_values"].index_select(0, expanded_return_idx)
         model_kwargs["attention_mask"] = torch.cat(
             [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
         )
     # Get the precomputed image_hidden_states
     model_kwargs["image_hidden_states"] = outputs.image_hidden_states
     pixel_values = kwargs.get("pixel_values", None)
     image_hidden_states = kwargs.get("image_hidden_states", None)
     return {
         "input_ids": input_ids,
         "token_type_ids": token_type_ids,
         "pixel_values": pixel_values,
         "image_hidden_states": image_hidden_states,
     }
     and "Generating Long Sequences with Sparse Transformers".
     """
+    def __init__(self, config: VMistralConfig, qk_layer_norms: bool = False):
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
 class MistralDecoderLayer(nn.Module):
+    def __init__(self, config: VMistralConfig):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.self_attn = (
     and behavior.
     Parameters:
+        config ([`VMistralConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
     MISTRAL_START_DOCSTRING,
 )
 class VMistralPreTrainedModel(PreTrainedModel):
+    config_class = VMistralConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _no_split_modules = ["MistralDecoderLayer"]
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`]
     Args:
+        config: VMistralConfig
     """
+    def __init__(self, config: VMistralConfig, vision_model=None):
         super().__init__(config)
         self.config = config
         self.padding_idx = config.pad_token_id
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, VMistralBaseModelOutputWithPast]:
         device = input_ids.device if input_ids is not None else inputs_embeds.device
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
                 for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, image_hidden_states]
                 if v is not None
             )
+        return VMistralBaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=next_cache,
             hidden_states=all_hidden_states,
         )
+class VMistralForVisionText2Text(VMistralPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config, vision_model=None):
                 output_embeddings.out_additional_features = input_embeddings.num_additional_embeddings
     @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=VMistralCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, VMistralCausalLMOutputWithPast]:
         r"""
         Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             output = (logits,) + outputs[1:]
             return (loss,) + output if loss is not None else output
+        return VMistralCausalLMOutputWithPast(
             loss=loss,
             logits=logits,
             past_key_values=outputs.past_key_values,

processing_idefics.py ADDED Viewed

	@@ -0,0 +1,414 @@

+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for IDEFICS.
+"""
+from typing import Callable, List, Optional, Union
+from urllib.parse import urlparse
+from ...feature_extraction_utils import BatchFeature
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, TextInput, TruncationStrategy
+from ...utils import TensorType, is_torch_available
+if is_torch_available():
+    import torch
+IMAGE_TOKEN = "<image>"
+# copied from m4.training.packing
+def incremental_to_binary_attention_mask(incremental_mask, num_classes=-1):
+    # This function converts: [-1, 0, 1] => [[0, 0], [1, 0], [0, 1]]
+    # If any of images index are more than num_classes, set them to -1.
+    # Words after the max number of images allowed have been seen don't attend on anything
+    if num_classes != -1:
+        incremental_mask[incremental_mask >= num_classes] = -1
+    negatives = incremental_mask == -1
+    incremental_mask[negatives] = 0
+    attn_mask = torch.nn.functional.one_hot(incremental_mask, num_classes=num_classes)
+    attn_mask[negatives, :] = 0
+    return attn_mask
+# copied from m4.training.packing
+def image_attention_mask_for_packed_input_ids(input_ids, tokenizer):
+    image_attention_mask = torch.full_like(input_ids, fill_value=-1)
+    next_image_attention_mask = torch.full_like(input_ids, fill_value=-1)
+    image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
+    eod_token_id = tokenizer.eos_token_id
+    for batch_idx in range(input_ids.size(0)):
+        count = -1
+        seen_eod = False
+        for idx, token_id in enumerate(input_ids[batch_idx]):
+            if token_id == image_token_id:
+                count += 1
+                image_attention_mask[batch_idx][idx] = count
+                seen_eod = False
+            else:
+                image_attention_mask[batch_idx][idx] = count
+            if seen_eod:
+                image_attention_mask[batch_idx][idx] = -1
+            if token_id == eod_token_id:
+                seen_eod = True
+    for batch_idx in range(input_ids.size(0)):
+        count = -1
+        seen_eod = False
+        for idx in range(input_ids[batch_idx].size(0) - 1, -1, -1):
+            token_id = input_ids[batch_idx][idx]
+            if token_id == image_token_id:
+                count += 1
+                next_image_attention_mask[batch_idx][idx] = count
+                seen_eod = False
+            else:
+                next_image_attention_mask[batch_idx][idx] = count
+            if token_id == eod_token_id:
+                seen_eod = True
+            if seen_eod:
+                next_image_attention_mask[batch_idx][idx] = -1
+        non_negative_indices = next_image_attention_mask[batch_idx] != -1
+        next_image_attention_mask[batch_idx][non_negative_indices] -= count
+        next_image_attention_mask[batch_idx][non_negative_indices] *= -1
+    return image_attention_mask, next_image_attention_mask
+def is_url(string):
+    """Checks if the passed string contains a valid url and nothing else. e.g. if space is included it's immediately
+    invalidated the url"""
+    if " " in string:
+        return False
+    result = urlparse(string)
+    return all([result.scheme, result.netloc])
+class IdeficsProcessor(ProcessorMixin):
+    r"""
+    Constructs a IDEFICS processor which wraps a LLama tokenizer and IDEFICS image processor into a single processor.
+    [`IdeficsProcessor`] offers all the functionalities of [`IdeficsImageProcessor`] and [`LlamaTokenizerFast`]. See
+    the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information.
+    Args:
+        image_processor (`IdeficsImageProcessor`):
+            An instance of [`IdeficsImageProcessor`]. The image processor is a required input.
+        tokenizer (`LlamaTokenizerFast`):
+            An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input.
+        image_size (`int`, *optional*, defaults to 224): Image size (assuming a square image)
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "IdeficsImageProcessor"
+    tokenizer_class = "LlamaTokenizerFast"
+    def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_utterance_token=None, **kwargs):
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+        super().__init__(image_processor, tokenizer)
+        self.current_processor = self.image_processor
+        self.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
+        self.default_image_dims = (
+            self.image_processor.image_num_channels,
+            self.image_processor.image_size,
+            self.image_processor.image_size,
+        )
+        self.tokenizer_was_trained_with_end_of_utterance_token = (
+            True
+            if "<end_of_utterance>" in self.tokenizer.special_tokens_map.get("additional_special_tokens", [])
+            else False
+        )
+    def __call__(
+        self,
+        prompts: Union[List[TextInput], List[List[TextInput]]],
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        transform: Callable = None,
+        add_eos_token=False,
+        add_end_of_utterance_token=None,
+        debug=False,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+    ) -> BatchEncoding:
+        """This method takes batched or non-batched prompts made of text and images and converts them into prompts that
+        the model was trained on and prepares the image pixel values for the model to process.
+        Args:
+            prompts (`Union[List[TextInput], [List[List[TextInput]]]]`):
+                either a single prompt or a batched list of prompts - see the detailed description immediately after
+                the end of the arguments doc section.
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            truncation (`bool`, *optional*):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            transform (`Callable`, *optional*):
+                A custom transform function that accepts a single image can be passed for training. For example,
+                `torchvision.Compose` can be used to compose multiple functions. If `None` a preset inference-specific
+                set of transforms will be applied to the images
+            add_eos_token (`bool`, *optional*, defaults to `False`):
+                Adds `eos_token` at the end of the final prompt if True`
+            add_end_of_utterance_token (`bool`, *optional*)
+                Whether to automatically add `<end_of_utterance>` after each prompt's text input (unless followed by an
+                image). If `None` the tokenizer will be checked instead and if this token is found in
+                `additional_special_tokens` then the value will be `True`.
+            debug (`bool`, *optional*, defaults to `False`):
+                `True` value will help debug prompt generation by dumping useful information
+            return_tensors (`str` or `TensorType`, *optional*, defaults to `TensorType.PYTORCH`):
+                The type of tensors to return. Can be one of:
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+        Returns:
+            a dict with entries: `input_ids`, `attention_mask`, `pixel_values`, `image_attention_mask` which can be
+            directly passed to `model.generate`
+        Detailed explanation:
+        Each entry in `prompts` is either a text to be passed as is or an image that will be processed.
+        An image can be either an image object (`PIL.Image`) or a url from which the image can be retrieved.
+        When the processor encounters an image it'll inject `<fake_token_around_image><image><fake_token_around_image>`
+        entry into the prompt.
+        Example:
+        ```python
+        checkpoint = "HuggingFaceM4/idefics-9b"
+        processor = AutoProcessor.from_pretrained(checkpoint)
+        url = "https://hips.hearstapps.com/hmg-prod/images/cute-photos-of-cats-in-grass-1593184777.jpg"
+        img = processor.image_processor.fetch_images([url])[0]
+        prompts = [
+            "User:",
+            img,
+            "Describe this image.\nAssistant: An image of two kittens in grass.\n",
+            "User:",
+            "https://hips.hearstapps.com/hmg-prod/images/dog-puns-1581708208.jpg",
+            "Describe this image.\nAssistant:",
+        ]
+        inputs = processor(prompts, return_tensors="pt")
+        generated_ids = model.generate(**inputs, max_length=100)
+        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        ```
+        In this example the `prompts` will be converted into:
+        ```
+        <s>User:<fake_token_around_image><image><fake_token_around_image>Describe this image.
+        Assistant: An image of two kittens in grass.
+        User:<fake_token_around_image><image><fake_token_around_image>Describe this image.
+        Assistant:'
+        ```
+        and the two images will be massaged using [`IdeficsImageProcessor.__call__`] method and placed inside the
+        `pixel_values` dict entry of the return value.
+        This example also examplifies that images can be passed as objects or as text urls. It can be seen that the
+        first image is passed as object and the second one as a url.
+        To do training do:
+        ```python
+        image_transform = transforms.Compose(
+            [
+                transforms.RandomResizedCrop(
+                    (w, h), scale=(0.9, 1.0), interpolation=transforms.InterpolationMode.BICUBIC
+                ),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=self.image_mean, std=self.image_std),
+            ]
+        )
+        inputs = processor(prompts, transform=image_transform, return_tensors="pt")
+        ```
+        In order to help debug prompt generation enable `debug=True` which will show you what's happening.
+        """
+        # if the value isn't overriden by the user, check if the tokenizer was trained with this token and then use it
+        if add_end_of_utterance_token is None:
+            add_end_of_utterance_token = self.tokenizer_was_trained_with_end_of_utterance_token
+        # turn non-batched prompts into batched
+        if not any(isinstance(i, list) for i in prompts):
+            prompts = [prompts]
+        fake_token = "<fake_token_around_image>"
+        image_token = "<image>"
+        end_of_utterance_token = "<end_of_utterance>"
+        def image_tokens(last_was_image):
+            if last_was_image:
+                return image_token + fake_token
+            else:
+                return fake_token + image_token + fake_token
+        all_prompts = []
+        all_images = []
+        for sample in prompts:
+            # the model was trained on samples starting with <s>
+            full_text = f"{self.tokenizer.bos_token}"
+            # an image can either be an image object in the item or the url, everything else is a verbatim prompt text
+            image_objects = []
+            last_was_image = False
+            last_was_text = False
+            for i, item in enumerate(sample):
+                if i > 0:
+                    last_was_text = True if not last_was_image else False
+                if isinstance(item, str):
+                    item = item.strip(" ")
+                    if is_url(item):
+                        image = self.image_processor.fetch_images(item)
+                        full_text += image_tokens(last_was_image)
+                        image_objects.append(image)
+                        last_was_image = True
+                    else:
+                        # we add end_of_utterance_token between each subsequent text prompts (but not at the last one!)
+                        if add_end_of_utterance_token and last_was_text:
+                            full_text += end_of_utterance_token
+                        full_text += item
+                        last_was_image = False
+                else:
+                    # must be an image obj
+                    full_text += image_tokens(last_was_image)
+                    image_objects.append(item)
+                    last_was_image = True
+            if add_eos_token:
+                full_text += self.tokenizer.eos_token
+            if debug is True:
+                print(f"{full_text=}")
+            image_objects = self.image_processor(image_objects, transform=transform)
+            all_prompts.append(full_text)
+            all_images.append(image_objects)
+        text_encoding = self.tokenizer(
+            text=all_prompts,
+            add_special_tokens=False,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+        )
+        all_texts = text_encoding["input_ids"]
+        max_seq_len = max(len(x) for x in all_texts)
+        # max_num_images has to be at least 1 even when there are no images
+        max_num_images = max(len(x) for x in all_images)
+        max_num_images = max(1, max_num_images)
+        at_least_one_image = sum(len(x) for x in all_images) > 0
+        output_input_ids = []
+        output_images = []
+        output_attention_masks = []
+        for text, images in zip(all_texts, all_images):
+            padded_input_ids = [self.tokenizer.pad_token_id] * max_seq_len
+            unpadded_seq_len = len(text)
+            start = max_seq_len - unpadded_seq_len
+            padded_input_ids[start:] = text[:max_seq_len]
+            attention_mask = torch.zeros((max_seq_len,), dtype=torch.long)
+            attention_mask[start:] = 1
+            image_count = padded_input_ids.count(self.image_token_id)
+            local_max_num_images = min(image_count, max_num_images)
+            current_images = images[:local_max_num_images]
+            if len(current_images) > 0:
+                padded_image_tensor = torch.zeros(max_num_images, *current_images.size()[1:])
+                padded_image_tensor[: current_images.size(0)] = current_images
+            else:
+                padded_image_tensor = torch.zeros(max_num_images, *self.default_image_dims)
+            output_images.append(padded_image_tensor)
+            output_input_ids.append(torch.tensor(padded_input_ids))
+            output_attention_masks.append(attention_mask)
+        output_input_ids = torch.stack(output_input_ids)
+        output_images = torch.stack(output_images)
+        output_attention_masks = torch.stack(output_attention_masks)
+        if at_least_one_image:
+            image_attention_mask, _ = image_attention_mask_for_packed_input_ids(output_input_ids, self.tokenizer)
+            image_attention_mask = incremental_to_binary_attention_mask(
+                image_attention_mask, num_classes=max_num_images
+            )
+        else:
+            # in full language mode we set the image mask to all-0s
+            image_attention_mask = torch.zeros(
+                output_input_ids.shape[0], output_input_ids.shape[1], 1, dtype=torch.bool
+            )
+        return BatchFeature(
+            data={
+                "input_ids": output_input_ids,
+                "attention_mask": output_attention_masks,
+                "pixel_values": output_images,
+                "image_attention_mask": image_attention_mask,
+            }
+        )
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))