Upload 6 files

Browse files

Files changed (6) hide show

config.json +31 -0
configuration_emu3visionvq.py +106 -0
image_processing_emu3visionvq.py +442 -0
model.safetensors +3 -0
modeling_emu3visionvq.py +822 -0
preprocessor_config.json +29 -0

config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "architectures": [
+    "Emu3VisionVQModel"
+  ],
+  "attn_resolutions": [
+    3
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_emu3visionvq.Emu3VisionVQConfig",
+    "AutoModel": "modeling_emu3visionvq.Emu3VisionVQModel"
+  },
+  "ch": 256,
+  "ch_mult": [
+    1,
+    2,
+    2,
+    4
+  ],
+  "codebook_size": 32768,
+  "double_z": false,
+  "dropout": 0.0,
+  "embed_dim": 4,
+  "in_channels": 3,
+  "model_type": "Emu3VisionVQ",
+  "num_res_blocks": 2,
+  "out_channels": 3,
+  "temporal_downsample_factor": 4,
+  "torch_dtype": "float32",
+  "transformers_version": "4.44.0",
+  "z_channels": 4
+}

configuration_emu3visionvq.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# coding=utf-8
+# Copyright 2024 The Emu team, BAAI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Emu3VisionVQ model configuration """
+from typing import List
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class Emu3VisionVQConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Emu3VisionVQ`]. It is used to instantiate an video movq
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a configuration to the VQ model presented in Emu3 paper.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        codebook_size (`int`, *optional*, defaults to 32768):
+            Codebook size of the VQ model.
+        embed_dim (`int`, *optional*, defaults to 4):
+            Dimension of the quantized vector in codebook.
+        z_channels (`int`, *optional*, defaults to 4):
+            Dimension of the output channel of encoder and the input channel of decoder
+        double_z (`bool`, *optional*, defaults to False):
+            Whether double the output dim of the encoder.
+        in_channels (`int`, *optional*, defaults to 3):
+            Input channel of encoder.
+        out_channels (`int`, *optional*, defaults to 3):
+            Output channel of decoder.
+        temporal_downsample_factor (`int`, *optional*, defaults to 4):
+            Temporal downsample factor.
+        ch (`int`, *optional*, defaults to 256):
+            Basic channel number of the intermediate blocks.
+        ch_mult (`List[int]`, *optional*, defaults to `[1, 2, 2, 4]`):
+            Channel scaling factor of the intermediate blocks.
+        num_res_blocks (`int`, *optional*, defaults to 2):
+            Residual block number in each stage.
+        attn_resolutions (`List[int]`, *optional*, defaults to 3):
+            Stage indices to apply attention.
+        dropout (`float`, *optional*, defaults to 0.0):
+            Dropout probability.
+    ```python
+    >>> from transformers import Emu3VisionVQ, Emu3VisionVQConfig
+    >>> # Initializing a video VQ model of Emu3 configuration
+    >>> configuration = Emu3VisionVQConfig()
+    >>> # Initializing a model from the Emu3 VQ model style configuration
+    >>> model = Emu3VisionVQModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "Emu3VisionVQ"
+    def __init__(
+        self,
+        codebook_size: int = 32768,
+        embed_dim: int = 4,
+        z_channels: int = 4,
+        double_z: bool = False,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        temporal_downsample_factor: int = 4,
+        ch: int = 256,
+        ch_mult: List[int] = [1, 2, 2, 4],
+        num_res_blocks: int = 2,
+        attn_resolutions: List[int] = [3],
+        dropout: float = 0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.codebook_size = codebook_size
+        self.embed_dim = embed_dim
+        self.z_channels = z_channels
+        self.double_z = double_z
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.temporal_downsample_factor = temporal_downsample_factor
+        self.ch = ch
+        self.ch_mult = ch_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_resolutions = attn_resolutions
+        self.dropout = dropout

image_processing_emu3visionvq.py ADDED Viewed

	@@ -0,0 +1,442 @@

+# coding=utf-8
+# Copyright 2024 The Emu team, BAAI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Emu3VisionVQ."""
+import math
+from typing import Dict, List, Optional, Union
+import numpy as np
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
+from transformers.image_transforms import (
+    convert_to_rgb,
+    resize,
+    to_channel_dimension_format,
+)
+from transformers.image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from transformers.utils import TensorType, is_vision_available, logging
+logger = logging.get_logger(__name__)
+if is_vision_available():
+    from PIL import Image
+def smart_resize(
+    height: int, width: int, factor: int = 8, min_pixels: int = 512 * 512, max_pixels: int = 1024 * 1024
+):
+    """Rescales the image so that the following conditions are met:
+    1. Both dimensions (height and width) are divisible by 'factor'.
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+    3. The aspect ratio of the image is maintained as closely as possible.
+    """
+    if height < factor or width < factor:
+        raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
+    elif max(height, width) / min(height, width) > 5:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than 5, got {max(height, width) / min(height, width)}"
+        )
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = math.floor(height / beta / factor) * factor
+        w_bar = math.floor(width / beta / factor) * factor
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+    return h_bar, w_bar
+class Emu3VisionVQImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Emu3VisionVQ image processor that dynamically resizes images based on the original images.
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use when resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        image_std (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        min_pixels (`int`, *optional*, defaults to `512 * 512`):
+            The min pixels of the image to resize the image.
+        max_pixels (`int`, *optional*, defaults to `1024 * 1024`):
+            The max pixels of the image to resize the image.
+        spatial_factor (`int`, *optional*, defautls to 8):
+            The spatial downsample factor the image will be downsampled in feature extracting phase
+    """
+    model_input_names = ["pixel_values"]
+    def __init__(
+        self,
+        do_resize: bool = True,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        min_pixels: int = 512 * 512,
+        max_pixels: int = 1024 * 1024,
+        spatial_factor: int = 8,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.do_resize = do_resize
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
+        self.do_convert_rgb = do_convert_rgb
+        self.spatial_factor = spatial_factor
+    def _preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        spatial_factor: Optional[int] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        output_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST,
+    ):
+        """
+        Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Scale factor to use if rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            spatial_factor (`int`, *optional*, defaults to `self.spatial_factor`):
+                The spatial downsample factor the image will be downsampled in feature extracting phase
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.   - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            output_data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+        """
+        spatial_factor = spatial_factor if spatial_factor is not None else self.spatial_factor
+        images = make_list_of_images(images)
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                "pixel_values.append()images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+        height, width = get_image_size(images[0], channel_dim=input_data_format)
+        resized_height, resized_width = height, width
+        processed_images = []
+        for image in images:
+            if do_resize:
+                resized_height, resized_width = smart_resize(
+                    height,
+                    width,
+                    factor=spatial_factor,
+                    min_pixels=self.min_pixels,
+                    max_pixels=self.max_pixels,
+                )
+                image = resize(
+                    image, size=(resized_height, resized_width), resample=resample, input_data_format=input_data_format
+                )
+            if do_rescale:
+                image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+            image = to_channel_dimension_format(image, output_data_format, input_channel_dim=input_data_format)
+            processed_images.append(image)
+        image = np.array(processed_images)
+        return image
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        spatial_factor: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        output_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST,
+    ):
+        """
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            spatial_factor (`int`, *optional*, defaults to `self.spatial_factor`):
+                The spatial downsample factor the image will be downsampled in feature extracting phase
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            output_data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        spatial_factor = spatial_factor if spatial_factor is not None else self.spatial_factor
+        images = make_list_of_images(images)
+        if images is None or not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=self.size,
+            resample=resample,
+        )
+        pixel_values = []
+        for image in images:
+            norm_image = self._preprocess(
+                image,
+                do_resize=do_resize,
+                resample=resample,
+                do_rescale=do_rescale,
+                rescale_factor=rescale_factor,
+                do_normalize=do_normalize,
+                image_mean=image_mean,
+                image_std=image_std,
+                do_convert_rgb=do_convert_rgb,
+                spatial_factor=spatial_factor,
+                input_data_format=input_data_format,
+                output_data_format=output_data_format,
+            )
+            pixel_values.extend(norm_image)
+        pixel_values = np.array(pixel_values)
+        data = {"pixel_values": pixel_values}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+    def postprocess(
+        self,
+        images: ImageInput,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: str | TensorType = "PIL.Image.Image",
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Postprocess an image or batch of images tensor. Postprocess is the reverse process of preprocess.
+        The parameters should be same as in preprocess.
+        Args:
+            images (`ImageInput`):
+                Image to postprocess. Expects a single or batch of images with pixel values ranging from -1 to 1.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        rescale_factor = 1 / rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        image_mean, image_std = self.inverse_meanstd(image_mean, image_std)
+        images = make_list_of_images(images)
+        if isinstance(images[0], Image.Image):
+            return images if len(images) > 1 else images[0]
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+        pixel_values = []
+        for image in images:
+            image = to_numpy_array(image)
+            if do_normalize:
+                image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+            if do_rescale:
+                image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
+                image = image.clip(0, 255).astype(np.uint8)
+            if do_normalize and do_rescale and return_tensors == "PIL.Image.Image":
+                image = to_channel_dimension_format(image, ChannelDimension.LAST, input_channel_dim=input_data_format)
+                pixel_values.append(Image.fromarray(image))
+            else:
+                pixel_values.extend(image)
+        data = {"pixel_values": pixel_values}
+        return_tensors = return_tensors if return_tensors != "PIL.Image.Image" else None
+        return BatchFeature(data=data, tensor_type=return_tensors)
+    def inverse_meanstd(self, image_mean, image_std):
+        image_mean = self.to_tuple(image_mean)
+        image_std = self.to_tuple(image_std)
+        rev_image_mean = tuple(-m / s for m, s in zip(image_mean, image_std))
+        rev_image_std = tuple(1 / s for s in image_std)
+        return rev_image_mean, rev_image_std
+    def to_tuple(self, value, dim=3):
+        if isinstance(value, int | float):
+            return (value,) * dim
+        return tuple(value)

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:89536431c69b08b10b449ec309f52dcea22f14b7647317f30f5715273392bbf1
+size 1083015124

modeling_emu3visionvq.py ADDED Viewed

	@@ -0,0 +1,822 @@

+# coding=utf-8
+# Copyright 2024 The Emu team, BAAI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Emu3VisionVQ model """
+import math
+from typing import Optional, Tuple, Union
+import torch
+from torch import nn
+from torch.nn import functional as F
+from transformers.modeling_utils import PreTrainedModel
+from .configuration_emu3visionvq import Emu3VisionVQConfig
+class Emu3VisionVQActivation(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def __call__(self, x: torch.Tensor):
+        return x * torch.sigmoid(x)
+class Emu3VisionVQUpsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+    def forward(self, x: torch.Tensor):
+        x = F.interpolate(x, scale_factor=2.0, mode="nearest")
+        x = self.conv(x)
+        return x
+class Emu3VisionVQDownsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size=3,
+            stride=2,
+            padding=0,
+        )
+    def forward(self, x: torch.Tensor):
+        pad = (0, 1, 0, 1)
+        x = F.pad(x, pad, mode="constant", value=0)
+        x = self.conv(x)
+        return x
+class Emu3VisionVQCausalConv3d(nn.Module):
+    def __init__(
+        self,
+        in_channel: int,
+        out_channel: int,
+        kernel_size: Union[int, Tuple[int, ...]] = (3, 1, 1),
+        stride: Union[int, Tuple[int, ...]] = (1, 1, 1),
+    ):
+        super().__init__()
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size,) * 3
+        if isinstance(stride, int):
+            stride = (stride,) * 3
+        hw_pad = [k - s for k, s in zip(kernel_size[1:], stride[1:])]
+        self.padding = tuple()
+        for p in hw_pad[::-1]:
+            self.padding += (p // 2 + p % 2, p // 2)
+        self.padding += (2, 0)
+        self.conv = nn.Conv3d(
+            in_channel,
+            out_channel,
+            kernel_size,
+            stride=stride,
+        )
+    def forward(self, x: torch.Tensor):
+        x = F.pad(x, self.padding)
+        x = self.conv(x)
+        return x
+class Emu3VisionVQResnetTemporalBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        conv_shortcut: bool = False,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        stride = (1, 1, 1)
+        kernel_size = (3, 3, 3)
+        self.norm1 = nn.BatchNorm3d(in_channels)
+        self.conv1 = Emu3VisionVQCausalConv3d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+        )
+        self.norm2 = nn.BatchNorm3d(out_channels)
+        self.dropout = nn.Dropout(dropout)
+        self.conv2 = Emu3VisionVQCausalConv3d(
+            out_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+        )
+        self.act = Emu3VisionVQActivation()
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = Emu3VisionVQCausalConv3d(
+                    in_channels,
+                    out_channels,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                )
+            else:
+                self.nin_shortcut = nn.Conv3d(
+                    in_channels,
+                    out_channels,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                )
+    def forward(self, x: torch.Tensor):
+        h = self.norm1(x)
+        h = self.act(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = self.act(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x + h
+class Emu3VisionVQSpatialNorm(nn.Module):
+    def __init__(
+        self,
+        f_channels: int,
+        zq_channels: int,
+        norm_layer: nn.Module = nn.GroupNorm,
+        add_conv: bool = False,
+        num_groups: int = 32,
+        eps: float = 1e-6,
+        affine: bool = True,
+    ):
+        super().__init__()
+        self.norm_layer = norm_layer(
+            num_channels=f_channels,
+            num_groups=num_groups,
+            eps=eps,
+            affine=affine,
+        )
+        self.add_conv = add_conv
+        if self.add_conv:
+            self.conv = nn.Conv2d(
+                zq_channels,
+                zq_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+            )
+        self.conv_y = nn.Conv2d(
+            zq_channels,
+            f_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        self.conv_b = nn.Conv2d(
+            zq_channels,
+            f_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+    def forward(self, x: torch.Tensor, zq: torch.Tensor):
+        zq = F.interpolate(zq, size=x.shape[-2:], mode="nearest")
+        if self.add_conv:
+            zq = self.conv(zq)
+        x = self.norm_layer(x)
+        x = x * self.conv_y(zq) + self.conv_b(zq)
+        return x
+class Emu3VisionVQResnetBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        conv_shortcut: bool = False,
+        dropout: float = 0.0,
+        zq_ch: Optional[int] = None,
+        add_conv: bool = False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.zq_ch = zq_ch
+        if zq_ch is None:
+            norm_kwargs = dict(num_groups=32, eps=1e-6, affine=True)
+            self.norm1 = nn.GroupNorm(num_channels=in_channels, **norm_kwargs)
+            self.norm2 = nn.GroupNorm(num_channels=out_channels, **norm_kwargs)
+        else:
+            self.norm1 = Emu3VisionVQSpatialNorm(in_channels, zq_ch, add_conv=add_conv)
+            self.norm2 = Emu3VisionVQSpatialNorm(out_channels, zq_ch, add_conv=add_conv)
+        self.conv1 = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+        self.dropout = nn.Dropout(dropout)
+        self.conv2 = nn.Conv2d(
+            out_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+        self.act = Emu3VisionVQActivation()
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = nn.Conv2d(
+                    in_channels,
+                    out_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                )
+            else:
+                self.nin_shortcut = nn.Conv2d(
+                    in_channels,
+                    out_channels,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                )
+    def forward(self, x: torch.Tensor, zq: Optional[torch.Tensor] = None):
+        norm_args = tuple() if self.zq_ch is None else (zq, )
+        h = self.norm1(x, *norm_args)
+        h = self.act(h)
+        h = self.conv1(h)
+        h = self.norm2(h, *norm_args)
+        h = self.act(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x + h
+class Emu3VisionVQAttnBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        zq_ch: Optional[int] = None,
+        add_conv: bool = False
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.zq_ch = zq_ch
+        if zq_ch is None:
+            norm_kwargs = dict(num_groups=32, eps=1e-6, affine=True)
+            self.norm = nn.GroupNorm(num_channels=in_channels, **norm_kwargs)
+        else:
+            self.norm = Emu3VisionVQSpatialNorm(in_channels, zq_ch, add_conv=add_conv)
+        self.q = nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        self.k = nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        self.v = nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        self.proj_out = nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+    def forward(self, x: torch.Tensor, zq: Optional[torch.Tensor] = None):
+        norm_args = tuple() if self.zq_ch is None else (zq, )
+        nx = self.norm(x, *norm_args)
+        q = self.q(nx)
+        k = self.k(nx)
+        v = self.v(nx)
+        # compute attention
+        b, c, h, w = q.shape
+        q = q.reshape(b, c, h * w)
+        k = k.reshape(b, c, h * w)
+        score = torch.bmm(q.permute(0, 2, 1), k)
+        score = score / (c ** 0.5)
+        score = F.softmax(score, dim=2)
+        # attend to values
+        v = v.reshape(b, c, h * w)
+        v = torch.bmm(v, score.permute(0, 2, 1))
+        v = v.reshape(b, c, h, w)
+        v = self.proj_out(v)
+        return x + v
+class Emu3VisionVQTemporalUpsample(nn.Module):
+    def __init__(
+        self,
+        in_channel: int,
+        out_channel: int,
+        kernel_size: Tuple[int, ...] = (3, 3, 3),
+        stride: Tuple[int, ...] = (1, 1, 1)
+    ):
+        super().__init__()
+        self.in_channel = in_channel
+        self.out_channel = out_channel
+        self.conv = Emu3VisionVQCausalConv3d(
+            in_channel,
+            out_channel,
+            kernel_size,
+            stride=stride,
+        )
+    def forward(self, x: torch.Tensor):
+        b, c, t, h, w = x.shape
+        x = x.permute(0, 1, 3, 4, 2).contiguous().view(b, -1, t)
+        x = F.interpolate(x, scale_factor=2.0, mode="nearest")
+        x = x.view(b, c, h, w, -1).permute(0, 1, 4, 2, 3).contiguous()
+        x = self.conv(x)
+        return x
+class Emu3VisionVQTemporalDownsample(nn.Module):
+    def __init__(
+        self,
+        in_channel: int,
+        out_channel: int,
+        kernel_size: Tuple[int, ...] = (4, 3, 3),
+        stride: Tuple[int, ...] = (2, 1, 1),
+    ):
+        super().__init__()
+        self.in_channel = in_channel
+        self.out_channel = out_channel
+        self.kernel_size = kernel_size
+        self.conv = Emu3VisionVQCausalConv3d(
+            in_channel,
+            out_channel,
+            kernel_size=kernel_size,
+            stride=stride,
+        )
+    def forward(self, x: torch.Tensor):
+        x = self.conv(x)
+        return x
+class Emu3VisionVQVectorQuantizer(nn.Module):
+    def __init__(self, config: Emu3VisionVQConfig):
+        super().__init__()
+        self.embedding = nn.Embedding(config.codebook_size, config.embed_dim)
+        self.embedding.weight.data.uniform_(-1.0 / config.codebook_size, 1.0 / config.codebook_size)
+    def forward(self, x: torch.Tensor):
+        # b t c h w -> b t h w c
+        b, t, c, h, w = x.shape
+        x = x.permute(0, 1, 3, 4, 2).contiguous()
+        x_flattened = x.view(-1, c)
+        codebook = self.embedding.weight
+        d = torch.sum(x_flattened ** 2, dim=1, keepdim=True) + \
+            torch.sum(codebook ** 2, dim=1) - 2 * \
+            torch.einsum('bd,dn->bn', x_flattened, codebook.permute(1, 0))
+        indices = torch.argmin(d, dim=1)
+        indices = indices.view(b, t, h, w)
+        return indices
+class Emu3VisionVQEncoder(nn.Module):
+    def __init__(self, config: Emu3VisionVQConfig):
+        super().__init__()
+        self.ch = config.ch
+        self.num_resolutions = len(config.ch_mult)
+        self.num_res_blocks = config.num_res_blocks
+        self.in_channels = config.in_channels
+        # downsampling
+        self.conv_in = nn.Conv2d(
+            self.in_channels,
+            self.ch,
+            kernel_size=3,
+            stride=1,
+            padding=1
+        )
+        in_ch_mult = (1,) + tuple(config.ch_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = config.ch * in_ch_mult[i_level]
+            block_out = config.ch * config.ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(
+                    Emu3VisionVQResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        dropout=config.dropout,
+                    )
+                )
+                block_in = block_out
+                if i_level in config.attn_resolutions:
+                    attn.append(Emu3VisionVQAttnBlock(block_in))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Emu3VisionVQDownsample(block_in)
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = Emu3VisionVQResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            dropout=config.dropout,
+        )
+        self.mid.attn_1 = Emu3VisionVQAttnBlock(block_in)
+        self.mid.block_2 = Emu3VisionVQResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            dropout=config.dropout,
+        )
+        # end
+        self.norm_out = nn.GroupNorm(num_channels=block_in, num_groups=32, eps=1e-6, affine=True)
+        out_z_channels = 2 * config.z_channels if config.double_z else config.z_channels
+        self.conv_out = nn.Conv2d(
+            block_in,
+            out_z_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+        temporal_down_blocks = int(math.log2(config.temporal_downsample_factor))
+        self.time_conv = nn.ModuleList()
+        for i in range(temporal_down_blocks):
+            conv = Emu3VisionVQTemporalDownsample(out_z_channels, out_z_channels)
+            self.time_conv.append(conv)
+        self.time_res_stack = nn.Sequential(*[
+            Emu3VisionVQResnetTemporalBlock(
+                in_channels=out_z_channels,
+                out_channels=out_z_channels,
+                dropout=config.dropout,
+            ) for _ in range(self.num_res_blocks)
+        ])
+        self.act = Emu3VisionVQActivation()
+    def forward(self, x: torch.Tensor):
+        t = x.shape[1]
+        x = x.reshape(-1, *x.shape[2:])
+        # downsampling
+        h = self.conv_in(x)
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](h)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+            if i_level != self.num_resolutions - 1:
+                h = self.down[i_level].downsample(h)
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # end
+        h = self.norm_out(h)
+        h = self.act(h)
+        h = self.conv_out(h)
+        h = h.reshape(-1, t, *h.shape[1:])
+        h = h.permute(0, 2, 1, 3, 4)
+        for conv in self.time_conv:
+            h = self.act(conv(h))
+        h = self.time_res_stack(h)
+        h = h.permute(0, 2, 1, 3, 4)
+        return h
+class Emu3VisionVQDecoder(nn.Module):
+    def __init__(self, config: Emu3VisionVQConfig):
+        super().__init__()
+        self.ch = config.ch
+        self.num_resolutions = len(config.ch_mult)
+        self.num_res_blocks = config.num_res_blocks
+        in_ch_mult = (1,) + tuple(config.ch_mult)
+        zq_ch = config.embed_dim
+        block_in = config.ch * config.ch_mult[-1]
+        self.time_res_stack = nn.Sequential(*[
+            Emu3VisionVQResnetTemporalBlock(
+                in_channels=config.z_channels,
+                out_channels=config.z_channels,
+                dropout=config.dropout,
+            ) for _ in range(config.num_res_blocks)
+        ])
+        tempo_upsample_block_num = int(math.log2(config.temporal_downsample_factor))
+        self.time_conv = nn.ModuleList()
+        for i in range(tempo_upsample_block_num):
+            conv = Emu3VisionVQTemporalUpsample(config.z_channels, config.z_channels)
+            self.time_conv.append(conv)
+        self.conv_in = nn.Conv2d(
+            config.z_channels,
+            block_in,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = Emu3VisionVQResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            dropout=config.dropout,
+            zq_ch=zq_ch,
+        )
+        self.mid.attn_1 = Emu3VisionVQAttnBlock(block_in, zq_ch)
+        self.mid.block_2 = Emu3VisionVQResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            dropout=config.dropout,
+            zq_ch=zq_ch,
+        )
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = config.ch * config.ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(
+                    Emu3VisionVQResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        dropout=config.dropout,
+                        zq_ch=zq_ch,
+                    )
+                )
+                block_in = block_out
+                if i_level in config.attn_resolutions:
+                    attn.append(Emu3VisionVQAttnBlock(block_in, zq_ch))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Emu3VisionVQUpsample(block_in)
+            self.up.insert(0, up)
+        self.act = Emu3VisionVQActivation()
+        self.norm_out = Emu3VisionVQSpatialNorm(block_in, zq_ch)
+        self.conv_out = nn.Conv2d(
+            block_in,
+            config.out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+    def forward(self, z: torch.Tensor, zq: torch.Tensor):
+        z_zq = torch.cat((z, zq), dim=0)
+        z_zq = z_zq.permute(0, 2, 1, 3, 4)
+        z_zq = self.time_res_stack(z_zq)
+        for conv in self.time_conv:
+            z_zq = self.act(conv(z_zq))
+        z_zq = z_zq.permute(0, 2, 1, 3, 4)
+        h, zq = torch.chunk(z_zq, 2, dim=0)
+        h = h.reshape(-1, *h.shape[2:])
+        zq = zq.reshape(-1, *zq.shape[2:])
+        h = self.conv_in(h)
+        # middle
+        h = self.mid.block_1(h, zq)
+        h = self.mid.attn_1(h, zq)
+        h = self.mid.block_2(h, zq)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks+1):
+                h = self.up[i_level].block[i_block](h, zq)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h, zq)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        h = self.norm_out(h, zq)
+        h = self.act(h)
+        h = self.conv_out(h)
+        return h
+class Emu3VisionVQPretrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = Emu3VisionVQConfig
+    base_model_prefix = "emuvideovq"
+    main_input_name = "pixel_values"
+    _no_split_modules = ["Emu3VisionVQResnetBlock", "Emu3VisionVQAttnBlock", "Emu3VisionVQResnetTemporalBlock"]
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Conv2d, nn.Conv3d)):
+            nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+        # copied from the `reset_parameters` method of `class Linear(Module)` in `torch`.
+        elif isinstance(module, nn.Linear):
+            nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5))
+            if module.bias is not None:
+                fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight)
+                bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                nn.init.uniform_(module.bias, -bound, bound)
+        elif isinstance(module, (nn.BatchNorm2d, nn.BatchNorm3d, nn.GroupNorm)):
+            nn.init.constant_(module.weight, 1)
+            nn.init.constant_(module.bias, 0)
+class Emu3VisionVQModel(Emu3VisionVQPretrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.encoder = Emu3VisionVQEncoder(config)
+        self.decoder = Emu3VisionVQDecoder(config)
+        self.quantize = Emu3VisionVQVectorQuantizer(config)
+        self.quant_conv = Emu3VisionVQCausalConv3d(config.z_channels, config.embed_dim)
+        self.post_quant_conv = Emu3VisionVQCausalConv3d(config.embed_dim, config.z_channels)
+        self.spatial_scale_factor = 2 ** (len(config.ch_mult) - 1)
+        self.post_init()
+    def encode(self, x: torch.Tensor):
+        ndim = x.ndim
+        if ndim == 4:
+            t = self.config.temporal_downsample_factor
+            b, c, h, w = x.shape
+            x = x.unsqueeze(1).repeat(1, t, 1, 1, 1)
+        elif ndim == 5:
+            b, t, c, h, w = x.shape
+        h = self.encoder(x)
+        # b t c h w -> b c t h w
+        h = h.permute(0, 2, 1, 3, 4)
+        h = self.quant_conv(h)
+        # b c t h w -> b t c h w
+        h = h.permute(0, 2, 1, 3, 4)
+        codes = self.quantize(h)
+        if ndim == 4:
+            codes = codes.squeeze(1)
+        return codes
+    def decode(self, x: torch.Tensor):
+        ndim = x.ndim
+        if ndim == 3:
+            x = x.unsqueeze(1)
+        b, t, h, w = x.shape
+        quant = self.quantize.embedding(x.flatten())
+        c = quant.shape[-1]
+        quant = quant.view(b, t, h, w, c).permute(0, 4, 1, 2, 3).contiguous()
+        quant2 = self.post_quant_conv(quant)
+        quant = quant.permute(0, 2, 1, 3, 4)
+        quant2 = quant2.permute(0, 2, 1, 3, 4)
+        video = self.decoder(quant2, quant)
+        video = video.reshape(
+            b,
+            t * self.config.temporal_downsample_factor,
+            self.config.out_channels,
+            h * self.spatial_scale_factor,
+            w * self.spatial_scale_factor,
+        )
+        if ndim == 3:
+            return video[:, 0]
+        return video
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "auto_map": {
+      "AutoImageProcessor": "image_processing_emu3visionvq.Emu3VisionVQImageProcessor"
+  },
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "Emu3VisionVQImageProcessor",
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "max_pixels": 1048576,
+  "min_pixels": 262144,
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "max_pixels": 1048576,
+    "min_pixels": 262144
+  },
+  "spatial_factor": 8
+}