BryanW commited on Mar 23

Commit

2710119

verified ·

1 Parent(s): 63f0c29

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/__pycache__/__init__.cpython-312.pyc +0 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/__pycache__/beam_constraints.cpython-312.pyc +0 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/__pycache__/beam_search.cpython-312.pyc +0 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/__pycache__/candidate_generator.cpython-312.pyc +0 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/__pycache__/configuration_utils.cpython-312.pyc +0 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/__pycache__/flax_logits_process.cpython-312.pyc +0 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/__pycache__/flax_utils.cpython-312.pyc +0 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/__pycache__/stopping_criteria.cpython-312.pyc +0 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/__pycache__/streamers.cpython-312.pyc +0 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/__pycache__/tf_logits_process.cpython-312.pyc +0 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/__pycache__/watermarking.cpython-312.pyc +0 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/continuous_batching/__init__.py +26 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/continuous_batching/__pycache__/__init__.cpython-312.pyc +0 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/continuous_batching/__pycache__/cache.cpython-312.pyc +0 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/continuous_batching/__pycache__/cache_manager.cpython-312.pyc +0 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/continuous_batching/__pycache__/continuous_api.cpython-312.pyc +0 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/continuous_batching/__pycache__/requests.cpython-312.pyc +0 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/continuous_batching/__pycache__/scheduler.cpython-312.pyc +0 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/continuous_batching/cache.py +606 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/continuous_batching/cache_manager.py +226 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/continuous_batching/continuous_api.py +1047 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/continuous_batching/requests.py +204 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/continuous_batching/scheduler.py +300 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/albert/__init__.py +31 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/albert/configuration_albert.py +170 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/albert/modeling_albert.py +1349 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/albert/modeling_flax_albert.py +1132 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/albert/modeling_tf_albert.py +1572 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/albert/tokenization_albert.py +320 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/albert/tokenization_albert_fast.py +178 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/auto/__init__.py +35 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/auto/auto_factory.py +882 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/auto/configuration_auto.py +1404 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/auto/feature_extraction_auto.py +422 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/auto/image_processing_auto.py +688 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/auto/modeling_auto.py +0 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/auto/modeling_flax_auto.py +413 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/auto/modeling_tf_auto.py +776 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/auto/processing_auto.py +443 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/auto/tokenization_auto.py +1235 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/auto/video_processing_auto.py +393 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bark/__init__.py +28 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bark/configuration_bark.py +303 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bark/generation_configuration_bark.py +330 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bark/modeling_bark.py +1628 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bark/processing_bark.py +340 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bert/__init__.py +32 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bert/configuration_bert.py +154 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py +1801 -0
URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bert/modeling_flax_bert.py +1727 -0

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (7.34 kB). View file

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/__pycache__/beam_constraints.cpython-312.pyc ADDED Viewed

Binary file (23.1 kB). View file

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/__pycache__/beam_search.cpython-312.pyc ADDED Viewed

Binary file (44.8 kB). View file

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/__pycache__/candidate_generator.cpython-312.pyc ADDED Viewed

Binary file (61 kB). View file

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/__pycache__/configuration_utils.cpython-312.pyc ADDED Viewed

Binary file (76.5 kB). View file

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/__pycache__/flax_logits_process.cpython-312.pyc ADDED Viewed

Binary file (32.4 kB). View file

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/__pycache__/flax_utils.cpython-312.pyc ADDED Viewed

Binary file (46.5 kB). View file

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/__pycache__/stopping_criteria.cpython-312.pyc ADDED Viewed

Binary file (31.2 kB). View file

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/__pycache__/streamers.cpython-312.pyc ADDED Viewed

Binary file (14.7 kB). View file

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/__pycache__/tf_logits_process.cpython-312.pyc ADDED Viewed

Binary file (40.4 kB). View file

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/__pycache__/watermarking.cpython-312.pyc ADDED Viewed

Binary file (28 kB). View file

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/continuous_batching/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .cache import PagedAttentionCache
+from .continuous_api import ContinuousBatchingManager, ContinuousMixin
+from .requests import RequestState, RequestStatus
+__all__ = [
+    "ContinuousBatchingManager",
+    "ContinuousMixin",
+    "PagedAttentionCache",
+    "RequestState",
+    "RequestStatus",
+]

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/continuous_batching/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (501 Bytes). View file

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/continuous_batching/__pycache__/cache.cpython-312.pyc ADDED Viewed

Binary file (31.7 kB). View file

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/continuous_batching/__pycache__/cache_manager.cpython-312.pyc ADDED Viewed

Binary file (12.4 kB). View file

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/continuous_batching/__pycache__/continuous_api.cpython-312.pyc ADDED Viewed

Binary file (55.7 kB). View file

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/continuous_batching/__pycache__/requests.cpython-312.pyc ADDED Viewed

Binary file (10.5 kB). View file

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/continuous_batching/__pycache__/scheduler.cpython-312.pyc ADDED Viewed

Binary file (16 kB). View file

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/continuous_batching/cache.py ADDED Viewed

	@@ -0,0 +1,606 @@

+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections import deque
+from math import floor, gcd, sqrt
+from typing import Optional, Union
+import torch
+from ...configuration_utils import PretrainedConfig
+from ...generation.configuration_utils import GenerationConfig
+from ...utils.metrics import attach_tracer, traced
+from .cache_manager import CacheAllocator, FullAttentionCacheAllocator, SlidingAttentionCacheAllocator
+from .requests import get_device_and_memory_breakdown, logger
+def group_layers_by_attn_type(config: PretrainedConfig) -> tuple[list[list[int]], list[str]]:
+    """
+    Group layers depending on the attention mix, according to VLLM's hybrid allocator rules:
+        - Layers in each group need to have the same type of attention
+        - All groups have the same number of layers
+    For a model with the following layer types: ["sliding", "full", "full", "sliding", "full", "full", "full", "full"]
+    We would get two groups: [0, 3] and [1, 2], [4,5], [6,7].
+    """
+    # If the config has no layer_type attribute, it means all layers are the same attention type
+    layer_types = getattr(config, "layer_types", None)
+    if layer_types is None:
+        attn_type = "sliding_attention" if getattr(config, "sliding_window", None) is not None else "full_attention"
+        layer_types = [attn_type for _ in range(config.num_hidden_layers)]
+    # We then count the number of layers of each type
+    layer_counts = {}
+    for i, layer_type in enumerate(layer_types):
+        layer_counts[layer_type] = layer_counts.get(layer_type, []) + [i]
+    # The size of all groups is the greatest common divisor of the number of layers of each type
+    group_size = gcd(*[len(indices) for indices in layer_counts.values()])
+    # We then group the layers by type
+    layer_groups = []
+    for layer_type, indices in layer_counts.items():
+        for i in range(0, len(indices), group_size):
+            layer_groups.append(indices[i : i + group_size])
+    # And note the layer types
+    group_types = [layer_types[lg[0]] for lg in layer_groups]
+    return layer_groups, group_types
+@attach_tracer()
+class PagedAttentionCache:
+    """
+    Manages the cache for a paged attention mechanism, inspired by VLLM's hybrid allocator. The cache relies on making
+    groups of layers to reduce the complexity of cache management and fragmentation.
+    The cache uses a three-level hierarchy:
+    - Pages: The smallest unit of cache, a page has a size of [num_heads, head_size], which is the space needed to
+        store the key or value states for one token and one layer. For a model with only full-attention layers, to store
+        the KV cache of one token, we need `2 * num_layers` pages: key and values each take `num_layers` pages.
+        Pages are grouped into blocks:
+    - Blocks: A block is a collection of `block_size` pages, serving as the allocation unit to reduce management
+        complexity and fragmentation. Cache is allocated and freed block by block, not page by page. One block is
+        allocated to one layer group, which only has one attention type, like full-attention or sliding-attention.
+        If all layers in the model have the same attention type, then all layers will be in the same group. There is
+        more than one group if and only if the model has a mixed attention types, like layers with full-attention and
+        layers with sliding-attention.
+    - Cache tensors: The physical supports for the cache. There are as many cache tensors as there are layer in a
+        layer group, and the shape of the cache tensor is `[num_blocks * block_size, num_heads, head_size]`.
+    Grouping layers into groups is useful because when we allocate one block to a group N, the block allocated is the
+        same for all layers in group N, equivalently it is allocated across all cache tensors. This allows us to
+        efficiently allocate and free blocks, and to efficiently read and write key and value states.
+    For instance, imagine we have 8 blocks of cache and a model with two layer groups: a full-attention group with 3
+    layers and a sliding-attention group with 3 layers. At creation time, the physical cache tensors look like this:
+    cache_tensor_0: □ □ □ □ □ □ □ □
+    cache_tensor_1: □ □ □ □ □ □ □ □
+    cache_tensor_2: □ □ □ □ □ □ □ □
+    where □ means the blocks is not allocated to any layer group yet. We have 3 cache tensors because there are
+    3 layers per group.
+    We allocate 1 block to each group, after allocation, the cache tensors look like this:
+    cache_tensor_0: ✖ ◉ □ □ □ □ □ □
+    cache_tensor_1: ✖ ◉ □ □ □ □ □ □
+    cache_tensor_2: ✖ ◉ □ □ □ □ □ □
+    where ✖ means the block is allocated to the full-attention group, and ◉ means the block is allocated to the
+    sliding-attention group.
+    Now, if we continue to generate, and the sliding window has been reached, we only need to allocate a new block
+    for the full-attention group, and the cache tensors look like this:
+    cache_tensor_0: ✖ ◉ ✖ □ □ □ □ □
+    cache_tensor_1: ✖ ◉ ✖ □ □ □ □ □
+    cache_tensor_2: ✖ ◉ ✖ □ □ □ □ □
+    And after further generation, when we need a new block allocated:
+    cache_tensor_0: ✖ ◉ ✖ ✖ □ □ □ □
+    cache_tensor_1: ✖ ◉ ✖ ✖ □ □ □ □
+    cache_tensor_2: ✖ ◉ ✖ ✖ □ □ □ □
+    This would not have been possible if all layers were in the same group: we would have had to allocate a new block
+    for the sliding-attention group, although it is not needed.
+    """
+    # TODO: this init is quite long, maybe a refactor is in order
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        generation_config: GenerationConfig,
+        device: torch.device,
+        dtype: torch.dtype = torch.float16,
+        layer_device_map: Optional[dict[int, Union[str, torch.device, int]]] = None,
+        tp_size: Optional[int] = None,
+    ) -> None:
+        """Initialize a paged attention cache for efficient memory usage.
+        Args:
+            config: Model configuration
+            generation_config: Generation configuration containing cache parameters
+            device: Device for the cache tensors
+            dtype: Data type of the cache
+            layer_device_map: Optional mapping of layer indices to devices
+            tp_size: Tensor parallelism size
+        """
+        self.config = config
+        self.dtype = dtype
+        self.device = device
+        # Extract model dimensions
+        kv_heads = getattr(config, "num_key_value_heads", None)
+        self.num_key_value_heads: int = kv_heads if kv_heads is not None else config.num_attention_heads
+        head_dim = getattr(config, "head_dim", None)
+        self.head_dim: int = head_dim if head_dim is not None else config.hidden_size // config.num_attention_heads
+        # Extract cache dimensions
+        self.block_size = getattr(generation_config, "block_size", 32)
+        # Group layers depending on the attention mix
+        layer_groups, group_types = group_layers_by_attn_type(config)
+        group_size = len(layer_groups[0])
+        self.num_groups = len(layer_groups)
+        self.sliding_windows = {}
+        self.layer_index_to_group_indices = {}
+        for i, group in enumerate(layer_groups):
+            sliding_window = config.sliding_window if group_types[i] == "sliding_attention" else 1
+            for j, layer in enumerate(group):
+                self.layer_index_to_group_indices[layer] = (i, j)
+                self.sliding_windows[layer] = sliding_window
+        # Handle TP (or dont)
+        if tp_size is not None and tp_size > 1:
+            if self.num_key_value_heads % tp_size != 0:
+                raise ValueError(
+                    f"Number of key value heads {self.num_key_value_heads} must be divisible by tensor parallel size {tp_size}."
+                )
+            # If the model is using tensor parallelism, we need to adjust the number of heads accordingly.
+            # self.num_key_value_heads //= tp_size # TODO: why is this commented out?
+        # Infer number of blocks and max batch tokens
+        page_size = self.head_dim * self.num_key_value_heads
+        if getattr(config, "attn_implementation", None) == "paged_attention":
+            num_attention_masks = 0
+        else:
+            # TODO: when we generalize to allow for block-attn, we can use `num_attention_masks=sum(set(group_types))`
+            num_attention_masks = 2 if "sliding_attention" in group_types else 1
+        memory_handler = PagedAttentionMemoryHandler(
+            block_size=self.block_size,
+            page_size=page_size,
+            num_groups=self.num_groups,
+            group_size=group_size,
+            peak_activation_per_token=(config.hidden_size + config.vocab_size),
+            num_attention_masks=num_attention_masks,
+        )
+        num_blocks, max_batch_tokens = memory_handler.infer_num_blocks_and_max_batch_tokens(
+            num_blocks=getattr(generation_config, "num_blocks", None),
+            max_batch_tokens=getattr(generation_config, "max_batch_tokens", None),
+            max_memory_percent=getattr(generation_config, "max_memory", 0.9),
+            cache_dtype=self.dtype,
+        )
+        # Add the inferred attributes to the class
+        self.num_blocks = num_blocks
+        self.max_batch_tokens = max_batch_tokens
+        logger.info(
+            f"PagedAttentionCache initialized with {self.num_blocks = }, {self.block_size = }, {page_size = }, "
+            f"{self.max_batch_tokens = } {num_attention_masks = }"
+        )
+        # Initialize the cache
+        self.key_cache: list[torch.Tensor] = []
+        self.value_cache: list[torch.Tensor] = []
+        # We add one extra token to the cache to handle padding and generally discard unwanted tokens
+        self.cache_shape = (num_blocks * self.block_size + 1, self.num_key_value_heads, self.head_dim)
+        for _ in range(group_size):
+            new_layer_key_cache = torch.empty(self.cache_shape, dtype=self.dtype, device=self.device)
+            new_layer_value_cache = torch.empty(self.cache_shape, dtype=self.dtype, device=self.device)
+            torch._dynamo.mark_static_address(new_layer_key_cache)
+            torch._dynamo.mark_static_address(new_layer_value_cache)
+            self.key_cache.append(new_layer_key_cache)
+            self.value_cache.append(new_layer_value_cache)
+        logger.info(f"{self.cache_shape = } {self.key_cache[0].shape = } {self.key_cache[0].numel() = }")
+        # Block management data structures
+        self._free_blocks = deque(range(num_blocks))
+        self.group_cache_managers: list[CacheAllocator] = []
+        for i, group_type in enumerate(group_types):
+            if group_type == "full_attention":
+                cm = FullAttentionCacheAllocator(i, self.block_size)
+            elif group_type == "sliding_attention":
+                cm = SlidingAttentionCacheAllocator(i, self.block_size, config.sliding_window)
+            else:
+                raise ValueError(f"Invalid group type: {group_type}")
+            self.group_cache_managers.append(cm)
+    @traced
+    def allocate_blocks(self, n_blocks: int, request_id: str) -> int:
+        """Allocate cache blocks across all layer groups for a given request. Actual allocation is done by the cache
+        managers, and this method only returns the maximum number of blocks actually allocated across all managers."""
+        max_allocated = 0
+        for cm in self.group_cache_managers:
+            allocated = cm.allocate_blocks(n_blocks, request_id, self._free_blocks)
+            if allocated is None:
+                return None
+            max_allocated = max(max_allocated, allocated)
+        return max_allocated
+    @traced
+    def free_blocks(self, request_id: str) -> None:
+        """Free all allocated cache blocks for a given request across all layer groups. Actual deallocation is done
+        by the cache managers."""
+        for cm in self.group_cache_managers:
+            cm.free_blocks(request_id, self._free_blocks)
+    def get_num_free_blocks(self) -> int:
+        """Get the current number of unallocated blocks available for new requests."""
+        return len(self._free_blocks)
+    @traced
+    def extend_read_indices(
+        self, request_id: str, past_length: int, query_length: int, read_index: list[list[int]]
+    ) -> None:
+        """Retrieve physical cache indices for reading KV states in the cache across all layer groups. This method
+        coordinates with all cache managers to build the complete set of read indices needed for attention computation.
+        """
+        for cm, read_indices in zip(self.group_cache_managers, read_index):
+            indices = cm.get_read_indices(request_id, past_length, query_length)
+            read_indices.extend(indices)
+    @traced
+    def extend_write_indices(
+        self, request_id: str, past_length: int, query_length: int, write_index: list[list[int]]
+    ) -> None:
+        """Retrieve physical cache indices for writing new KV states to the cache across all layer groups. This method
+        coordinates with all cache managers to build the complete set of write indices needed to store computed KV
+        states."""
+        for cm, write_indices in zip(self.group_cache_managers, write_index):
+            indices = cm.get_write_indices(request_id, past_length, query_length)
+            write_indices.extend(indices)
+    @traced
+    def get_seqlens_k(self, request_id: str, past_length: int, query_length: int) -> dict[str, int]:
+        """Retrieve the key sequence length for the given request_id across all layer types. Returns a dictionary of
+        layer types to their corresponding key sequence lengths."""
+        seqlens_k = {}
+        for cm in self.group_cache_managers:
+            attn_type, seqlen_k = cm.get_seqlens_k(request_id, past_length, query_length)
+            seqlens_k[attn_type] = seqlen_k
+        return seqlens_k
+    @traced
+    def update(
+        self,
+        key_states: torch.Tensor,  # shape [1, num_kv_heads, seqlen_kv, head_dim]
+        value_states: torch.Tensor,  # shape [1, num_kv_heads, seqlen_kv, head_dim]
+        layer_idx: int,
+        read_index: list[torch.Tensor],  # shape [num_layer_groups, seqlen_kv + past_length]
+        write_index: list[torch.Tensor],  # shape [num_layer_groups, seqlen_q]
+        **kwargs,
+    ) -> tuple[torch.Tensor, torch.Tensor]:  # shape [seqlen_kv + past_length, num_kv_heads, head_dim]
+        """Update the cache with new key-value states for a specific layer. This method writes new KV states to the
+        appropriate cache locations. The behavior differs based on the layer's attention type:
+        - Full attention: New KV states are written to cache, then complete sequence is read from cache
+        - Sliding window: Old KV is read from cache along with extra spaces for the new KV, then new KV is written to
+            cache. This is because new KV might overwrite the old KV, so we need to read the old KV first.
+        Returns the complete KV states (cached + new) for attention computation.
+        """
+        # Retrieve the layer read and write indices, and if there is a sliding window
+        group_idx, layer_idx_in_group = self.layer_index_to_group_indices[layer_idx]
+        layer_read_index = read_index[group_idx]
+        layer_write_index = write_index[group_idx]
+        # Select the correct cache
+        k_cache = self.key_cache[layer_idx_in_group]
+        v_cache = self.value_cache[layer_idx_in_group]
+        # Transpose the key and value states to match the cache shape, after which shape is [seqlen_kv, num_kv_heads, head_dim]
+        key_states = key_states.transpose(1, 2).squeeze(0)
+        value_states = value_states.transpose(1, 2).squeeze(0)
+        # Case: full attention
+        sliding_window = self.sliding_windows[layer_idx]
+        if sliding_window == 1:
+            k_cache[layer_write_index, :, :] = key_states
+            v_cache[layer_write_index, :, :] = value_states
+            key_states_with_cache = k_cache[layer_read_index, :, :]
+            value_states_with_cache = v_cache[layer_read_index, :, :]
+        # Case: sliding window -- we  need to be careful of read/write order because of chunked prefill, because it's
+        # the only case where you may write over cache you need to use
+        else:
+            # Add the cache to the key and value states
+            mask = layer_read_index == -1  # TODO: can this can be efficiently precomputed?
+            key_states_with_cache = k_cache[layer_read_index, :, :]
+            key_states_with_cache[mask] = key_states
+            value_states_with_cache = v_cache[layer_read_index, :, :]
+            value_states_with_cache[mask] = value_states
+            # Write new KV values to the cache
+            k_cache[layer_write_index, :, :] = key_states
+            v_cache[layer_write_index, :, :] = value_states
+        # Return the new KV values
+        return key_states_with_cache, value_states_with_cache
+# TODO: rework computation with the groups and their sizes
+class PagedAttentionMemoryHandler:
+    """A helper class to determine the best number of pages and maximum number of tokens per batch for the paged
+    attention cache, providing automatic sizing based on available GPU memory.
+    The helper works using the number of pages, which is tied to the number of blocks by:
+        num_blocks = num_pages // block_size
+    The memory footprint consists of three main components:
+    - Cache memory: the space needed to store the cache tensors:
+        2 * layer_group_size * [num_pages, page_size] * cache_dtype
+    - Activation memory: the space temporarily taken by the largest activation during the model forward pass:
+        peak_activation_per_token * max_tokens_per_batch * activation_dtype_size
+    - Static tensors: the space taken by the input/output buffers and metadata tensors for batch processing, sum of:
+        - inputs_ids + outputs_ids + position_ids + logits_indices: 4 * max_tokens_per_batch * int32_size
+        - attention_mask: num_attention_masks * num_pages * max_tokens_per_batch * activation_dtype_size
+        - cumulative_seqlens_q + cumulative_seqlens_k: (1 + 2) * max_tokens_per_batch * int32_size
+        - write_index_tensor: num_groups * max_tokens_per_batch * int32_size
+        - read_index_tensor: num_groups * (num_pages + max_tokens_per_batch) * int32_size
+    The handler can operate in three modes:
+    1. Auto-sizing: Determines both number of pages and maximum number of tokens per batch using quadratic optimization
+    2. Fixed cache: Calculates max batch tokens given a fixed number of pages
+    3. Fixed batch: Calculates number of pages given a fixed maximum batch size
+    """
+    _activation_dtype = torch.bfloat16
+    _input_dtype = torch.int32
+    _upper_bound_max_batch_tokens = 256
+    _upper_bound_num_blocks = 4096
+    def __init__(
+        self,
+        block_size: int,
+        page_size: int,
+        num_groups: int,
+        group_size: int,
+        peak_activation_per_token: int,
+        num_attention_masks: int,
+    ) -> None:
+        """Initialize the memory handler with the parameters that cannot be automatically inferred.
+        Args:
+            block_size: Size of the cache blocks
+            page_size: Size of the cache pages
+            num_groups: Number of layer groups
+            group_size: Number of layers per layer group
+            peak_activation_per_token: Maximum size of activation tensor per token, = hidden_size + vocab_size
+            num_attention_masks: Number of attention masks, 0 if no attention mask is used, 2 if hybrid model, else 1
+        """
+        self.block_size = block_size
+        self.page_size = page_size
+        self.num_groups = num_groups
+        self.group_size = group_size
+        self.peak_activation_per_token = peak_activation_per_token
+        self.num_attention_masks = num_attention_masks
+    @staticmethod
+    def get_available_memory(max_memory_percent: float = 1.0) -> int:
+        """Calculate available GPU memory for cache allocation, accounting for already allocated tensors.
+        This method queries the current memory state and applies the specified percentage limit to determine
+        how much memory can be safely used for the paged attention cache.
+        Args:
+            max_memory_percent: Fraction of available memory to use (0.0-1.0). 1.0 means use all available memory.
+        Returns:
+            int: Available memory in bytes for cache allocation
+        """
+        _, total, reserved, allocated = get_device_and_memory_breakdown()
+        available_memory = total - max(allocated, reserved)
+        available_memory = int(available_memory * max_memory_percent)
+        return available_memory
+    def infer_num_blocks_and_max_batch_tokens(
+        self,
+        num_blocks: Optional[int] = None,
+        max_batch_tokens: Optional[int] = None,
+        max_memory_percent: float = 0.9,
+        cache_dtype: torch.dtype = torch.float16,
+    ) -> tuple[int, int]:
+        """Determine optimal number of blocks and maximum number of tokens per batch based on available memory and
+        constraints. Check the class docstring for more details. Naming the number of pages as N and the maximum number
+        of tokens per batch as M, the equation solved is:
+        available_memory = sum([
+            MN * num_attention_masks * activation_dtype_size,
+            2N * (layer_group_size * page_size * cache_dtype + 2 * num_group),
+            M * (peak_activation_per_token * activation_dtype + 28 + 4 * num_group),
+        ])
+        where we already simplified int32_size = 4.
+        """
+        # If neither num_blocks nor max_batch_tokens are provided, we use a second-order polynomial
+        if num_blocks is None and max_batch_tokens is None:
+            num_blocks, max_batch_tokens = self.compute_num_blocks_and_max_batch_tokens(
+                max_memory_percent, cache_dtype
+            )
+        # If only num_blocks is provided, we infer the max_batch_tokens
+        elif num_blocks is not None and max_batch_tokens is None:
+            max_batch_tokens = self.compute_max_batch_tokens(num_blocks, max_memory_percent, cache_dtype)
+        # If only max_batch_tokens is provided, we infer the num_blocks
+        elif max_batch_tokens is not None and num_blocks is None:
+            num_blocks = self.compute_num_blocks(max_batch_tokens, max_memory_percent, cache_dtype)
+        # We check if the memory footprint is too large in all cases
+        available_memory = self.get_available_memory(max_memory_percent)
+        memory_footprint = self.compute_memory_footprint(
+            max_batch_tokens=max_batch_tokens,
+            num_blocks=num_blocks,
+            cache_dtype=cache_dtype,
+        )
+        if memory_footprint > available_memory:
+            raise MemoryError(f"Memory footprint {memory_footprint} is more than available memory {available_memory}")
+        return num_blocks, max_batch_tokens
+    def compute_num_blocks_and_max_batch_tokens(
+        self,
+        max_memory_percent: float = 0.9,
+        cache_dtype: torch.dtype = torch.float16,
+        m: float = 0.01,
+    ) -> tuple[int, int]:
+        """Calculate optimal number of blocks and maximum number of tokens per batch using quadratic optimization when
+        neither is fixed. This method assumes a relationship M = m * N where m is a small ratio below 1 and solves the
+        resulting quadratic equation to find the optimal N that maximizes utilization within memory constraints. m is
+        the amount of cache we can fill with one batch: m=0.01 means a batch fills at most 1% of the cache. The equation
+        to solve is:
+        available_memory = sum([
+            m * N^2 * num_attention_masks * activation_dtype_size,
+            2N * (layer_group_size * page_size * cache_dtype + 2 * num_group),
+            m * N * (peak_activation_per_token * activation_dtype + 28 + 4 * num_group),
+        ])
+        """
+        cache_memory = self.get_available_memory(max_memory_percent)
+        logger.info(f"Cache memory: {cache_memory}")
+        # Compute second-degree polynomial coefficients
+        a = m * self.num_attention_masks * self._activation_dtype.itemsize
+        b = 2 * (self.group_size * self.page_size * cache_dtype.itemsize + 2 * self.num_groups)
+        b += m * (self.peak_activation_per_token * self._activation_dtype.itemsize + 28 + 4 * self.num_groups)
+        c = -cache_memory
+        logger.debug(f"Coefficients of 2nd degree polynomial: {a = }, {b = }, {c = }")
+        # Compute discriminant and greatest solution
+        discriminant = b**2 - 4 * a * c
+        if discriminant < 0:
+            raise ValueError(f"Discriminant is negative: {discriminant = }")
+        greatest_solution = (-b + sqrt(discriminant)) / (2 * a)
+        if greatest_solution < 0:
+            raise ValueError(f"Greatest solution is negative: {greatest_solution = }")
+        # Infer number of blocks and max batch tokens
+        num_pages = floor(greatest_solution)
+        num_blocks = num_pages // self.block_size
+        if num_blocks > self._upper_bound_num_blocks:
+            logger.info(f"{num_blocks = } is too large, setting to {self._upper_bound_num_blocks = }")
+            num_blocks = self._upper_bound_num_blocks
+        max_batch_tokens = int(greatest_solution * m)
+        if max_batch_tokens > self._upper_bound_max_batch_tokens:
+            logger.info(f"{max_batch_tokens = } is too large, setting to {self._upper_bound_max_batch_tokens = }")
+            max_batch_tokens = self._upper_bound_max_batch_tokens
+        return num_blocks, max_batch_tokens
+    def compute_max_batch_tokens(
+        self,
+        num_blocks: int,
+        max_memory_percent: float = 0.9,
+        cache_dtype: torch.dtype = torch.float16,
+    ) -> int:
+        """Calculate maximum batch tokens M given a fixed number of cache blocks. The formula for M is given by:
+        M = (available_memory - 2N * (layer_group_size * page_size * cache_dtype + 2 * num_group))
+            / (activation_dtype_size * (N * num_attention_masks + peak_activation_per_token) + 28 + 4 * num_group)
+        """
+        cache_memory = self.get_available_memory(max_memory_percent)
+        num_pages = num_blocks * self.block_size
+        # Compute numerator
+        num = cache_memory
+        num -= 2 * num_pages * (self.group_size * self.page_size * cache_dtype.itemsize + 2 * self.num_groups)
+        # Compute denominator
+        denum = self._activation_dtype.itemsize * (
+            num_pages * self.num_attention_masks + self.peak_activation_per_token
+        )
+        denum += 28 + 4 * self.num_groups
+        # Compute max batch tokens and return
+        max_batch_tokens = floor(num / denum)
+        if max_batch_tokens > self._upper_bound_max_batch_tokens:
+            logger.info(f"{max_batch_tokens = } is too large, setting to {self._upper_bound_max_batch_tokens = }")
+            max_batch_tokens = self._upper_bound_max_batch_tokens
+        return max_batch_tokens
+    def compute_num_blocks(
+        self,
+        max_batch_tokens: int,
+        max_memory_percent: float = 0.9,
+        cache_dtype: torch.dtype = torch.float16,
+    ) -> int:
+        """Calculate number of cache blocks N given a fixed maximum token per token M. The formula for N is given by:
+        N = (available_memory - M * (peak_activation_per_token * activation_dtype + 28 + 4 * num_group))
+          / (2 * (layer_group_size * page_size * cache_dtype + 2 * num_group) + M * (num_attention_masks * activation_dtype_size))
+        """
+        cache_memory = self.get_available_memory(max_memory_percent)
+        # Compute numerator
+        num = cache_memory
+        num -= max_batch_tokens * self.peak_activation_per_token * self._activation_dtype.itemsize
+        num -= max_batch_tokens * (28 + 4 * self.num_groups)
+        # Compute denominator
+        denum = 2 * (self.group_size * self.page_size * cache_dtype.itemsize + 2 * self.num_groups)
+        denum += max_batch_tokens * (self.num_attention_masks * self._activation_dtype.itemsize)
+        denum += max_batch_tokens * self._activation_dtype.itemsize
+        # Compute cache size and return number of blocks
+        num_pages = floor(num / denum)
+        num_blocks = num_pages // self.block_size
+        if num_blocks > self._upper_bound_num_blocks:
+            logger.info(f"{num_blocks = } is too large, setting to {self._upper_bound_num_blocks = }")
+            num_blocks = self._upper_bound_num_blocks
+        return num_blocks
+    def compute_memory_footprint(
+        self,
+        num_blocks: Optional[int] = None,
+        max_batch_tokens: Optional[int] = None,
+        cache_dtype: torch.dtype = torch.float16,
+    ) -> tuple[int, int, int]:
+        """Calculate the memory footprint breakdown for a given number of blocks and maximum batch tokens. The memory
+        footprint is given by:
+        available_memory = sum([
+            MN * num_attention_masks * activation_dtype_size,
+            2N * (layer_group_size * page_size * cache_dtype + 2 * num_group),
+            M * (peak_activation_per_token * activation_dtype + 28 + 4 * num_group),
+        ])
+        but is broken down below.
+        """
+        num_pages = num_blocks * self.block_size
+        cache_memory_footprint = 2 * self.group_size * num_pages * self.page_size * cache_dtype.itemsize
+        activation_memory_footprint = self.peak_activation_per_token * self._activation_dtype.itemsize
+        activation_memory_footprint *= max_batch_tokens
+        inputs_outputs_positions_and_logits_memory_footprint = 4 * max_batch_tokens * 4  # second 4 is for int32 size
+        attention_memory_footprint = self.num_attention_masks * self._activation_dtype.itemsize
+        attention_memory_footprint *= num_pages * max_batch_tokens
+        cumulative_seqlens_memory_footprint = 3 * max_batch_tokens * 4  # 4 is for int32 size
+        write_index_memory_footprint = self.num_groups * max_batch_tokens * 4  # 4 is for int32 size
+        read_index_memory_footprint = self.num_groups * (num_pages + max_batch_tokens) * 4  # 4 is for int32 size
+        total_memory_footprint = sum(
+            [
+                cache_memory_footprint,
+                activation_memory_footprint,
+                inputs_outputs_positions_and_logits_memory_footprint,
+                attention_memory_footprint,
+                cumulative_seqlens_memory_footprint,
+                write_index_memory_footprint,
+                read_index_memory_footprint,
+            ]
+        )
+        return total_memory_footprint

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/continuous_batching/cache_manager.py ADDED Viewed

	@@ -0,0 +1,226 @@

+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import ABC, abstractmethod
+from collections import deque
+from math import ceil
+from typing import Optional
+from .requests import logger
+class CacheAllocator(ABC):
+    """Abstract base class for cache managers. Cache managers keep track of per-request cache allocations, determine
+    when a new physical block needs to be allocated and compute physical indices for reading or writing to the cache."""
+    _index: int
+    _block_table: dict[str, list[int]]  # request_id -> list of block_ids allocated to the request
+    @abstractmethod
+    def allocate_blocks(self, n_blocks: int, request_id: str, free_blocks: deque[int]) -> Optional[int]:
+        """Allocates n_blocks for a given request_id. Returns the num of blocks allocated if successful and None
+        otherwise."""
+        pass
+    def free_blocks(self, request_id: str, free_blocks: deque[int]) -> None:
+        """Frees all blocks associated with a request_id."""
+        if request_id in self._block_table:
+            blocks_to_free = self._block_table.pop(request_id)
+            free_blocks.extend(blocks_to_free)
+        else:
+            logger.warning(
+                f"CacheAllocator {self._index} attempted to free blocks for non-existent request_id: {request_id}"
+            )
+    @abstractmethod
+    def get_read_indices(self, request_id: str, past_length: int, query_length: int) -> list[int]:
+        """Returns the physical indices of where to read request_id's cache in the cache tensor."""
+        pass
+    @abstractmethod
+    def get_write_indices(self, request_id: str, past_length: int, query_length: int) -> list[int]:
+        """Returns the physical indices of where to write request_id's cache in the cache tensor."""
+        pass
+    @abstractmethod
+    def get_seqlens_k(self, request_id: str, past_length: int, query_length: int) -> tuple[str, int]:
+        """Returns the attention type of the cache allocator and the key sequence length for the given request_id."""
+        pass
+class FullAttentionCacheAllocator(CacheAllocator):
+    """Cache manager for a group of full attention layers."""
+    def __init__(self, index: int, block_size: int) -> None:
+        """Initializes the cache manager for a group of full attention layers.
+        Args:
+            - index: the index of the associated layer group
+            - block_size: the size of the blocks in the cache
+        """
+        self._index = index
+        self.block_size = block_size
+        self._block_table = {}
+    def allocate_blocks(self, n_blocks: int, request_id: str, free_blocks: deque[int]) -> Optional[int]:
+        """Allocate blocks for a given request_id. Returns the number of blocks allocated if successful and None
+        otherwise. For group of full attention layers, we always allocate the number of requested blocks."""
+        if len(free_blocks) < n_blocks:
+            return None
+        if request_id not in self._block_table:
+            self._block_table[request_id] = []
+        self._block_table[request_id].extend(free_blocks.popleft() for _ in range(n_blocks))
+        return n_blocks
+    def get_read_indices(self, request_id: str, past_length: int, query_length: int) -> list[int]:
+        """Returns the physical indices of where to read request_id's cache. For a group of full attention layers, we
+        first write the new cache to the cache tensor and then read the entire cache from the beginning to the end."""
+        # Retrieve the block table for the request and raise an error if it doesn't exist
+        block_table = self._block_table.get(request_id)
+        if block_table is None:
+            raise ValueError(f"No block table found for request {request_id}")
+        # Compute the physical indices
+        physical_indices = []
+        for i in range(past_length + query_length):
+            block_idx = i // self.block_size
+            block_offset = i % self.block_size
+            physical_index = block_table[block_idx] * self.block_size + block_offset
+            physical_indices.append(physical_index)
+        return physical_indices
+    def get_write_indices(self, request_id: str, past_length: int, query_length: int) -> list[int]:
+        """Returns the physical indices for writing to the cache. For a group of full attention layers, we write the new
+        cache as a continuation of the existing cache for the same request."""
+        block_table = self._block_table.get(request_id)
+        if block_table is None:
+            raise ValueError(f"No block table found for request {request_id}")
+        # Compute the physical indices
+        physical_indices = []
+        for i in range(past_length, past_length + query_length):
+            block_idx = i // self.block_size
+            block_offset = i % self.block_size
+            physical_index = block_table[block_idx] * self.block_size + block_offset
+            physical_indices.append(physical_index)
+        return physical_indices
+    def get_seqlens_k(self, request_id: str, past_length: int, query_length: int) -> tuple[str, int]:
+        """Returns the attention type of the cache allocator and the key sequence length for the given request_id."""
+        seqlens_k = past_length + query_length
+        return "full_attention", seqlens_k
+class SlidingAttentionCacheAllocator(CacheAllocator):
+    """Cache manager for sliding window attention layers."""
+    def __init__(self, index: int, block_size: int, sliding_window: int) -> None:
+        """Initializes the cache manager for a group of sliding window attention layers.
+        Args:
+            - index: the index of the associated layer group
+            - block_size: the size of the blocks in the cache
+            - sliding_window: the size of the sliding window
+        """
+        self._index = index
+        self.block_size = block_size
+        self.sliding_window = sliding_window
+        self._max_blocks_per_request = ceil(self.sliding_window / self.block_size)
+        self._block_table = {}
+    def allocate_blocks(self, n_blocks: int, request_id: str, free_blocks: deque[int]) -> Optional[int]:
+        """Allocate blocks for a given request_id. Returns the number of blocks allocated if successful and None
+        otherwise. For group of sliding window attention layers, we only allocate up to the point where we can fit an
+        entire sliding window in the cache tensor."""
+        if request_id not in self._block_table:
+            self._block_table[request_id] = []
+        # Early return if we are already at the max number of blocks per request
+        already_allocated = len(self._block_table[request_id])
+        if already_allocated == self._max_blocks_per_request:
+            return 0
+        # Compute actual number of blocks to allocate
+        after_allocation = min(already_allocated + n_blocks, self._max_blocks_per_request)
+        actual_n_blocks = after_allocation - already_allocated
+        # Classic allocation
+        if len(free_blocks) < actual_n_blocks:
+            return None
+        self._block_table[request_id].extend(free_blocks.popleft() for _ in range(actual_n_blocks))
+        return actual_n_blocks
+    def get_read_indices(self, request_id: str, past_length: int, query_length: int) -> list[int]:
+        """Returns the physical indices of where to read request_id's cache in the cache tensor.
+        For a group of sliding window attention layers, we read from the cache tensor before writing on it, because the
+        new cache can overwrite the old one. To form the cache + new key / values states, we read the at most
+        sliding_window - 1 cache page and then manually add the new key / values states after. Hence the -1 indices
+        which indicate where to store the new key or values indices."""
+        # Retrieve the block table for the request and raise an error if it doesn't exist
+        block_table = self._block_table.get(request_id)
+        if block_table is None:
+            raise ValueError(f"No block table found for request {request_id}")
+        # Apply sliding window
+        start_index = 0 if past_length < self.sliding_window else past_length % self.sliding_window
+        cache_length = min(past_length, self.sliding_window - 1)
+        # Compute the physical indices
+        physical_indices = []
+        for i in range(start_index, start_index + cache_length):
+            i %= self.sliding_window
+            block_idx = i // self.block_size
+            block_offset = i % self.block_size
+            physical_index = block_table[block_idx] * self.block_size + block_offset
+            physical_indices.append(physical_index)
+        return physical_indices + [-1] * query_length
+    def get_write_indices(self, request_id: str, past_length: int, query_length: int) -> list[int]:
+        """Returns the physical indices of where to write request_id's cache in the cache tensor. For a group of
+        sliding window attention layers, we write the new cache in rolling-buffer kind of way: if we reach the end of
+        the allocated physical cache, we start writing from the beginning of the physical cache again."""
+        # Retrieve the block table for the request and raise an error if it doesn't exist
+        block_table = self._block_table.get(request_id)
+        if block_table is None:
+            raise ValueError(f"No block table found for request {request_id}")
+        # Apply sliding window
+        start_index = past_length % self.sliding_window
+        cache_length = min(query_length, self.sliding_window)
+        padding_length = query_length - cache_length
+        # Compute the physical indices
+        physical_indices = []
+        for i in range(start_index, start_index + cache_length):
+            i %= self.sliding_window
+            block_idx = i // self.block_size
+            block_offset = i % self.block_size
+            physical_index = block_table[block_idx] * self.block_size + block_offset
+            physical_indices.append(physical_index)
+        if padding_length > 0:
+            physical_indices = [-1] * padding_length + physical_indices
+        return physical_indices
+    def get_seqlens_k(self, request_id: str, past_length: int, query_length: int) -> tuple[str, int]:
+        """Returns the attention type of the cache allocator and the key sequence length for the given request_id."""
+        seqlens_k = query_length + min(past_length, self.sliding_window - 1)
+        return "sliding_attention", seqlens_k
+# TODO: test the impact of this
+# def get_read_indices(self, request_id: str, past_length: int) -> list[int]:
+#     # Retrieve the block table for the request and raise an error if it doesn't exist
+#     block_table = self._block_table.get(request_id)
+#     if block_table is None:
+#         raise ValueError(f"No block table found for request {request_id}")
+#     # Compute the physical indices
+#     physical_indices = []
+#     n_left = past_length
+#     for block_idx in block_table:
+#         block_physical_index = block_idx * self.block_size
+#         pages_used = min(self.block_size, n_left)
+#         physical_indices.extend(block_physical_index + i for i in range(pages_used))
+#         n_left -= pages_used
+#         if n_left == 0:
+#             return physical_indices
+#     raise ValueError(f"Request {request_id} required too many indices: {past_length = } and {len(block_table) = }")

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/continuous_batching/continuous_api.py ADDED Viewed

	@@ -0,0 +1,1047 @@

+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import queue
+import threading
+from dataclasses import dataclass
+from functools import partial
+from itertools import count
+from time import perf_counter
+from typing import Optional, Union
+import torch
+from torch import nn
+from tqdm import tqdm
+from ...configuration_utils import PretrainedConfig
+from ...generation.configuration_utils import GenerationConfig
+from ...utils.logging import logging
+from ...utils.metrics import ContinuousBatchProcessorMetrics, attach_tracer, traced
+from .cache import PagedAttentionCache
+from .requests import GenerationOutput, RequestState, RequestStatus, get_device_and_memory_breakdown, logger
+from .scheduler import SCHEDULER_MAPPING, FIFOScheduler, Scheduler
+def build_attention_mask(
+    attention_mask: torch.Tensor,
+    cumulative_seqlens_q: torch.Tensor,
+    cumulative_seqlens_k: torch.Tensor,
+    sliding_window: int = 1,
+) -> None:
+    """Builds an attention mask inplace using the cumulative seqlens of the query and key. If given a sliding window, it
+    will also apply a sliding window mask on top. The attention mask is not boolean, it uses zeroes and -inf (or its
+    equivalent) so it's more of an attention score bias tensor.
+    The attention mask is a block-diagonal matrix, with each block an attention mask for a single query-key pair.
+    Each of those block is built from a causal mask and, if there is a sliding window, a sliding window mask.
+    An example is represented below, with seqlen_k = 8, seqlen_q = 4 and sliding_window = 6:
+    CAUSAL MASK:
+           █ █ █ █ █ ░ ░ ░
+           █ █ █ █ █ █ ░ ░
+           █ █ █ █ █ █ █ ░
+           █ █ █ █ █ █ █ █
+    SLIDING WINDOW MASK:
+         ┌──────────────────────── seqlen_k - seqlen_q - sliding_window = 8 - 4 - 6 = -2 offset to the right
+       <─┴─>
+     ░ █ | █ █ █ █ █ █ █ █
+     ░ ░ | █ █ █ █ █ █ █ █
+     ░ ░ | ░ █ █ █ █ █ █ █
+     ░ ░ | ░ ░ █ █ █ █ █ █
+    ATTENTION MASK (sum of causal and sliding window masks):
+           █ █ █ █ █ ░ ░ ░
+           █ █ █ █ █ █ ░ ░
+           ░ █ █ █ █ █ █ ░
+           ░ ░ █ █ █ █ █ █
+    Another example with seqlen_k = 5, seqlen_q = 3 and sliding_window = 2:
+    CAUSAL MASK:
+           █ █ █ ░ ░
+           █ █ █ █ ░
+           █ █ █ █ █
+    SLIDING WINDOW MASK:
+         ┌──────────────────────── seqlen_k - seqlen_q - sliding_window = 5 - 3 - 2 = 0 offset to the right
+        <┴>
+         | ░ █ █ █ █
+         | ░ ░ █ █ █
+         | ░ ░ ░ █ █
+    ATTENTION MASK (sum of causal and sliding window masks):
+           ░ █ █ ░ ░
+           ░ ░ █ █ ░
+           ░ ░ ░ █ █
+    """
+    min_value = torch.finfo(attention_mask.dtype).min
+    for i in range(len(cumulative_seqlens_q) - 1):
+        seqlen_q = cumulative_seqlens_q[i + 1] - cumulative_seqlens_q[i]
+        seqlen_k = cumulative_seqlens_k[i + 1] - cumulative_seqlens_k[i]
+        if seqlen_q < seqlen_k and seqlen_q >= 1:
+            causal_diagonal = seqlen_k - seqlen_q + 1
+        else:
+            causal_diagonal = 1
+        query_range = slice(cumulative_seqlens_q[i], cumulative_seqlens_q[i + 1])
+        key_range = slice(cumulative_seqlens_k[i], cumulative_seqlens_k[i + 1])
+        # Apply causal mask
+        minus_inf = torch.full(
+            attention_mask[..., query_range, key_range].shape,
+            min_value,
+            dtype=attention_mask.dtype,
+            device=attention_mask.device,
+        )
+        masked = torch.triu(minus_inf, diagonal=causal_diagonal)
+        # Apply sliding window mask if needed
+        if sliding_window > 1:
+            sliding_diagonal = seqlen_k - seqlen_q - sliding_window
+            masked += torch.tril(minus_inf, diagonal=sliding_diagonal)
+        # Replace in attention mask
+        attention_mask[..., query_range, key_range] = masked
+@dataclass
+class PagedAttentionArgs:
+    input_ids: torch.Tensor
+    attention_mask: Optional[torch.Tensor]
+    position_ids: torch.Tensor
+    cumulative_seqlens_q: torch.Tensor
+    cumulative_seqlens_k: torch.Tensor
+    max_seqlen_q: int
+    max_seqlen_k: int
+    write_index: list[torch.Tensor]
+    read_index: list[torch.Tensor]
+    logits_indices: torch.Tensor
+    cache: PagedAttentionCache
+    use_cache: bool = False
+# Continuous Batch Processor (Internal Logic)
+@attach_tracer()
+class ContinuousBatchProcessor:
+    def __init__(
+        self,
+        cache: PagedAttentionCache,
+        config: PretrainedConfig,
+        generation_config: GenerationConfig,
+        input_queue: queue.Queue,
+        output_queue: queue.Queue,
+        stop_event: threading.Event,
+        model_device: torch.device,
+        model_dtype: torch.dtype,
+        scheduler: Scheduler,
+        streaming: bool = False,
+        manual_eviction: bool = False,
+        slice_inputs: bool = True,  # TODO: There should be an heuristic to decide on slicing, compile, cuda graphs...
+    ) -> None:
+        """Initialize the continuous batch processor.
+        Args:
+            cache: A [`PagedAttentionCache`] object
+            config: The model configuration
+            generation_config: The generation configuration
+            input_queue: Queue for incoming requests
+            output_queue: Queue for outgoing results
+            stop_event: Event to signal processing should stop
+            model_device: Device for model inputs/outputs
+            model_dtype: Data type for model inputs/outputs
+            scheduler: The [`Scheduler`] to use
+            streaming: Whether to stream tokens as they're generated
+            manual_eviction: Whether to manually evict blocks from the cache
+            slice_inputs: Whether to slice the inputs to the model
+        """
+        self.cache = cache
+        self.config = config
+        self.generation_config = generation_config
+        self.input_queue = input_queue
+        self.output_queue = output_queue
+        self.stop_event = stop_event
+        self.model_device = model_device
+        self.model_dtype = model_dtype
+        self.scheduler = scheduler
+        self.streaming = streaming
+        self.manual_eviction = manual_eviction
+        self.slice_inputs = slice_inputs
+        # Retrieve the size of the sliding window if there is one
+        self.sliding_window = 1 if getattr(config, "sliding_window", None) is None else config.sliding_window
+        self.requests_in_batch: list[RequestState] = []
+        # Set up metrics collector
+        self.max_batch_tokens = cache.max_batch_tokens
+        self.metrics = ContinuousBatchProcessorMetrics(cache.max_batch_tokens)
+        # Setup static tensors
+        self.total_query_length = 0
+        self.total_key_length = 0
+        self.total_batch_size = 0
+        self.setup_static_tensors(cache.num_groups)
+    @traced(standalone=True)
+    def setup_static_tensors(self, num_groups: int) -> None:
+        T = self.max_batch_tokens
+        num_pages = self.cache.num_blocks * self.cache.block_size
+        self.tensor_metadata = {"dtype": torch.int32, "device": self.model_device}
+        # Some tensors always have the same shape regardless of the model
+        self.input_ids = torch.empty((1, T), **self.tensor_metadata)
+        self.position_ids = torch.empty((1, T), **self.tensor_metadata)
+        self.cumulative_seqlens_q = torch.empty((T + 1,), **self.tensor_metadata)
+        self.max_seqlen_q = 0
+        self.logits_indices = torch.empty((T,), **self.tensor_metadata)
+        self.output_ids = torch.empty((1, T), **self.tensor_metadata)
+        # For some kwargs, we have a dict of tensors with as many items as there are attention types
+        layer_types = getattr(self.config, "layer_types", None)
+        if layer_types is None:
+            sliding_window = getattr(self.config, "sliding_window", 1)
+            layer_types = ["full_attention"] if sliding_window in [1, None] else ["sliding_attention"]
+        layer_types = list(set(layer_types))
+        self.cumulative_seqlens_k = {
+            layer_type: torch.empty((T + 1), **self.tensor_metadata) for layer_type in layer_types
+        }
+        self.max_seqlen_k = dict.fromkeys(layer_types, 0)
+        if self.return_attention_mask():
+            attn_mask_kwargs = {
+                "size": (1, 1, T, num_pages + T),
+                "dtype": self.model_dtype,
+                "device": self.model_device,
+            }
+            self.attention_mask = {layer_type: torch.empty(**attn_mask_kwargs) for layer_type in layer_types}
+        else:
+            self.attention_mask = None
+        # For other kwargs, we need a list of tensors with as many tensors as there are groups
+        self.write_index_storage = [torch.empty((T,), **self.tensor_metadata) for _ in range(num_groups)]
+        self.read_index_storage = [torch.empty((num_pages + T), **self.tensor_metadata) for _ in range(num_groups)]
+        # For read index, the +T is because there are -1 for seqlen_q when model uses a sliding window
+        # After allocating empty tensors, we reset them to the right value
+        self.reset_static_tensors(full_reset=True)
+    def return_attention_mask(self) -> bool:
+        return self.config._attn_implementation != "paged_attention"  # we set `is_causal` to True in paged call
+    @traced
+    @torch.no_grad()
+    def reset_static_tensors(self, full_reset: bool = False):
+        """Reset static tensors for the next batch. In between batches, reset only the parts that were used in the last
+        batch, but for initialisation, we can reset everything using the (full_reset) flag."""
+        # Compute the slice to reset
+        if full_reset or not self.slice_inputs:
+            q_len = self.write_index_storage[0].size(-1)
+            k_len = self.read_index_storage[0].size(-1)
+            b_size = self.write_index_storage[0].size(0)
+        else:
+            q_len = self.total_query_length
+            k_len = self.total_key_length
+            b_size = self.total_batch_size
+        # Reset the attributes that always have the same shape
+        self.input_ids[:, :q_len].zero_()
+        self.position_ids[:, :q_len].zero_()
+        self.cumulative_seqlens_q[: b_size + 1].zero_()
+        self.max_seqlen_q = 0
+        self.logits_indices[:q_len].fill_(-1)
+        self.output_ids[:, :q_len].fill_(-1)
+        # Reset the attributes that are either tensors or dict of tensors
+        for layer_type in self.cumulative_seqlens_k:
+            self.cumulative_seqlens_k[layer_type][: b_size + 1].zero_()
+            self.max_seqlen_k[layer_type] = 0
+            if self.attention_mask is not None:
+                self.attention_mask[layer_type][:, :, :q_len, :k_len].fill_(torch.finfo(self.model_dtype).min)
+        # Reset the attributes that are lists of tensors
+        for i in range(self.cache.num_groups):
+            self.write_index_storage[i][:q_len].fill_(-1)
+            self.read_index_storage[i][: q_len + k_len].fill_(-1)
+    def get_model_kwargs(self) -> PagedAttentionArgs:
+        """Get model keyword arguments for the current batch."""
+        # Compute the slice to return
+        q_len = self.total_query_length if self.slice_inputs else self.write_index_storage[0].size(-1)
+        b_size = self.total_batch_size if self.slice_inputs else self.cumulative_seqlens_q.size(-1) - 1
+        # Prepare the kwargs, the attributes that are either tensors or dict of tensors are initialized to empty dicts
+        kwargs = {
+            "input_ids": self.input_ids[:, :q_len],
+            "position_ids": self.position_ids[:, :q_len],
+            "cu_seq_lens_q": self.cumulative_seqlens_q[: b_size + 1],
+            "max_seqlen_q": self.max_seqlen_q,
+            "logits_indices": self.logits_indices[:q_len],
+            "cu_seq_lens_k": {},
+            "max_seqlen_k": {},
+            "attention_mask": {},
+            "read_index": self.read_index,  # slicing is done during building
+            "write_index": self.write_index,  # slicing is done during building
+            "cache": self.cache,
+            "use_cache": False,
+        }
+        # For the attributes that are dict of tensors, we replace the dict with a tensor if there is only one entry
+        layer_types = list(self.cumulative_seqlens_k.keys())
+        if len(layer_types) > 1:
+            for layer_type, seqlens_k in self.cumulative_seqlens_k.items():
+                kwargs["cu_seq_lens_k"][layer_type] = seqlens_k[: b_size + 1]
+                kwargs["max_seqlen_k"][layer_type] = self.max_seqlen_k[layer_type]
+                if self.attention_mask is not None:
+                    k_len = seqlens_k[b_size] if self.slice_inputs else self.attention_mask[layer_type].size(-1)
+                    kwargs["attention_mask"][layer_type] = self.attention_mask[layer_type][..., :q_len, :k_len]
+        else:
+            layer_type = layer_types[0]
+            kwargs["cu_seq_lens_k"] = self.cumulative_seqlens_k[layer_type][: b_size + 1]
+            kwargs["max_seqlen_k"] = self.max_seqlen_k[layer_type]
+            if self.attention_mask is not None:
+                k_len = self.cumulative_seqlens_k[layer_type][b_size]
+                k_len = k_len if self.slice_inputs else self.attention_mask[layer_type].size(-1)
+                kwargs["attention_mask"] = self.attention_mask[layer_type][..., :q_len, :k_len]
+        if self.attention_mask is None:
+            kwargs["attention_mask"] = None
+        return kwargs
+    def __repr__(self):
+        return (
+            f"ContinuousBatchProcessor(input_queue={self.input_queue}, output_queue={self.output_queue}, "
+            f"active_requests={self.scheduler.active_requests}, waiting_requests={self.scheduler.waiting_requests})"
+            + self.get_model_kwargs().__repr__()
+        )
+    @traced
+    def _get_new_requests(self):
+        """Pull new requests from the input queue and add to waiting list."""
+        while not self.input_queue.empty():
+            try:
+                state = self.input_queue.get_nowait()
+                if state is None:  # Sentinel value
+                    continue
+                self.scheduler.add_waiting_request(state)
+            except queue.Empty:
+                break
+            except Exception as e:
+                logger.error(f"Error processing new request: {e}", exc_info=True)
+                state: RequestState = locals().get("state")
+                if state is not None:
+                    self._handle_request_error(e, state)
+    @traced
+    def _handle_request_error(self, error, state: RequestState):
+        """Handle general request processing error."""
+        state.status = RequestStatus.FAILED
+        state.error = str(error)
+        # Include any generated tokens if this is an active request
+        if isinstance(state.request_id, str):
+            state.static_outputs = self.scheduler.get_active_request_static_outputs(state.request_id)
+        else:
+            state.static_outputs = []
+        self.metrics.record_request_completion(state.created_time, state.request_id)
+        self.output_queue.put(state.to_generation_output())
+    @traced
+    def prepare_next_batch(self) -> bool:
+        """Prepare tensors and metadata for the next model forward pass. Returns True if there are requests to process,
+        False otherwise."""
+        # Get new requests from the queue, stop if there are no pending requests
+        self._get_new_requests()
+        self.scheduler.clear_cancelled_requests()
+        if not self.scheduler.has_pending_requests():
+            return False
+        self.metrics.record_queue_metrics(len(self.scheduler.active_requests), len(self.scheduler.waiting_requests))
+        # Schedule the next batch of requests, stop if there are no requests in the batch
+        self.requests_in_batch = self.scheduler.schedule_batch(self.max_batch_tokens)
+        if not self.requests_in_batch:
+            return False
+        self.metrics.record_batch_metrics(self.requests_in_batch)
+        # Reset the static tensors used for storage
+        self.reset_static_tensors()  # TODO: with slice_inputs, this might be unnecessary
+        # Prepare accumulators
+        self.total_query_length = 0
+        self.total_key_length = 0
+        self.total_batch_size = 0
+        input_ids = []
+        position_ids = []
+        cumulative_seqlens_q = [0]
+        logits_indices = []
+        if isinstance(self.cumulative_seqlens_k, dict):
+            cumulative_seqlens_k = {layer_type: [0] for layer_type in self.cumulative_seqlens_k}
+        else:
+            cumulative_seqlens_k = [0]
+        read_index = [[] for _ in range(self.cache.num_groups)]
+        write_index = [[] for _ in range(self.cache.num_groups)]
+        # Go through all the requests in the batch
+        for state in self.requests_in_batch:
+            # First we retrieve the lengths related to the request
+            past_length = state.position_offset
+            query_length = len(state.prompt_ids)
+            seqlens_k = self.cache.get_seqlens_k(state.request_id, past_length, query_length)
+            # Then we update the total lengths that are used for slicing
+            self.total_query_length += query_length
+            # total_key_length is used to slice the keys so we need to take the max of all the key lengths
+            self.total_key_length += max(seqlens_k.values())
+            self.total_batch_size += 1
+            # And the attribute tracking the position in the request object
+            state.position_offset += query_length
+            # Then we accumulate for the object used in the kwargs
+            input_ids.extend(state.prompt_ids)
+            position_ids.extend(range(past_length, past_length + query_length))
+            cumulative_seqlens_q.append(cumulative_seqlens_q[-1] + query_length)
+            self.max_seqlen_q = max(self.max_seqlen_q, query_length)
+            if not state.remaining_prompt_ids:
+                logits_indices.append(cumulative_seqlens_q[-1] - 1)
+            for layer_type, layer_type_seqlen_k in seqlens_k.items():
+                cumulative_seqlens_k[layer_type].append(cumulative_seqlens_k[layer_type][-1] + layer_type_seqlen_k)
+                self.max_seqlen_k[layer_type] = max(self.max_seqlen_k[layer_type], layer_type_seqlen_k)
+            self.cache.extend_read_indices(state.request_id, past_length, query_length, read_index)
+            self.cache.extend_write_indices(state.request_id, past_length, query_length, write_index)
+        # When looping over request is done, we can build the actual tensors
+        self._build_tensors(
+            input_ids,
+            position_ids,
+            read_index,
+            write_index,
+            cumulative_seqlens_q,
+            cumulative_seqlens_k,
+            logits_indices,
+        )
+        self.metrics.record_kv_cache_memory_metrics(self.cache)
+        if logger.isEnabledFor(logging.DEBUG):
+            if isinstance(self.cumulative_seqlens_k, dict):
+                ck = max(cumulative_seqlens_k[layer_type][-1] for layer_type in self.cumulative_seqlens_k)
+            else:
+                ck = cumulative_seqlens_k[-1]
+            logger.debug(
+                f"Scheduled: {len(self.requests_in_batch)}, Waiting: {len(self.scheduler.waiting_requests)}, "
+                f"Active: {len(self.scheduler.active_requests)}. cum Q: {cumulative_seqlens_q[-1]}. "
+                f"cum KV: {ck}, free blocks: {self.cache.get_num_free_blocks()}"
+            )
+        return True
+    @traced
+    def _build_tensors(
+        self,
+        input_ids: list[int],
+        position_ids: list[int],
+        read_index: list[list[int]],
+        write_index: list[list[int]],
+        cumulative_seqlens_q: list[int],
+        cumulative_seqlens_k: Union[list[int], dict[str, list[int]]],
+        logits_indices: list[int],
+    ) -> None:
+        """Builds the actual tensors for the current batch, by modifying the already allocated tensors in place."""
+        to_tensor = partial(torch.tensor, **self.tensor_metadata)
+        # Those kwargs always have the same type regardless of the model
+        self.input_ids[:, : len(input_ids)] = to_tensor(input_ids)
+        self.position_ids[:, : len(position_ids)] = to_tensor(position_ids)
+        self.cumulative_seqlens_q[: len(cumulative_seqlens_q)] = to_tensor(cumulative_seqlens_q)
+        self.logits_indices[: len(logits_indices)] = to_tensor(logits_indices)
+        # Those kwargs are either dict of tensors or tensors, so we need to handle both cases
+        for layer_type, layer_type_seqlens_k in cumulative_seqlens_k.items():
+            self.cumulative_seqlens_k[layer_type][: len(layer_type_seqlens_k)] = to_tensor(layer_type_seqlens_k)
+            if self.attention_mask is not None:
+                build_attention_mask(
+                    attention_mask=self.attention_mask[layer_type],
+                    cumulative_seqlens_q=cumulative_seqlens_q,
+                    cumulative_seqlens_k=layer_type_seqlens_k,
+                    sliding_window=self.sliding_window if layer_type == "sliding_attention" else 1,
+                )
+        # The index only contain references to the storage tensors, so we update the storage and their references
+        self.read_index = []
+        self.write_index = []
+        for i, group_read_indices, group_write_indices in zip(count(), read_index, write_index):
+            # Write in the actual tensors
+            self.read_index_storage[i][: len(group_read_indices)] = to_tensor(group_read_indices)
+            self.write_index_storage[i][: len(group_write_indices)] = to_tensor(group_write_indices)
+            # Slice to the right size
+            r = len(group_read_indices) if self.slice_inputs else self.read_index_storage[i].size(-1)
+            w = len(group_write_indices) if self.slice_inputs else self.write_index_storage[i].size(-1)
+            # Add to the index
+            self.read_index.append(self.read_index_storage[i][:r])
+            self.write_index.append(self.write_index_storage[i][:w])
+    @traced
+    def _sync(self):
+        if self.output_ids is not None:
+            try:
+                out = self.output_ids.tolist()[0]  # should be the only sync we do
+            except Exception:
+                out = [0, 1]
+        else:
+            out = [0, 0]
+        return out
+    @traced
+    def _maybe_send_output(self, state: RequestState, token: int):
+        """Send output to the queue based on streaming mode and request state."""
+        if self.streaming:
+            self.output_queue.put(state.to_generation_output())
+        elif state.status == RequestStatus.FINISHED:
+            self.output_queue.put(state.to_generation_output())
+    @traced
+    def update_batch(self):
+        """Update request states based on generated tokens."""
+        out_tokens = self._sync()
+        finished_request_ids = []
+        for i, state in enumerate(self.requests_in_batch):
+            req_id = state.request_id
+            if len(state.remaining_prompt_ids) == 0:
+                self.metrics.record_ttft_metric(state.created_time, state.request_id)
+                state.status = RequestStatus.DECODING
+                token = out_tokens[self.logits_indices[i]]
+                state.prompt_ids = [token]
+                if state.update_with_token(token):
+                    self.metrics.record_request_completion(state.created_time, state.request_id)
+                    self.scheduler.finish_request(state.request_id, evict_from_cache=(not self.manual_eviction))
+                    finished_request_ids.append(req_id)
+                self._maybe_send_output(state, token)
+            elif state.status == RequestStatus.PREFILLING_SPLIT:
+                state.status = RequestStatus.SPLIT_PENDING_REMAINDER
+        if self.cache.get_num_free_blocks() == 0:
+            raise ValueError("No more free blocks")
+    @traced
+    def has_pending_requests(self) -> bool:
+        """Check if there are any active or waiting requests."""
+        return self.scheduler.has_pending_requests()
+    @traced
+    def handle_batch_error(self, error):
+        """Handle errors during batch processing."""
+        failed_reqs = self.requests_in_batch
+        for req in failed_reqs:
+            self._handle_request_error(error, req)
+            self.scheduler.finish_request(req.request_id)
+    @traced
+    def fail_all_requests(self, error):
+        """Fail all active requests with the given error.
+        Args:
+            error: The error to report in the failure message
+        """
+        requests = list(self.scheduler.active_requests.values())
+        for state in requests:
+            self._handle_request_error(error, state)
+            self.scheduler.finish_request(state.request_id)
+        # Also fail any requests in the waiting queue
+        for req_id in list(self.scheduler.waiting_requests.keys()):
+            state = self.scheduler.waiting_requests.pop(req_id)
+            self._handle_request_error(error, state)
+        # Clear the ordering queue
+        self.scheduler.waiting_requests_order.clear()
+# Manager Class (User Interface)
+@attach_tracer()
+class ContinuousBatchingManager:
+    """Manager for handling continuous batching of generation requests.
+    This class provides the user interface for submitting generation requests,
+    retrieving results, and managing the background generation thread.
+    """
+    def __init__(
+        self,
+        model,
+        generation_config: GenerationConfig,
+        manual_eviction: bool = False,
+        max_queue_size=0,
+        streaming: bool = True,
+        slice_inputs: bool = True,
+    ):
+        """Initialize the continuous batching manager.
+        Args:
+            model: The language model for generation
+            generation_config: Configuration for generation parameters
+            max_queue_size: Maximum size of the request queue (0 = unlimited)
+            streaming: Whether to stream tokens as they are generated
+        """
+        self.model = model.eval()
+        generation_config = model.generation_config if generation_config is None else generation_config
+        self.generation_config = generation_config
+        self.input_queue = queue.Queue(maxsize=max_queue_size)
+        self.output_queue = queue.Queue()
+        self.stop_event = threading.Event()
+        self.streaming = streaming
+        self.log_prob_generation = getattr(generation_config, "log_prob_generation", False)
+        self._generation_thread = None
+        self._request_counter = 0
+        self._request_lock = threading.Lock()
+        self.model.generation_config.top_p = None
+        self.do_sample = getattr(generation_config, "do_sample", True)
+        self.logit_processor = self.model._get_logits_processor(generation_config)
+        self.use_cuda_graph = getattr(generation_config, "use_cuda_graph", False)  # TODO: same as do_sample
+        self.profile = getattr(generation_config, "profile", False)
+        self.manual_eviction = manual_eviction
+        self.batch_processor: Optional[ContinuousBatchProcessor] = None
+        self.slice_inputs = slice_inputs
+        if self.use_cuda_graph:
+            raise NotImplementedError("Cuda graphs are not supported yet")
+    @traced
+    def start(self):
+        """Start the background generation thread."""
+        if self._generation_thread is not None and self._generation_thread.is_alive():
+            logger.warning("Manager thread is already running.")
+            return
+        self._result_queue = queue.Queue()
+        self._generation_thread = threading.Thread(target=self._run_generation_loop)
+        self._generation_thread.start()
+    def is_running(self):
+        """Check if the background generation thread is running."""
+        return self._generation_thread is not None and self._generation_thread.is_alive()
+    def stop(self, block: bool = False, timeout: Optional[float] = None):
+        """Signal the background thread to stop.
+        Args:
+            block: Whether to wait for the thread to stop
+            timeout: Maximum time to wait for the thread to stop
+        """
+        if self._generation_thread is None:
+            logger.warning("Manager not started.")
+            return
+        if not self.stop_event.is_set():
+            self.stop_event.set()
+            logger.info("Stopping continuous batching manager...")
+        if block:
+            self.join(timeout)
+    def join(self, timeout: Optional[float] = None):
+        """Wait for the background thread to finish.
+        Args:
+            timeout: Maximum time to wait for the thread to stop
+        """
+        if self._generation_thread is not None:
+            self._generation_thread.join(timeout=timeout)
+            if self._generation_thread.is_alive():
+                logger.warning("Generation thread did not exit after join timeout.")
+            else:
+                logger.info("Continuous Batching Manager stopped.")
+                self._generation_thread = None
+    def add_request(
+        self, input_ids: list[int], request_id: Optional[str] = None, max_new_tokens: Optional[int] = None
+    ) -> str:
+        """Add a new generation request to the queue.
+        Args:
+            input_ids: Input token IDs to use as prompt
+            request_id: Optional custom request ID (auto-generated if None)
+            **kwargs: Additional generation parameters
+        Returns:
+            str: The request ID
+        """
+        if request_id is None:
+            with self._request_lock:
+                request_id = f"req_{self._request_counter}"
+                self._request_counter += 1
+        max_new_tokens = self.generation_config.max_new_tokens if max_new_tokens is None else max_new_tokens
+        # NOTE: do we want to handle a case when the user wants token ids returned instead of decoded text?
+        state = RequestState(
+            request_id=request_id,
+            prompt_ids=list(input_ids),
+            full_prompt_ids=list(input_ids),
+            max_new_tokens=max_new_tokens,
+            eos_token_id=self.generation_config.eos_token_id,
+        )
+        # Use block=True with timeout to handle backpressure if queue is full
+        self.input_queue.put(state, block=True, timeout=10)  # XXX: pass timeout as fn arg?
+        logger.debug(f"Added request {request_id} to queue.")
+        return request_id
+    def add_requests(self, inputs: list[list[int]], **kwargs):
+        for input_ids in inputs:
+            self.add_request(input_ids, **kwargs)
+    def cancel_request(self, request_id: str):
+        """Cancel a request by its ID.
+        Args:
+            request_id: The ID of the request to cancel
+        """
+        if self.batch_processor is not None:
+            self.batch_processor.scheduler.set_request_cancellation(request_id)
+    def get_result(self, request_id=None, timeout=None) -> Optional[GenerationOutput]:
+        """Retrieve one result from the output queue.
+        Args:
+            timeout: Maximum time to wait for a result
+        Returns:
+            Optional[GenerationOutput]: The result data or None if timeout
+        """
+        if self._generation_thread is None and self.output_queue.empty():
+            return None
+        try:
+            result = self.output_queue.get(block=True, timeout=timeout)
+            if request_id is not None and result.request_id != request_id:
+                self.output_queue.put(result)
+                return None
+            logger.debug(f"Retrieved result for request {result.request_id}")
+            return result
+        except queue.Empty:
+            return None
+    def __iter__(self):
+        """Iterate over results as they become available."""
+        while self._generation_thread is not None and self._generation_thread.is_alive():
+            result = self.get_result(timeout=0.1)
+            if result is not None:
+                yield result
+    def request_id_iter(self, request_id):
+        """Iterate over results matching a specific request id as they become available."""
+        request_cancelled = False
+        while self._generation_thread is not None and self._generation_thread.is_alive() and not request_cancelled:
+            result = self.get_result(request_id=request_id, timeout=0.1)
+            if result is not None:
+                yield result
+            if self.batch_processor is not None:
+                request_cancelled = self.batch_processor.scheduler.request_is_cancelled(request_id)
+    @staticmethod
+    def supported_attention_implementations() -> set[str]:
+        return {"eager_paged", "sdpa_paged", "flash_attention_2"}
+    @staticmethod
+    def default_attention_implementation() -> str:
+        return "sdpa_paged"
+    @traced
+    def warmup(self, batch_processor):
+        stream = torch.cuda.Stream(device=self.model.device)
+        stream.wait_stream(torch.cuda.current_stream())
+        with torch.cuda.stream(stream):
+            # Warmup the model with a dummy forward pass
+            self._generation_step(batch_processor)
+        torch.cuda.current_stream().wait_stream(stream)
+        self.graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(self.graph, stream=stream):
+            self._generation_step(batch_processor)
+    @traced
+    # @torch.compile
+    def _generation_step(self, batch_processor: ContinuousBatchProcessor):
+        """Perform a single generation step. This is cuda graphed"""
+        batch_data = batch_processor.get_model_kwargs()
+        with torch.no_grad():
+            logits = self._model_forward(batch_data)
+            if self.log_prob_generation:
+                batch_processor.output_probs.copy_(logits)  # TODO
+            probs = self._process_logit(batch_data, logits)
+            self._sample(batch_processor, probs)
+    @traced(span_name="model_forward")
+    def _model_forward(self, batch_data):
+        return self.model(**batch_data).logits
+    @traced(span_name="logit_processing")
+    def _process_logit(self, batch_data, logits):
+        # Pass continuous batching context to logits processor if it supports it. TODO we should find a way to make this a little bit cleaner!
+        if hasattr(self.logit_processor, "set_continuous_batching_context"):
+            self.logit_processor.set_continuous_batching_context(
+                batch_data["logits_indices"], batch_data["cu_seq_lens_q"]
+            )
+        # Handle shape compatibility: logit processors expect 2D tensors [batch_size, vocab_size]
+        # but continuous batching always produces 3D tensors [batch_size, seq_len, vocab_size]
+        batch_size, seq_len, vocab_size = logits.shape
+        logits_2d = logits.view(batch_size * seq_len, vocab_size)
+        input_ids_2d = batch_data["input_ids"].view(batch_size * seq_len)
+        # Process with 2D tensors
+        processed_logits_2d = self.logit_processor(input_ids_2d, logits_2d)
+        # Reshape back to 3D
+        return processed_logits_2d.view(batch_size, seq_len, vocab_size)
+    @traced(span_name="sampling")
+    def _sample(self, batch_processor: ContinuousBatchProcessor, probs):
+        if self.do_sample:  # sample
+            probs = nn.functional.softmax(probs, dim=-1)
+            # probs[0] has shape [seq_len, vocab_size], multinomial returns [seq_len, 1]
+            next_tokens = torch.multinomial(probs[0], num_samples=1).squeeze(-1)  # Now [seq_len]
+            # Add batch dimension back to match argmax output
+            next_tokens = next_tokens.unsqueeze(0)  # Now [1, seq_len]
+        else:
+            next_tokens = torch.argmax(probs, dim=-1)  # Already [1, seq_len]
+        tokens = next_tokens.size(1)  # Get seq_len dimension
+        batch_processor.output_ids[:, :tokens].copy_(next_tokens)
+    def _run_generation_loop(self):
+        """Main processing loop running in the background thread."""
+        batch_processor = None
+        try:
+            ref_time = perf_counter()
+            paged_attention_cache = PagedAttentionCache(
+                self.model.config,
+                self.generation_config,
+                self.model.device,
+                self.model.dtype,
+                tp_size=getattr(self.model, "_tp_size", None),  # Use model's actual TP setting
+            )
+            logger.debug(f"PagedAttentionCache created in {perf_counter() - ref_time} seconds")
+            scheduler = None
+            if hasattr(self.generation_config, "scheduler"):
+                scheduler = SCHEDULER_MAPPING.get(self.generation_config.scheduler, None)
+                if scheduler is None:
+                    logger.warning(f"Scheduler '{scheduler}' not found. Defaulting to FIFO.")
+                    scheduler = FIFOScheduler
+            else:
+                # Default to fifo
+                scheduler = FIFOScheduler
+            ref_time = perf_counter()
+            batch_processor = ContinuousBatchProcessor(
+                paged_attention_cache,
+                self.model.config,
+                self.generation_config,
+                self.input_queue,
+                self.output_queue,
+                self.stop_event,
+                self.model.device,
+                self.model.dtype,
+                scheduler(paged_attention_cache, self.manual_eviction),
+                self.streaming,
+                self.manual_eviction,
+                slice_inputs=self.slice_inputs,
+            )
+            self.batch_processor = batch_processor
+            self.current_batch = 0
+            logger.debug(f"batch_processor created in {perf_counter() - ref_time} seconds")
+            while (not self.stop_event.is_set()) or batch_processor.has_pending_requests():
+                self._inner_generation_loop(batch_processor)
+                self.current_batch += 1
+        except Exception as e:
+            logger.error(f"Error in generation loop: {e}", exc_info=True)
+            self._handle_critical_error(e, batch_processor)
+        finally:
+            logger.info("Generation loop finished.")
+    @traced(span_name="generation_loop")
+    def _inner_generation_loop(self, batch_processor: ContinuousBatchProcessor):
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        if not batch_processor.prepare_next_batch():
+            return
+        if logger.level <= logging.DEBUG:
+            device, total, reserved, allocated = get_device_and_memory_breakdown()
+            logger.debug(f"[Memory] Device: {device}, Total: {total}, Reserved: {reserved}, Allocated: {allocated}")
+        if torch.cuda.is_available() and self.use_cuda_graph:
+            if self.current_batch == 0:
+                self.warmup(batch_processor)
+            elif hasattr(self, "graph"):
+                try:
+                    self._graph_replay()
+                except Exception as e:
+                    logger.error(f"Model forward pass failed: {e}", exc_info=True)
+                    batch_processor.handle_batch_error(e)
+                    return
+            else:
+                self._generation_step(batch_processor)
+        else:
+            self._generation_step(batch_processor)
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        batch_processor.update_batch()
+    @traced(span_name="graph_replay")
+    def _graph_replay(self):
+        self.graph.replay()
+    @traced
+    def _handle_critical_error(self, error, batch_processor: Optional[ContinuousBatchProcessor]):
+        """Handle critical errors that terminate the generation loop."""
+        # Signal stop
+        self.stop_event.set()
+        # Fail pending requests in input queue
+        try:
+            while True:
+                req_data = self.input_queue.get_nowait()
+                if batch_processor is not None:
+                    batch_processor._handle_request_error(error, req_data)
+        except queue.Empty:
+            pass
+        # Fail active requests
+        if batch_processor is not None:
+            batch_processor.fail_all_requests(error)
+    @traced
+    def evict_request_from_cache(self, request_id: str):
+        """Evict a request from the cache. It is assumed that the request is already finished."""
+        if not self.manual_eviction:
+            raise RuntimeError("Manual eviction is not enabled for this manager.")
+        if self.batch_processor is not None:
+            self.batch_processor.scheduler.finish_request(request_id)
+class ContinuousMixin:
+    """Mixin class for models to add continuous batching capabilities."""
+    def init_continuous_batching(
+        self,
+        generation_config: Optional[GenerationConfig] = None,
+        manual_eviction: bool = False,
+        max_queue_size: int = 0,
+        streaming: bool = False,
+        slice_inputs: bool = True,
+    ) -> ContinuousBatchingManager:
+        """Initialize a manager for continuous batching inference.
+        Args:
+            generation_config: Custom generation configuration
+            max_queue_size: Maximum size of the input request queue
+            streaming: Whether to stream tokens as they are generated
+        Returns:
+            `ContinuousBatchingManager`: The manager instance to add requests and retrieve results.
+        """
+        if not hasattr(self, "config") or not hasattr(self, "device") or not hasattr(self, "dtype"):
+            raise AttributeError("Model must have 'config', 'device', and 'dtype' attributes.")
+        gen_config = generation_config if generation_config is not None else self.generation_config
+        if gen_config is None:
+            raise ValueError("A GenerationConfig must be provided or set in the model.")
+        if gen_config.eos_token_id is None:
+            logger.warning("`eos_token_id` not set in GenerationConfig. Setting to -1 (disabled).")
+            gen_config.eos_token_id = -1
+        # Create and return the manager
+        return ContinuousBatchingManager(
+            model=self,
+            generation_config=gen_config,
+            manual_eviction=manual_eviction,
+            max_queue_size=max_queue_size,
+            streaming=streaming,
+            slice_inputs=slice_inputs,
+        )
+    @traced
+    @torch.inference_mode()
+    def generate_batch(
+        self,
+        inputs: list[list[int]],
+        generation_config: Optional[GenerationConfig] = None,
+        progress_bar: bool = True,
+        slice_inputs: bool = True,
+        **kwargs,
+    ) -> list[list[int]]:
+        """Generate sequences for a batch of prompts using continuous batching.
+        Args:
+            inputs: List of input token sequences (prompts)
+            generation_config: Optional generation configuration
+            **kwargs: Additional generation parameters
+        Returns:
+            `list[list[int]]`: A list containing the generated sequences (including prompt tokens
+                                if not handled otherwise) for each input prompt, in the same order.
+                                Returns an empty list `[]` for requests that failed.
+        """
+        if not inputs:
+            return []
+        if logger.getEffectiveLevel() <= logging.DEBUG:
+            logger.warning("Progress bar is disabled when logger level is less than DEBUG")
+            progress_bar = False
+        # Initialize manager with the batch inputs
+        manager = self.init_continuous_batching(generation_config=generation_config, slice_inputs=slice_inputs)
+        manager.start()
+        results = {}
+        num_requests = len(inputs)
+        try:
+            from tqdm.contrib.logging import logging_redirect_tqdm
+            with logging_redirect_tqdm([logger]):
+                with tqdm(
+                    total=num_requests,
+                    disable=(not progress_bar),
+                    desc=f"Solving {num_requests} requests",
+                    unit="request",
+                ) as pbar:
+                    manager.add_requests(inputs, **kwargs)
+                    finished_count = 0
+                    while finished_count < num_requests:
+                        result = manager.get_result(timeout=1)
+                        if result:
+                            req_id = result.request_id
+                            if result.status == RequestStatus.FINISHED:
+                                results[req_id] = result
+                                finished_count += 1
+                                pbar.update(1)
+                        else:
+                            if not manager.is_running():
+                                logger.error("Generation thread terminated unexpectedly.")
+                                break
+        except Exception as e:
+            logger.error(f"Error during batch generation: {e}", exc_info=True)
+        finally:
+            manager.stop(block=True, timeout=5.0)
+        return results

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/continuous_batching/requests.py ADDED Viewed

	@@ -0,0 +1,204 @@

+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Optional
+import torch
+from ...utils.logging import logging
+from ...utils.metrics import traced
+# We centralize the logger here to coordinate between logging and progress bar
+logger = logging.getLogger("ContinuousBatchingLogger")
+# logger.setLevel(logging.INFO)
+def get_device_and_memory_breakdown() -> tuple[torch.device, int, int, int]:
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+        total_memory = torch.cuda.get_device_properties(device).total_memory
+        reserved_memory = torch.cuda.memory_reserved(device)
+        allocated_memory = torch.cuda.memory_allocated(device)
+    elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
+        device = torch.device("mps")
+        # MPS memory reporting (PyTorch 2.0+)
+        total_memory = torch.mps.driver_allocated_memory()
+        allocated_memory = total_memory - torch.mps.recommended_max_memory()
+        reserved_memory = 0  # MPS does not track reserved separately
+    else:
+        device = torch.device("cpu")
+        total_memory = None
+        reserved_memory = 0
+        allocated_memory = 0
+    return device, total_memory, reserved_memory, allocated_memory
+class RequestStatus(Enum):
+    """Status of a generation request through its lifecycle."""
+    PENDING = "pending"
+    PREFILLING = "prefilling"
+    PREFILLING_SPLIT = "prefilling_split"
+    SPLIT_PENDING_REMAINDER = "split_pending_remainder"
+    DECODING = "decoding"
+    FINISHED = "finished"
+    FAILED = "failed"
+@dataclass
+class GenerationOutput:
+    """Tracks the output of a generation request.
+    Attributes:
+        request_id (str): The ID of the generation request.
+        prompt_ids (list[int]): The IDs of the prompt tokens.
+        generated_tokens (list[int]): The generated tokens.
+        logprobs (list[float]): The log probabilities of the generated tokens.
+        error (Optional[str]): Any error message associated with the request. When None, the request was successful.
+        status (RequestStatus): The status of the request.
+        created_time (float): The time the request was created.
+    """
+    request_id: str
+    prompt_ids: list[int] = field(default_factory=list)
+    generated_tokens: list[int] = field(default_factory=list)
+    logprobs: list[float] = field(default_factory=list)
+    error: Optional[str] = None
+    status: RequestStatus = RequestStatus.PENDING
+    created_time: float = field(default_factory=time.time)
+@dataclass
+class RequestState:
+    """Tracks the state of a generation request through its lifecycle.
+    Attributes:
+        request_id (str): The ID of the generation request.
+        full_prompt_ids (list[int] | None): The tokens IDs of the full prompt.
+        prompt_ids (list[int] | None): The tokens IDs currently being processed.
+        remaining_prompt_ids (list[int]): The tokens IDs remaining to be processed (for split requests).
+        static_outputs (list[int]): The generated tokens.
+        allocated_blocks (int): The number of blocks allocated to the request.
+        position_offset (int): The current position in the sequence for position_ids.
+        status (RequestStatus): The status of the request: can be one of PENDING, PREFILLING, PREFILLING_SPLIT,
+                                SPLIT_PENDING_REMAINDER, DECODING, FINISHED, FAILED
+        max_new_tokens (int): The maximum number of new tokens to generate.
+        eos_token_id (int): The ID of the end-of-sequence token.
+        created_time (float): The time the request was created.
+        error (Optional[str]): Any error message associated with the request. When None, has had no error yet.
+    """
+    # Required fields
+    request_id: str
+    full_prompt_ids: Optional[list[int]] = None  # Full initial prompt
+    prompt_ids: Optional[list[int]] = None  # Tokens IDs currently being processed (initial + generated)
+    remaining_prompt_ids: list[int] = field(default_factory=list)  # For split requests, prefill left to process
+    static_outputs: list[int] = field(default_factory=list)  # Generated tokens
+    allocated_blocks: int = 0  # Number of blocks allocated to the request
+    position_offset: int = 0  # Current position in the sequence for position_ids
+    _status: RequestStatus = RequestStatus.PENDING  # Status of the request, hidden behind a property
+    max_new_tokens: int = 20  # Maximum number of new tokens to generate
+    eos_token_id: int = -1  # ID of the end-of-sequence token
+    created_time: float = field(default_factory=time.time)  # Time the request was created
+    error: Optional[str] = None  # Error message if the request failed
+    lifespan: tuple[float, float] = (-1, -1)  # (time request was no longer pending, time request finished)
+    @property
+    def status(self) -> RequestStatus:
+        return self._status
+    @status.setter
+    def status(self, value: RequestStatus):
+        if self._status == RequestStatus.PENDING:
+            self.lifespan = (time.time(), -1)
+        elif value == RequestStatus.FINISHED:
+            self.lifespan = (self.lifespan[0], time.time())
+            self.log_end_of_request()
+        self._status = value
+    def log_end_of_request(self):
+        prefill_len = len(self.full_prompt_ids)
+        decode_len = self.generated_len()
+        start_time = self.lifespan[0] - self.created_time
+        end_time = self.lifespan[1] - self.created_time
+        logger.info(
+            f"Request {self.request_id} finished: {prefill_len = } {decode_len = } {start_time = } {end_time = }"
+        )
+    def current_len(self) -> int:
+        """Get the current length of the sequence (prompt + generated tokens)."""
+        return self.position_offset
+    def generated_len(self) -> int:
+        """Get the number of tokens generated so far."""
+        return len(self.static_outputs)
+    # TODO: this logic seems one token off, check it out
+    @traced
+    def update_with_token(self, token_id: int) -> bool:
+        """Update the request with a newly generated token and check for completion.
+        Args:
+            token_id: The token ID to add to the output sequence
+        Returns:
+            bool: True if the request is now complete, False otherwise
+        """
+        # Only update if we're in decoding state
+        if self.status != RequestStatus.DECODING:
+            return False
+        is_eos = token_id == self.eos_token_id and self.eos_token_id != -1
+        is_max_len = self.generated_len() >= self.max_new_tokens
+        # Only add the token if we're not finishing due to max length
+        # (EOS tokens should still be added to the output)
+        if not (is_max_len and not is_eos):
+            self.static_outputs.extend([token_id])
+        if is_eos or is_max_len:
+            self.status = RequestStatus.FINISHED
+            return True
+        return False
+    def __repr__(self):
+        msg = [
+            f"request_id={self.request_id}",
+            f"status={self._status}",
+            f"out_tokens={self.generated_len()}",
+            f"query_length={len(self.prompt_ids)}",
+            f"remaining_tokens={len(self.remaining_prompt_ids)}",
+            f"kv_length={self.position_offset}",
+            f"full_prompt_length={len(self.full_prompt_ids)}",
+            f"allocated_blocks={self.allocated_blocks}",
+            f"generated_tokens={self.static_outputs}",
+        ]
+        return "RequestState(\n\t" + ",\n\t".join(msg) + "\n)"
+    def to_generation_output(self):
+        """Convert the request state to a GenerationOutput object."""
+        return GenerationOutput(
+            request_id=self.request_id,
+            prompt_ids=self.full_prompt_ids,
+            status=self.status,
+            generated_tokens=self.static_outputs,
+            logprobs=[],
+            error=self.error,
+        )

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/generation/continuous_batching/scheduler.py ADDED Viewed

	@@ -0,0 +1,300 @@

+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import threading
+from abc import ABC, abstractmethod
+from collections import deque
+from ...utils.metrics import attach_tracer, traced
+from .cache import PagedAttentionCache
+from .requests import RequestState, RequestStatus
+class Scheduler(ABC):
+    """
+    Abstract base class for scheduling requests in the continuous batch processor. Schedulers manage the lifecycle of
+    requests from when they are added to the waiting queue to when they are scheduled for processing. Different
+    schedulers implement different strategies for prioritizing and batching requests.
+    """
+    def __init__(self, cache: PagedAttentionCache, retain_cache_on_finish: bool = False):
+        self.active_requests: dict[str, RequestState] = {}
+        self.waiting_requests: dict[str, RequestState] = {}
+        self.waiting_requests_order: deque[str] = deque()
+        self.cache = cache
+        self.retain_cache_on_finish = retain_cache_on_finish
+        self._cancellation_lock = threading.Lock()
+        self._requests_to_cancel: set[str] = set()
+    @traced
+    def add_waiting_request(self, state: RequestState):
+        """Adds a request to the waiting list."""
+        if self.retain_cache_on_finish and state.request_id in self.active_requests:
+            old_state = self.active_requests.pop(state.request_id)
+            state.prompt_ids = state.prompt_ids[len(old_state.full_prompt_ids) :]  # XXX: check for indexing error?
+            state.allocated_blocks = old_state.allocated_blocks
+            state.position_offset = old_state.position_offset
+        self.waiting_requests[state.request_id] = state
+        self.waiting_requests_order.append(state.request_id)
+    @abstractmethod
+    def schedule_batch(self, token_budget: int) -> list[RequestState]:
+        """Schedules requests for the next batch based on available token budget. This method selects which requests
+        should be processed in the current batch, considering the token budget and the scheduler's prioritization rules.
+        The token_budget is the maximum number of tokens that can be processed in this batch."""
+        pass
+    @traced
+    def has_pending_requests(self) -> bool:
+        """Checks if there are requests ready to be processed."""
+        return len(self.active_requests) or len(self.waiting_requests)
+    @traced
+    def finish_request(self, request_id: str, evict_from_cache: bool = True):
+        """Completes processing of a request and optionally frees its allocated cache blocks. This method is called
+        when a request has finished generation or encountered an error.
+        """
+        if evict_from_cache:
+            self.cache.free_blocks(request_id)
+            if request_id in self.active_requests:
+                del self.active_requests[request_id]
+    @traced
+    def get_active_request_static_outputs(self, request_id: str) -> list[int]:
+        """Gets generated tokens for an active request."""
+        if request_id in self.active_requests:
+            return self.active_requests[request_id].static_outputs
+        return []
+    @traced
+    def set_request_cancellation(self, request_id: str):
+        """Marks a request for cancellation."""
+        with self._cancellation_lock:
+            self._requests_to_cancel.add(request_id)
+    @traced
+    def clear_cancelled_requests(self):
+        """Remove all cancelled requests from active and waiting queues."""
+        with self._cancellation_lock:
+            for request_id in self._requests_to_cancel:
+                if request_id in self.active_requests:
+                    del self.active_requests[request_id]
+                if request_id in self.waiting_requests:
+                    del self.waiting_requests[request_id]
+                if request_id in self.waiting_requests_order:
+                    self.waiting_requests_order.remove(request_id)
+                self.cache.free_blocks(request_id)
+            self._requests_to_cancel = set()
+    @traced
+    def request_is_cancelled(self, request_id: str) -> bool:
+        """Checks if a request has been cancelled or removed."""
+        return request_id in self._requests_to_cancel or (
+            request_id not in self.active_requests and request_id not in self.waiting_requests
+        )
+    @traced
+    def _allocate_blocks_if_needed(self, state: RequestState, len_next_tokens: int) -> bool:
+        """Allocate additional cache blocks for a request if the currently allocated blocks are insufficient to
+        accommodate the next tokens. It calculates how many blocks are needed based on the request's current
+        cache occupancy and the number of tokens to be processed. The allocation itself is done by the CacheAllocator
+        objects. Returns a boolean indicating if the allocation was successful or not.
+        """
+        # 1. we check that the occupancy is less than the requested length
+        # 2. we allocate enough blocks to cover the requested length
+        current_len = state.current_len()
+        occupancy = state.allocated_blocks * self.cache.block_size - current_len
+        if occupancy < len_next_tokens or state.allocated_blocks == 0:
+            blocks_needed = ((len_next_tokens - occupancy + 1) // self.cache.block_size) + 1
+            allocated = self.cache.allocate_blocks(blocks_needed, state.request_id)
+            if allocated is None:
+                return False
+            state.allocated_blocks += allocated
+        return True
+    @traced(span_name="prepare_request")
+    def _prepare_request_for_processing(
+        self, state: RequestState, token_budget: int, request_ids_to_remove_from_waiting: set[str]
+    ):
+        """Prepares a request for processing in the current batch."""
+        request_tokens = (
+            state.remaining_prompt_ids if state.status == RequestStatus.SPLIT_PENDING_REMAINDER else state.prompt_ids
+        )
+        if len(request_tokens) < token_budget:
+            # Can process the entire prompt/remainder
+            if state.status == RequestStatus.PENDING:
+                self.active_requests[state.request_id] = state
+                state.status = RequestStatus.PREFILLING
+                request_ids_to_remove_from_waiting.add(state.request_id)
+            elif state.status == RequestStatus.SPLIT_PENDING_REMAINDER:
+                state.status = RequestStatus.PREFILLING
+                state.prompt_ids = state.remaining_prompt_ids
+                state.remaining_prompt_ids = []
+        else:
+            # Need to split the request
+            if state.status == RequestStatus.PENDING:
+                self.active_requests[state.request_id] = state
+                state.status = RequestStatus.PREFILLING_SPLIT
+                request_ids_to_remove_from_waiting.add(state.request_id)
+            elif state.status == RequestStatus.SPLIT_PENDING_REMAINDER:
+                state.status = RequestStatus.PREFILLING_SPLIT
+            state.remaining_prompt_ids = request_tokens[token_budget:]
+            state.prompt_ids = request_tokens[:token_budget]
+@attach_tracer()
+class FIFOScheduler(Scheduler):
+    """This scheduler processes requests in the order they arrive, meaning decoding requests has priority over
+    prefilling requests. Additionally, it includes a safety margin mechanism to prevent cache exhaustion. By default,
+    when 80% of the cache is full, new requests will not be scheduled to prioritize decoding active requests."""
+    def __init__(self, cache: PagedAttentionCache, retain_cache_on_finish: bool = False, safety_margin: float = 0.2):
+        """Initializes the FIFO scheduler. The safety margin is the percentage of free blocks under which we stop
+        scheduling new prefill requests, so safety_margin = 0.1 means that when there is less than 10% of free blocks,
+        or equivalently when more than 90% of blocks are already allocated, we stop scheduling new prefill requests.
+        """
+        super().__init__(cache, retain_cache_on_finish)
+        self.safety_margin = safety_margin
+    @traced
+    def schedule_batch(self, token_budget: int) -> list[RequestState]:
+        priority_states: list[RequestState] = []
+        second_priority_states: list[RequestState] = []
+        scheduled_requests = []
+        for state in self.active_requests.values():
+            if state.status == RequestStatus.DECODING:
+                priority_states.append(state)
+            if state.status in [RequestStatus.SPLIT_PENDING_REMAINDER, RequestStatus.PREFILLING_SPLIT]:
+                second_priority_states.append(state)
+        # Add waiting requests to second priority
+        for req_id in self.waiting_requests_order:
+            second_priority_states.append(self.waiting_requests[req_id])
+        candidates = priority_states + second_priority_states
+        request_ids_to_remove_from_waiting = set()
+        safety_margins = self.safety_margin * self.cache.num_blocks
+        for state in candidates:
+            # If we are out the safety margin, we only accept decoding requests or the first prefill request
+            num_free_blocks = self.cache.get_num_free_blocks()
+            outside_safety_margin = num_free_blocks < safety_margins
+            if outside_safety_margin and scheduled_requests and state.status != RequestStatus.DECODING:
+                break
+            self._prepare_request_for_processing(state, token_budget, request_ids_to_remove_from_waiting)
+            request_len = len(state.prompt_ids)
+            if not self._allocate_blocks_if_needed(
+                state, len(state.prompt_ids)
+            ):  # don't schedule if we can't allocate blocks
+                if len(self.cache._free_blocks) == 0:
+                    break
+                continue
+            @traced
+            def _add_to_scheduled_requests(state: RequestState):
+                scheduled_requests.append(state)
+            _add_to_scheduled_requests(state)
+            token_budget -= request_len
+            @traced
+            def _remove_from_waiting_requests(state: RequestState):
+                req_id = state.request_id
+                if req_id in self.waiting_requests:
+                    del self.waiting_requests[req_id]
+                    request_ids_to_remove_from_waiting.add(req_id)
+            _remove_from_waiting_requests(state)
+            if token_budget == 0:
+                break
+        self.waiting_requests_order = deque(
+            [req_id for req_id in self.waiting_requests_order if req_id not in request_ids_to_remove_from_waiting]
+        )
+        return scheduled_requests
+# FIXME: prioritize adding from waiting reqs before scheduling `RequestStatus.DECODING` when cache space allows it
+@attach_tracer()
+class PrefillFirstScheduler(Scheduler):
+    """Scheduler that prioritizes split prefill requests over decoding requests. This scheduler ensures that split
+    prefill requests (which are continuations of partially processed prompts) are completed before processing new
+    decoding requests."""
+    @traced
+    def schedule_batch(self, token_budget: int) -> list[RequestState]:
+        priority_states: list[RequestState] = []
+        second_priority_states: list[RequestState] = []
+        scheduled_requests = []
+        for state in self.active_requests.values():
+            # XXX: when cache is full, state can stay on `PREFILLING_SPLIT` so we need to take those into account
+            if state.status in [RequestStatus.PREFILLING_SPLIT, RequestStatus.SPLIT_PENDING_REMAINDER]:
+                priority_states.append(state)
+            elif state.status == RequestStatus.DECODING:
+                second_priority_states.append(state)
+        for req_id in self.waiting_requests_order:
+            second_priority_states.append(self.waiting_requests[req_id])
+        candidates = priority_states + second_priority_states
+        request_ids_to_remove_from_waiting = set()
+        for state in candidates:
+            self._prepare_request_for_processing(state, token_budget, request_ids_to_remove_from_waiting)
+            request_len = len(state.prompt_ids)
+            if not self._allocate_blocks_if_needed(
+                state, len(state.prompt_ids)
+            ):  # don't schedule if we can't allocate blocks
+                if len(self.cache._free_blocks) == 0:
+                    break
+                continue
+            @traced
+            def _add_to_scheduled_requests(state: RequestState):
+                scheduled_requests.append(state)
+            _add_to_scheduled_requests(state)
+            token_budget -= request_len
+            @traced
+            def _remove_from_waiting_requests(state: RequestState):
+                req_id = state.request_id
+                if req_id in self.waiting_requests:
+                    del self.waiting_requests[req_id]
+                    request_ids_to_remove_from_waiting.add(req_id)
+            _remove_from_waiting_requests(state)
+            if token_budget == 0:
+                break
+        self.waiting_requests_order = deque(
+            [req_id for req_id in self.waiting_requests_order if req_id not in request_ids_to_remove_from_waiting]
+        )
+        return scheduled_requests
+SCHEDULER_MAPPING = {
+    "fifo": FIFOScheduler,
+    "prefill_first": PrefillFirstScheduler,
+}

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/albert/__init__.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+if TYPE_CHECKING:
+    from .configuration_albert import *
+    from .modeling_albert import *
+    from .modeling_flax_albert import *
+    from .modeling_tf_albert import *
+    from .tokenization_albert import *
+    from .tokenization_albert_fast import *
+else:
+    import sys
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/albert/configuration_albert.py ADDED Viewed

	@@ -0,0 +1,170 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ALBERT model configuration"""
+from collections import OrderedDict
+from collections.abc import Mapping
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+class AlbertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`AlbertModel`] or a [`TFAlbertModel`]. It is used
+    to instantiate an ALBERT model according to the specified arguments, defining the model architecture. Instantiating
+    a configuration with the defaults will yield a similar configuration to that of the ALBERT
+    [albert/albert-xxlarge-v2](https://huggingface.co/albert/albert-xxlarge-v2) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30000):
+            Vocabulary size of the ALBERT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`AlbertModel`] or [`TFAlbertModel`].
+        embedding_size (`int`, *optional*, defaults to 128):
+            Dimensionality of vocabulary embeddings.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_hidden_groups (`int`, *optional*, defaults to 1):
+            Number of groups for the hidden layers, parameters in the same group are shared.
+        num_attention_heads (`int`, *optional*, defaults to 64):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 16384):
+            The dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        inner_group_num (`int`, *optional*, defaults to 1):
+            The number of inner repetition of attention and ffn.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu_new"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`AlbertModel`] or [`TFAlbertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        classifier_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for attached classifiers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658).
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 2):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 3):
+            End of stream token id.
+    Examples:
+    ```python
+    >>> from transformers import AlbertConfig, AlbertModel
+    >>> # Initializing an ALBERT-xxlarge style configuration
+    >>> albert_xxlarge_configuration = AlbertConfig()
+    >>> # Initializing an ALBERT-base style configuration
+    >>> albert_base_configuration = AlbertConfig(
+    ...     hidden_size=768,
+    ...     num_attention_heads=12,
+    ...     intermediate_size=3072,
+    ... )
+    >>> # Initializing a model (with random weights) from the ALBERT-base style configuration
+    >>> model = AlbertModel(albert_xxlarge_configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "albert"
+    def __init__(
+        self,
+        vocab_size=30000,
+        embedding_size=128,
+        hidden_size=4096,
+        num_hidden_layers=12,
+        num_hidden_groups=1,
+        num_attention_heads=64,
+        intermediate_size=16384,
+        inner_group_num=1,
+        hidden_act="gelu_new",
+        hidden_dropout_prob=0,
+        attention_probs_dropout_prob=0,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        classifier_dropout_prob=0.1,
+        position_embedding_type="absolute",
+        pad_token_id=0,
+        bos_token_id=2,
+        eos_token_id=3,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.embedding_size = embedding_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_hidden_groups = num_hidden_groups
+        self.num_attention_heads = num_attention_heads
+        self.inner_group_num = inner_group_num
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.classifier_dropout_prob = classifier_dropout_prob
+        self.position_embedding_type = position_embedding_type
+# Copied from transformers.models.bert.configuration_bert.BertOnnxConfig with Roberta->Albert
+class AlbertOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task == "multiple-choice":
+            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
+        else:
+            dynamic_axis = {0: "batch", 1: "sequence"}
+        return OrderedDict(
+            [
+                ("input_ids", dynamic_axis),
+                ("attention_mask", dynamic_axis),
+                ("token_type_ids", dynamic_axis),
+            ]
+        )
+__all__ = ["AlbertConfig", "AlbertOnnxConfig"]

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/albert/modeling_albert.py ADDED Viewed

	@@ -0,0 +1,1349 @@

+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch ALBERT model."""
+import math
+import os
+from dataclasses import dataclass
+from typing import Optional, Union
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from ...activations import ACT2FN
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPooling,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from ...utils import ModelOutput, auto_docstring, logging
+from .configuration_albert import AlbertConfig
+logger = logging.get_logger(__name__)
+def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+    for name, array in zip(names, arrays):
+        print(name)
+    for name, array in zip(names, arrays):
+        original_name = name
+        # If saved from the TF HUB module
+        name = name.replace("module/", "")
+        # Renaming and simplifying
+        name = name.replace("ffn_1", "ffn")
+        name = name.replace("bert/", "albert/")
+        name = name.replace("attention_1", "attention")
+        name = name.replace("transform/", "")
+        name = name.replace("LayerNorm_1", "full_layer_layer_norm")
+        name = name.replace("LayerNorm", "attention/LayerNorm")
+        name = name.replace("transformer/", "")
+        # The feed forward layer had an 'intermediate' step which has been abstracted away
+        name = name.replace("intermediate/dense/", "")
+        name = name.replace("ffn/intermediate/output/dense/", "ffn_output/")
+        # ALBERT attention was split between self and output which have been abstracted away
+        name = name.replace("/output/", "/")
+        name = name.replace("/self/", "/")
+        # The pooler is a linear layer
+        name = name.replace("pooler/dense", "pooler")
+        # The classifier was simplified to predictions from cls/predictions
+        name = name.replace("cls/predictions", "predictions")
+        name = name.replace("predictions/attention", "predictions")
+        # Naming was changed to be more explicit
+        name = name.replace("embeddings/attention", "embeddings")
+        name = name.replace("inner_group_", "albert_layers/")
+        name = name.replace("group_", "albert_layer_groups/")
+        # Classifier
+        if len(name.split("/")) == 1 and ("output_bias" in name or "output_weights" in name):
+            name = "classifier/" + name
+        # No ALBERT model currently handles the next sentence prediction task
+        if "seq_relationship" in name:
+            name = name.replace("seq_relationship/output_", "sop_classifier/classifier/")
+            name = name.replace("weights", "weight")
+        name = name.split("/")
+        # Ignore the gradients applied by the LAMB/ADAM optimizers.
+        if (
+            "adam_m" in name
+            or "adam_v" in name
+            or "AdamWeightDecayOptimizer" in name
+            or "AdamWeightDecayOptimizer_1" in name
+            or "global_step" in name
+        ):
+            logger.info(f"Skipping {'/'.join(name)}")
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info(f"Skipping {'/'.join(name)}")
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
+            array = np.transpose(array)
+        try:
+            if pointer.shape != array.shape:
+                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
+        except ValueError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        print(f"Initialize PyTorch weight {name} from {original_name}")
+        pointer.data = torch.from_numpy(array)
+    return model
+class AlbertEmbeddings(nn.Module):
+    """
+    Construct the embeddings from word, position and token_type embeddings.
+    """
+    def __init__(self, config: AlbertConfig):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size)
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+        )
+    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.forward
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        past_key_values_length: int = 0,
+    ) -> torch.Tensor:
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+        seq_length = input_shape[1]
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class AlbertAttention(nn.Module):
+    def __init__(self, config: AlbertConfig):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads}"
+            )
+        self.num_attention_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.attention_head_size = config.hidden_size // config.num_attention_heads
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.attention_dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.output_dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pruned_heads = set()
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+    def prune_heads(self, heads: list[int]) -> None:
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.num_attention_heads, self.attention_head_size, self.pruned_heads
+        )
+        # Prune linear layers
+        self.query = prune_linear_layer(self.query, index)
+        self.key = prune_linear_layer(self.key, index)
+        self.value = prune_linear_layer(self.value, index)
+        self.dense = prune_linear_layer(self.dense, index, dim=1)
+        # Update hyper params and store pruned heads
+        self.num_attention_heads = self.num_attention_heads - len(heads)
+        self.all_head_size = self.attention_head_size * self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[tuple[torch.Tensor], tuple[torch.Tensor, torch.Tensor]]:
+        batch_size, seq_length, _ = hidden_states.shape
+        query_layer = self.query(hidden_states)
+        key_layer = self.key(hidden_states)
+        value_layer = self.value(hidden_states)
+        query_layer = query_layer.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose(
+            1, 2
+        )
+        key_layer = key_layer.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose(1, 2)
+        value_layer = value_layer.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose(
+            1, 2
+        )
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.attention_dropout(attention_probs)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.transpose(2, 1).flatten(2)
+        projected_context_layer = self.dense(context_layer)
+        projected_context_layer_dropout = self.output_dropout(projected_context_layer)
+        layernormed_context_layer = self.LayerNorm(hidden_states + projected_context_layer_dropout)
+        return (layernormed_context_layer, attention_probs) if output_attentions else (layernormed_context_layer,)
+class AlbertSdpaAttention(AlbertAttention):
+    def __init__(self, config):
+        super().__init__(config)
+        self.dropout_prob = config.attention_probs_dropout_prob
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[tuple[torch.Tensor], tuple[torch.Tensor, torch.Tensor]]:
+        if self.position_embedding_type != "absolute" or output_attentions:
+            logger.warning(
+                "AlbertSdpaAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support "
+                "non-absolute `position_embedding_type` or `output_attentions=True` . Falling back to "
+                "the eager attention implementation, but specifying the eager implementation will be required from "
+                "Transformers version v5.0.0 onwards. This warning can be removed using the argument "
+                '`attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(hidden_states, attention_mask, output_attentions=output_attentions)
+        batch_size, seq_len, _ = hidden_states.size()
+        query_layer = (
+            self.query(hidden_states)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+        key_layer = (
+            self.key(hidden_states)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+        value_layer = (
+            self.value(hidden_states)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+        attention_output = torch.nn.functional.scaled_dot_product_attention(
+            query=query_layer,
+            key=key_layer,
+            value=value_layer,
+            attn_mask=attention_mask,
+            dropout_p=self.dropout_prob if self.training else 0.0,
+            is_causal=False,
+        )
+        attention_output = attention_output.transpose(1, 2)
+        attention_output = attention_output.reshape(batch_size, seq_len, self.all_head_size)
+        projected_context_layer = self.dense(attention_output)
+        projected_context_layer_dropout = self.output_dropout(projected_context_layer)
+        layernormed_context_layer = self.LayerNorm(hidden_states + projected_context_layer_dropout)
+        return (layernormed_context_layer,)
+ALBERT_ATTENTION_CLASSES = {
+    "eager": AlbertAttention,
+    "sdpa": AlbertSdpaAttention,
+}
+class AlbertLayer(nn.Module):
+    def __init__(self, config: AlbertConfig):
+        super().__init__()
+        self.config = config
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.full_layer_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.attention = ALBERT_ATTENTION_CLASSES[config._attn_implementation](config)
+        self.ffn = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.ffn_output = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.activation = ACT2FN[config.hidden_act]
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        attention_output = self.attention(hidden_states, attention_mask, head_mask, output_attentions)
+        ffn_output = apply_chunking_to_forward(
+            self.ff_chunk,
+            self.chunk_size_feed_forward,
+            self.seq_len_dim,
+            attention_output[0],
+        )
+        hidden_states = self.full_layer_layer_norm(ffn_output + attention_output[0])
+        return (hidden_states,) + attention_output[1:]  # add attentions if we output them
+    def ff_chunk(self, attention_output: torch.Tensor) -> torch.Tensor:
+        ffn_output = self.ffn(attention_output)
+        ffn_output = self.activation(ffn_output)
+        ffn_output = self.ffn_output(ffn_output)
+        return ffn_output
+class AlbertLayerGroup(nn.Module):
+    def __init__(self, config: AlbertConfig):
+        super().__init__()
+        self.albert_layers = nn.ModuleList([AlbertLayer(config) for _ in range(config.inner_group_num)])
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+    ) -> tuple[Union[torch.Tensor, tuple[torch.Tensor]], ...]:
+        layer_hidden_states = ()
+        layer_attentions = ()
+        for layer_index, albert_layer in enumerate(self.albert_layers):
+            layer_output = albert_layer(hidden_states, attention_mask, head_mask[layer_index], output_attentions)
+            hidden_states = layer_output[0]
+            if output_attentions:
+                layer_attentions = layer_attentions + (layer_output[1],)
+            if output_hidden_states:
+                layer_hidden_states = layer_hidden_states + (hidden_states,)
+        outputs = (hidden_states,)
+        if output_hidden_states:
+            outputs = outputs + (layer_hidden_states,)
+        if output_attentions:
+            outputs = outputs + (layer_attentions,)
+        return outputs  # last-layer hidden state, (layer hidden states), (layer attentions)
+class AlbertTransformer(nn.Module):
+    def __init__(self, config: AlbertConfig):
+        super().__init__()
+        self.config = config
+        self.embedding_hidden_mapping_in = nn.Linear(config.embedding_size, config.hidden_size)
+        self.albert_layer_groups = nn.ModuleList([AlbertLayerGroup(config) for _ in range(config.num_hidden_groups)])
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[BaseModelOutput, tuple]:
+        hidden_states = self.embedding_hidden_mapping_in(hidden_states)
+        all_hidden_states = (hidden_states,) if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        head_mask = [None] * self.config.num_hidden_layers if head_mask is None else head_mask
+        for i in range(self.config.num_hidden_layers):
+            # Number of layers in a hidden group
+            layers_per_group = int(self.config.num_hidden_layers / self.config.num_hidden_groups)
+            # Index of the hidden group
+            group_idx = int(i / (self.config.num_hidden_layers / self.config.num_hidden_groups))
+            layer_group_output = self.albert_layer_groups[group_idx](
+                hidden_states,
+                attention_mask,
+                head_mask[group_idx * layers_per_group : (group_idx + 1) * layers_per_group],
+                output_attentions,
+                output_hidden_states,
+            )
+            hidden_states = layer_group_output[0]
+            if output_attentions:
+                all_attentions = all_attentions + layer_group_output[-1]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
+        )
+@auto_docstring
+class AlbertPreTrainedModel(PreTrainedModel):
+    config: AlbertConfig
+    load_tf_weights = load_tf_weights_in_albert
+    base_model_prefix = "albert"
+    _supports_sdpa = True
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, AlbertMLMHead):
+            module.bias.data.zero_()
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Output type of [`AlbertForPreTraining`].
+    """
+)
+class AlbertForPreTrainingOutput(ModelOutput):
+    r"""
+    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+        Total loss as the sum of the masked language modeling loss and the next sequence prediction
+        (classification) loss.
+    prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    sop_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
+        Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+        before SoftMax).
+    """
+    loss: Optional[torch.FloatTensor] = None
+    prediction_logits: Optional[torch.FloatTensor] = None
+    sop_logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+@auto_docstring
+class AlbertModel(AlbertPreTrainedModel):
+    config: AlbertConfig
+    base_model_prefix = "albert"
+    def __init__(self, config: AlbertConfig, add_pooling_layer: bool = True):
+        r"""
+        add_pooling_layer (bool, *optional*, defaults to `True`):
+            Whether to add a pooling layer
+        """
+        super().__init__(config)
+        self.config = config
+        self.embeddings = AlbertEmbeddings(config)
+        self.encoder = AlbertTransformer(config)
+        if add_pooling_layer:
+            self.pooler = nn.Linear(config.hidden_size, config.hidden_size)
+            self.pooler_activation = nn.Tanh()
+        else:
+            self.pooler = None
+            self.pooler_activation = None
+        self.attn_implementation = config._attn_implementation
+        self.position_embedding_type = config.position_embedding_type
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.embeddings.word_embeddings
+    def set_input_embeddings(self, value: nn.Embedding) -> None:
+        self.embeddings.word_embeddings = value
+    def _prune_heads(self, heads_to_prune: dict[int, list[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} ALBERT has
+        a different architecture in that its layers are shared across groups, which then has inner groups. If an ALBERT
+        model has 12 hidden layers and 2 hidden groups, with two inner groups, there is a total of 4 different layers.
+        These layers are flattened: the indices [0,1] correspond to the two inner groups of the first hidden layer,
+        while [2,3] correspond to the two inner groups of the second hidden layer.
+        Any layer with in index other than [0,1,2,3] will result in an error. See base class PreTrainedModel for more
+        information about head pruning
+        """
+        for layer, heads in heads_to_prune.items():
+            group_idx = int(layer / self.config.inner_group_num)
+            inner_group_idx = int(layer - group_idx * self.config.inner_group_num)
+            self.encoder.albert_layer_groups[group_idx].albert_layers[inner_group_idx].attention.prune_heads(heads)
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[BaseModelOutputWithPooling, tuple]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        if attention_mask is None:
+            attention_mask = torch.ones(input_shape, device=device)
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+        embedding_output = self.embeddings(
+            input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
+        )
+        use_sdpa_attention_mask = (
+            self.attn_implementation == "sdpa"
+            and self.position_embedding_type == "absolute"
+            and head_mask is None
+            and not output_attentions
+        )
+        if use_sdpa_attention_mask:
+            extended_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                attention_mask, embedding_output.dtype, tgt_len=seq_length
+            )
+        else:
+            extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+            extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+            extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(self.dtype).min
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        encoder_outputs = self.encoder(
+            embedding_output,
+            extended_attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler_activation(self.pooler(sequence_output[:, 0])) if self.pooler is not None else None
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+@auto_docstring(
+    custom_intro="""
+    Albert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
+    `sentence order prediction (classification)` head.
+    """
+)
+class AlbertForPreTraining(AlbertPreTrainedModel):
+    _tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"]
+    def __init__(self, config: AlbertConfig):
+        super().__init__(config)
+        self.albert = AlbertModel(config)
+        self.predictions = AlbertMLMHead(config)
+        self.sop_classifier = AlbertSOPHead(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_output_embeddings(self) -> nn.Linear:
+        return self.predictions.decoder
+    def set_output_embeddings(self, new_embeddings: nn.Linear) -> None:
+        self.predictions.decoder = new_embeddings
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.albert.embeddings.word_embeddings
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        sentence_order_label: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[AlbertForPreTrainingOutput, tuple]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        sentence_order_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
+            (see `input_ids` docstring) Indices should be in `[0, 1]`. `0` indicates original order (sequence A, then
+            sequence B), `1` indicates switched order (sequence B, then sequence A).
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, AlbertForPreTraining
+        >>> import torch
+        >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
+        >>> model = AlbertForPreTraining.from_pretrained("albert/albert-base-v2")
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)
+        >>> # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> prediction_logits = outputs.prediction_logits
+        >>> sop_logits = outputs.sop_logits
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.albert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores = self.predictions(sequence_output)
+        sop_scores = self.sop_classifier(pooled_output)
+        total_loss = None
+        if labels is not None and sentence_order_label is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+            sentence_order_loss = loss_fct(sop_scores.view(-1, 2), sentence_order_label.view(-1))
+            total_loss = masked_lm_loss + sentence_order_loss
+        if not return_dict:
+            output = (prediction_scores, sop_scores) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+        return AlbertForPreTrainingOutput(
+            loss=total_loss,
+            prediction_logits=prediction_scores,
+            sop_logits=sop_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+class AlbertMLMHead(nn.Module):
+    def __init__(self, config: AlbertConfig):
+        super().__init__()
+        self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        self.dense = nn.Linear(config.hidden_size, config.embedding_size)
+        self.decoder = nn.Linear(config.embedding_size, config.vocab_size)
+        self.activation = ACT2FN[config.hidden_act]
+        self.decoder.bias = self.bias
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        prediction_scores = hidden_states
+        return prediction_scores
+    def _tie_weights(self) -> None:
+        # For accelerate compatibility and to not break backward compatibility
+        if self.decoder.bias.device.type == "meta":
+            self.decoder.bias = self.bias
+        else:
+            # To tie those two weights if they get disconnected (on TPU or when the bias is resized)
+            self.bias = self.decoder.bias
+class AlbertSOPHead(nn.Module):
+    def __init__(self, config: AlbertConfig):
+        super().__init__()
+        self.dropout = nn.Dropout(config.classifier_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+    def forward(self, pooled_output: torch.Tensor) -> torch.Tensor:
+        dropout_pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(dropout_pooled_output)
+        return logits
+@auto_docstring
+class AlbertForMaskedLM(AlbertPreTrainedModel):
+    _tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.albert = AlbertModel(config, add_pooling_layer=False)
+        self.predictions = AlbertMLMHead(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_output_embeddings(self) -> nn.Linear:
+        return self.predictions.decoder
+    def set_output_embeddings(self, new_embeddings: nn.Linear) -> None:
+        self.predictions.decoder = new_embeddings
+        self.predictions.bias = new_embeddings.bias
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.albert.embeddings.word_embeddings
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[MaskedLMOutput, tuple]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        Example:
+        ```python
+        >>> import torch
+        >>> from transformers import AutoTokenizer, AlbertForMaskedLM
+        >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
+        >>> model = AlbertForMaskedLM.from_pretrained("albert/albert-base-v2")
+        >>> # add mask_token
+        >>> inputs = tokenizer("The capital of [MASK] is Paris.", return_tensors="pt")
+        >>> with torch.no_grad():
+        ...     logits = model(**inputs).logits
+        >>> # retrieve index of [MASK]
+        >>> mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
+        >>> predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
+        >>> tokenizer.decode(predicted_token_id)
+        'france'
+        ```
+        ```python
+        >>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"]
+        >>> labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100)
+        >>> outputs = model(**inputs, labels=labels)
+        >>> round(outputs.loss.item(), 2)
+        0.81
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.albert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_outputs = outputs[0]
+        prediction_scores = self.predictions(sequence_outputs)
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@auto_docstring(
+    custom_intro="""
+    Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """
+)
+class AlbertForSequenceClassification(AlbertPreTrainedModel):
+    def __init__(self, config: AlbertConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+        self.albert = AlbertModel(config)
+        self.dropout = nn.Dropout(config.classifier_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[SequenceClassifierOutput, tuple]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.albert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@auto_docstring
+class AlbertForTokenClassification(AlbertPreTrainedModel):
+    def __init__(self, config: AlbertConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.albert = AlbertModel(config, add_pooling_layer=False)
+        classifier_dropout_prob = (
+            config.classifier_dropout_prob
+            if config.classifier_dropout_prob is not None
+            else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[TokenClassifierOutput, tuple]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.albert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@auto_docstring
+class AlbertForQuestionAnswering(AlbertPreTrainedModel):
+    def __init__(self, config: AlbertConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.albert = AlbertModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[AlbertForPreTrainingOutput, tuple]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.albert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        logits: torch.Tensor = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@auto_docstring
+class AlbertForMultipleChoice(AlbertPreTrainedModel):
+    def __init__(self, config: AlbertConfig):
+        super().__init__(config)
+        self.albert = AlbertModel(config)
+        self.dropout = nn.Dropout(config.classifier_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[AlbertForPreTrainingOutput, tuple]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            [`PreTrainedTokenizer.encode`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where *num_choices* is the size of the second dimension of the input tensors. (see
+            *input_ids* above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+        outputs = self.albert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits: torch.Tensor = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+__all__ = [
+    "load_tf_weights_in_albert",
+    "AlbertPreTrainedModel",
+    "AlbertModel",
+    "AlbertForPreTraining",
+    "AlbertForMaskedLM",
+    "AlbertForSequenceClassification",
+    "AlbertForTokenClassification",
+    "AlbertForQuestionAnswering",
+    "AlbertForMultipleChoice",
+]

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/albert/modeling_flax_albert.py ADDED Viewed

	@@ -0,0 +1,1132 @@

+# coding=utf-8
+# Copyright 2021 Google AI, Google Brain and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, Optional
+import flax
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.linen.attention import dot_product_attention_weights
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax import lax
+from ...modeling_flax_outputs import (
+    FlaxBaseModelOutput,
+    FlaxBaseModelOutputWithPooling,
+    FlaxMaskedLMOutput,
+    FlaxMultipleChoiceModelOutput,
+    FlaxQuestionAnsweringModelOutput,
+    FlaxSequenceClassifierOutput,
+    FlaxTokenClassifierOutput,
+)
+from ...modeling_flax_utils import (
+    ACT2FN,
+    FlaxPreTrainedModel,
+    append_call_sample_docstring,
+    append_replace_return_docstrings,
+    overwrite_call_docstring,
+)
+from ...utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_albert import AlbertConfig
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "albert/albert-base-v2"
+_CONFIG_FOR_DOC = "AlbertConfig"
+@flax.struct.dataclass
+class FlaxAlbertForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`FlaxAlbertForPreTraining`].
+    Args:
+        prediction_logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        sop_logits (`jnp.ndarray` of shape `(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    prediction_logits: jnp.ndarray = None
+    sop_logits: jnp.ndarray = None
+    hidden_states: Optional[tuple[jnp.ndarray]] = None
+    attentions: Optional[tuple[jnp.ndarray]] = None
+ALBERT_START_DOCSTRING = r"""
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)
+    This model is also a
+    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
+    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
+    behavior.
+    Finally, this model supports inherent JAX features such as:
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+    Parameters:
+        config ([`AlbertConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+            `jax.numpy.bfloat16` (on TPUs).
+            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+            specified all the computation will be performed with the given `dtype`.
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.**
+            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
+            [`~FlaxPreTrainedModel.to_bf16`].
+"""
+ALBERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`numpy.ndarray` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+class FlaxAlbertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+    config: AlbertConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    def setup(self):
+        self.word_embeddings = nn.Embed(
+            self.config.vocab_size,
+            self.config.embedding_size,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+        )
+        self.position_embeddings = nn.Embed(
+            self.config.max_position_embeddings,
+            self.config.embedding_size,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+        )
+        self.token_type_embeddings = nn.Embed(
+            self.config.type_vocab_size,
+            self.config.embedding_size,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+        )
+        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+    def __call__(self, input_ids, token_type_ids, position_ids, deterministic: bool = True):
+        # Embed
+        inputs_embeds = self.word_embeddings(input_ids.astype("i4"))
+        position_embeds = self.position_embeddings(position_ids.astype("i4"))
+        token_type_embeddings = self.token_type_embeddings(token_type_ids.astype("i4"))
+        # Sum all embeddings
+        hidden_states = inputs_embeds + token_type_embeddings + position_embeds
+        # Layer Norm
+        hidden_states = self.LayerNorm(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        return hidden_states
+class FlaxAlbertSelfAttention(nn.Module):
+    config: AlbertConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    def setup(self):
+        if self.config.hidden_size % self.config.num_attention_heads != 0:
+            raise ValueError(
+                "`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads` "
+                "                   : {self.config.num_attention_heads}"
+            )
+        self.query = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        self.key = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        self.value = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+    def __call__(self, hidden_states, attention_mask, deterministic=True, output_attentions: bool = False):
+        head_dim = self.config.hidden_size // self.config.num_attention_heads
+        query_states = self.query(hidden_states).reshape(
+            hidden_states.shape[:2] + (self.config.num_attention_heads, head_dim)
+        )
+        value_states = self.value(hidden_states).reshape(
+            hidden_states.shape[:2] + (self.config.num_attention_heads, head_dim)
+        )
+        key_states = self.key(hidden_states).reshape(
+            hidden_states.shape[:2] + (self.config.num_attention_heads, head_dim)
+        )
+        # Convert the boolean attention mask to an attention bias.
+        if attention_mask is not None:
+            # attention mask in the form of attention bias
+            attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
+            attention_bias = lax.select(
+                attention_mask > 0,
+                jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
+            )
+        else:
+            attention_bias = None
+        dropout_rng = None
+        if not deterministic and self.config.attention_probs_dropout_prob > 0.0:
+            dropout_rng = self.make_rng("dropout")
+        attn_weights = dot_product_attention_weights(
+            query_states,
+            key_states,
+            bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.config.attention_probs_dropout_prob,
+            broadcast_dropout=True,
+            deterministic=deterministic,
+            dtype=self.dtype,
+            precision=None,
+        )
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states)
+        attn_output = attn_output.reshape(attn_output.shape[:2] + (-1,))
+        projected_attn_output = self.dense(attn_output)
+        projected_attn_output = self.dropout(projected_attn_output, deterministic=deterministic)
+        layernormed_attn_output = self.LayerNorm(projected_attn_output + hidden_states)
+        outputs = (layernormed_attn_output, attn_weights) if output_attentions else (layernormed_attn_output,)
+        return outputs
+class FlaxAlbertLayer(nn.Module):
+    config: AlbertConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    def setup(self):
+        self.attention = FlaxAlbertSelfAttention(self.config, dtype=self.dtype)
+        self.ffn = nn.Dense(
+            self.config.intermediate_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.activation = ACT2FN[self.config.hidden_act]
+        self.ffn_output = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.full_layer_layer_norm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+    ):
+        attention_outputs = self.attention(
+            hidden_states, attention_mask, deterministic=deterministic, output_attentions=output_attentions
+        )
+        attention_output = attention_outputs[0]
+        ffn_output = self.ffn(attention_output)
+        ffn_output = self.activation(ffn_output)
+        ffn_output = self.ffn_output(ffn_output)
+        ffn_output = self.dropout(ffn_output, deterministic=deterministic)
+        hidden_states = self.full_layer_layer_norm(ffn_output + attention_output)
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attention_outputs[1],)
+        return outputs
+class FlaxAlbertLayerCollection(nn.Module):
+    config: AlbertConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    def setup(self):
+        self.layers = [
+            FlaxAlbertLayer(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.inner_group_num)
+        ]
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+    ):
+        layer_hidden_states = ()
+        layer_attentions = ()
+        for layer_index, albert_layer in enumerate(self.layers):
+            layer_output = albert_layer(
+                hidden_states,
+                attention_mask,
+                deterministic=deterministic,
+                output_attentions=output_attentions,
+            )
+            hidden_states = layer_output[0]
+            if output_attentions:
+                layer_attentions = layer_attentions + (layer_output[1],)
+            if output_hidden_states:
+                layer_hidden_states = layer_hidden_states + (hidden_states,)
+        outputs = (hidden_states,)
+        if output_hidden_states:
+            outputs = outputs + (layer_hidden_states,)
+        if output_attentions:
+            outputs = outputs + (layer_attentions,)
+        return outputs  # last-layer hidden state, (layer hidden states), (layer attentions)
+class FlaxAlbertLayerCollections(nn.Module):
+    config: AlbertConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    layer_index: Optional[str] = None
+    def setup(self):
+        self.albert_layers = FlaxAlbertLayerCollection(self.config, dtype=self.dtype)
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+    ):
+        outputs = self.albert_layers(
+            hidden_states,
+            attention_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        return outputs
+class FlaxAlbertLayerGroups(nn.Module):
+    config: AlbertConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    def setup(self):
+        self.layers = [
+            FlaxAlbertLayerCollections(self.config, name=str(i), layer_index=str(i), dtype=self.dtype)
+            for i in range(self.config.num_hidden_groups)
+        ]
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        all_attentions = () if output_attentions else None
+        all_hidden_states = (hidden_states,) if output_hidden_states else None
+        for i in range(self.config.num_hidden_layers):
+            # Index of the hidden group
+            group_idx = int(i / (self.config.num_hidden_layers / self.config.num_hidden_groups))
+            layer_group_output = self.layers[group_idx](
+                hidden_states,
+                attention_mask,
+                deterministic=deterministic,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+            )
+            hidden_states = layer_group_output[0]
+            if output_attentions:
+                all_attentions = all_attentions + layer_group_output[-1]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
+        return FlaxBaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
+        )
+class FlaxAlbertEncoder(nn.Module):
+    config: AlbertConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    def setup(self):
+        self.embedding_hidden_mapping_in = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.albert_layer_groups = FlaxAlbertLayerGroups(self.config, dtype=self.dtype)
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        hidden_states = self.embedding_hidden_mapping_in(hidden_states)
+        return self.albert_layer_groups(
+            hidden_states,
+            attention_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+class FlaxAlbertOnlyMLMHead(nn.Module):
+    config: AlbertConfig
+    dtype: jnp.dtype = jnp.float32
+    bias_init: Callable[..., np.ndarray] = jax.nn.initializers.zeros
+    def setup(self):
+        self.dense = nn.Dense(self.config.embedding_size, dtype=self.dtype)
+        self.activation = ACT2FN[self.config.hidden_act]
+        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.decoder = nn.Dense(self.config.vocab_size, dtype=self.dtype, use_bias=False)
+        self.bias = self.param("bias", self.bias_init, (self.config.vocab_size,))
+    def __call__(self, hidden_states, shared_embedding=None):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        if shared_embedding is not None:
+            hidden_states = self.decoder.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
+        else:
+            hidden_states = self.decoder(hidden_states)
+        hidden_states += self.bias
+        return hidden_states
+class FlaxAlbertSOPHead(nn.Module):
+    config: AlbertConfig
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.dropout = nn.Dropout(self.config.classifier_dropout_prob)
+        self.classifier = nn.Dense(2, dtype=self.dtype)
+    def __call__(self, pooled_output, deterministic=True):
+        pooled_output = self.dropout(pooled_output, deterministic=deterministic)
+        logits = self.classifier(pooled_output)
+        return logits
+class FlaxAlbertPreTrainedModel(FlaxPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = AlbertConfig
+    base_model_prefix = "albert"
+    module_class: nn.Module = None
+    def __init__(
+        self,
+        config: AlbertConfig,
+        input_shape: tuple = (1, 1),
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_ids = jnp.zeros(input_shape, dtype="i4")
+        token_type_ids = jnp.zeros_like(input_ids)
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape)
+        attention_mask = jnp.ones_like(input_ids)
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+        random_params = self.module.init(
+            rngs, input_ids, attention_mask, token_type_ids, position_ids, return_dict=False
+        )["params"]
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        params: Optional[dict] = None,
+        dropout_rng: jax.random.PRNGKey = None,
+        train: bool = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        # init input tensors if not passed
+        if token_type_ids is None:
+            token_type_ids = jnp.zeros_like(input_ids)
+        if position_ids is None:
+            position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
+        if attention_mask is None:
+            attention_mask = jnp.ones_like(input_ids)
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+        return self.module.apply(
+            {"params": params or self.params},
+            jnp.array(input_ids, dtype="i4"),
+            jnp.array(attention_mask, dtype="i4"),
+            jnp.array(token_type_ids, dtype="i4"),
+            jnp.array(position_ids, dtype="i4"),
+            not train,
+            output_attentions,
+            output_hidden_states,
+            return_dict,
+            rngs=rngs,
+        )
+class FlaxAlbertModule(nn.Module):
+    config: AlbertConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    add_pooling_layer: bool = True
+    def setup(self):
+        self.embeddings = FlaxAlbertEmbeddings(self.config, dtype=self.dtype)
+        self.encoder = FlaxAlbertEncoder(self.config, dtype=self.dtype)
+        if self.add_pooling_layer:
+            self.pooler = nn.Dense(
+                self.config.hidden_size,
+                kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+                dtype=self.dtype,
+                name="pooler",
+            )
+            self.pooler_activation = nn.tanh
+        else:
+            self.pooler = None
+            self.pooler_activation = None
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids: Optional[np.ndarray] = None,
+        position_ids: Optional[np.ndarray] = None,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # make sure `token_type_ids` is correctly initialized when not passed
+        if token_type_ids is None:
+            token_type_ids = jnp.zeros_like(input_ids)
+        # make sure `position_ids` is correctly initialized when not passed
+        if position_ids is None:
+            position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
+        hidden_states = self.embeddings(input_ids, token_type_ids, position_ids, deterministic=deterministic)
+        outputs = self.encoder(
+            hidden_states,
+            attention_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        if self.add_pooling_layer:
+            pooled = self.pooler(hidden_states[:, 0])
+            pooled = self.pooler_activation(pooled)
+        else:
+            pooled = None
+        if not return_dict:
+            # if pooled is None, don't return it
+            if pooled is None:
+                return (hidden_states,) + outputs[1:]
+            return (hidden_states, pooled) + outputs[1:]
+        return FlaxBaseModelOutputWithPooling(
+            last_hidden_state=hidden_states,
+            pooler_output=pooled,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    "The bare Albert Model transformer outputting raw hidden-states without any specific head on top.",
+    ALBERT_START_DOCSTRING,
+)
+class FlaxAlbertModel(FlaxAlbertPreTrainedModel):
+    module_class = FlaxAlbertModule
+append_call_sample_docstring(FlaxAlbertModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutputWithPooling, _CONFIG_FOR_DOC)
+class FlaxAlbertForPreTrainingModule(nn.Module):
+    config: AlbertConfig
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.albert = FlaxAlbertModule(config=self.config, dtype=self.dtype)
+        self.predictions = FlaxAlbertOnlyMLMHead(config=self.config, dtype=self.dtype)
+        self.sop_classifier = FlaxAlbertSOPHead(config=self.config, dtype=self.dtype)
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.albert(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if self.config.tie_word_embeddings:
+            shared_embedding = self.albert.variables["params"]["embeddings"]["word_embeddings"]["embedding"]
+        else:
+            shared_embedding = None
+        hidden_states = outputs[0]
+        pooled_output = outputs[1]
+        prediction_scores = self.predictions(hidden_states, shared_embedding=shared_embedding)
+        sop_scores = self.sop_classifier(pooled_output, deterministic=deterministic)
+        if not return_dict:
+            return (prediction_scores, sop_scores) + outputs[2:]
+        return FlaxAlbertForPreTrainingOutput(
+            prediction_logits=prediction_scores,
+            sop_logits=sop_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Albert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
+    `sentence order prediction (classification)` head.
+    """,
+    ALBERT_START_DOCSTRING,
+)
+class FlaxAlbertForPreTraining(FlaxAlbertPreTrainedModel):
+    module_class = FlaxAlbertForPreTrainingModule
+FLAX_ALBERT_FOR_PRETRAINING_DOCSTRING = """
+    Returns:
+    Example:
+    ```python
+    >>> from transformers import AutoTokenizer, FlaxAlbertForPreTraining
+    >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
+    >>> model = FlaxAlbertForPreTraining.from_pretrained("albert/albert-base-v2")
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
+    >>> outputs = model(**inputs)
+    >>> prediction_logits = outputs.prediction_logits
+    >>> seq_relationship_logits = outputs.sop_logits
+    ```
+"""
+overwrite_call_docstring(
+    FlaxAlbertForPreTraining,
+    ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length") + FLAX_ALBERT_FOR_PRETRAINING_DOCSTRING,
+)
+append_replace_return_docstrings(
+    FlaxAlbertForPreTraining, output_type=FlaxAlbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC
+)
+class FlaxAlbertForMaskedLMModule(nn.Module):
+    config: AlbertConfig
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.albert = FlaxAlbertModule(config=self.config, add_pooling_layer=False, dtype=self.dtype)
+        self.predictions = FlaxAlbertOnlyMLMHead(config=self.config, dtype=self.dtype)
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.albert(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        if self.config.tie_word_embeddings:
+            shared_embedding = self.albert.variables["params"]["embeddings"]["word_embeddings"]["embedding"]
+        else:
+            shared_embedding = None
+        # Compute the prediction scores
+        logits = self.predictions(hidden_states, shared_embedding=shared_embedding)
+        if not return_dict:
+            return (logits,) + outputs[1:]
+        return FlaxMaskedLMOutput(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings("""Albert Model with a `language modeling` head on top.""", ALBERT_START_DOCSTRING)
+class FlaxAlbertForMaskedLM(FlaxAlbertPreTrainedModel):
+    module_class = FlaxAlbertForMaskedLMModule
+append_call_sample_docstring(
+    FlaxAlbertForMaskedLM, _CHECKPOINT_FOR_DOC, FlaxMaskedLMOutput, _CONFIG_FOR_DOC, revision="refs/pr/11"
+)
+class FlaxAlbertForSequenceClassificationModule(nn.Module):
+    config: AlbertConfig
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.albert = FlaxAlbertModule(config=self.config, dtype=self.dtype)
+        classifier_dropout = (
+            self.config.classifier_dropout_prob
+            if self.config.classifier_dropout_prob is not None
+            else self.config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(rate=classifier_dropout)
+        self.classifier = nn.Dense(
+            self.config.num_labels,
+            dtype=self.dtype,
+        )
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.albert(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output, deterministic=deterministic)
+        logits = self.classifier(pooled_output)
+        if not return_dict:
+            return (logits,) + outputs[2:]
+        return FlaxSequenceClassifierOutput(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """,
+    ALBERT_START_DOCSTRING,
+)
+class FlaxAlbertForSequenceClassification(FlaxAlbertPreTrainedModel):
+    module_class = FlaxAlbertForSequenceClassificationModule
+append_call_sample_docstring(
+    FlaxAlbertForSequenceClassification,
+    _CHECKPOINT_FOR_DOC,
+    FlaxSequenceClassifierOutput,
+    _CONFIG_FOR_DOC,
+)
+class FlaxAlbertForMultipleChoiceModule(nn.Module):
+    config: AlbertConfig
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.albert = FlaxAlbertModule(config=self.config, dtype=self.dtype)
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+        self.classifier = nn.Dense(1, dtype=self.dtype)
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        num_choices = input_ids.shape[1]
+        input_ids = input_ids.reshape(-1, input_ids.shape[-1]) if input_ids is not None else None
+        attention_mask = attention_mask.reshape(-1, attention_mask.shape[-1]) if attention_mask is not None else None
+        token_type_ids = token_type_ids.reshape(-1, token_type_ids.shape[-1]) if token_type_ids is not None else None
+        position_ids = position_ids.reshape(-1, position_ids.shape[-1]) if position_ids is not None else None
+        # Model
+        outputs = self.albert(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output, deterministic=deterministic)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.reshape(-1, num_choices)
+        if not return_dict:
+            return (reshaped_logits,) + outputs[2:]
+        return FlaxMultipleChoiceModelOutput(
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Albert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    ALBERT_START_DOCSTRING,
+)
+class FlaxAlbertForMultipleChoice(FlaxAlbertPreTrainedModel):
+    module_class = FlaxAlbertForMultipleChoiceModule
+overwrite_call_docstring(
+    FlaxAlbertForMultipleChoice, ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
+)
+append_call_sample_docstring(
+    FlaxAlbertForMultipleChoice,
+    _CHECKPOINT_FOR_DOC,
+    FlaxMultipleChoiceModelOutput,
+    _CONFIG_FOR_DOC,
+)
+class FlaxAlbertForTokenClassificationModule(nn.Module):
+    config: AlbertConfig
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.albert = FlaxAlbertModule(config=self.config, dtype=self.dtype, add_pooling_layer=False)
+        classifier_dropout = (
+            self.config.classifier_dropout_prob
+            if self.config.classifier_dropout_prob is not None
+            else self.config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(rate=classifier_dropout)
+        self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype)
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.albert(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        logits = self.classifier(hidden_states)
+        if not return_dict:
+            return (logits,) + outputs[1:]
+        return FlaxTokenClassifierOutput(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Albert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    ALBERT_START_DOCSTRING,
+)
+class FlaxAlbertForTokenClassification(FlaxAlbertPreTrainedModel):
+    module_class = FlaxAlbertForTokenClassificationModule
+append_call_sample_docstring(
+    FlaxAlbertForTokenClassification,
+    _CHECKPOINT_FOR_DOC,
+    FlaxTokenClassifierOutput,
+    _CONFIG_FOR_DOC,
+)
+class FlaxAlbertForQuestionAnsweringModule(nn.Module):
+    config: AlbertConfig
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.albert = FlaxAlbertModule(config=self.config, dtype=self.dtype, add_pooling_layer=False)
+        self.qa_outputs = nn.Dense(self.config.num_labels, dtype=self.dtype)
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.albert(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.qa_outputs(hidden_states)
+        start_logits, end_logits = jnp.split(logits, self.config.num_labels, axis=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+        if not return_dict:
+            return (start_logits, end_logits) + outputs[1:]
+        return FlaxQuestionAnsweringModelOutput(
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    ALBERT_START_DOCSTRING,
+)
+class FlaxAlbertForQuestionAnswering(FlaxAlbertPreTrainedModel):
+    module_class = FlaxAlbertForQuestionAnsweringModule
+append_call_sample_docstring(
+    FlaxAlbertForQuestionAnswering,
+    _CHECKPOINT_FOR_DOC,
+    FlaxQuestionAnsweringModelOutput,
+    _CONFIG_FOR_DOC,
+)
+__all__ = [
+    "FlaxAlbertPreTrainedModel",
+    "FlaxAlbertModel",
+    "FlaxAlbertForPreTraining",
+    "FlaxAlbertForMaskedLM",
+    "FlaxAlbertForSequenceClassification",
+    "FlaxAlbertForMultipleChoice",
+    "FlaxAlbertForTokenClassification",
+    "FlaxAlbertForQuestionAnswering",
+]

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/albert/modeling_tf_albert.py ADDED Viewed

	@@ -0,0 +1,1572 @@

+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF 2.0 ALBERT model."""
+from __future__ import annotations
+import math
+from dataclasses import dataclass
+import numpy as np
+import tensorflow as tf
+from ...activations_tf import get_tf_activation
+from ...modeling_tf_outputs import (
+    TFBaseModelOutput,
+    TFBaseModelOutputWithPooling,
+    TFMaskedLMOutput,
+    TFMultipleChoiceModelOutput,
+    TFQuestionAnsweringModelOutput,
+    TFSequenceClassifierOutput,
+    TFTokenClassifierOutput,
+)
+from ...modeling_tf_utils import (
+    TFMaskedLanguageModelingLoss,
+    TFModelInputType,
+    TFMultipleChoiceLoss,
+    TFPreTrainedModel,
+    TFQuestionAnsweringLoss,
+    TFSequenceClassificationLoss,
+    TFTokenClassificationLoss,
+    get_initializer,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_albert import AlbertConfig
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "albert/albert-base-v2"
+_CONFIG_FOR_DOC = "AlbertConfig"
+class TFAlbertPreTrainingLoss:
+    """
+    Loss function suitable for ALBERT pretraining, that is, the task of pretraining a language model by combining SOP +
+    MLM. .. note:: Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
+    """
+    def hf_compute_loss(self, labels: tf.Tensor, logits: tf.Tensor) -> tf.Tensor:
+        loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE)
+        if self.config.tf_legacy_loss:
+            # make sure only labels that are not equal to -100
+            # are taken into account as loss
+            masked_lm_active_loss = tf.not_equal(tf.reshape(tensor=labels["labels"], shape=(-1,)), -100)
+            masked_lm_reduced_logits = tf.boolean_mask(
+                tensor=tf.reshape(tensor=logits[0], shape=(-1, shape_list(logits[0])[2])),
+                mask=masked_lm_active_loss,
+            )
+            masked_lm_labels = tf.boolean_mask(
+                tensor=tf.reshape(tensor=labels["labels"], shape=(-1,)), mask=masked_lm_active_loss
+            )
+            sentence_order_active_loss = tf.not_equal(
+                tf.reshape(tensor=labels["sentence_order_label"], shape=(-1,)), -100
+            )
+            sentence_order_reduced_logits = tf.boolean_mask(
+                tensor=tf.reshape(tensor=logits[1], shape=(-1, 2)), mask=sentence_order_active_loss
+            )
+            sentence_order_label = tf.boolean_mask(
+                tensor=tf.reshape(tensor=labels["sentence_order_label"], shape=(-1,)), mask=sentence_order_active_loss
+            )
+            masked_lm_loss = loss_fn(y_true=masked_lm_labels, y_pred=masked_lm_reduced_logits)
+            sentence_order_loss = loss_fn(y_true=sentence_order_label, y_pred=sentence_order_reduced_logits)
+            masked_lm_loss = tf.reshape(tensor=masked_lm_loss, shape=(-1, shape_list(sentence_order_loss)[0]))
+            masked_lm_loss = tf.reduce_mean(input_tensor=masked_lm_loss, axis=0)
+            return masked_lm_loss + sentence_order_loss
+        # Clip negative labels to zero here to avoid NaNs and errors - those positions will get masked later anyway
+        unmasked_lm_losses = loss_fn(y_true=tf.nn.relu(labels["labels"]), y_pred=logits[0])
+        # make sure only labels that are not equal to -100
+        # are taken into account for the loss computation
+        lm_loss_mask = tf.cast(labels["labels"] != -100, dtype=unmasked_lm_losses.dtype)
+        masked_lm_losses = unmasked_lm_losses * lm_loss_mask
+        reduced_masked_lm_loss = tf.reduce_sum(masked_lm_losses) / tf.reduce_sum(lm_loss_mask)
+        sop_logits = tf.reshape(logits[1], (-1, 2))
+        # Clip negative labels to zero here to avoid NaNs and errors - those positions will get masked later anyway
+        unmasked_sop_loss = loss_fn(y_true=tf.nn.relu(labels["sentence_order_label"]), y_pred=sop_logits)
+        sop_loss_mask = tf.cast(labels["sentence_order_label"] != -100, dtype=unmasked_sop_loss.dtype)
+        masked_sop_loss = unmasked_sop_loss * sop_loss_mask
+        reduced_masked_sop_loss = tf.reduce_sum(masked_sop_loss) / tf.reduce_sum(sop_loss_mask)
+        return tf.reshape(reduced_masked_lm_loss + reduced_masked_sop_loss, (1,))
+class TFAlbertEmbeddings(keras.layers.Layer):
+    """Construct the embeddings from word, position and token_type embeddings."""
+    def __init__(self, config: AlbertConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.embedding_size = config.embedding_size
+        self.max_position_embeddings = config.max_position_embeddings
+        self.initializer_range = config.initializer_range
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+    def build(self, input_shape=None):
+        with tf.name_scope("word_embeddings"):
+            self.weight = self.add_weight(
+                name="weight",
+                shape=[self.config.vocab_size, self.embedding_size],
+                initializer=get_initializer(self.initializer_range),
+            )
+        with tf.name_scope("token_type_embeddings"):
+            self.token_type_embeddings = self.add_weight(
+                name="embeddings",
+                shape=[self.config.type_vocab_size, self.embedding_size],
+                initializer=get_initializer(self.initializer_range),
+            )
+        with tf.name_scope("position_embeddings"):
+            self.position_embeddings = self.add_weight(
+                name="embeddings",
+                shape=[self.max_position_embeddings, self.embedding_size],
+                initializer=get_initializer(self.initializer_range),
+            )
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.embedding_size])
+    # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call
+    def call(
+        self,
+        input_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        past_key_values_length=0,
+        training: bool = False,
+    ) -> tf.Tensor:
+        """
+        Applies embedding based on inputs tensor.
+        Returns:
+            final_embeddings (`tf.Tensor`): output embedding tensor.
+        """
+        if input_ids is None and inputs_embeds is None:
+            raise ValueError("Need to provide either `input_ids` or `input_embeds`.")
+        if input_ids is not None:
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
+            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
+        input_shape = shape_list(inputs_embeds)[:-1]
+        if token_type_ids is None:
+            token_type_ids = tf.fill(dims=input_shape, value=0)
+        if position_ids is None:
+            position_ids = tf.expand_dims(
+                tf.range(start=past_key_values_length, limit=input_shape[1] + past_key_values_length), axis=0
+            )
+        position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
+        token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
+        final_embeddings = inputs_embeds + position_embeds + token_type_embeds
+        final_embeddings = self.LayerNorm(inputs=final_embeddings)
+        final_embeddings = self.dropout(inputs=final_embeddings, training=training)
+        return final_embeddings
+class TFAlbertAttention(keras.layers.Layer):
+    """Contains the complete attention sublayer, including both dropouts and layer norm."""
+    def __init__(self, config: AlbertConfig, **kwargs):
+        super().__init__(**kwargs)
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number "
+                f"of attention heads ({config.num_attention_heads})"
+            )
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
+        self.output_attentions = config.output_attentions
+        self.query = keras.layers.Dense(
+            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
+        )
+        self.key = keras.layers.Dense(
+            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
+        )
+        self.value = keras.layers.Dense(
+            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
+        )
+        self.dense = keras.layers.Dense(
+            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        # Two different dropout probabilities; see https://github.com/google-research/albert/blob/master/modeling.py#L971-L993
+        self.attention_dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
+        self.output_dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config
+    def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
+        # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
+        tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
+        # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size]
+        return tf.transpose(tensor, perm=[0, 2, 1, 3])
+    def call(
+        self,
+        input_tensor: tf.Tensor,
+        attention_mask: tf.Tensor,
+        head_mask: tf.Tensor,
+        output_attentions: bool,
+        training: bool = False,
+    ) -> tuple[tf.Tensor]:
+        batch_size = shape_list(input_tensor)[0]
+        mixed_query_layer = self.query(inputs=input_tensor)
+        mixed_key_layer = self.key(inputs=input_tensor)
+        mixed_value_layer = self.value(inputs=input_tensor)
+        query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
+        key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
+        value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        # (batch size, num_heads, seq_len_q, seq_len_k)
+        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
+        dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
+        attention_scores = tf.divide(attention_scores, dk)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in TFAlbertModel call() function)
+            attention_scores = tf.add(attention_scores, attention_mask)
+        # Normalize the attention scores to probabilities.
+        attention_probs = stable_softmax(logits=attention_scores, axis=-1)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.attention_dropout(inputs=attention_probs, training=training)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = tf.multiply(attention_probs, head_mask)
+        context_layer = tf.matmul(attention_probs, value_layer)
+        context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
+        # (batch_size, seq_len_q, all_head_size)
+        context_layer = tf.reshape(tensor=context_layer, shape=(batch_size, -1, self.all_head_size))
+        self_outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+        hidden_states = self_outputs[0]
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.output_dropout(inputs=hidden_states, training=training)
+        attention_output = self.LayerNorm(inputs=hidden_states + input_tensor)
+        # add attentions if we output them
+        outputs = (attention_output,) + self_outputs[1:]
+        return outputs
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "query", None) is not None:
+            with tf.name_scope(self.query.name):
+                self.query.build([None, None, self.config.hidden_size])
+        if getattr(self, "key", None) is not None:
+            with tf.name_scope(self.key.name):
+                self.key.build([None, None, self.config.hidden_size])
+        if getattr(self, "value", None) is not None:
+            with tf.name_scope(self.value.name):
+                self.value.build([None, None, self.config.hidden_size])
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+class TFAlbertLayer(keras.layers.Layer):
+    def __init__(self, config: AlbertConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.attention = TFAlbertAttention(config, name="attention")
+        self.ffn = keras.layers.Dense(
+            units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn"
+        )
+        if isinstance(config.hidden_act, str):
+            self.activation = get_tf_activation(config.hidden_act)
+        else:
+            self.activation = config.hidden_act
+        self.ffn_output = keras.layers.Dense(
+            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn_output"
+        )
+        self.full_layer_layer_norm = keras.layers.LayerNormalization(
+            epsilon=config.layer_norm_eps, name="full_layer_layer_norm"
+        )
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor,
+        head_mask: tf.Tensor,
+        output_attentions: bool,
+        training: bool = False,
+    ) -> tuple[tf.Tensor]:
+        attention_outputs = self.attention(
+            input_tensor=hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            training=training,
+        )
+        ffn_output = self.ffn(inputs=attention_outputs[0])
+        ffn_output = self.activation(ffn_output)
+        ffn_output = self.ffn_output(inputs=ffn_output)
+        ffn_output = self.dropout(inputs=ffn_output, training=training)
+        hidden_states = self.full_layer_layer_norm(inputs=ffn_output + attention_outputs[0])
+        # add attentions if we output them
+        outputs = (hidden_states,) + attention_outputs[1:]
+        return outputs
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "attention", None) is not None:
+            with tf.name_scope(self.attention.name):
+                self.attention.build(None)
+        if getattr(self, "ffn", None) is not None:
+            with tf.name_scope(self.ffn.name):
+                self.ffn.build([None, None, self.config.hidden_size])
+        if getattr(self, "ffn_output", None) is not None:
+            with tf.name_scope(self.ffn_output.name):
+                self.ffn_output.build([None, None, self.config.intermediate_size])
+        if getattr(self, "full_layer_layer_norm", None) is not None:
+            with tf.name_scope(self.full_layer_layer_norm.name):
+                self.full_layer_layer_norm.build([None, None, self.config.hidden_size])
+class TFAlbertLayerGroup(keras.layers.Layer):
+    def __init__(self, config: AlbertConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.albert_layers = [
+            TFAlbertLayer(config, name=f"albert_layers_._{i}") for i in range(config.inner_group_num)
+        ]
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor,
+        head_mask: tf.Tensor,
+        output_attentions: bool,
+        output_hidden_states: bool,
+        training: bool = False,
+    ) -> TFBaseModelOutput | tuple[tf.Tensor]:
+        layer_hidden_states = () if output_hidden_states else None
+        layer_attentions = () if output_attentions else None
+        for layer_index, albert_layer in enumerate(self.albert_layers):
+            if output_hidden_states:
+                layer_hidden_states = layer_hidden_states + (hidden_states,)
+            layer_output = albert_layer(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                head_mask=head_mask[layer_index],
+                output_attentions=output_attentions,
+                training=training,
+            )
+            hidden_states = layer_output[0]
+            if output_attentions:
+                layer_attentions = layer_attentions + (layer_output[1],)
+        # Add last layer
+        if output_hidden_states:
+            layer_hidden_states = layer_hidden_states + (hidden_states,)
+        return tuple(v for v in [hidden_states, layer_hidden_states, layer_attentions] if v is not None)
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "albert_layers", None) is not None:
+            for layer in self.albert_layers:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+class TFAlbertTransformer(keras.layers.Layer):
+    def __init__(self, config: AlbertConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.num_hidden_layers = config.num_hidden_layers
+        self.num_hidden_groups = config.num_hidden_groups
+        # Number of layers in a hidden group
+        self.layers_per_group = int(config.num_hidden_layers / config.num_hidden_groups)
+        self.embedding_hidden_mapping_in = keras.layers.Dense(
+            units=config.hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="embedding_hidden_mapping_in",
+        )
+        self.albert_layer_groups = [
+            TFAlbertLayerGroup(config, name=f"albert_layer_groups_._{i}") for i in range(config.num_hidden_groups)
+        ]
+        self.config = config
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor,
+        head_mask: tf.Tensor,
+        output_attentions: bool,
+        output_hidden_states: bool,
+        return_dict: bool,
+        training: bool = False,
+    ) -> TFBaseModelOutput | tuple[tf.Tensor]:
+        hidden_states = self.embedding_hidden_mapping_in(inputs=hidden_states)
+        all_attentions = () if output_attentions else None
+        all_hidden_states = (hidden_states,) if output_hidden_states else None
+        for i in range(self.num_hidden_layers):
+            # Index of the hidden group
+            group_idx = int(i / (self.num_hidden_layers / self.num_hidden_groups))
+            layer_group_output = self.albert_layer_groups[group_idx](
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                head_mask=head_mask[group_idx * self.layers_per_group : (group_idx + 1) * self.layers_per_group],
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                training=training,
+            )
+            hidden_states = layer_group_output[0]
+            if output_attentions:
+                all_attentions = all_attentions + layer_group_output[-1]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
+        return TFBaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
+        )
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embedding_hidden_mapping_in", None) is not None:
+            with tf.name_scope(self.embedding_hidden_mapping_in.name):
+                self.embedding_hidden_mapping_in.build([None, None, self.config.embedding_size])
+        if getattr(self, "albert_layer_groups", None) is not None:
+            for layer in self.albert_layer_groups:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+class TFAlbertPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = AlbertConfig
+    base_model_prefix = "albert"
+class TFAlbertMLMHead(keras.layers.Layer):
+    def __init__(self, config: AlbertConfig, input_embeddings: keras.layers.Layer, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.embedding_size = config.embedding_size
+        self.dense = keras.layers.Dense(
+            config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        if isinstance(config.hidden_act, str):
+            self.activation = get_tf_activation(config.hidden_act)
+        else:
+            self.activation = config.hidden_act
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = input_embeddings
+    def build(self, input_shape=None):
+        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
+        self.decoder_bias = self.add_weight(
+            shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias"
+        )
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.embedding_size])
+    def get_output_embeddings(self) -> keras.layers.Layer:
+        return self.decoder
+    def set_output_embeddings(self, value: tf.Variable):
+        self.decoder.weight = value
+        self.decoder.vocab_size = shape_list(value)[0]
+    def get_bias(self) -> dict[str, tf.Variable]:
+        return {"bias": self.bias, "decoder_bias": self.decoder_bias}
+    def set_bias(self, value: tf.Variable):
+        self.bias = value["bias"]
+        self.decoder_bias = value["decoder_bias"]
+        self.config.vocab_size = shape_list(value["bias"])[0]
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.LayerNorm(inputs=hidden_states)
+        seq_length = shape_list(tensor=hidden_states)[1]
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size])
+        hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
+        hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.decoder_bias)
+        return hidden_states
+@keras_serializable
+class TFAlbertMainLayer(keras.layers.Layer):
+    config_class = AlbertConfig
+    def __init__(self, config: AlbertConfig, add_pooling_layer: bool = True, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.embeddings = TFAlbertEmbeddings(config, name="embeddings")
+        self.encoder = TFAlbertTransformer(config, name="encoder")
+        self.pooler = (
+            keras.layers.Dense(
+                units=config.hidden_size,
+                kernel_initializer=get_initializer(config.initializer_range),
+                activation="tanh",
+                name="pooler",
+            )
+            if add_pooling_layer
+            else None
+        )
+    def get_input_embeddings(self) -> keras.layers.Layer:
+        return self.embeddings
+    def set_input_embeddings(self, value: tf.Variable):
+        self.embeddings.weight = value
+        self.embeddings.vocab_size = shape_list(value)[0]
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        raise NotImplementedError
+    @unpack_inputs
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> TFBaseModelOutputWithPooling | tuple[tf.Tensor]:
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = shape_list(input_ids)
+        elif inputs_embeds is not None:
+            input_shape = shape_list(inputs_embeds)[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        if attention_mask is None:
+            attention_mask = tf.fill(dims=input_shape, value=1)
+        if token_type_ids is None:
+            token_type_ids = tf.fill(dims=input_shape, value=0)
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            training=training,
+        )
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = tf.reshape(attention_mask, (input_shape[0], 1, 1, input_shape[1]))
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = tf.cast(extended_attention_mask, dtype=embedding_output.dtype)
+        one_cst = tf.constant(1.0, dtype=embedding_output.dtype)
+        ten_thousand_cst = tf.constant(-10000.0, dtype=embedding_output.dtype)
+        extended_attention_mask = tf.multiply(tf.subtract(one_cst, extended_attention_mask), ten_thousand_cst)
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+        encoder_outputs = self.encoder(
+            hidden_states=embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(inputs=sequence_output[:, 0]) if self.pooler is not None else None
+        if not return_dict:
+            return (
+                sequence_output,
+                pooled_output,
+            ) + encoder_outputs[1:]
+        return TFBaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embeddings", None) is not None:
+            with tf.name_scope(self.embeddings.name):
+                self.embeddings.build(None)
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "pooler", None) is not None:
+            with tf.name_scope(self.pooler.name):
+                self.pooler.build([None, None, self.config.hidden_size])
+@dataclass
+class TFAlbertForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`TFAlbertForPreTraining`].
+    Args:
+        prediction_logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        sop_logits (`tf.Tensor` of shape `(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    loss: tf.Tensor | None = None
+    prediction_logits: tf.Tensor | None = None
+    sop_logits: tf.Tensor | None = None
+    hidden_states: tuple[tf.Tensor] | None = None
+    attentions: tuple[tf.Tensor] | None = None
+ALBERT_START_DOCSTRING = r"""
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+    <Tip>
+    TensorFlow models and layers in `transformers` accept two formats as input:
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional argument.
+    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+    positional argument:
+    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+    Note that when creating models and layers with
+    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+    about any of this, as you can just pass inputs like you would to any other Python function!
+    </Tip>
+    Args:
+        config ([`AlbertConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+ALBERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            [`PreTrainedTokenizer.encode`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+@add_start_docstrings(
+    "The bare Albert Model transformer outputting raw hidden-states without any specific head on top.",
+    ALBERT_START_DOCSTRING,
+)
+class TFAlbertModel(TFAlbertPreTrainedModel):
+    def __init__(self, config: AlbertConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.albert = TFAlbertMainLayer(config, name="albert")
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFBaseModelOutputWithPooling,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+    ) -> TFBaseModelOutputWithPooling | tuple[tf.Tensor]:
+        outputs = self.albert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        return outputs
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "albert", None) is not None:
+            with tf.name_scope(self.albert.name):
+                self.albert.build(None)
+@add_start_docstrings(
+    """
+    Albert Model with two heads on top for pretraining: a `masked language modeling` head and a `sentence order
+    prediction` (classification) head.
+    """,
+    ALBERT_START_DOCSTRING,
+)
+class TFAlbertForPreTraining(TFAlbertPreTrainedModel, TFAlbertPreTrainingLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"predictions.decoder.weight"]
+    def __init__(self, config: AlbertConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+        self.albert = TFAlbertMainLayer(config, name="albert")
+        self.predictions = TFAlbertMLMHead(config, input_embeddings=self.albert.embeddings, name="predictions")
+        self.sop_classifier = TFAlbertSOPHead(config, name="sop_classifier")
+    def get_lm_head(self) -> keras.layers.Layer:
+        return self.predictions
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=TFAlbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        sentence_order_label: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFAlbertForPreTrainingOutput | tuple[tf.Tensor]:
+        r"""
+        Return:
+        Example:
+        ```python
+        >>> import tensorflow as tf
+        >>> from transformers import AutoTokenizer, TFAlbertForPreTraining
+        >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
+        >>> model = TFAlbertForPreTraining.from_pretrained("albert/albert-base-v2")
+        >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]
+        >>> # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> prediction_logits = outputs.prediction_logits
+        >>> sop_logits = outputs.sop_logits
+        ```"""
+        outputs = self.albert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores = self.predictions(hidden_states=sequence_output)
+        sop_scores = self.sop_classifier(pooled_output=pooled_output, training=training)
+        total_loss = None
+        if labels is not None and sentence_order_label is not None:
+            d_labels = {"labels": labels}
+            d_labels["sentence_order_label"] = sentence_order_label
+            total_loss = self.hf_compute_loss(labels=d_labels, logits=(prediction_scores, sop_scores))
+        if not return_dict:
+            output = (prediction_scores, sop_scores) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+        return TFAlbertForPreTrainingOutput(
+            loss=total_loss,
+            prediction_logits=prediction_scores,
+            sop_logits=sop_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "albert", None) is not None:
+            with tf.name_scope(self.albert.name):
+                self.albert.build(None)
+        if getattr(self, "predictions", None) is not None:
+            with tf.name_scope(self.predictions.name):
+                self.predictions.build(None)
+        if getattr(self, "sop_classifier", None) is not None:
+            with tf.name_scope(self.sop_classifier.name):
+                self.sop_classifier.build(None)
+class TFAlbertSOPHead(keras.layers.Layer):
+    def __init__(self, config: AlbertConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.dropout = keras.layers.Dropout(rate=config.classifier_dropout_prob)
+        self.classifier = keras.layers.Dense(
+            units=config.num_labels,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="classifier",
+        )
+        self.config = config
+    def call(self, pooled_output: tf.Tensor, training: bool) -> tf.Tensor:
+        dropout_pooled_output = self.dropout(inputs=pooled_output, training=training)
+        logits = self.classifier(inputs=dropout_pooled_output)
+        return logits
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+@add_start_docstrings("""Albert Model with a `language modeling` head on top.""", ALBERT_START_DOCSTRING)
+class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"pooler", r"predictions.decoder.weight"]
+    def __init__(self, config: AlbertConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert")
+        self.predictions = TFAlbertMLMHead(config, input_embeddings=self.albert.embeddings, name="predictions")
+    def get_lm_head(self) -> keras.layers.Layer:
+        return self.predictions
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=TFMaskedLMOutput, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFMaskedLMOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        Returns:
+        Example:
+        ```python
+        >>> import tensorflow as tf
+        >>> from transformers import AutoTokenizer, TFAlbertForMaskedLM
+        >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
+        >>> model = TFAlbertForMaskedLM.from_pretrained("albert/albert-base-v2")
+        >>> # add mask_token
+        >>> inputs = tokenizer(f"The capital of [MASK] is Paris.", return_tensors="tf")
+        >>> logits = model(**inputs).logits
+        >>> # retrieve index of [MASK]
+        >>> mask_token_index = tf.where(inputs.input_ids == tokenizer.mask_token_id)[0][1]
+        >>> predicted_token_id = tf.math.argmax(logits[0, mask_token_index], axis=-1)
+        >>> tokenizer.decode(predicted_token_id)
+        'france'
+        ```
+        ```python
+        >>> labels = tokenizer("The capital of France is Paris.", return_tensors="tf")["input_ids"]
+        >>> labels = tf.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100)
+        >>> outputs = model(**inputs, labels=labels)
+        >>> round(float(outputs.loss), 2)
+        0.81
+        ```
+        """
+        outputs = self.albert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.predictions(hidden_states=sequence_output, training=training)
+        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=prediction_scores)
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return TFMaskedLMOutput(
+            loss=loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "albert", None) is not None:
+            with tf.name_scope(self.albert.name):
+                self.albert.build(None)
+        if getattr(self, "predictions", None) is not None:
+            with tf.name_scope(self.predictions.name):
+                self.predictions.build(None)
+@add_start_docstrings(
+    """
+    Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """,
+    ALBERT_START_DOCSTRING,
+)
+class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClassificationLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"predictions"]
+    _keys_to_ignore_on_load_missing = [r"dropout"]
+    def __init__(self, config: AlbertConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+        self.albert = TFAlbertMainLayer(config, name="albert")
+        self.dropout = keras.layers.Dropout(rate=config.classifier_dropout_prob)
+        self.classifier = keras.layers.Dense(
+            units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
+        self.config = config
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint="vumichien/albert-base-v2-imdb",
+        output_type=TFSequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output="'LABEL_1'",
+        expected_loss=0.12,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFSequenceClassifierOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        outputs = self.albert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(inputs=pooled_output, training=training)
+        logits = self.classifier(inputs=pooled_output)
+        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return TFSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "albert", None) is not None:
+            with tf.name_scope(self.albert.name):
+                self.albert.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+@add_start_docstrings(
+    """
+    Albert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    ALBERT_START_DOCSTRING,
+)
+class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificationLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"pooler", r"predictions"]
+    _keys_to_ignore_on_load_missing = [r"dropout"]
+    def __init__(self, config: AlbertConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+        self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert")
+        classifier_dropout_prob = (
+            config.classifier_dropout_prob
+            if config.classifier_dropout_prob is not None
+            else config.hidden_dropout_prob
+        )
+        self.dropout = keras.layers.Dropout(rate=classifier_dropout_prob)
+        self.classifier = keras.layers.Dense(
+            units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
+        self.config = config
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFTokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFTokenClassifierOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        outputs = self.albert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(inputs=sequence_output, training=training)
+        logits = self.classifier(inputs=sequence_output)
+        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return TFTokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "albert", None) is not None:
+            with tf.name_scope(self.albert.name):
+                self.albert.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+@add_start_docstrings(
+    """
+    Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    ALBERT_START_DOCSTRING,
+)
+class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"pooler", r"predictions"]
+    def __init__(self, config: AlbertConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+        self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert")
+        self.qa_outputs = keras.layers.Dense(
+            units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
+        )
+        self.config = config
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint="vumichien/albert-base-v2-squad2",
+        output_type=TFQuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        qa_target_start_index=12,
+        qa_target_end_index=13,
+        expected_output="'a nice puppet'",
+        expected_loss=7.36,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFQuestionAnsweringModelOutput | tuple[tf.Tensor]:
+        r"""
+        start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        outputs = self.albert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = outputs[0]
+        logits = self.qa_outputs(inputs=sequence_output)
+        start_logits, end_logits = tf.split(value=logits, num_or_size_splits=2, axis=-1)
+        start_logits = tf.squeeze(input=start_logits, axis=-1)
+        end_logits = tf.squeeze(input=end_logits, axis=-1)
+        loss = None
+        if start_positions is not None and end_positions is not None:
+            labels = {"start_position": start_positions}
+            labels["end_position"] = end_positions
+            loss = self.hf_compute_loss(labels=labels, logits=(start_logits, end_logits))
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return TFQuestionAnsweringModelOutput(
+            loss=loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "albert", None) is not None:
+            with tf.name_scope(self.albert.name):
+                self.albert.build(None)
+        if getattr(self, "qa_outputs", None) is not None:
+            with tf.name_scope(self.qa_outputs.name):
+                self.qa_outputs.build([None, None, self.config.hidden_size])
+@add_start_docstrings(
+    """
+    Albert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    ALBERT_START_DOCSTRING,
+)
+class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"pooler", r"predictions"]
+    _keys_to_ignore_on_load_missing = [r"dropout"]
+    def __init__(self, config: AlbertConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.albert = TFAlbertMainLayer(config, name="albert")
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.classifier = keras.layers.Dense(
+            units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
+        self.config = config
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFMultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFMultipleChoiceModelOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
+            where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
+        """
+        if input_ids is not None:
+            num_choices = shape_list(input_ids)[1]
+            seq_length = shape_list(input_ids)[2]
+        else:
+            num_choices = shape_list(inputs_embeds)[1]
+            seq_length = shape_list(inputs_embeds)[2]
+        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
+        flat_attention_mask = (
+            tf.reshape(tensor=attention_mask, shape=(-1, seq_length)) if attention_mask is not None else None
+        )
+        flat_token_type_ids = (
+            tf.reshape(tensor=token_type_ids, shape=(-1, seq_length)) if token_type_ids is not None else None
+        )
+        flat_position_ids = (
+            tf.reshape(tensor=position_ids, shape=(-1, seq_length)) if position_ids is not None else None
+        )
+        flat_inputs_embeds = (
+            tf.reshape(tensor=inputs_embeds, shape=(-1, seq_length, shape_list(inputs_embeds)[3]))
+            if inputs_embeds is not None
+            else None
+        )
+        outputs = self.albert(
+            input_ids=flat_input_ids,
+            attention_mask=flat_attention_mask,
+            token_type_ids=flat_token_type_ids,
+            position_ids=flat_position_ids,
+            head_mask=head_mask,
+            inputs_embeds=flat_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(inputs=pooled_output, training=training)
+        logits = self.classifier(inputs=pooled_output)
+        reshaped_logits = tf.reshape(tensor=logits, shape=(-1, num_choices))
+        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=reshaped_logits)
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return TFMultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "albert", None) is not None:
+            with tf.name_scope(self.albert.name):
+                self.albert.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+__all__ = [
+    "TFAlbertPreTrainedModel",
+    "TFAlbertModel",
+    "TFAlbertForPreTraining",
+    "TFAlbertForMaskedLM",
+    "TFAlbertForSequenceClassification",
+    "TFAlbertForTokenClassification",
+    "TFAlbertForQuestionAnswering",
+    "TFAlbertForMultipleChoice",
+    "TFAlbertMainLayer",
+]

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/albert/tokenization_albert.py ADDED Viewed

	@@ -0,0 +1,320 @@

+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for ALBERT model."""
+import os
+import unicodedata
+from shutil import copyfile
+from typing import Any, Optional
+import sentencepiece as spm
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...utils import logging
+from ...utils.import_utils import requires
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
+SPIECE_UNDERLINE = "▁"
+@requires(backends=("sentencepiece",))
+class AlbertTokenizer(PreTrainedTokenizer):
+    """
+    Construct an ALBERT tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+    Args:
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        remove_space (`bool`, *optional*, defaults to `True`):
+            Whether or not to strip the text when tokenizing (removing excess spaces before and after the string).
+        keep_accents (`bool`, *optional*, defaults to `False`):
+            Whether or not to keep accents when tokenizing.
+        bos_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+            <Tip>
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+            </Tip>
+        eos_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The end of sequence token.
+            <Tip>
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+            </Tip>
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+    Attributes:
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        remove_space=True,
+        keep_accents=False,
+        bos_token="[CLS]",
+        eos_token="[SEP]",
+        unk_token="<unk>",
+        sep_token="[SEP]",
+        pad_token="<pad>",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        sp_model_kwargs: Optional[dict[str, Any]] = None,
+        **kwargs,
+    ) -> None:
+        # Mask token behave like a normal word, i.e. include the space before it and
+        # is included in the raw text, there should be a match in a non-normalized sentence.
+        mask_token = (
+            AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
+            if isinstance(mask_token, str)
+            else mask_token
+        )
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        self.do_lower_case = do_lower_case
+        self.remove_space = remove_space
+        self.keep_accents = keep_accents
+        self.vocab_file = vocab_file
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+        super().__init__(
+            do_lower_case=do_lower_case,
+            remove_space=remove_space,
+            keep_accents=keep_accents,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+    @property
+    def vocab_size(self) -> int:
+        return len(self.sp_model)
+    def get_vocab(self) -> dict[str, int]:
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+    def __setstate__(self, d):
+        self.__dict__ = d
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(self.vocab_file)
+    def preprocess_text(self, inputs):
+        if self.remove_space:
+            outputs = " ".join(inputs.strip().split())
+        else:
+            outputs = inputs
+        outputs = outputs.replace("``", '"').replace("''", '"')
+        if not self.keep_accents:
+            outputs = unicodedata.normalize("NFKD", outputs)
+            outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
+        if self.do_lower_case:
+            outputs = outputs.lower()
+        return outputs
+    def _tokenize(self, text: str) -> list[str]:
+        """Tokenize a string."""
+        text = self.preprocess_text(text)
+        pieces = self.sp_model.encode(text, out_type=str)
+        new_pieces = []
+        for piece in pieces:
+            if len(piece) > 1 and piece[-1] == "," and piece[-2].isdigit():
+                # Logic to handle special cases see https://github.com/google-research/bert/blob/master/README.md#tokenization
+                # `9,9` -> ['▁9', ',', '9'] instead of [`_9,`, '9']
+                cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
+                if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
+                    if len(cur_pieces[0]) == 1:
+                        cur_pieces = cur_pieces[1:]
+                    else:
+                        cur_pieces[0] = cur_pieces[0][1:]
+                cur_pieces.append(piece[-1])
+                new_pieces.extend(cur_pieces)
+            else:
+                new_pieces.append(piece)
+        return new_pieces
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.PieceToId(token)
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.sp_model.IdToPiece(index)
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        prev_is_special = False
+        for token in tokens:
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                if not prev_is_special:
+                    out_string += " "
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                prev_is_special = True
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+                prev_is_special = False
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string.strip()
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An ALBERT sequence has the following format:
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return cls + token_ids_0 + sep
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+        return (out_vocab_file,)
+__all__ = ["AlbertTokenizer"]

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/albert/tokenization_albert_fast.py ADDED Viewed

	@@ -0,0 +1,178 @@

+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for ALBERT model."""
+import os
+from shutil import copyfile
+from typing import Optional
+from ...tokenization_utils import AddedToken
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import is_sentencepiece_available, logging
+if is_sentencepiece_available():
+    from .tokenization_albert import AlbertTokenizer
+else:
+    AlbertTokenizer = None
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
+SPIECE_UNDERLINE = "▁"
+class AlbertTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "fast" ALBERT tokenizer (backed by HuggingFace's *tokenizers* library). Based on
+    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This
+    tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods
+    Args:
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        remove_space (`bool`, *optional*, defaults to `True`):
+            Whether or not to strip the text when tokenizing (removing excess spaces before and after the string).
+        keep_accents (`bool`, *optional*, defaults to `False`):
+            Whether or not to keep accents when tokenizing.
+        bos_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+            <Tip>
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+            </Tip>
+        eos_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token
+            that is used for the end of sequence. The token used is the `sep_token`.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    slow_tokenizer_class = AlbertTokenizer
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        do_lower_case=True,
+        remove_space=True,
+        keep_accents=False,
+        bos_token="[CLS]",
+        eos_token="[SEP]",
+        unk_token="<unk>",
+        sep_token="[SEP]",
+        pad_token="<pad>",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs,
+    ):
+        # Mask token behave like a normal word, i.e. include the space before it and
+        # is included in the raw text, there should be a match in a non-normalized sentence.
+        mask_token = (
+            AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
+            if isinstance(mask_token, str)
+            else mask_token
+        )
+        super().__init__(
+            vocab_file,
+            tokenizer_file=tokenizer_file,
+            do_lower_case=do_lower_case,
+            remove_space=remove_space,
+            keep_accents=keep_accents,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            **kwargs,
+        )
+        self.do_lower_case = do_lower_case
+        self.remove_space = remove_space
+        self.keep_accents = keep_accents
+        self.vocab_file = vocab_file
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An ALBERT sequence has the following format:
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return cls + token_ids_0 + sep
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not self.can_save_slow_tokenizer:
+            raise ValueError(
+                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
+                "tokenizer."
+            )
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        return (out_vocab_file,)
+__all__ = ["AlbertTokenizerFast"]

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/auto/__init__.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+if TYPE_CHECKING:
+    from .auto_factory import *
+    from .configuration_auto import *
+    from .feature_extraction_auto import *
+    from .image_processing_auto import *
+    from .modeling_auto import *
+    from .modeling_flax_auto import *
+    from .modeling_tf_auto import *
+    from .processing_auto import *
+    from .tokenization_auto import *
+    from .video_processing_auto import *
+else:
+    import sys
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/auto/auto_factory.py ADDED Viewed

	@@ -0,0 +1,882 @@

+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Factory function to build auto-model classes."""
+import copy
+import importlib
+import json
+import os
+import warnings
+from collections import OrderedDict
+from collections.abc import Iterator
+from typing import Any, TypeVar, Union
+from ...configuration_utils import PretrainedConfig
+from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
+from ...utils import (
+    CONFIG_NAME,
+    cached_file,
+    copy_func,
+    extract_commit_hash,
+    find_adapter_config_file,
+    is_peft_available,
+    is_torch_available,
+    logging,
+    requires_backends,
+)
+from .configuration_auto import AutoConfig, model_type_to_module_name, replace_list_option_in_docstrings
+if is_torch_available():
+    from ...generation import GenerationMixin
+logger = logging.get_logger(__name__)
+_T = TypeVar("_T")
+# Tokenizers will depend on packages installed, too much variance and there are no common base or Protocol
+_LazyAutoMappingValue = tuple[Union[type[Any], None], Union[type[Any], None]]
+CLASS_DOCSTRING = """
+    This is a generic model class that will be instantiated as one of the model classes of the library when created
+    with the [`~BaseAutoModelClass.from_pretrained`] class method or the [`~BaseAutoModelClass.from_config`] class
+    method.
+    This class cannot be instantiated directly using `__init__()` (throws an error).
+"""
+FROM_CONFIG_DOCSTRING = """
+        Instantiates one of the model classes of the library from a configuration.
+        Note:
+            Loading a model from its configuration file does **not** load the model weights. It only affects the
+            model's configuration. Use [`~BaseAutoModelClass.from_pretrained`] to load the model weights.
+        Args:
+            config ([`PretrainedConfig`]):
+                The model class to instantiate is selected based on the configuration class:
+                List options
+            attn_implementation (`str`, *optional*):
+                The attention implementation to use in the model (if relevant). Can be any of `"eager"` (manual implementation of the attention), `"sdpa"` (using [`F.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html)), or `"flash_attention_2"` (using [Dao-AILab/flash-attention](https://github.com/Dao-AILab/flash-attention)). By default, if available, SDPA will be used for torch>=2.1.1. The default is otherwise the manual `"eager"` implementation.
+        Examples:
+        ```python
+        >>> from transformers import AutoConfig, BaseAutoModelClass
+        >>> # Download configuration from huggingface.co and cache.
+        >>> config = AutoConfig.from_pretrained("checkpoint_placeholder")
+        >>> model = BaseAutoModelClass.from_config(config)
+        ```
+"""
+FROM_PRETRAINED_TORCH_DOCSTRING = """
+        Instantiate one of the model classes of the library from a pretrained model.
+        The model class to instantiate is selected based on the `model_type` property of the config object (either
+        passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by
+        falling back to using pattern matching on `pretrained_model_name_or_path`:
+        List options
+        The model is set in evaluation mode by default using `model.eval()` (so for instance, dropout modules are
+        deactivated). To train the model, you should first set it back in training mode with `model.train()`
+        Args:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                Can be either:
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                    - A path to a *directory* containing model weights saved using
+                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+                    - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
+                      this case, `from_tf` should be set to `True` and a configuration object should be provided as
+                      `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
+                      PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+            model_args (additional positional arguments, *optional*):
+                Will be passed along to the underlying model `__init__()` method.
+            config ([`PretrainedConfig`], *optional*):
+                Configuration for the model to use instead of an automatically loaded configuration. Configuration can
+                be automatically loaded when:
+                    - The model is a model provided by the library (loaded with the *model id* string of a pretrained
+                      model).
+                    - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded by supplying the
+                      save directory.
+                    - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
+                      configuration JSON file named *config.json* is found in the directory.
+            state_dict (*dict[str, torch.Tensor]*, *optional*):
+                A state dictionary to use instead of a state dictionary loaded from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own
+                weights. In this case though, you should check if using [`~PreTrainedModel.save_pretrained`] and
+                [`~PreTrainedModel.from_pretrained`] is not a simpler option.
+            cache_dir (`str` or `os.PathLike`, *optional*):
+                Path to a directory in which a downloaded pretrained model configuration should be cached if the
+                standard cache should not be used.
+            from_tf (`bool`, *optional*, defaults to `False`):
+                Load the model weights from a TensorFlow checkpoint save file (see docstring of
+                `pretrained_model_name_or_path` argument).
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download:
+                Deprecated and ignored. All downloads are now resumed by default when possible.
+                Will be removed in v5 of Transformers.
+            proxies (`dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only(`bool`, *optional*, defaults to `False`):
+                Whether or not to only look at local files (e.g., not try downloading the model).
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+            trust_remote_code (`bool`, *optional*, defaults to `False`):
+                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
+                should only be set to `True` for repositories you trust and in which you have read the code, as it will
+                execute code present on the Hub on your local machine.
+            code_revision (`str`, *optional*, defaults to `"main"`):
+                The specific revision to use for the code on the Hub, if the code leaves in a different repository than
+                the rest of the model. It can be a branch name, a tag name, or a commit id, since we use a git-based
+                system for storing models and other artifacts on huggingface.co, so `revision` can be any identifier
+                allowed by git.
+            kwargs (additional keyword arguments, *optional*):
+                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
+                `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
+                automatically loaded:
+                    - If a configuration is provided with `config`, `**kwargs` will be directly passed to the
+                      underlying model's `__init__` method (we assume all relevant updates to the configuration have
+                      already been done)
+                    - If a configuration is not provided, `kwargs` will be first passed to the configuration class
+                      initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of `kwargs` that
+                      corresponds to a configuration attribute will be used to override said attribute with the
+                      supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute
+                      will be passed to the underlying model's `__init__` function.
+        Examples:
+        ```python
+        >>> from transformers import AutoConfig, BaseAutoModelClass
+        >>> # Download model and configuration from huggingface.co and cache.
+        >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder")
+        >>> # Update configuration during loading
+        >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder", output_attentions=True)
+        >>> model.config.output_attentions
+        True
+        >>> # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+        >>> config = AutoConfig.from_pretrained("./tf_model/shortcut_placeholder_tf_model_config.json")
+        >>> model = BaseAutoModelClass.from_pretrained(
+        ...     "./tf_model/shortcut_placeholder_tf_checkpoint.ckpt.index", from_tf=True, config=config
+        ... )
+        ```
+"""
+FROM_PRETRAINED_TF_DOCSTRING = """
+        Instantiate one of the model classes of the library from a pretrained model.
+        The model class to instantiate is selected based on the `model_type` property of the config object (either
+        passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by
+        falling back to using pattern matching on `pretrained_model_name_or_path`:
+        List options
+        Args:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                Can be either:
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                    - A path to a *directory* containing model weights saved using
+                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+                    - A path or url to a *PyTorch state_dict save file* (e.g, `./pt_model/pytorch_model.bin`). In this
+                      case, `from_pt` should be set to `True` and a configuration object should be provided as `config`
+                      argument. This loading path is slower than converting the PyTorch model in a TensorFlow model
+                      using the provided conversion scripts and loading the TensorFlow model afterwards.
+            model_args (additional positional arguments, *optional*):
+                Will be passed along to the underlying model `__init__()` method.
+            config ([`PretrainedConfig`], *optional*):
+                Configuration for the model to use instead of an automatically loaded configuration. Configuration can
+                be automatically loaded when:
+                    - The model is a model provided by the library (loaded with the *model id* string of a pretrained
+                      model).
+                    - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded by supplying the
+                      save directory.
+                    - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
+                      configuration JSON file named *config.json* is found in the directory.
+            cache_dir (`str` or `os.PathLike`, *optional*):
+                Path to a directory in which a downloaded pretrained model configuration should be cached if the
+                standard cache should not be used.
+            from_pt (`bool`, *optional*, defaults to `False`):
+                Load the model weights from a PyTorch checkpoint save file (see docstring of
+                `pretrained_model_name_or_path` argument).
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download:
+                Deprecated and ignored. All downloads are now resumed by default when possible.
+                Will be removed in v5 of Transformers.
+            proxies (`dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only(`bool`, *optional*, defaults to `False`):
+                Whether or not to only look at local files (e.g., not try downloading the model).
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+            trust_remote_code (`bool`, *optional*, defaults to `False`):
+                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
+                should only be set to `True` for repositories you trust and in which you have read the code, as it will
+                execute code present on the Hub on your local machine.
+            code_revision (`str`, *optional*, defaults to `"main"`):
+                The specific revision to use for the code on the Hub, if the code leaves in a different repository than
+                the rest of the model. It can be a branch name, a tag name, or a commit id, since we use a git-based
+                system for storing models and other artifacts on huggingface.co, so `revision` can be any identifier
+                allowed by git.
+            kwargs (additional keyword arguments, *optional*):
+                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
+                `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
+                automatically loaded:
+                    - If a configuration is provided with `config`, `**kwargs` will be directly passed to the
+                      underlying model's `__init__` method (we assume all relevant updates to the configuration have
+                      already been done)
+                    - If a configuration is not provided, `kwargs` will be first passed to the configuration class
+                      initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of `kwargs` that
+                      corresponds to a configuration attribute will be used to override said attribute with the
+                      supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute
+                      will be passed to the underlying model's `__init__` function.
+        Examples:
+        ```python
+        >>> from transformers import AutoConfig, BaseAutoModelClass
+        >>> # Download model and configuration from huggingface.co and cache.
+        >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder")
+        >>> # Update configuration during loading
+        >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder", output_attentions=True)
+        >>> model.config.output_attentions
+        True
+        >>> # Loading from a PyTorch checkpoint file instead of a TensorFlow model (slower)
+        >>> config = AutoConfig.from_pretrained("./pt_model/shortcut_placeholder_pt_model_config.json")
+        >>> model = BaseAutoModelClass.from_pretrained(
+        ...     "./pt_model/shortcut_placeholder_pytorch_model.bin", from_pt=True, config=config
+        ... )
+        ```
+"""
+FROM_PRETRAINED_FLAX_DOCSTRING = """
+        Instantiate one of the model classes of the library from a pretrained model.
+        The model class to instantiate is selected based on the `model_type` property of the config object (either
+        passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by
+        falling back to using pattern matching on `pretrained_model_name_or_path`:
+        List options
+        Args:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                Can be either:
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                    - A path to a *directory* containing model weights saved using
+                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+                    - A path or url to a *PyTorch state_dict save file* (e.g, `./pt_model/pytorch_model.bin`). In this
+                      case, `from_pt` should be set to `True` and a configuration object should be provided as `config`
+                      argument. This loading path is slower than converting the PyTorch model in a TensorFlow model
+                      using the provided conversion scripts and loading the TensorFlow model afterwards.
+            model_args (additional positional arguments, *optional*):
+                Will be passed along to the underlying model `__init__()` method.
+            config ([`PretrainedConfig`], *optional*):
+                Configuration for the model to use instead of an automatically loaded configuration. Configuration can
+                be automatically loaded when:
+                    - The model is a model provided by the library (loaded with the *model id* string of a pretrained
+                      model).
+                    - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded by supplying the
+                      save directory.
+                    - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
+                      configuration JSON file named *config.json* is found in the directory.
+            cache_dir (`str` or `os.PathLike`, *optional*):
+                Path to a directory in which a downloaded pretrained model configuration should be cached if the
+                standard cache should not be used.
+            from_pt (`bool`, *optional*, defaults to `False`):
+                Load the model weights from a PyTorch checkpoint save file (see docstring of
+                `pretrained_model_name_or_path` argument).
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download:
+                Deprecated and ignored. All downloads are now resumed by default when possible.
+                Will be removed in v5 of Transformers.
+            proxies (`dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only(`bool`, *optional*, defaults to `False`):
+                Whether or not to only look at local files (e.g., not try downloading the model).
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+            trust_remote_code (`bool`, *optional*, defaults to `False`):
+                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
+                should only be set to `True` for repositories you trust and in which you have read the code, as it will
+                execute code present on the Hub on your local machine.
+            code_revision (`str`, *optional*, defaults to `"main"`):
+                The specific revision to use for the code on the Hub, if the code leaves in a different repository than
+                the rest of the model. It can be a branch name, a tag name, or a commit id, since we use a git-based
+                system for storing models and other artifacts on huggingface.co, so `revision` can be any identifier
+                allowed by git.
+            kwargs (additional keyword arguments, *optional*):
+                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
+                `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
+                automatically loaded:
+                    - If a configuration is provided with `config`, `**kwargs` will be directly passed to the
+                      underlying model's `__init__` method (we assume all relevant updates to the configuration have
+                      already been done)
+                    - If a configuration is not provided, `kwargs` will be first passed to the configuration class
+                      initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of `kwargs` that
+                      corresponds to a configuration attribute will be used to override said attribute with the
+                      supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute
+                      will be passed to the underlying model's `__init__` function.
+        Examples:
+        ```python
+        >>> from transformers import AutoConfig, BaseAutoModelClass
+        >>> # Download model and configuration from huggingface.co and cache.
+        >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder")
+        >>> # Update configuration during loading
+        >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder", output_attentions=True)
+        >>> model.config.output_attentions
+        True
+        >>> # Loading from a PyTorch checkpoint file instead of a TensorFlow model (slower)
+        >>> config = AutoConfig.from_pretrained("./pt_model/shortcut_placeholder_pt_model_config.json")
+        >>> model = BaseAutoModelClass.from_pretrained(
+        ...     "./pt_model/shortcut_placeholder_pytorch_model.bin", from_pt=True, config=config
+        ... )
+        ```
+"""
+def _get_model_class(config, model_mapping):
+    supported_models = model_mapping[type(config)]
+    if not isinstance(supported_models, (list, tuple)):
+        return supported_models
+    name_to_model = {model.__name__: model for model in supported_models}
+    architectures = getattr(config, "architectures", [])
+    for arch in architectures:
+        if arch in name_to_model:
+            return name_to_model[arch]
+        elif f"TF{arch}" in name_to_model:
+            return name_to_model[f"TF{arch}"]
+        elif f"Flax{arch}" in name_to_model:
+            return name_to_model[f"Flax{arch}"]
+    # If not architecture is set in the config or match the supported models, the first element of the tuple is the
+    # defaults.
+    return supported_models[0]
+class _BaseAutoModelClass:
+    # Base class for auto models.
+    _model_mapping = None
+    def __init__(self, *args, **kwargs) -> None:
+        raise OSError(
+            f"{self.__class__.__name__} is designed to be instantiated "
+            f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)` or "
+            f"`{self.__class__.__name__}.from_config(config)` methods."
+        )
+    @classmethod
+    def from_config(cls, config, **kwargs):
+        trust_remote_code = kwargs.pop("trust_remote_code", None)
+        has_remote_code = hasattr(config, "auto_map") and cls.__name__ in config.auto_map
+        has_local_code = type(config) in cls._model_mapping
+        if has_remote_code:
+            class_ref = config.auto_map[cls.__name__]
+            if "--" in class_ref:
+                upstream_repo = class_ref.split("--")[0]
+            else:
+                upstream_repo = None
+            trust_remote_code = resolve_trust_remote_code(
+                trust_remote_code, config._name_or_path, has_local_code, has_remote_code, upstream_repo=upstream_repo
+            )
+        if has_remote_code and trust_remote_code:
+            if "--" in class_ref:
+                repo_id, class_ref = class_ref.split("--")
+            else:
+                repo_id = config.name_or_path
+            model_class = get_class_from_dynamic_module(class_ref, repo_id, **kwargs)
+            # This block handles the case where the user is loading a model with `trust_remote_code=True`
+            # but a library model exists with the same name. We don't want to override the autoclass
+            # mappings in this case, or all future loads of that model will be the remote code model.
+            if not has_local_code:
+                cls.register(config.__class__, model_class, exist_ok=True)
+                model_class.register_for_auto_class(auto_class=cls)
+            _ = kwargs.pop("code_revision", None)
+            model_class = add_generation_mixin_to_remote_model(model_class)
+            return model_class._from_config(config, **kwargs)
+        elif type(config) in cls._model_mapping:
+            model_class = _get_model_class(config, cls._model_mapping)
+            return model_class._from_config(config, **kwargs)
+        raise ValueError(
+            f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n"
+            f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping)}."
+        )
+    @classmethod
+    def _prepare_config_for_auto_class(cls, config: PretrainedConfig) -> PretrainedConfig:
+        """Additional autoclass-specific config post-loading manipulation. May be overridden in subclasses."""
+        return config
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike[str]], *model_args, **kwargs):
+        config = kwargs.pop("config", None)
+        trust_remote_code = kwargs.get("trust_remote_code")
+        kwargs["_from_auto"] = True
+        hub_kwargs_names = [
+            "cache_dir",
+            "force_download",
+            "local_files_only",
+            "proxies",
+            "resume_download",
+            "revision",
+            "subfolder",
+            "use_auth_token",
+            "token",
+        ]
+        hub_kwargs = {name: kwargs.pop(name) for name in hub_kwargs_names if name in kwargs}
+        code_revision = kwargs.pop("code_revision", None)
+        commit_hash = kwargs.pop("_commit_hash", None)
+        adapter_kwargs = kwargs.pop("adapter_kwargs", None)
+        token = hub_kwargs.pop("token", None)
+        use_auth_token = hub_kwargs.pop("use_auth_token", None)
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+                FutureWarning,
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+        if token is not None:
+            hub_kwargs["token"] = token
+        if commit_hash is None:
+            if not isinstance(config, PretrainedConfig):
+                # We make a call to the config file first (which may be absent) to get the commit hash as soon as possible
+                resolved_config_file = cached_file(
+                    pretrained_model_name_or_path,
+                    CONFIG_NAME,
+                    _raise_exceptions_for_gated_repo=False,
+                    _raise_exceptions_for_missing_entries=False,
+                    _raise_exceptions_for_connection_errors=False,
+                    **hub_kwargs,
+                )
+                commit_hash = extract_commit_hash(resolved_config_file, commit_hash)
+            else:
+                commit_hash = getattr(config, "_commit_hash", None)
+        if is_peft_available():
+            if adapter_kwargs is None:
+                adapter_kwargs = {}
+                if token is not None:
+                    adapter_kwargs["token"] = token
+            maybe_adapter_path = find_adapter_config_file(
+                pretrained_model_name_or_path, _commit_hash=commit_hash, **adapter_kwargs
+            )
+            if maybe_adapter_path is not None:
+                with open(maybe_adapter_path, "r", encoding="utf-8") as f:
+                    adapter_config = json.load(f)
+                    adapter_kwargs["_adapter_model_path"] = pretrained_model_name_or_path
+                    pretrained_model_name_or_path = adapter_config["base_model_name_or_path"]
+        if not isinstance(config, PretrainedConfig):
+            kwargs_orig = copy.deepcopy(kwargs)
+            # ensure not to pollute the config object with dtype="auto" - since it's
+            # meaningless in the context of the config object - torch.dtype values are acceptable
+            if kwargs.get("torch_dtype") == "auto":
+                _ = kwargs.pop("torch_dtype")
+            if kwargs.get("dtype") == "auto":
+                _ = kwargs.pop("dtype")
+            # to not overwrite the quantization_config if config has a quantization_config
+            if kwargs.get("quantization_config") is not None:
+                _ = kwargs.pop("quantization_config")
+            config, kwargs = AutoConfig.from_pretrained(
+                pretrained_model_name_or_path,
+                return_unused_kwargs=True,
+                code_revision=code_revision,
+                _commit_hash=commit_hash,
+                **hub_kwargs,
+                **kwargs,
+            )
+            # if torch_dtype=auto was passed here, ensure to pass it on
+            if kwargs_orig.get("torch_dtype", None) == "auto":
+                kwargs["torch_dtype"] = "auto"
+            if kwargs_orig.get("dtype", None) == "auto":
+                kwargs["dtype"] = "auto"
+            if kwargs_orig.get("quantization_config", None) is not None:
+                kwargs["quantization_config"] = kwargs_orig["quantization_config"]
+        has_remote_code = hasattr(config, "auto_map") and cls.__name__ in config.auto_map
+        has_local_code = type(config) in cls._model_mapping
+        upstream_repo = None
+        if has_remote_code:
+            class_ref = config.auto_map[cls.__name__]
+            if "--" in class_ref:
+                upstream_repo = class_ref.split("--")[0]
+        trust_remote_code = resolve_trust_remote_code(
+            trust_remote_code,
+            pretrained_model_name_or_path,
+            has_local_code,
+            has_remote_code,
+            upstream_repo=upstream_repo,
+        )
+        kwargs["trust_remote_code"] = trust_remote_code
+        # Set the adapter kwargs
+        kwargs["adapter_kwargs"] = adapter_kwargs
+        if has_remote_code and trust_remote_code:
+            model_class = get_class_from_dynamic_module(
+                class_ref, pretrained_model_name_or_path, code_revision=code_revision, **hub_kwargs, **kwargs
+            )
+            _ = hub_kwargs.pop("code_revision", None)
+            # This block handles the case where the user is loading a model with `trust_remote_code=True`
+            # but a library model exists with the same name. We don't want to override the autoclass
+            # mappings in this case, or all future loads of that model will be the remote code model.
+            if not has_local_code:
+                cls.register(config.__class__, model_class, exist_ok=True)
+                model_class.register_for_auto_class(auto_class=cls)
+            model_class = add_generation_mixin_to_remote_model(model_class)
+            return model_class.from_pretrained(
+                pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
+            )
+        elif type(config) in cls._model_mapping:
+            model_class = _get_model_class(config, cls._model_mapping)
+            if model_class.config_class == config.sub_configs.get("text_config", None):
+                config = config.get_text_config()
+            return model_class.from_pretrained(
+                pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
+            )
+        raise ValueError(
+            f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n"
+            f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping)}."
+        )
+    @classmethod
+    def register(cls, config_class, model_class, exist_ok=False) -> None:
+        """
+        Register a new model for this class.
+        Args:
+            config_class ([`PretrainedConfig`]):
+                The configuration corresponding to the model to register.
+            model_class ([`PreTrainedModel`]):
+                The model to register.
+        """
+        if hasattr(model_class, "config_class") and model_class.config_class.__name__ != config_class.__name__:
+            raise ValueError(
+                "The model class you are passing has a `config_class` attribute that is not consistent with the "
+                f"config class you passed (model has {model_class.config_class} and you passed {config_class}. Fix "
+                "one of those so they match!"
+            )
+        cls._model_mapping.register(config_class, model_class, exist_ok=exist_ok)
+class _BaseAutoBackboneClass(_BaseAutoModelClass):
+    # Base class for auto backbone models.
+    _model_mapping = None
+    @classmethod
+    def _load_timm_backbone_from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        requires_backends(cls, ["vision", "timm"])
+        from ...models.timm_backbone import TimmBackboneConfig
+        config = kwargs.pop("config", TimmBackboneConfig())
+        if kwargs.get("out_features") is not None:
+            raise ValueError("Cannot specify `out_features` for timm backbones")
+        if kwargs.get("output_loading_info", False):
+            raise ValueError("Cannot specify `output_loading_info=True` when loading from timm")
+        num_channels = kwargs.pop("num_channels", config.num_channels)
+        features_only = kwargs.pop("features_only", config.features_only)
+        use_pretrained_backbone = kwargs.pop("use_pretrained_backbone", config.use_pretrained_backbone)
+        out_indices = kwargs.pop("out_indices", config.out_indices)
+        config = TimmBackboneConfig(
+            backbone=pretrained_model_name_or_path,
+            num_channels=num_channels,
+            features_only=features_only,
+            use_pretrained_backbone=use_pretrained_backbone,
+            out_indices=out_indices,
+        )
+        return super().from_config(config, **kwargs)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        use_timm_backbone = kwargs.pop("use_timm_backbone", False)
+        if use_timm_backbone:
+            return cls._load_timm_backbone_from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+def insert_head_doc(docstring, head_doc: str = ""):
+    if len(head_doc) > 0:
+        return docstring.replace(
+            "one of the model classes of the library ",
+            f"one of the model classes of the library (with a {head_doc} head) ",
+        )
+    return docstring.replace(
+        "one of the model classes of the library ", "one of the base model classes of the library "
+    )
+def auto_class_update(cls, checkpoint_for_example: str = "google-bert/bert-base-cased", head_doc: str = ""):
+    # Create a new class with the right name from the base class
+    model_mapping = cls._model_mapping
+    name = cls.__name__
+    class_docstring = insert_head_doc(CLASS_DOCSTRING, head_doc=head_doc)
+    cls.__doc__ = class_docstring.replace("BaseAutoModelClass", name)
+    # Now we need to copy and re-register `from_config` and `from_pretrained` as class methods otherwise we can't
+    # have a specific docstrings for them.
+    from_config = copy_func(_BaseAutoModelClass.from_config)
+    from_config_docstring = insert_head_doc(FROM_CONFIG_DOCSTRING, head_doc=head_doc)
+    from_config_docstring = from_config_docstring.replace("BaseAutoModelClass", name)
+    from_config_docstring = from_config_docstring.replace("checkpoint_placeholder", checkpoint_for_example)
+    from_config.__doc__ = from_config_docstring
+    from_config = replace_list_option_in_docstrings(model_mapping._model_mapping, use_model_types=False)(from_config)
+    cls.from_config = classmethod(from_config)
+    if name.startswith("TF"):
+        from_pretrained_docstring = FROM_PRETRAINED_TF_DOCSTRING
+    elif name.startswith("Flax"):
+        from_pretrained_docstring = FROM_PRETRAINED_FLAX_DOCSTRING
+    else:
+        from_pretrained_docstring = FROM_PRETRAINED_TORCH_DOCSTRING
+    from_pretrained = copy_func(_BaseAutoModelClass.from_pretrained)
+    from_pretrained_docstring = insert_head_doc(from_pretrained_docstring, head_doc=head_doc)
+    from_pretrained_docstring = from_pretrained_docstring.replace("BaseAutoModelClass", name)
+    from_pretrained_docstring = from_pretrained_docstring.replace("checkpoint_placeholder", checkpoint_for_example)
+    shortcut = checkpoint_for_example.split("/")[-1].split("-")[0]
+    from_pretrained_docstring = from_pretrained_docstring.replace("shortcut_placeholder", shortcut)
+    from_pretrained.__doc__ = from_pretrained_docstring
+    from_pretrained = replace_list_option_in_docstrings(model_mapping._model_mapping)(from_pretrained)
+    cls.from_pretrained = classmethod(from_pretrained)
+    return cls
+def get_values(model_mapping):
+    result = []
+    for model in model_mapping.values():
+        if isinstance(model, (list, tuple)):
+            result += list(model)
+        else:
+            result.append(model)
+    return result
+def getattribute_from_module(module, attr):
+    if attr is None:
+        return None
+    if isinstance(attr, tuple):
+        return tuple(getattribute_from_module(module, a) for a in attr)
+    if hasattr(module, attr):
+        return getattr(module, attr)
+    # Some of the mappings have entries model_type -> object of another model type. In that case we try to grab the
+    # object at the top level.
+    transformers_module = importlib.import_module("transformers")
+    if module != transformers_module:
+        try:
+            return getattribute_from_module(transformers_module, attr)
+        except ValueError:
+            raise ValueError(f"Could not find {attr} neither in {module} nor in {transformers_module}!")
+    else:
+        raise ValueError(f"Could not find {attr} in {transformers_module}!")
+def add_generation_mixin_to_remote_model(model_class):
+    """
+    Adds `GenerationMixin` to the inheritance of `model_class`, if `model_class` is a PyTorch model.
+    This function is used for backwards compatibility purposes: in v4.45, we've started a deprecation cycle to make
+    `PreTrainedModel` stop inheriting from `GenerationMixin`. Without this function, older models dynamically loaded
+    from the Hub may not have the `generate` method after we remove the inheritance.
+    """
+    # 1. If it is not a PT model (i.e. doesn't inherit Module), do nothing
+    if "torch.nn.modules.module.Module" not in str(model_class.__mro__):
+        return model_class
+    # 2. If it already **directly** inherits from GenerationMixin, do nothing
+    if "GenerationMixin" in str(model_class.__bases__):
+        return model_class
+    # 3. Prior to v4.45, we could detect whether a model was `generate`-compatible if it had its own `generate` and/or
+    # `prepare_inputs_for_generation` method.
+    has_custom_generate_in_class = hasattr(model_class, "generate") and "GenerationMixin" not in str(
+        getattr(model_class, "generate")
+    )
+    has_custom_prepare_inputs = hasattr(model_class, "prepare_inputs_for_generation") and "GenerationMixin" not in str(
+        getattr(model_class, "prepare_inputs_for_generation")
+    )
+    if has_custom_generate_in_class or has_custom_prepare_inputs:
+        model_class_with_generation_mixin = type(
+            model_class.__name__, (model_class, GenerationMixin), {**model_class.__dict__}
+        )
+        return model_class_with_generation_mixin
+    return model_class
+class _LazyAutoMapping(OrderedDict[type[PretrainedConfig], _LazyAutoMappingValue]):
+    """
+    " A mapping config to object (model or tokenizer for instance) that will load keys and values when it is accessed.
+    Args:
+        - config_mapping: The map model type to config class
+        - model_mapping: The map model type to model (or tokenizer) class
+    """
+    def __init__(self, config_mapping, model_mapping) -> None:
+        self._config_mapping = config_mapping
+        self._reverse_config_mapping = {v: k for k, v in config_mapping.items()}
+        self._model_mapping = model_mapping
+        self._model_mapping._model_mapping = self
+        self._extra_content = {}
+        self._modules = {}
+    def __len__(self) -> int:
+        common_keys = set(self._config_mapping.keys()).intersection(self._model_mapping.keys())
+        return len(common_keys) + len(self._extra_content)
+    def __getitem__(self, key: type[PretrainedConfig]) -> _LazyAutoMappingValue:
+        if key in self._extra_content:
+            return self._extra_content[key]
+        model_type = self._reverse_config_mapping[key.__name__]
+        if model_type in self._model_mapping:
+            model_name = self._model_mapping[model_type]
+            return self._load_attr_from_module(model_type, model_name)
+        # Maybe there was several model types associated with this config.
+        model_types = [k for k, v in self._config_mapping.items() if v == key.__name__]
+        for mtype in model_types:
+            if mtype in self._model_mapping:
+                model_name = self._model_mapping[mtype]
+                return self._load_attr_from_module(mtype, model_name)
+        raise KeyError(key)
+    def _load_attr_from_module(self, model_type, attr):
+        module_name = model_type_to_module_name(model_type)
+        if module_name not in self._modules:
+            self._modules[module_name] = importlib.import_module(f".{module_name}", "transformers.models")
+        return getattribute_from_module(self._modules[module_name], attr)
+    def keys(self) -> list[type[PretrainedConfig]]:
+        mapping_keys = [
+            self._load_attr_from_module(key, name)
+            for key, name in self._config_mapping.items()
+            if key in self._model_mapping
+        ]
+        return mapping_keys + list(self._extra_content.keys())
+    def get(self, key: type[PretrainedConfig], default: _T) -> Union[_LazyAutoMappingValue, _T]:
+        try:
+            return self.__getitem__(key)
+        except KeyError:
+            return default
+    def __bool__(self) -> bool:
+        return bool(self.keys())
+    def values(self) -> list[_LazyAutoMappingValue]:
+        mapping_values = [
+            self._load_attr_from_module(key, name)
+            for key, name in self._model_mapping.items()
+            if key in self._config_mapping
+        ]
+        return mapping_values + list(self._extra_content.values())
+    def items(self) -> list[tuple[type[PretrainedConfig], _LazyAutoMappingValue]]:
+        mapping_items = [
+            (
+                self._load_attr_from_module(key, self._config_mapping[key]),
+                self._load_attr_from_module(key, self._model_mapping[key]),
+            )
+            for key in self._model_mapping
+            if key in self._config_mapping
+        ]
+        return mapping_items + list(self._extra_content.items())
+    def __iter__(self) -> Iterator[type[PretrainedConfig]]:
+        return iter(self.keys())
+    def __contains__(self, item: type) -> bool:
+        if item in self._extra_content:
+            return True
+        if not hasattr(item, "__name__") or item.__name__ not in self._reverse_config_mapping:
+            return False
+        model_type = self._reverse_config_mapping[item.__name__]
+        return model_type in self._model_mapping
+    def register(self, key: type[PretrainedConfig], value: _LazyAutoMappingValue, exist_ok=False) -> None:
+        """
+        Register a new model in this mapping.
+        """
+        if hasattr(key, "__name__") and key.__name__ in self._reverse_config_mapping:
+            model_type = self._reverse_config_mapping[key.__name__]
+            if model_type in self._model_mapping and not exist_ok:
+                raise ValueError(f"'{key}' is already used by a Transformers model.")
+        self._extra_content[key] = value
+__all__ = ["get_values"]

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/auto/configuration_auto.py ADDED Viewed

	@@ -0,0 +1,1404 @@

+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Auto Config class."""
+import importlib
+import os
+import re
+import warnings
+from collections import OrderedDict
+from collections.abc import Callable, Iterator, KeysView, ValuesView
+from typing import Any, TypeVar, Union
+from ...configuration_utils import PretrainedConfig
+from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
+from ...utils import CONFIG_NAME, logging
+logger = logging.get_logger(__name__)
+_CallableT = TypeVar("_CallableT", bound=Callable[..., Any])
+CONFIG_MAPPING_NAMES = OrderedDict[str, str](
+    [
+        # Add configs here
+        ("aimv2", "Aimv2Config"),
+        ("aimv2_vision_model", "Aimv2VisionConfig"),
+        ("albert", "AlbertConfig"),
+        ("align", "AlignConfig"),
+        ("altclip", "AltCLIPConfig"),
+        ("apertus", "ApertusConfig"),
+        ("arcee", "ArceeConfig"),
+        ("aria", "AriaConfig"),
+        ("aria_text", "AriaTextConfig"),
+        ("audio-spectrogram-transformer", "ASTConfig"),
+        ("autoformer", "AutoformerConfig"),
+        ("aya_vision", "AyaVisionConfig"),
+        ("bamba", "BambaConfig"),
+        ("bark", "BarkConfig"),
+        ("bart", "BartConfig"),
+        ("beit", "BeitConfig"),
+        ("bert", "BertConfig"),
+        ("bert-generation", "BertGenerationConfig"),
+        ("big_bird", "BigBirdConfig"),
+        ("bigbird_pegasus", "BigBirdPegasusConfig"),
+        ("biogpt", "BioGptConfig"),
+        ("bit", "BitConfig"),
+        ("bitnet", "BitNetConfig"),
+        ("blenderbot", "BlenderbotConfig"),
+        ("blenderbot-small", "BlenderbotSmallConfig"),
+        ("blip", "BlipConfig"),
+        ("blip-2", "Blip2Config"),
+        ("blip_2_qformer", "Blip2QFormerConfig"),
+        ("bloom", "BloomConfig"),
+        ("blt", "BltConfig"),
+        ("bridgetower", "BridgeTowerConfig"),
+        ("bros", "BrosConfig"),
+        ("camembert", "CamembertConfig"),
+        ("canine", "CanineConfig"),
+        ("chameleon", "ChameleonConfig"),
+        ("chinese_clip", "ChineseCLIPConfig"),
+        ("chinese_clip_vision_model", "ChineseCLIPVisionConfig"),
+        ("clap", "ClapConfig"),
+        ("clip", "CLIPConfig"),
+        ("clip_text_model", "CLIPTextConfig"),
+        ("clip_vision_model", "CLIPVisionConfig"),
+        ("clipseg", "CLIPSegConfig"),
+        ("clvp", "ClvpConfig"),
+        ("code_llama", "LlamaConfig"),
+        ("codegen", "CodeGenConfig"),
+        ("cohere", "CohereConfig"),
+        ("cohere2", "Cohere2Config"),
+        ("cohere2_vision", "Cohere2VisionConfig"),
+        ("colpali", "ColPaliConfig"),
+        ("colqwen2", "ColQwen2Config"),
+        ("conditional_detr", "ConditionalDetrConfig"),
+        ("convbert", "ConvBertConfig"),
+        ("convnext", "ConvNextConfig"),
+        ("convnextv2", "ConvNextV2Config"),
+        ("cpmant", "CpmAntConfig"),
+        ("csm", "CsmConfig"),
+        ("ctrl", "CTRLConfig"),
+        ("cvt", "CvtConfig"),
+        ("d_fine", "DFineConfig"),
+        ("dab-detr", "DabDetrConfig"),
+        ("dac", "DacConfig"),
+        ("data2vec-audio", "Data2VecAudioConfig"),
+        ("data2vec-text", "Data2VecTextConfig"),
+        ("data2vec-vision", "Data2VecVisionConfig"),
+        ("dbrx", "DbrxConfig"),
+        ("deberta", "DebertaConfig"),
+        ("deberta-v2", "DebertaV2Config"),
+        ("decision_transformer", "DecisionTransformerConfig"),
+        ("deepseek_v2", "DeepseekV2Config"),
+        ("deepseek_v3", "DeepseekV3Config"),
+        ("deepseek_vl", "DeepseekVLConfig"),
+        ("deepseek_vl_hybrid", "DeepseekVLHybridConfig"),
+        ("deformable_detr", "DeformableDetrConfig"),
+        ("deit", "DeiTConfig"),
+        ("depth_anything", "DepthAnythingConfig"),
+        ("depth_pro", "DepthProConfig"),
+        ("deta", "DetaConfig"),
+        ("detr", "DetrConfig"),
+        ("dia", "DiaConfig"),
+        ("diffllama", "DiffLlamaConfig"),
+        ("dinat", "DinatConfig"),
+        ("dinov2", "Dinov2Config"),
+        ("dinov2_with_registers", "Dinov2WithRegistersConfig"),
+        ("dinov3_convnext", "DINOv3ConvNextConfig"),
+        ("dinov3_vit", "DINOv3ViTConfig"),
+        ("distilbert", "DistilBertConfig"),
+        ("doge", "DogeConfig"),
+        ("donut-swin", "DonutSwinConfig"),
+        ("dots1", "Dots1Config"),
+        ("dpr", "DPRConfig"),
+        ("dpt", "DPTConfig"),
+        ("edgetam", "EdgeTamConfig"),
+        ("edgetam_video", "EdgeTamVideoConfig"),
+        ("edgetam_vision_model", "EdgeTamVisionConfig"),
+        ("efficientformer", "EfficientFormerConfig"),
+        ("efficientloftr", "EfficientLoFTRConfig"),
+        ("efficientnet", "EfficientNetConfig"),
+        ("electra", "ElectraConfig"),
+        ("emu3", "Emu3Config"),
+        ("encodec", "EncodecConfig"),
+        ("encoder-decoder", "EncoderDecoderConfig"),
+        ("eomt", "EomtConfig"),
+        ("ernie", "ErnieConfig"),
+        ("ernie4_5", "Ernie4_5Config"),
+        ("ernie4_5_moe", "Ernie4_5_MoeConfig"),
+        ("ernie_m", "ErnieMConfig"),
+        ("esm", "EsmConfig"),
+        ("evolla", "EvollaConfig"),
+        ("exaone4", "Exaone4Config"),
+        ("falcon", "FalconConfig"),
+        ("falcon_h1", "FalconH1Config"),
+        ("falcon_mamba", "FalconMambaConfig"),
+        ("fastspeech2_conformer", "FastSpeech2ConformerConfig"),
+        ("fastspeech2_conformer_with_hifigan", "FastSpeech2ConformerWithHifiGanConfig"),
+        ("flaubert", "FlaubertConfig"),
+        ("flava", "FlavaConfig"),
+        ("flex_olmo", "FlexOlmoConfig"),
+        ("florence2", "Florence2Config"),
+        ("fnet", "FNetConfig"),
+        ("focalnet", "FocalNetConfig"),
+        ("fsmt", "FSMTConfig"),
+        ("funnel", "FunnelConfig"),
+        ("fuyu", "FuyuConfig"),
+        ("gemma", "GemmaConfig"),
+        ("gemma2", "Gemma2Config"),
+        ("gemma3", "Gemma3Config"),
+        ("gemma3_text", "Gemma3TextConfig"),
+        ("gemma3n", "Gemma3nConfig"),
+        ("gemma3n_audio", "Gemma3nAudioConfig"),
+        ("gemma3n_text", "Gemma3nTextConfig"),
+        ("gemma3n_vision", "Gemma3nVisionConfig"),
+        ("git", "GitConfig"),
+        ("glm", "GlmConfig"),
+        ("glm4", "Glm4Config"),
+        ("glm4_moe", "Glm4MoeConfig"),
+        ("glm4v", "Glm4vConfig"),
+        ("glm4v_moe", "Glm4vMoeConfig"),
+        ("glm4v_moe_text", "Glm4vMoeTextConfig"),
+        ("glm4v_text", "Glm4vTextConfig"),
+        ("glpn", "GLPNConfig"),
+        ("got_ocr2", "GotOcr2Config"),
+        ("gpt-sw3", "GPT2Config"),
+        ("gpt2", "GPT2Config"),
+        ("gpt_bigcode", "GPTBigCodeConfig"),
+        ("gpt_neo", "GPTNeoConfig"),
+        ("gpt_neox", "GPTNeoXConfig"),
+        ("gpt_neox_japanese", "GPTNeoXJapaneseConfig"),
+        ("gpt_oss", "GptOssConfig"),
+        ("gptj", "GPTJConfig"),
+        ("gptsan-japanese", "GPTSanJapaneseConfig"),
+        ("granite", "GraniteConfig"),
+        ("granite_speech", "GraniteSpeechConfig"),
+        ("granitemoe", "GraniteMoeConfig"),
+        ("granitemoehybrid", "GraniteMoeHybridConfig"),
+        ("granitemoeshared", "GraniteMoeSharedConfig"),
+        ("granitevision", "LlavaNextConfig"),
+        ("graphormer", "GraphormerConfig"),
+        ("grounding-dino", "GroundingDinoConfig"),
+        ("groupvit", "GroupViTConfig"),
+        ("helium", "HeliumConfig"),
+        ("hgnet_v2", "HGNetV2Config"),
+        ("hiera", "HieraConfig"),
+        ("hubert", "HubertConfig"),
+        ("hunyuan_v1_dense", "HunYuanDenseV1Config"),
+        ("hunyuan_v1_moe", "HunYuanMoEV1Config"),
+        ("ibert", "IBertConfig"),
+        ("idefics", "IdeficsConfig"),
+        ("idefics2", "Idefics2Config"),
+        ("idefics3", "Idefics3Config"),
+        ("idefics3_vision", "Idefics3VisionConfig"),
+        ("ijepa", "IJepaConfig"),
+        ("imagegpt", "ImageGPTConfig"),
+        ("informer", "InformerConfig"),
+        ("instructblip", "InstructBlipConfig"),
+        ("instructblipvideo", "InstructBlipVideoConfig"),
+        ("internvl", "InternVLConfig"),
+        ("internvl_vision", "InternVLVisionConfig"),
+        ("jamba", "JambaConfig"),
+        ("janus", "JanusConfig"),
+        ("jetmoe", "JetMoeConfig"),
+        ("jukebox", "JukeboxConfig"),
+        ("kosmos-2", "Kosmos2Config"),
+        ("kosmos-2.5", "Kosmos2_5Config"),
+        ("kyutai_speech_to_text", "KyutaiSpeechToTextConfig"),
+        ("layoutlm", "LayoutLMConfig"),
+        ("layoutlmv2", "LayoutLMv2Config"),
+        ("layoutlmv3", "LayoutLMv3Config"),
+        ("led", "LEDConfig"),
+        ("levit", "LevitConfig"),
+        ("lfm2", "Lfm2Config"),
+        ("lfm2_vl", "Lfm2VlConfig"),
+        ("lightglue", "LightGlueConfig"),
+        ("lilt", "LiltConfig"),
+        ("llama", "LlamaConfig"),
+        ("llama4", "Llama4Config"),
+        ("llama4_text", "Llama4TextConfig"),
+        ("llava", "LlavaConfig"),
+        ("llava_next", "LlavaNextConfig"),
+        ("llava_next_video", "LlavaNextVideoConfig"),
+        ("llava_onevision", "LlavaOnevisionConfig"),
+        ("longcat_flash", "LongcatFlashConfig"),
+        ("longformer", "LongformerConfig"),
+        ("longt5", "LongT5Config"),
+        ("luke", "LukeConfig"),
+        ("lxmert", "LxmertConfig"),
+        ("m2m_100", "M2M100Config"),
+        ("mamba", "MambaConfig"),
+        ("mamba2", "Mamba2Config"),
+        ("marian", "MarianConfig"),
+        ("markuplm", "MarkupLMConfig"),
+        ("mask2former", "Mask2FormerConfig"),
+        ("maskformer", "MaskFormerConfig"),
+        ("maskformer-swin", "MaskFormerSwinConfig"),
+        ("mbart", "MBartConfig"),
+        ("mctct", "MCTCTConfig"),
+        ("mega", "MegaConfig"),
+        ("megatron-bert", "MegatronBertConfig"),
+        ("metaclip_2", "MetaClip2Config"),
+        ("mgp-str", "MgpstrConfig"),
+        ("mimi", "MimiConfig"),
+        ("minimax", "MiniMaxConfig"),
+        ("ministral", "MinistralConfig"),
+        ("mistral", "MistralConfig"),
+        ("mistral3", "Mistral3Config"),
+        ("mixtral", "MixtralConfig"),
+        ("mlcd", "MLCDVisionConfig"),
+        ("mllama", "MllamaConfig"),
+        ("mm-grounding-dino", "MMGroundingDinoConfig"),
+        ("mobilebert", "MobileBertConfig"),
+        ("mobilenet_v1", "MobileNetV1Config"),
+        ("mobilenet_v2", "MobileNetV2Config"),
+        ("mobilevit", "MobileViTConfig"),
+        ("mobilevitv2", "MobileViTV2Config"),
+        ("modernbert", "ModernBertConfig"),
+        ("modernbert-decoder", "ModernBertDecoderConfig"),
+        ("moonshine", "MoonshineConfig"),
+        ("moshi", "MoshiConfig"),
+        ("mpnet", "MPNetConfig"),
+        ("mpt", "MptConfig"),
+        ("mra", "MraConfig"),
+        ("mt5", "MT5Config"),
+        ("musicgen", "MusicgenConfig"),
+        ("musicgen_melody", "MusicgenMelodyConfig"),
+        ("mvp", "MvpConfig"),
+        ("nat", "NatConfig"),
+        ("nemotron", "NemotronConfig"),
+        ("nezha", "NezhaConfig"),
+        ("nllb-moe", "NllbMoeConfig"),
+        ("nougat", "VisionEncoderDecoderConfig"),
+        ("nystromformer", "NystromformerConfig"),
+        ("olmo", "OlmoConfig"),
+        ("olmo2", "Olmo2Config"),
+        ("olmo3", "Olmo3Config"),
+        ("olmoe", "OlmoeConfig"),
+        ("omdet-turbo", "OmDetTurboConfig"),
+        ("oneformer", "OneFormerConfig"),
+        ("open-llama", "OpenLlamaConfig"),
+        ("openai-gpt", "OpenAIGPTConfig"),
+        ("opt", "OPTConfig"),
+        ("ovis2", "Ovis2Config"),
+        ("owlv2", "Owlv2Config"),
+        ("owlvit", "OwlViTConfig"),
+        ("paligemma", "PaliGemmaConfig"),
+        ("parakeet_ctc", "ParakeetCTCConfig"),
+        ("parakeet_encoder", "ParakeetEncoderConfig"),
+        ("patchtsmixer", "PatchTSMixerConfig"),
+        ("patchtst", "PatchTSTConfig"),
+        ("pegasus", "PegasusConfig"),
+        ("pegasus_x", "PegasusXConfig"),
+        ("perceiver", "PerceiverConfig"),
+        ("perception_encoder", "TimmWrapperConfig"),
+        ("perception_lm", "PerceptionLMConfig"),
+        ("persimmon", "PersimmonConfig"),
+        ("phi", "PhiConfig"),
+        ("phi3", "Phi3Config"),
+        ("phi4_multimodal", "Phi4MultimodalConfig"),
+        ("phimoe", "PhimoeConfig"),
+        ("pix2struct", "Pix2StructConfig"),
+        ("pixtral", "PixtralVisionConfig"),
+        ("plbart", "PLBartConfig"),
+        ("poolformer", "PoolFormerConfig"),
+        ("pop2piano", "Pop2PianoConfig"),
+        ("prompt_depth_anything", "PromptDepthAnythingConfig"),
+        ("prophetnet", "ProphetNetConfig"),
+        ("pvt", "PvtConfig"),
+        ("pvt_v2", "PvtV2Config"),
+        ("qdqbert", "QDQBertConfig"),
+        ("qwen2", "Qwen2Config"),
+        ("qwen2_5_omni", "Qwen2_5OmniConfig"),
+        ("qwen2_5_vl", "Qwen2_5_VLConfig"),
+        ("qwen2_5_vl_text", "Qwen2_5_VLTextConfig"),
+        ("qwen2_audio", "Qwen2AudioConfig"),
+        ("qwen2_audio_encoder", "Qwen2AudioEncoderConfig"),
+        ("qwen2_moe", "Qwen2MoeConfig"),
+        ("qwen2_vl", "Qwen2VLConfig"),
+        ("qwen2_vl_text", "Qwen2VLTextConfig"),
+        ("qwen3", "Qwen3Config"),
+        ("qwen3_moe", "Qwen3MoeConfig"),
+        ("qwen3_next", "Qwen3NextConfig"),
+        ("qwen3_omni_moe", "Qwen3OmniMoeConfig"),
+        ("qwen3_vl", "Qwen3VLConfig"),
+        ("qwen3_vl_moe", "Qwen3VLMoeConfig"),
+        ("qwen3_vl_moe_text", "Qwen3VLMoeTextConfig"),
+        ("qwen3_vl_text", "Qwen3VLTextConfig"),
+        ("rag", "RagConfig"),
+        ("realm", "RealmConfig"),
+        ("recurrent_gemma", "RecurrentGemmaConfig"),
+        ("reformer", "ReformerConfig"),
+        ("regnet", "RegNetConfig"),
+        ("rembert", "RemBertConfig"),
+        ("resnet", "ResNetConfig"),
+        ("retribert", "RetriBertConfig"),
+        ("roberta", "RobertaConfig"),
+        ("roberta-prelayernorm", "RobertaPreLayerNormConfig"),
+        ("roc_bert", "RoCBertConfig"),
+        ("roformer", "RoFormerConfig"),
+        ("rt_detr", "RTDetrConfig"),
+        ("rt_detr_resnet", "RTDetrResNetConfig"),
+        ("rt_detr_v2", "RTDetrV2Config"),
+        ("rwkv", "RwkvConfig"),
+        ("sam", "SamConfig"),
+        ("sam2", "Sam2Config"),
+        ("sam2_hiera_det_model", "Sam2HieraDetConfig"),
+        ("sam2_video", "Sam2VideoConfig"),
+        ("sam2_vision_model", "Sam2VisionConfig"),
+        ("sam_hq", "SamHQConfig"),
+        ("sam_hq_vision_model", "SamHQVisionConfig"),
+        ("sam_vision_model", "SamVisionConfig"),
+        ("seamless_m4t", "SeamlessM4TConfig"),
+        ("seamless_m4t_v2", "SeamlessM4Tv2Config"),
+        ("seed_oss", "SeedOssConfig"),
+        ("segformer", "SegformerConfig"),
+        ("seggpt", "SegGptConfig"),
+        ("sew", "SEWConfig"),
+        ("sew-d", "SEWDConfig"),
+        ("shieldgemma2", "ShieldGemma2Config"),
+        ("siglip", "SiglipConfig"),
+        ("siglip2", "Siglip2Config"),
+        ("siglip2_vision_model", "Siglip2VisionConfig"),
+        ("siglip_vision_model", "SiglipVisionConfig"),
+        ("smollm3", "SmolLM3Config"),
+        ("smolvlm", "SmolVLMConfig"),
+        ("smolvlm_vision", "SmolVLMVisionConfig"),
+        ("speech-encoder-decoder", "SpeechEncoderDecoderConfig"),
+        ("speech_to_text", "Speech2TextConfig"),
+        ("speech_to_text_2", "Speech2Text2Config"),
+        ("speecht5", "SpeechT5Config"),
+        ("splinter", "SplinterConfig"),
+        ("squeezebert", "SqueezeBertConfig"),
+        ("stablelm", "StableLmConfig"),
+        ("starcoder2", "Starcoder2Config"),
+        ("superglue", "SuperGlueConfig"),
+        ("superpoint", "SuperPointConfig"),
+        ("swiftformer", "SwiftFormerConfig"),
+        ("swin", "SwinConfig"),
+        ("swin2sr", "Swin2SRConfig"),
+        ("swinv2", "Swinv2Config"),
+        ("switch_transformers", "SwitchTransformersConfig"),
+        ("t5", "T5Config"),
+        ("t5gemma", "T5GemmaConfig"),
+        ("table-transformer", "TableTransformerConfig"),
+        ("tapas", "TapasConfig"),
+        ("textnet", "TextNetConfig"),
+        ("time_series_transformer", "TimeSeriesTransformerConfig"),
+        ("timesfm", "TimesFmConfig"),
+        ("timesformer", "TimesformerConfig"),
+        ("timm_backbone", "TimmBackboneConfig"),
+        ("timm_wrapper", "TimmWrapperConfig"),
+        ("trajectory_transformer", "TrajectoryTransformerConfig"),
+        ("transfo-xl", "TransfoXLConfig"),
+        ("trocr", "TrOCRConfig"),
+        ("tvlt", "TvltConfig"),
+        ("tvp", "TvpConfig"),
+        ("udop", "UdopConfig"),
+        ("umt5", "UMT5Config"),
+        ("unispeech", "UniSpeechConfig"),
+        ("unispeech-sat", "UniSpeechSatConfig"),
+        ("univnet", "UnivNetConfig"),
+        ("upernet", "UperNetConfig"),
+        ("van", "VanConfig"),
+        ("vaultgemma", "VaultGemmaConfig"),
+        ("video_llava", "VideoLlavaConfig"),
+        ("videomae", "VideoMAEConfig"),
+        ("vilt", "ViltConfig"),
+        ("vipllava", "VipLlavaConfig"),
+        ("vision-encoder-decoder", "VisionEncoderDecoderConfig"),
+        ("vision-text-dual-encoder", "VisionTextDualEncoderConfig"),
+        ("visual_bert", "VisualBertConfig"),
+        ("vit", "ViTConfig"),
+        ("vit_hybrid", "ViTHybridConfig"),
+        ("vit_mae", "ViTMAEConfig"),
+        ("vit_msn", "ViTMSNConfig"),
+        ("vitdet", "VitDetConfig"),
+        ("vitmatte", "VitMatteConfig"),
+        ("vitpose", "VitPoseConfig"),
+        ("vitpose_backbone", "VitPoseBackboneConfig"),
+        ("vits", "VitsConfig"),
+        ("vivit", "VivitConfig"),
+        ("vjepa2", "VJEPA2Config"),
+        ("voxtral", "VoxtralConfig"),
+        ("voxtral_encoder", "VoxtralEncoderConfig"),
+        ("wav2vec2", "Wav2Vec2Config"),
+        ("wav2vec2-bert", "Wav2Vec2BertConfig"),
+        ("wav2vec2-conformer", "Wav2Vec2ConformerConfig"),
+        ("wavlm", "WavLMConfig"),
+        ("whisper", "WhisperConfig"),
+        ("xclip", "XCLIPConfig"),
+        ("xcodec", "XcodecConfig"),
+        ("xglm", "XGLMConfig"),
+        ("xlm", "XLMConfig"),
+        ("xlm-prophetnet", "XLMProphetNetConfig"),
+        ("xlm-roberta", "XLMRobertaConfig"),
+        ("xlm-roberta-xl", "XLMRobertaXLConfig"),
+        ("xlnet", "XLNetConfig"),
+        ("xlstm", "xLSTMConfig"),
+        ("xmod", "XmodConfig"),
+        ("yolos", "YolosConfig"),
+        ("yoso", "YosoConfig"),
+        ("zamba", "ZambaConfig"),
+        ("zamba2", "Zamba2Config"),
+        ("zoedepth", "ZoeDepthConfig"),
+    ]
+)
+MODEL_NAMES_MAPPING = OrderedDict[str, str](
+    [
+        # Add full (and cased) model names here
+        ("aimv2", "AIMv2"),
+        ("aimv2_vision_model", "Aimv2VisionModel"),
+        ("albert", "ALBERT"),
+        ("align", "ALIGN"),
+        ("altclip", "AltCLIP"),
+        ("apertus", "Apertus"),
+        ("arcee", "Arcee"),
+        ("aria", "Aria"),
+        ("aria_text", "AriaText"),
+        ("audio-spectrogram-transformer", "Audio Spectrogram Transformer"),
+        ("autoformer", "Autoformer"),
+        ("aya_vision", "AyaVision"),
+        ("bamba", "Bamba"),
+        ("bark", "Bark"),
+        ("bart", "BART"),
+        ("barthez", "BARThez"),
+        ("bartpho", "BARTpho"),
+        ("beit", "BEiT"),
+        ("bert", "BERT"),
+        ("bert-generation", "Bert Generation"),
+        ("bert-japanese", "BertJapanese"),
+        ("bertweet", "BERTweet"),
+        ("big_bird", "BigBird"),
+        ("bigbird_pegasus", "BigBird-Pegasus"),
+        ("biogpt", "BioGpt"),
+        ("bit", "BiT"),
+        ("bitnet", "BitNet"),
+        ("blenderbot", "Blenderbot"),
+        ("blenderbot-small", "BlenderbotSmall"),
+        ("blip", "BLIP"),
+        ("blip-2", "BLIP-2"),
+        ("blip_2_qformer", "BLIP-2 QFormer"),
+        ("bloom", "BLOOM"),
+        ("blt", "Blt"),
+        ("bort", "BORT"),
+        ("bridgetower", "BridgeTower"),
+        ("bros", "BROS"),
+        ("byt5", "ByT5"),
+        ("camembert", "CamemBERT"),
+        ("canine", "CANINE"),
+        ("chameleon", "Chameleon"),
+        ("chinese_clip", "Chinese-CLIP"),
+        ("chinese_clip_vision_model", "ChineseCLIPVisionModel"),
+        ("clap", "CLAP"),
+        ("clip", "CLIP"),
+        ("clip_text_model", "CLIPTextModel"),
+        ("clip_vision_model", "CLIPVisionModel"),
+        ("clipseg", "CLIPSeg"),
+        ("clvp", "CLVP"),
+        ("code_llama", "CodeLlama"),
+        ("codegen", "CodeGen"),
+        ("cohere", "Cohere"),
+        ("cohere2", "Cohere2"),
+        ("cohere2_vision", "Cohere2Vision"),
+        ("colpali", "ColPali"),
+        ("colqwen2", "ColQwen2"),
+        ("conditional_detr", "Conditional DETR"),
+        ("convbert", "ConvBERT"),
+        ("convnext", "ConvNeXT"),
+        ("convnextv2", "ConvNeXTV2"),
+        ("cpm", "CPM"),
+        ("cpmant", "CPM-Ant"),
+        ("csm", "CSM"),
+        ("ctrl", "CTRL"),
+        ("cvt", "CvT"),
+        ("d_fine", "D-FINE"),
+        ("dab-detr", "DAB-DETR"),
+        ("dac", "DAC"),
+        ("data2vec-audio", "Data2VecAudio"),
+        ("data2vec-text", "Data2VecText"),
+        ("data2vec-vision", "Data2VecVision"),
+        ("dbrx", "DBRX"),
+        ("deberta", "DeBERTa"),
+        ("deberta-v2", "DeBERTa-v2"),
+        ("decision_transformer", "Decision Transformer"),
+        ("deepseek_v2", "DeepSeek-V2"),
+        ("deepseek_v3", "DeepSeek-V3"),
+        ("deepseek_vl", "DeepseekVL"),
+        ("deepseek_vl_hybrid", "DeepseekVLHybrid"),
+        ("deformable_detr", "Deformable DETR"),
+        ("deit", "DeiT"),
+        ("deplot", "DePlot"),
+        ("depth_anything", "Depth Anything"),
+        ("depth_anything_v2", "Depth Anything V2"),
+        ("depth_pro", "DepthPro"),
+        ("deta", "DETA"),
+        ("detr", "DETR"),
+        ("dia", "Dia"),
+        ("dialogpt", "DialoGPT"),
+        ("diffllama", "DiffLlama"),
+        ("dinat", "DiNAT"),
+        ("dinov2", "DINOv2"),
+        ("dinov2_with_registers", "DINOv2 with Registers"),
+        ("dinov3_convnext", "DINOv3 ConvNext"),
+        ("dinov3_vit", "DINOv3 ViT"),
+        ("distilbert", "DistilBERT"),
+        ("dit", "DiT"),
+        ("doge", "Doge"),
+        ("donut-swin", "DonutSwin"),
+        ("dots1", "dots1"),
+        ("dpr", "DPR"),
+        ("dpt", "DPT"),
+        ("edgetam", "EdgeTAM"),
+        ("edgetam_video", "EdgeTamVideo"),
+        ("edgetam_vision_model", "EdgeTamVisionModel"),
+        ("efficientformer", "EfficientFormer"),
+        ("efficientloftr", "EfficientLoFTR"),
+        ("efficientnet", "EfficientNet"),
+        ("electra", "ELECTRA"),
+        ("emu3", "Emu3"),
+        ("encodec", "EnCodec"),
+        ("encoder-decoder", "Encoder decoder"),
+        ("eomt", "EoMT"),
+        ("ernie", "ERNIE"),
+        ("ernie4_5", "Ernie4_5"),
+        ("ernie4_5_moe", "Ernie4_5_MoE"),
+        ("ernie_m", "ErnieM"),
+        ("esm", "ESM"),
+        ("evolla", "Evolla"),
+        ("exaone4", "EXAONE-4.0"),
+        ("falcon", "Falcon"),
+        ("falcon3", "Falcon3"),
+        ("falcon_h1", "FalconH1"),
+        ("falcon_mamba", "FalconMamba"),
+        ("fastspeech2_conformer", "FastSpeech2Conformer"),
+        ("fastspeech2_conformer_with_hifigan", "FastSpeech2ConformerWithHifiGan"),
+        ("flan-t5", "FLAN-T5"),
+        ("flan-ul2", "FLAN-UL2"),
+        ("flaubert", "FlauBERT"),
+        ("flava", "FLAVA"),
+        ("flex_olmo", "FlexOlmo"),
+        ("florence2", "Florence2"),
+        ("fnet", "FNet"),
+        ("focalnet", "FocalNet"),
+        ("fsmt", "FairSeq Machine-Translation"),
+        ("funnel", "Funnel Transformer"),
+        ("fuyu", "Fuyu"),
+        ("gemma", "Gemma"),
+        ("gemma2", "Gemma2"),
+        ("gemma3", "Gemma3ForConditionalGeneration"),
+        ("gemma3_text", "Gemma3ForCausalLM"),
+        ("gemma3n", "Gemma3nForConditionalGeneration"),
+        ("gemma3n_audio", "Gemma3nAudioEncoder"),
+        ("gemma3n_text", "Gemma3nForCausalLM"),
+        ("gemma3n_vision", "TimmWrapperModel"),
+        ("git", "GIT"),
+        ("glm", "GLM"),
+        ("glm4", "GLM4"),
+        ("glm4_moe", "Glm4MoE"),
+        ("glm4v", "GLM4V"),
+        ("glm4v_moe", "GLM4VMOE"),
+        ("glm4v_moe_text", "GLM4VMOE"),
+        ("glm4v_text", "GLM4V"),
+        ("glpn", "GLPN"),
+        ("got_ocr2", "GOT-OCR2"),
+        ("gpt-sw3", "GPT-Sw3"),
+        ("gpt2", "OpenAI GPT-2"),
+        ("gpt_bigcode", "GPTBigCode"),
+        ("gpt_neo", "GPT Neo"),
+        ("gpt_neox", "GPT NeoX"),
+        ("gpt_neox_japanese", "GPT NeoX Japanese"),
+        ("gpt_oss", "GptOss"),
+        ("gptj", "GPT-J"),
+        ("gptsan-japanese", "GPTSAN-japanese"),
+        ("granite", "Granite"),
+        ("granite_speech", "GraniteSpeech"),
+        ("granitemoe", "GraniteMoeMoe"),
+        ("granitemoehybrid", "GraniteMoeHybrid"),
+        ("granitemoeshared", "GraniteMoeSharedMoe"),
+        ("granitevision", "LLaVA-NeXT"),
+        ("graphormer", "Graphormer"),
+        ("grounding-dino", "Grounding DINO"),
+        ("groupvit", "GroupViT"),
+        ("helium", "Helium"),
+        ("herbert", "HerBERT"),
+        ("hgnet_v2", "HGNet-V2"),
+        ("hiera", "Hiera"),
+        ("hubert", "Hubert"),
+        ("hunyuan_v1_dense", "HunYuanDenseV1"),
+        ("hunyuan_v1_moe", "HunYuanMoeV1"),
+        ("ibert", "I-BERT"),
+        ("idefics", "IDEFICS"),
+        ("idefics2", "Idefics2"),
+        ("idefics3", "Idefics3"),
+        ("idefics3_vision", "Idefics3VisionTransformer"),
+        ("ijepa", "I-JEPA"),
+        ("imagegpt", "ImageGPT"),
+        ("informer", "Informer"),
+        ("instructblip", "InstructBLIP"),
+        ("instructblipvideo", "InstructBlipVideo"),
+        ("internvl", "InternVL"),
+        ("internvl_vision", "InternVLVision"),
+        ("jamba", "Jamba"),
+        ("janus", "Janus"),
+        ("jetmoe", "JetMoe"),
+        ("jukebox", "Jukebox"),
+        ("kosmos-2", "KOSMOS-2"),
+        ("kosmos-2.5", "KOSMOS-2.5"),
+        ("kyutai_speech_to_text", "KyutaiSpeechToText"),
+        ("layoutlm", "LayoutLM"),
+        ("layoutlmv2", "LayoutLMv2"),
+        ("layoutlmv3", "LayoutLMv3"),
+        ("layoutxlm", "LayoutXLM"),
+        ("led", "LED"),
+        ("levit", "LeViT"),
+        ("lfm2", "Lfm2"),
+        ("lfm2_vl", "Lfm2Vl"),
+        ("lightglue", "LightGlue"),
+        ("lilt", "LiLT"),
+        ("llama", "LLaMA"),
+        ("llama2", "Llama2"),
+        ("llama3", "Llama3"),
+        ("llama4", "Llama4"),
+        ("llama4_text", "Llama4ForCausalLM"),
+        ("llava", "LLaVa"),
+        ("llava_next", "LLaVA-NeXT"),
+        ("llava_next_video", "LLaVa-NeXT-Video"),
+        ("llava_onevision", "LLaVA-Onevision"),
+        ("longcat_flash", "LongCatFlash"),
+        ("longformer", "Longformer"),
+        ("longt5", "LongT5"),
+        ("luke", "LUKE"),
+        ("lxmert", "LXMERT"),
+        ("m2m_100", "M2M100"),
+        ("madlad-400", "MADLAD-400"),
+        ("mamba", "Mamba"),
+        ("mamba2", "mamba2"),
+        ("marian", "Marian"),
+        ("markuplm", "MarkupLM"),
+        ("mask2former", "Mask2Former"),
+        ("maskformer", "MaskFormer"),
+        ("maskformer-swin", "MaskFormerSwin"),
+        ("matcha", "MatCha"),
+        ("mbart", "mBART"),
+        ("mbart50", "mBART-50"),
+        ("mctct", "M-CTC-T"),
+        ("mega", "MEGA"),
+        ("megatron-bert", "Megatron-BERT"),
+        ("megatron_gpt2", "Megatron-GPT2"),
+        ("metaclip_2", "MetaCLIP 2"),
+        ("mgp-str", "MGP-STR"),
+        ("mimi", "Mimi"),
+        ("minimax", "MiniMax"),
+        ("ministral", "Ministral"),
+        ("mistral", "Mistral"),
+        ("mistral3", "Mistral3"),
+        ("mixtral", "Mixtral"),
+        ("mlcd", "MLCD"),
+        ("mllama", "Mllama"),
+        ("mluke", "mLUKE"),
+        ("mm-grounding-dino", "MM Grounding DINO"),
+        ("mms", "MMS"),
+        ("mobilebert", "MobileBERT"),
+        ("mobilenet_v1", "MobileNetV1"),
+        ("mobilenet_v2", "MobileNetV2"),
+        ("mobilevit", "MobileViT"),
+        ("mobilevitv2", "MobileViTV2"),
+        ("modernbert", "ModernBERT"),
+        ("modernbert-decoder", "ModernBertDecoder"),
+        ("moonshine", "Moonshine"),
+        ("moshi", "Moshi"),
+        ("mpnet", "MPNet"),
+        ("mpt", "MPT"),
+        ("mra", "MRA"),
+        ("mt5", "MT5"),
+        ("musicgen", "MusicGen"),
+        ("musicgen_melody", "MusicGen Melody"),
+        ("mvp", "MVP"),
+        ("myt5", "myt5"),
+        ("nat", "NAT"),
+        ("nemotron", "Nemotron"),
+        ("nezha", "Nezha"),
+        ("nllb", "NLLB"),
+        ("nllb-moe", "NLLB-MOE"),
+        ("nougat", "Nougat"),
+        ("nystromformer", "Nyströmformer"),
+        ("olmo", "OLMo"),
+        ("olmo2", "OLMo2"),
+        ("olmo3", "Olmo3"),
+        ("olmoe", "OLMoE"),
+        ("omdet-turbo", "OmDet-Turbo"),
+        ("oneformer", "OneFormer"),
+        ("open-llama", "OpenLlama"),
+        ("openai-gpt", "OpenAI GPT"),
+        ("opt", "OPT"),
+        ("ovis2", "Ovis2"),
+        ("owlv2", "OWLv2"),
+        ("owlvit", "OWL-ViT"),
+        ("paligemma", "PaliGemma"),
+        ("parakeet", "Parakeet"),
+        ("parakeet_ctc", "Parakeet"),
+        ("parakeet_encoder", "ParakeetEncoder"),
+        ("patchtsmixer", "PatchTSMixer"),
+        ("patchtst", "PatchTST"),
+        ("pegasus", "Pegasus"),
+        ("pegasus_x", "PEGASUS-X"),
+        ("perceiver", "Perceiver"),
+        ("perception_encoder", "PerceptionEncoder"),
+        ("perception_lm", "PerceptionLM"),
+        ("persimmon", "Persimmon"),
+        ("phi", "Phi"),
+        ("phi3", "Phi3"),
+        ("phi4_multimodal", "Phi4Multimodal"),
+        ("phimoe", "Phimoe"),
+        ("phobert", "PhoBERT"),
+        ("pix2struct", "Pix2Struct"),
+        ("pixtral", "Pixtral"),
+        ("plbart", "PLBart"),
+        ("poolformer", "PoolFormer"),
+        ("pop2piano", "Pop2Piano"),
+        ("prompt_depth_anything", "PromptDepthAnything"),
+        ("prophetnet", "ProphetNet"),
+        ("pvt", "PVT"),
+        ("pvt_v2", "PVTv2"),
+        ("qdqbert", "QDQBert"),
+        ("qwen2", "Qwen2"),
+        ("qwen2_5_omni", "Qwen2_5Omni"),
+        ("qwen2_5_vl", "Qwen2_5_VL"),
+        ("qwen2_5_vl_text", "Qwen2_5_VL"),
+        ("qwen2_audio", "Qwen2Audio"),
+        ("qwen2_audio_encoder", "Qwen2AudioEncoder"),
+        ("qwen2_moe", "Qwen2MoE"),
+        ("qwen2_vl", "Qwen2VL"),
+        ("qwen2_vl_text", "Qwen2VL"),
+        ("qwen3", "Qwen3"),
+        ("qwen3_moe", "Qwen3MoE"),
+        ("qwen3_next", "Qwen3Next"),
+        ("qwen3_omni_moe", "Qwen3OmniMoE"),
+        ("qwen3_vl", "Qwen3VL"),
+        ("qwen3_vl_moe", "Qwen3VLMoe"),
+        ("qwen3_vl_moe_text", "Qwen3VLMoe"),
+        ("qwen3_vl_text", "Qwen3VL"),
+        ("rag", "RAG"),
+        ("realm", "REALM"),
+        ("recurrent_gemma", "RecurrentGemma"),
+        ("reformer", "Reformer"),
+        ("regnet", "RegNet"),
+        ("rembert", "RemBERT"),
+        ("resnet", "ResNet"),
+        ("retribert", "RetriBERT"),
+        ("roberta", "RoBERTa"),
+        ("roberta-prelayernorm", "RoBERTa-PreLayerNorm"),
+        ("roc_bert", "RoCBert"),
+        ("roformer", "RoFormer"),
+        ("rt_detr", "RT-DETR"),
+        ("rt_detr_resnet", "RT-DETR-ResNet"),
+        ("rt_detr_v2", "RT-DETRv2"),
+        ("rwkv", "RWKV"),
+        ("sam", "SAM"),
+        ("sam2", "SAM2"),
+        ("sam2_hiera_det_model", "Sam2HieraDetModel"),
+        ("sam2_video", "Sam2VideoModel"),
+        ("sam2_vision_model", "Sam2VisionModel"),
+        ("sam_hq", "SAM-HQ"),
+        ("sam_hq_vision_model", "SamHQVisionModel"),
+        ("sam_vision_model", "SamVisionModel"),
+        ("seamless_m4t", "SeamlessM4T"),
+        ("seamless_m4t_v2", "SeamlessM4Tv2"),
+        ("seed_oss", "SeedOss"),
+        ("segformer", "SegFormer"),
+        ("seggpt", "SegGPT"),
+        ("sew", "SEW"),
+        ("sew-d", "SEW-D"),
+        ("shieldgemma2", "Shieldgemma2"),
+        ("siglip", "SigLIP"),
+        ("siglip2", "SigLIP2"),
+        ("siglip2_vision_model", "Siglip2VisionModel"),
+        ("siglip_vision_model", "SiglipVisionModel"),
+        ("smollm3", "SmolLM3"),
+        ("smolvlm", "SmolVLM"),
+        ("smolvlm_vision", "SmolVLMVisionTransformer"),
+        ("speech-encoder-decoder", "Speech Encoder decoder"),
+        ("speech_to_text", "Speech2Text"),
+        ("speech_to_text_2", "Speech2Text2"),
+        ("speecht5", "SpeechT5"),
+        ("splinter", "Splinter"),
+        ("squeezebert", "SqueezeBERT"),
+        ("stablelm", "StableLm"),
+        ("starcoder2", "Starcoder2"),
+        ("superglue", "SuperGlue"),
+        ("superpoint", "SuperPoint"),
+        ("swiftformer", "SwiftFormer"),
+        ("swin", "Swin Transformer"),
+        ("swin2sr", "Swin2SR"),
+        ("swinv2", "Swin Transformer V2"),
+        ("switch_transformers", "SwitchTransformers"),
+        ("t5", "T5"),
+        ("t5gemma", "T5Gemma"),
+        ("t5v1.1", "T5v1.1"),
+        ("table-transformer", "Table Transformer"),
+        ("tapas", "TAPAS"),
+        ("tapex", "TAPEX"),
+        ("textnet", "TextNet"),
+        ("time_series_transformer", "Time Series Transformer"),
+        ("timesfm", "TimesFm"),
+        ("timesformer", "TimeSformer"),
+        ("timm_backbone", "TimmBackbone"),
+        ("timm_wrapper", "TimmWrapperModel"),
+        ("trajectory_transformer", "Trajectory Transformer"),
+        ("transfo-xl", "Transformer-XL"),
+        ("trocr", "TrOCR"),
+        ("tvlt", "TVLT"),
+        ("tvp", "TVP"),
+        ("udop", "UDOP"),
+        ("ul2", "UL2"),
+        ("umt5", "UMT5"),
+        ("unispeech", "UniSpeech"),
+        ("unispeech-sat", "UniSpeechSat"),
+        ("univnet", "UnivNet"),
+        ("upernet", "UPerNet"),
+        ("van", "VAN"),
+        ("vaultgemma", "VaultGemma"),
+        ("video_llava", "VideoLlava"),
+        ("videomae", "VideoMAE"),
+        ("vilt", "ViLT"),
+        ("vipllava", "VipLlava"),
+        ("vision-encoder-decoder", "Vision Encoder decoder"),
+        ("vision-text-dual-encoder", "VisionTextDualEncoder"),
+        ("visual_bert", "VisualBERT"),
+        ("vit", "ViT"),
+        ("vit_hybrid", "ViT Hybrid"),
+        ("vit_mae", "ViTMAE"),
+        ("vit_msn", "ViTMSN"),
+        ("vitdet", "VitDet"),
+        ("vitmatte", "ViTMatte"),
+        ("vitpose", "ViTPose"),
+        ("vitpose_backbone", "ViTPoseBackbone"),
+        ("vits", "VITS"),
+        ("vivit", "ViViT"),
+        ("vjepa2", "VJEPA2Model"),
+        ("voxtral", "Voxtral"),
+        ("voxtral_encoder", "Voxtral Encoder"),
+        ("wav2vec2", "Wav2Vec2"),
+        ("wav2vec2-bert", "Wav2Vec2-BERT"),
+        ("wav2vec2-conformer", "Wav2Vec2-Conformer"),
+        ("wav2vec2_phoneme", "Wav2Vec2Phoneme"),
+        ("wavlm", "WavLM"),
+        ("whisper", "Whisper"),
+        ("xclip", "X-CLIP"),
+        ("xcodec", "X-CODEC"),
+        ("xglm", "XGLM"),
+        ("xlm", "XLM"),
+        ("xlm-prophetnet", "XLM-ProphetNet"),
+        ("xlm-roberta", "XLM-RoBERTa"),
+        ("xlm-roberta-xl", "XLM-RoBERTa-XL"),
+        ("xlm-v", "XLM-V"),
+        ("xlnet", "XLNet"),
+        ("xls_r", "XLS-R"),
+        ("xlsr_wav2vec2", "XLSR-Wav2Vec2"),
+        ("xlstm", "xLSTM"),
+        ("xmod", "X-MOD"),
+        ("yolos", "YOLOS"),
+        ("yoso", "YOSO"),
+        ("zamba", "Zamba"),
+        ("zamba2", "Zamba2"),
+        ("zoedepth", "ZoeDepth"),
+    ]
+)
+# This is tied to the processing `-` -> `_` in `model_type_to_module_name`. For example, instead of putting
+# `transfo-xl` (as in `CONFIG_MAPPING_NAMES`), we should use `transfo_xl`.
+DEPRECATED_MODELS = [
+    "bort",
+    "deta",
+    "efficientformer",
+    "ernie_m",
+    "gptsan_japanese",
+    "graphormer",
+    "jukebox",
+    "mctct",
+    "mega",
+    "mmbt",
+    "nat",
+    "nezha",
+    "open_llama",
+    "qdqbert",
+    "realm",
+    "retribert",
+    "speech_to_text_2",
+    "tapex",
+    "trajectory_transformer",
+    "transfo_xl",
+    "tvlt",
+    "van",
+    "vit_hybrid",
+    "xlm_prophetnet",
+]
+SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict[str, str](
+    [
+        ("openai-gpt", "openai"),
+        ("data2vec-audio", "data2vec"),
+        ("data2vec-text", "data2vec"),
+        ("data2vec-vision", "data2vec"),
+        ("donut-swin", "donut"),
+        ("kosmos-2", "kosmos2"),
+        ("kosmos-2.5", "kosmos2_5"),
+        ("maskformer-swin", "maskformer"),
+        ("xclip", "x_clip"),
+        ("clip_vision_model", "clip"),
+        ("qwen2_audio_encoder", "qwen2_audio"),
+        ("voxtral_encoder", "voxtral"),
+        ("clip_text_model", "clip"),
+        ("aria_text", "aria"),
+        ("gemma3_text", "gemma3"),
+        ("gemma3n_audio", "gemma3n"),
+        ("gemma3n_text", "gemma3n"),
+        ("gemma3n_vision", "gemma3n"),
+        ("glm4v_text", "glm4v"),
+        ("glm4v_moe_text", "glm4v_moe"),
+        ("idefics3_vision", "idefics3"),
+        ("siglip_vision_model", "siglip"),
+        ("siglip2_vision_model", "siglip2"),
+        ("aimv2_vision_model", "aimv2"),
+        ("smolvlm_vision", "smolvlm"),
+        ("chinese_clip_vision_model", "chinese_clip"),
+        ("rt_detr_resnet", "rt_detr"),
+        ("granitevision", "llava_next"),
+        ("internvl_vision", "internvl"),
+        ("qwen2_5_vl_text", "qwen2_5_vl"),
+        ("qwen2_vl_text", "qwen2_vl"),
+        ("qwen3_vl_text", "qwen3_vl"),
+        ("qwen3_vl_moe_text", "qwen3_vl_moe"),
+        ("sam_vision_model", "sam"),
+        ("sam2_vision_model", "sam2"),
+        ("edgetam_vision_model", "edgetam"),
+        ("sam2_hiera_det_model", "sam2"),
+        ("sam_hq_vision_model", "sam_hq"),
+        ("llama4_text", "llama4"),
+        ("blip_2_qformer", "blip_2"),
+        ("fastspeech2_conformer_with_hifigan", "fastspeech2_conformer"),
+        ("perception_encoder", "perception_lm"),
+        ("parakeet_encoder", "parakeet"),
+        ("parakeet_ctc", "parakeet"),
+    ]
+)
+def model_type_to_module_name(key) -> str:
+    """Converts a config key to the corresponding module."""
+    # Special treatment
+    if key in SPECIAL_MODEL_TYPE_TO_MODULE_NAME:
+        key = SPECIAL_MODEL_TYPE_TO_MODULE_NAME[key]
+        if key in DEPRECATED_MODELS:
+            key = f"deprecated.{key}"
+        return key
+    key = key.replace("-", "_")
+    if key in DEPRECATED_MODELS:
+        key = f"deprecated.{key}"
+    return key
+def config_class_to_model_type(config) -> Union[str, None]:
+    """Converts a config class name to the corresponding model type"""
+    for key, cls in CONFIG_MAPPING_NAMES.items():
+        if cls == config:
+            return key
+    # if key not found check in extra content
+    for key, cls in CONFIG_MAPPING._extra_content.items():
+        if cls.__name__ == config:
+            return key
+    return None
+class _LazyConfigMapping(OrderedDict[str, type[PretrainedConfig]]):
+    """
+    A dictionary that lazily load its values when they are requested.
+    """
+    def __init__(self, mapping) -> None:
+        self._mapping = mapping
+        self._extra_content = {}
+        self._modules = {}
+    def __getitem__(self, key: str) -> type[PretrainedConfig]:
+        if key in self._extra_content:
+            return self._extra_content[key]
+        if key not in self._mapping:
+            raise KeyError(key)
+        value = self._mapping[key]
+        module_name = model_type_to_module_name(key)
+        if module_name not in self._modules:
+            self._modules[module_name] = importlib.import_module(f".{module_name}", "transformers.models")
+        if hasattr(self._modules[module_name], value):
+            return getattr(self._modules[module_name], value)
+        # Some of the mappings have entries model_type -> config of another model type. In that case we try to grab the
+        # object at the top level.
+        transformers_module = importlib.import_module("transformers")
+        return getattr(transformers_module, value)
+    def keys(self) -> list[str]:
+        return list(self._mapping.keys()) + list(self._extra_content.keys())
+    def values(self) -> list[type[PretrainedConfig]]:
+        return [self[k] for k in self._mapping] + list(self._extra_content.values())
+    def items(self) -> list[tuple[str, type[PretrainedConfig]]]:
+        return [(k, self[k]) for k in self._mapping] + list(self._extra_content.items())
+    def __iter__(self) -> Iterator[str]:
+        return iter(list(self._mapping.keys()) + list(self._extra_content.keys()))
+    def __contains__(self, item: object) -> bool:
+        return item in self._mapping or item in self._extra_content
+    def register(self, key: str, value: type[PretrainedConfig], exist_ok=False) -> None:
+        """
+        Register a new configuration in this mapping.
+        """
+        if key in self._mapping and not exist_ok:
+            raise ValueError(f"'{key}' is already used by a Transformers config, pick another name.")
+        self._extra_content[key] = value
+CONFIG_MAPPING = _LazyConfigMapping(CONFIG_MAPPING_NAMES)
+class _LazyLoadAllMappings(OrderedDict[str, str]):
+    """
+    A mapping that will load all pairs of key values at the first access (either by indexing, requestions keys, values,
+    etc.)
+    Args:
+        mapping: The mapping to load.
+    """
+    def __init__(self, mapping):
+        self._mapping = mapping
+        self._initialized = False
+        self._data = {}
+    def _initialize(self):
+        if self._initialized:
+            return
+        for model_type, map_name in self._mapping.items():
+            module_name = model_type_to_module_name(model_type)
+            module = importlib.import_module(f".{module_name}", "transformers.models")
+            mapping = getattr(module, map_name)
+            self._data.update(mapping)
+        self._initialized = True
+    def __getitem__(self, key):
+        self._initialize()
+        return self._data[key]
+    def keys(self) -> KeysView[str]:
+        self._initialize()
+        return self._data.keys()
+    def values(self) -> ValuesView[str]:
+        self._initialize()
+        return self._data.values()
+    def items(self) -> KeysView[str]:
+        self._initialize()
+        return self._data.keys()
+    def __iter__(self) -> Iterator[str]:
+        self._initialize()
+        return iter(self._data)
+    def __contains__(self, item: object) -> bool:
+        self._initialize()
+        return item in self._data
+def _get_class_name(model_class: Union[str, list[str]]):
+    if isinstance(model_class, (list, tuple)):
+        return " or ".join([f"[`{c}`]" for c in model_class if c is not None])
+    return f"[`{model_class}`]"
+def _list_model_options(indent, config_to_class=None, use_model_types=True):
+    if config_to_class is None and not use_model_types:
+        raise ValueError("Using `use_model_types=False` requires a `config_to_class` dictionary.")
+    if use_model_types:
+        if config_to_class is None:
+            model_type_to_name = {model_type: f"[`{config}`]" for model_type, config in CONFIG_MAPPING_NAMES.items()}
+        else:
+            model_type_to_name = {
+                model_type: _get_class_name(model_class)
+                for model_type, model_class in config_to_class.items()
+                if model_type in MODEL_NAMES_MAPPING
+            }
+        lines = [
+            f"{indent}- **{model_type}** -- {model_type_to_name[model_type]} ({MODEL_NAMES_MAPPING[model_type]} model)"
+            for model_type in sorted(model_type_to_name.keys())
+        ]
+    else:
+        config_to_name = {
+            CONFIG_MAPPING_NAMES[config]: _get_class_name(clas)
+            for config, clas in config_to_class.items()
+            if config in CONFIG_MAPPING_NAMES
+        }
+        config_to_model_name = {
+            config: MODEL_NAMES_MAPPING[model_type] for model_type, config in CONFIG_MAPPING_NAMES.items()
+        }
+        lines = [
+            f"{indent}- [`{config_name}`] configuration class:"
+            f" {config_to_name[config_name]} ({config_to_model_name[config_name]} model)"
+            for config_name in sorted(config_to_name.keys())
+        ]
+    return "\n".join(lines)
+def replace_list_option_in_docstrings(
+    config_to_class=None, use_model_types: bool = True
+) -> Callable[[_CallableT], _CallableT]:
+    def docstring_decorator(fn):
+        docstrings = fn.__doc__
+        if docstrings is None:
+            # Example: -OO
+            return fn
+        lines = docstrings.split("\n")
+        i = 0
+        while i < len(lines) and re.search(r"^(\s*)List options\s*$", lines[i]) is None:
+            i += 1
+        if i < len(lines):
+            indent = re.search(r"^(\s*)List options\s*$", lines[i]).groups()[0]
+            if use_model_types:
+                indent = f"{indent}    "
+            lines[i] = _list_model_options(indent, config_to_class=config_to_class, use_model_types=use_model_types)
+            docstrings = "\n".join(lines)
+        else:
+            raise ValueError(
+                f"The function {fn} should have an empty 'List options' in its docstring as placeholder, current"
+                f" docstring is:\n{docstrings}"
+            )
+        fn.__doc__ = docstrings
+        return fn
+    return docstring_decorator
+class AutoConfig:
+    r"""
+    This is a generic configuration class that will be instantiated as one of the configuration classes of the library
+    when created with the [`~AutoConfig.from_pretrained`] class method.
+    This class cannot be instantiated directly using `__init__()` (throws an error).
+    """
+    def __init__(self) -> None:
+        raise OSError(
+            "AutoConfig is designed to be instantiated "
+            "using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method."
+        )
+    @classmethod
+    def for_model(cls, model_type: str, *args, **kwargs) -> PretrainedConfig:
+        if model_type in CONFIG_MAPPING:
+            config_class = CONFIG_MAPPING[model_type]
+            return config_class(*args, **kwargs)
+        raise ValueError(
+            f"Unrecognized model identifier: {model_type}. Should contain one of {', '.join(CONFIG_MAPPING.keys())}"
+        )
+    @classmethod
+    @replace_list_option_in_docstrings()
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike[str]], **kwargs):
+        r"""
+        Instantiate one of the configuration classes of the library from a pretrained model configuration.
+        The configuration class to instantiate is selected based on the `model_type` property of the config object that
+        is loaded, or when it's missing, by falling back to using pattern matching on `pretrained_model_name_or_path`:
+        List options
+        Args:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                Can be either:
+                    - A string, the *model id* of a pretrained model configuration hosted inside a model repo on
+                      huggingface.co.
+                    - A path to a *directory* containing a configuration file saved using the
+                      [`~PretrainedConfig.save_pretrained`] method, or the [`~PreTrainedModel.save_pretrained`] method,
+                      e.g., `./my_model_directory/`.
+                    - A path or url to a saved configuration JSON *file*, e.g.,
+                      `./my_model_directory/configuration.json`.
+            cache_dir (`str` or `os.PathLike`, *optional*):
+                Path to a directory in which a downloaded pretrained model configuration should be cached if the
+                standard cache should not be used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download the model weights and configuration files and override the
+                cached versions if they exist.
+            resume_download:
+                Deprecated and ignored. All downloads are now resumed by default when possible.
+                Will be removed in v5 of Transformers.
+            proxies (`dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                If `False`, then this function returns just the final configuration object.
+                If `True`, then this functions returns a `Tuple(config, unused_kwargs)` where *unused_kwargs* is a
+                dictionary consisting of the key/value pairs whose keys are not configuration attributes: i.e., the
+                part of `kwargs` which has not been used to update `config` and is otherwise ignored.
+            trust_remote_code (`bool`, *optional*, defaults to `False`):
+                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
+                should only be set to `True` for repositories you trust and in which you have read the code, as it will
+                execute code present on the Hub on your local machine.
+            kwargs(additional keyword arguments, *optional*):
+                The values in kwargs of any keys which are configuration attributes will be used to override the loaded
+                values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
+                by the `return_unused_kwargs` keyword parameter.
+        Examples:
+        ```python
+        >>> from transformers import AutoConfig
+        >>> # Download configuration from huggingface.co and cache.
+        >>> config = AutoConfig.from_pretrained("google-bert/bert-base-uncased")
+        >>> # Download configuration from huggingface.co (user-uploaded) and cache.
+        >>> config = AutoConfig.from_pretrained("dbmdz/bert-base-german-cased")
+        >>> # If configuration file is in a directory (e.g., was saved using *save_pretrained('./test/saved_model/')*).
+        >>> config = AutoConfig.from_pretrained("./test/bert_saved_model/")
+        >>> # Load a specific configuration file.
+        >>> config = AutoConfig.from_pretrained("./test/bert_saved_model/my_configuration.json")
+        >>> # Change some config attributes when loading a pretrained config.
+        >>> config = AutoConfig.from_pretrained("google-bert/bert-base-uncased", output_attentions=True, foo=False)
+        >>> config.output_attentions
+        True
+        >>> config, unused_kwargs = AutoConfig.from_pretrained(
+        ...     "google-bert/bert-base-uncased", output_attentions=True, foo=False, return_unused_kwargs=True
+        ... )
+        >>> config.output_attentions
+        True
+        >>> unused_kwargs
+        {'foo': False}
+        ```
+        """
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+                FutureWarning,
+            )
+            if kwargs.get("token") is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            kwargs["token"] = use_auth_token
+        kwargs["_from_auto"] = True
+        kwargs["name_or_path"] = pretrained_model_name_or_path
+        trust_remote_code = kwargs.pop("trust_remote_code", None)
+        code_revision = kwargs.pop("code_revision", None)
+        config_dict, unused_kwargs = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        has_remote_code = "auto_map" in config_dict and "AutoConfig" in config_dict["auto_map"]
+        has_local_code = "model_type" in config_dict and config_dict["model_type"] in CONFIG_MAPPING
+        if has_remote_code:
+            class_ref = config_dict["auto_map"]["AutoConfig"]
+            if "--" in class_ref:
+                upstream_repo = class_ref.split("--")[0]
+            else:
+                upstream_repo = None
+            trust_remote_code = resolve_trust_remote_code(
+                trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code, upstream_repo
+            )
+        if has_remote_code and trust_remote_code:
+            config_class = get_class_from_dynamic_module(
+                class_ref, pretrained_model_name_or_path, code_revision=code_revision, **kwargs
+            )
+            config_class.register_for_auto_class()
+            return config_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif "model_type" in config_dict:
+            # Apply heuristic: if model_type is mistral but layer_types is present, treat as ministral
+            if config_dict["model_type"] == "mistral" and "layer_types" in config_dict:
+                logger.info(
+                    "Detected mistral model with layer_types, treating as ministral for alternating attention compatibility. "
+                )
+                config_dict["model_type"] = "ministral"
+            try:
+                config_class = CONFIG_MAPPING[config_dict["model_type"]]
+            except KeyError:
+                raise ValueError(
+                    f"The checkpoint you are trying to load has model type `{config_dict['model_type']}` "
+                    "but Transformers does not recognize this architecture. This could be because of an "
+                    "issue with the checkpoint, or because your version of Transformers is out of date.\n\n"
+                    "You can update Transformers with the command `pip install --upgrade transformers`. If this "
+                    "does not work, and the checkpoint is very new, then there may not be a release version "
+                    "that supports this model yet. In this case, you can get the most up-to-date code by installing "
+                    "Transformers from source with the command "
+                    "`pip install git+https://github.com/huggingface/transformers.git`"
+                )
+            return config_class.from_dict(config_dict, **unused_kwargs)
+        else:
+            # Fallback: use pattern matching on the string.
+            # We go from longer names to shorter names to catch roberta before bert (for instance)
+            for pattern in sorted(CONFIG_MAPPING.keys(), key=len, reverse=True):
+                if pattern in str(pretrained_model_name_or_path):
+                    return CONFIG_MAPPING[pattern].from_dict(config_dict, **unused_kwargs)
+        raise ValueError(
+            f"Unrecognized model in {pretrained_model_name_or_path}. "
+            f"Should have a `model_type` key in its {CONFIG_NAME}, or contain one of the following strings "
+            f"in its name: {', '.join(CONFIG_MAPPING.keys())}"
+        )
+    @staticmethod
+    def register(model_type, config, exist_ok=False) -> None:
+        """
+        Register a new configuration for this class.
+        Args:
+            model_type (`str`): The model type like "bert" or "gpt".
+            config ([`PretrainedConfig`]): The config to register.
+        """
+        if issubclass(config, PretrainedConfig) and config.model_type != model_type:
+            raise ValueError(
+                "The config you are passing has a `model_type` attribute that is not consistent with the model type "
+                f"you passed (config has {config.model_type} and you passed {model_type}. Fix one of those so they "
+                "match!"
+            )
+        CONFIG_MAPPING.register(model_type, config, exist_ok=exist_ok)
+__all__ = ["CONFIG_MAPPING", "MODEL_NAMES_MAPPING", "AutoConfig"]

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/auto/feature_extraction_auto.py ADDED Viewed

	@@ -0,0 +1,422 @@

+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""AutoFeatureExtractor class."""
+import importlib
+import json
+import os
+import warnings
+from collections import OrderedDict
+from typing import Optional, Union
+# Build the list of all feature extractors
+from ...configuration_utils import PretrainedConfig
+from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
+from ...feature_extraction_utils import FeatureExtractionMixin
+from ...utils import CONFIG_NAME, FEATURE_EXTRACTOR_NAME, cached_file, logging
+from .auto_factory import _LazyAutoMapping
+from .configuration_auto import (
+    CONFIG_MAPPING_NAMES,
+    AutoConfig,
+    model_type_to_module_name,
+    replace_list_option_in_docstrings,
+)
+logger = logging.get_logger(__name__)
+FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict(
+    [
+        ("audio-spectrogram-transformer", "ASTFeatureExtractor"),
+        ("beit", "BeitFeatureExtractor"),
+        ("chinese_clip", "ChineseCLIPFeatureExtractor"),
+        ("clap", "ClapFeatureExtractor"),
+        ("clip", "CLIPFeatureExtractor"),
+        ("clipseg", "ViTFeatureExtractor"),
+        ("clvp", "ClvpFeatureExtractor"),
+        ("conditional_detr", "ConditionalDetrFeatureExtractor"),
+        ("convnext", "ConvNextFeatureExtractor"),
+        ("cvt", "ConvNextFeatureExtractor"),
+        ("dac", "DacFeatureExtractor"),
+        ("data2vec-audio", "Wav2Vec2FeatureExtractor"),
+        ("data2vec-vision", "BeitFeatureExtractor"),
+        ("deformable_detr", "DeformableDetrFeatureExtractor"),
+        ("deit", "DeiTFeatureExtractor"),
+        ("detr", "DetrFeatureExtractor"),
+        ("dia", "DiaFeatureExtractor"),
+        ("dinat", "ViTFeatureExtractor"),
+        ("donut-swin", "DonutFeatureExtractor"),
+        ("dpt", "DPTFeatureExtractor"),
+        ("encodec", "EncodecFeatureExtractor"),
+        ("flava", "FlavaFeatureExtractor"),
+        ("gemma3n", "Gemma3nAudioFeatureExtractor"),
+        ("glpn", "GLPNFeatureExtractor"),
+        ("granite_speech", "GraniteSpeechFeatureExtractor"),
+        ("groupvit", "CLIPFeatureExtractor"),
+        ("hubert", "Wav2Vec2FeatureExtractor"),
+        ("imagegpt", "ImageGPTFeatureExtractor"),
+        ("kyutai_speech_to_text", "KyutaiSpeechToTextFeatureExtractor"),
+        ("layoutlmv2", "LayoutLMv2FeatureExtractor"),
+        ("layoutlmv3", "LayoutLMv3FeatureExtractor"),
+        ("levit", "LevitFeatureExtractor"),
+        ("maskformer", "MaskFormerFeatureExtractor"),
+        ("mctct", "MCTCTFeatureExtractor"),
+        ("mimi", "EncodecFeatureExtractor"),
+        ("mobilenet_v1", "MobileNetV1FeatureExtractor"),
+        ("mobilenet_v2", "MobileNetV2FeatureExtractor"),
+        ("mobilevit", "MobileViTFeatureExtractor"),
+        ("moonshine", "Wav2Vec2FeatureExtractor"),
+        ("moshi", "EncodecFeatureExtractor"),
+        ("nat", "ViTFeatureExtractor"),
+        ("owlvit", "OwlViTFeatureExtractor"),
+        ("parakeet_ctc", "ParakeetFeatureExtractor"),
+        ("parakeet_encoder", "ParakeetFeatureExtractor"),
+        ("perceiver", "PerceiverFeatureExtractor"),
+        ("phi4_multimodal", "Phi4MultimodalFeatureExtractor"),
+        ("poolformer", "PoolFormerFeatureExtractor"),
+        ("pop2piano", "Pop2PianoFeatureExtractor"),
+        ("regnet", "ConvNextFeatureExtractor"),
+        ("resnet", "ConvNextFeatureExtractor"),
+        ("seamless_m4t", "SeamlessM4TFeatureExtractor"),
+        ("seamless_m4t_v2", "SeamlessM4TFeatureExtractor"),
+        ("segformer", "SegformerFeatureExtractor"),
+        ("sew", "Wav2Vec2FeatureExtractor"),
+        ("sew-d", "Wav2Vec2FeatureExtractor"),
+        ("speech_to_text", "Speech2TextFeatureExtractor"),
+        ("speecht5", "SpeechT5FeatureExtractor"),
+        ("swiftformer", "ViTFeatureExtractor"),
+        ("swin", "ViTFeatureExtractor"),
+        ("swinv2", "ViTFeatureExtractor"),
+        ("table-transformer", "DetrFeatureExtractor"),
+        ("timesformer", "VideoMAEFeatureExtractor"),
+        ("tvlt", "TvltFeatureExtractor"),
+        ("unispeech", "Wav2Vec2FeatureExtractor"),
+        ("unispeech-sat", "Wav2Vec2FeatureExtractor"),
+        ("univnet", "UnivNetFeatureExtractor"),
+        ("van", "ConvNextFeatureExtractor"),
+        ("videomae", "VideoMAEFeatureExtractor"),
+        ("vilt", "ViltFeatureExtractor"),
+        ("vit", "ViTFeatureExtractor"),
+        ("vit_mae", "ViTFeatureExtractor"),
+        ("vit_msn", "ViTFeatureExtractor"),
+        ("wav2vec2", "Wav2Vec2FeatureExtractor"),
+        ("wav2vec2-bert", "Wav2Vec2FeatureExtractor"),
+        ("wav2vec2-conformer", "Wav2Vec2FeatureExtractor"),
+        ("wavlm", "Wav2Vec2FeatureExtractor"),
+        ("whisper", "WhisperFeatureExtractor"),
+        ("xclip", "CLIPFeatureExtractor"),
+        ("xcodec", "DacFeatureExtractor"),
+        ("yolos", "YolosFeatureExtractor"),
+    ]
+)
+FEATURE_EXTRACTOR_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FEATURE_EXTRACTOR_MAPPING_NAMES)
+def feature_extractor_class_from_name(class_name: str):
+    for module_name, extractors in FEATURE_EXTRACTOR_MAPPING_NAMES.items():
+        if class_name in extractors:
+            module_name = model_type_to_module_name(module_name)
+            module = importlib.import_module(f".{module_name}", "transformers.models")
+            try:
+                return getattr(module, class_name)
+            except AttributeError:
+                continue
+    for extractor in FEATURE_EXTRACTOR_MAPPING._extra_content.values():
+        if getattr(extractor, "__name__", None) == class_name:
+            return extractor
+    # We did not fine the class, but maybe it's because a dep is missing. In that case, the class will be in the main
+    # init and we return the proper dummy to get an appropriate error message.
+    main_module = importlib.import_module("transformers")
+    if hasattr(main_module, class_name):
+        return getattr(main_module, class_name)
+    return None
+def get_feature_extractor_config(
+    pretrained_model_name_or_path: Union[str, os.PathLike],
+    cache_dir: Optional[Union[str, os.PathLike]] = None,
+    force_download: bool = False,
+    resume_download: Optional[bool] = None,
+    proxies: Optional[dict[str, str]] = None,
+    token: Optional[Union[bool, str]] = None,
+    revision: Optional[str] = None,
+    local_files_only: bool = False,
+    **kwargs,
+):
+    """
+    Loads the tokenizer configuration from a pretrained model tokenizer configuration.
+    Args:
+        pretrained_model_name_or_path (`str` or `os.PathLike`):
+            This can be either:
+            - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
+              huggingface.co.
+            - a path to a *directory* containing a configuration file saved using the
+              [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
+        cache_dir (`str` or `os.PathLike`, *optional*):
+            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
+            cache should not be used.
+        force_download (`bool`, *optional*, defaults to `False`):
+            Whether or not to force to (re-)download the configuration files and override the cached versions if they
+            exist.
+        resume_download:
+            Deprecated and ignored. All downloads are now resumed by default when possible.
+            Will be removed in v5 of Transformers.
+        proxies (`dict[str, str]`, *optional*):
+            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+        token (`str` or *bool*, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+            when running `hf auth login` (stored in `~/.huggingface`).
+        revision (`str`, *optional*, defaults to `"main"`):
+            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+            identifier allowed by git.
+        local_files_only (`bool`, *optional*, defaults to `False`):
+            If `True`, will only try to load the tokenizer configuration from local files.
+    <Tip>
+    Passing `token=True` is required when you want to use a private model.
+    </Tip>
+    Returns:
+        `Dict`: The configuration of the tokenizer.
+    Examples:
+    ```python
+    # Download configuration from huggingface.co and cache.
+    tokenizer_config = get_tokenizer_config("google-bert/bert-base-uncased")
+    # This model does not have a tokenizer config so the result will be an empty dict.
+    tokenizer_config = get_tokenizer_config("FacebookAI/xlm-roberta-base")
+    # Save a pretrained tokenizer locally and you can reload its config
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
+    tokenizer.save_pretrained("tokenizer-test")
+    tokenizer_config = get_tokenizer_config("tokenizer-test")
+    ```"""
+    use_auth_token = kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+            FutureWarning,
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+    resolved_config_file = cached_file(
+        pretrained_model_name_or_path,
+        FEATURE_EXTRACTOR_NAME,
+        cache_dir=cache_dir,
+        force_download=force_download,
+        resume_download=resume_download,
+        proxies=proxies,
+        token=token,
+        revision=revision,
+        local_files_only=local_files_only,
+        _raise_exceptions_for_gated_repo=False,
+        _raise_exceptions_for_missing_entries=False,
+        _raise_exceptions_for_connection_errors=False,
+    )
+    if resolved_config_file is None:
+        logger.info(
+            "Could not locate the feature extractor configuration file, will try to use the model config instead."
+        )
+        return {}
+    with open(resolved_config_file, encoding="utf-8") as reader:
+        return json.load(reader)
+class AutoFeatureExtractor:
+    r"""
+    This is a generic feature extractor class that will be instantiated as one of the feature extractor classes of the
+    library when created with the [`AutoFeatureExtractor.from_pretrained`] class method.
+    This class cannot be instantiated directly using `__init__()` (throws an error).
+    """
+    def __init__(self):
+        raise OSError(
+            "AutoFeatureExtractor is designed to be instantiated "
+            "using the `AutoFeatureExtractor.from_pretrained(pretrained_model_name_or_path)` method."
+        )
+    @classmethod
+    @replace_list_option_in_docstrings(FEATURE_EXTRACTOR_MAPPING_NAMES)
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        r"""
+        Instantiate one of the feature extractor classes of the library from a pretrained model vocabulary.
+        The feature extractor class to instantiate is selected based on the `model_type` property of the config object
+        (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's
+        missing, by falling back to using pattern matching on `pretrained_model_name_or_path`:
+        List options
+        Params:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                This can be either:
+                - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
+                  huggingface.co.
+                - a path to a *directory* containing a feature extractor file saved using the
+                  [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] method, e.g.,
+                  `./my_model_directory/`.
+                - a path or url to a saved feature extractor JSON *file*, e.g.,
+                  `./my_model_directory/preprocessor_config.json`.
+            cache_dir (`str` or `os.PathLike`, *optional*):
+                Path to a directory in which a downloaded pretrained model feature extractor should be cached if the
+                standard cache should not be used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force to (re-)download the feature extractor files and override the cached versions
+                if they exist.
+            resume_download:
+                Deprecated and ignored. All downloads are now resumed by default when possible.
+                Will be removed in v5 of Transformers.
+            proxies (`dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+                when running `hf auth login` (stored in `~/.huggingface`).
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                If `False`, then this function returns just the final feature extractor object. If `True`, then this
+                functions returns a `Tuple(feature_extractor, unused_kwargs)` where *unused_kwargs* is a dictionary
+                consisting of the key/value pairs whose keys are not feature extractor attributes: i.e., the part of
+                `kwargs` which has not been used to update `feature_extractor` and is otherwise ignored.
+            trust_remote_code (`bool`, *optional*, defaults to `False`):
+                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
+                should only be set to `True` for repositories you trust and in which you have read the code, as it will
+                execute code present on the Hub on your local machine.
+            kwargs (`dict[str, Any]`, *optional*):
+                The values in kwargs of any keys which are feature extractor attributes will be used to override the
+                loaded values. Behavior concerning key/value pairs whose keys are *not* feature extractor attributes is
+                controlled by the `return_unused_kwargs` keyword parameter.
+        <Tip>
+        Passing `token=True` is required when you want to use a private model.
+        </Tip>
+        Examples:
+        ```python
+        >>> from transformers import AutoFeatureExtractor
+        >>> # Download feature extractor from huggingface.co and cache.
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
+        >>> # If feature extractor files are in a directory (e.g. feature extractor was saved using *save_pretrained('./test/saved_model/')*)
+        >>> # feature_extractor = AutoFeatureExtractor.from_pretrained("./test/saved_model/")
+        ```"""
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+                FutureWarning,
+            )
+            if kwargs.get("token") is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            kwargs["token"] = use_auth_token
+        config = kwargs.pop("config", None)
+        trust_remote_code = kwargs.pop("trust_remote_code", None)
+        kwargs["_from_auto"] = True
+        config_dict, _ = FeatureExtractionMixin.get_feature_extractor_dict(pretrained_model_name_or_path, **kwargs)
+        feature_extractor_class = config_dict.get("feature_extractor_type", None)
+        feature_extractor_auto_map = None
+        if "AutoFeatureExtractor" in config_dict.get("auto_map", {}):
+            feature_extractor_auto_map = config_dict["auto_map"]["AutoFeatureExtractor"]
+        # If we don't find the feature extractor class in the feature extractor config, let's try the model config.
+        if feature_extractor_class is None and feature_extractor_auto_map is None:
+            if not isinstance(config, PretrainedConfig):
+                config = AutoConfig.from_pretrained(
+                    pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
+                )
+            # It could be in `config.feature_extractor_type``
+            feature_extractor_class = getattr(config, "feature_extractor_type", None)
+            if hasattr(config, "auto_map") and "AutoFeatureExtractor" in config.auto_map:
+                feature_extractor_auto_map = config.auto_map["AutoFeatureExtractor"]
+        if feature_extractor_class is not None:
+            feature_extractor_class = feature_extractor_class_from_name(feature_extractor_class)
+        has_remote_code = feature_extractor_auto_map is not None
+        has_local_code = feature_extractor_class is not None or type(config) in FEATURE_EXTRACTOR_MAPPING
+        if has_remote_code:
+            if "--" in feature_extractor_auto_map:
+                upstream_repo = feature_extractor_auto_map.split("--")[0]
+            else:
+                upstream_repo = None
+            trust_remote_code = resolve_trust_remote_code(
+                trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code, upstream_repo
+            )
+        if has_remote_code and trust_remote_code:
+            feature_extractor_class = get_class_from_dynamic_module(
+                feature_extractor_auto_map, pretrained_model_name_or_path, **kwargs
+            )
+            _ = kwargs.pop("code_revision", None)
+            feature_extractor_class.register_for_auto_class()
+            return feature_extractor_class.from_dict(config_dict, **kwargs)
+        elif feature_extractor_class is not None:
+            return feature_extractor_class.from_dict(config_dict, **kwargs)
+        # Last try: we use the FEATURE_EXTRACTOR_MAPPING.
+        elif type(config) in FEATURE_EXTRACTOR_MAPPING:
+            feature_extractor_class = FEATURE_EXTRACTOR_MAPPING[type(config)]
+            return feature_extractor_class.from_dict(config_dict, **kwargs)
+        raise ValueError(
+            f"Unrecognized feature extractor in {pretrained_model_name_or_path}. Should have a "
+            f"`feature_extractor_type` key in its {FEATURE_EXTRACTOR_NAME} of {CONFIG_NAME}, or one of the following "
+            f"`model_type` keys in its {CONFIG_NAME}: {', '.join(c for c in FEATURE_EXTRACTOR_MAPPING_NAMES)}"
+        )
+    @staticmethod
+    def register(config_class, feature_extractor_class, exist_ok=False):
+        """
+        Register a new feature extractor for this class.
+        Args:
+            config_class ([`PretrainedConfig`]):
+                The configuration corresponding to the model to register.
+            feature_extractor_class ([`FeatureExtractorMixin`]): The feature extractor to register.
+        """
+        FEATURE_EXTRACTOR_MAPPING.register(config_class, feature_extractor_class, exist_ok=exist_ok)
+__all__ = ["FEATURE_EXTRACTOR_MAPPING", "AutoFeatureExtractor"]

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/auto/image_processing_auto.py ADDED Viewed

	@@ -0,0 +1,688 @@

+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""AutoImageProcessor class."""
+import importlib
+import json
+import os
+import warnings
+from collections import OrderedDict
+from typing import TYPE_CHECKING, Optional, Union
+# Build the list of all image processors
+from ...configuration_utils import PretrainedConfig
+from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
+from ...image_processing_utils import ImageProcessingMixin
+from ...image_processing_utils_fast import BaseImageProcessorFast
+from ...utils import (
+    CONFIG_NAME,
+    IMAGE_PROCESSOR_NAME,
+    cached_file,
+    is_timm_config_dict,
+    is_timm_local_checkpoint,
+    is_torchvision_available,
+    is_vision_available,
+    logging,
+)
+from ...utils.import_utils import requires
+from .auto_factory import _LazyAutoMapping
+from .configuration_auto import (
+    CONFIG_MAPPING_NAMES,
+    AutoConfig,
+    model_type_to_module_name,
+    replace_list_option_in_docstrings,
+)
+logger = logging.get_logger(__name__)
+FORCE_FAST_IMAGE_PROCESSOR = ["Qwen2VLImageProcessor"]
+if TYPE_CHECKING:
+    # This significantly improves completion suggestion performance when
+    # the transformers package is used with Microsoft's Pylance language server.
+    IMAGE_PROCESSOR_MAPPING_NAMES: OrderedDict[str, tuple[Optional[str], Optional[str]]] = OrderedDict()
+else:
+    IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
+        [
+            ("aimv2", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
+            ("aimv2_vision_model", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
+            ("align", ("EfficientNetImageProcessor", "EfficientNetImageProcessorFast")),
+            ("aria", ("AriaImageProcessor", None)),
+            ("beit", ("BeitImageProcessor", "BeitImageProcessorFast")),
+            ("bit", ("BitImageProcessor", "BitImageProcessorFast")),
+            ("blip", ("BlipImageProcessor", "BlipImageProcessorFast")),
+            ("blip-2", ("BlipImageProcessor", "BlipImageProcessorFast")),
+            ("bridgetower", ("BridgeTowerImageProcessor", "BridgeTowerImageProcessorFast")),
+            ("chameleon", ("ChameleonImageProcessor", "ChameleonImageProcessorFast")),
+            ("chinese_clip", ("ChineseCLIPImageProcessor", "ChineseCLIPImageProcessorFast")),
+            ("clip", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
+            ("clipseg", ("ViTImageProcessor", "ViTImageProcessorFast")),
+            ("cohere2_vision", (None, "Cohere2VisionImageProcessorFast")),
+            ("conditional_detr", ("ConditionalDetrImageProcessor", "ConditionalDetrImageProcessorFast")),
+            ("convnext", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
+            ("convnextv2", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
+            ("cvt", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
+            ("data2vec-vision", ("BeitImageProcessor", "BeitImageProcessorFast")),
+            ("deepseek_vl", ("DeepseekVLImageProcessor", "DeepseekVLImageProcessorFast")),
+            ("deepseek_vl_hybrid", ("DeepseekVLHybridImageProcessor", "DeepseekVLHybridImageProcessorFast")),
+            ("deformable_detr", ("DeformableDetrImageProcessor", "DeformableDetrImageProcessorFast")),
+            ("deit", ("DeiTImageProcessor", "DeiTImageProcessorFast")),
+            ("depth_anything", ("DPTImageProcessor", "DPTImageProcessorFast")),
+            ("depth_pro", ("DepthProImageProcessor", "DepthProImageProcessorFast")),
+            ("deta", ("DetaImageProcessor", None)),
+            ("detr", ("DetrImageProcessor", "DetrImageProcessorFast")),
+            ("dinat", ("ViTImageProcessor", "ViTImageProcessorFast")),
+            ("dinov2", ("BitImageProcessor", "BitImageProcessorFast")),
+            ("dinov3_vit", (None, "DINOv3ViTImageProcessorFast")),
+            ("donut-swin", ("DonutImageProcessor", "DonutImageProcessorFast")),
+            ("dpt", ("DPTImageProcessor", "DPTImageProcessorFast")),
+            ("edgetam", (None, "Sam2ImageProcessorFast")),
+            ("efficientformer", ("EfficientFormerImageProcessor", None)),
+            ("efficientloftr", ("EfficientLoFTRImageProcessor", "EfficientLoFTRImageProcessorFast")),
+            ("efficientnet", ("EfficientNetImageProcessor", "EfficientNetImageProcessorFast")),
+            ("eomt", ("EomtImageProcessor", "EomtImageProcessorFast")),
+            ("flava", ("FlavaImageProcessor", "FlavaImageProcessorFast")),
+            ("focalnet", ("BitImageProcessor", "BitImageProcessorFast")),
+            ("fuyu", ("FuyuImageProcessor", None)),
+            ("gemma3", ("Gemma3ImageProcessor", "Gemma3ImageProcessorFast")),
+            ("gemma3n", ("SiglipImageProcessor", "SiglipImageProcessorFast")),
+            ("git", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
+            ("glm4v", ("Glm4vImageProcessor", "Glm4vImageProcessorFast")),
+            ("glpn", ("GLPNImageProcessor", None)),
+            ("got_ocr2", ("GotOcr2ImageProcessor", "GotOcr2ImageProcessorFast")),
+            ("grounding-dino", ("GroundingDinoImageProcessor", "GroundingDinoImageProcessorFast")),
+            ("groupvit", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
+            ("hiera", ("BitImageProcessor", "BitImageProcessorFast")),
+            ("idefics", ("IdeficsImageProcessor", None)),
+            ("idefics2", ("Idefics2ImageProcessor", "Idefics2ImageProcessorFast")),
+            ("idefics3", ("Idefics3ImageProcessor", "Idefics3ImageProcessorFast")),
+            ("ijepa", ("ViTImageProcessor", "ViTImageProcessorFast")),
+            ("imagegpt", ("ImageGPTImageProcessor", "ImageGPTImageProcessorFast")),
+            ("instructblip", ("BlipImageProcessor", "BlipImageProcessorFast")),
+            ("instructblipvideo", ("InstructBlipVideoImageProcessor", None)),
+            ("janus", ("JanusImageProcessor", "JanusImageProcessorFast")),
+            ("kosmos-2", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
+            ("kosmos-2.5", ("Kosmos2_5ImageProcessor", "Kosmos2_5ImageProcessorFast")),
+            ("layoutlmv2", ("LayoutLMv2ImageProcessor", "LayoutLMv2ImageProcessorFast")),
+            ("layoutlmv3", ("LayoutLMv3ImageProcessor", "LayoutLMv3ImageProcessorFast")),
+            ("levit", ("LevitImageProcessor", "LevitImageProcessorFast")),
+            ("lfm2_vl", (None, "Lfm2VlImageProcessorFast")),
+            ("lightglue", ("LightGlueImageProcessor", None)),
+            ("llama4", ("Llama4ImageProcessor", "Llama4ImageProcessorFast")),
+            ("llava", ("LlavaImageProcessor", "LlavaImageProcessorFast")),
+            ("llava_next", ("LlavaNextImageProcessor", "LlavaNextImageProcessorFast")),
+            ("llava_next_video", ("LlavaNextVideoImageProcessor", None)),
+            ("llava_onevision", ("LlavaOnevisionImageProcessor", "LlavaOnevisionImageProcessorFast")),
+            ("mask2former", ("Mask2FormerImageProcessor", "Mask2FormerImageProcessorFast")),
+            ("maskformer", ("MaskFormerImageProcessor", "MaskFormerImageProcessorFast")),
+            ("metaclip_2", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
+            ("mgp-str", ("ViTImageProcessor", "ViTImageProcessorFast")),
+            ("mistral3", ("PixtralImageProcessor", "PixtralImageProcessorFast")),
+            ("mlcd", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
+            ("mllama", ("MllamaImageProcessor", None)),
+            ("mm-grounding-dino", ("GroundingDinoImageProcessor", "GroundingDinoImageProcessorFast")),
+            ("mobilenet_v1", ("MobileNetV1ImageProcessor", "MobileNetV1ImageProcessorFast")),
+            ("mobilenet_v2", ("MobileNetV2ImageProcessor", "MobileNetV2ImageProcessorFast")),
+            ("mobilevit", ("MobileViTImageProcessor", "MobileViTImageProcessorFast")),
+            ("mobilevitv2", ("MobileViTImageProcessor", "MobileViTImageProcessorFast")),
+            ("nat", ("ViTImageProcessor", "ViTImageProcessorFast")),
+            ("nougat", ("NougatImageProcessor", "NougatImageProcessorFast")),
+            ("oneformer", ("OneFormerImageProcessor", "OneFormerImageProcessorFast")),
+            ("ovis2", ("Ovis2ImageProcessor", "Ovis2ImageProcessorFast")),
+            ("owlv2", ("Owlv2ImageProcessor", "Owlv2ImageProcessorFast")),
+            ("owlvit", ("OwlViTImageProcessor", "OwlViTImageProcessorFast")),
+            ("paligemma", ("SiglipImageProcessor", "SiglipImageProcessorFast")),
+            ("perceiver", ("PerceiverImageProcessor", "PerceiverImageProcessorFast")),
+            ("perception_lm", (None, "PerceptionLMImageProcessorFast")),
+            ("phi4_multimodal", (None, "Phi4MultimodalImageProcessorFast")),
+            ("pix2struct", ("Pix2StructImageProcessor", None)),
+            ("pixtral", ("PixtralImageProcessor", "PixtralImageProcessorFast")),
+            ("poolformer", ("PoolFormerImageProcessor", "PoolFormerImageProcessorFast")),
+            ("prompt_depth_anything", ("PromptDepthAnythingImageProcessor", "PromptDepthAnythingImageProcessorFast")),
+            ("pvt", ("PvtImageProcessor", "PvtImageProcessorFast")),
+            ("pvt_v2", ("PvtImageProcessor", "PvtImageProcessorFast")),
+            ("qwen2_5_vl", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
+            ("qwen2_vl", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
+            ("qwen3_vl", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
+            ("regnet", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
+            ("resnet", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
+            ("rt_detr", ("RTDetrImageProcessor", "RTDetrImageProcessorFast")),
+            ("sam", ("SamImageProcessor", "SamImageProcessorFast")),
+            ("sam2", (None, "Sam2ImageProcessorFast")),
+            ("sam_hq", ("SamImageProcessor", "SamImageProcessorFast")),
+            ("segformer", ("SegformerImageProcessor", "SegformerImageProcessorFast")),
+            ("seggpt", ("SegGptImageProcessor", None)),
+            ("shieldgemma2", ("Gemma3ImageProcessor", "Gemma3ImageProcessorFast")),
+            ("siglip", ("SiglipImageProcessor", "SiglipImageProcessorFast")),
+            ("siglip2", ("Siglip2ImageProcessor", "Siglip2ImageProcessorFast")),
+            ("smolvlm", ("SmolVLMImageProcessor", "SmolVLMImageProcessorFast")),
+            ("superglue", ("SuperGlueImageProcessor", None)),
+            ("superpoint", ("SuperPointImageProcessor", "SuperPointImageProcessorFast")),
+            ("swiftformer", ("ViTImageProcessor", "ViTImageProcessorFast")),
+            ("swin", ("ViTImageProcessor", "ViTImageProcessorFast")),
+            ("swin2sr", ("Swin2SRImageProcessor", "Swin2SRImageProcessorFast")),
+            ("swinv2", ("ViTImageProcessor", "ViTImageProcessorFast")),
+            ("table-transformer", ("DetrImageProcessor", "DetrImageProcessorFast")),
+            ("textnet", ("TextNetImageProcessor", "TextNetImageProcessorFast")),
+            ("timesformer", ("VideoMAEImageProcessor", None)),
+            ("timm_wrapper", ("TimmWrapperImageProcessor", None)),
+            ("tvlt", ("TvltImageProcessor", None)),
+            ("tvp", ("TvpImageProcessor", "TvpImageProcessorFast")),
+            ("udop", ("LayoutLMv3ImageProcessor", "LayoutLMv3ImageProcessorFast")),
+            ("upernet", ("SegformerImageProcessor", "SegformerImageProcessorFast")),
+            ("van", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
+            ("videomae", ("VideoMAEImageProcessor", None)),
+            ("vilt", ("ViltImageProcessor", "ViltImageProcessorFast")),
+            ("vipllava", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
+            ("vit", ("ViTImageProcessor", "ViTImageProcessorFast")),
+            ("vit_hybrid", ("ViTHybridImageProcessor", None)),
+            ("vit_mae", ("ViTImageProcessor", "ViTImageProcessorFast")),
+            ("vit_msn", ("ViTImageProcessor", "ViTImageProcessorFast")),
+            ("vitmatte", ("VitMatteImageProcessor", "VitMatteImageProcessorFast")),
+            ("xclip", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
+            ("yolos", ("YolosImageProcessor", "YolosImageProcessorFast")),
+            ("zoedepth", ("ZoeDepthImageProcessor", "ZoeDepthImageProcessorFast")),
+        ]
+    )
+# Override to None if the packages are not available
+for model_type, (slow_class, fast_class) in IMAGE_PROCESSOR_MAPPING_NAMES.items():
+    if not is_vision_available():
+        slow_class = None
+    if not is_torchvision_available():
+        fast_class = None
+    IMAGE_PROCESSOR_MAPPING_NAMES[model_type] = (slow_class, fast_class)
+IMAGE_PROCESSOR_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, IMAGE_PROCESSOR_MAPPING_NAMES)
+def get_image_processor_class_from_name(class_name: str):
+    if class_name == "BaseImageProcessorFast":
+        return BaseImageProcessorFast
+    for module_name, extractors in IMAGE_PROCESSOR_MAPPING_NAMES.items():
+        if class_name in extractors:
+            module_name = model_type_to_module_name(module_name)
+            module = importlib.import_module(f".{module_name}", "transformers.models")
+            try:
+                return getattr(module, class_name)
+            except AttributeError:
+                continue
+    for extractors in IMAGE_PROCESSOR_MAPPING._extra_content.values():
+        for extractor in extractors:
+            if getattr(extractor, "__name__", None) == class_name:
+                return extractor
+    # We did not find the class, but maybe it's because a dep is missing. In that case, the class will be in the main
+    # init and we return the proper dummy to get an appropriate error message.
+    main_module = importlib.import_module("transformers")
+    if hasattr(main_module, class_name):
+        return getattr(main_module, class_name)
+    return None
+def get_image_processor_config(
+    pretrained_model_name_or_path: Union[str, os.PathLike],
+    cache_dir: Optional[Union[str, os.PathLike]] = None,
+    force_download: bool = False,
+    resume_download: Optional[bool] = None,
+    proxies: Optional[dict[str, str]] = None,
+    token: Optional[Union[bool, str]] = None,
+    revision: Optional[str] = None,
+    local_files_only: bool = False,
+    **kwargs,
+):
+    """
+    Loads the image processor configuration from a pretrained model image processor configuration.
+    Args:
+        pretrained_model_name_or_path (`str` or `os.PathLike`):
+            This can be either:
+            - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
+              huggingface.co.
+            - a path to a *directory* containing a configuration file saved using the
+              [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
+        cache_dir (`str` or `os.PathLike`, *optional*):
+            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
+            cache should not be used.
+        force_download (`bool`, *optional*, defaults to `False`):
+            Whether or not to force to (re-)download the configuration files and override the cached versions if they
+            exist.
+        resume_download:
+            Deprecated and ignored. All downloads are now resumed by default when possible.
+            Will be removed in v5 of Transformers.
+        proxies (`dict[str, str]`, *optional*):
+            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+        token (`str` or *bool*, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+            when running `hf auth login` (stored in `~/.huggingface`).
+        revision (`str`, *optional*, defaults to `"main"`):
+            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+            identifier allowed by git.
+        local_files_only (`bool`, *optional*, defaults to `False`):
+            If `True`, will only try to load the image processor configuration from local files.
+    <Tip>
+    Passing `token=True` is required when you want to use a private model.
+    </Tip>
+    Returns:
+        `Dict`: The configuration of the image processor.
+    Examples:
+    ```python
+    # Download configuration from huggingface.co and cache.
+    image_processor_config = get_image_processor_config("google-bert/bert-base-uncased")
+    # This model does not have a image processor config so the result will be an empty dict.
+    image_processor_config = get_image_processor_config("FacebookAI/xlm-roberta-base")
+    # Save a pretrained image processor locally and you can reload its config
+    from transformers import AutoTokenizer
+    image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
+    image_processor.save_pretrained("image-processor-test")
+    image_processor_config = get_image_processor_config("image-processor-test")
+    ```"""
+    use_auth_token = kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+            FutureWarning,
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+    resolved_config_file = cached_file(
+        pretrained_model_name_or_path,
+        IMAGE_PROCESSOR_NAME,
+        cache_dir=cache_dir,
+        force_download=force_download,
+        resume_download=resume_download,
+        proxies=proxies,
+        token=token,
+        revision=revision,
+        local_files_only=local_files_only,
+        _raise_exceptions_for_gated_repo=False,
+        _raise_exceptions_for_missing_entries=False,
+        _raise_exceptions_for_connection_errors=False,
+    )
+    if resolved_config_file is None:
+        logger.info(
+            "Could not locate the image processor configuration file, will try to use the model config instead."
+        )
+        return {}
+    with open(resolved_config_file, encoding="utf-8") as reader:
+        return json.load(reader)
+def _warning_fast_image_processor_available(fast_class):
+    logger.warning(
+        f"Fast image processor class {fast_class} is available for this model. "
+        "Using slow image processor class. To use the fast image processor class set `use_fast=True`."
+    )
+@requires(backends=("vision",))
+class AutoImageProcessor:
+    r"""
+    This is a generic image processor class that will be instantiated as one of the image processor classes of the
+    library when created with the [`AutoImageProcessor.from_pretrained`] class method.
+    This class cannot be instantiated directly using `__init__()` (throws an error).
+    """
+    def __init__(self):
+        raise OSError(
+            "AutoImageProcessor is designed to be instantiated "
+            "using the `AutoImageProcessor.from_pretrained(pretrained_model_name_or_path)` method."
+        )
+    @classmethod
+    @replace_list_option_in_docstrings(IMAGE_PROCESSOR_MAPPING_NAMES)
+    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
+        r"""
+        Instantiate one of the image processor classes of the library from a pretrained model vocabulary.
+        The image processor class to instantiate is selected based on the `model_type` property of the config object
+        (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's
+        missing, by falling back to using pattern matching on `pretrained_model_name_or_path`:
+        List options
+        Params:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                This can be either:
+                - a string, the *model id* of a pretrained image_processor hosted inside a model repo on
+                  huggingface.co.
+                - a path to a *directory* containing a image processor file saved using the
+                  [`~image_processing_utils.ImageProcessingMixin.save_pretrained`] method, e.g.,
+                  `./my_model_directory/`.
+                - a path or url to a saved image processor JSON *file*, e.g.,
+                  `./my_model_directory/preprocessor_config.json`.
+            cache_dir (`str` or `os.PathLike`, *optional*):
+                Path to a directory in which a downloaded pretrained model image processor should be cached if the
+                standard cache should not be used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force to (re-)download the image processor files and override the cached versions if
+                they exist.
+            resume_download:
+                Deprecated and ignored. All downloads are now resumed by default when possible.
+                Will be removed in v5 of Transformers.
+            proxies (`dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+                when running `hf auth login` (stored in `~/.huggingface`).
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+            use_fast (`bool`, *optional*, defaults to `False`):
+                Use a fast torchvision-base image processor if it is supported for a given model.
+                If a fast image processor is not available for a given model, a normal numpy-based image processor
+                is returned instead.
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                If `False`, then this function returns just the final image processor object. If `True`, then this
+                functions returns a `Tuple(image_processor, unused_kwargs)` where *unused_kwargs* is a dictionary
+                consisting of the key/value pairs whose keys are not image processor attributes: i.e., the part of
+                `kwargs` which has not been used to update `image_processor` and is otherwise ignored.
+            trust_remote_code (`bool`, *optional*, defaults to `False`):
+                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
+                should only be set to `True` for repositories you trust and in which you have read the code, as it will
+                execute code present on the Hub on your local machine.
+            image_processor_filename (`str`, *optional*, defaults to `"config.json"`):
+                The name of the file in the model directory to use for the image processor config.
+            kwargs (`dict[str, Any]`, *optional*):
+                The values in kwargs of any keys which are image processor attributes will be used to override the
+                loaded values. Behavior concerning key/value pairs whose keys are *not* image processor attributes is
+                controlled by the `return_unused_kwargs` keyword parameter.
+        <Tip>
+        Passing `token=True` is required when you want to use a private model.
+        </Tip>
+        Examples:
+        ```python
+        >>> from transformers import AutoImageProcessor
+        >>> # Download image processor from huggingface.co and cache.
+        >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
+        >>> # If image processor files are in a directory (e.g. image processor was saved using *save_pretrained('./test/saved_model/')*)
+        >>> # image_processor = AutoImageProcessor.from_pretrained("./test/saved_model/")
+        ```"""
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+                FutureWarning,
+            )
+            if kwargs.get("token") is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            kwargs["token"] = use_auth_token
+        config = kwargs.pop("config", None)
+        # TODO: @yoni, change in v4.48 (use_fast set to True by default)
+        use_fast = kwargs.pop("use_fast", None)
+        trust_remote_code = kwargs.pop("trust_remote_code", None)
+        kwargs["_from_auto"] = True
+        # Resolve the image processor config filename
+        if "image_processor_filename" in kwargs:
+            image_processor_filename = kwargs.pop("image_processor_filename")
+        elif is_timm_local_checkpoint(pretrained_model_name_or_path):
+            image_processor_filename = CONFIG_NAME
+        else:
+            image_processor_filename = IMAGE_PROCESSOR_NAME
+        # Load the image processor config
+        try:
+            # Main path for all transformers models and local TimmWrapper checkpoints
+            config_dict, _ = ImageProcessingMixin.get_image_processor_dict(
+                pretrained_model_name_or_path, image_processor_filename=image_processor_filename, **kwargs
+            )
+        except Exception as initial_exception:
+            # Fallback path for Hub TimmWrapper checkpoints. Timm models' image processing is saved in `config.json`
+            # instead of `preprocessor_config.json`. Because this is an Auto class and we don't have any information
+            # except the model name, the only way to check if a remote checkpoint is a timm model is to try to
+            # load `config.json` and if it fails with some error, we raise the initial exception.
+            try:
+                config_dict, _ = ImageProcessingMixin.get_image_processor_dict(
+                    pretrained_model_name_or_path, image_processor_filename=CONFIG_NAME, **kwargs
+                )
+            except Exception:
+                raise initial_exception
+            # In case we have a config_dict, but it's not a timm config dict, we raise the initial exception,
+            # because only timm models have image processing in `config.json`.
+            if not is_timm_config_dict(config_dict):
+                raise initial_exception
+        image_processor_type = config_dict.get("image_processor_type", None)
+        image_processor_auto_map = None
+        if "AutoImageProcessor" in config_dict.get("auto_map", {}):
+            image_processor_auto_map = config_dict["auto_map"]["AutoImageProcessor"]
+        # If we still don't have the image processor class, check if we're loading from a previous feature extractor config
+        # and if so, infer the image processor class from there.
+        if image_processor_type is None and image_processor_auto_map is None:
+            feature_extractor_class = config_dict.pop("feature_extractor_type", None)
+            if feature_extractor_class is not None:
+                image_processor_type = feature_extractor_class.replace("FeatureExtractor", "ImageProcessor")
+            if "AutoFeatureExtractor" in config_dict.get("auto_map", {}):
+                feature_extractor_auto_map = config_dict["auto_map"]["AutoFeatureExtractor"]
+                image_processor_auto_map = feature_extractor_auto_map.replace("FeatureExtractor", "ImageProcessor")
+        # If we don't find the image processor class in the image processor config, let's try the model config.
+        if image_processor_type is None and image_processor_auto_map is None:
+            if not isinstance(config, PretrainedConfig):
+                config = AutoConfig.from_pretrained(
+                    pretrained_model_name_or_path,
+                    trust_remote_code=trust_remote_code,
+                    **kwargs,
+                )
+            # It could be in `config.image_processor_type``
+            image_processor_type = getattr(config, "image_processor_type", None)
+            if hasattr(config, "auto_map") and "AutoImageProcessor" in config.auto_map:
+                image_processor_auto_map = config.auto_map["AutoImageProcessor"]
+        image_processor_class = None
+        # TODO: @yoni, change logic in v4.52 (when use_fast set to True by default)
+        if image_processor_type is not None:
+            # if use_fast is not set and the processor was saved with a fast processor, we use it, otherwise we use the slow processor.
+            if use_fast is None:
+                use_fast = image_processor_type.endswith("Fast")
+                if not use_fast and image_processor_type in FORCE_FAST_IMAGE_PROCESSOR and is_torchvision_available():
+                    use_fast = True
+                    logger.warning_once(
+                        f"The image processor of type `{image_processor_type}` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. "
+                        "This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. "
+                        "Note that this behavior will be extended to all models in a future release."
+                    )
+                if not use_fast:
+                    logger.warning_once(
+                        "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. "
+                        "`use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. "
+                        "This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`."
+                    )
+            if use_fast and not image_processor_type.endswith("Fast"):
+                image_processor_type += "Fast"
+            if use_fast and not is_torchvision_available():
+                # check if there is a slow image processor class to fallback to
+                image_processor_class = get_image_processor_class_from_name(image_processor_type[:-4])
+                if image_processor_class is None:
+                    raise ValueError(
+                        f"`{image_processor_type}` requires `torchvision` to be installed. Please install `torchvision` and try again."
+                    )
+                logger.warning_once(
+                    "Using `use_fast=True` but `torchvision` is not available. Falling back to the slow image processor."
+                )
+                use_fast = False
+            if use_fast:
+                for image_processors in IMAGE_PROCESSOR_MAPPING_NAMES.values():
+                    if image_processor_type in image_processors:
+                        break
+                else:
+                    image_processor_type = image_processor_type[:-4]
+                    use_fast = False
+                    logger.warning_once(
+                        "`use_fast` is set to `True` but the image processor class does not have a fast version. "
+                        " Falling back to the slow version."
+                    )
+                image_processor_class = get_image_processor_class_from_name(image_processor_type)
+            else:
+                image_processor_type_slow = image_processor_type.removesuffix("Fast")
+                image_processor_class = get_image_processor_class_from_name(image_processor_type_slow)
+                if image_processor_class is None and image_processor_type.endswith("Fast"):
+                    raise ValueError(
+                        f"`{image_processor_type}` does not have a slow version. Please set `use_fast=True` when instantiating the processor."
+                    )
+        has_remote_code = image_processor_auto_map is not None
+        has_local_code = image_processor_class is not None or type(config) in IMAGE_PROCESSOR_MAPPING
+        if has_remote_code:
+            if image_processor_auto_map is not None and not isinstance(image_processor_auto_map, tuple):
+                # In some configs, only the slow image processor class is stored
+                image_processor_auto_map = (image_processor_auto_map, None)
+            if use_fast and image_processor_auto_map[1] is not None:
+                class_ref = image_processor_auto_map[1]
+            else:
+                class_ref = image_processor_auto_map[0]
+            if "--" in class_ref:
+                upstream_repo = class_ref.split("--")[0]
+            else:
+                upstream_repo = None
+            trust_remote_code = resolve_trust_remote_code(
+                trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code, upstream_repo
+            )
+        if has_remote_code and trust_remote_code:
+            if not use_fast and image_processor_auto_map[1] is not None:
+                _warning_fast_image_processor_available(image_processor_auto_map[1])
+            image_processor_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs)
+            _ = kwargs.pop("code_revision", None)
+            image_processor_class.register_for_auto_class()
+            return image_processor_class.from_dict(config_dict, **kwargs)
+        elif image_processor_class is not None:
+            return image_processor_class.from_dict(config_dict, **kwargs)
+        # Last try: we use the IMAGE_PROCESSOR_MAPPING.
+        elif type(config) in IMAGE_PROCESSOR_MAPPING:
+            image_processor_tuple = IMAGE_PROCESSOR_MAPPING[type(config)]
+            image_processor_class_py, image_processor_class_fast = image_processor_tuple
+            if not use_fast and image_processor_class_fast is not None:
+                _warning_fast_image_processor_available(image_processor_class_fast)
+            if image_processor_class_fast and (use_fast or image_processor_class_py is None):
+                return image_processor_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+            else:
+                if image_processor_class_py is not None:
+                    return image_processor_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+                else:
+                    raise ValueError(
+                        "This image processor cannot be instantiated. Please make sure you have `Pillow` installed."
+                    )
+        raise ValueError(
+            f"Unrecognized image processor in {pretrained_model_name_or_path}. Should have a "
+            f"`image_processor_type` key in its {IMAGE_PROCESSOR_NAME} of {CONFIG_NAME}, or one of the following "
+            f"`model_type` keys in its {CONFIG_NAME}: {', '.join(c for c in IMAGE_PROCESSOR_MAPPING_NAMES)}"
+        )
+    @staticmethod
+    def register(
+        config_class,
+        image_processor_class=None,
+        slow_image_processor_class=None,
+        fast_image_processor_class=None,
+        exist_ok=False,
+    ):
+        """
+        Register a new image processor for this class.
+        Args:
+            config_class ([`PretrainedConfig`]):
+                The configuration corresponding to the model to register.
+            image_processor_class ([`ImageProcessingMixin`]): The image processor to register.
+        """
+        if image_processor_class is not None:
+            if slow_image_processor_class is not None:
+                raise ValueError("Cannot specify both image_processor_class and slow_image_processor_class")
+            warnings.warn(
+                "The image_processor_class argument is deprecated and will be removed in v4.42. Please use `slow_image_processor_class`, or `fast_image_processor_class` instead",
+                FutureWarning,
+            )
+            slow_image_processor_class = image_processor_class
+        if slow_image_processor_class is None and fast_image_processor_class is None:
+            raise ValueError("You need to specify either slow_image_processor_class or fast_image_processor_class")
+        if slow_image_processor_class is not None and issubclass(slow_image_processor_class, BaseImageProcessorFast):
+            raise ValueError("You passed a fast image processor in as the `slow_image_processor_class`.")
+        if fast_image_processor_class is not None and not issubclass(
+            fast_image_processor_class, BaseImageProcessorFast
+        ):
+            raise ValueError("The `fast_image_processor_class` should inherit from `BaseImageProcessorFast`.")
+        if (
+            slow_image_processor_class is not None
+            and fast_image_processor_class is not None
+            and issubclass(fast_image_processor_class, BaseImageProcessorFast)
+            and fast_image_processor_class.slow_image_processor_class != slow_image_processor_class
+        ):
+            raise ValueError(
+                "The fast processor class you are passing has a `slow_image_processor_class` attribute that is not "
+                "consistent with the slow processor class you passed (fast tokenizer has "
+                f"{fast_image_processor_class.slow_image_processor_class} and you passed {slow_image_processor_class}. Fix one of those "
+                "so they match!"
+            )
+        # Avoid resetting a set slow/fast image processor if we are passing just the other ones.
+        if config_class in IMAGE_PROCESSOR_MAPPING._extra_content:
+            existing_slow, existing_fast = IMAGE_PROCESSOR_MAPPING[config_class]
+            if slow_image_processor_class is None:
+                slow_image_processor_class = existing_slow
+            if fast_image_processor_class is None:
+                fast_image_processor_class = existing_fast
+        IMAGE_PROCESSOR_MAPPING.register(
+            config_class, (slow_image_processor_class, fast_image_processor_class), exist_ok=exist_ok
+        )
+__all__ = ["IMAGE_PROCESSOR_MAPPING", "AutoImageProcessor"]

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/auto/modeling_auto.py ADDED Viewed

The diff for this file is too large to render. See raw diff

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/auto/modeling_flax_auto.py ADDED Viewed

	@@ -0,0 +1,413 @@

+# coding=utf-8
+# Copyright 2018 The Google Flax Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Auto Model class."""
+from collections import OrderedDict
+from ...utils import logging
+from .auto_factory import _BaseAutoModelClass, _LazyAutoMapping, auto_class_update
+from .configuration_auto import CONFIG_MAPPING_NAMES
+logger = logging.get_logger(__name__)
+FLAX_MODEL_MAPPING_NAMES = OrderedDict(
+    [
+        # Base model mapping
+        ("albert", "FlaxAlbertModel"),
+        ("bart", "FlaxBartModel"),
+        ("beit", "FlaxBeitModel"),
+        ("bert", "FlaxBertModel"),
+        ("big_bird", "FlaxBigBirdModel"),
+        ("blenderbot", "FlaxBlenderbotModel"),
+        ("blenderbot-small", "FlaxBlenderbotSmallModel"),
+        ("bloom", "FlaxBloomModel"),
+        ("clip", "FlaxCLIPModel"),
+        ("dinov2", "FlaxDinov2Model"),
+        ("distilbert", "FlaxDistilBertModel"),
+        ("electra", "FlaxElectraModel"),
+        ("gemma", "FlaxGemmaModel"),
+        ("gpt-sw3", "FlaxGPT2Model"),
+        ("gpt2", "FlaxGPT2Model"),
+        ("gpt_neo", "FlaxGPTNeoModel"),
+        ("gptj", "FlaxGPTJModel"),
+        ("llama", "FlaxLlamaModel"),
+        ("longt5", "FlaxLongT5Model"),
+        ("marian", "FlaxMarianModel"),
+        ("mbart", "FlaxMBartModel"),
+        ("mistral", "FlaxMistralModel"),
+        ("mt5", "FlaxMT5Model"),
+        ("opt", "FlaxOPTModel"),
+        ("pegasus", "FlaxPegasusModel"),
+        ("regnet", "FlaxRegNetModel"),
+        ("resnet", "FlaxResNetModel"),
+        ("roberta", "FlaxRobertaModel"),
+        ("roberta-prelayernorm", "FlaxRobertaPreLayerNormModel"),
+        ("roformer", "FlaxRoFormerModel"),
+        ("t5", "FlaxT5Model"),
+        ("vision-text-dual-encoder", "FlaxVisionTextDualEncoderModel"),
+        ("vit", "FlaxViTModel"),
+        ("wav2vec2", "FlaxWav2Vec2Model"),
+        ("whisper", "FlaxWhisperModel"),
+        ("xglm", "FlaxXGLMModel"),
+        ("xlm-roberta", "FlaxXLMRobertaModel"),
+    ]
+)
+FLAX_MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for pre-training mapping
+        ("albert", "FlaxAlbertForPreTraining"),
+        ("bart", "FlaxBartForConditionalGeneration"),
+        ("bert", "FlaxBertForPreTraining"),
+        ("big_bird", "FlaxBigBirdForPreTraining"),
+        ("electra", "FlaxElectraForPreTraining"),
+        ("longt5", "FlaxLongT5ForConditionalGeneration"),
+        ("mbart", "FlaxMBartForConditionalGeneration"),
+        ("mt5", "FlaxMT5ForConditionalGeneration"),
+        ("roberta", "FlaxRobertaForMaskedLM"),
+        ("roberta-prelayernorm", "FlaxRobertaPreLayerNormForMaskedLM"),
+        ("roformer", "FlaxRoFormerForMaskedLM"),
+        ("t5", "FlaxT5ForConditionalGeneration"),
+        ("wav2vec2", "FlaxWav2Vec2ForPreTraining"),
+        ("whisper", "FlaxWhisperForConditionalGeneration"),
+        ("xlm-roberta", "FlaxXLMRobertaForMaskedLM"),
+    ]
+)
+FLAX_MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Masked LM mapping
+        ("albert", "FlaxAlbertForMaskedLM"),
+        ("bart", "FlaxBartForConditionalGeneration"),
+        ("bert", "FlaxBertForMaskedLM"),
+        ("big_bird", "FlaxBigBirdForMaskedLM"),
+        ("distilbert", "FlaxDistilBertForMaskedLM"),
+        ("electra", "FlaxElectraForMaskedLM"),
+        ("mbart", "FlaxMBartForConditionalGeneration"),
+        ("roberta", "FlaxRobertaForMaskedLM"),
+        ("roberta-prelayernorm", "FlaxRobertaPreLayerNormForMaskedLM"),
+        ("roformer", "FlaxRoFormerForMaskedLM"),
+        ("xlm-roberta", "FlaxXLMRobertaForMaskedLM"),
+    ]
+)
+FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Seq2Seq Causal LM mapping
+        ("bart", "FlaxBartForConditionalGeneration"),
+        ("blenderbot", "FlaxBlenderbotForConditionalGeneration"),
+        ("blenderbot-small", "FlaxBlenderbotSmallForConditionalGeneration"),
+        ("encoder-decoder", "FlaxEncoderDecoderModel"),
+        ("longt5", "FlaxLongT5ForConditionalGeneration"),
+        ("marian", "FlaxMarianMTModel"),
+        ("mbart", "FlaxMBartForConditionalGeneration"),
+        ("mt5", "FlaxMT5ForConditionalGeneration"),
+        ("pegasus", "FlaxPegasusForConditionalGeneration"),
+        ("t5", "FlaxT5ForConditionalGeneration"),
+    ]
+)
+FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Image-classification
+        ("beit", "FlaxBeitForImageClassification"),
+        ("dinov2", "FlaxDinov2ForImageClassification"),
+        ("regnet", "FlaxRegNetForImageClassification"),
+        ("resnet", "FlaxResNetForImageClassification"),
+        ("vit", "FlaxViTForImageClassification"),
+    ]
+)
+FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = OrderedDict(
+    [
+        ("vision-encoder-decoder", "FlaxVisionEncoderDecoderModel"),
+    ]
+)
+FLAX_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Causal LM mapping
+        ("bart", "FlaxBartForCausalLM"),
+        ("bert", "FlaxBertForCausalLM"),
+        ("big_bird", "FlaxBigBirdForCausalLM"),
+        ("bloom", "FlaxBloomForCausalLM"),
+        ("electra", "FlaxElectraForCausalLM"),
+        ("gemma", "FlaxGemmaForCausalLM"),
+        ("gpt-sw3", "FlaxGPT2LMHeadModel"),
+        ("gpt2", "FlaxGPT2LMHeadModel"),
+        ("gpt_neo", "FlaxGPTNeoForCausalLM"),
+        ("gptj", "FlaxGPTJForCausalLM"),
+        ("llama", "FlaxLlamaForCausalLM"),
+        ("mistral", "FlaxMistralForCausalLM"),
+        ("opt", "FlaxOPTForCausalLM"),
+        ("roberta", "FlaxRobertaForCausalLM"),
+        ("roberta-prelayernorm", "FlaxRobertaPreLayerNormForCausalLM"),
+        ("xglm", "FlaxXGLMForCausalLM"),
+        ("xlm-roberta", "FlaxXLMRobertaForCausalLM"),
+    ]
+)
+FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Sequence Classification mapping
+        ("albert", "FlaxAlbertForSequenceClassification"),
+        ("bart", "FlaxBartForSequenceClassification"),
+        ("bert", "FlaxBertForSequenceClassification"),
+        ("big_bird", "FlaxBigBirdForSequenceClassification"),
+        ("distilbert", "FlaxDistilBertForSequenceClassification"),
+        ("electra", "FlaxElectraForSequenceClassification"),
+        ("mbart", "FlaxMBartForSequenceClassification"),
+        ("roberta", "FlaxRobertaForSequenceClassification"),
+        ("roberta-prelayernorm", "FlaxRobertaPreLayerNormForSequenceClassification"),
+        ("roformer", "FlaxRoFormerForSequenceClassification"),
+        ("xlm-roberta", "FlaxXLMRobertaForSequenceClassification"),
+    ]
+)
+FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Question Answering mapping
+        ("albert", "FlaxAlbertForQuestionAnswering"),
+        ("bart", "FlaxBartForQuestionAnswering"),
+        ("bert", "FlaxBertForQuestionAnswering"),
+        ("big_bird", "FlaxBigBirdForQuestionAnswering"),
+        ("distilbert", "FlaxDistilBertForQuestionAnswering"),
+        ("electra", "FlaxElectraForQuestionAnswering"),
+        ("mbart", "FlaxMBartForQuestionAnswering"),
+        ("roberta", "FlaxRobertaForQuestionAnswering"),
+        ("roberta-prelayernorm", "FlaxRobertaPreLayerNormForQuestionAnswering"),
+        ("roformer", "FlaxRoFormerForQuestionAnswering"),
+        ("xlm-roberta", "FlaxXLMRobertaForQuestionAnswering"),
+    ]
+)
+FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Token Classification mapping
+        ("albert", "FlaxAlbertForTokenClassification"),
+        ("bert", "FlaxBertForTokenClassification"),
+        ("big_bird", "FlaxBigBirdForTokenClassification"),
+        ("distilbert", "FlaxDistilBertForTokenClassification"),
+        ("electra", "FlaxElectraForTokenClassification"),
+        ("roberta", "FlaxRobertaForTokenClassification"),
+        ("roberta-prelayernorm", "FlaxRobertaPreLayerNormForTokenClassification"),
+        ("roformer", "FlaxRoFormerForTokenClassification"),
+        ("xlm-roberta", "FlaxXLMRobertaForTokenClassification"),
+    ]
+)
+FLAX_MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Multiple Choice mapping
+        ("albert", "FlaxAlbertForMultipleChoice"),
+        ("bert", "FlaxBertForMultipleChoice"),
+        ("big_bird", "FlaxBigBirdForMultipleChoice"),
+        ("distilbert", "FlaxDistilBertForMultipleChoice"),
+        ("electra", "FlaxElectraForMultipleChoice"),
+        ("roberta", "FlaxRobertaForMultipleChoice"),
+        ("roberta-prelayernorm", "FlaxRobertaPreLayerNormForMultipleChoice"),
+        ("roformer", "FlaxRoFormerForMultipleChoice"),
+        ("xlm-roberta", "FlaxXLMRobertaForMultipleChoice"),
+    ]
+)
+FLAX_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES = OrderedDict(
+    [
+        ("bert", "FlaxBertForNextSentencePrediction"),
+    ]
+)
+FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = OrderedDict(
+    [
+        ("speech-encoder-decoder", "FlaxSpeechEncoderDecoderModel"),
+        ("whisper", "FlaxWhisperForConditionalGeneration"),
+    ]
+)
+FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
+    [
+        ("whisper", "FlaxWhisperForAudioClassification"),
+    ]
+)
+FLAX_MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_MAPPING_NAMES)
+FLAX_MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_PRETRAINING_MAPPING_NAMES)
+FLAX_MODEL_FOR_MASKED_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_MASKED_LM_MAPPING_NAMES)
+FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
+)
+FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
+)
+FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES)
+FLAX_MODEL_FOR_CAUSAL_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
+FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
+)
+FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
+)
+FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
+)
+FLAX_MODEL_FOR_MULTIPLE_CHOICE_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES
+)
+FLAX_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES
+)
+FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES
+)
+FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES
+)
+class FlaxAutoModel(_BaseAutoModelClass):
+    _model_mapping = FLAX_MODEL_MAPPING
+FlaxAutoModel = auto_class_update(FlaxAutoModel)
+class FlaxAutoModelForPreTraining(_BaseAutoModelClass):
+    _model_mapping = FLAX_MODEL_FOR_PRETRAINING_MAPPING
+FlaxAutoModelForPreTraining = auto_class_update(FlaxAutoModelForPreTraining, head_doc="pretraining")
+class FlaxAutoModelForCausalLM(_BaseAutoModelClass):
+    _model_mapping = FLAX_MODEL_FOR_CAUSAL_LM_MAPPING
+FlaxAutoModelForCausalLM = auto_class_update(FlaxAutoModelForCausalLM, head_doc="causal language modeling")
+class FlaxAutoModelForMaskedLM(_BaseAutoModelClass):
+    _model_mapping = FLAX_MODEL_FOR_MASKED_LM_MAPPING
+FlaxAutoModelForMaskedLM = auto_class_update(FlaxAutoModelForMaskedLM, head_doc="masked language modeling")
+class FlaxAutoModelForSeq2SeqLM(_BaseAutoModelClass):
+    _model_mapping = FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
+FlaxAutoModelForSeq2SeqLM = auto_class_update(
+    FlaxAutoModelForSeq2SeqLM,
+    head_doc="sequence-to-sequence language modeling",
+    checkpoint_for_example="google-t5/t5-base",
+)
+class FlaxAutoModelForSequenceClassification(_BaseAutoModelClass):
+    _model_mapping = FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
+FlaxAutoModelForSequenceClassification = auto_class_update(
+    FlaxAutoModelForSequenceClassification, head_doc="sequence classification"
+)
+class FlaxAutoModelForQuestionAnswering(_BaseAutoModelClass):
+    _model_mapping = FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING
+FlaxAutoModelForQuestionAnswering = auto_class_update(FlaxAutoModelForQuestionAnswering, head_doc="question answering")
+class FlaxAutoModelForTokenClassification(_BaseAutoModelClass):
+    _model_mapping = FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
+FlaxAutoModelForTokenClassification = auto_class_update(
+    FlaxAutoModelForTokenClassification, head_doc="token classification"
+)
+class FlaxAutoModelForMultipleChoice(_BaseAutoModelClass):
+    _model_mapping = FLAX_MODEL_FOR_MULTIPLE_CHOICE_MAPPING
+FlaxAutoModelForMultipleChoice = auto_class_update(FlaxAutoModelForMultipleChoice, head_doc="multiple choice")
+class FlaxAutoModelForNextSentencePrediction(_BaseAutoModelClass):
+    _model_mapping = FLAX_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING
+FlaxAutoModelForNextSentencePrediction = auto_class_update(
+    FlaxAutoModelForNextSentencePrediction, head_doc="next sentence prediction"
+)
+class FlaxAutoModelForImageClassification(_BaseAutoModelClass):
+    _model_mapping = FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
+FlaxAutoModelForImageClassification = auto_class_update(
+    FlaxAutoModelForImageClassification, head_doc="image classification"
+)
+class FlaxAutoModelForVision2Seq(_BaseAutoModelClass):
+    _model_mapping = FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING
+FlaxAutoModelForVision2Seq = auto_class_update(FlaxAutoModelForVision2Seq, head_doc="vision-to-text modeling")
+class FlaxAutoModelForSpeechSeq2Seq(_BaseAutoModelClass):
+    _model_mapping = FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING
+FlaxAutoModelForSpeechSeq2Seq = auto_class_update(
+    FlaxAutoModelForSpeechSeq2Seq, head_doc="sequence-to-sequence speech-to-text modeling"
+)
+__all__ = [
+    "FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
+    "FLAX_MODEL_FOR_CAUSAL_LM_MAPPING",
+    "FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
+    "FLAX_MODEL_FOR_MASKED_LM_MAPPING",
+    "FLAX_MODEL_FOR_MULTIPLE_CHOICE_MAPPING",
+    "FLAX_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING",
+    "FLAX_MODEL_FOR_PRETRAINING_MAPPING",
+    "FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING",
+    "FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING",
+    "FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
+    "FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
+    "FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
+    "FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING",
+    "FLAX_MODEL_MAPPING",
+    "FlaxAutoModel",
+    "FlaxAutoModelForCausalLM",
+    "FlaxAutoModelForImageClassification",
+    "FlaxAutoModelForMaskedLM",
+    "FlaxAutoModelForMultipleChoice",
+    "FlaxAutoModelForNextSentencePrediction",
+    "FlaxAutoModelForPreTraining",
+    "FlaxAutoModelForQuestionAnswering",
+    "FlaxAutoModelForSeq2SeqLM",
+    "FlaxAutoModelForSequenceClassification",
+    "FlaxAutoModelForSpeechSeq2Seq",
+    "FlaxAutoModelForTokenClassification",
+    "FlaxAutoModelForVision2Seq",
+]

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/auto/modeling_tf_auto.py ADDED Viewed

	@@ -0,0 +1,776 @@

+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Auto Model class."""
+import warnings
+from collections import OrderedDict
+from ...utils import logging
+from .auto_factory import _BaseAutoModelClass, _LazyAutoMapping, auto_class_update
+from .configuration_auto import CONFIG_MAPPING_NAMES
+logger = logging.get_logger(__name__)
+TF_MODEL_MAPPING_NAMES = OrderedDict(
+    [
+        # Base model mapping
+        ("albert", "TFAlbertModel"),
+        ("bart", "TFBartModel"),
+        ("bert", "TFBertModel"),
+        ("blenderbot", "TFBlenderbotModel"),
+        ("blenderbot-small", "TFBlenderbotSmallModel"),
+        ("blip", "TFBlipModel"),
+        ("camembert", "TFCamembertModel"),
+        ("clip", "TFCLIPModel"),
+        ("convbert", "TFConvBertModel"),
+        ("convnext", "TFConvNextModel"),
+        ("convnextv2", "TFConvNextV2Model"),
+        ("ctrl", "TFCTRLModel"),
+        ("cvt", "TFCvtModel"),
+        ("data2vec-vision", "TFData2VecVisionModel"),
+        ("deberta", "TFDebertaModel"),
+        ("deberta-v2", "TFDebertaV2Model"),
+        ("deit", "TFDeiTModel"),
+        ("distilbert", "TFDistilBertModel"),
+        ("dpr", "TFDPRQuestionEncoder"),
+        ("efficientformer", "TFEfficientFormerModel"),
+        ("electra", "TFElectraModel"),
+        ("esm", "TFEsmModel"),
+        ("flaubert", "TFFlaubertModel"),
+        ("funnel", ("TFFunnelModel", "TFFunnelBaseModel")),
+        ("gpt-sw3", "TFGPT2Model"),
+        ("gpt2", "TFGPT2Model"),
+        ("gptj", "TFGPTJModel"),
+        ("groupvit", "TFGroupViTModel"),
+        ("hubert", "TFHubertModel"),
+        ("idefics", "TFIdeficsModel"),
+        ("layoutlm", "TFLayoutLMModel"),
+        ("layoutlmv3", "TFLayoutLMv3Model"),
+        ("led", "TFLEDModel"),
+        ("longformer", "TFLongformerModel"),
+        ("lxmert", "TFLxmertModel"),
+        ("marian", "TFMarianModel"),
+        ("mbart", "TFMBartModel"),
+        ("mistral", "TFMistralModel"),
+        ("mobilebert", "TFMobileBertModel"),
+        ("mobilevit", "TFMobileViTModel"),
+        ("mpnet", "TFMPNetModel"),
+        ("mt5", "TFMT5Model"),
+        ("openai-gpt", "TFOpenAIGPTModel"),
+        ("opt", "TFOPTModel"),
+        ("pegasus", "TFPegasusModel"),
+        ("regnet", "TFRegNetModel"),
+        ("rembert", "TFRemBertModel"),
+        ("resnet", "TFResNetModel"),
+        ("roberta", "TFRobertaModel"),
+        ("roberta-prelayernorm", "TFRobertaPreLayerNormModel"),
+        ("roformer", "TFRoFormerModel"),
+        ("sam", "TFSamModel"),
+        ("sam_vision_model", "TFSamVisionModel"),
+        ("segformer", "TFSegformerModel"),
+        ("speech_to_text", "TFSpeech2TextModel"),
+        ("swiftformer", "TFSwiftFormerModel"),
+        ("swin", "TFSwinModel"),
+        ("t5", "TFT5Model"),
+        ("tapas", "TFTapasModel"),
+        ("transfo-xl", "TFTransfoXLModel"),
+        ("vision-text-dual-encoder", "TFVisionTextDualEncoderModel"),
+        ("vit", "TFViTModel"),
+        ("vit_mae", "TFViTMAEModel"),
+        ("wav2vec2", "TFWav2Vec2Model"),
+        ("whisper", "TFWhisperModel"),
+        ("xglm", "TFXGLMModel"),
+        ("xlm", "TFXLMModel"),
+        ("xlm-roberta", "TFXLMRobertaModel"),
+        ("xlnet", "TFXLNetModel"),
+    ]
+)
+TF_MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for pre-training mapping
+        ("albert", "TFAlbertForPreTraining"),
+        ("bart", "TFBartForConditionalGeneration"),
+        ("bert", "TFBertForPreTraining"),
+        ("camembert", "TFCamembertForMaskedLM"),
+        ("ctrl", "TFCTRLLMHeadModel"),
+        ("distilbert", "TFDistilBertForMaskedLM"),
+        ("electra", "TFElectraForPreTraining"),
+        ("flaubert", "TFFlaubertWithLMHeadModel"),
+        ("funnel", "TFFunnelForPreTraining"),
+        ("gpt-sw3", "TFGPT2LMHeadModel"),
+        ("gpt2", "TFGPT2LMHeadModel"),
+        ("idefics", "TFIdeficsForVisionText2Text"),
+        ("layoutlm", "TFLayoutLMForMaskedLM"),
+        ("lxmert", "TFLxmertForPreTraining"),
+        ("mobilebert", "TFMobileBertForPreTraining"),
+        ("mpnet", "TFMPNetForMaskedLM"),
+        ("openai-gpt", "TFOpenAIGPTLMHeadModel"),
+        ("roberta", "TFRobertaForMaskedLM"),
+        ("roberta-prelayernorm", "TFRobertaPreLayerNormForMaskedLM"),
+        ("t5", "TFT5ForConditionalGeneration"),
+        ("tapas", "TFTapasForMaskedLM"),
+        ("transfo-xl", "TFTransfoXLLMHeadModel"),
+        ("vit_mae", "TFViTMAEForPreTraining"),
+        ("xlm", "TFXLMWithLMHeadModel"),
+        ("xlm-roberta", "TFXLMRobertaForMaskedLM"),
+        ("xlnet", "TFXLNetLMHeadModel"),
+    ]
+)
+TF_MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict(
+    [
+        # Model with LM heads mapping
+        ("albert", "TFAlbertForMaskedLM"),
+        ("bart", "TFBartForConditionalGeneration"),
+        ("bert", "TFBertForMaskedLM"),
+        ("camembert", "TFCamembertForMaskedLM"),
+        ("convbert", "TFConvBertForMaskedLM"),
+        ("ctrl", "TFCTRLLMHeadModel"),
+        ("distilbert", "TFDistilBertForMaskedLM"),
+        ("electra", "TFElectraForMaskedLM"),
+        ("esm", "TFEsmForMaskedLM"),
+        ("flaubert", "TFFlaubertWithLMHeadModel"),
+        ("funnel", "TFFunnelForMaskedLM"),
+        ("gpt-sw3", "TFGPT2LMHeadModel"),
+        ("gpt2", "TFGPT2LMHeadModel"),
+        ("gptj", "TFGPTJForCausalLM"),
+        ("layoutlm", "TFLayoutLMForMaskedLM"),
+        ("led", "TFLEDForConditionalGeneration"),
+        ("longformer", "TFLongformerForMaskedLM"),
+        ("marian", "TFMarianMTModel"),
+        ("mobilebert", "TFMobileBertForMaskedLM"),
+        ("mpnet", "TFMPNetForMaskedLM"),
+        ("openai-gpt", "TFOpenAIGPTLMHeadModel"),
+        ("rembert", "TFRemBertForMaskedLM"),
+        ("roberta", "TFRobertaForMaskedLM"),
+        ("roberta-prelayernorm", "TFRobertaPreLayerNormForMaskedLM"),
+        ("roformer", "TFRoFormerForMaskedLM"),
+        ("speech_to_text", "TFSpeech2TextForConditionalGeneration"),
+        ("t5", "TFT5ForConditionalGeneration"),
+        ("tapas", "TFTapasForMaskedLM"),
+        ("transfo-xl", "TFTransfoXLLMHeadModel"),
+        ("whisper", "TFWhisperForConditionalGeneration"),
+        ("xlm", "TFXLMWithLMHeadModel"),
+        ("xlm-roberta", "TFXLMRobertaForMaskedLM"),
+        ("xlnet", "TFXLNetLMHeadModel"),
+    ]
+)
+TF_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Causal LM mapping
+        ("bert", "TFBertLMHeadModel"),
+        ("camembert", "TFCamembertForCausalLM"),
+        ("ctrl", "TFCTRLLMHeadModel"),
+        ("gpt-sw3", "TFGPT2LMHeadModel"),
+        ("gpt2", "TFGPT2LMHeadModel"),
+        ("gptj", "TFGPTJForCausalLM"),
+        ("mistral", "TFMistralForCausalLM"),
+        ("openai-gpt", "TFOpenAIGPTLMHeadModel"),
+        ("opt", "TFOPTForCausalLM"),
+        ("rembert", "TFRemBertForCausalLM"),
+        ("roberta", "TFRobertaForCausalLM"),
+        ("roberta-prelayernorm", "TFRobertaPreLayerNormForCausalLM"),
+        ("roformer", "TFRoFormerForCausalLM"),
+        ("transfo-xl", "TFTransfoXLLMHeadModel"),
+        ("xglm", "TFXGLMForCausalLM"),
+        ("xlm", "TFXLMWithLMHeadModel"),
+        ("xlm-roberta", "TFXLMRobertaForCausalLM"),
+        ("xlnet", "TFXLNetLMHeadModel"),
+    ]
+)
+TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES = OrderedDict(
+    [
+        ("deit", "TFDeiTForMaskedImageModeling"),
+        ("swin", "TFSwinForMaskedImageModeling"),
+    ]
+)
+TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Image-classsification
+        ("convnext", "TFConvNextForImageClassification"),
+        ("convnextv2", "TFConvNextV2ForImageClassification"),
+        ("cvt", "TFCvtForImageClassification"),
+        ("data2vec-vision", "TFData2VecVisionForImageClassification"),
+        ("deit", ("TFDeiTForImageClassification", "TFDeiTForImageClassificationWithTeacher")),
+        (
+            "efficientformer",
+            ("TFEfficientFormerForImageClassification", "TFEfficientFormerForImageClassificationWithTeacher"),
+        ),
+        ("mobilevit", "TFMobileViTForImageClassification"),
+        ("regnet", "TFRegNetForImageClassification"),
+        ("resnet", "TFResNetForImageClassification"),
+        ("segformer", "TFSegformerForImageClassification"),
+        ("swiftformer", "TFSwiftFormerForImageClassification"),
+        ("swin", "TFSwinForImageClassification"),
+        ("vit", "TFViTForImageClassification"),
+    ]
+)
+TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Zero Shot Image Classification mapping
+        ("blip", "TFBlipModel"),
+        ("clip", "TFCLIPModel"),
+    ]
+)
+TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Semantic Segmentation mapping
+        ("data2vec-vision", "TFData2VecVisionForSemanticSegmentation"),
+        ("mobilevit", "TFMobileViTForSemanticSegmentation"),
+        ("segformer", "TFSegformerForSemanticSegmentation"),
+    ]
+)
+TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = OrderedDict(
+    [
+        ("blip", "TFBlipForConditionalGeneration"),
+        ("vision-encoder-decoder", "TFVisionEncoderDecoderModel"),
+    ]
+)
+TF_MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Masked LM mapping
+        ("albert", "TFAlbertForMaskedLM"),
+        ("bert", "TFBertForMaskedLM"),
+        ("camembert", "TFCamembertForMaskedLM"),
+        ("convbert", "TFConvBertForMaskedLM"),
+        ("deberta", "TFDebertaForMaskedLM"),
+        ("deberta-v2", "TFDebertaV2ForMaskedLM"),
+        ("distilbert", "TFDistilBertForMaskedLM"),
+        ("electra", "TFElectraForMaskedLM"),
+        ("esm", "TFEsmForMaskedLM"),
+        ("flaubert", "TFFlaubertWithLMHeadModel"),
+        ("funnel", "TFFunnelForMaskedLM"),
+        ("layoutlm", "TFLayoutLMForMaskedLM"),
+        ("longformer", "TFLongformerForMaskedLM"),
+        ("mobilebert", "TFMobileBertForMaskedLM"),
+        ("mpnet", "TFMPNetForMaskedLM"),
+        ("rembert", "TFRemBertForMaskedLM"),
+        ("roberta", "TFRobertaForMaskedLM"),
+        ("roberta-prelayernorm", "TFRobertaPreLayerNormForMaskedLM"),
+        ("roformer", "TFRoFormerForMaskedLM"),
+        ("tapas", "TFTapasForMaskedLM"),
+        ("xlm", "TFXLMWithLMHeadModel"),
+        ("xlm-roberta", "TFXLMRobertaForMaskedLM"),
+    ]
+)
+TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Seq2Seq Causal LM mapping
+        ("bart", "TFBartForConditionalGeneration"),
+        ("blenderbot", "TFBlenderbotForConditionalGeneration"),
+        ("blenderbot-small", "TFBlenderbotSmallForConditionalGeneration"),
+        ("encoder-decoder", "TFEncoderDecoderModel"),
+        ("led", "TFLEDForConditionalGeneration"),
+        ("marian", "TFMarianMTModel"),
+        ("mbart", "TFMBartForConditionalGeneration"),
+        ("mt5", "TFMT5ForConditionalGeneration"),
+        ("pegasus", "TFPegasusForConditionalGeneration"),
+        ("t5", "TFT5ForConditionalGeneration"),
+    ]
+)
+TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = OrderedDict(
+    [
+        ("speech_to_text", "TFSpeech2TextForConditionalGeneration"),
+        ("whisper", "TFWhisperForConditionalGeneration"),
+    ]
+)
+TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Sequence Classification mapping
+        ("albert", "TFAlbertForSequenceClassification"),
+        ("bart", "TFBartForSequenceClassification"),
+        ("bert", "TFBertForSequenceClassification"),
+        ("camembert", "TFCamembertForSequenceClassification"),
+        ("convbert", "TFConvBertForSequenceClassification"),
+        ("ctrl", "TFCTRLForSequenceClassification"),
+        ("deberta", "TFDebertaForSequenceClassification"),
+        ("deberta-v2", "TFDebertaV2ForSequenceClassification"),
+        ("distilbert", "TFDistilBertForSequenceClassification"),
+        ("electra", "TFElectraForSequenceClassification"),
+        ("esm", "TFEsmForSequenceClassification"),
+        ("flaubert", "TFFlaubertForSequenceClassification"),
+        ("funnel", "TFFunnelForSequenceClassification"),
+        ("gpt-sw3", "TFGPT2ForSequenceClassification"),
+        ("gpt2", "TFGPT2ForSequenceClassification"),
+        ("gptj", "TFGPTJForSequenceClassification"),
+        ("layoutlm", "TFLayoutLMForSequenceClassification"),
+        ("layoutlmv3", "TFLayoutLMv3ForSequenceClassification"),
+        ("longformer", "TFLongformerForSequenceClassification"),
+        ("mistral", "TFMistralForSequenceClassification"),
+        ("mobilebert", "TFMobileBertForSequenceClassification"),
+        ("mpnet", "TFMPNetForSequenceClassification"),
+        ("openai-gpt", "TFOpenAIGPTForSequenceClassification"),
+        ("rembert", "TFRemBertForSequenceClassification"),
+        ("roberta", "TFRobertaForSequenceClassification"),
+        ("roberta-prelayernorm", "TFRobertaPreLayerNormForSequenceClassification"),
+        ("roformer", "TFRoFormerForSequenceClassification"),
+        ("tapas", "TFTapasForSequenceClassification"),
+        ("transfo-xl", "TFTransfoXLForSequenceClassification"),
+        ("xlm", "TFXLMForSequenceClassification"),
+        ("xlm-roberta", "TFXLMRobertaForSequenceClassification"),
+        ("xlnet", "TFXLNetForSequenceClassification"),
+    ]
+)
+TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Question Answering mapping
+        ("albert", "TFAlbertForQuestionAnswering"),
+        ("bert", "TFBertForQuestionAnswering"),
+        ("camembert", "TFCamembertForQuestionAnswering"),
+        ("convbert", "TFConvBertForQuestionAnswering"),
+        ("deberta", "TFDebertaForQuestionAnswering"),
+        ("deberta-v2", "TFDebertaV2ForQuestionAnswering"),
+        ("distilbert", "TFDistilBertForQuestionAnswering"),
+        ("electra", "TFElectraForQuestionAnswering"),
+        ("flaubert", "TFFlaubertForQuestionAnsweringSimple"),
+        ("funnel", "TFFunnelForQuestionAnswering"),
+        ("gptj", "TFGPTJForQuestionAnswering"),
+        ("layoutlmv3", "TFLayoutLMv3ForQuestionAnswering"),
+        ("longformer", "TFLongformerForQuestionAnswering"),
+        ("mobilebert", "TFMobileBertForQuestionAnswering"),
+        ("mpnet", "TFMPNetForQuestionAnswering"),
+        ("rembert", "TFRemBertForQuestionAnswering"),
+        ("roberta", "TFRobertaForQuestionAnswering"),
+        ("roberta-prelayernorm", "TFRobertaPreLayerNormForQuestionAnswering"),
+        ("roformer", "TFRoFormerForQuestionAnswering"),
+        ("xlm", "TFXLMForQuestionAnsweringSimple"),
+        ("xlm-roberta", "TFXLMRobertaForQuestionAnswering"),
+        ("xlnet", "TFXLNetForQuestionAnsweringSimple"),
+    ]
+)
+TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = OrderedDict([("wav2vec2", "TFWav2Vec2ForSequenceClassification")])
+TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
+    [
+        ("layoutlm", "TFLayoutLMForQuestionAnswering"),
+        ("layoutlmv3", "TFLayoutLMv3ForQuestionAnswering"),
+    ]
+)
+TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Table Question Answering mapping
+        ("tapas", "TFTapasForQuestionAnswering"),
+    ]
+)
+TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Token Classification mapping
+        ("albert", "TFAlbertForTokenClassification"),
+        ("bert", "TFBertForTokenClassification"),
+        ("camembert", "TFCamembertForTokenClassification"),
+        ("convbert", "TFConvBertForTokenClassification"),
+        ("deberta", "TFDebertaForTokenClassification"),
+        ("deberta-v2", "TFDebertaV2ForTokenClassification"),
+        ("distilbert", "TFDistilBertForTokenClassification"),
+        ("electra", "TFElectraForTokenClassification"),
+        ("esm", "TFEsmForTokenClassification"),
+        ("flaubert", "TFFlaubertForTokenClassification"),
+        ("funnel", "TFFunnelForTokenClassification"),
+        ("layoutlm", "TFLayoutLMForTokenClassification"),
+        ("layoutlmv3", "TFLayoutLMv3ForTokenClassification"),
+        ("longformer", "TFLongformerForTokenClassification"),
+        ("mobilebert", "TFMobileBertForTokenClassification"),
+        ("mpnet", "TFMPNetForTokenClassification"),
+        ("rembert", "TFRemBertForTokenClassification"),
+        ("roberta", "TFRobertaForTokenClassification"),
+        ("roberta-prelayernorm", "TFRobertaPreLayerNormForTokenClassification"),
+        ("roformer", "TFRoFormerForTokenClassification"),
+        ("xlm", "TFXLMForTokenClassification"),
+        ("xlm-roberta", "TFXLMRobertaForTokenClassification"),
+        ("xlnet", "TFXLNetForTokenClassification"),
+    ]
+)
+TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Multiple Choice mapping
+        ("albert", "TFAlbertForMultipleChoice"),
+        ("bert", "TFBertForMultipleChoice"),
+        ("camembert", "TFCamembertForMultipleChoice"),
+        ("convbert", "TFConvBertForMultipleChoice"),
+        ("deberta-v2", "TFDebertaV2ForMultipleChoice"),
+        ("distilbert", "TFDistilBertForMultipleChoice"),
+        ("electra", "TFElectraForMultipleChoice"),
+        ("flaubert", "TFFlaubertForMultipleChoice"),
+        ("funnel", "TFFunnelForMultipleChoice"),
+        ("longformer", "TFLongformerForMultipleChoice"),
+        ("mobilebert", "TFMobileBertForMultipleChoice"),
+        ("mpnet", "TFMPNetForMultipleChoice"),
+        ("rembert", "TFRemBertForMultipleChoice"),
+        ("roberta", "TFRobertaForMultipleChoice"),
+        ("roberta-prelayernorm", "TFRobertaPreLayerNormForMultipleChoice"),
+        ("roformer", "TFRoFormerForMultipleChoice"),
+        ("xlm", "TFXLMForMultipleChoice"),
+        ("xlm-roberta", "TFXLMRobertaForMultipleChoice"),
+        ("xlnet", "TFXLNetForMultipleChoice"),
+    ]
+)
+TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES = OrderedDict(
+    [
+        ("bert", "TFBertForNextSentencePrediction"),
+        ("mobilebert", "TFMobileBertForNextSentencePrediction"),
+    ]
+)
+TF_MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = OrderedDict(
+    [
+        ("sam", "TFSamModel"),
+    ]
+)
+TF_MODEL_FOR_TEXT_ENCODING_MAPPING_NAMES = OrderedDict(
+    [
+        ("albert", "TFAlbertModel"),
+        ("bert", "TFBertModel"),
+        ("convbert", "TFConvBertModel"),
+        ("deberta", "TFDebertaModel"),
+        ("deberta-v2", "TFDebertaV2Model"),
+        ("distilbert", "TFDistilBertModel"),
+        ("electra", "TFElectraModel"),
+        ("flaubert", "TFFlaubertModel"),
+        ("longformer", "TFLongformerModel"),
+        ("mobilebert", "TFMobileBertModel"),
+        ("mt5", "TFMT5EncoderModel"),
+        ("rembert", "TFRemBertModel"),
+        ("roberta", "TFRobertaModel"),
+        ("roberta-prelayernorm", "TFRobertaPreLayerNormModel"),
+        ("roformer", "TFRoFormerModel"),
+        ("t5", "TFT5EncoderModel"),
+        ("xlm", "TFXLMModel"),
+        ("xlm-roberta", "TFXLMRobertaModel"),
+    ]
+)
+TF_MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_MAPPING_NAMES)
+TF_MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_FOR_PRETRAINING_MAPPING_NAMES)
+TF_MODEL_WITH_LM_HEAD_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_WITH_LM_HEAD_MAPPING_NAMES)
+TF_MODEL_FOR_CAUSAL_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
+TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES
+)
+TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
+)
+TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
+)
+TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES
+)
+TF_MODEL_FOR_VISION_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES)
+TF_MODEL_FOR_MASKED_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_FOR_MASKED_LM_MAPPING_NAMES)
+TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
+)
+TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
+)
+TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES
+)
+TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
+)
+TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES
+)
+TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES
+)
+TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
+)
+TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES
+)
+TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES
+)
+TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES
+)
+TF_MODEL_FOR_MASK_GENERATION_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_MASK_GENERATION_MAPPING_NAMES
+)
+TF_MODEL_FOR_TEXT_ENCODING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_FOR_TEXT_ENCODING_MAPPING_NAMES)
+class TFAutoModelForMaskGeneration(_BaseAutoModelClass):
+    _model_mapping = TF_MODEL_FOR_MASK_GENERATION_MAPPING
+class TFAutoModelForTextEncoding(_BaseAutoModelClass):
+    _model_mapping = TF_MODEL_FOR_TEXT_ENCODING_MAPPING
+class TFAutoModel(_BaseAutoModelClass):
+    _model_mapping = TF_MODEL_MAPPING
+TFAutoModel = auto_class_update(TFAutoModel)
+class TFAutoModelForAudioClassification(_BaseAutoModelClass):
+    _model_mapping = TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
+TFAutoModelForAudioClassification = auto_class_update(
+    TFAutoModelForAudioClassification, head_doc="audio classification"
+)
+class TFAutoModelForPreTraining(_BaseAutoModelClass):
+    _model_mapping = TF_MODEL_FOR_PRETRAINING_MAPPING
+TFAutoModelForPreTraining = auto_class_update(TFAutoModelForPreTraining, head_doc="pretraining")
+# Private on purpose, the public class will add the deprecation warnings.
+class _TFAutoModelWithLMHead(_BaseAutoModelClass):
+    _model_mapping = TF_MODEL_WITH_LM_HEAD_MAPPING
+_TFAutoModelWithLMHead = auto_class_update(_TFAutoModelWithLMHead, head_doc="language modeling")
+class TFAutoModelForCausalLM(_BaseAutoModelClass):
+    _model_mapping = TF_MODEL_FOR_CAUSAL_LM_MAPPING
+TFAutoModelForCausalLM = auto_class_update(TFAutoModelForCausalLM, head_doc="causal language modeling")
+class TFAutoModelForMaskedImageModeling(_BaseAutoModelClass):
+    _model_mapping = TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING
+TFAutoModelForMaskedImageModeling = auto_class_update(
+    TFAutoModelForMaskedImageModeling, head_doc="masked image modeling"
+)
+class TFAutoModelForImageClassification(_BaseAutoModelClass):
+    _model_mapping = TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
+TFAutoModelForImageClassification = auto_class_update(
+    TFAutoModelForImageClassification, head_doc="image classification"
+)
+class TFAutoModelForZeroShotImageClassification(_BaseAutoModelClass):
+    _model_mapping = TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING
+TFAutoModelForZeroShotImageClassification = auto_class_update(
+    TFAutoModelForZeroShotImageClassification, head_doc="zero-shot image classification"
+)
+class TFAutoModelForSemanticSegmentation(_BaseAutoModelClass):
+    _model_mapping = TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING
+TFAutoModelForSemanticSegmentation = auto_class_update(
+    TFAutoModelForSemanticSegmentation, head_doc="semantic segmentation"
+)
+class TFAutoModelForVision2Seq(_BaseAutoModelClass):
+    _model_mapping = TF_MODEL_FOR_VISION_2_SEQ_MAPPING
+TFAutoModelForVision2Seq = auto_class_update(TFAutoModelForVision2Seq, head_doc="vision-to-text modeling")
+class TFAutoModelForMaskedLM(_BaseAutoModelClass):
+    _model_mapping = TF_MODEL_FOR_MASKED_LM_MAPPING
+TFAutoModelForMaskedLM = auto_class_update(TFAutoModelForMaskedLM, head_doc="masked language modeling")
+class TFAutoModelForSeq2SeqLM(_BaseAutoModelClass):
+    _model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
+TFAutoModelForSeq2SeqLM = auto_class_update(
+    TFAutoModelForSeq2SeqLM,
+    head_doc="sequence-to-sequence language modeling",
+    checkpoint_for_example="google-t5/t5-base",
+)
+class TFAutoModelForSequenceClassification(_BaseAutoModelClass):
+    _model_mapping = TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
+TFAutoModelForSequenceClassification = auto_class_update(
+    TFAutoModelForSequenceClassification, head_doc="sequence classification"
+)
+class TFAutoModelForQuestionAnswering(_BaseAutoModelClass):
+    _model_mapping = TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING
+TFAutoModelForQuestionAnswering = auto_class_update(TFAutoModelForQuestionAnswering, head_doc="question answering")
+class TFAutoModelForDocumentQuestionAnswering(_BaseAutoModelClass):
+    _model_mapping = TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING
+TFAutoModelForDocumentQuestionAnswering = auto_class_update(
+    TFAutoModelForDocumentQuestionAnswering,
+    head_doc="document question answering",
+    checkpoint_for_example='impira/layoutlm-document-qa", revision="52e01b3',
+)
+class TFAutoModelForTableQuestionAnswering(_BaseAutoModelClass):
+    _model_mapping = TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING
+TFAutoModelForTableQuestionAnswering = auto_class_update(
+    TFAutoModelForTableQuestionAnswering,
+    head_doc="table question answering",
+    checkpoint_for_example="google/tapas-base-finetuned-wtq",
+)
+class TFAutoModelForTokenClassification(_BaseAutoModelClass):
+    _model_mapping = TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
+TFAutoModelForTokenClassification = auto_class_update(
+    TFAutoModelForTokenClassification, head_doc="token classification"
+)
+class TFAutoModelForMultipleChoice(_BaseAutoModelClass):
+    _model_mapping = TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING
+TFAutoModelForMultipleChoice = auto_class_update(TFAutoModelForMultipleChoice, head_doc="multiple choice")
+class TFAutoModelForNextSentencePrediction(_BaseAutoModelClass):
+    _model_mapping = TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING
+TFAutoModelForNextSentencePrediction = auto_class_update(
+    TFAutoModelForNextSentencePrediction, head_doc="next sentence prediction"
+)
+class TFAutoModelForSpeechSeq2Seq(_BaseAutoModelClass):
+    _model_mapping = TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING
+TFAutoModelForSpeechSeq2Seq = auto_class_update(
+    TFAutoModelForSpeechSeq2Seq, head_doc="sequence-to-sequence speech-to-text modeling"
+)
+class TFAutoModelWithLMHead(_TFAutoModelWithLMHead):
+    @classmethod
+    def from_config(cls, config):
+        warnings.warn(
+            "The class `TFAutoModelWithLMHead` is deprecated and will be removed in a future version. Please use"
+            " `TFAutoModelForCausalLM` for causal language models, `TFAutoModelForMaskedLM` for masked language models"
+            " and `TFAutoModelForSeq2SeqLM` for encoder-decoder models.",
+            FutureWarning,
+        )
+        return super().from_config(config)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        warnings.warn(
+            "The class `TFAutoModelWithLMHead` is deprecated and will be removed in a future version. Please use"
+            " `TFAutoModelForCausalLM` for causal language models, `TFAutoModelForMaskedLM` for masked language models"
+            " and `TFAutoModelForSeq2SeqLM` for encoder-decoder models.",
+            FutureWarning,
+        )
+        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+__all__ = [
+    "TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
+    "TF_MODEL_FOR_CAUSAL_LM_MAPPING",
+    "TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
+    "TF_MODEL_FOR_MASK_GENERATION_MAPPING",
+    "TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING",
+    "TF_MODEL_FOR_MASKED_LM_MAPPING",
+    "TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING",
+    "TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING",
+    "TF_MODEL_FOR_PRETRAINING_MAPPING",
+    "TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING",
+    "TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING",
+    "TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING",
+    "TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING",
+    "TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
+    "TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
+    "TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
+    "TF_MODEL_FOR_TEXT_ENCODING_MAPPING",
+    "TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
+    "TF_MODEL_FOR_VISION_2_SEQ_MAPPING",
+    "TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING",
+    "TF_MODEL_MAPPING",
+    "TF_MODEL_WITH_LM_HEAD_MAPPING",
+    "TFAutoModel",
+    "TFAutoModelForAudioClassification",
+    "TFAutoModelForCausalLM",
+    "TFAutoModelForImageClassification",
+    "TFAutoModelForMaskedImageModeling",
+    "TFAutoModelForMaskedLM",
+    "TFAutoModelForMaskGeneration",
+    "TFAutoModelForMultipleChoice",
+    "TFAutoModelForNextSentencePrediction",
+    "TFAutoModelForPreTraining",
+    "TFAutoModelForDocumentQuestionAnswering",
+    "TFAutoModelForQuestionAnswering",
+    "TFAutoModelForSemanticSegmentation",
+    "TFAutoModelForSeq2SeqLM",
+    "TFAutoModelForSequenceClassification",
+    "TFAutoModelForSpeechSeq2Seq",
+    "TFAutoModelForTableQuestionAnswering",
+    "TFAutoModelForTextEncoding",
+    "TFAutoModelForTokenClassification",
+    "TFAutoModelForVision2Seq",
+    "TFAutoModelForZeroShotImageClassification",
+    "TFAutoModelWithLMHead",
+]

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/auto/processing_auto.py ADDED Viewed

	@@ -0,0 +1,443 @@

+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""AutoProcessor class."""
+import importlib
+import inspect
+import json
+import warnings
+from collections import OrderedDict
+# Build the list of all feature extractors
+from ...configuration_utils import PretrainedConfig
+from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
+from ...feature_extraction_utils import FeatureExtractionMixin
+from ...image_processing_utils import ImageProcessingMixin
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils import TOKENIZER_CONFIG_FILE
+from ...utils import FEATURE_EXTRACTOR_NAME, PROCESSOR_NAME, VIDEO_PROCESSOR_NAME, cached_file, logging
+from ...video_processing_utils import BaseVideoProcessor
+from .auto_factory import _LazyAutoMapping
+from .configuration_auto import (
+    CONFIG_MAPPING_NAMES,
+    AutoConfig,
+    model_type_to_module_name,
+    replace_list_option_in_docstrings,
+)
+from .feature_extraction_auto import AutoFeatureExtractor
+from .image_processing_auto import AutoImageProcessor
+from .tokenization_auto import AutoTokenizer
+logger = logging.get_logger(__name__)
+PROCESSOR_MAPPING_NAMES = OrderedDict(
+    [
+        ("aimv2", "CLIPProcessor"),
+        ("align", "AlignProcessor"),
+        ("altclip", "AltCLIPProcessor"),
+        ("aria", "AriaProcessor"),
+        ("aya_vision", "AyaVisionProcessor"),
+        ("bark", "BarkProcessor"),
+        ("blip", "BlipProcessor"),
+        ("blip-2", "Blip2Processor"),
+        ("bridgetower", "BridgeTowerProcessor"),
+        ("chameleon", "ChameleonProcessor"),
+        ("chinese_clip", "ChineseCLIPProcessor"),
+        ("clap", "ClapProcessor"),
+        ("clip", "CLIPProcessor"),
+        ("clipseg", "CLIPSegProcessor"),
+        ("clvp", "ClvpProcessor"),
+        ("cohere2_vision", "Cohere2VisionProcessor"),
+        ("colpali", "ColPaliProcessor"),
+        ("colqwen2", "ColQwen2Processor"),
+        ("deepseek_vl", "DeepseekVLProcessor"),
+        ("deepseek_vl_hybrid", "DeepseekVLHybridProcessor"),
+        ("dia", "DiaProcessor"),
+        ("edgetam", "Sam2Processor"),
+        ("emu3", "Emu3Processor"),
+        ("evolla", "EvollaProcessor"),
+        ("flava", "FlavaProcessor"),
+        ("florence2", "Florence2Processor"),
+        ("fuyu", "FuyuProcessor"),
+        ("gemma3", "Gemma3Processor"),
+        ("gemma3n", "Gemma3nProcessor"),
+        ("git", "GitProcessor"),
+        ("glm4v", "Glm4vProcessor"),
+        ("glm4v_moe", "Glm4vProcessor"),
+        ("got_ocr2", "GotOcr2Processor"),
+        ("granite_speech", "GraniteSpeechProcessor"),
+        ("grounding-dino", "GroundingDinoProcessor"),
+        ("groupvit", "CLIPProcessor"),
+        ("hubert", "Wav2Vec2Processor"),
+        ("idefics", "IdeficsProcessor"),
+        ("idefics2", "Idefics2Processor"),
+        ("idefics3", "Idefics3Processor"),
+        ("instructblip", "InstructBlipProcessor"),
+        ("instructblipvideo", "InstructBlipVideoProcessor"),
+        ("internvl", "InternVLProcessor"),
+        ("janus", "JanusProcessor"),
+        ("kosmos-2", "Kosmos2Processor"),
+        ("kosmos-2.5", "Kosmos2_5Processor"),
+        ("kyutai_speech_to_text", "KyutaiSpeechToTextProcessor"),
+        ("layoutlmv2", "LayoutLMv2Processor"),
+        ("layoutlmv3", "LayoutLMv3Processor"),
+        ("lfm2_vl", "Lfm2VlProcessor"),
+        ("llama4", "Llama4Processor"),
+        ("llava", "LlavaProcessor"),
+        ("llava_next", "LlavaNextProcessor"),
+        ("llava_next_video", "LlavaNextVideoProcessor"),
+        ("llava_onevision", "LlavaOnevisionProcessor"),
+        ("markuplm", "MarkupLMProcessor"),
+        ("mctct", "MCTCTProcessor"),
+        ("metaclip_2", "CLIPProcessor"),
+        ("mgp-str", "MgpstrProcessor"),
+        ("mistral3", "PixtralProcessor"),
+        ("mllama", "MllamaProcessor"),
+        ("mm-grounding-dino", "GroundingDinoProcessor"),
+        ("moonshine", "Wav2Vec2Processor"),
+        ("oneformer", "OneFormerProcessor"),
+        ("ovis2", "Ovis2Processor"),
+        ("owlv2", "Owlv2Processor"),
+        ("owlvit", "OwlViTProcessor"),
+        ("paligemma", "PaliGemmaProcessor"),
+        ("perception_lm", "PerceptionLMProcessor"),
+        ("phi4_multimodal", "Phi4MultimodalProcessor"),
+        ("pix2struct", "Pix2StructProcessor"),
+        ("pixtral", "PixtralProcessor"),
+        ("pop2piano", "Pop2PianoProcessor"),
+        ("qwen2_5_omni", "Qwen2_5OmniProcessor"),
+        ("qwen2_5_vl", "Qwen2_5_VLProcessor"),
+        ("qwen2_audio", "Qwen2AudioProcessor"),
+        ("qwen2_vl", "Qwen2VLProcessor"),
+        ("qwen3_omni_moe", "Qwen3OmniMoeProcessor"),
+        ("qwen3_vl", "Qwen3VLProcessor"),
+        ("qwen3_vl_moe", "Qwen3VLProcessor"),
+        ("sam", "SamProcessor"),
+        ("sam2", "Sam2Processor"),
+        ("sam_hq", "SamHQProcessor"),
+        ("seamless_m4t", "SeamlessM4TProcessor"),
+        ("sew", "Wav2Vec2Processor"),
+        ("sew-d", "Wav2Vec2Processor"),
+        ("shieldgemma2", "ShieldGemma2Processor"),
+        ("siglip", "SiglipProcessor"),
+        ("siglip2", "Siglip2Processor"),
+        ("smolvlm", "SmolVLMProcessor"),
+        ("speech_to_text", "Speech2TextProcessor"),
+        ("speech_to_text_2", "Speech2Text2Processor"),
+        ("speecht5", "SpeechT5Processor"),
+        ("trocr", "TrOCRProcessor"),
+        ("tvlt", "TvltProcessor"),
+        ("tvp", "TvpProcessor"),
+        ("udop", "UdopProcessor"),
+        ("unispeech", "Wav2Vec2Processor"),
+        ("unispeech-sat", "Wav2Vec2Processor"),
+        ("video_llava", "VideoLlavaProcessor"),
+        ("vilt", "ViltProcessor"),
+        ("vipllava", "LlavaProcessor"),
+        ("vision-text-dual-encoder", "VisionTextDualEncoderProcessor"),
+        ("voxtral", "VoxtralProcessor"),
+        ("wav2vec2", "Wav2Vec2Processor"),
+        ("wav2vec2-bert", "Wav2Vec2Processor"),
+        ("wav2vec2-conformer", "Wav2Vec2Processor"),
+        ("wavlm", "Wav2Vec2Processor"),
+        ("whisper", "WhisperProcessor"),
+        ("xclip", "XCLIPProcessor"),
+    ]
+)
+PROCESSOR_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, PROCESSOR_MAPPING_NAMES)
+def processor_class_from_name(class_name: str):
+    for module_name, processors in PROCESSOR_MAPPING_NAMES.items():
+        if class_name in processors:
+            module_name = model_type_to_module_name(module_name)
+            module = importlib.import_module(f".{module_name}", "transformers.models")
+            try:
+                return getattr(module, class_name)
+            except AttributeError:
+                continue
+    for processor in PROCESSOR_MAPPING._extra_content.values():
+        if getattr(processor, "__name__", None) == class_name:
+            return processor
+    # We did not fine the class, but maybe it's because a dep is missing. In that case, the class will be in the main
+    # init and we return the proper dummy to get an appropriate error message.
+    main_module = importlib.import_module("transformers")
+    if hasattr(main_module, class_name):
+        return getattr(main_module, class_name)
+    return None
+class AutoProcessor:
+    r"""
+    This is a generic processor class that will be instantiated as one of the processor classes of the library when
+    created with the [`AutoProcessor.from_pretrained`] class method.
+    This class cannot be instantiated directly using `__init__()` (throws an error).
+    """
+    def __init__(self):
+        raise OSError(
+            "AutoProcessor is designed to be instantiated "
+            "using the `AutoProcessor.from_pretrained(pretrained_model_name_or_path)` method."
+        )
+    @classmethod
+    @replace_list_option_in_docstrings(PROCESSOR_MAPPING_NAMES)
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        r"""
+        Instantiate one of the processor classes of the library from a pretrained model vocabulary.
+        The processor class to instantiate is selected based on the `model_type` property of the config object (either
+        passed as an argument or loaded from `pretrained_model_name_or_path` if possible):
+        List options
+        Params:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                This can be either:
+                - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
+                  huggingface.co.
+                - a path to a *directory* containing a processor files saved using the `save_pretrained()` method,
+                  e.g., `./my_model_directory/`.
+            cache_dir (`str` or `os.PathLike`, *optional*):
+                Path to a directory in which a downloaded pretrained model feature extractor should be cached if the
+                standard cache should not be used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force to (re-)download the feature extractor files and override the cached versions
+                if they exist.
+            resume_download:
+                Deprecated and ignored. All downloads are now resumed by default when possible.
+                Will be removed in v5 of Transformers.
+            proxies (`dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+                when running `hf auth login` (stored in `~/.huggingface`).
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                If `False`, then this function returns just the final feature extractor object. If `True`, then this
+                functions returns a `Tuple(feature_extractor, unused_kwargs)` where *unused_kwargs* is a dictionary
+                consisting of the key/value pairs whose keys are not feature extractor attributes: i.e., the part of
+                `kwargs` which has not been used to update `feature_extractor` and is otherwise ignored.
+            trust_remote_code (`bool`, *optional*, defaults to `False`):
+                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
+                should only be set to `True` for repositories you trust and in which you have read the code, as it will
+                execute code present on the Hub on your local machine.
+            kwargs (`dict[str, Any]`, *optional*):
+                The values in kwargs of any keys which are feature extractor attributes will be used to override the
+                loaded values. Behavior concerning key/value pairs whose keys are *not* feature extractor attributes is
+                controlled by the `return_unused_kwargs` keyword parameter.
+        <Tip>
+        Passing `token=True` is required when you want to use a private model.
+        </Tip>
+        Examples:
+        ```python
+        >>> from transformers import AutoProcessor
+        >>> # Download processor from huggingface.co and cache.
+        >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
+        >>> # If processor files are in a directory (e.g. processor was saved using *save_pretrained('./test/saved_model/')*)
+        >>> # processor = AutoProcessor.from_pretrained("./test/saved_model/")
+        ```"""
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+                FutureWarning,
+            )
+            if kwargs.get("token") is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            kwargs["token"] = use_auth_token
+        config = kwargs.pop("config", None)
+        trust_remote_code = kwargs.pop("trust_remote_code", None)
+        kwargs["_from_auto"] = True
+        processor_class = None
+        processor_auto_map = None
+        # First, let's see if we have a processor or preprocessor config.
+        # Filter the kwargs for `cached_file`.
+        cached_file_kwargs = {key: kwargs[key] for key in inspect.signature(cached_file).parameters if key in kwargs}
+        # We don't want to raise
+        cached_file_kwargs.update(
+            {
+                "_raise_exceptions_for_gated_repo": False,
+                "_raise_exceptions_for_missing_entries": False,
+                "_raise_exceptions_for_connection_errors": False,
+            }
+        )
+        # Let's start by checking whether the processor class is saved in a processor config
+        processor_config_file = cached_file(pretrained_model_name_or_path, PROCESSOR_NAME, **cached_file_kwargs)
+        if processor_config_file is not None:
+            config_dict, _ = ProcessorMixin.get_processor_dict(pretrained_model_name_or_path, **kwargs)
+            processor_class = config_dict.get("processor_class", None)
+            if "AutoProcessor" in config_dict.get("auto_map", {}):
+                processor_auto_map = config_dict["auto_map"]["AutoProcessor"]
+        if processor_class is None:
+            # If not found, let's check whether the processor class is saved in an image processor config
+            preprocessor_config_file = cached_file(
+                pretrained_model_name_or_path, FEATURE_EXTRACTOR_NAME, **cached_file_kwargs
+            )
+            if preprocessor_config_file is not None:
+                config_dict, _ = ImageProcessingMixin.get_image_processor_dict(pretrained_model_name_or_path, **kwargs)
+                processor_class = config_dict.get("processor_class", None)
+                if "AutoProcessor" in config_dict.get("auto_map", {}):
+                    processor_auto_map = config_dict["auto_map"]["AutoProcessor"]
+            # Saved as video processor
+            if preprocessor_config_file is None:
+                preprocessor_config_file = cached_file(
+                    pretrained_model_name_or_path, VIDEO_PROCESSOR_NAME, **cached_file_kwargs
+                )
+                if preprocessor_config_file is not None:
+                    config_dict, _ = BaseVideoProcessor.get_video_processor_dict(
+                        pretrained_model_name_or_path, **kwargs
+                    )
+                    processor_class = config_dict.get("processor_class", None)
+                    if "AutoProcessor" in config_dict.get("auto_map", {}):
+                        processor_auto_map = config_dict["auto_map"]["AutoProcessor"]
+            # Saved as feature extractor
+            if preprocessor_config_file is None:
+                preprocessor_config_file = cached_file(
+                    pretrained_model_name_or_path, FEATURE_EXTRACTOR_NAME, **cached_file_kwargs
+                )
+                if preprocessor_config_file is not None and processor_class is None:
+                    config_dict, _ = FeatureExtractionMixin.get_feature_extractor_dict(
+                        pretrained_model_name_or_path, **kwargs
+                    )
+                    processor_class = config_dict.get("processor_class", None)
+                    if "AutoProcessor" in config_dict.get("auto_map", {}):
+                        processor_auto_map = config_dict["auto_map"]["AutoProcessor"]
+        if processor_class is None:
+            # Next, let's check whether the processor class is saved in a tokenizer
+            tokenizer_config_file = cached_file(
+                pretrained_model_name_or_path, TOKENIZER_CONFIG_FILE, **cached_file_kwargs
+            )
+            if tokenizer_config_file is not None:
+                with open(tokenizer_config_file, encoding="utf-8") as reader:
+                    config_dict = json.load(reader)
+                processor_class = config_dict.get("processor_class", None)
+                if "AutoProcessor" in config_dict.get("auto_map", {}):
+                    processor_auto_map = config_dict["auto_map"]["AutoProcessor"]
+        if processor_class is None:
+            # Otherwise, load config, if it can be loaded.
+            if not isinstance(config, PretrainedConfig):
+                config = AutoConfig.from_pretrained(
+                    pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
+                )
+            # And check if the config contains the processor class.
+            processor_class = getattr(config, "processor_class", None)
+            if hasattr(config, "auto_map") and "AutoProcessor" in config.auto_map:
+                processor_auto_map = config.auto_map["AutoProcessor"]
+        if processor_class is not None:
+            processor_class = processor_class_from_name(processor_class)
+        has_remote_code = processor_auto_map is not None
+        has_local_code = processor_class is not None or type(config) in PROCESSOR_MAPPING
+        if has_remote_code:
+            if "--" in processor_auto_map:
+                upstream_repo = processor_auto_map.split("--")[0]
+            else:
+                upstream_repo = None
+            trust_remote_code = resolve_trust_remote_code(
+                trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code, upstream_repo
+            )
+        if has_remote_code and trust_remote_code:
+            processor_class = get_class_from_dynamic_module(
+                processor_auto_map, pretrained_model_name_or_path, **kwargs
+            )
+            _ = kwargs.pop("code_revision", None)
+            processor_class.register_for_auto_class()
+            return processor_class.from_pretrained(
+                pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
+            )
+        elif processor_class is not None:
+            return processor_class.from_pretrained(
+                pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
+            )
+        # Last try: we use the PROCESSOR_MAPPING.
+        elif type(config) in PROCESSOR_MAPPING:
+            return PROCESSOR_MAPPING[type(config)].from_pretrained(pretrained_model_name_or_path, **kwargs)
+        # At this stage, there doesn't seem to be a `Processor` class available for this model, so let's try a
+        # tokenizer.
+        try:
+            return AutoTokenizer.from_pretrained(
+                pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
+            )
+        except Exception:
+            try:
+                return AutoImageProcessor.from_pretrained(
+                    pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
+                )
+            except Exception:
+                pass
+            try:
+                return AutoFeatureExtractor.from_pretrained(
+                    pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
+                )
+            except Exception:
+                pass
+        raise ValueError(
+            f"Unrecognized processing class in {pretrained_model_name_or_path}. Can't instantiate a processor, a "
+            "tokenizer, an image processor or a feature extractor for this model. Make sure the repository contains "
+            "the files of at least one of those processing classes."
+        )
+    @staticmethod
+    def register(config_class, processor_class, exist_ok=False):
+        """
+        Register a new processor for this class.
+        Args:
+            config_class ([`PretrainedConfig`]):
+                The configuration corresponding to the model to register.
+            processor_class ([`ProcessorMixin`]): The processor to register.
+        """
+        PROCESSOR_MAPPING.register(config_class, processor_class, exist_ok=exist_ok)
+__all__ = ["PROCESSOR_MAPPING", "AutoProcessor"]

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/auto/tokenization_auto.py ADDED Viewed

	@@ -0,0 +1,1235 @@

+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Auto Tokenizer class."""
+import importlib
+import json
+import os
+import warnings
+from collections import OrderedDict
+from typing import Any, Optional, Union
+from transformers.utils.import_utils import is_mistral_common_available
+from ...configuration_utils import PretrainedConfig
+from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
+from ...modeling_gguf_pytorch_utils import load_gguf_checkpoint
+from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_utils_base import TOKENIZER_CONFIG_FILE
+from ...utils import (
+    cached_file,
+    extract_commit_hash,
+    is_g2p_en_available,
+    is_sentencepiece_available,
+    is_tokenizers_available,
+    logging,
+)
+from ..encoder_decoder import EncoderDecoderConfig
+from .auto_factory import _LazyAutoMapping
+from .configuration_auto import (
+    CONFIG_MAPPING_NAMES,
+    AutoConfig,
+    config_class_to_model_type,
+    model_type_to_module_name,
+    replace_list_option_in_docstrings,
+)
+if is_tokenizers_available():
+    from ...tokenization_utils_fast import PreTrainedTokenizerFast
+else:
+    PreTrainedTokenizerFast = None
+logger = logging.get_logger(__name__)
+# Explicit rather than inferred generics to significantly improves completion suggestion performance for language servers.
+TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
+    [
+        (
+            "aimv2",
+            (
+                "CLIPTokenizer",
+                "CLIPTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        (
+            "albert",
+            (
+                "AlbertTokenizer" if is_sentencepiece_available() else None,
+                "AlbertTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        ("align", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
+        ("arcee", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
+        ("aria", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
+        ("aya_vision", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)),
+        ("bark", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
+        ("bart", ("BartTokenizer", "BartTokenizerFast")),
+        (
+            "barthez",
+            (
+                "BarthezTokenizer" if is_sentencepiece_available() else None,
+                "BarthezTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        ("bartpho", ("BartphoTokenizer", None)),
+        ("bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
+        ("bert-generation", ("BertGenerationTokenizer" if is_sentencepiece_available() else None, None)),
+        ("bert-japanese", ("BertJapaneseTokenizer", None)),
+        ("bertweet", ("BertweetTokenizer", None)),
+        (
+            "big_bird",
+            (
+                "BigBirdTokenizer" if is_sentencepiece_available() else None,
+                "BigBirdTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        ("bigbird_pegasus", ("PegasusTokenizer", "PegasusTokenizerFast" if is_tokenizers_available() else None)),
+        ("biogpt", ("BioGptTokenizer", None)),
+        ("bitnet", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
+        ("blenderbot", ("BlenderbotTokenizer", "BlenderbotTokenizerFast")),
+        ("blenderbot-small", ("BlenderbotSmallTokenizer", None)),
+        ("blip", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
+        ("blip-2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
+        ("bloom", (None, "BloomTokenizerFast" if is_tokenizers_available() else None)),
+        ("blt", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
+        ("bridgetower", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
+        ("bros", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
+        ("byt5", ("ByT5Tokenizer", None)),
+        (
+            "camembert",
+            (
+                "CamembertTokenizer" if is_sentencepiece_available() else None,
+                "CamembertTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        ("canine", ("CanineTokenizer", None)),
+        (
+            "chameleon",
+            (
+                "LlamaTokenizer" if is_sentencepiece_available() else None,
+                "LlamaTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        ("chinese_clip", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
+        (
+            "clap",
+            (
+                "RobertaTokenizer",
+                "RobertaTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        (
+            "clip",
+            (
+                "CLIPTokenizer",
+                "CLIPTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        (
+            "clipseg",
+            (
+                "CLIPTokenizer",
+                "CLIPTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        ("clvp", ("ClvpTokenizer", None)),
+        (
+            "code_llama",
+            (
+                "CodeLlamaTokenizer" if is_sentencepiece_available() else None,
+                "CodeLlamaTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        ("codegen", ("CodeGenTokenizer", "CodeGenTokenizerFast" if is_tokenizers_available() else None)),
+        ("cohere", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)),
+        ("cohere2", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)),
+        ("colpali", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
+        ("colqwen2", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)),
+        ("convbert", ("ConvBertTokenizer", "ConvBertTokenizerFast" if is_tokenizers_available() else None)),
+        (
+            "cpm",
+            (
+                "CpmTokenizer" if is_sentencepiece_available() else None,
+                "CpmTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        ("cpmant", ("CpmAntTokenizer", None)),
+        ("csm", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
+        ("ctrl", ("CTRLTokenizer", None)),
+        ("data2vec-audio", ("Wav2Vec2CTCTokenizer", None)),
+        ("data2vec-text", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
+        ("dbrx", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
+        ("deberta", ("DebertaTokenizer", "DebertaTokenizerFast" if is_tokenizers_available() else None)),
+        (
+            "deberta-v2",
+            (
+                "DebertaV2Tokenizer" if is_sentencepiece_available() else None,
+                "DebertaV2TokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        (
+            "deepseek_v2",
+            (
+                "LlamaTokenizer" if is_sentencepiece_available() else None,
+                "LlamaTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        (
+            "deepseek_v3",
+            (
+                "LlamaTokenizer" if is_sentencepiece_available() else None,
+                "LlamaTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        (
+            "deepseek_vl",
+            (
+                "LlamaTokenizer" if is_sentencepiece_available() else None,
+                "LlamaTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        (
+            "deepseek_vl_hybrid",
+            (
+                "LlamaTokenizer" if is_sentencepiece_available() else None,
+                "LlamaTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        ("dia", ("DiaTokenizer", None)),
+        (
+            "diffllama",
+            (
+                "LlamaTokenizer" if is_sentencepiece_available() else None,
+                "LlamaTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        ("distilbert", ("DistilBertTokenizer", "DistilBertTokenizerFast" if is_tokenizers_available() else None)),
+        (
+            "dpr",
+            (
+                "DPRQuestionEncoderTokenizer",
+                "DPRQuestionEncoderTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        ("electra", ("ElectraTokenizer", "ElectraTokenizerFast" if is_tokenizers_available() else None)),
+        ("emu3", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
+        ("ernie", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
+        ("ernie4_5", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)),
+        ("ernie4_5_moe", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)),
+        ("ernie_m", ("ErnieMTokenizer" if is_sentencepiece_available() else None, None)),
+        ("esm", ("EsmTokenizer", None)),
+        (
+            "exaone4",
+            (
+                "GPT2Tokenizer" if is_tokenizers_available() else None,
+                "GPT2TokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        ("falcon", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
+        ("falcon_mamba", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
+        (
+            "fastspeech2_conformer",
+            ("FastSpeech2ConformerTokenizer" if is_g2p_en_available() else None, None),
+        ),
+        ("flaubert", ("FlaubertTokenizer", None)),
+        ("flex_olmo", (None, "GPT2TokenizerFast" if is_tokenizers_available() else None)),
+        ("fnet", ("FNetTokenizer", "FNetTokenizerFast" if is_tokenizers_available() else None)),
+        ("fsmt", ("FSMTTokenizer", None)),
+        ("funnel", ("FunnelTokenizer", "FunnelTokenizerFast" if is_tokenizers_available() else None)),
+        (
+            "gemma",
+            (
+                "GemmaTokenizer" if is_sentencepiece_available() else None,
+                "GemmaTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        (
+            "gemma2",
+            (
+                "GemmaTokenizer" if is_sentencepiece_available() else None,
+                "GemmaTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        (
+            "gemma3",
+            (
+                "GemmaTokenizer" if is_sentencepiece_available() else None,
+                "GemmaTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        (
+            "gemma3_text",
+            (
+                "GemmaTokenizer" if is_sentencepiece_available() else None,
+                "GemmaTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        (
+            "gemma3n",
+            (
+                "GemmaTokenizer" if is_sentencepiece_available() else None,
+                "GemmaTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        (
+            "gemma3n_text",
+            (
+                "GemmaTokenizer" if is_sentencepiece_available() else None,
+                "GemmaTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        ("git", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
+        ("glm", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
+        ("glm4", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
+        ("glm4_moe", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
+        ("glm4v", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
+        ("glm4v_moe", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
+        ("gpt-sw3", ("GPTSw3Tokenizer" if is_sentencepiece_available() else None, None)),
+        ("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
+        ("gpt_bigcode", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
+        ("gpt_neo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
+        ("gpt_neox", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
+        ("gpt_neox_japanese", ("GPTNeoXJapaneseTokenizer", None)),
+        ("gpt_oss", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
+        ("gptj", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
+        ("gptsan-japanese", ("GPTSanJapaneseTokenizer", None)),
+        ("granite", ("GPT2Tokenizer", None)),
+        ("granitemoe", ("GPT2Tokenizer", None)),
+        ("granitemoehybrid", ("GPT2Tokenizer", None)),
+        ("granitemoeshared", ("GPT2Tokenizer", None)),
+        ("grounding-dino", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
+        ("groupvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
+        ("helium", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
+        ("herbert", ("HerbertTokenizer", "HerbertTokenizerFast" if is_tokenizers_available() else None)),
+        ("hubert", ("Wav2Vec2CTCTokenizer", None)),
+        ("ibert", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
+        ("idefics", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)),
+        ("idefics2", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
+        ("idefics3", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
+        ("instructblip", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
+        ("instructblipvideo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
+        ("internvl", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)),
+        (
+            "jamba",
+            (
+                "LlamaTokenizer" if is_sentencepiece_available() else None,
+                "LlamaTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        ("janus", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)),
+        (
+            "jetmoe",
+            (
+                "LlamaTokenizer" if is_sentencepiece_available() else None,
+                "LlamaTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        ("jukebox", ("JukeboxTokenizer", None)),
+        (
+            "kosmos-2",
+            (
+                "XLMRobertaTokenizer" if is_sentencepiece_available() else None,
+                "XLMRobertaTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        ("kosmos-2.5", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
+        ("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)),
+        ("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)),
+        ("layoutlmv3", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)),
+        ("layoutxlm", ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast" if is_tokenizers_available() else None)),
+        ("led", ("LEDTokenizer", "LEDTokenizerFast" if is_tokenizers_available() else None)),
+        ("lilt", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)),
+        (
+            "llama",
+            (
+                "LlamaTokenizer" if is_sentencepiece_available() else None,
+                "LlamaTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        (
+            "llama4",
+            (
+                "LlamaTokenizer" if is_sentencepiece_available() else None,
+                "LlamaTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        (
+            "llama4_text",
+            (
+                "LlamaTokenizer" if is_sentencepiece_available() else None,
+                "LlamaTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        ("llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
+        ("llava_next", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
+        ("llava_next_video", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
+        ("llava_onevision", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
+        ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)),
+        (
+            "longt5",
+            (
+                "T5Tokenizer" if is_sentencepiece_available() else None,
+                "T5TokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        ("luke", ("LukeTokenizer", None)),
+        ("lxmert", ("LxmertTokenizer", "LxmertTokenizerFast" if is_tokenizers_available() else None)),
+        ("m2m_100", ("M2M100Tokenizer" if is_sentencepiece_available() else None, None)),
+        ("mamba", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
+        ("mamba2", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
+        ("marian", ("MarianTokenizer" if is_sentencepiece_available() else None, None)),
+        (
+            "mbart",
+            (
+                "MBartTokenizer" if is_sentencepiece_available() else None,
+                "MBartTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        (
+            "mbart50",
+            (
+                "MBart50Tokenizer" if is_sentencepiece_available() else None,
+                "MBart50TokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        ("mega", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
+        ("megatron-bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
+        (
+            "metaclip_2",
+            (
+                "XLMRobertaTokenizer",
+                "XLMRobertaTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        ("mgp-str", ("MgpstrTokenizer", None)),
+        (
+            "minimax",
+            (
+                "GPT2Tokenizer" if is_sentencepiece_available() else None,
+                "GPT2TokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        (
+            "ministral",
+            (
+                "MistralCommonTokenizer"
+                if is_mistral_common_available()
+                else ("LlamaTokenizer" if is_sentencepiece_available() else None),
+                "LlamaTokenizerFast" if is_tokenizers_available() and not is_mistral_common_available() else None,
+            ),
+        ),
+        (
+            "mistral",
+            (
+                "MistralCommonTokenizer"
+                if is_mistral_common_available()
+                else ("LlamaTokenizer" if is_sentencepiece_available() else None),
+                "LlamaTokenizerFast" if is_tokenizers_available() and not is_mistral_common_available() else None,
+            ),
+        ),
+        (
+            "mistral3",
+            (
+                "MistralCommonTokenizer"
+                if is_mistral_common_available()
+                else ("LlamaTokenizer" if is_sentencepiece_available() else None),
+                "LlamaTokenizerFast" if is_tokenizers_available() and not is_mistral_common_available() else None,
+            ),
+        ),
+        (
+            "mixtral",
+            (
+                "MistralCommonTokenizer"
+                if is_mistral_common_available()
+                else ("LlamaTokenizer" if is_sentencepiece_available() else None),
+                "LlamaTokenizerFast" if is_tokenizers_available() and not is_mistral_common_available() else None,
+            ),
+        ),
+        ("mllama", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
+        ("mluke", ("MLukeTokenizer" if is_sentencepiece_available() else None, None)),
+        ("mm-grounding-dino", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
+        ("mobilebert", ("MobileBertTokenizer", "MobileBertTokenizerFast" if is_tokenizers_available() else None)),
+        ("modernbert", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
+        ("moonshine", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
+        ("moshi", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
+        ("mpnet", ("MPNetTokenizer", "MPNetTokenizerFast" if is_tokenizers_available() else None)),
+        ("mpt", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
+        ("mra", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
+        (
+            "mt5",
+            (
+                "MT5Tokenizer" if is_sentencepiece_available() else None,
+                "MT5TokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        ("musicgen", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)),
+        ("musicgen_melody", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)),
+        ("mvp", ("MvpTokenizer", "MvpTokenizerFast" if is_tokenizers_available() else None)),
+        ("myt5", ("MyT5Tokenizer", None)),
+        ("nemotron", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
+        ("nezha", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
+        (
+            "nllb",
+            (
+                "NllbTokenizer" if is_sentencepiece_available() else None,
+                "NllbTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        (
+            "nllb-moe",
+            (
+                "NllbTokenizer" if is_sentencepiece_available() else None,
+                "NllbTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        (
+            "nystromformer",
+            (
+                "AlbertTokenizer" if is_sentencepiece_available() else None,
+                "AlbertTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        ("olmo", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
+        ("olmo2", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
+        ("olmo3", (None, "GPT2TokenizerFast" if is_tokenizers_available() else None)),
+        ("olmoe", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
+        (
+            "omdet-turbo",
+            ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None),
+        ),
+        ("oneformer", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
+        (
+            "openai-gpt",
+            ("OpenAIGPTTokenizer", "OpenAIGPTTokenizerFast" if is_tokenizers_available() else None),
+        ),
+        ("opt", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
+        ("owlv2", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
+        ("owlvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
+        ("paligemma", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
+        ("parakeet", ("ParakeetCTCTokenizer", None)),
+        (
+            "pegasus",
+            (
+                "PegasusTokenizer" if is_sentencepiece_available() else None,
+                "PegasusTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        (
+            "pegasus_x",
+            (
+                "PegasusTokenizer" if is_sentencepiece_available() else None,
+                "PegasusTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        (
+            "perceiver",
+            (
+                "PerceiverTokenizer",
+                None,
+            ),
+        ),
+        (
+            "persimmon",
+            (
+                "LlamaTokenizer" if is_sentencepiece_available() else None,
+                "LlamaTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        ("phi", ("CodeGenTokenizer", "CodeGenTokenizerFast" if is_tokenizers_available() else None)),
+        ("phi3", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
+        ("phimoe", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
+        ("phobert", ("PhobertTokenizer", None)),
+        ("pix2struct", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)),
+        (
+            "pixtral",
+            (
+                None,
+                "MistralCommonTokenizer"
+                if is_mistral_common_available()
+                else ("PreTrainedTokenizerFast" if is_tokenizers_available() else None),
+            ),
+        ),
+        ("plbart", ("PLBartTokenizer" if is_sentencepiece_available() else None, None)),
+        ("prophetnet", ("ProphetNetTokenizer", None)),
+        ("qdqbert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
+        (
+            "qwen2",
+            (
+                "Qwen2Tokenizer",
+                "Qwen2TokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        ("qwen2_5_omni", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)),
+        ("qwen2_5_vl", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)),
+        ("qwen2_audio", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)),
+        (
+            "qwen2_moe",
+            (
+                "Qwen2Tokenizer",
+                "Qwen2TokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        ("qwen2_vl", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)),
+        (
+            "qwen3",
+            (
+                "Qwen2Tokenizer",
+                "Qwen2TokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        (
+            "qwen3_moe",
+            (
+                "Qwen2Tokenizer",
+                "Qwen2TokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        (
+            "qwen3_next",
+            (
+                "Qwen2Tokenizer",
+                "Qwen2TokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        ("qwen3_omni_moe", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)),
+        ("qwen3_vl", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)),
+        ("qwen3_vl_moe", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)),
+        ("rag", ("RagTokenizer", None)),
+        ("realm", ("RealmTokenizer", "RealmTokenizerFast" if is_tokenizers_available() else None)),
+        (
+            "recurrent_gemma",
+            (
+                "GemmaTokenizer" if is_sentencepiece_available() else None,
+                "GemmaTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        (
+            "reformer",
+            (
+                "ReformerTokenizer" if is_sentencepiece_available() else None,
+                "ReformerTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        (
+            "rembert",
+            (
+                "RemBertTokenizer" if is_sentencepiece_available() else None,
+                "RemBertTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        ("retribert", ("RetriBertTokenizer", "RetriBertTokenizerFast" if is_tokenizers_available() else None)),
+        ("roberta", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
+        (
+            "roberta-prelayernorm",
+            ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None),
+        ),
+        ("roc_bert", ("RoCBertTokenizer", None)),
+        ("roformer", ("RoFormerTokenizer", "RoFormerTokenizerFast" if is_tokenizers_available() else None)),
+        ("rwkv", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
+        (
+            "seamless_m4t",
+            (
+                "SeamlessM4TTokenizer" if is_sentencepiece_available() else None,
+                "SeamlessM4TTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        (
+            "seamless_m4t_v2",
+            (
+                "SeamlessM4TTokenizer" if is_sentencepiece_available() else None,
+                "SeamlessM4TTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        (
+            "shieldgemma2",
+            (
+                "GemmaTokenizer" if is_sentencepiece_available() else None,
+                "GemmaTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        ("siglip", ("SiglipTokenizer" if is_sentencepiece_available() else None, None)),
+        (
+            "siglip2",
+            (
+                "GemmaTokenizer" if is_sentencepiece_available() else None,
+                "GemmaTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        ("smollm3", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
+        ("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)),
+        ("speech_to_text_2", ("Speech2Text2Tokenizer", None)),
+        ("speecht5", ("SpeechT5Tokenizer" if is_sentencepiece_available() else None, None)),
+        ("splinter", ("SplinterTokenizer", "SplinterTokenizerFast")),
+        (
+            "squeezebert",
+            ("SqueezeBertTokenizer", "SqueezeBertTokenizerFast" if is_tokenizers_available() else None),
+        ),
+        ("stablelm", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
+        ("starcoder2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
+        (
+            "switch_transformers",
+            (
+                "T5Tokenizer" if is_sentencepiece_available() else None,
+                "T5TokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        (
+            "t5",
+            (
+                "T5Tokenizer" if is_sentencepiece_available() else None,
+                "T5TokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        (
+            "t5gemma",
+            (
+                "GemmaTokenizer" if is_sentencepiece_available() else None,
+                "GemmaTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        ("tapas", ("TapasTokenizer", None)),
+        ("tapex", ("TapexTokenizer", None)),
+        ("transfo-xl", ("TransfoXLTokenizer", None)),
+        ("tvp", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
+        (
+            "udop",
+            (
+                "UdopTokenizer" if is_sentencepiece_available() else None,
+                "UdopTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        (
+            "umt5",
+            (
+                "T5Tokenizer" if is_sentencepiece_available() else None,
+                "T5TokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        ("video_llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
+        ("vilt", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
+        ("vipllava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
+        ("visual_bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
+        ("vits", ("VitsTokenizer", None)),
+        (
+            "voxtral",
+            (
+                "MistralCommonTokenizer" if is_mistral_common_available() else None,
+                "PreTrainedTokenizerFast" if is_tokenizers_available() and not is_mistral_common_available() else None,
+            ),
+        ),
+        ("wav2vec2", ("Wav2Vec2CTCTokenizer", None)),
+        ("wav2vec2-bert", ("Wav2Vec2CTCTokenizer", None)),
+        ("wav2vec2-conformer", ("Wav2Vec2CTCTokenizer", None)),
+        ("wav2vec2_phoneme", ("Wav2Vec2PhonemeCTCTokenizer", None)),
+        ("whisper", ("WhisperTokenizer", "WhisperTokenizerFast" if is_tokenizers_available() else None)),
+        ("xclip", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
+        (
+            "xglm",
+            (
+                "XGLMTokenizer" if is_sentencepiece_available() else None,
+                "XGLMTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        ("xlm", ("XLMTokenizer", None)),
+        ("xlm-prophetnet", ("XLMProphetNetTokenizer" if is_sentencepiece_available() else None, None)),
+        (
+            "xlm-roberta",
+            (
+                "XLMRobertaTokenizer" if is_sentencepiece_available() else None,
+                "XLMRobertaTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        (
+            "xlm-roberta-xl",
+            (
+                "XLMRobertaTokenizer" if is_sentencepiece_available() else None,
+                "XLMRobertaTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        (
+            "xlnet",
+            (
+                "XLNetTokenizer" if is_sentencepiece_available() else None,
+                "XLNetTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        ("xlstm", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
+        (
+            "xmod",
+            (
+                "XLMRobertaTokenizer" if is_sentencepiece_available() else None,
+                "XLMRobertaTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        (
+            "yoso",
+            (
+                "AlbertTokenizer" if is_sentencepiece_available() else None,
+                "AlbertTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        (
+            "zamba",
+            (
+                "LlamaTokenizer" if is_sentencepiece_available() else None,
+                "LlamaTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+        (
+            "zamba2",
+            (
+                "LlamaTokenizer" if is_sentencepiece_available() else None,
+                "LlamaTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
+    ]
+)
+TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TOKENIZER_MAPPING_NAMES)
+CONFIG_TO_TYPE = {v: k for k, v in CONFIG_MAPPING_NAMES.items()}
+def tokenizer_class_from_name(class_name: str) -> Union[type[Any], None]:
+    if class_name == "PreTrainedTokenizerFast":
+        return PreTrainedTokenizerFast
+    for module_name, tokenizers in TOKENIZER_MAPPING_NAMES.items():
+        if class_name in tokenizers:
+            module_name = model_type_to_module_name(module_name)
+            if module_name in ["mistral", "mixtral", "ministral"] and class_name == "MistralCommonTokenizer":
+                module = importlib.import_module(".tokenization_mistral_common", "transformers")
+            else:
+                module = importlib.import_module(f".{module_name}", "transformers.models")
+            try:
+                return getattr(module, class_name)
+            except AttributeError:
+                continue
+    for tokenizers in TOKENIZER_MAPPING._extra_content.values():
+        for tokenizer in tokenizers:
+            if getattr(tokenizer, "__name__", None) == class_name:
+                return tokenizer
+    # We did not fine the class, but maybe it's because a dep is missing. In that case, the class will be in the main
+    # init and we return the proper dummy to get an appropriate error message.
+    main_module = importlib.import_module("transformers")
+    if hasattr(main_module, class_name):
+        return getattr(main_module, class_name)
+    return None
+def get_tokenizer_config(
+    pretrained_model_name_or_path: Union[str, os.PathLike[str]],
+    cache_dir: Optional[Union[str, os.PathLike[str]]] = None,
+    force_download: bool = False,
+    resume_download: Optional[bool] = None,
+    proxies: Optional[dict[str, str]] = None,
+    token: Optional[Union[bool, str]] = None,
+    revision: Optional[str] = None,
+    local_files_only: bool = False,
+    subfolder: str = "",
+    **kwargs,
+) -> dict[str, Any]:
+    """
+    Loads the tokenizer configuration from a pretrained model tokenizer configuration.
+    Args:
+        pretrained_model_name_or_path (`str` or `os.PathLike`):
+            This can be either:
+            - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
+              huggingface.co.
+            - a path to a *directory* containing a configuration file saved using the
+              [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
+        cache_dir (`str` or `os.PathLike`, *optional*):
+            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
+            cache should not be used.
+        force_download (`bool`, *optional*, defaults to `False`):
+            Whether or not to force to (re-)download the configuration files and override the cached versions if they
+            exist.
+        resume_download:
+            Deprecated and ignored. All downloads are now resumed by default when possible.
+            Will be removed in v5 of Transformers.
+        proxies (`dict[str, str]`, *optional*):
+            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+        token (`str` or *bool*, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+            when running `hf auth login` (stored in `~/.huggingface`).
+        revision (`str`, *optional*, defaults to `"main"`):
+            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+            identifier allowed by git.
+        local_files_only (`bool`, *optional*, defaults to `False`):
+            If `True`, will only try to load the tokenizer configuration from local files.
+        subfolder (`str`, *optional*, defaults to `""`):
+            In case the tokenizer config is located inside a subfolder of the model repo on huggingface.co, you can
+            specify the folder name here.
+    <Tip>
+    Passing `token=True` is required when you want to use a private model.
+    </Tip>
+    Returns:
+        `dict`: The configuration of the tokenizer.
+    Examples:
+    ```python
+    # Download configuration from huggingface.co and cache.
+    tokenizer_config = get_tokenizer_config("google-bert/bert-base-uncased")
+    # This model does not have a tokenizer config so the result will be an empty dict.
+    tokenizer_config = get_tokenizer_config("FacebookAI/xlm-roberta-base")
+    # Save a pretrained tokenizer locally and you can reload its config
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
+    tokenizer.save_pretrained("tokenizer-test")
+    tokenizer_config = get_tokenizer_config("tokenizer-test")
+    ```"""
+    use_auth_token = kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+            FutureWarning,
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+    commit_hash = kwargs.get("_commit_hash")
+    resolved_config_file = cached_file(
+        pretrained_model_name_or_path,
+        TOKENIZER_CONFIG_FILE,
+        cache_dir=cache_dir,
+        force_download=force_download,
+        resume_download=resume_download,
+        proxies=proxies,
+        token=token,
+        revision=revision,
+        local_files_only=local_files_only,
+        subfolder=subfolder,
+        _raise_exceptions_for_gated_repo=False,
+        _raise_exceptions_for_missing_entries=False,
+        _raise_exceptions_for_connection_errors=False,
+        _commit_hash=commit_hash,
+    )
+    if resolved_config_file is None:
+        logger.info("Could not locate the tokenizer configuration file, will try to use the model config instead.")
+        return {}
+    commit_hash = extract_commit_hash(resolved_config_file, commit_hash)
+    with open(resolved_config_file, encoding="utf-8") as reader:
+        result = json.load(reader)
+    result["_commit_hash"] = commit_hash
+    return result
+class AutoTokenizer:
+    r"""
+    This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library when
+    created with the [`AutoTokenizer.from_pretrained`] class method.
+    This class cannot be instantiated directly using `__init__()` (throws an error).
+    """
+    def __init__(self):
+        raise OSError(
+            "AutoTokenizer is designed to be instantiated "
+            "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method."
+        )
+    @classmethod
+    @replace_list_option_in_docstrings(TOKENIZER_MAPPING_NAMES)
+    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
+        r"""
+        Instantiate one of the tokenizer classes of the library from a pretrained model vocabulary.
+        The tokenizer class to instantiate is selected based on the `model_type` property of the config object (either
+        passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by
+        falling back to using pattern matching on `pretrained_model_name_or_path`:
+        List options
+        Params:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                Can be either:
+                    - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
+                    - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
+                      using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
+                    - A path or url to a single saved vocabulary file if and only if the tokenizer only requires a
+                      single vocabulary file (like Bert or XLNet), e.g.: `./my_model_directory/vocab.txt`. (Not
+                      applicable to all derived classes)
+            inputs (additional positional arguments, *optional*):
+                Will be passed along to the Tokenizer `__init__()` method.
+            config ([`PretrainedConfig`], *optional*)
+                The configuration object used to determine the tokenizer class to instantiate.
+            cache_dir (`str` or `os.PathLike`, *optional*):
+                Path to a directory in which a downloaded pretrained model configuration should be cached if the
+                standard cache should not be used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download the model weights and configuration files and override the
+                cached versions if they exist.
+            resume_download:
+                Deprecated and ignored. All downloads are now resumed by default when possible.
+                Will be removed in v5 of Transformers.
+            proxies (`dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+            subfolder (`str`, *optional*):
+                In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for
+                facebook/rag-token-base), specify it here.
+            use_fast (`bool`, *optional*, defaults to `True`):
+                Use a [fast Rust-based tokenizer](https://huggingface.co/docs/tokenizers/index) if it is supported for
+                a given model. If a fast tokenizer is not available for a given model, a normal Python-based tokenizer
+                is returned instead.
+            tokenizer_type (`str`, *optional*):
+                Tokenizer type to be loaded.
+            trust_remote_code (`bool`, *optional*, defaults to `False`):
+                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
+                should only be set to `True` for repositories you trust and in which you have read the code, as it will
+                execute code present on the Hub on your local machine.
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the Tokenizer `__init__()` method. Can be used to set special tokens like
+                `bos_token`, `eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`, `mask_token`,
+                `additional_special_tokens`. See parameters in the `__init__()` for more details.
+        Examples:
+        ```python
+        >>> from transformers import AutoTokenizer
+        >>> # Download vocabulary from huggingface.co and cache.
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+        >>> # Download vocabulary from huggingface.co (user-uploaded) and cache.
+        >>> tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
+        >>> # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)
+        >>> # tokenizer = AutoTokenizer.from_pretrained("./test/bert_saved_model/")
+        >>> # Download vocabulary from huggingface.co and define model-specific arguments
+        >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base", add_prefix_space=True)
+        ```"""
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+                FutureWarning,
+            )
+            if kwargs.get("token") is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            kwargs["token"] = use_auth_token
+        config = kwargs.pop("config", None)
+        kwargs["_from_auto"] = True
+        use_fast = kwargs.pop("use_fast", True)
+        tokenizer_type = kwargs.pop("tokenizer_type", None)
+        trust_remote_code = kwargs.pop("trust_remote_code", None)
+        gguf_file = kwargs.get("gguf_file")
+        # First, let's see whether the tokenizer_type is passed so that we can leverage it
+        if tokenizer_type is not None:
+            tokenizer_class = None
+            tokenizer_class_tuple = TOKENIZER_MAPPING_NAMES.get(tokenizer_type, None)
+            if tokenizer_class_tuple is None:
+                raise ValueError(
+                    f"Passed `tokenizer_type` {tokenizer_type} does not exist. `tokenizer_type` should be one of "
+                    f"{', '.join(c for c in TOKENIZER_MAPPING_NAMES)}."
+                )
+            tokenizer_class_name, tokenizer_fast_class_name = tokenizer_class_tuple
+            if use_fast:
+                if tokenizer_fast_class_name is not None:
+                    tokenizer_class = tokenizer_class_from_name(tokenizer_fast_class_name)
+                else:
+                    logger.warning(
+                        "`use_fast` is set to `True` but the tokenizer class does not have a fast version. "
+                        " Falling back to the slow version."
+                    )
+            if tokenizer_class is None:
+                tokenizer_class = tokenizer_class_from_name(tokenizer_class_name)
+            if tokenizer_class is None:
+                raise ValueError(f"Tokenizer class {tokenizer_class_name} is not currently imported.")
+            return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        # Next, let's try to use the tokenizer_config file to get the tokenizer class.
+        tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)
+        if "_commit_hash" in tokenizer_config:
+            kwargs["_commit_hash"] = tokenizer_config["_commit_hash"]
+        config_tokenizer_class = tokenizer_config.get("tokenizer_class")
+        tokenizer_auto_map = None
+        if "auto_map" in tokenizer_config:
+            if isinstance(tokenizer_config["auto_map"], (tuple, list)):
+                # Legacy format for dynamic tokenizers
+                tokenizer_auto_map = tokenizer_config["auto_map"]
+            else:
+                tokenizer_auto_map = tokenizer_config["auto_map"].get("AutoTokenizer", None)
+        # If that did not work, let's try to use the config.
+        if config_tokenizer_class is None:
+            if not isinstance(config, PretrainedConfig):
+                if gguf_file:
+                    gguf_path = cached_file(pretrained_model_name_or_path, gguf_file, **kwargs)
+                    config_dict = load_gguf_checkpoint(gguf_path, return_tensors=False)["config"]
+                    config = AutoConfig.for_model(**config_dict)
+                else:
+                    config = AutoConfig.from_pretrained(
+                        pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
+                    )
+            config_tokenizer_class = config.tokenizer_class
+            if hasattr(config, "auto_map") and "AutoTokenizer" in config.auto_map:
+                tokenizer_auto_map = config.auto_map["AutoTokenizer"]
+        has_remote_code = tokenizer_auto_map is not None
+        has_local_code = type(config) in TOKENIZER_MAPPING or (
+            config_tokenizer_class is not None
+            and (
+                tokenizer_class_from_name(config_tokenizer_class) is not None
+                or tokenizer_class_from_name(config_tokenizer_class + "Fast") is not None
+            )
+        )
+        if has_remote_code:
+            if use_fast and tokenizer_auto_map[1] is not None:
+                class_ref = tokenizer_auto_map[1]
+            else:
+                class_ref = tokenizer_auto_map[0]
+            if "--" in class_ref:
+                upstream_repo = class_ref.split("--")[0]
+            else:
+                upstream_repo = None
+            trust_remote_code = resolve_trust_remote_code(
+                trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code, upstream_repo
+            )
+        if has_remote_code and trust_remote_code:
+            tokenizer_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs)
+            _ = kwargs.pop("code_revision", None)
+            tokenizer_class.register_for_auto_class()
+            return tokenizer_class.from_pretrained(
+                pretrained_model_name_or_path, *inputs, trust_remote_code=trust_remote_code, **kwargs
+            )
+        elif config_tokenizer_class is not None:
+            tokenizer_class = None
+            if use_fast and not config_tokenizer_class.endswith("Fast"):
+                tokenizer_class_candidate = f"{config_tokenizer_class}Fast"
+                tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
+            if tokenizer_class is None:
+                tokenizer_class_candidate = config_tokenizer_class
+                tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
+            if tokenizer_class is None:
+                raise ValueError(
+                    f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported."
+                )
+            return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        # Otherwise we have to be creative.
+        # if model is an encoder decoder, the encoder tokenizer class is used by default
+        if isinstance(config, EncoderDecoderConfig):
+            if type(config.decoder) is not type(config.encoder):
+                logger.warning(
+                    f"The encoder model config class: {config.encoder.__class__} is different from the decoder model "
+                    f"config class: {config.decoder.__class__}. It is not recommended to use the "
+                    "`AutoTokenizer.from_pretrained()` method in this case. Please use the encoder and decoder "
+                    "specific tokenizer classes."
+                )
+            config = config.encoder
+        model_type = config_class_to_model_type(type(config).__name__)
+        if model_type is not None:
+            tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)]
+            if tokenizer_class_fast and (use_fast or tokenizer_class_py is None):
+                return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+            else:
+                if tokenizer_class_py is not None:
+                    return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+                else:
+                    raise ValueError(
+                        "This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed "
+                        "in order to use this tokenizer."
+                    )
+        raise ValueError(
+            f"Unrecognized configuration class {config.__class__} to build an AutoTokenizer.\n"
+            f"Model type should be one of {', '.join(c.__name__ for c in TOKENIZER_MAPPING)}."
+        )
+    @staticmethod
+    def register(config_class, slow_tokenizer_class=None, fast_tokenizer_class=None, exist_ok=False):
+        """
+        Register a new tokenizer in this mapping.
+        Args:
+            config_class ([`PretrainedConfig`]):
+                The configuration corresponding to the model to register.
+            slow_tokenizer_class ([`PretrainedTokenizer`], *optional*):
+                The slow tokenizer to register.
+            fast_tokenizer_class ([`PretrainedTokenizerFast`], *optional*):
+                The fast tokenizer to register.
+        """
+        if slow_tokenizer_class is None and fast_tokenizer_class is None:
+            raise ValueError("You need to pass either a `slow_tokenizer_class` or a `fast_tokenizer_class")
+        if slow_tokenizer_class is not None and issubclass(slow_tokenizer_class, PreTrainedTokenizerFast):
+            raise ValueError("You passed a fast tokenizer in the `slow_tokenizer_class`.")
+        if fast_tokenizer_class is not None and issubclass(fast_tokenizer_class, PreTrainedTokenizer):
+            raise ValueError("You passed a slow tokenizer in the `fast_tokenizer_class`.")
+        if (
+            slow_tokenizer_class is not None
+            and fast_tokenizer_class is not None
+            and issubclass(fast_tokenizer_class, PreTrainedTokenizerFast)
+            and fast_tokenizer_class.slow_tokenizer_class != slow_tokenizer_class
+        ):
+            raise ValueError(
+                "The fast tokenizer class you are passing has a `slow_tokenizer_class` attribute that is not "
+                "consistent with the slow tokenizer class you passed (fast tokenizer has "
+                f"{fast_tokenizer_class.slow_tokenizer_class} and you passed {slow_tokenizer_class}. Fix one of those "
+                "so they match!"
+            )
+        # Avoid resetting a set slow/fast tokenizer if we are passing just the other ones.
+        if config_class in TOKENIZER_MAPPING._extra_content:
+            existing_slow, existing_fast = TOKENIZER_MAPPING[config_class]
+            if slow_tokenizer_class is None:
+                slow_tokenizer_class = existing_slow
+            if fast_tokenizer_class is None:
+                fast_tokenizer_class = existing_fast
+        TOKENIZER_MAPPING.register(config_class, (slow_tokenizer_class, fast_tokenizer_class), exist_ok=exist_ok)
+__all__ = ["TOKENIZER_MAPPING", "AutoTokenizer"]

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/auto/video_processing_auto.py ADDED Viewed

	@@ -0,0 +1,393 @@

+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""AutoVideoProcessor class."""
+import importlib
+import json
+import os
+import warnings
+from collections import OrderedDict
+from typing import TYPE_CHECKING, Optional, Union
+# Build the list of all video processors
+from ...configuration_utils import PretrainedConfig
+from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
+from ...utils import CONFIG_NAME, VIDEO_PROCESSOR_NAME, cached_file, is_torchvision_available, logging
+from ...utils.import_utils import requires
+from ...video_processing_utils import BaseVideoProcessor
+from .auto_factory import _LazyAutoMapping
+from .configuration_auto import (
+    CONFIG_MAPPING_NAMES,
+    AutoConfig,
+    model_type_to_module_name,
+    replace_list_option_in_docstrings,
+)
+logger = logging.get_logger(__name__)
+if TYPE_CHECKING:
+    # This significantly improves completion suggestion performance when
+    # the transformers package is used with Microsoft's Pylance language server.
+    VIDEO_PROCESSOR_MAPPING_NAMES: OrderedDict[str, tuple[Optional[str], Optional[str]]] = OrderedDict()
+else:
+    VIDEO_PROCESSOR_MAPPING_NAMES = OrderedDict(
+        [
+            ("glm4v", "Glm4vVideoProcessor"),
+            ("instructblip", "InstructBlipVideoVideoProcessor"),
+            ("instructblipvideo", "InstructBlipVideoVideoProcessor"),
+            ("internvl", "InternVLVideoProcessor"),
+            ("llava_next_video", "LlavaNextVideoVideoProcessor"),
+            ("llava_onevision", "LlavaOnevisionVideoProcessor"),
+            ("perception_lm", "PerceptionLMVideoProcessor"),
+            ("qwen2_5_omni", "Qwen2VLVideoProcessor"),
+            ("qwen2_5_vl", "Qwen2VLVideoProcessor"),
+            ("qwen2_vl", "Qwen2VLVideoProcessor"),
+            ("qwen3_omni_moe", "Qwen2VLVideoProcessor"),
+            ("qwen3_vl", "Qwen3VLVideoProcessor"),
+            ("qwen3_vl_moe", "Qwen3VLVideoProcessor"),
+            ("sam2_video", "Sam2VideoVideoProcessor"),
+            ("smolvlm", "SmolVLMVideoProcessor"),
+            ("video_llava", "VideoLlavaVideoProcessor"),
+            ("vjepa2", "VJEPA2VideoProcessor"),
+        ]
+    )
+for model_type, video_processors in VIDEO_PROCESSOR_MAPPING_NAMES.items():
+    fast_video_processor_class = video_processors
+    # If the torchvision is not available, we set it to None
+    if not is_torchvision_available():
+        fast_video_processor_class = None
+    VIDEO_PROCESSOR_MAPPING_NAMES[model_type] = fast_video_processor_class
+VIDEO_PROCESSOR_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, VIDEO_PROCESSOR_MAPPING_NAMES)
+def video_processor_class_from_name(class_name: str):
+    for module_name, extractors in VIDEO_PROCESSOR_MAPPING_NAMES.items():
+        if class_name in extractors:
+            module_name = model_type_to_module_name(module_name)
+            module = importlib.import_module(f".{module_name}", "transformers.models")
+            try:
+                return getattr(module, class_name)
+            except AttributeError:
+                continue
+    for extractor in VIDEO_PROCESSOR_MAPPING._extra_content.values():
+        if getattr(extractor, "__name__", None) == class_name:
+            return extractor
+    # We did not find the class, but maybe it's because a dep is missing. In that case, the class will be in the main
+    # init and we return the proper dummy to get an appropriate error message.
+    main_module = importlib.import_module("transformers")
+    if hasattr(main_module, class_name):
+        return getattr(main_module, class_name)
+    return None
+def get_video_processor_config(
+    pretrained_model_name_or_path: Union[str, os.PathLike],
+    cache_dir: Optional[Union[str, os.PathLike]] = None,
+    force_download: bool = False,
+    resume_download: Optional[bool] = None,
+    proxies: Optional[dict[str, str]] = None,
+    token: Optional[Union[bool, str]] = None,
+    revision: Optional[str] = None,
+    local_files_only: bool = False,
+    **kwargs,
+):
+    """
+    Loads the video processor configuration from a pretrained model video processor configuration.
+    Args:
+        pretrained_model_name_or_path (`str` or `os.PathLike`):
+            This can be either:
+            - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
+              huggingface.co.
+            - a path to a *directory* containing a configuration file saved using the
+              [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
+        cache_dir (`str` or `os.PathLike`, *optional*):
+            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
+            cache should not be used.
+        force_download (`bool`, *optional*, defaults to `False`):
+            Whether or not to force to (re-)download the configuration files and override the cached versions if they
+            exist.
+        resume_download:
+            Deprecated and ignored. All downloads are now resumed by default when possible.
+            Will be removed in v5 of Transformers.
+        proxies (`dict[str, str]`, *optional*):
+            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+        token (`str` or *bool*, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+            when running `hf auth login` (stored in `~/.huggingface`).
+        revision (`str`, *optional*, defaults to `"main"`):
+            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+            identifier allowed by git.
+        local_files_only (`bool`, *optional*, defaults to `False`):
+            If `True`, will only try to load the video processor configuration from local files.
+    <Tip>
+    Passing `token=True` is required when you want to use a private model.
+    </Tip>
+    Returns:
+        `Dict`: The configuration of the video processor.
+    Examples:
+    ```python
+    # Download configuration from huggingface.co and cache.
+    video_processor_config = get_video_processor_config("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
+    # This model does not have a video processor config so the result will be an empty dict.
+    video_processor_config = get_video_processor_config("FacebookAI/xlm-roberta-base")
+    # Save a pretrained video processor locally and you can reload its config
+    from transformers import AutoVideoProcessor
+    video_processor = AutoVideoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
+    video_processor.save_pretrained("video-processor-test")
+    video_processor = get_video_processor_config("video-processor-test")
+    ```"""
+    use_auth_token = kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+            FutureWarning,
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+    resolved_config_file = cached_file(
+        pretrained_model_name_or_path,
+        VIDEO_PROCESSOR_NAME,
+        cache_dir=cache_dir,
+        force_download=force_download,
+        resume_download=resume_download,
+        proxies=proxies,
+        token=token,
+        revision=revision,
+        local_files_only=local_files_only,
+    )
+    if resolved_config_file is None:
+        logger.info(
+            "Could not locate the video processor configuration file, will try to use the model config instead."
+        )
+        return {}
+    with open(resolved_config_file, encoding="utf-8") as reader:
+        return json.load(reader)
+@requires(backends=("vision", "torchvision"))
+class AutoVideoProcessor:
+    r"""
+    This is a generic video processor class that will be instantiated as one of the video processor classes of the
+    library when created with the [`AutoVideoProcessor.from_pretrained`] class method.
+    This class cannot be instantiated directly using `__init__()` (throws an error).
+    """
+    def __init__(self):
+        raise OSError(
+            "AutoVideoProcessor is designed to be instantiated "
+            "using the `AutoVideoProcessor.from_pretrained(pretrained_model_name_or_path)` method."
+        )
+    @classmethod
+    @replace_list_option_in_docstrings(VIDEO_PROCESSOR_MAPPING_NAMES)
+    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
+        r"""
+        Instantiate one of the video processor classes of the library from a pretrained model vocabulary.
+        The video processor class to instantiate is selected based on the `model_type` property of the config object
+        (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's
+        missing, by falling back to using pattern matching on `pretrained_model_name_or_path`:
+        List options
+        Params:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                This can be either:
+                - a string, the *model id* of a pretrained video_processor hosted inside a model repo on
+                  huggingface.co.
+                - a path to a *directory* containing a video processor file saved using the
+                  [`~video_processing_utils.BaseVideoProcessor.save_pretrained`] method, e.g.,
+                  `./my_model_directory/`.
+                - a path or url to a saved video processor JSON *file*, e.g.,
+                  `./my_model_directory/preprocessor_config.json`.
+            cache_dir (`str` or `os.PathLike`, *optional*):
+                Path to a directory in which a downloaded pretrained model video processor should be cached if the
+                standard cache should not be used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force to (re-)download the video processor files and override the cached versions if
+                they exist.
+            resume_download:
+                Deprecated and ignored. All downloads are now resumed by default when possible.
+                Will be removed in v5 of Transformers.
+            proxies (`dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+                when running `hf auth login` (stored in `~/.huggingface`).
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                If `False`, then this function returns just the final video processor object. If `True`, then this
+                functions returns a `Tuple(video_processor, unused_kwargs)` where *unused_kwargs* is a dictionary
+                consisting of the key/value pairs whose keys are not video processor attributes: i.e., the part of
+                `kwargs` which has not been used to update `video_processor` and is otherwise ignored.
+            trust_remote_code (`bool`, *optional*, defaults to `False`):
+                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
+                should only be set to `True` for repositories you trust and in which you have read the code, as it will
+                execute code present on the Hub on your local machine.
+            kwargs (`dict[str, Any]`, *optional*):
+                The values in kwargs of any keys which are video processor attributes will be used to override the
+                loaded values. Behavior concerning key/value pairs whose keys are *not* video processor attributes is
+                controlled by the `return_unused_kwargs` keyword parameter.
+        <Tip>
+        Passing `token=True` is required when you want to use a private model.
+        </Tip>
+        Examples:
+        ```python
+        >>> from transformers import AutoVideoProcessor
+        >>> # Download video processor from huggingface.co and cache.
+        >>> video_processor = AutoVideoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
+        >>> # If video processor files are in a directory (e.g. video processor was saved using *save_pretrained('./test/saved_model/')*)
+        >>> # video_processor = AutoVideoProcessor.from_pretrained("./test/saved_model/")
+        ```"""
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+                FutureWarning,
+            )
+            if kwargs.get("token") is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            kwargs["token"] = use_auth_token
+        config = kwargs.pop("config", None)
+        trust_remote_code = kwargs.pop("trust_remote_code", None)
+        kwargs["_from_auto"] = True
+        config_dict, _ = BaseVideoProcessor.get_video_processor_dict(pretrained_model_name_or_path, **kwargs)
+        video_processor_class = config_dict.get("video_processor_type", None)
+        video_processor_auto_map = None
+        if "AutoVideoProcessor" in config_dict.get("auto_map", {}):
+            video_processor_auto_map = config_dict["auto_map"]["AutoVideoProcessor"]
+        # If we still don't have the video processor class, check if we're loading from a previous image processor config
+        # and if so, infer the video processor class from there.
+        if video_processor_class is None and video_processor_auto_map is None:
+            image_processor_class = config_dict.pop("image_processor_type", None)
+            if image_processor_class is not None:
+                video_processor_class_inferred = image_processor_class.replace("ImageProcessor", "VideoProcessor")
+                # Some models have different image processors, e.g. InternVL uses GotOCRImageProcessor
+                # We cannot use GotOCRVideoProcessor when falling back for BC and should try to infer from config later on
+                if video_processor_class_inferred in VIDEO_PROCESSOR_MAPPING_NAMES.values():
+                    video_processor_class = video_processor_class_inferred
+            if "AutoImageProcessor" in config_dict.get("auto_map", {}):
+                image_processor_auto_map = config_dict["auto_map"]["AutoImageProcessor"]
+                video_processor_auto_map = image_processor_auto_map.replace("ImageProcessor", "VideoProcessor")
+        # If we don't find the video processor class in the video processor config, let's try the model config.
+        if video_processor_class is None and video_processor_auto_map is None:
+            if not isinstance(config, PretrainedConfig):
+                config = AutoConfig.from_pretrained(
+                    pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
+                )
+            # It could be in `config.video_processor_type``
+            video_processor_class = getattr(config, "video_processor_type", None)
+            if hasattr(config, "auto_map") and "AutoVideoProcessor" in config.auto_map:
+                video_processor_auto_map = config.auto_map["AutoVideoProcessor"]
+        if video_processor_class is not None:
+            video_processor_class = video_processor_class_from_name(video_processor_class)
+        has_remote_code = video_processor_auto_map is not None
+        has_local_code = video_processor_class is not None or type(config) in VIDEO_PROCESSOR_MAPPING
+        trust_remote_code = resolve_trust_remote_code(
+            trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code
+        )
+        if has_remote_code and trust_remote_code:
+            class_ref = video_processor_auto_map
+            video_processor_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs)
+            _ = kwargs.pop("code_revision", None)
+            video_processor_class.register_for_auto_class()
+            return video_processor_class.from_dict(config_dict, **kwargs)
+        elif video_processor_class is not None:
+            return video_processor_class.from_dict(config_dict, **kwargs)
+        # Last try: we use the VIDEO_PROCESSOR_MAPPING.
+        elif type(config) in VIDEO_PROCESSOR_MAPPING:
+            video_processor_class = VIDEO_PROCESSOR_MAPPING[type(config)]
+            if video_processor_class is not None:
+                return video_processor_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+            else:
+                raise ValueError(
+                    "This video processor cannot be instantiated. Please make sure you have `torchvision` installed."
+                )
+        raise ValueError(
+            f"Unrecognized video processor in {pretrained_model_name_or_path}. Should have a "
+            f"`video_processor_type` key in its {VIDEO_PROCESSOR_NAME} of {CONFIG_NAME}, or one of the following "
+            f"`model_type` keys in its {CONFIG_NAME}: {', '.join(c for c in VIDEO_PROCESSOR_MAPPING_NAMES)}"
+        )
+    @staticmethod
+    def register(
+        config_class,
+        video_processor_class,
+        exist_ok=False,
+    ):
+        """
+        Register a new video processor for this class.
+        Args:
+            config_class ([`PretrainedConfig`]):
+                The configuration corresponding to the model to register.
+            video_processor_class ([`BaseVideoProcessor`]):
+                The video processor to register.
+        """
+        VIDEO_PROCESSOR_MAPPING.register(config_class, video_processor_class, exist_ok=exist_ok)
+__all__ = ["VIDEO_PROCESSOR_MAPPING", "AutoVideoProcessor"]

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bark/__init__.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+if TYPE_CHECKING:
+    from .configuration_bark import *
+    from .modeling_bark import *
+    from .processing_bark import *
+else:
+    import sys
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bark/configuration_bark.py ADDED Viewed

	@@ -0,0 +1,303 @@

+# coding=utf-8
+# Copyright 2023 The Suno AI Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BARK model configuration"""
+from typing import Optional
+from ...configuration_utils import PretrainedConfig
+from ...utils import add_start_docstrings, logging
+from ..auto import CONFIG_MAPPING, AutoConfig
+logger = logging.get_logger(__name__)
+BARK_SUBMODELCONFIG_START_DOCSTRING = """
+    This is the configuration class to store the configuration of a [`{model}`]. It is used to instantiate the model
+    according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Bark [suno/bark](https://huggingface.co/suno/bark)
+    architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        block_size (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        input_vocab_size (`int`, *optional*, defaults to 10_048):
+            Vocabulary size of a Bark sub-model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`{model}`]. Defaults to 10_048 but should be carefully thought with
+            regards to the chosen sub-model.
+        output_vocab_size (`int`, *optional*, defaults to 10_048):
+            Output vocabulary size of a Bark sub-model. Defines the number of different tokens that can be represented
+            by the: `output_ids` when passing forward a [`{model}`]. Defaults to 10_048 but should be carefully thought
+            with regards to the chosen sub-model.
+        num_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the given sub-model.
+        num_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer architecture.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the architecture.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        bias (`bool`, *optional*, defaults to `True`):
+            Whether or not to use bias in the linear layers and layer norm layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+"""
+class BarkSubModelConfig(PretrainedConfig):
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "num_attention_heads": "num_heads",
+        "num_hidden_layers": "num_layers",
+        "vocab_size": "input_vocab_size",
+        "window_size": "block_size",
+    }
+    def __init__(
+        self,
+        block_size=1024,
+        input_vocab_size=10_048,
+        output_vocab_size=10_048,
+        num_layers=12,
+        num_heads=12,
+        hidden_size=768,
+        dropout=0.0,
+        bias=True,  # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
+        initializer_range=0.02,
+        use_cache=True,
+        **kwargs,
+    ):
+        self.block_size = block_size
+        self.input_vocab_size = input_vocab_size
+        self.output_vocab_size = output_vocab_size
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self.bias = bias
+        self.use_cache = use_cache
+        self.initializer_range = initializer_range
+        super().__init__(**kwargs)
+@add_start_docstrings(
+    BARK_SUBMODELCONFIG_START_DOCSTRING.format(config="BarkSemanticConfig", model="BarkSemanticModel"),
+    """
+    Example:
+    ```python
+    >>> from transformers import BarkSemanticConfig, BarkSemanticModel
+    >>> # Initializing a Bark sub-module style configuration
+    >>> configuration = BarkSemanticConfig()
+    >>> # Initializing a model (with random weights) from the suno/bark style configuration
+    >>> model = BarkSemanticModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```""",
+)
+class BarkSemanticConfig(BarkSubModelConfig):
+    model_type = "semantic"
+    base_config_key = "semantic_config"
+@add_start_docstrings(
+    BARK_SUBMODELCONFIG_START_DOCSTRING.format(config="BarkCoarseConfig", model="BarkCoarseModel"),
+    """
+    Example:
+    ```python
+    >>> from transformers import BarkCoarseConfig, BarkCoarseModel
+    >>> # Initializing a Bark sub-module style configuration
+    >>> configuration = BarkCoarseConfig()
+    >>> # Initializing a model (with random weights) from the suno/bark style configuration
+    >>> model = BarkCoarseModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```""",
+)
+class BarkCoarseConfig(BarkSubModelConfig):
+    model_type = "coarse_acoustics"
+    base_config_key = "coarse_acoustics_config"
+@add_start_docstrings(
+    BARK_SUBMODELCONFIG_START_DOCSTRING.format(config="BarkFineConfig", model="BarkFineModel"),
+    """
+        n_codes_total (`int`, *optional*, defaults to 8):
+            The total number of audio codebooks predicted. Used in the fine acoustics sub-model.
+        n_codes_given (`int`, *optional*, defaults to 1):
+            The number of audio codebooks predicted in the coarse acoustics sub-model. Used in the acoustics
+            sub-models.
+    Example:
+    ```python
+    >>> from transformers import BarkFineConfig, BarkFineModel
+    >>> # Initializing a Bark sub-module style configuration
+    >>> configuration = BarkFineConfig()
+    >>> # Initializing a model (with random weights) from the suno/bark style configuration
+    >>> model = BarkFineModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```""",
+)
+class BarkFineConfig(BarkSubModelConfig):
+    model_type = "fine_acoustics"
+    base_config_key = "fine_acoustics_config"
+    def __init__(self, tie_word_embeddings=True, n_codes_total=8, n_codes_given=1, **kwargs):
+        self.n_codes_total = n_codes_total
+        self.n_codes_given = n_codes_given
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+class BarkConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`BarkModel`]. It is used to instantiate a Bark
+    model according to the specified sub-models configurations, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Bark
+    [suno/bark](https://huggingface.co/suno/bark) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+    semantic_config ([`BarkSemanticConfig`], *optional*):
+        Configuration of the underlying semantic sub-model.
+    coarse_acoustics_config ([`BarkCoarseConfig`], *optional*):
+        Configuration of the underlying coarse acoustics sub-model.
+    fine_acoustics_config ([`BarkFineConfig`], *optional*):
+        Configuration of the underlying fine acoustics sub-model.
+    codec_config ([`AutoConfig`], *optional*):
+        Configuration of the underlying codec sub-model.
+    Example:
+    ```python
+    >>> from transformers import (
+    ...     BarkSemanticConfig,
+    ...     BarkCoarseConfig,
+    ...     BarkFineConfig,
+    ...     BarkModel,
+    ...     BarkConfig,
+    ...     AutoConfig,
+    ... )
+    >>> # Initializing Bark sub-modules configurations.
+    >>> semantic_config = BarkSemanticConfig()
+    >>> coarse_acoustics_config = BarkCoarseConfig()
+    >>> fine_acoustics_config = BarkFineConfig()
+    >>> codec_config = AutoConfig.from_pretrained("facebook/encodec_24khz")
+    >>> # Initializing a Bark module style configuration
+    >>> configuration = BarkConfig.from_sub_model_configs(
+    ...     semantic_config, coarse_acoustics_config, fine_acoustics_config, codec_config
+    ... )
+    >>> # Initializing a model (with random weights)
+    >>> model = BarkModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+    model_type = "bark"
+    sub_configs = {
+        "semantic_config": BarkSemanticConfig,
+        "coarse_acoustics_config": BarkCoarseConfig,
+        "fine_acoustics_config": BarkFineConfig,
+        "codec_config": AutoConfig,
+    }
+    def __init__(
+        self,
+        semantic_config: Optional[dict] = None,
+        coarse_acoustics_config: Optional[dict] = None,
+        fine_acoustics_config: Optional[dict] = None,
+        codec_config: Optional[dict] = None,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        if semantic_config is None:
+            semantic_config = {}
+            logger.info("semantic_config is None. initializing the semantic model with default values.")
+        if coarse_acoustics_config is None:
+            coarse_acoustics_config = {}
+            logger.info("coarse_acoustics_config is None. initializing the coarse model with default values.")
+        if fine_acoustics_config is None:
+            fine_acoustics_config = {}
+            logger.info("fine_acoustics_config is None. initializing the fine model with default values.")
+        if codec_config is None:
+            codec_config = {}
+            logger.info("codec_config is None. initializing the codec model with default values.")
+        self.semantic_config = BarkSemanticConfig(**semantic_config)
+        self.coarse_acoustics_config = BarkCoarseConfig(**coarse_acoustics_config)
+        self.fine_acoustics_config = BarkFineConfig(**fine_acoustics_config)
+        codec_model_type = codec_config.get("model_type", "encodec")
+        self.codec_config = CONFIG_MAPPING[codec_model_type](**codec_config)
+        self.initializer_range = initializer_range
+        super().__init__(**kwargs)
+    @classmethod
+    def from_sub_model_configs(
+        cls,
+        semantic_config: BarkSemanticConfig,
+        coarse_acoustics_config: BarkCoarseConfig,
+        fine_acoustics_config: BarkFineConfig,
+        codec_config: PretrainedConfig,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a [`BarkConfig`] (or a derived class) from bark sub-models configuration.
+        Returns:
+            [`BarkConfig`]: An instance of a configuration object
+        """
+        return cls(
+            semantic_config=semantic_config.to_dict(),
+            coarse_acoustics_config=coarse_acoustics_config.to_dict(),
+            fine_acoustics_config=fine_acoustics_config.to_dict(),
+            codec_config=codec_config.to_dict(),
+            **kwargs,
+        )
+__all__ = ["BarkCoarseConfig", "BarkConfig", "BarkFineConfig", "BarkSemanticConfig"]

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bark/generation_configuration_bark.py ADDED Viewed

	@@ -0,0 +1,330 @@

+# coding=utf-8
+# Copyright 2023 The Suno AI Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BARK model generation configuration"""
+import copy
+from typing import Optional
+from ...generation.configuration_utils import GenerationConfig
+from ...utils import logging
+logger = logging.get_logger(__name__)
+class BarkSemanticGenerationConfig(GenerationConfig):
+    model_type = "semantic"
+    def __init__(
+        self,
+        eos_token_id=10_000,
+        renormalize_logits=True,
+        max_new_tokens=768,
+        output_scores=False,
+        return_dict_in_generate=False,
+        output_hidden_states=False,
+        output_attentions=False,
+        temperature=1.0,
+        do_sample=False,
+        text_encoding_offset=10_048,
+        text_pad_token=129_595,
+        semantic_infer_token=129_599,
+        semantic_vocab_size=10_000,
+        max_input_semantic_length=256,
+        semantic_rate_hz=49.9,
+        min_eos_p=None,
+        **kwargs,
+    ):
+        """Class that holds a generation configuration for [`BarkSemanticModel`].
+        This configuration inherit from [`GenerationConfig`] and can be used to control the model generation. Read the
+        documentation from [`GenerationConfig`] for more information.
+        Args:
+            eos_token_id (`int`, *optional*, defaults to 10_000):
+                The id of the *end-of-sequence* token.
+            renormalize_logits (`bool`, *optional*, defaults to `True`):
+                Whether to renormalize the logits after applying all the logits processors (including the
+                custom ones). It's highly recommended to set this flag to `True` as the search algorithms suppose the
+                score logits are normalized but some logit processors break the normalization.
+            max_new_tokens (`int`, *optional*, defaults to 768):
+                The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
+            output_scores (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            output_hidden_states (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more details.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more details.
+            temperature (`float`, *optional*, defaults to 1.0):
+                The value used to modulate the next token probabilities.
+            do_sample (`bool`, *optional*, defaults to `False`):
+                Whether or not to use sampling ; use greedy decoding otherwise.
+            text_encoding_offset (`int`, *optional*, defaults to 10_048):
+                Text encoding offset.
+            text_pad_token (`int`, *optional*, defaults to 129_595):
+                Text pad token.
+            semantic_infer_token (`int`, *optional*, defaults to 129_599):
+                Semantic infer token.
+            semantic_vocab_size (`int`, *optional*, defaults to 10_000):
+                Semantic vocab size.
+            max_input_semantic_length (`int`, *optional*, defaults to 256):
+                Max length of semantic input vector.
+            semantic_rate_hz (`float`, *optional*, defaults to 49.9):
+                Semantic rate in Hertz.
+            min_eos_p (`float`, *optional*):
+                Minimum threshold of the probability of the EOS token for it to be sampled. This is an early stopping
+                strategy to mitigate potential unwanted generations at the end of a prompt. The original implementation
+                suggests a default value of 0.2.
+        """
+        super().__init__(
+            temperature=temperature,
+            do_sample=do_sample,
+            eos_token_id=eos_token_id,
+            renormalize_logits=renormalize_logits,
+            max_new_tokens=max_new_tokens,
+            output_scores=output_scores,
+            return_dict_in_generate=return_dict_in_generate,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            **kwargs,
+        )
+        self.text_encoding_offset = text_encoding_offset
+        self.text_pad_token = text_pad_token
+        self.semantic_pad_token = eos_token_id
+        self.semantic_infer_token = semantic_infer_token
+        self.semantic_vocab_size = semantic_vocab_size
+        self.max_input_semantic_length = max_input_semantic_length
+        self.semantic_rate_hz = semantic_rate_hz
+        self.min_eos_p = min_eos_p
+class BarkCoarseGenerationConfig(GenerationConfig):
+    model_type = "coarse_acoustics"
+    def __init__(
+        self,
+        renormalize_logits=True,
+        output_scores=False,
+        return_dict_in_generate=False,
+        output_hidden_states=False,
+        output_attentions=False,
+        temperature=1.0,
+        do_sample=False,
+        coarse_semantic_pad_token=12_048,
+        coarse_rate_hz=75,
+        n_coarse_codebooks=2,
+        coarse_infer_token=12_050,
+        max_coarse_input_length=256,
+        max_coarse_history: int = 630,
+        sliding_window_len: int = 60,
+        **kwargs,
+    ):
+        """Class that holds a generation configuration for [`BarkCoarseModel`].
+        This configuration inherit from [`GenerationConfig`] and can be used to control the model generation. Read the
+        documentation from [`GenerationConfig`] for more information.
+        Args:
+            renormalize_logits (`bool`, *optional*, defaults to `True`):
+                Whether to renormalize the logits after applying all the logits processors (including the
+                custom ones). It's highly recommended to set this flag to `True` as the search algorithms suppose the
+                score logits are normalized but some logit processors break the normalization.
+            output_scores (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            output_hidden_states (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more details.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more details.
+            temperature (`float`, *optional*, defaults to 1.0):
+                The value used to modulate the next token probabilities.
+            do_sample (`bool`, *optional*, defaults to `False`):
+                Whether or not to use sampling ; use greedy decoding otherwise.
+            coarse_semantic_pad_token (`int`, *optional*, defaults to 12_048):
+                Coarse semantic pad token.
+            coarse_rate_hz (`int`, *optional*, defaults to 75):
+                Coarse rate in Hertz.
+            n_coarse_codebooks (`int`, *optional*, defaults to 2):
+                Number of coarse codebooks.
+            coarse_infer_token (`int`, *optional*, defaults to 12_050):
+                Coarse infer token.
+            max_coarse_input_length (`int`, *optional*, defaults to 256):
+                Max length of input coarse vector.
+            max_coarse_history (`int`, *optional*, defaults to 630):
+                Max length of the output of the coarse acoustics model used in the fine generation step.
+            sliding_window_len (`int`, *optional*, defaults to 60):
+                The coarse generation step uses a sliding window to generate raw audio.
+        """
+        super().__init__(
+            temperature=temperature,
+            do_sample=do_sample,
+            renormalize_logits=renormalize_logits,
+            output_scores=output_scores,
+            return_dict_in_generate=return_dict_in_generate,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            **kwargs,
+        )
+        self.coarse_semantic_pad_token = coarse_semantic_pad_token
+        self.coarse_rate_hz = coarse_rate_hz
+        self.n_coarse_codebooks = n_coarse_codebooks
+        self.coarse_infer_token = coarse_infer_token
+        self.max_coarse_input_length = max_coarse_input_length
+        self.max_coarse_history = max_coarse_history
+        self.sliding_window_len = sliding_window_len
+class BarkFineGenerationConfig(GenerationConfig):
+    model_type = "fine_acoustics"
+    def __init__(
+        self,
+        temperature=1.0,
+        max_fine_history_length=512,
+        max_fine_input_length=1024,
+        n_fine_codebooks=8,
+        **kwargs,
+    ):
+        """Class that holds a generation configuration for [`BarkFineModel`].
+        [`BarkFineModel`] is an autoencoder model, so should not usually be used for generation. However, under the
+        hood, it uses `temperature` when used by [`BarkModel`]
+        This configuration inherit from [`GenerationConfig`] and can be used to control the model generation. Read the
+        documentation from [`GenerationConfig`] for more information.
+        Args:
+            temperature (`float`, *optional*):
+                The value used to modulate the next token probabilities.
+            max_fine_history_length (`int`, *optional*, defaults to 512):
+                Max length of the fine history vector.
+            max_fine_input_length (`int`, *optional*, defaults to 1024):
+                Max length of fine input vector.
+            n_fine_codebooks (`int`, *optional*, defaults to 8):
+                Number of codebooks used.
+        """
+        super().__init__(temperature=temperature)
+        self.max_fine_history_length = max_fine_history_length
+        self.max_fine_input_length = max_fine_input_length
+        self.n_fine_codebooks = n_fine_codebooks
+    def validate(self, **kwargs):
+        """
+        Overrides GenerationConfig.validate because BarkFineGenerationConfig don't use any parameters outside
+        temperature.
+        """
+        pass
+class BarkGenerationConfig(GenerationConfig):
+    model_type = "bark"
+    # TODO (joao): nested from_dict
+    def __init__(
+        self,
+        semantic_config: Optional[dict] = None,
+        coarse_acoustics_config: Optional[dict] = None,
+        fine_acoustics_config: Optional[dict] = None,
+        sample_rate=24_000,
+        codebook_size=1024,
+        **kwargs,
+    ):
+        """Class that holds a generation configuration for [`BarkModel`].
+        The [`BarkModel`] does not have a `generate` method, but uses this class to generate speeches with a nested
+        [`BarkGenerationConfig`] which uses [`BarkSemanticGenerationConfig`], [`BarkCoarseGenerationConfig`],
+        [`BarkFineGenerationConfig`].
+        This configuration inherit from [`GenerationConfig`] and can be used to control the model generation. Read the
+        documentation from [`GenerationConfig`] for more information.
+        Args:
+            semantic_config (`Dict`, *optional*):
+                Semantic generation configuration.
+            coarse_acoustics_config (`Dict`, *optional*):
+                Coarse generation configuration.
+            fine_acoustics_config (`Dict`, *optional*):
+                Fine generation configuration.
+            sample_rate (`int`, *optional*, defaults to 24_000):
+                Sample rate.
+            codebook_size (`int`, *optional*, defaults to 1024):
+                Vector length for each codebook.
+        """
+        if semantic_config is None:
+            semantic_config = {}
+            logger.info("semantic_config is None. initializing the semantic model with default values.")
+        if coarse_acoustics_config is None:
+            coarse_acoustics_config = {}
+            logger.info("coarse_acoustics_config is None. initializing the coarse model with default values.")
+        if fine_acoustics_config is None:
+            fine_acoustics_config = {}
+            logger.info("fine_acoustics_config is None. initializing the fine model with default values.")
+        self.semantic_config = BarkSemanticGenerationConfig(**semantic_config)
+        self.coarse_acoustics_config = BarkCoarseGenerationConfig(**coarse_acoustics_config)
+        self.fine_acoustics_config = BarkFineGenerationConfig(**fine_acoustics_config)
+        self.sample_rate = sample_rate
+        self.codebook_size = codebook_size
+    @classmethod
+    def from_sub_model_configs(
+        cls,
+        semantic_config: BarkSemanticGenerationConfig,
+        coarse_acoustics_config: BarkCoarseGenerationConfig,
+        fine_acoustics_config: BarkFineGenerationConfig,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a [`BarkGenerationConfig`] (or a derived class) from bark sub-models generation configuration.
+        Returns:
+            [`BarkGenerationConfig`]: An instance of a configuration object
+        """
+        return cls(
+            semantic_config=semantic_config.to_dict(),
+            coarse_acoustics_config=coarse_acoustics_config.to_dict(),
+            fine_acoustics_config=fine_acoustics_config.to_dict(),
+            **kwargs,
+        )
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["semantic_config"] = self.semantic_config.to_dict()
+        output["coarse_acoustics_config"] = self.coarse_acoustics_config.to_dict()
+        output["fine_acoustics_config"] = self.fine_acoustics_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bark/modeling_bark.py ADDED Viewed

	@@ -0,0 +1,1628 @@

+# coding=utf-8
+# Copyright 2023 The Suno AI Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BARK model."""
+import math
+import warnings
+from typing import Optional, Union
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...generation.logits_process import (
+    AlternatingCodebooksLogitsProcessor,
+    BarkEosPrioritizerLogitsProcessor,
+    SuppressTokensLogitsProcessor,
+)
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import CausalLMOutputWithPast, MaskedLMOutput
+from ...modeling_utils import PreTrainedModel, get_parameter_device
+from ...utils import (
+    auto_docstring,
+    is_accelerate_available,
+    is_torch_accelerator_available,
+    logging,
+)
+from ..auto import AutoModel
+from .configuration_bark import (
+    BarkCoarseConfig,
+    BarkConfig,
+    BarkFineConfig,
+    BarkSemanticConfig,
+    BarkSubModelConfig,
+)
+from .generation_configuration_bark import (
+    BarkCoarseGenerationConfig,
+    BarkFineGenerationConfig,
+    BarkSemanticGenerationConfig,
+)
+if is_flash_attn_available():
+    from ...modeling_flash_attention_utils import _flash_attention_forward
+logger = logging.get_logger(__name__)
+class BarkSelfAttention(nn.Module):
+    # adapted from GPTNeoSelfAttention and Bark code
+    # BarkSelfAttention can have two attention type, i.e full attention or causal attention
+    def __init__(self, config, is_causal=False, layer_idx=None):
+        super().__init__()
+        # regularization
+        self.dropout = config.dropout
+        self.attn_dropout = nn.Dropout(config.dropout)
+        self.resid_dropout = nn.Dropout(config.dropout)
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if config.hidden_size % config.num_heads != 0:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        # key, query, value projections for all heads, but in a batch
+        self.att_proj = nn.Linear(config.hidden_size, 3 * config.hidden_size, bias=config.bias)
+        # output projection
+        self.out_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=config.bias)
+        self.is_causal = is_causal
+        self.layer_idx = layer_idx
+        if is_causal:
+            block_size = config.block_size
+            bias = torch.tril(torch.ones((block_size, block_size), dtype=bool)).view(1, 1, block_size, block_size)
+            self.register_buffer("bias", bias)
+    # Copied from transformers.models.gpt_neo.modeling_gpt_neo.GPTNeoSelfAttention._split_heads
+    def _split_heads(self, tensor, num_heads, attn_head_size):
+        """
+        Splits hidden_size dim into attn_head_size and num_heads
+        """
+        new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
+        tensor = tensor.view(new_shape)
+        return tensor.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
+    def _merge_heads(self, tensor, num_heads, attn_head_size):
+        """
+        Merges attn_head_size dim and num_attn_heads dim into hidden_size
+        """
+        # re-assemble all head outputs side by side
+        # (batch, num_heads, seq_len, attn_head_size) -> (batch, seq_len, num_heads*attn_head_size)
+        tensor = tensor.transpose(1, 2).contiguous()
+        tensor = tensor.view(tensor.size()[:-2] + (num_heads * attn_head_size,))
+        return tensor
+    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
+        # unlike GPTNeo's SelfAttention, divide by the square root of the dimension of the query and the key
+        attn_weights = torch.matmul(query, key.transpose(-1, -2)) * (1.0 / math.sqrt(self.head_dim))
+        if self.is_causal:
+            query_length, key_length = query.size(-2), key.size(-2)
+            # fill the upper left part of the attention weights with inf
+            attn_weights = attn_weights.masked_fill(
+                self.bias[:, :, key_length - query_length : key_length, :key_length] == 0,
+                torch.finfo(attn_weights.dtype).min,
+            )
+        if attention_mask is not None:
+            # Apply the attention mask
+            attn_weights = attn_weights + attention_mask
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        attn_weights = attn_weights.to(value.dtype)
+        attn_weights = self.attn_dropout(attn_weights)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+        # (batch, num_heads, seq_len, seq_len) x (batch, num_heads, seq_len, attn_head_size)
+        # -> (batch, num_heads, seq_len, attn_head_size)
+        attn_output = torch.matmul(attn_weights, value)
+        return attn_output, attn_weights
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        past_key_values=None,
+        head_mask=None,
+        use_cache=False,
+        output_attentions=False,
+        cache_position=None,
+    ):
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        query, key, value = self.att_proj(hidden_states).split(self.embed_dim, dim=2)
+        query = self._split_heads(query, self.num_heads, self.head_dim)
+        key = self._split_heads(key, self.num_heads, self.head_dim)
+        value = self._split_heads(value, self.num_heads, self.head_dim)
+        if past_key_values is not None:
+            key, value = past_key_values.update(key, value, self.layer_idx, {"cache_position": cache_position})
+        attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+        attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
+        attn_output = self.out_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+        return attn_output, attn_weights
+class BarkSelfFlashAttention2(BarkSelfAttention):
+    """
+    Bark flash attention module. This module inherits from `BarkSelfAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
+    def _split_heads(self, tensor, num_heads, attn_head_size):
+        """
+        Splits hidden_size dim into attn_head_size and num_heads
+        """
+        new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
+        tensor = tensor.view(new_shape)
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim - (batch, seq_length, head, head_features)
+        return tensor
+    def _merge_heads(self, tensor, num_heads, attn_head_size):
+        """
+        Merges attn_head_size dim and num_attn_heads dim into hidden_size
+        """
+        # re-assemble all head outputs side by side
+        # (batch, seq_len, num_heads, attn_head_size) -> (batch, seq_len, num_heads*attn_head_size)
+        tensor = tensor.view(tensor.size()[:-2] + (num_heads * attn_head_size,))
+        return tensor
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        past_key_values=None,
+        head_mask=None,
+        use_cache=False,
+        output_attentions=False,
+        cache_position=None,
+    ):
+        batch_size, query_len, _ = hidden_states.size()
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        query, key, value = self.att_proj(hidden_states).split(self.embed_dim, dim=2)
+        query = self._split_heads(query, self.num_heads, self.head_dim)
+        key = self._split_heads(key, self.num_heads, self.head_dim)
+        value = self._split_heads(value, self.num_heads, self.head_dim)
+        if past_key_values is not None:
+            key, value = past_key_values.update(key, value, self.layer_idx, {"cache_position": cache_position})
+        attn_output = _flash_attention_forward(
+            query,
+            key,
+            value,
+            attention_mask,
+            query_len,
+            dropout=self.dropout if self.training else 0.0,
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+        )
+        attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
+        attn_output = self.out_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+        return attn_output, None
+BARK_ATTENTION_CLASSES = {
+    "eager": BarkSelfAttention,
+    "flash_attention_2": BarkSelfFlashAttention2,
+}
+class BarkMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.in_proj = nn.Linear(config.hidden_size, 4 * config.hidden_size, bias=config.bias)
+        self.out_proj = nn.Linear(4 * config.hidden_size, config.hidden_size, bias=config.bias)
+        self.dropout = nn.Dropout(config.dropout)
+        self.gelu = nn.GELU()
+    def forward(self, hidden_states):
+        hidden_states = self.in_proj(hidden_states)
+        hidden_states = self.gelu(hidden_states)
+        hidden_states = self.out_proj(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+class BarkBlock(GradientCheckpointingLayer):
+    def __init__(self, config, is_causal=False, layer_idx=None):
+        super().__init__()
+        if is_causal:
+            # if causal, the layerNorm bias is optional to stick with Bark choice of leaving optional bias
+            # in AutoRegressive models (corresponding to the "Text" and the "Coarse" modules)
+            self.layernorm_1 = nn.LayerNorm(config.hidden_size, bias=config.bias)
+            self.layernorm_2 = nn.LayerNorm(config.hidden_size, bias=config.bias)
+        else:
+            self.layernorm_1 = nn.LayerNorm(config.hidden_size)
+            self.layernorm_2 = nn.LayerNorm(config.hidden_size)
+        self.attn = BARK_ATTENTION_CLASSES[config._attn_implementation](
+            config, is_causal=is_causal, layer_idx=layer_idx
+        )
+        self.mlp = BarkMLP(config)
+    def forward(
+        self,
+        hidden_states,
+        past_key_values=None,
+        attention_mask=None,
+        head_mask=None,
+        use_cache=False,
+        output_attentions=False,
+        cache_position=None,
+    ):
+        intermediary_hidden_states = self.layernorm_1(hidden_states)
+        attn_outputs = self.attn(
+            intermediary_hidden_states,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+        )
+        attn_output = attn_outputs[0]  # output_attn: output, present_key_values, (attn_weights)
+        outputs = attn_outputs[1:]
+        intermediary_hidden_states = hidden_states + attn_output
+        intermediary_hidden_states = intermediary_hidden_states + self.mlp(
+            self.layernorm_2(intermediary_hidden_states)
+        )
+        return (intermediary_hidden_states,) + outputs
+@auto_docstring
+class BarkPreTrainedModel(PreTrainedModel):
+    config: BarkConfig
+    supports_gradient_checkpointing = False
+    _supports_flash_attn = True
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, (nn.Linear,)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+    @property
+    def device(self) -> torch.device:
+        """
+        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
+        device).
+        """
+        # if has _hf_hook, has been offloaded so the device has to be found in the hook
+        if not hasattr(self, "_hf_hook"):
+            return get_parameter_device(self)
+        for module in self.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return get_parameter_device(self)
+# GPT2-like autoregressive model
+class BarkCausalModel(BarkPreTrainedModel, GenerationMixin):
+    config: BarkSubModelConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        # initialize as an autoregressive GPT-like model
+        self.input_embeds_layer = nn.Embedding(config.input_vocab_size, config.hidden_size)
+        self.position_embeds_layer = nn.Embedding(config.block_size, config.hidden_size)
+        self.drop = nn.Dropout(config.dropout)
+        self.layers = nn.ModuleList([BarkBlock(config, is_causal=True, layer_idx=i) for i in range(config.num_layers)])
+        self.layernorm_final = nn.LayerNorm(config.hidden_size, bias=config.bias)
+        self.lm_head = nn.Linear(config.hidden_size, config.output_vocab_size, bias=False)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_output_embeddings(self):
+        # NOTE: get_output_embeddings() must return None to prevent accidental weight tying.
+        # See e.g. https://github.com/huggingface/transformers/pull/39339#discussion_r2219126400
+        return None
+    def get_input_embeddings(self):
+        return self.input_embeds_layer
+    def set_input_embeddings(self, new_embeddings):
+        self.input_embeds_layer = new_embeddings
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        attention_mask=None,
+        input_embeds=None,
+        past_key_values=None,
+        position_ids=None,
+        use_cache=None,
+        cache_position=None,
+        **kwargs,
+    ):
+        # Overwritten -- bark uses `input_embeds` not `inputS_embeds`
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=input_embeds,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        model_inputs["input_embeds"] = model_inputs.pop("inputs_embeds", None)
+        return model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        input_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Union[tuple[torch.Tensor], CausalLMOutputWithPast]:
+        r"""
+        input_embeds (`torch.FloatTensor` of shape `(batch_size, input_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+            Here, due to `Bark` particularities, if `past_key_values` is used, `input_embeds` will be ignored and you
+            have to use `input_ids`. If `past_key_values` is not used and `use_cache` is set to `True`, `input_embeds`
+            is used in priority instead of `input_ids`.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        loss = None
+        if labels is not None:
+            raise NotImplementedError(
+                "Training is not implemented yet for Bark - ensure you do not pass `labels` to the model."
+            )
+        # Verify if input_embeds already exists
+        # then compute embeddings.
+        if input_ids is not None and input_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and input_embeds at the same time")
+        elif input_embeds is not None and past_key_values is None:
+            # we want to return the input_embeds in priority so that it is in line with a weird hack
+            # of Bark which concatenate two bits of the input_embeds on the first forward pass of the semantic model
+            pass
+        elif input_ids is not None:
+            input_embeds = self.input_embeds_layer(input_ids)  # token embeddings of shape (b, t, n_embd)
+        elif input_embeds is not None:
+            pass
+        else:
+            raise ValueError("You have to specify either input_ids or input_embeds")
+        input_shape = input_embeds.size()[:-1]
+        batch_size = input_embeds.shape[0]
+        seq_length = input_shape[-1]
+        device = input_ids.device if input_ids is not None else input_embeds.device
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+        if use_cache and isinstance(past_key_values, tuple):
+            logger.warning_once(
+                "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. "
+                "You should pass an instance of `DynamicCache` instead, e.g. "
+                "`past_key_values=DynamicCache.from_legacy_cache(past_key_values)`."
+            )
+            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+        past_length = past_key_values.get_seq_length() if past_key_values is not None else 0
+        if position_ids is None:
+            position_ids = torch.arange(past_length, seq_length + past_length, dtype=torch.long, device=device)
+            position_ids = position_ids.unsqueeze(0)  # shape (1, seq_length)
+        position_embeds = self.position_embeds_layer(position_ids)  # position embeddings of shape (1, t, n_embd)
+        # Attention mask.
+        if attention_mask is not None:
+            if batch_size <= 0:
+                raise ValueError("batch_size has to be defined and > 0")
+            if self.config._attn_implementation == "flash_attention_2":
+                attention_mask = attention_mask if 0 in attention_mask else None
+            else:
+                attention_mask = attention_mask.view(batch_size, -1)
+                # [bsz, to_seq_length] -> [bsz, 1, 1, to_seq_length]
+                # from_seq_length is 1 to easily broadcast
+                attention_mask = _prepare_4d_attention_mask(attention_mask, input_embeds.dtype, tgt_len=1)
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x num_heads x N x N
+        # head_mask has shape num_layers x batch x num_heads x N x N
+        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
+        hidden_states = self.drop(input_embeds + position_embeds)
+        output_shape = input_shape + (hidden_states.size(-1),)
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        for i, block in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            outputs = block(
+                hidden_states,
+                past_key_values=past_key_values,
+                attention_mask=attention_mask,
+                head_mask=head_mask[i],
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+            hidden_states = outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[1],)
+        hidden_states = self.layernorm_final(hidden_states)
+        hidden_states = hidden_states.view(output_shape)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        logits = self.lm_head(hidden_states)
+        if not return_dict:
+            return tuple(
+                v for v in [None, logits, past_key_values, all_hidden_states, all_self_attentions] if v is not None
+            )
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+@auto_docstring(
+    custom_intro="""
+    Bark semantic (or text) model. It shares the same architecture as the coarse model.
+    It is a GPT-2 like autoregressive model with a language modeling head on top.
+    """
+)
+class BarkSemanticModel(BarkCausalModel):
+    base_model_prefix = "semantic"
+    config: BarkSemanticConfig
+    def generate(
+        self,
+        input_ids: torch.Tensor,
+        semantic_generation_config: Optional[BarkSemanticGenerationConfig] = None,
+        history_prompt: Optional[dict[str, torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.LongTensor:
+        """
+        Generates text semantic tokens from an input prompt and an additional optional `Bark` speaker prompt.
+        Args:
+            input_ids (`Optional[torch.Tensor]` of shape (batch_size, seq_len), *optional*):
+                Input ids, i.e tokenized input sentences. Will be truncated up to
+                semantic_generation_config.max_input_semantic_length tokens. Note that the output audios will be as
+                long as the longest generation among the batch.
+            semantic_generation_config (`BarkSemanticGenerationConfig`):
+                Generation config indicating how to generate the semantic tokens.
+            history_prompt (`Optional[dict[str,torch.Tensor]]`, *optional*):
+                Optional `Bark` speaker prompt.
+            attention_mask (`Optional[torch.Tensor]`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+        Returns:
+            torch.LongTensor: Output semantic tokens.
+        """
+        if semantic_generation_config is None:
+            raise ValueError("`semantic_generation_config` has to be provided")
+        batch_size = input_ids.shape[0]
+        max_input_semantic_length = semantic_generation_config.max_input_semantic_length
+        input_ids = input_ids + semantic_generation_config.text_encoding_offset
+        if attention_mask is not None:
+            input_ids = input_ids.masked_fill((1 - attention_mask).bool(), semantic_generation_config.text_pad_token)
+        if history_prompt is not None:
+            semantic_history = history_prompt["semantic_prompt"][-max_input_semantic_length:]
+            semantic_history = nn.functional.pad(
+                semantic_history,
+                (0, max_input_semantic_length - len(semantic_history)),
+                value=semantic_generation_config.semantic_pad_token,
+                mode="constant",
+            )
+        else:
+            semantic_history = torch.tensor(
+                [semantic_generation_config.semantic_pad_token] * max_input_semantic_length, dtype=torch.int
+            ).to(self.device)
+        semantic_history = torch.repeat_interleave(semantic_history[None], batch_size, dim=0)
+        infer_array = torch.tensor(
+            [[semantic_generation_config.semantic_infer_token]] * batch_size, dtype=torch.int
+        ).to(self.device)
+        input_embeds = torch.cat(
+            [
+                self.input_embeds_layer(input_ids[:, :max_input_semantic_length])
+                + self.input_embeds_layer(semantic_history[:, : max_input_semantic_length + 1]),
+                self.input_embeds_layer(infer_array),
+            ],
+            dim=1,
+        )
+        tokens_to_suppress = list(
+            range(semantic_generation_config.semantic_vocab_size, semantic_generation_config.semantic_pad_token)
+        )
+        tokens_to_suppress.extend(
+            list(range(semantic_generation_config.semantic_pad_token + 1, self.config.output_vocab_size))
+        )
+        suppress_tokens_logits_processor = SuppressTokensLogitsProcessor(tokens_to_suppress, device=input_ids.device)
+        min_eos_p = kwargs.get("min_eos_p", semantic_generation_config.min_eos_p)
+        early_stopping_logits_processor = BarkEosPrioritizerLogitsProcessor(
+            eos_token_id=semantic_generation_config.eos_token_id, min_eos_p=min_eos_p, device=input_ids.device
+        )
+        # pass input_ids in order to stay consistent with the transformers generate method even though it is not used
+        # (except to get the input seq_len - that's why we keep the first 257 tokens)
+        semantic_output = super().generate(
+            torch.ones((batch_size, max_input_semantic_length + 1), dtype=torch.int, device=self.device),
+            input_embeds=input_embeds,
+            logits_processor=[suppress_tokens_logits_processor, early_stopping_logits_processor],
+            generation_config=semantic_generation_config,
+            **kwargs,
+        )  # size: 10048
+        # take the generated semantic tokens
+        semantic_output = semantic_output[:, max_input_semantic_length + 1 :]
+        return semantic_output
+@auto_docstring(
+    custom_intro="""
+    Bark coarse acoustics model.
+    It shares the same architecture as the semantic (or text) model. It is a GPT-2 like autoregressive model with a
+    language modeling head on top.
+    """
+)
+class BarkCoarseModel(BarkCausalModel):
+    base_model_prefix = "coarse_acoustics"
+    config: BarkCoarseConfig
+    def preprocess_histories(
+        self,
+        max_coarse_history: int,
+        semantic_to_coarse_ratio: int,
+        batch_size: int,
+        semantic_generation_config: int,
+        codebook_size: int,
+        history_prompt: Optional[dict[str, torch.Tensor]] = None,
+    ):
+        """
+        Preprocess the optional `Bark` speaker prompts before `self.generate`.
+        Args:
+            max_coarse_history (`int`):
+                Maximum size of coarse tokens used.
+            semantic_to_coarse_ratio (`int`):
+                Ratio of semantic to coarse frequency
+            batch_size (`int`):
+                Batch size, i.e the number of samples.
+            semantic_generation_config (`BarkSemanticGenerationConfig`):
+                Generation config indicating how to generate the semantic tokens.
+            codebook_size (`int`):
+                Codebook channel size, i.e. the size of the output vocabulary per codebook channel.
+            history_prompt (`Optional[dict[str,torch.Tensor]]`):
+                Optional `Bark` speaker prompt.
+        Returns: Returns:
+            `tuple(torch.FloatTensor)`:
+            - **x_semantic_history** (`torch.FloatTensor` -- Processed semantic speaker prompt.
+            - **x_coarse_history** (`torch.FloatTensor`) -- Processed coarse speaker prompt.
+        """
+        if history_prompt is not None:
+            x_semantic_history = torch.repeat_interleave(history_prompt["semantic_prompt"][None], batch_size, dim=0)
+            # clone to avoid modifying history_prompt.coarse_prompt
+            x_coarse_history = history_prompt["coarse_prompt"].clone()
+            # offset x_coarse_history
+            if codebook_size is not None:
+                for n in range(1, x_coarse_history.shape[0]):
+                    # offset
+                    x_coarse_history[n, :] += codebook_size * n
+            # flatten x_coarse_history
+            x_coarse_history = torch.transpose(x_coarse_history, 0, 1).reshape(-1)
+            x_coarse_history = x_coarse_history + semantic_generation_config.semantic_vocab_size
+            x_coarse_history = torch.repeat_interleave(x_coarse_history[None], batch_size, dim=0)
+            # e.g: after SEMANTIC_VOCAB_SIZE (10000), 1024 tokens dedicated to first codebook, 1024 next tokens
+            # dedicated to second codebook.
+            max_semantic_history = int(np.floor(max_coarse_history / semantic_to_coarse_ratio))
+            # trim histories correctly
+            n_semantic_hist_provided = min(
+                [
+                    max_semantic_history,
+                    x_semantic_history.shape[1] - x_semantic_history.shape[1] % 2,
+                    int(np.floor(x_coarse_history.shape[1] / semantic_to_coarse_ratio)),
+                ]
+            )
+            n_coarse_hist_provided = int(round(n_semantic_hist_provided * semantic_to_coarse_ratio))
+            x_semantic_history = x_semantic_history[:, -n_semantic_hist_provided:].int()
+            x_coarse_history = x_coarse_history[:, -n_coarse_hist_provided:].int()
+            # bit of a hack for time alignment (sounds better) - from Bark original implementation
+            x_coarse_history = x_coarse_history[:, :-2]
+        else:
+            # shape: (batch_size, 0)
+            x_semantic_history = torch.tensor([[]] * batch_size, dtype=torch.int, device=self.device)
+            x_coarse_history = torch.tensor([[]] * batch_size, dtype=torch.int, device=self.device)
+        return x_semantic_history, x_coarse_history
+    def generate(
+        self,
+        semantic_output: torch.Tensor,
+        semantic_generation_config: Optional[BarkSemanticGenerationConfig] = None,
+        coarse_generation_config: Optional[BarkCoarseGenerationConfig] = None,
+        codebook_size: int = 1024,
+        history_prompt: Optional[dict[str, torch.Tensor]] = None,
+        return_output_lengths: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[torch.LongTensor, tuple[torch.LongTensor, torch.LongTensor]]:
+        """
+        Generates coarse acoustics tokens from input text semantic tokens and an additional optional `Bark` speaker
+        prompt.
+        Args:
+            semantic_output (`torch.Tensor` of shape (batch_size, seq_len), *optional*):
+                Input text semantic ids, i.e the output of `BarkSemanticModel.generate`.
+            semantic_generation_config (`BarkSemanticGenerationConfig`):
+                Generation config indicating how to generate the semantic tokens.
+            coarse_generation_config (`BarkCoarseGenerationConfig`):
+                Generation config indicating how to generate the coarse tokens.
+            codebook_size (`int`, *optional*, defaults to 1024):
+                Codebook channel size, i.e. the size of the output vocabulary per codebook channel.
+            history_prompt (`Optional[dict[str,torch.Tensor]]`, *optional*):
+                Optional `Bark` speaker prompt.
+            return_output_lengths (`bool`, *optional*):
+                Whether or not to return the output lengths. Useful when batching.
+        Returns:
+            By default:
+                torch.LongTensor: Output coarse acoustics tokens.
+            If `return_output_lengths=True`:
+                `Tuple(torch.Tensor, torch.Tensor): The output coarse acoustics tokens, and the length of each sample
+                of the batch.
+        """
+        if semantic_generation_config is None:
+            raise ValueError("`semantic_generation_config` has to be provided")
+        if coarse_generation_config is None:
+            raise ValueError("`coarse_generation_config` has to be provided")
+        max_coarse_input_length = coarse_generation_config.max_coarse_input_length
+        max_coarse_history = coarse_generation_config.max_coarse_history
+        sliding_window_len = coarse_generation_config.sliding_window_len
+        # replace semantic_pad_token (eos_tok and pad_tok here) with coarse_semantic_pad_token i.e the pad_token
+        # used in the next model
+        semantic_output.masked_fill_(
+            semantic_output == semantic_generation_config.semantic_pad_token,
+            coarse_generation_config.coarse_semantic_pad_token,
+        )
+        semantic_to_coarse_ratio = (
+            coarse_generation_config.coarse_rate_hz
+            / semantic_generation_config.semantic_rate_hz
+            * coarse_generation_config.n_coarse_codebooks
+        )
+        max_semantic_history = int(np.floor(max_coarse_history / semantic_to_coarse_ratio))
+        output_lengths = (semantic_output != coarse_generation_config.coarse_semantic_pad_token).sum(1)
+        output_lengths = torch.floor(
+            output_lengths * semantic_to_coarse_ratio / coarse_generation_config.n_coarse_codebooks
+        )
+        output_lengths = torch.round(output_lengths * coarse_generation_config.n_coarse_codebooks).int()
+        max_generated_len = torch.max(output_lengths).item()
+        batch_size = semantic_output.shape[0]
+        x_semantic_history, x_coarse = self.preprocess_histories(
+            history_prompt=history_prompt,
+            max_coarse_history=max_coarse_history,
+            semantic_to_coarse_ratio=semantic_to_coarse_ratio,
+            batch_size=batch_size,
+            semantic_generation_config=semantic_generation_config,
+            codebook_size=codebook_size,
+        )
+        base_semantic_idx = x_semantic_history.shape[1]
+        semantic_output = torch.hstack([x_semantic_history, semantic_output])
+        n_window_steps = int(np.ceil(max_generated_len / sliding_window_len))
+        total_generated_len = 0
+        len_coarse_history = x_coarse.shape[1]
+        for _ in range(n_window_steps):
+            semantic_idx = base_semantic_idx + int(round(total_generated_len / semantic_to_coarse_ratio))
+            # pad from right side
+            input_coarse = semantic_output[:, np.max([0, semantic_idx - max_semantic_history]) :]
+            input_coarse = input_coarse[:, :max_coarse_input_length]
+            input_coarse = F.pad(
+                input_coarse,
+                (0, max_coarse_input_length - input_coarse.shape[-1]),
+                "constant",
+                coarse_generation_config.coarse_semantic_pad_token,
+            )
+            input_coarse = torch.hstack(
+                [
+                    input_coarse,
+                    torch.tensor([[coarse_generation_config.coarse_infer_token]] * batch_size, device=self.device),
+                    x_coarse[:, -max_coarse_history:],
+                ]
+            )
+            alternatingLogitsProcessor = AlternatingCodebooksLogitsProcessor(
+                input_coarse.shape[1],
+                semantic_generation_config.semantic_vocab_size,
+                codebook_size,
+            )
+            output_coarse = super().generate(
+                input_coarse,
+                logits_processor=[alternatingLogitsProcessor],
+                max_new_tokens=min(sliding_window_len, max_generated_len - total_generated_len),
+                generation_config=coarse_generation_config,
+                **kwargs,
+            )
+            input_coarse_len = input_coarse.shape[1]
+            x_coarse = torch.hstack([x_coarse, output_coarse[:, input_coarse_len:]])
+            total_generated_len = x_coarse.shape[1] - len_coarse_history
+            del output_coarse
+        coarse_output = x_coarse[:, len_coarse_history:]
+        if return_output_lengths:
+            return coarse_output, output_lengths
+        return coarse_output
+@auto_docstring(
+    custom_intro="""
+    Bark fine acoustics model. It is a non-causal GPT-like model with `config.n_codes_total` embedding layers and
+    language modeling heads, one for each codebook.
+    """
+)
+class BarkFineModel(BarkPreTrainedModel):
+    base_model_prefix = "fine_acoustics"
+    config: BarkFineConfig
+    main_input_name = "codebook_idx"
+    def __init__(self, config):
+        # non-causal gpt-like model with one embedding layer and one lm_head for each codebook of Encodec
+        super().__init__(config)
+        self.config = config
+        # initialize a modified non causal GPT-like model
+        # note that for there is one embedding layer and one lm_head for each codebook of Encodec
+        self.input_embeds_layers = nn.ModuleList(
+            [nn.Embedding(config.input_vocab_size, config.hidden_size) for _ in range(config.n_codes_total)]
+        )
+        self.position_embeds_layer = nn.Embedding(config.block_size, config.hidden_size)
+        self.drop = nn.Dropout(config.dropout)
+        self.layers = nn.ModuleList(
+            [BarkBlock(config, is_causal=False, layer_idx=i) for i in range(config.num_layers)]
+        )
+        self.layernorm_final = nn.LayerNorm(config.hidden_size)
+        self.lm_heads = nn.ModuleList(
+            [
+                nn.Linear(config.hidden_size, config.output_vocab_size, bias=False)
+                for _ in range(config.n_codes_given, config.n_codes_total)
+            ]
+        )
+        self.gradient_checkpointing = False
+        self.n_codes_total = config.n_codes_total
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        # one embedding layers for each codebook
+        return self.input_embeds_layers
+    def set_input_embeddings(self, new_embeddings):
+        # one embedding layers for each codebook
+        self.input_embeds_layers = new_embeddings
+    def get_output_embeddings(self):
+        # one lm_head for each codebook
+        return self.lm_heads
+    def set_output_embeddings(self, new_output_embeddings):
+        # one lm_head for each codebook
+        self.lm_heads = new_output_embeddings
+    def _resize_token_embeddings(self, new_num_tokens, pad_to_multiple_of=None, mean_resizing=True):
+        old_embeddings_list = self.get_input_embeddings()
+        new_embeddings_list = nn.ModuleList(
+            [
+                self._get_resized_embeddings(old_embeddings, new_num_tokens, pad_to_multiple_of, mean_resizing)
+                for old_embeddings in old_embeddings_list
+            ]
+        )
+        self.set_input_embeddings(new_embeddings_list)
+        new_num_tokens = new_embeddings_list[0].weight.shape[0]
+        # if word embeddings are not tied, make sure that lm head is resized as well
+        if self.get_output_embeddings() is not None and not self.config.tie_word_embeddings:
+            old_lm_head_list = self.get_output_embeddings()
+            new_lm_head_list = nn.ModuleList(
+                [self._get_resized_lm_head(old_lm_head, new_num_tokens) for old_lm_head in old_lm_head_list]
+            )
+            self.set_output_embeddings(new_lm_head_list)
+        return self.get_input_embeddings()
+    def resize_token_embeddings(
+        self,
+        new_num_tokens: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        mean_resizing: bool = True,
+    ) -> nn.Embedding:
+        """
+        Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`.
+        Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
+        Arguments:
+            new_num_tokens (`int`, *optional*):
+                The number of new tokens in the embedding matrix. Increasing the size will add newly initialized
+                vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just
+                returns a pointer to the input tokens `torch.nn.Embedding` module of the model without doing anything.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the embedding matrix to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
+                details about this, or help on choosing the correct value for resizing, refer to this guide:
+                https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
+            mean_resizing (`bool`):
+                Whether to initialize the added embeddings from a multivariate normal distribution that has old embeddings' mean and
+                covariance or to initialize them with a normal distribution that has a mean of zero and std equals `config.initializer_range`.
+                Setting `mean_resizing` to `True` is useful when increasing the size of the embeddings of causal language models,
+                where the generated tokens' probabilities won't be affected by the added embeddings because initializing the new embeddings with the
+                old embeddings' mean will reduce the kl-divergence between the next token probability before and after adding the new embeddings.
+                Refer to this article for more information: https://nlp.stanford.edu/~johnhew/vocab-expansion.html
+        Return:
+            `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
+        """
+        model_embeds = self._resize_token_embeddings(new_num_tokens, pad_to_multiple_of, mean_resizing)
+        if new_num_tokens is None and pad_to_multiple_of is None:
+            return model_embeds
+        # Update base model and current model config
+        self.config.output_vocab_size = model_embeds[0].weight.shape[0]
+        self.config.vocab_size = model_embeds[0].weight.shape[0]
+        self.output_vocab_size = model_embeds[0].weight.shape[0]
+        self.vocab_size = model_embeds[0].weight.shape[0]
+        # Tie weights again if needed
+        self.tie_weights()
+        return model_embeds
+    def _tie_weights(self):
+        if getattr(self.config, "tie_word_embeddings", True):
+            self._tied_weights_keys = []
+            output_embeddings = self.get_output_embeddings()
+            input_embeddings = self.get_input_embeddings()
+            for i in range(self.config.n_codes_total - self.config.n_codes_given):
+                # self.input_embeds_layers[i + 1].weight = self.lm_heads[i].weight
+                self._tie_or_clone_weights(output_embeddings[i], input_embeddings[i + 1])
+                self._tied_weights_keys.append(f"lm_heads.{i}.weight")
+    def tie_weights(self):
+        """
+        Tie the weights between the input embeddings list and the output embeddings list.
+        If the `torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning the
+        weights instead.
+        """
+        for module in self.modules():
+            if hasattr(module, "_tie_weights"):
+                module._tie_weights()
+    @auto_docstring
+    def forward(
+        self,
+        codebook_idx: int,  # an additional idx corresponding to the id of the codebook that will be predicted
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        input_embeds: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], MaskedLMOutput]:
+        r"""
+        codebook_idx (`int`):
+            Index of the codebook that will be predicted.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            NOT IMPLEMENTED YET.
+        input_embeds (`torch.FloatTensor` of shape `(batch_size, input_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. If
+            `past_key_values` is used, optionally only the last `input_embeds` have to be input (see
+            `past_key_values`). This is useful if you want more control over how to convert `input_ids` indices into
+            associated vectors than the model's internal embedding lookup matrix.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        loss = None
+        if labels is not None:
+            raise NotImplementedError("Training is not implemented yet")
+        if codebook_idx == 0:
+            raise ValueError("Cannot predict 0th codebook - 0th codebook should be predicted by the coarse model")
+        if input_ids is not None and input_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and input_embeds at the same time")
+        if input_ids is None and input_embeds is None:
+            raise ValueError("You have to specify either input_ids or input_embeds")
+        if input_ids is not None:
+            # the input_embeddings are the sum of the j previous codebooks embeddings before
+            # the current codebook_idx codebook
+            # forward the GPT model itself
+            input_embeds = [
+                input_embeds_layer(input_ids[:, :, i]).unsqueeze(-1)
+                for i, input_embeds_layer in enumerate(self.input_embeds_layers)
+            ]  # token embeddings of shape (b, t, n_embd)
+            input_embeds = torch.cat(input_embeds, dim=-1)
+            input_embeds = input_embeds[:, :, :, : codebook_idx + 1].sum(dim=-1)
+        input_shape = input_embeds.size()[:-1]
+        batch_size = input_embeds.shape[0]
+        seq_length = input_shape[1]
+        device = input_ids.device if input_ids is not None else input_embeds.device
+        if position_ids is None:
+            position_ids = torch.arange(0, seq_length, dtype=torch.long, device=device)
+            position_ids = position_ids.unsqueeze(0)  # shape (1, seq_length)
+        position_embeds = self.position_embeds_layer(position_ids)  # position embeddings of shape (1, t, n_embd)
+        # Attention mask.
+        if attention_mask is not None:
+            if batch_size <= 0:
+                raise ValueError("batch_size has to be defined and > 0")
+            if self.config._attn_implementation == "flash_attention_2":
+                attention_mask = attention_mask if 0 in attention_mask else None
+            else:
+                # [bsz, to_seq_length] -> [bsz, 1, 1, to_seq_length]
+                # from_seq_length is 1 to easily broadcast
+                attention_mask = _prepare_4d_attention_mask(attention_mask, input_embeds.dtype, tgt_len=1)
+        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
+        hidden_states = self.drop(input_embeds + position_embeds)
+        output_shape = input_shape + (hidden_states.size(-1),)
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        for i, block in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            outputs = block(
+                hidden_states,
+                attention_mask=attention_mask,
+                head_mask=head_mask[i],
+                output_attentions=output_attentions,
+            )
+            hidden_states = outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[1],)
+        hidden_states = self.layernorm_final(hidden_states)
+        hidden_states = hidden_states.view(output_shape)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        logits = self.lm_heads[codebook_idx - self.config.n_codes_given](hidden_states)
+        if not return_dict:
+            return tuple(v for v in [None, logits, all_hidden_states, all_self_attentions] if v is not None)
+        return MaskedLMOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+    @torch.no_grad()
+    def generate(
+        self,
+        coarse_output: torch.Tensor,
+        semantic_generation_config: Optional[BarkSemanticGenerationConfig] = None,
+        coarse_generation_config: Optional[BarkCoarseGenerationConfig] = None,
+        fine_generation_config: BarkFineGenerationConfig = None,
+        codebook_size: int = 1024,
+        history_prompt: Optional[dict[str, torch.Tensor]] = None,
+        **kwargs,
+    ) -> torch.LongTensor:
+        """
+        Generates fine acoustics tokens from input coarse acoustics tokens and an additional optional `Bark` speaker
+        prompt.
+        Args:
+            coarse_output (`torch.Tensor` of shape (batch_size, seq_len)):
+                Input coarse acoustics ids, i.e the output of `BarkCoarseModel.generate`.
+            semantic_generation_config (`BarkSemanticGenerationConfig`):
+                Generation config indicating how to generate the semantic tokens.
+            coarse_generation_config (`BarkCoarseGenerationConfig`):
+                Generation config indicating how to generate the coarse tokens.
+            fine_generation_config (`BarkFineGenerationConfig`):
+                Generation config indicating how to generate the fine tokens.
+            codebook_size (`int`, *optional*, defaults to 1024):
+                Codebook channel size, i.e. the size of the output vocabulary per codebook channel.
+            history_prompt (`Optional[dict[str,torch.Tensor]]`, *optional*):
+                Optional `Bark` speaker prompt.
+        Returns:
+            torch.LongTensor: Output fine acoustics tokens.
+        """
+        if semantic_generation_config is None:
+            raise ValueError("`semantic_generation_config` has to be provided")
+        if coarse_generation_config is None:
+            raise ValueError("`coarse_generation_config` has to be provided")
+        if fine_generation_config is None:
+            raise ValueError("`fine_generation_config` has to be provided")
+        # since we don't really use GenerationConfig through the fine model (autoencoder)
+        # and since only temperature is used from the classic GenerationConfig parameters
+        # manually impose the kwargs priority over the generation config
+        temperature = kwargs.get("temperature", fine_generation_config.temperature)
+        max_fine_history_length = fine_generation_config.max_fine_history_length
+        max_fine_input_length = fine_generation_config.max_fine_input_length
+        # shape: (batch, n_coarse_codebooks * seq_len)
+        # new_shape: (batch, seq_len, n_coarse_codebooks)
+        coarse_output = coarse_output.view(coarse_output.shape[0], -1, coarse_generation_config.n_coarse_codebooks)
+        # brings ids into the range [0, codebook_size -1]
+        coarse_output = torch.remainder(coarse_output - semantic_generation_config.semantic_vocab_size, codebook_size)
+        batch_size = coarse_output.shape[0]
+        if history_prompt is not None:
+            x_fine_history = torch.repeat_interleave(history_prompt["fine_prompt"].T[None], batch_size, dim=0)
+            # transpose to get to shape (seq_len, n_fine_codebooks)
+        else:
+            x_fine_history = None
+        n_coarse = coarse_generation_config.n_coarse_codebooks
+        # pad the last 6th codebooks
+        fine_input = F.pad(
+            coarse_output,
+            (0, fine_generation_config.n_fine_codebooks - n_coarse),
+            "constant",
+            codebook_size,
+        )
+        # prepend history if available (max max_fine_history_length)
+        if x_fine_history is not None:
+            fine_input = torch.cat([x_fine_history[:, -max_fine_history_length:, :], fine_input], dim=1)
+            # len of the fine_history that has been added to fine_input
+            n_history = x_fine_history[:, -max_fine_history_length:, :].shape[1]
+        else:
+            n_history = 0
+        n_remove_from_end = 0
+        # need to pad if too short (since non-causal model)
+        if fine_input.shape[1] < max_fine_input_length:
+            n_remove_from_end = max_fine_input_length - fine_input.shape[1]
+            fine_input = F.pad(fine_input, (0, 0, 0, n_remove_from_end), mode="constant", value=codebook_size)
+        # we can be lazy about fractional loop and just keep overwriting codebooks.
+        # seems that coarse_output.shape[1] - (max_fine_input_length - n_history) is equal to minus n_remove_from_end
+        # So if we needed to pad because too short, n_loops is always 1 (because n_remove_from_end > 0)
+        # If not, we loop over at least twice.
+        n_loops = (coarse_output.shape[1] - (max_fine_input_length - n_history)) / max_fine_history_length
+        n_loops = int(np.ceil(n_loops))
+        n_loops = max(0, n_loops) + 1
+        for n_outer in range(n_loops):
+            start_idx = min([n_outer * max_fine_history_length, fine_input.shape[1] - max_fine_input_length])
+            start_fill_idx = min(
+                [n_history + n_outer * max_fine_history_length, fine_input.shape[1] - max_fine_history_length]
+            )
+            rel_start_fill_idx = start_fill_idx - start_idx
+            input_buffer = fine_input[:, start_idx : start_idx + max_fine_input_length, :]
+            for n_inner in range(n_coarse, fine_generation_config.n_fine_codebooks):
+                logits = self.forward(n_inner, input_buffer).logits
+                if temperature is None or temperature == 1.0:
+                    relevant_logits = logits[:, rel_start_fill_idx:, :codebook_size]
+                    codebook_preds = torch.argmax(relevant_logits, -1)
+                else:
+                    relevant_logits = logits[:, :, :codebook_size] / temperature
+                    # apply softmax
+                    probs = F.softmax(relevant_logits, dim=-1)[:, rel_start_fill_idx:max_fine_input_length]
+                    # reshape to 2D: (batch_size, seq_len, codebook_size) -> (batch_size*seq_len, codebook_size)
+                    probs = probs.reshape((-1, codebook_size))
+                    # multinomial then reshape : (batch_size*seq_len)-> (batch_size,seq_len)
+                    codebook_preds = torch.multinomial(probs, num_samples=1).view(batch_size, -1)
+                codebook_preds = codebook_preds.to(torch.int32)
+                input_buffer[:, rel_start_fill_idx:, n_inner] = codebook_preds
+                del logits, codebook_preds
+            # transfer into fine_input
+            for n_inner in range(n_coarse, fine_generation_config.n_fine_codebooks):
+                fine_input[
+                    :, start_fill_idx : start_fill_idx + (max_fine_input_length - rel_start_fill_idx), n_inner
+                ] = input_buffer[:, rel_start_fill_idx:, n_inner]
+            del input_buffer
+        fine_input = fine_input.transpose(1, 2)[:, :, n_history:]
+        if n_remove_from_end > 0:
+            fine_input = fine_input[:, :, :-n_remove_from_end]
+        if fine_input.shape[-1] != coarse_output.shape[-2]:
+            raise ValueError("input and output should have the same seq_len")
+        return fine_input
+@auto_docstring(
+    custom_intro="""
+    The full Bark model, a text-to-speech model composed of 4 sub-models:
+    - [`BarkSemanticModel`] (also referred to as the 'text' model): a causal auto-regressive transformer model that
+      takes
+    as input tokenized text, and predicts semantic text tokens that capture the meaning of the text.
+    - [`BarkCoarseModel`] (also referred to as the 'coarse acoustics' model), also a causal autoregressive transformer,
+    that takes into input the results of the last model. It aims at regressing the first two audio codebooks necessary
+    to `encodec`.
+    - [`BarkFineModel`] (the 'fine acoustics' model), this time a non-causal autoencoder transformer, which iteratively
+    predicts the last codebooks based on the sum of the previous codebooks embeddings.
+    - having predicted all the codebook channels from the [`EncodecModel`], Bark uses it to decode the output audio
+      array.
+    It should be noted that each of the first three modules can support conditional speaker embeddings to condition the
+    output sound according to specific predefined voice.
+    """
+)
+class BarkModel(BarkPreTrainedModel):
+    config: BarkConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.semantic = BarkSemanticModel(config.semantic_config)
+        self.coarse_acoustics = BarkCoarseModel(config.coarse_acoustics_config)
+        self.fine_acoustics = BarkFineModel(config.fine_acoustics_config)
+        self.codec_model = AutoModel.from_config(config.codec_config)
+        self.config = config
+    @classmethod
+    def can_generate(cls) -> bool:
+        # Bark has a unique model structure, where the external class (`BarkModel`) doesn't need to inherit from
+        # `GenerationMixin` (it has a non-standard generation method), but one of the internal models do
+        # (`BarkSemanticModel`). This means that the base `can_generate()` will return `False`, but we need to
+        # override it so as to do `GenerationConfig` handling in multiple parts of the codebase.
+        return True
+    @property
+    def device(self) -> torch.device:
+        """
+        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
+        device).
+        """
+        # for bark_model, device must be verified on its sub-models
+        # if has _hf_hook, has been offloaded so the device has to be found in the hook
+        if not hasattr(self.semantic, "_hf_hook"):
+            return get_parameter_device(self)
+        for module in self.semantic.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+    def enable_cpu_offload(
+        self,
+        accelerator_id: Optional[int] = 0,
+        **kwargs,
+    ):
+        r"""
+        Offloads all sub-models to CPU using accelerate, reducing memory usage with a low impact on performance. This
+        method moves one whole sub-model at a time to the accelerator when it is used, and the sub-model remains in accelerator until the next sub-model runs.
+        Args:
+            accelerator_id (`int`, *optional*, defaults to 0):
+                accelerator id on which the sub-models will be loaded and offloaded. This argument is deprecated.
+            kwargs (`dict`, *optional*):
+                additional keyword arguments:
+                    `gpu_id`: accelerator id on which the sub-models will be loaded and offloaded.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate`.")
+        gpu_id = kwargs.get("gpu_id", 0)
+        if gpu_id != 0:
+            warnings.warn(
+                "The argument `gpu_id` is deprecated and will be removed in version 4.54.0 of Transformers. Please use `accelerator_id` instead.",
+                FutureWarning,
+            )
+            accelerator_id = gpu_id
+        device_type = "cuda"
+        if is_torch_accelerator_available():
+            device_type = torch.accelerator.current_accelerator().type
+        device = torch.device(f"{device_type}:{accelerator_id}")
+        torch_accelerator_module = getattr(torch, device_type)
+        if self.device.type != "cpu":
+            self.to("cpu")
+            torch_accelerator_module.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+        # this layer is used outside the first forward pass of semantic so need to be loaded before semantic
+        self.semantic.input_embeds_layer, _ = cpu_offload_with_hook(self.semantic.input_embeds_layer, device)
+        hook = None
+        for cpu_offloaded_model in [
+            self.semantic,
+            self.coarse_acoustics,
+            self.fine_acoustics,
+        ]:
+            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+        self.fine_acoustics_hook = hook
+        _, hook = cpu_offload_with_hook(self.codec_model, device, prev_module_hook=hook)
+        # We'll offload the last model manually.
+        self.codec_model_hook = hook
+    def codec_decode(self, fine_output, output_lengths=None):
+        """Turn quantized audio codes into audio array using encodec."""
+        fine_output = fine_output.transpose(0, 1)
+        emb = self.codec_model.quantizer.decode(fine_output)
+        if output_lengths is not None:
+            # encodec uses LSTMs which behaves differently with appended padding
+            # decoding with encodec takes around 0.1% of the total generation time
+            # to keep generation quality, we break batching
+            out = [sample[:, :l].unsqueeze(0) for (sample, l) in zip(emb, output_lengths)]
+            audio_arr = [self.codec_model.decoder(sample).squeeze() for sample in out]
+        else:
+            out = self.codec_model.decoder(emb)
+            audio_arr = out.squeeze(1)  # squeeze the codebook dimension
+        return audio_arr
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        history_prompt: Optional[dict[str, torch.Tensor]] = None,
+        return_output_lengths: Optional[bool] = None,
+        **kwargs,
+    ) -> torch.LongTensor:
+        """
+        Generates audio from an input prompt and an additional optional `Bark` speaker prompt.
+        Args:
+            input_ids (`Optional[torch.Tensor]` of shape (batch_size, seq_len), *optional*):
+                Input ids. Will be truncated up to 256 tokens. Note that the output audios will be as long as the
+                longest generation among the batch.
+            history_prompt (`Optional[dict[str,torch.Tensor]]`, *optional*):
+                Optional `Bark` speaker prompt. Note that for now, this model takes only one speaker prompt per batch.
+            kwargs (*optional*): Remaining dictionary of keyword arguments. Keyword arguments are of two types:
+                - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model.
+                - With a *semantic_*, *coarse_*, *fine_* prefix, they will be input for the `generate` method of the
+                semantic, coarse and fine respectively. It has the priority over the keywords without a prefix.
+                This means you can, for example, specify a generation strategy for all sub-models except one.
+            return_output_lengths (`bool`, *optional*):
+                Whether or not to return the waveform lengths. Useful when batching.
+        Returns:
+            By default:
+                - **audio_waveform** (`torch.Tensor` of shape (batch_size, seq_len)): Generated audio waveform.
+            When `return_output_lengths=True`:
+                Returns a tuple made of:
+                - **audio_waveform** (`torch.Tensor` of shape (batch_size, seq_len)): Generated audio waveform.
+                - **output_lengths** (`torch.Tensor` of shape (batch_size)): The length of each waveform in the batch
+        Example:
+        ```python
+        >>> from transformers import AutoProcessor, BarkModel
+        >>> processor = AutoProcessor.from_pretrained("suno/bark-small")
+        >>> model = BarkModel.from_pretrained("suno/bark-small")
+        >>> # To add a voice preset, you can pass `voice_preset` to `BarkProcessor.__call__(...)`
+        >>> voice_preset = "v2/en_speaker_6"
+        >>> inputs = processor("Hello, my dog is cute, I need him in my life", voice_preset=voice_preset)
+        >>> audio_array = model.generate(**inputs, semantic_max_new_tokens=100)
+        >>> audio_array = audio_array.cpu().numpy().squeeze()
+        ```
+        """
+        # TODO (joao):workaround until nested generation config is compatible with PreTrained Model
+        # todo: dict
+        semantic_generation_config = BarkSemanticGenerationConfig(**self.generation_config.semantic_config)
+        coarse_generation_config = BarkCoarseGenerationConfig(**self.generation_config.coarse_acoustics_config)
+        fine_generation_config = BarkFineGenerationConfig(**self.generation_config.fine_acoustics_config)
+        kwargs_semantic = {
+            # if "attention_mask" is set, it should not be passed to CoarseModel and FineModel
+            "attention_mask": kwargs.pop("attention_mask", None),
+            "min_eos_p": kwargs.pop("min_eos_p", None),
+        }
+        kwargs_coarse = {}
+        kwargs_fine = {}
+        for key, value in kwargs.items():
+            if key.startswith("semantic_"):
+                key = key[len("semantic_") :]
+                kwargs_semantic[key] = value
+            elif key.startswith("coarse_"):
+                key = key[len("coarse_") :]
+                kwargs_coarse[key] = value
+            elif key.startswith("fine_"):
+                key = key[len("fine_") :]
+                kwargs_fine[key] = value
+            else:
+                # If the key is already in a specific config, then it's been set with a
+                # submodules specific value and we don't override
+                if key not in kwargs_semantic:
+                    kwargs_semantic[key] = value
+                if key not in kwargs_coarse:
+                    kwargs_coarse[key] = value
+                if key not in kwargs_fine:
+                    kwargs_fine[key] = value
+        # 1. Generate from the semantic model
+        if "generation_config" in kwargs_semantic:
+            kwargs_semantic.pop("generation_config")
+        semantic_output = self.semantic.generate(
+            input_ids,
+            history_prompt=history_prompt,
+            semantic_generation_config=semantic_generation_config,
+            **kwargs_semantic,
+        )
+        # 2. Generate from the coarse model
+        if "generation_config" in kwargs_coarse:
+            kwargs_coarse.pop("generation_config")
+        coarse_output = self.coarse_acoustics.generate(
+            semantic_output,
+            history_prompt=history_prompt,
+            semantic_generation_config=semantic_generation_config,
+            coarse_generation_config=coarse_generation_config,
+            codebook_size=self.generation_config.codebook_size,
+            return_output_lengths=return_output_lengths,
+            **kwargs_coarse,
+        )
+        output_lengths = None
+        if return_output_lengths:
+            coarse_output, output_lengths = coarse_output
+            # (batch_size, seq_len*coarse_codebooks) -> (batch_size, seq_len)
+            output_lengths = output_lengths // coarse_generation_config.n_coarse_codebooks
+        # 3. "generate" from the fine model
+        if "generation_config" in kwargs_fine:
+            kwargs_fine.pop("generation_config")
+        output = self.fine_acoustics.generate(
+            coarse_output,
+            history_prompt=history_prompt,
+            semantic_generation_config=semantic_generation_config,
+            coarse_generation_config=coarse_generation_config,
+            fine_generation_config=fine_generation_config,
+            codebook_size=self.generation_config.codebook_size,
+            **kwargs_fine,
+        )
+        if getattr(self, "fine_acoustics_hook", None) is not None:
+            # Manually offload fine_acoustics to CPU
+            # and load codec_model to GPU
+            # since bark doesn't use codec_model forward pass
+            self.fine_acoustics_hook.offload()
+            self.codec_model = self.codec_model.to(self.device)
+        # 4. Decode the output and generate audio array
+        audio = self.codec_decode(output, output_lengths)
+        if getattr(self, "codec_model_hook", None) is not None:
+            # Offload codec_model to CPU
+            self.codec_model_hook.offload()
+        if return_output_lengths:
+            output_lengths = [len(sample) for sample in audio]
+            audio = nn.utils.rnn.pad_sequence(audio, batch_first=True, padding_value=0)
+            return audio, output_lengths
+        return audio
+    def tie_weights(self):
+        """
+        Tie the weights between the input embeddings list and the output embeddings list.
+        If the `torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning the
+        weights instead.
+        """
+        for module in self.modules():
+            if hasattr(module, "_tie_weights"):
+                module._tie_weights()
+__all__ = [
+    "BarkFineModel",
+    "BarkSemanticModel",
+    "BarkCoarseModel",
+    "BarkModel",
+    "BarkPreTrainedModel",
+    "BarkCausalModel",
+]

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bark/processing_bark.py ADDED Viewed

	@@ -0,0 +1,340 @@

+# coding=utf-8
+# Copyright 2023 The Suno AI Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Bark
+"""
+import json
+import os
+from typing import Optional
+import numpy as np
+from ...feature_extraction_utils import BatchFeature
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding
+from ...utils import logging
+from ...utils.hub import cached_file
+from ..auto import AutoTokenizer
+logger = logging.get_logger(__name__)
+class BarkProcessor(ProcessorMixin):
+    r"""
+    Constructs a Bark processor which wraps a text tokenizer and optional Bark voice presets into a single processor.
+    Args:
+        tokenizer ([`PreTrainedTokenizer`]):
+            An instance of [`PreTrainedTokenizer`].
+        speaker_embeddings (`dict[dict[str]]`, *optional*):
+            Optional nested speaker embeddings dictionary. The first level contains voice preset names (e.g
+            `"en_speaker_4"`). The second level contains `"semantic_prompt"`, `"coarse_prompt"` and `"fine_prompt"`
+            embeddings. The values correspond to the path of the corresponding `np.ndarray`. See
+            [here](https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c) for
+            a list of `voice_preset_names`.
+    """
+    tokenizer_class = "AutoTokenizer"
+    attributes = ["tokenizer"]
+    preset_shape = {
+        "semantic_prompt": 1,  # 1D array of shape (X,)
+        "coarse_prompt": 2,  # 2D array of shape (2,X)
+        "fine_prompt": 2,  # 2D array of shape (8,X)
+    }
+    def __init__(self, tokenizer, speaker_embeddings=None):
+        super().__init__(tokenizer)
+        self.speaker_embeddings = speaker_embeddings
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_processor_name_or_path, speaker_embeddings_dict_path="speaker_embeddings_path.json", **kwargs
+    ):
+        r"""
+        Instantiate a Bark processor associated with a pretrained model.
+        Args:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                This can be either:
+                - a string, the *model id* of a pretrained [`BarkProcessor`] hosted inside a model repo on
+                  huggingface.co.
+                - a path to a *directory* containing a processor saved using the [`~BarkProcessor.save_pretrained`]
+                  method, e.g., `./my_model_directory/`.
+            speaker_embeddings_dict_path (`str`, *optional*, defaults to `"speaker_embeddings_path.json"`):
+                The name of the `.json` file containing the speaker_embeddings dictionary located in
+                `pretrained_model_name_or_path`. If `None`, no speaker_embeddings is loaded.
+            **kwargs
+                Additional keyword arguments passed along to both
+                [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`].
+        """
+        if speaker_embeddings_dict_path is not None:
+            speaker_embeddings_path = cached_file(
+                pretrained_processor_name_or_path,
+                speaker_embeddings_dict_path,
+                subfolder=kwargs.pop("subfolder", None),
+                cache_dir=kwargs.pop("cache_dir", None),
+                force_download=kwargs.pop("force_download", False),
+                proxies=kwargs.pop("proxies", None),
+                resume_download=kwargs.pop("resume_download", None),
+                local_files_only=kwargs.pop("local_files_only", False),
+                token=kwargs.pop("use_auth_token", None),
+                revision=kwargs.pop("revision", None),
+                _raise_exceptions_for_gated_repo=False,
+                _raise_exceptions_for_missing_entries=False,
+                _raise_exceptions_for_connection_errors=False,
+            )
+            if speaker_embeddings_path is None:
+                logger.warning(
+                    f"""`{os.path.join(pretrained_processor_name_or_path, speaker_embeddings_dict_path)}` does not exists
+                    , no preloaded speaker embeddings will be used - Make sure to provide a correct path to the json
+                    dictionary if wanted, otherwise set `speaker_embeddings_dict_path=None`."""
+                )
+                speaker_embeddings = None
+            else:
+                with open(speaker_embeddings_path) as speaker_embeddings_json:
+                    speaker_embeddings = json.load(speaker_embeddings_json)
+        else:
+            speaker_embeddings = None
+        if speaker_embeddings is not None:
+            if "repo_or_path" in speaker_embeddings:
+                speaker_embeddings["repo_or_path"] = pretrained_processor_name_or_path
+        tokenizer = AutoTokenizer.from_pretrained(pretrained_processor_name_or_path, **kwargs)
+        return cls(tokenizer=tokenizer, speaker_embeddings=speaker_embeddings)
+    def save_pretrained(
+        self,
+        save_directory,
+        speaker_embeddings_dict_path="speaker_embeddings_path.json",
+        speaker_embeddings_directory="speaker_embeddings",
+        push_to_hub: bool = False,
+        **kwargs,
+    ):
+        """
+        Saves the attributes of this processor (tokenizer...) in the specified directory so that it can be reloaded
+        using the [`~BarkProcessor.from_pretrained`] method.
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the tokenizer files and the speaker embeddings will be saved (directory will be created
+                if it does not exist).
+            speaker_embeddings_dict_path (`str`, *optional*, defaults to `"speaker_embeddings_path.json"`):
+                The name of the `.json` file that will contains the speaker_embeddings nested path dictionary, if it
+                exists, and that will be located in `pretrained_model_name_or_path/speaker_embeddings_directory`.
+            speaker_embeddings_directory (`str`, *optional*, defaults to `"speaker_embeddings/"`):
+                The name of the folder in which the speaker_embeddings arrays will be saved.
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
+                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
+                namespace).
+            kwargs:
+                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        if self.speaker_embeddings is not None:
+            os.makedirs(os.path.join(save_directory, speaker_embeddings_directory, "v2"), exist_ok=True)
+            embeddings_dict = {}
+            embeddings_dict["repo_or_path"] = save_directory
+            for prompt_key in self.available_voice_presets:
+                voice_preset = self._load_voice_preset(prompt_key)
+                tmp_dict = {}
+                for key in self.speaker_embeddings[prompt_key]:
+                    np.save(
+                        os.path.join(
+                            embeddings_dict["repo_or_path"], speaker_embeddings_directory, f"{prompt_key}_{key}"
+                        ),
+                        voice_preset[key],
+                        allow_pickle=False,
+                    )
+                    tmp_dict[key] = os.path.join(speaker_embeddings_directory, f"{prompt_key}_{key}.npy")
+                embeddings_dict[prompt_key] = tmp_dict
+            with open(os.path.join(save_directory, speaker_embeddings_dict_path), "w") as fp:
+                json.dump(embeddings_dict, fp)
+        super().save_pretrained(save_directory, push_to_hub, **kwargs)
+    def _load_voice_preset(self, voice_preset: Optional[str] = None, **kwargs):
+        voice_preset_paths = self.speaker_embeddings[voice_preset]
+        voice_preset_dict = {}
+        for key in ["semantic_prompt", "coarse_prompt", "fine_prompt"]:
+            if key not in voice_preset_paths:
+                raise ValueError(
+                    f"Voice preset unrecognized, missing {key} as a key in self.speaker_embeddings[{voice_preset}]."
+                )
+            path = cached_file(
+                self.speaker_embeddings.get("repo_or_path", "/"),
+                voice_preset_paths[key],
+                subfolder=kwargs.pop("subfolder", None),
+                cache_dir=kwargs.pop("cache_dir", None),
+                force_download=kwargs.pop("force_download", False),
+                proxies=kwargs.pop("proxies", None),
+                resume_download=kwargs.pop("resume_download", None),
+                local_files_only=kwargs.pop("local_files_only", False),
+                token=kwargs.pop("use_auth_token", None),
+                revision=kwargs.pop("revision", None),
+                _raise_exceptions_for_gated_repo=False,
+                _raise_exceptions_for_missing_entries=False,
+                _raise_exceptions_for_connection_errors=False,
+            )
+            if path is None:
+                raise ValueError(
+                    f"""`{os.path.join(self.speaker_embeddings.get("repo_or_path", "/"), voice_preset_paths[key])}` does not exists
+                    , no preloaded voice preset will be used - Make sure to provide correct paths to the {voice_preset}
+                    embeddings."""
+                )
+            voice_preset_dict[key] = np.load(path)
+        return voice_preset_dict
+    def _validate_voice_preset_dict(self, voice_preset: Optional[dict] = None):
+        for key in ["semantic_prompt", "coarse_prompt", "fine_prompt"]:
+            if key not in voice_preset:
+                raise ValueError(f"Voice preset unrecognized, missing {key} as a key.")
+            if not isinstance(voice_preset[key], np.ndarray):
+                raise TypeError(f"{key} voice preset must be a {str(self.preset_shape[key])}D ndarray.")
+            if len(voice_preset[key].shape) != self.preset_shape[key]:
+                raise ValueError(f"{key} voice preset must be a {str(self.preset_shape[key])}D ndarray.")
+    @property
+    def available_voice_presets(self) -> list:
+        """
+        Returns a list of available voice presets.
+        Returns:
+            `list[str]`: A list of voice preset names.
+        """
+        if self.speaker_embeddings is None:
+            return []
+        voice_presets = list(self.speaker_embeddings.keys())
+        if "repo_or_path" in voice_presets:
+            voice_presets.remove("repo_or_path")
+        return voice_presets
+    def _verify_speaker_embeddings(self, remove_unavailable: bool = True):
+        # check which actually downloaded properly / are available
+        unavailable_keys = []
+        if self.speaker_embeddings is not None:
+            for voice_preset in self.available_voice_presets:
+                try:
+                    voice_preset_dict = self._load_voice_preset(voice_preset)
+                except ValueError:
+                    # error from `_load_voice_preset` of path not existing
+                    unavailable_keys.append(voice_preset)
+                    continue
+                self._validate_voice_preset_dict(voice_preset_dict)
+            if unavailable_keys:
+                logger.warning(
+                    f"The following {len(unavailable_keys)} speaker embeddings are not available: {unavailable_keys} "
+                    "If you would like to use them, please check the paths or try downloading them again."
+                )
+            if remove_unavailable:
+                for voice_preset in unavailable_keys:
+                    del self.speaker_embeddings[voice_preset]
+    def __call__(
+        self,
+        text=None,
+        voice_preset=None,
+        return_tensors="pt",
+        max_length=256,
+        add_special_tokens=False,
+        return_attention_mask=True,
+        return_token_type_ids=False,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        Main method to prepare for the model one or several sequences(s). This method forwards the `text` and `kwargs`
+        arguments to the AutoTokenizer's [`~AutoTokenizer.__call__`] to encode the text. The method also proposes a
+        voice preset which is a dictionary of arrays that conditions `Bark`'s output. `kwargs` arguments are forwarded
+        to the tokenizer and to `cached_file` method if `voice_preset` is a valid filename.
+        Args:
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            voice_preset (`str`, `dict[np.ndarray]`):
+                The voice preset, i.e the speaker embeddings. It can either be a valid voice_preset name, e.g
+                `"en_speaker_1"`, or directly a dictionary of `np.ndarray` embeddings for each submodel of `Bark`. Or
+                it can be a valid file name of a local `.npz` single voice preset containing the keys
+                `"semantic_prompt"`, `"coarse_prompt"` and `"fine_prompt"`.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+        Returns:
+            [`BatchEncoding`]: A [`BatchEncoding`] object containing the output of the `tokenizer`.
+            If a voice preset is provided, the returned object will include a `"history_prompt"` key
+            containing a [`BatchFeature`], i.e the voice preset with the right tensors type.
+        """
+        if voice_preset is not None and not isinstance(voice_preset, dict):
+            if (
+                isinstance(voice_preset, str)
+                and self.speaker_embeddings is not None
+                and voice_preset in self.speaker_embeddings
+            ):
+                voice_preset = self._load_voice_preset(voice_preset)
+            else:
+                if isinstance(voice_preset, str) and not voice_preset.endswith(".npz"):
+                    voice_preset = voice_preset + ".npz"
+                voice_preset = np.load(voice_preset)
+        if voice_preset is not None:
+            self._validate_voice_preset_dict(voice_preset, **kwargs)
+            voice_preset = BatchFeature(data=voice_preset, tensor_type=return_tensors)
+        encoded_text = self.tokenizer(
+            text,
+            return_tensors=return_tensors,
+            padding="max_length",
+            max_length=max_length,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            add_special_tokens=add_special_tokens,
+            **kwargs,
+        )
+        if voice_preset is not None:
+            encoded_text["history_prompt"] = voice_preset
+        return encoded_text
+__all__ = ["BarkProcessor"]

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bert/__init__.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+if TYPE_CHECKING:
+    from .configuration_bert import *
+    from .modeling_bert import *
+    from .modeling_flax_bert import *
+    from .modeling_tf_bert import *
+    from .tokenization_bert import *
+    from .tokenization_bert_fast import *
+    from .tokenization_bert_tf import *
+else:
+    import sys
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bert/configuration_bert.py ADDED Viewed

	@@ -0,0 +1,154 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT model configuration"""
+from collections import OrderedDict
+from collections.abc import Mapping
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+logger = logging.get_logger(__name__)
+class BertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`BertModel`] or a [`TFBertModel`]. It is used to
+    instantiate a BERT model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the BERT
+    [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658).
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+    Examples:
+    ```python
+    >>> from transformers import BertConfig, BertModel
+    >>> # Initializing a BERT google-bert/bert-base-uncased style configuration
+    >>> configuration = BertConfig()
+    >>> # Initializing a model (with random weights) from the google-bert/bert-base-uncased style configuration
+    >>> model = BertModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "bert"
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        position_embedding_type="absolute",
+        use_cache=True,
+        classifier_dropout=None,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+class BertOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task == "multiple-choice":
+            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
+        else:
+            dynamic_axis = {0: "batch", 1: "sequence"}
+        return OrderedDict(
+            [
+                ("input_ids", dynamic_axis),
+                ("attention_mask", dynamic_axis),
+                ("token_type_ids", dynamic_axis),
+            ]
+        )
+__all__ = ["BertConfig", "BertOnnxConfig"]

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py ADDED Viewed

	@@ -0,0 +1,1801 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model."""
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Union
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa, _prepare_4d_causal_attention_mask_for_sdpa
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import ModelOutput, auto_docstring, logging
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_bert import BertConfig
+logger = logging.get_logger(__name__)
+def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+    for name, array in zip(names, arrays):
+        name = name.split("/")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+            for n in name
+        ):
+            logger.info(f"Skipping {'/'.join(name)}")
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info(f"Skipping {'/'.join(name)}")
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
+            array = np.transpose(array)
+        try:
+            if pointer.shape != array.shape:
+                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
+        except ValueError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info(f"Initialize PyTorch weight {name}")
+        pointer.data = torch.from_numpy(array)
+    return model
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.register_buffer(
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+        )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        past_key_values_length: int = 0,
+    ) -> torch.Tensor:
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+        seq_length = input_shape[1]
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class BertSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None, layer_idx=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+        self.is_decoder = config.is_decoder
+        self.layer_idx = layer_idx
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor]:
+        batch_size, seq_length, _ = hidden_states.shape
+        query_layer = self.query(hidden_states)
+        query_layer = query_layer.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose(
+            1, 2
+        )
+        is_updated = False
+        is_cross_attention = encoder_hidden_states is not None
+        if past_key_values is not None:
+            if isinstance(past_key_values, EncoderDecoderCache):
+                is_updated = past_key_values.is_updated.get(self.layer_idx)
+                if is_cross_attention:
+                    # after the first generated id, we can subsequently re-use all key/value_layer from cache
+                    curr_past_key_value = past_key_values.cross_attention_cache
+                else:
+                    curr_past_key_value = past_key_values.self_attention_cache
+            else:
+                curr_past_key_value = past_key_values
+        current_states = encoder_hidden_states if is_cross_attention else hidden_states
+        if is_cross_attention and past_key_values is not None and is_updated:
+            # reuse k,v, cross_attentions
+            key_layer = curr_past_key_value.layers[self.layer_idx].keys
+            value_layer = curr_past_key_value.layers[self.layer_idx].values
+        else:
+            key_layer = self.key(current_states)
+            key_layer = key_layer.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose(
+                1, 2
+            )
+            value_layer = self.value(current_states)
+            value_layer = value_layer.view(
+                batch_size, -1, self.num_attention_heads, self.attention_head_size
+            ).transpose(1, 2)
+            if past_key_values is not None:
+                # save all key/value_layer to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_layer, value_layer = curr_past_key_value.update(
+                    key_layer, value_layer, self.layer_idx, {"cache_position": cache_position}
+                )
+                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+                if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache):
+                    past_key_values.is_updated[self.layer_idx] = True
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
+            if past_key_values is not None:
+                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
+                    -1, 1
+                )
+            else:
+                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+        return context_layer, attention_probs
+class BertSdpaSelfAttention(BertSelfAttention):
+    def __init__(self, config, position_embedding_type=None, layer_idx=None):
+        super().__init__(config, position_embedding_type=position_embedding_type, layer_idx=layer_idx)
+        self.dropout_prob = config.attention_probs_dropout_prob
+    # Adapted from BertSelfAttention
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor]:
+        if self.position_embedding_type != "absolute" or output_attentions or head_mask is not None:
+            # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once implemented.
+            logger.warning_once(
+                "BertSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support "
+                "non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to "
+                "the manual attention implementation, but specifying the manual implementation will be required from "
+                "Transformers version v5.0.0 onwards. This warning can be removed using the argument "
+                '`attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                past_key_values,
+                output_attentions,
+                cache_position,
+            )
+        bsz, tgt_len, _ = hidden_states.size()
+        query_layer = (
+            self.query(hidden_states).view(bsz, -1, self.num_attention_heads, self.attention_head_size).transpose(1, 2)
+        )
+        is_updated = False
+        is_cross_attention = encoder_hidden_states is not None
+        current_states = encoder_hidden_states if is_cross_attention else hidden_states
+        if past_key_values is not None:
+            if isinstance(past_key_values, EncoderDecoderCache):
+                is_updated = past_key_values.is_updated.get(self.layer_idx)
+                if is_cross_attention:
+                    # after the first generated id, we can subsequently re-use all key/value_states from cache
+                    curr_past_key_value = past_key_values.cross_attention_cache
+                else:
+                    curr_past_key_value = past_key_values.self_attention_cache
+            else:
+                curr_past_key_value = past_key_values
+        current_states = encoder_hidden_states if is_cross_attention else hidden_states
+        if is_cross_attention and past_key_values is not None and is_updated:
+            # reuse k,v, cross_attentions
+            key_layer = curr_past_key_value.layers[self.layer_idx].keys
+            value_layer = curr_past_key_value.layers[self.layer_idx].values
+        else:
+            key_layer = (
+                self.key(current_states)
+                .view(bsz, -1, self.num_attention_heads, self.attention_head_size)
+                .transpose(1, 2)
+            )
+            value_layer = (
+                self.value(current_states)
+                .view(bsz, -1, self.num_attention_heads, self.attention_head_size)
+                .transpose(1, 2)
+            )
+            if past_key_values is not None:
+                # save all key/value_layer to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_layer, value_layer = curr_past_key_value.update(
+                    key_layer, value_layer, self.layer_idx, {"cache_position": cache_position}
+                )
+                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+                if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache):
+                    past_key_values.is_updated[self.layer_idx] = True
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create
+        # a causal mask in case tgt_len == 1.
+        is_causal = self.is_decoder and not is_cross_attention and attention_mask is None and tgt_len > 1
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_layer,
+            key_layer,
+            value_layer,
+            attn_mask=attention_mask,
+            dropout_p=self.dropout_prob if self.training else 0.0,
+            is_causal=is_causal,
+        )
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, self.all_head_size)
+        return attn_output, None
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+BERT_SELF_ATTENTION_CLASSES = {
+    "eager": BertSelfAttention,
+    "sdpa": BertSdpaSelfAttention,
+}
+class BertAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None, layer_idx=None):
+        super().__init__()
+        self.self = BERT_SELF_ATTENTION_CLASSES[config._attn_implementation](
+            config,
+            position_embedding_type=position_embedding_type,
+            layer_idx=layer_idx,
+        )
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor]:
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertLayer(GradientCheckpointingLayer):
+    def __init__(self, config, layer_idx=None):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BertAttention(config, layer_idx=layer_idx)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = BertAttention(config, position_embedding_type="absolute", layer_idx=layer_idx)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor]:
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            past_key_values=past_key_values,
+            cache_position=cache_position,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask=encoder_attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:]  # add cross attentions if we output attention weights
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+        return outputs
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+class BertEncoder(nn.Module):
+    def __init__(self, config, layer_idx=None):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([BertLayer(config, layer_idx=i) for i in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        if use_cache and self.config.is_decoder and past_key_values is None:
+            past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
+        if use_cache and self.config.is_decoder and isinstance(past_key_values, tuple):
+            logger.warning_once(
+                "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. "
+                "You should pass an instance of `EncoderDecoderCache` instead, e.g. "
+                "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`."
+            )
+            past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask,
+                layer_head_mask,
+                encoder_hidden_states,  # as a positional argument for gradient checkpointing
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    past_key_values,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+    def _tie_weights(self):
+        self.decoder.bias = self.bias
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+class BertOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+    def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+class BertOnlyNSPHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+class BertPreTrainingHeads(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+@auto_docstring
+class BertPreTrainedModel(PreTrainedModel):
+    config: BertConfig
+    load_tf_weights = load_tf_weights_in_bert
+    base_model_prefix = "bert"
+    supports_gradient_checkpointing = True
+    _supports_sdpa = True
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, BertLMPredictionHead):
+            module.bias.data.zero_()
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Output type of [`BertForPreTraining`].
+    """
+)
+class BertForPreTrainingOutput(ModelOutput):
+    r"""
+    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+        Total loss as the sum of the masked language modeling loss and the next sequence prediction
+        (classification) loss.
+    prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
+        Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+        before SoftMax).
+    """
+    loss: Optional[torch.FloatTensor] = None
+    prediction_logits: Optional[torch.FloatTensor] = None
+    seq_relationship_logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+@auto_docstring(
+    custom_intro="""
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+    """
+)
+class BertModel(BertPreTrainedModel):
+    _no_split_modules = ["BertEmbeddings", "BertLayer"]
+    def __init__(self, config, add_pooling_layer=True):
+        r"""
+        add_pooling_layer (bool, *optional*, defaults to `True`):
+            Whether to add a pooling layer
+        """
+        super().__init__(config)
+        self.config = config
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+        self.attn_implementation = config._attn_implementation
+        self.position_embedding_type = config.position_embedding_type
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = (
+                past_key_values[0][0].shape[-2]
+                if not isinstance(past_key_values, Cache)
+                else past_key_values.get_seq_length()
+            )
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        if attention_mask is None:
+            attention_mask = torch.ones((batch_size, seq_length + past_key_values_length), device=device)
+        use_sdpa_attention_masks = (
+            self.attn_implementation == "sdpa"
+            and self.position_embedding_type == "absolute"
+            and head_mask is None
+            and not output_attentions
+        )
+        # Expand the attention mask
+        if use_sdpa_attention_masks and attention_mask.dim() == 2:
+            # Expand the attention mask for SDPA.
+            # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len]
+            if self.config.is_decoder:
+                extended_attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                    attention_mask,
+                    input_shape,
+                    embedding_output,
+                    past_key_values_length,
+                )
+            else:
+                extended_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    attention_mask, embedding_output.dtype, tgt_len=seq_length
+                )
+        else:
+            # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+            # ourselves in which case we just need to make it broadcastable to all heads.
+            extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            if use_sdpa_attention_masks and encoder_attention_mask.dim() == 2:
+                # Expand the attention mask for SDPA.
+                # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len]
+                encoder_extended_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    encoder_attention_mask, embedding_output.dtype, tgt_len=seq_length
+                )
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+@auto_docstring(
+    custom_intro="""
+    Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
+    sentence prediction (classification)` head.
+    """
+)
+class BertForPreTraining(BertPreTrainedModel):
+    _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = BertModel(config)
+        self.cls = BertPreTrainingHeads(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+        self.cls.predictions.bias = new_embeddings.bias
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        next_sentence_label: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], BertForPreTrainingOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
+            the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence
+            pair (see `input_ids` docstring) Indices should be in `[0, 1]`:
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, BertForPreTraining
+        >>> import torch
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+        >>> model = BertForPreTraining.from_pretrained("google-bert/bert-base-uncased")
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> prediction_logits = outputs.prediction_logits
+        >>> seq_relationship_logits = outputs.seq_relationship_logits
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
+        total_loss = None
+        if labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+        if not return_dict:
+            output = (prediction_scores, seq_relationship_score) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+        return BertForPreTrainingOutput(
+            loss=total_loss,
+            prediction_logits=prediction_scores,
+            seq_relationship_logits=seq_relationship_score,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@auto_docstring(
+    custom_intro="""
+    Bert Model with a `language modeling` head on top for CLM fine-tuning.
+    """
+)
+class BertLMHeadModel(BertPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        if not config.is_decoder:
+            logger.warning("If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`")
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+        self.cls.predictions.bias = new_embeddings.bias
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        **loss_kwargs,
+    ) -> Union[tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+        lm_loss = None
+        if labels is not None:
+            lm_loss = self.loss_function(prediction_scores, labels, self.config.vocab_size, **loss_kwargs)
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+@auto_docstring
+class BertForMaskedLM(BertPreTrainedModel):
+    _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+        self.cls.predictions.bias = new_embeddings.bias
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], MaskedLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+        #  add a dummy token
+        if self.config.pad_token_id is None:
+            raise ValueError("The PAD token should be defined for generation")
+        attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
+        dummy_token = torch.full(
+            (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
+        )
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+        return {"input_ids": input_ids, "attention_mask": attention_mask}
+    @classmethod
+    def can_generate(cls) -> bool:
+        """
+        Legacy correction: BertForMaskedLM can't call `generate()` from `GenerationMixin`, even though it has a
+        `prepare_inputs_for_generation` method.
+        """
+        return False
+@auto_docstring(
+    custom_intro="""
+    Bert Model with a `next sentence prediction (classification)` head on top.
+    """
+)
+class BertForNextSentencePrediction(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = BertModel(config)
+        self.cls = BertOnlyNSPHead(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple[torch.Tensor], NextSentencePredictorOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
+            (see `input_ids` docstring). Indices should be in `[0, 1]`:
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, BertForNextSentencePrediction
+        >>> import torch
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+        >>> model = BertForNextSentencePrediction.from_pretrained("google-bert/bert-base-uncased")
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")
+        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
+        >>> logits = outputs.logits
+        >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
+        ```
+        """
+        if "next_sentence_label" in kwargs:
+            warnings.warn(
+                "The `next_sentence_label` argument is deprecated and will be removed in a future version, use"
+                " `labels` instead.",
+                FutureWarning,
+            )
+            labels = kwargs.pop("next_sentence_label")
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+        seq_relationship_scores = self.cls(pooled_output)
+        next_sentence_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1))
+        if not return_dict:
+            output = (seq_relationship_scores,) + outputs[2:]
+            return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output
+        return NextSentencePredictorOutput(
+            loss=next_sentence_loss,
+            logits=seq_relationship_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@auto_docstring(
+    custom_intro="""
+    Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """
+)
+class BertForSequenceClassification(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+        self.bert = BertModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@auto_docstring
+class BertForMultipleChoice(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = BertModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], MultipleChoiceModelOutput]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@auto_docstring
+class BertForTokenClassification(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.bert = BertModel(config, add_pooling_layer=False)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@auto_docstring
+class BertForQuestionAnswering(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], QuestionAnsweringModelOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+__all__ = [
+    "BertForMaskedLM",
+    "BertForMultipleChoice",
+    "BertForNextSentencePrediction",
+    "BertForPreTraining",
+    "BertForQuestionAnswering",
+    "BertForSequenceClassification",
+    "BertForTokenClassification",
+    "BertLayer",
+    "BertLMHeadModel",
+    "BertModel",
+    "BertPreTrainedModel",
+    "load_tf_weights_in_bert",
+]

URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bert/modeling_flax_bert.py ADDED Viewed

	@@ -0,0 +1,1727 @@

+# coding=utf-8
+# Copyright 2021 The Google Flax Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, Optional
+import flax
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.linen import combine_masks, make_causal_mask
+from flax.linen import partitioning as nn_partitioning
+from flax.linen.attention import dot_product_attention_weights
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax import lax
+from ...modeling_flax_outputs import (
+    FlaxBaseModelOutputWithPastAndCrossAttentions,
+    FlaxBaseModelOutputWithPooling,
+    FlaxBaseModelOutputWithPoolingAndCrossAttentions,
+    FlaxCausalLMOutputWithCrossAttentions,
+    FlaxMaskedLMOutput,
+    FlaxMultipleChoiceModelOutput,
+    FlaxNextSentencePredictorOutput,
+    FlaxQuestionAnsweringModelOutput,
+    FlaxSequenceClassifierOutput,
+    FlaxTokenClassifierOutput,
+)
+from ...modeling_flax_utils import (
+    ACT2FN,
+    FlaxPreTrainedModel,
+    append_call_sample_docstring,
+    append_replace_return_docstrings,
+    overwrite_call_docstring,
+)
+from ...utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_bert import BertConfig
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "google-bert/bert-base-uncased"
+_CONFIG_FOR_DOC = "BertConfig"
+remat = nn_partitioning.remat
+@flax.struct.dataclass
+class FlaxBertForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`BertForPreTraining`].
+    Args:
+        prediction_logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        seq_relationship_logits (`jnp.ndarray` of shape `(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    prediction_logits: jnp.ndarray = None
+    seq_relationship_logits: jnp.ndarray = None
+    hidden_states: Optional[tuple[jnp.ndarray]] = None
+    attentions: Optional[tuple[jnp.ndarray]] = None
+BERT_START_DOCSTRING = r"""
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)
+    This model is also a
+    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
+    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
+    behavior.
+    Finally, this model supports inherent JAX features such as:
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+    Parameters:
+        config ([`BertConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+            `jax.numpy.bfloat16` (on TPUs).
+            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+            specified all the computation will be performed with the given `dtype`.
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.**
+            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
+            [`~FlaxPreTrainedModel.to_bf16`].
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+            `jax.numpy.bfloat16` (on TPUs).
+            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+            specified all the computation will be performed with the given `dtype`.
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.**
+            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
+            [`~FlaxPreTrainedModel.to_bf16`].
+"""
+BERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`numpy.ndarray` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+        head_mask (`numpy.ndarray` of shape `({0})`, `optional):
+            Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+class FlaxBertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+    config: BertConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    def setup(self):
+        self.word_embeddings = nn.Embed(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.position_embeddings = nn.Embed(
+            self.config.max_position_embeddings,
+            self.config.hidden_size,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.token_type_embeddings = nn.Embed(
+            self.config.type_vocab_size,
+            self.config.hidden_size,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+    def __call__(self, input_ids, token_type_ids, position_ids, attention_mask, deterministic: bool = True):
+        # Embed
+        inputs_embeds = self.word_embeddings(input_ids.astype("i4"))
+        position_embeds = self.position_embeddings(position_ids.astype("i4"))
+        token_type_embeddings = self.token_type_embeddings(token_type_ids.astype("i4"))
+        # Sum all embeddings
+        hidden_states = inputs_embeds + token_type_embeddings + position_embeds
+        # Layer Norm
+        hidden_states = self.LayerNorm(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        return hidden_states
+class FlaxBertSelfAttention(nn.Module):
+    config: BertConfig
+    causal: bool = False
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    def setup(self):
+        self.head_dim = self.config.hidden_size // self.config.num_attention_heads
+        if self.config.hidden_size % self.config.num_attention_heads != 0:
+            raise ValueError(
+                "`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads` "
+                "                   : {self.config.num_attention_heads}"
+            )
+        self.query = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        self.key = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        self.value = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        if self.causal:
+            self.causal_mask = make_causal_mask(
+                jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool"
+            )
+    def _split_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.config.num_attention_heads, self.head_dim))
+    def _merge_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.config.hidden_size,))
+    @nn.compact
+    # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartAttention._concatenate_to_cache
+    def _concatenate_to_cache(self, key, value, query, attention_mask):
+        """
+        This function takes projected key, value states from a single input token and concatenates the states to cached
+        states from previous steps. This function is slightly adapted from the official Flax repository:
+        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
+        """
+        # detect if we're initializing by absence of existing cache data.
+        is_initialized = self.has_variable("cache", "cached_key")
+        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
+        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
+        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
+        if is_initialized:
+            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
+            # update key, value caches with our new 1d spatial slices
+            cur_index = cache_index.value
+            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
+            key = lax.dynamic_update_slice(cached_key.value, key, indices)
+            value = lax.dynamic_update_slice(cached_value.value, value, indices)
+            cached_key.value = key
+            cached_value.value = value
+            num_updated_cache_vectors = query.shape[1]
+            cache_index.value = cache_index.value + num_updated_cache_vectors
+            # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements.
+            pad_mask = jnp.broadcast_to(
+                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
+                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
+            )
+            attention_mask = combine_masks(pad_mask, attention_mask)
+        return key, value, attention_mask
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        layer_head_mask,
+        key_value_states: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic=True,
+        output_attentions: bool = False,
+    ):
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        batch_size = hidden_states.shape[0]
+        # get query proj
+        query_states = self.query(hidden_states)
+        # get key, value proj
+        if is_cross_attention:
+            # cross_attentions
+            key_states = self.key(key_value_states)
+            value_states = self.value(key_value_states)
+        else:
+            # self_attention
+            key_states = self.key(hidden_states)
+            value_states = self.value(hidden_states)
+        query_states = self._split_heads(query_states)
+        key_states = self._split_heads(key_states)
+        value_states = self._split_heads(value_states)
+        # handle cache prepare causal attention mask
+        if self.causal:
+            query_length, key_length = query_states.shape[1], key_states.shape[1]
+            if self.has_variable("cache", "cached_key"):
+                mask_shift = self.variables["cache"]["cache_index"]
+                max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
+                causal_mask = lax.dynamic_slice(
+                    self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length)
+                )
+            else:
+                causal_mask = self.causal_mask[:, :, :query_length, :key_length]
+            causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
+        # combine masks if needed
+        if attention_mask is not None and self.causal:
+            attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
+            attention_mask = combine_masks(attention_mask, causal_mask)
+        elif self.causal:
+            attention_mask = causal_mask
+        elif attention_mask is not None:
+            attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
+        # During fast autoregressive decoding, we feed one position at a time,
+        # and cache the keys and values step by step.
+        if self.causal and (self.has_variable("cache", "cached_key") or init_cache):
+            key_states, value_states, attention_mask = self._concatenate_to_cache(
+                key_states, value_states, query_states, attention_mask
+            )
+        # Convert the boolean attention mask to an attention bias.
+        if attention_mask is not None:
+            # attention mask in the form of attention bias
+            attention_bias = lax.select(
+                attention_mask > 0,
+                jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
+            )
+        else:
+            attention_bias = None
+        dropout_rng = None
+        if not deterministic and self.config.attention_probs_dropout_prob > 0.0:
+            dropout_rng = self.make_rng("dropout")
+        attn_weights = dot_product_attention_weights(
+            query_states,
+            key_states,
+            bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.config.attention_probs_dropout_prob,
+            broadcast_dropout=True,
+            deterministic=deterministic,
+            dtype=self.dtype,
+            precision=None,
+        )
+        # Mask heads if we want to
+        if layer_head_mask is not None:
+            attn_weights = jnp.einsum("...hqk,h->...hqk", attn_weights, layer_head_mask)
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states)
+        attn_output = attn_output.reshape(attn_output.shape[:2] + (-1,))
+        outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
+        return outputs
+class FlaxBertSelfOutput(nn.Module):
+    config: BertConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+    def __call__(self, hidden_states, input_tensor, deterministic: bool = True):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class FlaxBertAttention(nn.Module):
+    config: BertConfig
+    causal: bool = False
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.self = FlaxBertSelfAttention(self.config, causal=self.causal, dtype=self.dtype)
+        self.output = FlaxBertSelfOutput(self.config, dtype=self.dtype)
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        layer_head_mask,
+        key_value_states=None,
+        init_cache=False,
+        deterministic=True,
+        output_attentions: bool = False,
+    ):
+        # Attention mask comes in as attention_mask.shape == (*batch_sizes, kv_length)
+        # FLAX expects: attention_mask.shape == (*batch_sizes, 1, 1, kv_length) such that it is broadcastable
+        # with attn_weights.shape == (*batch_sizes, num_heads, q_length, kv_length)
+        attn_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            layer_head_mask=layer_head_mask,
+            key_value_states=key_value_states,
+            init_cache=init_cache,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+        )
+        attn_output = attn_outputs[0]
+        hidden_states = self.output(attn_output, hidden_states, deterministic=deterministic)
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_outputs[1],)
+        return outputs
+class FlaxBertIntermediate(nn.Module):
+    config: BertConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.intermediate_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.activation = ACT2FN[self.config.hidden_act]
+    def __call__(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+class FlaxBertOutput(nn.Module):
+    config: BertConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+    def __call__(self, hidden_states, attention_output, deterministic: bool = True):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        hidden_states = self.LayerNorm(hidden_states + attention_output)
+        return hidden_states
+class FlaxBertLayer(nn.Module):
+    config: BertConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    def setup(self):
+        self.attention = FlaxBertAttention(self.config, causal=self.config.is_decoder, dtype=self.dtype)
+        self.intermediate = FlaxBertIntermediate(self.config, dtype=self.dtype)
+        self.output = FlaxBertOutput(self.config, dtype=self.dtype)
+        if self.config.add_cross_attention:
+            self.crossattention = FlaxBertAttention(self.config, causal=False, dtype=self.dtype)
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        layer_head_mask,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+    ):
+        # Self Attention
+        attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            layer_head_mask=layer_head_mask,
+            init_cache=init_cache,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+        )
+        attention_output = attention_outputs[0]
+        # Cross-Attention Block
+        if encoder_hidden_states is not None:
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=layer_head_mask,
+                key_value_states=encoder_hidden_states,
+                deterministic=deterministic,
+                output_attentions=output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+        hidden_states = self.intermediate(attention_output)
+        hidden_states = self.output(hidden_states, attention_output, deterministic=deterministic)
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attention_outputs[1],)
+            if encoder_hidden_states is not None:
+                outputs += (cross_attention_outputs[1],)
+        return outputs
+class FlaxBertLayerCollection(nn.Module):
+    config: BertConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    gradient_checkpointing: bool = False
+    def setup(self):
+        if self.gradient_checkpointing:
+            FlaxBertCheckpointLayer = remat(FlaxBertLayer, static_argnums=(5, 6, 7))
+            self.layers = [
+                FlaxBertCheckpointLayer(self.config, name=str(i), dtype=self.dtype)
+                for i in range(self.config.num_hidden_layers)
+            ]
+        else:
+            self.layers = [
+                FlaxBertLayer(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.num_hidden_layers)
+            ]
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        head_mask,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        # Check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            if head_mask.shape[0] != (len(self.layers)):
+                raise ValueError(
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for                  "
+                    f"       {head_mask.shape[0]}."
+                )
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            layer_outputs = layer(
+                hidden_states,
+                attention_mask,
+                head_mask[i] if head_mask is not None else None,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                init_cache,
+                deterministic,
+                output_attentions,
+            )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions += (layer_outputs[1],)
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        outputs = (hidden_states, all_hidden_states, all_attentions, all_cross_attentions)
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+        return FlaxBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+class FlaxBertEncoder(nn.Module):
+    config: BertConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    gradient_checkpointing: bool = False
+    def setup(self):
+        self.layer = FlaxBertLayerCollection(
+            self.config,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        head_mask,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        return self.layer(
+            hidden_states,
+            attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            init_cache=init_cache,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+class FlaxBertPooler(nn.Module):
+    config: BertConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+    def __call__(self, hidden_states):
+        cls_hidden_state = hidden_states[:, 0]
+        cls_hidden_state = self.dense(cls_hidden_state)
+        return nn.tanh(cls_hidden_state)
+class FlaxBertPredictionHeadTransform(nn.Module):
+    config: BertConfig
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.dense = nn.Dense(self.config.hidden_size, dtype=self.dtype)
+        self.activation = ACT2FN[self.config.hidden_act]
+        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+    def __call__(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return self.LayerNorm(hidden_states)
+class FlaxBertLMPredictionHead(nn.Module):
+    config: BertConfig
+    dtype: jnp.dtype = jnp.float32
+    bias_init: Callable[..., np.ndarray] = jax.nn.initializers.zeros
+    def setup(self):
+        self.transform = FlaxBertPredictionHeadTransform(self.config, dtype=self.dtype)
+        self.decoder = nn.Dense(self.config.vocab_size, dtype=self.dtype, use_bias=False)
+        self.bias = self.param("bias", self.bias_init, (self.config.vocab_size,))
+    def __call__(self, hidden_states, shared_embedding=None):
+        hidden_states = self.transform(hidden_states)
+        if shared_embedding is not None:
+            hidden_states = self.decoder.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
+        else:
+            hidden_states = self.decoder(hidden_states)
+        bias = jnp.asarray(self.bias, self.dtype)
+        hidden_states += bias
+        return hidden_states
+class FlaxBertOnlyMLMHead(nn.Module):
+    config: BertConfig
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.predictions = FlaxBertLMPredictionHead(self.config, dtype=self.dtype)
+    def __call__(self, hidden_states, shared_embedding=None):
+        hidden_states = self.predictions(hidden_states, shared_embedding=shared_embedding)
+        return hidden_states
+class FlaxBertOnlyNSPHead(nn.Module):
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.seq_relationship = nn.Dense(2, dtype=self.dtype)
+    def __call__(self, pooled_output):
+        return self.seq_relationship(pooled_output)
+class FlaxBertPreTrainingHeads(nn.Module):
+    config: BertConfig
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.predictions = FlaxBertLMPredictionHead(self.config, dtype=self.dtype)
+        self.seq_relationship = nn.Dense(2, dtype=self.dtype)
+    def __call__(self, hidden_states, pooled_output, shared_embedding=None):
+        prediction_scores = self.predictions(hidden_states, shared_embedding=shared_embedding)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+class FlaxBertPreTrainedModel(FlaxPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = BertConfig
+    base_model_prefix = "bert"
+    module_class: nn.Module = None
+    def __init__(
+        self,
+        config: BertConfig,
+        input_shape: tuple = (1, 1),
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        gradient_checkpointing: bool = False,
+        **kwargs,
+    ):
+        module = self.module_class(
+            config=config,
+            dtype=dtype,
+            gradient_checkpointing=gradient_checkpointing,
+            **kwargs,
+        )
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+    def enable_gradient_checkpointing(self):
+        self._module = self.module_class(
+            config=self.config,
+            dtype=self.dtype,
+            gradient_checkpointing=True,
+        )
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_ids = jnp.zeros(input_shape, dtype="i4")
+        token_type_ids = jnp.zeros_like(input_ids)
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape)
+        attention_mask = jnp.ones_like(input_ids)
+        head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads))
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+        if self.config.add_cross_attention:
+            encoder_hidden_states = jnp.zeros(input_shape + (self.config.hidden_size,))
+            encoder_attention_mask = attention_mask
+            module_init_outputs = self.module.init(
+                rngs,
+                input_ids,
+                attention_mask,
+                token_type_ids,
+                position_ids,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                return_dict=False,
+            )
+        else:
+            module_init_outputs = self.module.init(
+                rngs, input_ids, attention_mask, token_type_ids, position_ids, head_mask, return_dict=False
+            )
+        random_params = module_init_outputs["params"]
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+    # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartDecoderPreTrainedModel.init_cache
+    def init_cache(self, batch_size, max_length):
+        r"""
+        Args:
+            batch_size (`int`):
+                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
+            max_length (`int`):
+                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
+                cache.
+        """
+        # init input variables to retrieve cache
+        input_ids = jnp.ones((batch_size, max_length), dtype="i4")
+        attention_mask = jnp.ones_like(input_ids, dtype="i4")
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
+        init_variables = self.module.init(
+            jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
+        )
+        return unfreeze(init_variables["cache"])
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        params: Optional[dict] = None,
+        dropout_rng: jax.random.PRNGKey = None,
+        train: bool = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        past_key_values: Optional[dict] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        # init input tensors if not passed
+        if token_type_ids is None:
+            token_type_ids = jnp.zeros_like(input_ids)
+        if position_ids is None:
+            position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
+        if attention_mask is None:
+            attention_mask = jnp.ones_like(input_ids)
+        if head_mask is None:
+            head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads))
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+        inputs = {"params": params or self.params}
+        if self.config.add_cross_attention:
+            # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed
+            # down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be
+            # changed by FlaxBertAttention module
+            if past_key_values:
+                inputs["cache"] = past_key_values
+                mutable = ["cache"]
+            else:
+                mutable = False
+            outputs = self.module.apply(
+                inputs,
+                jnp.array(input_ids, dtype="i4"),
+                jnp.array(attention_mask, dtype="i4"),
+                token_type_ids=jnp.array(token_type_ids, dtype="i4"),
+                position_ids=jnp.array(position_ids, dtype="i4"),
+                head_mask=jnp.array(head_mask, dtype="i4"),
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                deterministic=not train,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                rngs=rngs,
+                mutable=mutable,
+            )
+            # add updated cache to model output
+            if past_key_values is not None and return_dict:
+                outputs, past_key_values = outputs
+                outputs["past_key_values"] = unfreeze(past_key_values["cache"])
+                return outputs
+            elif past_key_values is not None and not return_dict:
+                outputs, past_key_values = outputs
+                outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:]
+        else:
+            outputs = self.module.apply(
+                inputs,
+                jnp.array(input_ids, dtype="i4"),
+                jnp.array(attention_mask, dtype="i4"),
+                token_type_ids=jnp.array(token_type_ids, dtype="i4"),
+                position_ids=jnp.array(position_ids, dtype="i4"),
+                head_mask=jnp.array(head_mask, dtype="i4"),
+                deterministic=not train,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                rngs=rngs,
+            )
+        return outputs
+class FlaxBertModule(nn.Module):
+    config: BertConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    add_pooling_layer: bool = True
+    gradient_checkpointing: bool = False
+    def setup(self):
+        self.embeddings = FlaxBertEmbeddings(self.config, dtype=self.dtype)
+        self.encoder = FlaxBertEncoder(
+            self.config,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.pooler = FlaxBertPooler(self.config, dtype=self.dtype)
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids: Optional[jnp.ndarray] = None,
+        position_ids: Optional[jnp.ndarray] = None,
+        head_mask: Optional[jnp.ndarray] = None,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # make sure `token_type_ids` is correctly initialized when not passed
+        if token_type_ids is None:
+            token_type_ids = jnp.zeros_like(input_ids)
+        # make sure `position_ids` is correctly initialized when not passed
+        if position_ids is None:
+            position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
+        hidden_states = self.embeddings(
+            input_ids, token_type_ids, position_ids, attention_mask, deterministic=deterministic
+        )
+        outputs = self.encoder(
+            hidden_states,
+            attention_mask,
+            head_mask=head_mask,
+            deterministic=deterministic,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        pooled = self.pooler(hidden_states) if self.add_pooling_layer else None
+        if not return_dict:
+            # if pooled is None, don't return it
+            if pooled is None:
+                return (hidden_states,) + outputs[1:]
+            return (hidden_states, pooled) + outputs[1:]
+        return FlaxBaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            pooler_output=pooled,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+@add_start_docstrings(
+    "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
+    BERT_START_DOCSTRING,
+)
+class FlaxBertModel(FlaxBertPreTrainedModel):
+    module_class = FlaxBertModule
+append_call_sample_docstring(FlaxBertModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutputWithPooling, _CONFIG_FOR_DOC)
+class FlaxBertForPreTrainingModule(nn.Module):
+    config: BertConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+    def setup(self):
+        self.bert = FlaxBertModule(
+            config=self.config,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.cls = FlaxBertPreTrainingHeads(config=self.config, dtype=self.dtype)
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.bert(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if self.config.tie_word_embeddings:
+            shared_embedding = self.bert.variables["params"]["embeddings"]["word_embeddings"]["embedding"]
+        else:
+            shared_embedding = None
+        hidden_states = outputs[0]
+        pooled_output = outputs[1]
+        prediction_scores, seq_relationship_score = self.cls(
+            hidden_states, pooled_output, shared_embedding=shared_embedding
+        )
+        if not return_dict:
+            return (prediction_scores, seq_relationship_score) + outputs[2:]
+        return FlaxBertForPreTrainingOutput(
+            prediction_logits=prediction_scores,
+            seq_relationship_logits=seq_relationship_score,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
+    sentence prediction (classification)` head.
+    """,
+    BERT_START_DOCSTRING,
+)
+class FlaxBertForPreTraining(FlaxBertPreTrainedModel):
+    module_class = FlaxBertForPreTrainingModule
+FLAX_BERT_FOR_PRETRAINING_DOCSTRING = """
+    Returns:
+    Example:
+    ```python
+    >>> from transformers import AutoTokenizer, FlaxBertForPreTraining
+    >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+    >>> model = FlaxBertForPreTraining.from_pretrained("google-bert/bert-base-uncased")
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
+    >>> outputs = model(**inputs)
+    >>> prediction_logits = outputs.prediction_logits
+    >>> seq_relationship_logits = outputs.seq_relationship_logits
+    ```
+"""
+overwrite_call_docstring(
+    FlaxBertForPreTraining,
+    BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length") + FLAX_BERT_FOR_PRETRAINING_DOCSTRING,
+)
+append_replace_return_docstrings(
+    FlaxBertForPreTraining, output_type=FlaxBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC
+)
+class FlaxBertForMaskedLMModule(nn.Module):
+    config: BertConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+    def setup(self):
+        self.bert = FlaxBertModule(
+            config=self.config,
+            add_pooling_layer=False,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.cls = FlaxBertOnlyMLMHead(config=self.config, dtype=self.dtype)
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.bert(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        if self.config.tie_word_embeddings:
+            shared_embedding = self.bert.variables["params"]["embeddings"]["word_embeddings"]["embedding"]
+        else:
+            shared_embedding = None
+        # Compute the prediction scores
+        logits = self.cls(hidden_states, shared_embedding=shared_embedding)
+        if not return_dict:
+            return (logits,) + outputs[1:]
+        return FlaxMaskedLMOutput(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings("""Bert Model with a `language modeling` head on top.""", BERT_START_DOCSTRING)
+class FlaxBertForMaskedLM(FlaxBertPreTrainedModel):
+    module_class = FlaxBertForMaskedLMModule
+append_call_sample_docstring(FlaxBertForMaskedLM, _CHECKPOINT_FOR_DOC, FlaxMaskedLMOutput, _CONFIG_FOR_DOC)
+class FlaxBertForNextSentencePredictionModule(nn.Module):
+    config: BertConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+    def setup(self):
+        self.bert = FlaxBertModule(
+            config=self.config,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.cls = FlaxBertOnlyNSPHead(dtype=self.dtype)
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        # Model
+        outputs = self.bert(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+        seq_relationship_scores = self.cls(pooled_output)
+        if not return_dict:
+            return (seq_relationship_scores,) + outputs[2:]
+        return FlaxNextSentencePredictorOutput(
+            logits=seq_relationship_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """Bert Model with a `next sentence prediction (classification)` head on top.""",
+    BERT_START_DOCSTRING,
+)
+class FlaxBertForNextSentencePrediction(FlaxBertPreTrainedModel):
+    module_class = FlaxBertForNextSentencePredictionModule
+FLAX_BERT_FOR_NEXT_SENT_PRED_DOCSTRING = """
+    Returns:
+    Example:
+    ```python
+    >>> from transformers import AutoTokenizer, FlaxBertForNextSentencePrediction
+    >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+    >>> model = FlaxBertForNextSentencePrediction.from_pretrained("google-bert/bert-base-uncased")
+    >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+    >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+    >>> encoding = tokenizer(prompt, next_sentence, return_tensors="jax")
+    >>> outputs = model(**encoding)
+    >>> logits = outputs.logits
+    >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
+    ```
+"""
+overwrite_call_docstring(
+    FlaxBertForNextSentencePrediction,
+    BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length") + FLAX_BERT_FOR_NEXT_SENT_PRED_DOCSTRING,
+)
+append_replace_return_docstrings(
+    FlaxBertForNextSentencePrediction, output_type=FlaxNextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC
+)
+class FlaxBertForSequenceClassificationModule(nn.Module):
+    config: BertConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+    def setup(self):
+        self.bert = FlaxBertModule(
+            config=self.config,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        classifier_dropout = (
+            self.config.classifier_dropout
+            if self.config.classifier_dropout is not None
+            else self.config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(rate=classifier_dropout)
+        self.classifier = nn.Dense(
+            self.config.num_labels,
+            dtype=self.dtype,
+        )
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.bert(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output, deterministic=deterministic)
+        logits = self.classifier(pooled_output)
+        if not return_dict:
+            return (logits,) + outputs[2:]
+        return FlaxSequenceClassifierOutput(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """,
+    BERT_START_DOCSTRING,
+)
+class FlaxBertForSequenceClassification(FlaxBertPreTrainedModel):
+    module_class = FlaxBertForSequenceClassificationModule
+append_call_sample_docstring(
+    FlaxBertForSequenceClassification,
+    _CHECKPOINT_FOR_DOC,
+    FlaxSequenceClassifierOutput,
+    _CONFIG_FOR_DOC,
+)
+class FlaxBertForMultipleChoiceModule(nn.Module):
+    config: BertConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+    def setup(self):
+        self.bert = FlaxBertModule(
+            config=self.config,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+        self.classifier = nn.Dense(1, dtype=self.dtype)
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        num_choices = input_ids.shape[1]
+        input_ids = input_ids.reshape(-1, input_ids.shape[-1]) if input_ids is not None else None
+        attention_mask = attention_mask.reshape(-1, attention_mask.shape[-1]) if attention_mask is not None else None
+        token_type_ids = token_type_ids.reshape(-1, token_type_ids.shape[-1]) if token_type_ids is not None else None
+        position_ids = position_ids.reshape(-1, position_ids.shape[-1]) if position_ids is not None else None
+        # Model
+        outputs = self.bert(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output, deterministic=deterministic)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.reshape(-1, num_choices)
+        if not return_dict:
+            return (reshaped_logits,) + outputs[2:]
+        return FlaxMultipleChoiceModelOutput(
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    BERT_START_DOCSTRING,
+)
+class FlaxBertForMultipleChoice(FlaxBertPreTrainedModel):
+    module_class = FlaxBertForMultipleChoiceModule
+overwrite_call_docstring(
+    FlaxBertForMultipleChoice, BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
+)
+append_call_sample_docstring(
+    FlaxBertForMultipleChoice, _CHECKPOINT_FOR_DOC, FlaxMultipleChoiceModelOutput, _CONFIG_FOR_DOC
+)
+class FlaxBertForTokenClassificationModule(nn.Module):
+    config: BertConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+    def setup(self):
+        self.bert = FlaxBertModule(
+            config=self.config,
+            dtype=self.dtype,
+            add_pooling_layer=False,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        classifier_dropout = (
+            self.config.classifier_dropout
+            if self.config.classifier_dropout is not None
+            else self.config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(rate=classifier_dropout)
+        self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype)
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.bert(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        logits = self.classifier(hidden_states)
+        if not return_dict:
+            return (logits,) + outputs[1:]
+        return FlaxTokenClassifierOutput(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    BERT_START_DOCSTRING,
+)
+class FlaxBertForTokenClassification(FlaxBertPreTrainedModel):
+    module_class = FlaxBertForTokenClassificationModule
+append_call_sample_docstring(
+    FlaxBertForTokenClassification, _CHECKPOINT_FOR_DOC, FlaxTokenClassifierOutput, _CONFIG_FOR_DOC
+)
+class FlaxBertForQuestionAnsweringModule(nn.Module):
+    config: BertConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+    def setup(self):
+        self.bert = FlaxBertModule(
+            config=self.config,
+            dtype=self.dtype,
+            add_pooling_layer=False,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.qa_outputs = nn.Dense(self.config.num_labels, dtype=self.dtype)
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.bert(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.qa_outputs(hidden_states)
+        start_logits, end_logits = jnp.split(logits, self.config.num_labels, axis=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+        if not return_dict:
+            return (start_logits, end_logits) + outputs[1:]
+        return FlaxQuestionAnsweringModelOutput(
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    BERT_START_DOCSTRING,
+)
+class FlaxBertForQuestionAnswering(FlaxBertPreTrainedModel):
+    module_class = FlaxBertForQuestionAnsweringModule
+append_call_sample_docstring(
+    FlaxBertForQuestionAnswering,
+    _CHECKPOINT_FOR_DOC,
+    FlaxQuestionAnsweringModelOutput,
+    _CONFIG_FOR_DOC,
+)
+class FlaxBertForCausalLMModule(nn.Module):
+    config: BertConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+    def setup(self):
+        self.bert = FlaxBertModule(
+            config=self.config,
+            add_pooling_layer=False,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.cls = FlaxBertOnlyMLMHead(config=self.config, dtype=self.dtype)
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        position_ids,
+        token_type_ids: Optional[jnp.ndarray] = None,
+        head_mask: Optional[jnp.ndarray] = None,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.bert(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            init_cache=init_cache,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        if self.config.tie_word_embeddings:
+            shared_embedding = self.bert.variables["params"]["embeddings"]["word_embeddings"]["embedding"]
+        else:
+            shared_embedding = None
+        # Compute the prediction scores
+        logits = self.cls(hidden_states, shared_embedding=shared_embedding)
+        if not return_dict:
+            return (logits,) + outputs[1:]
+        return FlaxCausalLMOutputWithCrossAttentions(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+@add_start_docstrings(
+    """
+    Bert Model with a language modeling head on top (a linear layer on top of the hidden-states output) e.g for
+    autoregressive tasks.
+    """,
+    BERT_START_DOCSTRING,
+)
+class FlaxBertForCausalLM(FlaxBertPreTrainedModel):
+    module_class = FlaxBertForCausalLMModule
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
+        # initializing the cache
+        batch_size, seq_length = input_ids.shape
+        past_key_values = self.init_cache(batch_size, max_length)
+        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
+        # But since the decoder uses a causal mask, those positions are masked anyway.
+        # Thus, we can create a single static attention_mask here, which is more efficient for compilation
+        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
+        if attention_mask is not None:
+            position_ids = attention_mask.cumsum(axis=-1) - 1
+            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))
+        else:
+            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
+        return {
+            "past_key_values": past_key_values,
+            "attention_mask": extended_attention_mask,
+            "position_ids": position_ids,
+        }
+    def update_inputs_for_generation(self, model_outputs, model_kwargs):
+        model_kwargs["past_key_values"] = model_outputs.past_key_values
+        model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
+        return model_kwargs
+append_call_sample_docstring(
+    FlaxBertForCausalLM,
+    _CHECKPOINT_FOR_DOC,
+    FlaxCausalLMOutputWithCrossAttentions,
+    _CONFIG_FOR_DOC,
+)
+__all__ = [
+    "FlaxBertForCausalLM",
+    "FlaxBertForMaskedLM",
+    "FlaxBertForMultipleChoice",
+    "FlaxBertForNextSentencePrediction",
+    "FlaxBertForPreTraining",
+    "FlaxBertForQuestionAnswering",
+    "FlaxBertForSequenceClassification",
+    "FlaxBertForTokenClassification",
+    "FlaxBertModel",
+    "FlaxBertPreTrainedModel",
+]