FLMR / modeling_flmr.py

Upload folder using huggingface_hub

3704bcf verified 3 months ago

No virus

81.9 kB

	# coding=utf-8
	# Copyright 2024 FLMR Authors, The Hugging Face Team.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	""" PyTorch FLMR model for Knowledge-intensive Visual Question Answering."""


	import copy
	import os
	import pathlib
	import string
	from dataclasses import dataclass
	from typing import Optional, Tuple, Union

	import torch
	import torch.distributed as dist
	from torch import Tensor, nn
	from torch.utils.cpp_extension import load

	from transformers.modeling_outputs import BaseModelOutputWithPooling
	from transformers.modeling_utils import PreTrainedModel
	from transformers.utils import (
	ModelOutput,
	add_start_docstrings,
	add_start_docstrings_to_model_forward,
	logging,
	replace_return_docstrings,
	)
	from transformers.models.bert.modeling_bert import BertModel
	from transformers.models.clip import CLIPVisionModel
	from .configuration_flmr import FLMRConfig, FLMRTextConfig, FLMRVisionConfig
	from .tokenization_flmr import FLMRQueryEncoderTokenizer, FLMRContextEncoderTokenizer
	from .tokenization_flmr_fast import FLMRQueryEncoderTokenizerFast, FLMRContextEncoderTokenizerFast
	from .flmr_utils import (
	colbert_score,
	colbert_score_reduce,
	get_rank,
	get_world_size,
	)


	logger = logging.get_logger(__name__)

	_CONFIG_FOR_DOC = "FLMRConfig"
	_CHECKPOINT_FOR_DOC = "LinWeizheDragon/PreFLMR_ViT-L"


	FLMR_PRETRAINED_MODEL_ARCHIVE_LIST = [
	"LinWeizheDragon/PreFLMR_ViT-L",
	"LinWeizheDragon/FLMR",
	# See all FLMR models at https://huggingface.co/models?filter=flmr
	]


	##########
	# Outputs
	##########


	@dataclass
	class FLMRContextEncoderOutput(ModelOutput):
	"""
	Class for outputs of the `doc()` function of [`FLMRModelForRetrieval`].

	Args:
	pooler_output (`torch.FloatTensor` of shape `(batch_size, embeddings_size)`):
	The FLMR encoder outputs the pooler_output that corresponds to the embedding of the first token of the context representation.
	This output can be used to embed questions for nearest neighbors queries with query embeddings.
	late_interaction_output (`torch.FloatTensor` of shape `(batch_size, context_embedding_length, embeddings_size)`):
	The FLMR encoder outputs the late_interaction_output that corresponds to the question representation. The embeddings of all tokens are included for late interaction retrieval.
	This output is to be used to embed contexts for late-interaction retrieval with query embeddings.
	context_mask (`torch.FloatTensor` of shape `(batch_size, context_embedding_length)`):
	The FLMR encoder outputs the context_mask that corresponds to the mask of the context representation.
	text_encoder_attentions (`Tuple[torch.FloatTensor]`, optional):
	Tuple of elements containing the attention weights of the text encoder's layers. Each element is a
	tensor of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
	text_encoder_hidden_states (`Tuple[torch.FloatTensor]`, optional):
	Tuple of elements containing the hidden states of the text encoder at each layer plus the initial embedding
	outputs. Each tensor has a shape of `(batch_size, sequence_length, hidden_size)`.
	vision_encoder_attentions (`Tuple[torch.FloatTensor]`, optional):
	Tuple of elements containing the attention weights of the vision encoder's layers. Each element is a
	tensor of shape `(batch_size, num_heads, vision_sequence_length, vision_sequence_length)`.
	vision_encoder_hidden_states (`Tuple[torch.FloatTensor]`, optional):
	Tuple of elements containing the hidden states of the vision encoder at each layer plus the initial embedding
	outputs. Each tensor has a shape of `(batch_size, vision_sequence_length, hidden_size)`.
	transformer_mapping_network_attentions (`Tuple[torch.FloatTensor]`, optional):
	Tuple of elements containing the attention weights of the transformer mapping network's layers. Each element
	is a tensor of shape `(batch_size, num_heads, mapping_sequence_length, mapping_sequence_length)`.
	transformer_mapping_network_hidden_states (`Tuple[torch.FloatTensor]`, optional):
	Tuple of elements containing the hidden states of the transformer mapping network at each layer plus the
	initial embedding outputs. Each tensor has a shape of `(batch_size, mapping_sequence_length, hidden_size)`.
	"""

	pooler_output: torch.FloatTensor
	late_interaction_output: torch.FloatTensor = None
	context_mask: torch.FloatTensor = None
	text_encoder_attentions: Optional[Tuple[Tensor]] = None
	text_encoder_hidden_states: Optional[Tuple[Tensor]] = None
	vision_encoder_attentions: Optional[Tuple[Tensor]] = None
	vision_encoder_hidden_states: Optional[Tuple[Tensor]] = None
	transformer_mapping_network_attentions: Optional[Tuple[Tensor]] = None
	transformer_mapping_network_hidden_states: Optional[Tuple[Tensor]] = None


	@dataclass
	class FLMRQueryEncoderOutput(ModelOutput):
	"""
	Class for outputs of the `query()` function of [`FLMRModelForRetrieval.query()`].

	Args:
	pooler_output (`torch.FloatTensor` of shape `(batch_size, embeddings_size)`):
	The FLMR encoder outputs the pooler_output that corresponds to the embedding of the first token of the query representation.
	This output can be used to embed questions for nearest neighbors queries with context embeddings.
	late_interaction_output (`torch.FloatTensor` of shape `(batch_size, query_embedding_length, embeddings_size)`):
	The FLMR encoder outputs the late_interaction_output that corresponds to the question representation. The embeddings of all tokens are included for late interaction retrieval.
	This output is to be used to embed questions for late-interaction retrieval with context embeddings.
	text_encoder_attentions (`Tuple[torch.FloatTensor]`, optional):
	Tuple of elements containing the attention weights of the text encoder's layers. Each element is a
	tensor of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
	text_encoder_hidden_states (`Tuple[torch.FloatTensor]`, optional):
	Tuple of elements containing the hidden states of the text encoder at each layer plus the initial embedding
	outputs. Each tensor has a shape of `(batch_size, sequence_length, hidden_size)`.
	vision_encoder_attentions (`Tuple[torch.FloatTensor]`, optional):
	Tuple of elements containing the attention weights of the vision encoder's layers. Each element is a
	tensor of shape `(batch_size, num_heads, vision_sequence_length, vision_sequence_length)`.
	vision_encoder_hidden_states (`Tuple[torch.FloatTensor]`, optional):
	Tuple of elements containing the hidden states of the vision encoder at each layer plus the initial embedding
	outputs. Each tensor has a shape of `(batch_size, vision_sequence_length, hidden_size)`.
	transformer_mapping_network_attentions (`Tuple[torch.FloatTensor]`, optional):
	Tuple of elements containing the attention weights of the transformer mapping network's layers. Each element
	is a tensor of shape `(batch_size, num_heads, mapping_sequence_length, mapping_sequence_length)`.
	transformer_mapping_network_hidden_states (`Tuple[torch.FloatTensor]`, optional):
	Tuple of elements containing the hidden states of the transformer mapping network at each layer plus the
	initial embedding outputs. Each tensor has a shape of `(batch_size, mapping_sequence_length, hidden_size)`.
	"""

	pooler_output: torch.FloatTensor
	late_interaction_output: torch.FloatTensor = None
	text_encoder_attentions: Optional[Tuple[Tensor]] = None
	text_encoder_hidden_states: Optional[Tuple[Tensor]] = None
	vision_encoder_attentions: Optional[Tuple[Tensor]] = None
	vision_encoder_hidden_states: Optional[Tuple[Tensor]] = None
	transformer_mapping_network_attentions: Optional[Tuple[Tensor]] = None
	transformer_mapping_network_hidden_states: Optional[Tuple[Tensor]] = None


	@dataclass
	class FLMRModelForRetrievalOutput(ModelOutput):
	"""
	Class for outputs of [`FLMRModelForRetrieval.query()`].

	Args:
	loss (`torch.FloatTensor`):
	contrastive loss of the input queries and positive and negative examples. This output is to be used in model training.
	scores (`torch.FloatTensor` of shape `(batch_size, num_positive_examples + num_negative_examples)`):
	The FLMR model outputs the scores that corresponds to the late-interaction scores of the input query and context. Each query is associated with `num_positive_examples` positive examples and `num_negative_examples` negative examples, and the scores are the late-interaction scores of the query and these examples.
	in_batch_negative_loss (`torch.FloatTensor` of shape `(batch_size, query_embedding_length, embeddings_size)`):
	The FLMR model outputs the in_batch_negative_loss which computes contrastive loss that includes in-batch negatives. For each positive example, all other examples in the batch except itself are considered negative examples in computing the contrastive loss. This improves ultimate performance in practice. This output is to be used in model training.
	query_late_interaction_output (`torch.FloatTensor` of shape `(batch_size, query_embedding_length, embeddings_size)`):
	The FLMR model outputs the query_late_interaction_output that corresponds to the late-interaction representations of the input query.
	context_late_interaction_output (`torch.FloatTensor` of shape `(batch_size, context_embedding_length, embeddings_size)`):
	The FLMR model outputs the context_late_interaction_output that corresponds to the late-interaction representations of the input context.
	query_attentions (`Tuple[Tuple[Tensor]]`, optional):
	Tuple of elements containing the attention weights of the query's layers. There are three sub-tuples in this tuple, corresponding to the attentions of the text encoder, vision encoder, and transformer mapping network. Each element in the sub-tuple is a tensor of shape `(batch_size, num_heads, sequence_length, sequence_length)`, with `sequence_length` being the sequence length in the corresponding encoder.
	query_hidden_states (`Tuple[Tuple[Tensor]]`, optional):
	Tuple of elements containing the hidden states of the query's layers. There are three sub-tuples in this tuple, corresponding to the hidden states of the text encoder, vision encoder, and transformer mapping network. Each element in the sub-tuple is a tensor of shape `(batch_size, sequence_length, hidden_size)`, with `sequence_length` being the sequence length in the corresponding encoder.
	context_attentions (`Tuple[Tuple[Tensor]]`, optional):
	Tuple of elements containing the attention weights of the context's layers. There are three sub-tuples in this tuple, corresponding to the attentions of the text encoder, vision encoder, and transformer mapping network. Each element in the sub-tuple is a tensor of shape `(batch_size, num_heads, sequence_length, sequence_length)`, with `sequence_length` being the sequence length in the corresponding encoder.
	context_hidden_states (`Tuple[Tuple[Tensor]]`, optional):
	Tuple of elements containing the hidden states of the context's layers. There are three sub-tuples in this tuple, corresponding to the hidden states of the text encoder, vision encoder, and transformer mapping network. Each element in the sub-tuple is a tensor of shape `(batch_size, sequence_length, hidden_size)`, with `sequence_length` being the sequence length in the corresponding encoder.
	"""

	loss: torch.FloatTensor
	scores: torch.FloatTensor = None
	in_batch_negative_loss: torch.FloatTensor = None
	query_late_interaction_output: torch.FloatTensor = None
	context_late_interaction_output: torch.FloatTensor = None
	query_attentions: Optional[Tuple[Tuple[Tensor]]] = None
	query_hidden_states: Optional[Tuple[Tuple[Tensor]]] = None
	context_attentions: Optional[Tuple[Tuple[Tensor]]] = None
	context_hidden_states: Optional[Tuple[Tuple[Tensor]]] = None


	class FLMRPreTrainedModel(PreTrainedModel):
	def _init_weights(self, module):
	"""Initialize the weights"""
	if isinstance(module, nn.Linear):
	# Slightly different from the TF version which uses truncated_normal for initialization
	# cf https://github.com/pytorch/pytorch/pull/5617
	module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
	if module.bias is not None:
	module.bias.data.zero_()
	elif isinstance(module, nn.Embedding):
	module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
	if module.padding_idx is not None:
	module.weight.data[module.padding_idx].zero_()
	elif isinstance(module, nn.LayerNorm):
	module.bias.data.zero_()
	module.weight.data.fill_(1.0)


	##################
	# PreTrainedModel
	##################


	class FLMRPretrainedModelForRetrieval(FLMRPreTrainedModel):
	"""
	An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
	models.
	"""

	config_class = FLMRConfig
	load_tf_weights = None
	base_model_prefix = "flmr"


	###############
	# Actual Models
	###############


	FLMR_START_DOCSTRING = r"""

	This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
	library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
	etc.)

	This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
	Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
	and behavior.

	Parameters:
	config ([`FLMRConfig`]): Model configuration class with all the parameters of the model.
	Initializing with a config file does not load the weights associated with the model, only the
	configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
	query_tokenizer ([`FLMRQueryEncoderTokenizer`], optional): The tokenizer used for tokenizing the query.
	The query tokenizer can be initialized with `FLMRQueryEncoderTokenizer.from_pretrained(pretrained_model_name_or_path)`.
	context_tokenizer ([`FLMRContextEncoderTokenizer`], optional): The tokenizer used for tokenizing the context.
	The context tokenizer can be initialized with `FLMRContextEncoderTokenizer.from_pretrained(pretrained_model_name_or_path)`.
	"""


	FLMR_MODEL_INPUTS_DOCSTRING = r"""
	Args:
	query_input_ids (`torch.LongTensor` of shape `(batch_size, query_length)`):
	Indices of input query tokens in the vocabulary. To match pretraining, FLMR input sequence should be
	formatted with [CLS] and Q marker tokens as follows:
	[CLS] [unused0] using the provided image, obtain documents that address the subsequent question : what is the capital of france? [SEP] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] ...

	FLMR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
	rather than the left.

	Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
	[`PreTrainedTokenizer.__call__`] for details.

	[What are input IDs?](../glossary#input-ids)
	query_attention_mask (`torch.FloatTensor` of shape `(batch_size, query_length)`, optional):
	Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

	- 1 for tokens that are not masked,
	- 0 for tokens that are masked.

	[What are attention masks?](../glossary#attention-mask)
	query_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, optional):
	Pixel values. Pixel values can be obtained using
	[`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
	query_image_features (`torch.FloatTensor` of shape `(batch_size, vision_encoder_hidden_size)`, optional):
	Image features are required when `query_pixel_values` is not provided. In this case, vision encoder outputs are pre-extracted to speed up training and inference by skipping the vision encoder forward pass and the extract image features are directly given to the FLMR model. Image features can be obtained
	using [`CLIPVisionModel`]. See [`CLIPVisionModel.__call__`] for details.
	context_input_ids (`torch.LongTensor` of shape `(batch_size * (1 + num_negative_examples), context_length)`):
	Indices of input context tokens in the vocabulary. To match pretraining, FLMR input sequence should be
	formatted with [CLS] and D marker tokens as follows:
	[CLS] [unused1] paris is the capital of france. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] ...

	FLMR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
	rather than the left.

	Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
	[`PreTrainedTokenizer.__call__`] for details.

	[What are input IDs?](../glossary#input-ids)

	The input batch size of this tensor is `batch_size * (1 + num_negative_examples)`. Check the following argument `num_negative_examples` for details.

	context_attention_mask (`torch.FloatTensor` of shape `(batch_size * (1 + num_negative_examples), context_length)`, optional):
	Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

	- 1 for tokens that are not masked,
	- 0 for tokens that are masked.

	[What are attention masks?](../glossary#attention-mask)

	The input batch size of this tensor is `batch_size * (1 + num_negative_examples)`. Check the following argument `num_negative_examples` for details.
	context_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, optional):
	Pixel values. Pixel values can be obtained using
	[`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
	context_image_features (`torch.FloatTensor` of shape `(batch_size, vision_encoder_hidden_size)`, optional):
	Image features are required when `context_pixel_values` is not provided. In this case, vision encoder outputs are pre-extracted to speed up training and inference by skipping the vision encoder forward pass and the extract image features are directly given to the FLMR model. Image features can be obtained
	using [`CLIPVisionModel`]. See [`CLIPVisionModel.__call__`] for details.
	use_in_batch_negatives (`bool`, optional):
	Whether or not to use in-batch negatives. If `True`, the contrastive loss includes in-batch negatives. For each positive example, all other examples in the batch except itself are considered negative examples in computing the contrastive loss. This improves ultimate performance in practice. This input is to be used in model training.
	in_batch_negatives_from_all_gpus (`bool`, optional):
	Whether or not to use in-batch negatives from all GPUs. If `True`, the contrastive loss includes in-batch negatives from all GPUs. This input is to be used in model training.
	num_negative_examples (`int`, optional):
	The number of negative examples in the batch. For example, if `num_negative_examples` is 4, the batch size of `context_input_ids` and `context_attention_mask` is `batch_size * 5`.
	query_concat_output_from_vision_encoder (`bool`, optional):
	Whether or not to concatenate the output from the vision encoder to the final query late-interaction representations. If `True`, the output from the vision encoder is concatenated to the query representations. When using a pretrained model, this will be read from the model configuration. It should be set to `True` for FLMR and PreFLMR -style models.
	query_concat_output_from_text_encoder (`bool`, optional):
	Whether or not to concatenate the output from the text encoder to the final query late-interaction representations. If `True`, the output from the text encoder is concatenated to the query representations. When using a pretrained model, this will be read from the model configuration. It should be set to `True` for FLMR and PreFLMR -style models.

	This argument can be set to `False` when performing mapping network pretraining as in FLMR and PreFLMR, in which case the output from the text encoder is not concatenated to the final query representations.
	context_concat_output_from_vision_encoder (`bool`, optional):
	Whether or not to concatenate the output from the vision encoder to the final context late-interaction representations. If `True`, the output from the vision encoder is concatenated to the context representations. When using a pretrained model, this will be read from the model configuration. It should be set to `False` for FLMR and PreFLMR -style models since the context vision encoder is not used.

	This can be set to `True` to additionally encode the context images with the vision encoder when context images are provided.
	context_concat_output_from_text_encoder (`bool`, optional):
	Whether or not to concatenate the output from the text encoder to the final context late-interaction representations. If `True`, the output from the text encoder is concatenated to the context representations. When using a pretrained model, this will be read from the model configuration. It should be set to `True` for FLMR and PreFLMR -style models.
	return_dict (`bool`, optional):
	Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
	output_attentions (`bool`, optional):
	Whether or not to return the attentions tensors of all attention layers. See `*_attentions` under returned
	tensors for more detail.
	output_hidden_states (`bool`, optional):
	Whether or not to return the hidden states of all layers. See `*_hidden_states` under returned tensors for more detail.
	"""


	FLMR_MODEL_QUERY_INPUTS_DOCSTRING = r"""
	Args:
	input_ids (`torch.LongTensor` of shape `(batch_size, query_length)`):
	Indices of input query tokens in the vocabulary. To match pretraining, FLMR input sequence should be
	formatted with [CLS] and Q marker tokens as follows:
	[CLS] [unused0] using the provided image, obtain documents that address the subsequent question : what is the capital of france? [SEP] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] ...

	FLMR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
	rather than the left.

	Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
	[`PreTrainedTokenizer.__call__`] for details.

	[What are input IDs?](../glossary#input-ids)
	attention_mask (`torch.FloatTensor` of shape `(batch_size, query_length)`, optional):
	Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

	- 1 for tokens that are not masked,
	- 0 for tokens that are masked.

	[What are attention masks?](../glossary#attention-mask)
	pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, optional):
	Pixel values. Pixel values can be obtained using
	[`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
	image_features (`torch.FloatTensor` of shape `(batch_size, vision_encoder_hidden_size)`, optional):
	Image features are required when `pixel_values` is not provided. In this case, vision encoder outputs are pre-extracted to speed up training and inference by skipping the vision encoder forward pass and the extract image features are directly given to the FLMR model. Image features can be obtained
	using [`CLIPVisionModel`]. See [`CLIPVisionModel.__call__`] for details.
	concat_output_from_vision_encoder (`bool`, optional):
	Whether or not to concatenate the output from the vision encoder to the final query late-interaction representations. If `True`, the output from the vision encoder is concatenated to the query representations. When using a pretrained model, this will be read from the model configuration. It should be set to `True` for FLMR and PreFLMR -style models.
	concat_output_from_text_encoder (`bool`, optional):
	Whether or not to concatenate the output from the text encoder to the final query late-interaction representations. If `True`, the output from the text encoder is concatenated to the query representations. When using a pretrained model, this will be read from the model configuration. It should be set to `True` for FLMR and PreFLMR -style models.

	This argument can be set to `False` when performing mapping network pretraining as in FLMR and PreFLMR, in which case the output from the text encoder is not concatenated to the final query representations.
	"""


	FLMR_MODEL_CONTEXT_INPUTS_DOCSTRING = r"""
	Args:
	input_ids (`torch.LongTensor` of shape `(batch_size * (1 + num_negative_examples), context_length)`):
	Indices of input context tokens in the vocabulary. To match pretraining, FLMR input sequence should be
	formatted with [CLS] and D marker tokens as follows:
	[CLS] [unused1] paris is the capital of france. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] ...

	FLMR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
	rather than the left.

	Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
	[`PreTrainedTokenizer.__call__`] for details.

	[What are input IDs?](../glossary#input-ids)

	The input batch size of this tensor is `batch_size * (1 + num_negative_examples)`. Check the following argument `num_negative_examples` for details.
	attention_mask (`torch.FloatTensor` of shape `(batch_size * (1 + num_negative_examples), context_length)`, optional):
	Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

	- 1 for tokens that are not masked,
	- 0 for tokens that are masked.

	[What are attention masks?](../glossary#attention-mask)

	The input batch size of this tensor is `batch_size * (1 + num_negative_examples)`. Check the following argument `num_negative_examples` for details.
	pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, optional):
	Pixel values. Pixel values can be obtained using
	[`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
	image_features (`torch.FloatTensor` of shape `(batch_size, vision_encoder_hidden_size)`, optional):
	Image features are required when `pixel_values` is not provided. In this case, vision encoder outputs are pre-extracted to speed up training and inference by skipping the vision encoder forward pass and the extract image features are directly given to the FLMR model. Image features can be obtained
	using [`CLIPVisionModel`]. See [`CLIPVisionModel
	.__call__`] for details.
	concat_output_from_vision_encoder (`bool`, optional):
	Whether or not to concatenate the output from the vision encoder to the final context late-interaction representations. If `True`, the output from the vision encoder is concatenated to the context representations. When using a pretrained model, this will be read from the model configuration. It should be set to `False` for FLMR and PreFLMR -style models since the context vision encoder is not used.

	This can be set to `True` to additionally encode the context images with the vision encoder when context images are provided.
	concat_output_from_text_encoder (`bool`, optional):
	Whether or not to concatenate the output from the text encoder to the final context late-interaction representations. If `True`, the output from the text encoder is concatenated to the context representations. When using a pretrained model, this will be read from the model configuration. It should be set to `True` for FLMR and PreFLMR -style models.
	keep_dims (`bool`, optional):
	Whether or not to keep the dimensions of the output. If `True`, the output is returned with the same dimensions as the input. If `False`, the output is returned with the batch size of the input and the context length. This input is to be used in model training.
	return_mask (`bool`, optional):
	Whether or not to return the mask of the context representation. If `True`, the mask of the context representation is returned. This input is to be used in model training.
	"""


	FLMR_TEXT_ENCODERS_START_DOCSTRING = r"""

	This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
	library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
	etc.)

	This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
	Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
	and behavior.

	Parameters:
	config ([`FLMRTextConfig`]): Model configuration class with all the parameters of the model.
	Initializing with a config file does not load the weights associated with the model, only the
	configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
	"""


	# Modified from transformers.models.dpr.modeling_dpr with DPR -> FLMR
	FLMR_TEXT_ENCODERS_INPUTS_DOCSTRING = r"""
	Args:
	input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
	Indices of input sequence tokens in the vocabulary. To match pretraining, FLMR input sequence should be
	formatted with [CLS] and [SEP] tokens as follows:

	(a) For sequence pairs (for a pair title+text for example):

	```
	tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
	token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
	```

	(b) For single sequences (for a question for example):

	```
	tokens: [CLS] the dog is hairy . [SEP]
	token_type_ids: 0 0 0 0 0 0 0
	```

	FLMR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
	rather than the left.

	Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
	[`PreTrainedTokenizer.__call__`] for details.

	[What are input IDs?](../glossary#input-ids)
	attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, optional):
	Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

	- 1 for tokens that are not masked,
	- 0 for tokens that are masked.

	[What are attention masks?](../glossary#attention-mask)
	token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, optional):
	Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
	1]`:

	- 0 corresponds to a sentence A token,
	- 1 corresponds to a sentence B token.

	[What are token type IDs?](../glossary#token-type-ids)
	inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, optional):
	Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
	is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
	model's internal embedding lookup matrix.
	output_attentions (`bool`, optional):
	Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
	tensors for more detail.
	output_hidden_states (`bool`, optional):
	Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
	more detail.
	return_dict (`bool`, optional):
	Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
	"""

	FLMR_VISION_ENCODERS_START_DOCSTRING = r"""
	This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
	library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
	etc.)

	This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
	Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
	and behavior.

	Parameters:
	config ([`FLMRVisionConfig`]): Model configuration class with all the parameters of the model.
	Initializing with a config file does not load the weights associated with the model, only the
	configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
	"""

	# Modified from transformers.models.clip.modeling_clip with CLIP -> FLMR
	FLMR_VISION_ENCODERS_INPUTS_DOCSTRING = r"""
	Args:
	pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
	Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
	[`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
	output_attentions (`bool`, optional):
	Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
	tensors for more detail.
	output_hidden_states (`bool`, optional):
	Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
	more detail.
	return_dict (`bool`, optional):
	Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
	"""


	class FLMRMultiLayerPerceptron(nn.Module):
	"""
	A simple multi-layer perceptron with an activation function. This can be used as the mapping network in the FLMR model.
	"""

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	return self.model(x)

	def __init__(self, sizes, bias=True, act=nn.Tanh):
	super(FLMRMultiLayerPerceptron, self).__init__()
	layers = []
	for i in range(len(sizes) - 1):
	layers.append(nn.Linear(sizes[i], sizes[i + 1], bias=bias))
	if i < len(sizes) - 2:
	layers.append(act())
	self.model = nn.Sequential(*layers)


	@add_start_docstrings(
	"The bare FLMR model that can be used to generate late-interaction embeddings for both multi-modal queries and documents. ",
	FLMR_START_DOCSTRING,
	)
	class FLMRModelForRetrieval(FLMRPretrainedModelForRetrieval):
	_keys_to_ignore_on_load_unexpected = [r"cls"]
	main_input_name = "query_input_ids"
	_tied_weights_keys = [] # Added dynamically at initialization depending on the architecture

	def __init__(self, config: FLMRConfig, query_tokenizer=None, context_tokenizer=None):
	super().__init__(config)
	self.config = config
	self.vision_model_version = config.vision_model_version

	self.context_text_encoder = FLMRTextModel(config.text_config)
	self.context_text_encoder_linear = nn.Linear(config.text_config.hidden_size, config.dim, bias=False)

	self.query_tokenizer = query_tokenizer
	self.context_tokenizer = context_tokenizer

	if self.query_tokenizer is None:
	logger.warning(
	"query_tokenizer is not provided. A tokenizer is initialized from `bert-base-uncased`. Please pass in an FLMRQueryEncoderTokenizer instance if you need to extend the vocabulary beyond the existing ones in the bert tokenizer."
	)
	from transformers import FLMRQueryEncoderTokenizer

	# initialize a FLMRQueryEncoderTokenizer
	self.query_tokenizer = FLMRQueryEncoderTokenizer.from_pretrained("bert-base-uncased")

	if self.context_tokenizer is None:
	logger.warning(
	"context_tokenizer is not provided. A tokenizer is initialized from `bert-base-uncased`. Please pass in an FLMRContextEncoderTokenizer instance if you need to extend the vocabulary beyond the existing ones in the bert tokenizer."
	)
	from transformers import FLMRContextEncoderTokenizer

	# initialize a FLMRContextEncoderTokenizer
	self.context_tokenizer = FLMRContextEncoderTokenizer.from_pretrained("bert-base-uncased")

	self.mapping_network_prefix_length = self.config.mapping_network_prefix_length
	self.vision_encoder_embedding_size = self.config.vision_config.hidden_size
	self.text_encoder_embedding_size = self.config.text_config.hidden_size
	self.late_interaction_embedding_size = self.config.dim

	self.context_vision_projection = FLMRMultiLayerPerceptron(
	(
	self.vision_encoder_embedding_size,
	(self.late_interaction_embedding_size * self.mapping_network_prefix_length) // 2,
	self.late_interaction_embedding_size * self.mapping_network_prefix_length,
	)
	)

	if self.config.use_vision_encoder:
	self.context_vision_encoder = FLMRVisionModel(config.vision_config)

	if self.config.use_transformer_mapping_network:
	# This is a PreFLMR style model
	transformer_mapping_config_base = self.config.transformer_mapping_config_base
	try:
	from transformers import BertConfig
	from transformers.models.bert.modeling_bert import BertEncoder
	except Exception as e:
	raise ImportError(f"Failed to import BertConfig and BertEncoder from transformers. {e}")

	transformer_mapping_config = BertConfig.from_pretrained(transformer_mapping_config_base)

	assert (
	self.config.text_config.hidden_size == transformer_mapping_config.hidden_size
	), f"hidden_size {self.config.text_config.hidden_size} != transformer_mapping_config.hidden_size {transformer_mapping_config.hidden_size}. To use cross attention, the dimensions must match."
	# shallow transformer
	transformer_mapping_config.num_hidden_layers = self.config.transformer_mapping_num_hidden_layers
	# add cross attention
	transformer_mapping_config.is_decoder = True
	transformer_mapping_config.add_cross_attention = True

	# The linear layer from vision encoder to transformer input
	self.transformer_mapping_input_linear = nn.Linear(
	self.vision_encoder_embedding_size, transformer_mapping_config.hidden_size
	)

	# The transformer encoder
	self.transformer_mapping_network = BertEncoder(transformer_mapping_config)

	# The linear layer from transformer output to FLMR dim
	self.transformer_mapping_output_linear = nn.Linear(
	transformer_mapping_config.hidden_size, self.late_interaction_embedding_size
	)

	if self.config.separate_query_and_context_text_encoder:
	self.query_text_encoder = copy.deepcopy(self.context_text_encoder)
	self.query_text_encoder_linear = copy.deepcopy(self.context_text_encoder_linear)
	else:
	self.query_text_encoder = self.context_text_encoder
	self.query_text_encoder_linear = self.context_text_encoder_linear
	self._tied_weights_keys += ["context_text_encoder", "context_text_encoder_linear"]

	if self.config.separate_query_and_context_vision_encoder:
	self.query_vision_encoder = copy.deepcopy(self.context_vision_encoder)
	self.query_vision_projection = copy.deepcopy(self.context_vision_projection)
	else:
	self.query_vision_encoder = self.context_vision_encoder
	self.query_vision_projection = self.context_vision_projection
	self._tied_weights_keys += ["context_vision_encoder", "context_vision_projection"]

	if self.config.load_cpu_extension:
	try:
	FLMRModelForRetrieval.try_load_torch_extensions()
	except Exception as e:
	raise(f"Unable to load `segmented_maxsim.cpp`. hf-hub does not download this file automatically. Please download it manually from `https://huggingface.co/LinWeizheDragon/PreFLMR_ViT-L/blob/main/segmented_maxsim.cpp` and put it under the same folder as the model file.\n {e}")

	if self.config.mask_punctuation:
	self.skiplist = {
	w: True
	for symbol in string.punctuation
	for w in [symbol, self.context_tokenizer.encode(symbol, add_special_tokens=False)[0]]
	}

	if self.config.mask_instruction_token is not None:
	self.mask_instruction = True
	# obtain the token id of the instruction token
	self.instruction_token_id = self.query_tokenizer.encode(
	self.config.mask_instruction_token, add_special_tokens=False
	)[0]
	else:
	self.mask_instruction = False

	self.loss_fn = torch.nn.CrossEntropyLoss()

	# Initialize weights and apply final processing
	self.post_init()

	@property
	def use_gpu(self):
	return self.device.type == "cuda"

	@classmethod
	def from_pretrained(self, name_or_path, **kwargs):
	obj = super().from_pretrained(name_or_path, **kwargs)
	return obj

	@classmethod
	def try_load_torch_extensions(cls):
	if hasattr(cls, "loaded_extensions"):
	return

	logger.info(
	"Loading segmented_maxsim_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)..."
	)
	segmented_maxsim_cpp = load(
	name="segmented_maxsim_cpp",
	sources=[
	os.path.join(pathlib.Path(__file__).parent.resolve(), "segmented_maxsim.cpp"),
	],
	extra_cflags=["-O3"],
	verbose=os.getenv("COLBERT_LOAD_TORCH_EXTENSION_VERBOSE", "False") == "True",
	)
	cls.segmented_maxsim = segmented_maxsim_cpp.segmented_maxsim_cpp

	cls.loaded_extensions = True

	def query_mask(self, input_ids, skiplist):
	if not self.mask_instruction:
	return self.mask(input_ids, skiplist)

	# find the position of end of instruction in input_ids
	# mask the tokens before the position
	sep_id = self.instruction_token_id
	sep_positions = torch.argmax((input_ids == sep_id).int(), dim=1).tolist()
	# if any of the positions is lower than 1, set to 1
	for i, x in enumerate(sep_positions):
	if x < 1:
	sep_positions[i] = 1
	logger.error(f"can not find the separator in the input_ids: {input_ids[i].tolist()}")
	mask = [
	[
	(x not in skiplist) and (x != 0) and (index > sep_positions[seq_index] or index < 2)
	for index, x in enumerate(d)
	]
	for seq_index, d in enumerate(input_ids.cpu().tolist())
	]
	return mask

	@add_start_docstrings_to_model_forward(FLMR_MODEL_INPUTS_DOCSTRING)
	@replace_return_docstrings(output_type=FLMRModelForRetrievalOutput, config_class=_CONFIG_FOR_DOC)
	def forward(
	self,
	query_input_ids: Optional[torch.Tensor] = None,
	query_attention_mask: Optional[torch.Tensor] = None,
	query_pixel_values: Optional[torch.Tensor] = None,
	query_image_features: Optional[torch.Tensor] = None,
	context_input_ids: Optional[torch.Tensor] = None,
	context_attention_mask: Optional[torch.Tensor] = None,
	context_pixel_values: Optional[torch.Tensor] = None,
	context_image_features: Optional[torch.Tensor] = None,
	use_in_batch_negatives: bool = True,
	in_batch_negatives_from_all_gpus: bool = False,
	num_negative_examples: int = 1,
	query_concat_output_from_vision_encoder: Optional[bool] = None,
	query_concat_output_from_text_encoder: Optional[bool] = None,
	context_concat_output_from_vision_encoder: Optional[bool] = None,
	context_concat_output_from_text_encoder: Optional[bool] = None,
	return_dict: bool = None,
	output_attentions: bool = None,
	output_hidden_states: bool = None,
	) -> Union[FLMRModelForRetrievalOutput, Tuple[Tensor, ...]]:
	r"""
	Return:

	Examples:

	```python
	>>> import torch
	>>> from transformers import FLMRQueryEncoderTokenizer, FLMRContextEncoderTokenizer, FLMRModelForRetrieval, AutoImageProcessor

	>>> checkpoint_path = "LinWeizheDragon/PreFLMR_ViT-L"
	>>> image_processor_name = "openai/clip-vit-large-patch14"
	>>> query_tokenizer = FLMRQueryEncoderTokenizer.from_pretrained(checkpoint_path, subfolder="query_tokenizer")
	>>> context_tokenizer = FLMRContextEncoderTokenizer.from_pretrained(checkpoint_path, subfolder="context_tokenizer")

	>>> model = FLMRModelForRetrieval.from_pretrained(checkpoint_path,
	query_tokenizer=query_tokenizer,
	context_tokenizer=context_tokenizer,
	)
	>>> image_processor = AutoImageProcessor.from_pretrained(image_processor_name)

	>>> Q_encoding = query_tokenizer(["Using the provided image, obtain documents that address the subsequent question: What is the capital of France?", "Extract documents linked to the question provided in conjunction with the image: What is the capital of China?"])
	>>> D_encoding = context_tokenizer(["Paris is the capital of France.", "Beijing is the capital of China.",
	"Paris is the capital of France.", "Beijing is the capital of China."])
	>>> Q_pixel_values = torch.zeros(2, 3, 224, 224)
	>>> inputs = dict(
	query_input_ids=Q_encoding['input_ids'],
	query_attention_mask=Q_encoding['attention_mask'],
	query_pixel_values=Q_pixel_values,
	context_input_ids=D_encoding['input_ids'],
	context_attention_mask=D_encoding['attention_mask'],
	use_in_batch_negatives=True,
	)

	>>> model.forward(**inputs)
	FLMRModelForRetrievalOutput(loss=tensor(4.5000, device='cuda:0', dtype=torch.float16,
	grad_fn=<NllLossBackward0>), scores=tensor([[44.2188, 40.6562],
	[39.4375, 48.4062]], device='cuda:0', dtype=torch.float16,
	grad_fn=<ViewBackward0>), in_batch_negative_loss=tensor(5.1994, device='cuda:0', grad_fn=<NllLossBackward0>), query_late_interaction_output=tensor(...), context_late_interaction_output=tensor(...)
	```
	"""

	if query_concat_output_from_vision_encoder is None:
	query_concat_output_from_vision_encoder = self.config.query_concat_output_from_vision_encoder

	if query_concat_output_from_text_encoder is None:
	query_concat_output_from_text_encoder = self.config.query_concat_output_from_text_encoder

	if context_concat_output_from_vision_encoder is None:
	context_concat_output_from_vision_encoder = self.config.context_concat_output_from_vision_encoder

	if context_concat_output_from_text_encoder is None:
	context_concat_output_from_text_encoder = self.config.context_concat_output_from_text_encoder

	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	query_outputs = self.query(
	input_ids=query_input_ids,
	attention_mask=query_attention_mask,
	pixel_values=query_pixel_values,
	image_features=query_image_features,
	concat_output_from_vision_encoder=query_concat_output_from_vision_encoder,
	concat_output_from_text_encoder=query_concat_output_from_text_encoder,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	)
	Q = query_outputs.late_interaction_output

	context_outputs = self.doc(
	input_ids=context_input_ids,
	attention_mask=context_attention_mask,
	pixel_values=context_pixel_values,
	image_features=context_image_features,
	concat_output_from_vision_encoder=context_concat_output_from_vision_encoder,
	concat_output_from_text_encoder=context_concat_output_from_text_encoder,
	keep_dims=True,
	return_mask=True,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	)
	D, D_mask = context_outputs.late_interaction_output, context_outputs.context_mask

	# Gather tensors from other GPUs
	if in_batch_negatives_from_all_gpus:
	Q, D, D_mask = self.gather_tensors_from_other_gpus(Q, D, D_mask)
	# Repeat each query encoding for every corresponding document.
	Q_duplicated = Q.repeat_interleave(num_negative_examples + 1, dim=0).contiguous()

	scores = self.score(Q_duplicated, D, D_mask)

	# Use contrastive learning
	batch_size = query_input_ids.shape[0]
	scores = scores.view(-1, num_negative_examples + 1)
	labels = torch.zeros(batch_size, dtype=torch.long, device=self.device)
	loss = self.loss_fn(scores, labels)

	if use_in_batch_negatives:
	ib_loss = self.compute_ib_loss_new(Q, D, D_mask)
	else:
	ib_loss = None

	if output_attentions:
	query_attentions = (
	query_outputs.text_encoder_attentions if query_outputs.text_encoder_attentions is not None else None,
	query_outputs.vision_encoder_attentions
	if query_outputs.vision_encoder_attentions is not None
	else None,
	query_outputs.transformer_mapping_network_attentions
	if query_outputs.transformer_mapping_network_attentions is not None
	else None,
	)
	context_attentions = (
	context_outputs.text_encoder_attentions
	if context_outputs.text_encoder_attentions is not None
	else None,
	context_outputs.vision_encoder_attentions
	if context_outputs.vision_encoder_attentions is not None
	else None,
	context_outputs.transformer_mapping_network_attentions
	if context_outputs.transformer_mapping_network_attentions is not None
	else None,
	)
	else:
	query_attentions = None
	context_attentions = None

	if output_hidden_states:
	query_hidden_states = (
	query_outputs.text_encoder_hidden_states
	if query_outputs.text_encoder_hidden_states is not None
	else None,
	query_outputs.vision_encoder_hidden_states
	if query_outputs.vision_encoder_hidden_states is not None
	else None,
	query_outputs.transformer_mapping_network_hidden_states
	if query_outputs.transformer_mapping_network_hidden_states is not None
	else None,
	)
	context_hidden_states = (
	context_outputs.text_encoder_hidden_states
	if context_outputs.text_encoder_hidden_states is not None
	else None,
	context_outputs.vision_encoder_hidden_states
	if context_outputs.vision_encoder_hidden_states is not None
	else None,
	context_outputs.transformer_mapping_network_hidden_states
	if context_outputs.transformer_mapping_network_hidden_states is not None
	else None,
	)
	else:
	query_hidden_states = None
	context_hidden_states = None

	if not return_dict:
	if output_attentions and output_hidden_states:
	return (
	loss,
	scores,
	ib_loss,
	query_outputs.late_interaction_output,
	context_outputs.late_interaction_output,
	query_attentions,
	query_hidden_states,
	context_attentions,
	context_hidden_states,
	)
	elif output_attentions:
	return (
	loss,
	scores,
	ib_loss,
	query_outputs.late_interaction_output,
	context_outputs.late_interaction_output,
	query_attentions,
	context_attentions,
	)
	elif output_hidden_states:
	return (
	loss,
	scores,
	ib_loss,
	query_outputs.late_interaction_output,
	context_outputs.late_interaction_output,
	query_hidden_states,
	context_hidden_states,
	)
	else:
	return (
	loss,
	scores,
	ib_loss,
	query_outputs.late_interaction_output,
	context_outputs.late_interaction_output,
	)

	return FLMRModelForRetrievalOutput(
	loss=loss,
	scores=scores,
	in_batch_negative_loss=ib_loss,
	query_late_interaction_output=query_outputs.late_interaction_output,
	context_late_interaction_output=context_outputs.late_interaction_output,
	query_attentions=query_attentions if output_attentions else None,
	query_hidden_states=query_hidden_states if output_hidden_states else None,
	context_attentions=context_attentions if output_attentions else None,
	context_hidden_states=context_hidden_states if output_hidden_states else None,
	)

	def compute_ib_loss_new(self, Q: torch.Tensor, D: torch.Tensor, D_mask: torch.Tensor) -> torch.Tensor:
	# Q: batch_size x q_len x dim
	# D: batch_size*n_docs x i_len x dim
	# D_mask: batch_size*n_docs x i_len x dim
	# 1 x batch_size*n_docs x i_len x dim matmul batch_size x 1 x q_len x dim
	# = batch_size x batch_size*n_docs x i_len x q_len

	scores = (D.float().unsqueeze(0) @ Q.float().permute(0, 2, 1).unsqueeze(1)).flatten(
	0, 1
	) # query-major unsqueeze
	scores = colbert_score_reduce(scores, D_mask.repeat(Q.size(0), 1, 1))

	in_batch_scores = scores.reshape(Q.size(0), -1)

	batch_size = Q.shape[0]
	batch_size_with_pos_and_neg = D.shape[0]
	num_pos_and_neg = batch_size_with_pos_and_neg // batch_size

	# batch_size x dim matmul dim x (num_pos+num_neg)*batch_size
	# --> batch_size x (num_pos+num_neg)*batch_size
	in_batch_labels = torch.zeros(batch_size, batch_size_with_pos_and_neg).to(scores.device)
	step = num_pos_and_neg
	for i in range(batch_size):
	in_batch_labels[i, step * i] = 1
	# print('in_batch_labels', in_batch_labels)
	in_batch_labels = torch.argmax(in_batch_labels, dim=1)
	# print('in_batch_labels', in_batch_labels)

	loss = self.loss_fn(in_batch_scores, in_batch_labels)

	return loss

	def gather_tensors_from_other_gpus(self, query_embeddings, item_embeddings, item_mask):
	# print("get rank", get_rank())
	# print("get world size", get_world_size())
	# Gather embeddings from other GPUs
	n_nodes = get_world_size()
	if n_nodes == 1:
	return query_embeddings, item_embeddings, item_mask
	# Create placeholder to hold embeddings passed from other ranks
	global_query_embeddings_placeholder = [
	torch.zeros(*query_embeddings.shape, dtype=query_embeddings.dtype).to(query_embeddings.device)
	for _ in range(n_nodes)
	]
	global_item_embeddings_placeholder = [
	torch.zeros(*item_embeddings.shape, dtype=item_embeddings.dtype).to(item_embeddings.device)
	for _ in range(n_nodes)
	]
	global_item_mask_placeholder = [
	torch.zeros(*item_mask.shape, dtype=item_mask.dtype).to(item_mask.device) for _ in range(n_nodes)
	]
	dist.all_gather(global_query_embeddings_placeholder, query_embeddings.detach())
	dist.all_gather(global_item_embeddings_placeholder, item_embeddings.detach())
	dist.all_gather(global_item_mask_placeholder, item_mask.detach())

	global_query_embeddings = []
	global_item_embeddings = []
	global_item_mask = []
	# print(f"rank {get_rank()} global_query_embeddings", global_query_embeddings)
	# print(f"rank {get_rank()} global_item_embeddings", global_item_embeddings)
	# input()
	current_rank = get_rank()
	for rank_index, remote_q_embeddings in enumerate(global_query_embeddings_placeholder):
	# We append the embeddings from other GPUs if this embedding does not require gradients
	if rank_index != current_rank:
	global_query_embeddings.append(remote_q_embeddings)
	else:
	global_query_embeddings.append(query_embeddings)

	for rank_index, remote_item_embeddings in enumerate(global_item_embeddings_placeholder):
	# We append the embeddings from other GPUs if this embedding does not require gradients
	if rank_index != current_rank:
	global_item_embeddings.append(remote_item_embeddings)
	else:
	global_item_embeddings.append(item_embeddings)

	for rank_index, remote_item_mask in enumerate(global_item_mask_placeholder):
	# We append the embeddings from other GPUs if this embedding does not require gradients
	if rank_index != current_rank:
	global_item_mask.append(remote_item_mask)
	else:
	global_item_mask.append(item_mask)

	# Replace the previous variables with gathered tensors
	query_embeddings = torch.cat(global_query_embeddings)
	item_embeddings = torch.cat(global_item_embeddings)
	item_mask = torch.cat(global_item_mask)

	return query_embeddings, item_embeddings, item_mask

	@add_start_docstrings_to_model_forward(FLMR_MODEL_QUERY_INPUTS_DOCSTRING)
	@replace_return_docstrings(output_type=FLMRQueryEncoderOutput, config_class=_CONFIG_FOR_DOC)
	def query(
	self,
	input_ids: torch.Tensor,
	attention_mask: torch.Tensor,
	pixel_values: Optional[torch.Tensor] = None,
	image_features: Optional[torch.Tensor] = None,
	concat_output_from_vision_encoder: Optional[bool] = None,
	concat_output_from_text_encoder: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	):
	r"""
	Returns:

	"""

	if concat_output_from_vision_encoder is None:
	concat_output_from_vision_encoder = self.config.query_concat_output_from_vision_encoder

	if concat_output_from_text_encoder is None:
	concat_output_from_text_encoder = self.config.query_concat_output_from_text_encoder

	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)

	input_modality = []
	if pixel_values is not None or image_features is not None:
	input_modality.append("image")
	if input_ids is not None and attention_mask is not None:
	input_modality.append("text")

	text_encoder_outputs = None
	vision_encoder_outputs = None
	transformer_mapping_outputs = None

	if "image" in input_modality:
	assert (
	pixel_values is not None or image_features is not None
	), "pixel_values or image_features must be provided if image modality is used"
	assert (
	pixel_values is None or image_features is None
	), "pixel_values and image_features cannot be provided at the same time"

	if "text" in input_modality:
	assert (
	input_ids is not None and attention_mask is not None
	), "input_ids and attention_mask must be provided if text modality is used"
	# Forward the text encoder
	input_ids, attention_mask = input_ids.to(self.device), attention_mask.to(self.device)
	text_encoder_outputs = self.query_text_encoder(input_ids, attention_mask=attention_mask)
	text_encoder_hidden_states = text_encoder_outputs[0]
	text_embeddings = self.query_text_encoder_linear(text_encoder_hidden_states)
	mask = torch.tensor(self.query_mask(input_ids, skiplist=[]), device=self.device).unsqueeze(2).float()

	text_embeddings = text_embeddings * mask

	if "image" in input_modality:
	if pixel_values is not None:
	batch_size = pixel_values.shape[0]
	# Forward the vision encoder
	pixel_values = pixel_values.to(self.device)
	if len(pixel_values.shape) == 5:
	# Multiple ROIs are provided
	# merge the first two dimensions
	pixel_values = pixel_values.reshape(
	-1, pixel_values.shape[2], pixel_values.shape[3], pixel_values.shape[4]
	)
	vision_encoder_outputs = self.query_vision_encoder(pixel_values, output_hidden_states=True)
	vision_embeddings = vision_encoder_outputs.last_hidden_state[:, 0]

	if image_features is not None:
	batch_size = image_features.shape[0]
	vision_embeddings = image_features.to(self.device)

	# Forward the vision projection / mapping network
	vision_embeddings = self.query_vision_projection(vision_embeddings)
	vision_embeddings = vision_embeddings.view(batch_size, -1, self.late_interaction_embedding_size)

	if self.config.use_transformer_mapping_network:
	# select the second last layer
	vision_second_last_layer_hidden_states = vision_encoder_outputs.hidden_states[-2][:, 1:]
	# transformer_mapping
	transformer_mapping_input_features = self.transformer_mapping_input_linear(
	vision_second_last_layer_hidden_states
	)

	# Cross attention only attends to the first 32 tokens
	encoder_mask = torch.ones_like(mask).to(mask.device, dtype=mask.dtype)
	cross_attention_length = self.config.transformer_mapping_cross_attention_length
	if text_encoder_hidden_states.shape[1] > cross_attention_length:
	text_encoder_hidden_states = text_encoder_hidden_states[:, :cross_attention_length]
	encoder_mask = encoder_mask[:, :cross_attention_length]

	# Obtain cross attention mask
	encoder_extended_attention_mask = self.invert_attention_mask(encoder_mask.squeeze(-1))
	# Pass through the transformer mapping
	transformer_mapping_outputs = self.transformer_mapping_network(
	transformer_mapping_input_features,
	encoder_hidden_states=text_encoder_hidden_states,
	encoder_attention_mask=encoder_extended_attention_mask,
	)
	transformer_mapping_output_features = transformer_mapping_outputs.last_hidden_state
	# Convert the dimension to FLMR dim
	transformer_mapping_output_features = self.transformer_mapping_output_linear(
	transformer_mapping_output_features
	)
	# Merge with the vision embeddings
	vision_embeddings = torch.cat([vision_embeddings, transformer_mapping_output_features], dim=1)

	if concat_output_from_vision_encoder and concat_output_from_text_encoder:
	Q = torch.cat([text_embeddings, vision_embeddings], dim=1)
	elif concat_output_from_vision_encoder:
	Q = vision_embeddings
	elif concat_output_from_text_encoder:
	Q = text_embeddings

	vision_encoder_attentions = (
	vision_encoder_outputs.attentions
	if vision_encoder_outputs is not None
	and hasattr(vision_encoder_outputs, "attentions")
	and output_attentions
	else None
	)
	vision_encoder_hidden_states = (
	vision_encoder_outputs.hidden_states
	if vision_encoder_outputs is not None
	and hasattr(vision_encoder_outputs, "hidden_states")
	and output_hidden_states
	else None
	)
	text_encoder_attentions = (
	text_encoder_outputs.attentions
	if text_encoder_outputs is not None and hasattr(text_encoder_outputs, "attentions") and output_attentions
	else None
	)
	text_encoder_hidden_states = (
	text_encoder_outputs.hidden_states
	if text_encoder_outputs is not None
	and hasattr(text_encoder_outputs, "hidden_states")
	and output_hidden_states
	else None
	)
	transformer_mapping_network_attentions = (
	transformer_mapping_outputs.attentions
	if transformer_mapping_outputs is not None
	and hasattr(transformer_mapping_outputs, "attentions")
	and output_attentions
	else None
	)
	transformer_mapping_network_hidden_states = (
	transformer_mapping_outputs.hidden_states
	if transformer_mapping_outputs is not None
	and hasattr(transformer_mapping_outputs, "hidden_states")
	and output_hidden_states
	else None
	)

	return FLMRQueryEncoderOutput(
	pooler_output=Q[:, 0, :],
	late_interaction_output=torch.nn.functional.normalize(Q, p=2, dim=2),
	vision_encoder_attentions=vision_encoder_attentions,
	vision_encoder_hidden_states=vision_encoder_hidden_states,
	text_encoder_attentions=text_encoder_attentions,
	text_encoder_hidden_states=text_encoder_hidden_states,
	transformer_mapping_network_attentions=transformer_mapping_network_attentions,
	transformer_mapping_network_hidden_states=transformer_mapping_network_hidden_states,
	)

	@add_start_docstrings_to_model_forward(FLMR_MODEL_CONTEXT_INPUTS_DOCSTRING)
	@replace_return_docstrings(output_type=FLMRContextEncoderOutput, config_class=_CONFIG_FOR_DOC)
	def doc(
	self,
	input_ids: torch.Tensor,
	attention_mask: torch.Tensor,
	pixel_values: Optional[torch.Tensor] = None,
	image_features: Optional[torch.Tensor] = None,
	concat_output_from_vision_encoder: Optional[bool] = None,
	concat_output_from_text_encoder: Optional[bool] = None,
	keep_dims: Optional[bool] = True,
	return_mask: Optional[bool] = True,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	):
	r"""
	Returns:

	"""
	assert keep_dims in [True, False]

	if concat_output_from_vision_encoder is None:
	concat_output_from_vision_encoder = self.config.context_concat_output_from_vision_encoder

	if concat_output_from_text_encoder is None:
	concat_output_from_text_encoder = self.config.context_concat_output_from_text_encoder

	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)

	input_modality = []
	if pixel_values is not None or image_features is not None:
	input_modality.append("image")
	if input_ids is not None and attention_mask is not None:
	input_modality.append("text")

	text_encoder_outputs = None
	vision_encoder_outputs = None

	if "image" in input_modality:
	assert (
	pixel_values is not None or image_features is not None
	), "pixel_values or image_features must be provided if image modality is used"
	assert (
	pixel_values is None or image_features is None
	), "pixel_values and image_features cannot be provided at the same time"

	if "text" in input_modality:
	assert (
	input_ids is not None and attention_mask is not None
	), "input_ids and attention_mask must be provided if text modality is used"
	# Forward the text encoder
	input_ids, attention_mask = input_ids.to(self.device), attention_mask.to(self.device)
	text_encoder_outputs = self.context_text_encoder(input_ids, attention_mask=attention_mask)
	text_embeddings = text_encoder_outputs[0]
	text_embeddings = self.context_text_encoder_linear(text_embeddings)

	mask = torch.tensor(self.mask(input_ids, skiplist=self.skiplist), device=self.device).unsqueeze(2).float()
	text_embeddings = text_embeddings * mask

	if "image" in input_modality:
	if pixel_values is not None:
	# Forward the vision encoder
	pixel_values = pixel_values.to(self.device)
	vision_encoder_outputs = self.context_vision_encoder(pixel_values)
	vision_embeddings = vision_encoder_outputs.last_hidden_state[:, 0]

	if image_features is not None:
	vision_embeddings = image_features.to(self.device)

	batch_size = vision_embeddings.shape[0]

	# Forward the vision projection / mapping network
	vision_embeddings = self.context_vision_projection(vision_embeddings)
	vision_embeddings = vision_embeddings.view(
	-1, self.mapping_network_prefix_length, self.late_interaction_embedding_size
	)

	image_mask = torch.ones(batch_size, vision_embeddings.shape[1], 1).to(self.device)

	if concat_output_from_vision_encoder and concat_output_from_text_encoder:
	# Note: vision embeddings must be in the front since the ColBERT engine only indexes embeddings up to number of 1's in the mask
	# TODO: fix the engine to support masks with discontinuous 0 and 1.
	D = torch.cat([vision_embeddings, text_embeddings], dim=1)
	# concatenate the mask
	mask = torch.cat([mask, image_mask], dim=1)
	elif concat_output_from_vision_encoder:
	D = vision_embeddings
	mask = image_mask
	elif concat_output_from_text_encoder:
	D = text_embeddings
	mask = mask

	D = torch.nn.functional.normalize(D, p=2, dim=2)

	if self.use_gpu:
	D = D.half()

	if keep_dims is False:
	D, mask = D.cpu(), mask.bool().cpu().squeeze(-1)
	D = [d[mask[idx]] for idx, d in enumerate(D)]

	vision_encoder_attentions = (
	vision_encoder_outputs.attentions
	if vision_encoder_outputs is not None
	and hasattr(vision_encoder_outputs, "attentions")
	and output_attentions
	else None
	)
	vision_encoder_hidden_states = (
	vision_encoder_outputs.hidden_states
	if vision_encoder_outputs is not None
	and hasattr(vision_encoder_outputs, "hidden_states")
	and output_hidden_states
	else None
	)
	text_encoder_attentions = (
	text_encoder_outputs.attentions
	if text_encoder_outputs is not None and hasattr(text_encoder_outputs, "attentions") and output_attentions
	else None
	)
	text_encoder_hidden_states = (
	text_encoder_outputs.hidden_states
	if text_encoder_outputs is not None
	and hasattr(text_encoder_outputs, "hidden_states")
	and output_hidden_states
	else None
	)

	return FLMRContextEncoderOutput(
	pooler_output=D[:, 0, :],
	late_interaction_output=D,
	context_mask=mask.bool() if return_mask else None,
	vision_encoder_attentions=vision_encoder_attentions,
	vision_encoder_hidden_states=vision_encoder_hidden_states,
	text_encoder_attentions=text_encoder_attentions,
	text_encoder_hidden_states=text_encoder_hidden_states,
	)

	def score(self, Q, D_padded, D_mask):
	# assert self.colbert_config.similarity == 'cosine'
	# if self.colbert_config.similarity == 'l2':
	# assert self.colbert_config.interaction == 'colbert'
	# return (-1.0 * ((Q.unsqueeze(2) - D_padded.unsqueeze(1))**2).sum(-1)).max(-1).values.sum(-1)
	return colbert_score(Q, D_padded, D_mask, use_gpu=self.use_gpu)

	def mask(self, input_ids, skiplist):
	mask = [[(x not in skiplist) and (x != 0) for x in d] for d in input_ids.cpu().tolist()]
	return mask


	@add_start_docstrings(
	"The bare FLMR text encoder that can be used to generate late-interaction embeddings for texts in queries and contexts. This model is based on a `BertModel`. It can be used like a `BertModel` model for encoding text.",
	FLMR_TEXT_ENCODERS_START_DOCSTRING,
	)
	class FLMRTextModel(FLMRPreTrainedModel):
	base_model_prefix = "bert_model"
	config_class = FLMRTextConfig

	def __init__(self, config: FLMRTextConfig, args, *kwargs):
	super().__init__(config)
	self.bert_model = BertModel(config, add_pooling_layer=True)
	if self.bert_model.config.hidden_size <= 0:
	raise ValueError("Encoder hidden_size can't be zero")
	self.projection_dim = config.projection_dim
	if self.projection_dim > 0:
	self.encode_proj = nn.Linear(self.bert_model.config.hidden_size, config.projection_dim)
	# Initialize weights and apply final processing
	self.post_init()

	@add_start_docstrings_to_model_forward(FLMR_TEXT_ENCODERS_INPUTS_DOCSTRING)
	@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=FLMRTextConfig)
	def forward(
	self,
	input_ids: Optional[Tensor] = None,
	attention_mask: Optional[Tensor] = None,
	token_type_ids: Optional[Tensor] = None,
	inputs_embeds: Optional[Tensor] = None,
	output_attentions: bool = None,
	output_hidden_states: bool = None,
	return_dict: bool = None,
	) -> Union[BaseModelOutputWithPooling, Tuple[Tensor, ...]]:
	r"""
	Returns:

	"""
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	outputs = self.bert_model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	token_type_ids=token_type_ids,
	inputs_embeds=inputs_embeds,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)
	sequence_output = outputs[0]
	pooled_output = sequence_output[:, 0, :]

	if self.projection_dim > 0:
	pooled_output = self.encode_proj(pooled_output)

	if not return_dict:
	return (sequence_output, pooled_output) + outputs[2:]

	return BaseModelOutputWithPooling(
	last_hidden_state=sequence_output,
	pooler_output=pooled_output,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	)

	@property
	def embeddings_size(self) -> int:
	if self.projection_dim > 0:
	return self.encode_proj.out_features
	return self.bert_model.config.hidden_size


	@add_start_docstrings(
	"The bare FLMR vision encoder that can be used to generate late-interaction embeddings for images in queries and contexts. This model is based on a `CLIPVisionModel`. It can be used like a `CLIPVisionModel` model for encoding images.",
	FLMR_VISION_ENCODERS_START_DOCSTRING,
	)
	class FLMRVisionModel(FLMRPreTrainedModel):
	base_model_prefix = "vision_model"
	config_class = FLMRVisionConfig
	main_input_name = "pixel_values"
	_no_split_modules = ["CLIPEncoderLayer"]

	def __init__(self, config: FLMRVisionConfig):
	super().__init__(config)
	self.vision_model = CLIPVisionModel(config)
	self.post_init()

	def get_input_embeddings(self) -> nn.Module:
	return self.vision_model.vision_model.embeddings.patch_embedding

	@add_start_docstrings_to_model_forward(FLMR_VISION_ENCODERS_INPUTS_DOCSTRING)
	@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=FLMRVisionConfig)
	def forward(
	self,
	pixel_values: Optional[torch.FloatTensor] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> Union[Tuple, BaseModelOutputWithPooling]:
	r"""
	Returns:

	Examples:

	```python
	>>> from PIL import Image
	>>> import requests
	>>> from transformers import AutoProcessor, FLMRVisionModel

	>>> model = FLMRVisionModel.from_pretrained("openai/clip-vit-base-patch32")
	>>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

	>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
	>>> image = Image.open(requests.get(url, stream=True).raw)

	>>> inputs = processor(images=image, return_tensors="pt")

	>>> outputs = model(**inputs)
	>>> last_hidden_state = outputs.last_hidden_state
	>>> pooled_output = outputs.pooler_output # pooled CLS states
	```"""
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	return self.vision_model(
	pixel_values=pixel_values,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)