Spaces:

Mat17892
/

iris

Runtime error

App Files Files Community

iris / llama.cpp /examples /llava /minicpmv-convert-image-encoder-to-gguf.py

Mat17892

llamacpp

b664585 verified 2 months ago

raw

history blame

31.9 kB

	# coding=utf-8
	# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	""" PyTorch Siglip model. """
	# Copied from HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit and add tgt_sizes


	import os
	import math
	import warnings

	import numpy as np
	import torch
	import torch.nn.functional as F
	import torch.utils.checkpoint
	from torch import nn
	from torch.nn.init import _calculate_fan_in_and_fan_out

	from transformers.activations import ACT2FN
	from transformers.modeling_utils import PreTrainedModel
	from transformers.configuration_utils import PretrainedConfig
	from transformers.utils import (
	logging,
	)
	from transformers.utils import logging

	logger = logging.get_logger(__name__)

	class SiglipVisionConfig(PretrainedConfig):
	r"""
	This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
	Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a
	configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip
	[google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
	Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
	documentation from [`PretrainedConfig`] for more information.
	Args:
	hidden_size (`int`, optional, defaults to 768):
	Dimensionality of the encoder layers and the pooler layer.
	intermediate_size (`int`, optional, defaults to 3072):
	Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
	num_hidden_layers (`int`, optional, defaults to 12):
	Number of hidden layers in the Transformer encoder.
	num_attention_heads (`int`, optional, defaults to 12):
	Number of attention heads for each attention layer in the Transformer encoder.
	num_channels (`int`, optional, defaults to 3):
	Number of channels in the input images.
	image_size (`int`, optional, defaults to 224):
	The size (resolution) of each image.
	patch_size (`int`, optional, defaults to 16):
	The size (resolution) of each patch.
	hidden_act (`str` or `function`, optional, defaults to `"gelu_pytorch_tanh"`):
	The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
	`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
	layer_norm_eps (`float`, optional, defaults to 1e-06):
	The epsilon used by the layer normalization layers.
	attention_dropout (`float`, optional, defaults to 0.0):
	The dropout ratio for the attention probabilities.
	Example:
	```python
	>>> from transformers import SiglipVisionConfig, SiglipVisionModel
	>>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration
	>>> configuration = SiglipVisionConfig()
	>>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration
	>>> model = SiglipVisionModel(configuration)
	>>> # Accessing the model configuration
	>>> configuration = model.config
	```"""

	model_type = "siglip_vision_model"

	def __init__(
	self,
	hidden_size=768,
	intermediate_size=3072,
	num_hidden_layers=12,
	num_attention_heads=12,
	num_channels=3,
	image_size=224,
	patch_size=16,
	hidden_act="gelu_pytorch_tanh",
	layer_norm_eps=1e-6,
	attention_dropout=0.0,
	**kwargs,
	):
	super().__init__(**kwargs)

	self.hidden_size = hidden_size
	self.intermediate_size = intermediate_size
	self.num_hidden_layers = num_hidden_layers
	self.num_attention_heads = num_attention_heads
	self.num_channels = num_channels
	self.patch_size = patch_size
	self.image_size = image_size
	self.attention_dropout = attention_dropout
	self.layer_norm_eps = layer_norm_eps
	self.hidden_act = hidden_act

	_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"

	SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
	"google/siglip-base-patch16-224",
	# See all SigLIP models at https://huggingface.co/models?filter=siglip
	]

	# Copied from transformers.models.llama.modeling_llama._get_unpad_data
	def _get_unpad_data(attention_mask):
	seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
	indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
	max_seqlen_in_batch = seqlens_in_batch.max().item()
	cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
	return (
	indices,
	cu_seqlens,
	max_seqlen_in_batch,
	)


	def _trunc_normal_(tensor, mean, std, a, b):
	# Cut & paste from PyTorch official master until it's in a few official releases - RW
	# Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
	def norm_cdf(x):
	# Computes standard normal cumulative distribution function
	return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0

	if (mean < a - 2 * std) or (mean > b + 2 * std):
	warnings.warn(
	"mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
	"The distribution of values may be incorrect.",
	stacklevel=2,
	)

	# Values are generated by using a truncated uniform distribution and
	# then using the inverse CDF for the normal distribution.
	# Get upper and lower cdf values
	l = norm_cdf((a - mean) / std)
	u = norm_cdf((b - mean) / std)

	# Uniformly fill tensor with values from [l, u], then translate to
	# [2l-1, 2u-1].
	tensor.uniform_(2 * l - 1, 2 * u - 1)

	# Use inverse cdf transform for normal distribution to get truncated
	# standard normal
	if tensor.dtype in [torch.float16, torch.bfloat16]:
	# The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu
	og_dtype = tensor.dtype
	tensor = tensor.to(torch.float32)
	tensor.erfinv_()
	tensor = tensor.to(og_dtype)
	else:
	tensor.erfinv_()

	# Transform to proper mean, std
	tensor.mul_(std * math.sqrt(2.0))
	tensor.add_(mean)

	# Clamp to ensure it's in the proper range
	if tensor.dtype == torch.float16:
	# The `clamp_` op is not (yet?) defined in float16+cpu
	tensor = tensor.to(torch.float32)
	tensor.clamp_(min=a, max=b)
	tensor = tensor.to(torch.float16)
	else:
	tensor.clamp_(min=a, max=b)


	def trunc_normal_tf_(
	tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
	):
	"""Fills the input Tensor with values drawn from a truncated
	normal distribution. The values are effectively drawn from the
	normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
	with values outside :math:`[a, b]` redrawn until they are within
	the bounds. The method used for generating the random values works
	best when :math:`a \\leq \text{mean} \\leq b`.
	NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
	bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
	and the result is subsquently scaled and shifted by the mean and std args.
	Args:
	tensor: an n-dimensional `torch.Tensor`
	mean: the mean of the normal distribution
	std: the standard deviation of the normal distribution
	a: the minimum cutoff value
	b: the maximum cutoff value
	"""
	with torch.no_grad():
	_trunc_normal_(tensor, 0, 1.0, a, b)
	tensor.mul_(std).add_(mean)


	def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
	fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
	denom = fan_in
	if mode == "fan_in":
	denom = fan_in
	elif mode == "fan_out":
	denom = fan_out
	elif mode == "fan_avg":
	denom = (fan_in + fan_out) / 2

	variance = scale / denom

	if distribution == "truncated_normal":
	# constant is stddev of standard normal truncated to (-2, 2)
	trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
	elif distribution == "normal":
	with torch.no_grad():
	tensor.normal_(std=math.sqrt(variance))
	elif distribution == "uniform":
	bound = math.sqrt(3 * variance)
	with torch.no_grad():
	tensor.uniform_(-bound, bound)
	else:
	raise ValueError(f"invalid distribution {distribution}")


	def lecun_normal_(tensor):
	variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")


	def default_flax_embed_init(tensor):
	variance_scaling_(tensor, mode="fan_in", distribution="normal")

	class SiglipVisionEmbeddings(nn.Module):
	def __init__(self, config: SiglipVisionConfig):
	super().__init__()
	self.config = config
	self.embed_dim = config.hidden_size
	self.image_size = config.image_size
	self.patch_size = config.patch_size

	self.patch_embedding = nn.Conv2d(
	in_channels=config.num_channels,
	out_channels=self.embed_dim,
	kernel_size=self.patch_size,
	stride=self.patch_size,
	padding="valid",
	)

	self.num_patches_per_side = self.image_size // self.patch_size
	self.num_patches = self.num_patches_per_side**2
	self.num_positions = self.num_patches
	self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)

	class SiglipAttention(nn.Module):
	"""Multi-headed attention from 'Attention Is All You Need' paper"""

	# Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
	def __init__(self, config):
	super().__init__()
	self.config = config
	self.embed_dim = config.hidden_size
	self.num_heads = config.num_attention_heads
	self.head_dim = self.embed_dim // self.num_heads
	if self.head_dim * self.num_heads != self.embed_dim:
	raise ValueError(
	f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
	f" {self.num_heads})."
	)
	self.scale = self.head_dim**-0.5
	self.dropout = config.attention_dropout

	self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
	self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
	self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
	self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)

	# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
	class SiglipMLP(nn.Module):
	def __init__(self, config):
	super().__init__()
	self.config = config
	self.activation_fn = ACT2FN[config.hidden_act]
	self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
	self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)


	# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Siglip
	class SiglipEncoderLayer(nn.Module):
	def __init__(self, config: SiglipVisionConfig):
	super().__init__()
	self.embed_dim = config.hidden_size
	self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
	self.self_attn = (
	SiglipAttention(config)
	)
	self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
	self.mlp = SiglipMLP(config)
	self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)

	class SiglipPreTrainedModel(PreTrainedModel):
	"""
	An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
	models.
	"""

	config_class = SiglipVisionConfig
	base_model_prefix = "siglip"
	supports_gradient_checkpointing = True

	def _init_weights(self, module):
	"""Initialize the weights"""

	if isinstance(module, SiglipVisionEmbeddings):
	width = self.config.hidden_size
	nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
	elif isinstance(module, nn.Embedding):
	default_flax_embed_init(module.weight)
	elif isinstance(module, SiglipAttention):
	nn.init.normal_(module.q_proj.weight)
	nn.init.normal_(module.k_proj.weight)
	nn.init.normal_(module.v_proj.weight)
	nn.init.normal_(module.out_proj.weight)
	nn.init.zeros_(module.q_proj.bias)
	nn.init.zeros_(module.k_proj.bias)
	nn.init.zeros_(module.v_proj.bias)
	nn.init.zeros_(module.out_proj.bias)
	elif isinstance(module, SiglipMLP):
	nn.init.normal_(module.fc1.weight)
	nn.init.normal_(module.fc2.weight)
	nn.init.normal_(module.fc1.bias, std=1e-6)
	nn.init.normal_(module.fc2.bias, std=1e-6)
	elif isinstance(module, (nn.Linear, nn.Conv2d)):
	lecun_normal_(module.weight)
	if module.bias is not None:
	nn.init.zeros_(module.bias)
	elif isinstance(module, nn.LayerNorm):
	module.bias.data.zero_()
	module.weight.data.fill_(1.0)


	SIGLIP_START_DOCSTRING = r"""
	This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
	library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
	etc.)
	This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
	Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
	and behavior.
	Parameters:
	config ([`SiglipVisionConfig`]): Model configuration class with all the parameters of the model.
	Initializing with a config file does not load the weights associated with the model, only the
	configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
	"""


	SIGLIP_VISION_INPUTS_DOCSTRING = r"""
	Args:
	pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
	Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
	[`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
	output_attentions (`bool`, optional):
	Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
	tensors for more detail.
	output_hidden_states (`bool`, optional):
	Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
	more detail.
	return_dict (`bool`, optional):
	Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
	"""


	# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->Siglip
	class SiglipEncoder(nn.Module):
	"""
	Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
	[`SiglipEncoderLayer`].
	Args:
	config: SiglipConfig
	"""

	def __init__(self, config: SiglipVisionConfig):
	super().__init__()
	self.config = config
	self.layers = nn.ModuleList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
	self.gradient_checkpointing = False

	class SiglipVisionTransformer(SiglipPreTrainedModel):
	config_class = SiglipVisionConfig
	main_input_name = "pixel_values"
	_supports_flash_attn_2 = True

	def __init__(self, config: SiglipVisionConfig):
	super().__init__(config)
	self.config = config
	embed_dim = config.hidden_size

	self.embeddings = SiglipVisionEmbeddings(config)
	self.encoder = SiglipEncoder(config)
	self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
	self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"

	# Initialize weights and apply final processing
	self.post_init()

	def get_input_embeddings(self) -> nn.Module:
	return self.embeddings.patch_embedding

	import argparse
	import json
	import re

	import numpy as np
	from gguf import *
	from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer, Idefics2VisionConfig

	TEXT = "clip.text"
	VISION = "clip.vision"


	def add_key_str(raw_key: str, arch: str) -> str:
	return raw_key.format(arch=arch)


	def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_minicpmv: bool) -> bool:
	if name in (
	"logit_scale",
	"text_model.embeddings.position_ids",
	"vision_model.embeddings.position_ids",
	):
	return True

	if has_minicpmv and name in ["visual_projection.weight"]:
	return True

	if name.startswith("v") and not has_vision:
	return True

	if name.startswith("t") and not has_text:
	return True

	return False


	def get_tensor_name(name: str) -> str:
	if "projection" in name:
	return name
	if "mm_projector" in name:
	name = name.replace("model.mm_projector", "mm")
	name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
	name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
	return name

	return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")


	def bytes_to_unicode():
	"""
	Returns list of utf-8 byte and a corresponding list of unicode strings.
	The reversible bpe codes work on unicode strings.
	This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
	When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
	This is a significant percentage of your normal, say, 32K bpe vocab.
	To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
	And avoids mapping to whitespace/control characters the bpe code barfs on.
	"""
	bs = (
	list(range(ord("!"), ord("~") + 1))
	+ list(range(ord("¡"), ord("¬") + 1))
	+ list(range(ord("®"), ord("ÿ") + 1))
	)
	cs = bs[:]
	n = 0
	for b in range(2**8):
	if b not in bs:
	bs.append(b)
	cs.append(2**8 + n)
	n += 1
	cs = [chr(n) for n in cs]
	return dict(zip(bs, cs))


	ap = argparse.ArgumentParser()
	ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
	ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
	ap.add_argument("--text-only", action="store_true", required=False,
	help="Save a text-only model. It can't be used to encode images")
	ap.add_argument("--vision-only", action="store_true", required=False,
	help="Save a vision-only model. It can't be used to encode texts")
	ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
	help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
	ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
	help="The clip model is from openclip (for ViT-SO400M type))")
	ap.add_argument("--minicpmv-projector", help="Path to minicpmv.projector file. If specified, save an image encoder for MiniCPM-V models.")
	ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp")
	ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
	# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
	# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
	default_image_mean = [0.48145466, 0.4578275, 0.40821073]
	default_image_std = [0.26862954, 0.26130258, 0.27577711]
	ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
	ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
	ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3', default=2)

	# with proper
	args = ap.parse_args()


	if args.text_only and args.vision_only:
	print("--text-only and --image-only arguments cannot be specified at the same time.")
	exit(1)

	if args.use_f32:
	print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")

	# output in the same directory as the model if output_dir is None
	dir_model = args.model_dir

	if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
	vocab = None
	tokens = None
	else:
	with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
	vocab = json.load(f)
	tokens = [key for key in vocab]

	# possible data types
	# ftype == 0 -> float32
	# ftype == 1 -> float16
	#
	# map from ftype to string
	ftype_str = ["f32", "f16"]

	ftype = 1
	if args.use_f32:
	ftype = 0

	# if args.clip_model_is_vision or args.clip_model_is_openclip:
	# model = CLIPVisionModel.from_pretrained(dir_model)
	# processor = None
	# else:
	# model = CLIPModel.from_pretrained(dir_model)
	# processor = CLIPProcessor.from_pretrained(dir_model)

	minicpmv_version = args.minicpmv_version
	emb_dim = 4096
	if minicpmv_version == 1:
	emb_dim = 2304
	elif minicpmv_version == 2:
	emb_dim = 4096
	elif minicpmv_version == 3:
	emb_dim = 3584

	default_vision_config = {
	"hidden_size": 1152,
	"image_size": 980,
	"intermediate_size": 4304,
	"model_type": "idefics2",
	"num_attention_heads": 16,
	"num_hidden_layers": 27,
	"patch_size": 14,
	}

	vision_config = Idefics2VisionConfig(**default_vision_config)
	model = Idefics2VisionTransformer(vision_config)
	if minicpmv_version == 3:
	vision_config = SiglipVisionConfig(**default_vision_config)
	model = SiglipVisionTransformer(vision_config)

	processor = None
	# if model.attn_pool is not None:
	# model.attn_pool = torch.nn.Identity()

	# model.blocks = model.blocks[:-1]
	model.load_state_dict(torch.load(os.path.join(dir_model, "minicpmv.clip")))

	fname_middle = None
	has_text_encoder = True
	has_vision_encoder = True
	has_minicpmv_projector = False

	if args.text_only:
	fname_middle = "text-"
	has_vision_encoder = False
	elif args.minicpmv_projector is not None:
	fname_middle = "mmproj-"
	has_text_encoder = False
	has_minicpmv_projector = True
	minicpmv_version = 3
	elif args.vision_only:
	fname_middle = "vision-"
	has_text_encoder = False
	else:
	fname_middle = ""

	output_dir = args.output_dir if args.output_dir is not None else dir_model
	os.makedirs(output_dir, exist_ok=True)
	output_prefix = os.path.basename(output_dir).replace("ggml_", "")
	fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
	fout = GGUFWriter(path=fname_out, arch="clip")

	fout.add_bool("clip.has_text_encoder", has_text_encoder)
	fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
	fout.add_bool("clip.has_minicpmv_projector", has_minicpmv_projector)
	fout.add_file_type(ftype)
	if args.text_only:
	fout.add_description("text-only CLIP model")
	elif args.vision_only and not has_minicpmv_projector:
	fout.add_description("vision-only CLIP model")
	elif has_minicpmv_projector:
	fout.add_description("image encoder for MiniCPM-V")
	# add projector type
	fout.add_string("clip.projector_type", "resampler")
	fout.add_int32("clip.minicpmv_version", minicpmv_version)
	else:
	fout.add_description("two-tower CLIP model")

	if has_vision_encoder:
	# vision_model hparams
	fout.add_uint32("clip.vision.image_size", 448)
	fout.add_uint32("clip.vision.patch_size", 14)
	fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), 1152)
	fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), 4304)
	fout.add_uint32("clip.vision.projection_dim", 0)
	fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), 16)
	fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
	block_count = 26
	fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count)

	if processor is not None:
	image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
	image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
	else:
	image_mean = args.image_mean if args.image_mean is not None else default_image_mean
	image_std = args.image_std if args.image_std is not None else default_image_std
	fout.add_array("clip.vision.image_mean", image_mean)
	fout.add_array("clip.vision.image_std", image_std)

	use_gelu = True
	fout.add_bool("clip.use_gelu", use_gelu)

	def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
	"""
	embed_dim: output dimension for each position
	pos: a list of positions to be encoded: size (M,)
	out: (M, D)
	"""
	assert embed_dim % 2 == 0
	omega = np.arange(embed_dim // 2, dtype=np.float32)
	omega /= embed_dim / 2.
	omega = 1. / 10000 ** omega # (D/2,)

	pos = pos.reshape(-1) # (M,)
	out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product

	emb_sin = np.sin(out) # (M, D/2)
	emb_cos = np.cos(out) # (M, D/2)

	emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
	return emb

	def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
	assert embed_dim % 2 == 0

	# use half of dimensions to encode grid_h
	emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
	emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)

	emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
	return emb


	# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
	def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
	"""
	grid_size: int of the grid height and width
	return:
	pos_embed: [grid_sizegrid_size, embed_dim] or [1+grid_sizegrid_size, embed_dim] (w/ or w/o cls_token)
	"""
	if isinstance(grid_size, int):
	grid_h_size, grid_w_size = grid_size, grid_size
	else:
	grid_h_size, grid_w_size = grid_size[0], grid_size[1]

	grid_h = np.arange(grid_h_size, dtype=np.float32)
	grid_w = np.arange(grid_w_size, dtype=np.float32)
	grid = np.meshgrid(grid_w, grid_h) # here w goes first
	grid = np.stack(grid, axis=0)

	grid = grid.reshape([2, 1, grid_h_size, grid_w_size])
	pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
	if cls_token:
	pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
	return pos_embed

	def _replace_name_resampler(s, v):
	if re.match("resampler.pos_embed", s):
	return {
	s: v,
	re.sub("pos_embed", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))),
	}
	if re.match("resampler.proj", s):
	return {
	re.sub("proj", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))),
	re.sub("proj", "proj.weight", s): v.transpose(-1, -2).contiguous(),
	}
	if re.match("resampler.attn.in_proj_.*", s):
	return {
	re.sub("attn.in_proj_", "attn.q.", s): v.chunk(3, dim=0)[0],
	re.sub("attn.in_proj_", "attn.k.", s): v.chunk(3, dim=0)[1],
	re.sub("attn.in_proj_", "attn.v.", s): v.chunk(3, dim=0)[2],
	}
	return {s: v}

	if has_minicpmv_projector:
	projector = torch.load(args.minicpmv_projector)
	new_state_dict = {}
	for k, v in projector.items():
	kvs = _replace_name_resampler(k, v)
	for nk, nv in kvs.items():
	new_state_dict[nk] = nv
	projector = new_state_dict
	ftype_cur = 0
	for name, data in projector.items():
	name = get_tensor_name(name)
	data = data.squeeze().numpy()

	n_dims = len(data.shape)
	if ftype == 1:
	if name[-7:] == ".weight" and n_dims == 2:
	print(" Converting to float16")
	data = data.astype(np.float16)
	ftype_cur = 1
	else:
	print(" Converting to float32")
	data = data.astype(np.float32)
	ftype_cur = 0
	else:
	if data.dtype != np.float32:
	print(" Converting to float32")
	data = data.astype(np.float32)
	ftype_cur = 0

	fout.add_tensor(name, data)
	print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")

	print("Projector tensors added\n")

	def _replace_name(s, v):
	s = "vision_model." + s
	if re.match("vision_model.embeddings.position_embedding", s):
	v = v.unsqueeze(0)
	return {s: v}

	return {s: v}

	state_dict = model.state_dict()
	new_state_dict = {}
	for k, v in state_dict.items():
	kvs = _replace_name(k, v)
	for nk, nv in kvs.items():
	new_state_dict[nk] = nv
	state_dict = new_state_dict
	for name, data in state_dict.items():
	if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_minicpmv_projector):
	# we don't need this
	print(f"skipping parameter: {name}")
	continue

	name = get_tensor_name(name)
	data = data.squeeze().numpy()

	n_dims = len(data.shape)

	# ftype == 0 -> float32, ftype == 1 -> float16
	ftype_cur = 0
	if n_dims == 4:
	print(f"tensor {name} is always saved in f16")
	data = data.astype(np.float16)
	ftype_cur = 1
	elif ftype == 1:
	if name[-7:] == ".weight" and n_dims == 2:
	print(" Converting to float16")
	data = data.astype(np.float16)
	ftype_cur = 1
	else:
	print(" Converting to float32")
	data = data.astype(np.float32)
	ftype_cur = 0
	else:
	if data.dtype != np.float32:
	print(" Converting to float32")
	data = data.astype(np.float32)
	ftype_cur = 0

	print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
	fout.add_tensor(name, data)


	fout.write_header_to_file()
	fout.write_kv_data_to_file()
	fout.write_tensors_to_file()
	fout.close()

	print("Done. Output file: " + fname_out)