|
import os |
|
import typing |
|
import argparse |
|
import numpy as np |
|
import torch |
|
from gguf import * |
|
from safetensors import safe_open |
|
|
|
def k(raw_key: str, arch: str) -> str: |
|
return raw_key.format(arch=arch) |
|
|
|
class Args: |
|
def __init__(self, model, output): |
|
self.model = model |
|
self.output = output |
|
|
|
class SafetensorsIndexFile(typing.TypedDict): |
|
weight_map: typing.Dict[str, str] |
|
|
|
class SafetensorsIndex: |
|
def __init__(self, index_file_path: str): |
|
directory = os.path.dirname(index_file_path) |
|
self.index = typing.cast(SafetensorsIndexFile, json.load(open(index_file_path))) |
|
self.weight_map = self.index["weight_map"] |
|
files = set(self.weight_map.values()) |
|
self.tensors = {file: safe_open(os.path.join(directory, file), framework="pt") for file in files} |
|
|
|
def get_tensor(self, key: str) -> npt.NDArray[np.float32]: |
|
|
|
return typing.cast(npt.NDArray[np.float32], self.tensors[self.weight_map[key]].get_tensor(key).to(torch.float32).numpy()) |
|
|
|
def main(): |
|
parser = argparse.ArgumentParser(description="Extract vision model from safetensors to GGUF") |
|
parser.add_argument("--model", type=str, required=True, help="Input safetensors file") |
|
parser.add_argument("--output", type=str, required=True, help="Output GGUF file") |
|
args = parser.parse_args() |
|
|
|
import pathlib |
|
dir_model = pathlib.Path(args.model) |
|
config = json.load(open(dir_model / "config.json")) |
|
|
|
|
|
tensors = SafetensorsIndex((dir_model / "model.safetensors.index.json").as_posix()) |
|
|
|
ftype = 1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
clip_vision_config = { |
|
"hidden_size": 1024, |
|
"intermediate_size": 4096, |
|
"projection_dim": 1024, |
|
"num_hidden_layers": 24, |
|
"num_attention_heads": 16, |
|
"num_channels": 3, |
|
"image_size": 224, |
|
"patch_size": 14, |
|
"hidden_act": "quick_gelu", |
|
"layer_norm_eps": 1e-5, |
|
"attention_dropout": 0.0, |
|
"initializer_range": 0.02, |
|
"initializer_factor": 1.0, |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
clip_vision_config.update(dict( |
|
attention_dropout=0.0, |
|
dropout=0.0, |
|
hidden_act="quick_gelu", |
|
hidden_size=1024, |
|
image_size=224, |
|
initializer_factor=1.0, |
|
initializer_range=0.02, |
|
intermediate_size=4096, |
|
layer_norm_eps=1e-05, |
|
num_attention_heads=16, |
|
num_channels=3, |
|
num_hidden_layers=24, |
|
patch_size=14, |
|
projection_dim=1024 |
|
)) |
|
|
|
|
|
|
|
fout = GGUFWriter(args.output, arch="clip") |
|
|
|
fout.add_bool("clip.has_text_encoder", False) |
|
fout.add_bool("clip.has_vision_encoder", True) |
|
fout.add_bool("clip.has_llava_projector", True) |
|
fout.add_file_type(ftype) |
|
|
|
model_name = "microsoft/phi-3.5-vision-instruct" |
|
fout.add_name(model_name) |
|
fout.add_description("image encoder for " + model_name) |
|
fout.add_string("clip.projector_type", "mlp") |
|
|
|
|
|
VISION = "clip.vision" |
|
fout.add_uint32("clip.vision.image_size", clip_vision_config["image_size"]) |
|
fout.add_uint32("clip.vision.patch_size", clip_vision_config["patch_size"]) |
|
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), clip_vision_config["hidden_size"]) |
|
fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), clip_vision_config["intermediate_size"]) |
|
fout.add_uint32("clip.vision.projection_dim", clip_vision_config["projection_dim"]) |
|
fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), clip_vision_config["num_attention_heads"]) |
|
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), clip_vision_config["layer_norm_eps"]) |
|
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), clip_vision_config["num_hidden_layers"]) |
|
|
|
fout.add_array("clip.vision.image_mean", [0.48145466, 0.4578275, 0.40821073]) |
|
fout.add_array("clip.vision.image_std", [0.26862954, 0.26130258, 0.27577711]) |
|
|
|
|
|
prefix = "model.vision_embed_tokens.img_processor.vision_model." |
|
|
|
fout.add_tensor( |
|
"v.class_embd", |
|
tensors.get_tensor(f"{prefix}embeddings.class_embedding").astype(np.float16), |
|
) |
|
fout.add_tensor( |
|
"v.patch_embd.weight", |
|
tensors.get_tensor(f"{prefix}embeddings.patch_embedding.weight").reshape(1024, 3, 14, 14).astype(np.float16), |
|
) |
|
fout.add_tensor( |
|
"v.position_embd.weight", |
|
tensors.get_tensor(f"{prefix}embeddings.position_embedding.weight").astype(np.float16), |
|
) |
|
|
|
fout.add_tensor( |
|
"v.sub_GN", |
|
tensors.get_tensor("model.vision_embed_tokens.sub_GN").astype(np.float32), |
|
) |
|
fout.add_tensor( |
|
"v.glb_GN", |
|
tensors.get_tensor("model.vision_embed_tokens.glb_GN").astype(np.float32), |
|
) |
|
|
|
for i in range(clip_vision_config["num_hidden_layers"]): |
|
|
|
fout.add_tensor( |
|
f"blk.{i}.attn_norm.weight", |
|
tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm1.weight").astype(np.float32), |
|
) |
|
fout.add_tensor( |
|
f"blk.{i}.attn_norm.bias", |
|
tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm1.bias").astype(np.float32), |
|
) |
|
fout.add_tensor( |
|
f"blk.{i}.ffn_norm.weight", |
|
tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm2.weight").astype(np.float32), |
|
) |
|
fout.add_tensor( |
|
f"blk.{i}.ffn_norm.bias", |
|
tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm2.bias").astype(np.float32), |
|
) |
|
|
|
|
|
fout.add_tensor( |
|
f"blk.{i}.ffn_down.weight", |
|
tensors.get_tensor(f"{prefix}encoder.layers.{i}.mlp.fc1.weight").astype(np.float16), |
|
) |
|
fout.add_tensor( |
|
f"blk.{i}.ffn_down.bias", |
|
tensors.get_tensor(f"{prefix}encoder.layers.{i}.mlp.fc1.bias").astype(np.float16), |
|
) |
|
fout.add_tensor( |
|
f"blk.{i}.ffn_up.weight", |
|
tensors.get_tensor(f"{prefix}encoder.layers.{i}.mlp.fc2.weight").astype(np.float16), |
|
) |
|
fout.add_tensor( |
|
f"blk.{i}.ffn_up.bias", |
|
tensors.get_tensor(f"{prefix}encoder.layers.{i}.mlp.fc2.bias").astype(np.float16), |
|
) |
|
|
|
|
|
fout.add_tensor( |
|
f"blk.{i}.attn_k.weight", |
|
tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.k_proj.weight").astype(np.float16), |
|
) |
|
fout.add_tensor( |
|
f"blk.{i}.attn_k.bias", |
|
tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.k_proj.bias").astype(np.float16), |
|
) |
|
fout.add_tensor( |
|
f"blk.{i}.attn_output.weight", |
|
tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.out_proj.weight").astype(np.float16), |
|
) |
|
fout.add_tensor( |
|
f"blk.{i}.attn_output.bias", |
|
tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.out_proj.bias").astype(np.float16), |
|
) |
|
fout.add_tensor( |
|
f"blk.{i}.attn_q.weight", |
|
tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.q_proj.weight").astype(np.float16), |
|
) |
|
fout.add_tensor( |
|
f"blk.{i}.attn_q.bias", |
|
tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.q_proj.bias").astype(np.float16), |
|
) |
|
fout.add_tensor( |
|
f"blk.{i}.attn_v.weight", |
|
tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.v_proj.weight").astype(np.float16), |
|
) |
|
fout.add_tensor( |
|
f"blk.{i}.attn_v.bias", |
|
tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.v_proj.bias").astype(np.float16), |
|
) |
|
|
|
fout.add_tensor( |
|
"output_norm.weight", |
|
tensors.get_tensor(f"{prefix}post_layernorm.weight").astype(np.float32), |
|
) |
|
|
|
fout.write_header_to_file() |
|
fout.write_kv_data_to_file() |
|
fout.write_tensors_to_file() |
|
fout.close() |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|