Adocados

HelloKKMe commited on Oct 14

Commit

272f31d

verified ·

0 Parent(s):

Duplicate from Salesforce/GTA1-32B

Browse files

Co-authored-by: Yan Yang <HelloKKMe@users.noreply.huggingface.co>

Files changed (32) hide show

.gitattributes +35 -0
README.md +777 -0
config.bak.json +69 -0
config.json +51 -0
configuration_opencua.py +37 -0
generation_config.json +4 -0
merges.txt +0 -0
model-00001-of-00014.safetensors +3 -0
model-00002-of-00014.safetensors +3 -0
model-00003-of-00014.safetensors +3 -0
model-00004-of-00014.safetensors +3 -0
model-00005-of-00014.safetensors +3 -0
model-00006-of-00014.safetensors +3 -0
model-00007-of-00014.safetensors +3 -0
model-00008-of-00014.safetensors +3 -0
model-00009-of-00014.safetensors +3 -0
model-00010-of-00014.safetensors +3 -0
model-00011-of-00014.safetensors +3 -0
model-00012-of-00014.safetensors +3 -0
model-00013-of-00014.safetensors +3 -0
model-00014-of-00014.safetensors +3 -0
model.args.pt +3 -0
model.safetensors.index.json +1168 -0
modeling_opencua.py +449 -0
preprocessor_config.json +18 -0
processing_opencua.py +93 -0
processor_config.json +4 -0
qwen.tiktoken +0 -0
tiktoken.model +3 -0
tokenization_opencua.py +379 -0
tokenizer_config.json +58 -0
vocab.json +0 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,777 @@

+---
+language:
+- en
+license: mit
+metrics:
+- accuracy
+pipeline_tag: image-text-to-text
+tags:
+- VLM
+- Computer-Use-Agent
+- OS-Agent
+- GUI
+- Grounding
+library_name: transformers
+---
+# Introduction
+Reinforcement learning (RL) (e.g., GRPO) helps with grounding because of its inherent objective alignment—rewarding successful clicks—rather than encouraging long textual Chain-of-Thought (CoT) reasoning. Unlike approaches that rely heavily on verbose CoT reasoning, GRPO directly incentivizes actionable and grounded responses. Based on findings from our [blog](https://huggingface.co/blog/HelloKKMe/grounding-r1), we share state-of-the-art GUI grounding models trained using GRPO.
+# Grounding Performance
+We follow the standard evaluation protocol and benchmark our model on three challenging datasets. Our method consistently achieves the best results among all open-source model families. Below are the comparative results:
+| **Model**         | **Size** | **Open Source** | **ScreenSpot-V2** | **ScreenSpotPro** | **OSWORLD-G** | **OSWORLD-G-Refined** |
+|-------------------|:--------:|:---------------:|:-----------------:|:-----------------:|:-----------------:|:-----------------:|
+| OpenAI CUA        | —        | ❌              | 87.9              | 23.4              |        —          |        —          |
+| Claude 3.7        | —        | ❌              | 87.6              | 27.7              |        —          |         —          |
+| JEDI-7B           | 7B       | ✅              | 91.7              | 39.5              | 54.1              |        —          |
+| SE-GUI            | 7B       | ✅              | 90.3              | 47.0              |        —          |        —          |
+| UI-TARS           | 7B       | ✅              | 91.6              | 35.7              | 47.5              |        —          |
+| UI-TARS-1.5*       | 7B       | ✅              | 89.7*                 | 42.0*              | 52.8*  |         64.2*         |
+| UGround-v1-7B     | 7B       | ✅              |  —                | 31.1              |   —        |        36.4         |
+| Qwen2.5-VL-32B-Instruct | 32B | ✅              |  91.9*                | 48.0              |    46.5           |     59.6*           |
+| UGround-v1-72B    | 72B      | ✅              |  —                | 34.5              |        —          | —  |
+| Qwen2.5-VL-72B-Instruct | 72B | ✅              |  94.00*                | 53.3              |      —            |        62.2*         |
+| UI-TARS           | 72B      | ✅              | 90.3              | 38.1              |        —          |        —          |
+| OpenCUA           | 7B       | ✅              | 92.3              | 50.0             |      55.3         |       68.3*        |
+| OpenCUA          | 32B      | ✅              | 93.4              | 55.3              |       59.6         |        70.2*          |
+| GTA1-2507 (Ours)              | 7B       | ✅              | 92.4 <sub>*(∆ +2.7)*</sub>             | 50.1<sub>*(∆ +8.1)*</sub>              |     55.1 <sub>*(∆ +2.3)*</sub>          |        67.7 <sub>*(∆ +3.5)*</sub>          |
+| GTA1 (Ours)           | 7B         |     ✅          |  93.4 <sub>*(∆ +0.1)*</sub>              |   55.5<sub>*(∆ +5.5)*</sub>             |    60.1<sub>*(∆ +4.8)*</sub>           |     68.8<sub>*(∆ +0.5)*</sub>            |
+| GTA1 (Ours)              | 32B      | ✅              | 95.2 <sub>*(∆ +1.8)*</sub>             |        63.6<sub>*(∆ +8.3)*</sub>          | 65.2 <sub>*(∆ +5.6)*</sub>             |        72.2<sub>*(∆ +2.0)*</sub>          |
+> **Note:**
+> - Model size is indicated in billions (B) of parameters.
+> - A dash (—) denotes results that are currently unavailable.
+> - A superscript asterisk (﹡) denotes our evaluated result.
+> - UI-TARS-1.5 7B, OpenCUA-7B, and OpenCUA-32B are applied as our baseline models.
+> - ∆ indicates the performance improvement (∆) of our model compared to its baseline.
+# Agent Performance
+## OSWorld and OSWorld-Verified Benchmarks
+We evaluate our models on the OSWorld and OSWorld-Verified benchmarks following the standard evaluation protocol. The results demonstrate strong performance across both datasets.
+| **Agent Model** | **Step** | **OSWorld** | **OSWorld-Verified** |
+|-----------------|:--------:|:-----------:|:-------------------:|
+| **Proprietary Models** |
+| Claude 3.7 Sonnet | 100 | 28.0 | — |
+| OpenAI CUA 4o | 200 | 38.1 | — |
+| UI-TARS-1.5 | 100 | 42.5 | 41.8 |
+| OpenAI CUA o3 | 200 | 42.9 | — |
+| **Open-Source Models** |
+| Aria-UI w/ GPT-4o | 15 | 15.2 | — |
+| Aguvis-72B w/ GPT-4o | 15 | 17.0 | — |
+| UI-TARS-72B-SFT | 50 | 18.8 | — |
+| Agent S w/ Claude-3.5-Sonnet | 15 | 20.5 | — |
+| Agent S w/ GPT-4o | 15 | 20.6 | — |
+| UI-TARS-72B-DPO | 15 | 22.7 | — |
+| UI-TARS-72B-DPO | 50 | 24.6 | — |
+| UI-TARS-1.5-7B | 100 | 26.9 | 27.4 |
+| Jedi-7B w/ o3 | 100 | — | 51.0 |
+| Jedi-7B w/ GPT-4o | 100 | 27.0 | �� |
+| Agent S2 w/ Claude-3.7-Sonnet | 50 | 34.5 | — |
+| Agent S2 w/ Gemini-2.5-Pro | 50 | 41.4 | 45.8 |
+| Agent S2.5 w/ o3 | 100 | — | 56.0 |
+| Agent S2.5 w/ GPT-5 | 100 | — | 58.4 |
+| CoAct-1 w/o3 & o4mini & OpenAI CUA 4o | 150 | — | 60.8 |
+| GTA1-7B-2507 w/ o3 | 100 | 45.2 | 53.1 |
+| GTA1-7B-2507 w/ GPT-5 | 100 | — | 61.0 |
+| GTA1-32B w/ o3 | 100 | — | 55.4 |
+| GTA1-32B w/ GPT-5 | 100 | — | 63.4 |
+> **Note:** A dash (—) indicates unavailable results.
+## WindowsAgentArena Benchmark
+We also evaluate our models on the WindowsAgentArena benchmark, demonstrating strong performance in Windows-specific GUI automation tasks.
+| **Agent Model** | **Step** | **Success Rate** |
+|-----------------|:--------:|:---------------:|
+| Kimi-VL | 15 | 10.4 |
+| WAA | — | 19.5 |
+| Jedi w/ GPT-4o | 100 | 33.7 |
+| GTA1-7B-2507 w/ o3 | 100 | 47.9 |
+| GTA1-7B-2507 w/ GPT-5 | 100 | 49.2 |
+| GTA1-32B w/ o3 | 100 | 51.2 |
+| GTA1-32B w/ GPT-5 | 100 | 50.6 |
+> **Note:** A dash (—) indicates unavailable results.
+# Inference
+Below is a code snippet demonstrating how to run inference using a trained model.
+```python
+from transformers import AutoTokenizer, AutoImageProcessor
+from transformers.models.qwen2_vl.image_processing_qwen2_vl_fast import smart_resize
+from PIL import Image
+from io import BytesIO
+import base64
+import re
+from vllm import LLM, SamplingParams
+instruction="click start"
+image_path="example.png"
+CLICK_REGEXES = [
+                # pyautogui.click(x=123, y=456)
+                re.compile(r"click\s*\(\s*x\s*=\s*(\d+)\s*,\s*y\s*=\s*(\d+)\s*\)", re.IGNORECASE),
+                # pyautogui.click(123, 456) or click(123,456)
+                re.compile(r"click\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)", re.IGNORECASE),
+            ]
+def format_message(image_path,instruction):
+    SYSTEM_PROMPT = (
+        "You are a GUI agent. You are given a task and a screenshot of the screen. "
+        "You need to perform a series of pyautogui actions to complete the task."
+    )
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": [
+            {"type": "image", "image": image_path},
+            {"type": "text", "text": instruction},
+        ]},
+    ]
+    text = prompt_tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    text2, n = re.subn(
+        r"<\|media_begin\|>.*?<\|media_end\|>",
+        "<|vision_start|><|image_pad|><|vision_end|>",
+        text,
+        flags=re.S
+    )
+    if n == 0:
+        raise RuntimeError("Cannot find <|media_begin|>...<|media_end|> token.")
+    return text2
+def parse_xy_from_text(text: str):
+      if "click" not in text.lower():
+          return [-1, -1]
+      for rx in CLICK_REGEXES:
+          m = rx.search(text)
+          if m:
+              try:
+                  return int(m.group(1)), int(m.group(2))
+              except Exception:
+                  continue
+      return [-1,-1]
+def convert_pil_image_to_base64(image):
+    buffered = BytesIO()
+    image.save(buffered, format="PNG")
+    return base64.b64encode(buffered.getvalue()).decode()
+llm = LLM(
+            model="Salesforce/GTA1-32B",
+            tokenizer="Salesforce/GTA1-32B",
+            tokenizer_mode="slow",
+            trust_remote_code=True,
+            dtype="bfloat16",
+            limit_mm_per_prompt={"image": 1},
+            tensor_parallel_size=1,
+        )
+prompt_tok = AutoTokenizer.from_pretrained("Salesforce/GTA1-32B", trust_remote_code=True)
+sp = SamplingParams(max_tokens=512, temperature=0.0)
+tokenizer = llm.get_tokenizer()
+processor=AutoImageProcessor.from_pretrained("Salesforce/GTA1-32B", trust_remote_code=True)
+image = Image.open(image_path).convert('RGB')
+resized_height, resized_width = smart_resize(
+            image.height,
+            image.width,
+            factor=processor.patch_size * processor.merge_size,
+            min_pixels=processor.min_pixels,
+            max_pixels=processor.max_pixels,
+        )
+resized_image = image.resize((resized_width, resized_height))
+messages = format_message(image_path, instruction)
+response = llm.generate(
+            [{"prompt": messages, "multi_modal_data": {"image": [resized_image]}}],
+            sampling_params=sp
+        )[0].outputs[0].text
+coordinates = parse_xy_from_text(response)
+print(coordinates[0]/resized_width*image.width, coordinates[1]/resized_height*image.height)
+```
+# Model Serving
+Below is an example script for serving the model.
+```python
+import torch
+import os
+# -------------------------
+# System / Torch defaults
+# -------------------------
+os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")  # avoid CPU oversubscription
+os.environ.setdefault("VLLM_USE_V1", "1")
+os.environ.setdefault("VLLM_ENGINE_IN_BACKGROUND_THREAD", "0")
+import base64
+import re
+from typing import Dict, List, Union
+from PIL import Image
+from io import BytesIO
+import traceback
+import argparse
+import asyncio
+import requests
+import ray
+from ray import serve
+from fastapi import FastAPI
+from transformers import AutoTokenizer
+from vllm import LLM, SamplingParams
+import uuid
+N_REPLICAS = 2
+try:
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.benchmark = True
+except Exception:
+    pass
+# -------------------------
+# IO helpers
+# -------------------------
+def pil_to_base64(img: Image.Image, format: str = "PNG") -> str:
+    buffer = BytesIO()
+    img.save(buffer, format=format)
+    img_bytes = buffer.getvalue()
+    img_b64 = base64.b64encode(img_bytes).decode("utf-8")
+    return img_b64
+def data_uri_to_pil(data_uri: str) -> Image.Image:
+    header, b64_str = data_uri.split(",", 1)
+    img_data = base64.b64decode(b64_str)
+    buffer = BytesIO(img_data)
+    img = Image.open(buffer)
+    return img
+def extract_images(messages: List[Dict]) -> List[Image.Image]:
+    images = []
+    for msg in messages:
+        if msg.get("role") == "user":
+            for content in msg.get("content", []):
+                if content.get("type") in ["image", "image_url"]:
+                    if content["type"] == "image":
+                        images.append(data_uri_to_pil(content["image"]).convert("RGB"))
+                    else:
+                        images.append(data_uri_to_pil(content["image_url"]["url"]).convert("RGB"))
+    return images
+# -------------------------
+# Prompt builder
+# -------------------------
+def build_prompt_with_template(tokenizer: AutoTokenizer, messages: List[Dict]) -> str:
+    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    text2, n = re.subn(
+        r"<\|media_begin\|>.*?<\|media_end\|>",
+        "<|vision_start|><|image_pad|><|vision_end|>",
+        text,
+        flags=re.S,
+    )
+    if n == 0:
+        raise RuntimeError("Did not find <|media_begin|>...<|media_end|> block in template.")
+    return text2
+# -------------------------
+# Deployment
+# -------------------------
+def build_app(model_path: str, num_replicas: int, port: int):
+    api = FastAPI(title="GTA1-32B Multi-GPU Service (High-throughput)")
+    @serve.deployment(
+        num_replicas=num_replicas,
+        ray_actor_options={"num_gpus": 1, "num_cpus": 4},
+        max_ongoing_requests=16,
+    )
+    class GTA1Model:
+        def __init__(self, model_path: str):
+            gpu_ids = ray.get_gpu_ids()
+            self.gpu_id = gpu_ids[0] if gpu_ids else 0
+            print(f"🔍 Ray assigned GPU IDs: {gpu_ids}")
+            # Initialize vLLM within this replica (Ray sets CUDA_VISIBLE_DEVICES)
+            print(f"🔄 Initializing vLLM on GPU {self.gpu_id}[ray id] from {model_path}")
+            if not torch.cuda.is_available():
+                raise RuntimeError("CUDA is not available")
+            self.llm = LLM(
+                model=model_path,
+                tokenizer=model_path,
+                tokenizer_mode="slow",
+                trust_remote_code=True,
+                dtype="bfloat16",
+                limit_mm_per_prompt={"image": 1},
+                max_model_len=32768,
+                tensor_parallel_size=1,
+            )
+            self.vllm_tokenizer = self.llm.get_tokenizer()
+            self.hf_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+            self.model_path = model_path
+            self.dtype = "bfloat16"
+            print(f"✅ vLLM initialized successfully (Ray GPU Id: {self.gpu_id})")
+        # ------------ batching core ------------
+        @serve.batch(max_batch_size=8, batch_wait_timeout_s=0.1) # increase if GPU allows
+        async def _generate_batch(self, payload: Union[Dict, List[Dict]]):
+            """Build prompts, enforce single image, and call vLLM.generate."""
+            if isinstance(payload, dict):
+                list_of_payloads = [payload]
+            else:
+                list_of_payloads = payload
+            request_id = uuid.uuid4().hex[:8]
+            # --- Build per-sample prompt/image ---
+            prompts: List[str] = []
+            images_per_req: List[Image.Image] = []
+            error_results = []
+            early_exit = False
+            for p in list_of_payloads:
+                try:
+                    messages = p["messages"]
+                    imgs = extract_images(messages)
+                    if len(imgs) != 1:
+                        raise RuntimeError(f"Exactly one image is required, got {len(imgs)}")
+                    prompt_text = build_prompt_with_template(self.hf_tokenizer, messages)
+                    # Sanity check on tokens: 1 <|image_pad|>, no <|media_placeholder|>
+                    tok = self.vllm_tokenizer
+                    id_imgpad = tok.encode("<|image_pad|>", add_special_tokens=False)[0]
+                    id_media = tok.encode("<|media_placeholder|>", add_special_tokens=False)[0]
+                    ids = tok.encode(prompt_text, add_special_tokens=False)
+                    if sum(i == id_imgpad for i in ids) != 1 or any(i == id_media for i in ids):
+                        raise RuntimeError("Prompt media tokens invalid after conversion")
+                    prompts.append(prompt_text)
+                    images_per_req.append(imgs[0])
+                except Exception as e:
+                    early_exit = True
+                    trace = traceback.format_exc()
+                    error_results.append(
+                        {
+                            "response": "",
+                            "error": {
+                                        "message": str(e),
+                                        "trace": trace,
+                                        'type_of_payload': str(type(payload)),
+                                        'type_of_list_of_payloads': str(type(list_of_payloads)),
+                                        'type_of_p': str(type(p)),
+                                        'p_keys': str(p.keys()) if isinstance(p, dict) else str(p),
+                                    },
+                            "usage": {},
+                            "gpu_id": self.gpu_id
+                        }
+                     )
+            if early_exit:
+                return error_results
+            # --- vLLM generation ---
+            args_base = list_of_payloads[0]
+            sp = SamplingParams(
+                max_tokens=args_base.get("max_new_tokens", 512),
+                temperature=args_base.get("temperature", 0.0),
+                top_p=args_base.get("top_p", 0.9),
+            )
+            requests_list = [
+                {"prompt": pr, "multi_modal_data": {"image": [im]}}
+                for pr, im in zip(prompts, images_per_req)
+            ]
+            outs = self.llm.generate(requests_list, sampling_params=sp)
+            tok = self.vllm_tokenizer
+            results: List[Dict] = []
+            for pr, o in zip(prompts, outs):
+                text = o.outputs[0].text if o.outputs else ""
+                gen_tokens = len(o.outputs[0].token_ids) if (o.outputs and hasattr(o.outputs[0], 'token_ids')) else None
+                prompt_tokens = len(tok.encode(pr, add_special_tokens=False))
+                usage = {
+                    "prompt_tokens": prompt_tokens,
+                    "generated_tokens": gen_tokens if gen_tokens is not None else None,
+                    "total_tokens": (prompt_tokens + gen_tokens) if gen_tokens is not None else None,
+                }
+                results.append({
+                    "response": text,
+                    "error": "",
+                    "usage": usage,
+                    "gpu_id": self.gpu_id,
+                    'bs_size_in_this_request': f"{request_id}:{len(list_of_payloads)}"
+                })
+            return results
+        # Exposed single-call entry that joins the batch
+        async def call_llm(self, payload: Dict):
+            try:
+                res = await self._generate_batch(payload)
+                return res
+            except Exception as e:
+                trace = traceback.format_exc()
+                return {"response": "", "error": {"message": str(e), "trace": trace}, "usage": {}, "gpu_id": self.gpu_id}
+        def health(self):
+            return {
+                "status": "ok",
+                "gpu_id": self.gpu_id,
+                "dtype": self.dtype,
+                "model_path": self.model_path,
+            }
+    model = GTA1Model.bind(model_path)
+    @serve.deployment(max_ongoing_requests=96)
+    @serve.ingress(api)
+    class GTA1App:
+        def __init__(self, model_handle):
+            self.model_deployment = model_handle
+        @api.get("/health")
+        async def health_all(self):
+            # Calling the same Serve handle N times does not guarantee each call hits a different replica
+            attempts = max(8, N_REPLICAS * 4)  # oversample
+            calls = [self.model_deployment.health.remote() for i in range(attempts)]
+            replies = await asyncio.gather(*calls)
+            # dedupe by replica_id (or by tuple(gpu_id))
+            seen = {}
+            for r in replies:
+                seen[r.get("gpu_id", f"unknown-{len(seen)}")] = r
+                if len(seen) >= N_REPLICAS:
+                    break
+            return {"replicas": list(seen.values())}
+        @api.post("/call_llm")
+        async def call_llm(self, req: Dict):
+            return await self.model_deployment.call_llm.remote(req)
+    return GTA1App.bind(model)
+# -------------------------
+# Main
+# -------------------------
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", type=str, default="Salesforce/GTA1-32B")
+    parser.add_argument("--host", type=str, default="0.0.0.0")
+    parser.add_argument("--port", type=int, default=3005)
+    parser.add_argument("--num_replicas", type=int, default=2)
+    args = parser.parse_args()
+    N_REPLICAS = args.num_replicas
+    ray.init(ignore_reinit_error=True)
+    print(f"🚀 Starting GTA1-32B service on {args.host}:{args.port}")
+    serve.start(detached=True, http_options={"host": args.host, "port": args.port})
+    app = build_app(args.model_path, args.num_replicas, args.port)
+    serve.run(app, name="GTA1-32B", route_prefix="/")
+    # Quick health sample
+    try:
+        r = requests.get(f"http://0.0.0.0:{args.port}/health", timeout=5)
+        print(r.json())
+    except Exception as e:
+        print("Health probe failed:", e)
+```
+Here is the example usage,
+```python
+import argparse
+import base64
+import concurrent.futures
+import json
+import os
+import re
+from typing import Dict, List, Tuple
+from gui_agent.agent.gta1.format_message import encode_numpy_image_to_base64, encode_image_bytes, smart_resize
+import requests
+from PIL import Image, ImageDraw
+def image_file_to_data_uri(image_path: str) -> str:
+    if not os.path.exists(image_path):
+        raise FileNotFoundError(f"Image not found: {image_path}")
+    with open(image_path, "rb") as f:
+        b64 = base64.b64encode(f.read()).decode("utf-8")
+    # default to png; serverside only requires a data URI header then comma
+    return f"data:image/png;base64,{b64}"
+def build_messages(image_path: str, instruction: str, system_prompt: str) -> List[Dict]:
+    return [
+        {"role": "system", "content": system_prompt},
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image_file_to_data_uri(image_path)},
+                {"type": "text", "text": instruction},
+            ],
+        },
+    ]
+def call_health(base_url: str, timeout: float = 10.0) -> Dict:
+    r = requests.get(f"{base_url}/health", timeout=timeout)
+    r.raise_for_status()
+    return r.json()
+def call_single(
+    base_url: str,
+    image_path: str,
+    instruction: str,
+    system_prompt: str,
+    max_new_tokens: int = 512,
+    temperature: float = 0.0,
+    top_p: float = 0.9,
+    timeout: float = 120.0,
+) -> List[Dict]:
+    payload = {
+        "messages": build_messages(image_path, instruction, system_prompt),
+        "max_new_tokens": max_new_tokens,
+        "temperature": temperature,
+        "top_p": top_p,
+    }
+    r = requests.post(f"{base_url}/call_llm", json=payload, timeout=timeout)
+    r.raise_for_status()
+    resp = r.json()
+    if isinstance(resp, dict):
+        return [resp]
+    return resp
+def call_many_concurrent(
+    base_url: str,
+    image_path: str,
+    instruction: str,
+    system_prompt: str,
+    num_requests: int,
+    concurrency: int,
+    max_new_tokens: int = 512,
+    temperature: float = 0.0,
+    top_p: float = 0.9,
+    timeout: float = 120.0,
+) -> List[List[Dict]]:
+    results: List[List[Dict]] = []
+    def _one(i: int) -> List[Dict]:
+        # Vary instruction slightly so you can trace requests
+        instr = f"{instruction} [req {i+1}/{num_requests}]"
+        return call_single(
+            base_url,
+            image_path,
+            instr,
+            system_prompt,
+            max_new_tokens,
+            temperature,
+            top_p,
+            timeout,
+        )
+    with concurrent.futures.ThreadPoolExecutor(max_workers=concurrency) as pool:
+        futures = [pool.submit(_one, i) for i in range(num_requests)]
+        for fut in concurrent.futures.as_completed(futures):
+            results.append(fut.result())
+    return results
+def pretty_print_response(batch_results: List[Dict]) -> None:
+    if isinstance(batch_results, dict):
+        batch_results = [batch_results]
+    for idx, item in enumerate(batch_results):
+        if item.get("error"):
+            print(f"[#{idx}] ERROR: {json.dumps(item['error'], ensure_ascii=False)})")
+        else:
+            usage = item.get("usage", {})
+            print(f"[#{idx}] gpu={item.get('gpu_id')} tokens={usage} text=\n{item.get('response','').strip()}\n")
+CLICK_KWARGS_REGEX = re.compile(r"pyautogui\.click\(\s*x\s*=\s*(\d+)\s*,\s*y\s*=\s*(\d+)\s*\)")
+CLICK_POSARGS_REGEX = re.compile(r"pyautogui\.click\(\s*(\d+)\s*,\s*(\d+)\s*\)")
+def extract_clicks_from_text(text: str) -> List[Tuple[int, int]]:
+    clicks: List[Tuple[int, int]] = []
+    for x, y in CLICK_KWARGS_REGEX.findall(text or ""):
+        clicks.append((int(x), int(y)))
+    for x, y in CLICK_POSARGS_REGEX.findall(text or ""):
+        clicks.append((int(x), int(y)))
+    return clicks
+def extract_clicks_from_results(result_items: List[Dict]) -> List[Tuple[int, int]]:
+    clicks: List[Tuple[int, int]] = []
+    if isinstance(result_items, dict):
+        result_items = [result_items]
+    for item in result_items:
+        if item.get("error"):
+            continue
+        clicks.extend(extract_clicks_from_text(item.get("response", "")))
+    return clicks
+def compute_resized_dims_for_server_mapping(image_path: str) -> Tuple[int, int, int, int]:
+    with Image.open(image_path) as im:
+        width, height = im.size
+    resized_H, resized_W = smart_resize(
+        height,
+        width,
+        factor=28,
+        min_pixels=1000,
+        max_pixels=1000000000000,
+    )
+    return width, height, int(resized_W), int(resized_H)
+def map_clicks_to_original(clicks_resized: List[Tuple[int, int]],
+                           original_w: int,
+                           original_h: int,
+                           resized_w: int,
+                           resized_h: int) -> List[Tuple[int, int]]:
+    if resized_w == 0 or resized_h == 0:
+        return []
+    scale_x = original_w / float(resized_w)
+    scale_y = original_h / float(resized_h)
+    mapped: List[Tuple[int, int]] = []
+    for x, y in clicks_resized:
+        mapped_x = int(round(x * scale_x))
+        mapped_y = int(round(y * scale_y))
+        mapped.append((mapped_x, mapped_y))
+    return mapped
+def draw_circles_on_image(image_path: str,
+                          points: List[Tuple[int, int]],
+                          output_path: str,
+                          radius: int = 8,
+                          color: Tuple[int, int, int] = (255, 0, 0),
+                          width: int = 3) -> None:
+    if not points:
+        return
+    with Image.open(image_path).convert("RGB") as img:
+        drawer = ImageDraw.Draw(img)
+        for (x, y) in points:
+            left = x - radius
+            top = y - radius
+            right = x + radius
+            bottom = y + radius
+            drawer.ellipse([(left, top), (right, bottom)], outline=color, fill=(0,255,0), width=width)
+        img.save(output_path)
+    print(f"Annotated image saved to: {output_path} (points drawn: {len(points)})")
+SYSTEM_PROMPT = (
+    "You are a GUI agent. You are given a task and a screenshot of the screen. "
+    "You need to perform a series of pyautogui actions to complete the task."
+)
+def main():
+    parser = argparse.ArgumentParser(description="Examples: single and batched inference against GTA1-32B Ray Serve.")
+    parser.add_argument("--host", type=str, default="http://localhost", help="Ray Serve host, e.g. http://localhost or http://IP")
+    parser.add_argument("--port", type=int, default=3005, help="Ray Serve port")
+    parser.add_argument("--image", type=str, required=False, default="example.jpg", help="Path to input image")
+    parser.add_argument("--instruction", type=str, default="click the icon in the bottom row, third from the left", help="User instruction")
+    parser.add_argument("--system", type=str, default=SYSTEM_PROMPT)
+    parser.add_argument("--mode", type=str, choices=["single", "batch", "health"], default="batch")
+    parser.add_argument("--num_requests", type=int, default=8, help="Number of requests in batch mode")
+    parser.add_argument("--concurrency", type=int, default=8, help="Max concurrent HTTP calls in batch mode")
+    parser.add_argument("--max_new_tokens", type=int, default=512)
+    parser.add_argument("--temperature", type=float, default=0.0)
+    parser.add_argument("--top_p", type=float, default=0.9)
+    parser.add_argument("--timeout", type=float, default=180.0)
+    args = parser.parse_args()
+    base_url = f"{args.host}:{args.port}"
+    if args.mode == "health":
+        info = call_health(base_url, timeout=10.0)
+        print(json.dumps(info, indent=2))
+        return
+    if args.mode == "single":
+        result_list = call_single(
+            base_url=base_url,
+            image_path=args.image,
+            instruction=args.instruction,
+            system_prompt=args.system,
+            max_new_tokens=args.max_new_tokens,
+            temperature=args.temperature,
+            top_p=args.top_p,
+            timeout=args.timeout,
+        )
+        print(result_list)
+        pretty_print_response(result_list)
+        clicks_resized = extract_clicks_from_results(result_list)
+        if clicks_resized:
+            orig_w, orig_h, resized_w, resized_h = compute_resized_dims_for_server_mapping(args.image)
+            mapped_clicks = map_clicks_to_original(clicks_resized, orig_w, orig_h, resized_w, resized_h)
+            out_path = f"ray_serve/annotated.png"
+            draw_circles_on_image(args.image, mapped_clicks, out_path)
+        return
+    if args.mode == "batch":
+        print(f"Submitting {args.num_requests} requests with concurrency={args.concurrency}...")
+        batch_outs = call_many_concurrent(
+            base_url=base_url,
+            image_path=args.image,
+            instruction=args.instruction,
+            system_prompt=args.system,
+            num_requests=args.num_requests,
+            concurrency=args.concurrency,
+            max_new_tokens=args.max_new_tokens,
+            temperature=args.temperature,
+            top_p=args.top_p,
+            timeout=args.timeout,
+        )
+        for i, one_result in enumerate(batch_outs):
+            print(f"===== Result for request {i+1} =====")
+            pretty_print_response(one_result)
+        all_clicks_resized: List[Tuple[int, int]] = []
+        for one_result in batch_outs:
+            all_clicks_resized.extend(extract_clicks_from_results(one_result))
+        if all_clicks_resized:
+            orig_w, orig_h, resized_w, resized_h = compute_resized_dims_for_server_mapping(args.image)
+            mapped_clicks = map_clicks_to_original(all_clicks_resized, orig_w, orig_h, resized_w, resized_h)
+            out_path = f"ray_serve/annotated.png"
+            draw_circles_on_image(args.image, mapped_clicks, out_path)
+        return
+if __name__ == "__main__":
+    main()
+```
+## Ethical Considerations
+This model is released for research and educational purposes. While our model demonstrates strong performance on GUI benchmarks, users should carefully evaluate its suitability for their specific use cases.
+**Important Considerations:**
+- **Accuracy Limitations:** Like all AI systems, this model may produce incorrect outputs or fail to accurately identify GUI elements in certain scenarios.
+- **Safety and Security:** Exercise caution when deploying GUI automation agents, especially in production environments where incorrect actions could affect system integrity or data security.
+- **Human Oversight:** We recommend maintaining appropriate human supervision when using this model for automated GUI interactions.
+- **Compliance:** Users are responsible for ensuring their use of this model complies with applicable laws, regulations, and organizational policies.
+**Recommended Best Practices:**
+- Thoroughly test the model in controlled environments before production deployment
+- Implement safeguards and error handling mechanisms
+- Consider the potential impact of automated actions on user systems and data
+- Regularly monitor and validate model performance in your specific domain
+For further guidance on use cases, refer to our AUP and AI AUP.
+## Citation
+If you're using any GTA model or find it helpful in your research, please cite it as follows:
+```markdown
+@article{yang2025gta1guitesttimescaling,
+      title={GTA1: GUI Test-time Scaling Agent},
+      author={Yan Yang and Dongxu Li and Yutong Dai and Yuhao Yang and Ziyang Luo and Zirui Zhao and Zhiyuan Hu and Junzhe Huang and Amrita Saha and Zeyuan Chen and Ran Xu and Liyuan Pan and Silvio Savarese and Caiming Xiong and Junnan Li},
+      year={2025},
+      eprint={2507.05791},
+      archivePrefix={arXiv},
+      primaryClass={cs.AI},
+      url={https://arxiv.org/abs/2507.05791},
+}
+```

config.bak.json ADDED Viewed

	@@ -0,0 +1,69 @@

+{
+  "architectures": [
+    "OpenCUAForConditionalGeneration"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_opencua.OpenCUAConfig",
+    "AutoModel": "modeling_opencua.OpenCUAForConditionalGeneration",
+    "AutoModelForCausalLM": "modeling_opencua.OpenCUAForConditionalGeneration"
+  },
+  "ignore_index": -100,
+  "media_placeholder_token_id": 151664,
+  "model_type": "opencua",
+  "pad_token_id": 0,
+  "text_config": {
+    "bos_token_id": 151643,
+    "eos_token_id": 151644,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 5120,
+    "initializer_range": 0.02,
+    "intermediate_size": 27648,
+    "k_proj_bias": true,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "qwen2",
+    "num_attention_heads": 40,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 64,
+    "num_key_value_heads": 8,
+    "pad_token_id": 152063,
+    "pretraining_sequence_length": 131072,
+    "q_proj_bias": true,
+    "rms_norm_eps": 1e-05,
+    "rope_theta": 1000000.0,
+    "tie_word_embeddings": false,
+    "torch_dtype": "bfloat16",
+    "use_bfloat16": false,
+    "use_cache": true,
+    "v_proj_bias": true,
+    "vocab_size": 152064
+  },
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.48.3",
+  "vision_config": {
+    "depth": 32,
+    "fullatt_block_indexes": [
+      7,
+      15,
+      23,
+      31
+    ],
+    "hidden_act": "silu",
+    "hidden_size": 1280,
+    "num_heads": 16,
+    "in_chans": 3,
+    "intermediate_size": 3456,
+    "patch_size": 14,
+    "spatial_merge_size": 2,
+    "spatial_patch_size": 14,
+    "temporal_patch_size": 2,
+    "out_hidden_size": 5120,
+    "tokens_per_second": 2,
+    "window_size": 112
+  },
+  "vocab_size": 152064
+}

config.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "architectures": ["Qwen2_5_VLForConditionalGeneration"],
+  "model_type": "qwen2_5_vl",
+  "transformers_version": "4.49.0",
+  "torch_dtype": "bfloat16",
+  "processor_class": "OpenCUAProcessor",
+  "hidden_act": "silu",
+  "attention_dropout": 0.0,
+  "initializer_range": 0.02,
+  "rms_norm_eps": 1e-06,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "vocab_size": 152064,
+  "max_position_embeddings": 128000,
+  "sliding_window": 32768,
+  "use_sliding_window": false,
+  "max_window_layers": 64,
+  "rope_scaling": { "type": "default" },
+  "rope_theta": 1000000.0,
+  "hidden_size": 5120,
+  "intermediate_size": 27648,
+  "num_hidden_layers": 64,
+  "num_attention_heads": 40,
+  "num_key_value_heads": 8,
+  "bos_token_id": 151643,
+  "eos_token_id": 151644,
+  "pad_token_id": 152063,
+  "vision_start_token_id": 151665,
+  "vision_end_token_id": 151666,
+  "vision_token_id": 151654,
+  "image_token_id": 151667,
+  "video_token_id": 151664,
+  "vision_config": {
+    "model_type": "qwen2_5_vl",
+    "in_chans": 3,
+    "hidden_size": 1280,
+    "intermediate_size": 3456,
+    "out_hidden_size": 5120,
+    "spatial_patch_size": 14,
+    "tokens_per_second": 2,
+    "torch_dtype": "bfloat16"
+  }
+}

configuration_opencua.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from transformers.configuration_utils import PretrainedConfig
+from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig
+from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
+class OpenCUAConfig(PretrainedConfig):
+    """OpenCUA-2.5-32B model configuration.
+    Args:
+        vision_config: Configuration for the vision model.Qwen2_5_VLVisionConfig
+        text_config: Configuration for the text model. Qwen2Config
+        pad_token_id: The token ID to use for padding.
+    """
+    model_type = "opencua"
+    def __init__(
+        self,
+        vision_config: dict | Qwen2_5_VLVisionConfig | None = None,
+        text_config: dict | Qwen2Config | None = None,
+        ignore_index: int = -100,
+        media_placeholder_token_id: int = 151664,
+        pad_token_id: int = 0,
+        **kwargs
+    ):
+        if isinstance(vision_config, dict):
+            vision_config = Qwen2_5_VLVisionConfig(**vision_config)
+        self.vision_config = vision_config
+        if isinstance(text_config, dict):
+            text_config = Qwen2Config(**text_config)
+        self.text_config = text_config
+        self.ignore_index = ignore_index
+        self.media_placeholder_token_id = media_placeholder_token_id
+        super().__init__(pad_token_id=pad_token_id, **kwargs)

generation_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "max_length": 32768,
+  "eos_token_id": 151644
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:61a1a81d27692172b8c4af0dba584d54f9761065e65a8a99800cc2d46334a78d
+size 4932320880

model-00002-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4c166a525e6180f6a33d89c8d1c241b87ab4c14295e77a48d351328abef8babb
+size 4727609720

model-00003-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4265e2f92162197d911af82da2900280989834d2dffef68b302a5818c4c0ee95
+size 4822749744

model-00004-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bbc0137d617f58571e5669e20f199ad0fcef7bbe89126aef4de08b4c6a04be5a
+size 4998049568

model-00005-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a1f6c2afe9fa652bf72633734170e2f8e520ebe6a0b4a291916543cd87f0149e
+size 4883041912

model-00006-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b8258dd0383f72df276a81618b19194bedfb4d9d30eb941b54252a86e203d029
+size 4772902520

model-00007-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f8c4849d1e7e8f76954e18183b1ff2b4560413c5a9df5e305f204c1025582e6b
+size 4966230504

model-00008-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:32869aec221c400372785a300e2285c45664ac0b584576c3db40a9c9ad842359
+size 4800968432

model-00009-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eadb68817ae3046cc0867c72d8f058e97202c9b2fc79f50f91d8905e1a74b0e4
+size 4931462480

model-00010-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de45132043dbcfd9728b92fc13cc27166ec3fa2ce4bcac50c97e3196b77ed039
+size 4728824872

model-00011-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:96e16b21960f0a3e5f1de0f8f120c3c1860fd5b8c97f8dda9e784401965d0ed9
+size 4943615728

model-00012-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ebc7d2855167a5e1b4e1d48734da0fc45d2117638a58a68223b9ec6e4e895f54
+size 4744353832

model-00013-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7882dd3323b2ab948e7a2c59d8ec7ad5d314f509463cc20049196b7318a6eb50
+size 4817449768

model-00014-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f7d75bc05db0c27fda49bc6e417b96ca8f8b07754e0e32570e443c168ad202f6
+size 3835987888

model.args.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f63a4fc32414b15ac0c96d0d6b4889f0bd29bb7c16bde542c978c0e01dd49beb
+size 25196

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,1168 @@

+{
+  "metadata": {
+    "total_size": 66905436672
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00008-of-00014.safetensors",
+    "model.embed_tokens.weight": "model-00008-of-00014.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00002-of-00014.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00002-of-00014.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00011-of-00014.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00014.safetensors",
+    "model.layers.0.self_attn.k_proj.bias": "model-00003-of-00014.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00006-of-00014.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00007-of-00014.safetensors",
+    "model.layers.0.self_attn.q_proj.bias": "model-00005-of-00014.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00008-of-00014.safetensors",
+    "model.layers.0.self_attn.v_proj.bias": "model-00003-of-00014.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00010-of-00014.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00006-of-00014.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00009-of-00014.safetensors",
+    "model.layers.1.self_attn.k_proj.bias": "model-00006-of-00014.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00011-of-00014.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.1.self_attn.q_proj.bias": "model-00002-of-00014.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00014.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00014-of-00014.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00006-of-00014.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00006-of-00014.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00006-of-00014.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00014-of-00014.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00006-of-00014.safetensors",
+    "model.layers.10.self_attn.k_proj.bias": "model-00006-of-00014.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00005-of-00014.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00014-of-00014.safetensors",
+    "model.layers.10.self_attn.q_proj.bias": "model-00013-of-00014.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00014-of-00014.safetensors",
+    "model.layers.10.self_attn.v_proj.bias": "model-00003-of-00014.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00007-of-00014.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00006-of-00014.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00005-of-00014.safetensors",
+    "model.layers.11.self_attn.k_proj.bias": "model-00009-of-00014.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.11.self_attn.q_proj.bias": "model-00007-of-00014.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00006-of-00014.safetensors",
+    "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00014.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00011-of-00014.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00011-of-00014.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00007-of-00014.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00007-of-00014.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00014.safetensors",
+    "model.layers.12.self_attn.k_proj.bias": "model-00013-of-00014.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00014-of-00014.safetensors",
+    "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00014.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.12.self_attn.v_proj.bias": "model-00010-of-00014.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00009-of-00014.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00010-of-00014.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00011-of-00014.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00012-of-00014.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00004-of-00014.safetensors",
+    "model.layers.13.self_attn.k_proj.bias": "model-00004-of-00014.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00006-of-00014.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00010-of-00014.safetensors",
+    "model.layers.13.self_attn.q_proj.bias": "model-00003-of-00014.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00011-of-00014.safetensors",
+    "model.layers.13.self_attn.v_proj.bias": "model-00012-of-00014.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00012-of-00014.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00001-of-00014.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00010-of-00014.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00010-of-00014.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00009-of-00014.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00003-of-00014.safetensors",
+    "model.layers.14.self_attn.k_proj.bias": "model-00011-of-00014.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00006-of-00014.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00010-of-00014.safetensors",
+    "model.layers.14.self_attn.q_proj.bias": "model-00013-of-00014.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00008-of-00014.safetensors",
+    "model.layers.14.self_attn.v_proj.bias": "model-00009-of-00014.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00012-of-00014.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00011-of-00014.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00005-of-00014.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00007-of-00014.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00014.safetensors",
+    "model.layers.15.self_attn.k_proj.bias": "model-00010-of-00014.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.15.self_attn.q_proj.bias": "model-00006-of-00014.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00014.safetensors",
+    "model.layers.15.self_attn.v_proj.bias": "model-00003-of-00014.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00013-of-00014.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00005-of-00014.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00014.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00011-of-00014.safetensors",
+    "model.layers.16.self_attn.k_proj.bias": "model-00009-of-00014.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00011-of-00014.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00010-of-00014.safetensors",
+    "model.layers.16.self_attn.q_proj.bias": "model-00008-of-00014.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00012-of-00014.safetensors",
+    "model.layers.16.self_attn.v_proj.bias": "model-00004-of-00014.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00001-of-00014.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00012-of-00014.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00011-of-00014.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00010-of-00014.safetensors",
+    "model.layers.17.self_attn.k_proj.bias": "model-00011-of-00014.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00010-of-00014.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00010-of-00014.safetensors",
+    "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00014.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00014.safetensors",
+    "model.layers.17.self_attn.v_proj.bias": "model-00013-of-00014.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00010-of-00014.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00007-of-00014.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00014-of-00014.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00014.safetensors",
+    "model.layers.18.self_attn.k_proj.bias": "model-00004-of-00014.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00010-of-00014.safetensors",
+    "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00014.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00006-of-00014.safetensors",
+    "model.layers.18.self_attn.v_proj.bias": "model-00006-of-00014.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00007-of-00014.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00001-of-00014.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00014.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00009-of-00014.safetensors",
+    "model.layers.19.self_attn.k_proj.bias": "model-00005-of-00014.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00006-of-00014.safetensors",
+    "model.layers.19.self_attn.q_proj.bias": "model-00010-of-00014.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00014-of-00014.safetensors",
+    "model.layers.19.self_attn.v_proj.bias": "model-00012-of-00014.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00011-of-00014.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00011-of-00014.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00012-of-00014.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00010-of-00014.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00014-of-00014.safetensors",
+    "model.layers.2.self_attn.k_proj.bias": "model-00010-of-00014.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.2.self_attn.q_proj.bias": "model-00011-of-00014.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.2.self_attn.v_proj.bias": "model-00003-of-00014.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00002-of-00014.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00014-of-00014.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00011-of-00014.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00014.safetensors",
+    "model.layers.20.self_attn.k_proj.bias": "model-00005-of-00014.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00009-of-00014.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00014.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00005-of-00014.safetensors",
+    "model.layers.20.self_attn.v_proj.bias": "model-00012-of-00014.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00009-of-00014.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00002-of-00014.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00007-of-00014.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00007-of-00014.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00014-of-00014.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00010-of-00014.safetensors",
+    "model.layers.21.self_attn.k_proj.bias": "model-00011-of-00014.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00009-of-00014.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00009-of-00014.safetensors",
+    "model.layers.21.self_attn.q_proj.bias": "model-00007-of-00014.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.21.self_attn.v_proj.bias": "model-00004-of-00014.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00014.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00007-of-00014.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00014.safetensors",
+    "model.layers.22.self_attn.k_proj.bias": "model-00010-of-00014.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00011-of-00014.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.22.self_attn.q_proj.bias": "model-00005-of-00014.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00011-of-00014.safetensors",
+    "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00014.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00010-of-00014.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00007-of-00014.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00008-of-00014.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00006-of-00014.safetensors",
+    "model.layers.23.self_attn.k_proj.bias": "model-00001-of-00014.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.23.self_attn.q_proj.bias": "model-00010-of-00014.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.23.self_attn.v_proj.bias": "model-00006-of-00014.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00007-of-00014.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00010-of-00014.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00008-of-00014.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00006-of-00014.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00007-of-00014.safetensors",
+    "model.layers.24.self_attn.k_proj.bias": "model-00006-of-00014.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00012-of-00014.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00010-of-00014.safetensors",
+    "model.layers.24.self_attn.q_proj.bias": "model-00010-of-00014.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.24.self_attn.v_proj.bias": "model-00012-of-00014.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00010-of-00014.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00009-of-00014.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00007-of-00014.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00010-of-00014.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00004-of-00014.safetensors",
+    "model.layers.25.self_attn.k_proj.bias": "model-00001-of-00014.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00007-of-00014.safetensors",
+    "model.layers.25.self_attn.q_proj.bias": "model-00006-of-00014.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.25.self_attn.v_proj.bias": "model-00006-of-00014.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00010-of-00014.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00009-of-00014.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00009-of-00014.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00012-of-00014.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00013-of-00014.safetensors",
+    "model.layers.26.self_attn.k_proj.bias": "model-00012-of-00014.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00010-of-00014.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00012-of-00014.safetensors",
+    "model.layers.26.self_attn.q_proj.bias": "model-00008-of-00014.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.26.self_attn.v_proj.bias": "model-00009-of-00014.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00006-of-00014.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00009-of-00014.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00005-of-00014.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00009-of-00014.safetensors",
+    "model.layers.27.self_attn.k_proj.bias": "model-00012-of-00014.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00014.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00012-of-00014.safetensors",
+    "model.layers.27.self_attn.q_proj.bias": "model-00005-of-00014.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00009-of-00014.safetensors",
+    "model.layers.27.self_attn.v_proj.bias": "model-00009-of-00014.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00001-of-00014.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00010-of-00014.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00009-of-00014.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00007-of-00014.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00013-of-00014.safetensors",
+    "model.layers.28.self_attn.k_proj.bias": "model-00010-of-00014.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00014.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.28.self_attn.q_proj.bias": "model-00007-of-00014.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00011-of-00014.safetensors",
+    "model.layers.28.self_attn.v_proj.bias": "model-00005-of-00014.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00012-of-00014.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00003-of-00014.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00005-of-00014.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00009-of-00014.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00001-of-00014.safetensors",
+    "model.layers.29.self_attn.k_proj.bias": "model-00011-of-00014.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00014-of-00014.safetensors",
+    "model.layers.29.self_attn.q_proj.bias": "model-00013-of-00014.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00014.safetensors",
+    "model.layers.29.self_attn.v_proj.bias": "model-00014-of-00014.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00005-of-00014.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00008-of-00014.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00014.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00010-of-00014.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00010-of-00014.safetensors",
+    "model.layers.3.self_attn.k_proj.bias": "model-00012-of-00014.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.3.self_attn.q_proj.bias": "model-00007-of-00014.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00009-of-00014.safetensors",
+    "model.layers.3.self_attn.v_proj.bias": "model-00002-of-00014.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00011-of-00014.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00006-of-00014.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00014.safetensors",
+    "model.layers.30.self_attn.k_proj.bias": "model-00014-of-00014.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00007-of-00014.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.30.self_attn.q_proj.bias": "model-00004-of-00014.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.30.self_attn.v_proj.bias": "model-00005-of-00014.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00008-of-00014.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00013-of-00014.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00006-of-00014.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00009-of-00014.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00011-of-00014.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00013-of-00014.safetensors",
+    "model.layers.31.self_attn.k_proj.bias": "model-00010-of-00014.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00008-of-00014.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00005-of-00014.safetensors",
+    "model.layers.31.self_attn.q_proj.bias": "model-00003-of-00014.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00005-of-00014.safetensors",
+    "model.layers.31.self_attn.v_proj.bias": "model-00007-of-00014.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00008-of-00014.safetensors",
+    "model.layers.32.input_layernorm.weight": "model-00011-of-00014.safetensors",
+    "model.layers.32.mlp.down_proj.weight": "model-00011-of-00014.safetensors",
+    "model.layers.32.mlp.gate_proj.weight": "model-00006-of-00014.safetensors",
+    "model.layers.32.mlp.up_proj.weight": "model-00005-of-00014.safetensors",
+    "model.layers.32.post_attention_layernorm.weight": "model-00005-of-00014.safetensors",
+    "model.layers.32.self_attn.k_proj.bias": "model-00004-of-00014.safetensors",
+    "model.layers.32.self_attn.k_proj.weight": "model-00005-of-00014.safetensors",
+    "model.layers.32.self_attn.o_proj.weight": "model-00002-of-00014.safetensors",
+    "model.layers.32.self_attn.q_proj.bias": "model-00010-of-00014.safetensors",
+    "model.layers.32.self_attn.q_proj.weight": "model-00014-of-00014.safetensors",
+    "model.layers.32.self_attn.v_proj.bias": "model-00004-of-00014.safetensors",
+    "model.layers.32.self_attn.v_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.33.input_layernorm.weight": "model-00010-of-00014.safetensors",
+    "model.layers.33.mlp.down_proj.weight": "model-00012-of-00014.safetensors",
+    "model.layers.33.mlp.gate_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.33.mlp.up_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.33.post_attention_layernorm.weight": "model-00013-of-00014.safetensors",
+    "model.layers.33.self_attn.k_proj.bias": "model-00010-of-00014.safetensors",
+    "model.layers.33.self_attn.k_proj.weight": "model-00010-of-00014.safetensors",
+    "model.layers.33.self_attn.o_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.33.self_attn.q_proj.bias": "model-00005-of-00014.safetensors",
+    "model.layers.33.self_attn.q_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.33.self_attn.v_proj.bias": "model-00004-of-00014.safetensors",
+    "model.layers.33.self_attn.v_proj.weight": "model-00014-of-00014.safetensors",
+    "model.layers.34.input_layernorm.weight": "model-00009-of-00014.safetensors",
+    "model.layers.34.mlp.down_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.34.mlp.gate_proj.weight": "model-00006-of-00014.safetensors",
+    "model.layers.34.mlp.up_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.34.post_attention_layernorm.weight": "model-00012-of-00014.safetensors",
+    "model.layers.34.self_attn.k_proj.bias": "model-00013-of-00014.safetensors",
+    "model.layers.34.self_attn.k_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.34.self_attn.o_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.34.self_attn.q_proj.bias": "model-00009-of-00014.safetensors",
+    "model.layers.34.self_attn.q_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.34.self_attn.v_proj.bias": "model-00006-of-00014.safetensors",
+    "model.layers.34.self_attn.v_proj.weight": "model-00007-of-00014.safetensors",
+    "model.layers.35.input_layernorm.weight": "model-00005-of-00014.safetensors",
+    "model.layers.35.mlp.down_proj.weight": "model-00009-of-00014.safetensors",
+    "model.layers.35.mlp.gate_proj.weight": "model-00012-of-00014.safetensors",
+    "model.layers.35.mlp.up_proj.weight": "model-00011-of-00014.safetensors",
+    "model.layers.35.post_attention_layernorm.weight": "model-00001-of-00014.safetensors",
+    "model.layers.35.self_attn.k_proj.bias": "model-00011-of-00014.safetensors",
+    "model.layers.35.self_attn.k_proj.weight": "model-00007-of-00014.safetensors",
+    "model.layers.35.self_attn.o_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.35.self_attn.q_proj.bias": "model-00007-of-00014.safetensors",
+    "model.layers.35.self_attn.q_proj.weight": "model-00009-of-00014.safetensors",
+    "model.layers.35.self_attn.v_proj.bias": "model-00009-of-00014.safetensors",
+    "model.layers.35.self_attn.v_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.36.input_layernorm.weight": "model-00010-of-00014.safetensors",
+    "model.layers.36.mlp.down_proj.weight": "model-00006-of-00014.safetensors",
+    "model.layers.36.mlp.gate_proj.weight": "model-00006-of-00014.safetensors",
+    "model.layers.36.mlp.up_proj.weight": "model-00002-of-00014.safetensors",
+    "model.layers.36.post_attention_layernorm.weight": "model-00011-of-00014.safetensors",
+    "model.layers.36.self_attn.k_proj.bias": "model-00007-of-00014.safetensors",
+    "model.layers.36.self_attn.k_proj.weight": "model-00014-of-00014.safetensors",
+    "model.layers.36.self_attn.o_proj.weight": "model-00002-of-00014.safetensors",
+    "model.layers.36.self_attn.q_proj.bias": "model-00009-of-00014.safetensors",
+    "model.layers.36.self_attn.q_proj.weight": "model-00010-of-00014.safetensors",
+    "model.layers.36.self_attn.v_proj.bias": "model-00004-of-00014.safetensors",
+    "model.layers.36.self_attn.v_proj.weight": "model-00012-of-00014.safetensors",
+    "model.layers.37.input_layernorm.weight": "model-00012-of-00014.safetensors",
+    "model.layers.37.mlp.down_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.37.mlp.gate_proj.weight": "model-00005-of-00014.safetensors",
+    "model.layers.37.mlp.up_proj.weight": "model-00009-of-00014.safetensors",
+    "model.layers.37.post_attention_layernorm.weight": "model-00011-of-00014.safetensors",
+    "model.layers.37.self_attn.k_proj.bias": "model-00008-of-00014.safetensors",
+    "model.layers.37.self_attn.k_proj.weight": "model-00010-of-00014.safetensors",
+    "model.layers.37.self_attn.o_proj.weight": "model-00006-of-00014.safetensors",
+    "model.layers.37.self_attn.q_proj.bias": "model-00008-of-00014.safetensors",
+    "model.layers.37.self_attn.q_proj.weight": "model-00008-of-00014.safetensors",
+    "model.layers.37.self_attn.v_proj.bias": "model-00010-of-00014.safetensors",
+    "model.layers.37.self_attn.v_proj.weight": "model-00009-of-00014.safetensors",
+    "model.layers.38.input_layernorm.weight": "model-00010-of-00014.safetensors",
+    "model.layers.38.mlp.down_proj.weight": "model-00005-of-00014.safetensors",
+    "model.layers.38.mlp.gate_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.38.mlp.up_proj.weight": "model-00002-of-00014.safetensors",
+    "model.layers.38.post_attention_layernorm.weight": "model-00007-of-00014.safetensors",
+    "model.layers.38.self_attn.k_proj.bias": "model-00008-of-00014.safetensors",
+    "model.layers.38.self_attn.k_proj.weight": "model-00009-of-00014.safetensors",
+    "model.layers.38.self_attn.o_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.38.self_attn.q_proj.bias": "model-00010-of-00014.safetensors",
+    "model.layers.38.self_attn.q_proj.weight": "model-00010-of-00014.safetensors",
+    "model.layers.38.self_attn.v_proj.bias": "model-00001-of-00014.safetensors",
+    "model.layers.38.self_attn.v_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.39.input_layernorm.weight": "model-00004-of-00014.safetensors",
+    "model.layers.39.mlp.down_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.39.mlp.gate_proj.weight": "model-00014-of-00014.safetensors",
+    "model.layers.39.mlp.up_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.39.post_attention_layernorm.weight": "model-00013-of-00014.safetensors",
+    "model.layers.39.self_attn.k_proj.bias": "model-00008-of-00014.safetensors",
+    "model.layers.39.self_attn.k_proj.weight": "model-00006-of-00014.safetensors",
+    "model.layers.39.self_attn.o_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.39.self_attn.q_proj.bias": "model-00011-of-00014.safetensors",
+    "model.layers.39.self_attn.q_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.39.self_attn.v_proj.bias": "model-00004-of-00014.safetensors",
+    "model.layers.39.self_attn.v_proj.weight": "model-00002-of-00014.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00005-of-00014.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00009-of-00014.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00010-of-00014.safetensors",
+    "model.layers.4.self_attn.k_proj.bias": "model-00003-of-00014.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00005-of-00014.safetensors",
+    "model.layers.4.self_attn.q_proj.bias": "model-00012-of-00014.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00014-of-00014.safetensors",
+    "model.layers.4.self_attn.v_proj.bias": "model-00006-of-00014.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00010-of-00014.safetensors",
+    "model.layers.40.input_layernorm.weight": "model-00001-of-00014.safetensors",
+    "model.layers.40.mlp.down_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.40.mlp.gate_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.40.mlp.up_proj.weight": "model-00008-of-00014.safetensors",
+    "model.layers.40.post_attention_layernorm.weight": "model-00005-of-00014.safetensors",
+    "model.layers.40.self_attn.k_proj.bias": "model-00010-of-00014.safetensors",
+    "model.layers.40.self_attn.k_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.40.self_attn.o_proj.weight": "model-00012-of-00014.safetensors",
+    "model.layers.40.self_attn.q_proj.bias": "model-00012-of-00014.safetensors",
+    "model.layers.40.self_attn.q_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.40.self_attn.v_proj.bias": "model-00003-of-00014.safetensors",
+    "model.layers.40.self_attn.v_proj.weight": "model-00006-of-00014.safetensors",
+    "model.layers.41.input_layernorm.weight": "model-00006-of-00014.safetensors",
+    "model.layers.41.mlp.down_proj.weight": "model-00009-of-00014.safetensors",
+    "model.layers.41.mlp.gate_proj.weight": "model-00012-of-00014.safetensors",
+    "model.layers.41.mlp.up_proj.weight": "model-00002-of-00014.safetensors",
+    "model.layers.41.post_attention_layernorm.weight": "model-00010-of-00014.safetensors",
+    "model.layers.41.self_attn.k_proj.bias": "model-00001-of-00014.safetensors",
+    "model.layers.41.self_attn.k_proj.weight": "model-00007-of-00014.safetensors",
+    "model.layers.41.self_attn.o_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.41.self_attn.q_proj.bias": "model-00001-of-00014.safetensors",
+    "model.layers.41.self_attn.q_proj.weight": "model-00005-of-00014.safetensors",
+    "model.layers.41.self_attn.v_proj.bias": "model-00012-of-00014.safetensors",
+    "model.layers.41.self_attn.v_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.42.input_layernorm.weight": "model-00001-of-00014.safetensors",
+    "model.layers.42.mlp.down_proj.weight": "model-00011-of-00014.safetensors",
+    "model.layers.42.mlp.gate_proj.weight": "model-00005-of-00014.safetensors",
+    "model.layers.42.mlp.up_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.42.post_attention_layernorm.weight": "model-00003-of-00014.safetensors",
+    "model.layers.42.self_attn.k_proj.bias": "model-00003-of-00014.safetensors",
+    "model.layers.42.self_attn.k_proj.weight": "model-00006-of-00014.safetensors",
+    "model.layers.42.self_attn.o_proj.weight": "model-00002-of-00014.safetensors",
+    "model.layers.42.self_attn.q_proj.bias": "model-00010-of-00014.safetensors",
+    "model.layers.42.self_attn.q_proj.weight": "model-00006-of-00014.safetensors",
+    "model.layers.42.self_attn.v_proj.bias": "model-00010-of-00014.safetensors",
+    "model.layers.42.self_attn.v_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.43.input_layernorm.weight": "model-00006-of-00014.safetensors",
+    "model.layers.43.mlp.down_proj.weight": "model-00010-of-00014.safetensors",
+    "model.layers.43.mlp.gate_proj.weight": "model-00012-of-00014.safetensors",
+    "model.layers.43.mlp.up_proj.weight": "model-00010-of-00014.safetensors",
+    "model.layers.43.post_attention_layernorm.weight": "model-00011-of-00014.safetensors",
+    "model.layers.43.self_attn.k_proj.bias": "model-00010-of-00014.safetensors",
+    "model.layers.43.self_attn.k_proj.weight": "model-00005-of-00014.safetensors",
+    "model.layers.43.self_attn.o_proj.weight": "model-00010-of-00014.safetensors",
+    "model.layers.43.self_attn.q_proj.bias": "model-00007-of-00014.safetensors",
+    "model.layers.43.self_attn.q_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.43.self_attn.v_proj.bias": "model-00009-of-00014.safetensors",
+    "model.layers.43.self_attn.v_proj.weight": "model-00006-of-00014.safetensors",
+    "model.layers.44.input_layernorm.weight": "model-00001-of-00014.safetensors",
+    "model.layers.44.mlp.down_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.44.mlp.gate_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.44.mlp.up_proj.weight": "model-00014-of-00014.safetensors",
+    "model.layers.44.post_attention_layernorm.weight": "model-00014-of-00014.safetensors",
+    "model.layers.44.self_attn.k_proj.bias": "model-00007-of-00014.safetensors",
+    "model.layers.44.self_attn.k_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.44.self_attn.o_proj.weight": "model-00007-of-00014.safetensors",
+    "model.layers.44.self_attn.q_proj.bias": "model-00011-of-00014.safetensors",
+    "model.layers.44.self_attn.q_proj.weight": "model-00012-of-00014.safetensors",
+    "model.layers.44.self_attn.v_proj.bias": "model-00011-of-00014.safetensors",
+    "model.layers.44.self_attn.v_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.45.input_layernorm.weight": "model-00001-of-00014.safetensors",
+    "model.layers.45.mlp.down_proj.weight": "model-00009-of-00014.safetensors",
+    "model.layers.45.mlp.gate_proj.weight": "model-00010-of-00014.safetensors",
+    "model.layers.45.mlp.up_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.45.post_attention_layernorm.weight": "model-00004-of-00014.safetensors",
+    "model.layers.45.self_attn.k_proj.bias": "model-00005-of-00014.safetensors",
+    "model.layers.45.self_attn.k_proj.weight": "model-00010-of-00014.safetensors",
+    "model.layers.45.self_attn.o_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.45.self_attn.q_proj.bias": "model-00003-of-00014.safetensors",
+    "model.layers.45.self_attn.q_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.45.self_attn.v_proj.bias": "model-00009-of-00014.safetensors",
+    "model.layers.45.self_attn.v_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.46.input_layernorm.weight": "model-00001-of-00014.safetensors",
+    "model.layers.46.mlp.down_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.46.mlp.gate_proj.weight": "model-00005-of-00014.safetensors",
+    "model.layers.46.mlp.up_proj.weight": "model-00009-of-00014.safetensors",
+    "model.layers.46.post_attention_layernorm.weight": "model-00001-of-00014.safetensors",
+    "model.layers.46.self_attn.k_proj.bias": "model-00012-of-00014.safetensors",
+    "model.layers.46.self_attn.k_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.46.self_attn.o_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.46.self_attn.q_proj.bias": "model-00010-of-00014.safetensors",
+    "model.layers.46.self_attn.q_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.46.self_attn.v_proj.bias": "model-00002-of-00014.safetensors",
+    "model.layers.46.self_attn.v_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.47.input_layernorm.weight": "model-00002-of-00014.safetensors",
+    "model.layers.47.mlp.down_proj.weight": "model-00008-of-00014.safetensors",
+    "model.layers.47.mlp.gate_proj.weight": "model-00002-of-00014.safetensors",
+    "model.layers.47.mlp.up_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.47.post_attention_layernorm.weight": "model-00004-of-00014.safetensors",
+    "model.layers.47.self_attn.k_proj.bias": "model-00009-of-00014.safetensors",
+    "model.layers.47.self_attn.k_proj.weight": "model-00012-of-00014.safetensors",
+    "model.layers.47.self_attn.o_proj.weight": "model-00005-of-00014.safetensors",
+    "model.layers.47.self_attn.q_proj.bias": "model-00003-of-00014.safetensors",
+    "model.layers.47.self_attn.q_proj.weight": "model-00005-of-00014.safetensors",
+    "model.layers.47.self_attn.v_proj.bias": "model-00009-of-00014.safetensors",
+    "model.layers.47.self_attn.v_proj.weight": "model-00005-of-00014.safetensors",
+    "model.layers.48.input_layernorm.weight": "model-00008-of-00014.safetensors",
+    "model.layers.48.mlp.down_proj.weight": "model-00009-of-00014.safetensors",
+    "model.layers.48.mlp.gate_proj.weight": "model-00002-of-00014.safetensors",
+    "model.layers.48.mlp.up_proj.weight": "model-00012-of-00014.safetensors",
+    "model.layers.48.post_attention_layernorm.weight": "model-00008-of-00014.safetensors",
+    "model.layers.48.self_attn.k_proj.bias": "model-00013-of-00014.safetensors",
+    "model.layers.48.self_attn.k_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.48.self_attn.o_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.48.self_attn.q_proj.bias": "model-00007-of-00014.safetensors",
+    "model.layers.48.self_attn.q_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.48.self_attn.v_proj.bias": "model-00012-of-00014.safetensors",
+    "model.layers.48.self_attn.v_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.49.input_layernorm.weight": "model-00014-of-00014.safetensors",
+    "model.layers.49.mlp.down_proj.weight": "model-00012-of-00014.safetensors",
+    "model.layers.49.mlp.gate_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.49.mlp.up_proj.weight": "model-00002-of-00014.safetensors",
+    "model.layers.49.post_attention_layernorm.weight": "model-00006-of-00014.safetensors",
+    "model.layers.49.self_attn.k_proj.bias": "model-00010-of-00014.safetensors",
+    "model.layers.49.self_attn.k_proj.weight": "model-00014-of-00014.safetensors",
+    "model.layers.49.self_attn.o_proj.weight": "model-00002-of-00014.safetensors",
+    "model.layers.49.self_attn.q_proj.bias": "model-00009-of-00014.safetensors",
+    "model.layers.49.self_attn.q_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.49.self_attn.v_proj.bias": "model-00002-of-00014.safetensors",
+    "model.layers.49.self_attn.v_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00004-of-00014.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00014-of-00014.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00007-of-00014.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00014-of-00014.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00011-of-00014.safetensors",
+    "model.layers.5.self_attn.k_proj.bias": "model-00003-of-00014.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00011-of-00014.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.5.self_attn.q_proj.bias": "model-00008-of-00014.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.5.self_attn.v_proj.bias": "model-00014-of-00014.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.50.input_layernorm.weight": "model-00002-of-00014.safetensors",
+    "model.layers.50.mlp.down_proj.weight": "model-00006-of-00014.safetensors",
+    "model.layers.50.mlp.gate_proj.weight": "model-00002-of-00014.safetensors",
+    "model.layers.50.mlp.up_proj.weight": "model-00007-of-00014.safetensors",
+    "model.layers.50.post_attention_layernorm.weight": "model-00003-of-00014.safetensors",
+    "model.layers.50.self_attn.k_proj.bias": "model-00011-of-00014.safetensors",
+    "model.layers.50.self_attn.k_proj.weight": "model-00008-of-00014.safetensors",
+    "model.layers.50.self_attn.o_proj.weight": "model-00014-of-00014.safetensors",
+    "model.layers.50.self_attn.q_proj.bias": "model-00008-of-00014.safetensors",
+    "model.layers.50.self_attn.q_proj.weight": "model-00012-of-00014.safetensors",
+    "model.layers.50.self_attn.v_proj.bias": "model-00005-of-00014.safetensors",
+    "model.layers.50.self_attn.v_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.51.input_layernorm.weight": "model-00003-of-00014.safetensors",
+    "model.layers.51.mlp.down_proj.weight": "model-00002-of-00014.safetensors",
+    "model.layers.51.mlp.gate_proj.weight": "model-00006-of-00014.safetensors",
+    "model.layers.51.mlp.up_proj.weight": "model-00010-of-00014.safetensors",
+    "model.layers.51.post_attention_layernorm.weight": "model-00010-of-00014.safetensors",
+    "model.layers.51.self_attn.k_proj.bias": "model-00001-of-00014.safetensors",
+    "model.layers.51.self_attn.k_proj.weight": "model-00012-of-00014.safetensors",
+    "model.layers.51.self_attn.o_proj.weight": "model-00014-of-00014.safetensors",
+    "model.layers.51.self_attn.q_proj.bias": "model-00006-of-00014.safetensors",
+    "model.layers.51.self_attn.q_proj.weight": "model-00005-of-00014.safetensors",
+    "model.layers.51.self_attn.v_proj.bias": "model-00001-of-00014.safetensors",
+    "model.layers.51.self_attn.v_proj.weight": "model-00012-of-00014.safetensors",
+    "model.layers.52.input_layernorm.weight": "model-00013-of-00014.safetensors",
+    "model.layers.52.mlp.down_proj.weight": "model-00012-of-00014.safetensors",
+    "model.layers.52.mlp.gate_proj.weight": "model-00005-of-00014.safetensors",
+    "model.layers.52.mlp.up_proj.weight": "model-00009-of-00014.safetensors",
+    "model.layers.52.post_attention_layernorm.weight": "model-00001-of-00014.safetensors",
+    "model.layers.52.self_attn.k_proj.bias": "model-00011-of-00014.safetensors",
+    "model.layers.52.self_attn.k_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.52.self_attn.o_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.52.self_attn.q_proj.bias": "model-00012-of-00014.safetensors",
+    "model.layers.52.self_attn.q_proj.weight": "model-00012-of-00014.safetensors",
+    "model.layers.52.self_attn.v_proj.bias": "model-00008-of-00014.safetensors",
+    "model.layers.52.self_attn.v_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.53.input_layernorm.weight": "model-00011-of-00014.safetensors",
+    "model.layers.53.mlp.down_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.53.mlp.gate_proj.weight": "model-00006-of-00014.safetensors",
+    "model.layers.53.mlp.up_proj.weight": "model-00002-of-00014.safetensors",
+    "model.layers.53.post_attention_layernorm.weight": "model-00007-of-00014.safetensors",
+    "model.layers.53.self_attn.k_proj.bias": "model-00008-of-00014.safetensors",
+    "model.layers.53.self_attn.k_proj.weight": "model-00012-of-00014.safetensors",
+    "model.layers.53.self_attn.o_proj.weight": "model-00012-of-00014.safetensors",
+    "model.layers.53.self_attn.q_proj.bias": "model-00014-of-00014.safetensors",
+    "model.layers.53.self_attn.q_proj.weight": "model-00012-of-00014.safetensors",
+    "model.layers.53.self_attn.v_proj.bias": "model-00011-of-00014.safetensors",
+    "model.layers.53.self_attn.v_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.54.input_layernorm.weight": "model-00013-of-00014.safetensors",
+    "model.layers.54.mlp.down_proj.weight": "model-00002-of-00014.safetensors",
+    "model.layers.54.mlp.gate_proj.weight": "model-00014-of-00014.safetensors",
+    "model.layers.54.mlp.up_proj.weight": "model-00005-of-00014.safetensors",
+    "model.layers.54.post_attention_layernorm.weight": "model-00010-of-00014.safetensors",
+    "model.layers.54.self_attn.k_proj.bias": "model-00004-of-00014.safetensors",
+    "model.layers.54.self_attn.k_proj.weight": "model-00011-of-00014.safetensors",
+    "model.layers.54.self_attn.o_proj.weight": "model-00012-of-00014.safetensors",
+    "model.layers.54.self_attn.q_proj.bias": "model-00006-of-00014.safetensors",
+    "model.layers.54.self_attn.q_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.54.self_attn.v_proj.bias": "model-00001-of-00014.safetensors",
+    "model.layers.54.self_attn.v_proj.weight": "model-00008-of-00014.safetensors",
+    "model.layers.55.input_layernorm.weight": "model-00014-of-00014.safetensors",
+    "model.layers.55.mlp.down_proj.weight": "model-00012-of-00014.safetensors",
+    "model.layers.55.mlp.gate_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.55.mlp.up_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.55.post_attention_layernorm.weight": "model-00004-of-00014.safetensors",
+    "model.layers.55.self_attn.k_proj.bias": "model-00001-of-00014.safetensors",
+    "model.layers.55.self_attn.k_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.55.self_attn.o_proj.weight": "model-00012-of-00014.safetensors",
+    "model.layers.55.self_attn.q_proj.bias": "model-00012-of-00014.safetensors",
+    "model.layers.55.self_attn.q_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.55.self_attn.v_proj.bias": "model-00001-of-00014.safetensors",
+    "model.layers.55.self_attn.v_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.56.input_layernorm.weight": "model-00002-of-00014.safetensors",
+    "model.layers.56.mlp.down_proj.weight": "model-00011-of-00014.safetensors",
+    "model.layers.56.mlp.gate_proj.weight": "model-00007-of-00014.safetensors",
+    "model.layers.56.mlp.up_proj.weight": "model-00010-of-00014.safetensors",
+    "model.layers.56.post_attention_layernorm.weight": "model-00014-of-00014.safetensors",
+    "model.layers.56.self_attn.k_proj.bias": "model-00001-of-00014.safetensors",
+    "model.layers.56.self_attn.k_proj.weight": "model-00007-of-00014.safetensors",
+    "model.layers.56.self_attn.o_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.56.self_attn.q_proj.bias": "model-00001-of-00014.safetensors",
+    "model.layers.56.self_attn.q_proj.weight": "model-00009-of-00014.safetensors",
+    "model.layers.56.self_attn.v_proj.bias": "model-00013-of-00014.safetensors",
+    "model.layers.56.self_attn.v_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.57.input_layernorm.weight": "model-00010-of-00014.safetensors",
+    "model.layers.57.mlp.down_proj.weight": "model-00014-of-00014.safetensors",
+    "model.layers.57.mlp.gate_proj.weight": "model-00011-of-00014.safetensors",
+    "model.layers.57.mlp.up_proj.weight": "model-00002-of-00014.safetensors",
+    "model.layers.57.post_attention_layernorm.weight": "model-00001-of-00014.safetensors",
+    "model.layers.57.self_attn.k_proj.bias": "model-00005-of-00014.safetensors",
+    "model.layers.57.self_attn.k_proj.weight": "model-00009-of-00014.safetensors",
+    "model.layers.57.self_attn.o_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.57.self_attn.q_proj.bias": "model-00009-of-00014.safetensors",
+    "model.layers.57.self_attn.q_proj.weight": "model-00009-of-00014.safetensors",
+    "model.layers.57.self_attn.v_proj.bias": "model-00005-of-00014.safetensors",
+    "model.layers.57.self_attn.v_proj.weight": "model-00007-of-00014.safetensors",
+    "model.layers.58.input_layernorm.weight": "model-00014-of-00014.safetensors",
+    "model.layers.58.mlp.down_proj.weight": "model-00006-of-00014.safetensors",
+    "model.layers.58.mlp.gate_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.58.mlp.up_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.58.post_attention_layernorm.weight": "model-00004-of-00014.safetensors",
+    "model.layers.58.self_attn.k_proj.bias": "model-00006-of-00014.safetensors",
+    "model.layers.58.self_attn.k_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.58.self_attn.o_proj.weight": "model-00005-of-00014.safetensors",
+    "model.layers.58.self_attn.q_proj.bias": "model-00012-of-00014.safetensors",
+    "model.layers.58.self_attn.q_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.58.self_attn.v_proj.bias": "model-00010-of-00014.safetensors",
+    "model.layers.58.self_attn.v_proj.weight": "model-00005-of-00014.safetensors",
+    "model.layers.59.input_layernorm.weight": "model-00001-of-00014.safetensors",
+    "model.layers.59.mlp.down_proj.weight": "model-00006-of-00014.safetensors",
+    "model.layers.59.mlp.gate_proj.weight": "model-00007-of-00014.safetensors",
+    "model.layers.59.mlp.up_proj.weight": "model-00007-of-00014.safetensors",
+    "model.layers.59.post_attention_layernorm.weight": "model-00004-of-00014.safetensors",
+    "model.layers.59.self_attn.k_proj.bias": "model-00004-of-00014.safetensors",
+    "model.layers.59.self_attn.k_proj.weight": "model-00011-of-00014.safetensors",
+    "model.layers.59.self_attn.o_proj.weight": "model-00011-of-00014.safetensors",
+    "model.layers.59.self_attn.q_proj.bias": "model-00010-of-00014.safetensors",
+    "model.layers.59.self_attn.q_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.59.self_attn.v_proj.bias": "model-00010-of-00014.safetensors",
+    "model.layers.59.self_attn.v_proj.weight": "model-00002-of-00014.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00012-of-00014.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00011-of-00014.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00011-of-00014.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00006-of-00014.safetensors",
+    "model.layers.6.self_attn.k_proj.bias": "model-00011-of-00014.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00014-of-00014.safetensors",
+    "model.layers.6.self_attn.q_proj.bias": "model-00004-of-00014.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00014-of-00014.safetensors",
+    "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00014.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.60.input_layernorm.weight": "model-00003-of-00014.safetensors",
+    "model.layers.60.mlp.down_proj.weight": "model-00005-of-00014.safetensors",
+    "model.layers.60.mlp.gate_proj.weight": "model-00006-of-00014.safetensors",
+    "model.layers.60.mlp.up_proj.weight": "model-00005-of-00014.safetensors",
+    "model.layers.60.post_attention_layernorm.weight": "model-00009-of-00014.safetensors",
+    "model.layers.60.self_attn.k_proj.bias": "model-00005-of-00014.safetensors",
+    "model.layers.60.self_attn.k_proj.weight": "model-00011-of-00014.safetensors",
+    "model.layers.60.self_attn.o_proj.weight": "model-00008-of-00014.safetensors",
+    "model.layers.60.self_attn.q_proj.bias": "model-00010-of-00014.safetensors",
+    "model.layers.60.self_attn.q_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.60.self_attn.v_proj.bias": "model-00009-of-00014.safetensors",
+    "model.layers.60.self_attn.v_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.61.input_layernorm.weight": "model-00002-of-00014.safetensors",
+    "model.layers.61.mlp.down_proj.weight": "model-00009-of-00014.safetensors",
+    "model.layers.61.mlp.gate_proj.weight": "model-00005-of-00014.safetensors",
+    "model.layers.61.mlp.up_proj.weight": "model-00011-of-00014.safetensors",
+    "model.layers.61.post_attention_layernorm.weight": "model-00014-of-00014.safetensors",
+    "model.layers.61.self_attn.k_proj.bias": "model-00012-of-00014.safetensors",
+    "model.layers.61.self_attn.k_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.61.self_attn.o_proj.weight": "model-00009-of-00014.safetensors",
+    "model.layers.61.self_attn.q_proj.bias": "model-00005-of-00014.safetensors",
+    "model.layers.61.self_attn.q_proj.weight": "model-00009-of-00014.safetensors",
+    "model.layers.61.self_attn.v_proj.bias": "model-00004-of-00014.safetensors",
+    "model.layers.61.self_attn.v_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.62.input_layernorm.weight": "model-00013-of-00014.safetensors",
+    "model.layers.62.mlp.down_proj.weight": "model-00007-of-00014.safetensors",
+    "model.layers.62.mlp.gate_proj.weight": "model-00012-of-00014.safetensors",
+    "model.layers.62.mlp.up_proj.weight": "model-00011-of-00014.safetensors",
+    "model.layers.62.post_attention_layernorm.weight": "model-00004-of-00014.safetensors",
+    "model.layers.62.self_attn.k_proj.bias": "model-00004-of-00014.safetensors",
+    "model.layers.62.self_attn.k_proj.weight": "model-00004-of-00014.safetensors",
+    "model.layers.62.self_attn.o_proj.weight": "model-00007-of-00014.safetensors",
+    "model.layers.62.self_attn.q_proj.bias": "model-00004-of-00014.safetensors",
+    "model.layers.62.self_attn.q_proj.weight": "model-00002-of-00014.safetensors",
+    "model.layers.62.self_attn.v_proj.bias": "model-00007-of-00014.safetensors",
+    "model.layers.62.self_attn.v_proj.weight": "model-00006-of-00014.safetensors",
+    "model.layers.63.input_layernorm.weight": "model-00008-of-00014.safetensors",
+    "model.layers.63.mlp.down_proj.weight": "model-00007-of-00014.safetensors",
+    "model.layers.63.mlp.gate_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.63.mlp.up_proj.weight": "model-00008-of-00014.safetensors",
+    "model.layers.63.post_attention_layernorm.weight": "model-00006-of-00014.safetensors",
+    "model.layers.63.self_attn.k_proj.bias": "model-00005-of-00014.safetensors",
+    "model.layers.63.self_attn.k_proj.weight": "model-00005-of-00014.safetensors",
+    "model.layers.63.self_attn.o_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.63.self_attn.q_proj.bias": "model-00007-of-00014.safetensors",
+    "model.layers.63.self_attn.q_proj.weight": "model-00007-of-00014.safetensors",
+    "model.layers.63.self_attn.v_proj.bias": "model-00010-of-00014.safetensors",
+    "model.layers.63.self_attn.v_proj.weight": "model-00001-of-00014.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00014.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00011-of-00014.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00010-of-00014.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00006-of-00014.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00009-of-00014.safetensors",
+    "model.layers.7.self_attn.k_proj.bias": "model-00007-of-00014.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00006-of-00014.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00014.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00005-of-00014.safetensors",
+    "model.layers.7.self_attn.v_proj.bias": "model-00007-of-00014.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00010-of-00014.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00014.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00014-of-00014.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00005-of-00014.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00010-of-00014.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00011-of-00014.safetensors",
+    "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00014.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00014-of-00014.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00006-of-00014.safetensors",
+    "model.layers.8.self_attn.q_proj.bias": "model-00007-of-00014.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.8.self_attn.v_proj.bias": "model-00004-of-00014.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00012-of-00014.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00012-of-00014.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00011-of-00014.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00003-of-00014.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00007-of-00014.safetensors",
+    "model.layers.9.self_attn.k_proj.bias": "model-00011-of-00014.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00009-of-00014.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00013-of-00014.safetensors",
+    "model.layers.9.self_attn.q_proj.bias": "model-00010-of-00014.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00009-of-00014.safetensors",
+    "model.layers.9.self_attn.v_proj.bias": "model-00011-of-00014.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00012-of-00014.safetensors",
+    "model.norm.weight": "model-00003-of-00014.safetensors",
+    "visual.blocks.0.attn.proj.bias": "model-00004-of-00014.safetensors",
+    "visual.blocks.0.attn.proj.weight": "model-00001-of-00014.safetensors",
+    "visual.blocks.0.attn.qkv.bias": "model-00004-of-00014.safetensors",
+    "visual.blocks.0.attn.qkv.weight": "model-00003-of-00014.safetensors",
+    "visual.blocks.0.mlp.down_proj.bias": "model-00013-of-00014.safetensors",
+    "visual.blocks.0.mlp.down_proj.weight": "model-00010-of-00014.safetensors",
+    "visual.blocks.0.mlp.gate_proj.bias": "model-00005-of-00014.safetensors",
+    "visual.blocks.0.mlp.gate_proj.weight": "model-00001-of-00014.safetensors",
+    "visual.blocks.0.mlp.up_proj.bias": "model-00002-of-00014.safetensors",
+    "visual.blocks.0.mlp.up_proj.weight": "model-00012-of-00014.safetensors",
+    "visual.blocks.0.norm1.weight": "model-00009-of-00014.safetensors",
+    "visual.blocks.0.norm2.weight": "model-00001-of-00014.safetensors",
+    "visual.blocks.1.attn.proj.bias": "model-00001-of-00014.safetensors",
+    "visual.blocks.1.attn.proj.weight": "model-00001-of-00014.safetensors",
+    "visual.blocks.1.attn.qkv.bias": "model-00003-of-00014.safetensors",
+    "visual.blocks.1.attn.qkv.weight": "model-00014-of-00014.safetensors",
+    "visual.blocks.1.mlp.down_proj.bias": "model-00002-of-00014.safetensors",
+    "visual.blocks.1.mlp.down_proj.weight": "model-00014-of-00014.safetensors",
+    "visual.blocks.1.mlp.gate_proj.bias": "model-00010-of-00014.safetensors",
+    "visual.blocks.1.mlp.gate_proj.weight": "model-00001-of-00014.safetensors",
+    "visual.blocks.1.mlp.up_proj.bias": "model-00014-of-00014.safetensors",
+    "visual.blocks.1.mlp.up_proj.weight": "model-00003-of-00014.safetensors",
+    "visual.blocks.1.norm1.weight": "model-00010-of-00014.safetensors",
+    "visual.blocks.1.norm2.weight": "model-00004-of-00014.safetensors",
+    "visual.blocks.10.attn.proj.bias": "model-00014-of-00014.safetensors",
+    "visual.blocks.10.attn.proj.weight": "model-00001-of-00014.safetensors",
+    "visual.blocks.10.attn.qkv.bias": "model-00010-of-00014.safetensors",
+    "visual.blocks.10.attn.qkv.weight": "model-00013-of-00014.safetensors",
+    "visual.blocks.10.mlp.down_proj.bias": "model-00012-of-00014.safetensors",
+    "visual.blocks.10.mlp.down_proj.weight": "model-00014-of-00014.safetensors",
+    "visual.blocks.10.mlp.gate_proj.bias": "model-00014-of-00014.safetensors",
+    "visual.blocks.10.mlp.gate_proj.weight": "model-00010-of-00014.safetensors",
+    "visual.blocks.10.mlp.up_proj.bias": "model-00001-of-00014.safetensors",
+    "visual.blocks.10.mlp.up_proj.weight": "model-00013-of-00014.safetensors",
+    "visual.blocks.10.norm1.weight": "model-00010-of-00014.safetensors",
+    "visual.blocks.10.norm2.weight": "model-00014-of-00014.safetensors",
+    "visual.blocks.11.attn.proj.bias": "model-00006-of-00014.safetensors",
+    "visual.blocks.11.attn.proj.weight": "model-00011-of-00014.safetensors",
+    "visual.blocks.11.attn.qkv.bias": "model-00006-of-00014.safetensors",
+    "visual.blocks.11.attn.qkv.weight": "model-00001-of-00014.safetensors",
+    "visual.blocks.11.mlp.down_proj.bias": "model-00009-of-00014.safetensors",
+    "visual.blocks.11.mlp.down_proj.weight": "model-00006-of-00014.safetensors",
+    "visual.blocks.11.mlp.gate_proj.bias": "model-00011-of-00014.safetensors",
+    "visual.blocks.11.mlp.gate_proj.weight": "model-00006-of-00014.safetensors",
+    "visual.blocks.11.mlp.up_proj.bias": "model-00006-of-00014.safetensors",
+    "visual.blocks.11.mlp.up_proj.weight": "model-00003-of-00014.safetensors",
+    "visual.blocks.11.norm1.weight": "model-00003-of-00014.safetensors",
+    "visual.blocks.11.norm2.weight": "model-00010-of-00014.safetensors",
+    "visual.blocks.12.attn.proj.bias": "model-00014-of-00014.safetensors",
+    "visual.blocks.12.attn.proj.weight": "model-00013-of-00014.safetensors",
+    "visual.blocks.12.attn.qkv.bias": "model-00003-of-00014.safetensors",
+    "visual.blocks.12.attn.qkv.weight": "model-00010-of-00014.safetensors",
+    "visual.blocks.12.mlp.down_proj.bias": "model-00001-of-00014.safetensors",
+    "visual.blocks.12.mlp.down_proj.weight": "model-00007-of-00014.safetensors",
+    "visual.blocks.12.mlp.gate_proj.bias": "model-00005-of-00014.safetensors",
+    "visual.blocks.12.mlp.gate_proj.weight": "model-00008-of-00014.safetensors",
+    "visual.blocks.12.mlp.up_proj.bias": "model-00001-of-00014.safetensors",
+    "visual.blocks.12.mlp.up_proj.weight": "model-00010-of-00014.safetensors",
+    "visual.blocks.12.norm1.weight": "model-00013-of-00014.safetensors",
+    "visual.blocks.12.norm2.weight": "model-00003-of-00014.safetensors",
+    "visual.blocks.13.attn.proj.bias": "model-00012-of-00014.safetensors",
+    "visual.blocks.13.attn.proj.weight": "model-00003-of-00014.safetensors",
+    "visual.blocks.13.attn.qkv.bias": "model-00008-of-00014.safetensors",
+    "visual.blocks.13.attn.qkv.weight": "model-00003-of-00014.safetensors",
+    "visual.blocks.13.mlp.down_proj.bias": "model-00014-of-00014.safetensors",
+    "visual.blocks.13.mlp.down_proj.weight": "model-00010-of-00014.safetensors",
+    "visual.blocks.13.mlp.gate_proj.bias": "model-00012-of-00014.safetensors",
+    "visual.blocks.13.mlp.gate_proj.weight": "model-00009-of-00014.safetensors",
+    "visual.blocks.13.mlp.up_proj.bias": "model-00013-of-00014.safetensors",
+    "visual.blocks.13.mlp.up_proj.weight": "model-00001-of-00014.safetensors",
+    "visual.blocks.13.norm1.weight": "model-00003-of-00014.safetensors",
+    "visual.blocks.13.norm2.weight": "model-00014-of-00014.safetensors",
+    "visual.blocks.14.attn.proj.bias": "model-00004-of-00014.safetensors",
+    "visual.blocks.14.attn.proj.weight": "model-00013-of-00014.safetensors",
+    "visual.blocks.14.attn.qkv.bias": "model-00009-of-00014.safetensors",
+    "visual.blocks.14.attn.qkv.weight": "model-00001-of-00014.safetensors",
+    "visual.blocks.14.mlp.down_proj.bias": "model-00001-of-00014.safetensors",
+    "visual.blocks.14.mlp.down_proj.weight": "model-00010-of-00014.safetensors",
+    "visual.blocks.14.mlp.gate_proj.bias": "model-00009-of-00014.safetensors",
+    "visual.blocks.14.mlp.gate_proj.weight": "model-00013-of-00014.safetensors",
+    "visual.blocks.14.mlp.up_proj.bias": "model-00001-of-00014.safetensors",
+    "visual.blocks.14.mlp.up_proj.weight": "model-00012-of-00014.safetensors",
+    "visual.blocks.14.norm1.weight": "model-00003-of-00014.safetensors",
+    "visual.blocks.14.norm2.weight": "model-00011-of-00014.safetensors",
+    "visual.blocks.15.attn.proj.bias": "model-00010-of-00014.safetensors",
+    "visual.blocks.15.attn.proj.weight": "model-00009-of-00014.safetensors",
+    "visual.blocks.15.attn.qkv.bias": "model-00006-of-00014.safetensors",
+    "visual.blocks.15.attn.qkv.weight": "model-00009-of-00014.safetensors",
+    "visual.blocks.15.mlp.down_proj.bias": "model-00003-of-00014.safetensors",
+    "visual.blocks.15.mlp.down_proj.weight": "model-00009-of-00014.safetensors",
+    "visual.blocks.15.mlp.gate_proj.bias": "model-00005-of-00014.safetensors",
+    "visual.blocks.15.mlp.gate_proj.weight": "model-00002-of-00014.safetensors",
+    "visual.blocks.15.mlp.up_proj.bias": "model-00012-of-00014.safetensors",
+    "visual.blocks.15.mlp.up_proj.weight": "model-00012-of-00014.safetensors",
+    "visual.blocks.15.norm1.weight": "model-00010-of-00014.safetensors",
+    "visual.blocks.15.norm2.weight": "model-00012-of-00014.safetensors",
+    "visual.blocks.16.attn.proj.bias": "model-00001-of-00014.safetensors",
+    "visual.blocks.16.attn.proj.weight": "model-00012-of-00014.safetensors",
+    "visual.blocks.16.attn.qkv.bias": "model-00009-of-00014.safetensors",
+    "visual.blocks.16.attn.qkv.weight": "model-00011-of-00014.safetensors",
+    "visual.blocks.16.mlp.down_proj.bias": "model-00004-of-00014.safetensors",
+    "visual.blocks.16.mlp.down_proj.weight": "model-00005-of-00014.safetensors",
+    "visual.blocks.16.mlp.gate_proj.bias": "model-00013-of-00014.safetensors",
+    "visual.blocks.16.mlp.gate_proj.weight": "model-00012-of-00014.safetensors",
+    "visual.blocks.16.mlp.up_proj.bias": "model-00007-of-00014.safetensors",
+    "visual.blocks.16.mlp.up_proj.weight": "model-00004-of-00014.safetensors",
+    "visual.blocks.16.norm1.weight": "model-00001-of-00014.safetensors",
+    "visual.blocks.16.norm2.weight": "model-00014-of-00014.safetensors",
+    "visual.blocks.17.attn.proj.bias": "model-00004-of-00014.safetensors",
+    "visual.blocks.17.attn.proj.weight": "model-00004-of-00014.safetensors",
+    "visual.blocks.17.attn.qkv.bias": "model-00013-of-00014.safetensors",
+    "visual.blocks.17.attn.qkv.weight": "model-00012-of-00014.safetensors",
+    "visual.blocks.17.mlp.down_proj.bias": "model-00003-of-00014.safetensors",
+    "visual.blocks.17.mlp.down_proj.weight": "model-00010-of-00014.safetensors",
+    "visual.blocks.17.mlp.gate_proj.bias": "model-00009-of-00014.safetensors",
+    "visual.blocks.17.mlp.gate_proj.weight": "model-00007-of-00014.safetensors",
+    "visual.blocks.17.mlp.up_proj.bias": "model-00010-of-00014.safetensors",
+    "visual.blocks.17.mlp.up_proj.weight": "model-00004-of-00014.safetensors",
+    "visual.blocks.17.norm1.weight": "model-00001-of-00014.safetensors",
+    "visual.blocks.17.norm2.weight": "model-00007-of-00014.safetensors",
+    "visual.blocks.18.attn.proj.bias": "model-00009-of-00014.safetensors",
+    "visual.blocks.18.attn.proj.weight": "model-00003-of-00014.safetensors",
+    "visual.blocks.18.attn.qkv.bias": "model-00011-of-00014.safetensors",
+    "visual.blocks.18.attn.qkv.weight": "model-00012-of-00014.safetensors",
+    "visual.blocks.18.mlp.down_proj.bias": "model-00006-of-00014.safetensors",
+    "visual.blocks.18.mlp.down_proj.weight": "model-00013-of-00014.safetensors",
+    "visual.blocks.18.mlp.gate_proj.bias": "model-00004-of-00014.safetensors",
+    "visual.blocks.18.mlp.gate_proj.weight": "model-00014-of-00014.safetensors",
+    "visual.blocks.18.mlp.up_proj.bias": "model-00008-of-00014.safetensors",
+    "visual.blocks.18.mlp.up_proj.weight": "model-00003-of-00014.safetensors",
+    "visual.blocks.18.norm1.weight": "model-00014-of-00014.safetensors",
+    "visual.blocks.18.norm2.weight": "model-00003-of-00014.safetensors",
+    "visual.blocks.19.attn.proj.bias": "model-00003-of-00014.safetensors",
+    "visual.blocks.19.attn.proj.weight": "model-00001-of-00014.safetensors",
+    "visual.blocks.19.attn.qkv.bias": "model-00014-of-00014.safetensors",
+    "visual.blocks.19.attn.qkv.weight": "model-00004-of-00014.safetensors",
+    "visual.blocks.19.mlp.down_proj.bias": "model-00010-of-00014.safetensors",
+    "visual.blocks.19.mlp.down_proj.weight": "model-00009-of-00014.safetensors",
+    "visual.blocks.19.mlp.gate_proj.bias": "model-00010-of-00014.safetensors",
+    "visual.blocks.19.mlp.gate_proj.weight": "model-00003-of-00014.safetensors",
+    "visual.blocks.19.mlp.up_proj.bias": "model-00001-of-00014.safetensors",
+    "visual.blocks.19.mlp.up_proj.weight": "model-00014-of-00014.safetensors",
+    "visual.blocks.19.norm1.weight": "model-00006-of-00014.safetensors",
+    "visual.blocks.19.norm2.weight": "model-00005-of-00014.safetensors",
+    "visual.blocks.2.attn.proj.bias": "model-00014-of-00014.safetensors",
+    "visual.blocks.2.attn.proj.weight": "model-00004-of-00014.safetensors",
+    "visual.blocks.2.attn.qkv.bias": "model-00005-of-00014.safetensors",
+    "visual.blocks.2.attn.qkv.weight": "model-00007-of-00014.safetensors",
+    "visual.blocks.2.mlp.down_proj.bias": "model-00010-of-00014.safetensors",
+    "visual.blocks.2.mlp.down_proj.weight": "model-00009-of-00014.safetensors",
+    "visual.blocks.2.mlp.gate_proj.bias": "model-00014-of-00014.safetensors",
+    "visual.blocks.2.mlp.gate_proj.weight": "model-00005-of-00014.safetensors",
+    "visual.blocks.2.mlp.up_proj.bias": "model-00001-of-00014.safetensors",
+    "visual.blocks.2.mlp.up_proj.weight": "model-00012-of-00014.safetensors",
+    "visual.blocks.2.norm1.weight": "model-00008-of-00014.safetensors",
+    "visual.blocks.2.norm2.weight": "model-00010-of-00014.safetensors",
+    "visual.blocks.20.attn.proj.bias": "model-00010-of-00014.safetensors",
+    "visual.blocks.20.attn.proj.weight": "model-00001-of-00014.safetensors",
+    "visual.blocks.20.attn.qkv.bias": "model-00010-of-00014.safetensors",
+    "visual.blocks.20.attn.qkv.weight": "model-00003-of-00014.safetensors",
+    "visual.blocks.20.mlp.down_proj.bias": "model-00012-of-00014.safetensors",
+    "visual.blocks.20.mlp.down_proj.weight": "model-00010-of-00014.safetensors",
+    "visual.blocks.20.mlp.gate_proj.bias": "model-00005-of-00014.safetensors",
+    "visual.blocks.20.mlp.gate_proj.weight": "model-00011-of-00014.safetensors",
+    "visual.blocks.20.mlp.up_proj.bias": "model-00001-of-00014.safetensors",
+    "visual.blocks.20.mlp.up_proj.weight": "model-00001-of-00014.safetensors",
+    "visual.blocks.20.norm1.weight": "model-00005-of-00014.safetensors",
+    "visual.blocks.20.norm2.weight": "model-00006-of-00014.safetensors",
+    "visual.blocks.21.attn.proj.bias": "model-00009-of-00014.safetensors",
+    "visual.blocks.21.attn.proj.weight": "model-00001-of-00014.safetensors",
+    "visual.blocks.21.attn.qkv.bias": "model-00007-of-00014.safetensors",
+    "visual.blocks.21.attn.qkv.weight": "model-00007-of-00014.safetensors",
+    "visual.blocks.21.mlp.down_proj.bias": "model-00013-of-00014.safetensors",
+    "visual.blocks.21.mlp.down_proj.weight": "model-00001-of-00014.safetensors",
+    "visual.blocks.21.mlp.gate_proj.bias": "model-00010-of-00014.safetensors",
+    "visual.blocks.21.mlp.gate_proj.weight": "model-00006-of-00014.safetensors",
+    "visual.blocks.21.mlp.up_proj.bias": "model-00010-of-00014.safetensors",
+    "visual.blocks.21.mlp.up_proj.weight": "model-00011-of-00014.safetensors",
+    "visual.blocks.21.norm1.weight": "model-00010-of-00014.safetensors",
+    "visual.blocks.21.norm2.weight": "model-00001-of-00014.safetensors",
+    "visual.blocks.22.attn.proj.bias": "model-00006-of-00014.safetensors",
+    "visual.blocks.22.attn.proj.weight": "model-00010-of-00014.safetensors",
+    "visual.blocks.22.attn.qkv.bias": "model-00007-of-00014.safetensors",
+    "visual.blocks.22.attn.qkv.weight": "model-00010-of-00014.safetensors",
+    "visual.blocks.22.mlp.down_proj.bias": "model-00005-of-00014.safetensors",
+    "visual.blocks.22.mlp.down_proj.weight": "model-00011-of-00014.safetensors",
+    "visual.blocks.22.mlp.gate_proj.bias": "model-00001-of-00014.safetensors",
+    "visual.blocks.22.mlp.gate_proj.weight": "model-00010-of-00014.safetensors",
+    "visual.blocks.22.mlp.up_proj.bias": "model-00014-of-00014.safetensors",
+    "visual.blocks.22.mlp.up_proj.weight": "model-00010-of-00014.safetensors",
+    "visual.blocks.22.norm1.weight": "model-00007-of-00014.safetensors",
+    "visual.blocks.22.norm2.weight": "model-00001-of-00014.safetensors",
+    "visual.blocks.23.attn.proj.bias": "model-00010-of-00014.safetensors",
+    "visual.blocks.23.attn.proj.weight": "model-00013-of-00014.safetensors",
+    "visual.blocks.23.attn.qkv.bias": "model-00007-of-00014.safetensors",
+    "visual.blocks.23.attn.qkv.weight": "model-00010-of-00014.safetensors",
+    "visual.blocks.23.mlp.down_proj.bias": "model-00003-of-00014.safetensors",
+    "visual.blocks.23.mlp.down_proj.weight": "model-00004-of-00014.safetensors",
+    "visual.blocks.23.mlp.gate_proj.bias": "model-00007-of-00014.safetensors",
+    "visual.blocks.23.mlp.gate_proj.weight": "model-00007-of-00014.safetensors",
+    "visual.blocks.23.mlp.up_proj.bias": "model-00004-of-00014.safetensors",
+    "visual.blocks.23.mlp.up_proj.weight": "model-00003-of-00014.safetensors",
+    "visual.blocks.23.norm1.weight": "model-00011-of-00014.safetensors",
+    "visual.blocks.23.norm2.weight": "model-00005-of-00014.safetensors",
+    "visual.blocks.24.attn.proj.bias": "model-00008-of-00014.safetensors",
+    "visual.blocks.24.attn.proj.weight": "model-00002-of-00014.safetensors",
+    "visual.blocks.24.attn.qkv.bias": "model-00012-of-00014.safetensors",
+    "visual.blocks.24.attn.qkv.weight": "model-00007-of-00014.safetensors",
+    "visual.blocks.24.mlp.down_proj.bias": "model-00003-of-00014.safetensors",
+    "visual.blocks.24.mlp.down_proj.weight": "model-00013-of-00014.safetensors",
+    "visual.blocks.24.mlp.gate_proj.bias": "model-00005-of-00014.safetensors",
+    "visual.blocks.24.mlp.gate_proj.weight": "model-00001-of-00014.safetensors",
+    "visual.blocks.24.mlp.up_proj.bias": "model-00013-of-00014.safetensors",
+    "visual.blocks.24.mlp.up_proj.weight": "model-00009-of-00014.safetensors",
+    "visual.blocks.24.norm1.weight": "model-00004-of-00014.safetensors",
+    "visual.blocks.24.norm2.weight": "model-00012-of-00014.safetensors",
+    "visual.blocks.25.attn.proj.bias": "model-00005-of-00014.safetensors",
+    "visual.blocks.25.attn.proj.weight": "model-00010-of-00014.safetensors",
+    "visual.blocks.25.attn.qkv.bias": "model-00007-of-00014.safetensors",
+    "visual.blocks.25.attn.qkv.weight": "model-00003-of-00014.safetensors",
+    "visual.blocks.25.mlp.down_proj.bias": "model-00006-of-00014.safetensors",
+    "visual.blocks.25.mlp.down_proj.weight": "model-00010-of-00014.safetensors",
+    "visual.blocks.25.mlp.gate_proj.bias": "model-00002-of-00014.safetensors",
+    "visual.blocks.25.mlp.gate_proj.weight": "model-00009-of-00014.safetensors",
+    "visual.blocks.25.mlp.up_proj.bias": "model-00004-of-00014.safetensors",
+    "visual.blocks.25.mlp.up_proj.weight": "model-00010-of-00014.safetensors",
+    "visual.blocks.25.norm1.weight": "model-00014-of-00014.safetensors",
+    "visual.blocks.25.norm2.weight": "model-00010-of-00014.safetensors",
+    "visual.blocks.26.attn.proj.bias": "model-00010-of-00014.safetensors",
+    "visual.blocks.26.attn.proj.weight": "model-00009-of-00014.safetensors",
+    "visual.blocks.26.attn.qkv.bias": "model-00003-of-00014.safetensors",
+    "visual.blocks.26.attn.qkv.weight": "model-00011-of-00014.safetensors",
+    "visual.blocks.26.mlp.down_proj.bias": "model-00009-of-00014.safetensors",
+    "visual.blocks.26.mlp.down_proj.weight": "model-00001-of-00014.safetensors",
+    "visual.blocks.26.mlp.gate_proj.bias": "model-00012-of-00014.safetensors",
+    "visual.blocks.26.mlp.gate_proj.weight": "model-00009-of-00014.safetensors",
+    "visual.blocks.26.mlp.up_proj.bias": "model-00011-of-00014.safetensors",
+    "visual.blocks.26.mlp.up_proj.weight": "model-00005-of-00014.safetensors",
+    "visual.blocks.26.norm1.weight": "model-00005-of-00014.safetensors",
+    "visual.blocks.26.norm2.weight": "model-00006-of-00014.safetensors",
+    "visual.blocks.27.attn.proj.bias": "model-00007-of-00014.safetensors",
+    "visual.blocks.27.attn.proj.weight": "model-00001-of-00014.safetensors",
+    "visual.blocks.27.attn.qkv.bias": "model-00005-of-00014.safetensors",
+    "visual.blocks.27.attn.qkv.weight": "model-00012-of-00014.safetensors",
+    "visual.blocks.27.mlp.down_proj.bias": "model-00010-of-00014.safetensors",
+    "visual.blocks.27.mlp.down_proj.weight": "model-00011-of-00014.safetensors",
+    "visual.blocks.27.mlp.gate_proj.bias": "model-00001-of-00014.safetensors",
+    "visual.blocks.27.mlp.gate_proj.weight": "model-00011-of-00014.safetensors",
+    "visual.blocks.27.mlp.up_proj.bias": "model-00003-of-00014.safetensors",
+    "visual.blocks.27.mlp.up_proj.weight": "model-00001-of-00014.safetensors",
+    "visual.blocks.27.norm1.weight": "model-00004-of-00014.safetensors",
+    "visual.blocks.27.norm2.weight": "model-00002-of-00014.safetensors",
+    "visual.blocks.28.attn.proj.bias": "model-00006-of-00014.safetensors",
+    "visual.blocks.28.attn.proj.weight": "model-00009-of-00014.safetensors",
+    "visual.blocks.28.attn.qkv.bias": "model-00010-of-00014.safetensors",
+    "visual.blocks.28.attn.qkv.weight": "model-00014-of-00014.safetensors",
+    "visual.blocks.28.mlp.down_proj.bias": "model-00001-of-00014.safetensors",
+    "visual.blocks.28.mlp.down_proj.weight": "model-00010-of-00014.safetensors",
+    "visual.blocks.28.mlp.gate_proj.bias": "model-00013-of-00014.safetensors",
+    "visual.blocks.28.mlp.gate_proj.weight": "model-00012-of-00014.safetensors",
+    "visual.blocks.28.mlp.up_proj.bias": "model-00002-of-00014.safetensors",
+    "visual.blocks.28.mlp.up_proj.weight": "model-00001-of-00014.safetensors",
+    "visual.blocks.28.norm1.weight": "model-00003-of-00014.safetensors",
+    "visual.blocks.28.norm2.weight": "model-00013-of-00014.safetensors",
+    "visual.blocks.29.attn.proj.bias": "model-00001-of-00014.safetensors",
+    "visual.blocks.29.attn.proj.weight": "model-00013-of-00014.safetensors",
+    "visual.blocks.29.attn.qkv.bias": "model-00012-of-00014.safetensors",
+    "visual.blocks.29.attn.qkv.weight": "model-00011-of-00014.safetensors",
+    "visual.blocks.29.mlp.down_proj.bias": "model-00008-of-00014.safetensors",
+    "visual.blocks.29.mlp.down_proj.weight": "model-00010-of-00014.safetensors",
+    "visual.blocks.29.mlp.gate_proj.bias": "model-00007-of-00014.safetensors",
+    "visual.blocks.29.mlp.gate_proj.weight": "model-00010-of-00014.safetensors",
+    "visual.blocks.29.mlp.up_proj.bias": "model-00006-of-00014.safetensors",
+    "visual.blocks.29.mlp.up_proj.weight": "model-00011-of-00014.safetensors",
+    "visual.blocks.29.norm1.weight": "model-00013-of-00014.safetensors",
+    "visual.blocks.29.norm2.weight": "model-00004-of-00014.safetensors",
+    "visual.blocks.3.attn.proj.bias": "model-00002-of-00014.safetensors",
+    "visual.blocks.3.attn.proj.weight": "model-00001-of-00014.safetensors",
+    "visual.blocks.3.attn.qkv.bias": "model-00001-of-00014.safetensors",
+    "visual.blocks.3.attn.qkv.weight": "model-00007-of-00014.safetensors",
+    "visual.blocks.3.mlp.down_proj.bias": "model-00006-of-00014.safetensors",
+    "visual.blocks.3.mlp.down_proj.weight": "model-00006-of-00014.safetensors",
+    "visual.blocks.3.mlp.gate_proj.bias": "model-00011-of-00014.safetensors",
+    "visual.blocks.3.mlp.gate_proj.weight": "model-00002-of-00014.safetensors",
+    "visual.blocks.3.mlp.up_proj.bias": "model-00001-of-00014.safetensors",
+    "visual.blocks.3.mlp.up_proj.weight": "model-00001-of-00014.safetensors",
+    "visual.blocks.3.norm1.weight": "model-00003-of-00014.safetensors",
+    "visual.blocks.3.norm2.weight": "model-00004-of-00014.safetensors",
+    "visual.blocks.30.attn.proj.bias": "model-00003-of-00014.safetensors",
+    "visual.blocks.30.attn.proj.weight": "model-00006-of-00014.safetensors",
+    "visual.blocks.30.attn.qkv.bias": "model-00003-of-00014.safetensors",
+    "visual.blocks.30.attn.qkv.weight": "model-00003-of-00014.safetensors",
+    "visual.blocks.30.mlp.down_proj.bias": "model-00001-of-00014.safetensors",
+    "visual.blocks.30.mlp.down_proj.weight": "model-00003-of-00014.safetensors",
+    "visual.blocks.30.mlp.gate_proj.bias": "model-00005-of-00014.safetensors",
+    "visual.blocks.30.mlp.gate_proj.weight": "model-00007-of-00014.safetensors",
+    "visual.blocks.30.mlp.up_proj.bias": "model-00010-of-00014.safetensors",
+    "visual.blocks.30.mlp.up_proj.weight": "model-00001-of-00014.safetensors",
+    "visual.blocks.30.norm1.weight": "model-00002-of-00014.safetensors",
+    "visual.blocks.30.norm2.weight": "model-00001-of-00014.safetensors",
+    "visual.blocks.31.attn.proj.bias": "model-00013-of-00014.safetensors",
+    "visual.blocks.31.attn.proj.weight": "model-00007-of-00014.safetensors",
+    "visual.blocks.31.attn.qkv.bias": "model-00002-of-00014.safetensors",
+    "visual.blocks.31.attn.qkv.weight": "model-00014-of-00014.safetensors",
+    "visual.blocks.31.mlp.down_proj.bias": "model-00013-of-00014.safetensors",
+    "visual.blocks.31.mlp.down_proj.weight": "model-00003-of-00014.safetensors",
+    "visual.blocks.31.mlp.gate_proj.bias": "model-00010-of-00014.safetensors",
+    "visual.blocks.31.mlp.gate_proj.weight": "model-00009-of-00014.safetensors",
+    "visual.blocks.31.mlp.up_proj.bias": "model-00002-of-00014.safetensors",
+    "visual.blocks.31.mlp.up_proj.weight": "model-00003-of-00014.safetensors",
+    "visual.blocks.31.norm1.weight": "model-00001-of-00014.safetensors",
+    "visual.blocks.31.norm2.weight": "model-00001-of-00014.safetensors",
+    "visual.blocks.4.attn.proj.bias": "model-00013-of-00014.safetensors",
+    "visual.blocks.4.attn.proj.weight": "model-00011-of-00014.safetensors",
+    "visual.blocks.4.attn.qkv.bias": "model-00013-of-00014.safetensors",
+    "visual.blocks.4.attn.qkv.weight": "model-00009-of-00014.safetensors",
+    "visual.blocks.4.mlp.down_proj.bias": "model-00001-of-00014.safetensors",
+    "visual.blocks.4.mlp.down_proj.weight": "model-00001-of-00014.safetensors",
+    "visual.blocks.4.mlp.gate_proj.bias": "model-00012-of-00014.safetensors",
+    "visual.blocks.4.mlp.gate_proj.weight": "model-00014-of-00014.safetensors",
+    "visual.blocks.4.mlp.up_proj.bias": "model-00007-of-00014.safetensors",
+    "visual.blocks.4.mlp.up_proj.weight": "model-00004-of-00014.safetensors",
+    "visual.blocks.4.norm1.weight": "model-00005-of-00014.safetensors",
+    "visual.blocks.4.norm2.weight": "model-00001-of-00014.safetensors",
+    "visual.blocks.5.attn.proj.bias": "model-00003-of-00014.safetensors",
+    "visual.blocks.5.attn.proj.weight": "model-00006-of-00014.safetensors",
+    "visual.blocks.5.attn.qkv.bias": "model-00012-of-00014.safetensors",
+    "visual.blocks.5.attn.qkv.weight": "model-00014-of-00014.safetensors",
+    "visual.blocks.5.mlp.down_proj.bias": "model-00010-of-00014.safetensors",
+    "visual.blocks.5.mlp.down_proj.weight": "model-00002-of-00014.safetensors",
+    "visual.blocks.5.mlp.gate_proj.bias": "model-00014-of-00014.safetensors",
+    "visual.blocks.5.mlp.gate_proj.weight": "model-00004-of-00014.safetensors",
+    "visual.blocks.5.mlp.up_proj.bias": "model-00001-of-00014.safetensors",
+    "visual.blocks.5.mlp.up_proj.weight": "model-00010-of-00014.safetensors",
+    "visual.blocks.5.norm1.weight": "model-00009-of-00014.safetensors",
+    "visual.blocks.5.norm2.weight": "model-00011-of-00014.safetensors",
+    "visual.blocks.6.attn.proj.bias": "model-00001-of-00014.safetensors",
+    "visual.blocks.6.attn.proj.weight": "model-00013-of-00014.safetensors",
+    "visual.blocks.6.attn.qkv.bias": "model-00010-of-00014.safetensors",
+    "visual.blocks.6.attn.qkv.weight": "model-00011-of-00014.safetensors",
+    "visual.blocks.6.mlp.down_proj.bias": "model-00011-of-00014.safetensors",
+    "visual.blocks.6.mlp.down_proj.weight": "model-00014-of-00014.safetensors",
+    "visual.blocks.6.mlp.gate_proj.bias": "model-00011-of-00014.safetensors",
+    "visual.blocks.6.mlp.gate_proj.weight": "model-00009-of-00014.safetensors",
+    "visual.blocks.6.mlp.up_proj.bias": "model-00005-of-00014.safetensors",
+    "visual.blocks.6.mlp.up_proj.weight": "model-00013-of-00014.safetensors",
+    "visual.blocks.6.norm1.weight": "model-00003-of-00014.safetensors",
+    "visual.blocks.6.norm2.weight": "model-00004-of-00014.safetensors",
+    "visual.blocks.7.attn.proj.bias": "model-00011-of-00014.safetensors",
+    "visual.blocks.7.attn.proj.weight": "model-00007-of-00014.safetensors",
+    "visual.blocks.7.attn.qkv.bias": "model-00004-of-00014.safetensors",
+    "visual.blocks.7.attn.qkv.weight": "model-00009-of-00014.safetensors",
+    "visual.blocks.7.mlp.down_proj.bias": "model-00005-of-00014.safetensors",
+    "visual.blocks.7.mlp.down_proj.weight": "model-00013-of-00014.safetensors",
+    "visual.blocks.7.mlp.gate_proj.bias": "model-00009-of-00014.safetensors",
+    "visual.blocks.7.mlp.gate_proj.weight": "model-00001-of-00014.safetensors",
+    "visual.blocks.7.mlp.up_proj.bias": "model-00013-of-00014.safetensors",
+    "visual.blocks.7.mlp.up_proj.weight": "model-00003-of-00014.safetensors",
+    "visual.blocks.7.norm1.weight": "model-00014-of-00014.safetensors",
+    "visual.blocks.7.norm2.weight": "model-00004-of-00014.safetensors",
+    "visual.blocks.8.attn.proj.bias": "model-00004-of-00014.safetensors",
+    "visual.blocks.8.attn.proj.weight": "model-00006-of-00014.safetensors",
+    "visual.blocks.8.attn.qkv.bias": "model-00004-of-00014.safetensors",
+    "visual.blocks.8.attn.qkv.weight": "model-00012-of-00014.safetensors",
+    "visual.blocks.8.mlp.down_proj.bias": "model-00005-of-00014.safetensors",
+    "visual.blocks.8.mlp.down_proj.weight": "model-00005-of-00014.safetensors",
+    "visual.blocks.8.mlp.gate_proj.bias": "model-00003-of-00014.safetensors",
+    "visual.blocks.8.mlp.gate_proj.weight": "model-00013-of-00014.safetensors",
+    "visual.blocks.8.mlp.up_proj.bias": "model-00004-of-00014.safetensors",
+    "visual.blocks.8.mlp.up_proj.weight": "model-00004-of-00014.safetensors",
+    "visual.blocks.8.norm1.weight": "model-00012-of-00014.safetensors",
+    "visual.blocks.8.norm2.weight": "model-00005-of-00014.safetensors",
+    "visual.blocks.9.attn.proj.bias": "model-00010-of-00014.safetensors",
+    "visual.blocks.9.attn.proj.weight": "model-00012-of-00014.safetensors",
+    "visual.blocks.9.attn.qkv.bias": "model-00003-of-00014.safetensors",
+    "visual.blocks.9.attn.qkv.weight": "model-00009-of-00014.safetensors",
+    "visual.blocks.9.mlp.down_proj.bias": "model-00010-of-00014.safetensors",
+    "visual.blocks.9.mlp.down_proj.weight": "model-00006-of-00014.safetensors",
+    "visual.blocks.9.mlp.gate_proj.bias": "model-00001-of-00014.safetensors",
+    "visual.blocks.9.mlp.gate_proj.weight": "model-00007-of-00014.safetensors",
+    "visual.blocks.9.mlp.up_proj.bias": "model-00003-of-00014.safetensors",
+    "visual.blocks.9.mlp.up_proj.weight": "model-00010-of-00014.safetensors",
+    "visual.blocks.9.norm1.weight": "model-00014-of-00014.safetensors",
+    "visual.blocks.9.norm2.weight": "model-00013-of-00014.safetensors",
+    "visual.merger.ln_q.weight": "model-00012-of-00014.safetensors",
+    "visual.merger.mlp.0.bias": "model-00011-of-00014.safetensors",
+    "visual.merger.mlp.0.weight": "model-00013-of-00014.safetensors",
+    "visual.merger.mlp.2.bias": "model-00012-of-00014.safetensors",
+    "visual.merger.mlp.2.weight": "model-00006-of-00014.safetensors",
+    "visual.patch_embed.proj.weight": "model-00005-of-00014.safetensors"
+  }
+}

modeling_opencua.py ADDED Viewed

	@@ -0,0 +1,449 @@

+# ------------------------------------------------------------------------------
+# OpenCUA‑7B Model
+#
+# This implementation is adapted from the Qwen2‑VL reference code in
+# Hugging Face Transformers v4.53.0:
+#   https://github.com/huggingface/transformers/tree/v4.53.0/src/transformers/models/qwen2_5_vl
+#
+# Checkpoint used for weight initialisation:
+#   "Qwen/Qwen2.5-VL-32B-Instruct"  –  https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct
+#
+# Key modifications
+# -----------------
+# • Replaced Multimodal Rotary Position Embedding (M‑RoPE) with 1‑D RoPE for
+#   compatibility with OpenCUA training settings.
+# • Wrapped vision encoder and language model into a single
+#   `OpenCUAForConditionalGeneration` class.
+# • Simplified weight initialisation — this file targets inference / fine‑tuning,
+#   not training from scratch.
+#
+# Copyright (c) 2025 XLANG Lab, The University of Hong Kong
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the “Software”), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# ------------------------------------------------------------------------------
+# Prohibited Uses & Additional Disclaimer
+# ---------------------------------------
+# • The Software may **not** be used for any purpose or activity that violates
+#   applicable laws or regulations in any jurisdiction.
+# • The authors, contributors, and copyright holders are **not responsible**
+#   for any illegal, unethical, or harmful use of the Software, nor for any
+#   direct or indirect damages resulting from such use.
+# • Use of the “OpenCUA” name, logo, or trademarks does **not** imply any
+#   endorsement or affiliation unless a separate written permission is obtained.
+import torch
+import torch.nn as nn
+from transformers.cache_utils import Cache
+from transformers.modeling_utils import PreTrainedModel
+from transformers.models.llava.modeling_llava import LlavaCausalLMOutputWithPast
+from .configuration_opencua import OpenCUAConfig
+from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VisionTransformerPretrainedModel
+from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM
+class OpenCUAPreTrainedModel(PreTrainedModel):
+    config_class = OpenCUAConfig
+    base_model_prefix = "model"
+    _no_split_modules = ["Qwen2_5_VisionTransformerPretrainedModel"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    def _init_weights(self, module):
+        # important: this ported version of Llava isn't meant for training from scratch - only
+        # inference and fine-tuning - so the proper init weights code has been removed - the original codebase
+        # https://github.com/haotian-liu/LLaVA/tree/main/llava should serve for that purpose
+        std = (
+            self.config.initializer_range
+            if hasattr(self.config, "initializer_range")
+            else self.config.text_config.initializer_range
+        )
+        if hasattr(module, "class_embedding"):
+            module.class_embedding.data.normal_(mean=0.0, std=std)
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    @property
+    def _supports_sdpa(self):
+        """
+        Retrieve language_model's attribute to check whether the model supports
+        SDPA or not.
+        """
+        return self.language_model._supports_sdpa
+class OpenCUAForConditionalGeneration(OpenCUAPreTrainedModel):
+    def __init__(self, config: OpenCUAConfig):
+        super().__init__(config)
+        self.vision_tower = Qwen2_5_VisionTransformerPretrainedModel(config.vision_config)
+        self.language_model = Qwen2ForCausalLM(config.text_config)
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+    def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+    def set_decoder(self, decoder):
+        self.language_model.set_decoder(decoder)
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+    def tie_weights(self):
+        return self.language_model.tie_weights()
+    def resize_token_embeddings(self, new_num_tokens: int | None = None, pad_to_multiple_of=None) -> nn.Embedding:
+        model_embeds = self.language_model.resize_token_embeddings(
+            new_num_tokens, pad_to_multiple_of)
+        # update vocab size
+        self.config.text_config.vocab_size = model_embeds.num_embeddings
+        self.vocab_size = model_embeds.num_embeddings
+        return model_embeds
+    def _merge_input_ids_with_image_features(
+            self,
+            image_features: torch.Tensor,
+            feature_lengths: list[int],
+            inputs_embeds: torch.Tensor,
+            input_ids: torch.Tensor,
+            attention_mask: torch.Tensor,
+            labels: torch.Tensor | None = None):
+        """
+        Args:
+            image_features (:obj:`torch.Tensor` of shape :obj:`(num_image_tokens, embed_dim)`):
+                The image features to merge with the input embeddings.
+            feature_lengths: the length of image feature.
+            inputs_embeds (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length, embed_dim)`):
+                The input embeddings.
+            input_ids (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+                The input ids.
+            attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+                The attention mask.
+            labels (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, *optional*):
+                The labels.
+        """
+        image_token_index: int = self.config.media_placeholder_token_id
+        pad_token_id: int = self.config.pad_token_id
+        ignore_index: int = self.config.ignore_index
+        _, embed_dim = image_features.shape
+        batch_size, sequence_length = input_ids.shape
+        left_padding = not torch.sum(
+            input_ids[:, -1] == torch.tensor(pad_token_id))
+        # 1. Create a mask to know where special image tokens are
+        _token_occupation_table = torch.ones_like(input_ids.flatten())
+        _token_occupation_table[input_ids.flatten() == image_token_index] = \
+            torch.tensor(feature_lengths,
+                         dtype=torch.long, device=input_ids.device)
+        _token_occupation_table = _token_occupation_table.reshape(
+            input_ids.shape)
+        max_embed_dim = _token_occupation_table.sum(-1).max().item()
+        assert max_embed_dim >= sequence_length, (
+            f"The maximum embedding dimension ({max_embed_dim}) is less than the sequence length ({sequence_length})"
+        )
+        batch_indices, non_image_indices = torch.where(input_ids != image_token_index)
+        # 2. Compute the positions where text should be written
+        # Calculate new positions for text tokens in merged image-text sequence.
+        new_token_positions = torch.cumsum(_token_occupation_table, -1) - 1
+        nb_image_pad = max_embed_dim - 1 - new_token_positions[:, -1]
+        if left_padding:
+            new_token_positions += nb_image_pad[:, None]  # offset for left padding
+        text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
+        # 3. Create the full embedding, already padded to the maximum position
+        final_embedding = torch.zeros(
+            batch_size, max_embed_dim, embed_dim, dtype=inputs_embeds.dtype, device=inputs_embeds.device
+        )
+        final_attention_mask = torch.zeros(
+            batch_size, max_embed_dim, dtype=attention_mask.dtype, device=inputs_embeds.device
+        )
+        if labels is not None:
+            final_labels = torch.full(
+                (batch_size, max_embed_dim), ignore_index, dtype=input_ids.dtype, device=input_ids.device
+            )
+        # In case the Vision model or the Language model has been offloaded to CPU, we need to manually
+        # set the corresponding tensors into their correct target device.
+        target_device = inputs_embeds.device
+        batch_indices, non_image_indices, text_to_overwrite = (
+            batch_indices.to(target_device),
+            non_image_indices.to(target_device),
+            text_to_overwrite.to(target_device),
+        )
+        attention_mask = attention_mask.to(target_device)
+        # 4. Fill the embeddings based on the mask.
+        final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]
+        final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]
+        if labels is not None:
+            final_labels[batch_indices, text_to_overwrite] = labels[batch_indices, non_image_indices]
+        # 5. Fill the embeddings corresponding to the images. Anything that is not `text_positions` needs filling (#29835)
+        image_to_overwrite = torch.full(
+            (batch_size, max_embed_dim), True, dtype=torch.bool, device=inputs_embeds.device
+        )
+        image_to_overwrite[batch_indices, text_to_overwrite] = False
+        image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device)
+        if image_to_overwrite.sum() != image_features.shape[:-1].numel():
+            raise ValueError(
+                f"The input provided to the model are wrong. The number of image tokens is {image_to_overwrite.sum()} while"
+                f" the number of image features given to the model is {image_features.shape[:-1].numel()}. "
+                "This prevents correct indexing and breaks batch generation."
+            )
+        final_embedding[image_to_overwrite] = image_features.contiguous().reshape(-1, embed_dim).to(target_device)
+        final_attention_mask |= image_to_overwrite
+        position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1)
+        # 6. Mask out the embedding at padding positions, as we later use the past_key_value value to determine the non-attended tokens.
+        batch_indices, pad_indices = torch.where(input_ids == pad_token_id)
+        indices_to_mask = new_token_positions[batch_indices, pad_indices]
+        final_embedding[batch_indices, indices_to_mask] = 0
+        if labels is None:
+            final_labels = None
+        return final_embedding, final_attention_mask, final_labels, position_ids
+    def _extract_image_features(self,
+                                pixel_values: torch.FloatTensor | list[torch.FloatTensor],
+                                grid_thws: torch.FloatTensor,
+                                ):
+        """
+        Args:
+            pixel_values (:obj:`torch.FloatTensor` of shape :obj:`(sum_num_image_tokens, channels)`):
+                The pixel values of the images processed by image processor.
+            grid_thws: (B,3)
+        Returns:
+            selected_image_feature (:obj:`torch.FloatTensor` of shape :obj:`(num_image_tokens, embed_dim)`):
+                The selected image features to use as input to the projector head.
+        """
+        assert len(grid_thws.shape)==2 and grid_thws.shape[1]==3, f"grid_thws must be a 2D tensor with shape (batched, 3), but got {grid_thws.shape}"
+        if isinstance(pixel_values, list):
+            pixel_values = torch.cat(pixel_values, dim=0)
+        image_features_ = self.vision_tower(pixel_values, grid_thw=grid_thws)
+        image_features_list = []
+        start_idx = 0
+        for i, grid_thw in enumerate(grid_thws):
+            end_idx = start_idx + (grid_thw[0] * grid_thw[1] * grid_thw[2]) // 4
+            image_features_list.append(image_features_[start_idx:end_idx, :])
+            start_idx = end_idx
+        selected_image_feature = torch.cat(image_features_list, dim=0)
+        feature_lengths = [x.size(0) for x in image_features_list]
+        return selected_image_feature, feature_lengths
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        pixel_values: torch.FloatTensor | list[torch.FloatTensor] | None = None,
+        grid_thws: torch.Tensor = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        labels: torch.LongTensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+    ) -> tuple | LlavaCausalLMOutputWithPast:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if inputs_embeds is None:
+            # 1. Extra the input embeddings
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+            # 2. Merge text and images
+            if pixel_values is not None and len(pixel_values) > 0 and input_ids.shape[1] != 1:
+                image_feature, feature_lengths = self._extract_image_features(
+                    pixel_values, grid_thws)
+                inputs_embeds = inputs_embeds.to(image_feature.dtype)  # num_tokens, embed_dim
+                inputs_embeds, attention_mask, labels, position_ids = \
+                    self._merge_input_ids_with_image_features(image_feature, feature_lengths, inputs_embeds, input_ids, attention_mask, labels
+                    )
+            # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
+            # generation with cache
+            elif past_key_values is not None and pixel_values is not None and input_ids.shape[1] == 1:
+                # Retrieve the first layer to inspect the logits and mask out the hidden states
+                # that are set to 0
+                first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
+                # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
+                batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
+                # Get the target length
+                target_length = input_ids.shape[1]
+                past_length = first_layer_past_key_value.shape[-1]
+                extended_attention_mask = torch.ones(
+                    (attention_mask.shape[0], past_length),
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device,
+                )
+                # Filter out only the tokens that can be un-attended, this can happen
+                # if one uses Llava + Fused modules where the cache on the
+                # first iteration is already big enough, or if one passes custom cache
+                valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
+                new_batch_index = batch_index[valid_indices]
+                new_non_attended_tokens = non_attended_tokens[valid_indices]
+                # Zero-out the places where we don't need to attend
+                extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
+                attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
+                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        logits = outputs[0]
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            if attention_mask is not None:
+                shift_attention_mask = attention_mask[..., 1:]
+                shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
+            else:
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device)
+            )
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return LlavaCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, inputs_embeds=None, pixel_values=None, grid_thws=None, attention_mask=None, **kwargs
+    ):
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+            elif self.config.media_placeholder_token_id in input_ids:
+                input_ids = input_ids[:, input_ids.shape[1] - 1 :]
+            # If the cache has seen more tokens than it can hold, then the cache has a size limit. Let's discard the
+            # older attention values, as their corresponding values are not part of the input.
+            if cache_length < past_length and attention_mask is not None:
+                attention_mask = attention_mask[:, -(cache_length + input_ids.shape[1]) :]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "pixel_values": pixel_values,
+                "grid_thws": grid_thws,
+            }
+        )
+        return model_inputs
+    def _reorder_cache(self, *args, **kwargs):
+        return self.language_model._reorder_cache(*args, **kwargs)

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "min_pixels": 3136,
+  "max_pixels": 12845056,
+  "patch_size": 14,
+  "temporal_patch_size": 2,
+  "merge_size": 2,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "image_processor_type": "Qwen2VLImageProcessor"
+}

processing_opencua.py ADDED Viewed

	@@ -0,0 +1,93 @@

+# processing_opencua.py
+import torch
+from typing import List, Dict, Any, Union
+from PIL import Image
+from transformers.processing_utils import ProcessorMixin, BatchFeature
+from transformers import AutoTokenizer, AutoImageProcessor
+PLACEHOLDER = "<|media_placeholder|>"
+class OpenCUAProcessor(ProcessorMixin):
+    attributes = ["image_processor", "tokenizer", "image_token_id", "merge_size"]
+    def __init__(self, image_processor, tokenizer, image_token_id: int = 151664, merge_size: int = 2, **kwargs):
+        self.image_processor = image_processor
+        self.tokenizer = tokenizer
+        self.image_token_id = image_token_id
+        self.merge_size = getattr(image_processor, "merge_size", merge_size)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        trust = kwargs.get("trust_remote_code", True)
+        # 优先用你仓库的 TikTokenV3；失败回退 AutoTokenizer（只用于初始化/占位）
+        try:
+            from tokenization_opencua import TikTokenV3
+            tok = TikTokenV3.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust)
+        except Exception:
+            tok = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust)
+        imgproc = AutoImageProcessor.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust)
+        return cls(imgproc, tok, **kwargs)
+    def apply_chat_template(self, messages: List[Dict[str, Any]], **kwargs) -> Union[str, List[int]]:
+        return self.tokenizer.apply_chat_template(messages, **kwargs)
+    # 下面这些方法给 HF 路径用；vLLM 初始化只需要能成功 new 出来即可
+    def __call__(self, *args, **kwargs) -> BatchFeature:
+        # 返回一个最小结构，避免被实际调用时崩溃
+        data = {"input_ids": torch.zeros(1, 1, dtype=torch.long)}
+        return BatchFeature(data=data)
+    # 提供给你自己脚本用的辅助（可选）
+    def prepare_vllm_inputs(self, messages, images, add_generation_prompt=True):
+        text = self.apply_chat_template(messages, tokenize=False, add_generation_prompt=add_generation_prompt)
+        proc = self.image_processor(images=images, return_tensors="pt")
+        grid = torch.as_tensor(proc["image_grid_thw"])
+        merge = getattr(self, "merge_size", 2)
+        for thw in grid:
+            num = int((thw[0] * thw[1] * thw[2]) // (merge ** 2))
+            text = text.replace(PLACEHOLDER, PLACEHOLDER * num, 1)
+        return text, images
+# # processing_opencua.py
+# from transformers import Qwen2_5_VLProcessor, AutoTokenizer, AutoImageProcessor
+# class OpenCUAProcessor(Qwen2_5_VLProcessor):
+#     # 用字符串就行，但我们会在 from_pretrained 里手动加载，避免字符串反射
+#     tokenizer_class = "TikTokenV3"
+#     @classmethod
+#     def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+#         # 确保 remote code 可用
+#         trust_remote_code = kwargs.get("trust_remote_code", False)
+#         # 1) 手动加载 tokenizer（会按模型目录里的 tokenizer_config.json -> TikTokenV3 + tokenization_opencua.py）
+#         tokenizer = AutoTokenizer.from_pretrained(
+#             pretrained_model_name_or_path,
+#             trust_remote_code=trust_remote_code,
+#         )
+#         # 2) 手动加载图像处理器（保持 Qwen2VLImageProcessor）
+#         image_processor = AutoImageProcessor.from_pretrained(
+#             pretrained_model_name_or_path,
+#             trust_remote_code=trust_remote_code,
+#         )
+#         # 3) 获取chat_template，如果tokenizer有的话
+#         chat_template = getattr(tokenizer, 'chat_template', None)
+#         # 4) 构造并返回 Qwen2.5-VL 的 Processor 实例，传递chat_template
+#         processor = cls(image_processor=image_processor, tokenizer=tokenizer, chat_template=chat_template)
+#         # 5) 添加vLLM需要的属性
+#         # 这些token ID需要与tokenizer_config.json中的定义一致
+#         processor.image_token = "<|media_placeholder|>"  # 使用OpenCUA的媒体占位符
+#         processor.video_token = "<|media_placeholder|>"  # 视频也使用相同的占位符
+#         # 添加token ID（从tokenizer_config.json中获取）
+#         vocab = tokenizer.get_vocab()
+#         processor.image_token_id = vocab.get("<|media_placeholder|>", 151664)  # 默认151664
+#         processor.video_token_id = vocab.get("<|media_placeholder|>", 151664)  # 视频使用相同ID
+#         return processor

processor_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "processor_class": "Qwen2VLProcessor"
+}

qwen.tiktoken ADDED Viewed

The diff for this file is too large to render. See raw diff

tiktoken.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b2b1b8dfb5cc5f024bafc373121c6aba3f66f9a5a0269e243470a1de16a33186
+size 2561218

tokenization_opencua.py ADDED Viewed

	@@ -0,0 +1,379 @@

+import os
+import tiktoken
+from logging import getLogger
+from pathlib import Path
+from typing import (
+    cast,
+    Tuple,
+    Dict,
+    Iterator,
+    List,
+    Union,
+    Optional,
+)
+from shutil import copyfile
+from tiktoken.load import load_tiktoken_bpe
+from tokenizers import AddedToken
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
+# 导入Qwen2Tokenizer用于继承
+try:
+    from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer
+    QWEN2_AVAILABLE = True
+except ImportError:
+    QWEN2_AVAILABLE = False
+    Qwen2Tokenizer = PreTrainedTokenizer
+logger = getLogger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "tiktoken.model"}
+class TikTokenTokenizer(PreTrainedTokenizer):
+    """
+    Tokenizing and encoding/decoding text using the Tiktoken tokenizer. See megatron/tokenizer/tiktoken_tokenizer.py.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+    Args:
+        vocab_file (`str`):
+            The path to the Tiktoken model file.
+        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|begin_of_text|>",`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|end_of_text|>"`):
+            The end of sequence token.
+        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_249|>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead. The second to last item in special_tokens.
+        pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_250|>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        additional_special_tokens (list of `str`, *optional*):
+            A tuple or a list of additional tokens, which will be marked as `special`, meaning that they will be
+            skipped when decoding if `skip_special_tokens` is set to `True`.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    special_tokens: Dict[str, int]
+    num_reserved_special_tokens = 256
+    pat_str = "|".join(
+        [
+            r"""[\p{Han}]+""",
+            r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
+            r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
+            r"""\p{N}{1,3}""",
+            r""" ?[^\s\p{L}\p{N}]+[\r\n]*""",
+            r"""\s*[\r\n]+""",
+            r"""\s+(?!\S)""",
+            r"""\s+""",
+        ]
+    )
+    def __init__(
+        self,
+        vocab_file,
+        bos_token: Union[str, AddedToken]="[BOS]",
+        eos_token: Union[str, AddedToken]="[EOS]",
+        unk_token: Union[str, AddedToken, None]=None,
+        pad_token: Union[str, AddedToken, None]=None,
+        additional_special_tokens: List[str]=None,
+        added_tokens_decoder: Optional[dict] = None,
+        **kwargs,
+    ):
+        assert os.path.isfile(vocab_file), vocab_file
+        if additional_special_tokens is None:
+            # dumping mode
+            used_special_tokens  = [
+                "<|im_end|>",
+                "<|im_user|>",
+                "<|im_assistant|>",
+                "<|reserved_token_0|>",
+                "<|start_header_id|>",
+                "<|end_header_id|>",
+                "<|reserved_token_1|>",
+                "[EOT]",
+                "<|im_system|>",
+                "<|reserved_token_2|>",
+                "<|reserved_token_3|>",
+                "<|reserved_token_4|>",
+                "<|reserved_token_5|>",
+                "<|reserved_token_6|>",
+                "<|reserved_token_7|>",
+                "<|im_middle|>",
+                "<|media_begin|>",
+                "<|media_content|>",
+                "<|media_end|>",
+                "<|media_placeholder|>",
+                # 添加标准Qwen2.5-VL需要的token
+                "<|vision_start|>",
+                "<|vision_end|>",
+                "<|image_pad|>",
+                "<|video_pad|>",
+            ]
+            used_reserved_tokens = 12  # 原来8个 + 新增4个vision相关token
+            last_reserved_token_id = self.num_reserved_special_tokens - 4 - len(used_special_tokens)  + used_reserved_tokens - 1
+            additional_special_tokens = used_special_tokens + [
+                f"<|reserved_token_{i}|>"
+                for i in range(used_reserved_tokens, last_reserved_token_id + 1)
+            ]
+            # num_reserved_special_tokens = additional_special_tokens + BOS + EOS + unk_token + pad_token
+            assert len(additional_special_tokens) + 4 == self.num_reserved_special_tokens, f"additional_special_tokens num: {len(additional_special_tokens)} is not correct"
+            # we assume that the instance is under initialization and unk_token and pad_token should be automatically inferred
+            if unk_token is not None:
+                raise ValueError("unk_token should not be set in dumping mode when additional_special_tokens is None")
+            if pad_token is not None:
+                raise ValueError("pad_token should not be set in dumping mode when additional_special_tokens is None")
+            # last two reserved tokens
+            unk_token = f"[UNK]"
+            pad_token = f"[PAD]"
+            logger.info(f"adding unk_token: {unk_token} and pad_token: {pad_token}")
+            self.additional_special_tokens = additional_special_tokens
+            special_tokens = [str(bos_token), str(eos_token)] + additional_special_tokens + [str(unk_token), str(pad_token)]
+            self.vocab_file = vocab_file
+            mergeable_ranks = load_tiktoken_bpe(vocab_file)
+            num_base_tokens = len(mergeable_ranks)
+            self.special_tokens = {
+                token: num_base_tokens + i for i, token in enumerate(special_tokens)
+            }
+        else:
+            self.additional_special_tokens = additional_special_tokens
+            special_tokens_mapping = {
+            i: added_tokens_decoder[i].content for i in added_tokens_decoder
+        }
+            self.vocab_file = vocab_file
+            mergeable_ranks = load_tiktoken_bpe(vocab_file)
+            num_base_tokens = len(mergeable_ranks)
+            self.special_tokens = {
+                special_tokens_mapping.get(i, f"<|reserved_token_{i}|>"): i
+                for i in range(
+                    num_base_tokens, num_base_tokens + self.num_reserved_special_tokens + 2
+                )
+            }
+        self.model = tiktoken.Encoding(
+            name=Path(vocab_file).name,
+            pat_str=self.pat_str,
+            mergeable_ranks=mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        logger.info(f"Reloaded tiktoken model from {vocab_file}")
+        self.n_words: int = self.model.n_vocab
+        # BOS / EOS token IDs
+        self.bos_id: int = self.special_tokens[str(bos_token)]
+        self.eos_id: int = self.special_tokens[str(eos_token)]
+        logger.info(
+            f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
+        )
+        self.pad_id: int = self.special_tokens[str(pad_token)]
+        self.unk_id: int = self.special_tokens[str(unk_token)]
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        self.decoder = {}
+        for i in range(self.n_words):
+            # Taken from https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee
+            decoding = ''.join([
+                self.byte_encoder[ord(char)] for char in
+                self.model.decode_single_token_bytes(i).decode('latin-1')
+            ])
+            self.decoder[i] = decoding
+        self.encoder = {}
+        for i in range(self.n_words):
+            if i in self.decoder:
+                self.encoder[self.decoder[i]] = i
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            additional_special_tokens=self.additional_special_tokens,
+            **kwargs,
+        )
+        self.all_special_ids_set = set(self.all_special_ids)
+    def encode(
+        self,
+        text: str,
+        allow_special_tokens = True,
+        **kwargs
+    ) -> List[int]:
+        """
+        Encodes a string into a list of token IDs.
+        Args:
+            text (str): The input string to be encoded.
+        Returns:
+            list[int]: A list of token IDs.
+        """
+        # If there are other args, we should call super().encode because there are a lot of code
+        # to handle those args. supper().encode finally will call _tokenize and _convert_token_to_id.
+        # NOTE: our encode method is not compatible with the super().encode method,
+        #   e.g. split_special_tokens' default is True in our encode method.
+        if len(kwargs) > 0:
+            logger.warning( f"Calling super().encode with {kwargs}" )
+            return super().encode(text, **kwargs)
+        assert type(text) is str
+        # The tiktoken tokenizer can handle <=400k chars without
+        # pyo3_runtime.PanicException.
+        TIKTOKEN_MAX_ENCODE_CHARS = 400_000
+        # https://github.com/openai/tiktoken/issues/195
+        # Here we iterate over subsequences and split if we exceed the limit
+        # of max consecutive non-whitespace or whitespace characters.
+        MAX_NO_WHITESPACES_CHARS = 25_000
+        texts = self.pre_tokenizer_process(text)
+        all_substrs = []
+        for text in texts:
+            substrs = (
+                substr
+                for i in range(0, len(text), TIKTOKEN_MAX_ENCODE_CHARS)
+                for substr in self._split_whitespaces_or_nonwhitespaces(
+                    text[i: i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
+                )
+            )
+            all_substrs.extend(substrs)
+        t: List[int] = []
+        for substr in all_substrs:
+            if allow_special_tokens:
+                t.extend(
+                    self.model.encode(
+                        substr,
+                        allowed_special="all",
+                    )
+                )
+            else:
+                t.extend(
+                    self.model.encode(
+                        substr,
+                        disallowed_special=(),
+                    )
+                )
+        return t
+    def decode(
+        self,
+        token_ids: Union[int, List[int]],
+        **kwargs
+    ) -> str:
+        """
+        Decodes a list of token IDs into a string.
+        Args:
+            token_ids (List[int]): The list of token IDs to be decoded.
+        Returns:
+            str: The decoded string.
+        """
+        # If there are other args, we should call super().decode because there are a lot of code
+        # to handle those args. supper().encode finally will call convert_tokens_to_string and _convert_id_to_token.
+        if len(kwargs) > 0:
+            return super().decode(token_ids, **kwargs)
+        if type(token_ids) is int:
+            token_ids = [token_ids]
+        return self.model.decode(cast(List[int], token_ids))
+    @staticmethod
+    def _split_whitespaces_or_nonwhitespaces(
+        s: str, max_consecutive_slice_len: int
+    ) -> Iterator[str]:
+        """
+        Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
+        consecutive whitespaces or consecutive non-whitespaces.
+        """
+        current_slice_len = 0
+        current_slice_is_space = s[0].isspace() if len(s) > 0 else False
+        slice_start = 0
+        for i in range(len(s)):
+            is_now_space = s[i].isspace()
+            if current_slice_is_space ^ is_now_space:
+                current_slice_len = 1
+                current_slice_is_space = is_now_space
+            else:
+                current_slice_len += 1
+                if current_slice_len > max_consecutive_slice_len:
+                    yield s[slice_start:i]
+                    slice_start = i
+                    current_slice_len = 1
+        yield s[slice_start:]
+    def pre_tokenizer_process(self, text: str) -> List[str]:
+        """
+        pre-tokenizes the input text into a list of tokens.
+        This method is used to split the input text into smaller chunks for internal processing.
+        """
+        return [text]
+    """ ----- Below are the abstract methods required by PreTrainedTokenizer ----- """
+    @property
+    def vocab_size(self) -> int:
+        return self.n_words
+    def get_vocab(self) -> Dict[str, int]:
+        return self.encoder
+    def _tokenize(self, text: str, **kwargs) -> List[str]:
+        return [
+            self.decoder[t]
+            for t in self.encode(text)
+        ]
+    def _convert_token_to_id(self, token: str) -> int:
+        return self.encoder.get(token, self.unk_id)
+    def _convert_id_to_token(self, index: int) -> str:
+        return self.decoder.get(index)
+    @staticmethod
+    def clean_up_tokenization(out_string: str) -> str:
+        return out_string
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        text = ''.join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', 'replace')
+        return text
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            raise ValueError(f"vocabulary path ({save_directory}) should be a directory")
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        return (out_vocab_file,)
+class TikTokenV3(TikTokenTokenizer):
+    num_reserved_special_tokens = 293 + 128
+    pat_str = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "added_tokens_decoder": {
+    "151643": {"content": "[BOS]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true},
+    "151644": {"content": "[EOS]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true},
+    "151645": {"content": "<|im_end|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true},
+    "151646": {"content": "<|im_user|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true},
+    "151647": {"content": "<|im_assistant|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true},
+    "151648": {"content": "<|reserved_token_0|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true},
+    "151649": {"content": "<|start_header_id|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true},
+    "151650": {"content": "<|end_header_id|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true},
+    "151651": {"content": "<|reserved_token_1|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true},
+    "151652": {"content": "[EOT]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true},
+    "151653": {"content": "<|im_system|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true},
+    "151654": {"content": "<|reserved_token_2|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true},
+    "151655": {"content": "<|reserved_token_3|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true},
+    "151656": {"content": "<|reserved_token_4|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true},
+    "151657": {"content": "<|reserved_token_5|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true},
+    "151658": {"content": "<|reserved_token_6|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true},
+    "151659": {"content": "<|reserved_token_7|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true},
+    "151660": {"content": "<|im_middle|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true},
+    "151661": {"content": "<|media_begin|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true},
+    "151662": {"content": "<|media_content|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true},
+    "151663": {"content": "<|media_end|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true},
+    "151664": {"content": "<|media_placeholder|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true},
+    "151665": {"content": "<|vision_start|>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true},
+    "151666": {"content": "<|vision_end|>",   "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true},
+    "151667": {"content": "<|image_pad|>",     "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true},
+    "151668": {"content": "<|video_pad|>",     "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true},
+    "152062": {"content": "[UNK]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true},
+    "152063": {"content": "[PAD]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}
+  },
+  "additional_special_tokens": [
+    "<|im_end|>", "<|im_user|>", "<|im_assistant|>",
+    "<|reserved_token_0|>", "<|start_header_id|>", "<|end_header_id|>",
+    "<|reserved_token_1|>", "[EOT]", "<|im_system|>",
+    "<|reserved_token_2|>", "<|reserved_token_3|>", "<|reserved_token_4|>",
+    "<|reserved_token_5|>", "<|reserved_token_6|>", "<|reserved_token_7|>",
+    "<|im_middle|>",
+    "<|media_begin|>", "<|media_content|>", "<|media_end|>", "<|media_placeholder|>",
+    "<|vision_start|>", "<|vision_end|>", "<|image_pad|>", "<|video_pad|>"
+  ],
+  "bos_token": "[BOS]",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "[EOS]",
+  "extra_special_tokens": {},
+  "chat_template": "{%- for message in messages -%}{%- if loop.first and messages[0]['role'] != 'system' -%}{{'<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>'}}{%- endif -%}{%- if message['role'] == 'system' -%}{{'<|im_system|>'}}{%- endif -%}{%- if message['role'] == 'user' -%}{{'<|im_user|>'}}{%- endif -%}{%- if message['role'] == 'assistant' -%}{{'<|im_assistant|>'}}{%- endif -%}{{- message['role'] -}}{{'<|im_middle|>'}}{%- if message['content'] is string -%}{{- message['content'] + '<|im_end|>' -}}{%- else -%}{%- for content in message['content'] -%}{%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}{{'<|media_begin|>image<|media_content|><|media_placeholder|><|media_end|>'}}{%- else -%}{{content['text']}}{%- endif -%}{%- endfor -%}{{'<|im_end|>'}}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{'<|im_assistant|>assistant<|im_middle|>'}}{%- endif -%}",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "tokenizer_class": "TikTokenV3",
+  "unk_token": "[UNK]",
+  "auto_map": {
+    "AutoTokenizer": ["tokenization_opencua.TikTokenV3", null]
+  }
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff