File size: 4,543 Bytes

2a190f5

# processing_opencua.py
import torch
from typing import List, Dict, Any, Union
from PIL import Image
from transformers.processing_utils import ProcessorMixin, BatchFeature
from transformers import AutoTokenizer, AutoImageProcessor

PLACEHOLDER = "<|media_placeholder|>"

class OpenCUAProcessor(ProcessorMixin):
    attributes = ["image_processor", "tokenizer", "image_token_id", "merge_size"]

    def __init__(self, image_processor, tokenizer, image_token_id: int = 151664, merge_size: int = 2, **kwargs):
        self.image_processor = image_processor
        self.tokenizer = tokenizer
        self.image_token_id = image_token_id
        self.merge_size = getattr(image_processor, "merge_size", merge_size)

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
        trust = kwargs.get("trust_remote_code", True)
        # 优先用你仓库的 TikTokenV3；失败回退 AutoTokenizer（只用于初始化/占位）
        try:
            from tokenization_opencua import TikTokenV3
            tok = TikTokenV3.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust)
        except Exception:
            tok = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust)
        imgproc = AutoImageProcessor.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust)
        return cls(imgproc, tok, **kwargs)

    def apply_chat_template(self, messages: List[Dict[str, Any]], **kwargs) -> Union[str, List[int]]:
        return self.tokenizer.apply_chat_template(messages, **kwargs)

    # 下面这些方法给 HF 路径用；vLLM 初始化只需要能成功 new 出来即可
    def __call__(self, *args, **kwargs) -> BatchFeature:
        # 返回一个最小结构，避免被实际调用时崩溃
        data = {"input_ids": torch.zeros(1, 1, dtype=torch.long)}
        return BatchFeature(data=data)

    # 提供给你自己脚本用的辅助（可选）
    def prepare_vllm_inputs(self, messages, images, add_generation_prompt=True):
        text = self.apply_chat_template(messages, tokenize=False, add_generation_prompt=add_generation_prompt)
        proc = self.image_processor(images=images, return_tensors="pt")
        grid = torch.as_tensor(proc["image_grid_thw"])
        merge = getattr(self, "merge_size", 2)
        for thw in grid:
            num = int((thw[0] * thw[1] * thw[2]) // (merge ** 2))
            text = text.replace(PLACEHOLDER, PLACEHOLDER * num, 1)
        return text, images



# # processing_opencua.py
# from transformers import Qwen2_5_VLProcessor, AutoTokenizer, AutoImageProcessor

# class OpenCUAProcessor(Qwen2_5_VLProcessor):
#     # 用字符串就行，但我们会在 from_pretrained 里手动加载，避免字符串反射
#     tokenizer_class = "TikTokenV3"

#     @classmethod
#     def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
#         # 确保 remote code 可用
#         trust_remote_code = kwargs.get("trust_remote_code", False)

#         # 1) 手动加载 tokenizer（会按模型目录里的 tokenizer_config.json -> TikTokenV3 + tokenization_opencua.py）
#         tokenizer = AutoTokenizer.from_pretrained(
#             pretrained_model_name_or_path,
#             trust_remote_code=trust_remote_code,
#         )

#         # 2) 手动加载图像处理器（保持 Qwen2VLImageProcessor）
#         image_processor = AutoImageProcessor.from_pretrained(
#             pretrained_model_name_or_path,
#             trust_remote_code=trust_remote_code,
#         )

#         # 3) 获取chat_template，如果tokenizer有的话
#         chat_template = getattr(tokenizer, 'chat_template', None)

#         # 4) 构造并返回 Qwen2.5-VL 的 Processor 实例，传递chat_template
#         processor = cls(image_processor=image_processor, tokenizer=tokenizer, chat_template=chat_template)
        
#         # 5) 添加vLLM需要的属性
#         # 这些token ID需要与tokenizer_config.json中的定义一致
#         processor.image_token = "<|media_placeholder|>"  # 使用OpenCUA的媒体占位符
#         processor.video_token = "<|media_placeholder|>"  # 视频也使用相同的占位符
        
#         # 添加token ID（从tokenizer_config.json中获取）
#         vocab = tokenizer.get_vocab()
#         processor.image_token_id = vocab.get("<|media_placeholder|>", 151664)  # 默认151664
#         processor.video_token_id = vocab.get("<|media_placeholder|>", 151664)  # 视频使用相同ID
        
#         return processor