visheratin
/

MC-LLaVA-3b

@@ -1,101 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Processor class for Llava.
-"""
-from typing import List, Optional, Union
-from transformers.feature_extraction_utils import BatchFeature
-from transformers.image_utils import ImageInput
-from transformers.tokenization_utils_base import (
-    PaddingStrategy,
-    PreTokenizedInput,
-    TextInput,
-    TruncationStrategy,
-)
-from transformers.utils import TensorType
-import torch
-from open_clip.transform import PreprocessCfg, image_transform_v2
-class OpenCLIPImageProcessor:
-    def __init__(self, config):
-        cfg = PreprocessCfg(**config)
-        transform = image_transform_v2(cfg=cfg, is_train=False)
-        self.transform = transform
-    def __call__(self, image, return_tensors):
-        if isinstance(image, list):
-            outputs = []
-            for item in image:
-                outputs.append(self.transform(item))
-            return {
-                "pixel_values": torch.tensor(outputs),
-            }
-        output = self.transform(image)
-        return {
-            "pixel_values": output.unsqueeze(0),
-        }
-    @property
-    def model_input_names(self):
-        return ["pixel_values"]
-class LlavaProcessor:
-    def __init__(self, image_processor: OpenCLIPImageProcessor, tokenizer):
-        self.image_processor = image_processor
-        self.tokenizer = tokenizer
-    def __call__(
-        self,
-        text: Union[
-            TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]
-        ] = None,
-        images: ImageInput = None,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length=None,
-        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
-    ) -> BatchFeature:
-        if images is not None:
-            pixel_values = self.image_processor(images, return_tensors=return_tensors)[
-                "pixel_values"
-            ]
-        else:
-            pixel_values = None
-        text_inputs = self.tokenizer(
-            text,
-            return_tensors=return_tensors,
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-        )
-        return BatchFeature(data={**text_inputs, "pixel_values": pixel_values})
-    def batch_decode(self, *args, **kwargs):
-        return self.tokenizer.batch_decode(*args, **kwargs)
-    def decode(self, *args, **kwargs):
-        return self.tokenizer.decode(*args, **kwargs)
-    @property
-    def model_input_names(self):
-        tokenizer_input_names = self.tokenizer.model_input_names
-        image_processor_input_names = self.image_processor.model_input_names
-        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))