visheratin
/

MC-LLaVA-3b

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Llava.
+"""
+from typing import List, Optional, Union
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.tokenization_utils_base import (
+    PaddingStrategy,
+    PreTokenizedInput,
+    TextInput,
+    TruncationStrategy,
+)
+from transformers.utils import TensorType
+import torch
+from open_clip.transform import PreprocessCfg, image_transform_v2
+class OpenCLIPImageProcessor:
+    def __init__(self, config):
+        cfg = PreprocessCfg(**config)
+        transform = image_transform_v2(cfg=cfg, is_train=False)
+        self.transform = transform
+    def __call__(self, image, return_tensors):
+        if isinstance(image, list):
+            outputs = []
+            for item in image:
+                outputs.append(self.transform(item))
+            return {
+                "pixel_values": torch.tensor(outputs),
+            }
+        output = self.transform(image)
+        return {
+            "pixel_values": output.unsqueeze(0),
+        }
+    @property
+    def model_input_names(self):
+        return ["pixel_values"]
+class LlavaProcessor:
+    def __init__(self, image_processor: OpenCLIPImageProcessor, tokenizer):
+        self.image_processor = image_processor
+        self.tokenizer = tokenizer
+    def __call__(
+        self,
+        text: Union[
+            TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]
+        ] = None,
+        images: ImageInput = None,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length=None,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+    ) -> BatchFeature:
+        if images is not None:
+            pixel_values = self.image_processor(images, return_tensors=return_tensors)[
+                "pixel_values"
+            ]
+        else:
+            pixel_values = None
+        text_inputs = self.tokenizer(
+            text,
+            return_tensors=return_tensors,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+        )
+        return BatchFeature(data={**text_inputs, "pixel_values": pixel_values})
+    def batch_decode(self, *args, **kwargs):
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))