Sundogs
/

image_to_text

@@ -1,10 +1,14 @@
-from typing import Dict, List, Any
-from transformers import pipeline
-import torch, PIL, transformers, triton, sentencepiece, protobuf
-import torchvision, einops
-import xformers, accelerate
-from transformers import AutoModelForCausalLM, LlamaTokenizer
 class EndpointHandler():
     def __init__(self, path=""):
@@ -17,7 +21,7 @@ class EndpointHandler():
         )
         self.tokenizer = LlamaTokenizer.from_pretrained('lmsys/vicuna-7b-v1.5')
         # create inference pipeline
-        # self.pipeline = pipeline(model=model, tokenizer=tokenizer)
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         """
@@ -29,9 +33,86 @@ class EndpointHandler():
                  - "label": A string representing what the label/class is. There can be multiple labels.
                  - "score": A score between 0 and 1 describing how confident the model is for this label/class.
          """
         inputs = data.pop("inputs", data)
         gen_kwargs = {"max_length": 2048, "do_sample": False}
         # pass inputs with all kwargs in data
         # prediction = self.pipeline(inputs)

+from typing import Dict, List, Any, Optional, Tuple, Literal
+# from transformers import pipeline
+import torch, PIL, triton, protobuf
+from torchvision import transforms
+# import torchvision, einops
+# import xformers, accelerate
+from transformers import AutoModelForCausalLM, LlamaTokenizer, PretrainedConfig
+LANGUAGE_TOKEN_TYPE = 0
+VISION_TOKEN_TYPE = 1
+config = PretrainedConfig.from_json_file('config.json')
 class EndpointHandler():
     def __init__(self, path=""):
         )
         self.tokenizer = LlamaTokenizer.from_pretrained('lmsys/vicuna-7b-v1.5')
         # create inference pipeline
+        # self.pipeline = pipeline("text-generation", model="THUDM/cogvlm-chat-hf", trust_remote_code=True)
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         """
                  - "label": A string representing what the label/class is. There can be multiple labels.
                  - "score": A score between 0 and 1 describing how confident the model is for this label/class.
          """
+        def _history_to_prompt(signal_type, history, query):
+            if signal_type == 'base':
+                return query
+            elif signal_type == 'vqa':
+                answer_format = 'Short answer:'
+            elif signal_type == 'chat':
+                answer_format = 'Answer:'
+            else:
+                assert False, f"Unknown signal type {signal_type}"
+            prompt = ''
+            for i, (old_query, response) in enumerate(history):
+                prompt += 'Question: ' + old_query + " {} ".format(answer_format) + response + "\n"
+            prompt += 'Question: {} {}'.format(query, answer_format)
+            return prompt
+        def build_conversation_input_ids(
+                tokenizer: "PreTrainedTokenizer",
+                *,
+                query: str,
+                history: Optional[List[Tuple[str, str]]] = None,
+                images: Optional[List["PIL.Image"]] = None,
+                template_version: Optional[Literal["base", "chat", "vqa"]] = None,
+                config=config
+        ):
+            image_size: int = config.vision_config['image_size']
+            patch_size: int = config.vision_config['patch_size']
+            template_version = template_version or config.template_version
+            assert images is None or len(images) <= 1, f"not support multi images by now."
+            history = history or []
+            text = _history_to_prompt(template_version, history, query)
+            input_ids = [tokenizer.bos_token_id]
+            token_type_ids = [LANGUAGE_TOKEN_TYPE]
+            if images is not None and len(images) == 1:
+                # vision
+                transform = transforms.Compose(
+                    [
+                        transforms.Resize(
+                            (image_size, image_size), interpolation=transforms.InterpolationMode.BICUBIC
+                        ),
+                        transforms.ToTensor(),
+                        transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+                    ]
+                )
+                images = [transform(images[0])]
+                # language
+                vision_token_num = (image_size // patch_size) * (image_size // patch_size) + 2
+                input_ids += [tokenizer.pad_token_id] * vision_token_num
+                token_type_ids += [VISION_TOKEN_TYPE] * vision_token_num
+            text_ids = tokenizer.encode(text, add_special_tokens=False)
+            input_ids += text_ids
+            token_type_ids += [LANGUAGE_TOKEN_TYPE] * len(text_ids)
+            attention_mask = [1] * len(input_ids)
+            return {
+                'input_ids': torch.tensor(input_ids, dtype=torch.long),
+                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
+                'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
+                'images': images,
+            }
         inputs = data.pop("inputs", data)
+        query = inputs.pop("query", data)
+        image = inputs.pop("image", data)
         gen_kwargs = {"max_length": 2048, "do_sample": False}
+        inputs = build_conversation_input_ids(self.tokenizer, query=query, history=[], images=[image],
+                                              template_version='vqa')
+        inputs = {'inputs': {
+            'input_ids': inputs['input_ids'].unsqueeze(0).to('cuda'),
+            'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to('cuda'),
+            'attention_mask': inputs['attention_mask'].unsqueeze(0).to('cuda'),
+            'images': [[inputs['images'][0].to('cuda').to(torch.bfloat16)]],
+        }}
         # pass inputs with all kwargs in data
         # prediction = self.pipeline(inputs)