Upload 8 files

Browse files

Files changed (8) hide show

README.md +92 -1
config.json +170 -0
handler.py +108 -0
preprocessor_config.json +25 -0
special_tokens_map.json +7 -0
tokenizer.json +0 -0
tokenizer_config.json +21 -0
vocab.txt +0 -0

README.md CHANGED Viewed

@@ -1,3 +1,94 @@
 ---
-license: apache-2.0
 ---

 ---
+tags:
+- image-to-text
+- image-captioning
+- endpoints-template
+license: bsd-3-clause
+library_name: generic
 ---
+# Fork of [Salesforce/blip-image-captioning-large](https://huggingface.co/Salesforce/blip-image-captioning-large) for a `image-captioning` task on 🤗Inference endpoint.
+This repository implements a `custom` task for `image-captioning` for 🤗 Inference Endpoints. The code for the customized pipeline is in the [pipeline.py](https://huggingface.co/florentgbelidji/blip_captioning/blob/main/pipeline.py).
+To use deploy this model a an Inference Endpoint you have to select `Custom` as task to use the `pipeline.py` file. -> _double check if it is selected_
+### expected Request payload
+```json
+{
+  "image": "/9j/4AAQSkZJRgA.....", #encoded image
+  "text": "a photography of a"
+}
+```
+below is an example on how to run a request using Python and `requests`.
+## Run Request
+1. Use any online  image.
+```bash
+!wget https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg
+```
+2.run request
+```python
+import json
+from typing import List
+import requests as r
+import base64
+with open("/content/demo.jpg", "rb") as image_file:
+    encoded_string = base64.b64encode(image_file.read()).decode()
+ENDPOINT_URL = ""
+HF_TOKEN = ""
+def query(payload):
+	response = requests.post(API_URL, headers=headers, json=payload)
+	return response.json()
+output = query({
+    "inputs": {
+        "images": [encoded_string],  # using the base64 encoded string
+        "texts": ["a photography of"]  # Optional, based on your current class logic
+    }
+})
+print(output)
+```
+Example parameters depending on the decoding strategy:
+1. Beam search
+```
+        "parameters": {
+                   "num_beams":5,
+                   "max_length":20
+        }
+```
+2. Nucleus sampling
+```
+        "parameters": {
+                   "num_beams":1,
+                   "max_length":20,
+                   "do_sample": True,
+                   "top_k":50,
+                   "top_p":0.95
+        }
+```
+3. Contrastive search
+```
+        "parameters": {
+                   "penalty_alpha":0.6,
+                   "top_k":4
+                   "max_length":512
+        }
+```
+See [generate()](https://huggingface.co/docs/transformers/v4.25.1/en/main_classes/text_generation#transformers.GenerationMixin.generate) doc for additional detail
+expected output
+```python
+{'captions': ['a photography of a woman and her dog on the beach']}
+```

config.json ADDED Viewed

	@@ -0,0 +1,170 @@

+{
+  "_commit_hash": null,
+  "architectures": [
+    "BlipForConditionalGeneration"
+  ],
+  "image_text_hidden_size": 256,
+  "initializer_factor": 1.0,
+  "logit_scale_init_value": 2.6592,
+  "model_type": "blip",
+  "projection_dim": 512,
+  "text_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_probs_dropout_prob": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 30522,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_hidden_size": 1024,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 2,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.0,
+    "hidden_size": 768,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "is_decoder": true,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-12,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 512,
+    "min_length": 0,
+    "model_type": "blip_text_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 12,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 0,
+    "prefix": null,
+    "problem_type": null,
+    "projection_dim": 768,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": 102,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.26.0.dev0",
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_cache": true,
+    "vocab_size": 30524
+  },
+  "torch_dtype": "float32",
+  "transformers_version": null,
+  "vision_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.0,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "gelu",
+    "hidden_size": 1024,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_size": 384,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "blip_vision_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 16,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_channels": 3,
+    "num_hidden_layers": 24,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 16,
+    "prefix": null,
+    "problem_type": null,
+    "projection_dim": 512,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.26.0.dev0",
+    "typical_p": 1.0,
+    "use_bfloat16": false
+  }
+}

handler.py ADDED Viewed

	@@ -0,0 +1,108 @@

+from typing import Dict, Any, List
+from PIL import Image
+import torch
+from io import BytesIO
+from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
+# Source: https://www.philschmid.de/custom-inference-handler
+class EndpointHandler:
+    def __init__(self, path="nlpconnect/vit-gpt2-image-captioning"):
+        self.model = VisionEncoderDecoderModel.from_pretrained(path)
+        # Using ViTImageProcessor instead of ViTFeatureExtractor
+        self.feature_extractor = ViTImageProcessor.from_pretrained(path)
+        self.tokenizer = AutoTokenizer.from_pretrained(path)
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model.to(self.device)
+        self.max_length = 16
+        self.num_beams = 4
+    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """
+        Args:
+            data (:obj:):
+                includes the input image data.
+        Return:
+            A :obj:`dict` with the caption.
+        """
+        image_bytes = data.get("inputs", None)
+        # Convert image bytes to PIL Image
+        image = Image.open(BytesIO(image_bytes))
+        if image.mode != "RGB":
+            image = image.convert(mode="RGB")
+        pixel_values = self.feature_extractor(
+            images=image, return_tensors="pt"
+        ).pixel_values
+        pixel_values = pixel_values.to(self.device)
+        gen_kwargs = {"max_length": self.max_length, "num_beams": self.num_beams}
+        output_ids = self.model.generate(pixel_values, **gen_kwargs)
+        caption = self.tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
+        return {"caption": caption}
+# from typing import Dict, Any, List
+# from PIL import Image
+# import torch
+# import os
+# from io import BytesIO
+# from transformers import (
+#     VisionEncoderDecoderModel,
+#     ViTImageProcessor,
+#     AutoTokenizer,
+#     PreTrainedModel,
+# )
+# class EndpointHandler:
+#     def __init__(self, model_path="nlpconnect/vit-gpt2-image-captioning"):
+#         # Load model and components
+#         self.model: PreTrainedModel = VisionEncoderDecoderModel.from_pretrained(
+#             model_path
+#         )
+#         self.processor = ViTImageProcessor.from_pretrained(model_path)
+#         self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+#         # Ensure model is on the correct device
+#         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+#         self.model.to(self.device)
+#         # Parameters for generation
+#         self.gen_kwargs = {"max_length": 16, "num_beams": 4, "attention_mask": True}
+#         # Save model and configuration to the /repository directory
+#         self.model_directory = "/repository"
+#         os.makedirs(self.model_directory, exist_ok=True)  # Ensure the directory exists
+#         self.model.config.save_pretrained(self.model_directory)
+#         torch.save(
+#             self.model.state_dict(),
+#             os.path.join(self.model_directory, "pytorch_model.bin"),
+#         )
+#     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+#         image_bytes = data["image"]
+#         # Open image and ensure it's RGB
+#         image = Image.open(BytesIO(image_bytes))
+#         if image.mode != "RGB":
+#             image = image.convert("RGB")
+#         # Process image and prepare input tensor
+#         pixel_values = self.processor(images=image, return_tensors="pt").pixel_values
+#         pixel_values = pixel_values.to(self.device)
+#         # Generate captions
+#         output_ids = self.model.generate(pixel_values, **self.gen_kwargs)
+#         caption = self.tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
+#         return {"caption": caption}

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "do_normalize": true,
+  "do_pad": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "BlipImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "processor_class": "BlipProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 384,
+    "width": 384
+  },
+  "size_divisor": 32
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "name_or_path": "Salesforce/blip-image-captioning-large",
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "processor_class": "BlipProcessor",
+  "sep_token": "[SEP]",
+  "special_tokens_map_file": null,
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]",
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ]
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff