finhdev
/

testmobileclip

mobileclip

Model card Files Files and versions

xet

Community

finhdev commited on Jul 29, 2025

Commit

aa10251

verified ·

1 Parent(s): 825b375

Update handler.py

Browse files

Files changed (1) hide show

handler.py +101 -13

handler.py CHANGED Viewed

@@ -1,13 +1,16 @@
 # handler.py  (repo root)
 import io, base64, torch
 from PIL import Image
 import open_clip
 class EndpointHandler:
     """
     Zero‑shot classifier for MobileCLIP‑B (OpenCLIP).
-    Expected client JSON *to the endpoint*:
     {
       "inputs": {
         "image": "<base64 PNG/JPEG>",
@@ -16,43 +19,128 @@ class EndpointHandler:
     }
     """
     def __init__(self, path: str = ""):
         weights = f"{path}/mobileclip_b.pt"
         self.model, _, self.preprocess = open_clip.create_model_and_transforms(
             "MobileCLIP-B", pretrained=weights
         )
-        self.model.eval()
         self.tokenizer = open_clip.get_tokenizer("MobileCLIP-B")
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.model.to(self.device)
     def __call__(self, data):
-        # ── unwrap Hugging Face's `inputs` envelope ───────────
         payload = data.get("inputs", data)
-        img_b64 = payload["image"]
-        labels  = payload.get("candidate_labels", [])
         if not labels:
             return {"error": "candidate_labels list is empty"}
-        # Decode & preprocess image
         image = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB")
         img_tensor = self.preprocess(image).unsqueeze(0).to(self.device)
-        # Tokenise labels
-        text_tokens = self.tokenizer(labels).to(self.device)
-        # Forward pass
         with torch.no_grad(), torch.cuda.amp.autocast():
             img_feat = self.model.encode_image(img_tensor)
-            txt_feat = self.model.encode_text(text_tokens)
             img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True)
-            txt_feat = txt_feat / txt_feat.norm(dim=-1, keepdim=True)
-            probs = (100 * img_feat @ txt_feat.T).softmax(dim=-1)[0].tolist()
-        # Sorted output
         return [
             {"label": l, "score": float(p)}
             for l, p in sorted(zip(labels, probs), key=lambda x: x[1], reverse=True)
         ]

 # handler.py  (repo root)
 import io, base64, torch
 from PIL import Image
 import open_clip
+from open_clip import fuse_conv_bn_sequential
 class EndpointHandler:
     """
     Zero‑shot classifier for MobileCLIP‑B (OpenCLIP).
+    Client JSON format:
     {
       "inputs": {
         "image": "<base64 PNG/JPEG>",
     }
     """
+    # ----------------------------------------------------- #
+    #               INITIALISATION (once)                  #
+    # ----------------------------------------------------- #
     def __init__(self, path: str = ""):
         weights = f"{path}/mobileclip_b.pt"
+        # Load model + transforms
         self.model, _, self.preprocess = open_clip.create_model_and_transforms(
             "MobileCLIP-B", pretrained=weights
         )
+        # Fuse Conv+BN for faster inference
+        self.model = fuse_conv_bn_sequential(self.model).eval()
+        # Tokeniser
         self.tokenizer = open_clip.get_tokenizer("MobileCLIP-B")
+        # Device
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.model.to(self.device)
+        # -------- text‑embedding cache --------
+        # key: prompt string  •  value: torch.Tensor [512] on correct device
+        self.label_cache: dict[str, torch.Tensor] = {}
+    # ----------------------------------------------------- #
+    #              INFERENCE  (per request)                #
+    # ----------------------------------------------------- #
     def __call__(self, data):
+        # 1. Unwrap the HF "inputs" envelope
         payload = data.get("inputs", data)
+        img_b64  = payload["image"]
+        labels   = payload.get("candidate_labels", [])
         if not labels:
             return {"error": "candidate_labels list is empty"}
+        # 2. Decode & preprocess image
         image = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB")
         img_tensor = self.preprocess(image).unsqueeze(0).to(self.device)
+        # 3. Text embeddings with cache
+        missing = [l for l in labels if l not in self.label_cache]
+        if missing:
+            tokens = self.tokenizer(missing).to(self.device)
+            with torch.no_grad():
+                emb = self.model.encode_text(tokens)
+                emb = emb / emb.norm(dim=-1, keepdim=True)
+            for lbl, vec in zip(missing, emb):
+                self.label_cache[lbl] = vec  # store on device
+        txt_feat = torch.stack([self.label_cache[l] for l in labels])
+        # 4. Forward pass for image
         with torch.no_grad(), torch.cuda.amp.autocast():
             img_feat = self.model.encode_image(img_tensor)
             img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True)
+        # 5. Similarity & softmax
+        probs = (100 * img_feat @ txt_feat.T).softmax(dim=-1)[0].tolist()
+        # 6. Return sorted list
         return [
             {"label": l, "score": float(p)}
             for l, p in sorted(zip(labels, probs), key=lambda x: x[1], reverse=True)
         ]
+# # handler.py  (repo root)
+# import io, base64, torch
+# from PIL import Image
+# import open_clip
+# class EndpointHandler:
+#     """
+#     Zero‑shot classifier for MobileCLIP‑B (OpenCLIP).
+#     Expected client JSON *to the endpoint*:
+#     {
+#       "inputs": {
+#         "image": "<base64 PNG/JPEG>",
+#         "candidate_labels": ["cat", "dog", ...]
+#       }
+#     }
+#     """
+#     def __init__(self, path: str = ""):
+#         weights = f"{path}/mobileclip_b.pt"
+#         self.model, _, self.preprocess = open_clip.create_model_and_transforms(
+#             "MobileCLIP-B", pretrained=weights
+#         )
+#         self.model.eval()
+#         self.tokenizer = open_clip.get_tokenizer("MobileCLIP-B")
+#         self.device = "cuda" if torch.cuda.is_available() else "cpu"
+#         self.model.to(self.device)
+#     def __call__(self, data):
+#         # ── unwrap Hugging Face's `inputs` envelope ───────────
+#         payload = data.get("inputs", data)
+#         img_b64 = payload["image"]
+#         labels  = payload.get("candidate_labels", [])
+#         if not labels:
+#             return {"error": "candidate_labels list is empty"}
+#         # Decode & preprocess image
+#         image = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB")
+#         img_tensor = self.preprocess(image).unsqueeze(0).to(self.device)
+#         # Tokenise labels
+#         text_tokens = self.tokenizer(labels).to(self.device)
+#         # Forward pass
+#         with torch.no_grad(), torch.cuda.amp.autocast():
+#             img_feat = self.model.encode_image(img_tensor)
+#             txt_feat = self.model.encode_text(text_tokens)
+#             img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True)
+#             txt_feat = txt_feat / txt_feat.norm(dim=-1, keepdim=True)
+#             probs = (100 * img_feat @ txt_feat.T).softmax(dim=-1)[0].tolist()
+#         # Sorted output
+#         return [
+#             {"label": l, "score": float(p)}
+#             for l, p in sorted(zip(labels, probs), key=lambda x: x[1], reverse=True)
+#         ]