Spaces:

SaniaE
/

Image_Captioning_Ensemble_API

Running

App Files Files Community

SaniaE commited on 3 days ago

Commit

cf0f372

verified ·

1 Parent(s): 8b9f879

revamped complete API structure

Browse files

Files changed (1) hide show

app.py +102 -126

app.py CHANGED Viewed

@@ -1,40 +1,46 @@
 import os
-import torch
-import random
-import asyncio
 import io
 import numpy as np
 import matplotlib.pyplot as plt
 from PIL import Image, ImageFilter
 from fastapi import FastAPI, UploadFile, File, Query
 from fastapi.responses import StreamingResponse
 from huggingface_hub import snapshot_download, login
-import torch.nn.functional as F
 from transformers import (
     BlipProcessor, BlipForConditionalGeneration,
-    ViTImageProcessor, AutoProcessor, AutoModelForCausalLM
 )
-app = FastAPI(title="XAI Auditor Ensemble")
-# --- Configuration & State ---
 REPO_ID = "SaniaE/Image_Captioning_Ensemble"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 MODELS = {}
-MODEL_SETTINGS = {
     "blip": {
         "subfolder": "blip",
-        "processor": BlipProcessor,
-        "pretrained_path": "Salesforce/blip-image-captioning-large",
-        "inference_model": BlipForConditionalGeneration
-    },
     "vit": {
         "subfolder": "vit",
-        "processor": [ViTImageProcessor, AutoProcessor],
-        "pretrained_path": ["nlpconnect/vit-gpt2-image-captioning", "microsoft/git-large"],
-        "inference_model": AutoModelForCausalLM
     }
 }
@@ -44,66 +50,72 @@ async def startup_event():
     token = os.getenv("HF_Token")
     if token: login(token=token)
-    print(f"Downloading models from {REPO_ID}...")
     local_dir = snapshot_download(repo_id=REPO_ID, token=token, local_dir="weights")
-    for name, cfg in MODEL_SETTINGS.items():
-        ckpt_path = os.path.join(local_dir, cfg["subfolder"])
-        print(f"Loading {name} from {ckpt_path}...")
-        model = cfg["inference_model"].from_pretrained(ckpt_path).to(DEVICE)
-        if name == "vit":
-            i_proc = cfg["processor"][0].from_pretrained(cfg["pretrained_path"][0])
-            t_proc = cfg["processor"][1].from_pretrained(cfg["pretrained_path"][1])
-            processor = (i_proc, t_proc)
-        else:
-            processor = cfg["processor"].from_pretrained(cfg["pretrained_path"])
-        MODELS[name] = {"model": model, "processor": processor}
-    print("Optimization Complete: Ensemble is live!")
-# --- Core Logic Helpers ---
-def _generate_sync(m_name, image, temp=0.7):
-    """Synchronous generator tailored for the specific architecture."""
-    m_data = MODELS[m_name]
-    model = m_data["model"]
     if m_name == "vit":
         i_proc, t_proc = m_data["processor"]
         inputs = i_proc(images=image, return_tensors="pt").to(DEVICE)
-        gen_ids = model.generate(**inputs, max_length=50, do_sample=True, temperature=temp)
-        return t_proc.batch_decode(gen_ids, skip_special_tokens=True)[0].strip()
     else:
         proc = m_data["processor"]
         inputs = proc(images=image, return_tensors="pt").to(DEVICE)
-        gen_ids = model.generate(**inputs, max_length=50, do_sample=True, temperature=temp)
-        return proc.batch_decode(gen_ids, skip_special_tokens=True)[0].strip()
-# --- Endpoint 1: The Multi-Perspective Generator ---
 @app.post("/generate")
-async def generate_endpoint(
     file: UploadFile = File(...),
     temp: float = Query(0.8),
     top_k: int = Query(50),
     top_p: float = Query(0.9)
 ):
     image = Image.open(file.file).convert("RGB")
-    available = ["blip", "vit"]
-    # Generate 5 captions using a mix of models
-    model_selection = random.choices(available, k=5)
-    tasks = [asyncio.to_thread(_generate_sync, m, image, temp, top_k, top_p) for m in model_selection]
     captions = await asyncio.gather(*tasks)
-    return {"captions": captions, "architectures": model_selection}
-# --- Endpoint 2: Objective Vision Saliency (Static Image Perception) ---
-@app.post("/saliency-explorer/vision")
-async def get_objective_saliency(file: UploadFile = File(...)):
     image_bytes = await file.read()
     orig_img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
@@ -111,93 +123,57 @@ async def get_objective_saliency(file: UploadFile = File(...)):
     inputs = blip["processor"](images=orig_img, return_tensors="pt").to(DEVICE)
     with torch.no_grad():
-        # Capturing Self-Attention from the Vision Encoder itself
-        # This shows what the model finds interesting in the image, regardless of prompt
-        outputs = blip["model"].vision_model(
-            inputs.pixel_values,
-            output_attentions=True
-        )
-        # Last layer attention: (batch, heads, patches, patches)
-        attentions = outputs.attentions[-1]
-        # Average across heads and focus on CLS token's view of the patches
-        # Patch grid for BLIP-Large is typically 24x24 (576 patches + 1 CLS)
-        nh = attentions.shape[1]
-        attentional_map = attentions[0, :, 0, 1:].reshape(nh, -1)
-        mask_1d = attentional_map.mean(dim=0)
         grid_size = int(np.sqrt(mask_1d.shape[-1]))
         mask = mask_1d.view(grid_size, grid_size).cpu().numpy()
-    # Normalization and High-Contrast "Heat"
     mask = (mask - mask.min()) / (mask.max() - mask.min() + 1e-8)
-    mask_pill = Image.fromarray((mask * 255).astype('uint8')).resize(orig_img.size, resample=Image.BICUBIC)
-    mask_pill = mask_pill.filter(ImageFilter.GaussianBlur(radius=10))
-    heatmap_rgba = plt.get_cmap('magma')(np.array(mask_pill)/255.0)
-    heatmap_img = Image.fromarray((heatmap_rgba[:, :, :3] * 255).astype('uint8')).convert("RGB")
-    # Blending at 0.6 alpha to make the "Model's Focus" pop
-    blended_img = Image.blend(orig_img, heatmap_img, alpha=0.6)
     buf = io.BytesIO()
-    blended_img.save(buf, format="PNG")
     buf.seek(0)
     return StreamingResponse(buf, media_type="image/png")
-# --- Endpoint 3: Perspective Auditor (Internal Debate) ---
-# --- Endpoint 3: Internal Debate (Audit Mode) ---
-@app.post("/audit-perspective")
-async def audit_perspective(file: UploadFile = File(...), user_prompt: str = Query(...)):
-    image = Image.open(file.file).convert("RGB")
-    # Run both models to get the "Internal Debate"
     blip_caption = await asyncio.to_thread(_generate_sync, "blip", image, 0.7, 50, 0.9)
-    vit_caption = await asyncio.to_thread(_generate_sync, "vit", image, 0.7, 50, 0.9)
-    def get_metrics(target, reference):
-        # 1. Semantic Embedding (The "Vibe" check)
-        blip = MODELS["blip"]
-        t_in = blip["processor"](text=target, return_tensors="pt", padding=True).to(DEVICE)
-        r_in = blip["processor"](text=reference, return_tensors="pt", padding=True).to(DEVICE)
-        with torch.no_grad():
-            t_emb = F.normalize(blip["model"].text_decoder.bert(**t_in).last_hidden_state.mean(dim=1), p=2, dim=-1)
-            r_emb = F.normalize(blip["model"].text_decoder.bert(**r_in).last_hidden_state.mean(dim=1), p=2, dim=-1)
-        cosine_sim = torch.matmul(t_emb, r_emb.T).item()
-        # 2. Jaccard Calibration (The "Accuracy" check - 70% weight)
-        t_words = set(target.lower().replace(",", "").split())
-        r_words = set(reference.lower().replace(",", "").split())
-        jaccard = len(t_words & r_words) / len(t_words | r_words) if t_words | r_words else 0
-        return (cosine_sim * 0.3) + (jaccard * 0.7)
-    user_vs_blip = get_metrics(user_prompt, blip_caption)
-    user_vs_vit = get_metrics(user_prompt, vit_caption)
-    consensus = get_metrics(blip_caption, vit_caption)
-    # XAI Verdict Logic
-    if consensus < 0.5:
-        verdict = "Model Confusion: High Uncertainty"
-    elif user_vs_blip < 0.6:
-        verdict = "Perspective Divergence: Prompt Mismatch"
     else:
-        verdict = "Verified: Strong Alignment"
     return {
-        "perspectives": {
-            "user_intent": user_prompt,
-            "blip_view": blip_caption,
-            "vit_git_view": vit_caption
-        },
-        "audit_metrics": {
-            "user_vs_blip": round(user_vs_blip, 4),
-            "user_vs_vit": round(user_vs_vit, 4),
-            "inter_model_consensus": round(consensus, 4)
-        },
         "verdict": verdict
     }

 import os
 import io
+import asyncio
+import random
 import numpy as np
+import torch
+import torch.nn.functional as F
 import matplotlib.pyplot as plt
 from PIL import Image, ImageFilter
 from fastapi import FastAPI, UploadFile, File, Query
 from fastapi.responses import StreamingResponse
 from huggingface_hub import snapshot_download, login
 from transformers import (
     BlipProcessor, BlipForConditionalGeneration,
+    ViTImageProcessor, AutoProcessor, AutoModelForCausalLM,
+    CLIPModel, CLIPProcessor
 )
+app = FastAPI(title="XAI Auditor Ensemble with CLIP Jury")
+# --- Configuration & Paths ---
 REPO_ID = "SaniaE/Image_Captioning_Ensemble"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 MODELS = {}
+# Metadata for loading
+MODEL_CONFIGS = {
     "blip": {
         "subfolder": "blip",
+        "proc_class": BlipProcessor,
+        "model_class": BlipForConditionalGeneration,
+        "base_path": "Salesforce/blip-image-captioning-large"
+    },
     "vit": {
         "subfolder": "vit",
+        "proc_classes": [ViTImageProcessor, AutoProcessor],
+        "model_class": AutoModelForCausalLM,
+        "base_paths": ["nlpconnect/vit-gpt2-image-captioning", "microsoft/git-large"]
+    },
+    "clip": {
+        "model_subfolder": "clip/clip_model",
+        "proc_subfolder": "clip/clip_processor"
     }
 }
     token = os.getenv("HF_Token")
     if token: login(token=token)
+    print(f"Syncing weights from {REPO_ID}...")
     local_dir = snapshot_download(repo_id=REPO_ID, token=token, local_dir="weights")
+    # 1. Load BLIP
+    cfg_b = MODEL_CONFIGS["blip"]
+    MODELS["blip"] = {
+        "model": cfg_b["model_class"].from_pretrained(os.path.join(local_dir, cfg_b["subfolder"])).to(DEVICE),
+        "processor": cfg_b["proc_class"].from_pretrained(cfg_b["base_path"])
+    }
+    # 2. Load ViT/GIT Ensemble
+    cfg_v = MODEL_CONFIGS["vit"]
+    MODELS["vit"] = {
+        "model": cfg_v["model_class"].from_pretrained(os.path.join(local_dir, cfg_v["subfolder"])).to(DEVICE),
+        "processor": (
+            cfg_v["proc_classes"][0].from_pretrained(cfg_v["base_paths"][0]),
+            cfg_v["proc_classes"][1].from_pretrained(cfg_v["base_paths"][1])
+        )
+    }
+    # 3. Load Fine-Tuned CLIP (Your Jury)
+    cfg_c = MODEL_CONFIGS["clip"]
+    MODELS["clip"] = {
+        "model": CLIPModel.from_pretrained(os.path.join(local_dir, cfg_c["model_subfolder"])).to(DEVICE),
+        "processor": CLIPProcessor.from_pretrained(os.path.join(local_dir, cfg_c["proc_subfolder"]))
+    }
+    print("All models synchronized. Auditor is active.")
+# --- Utilities ---
+def _generate_sync(m_name, image, temp, top_k, top_p):
+    m_data = MODELS[m_name]
     if m_name == "vit":
         i_proc, t_proc = m_data["processor"]
         inputs = i_proc(images=image, return_tensors="pt").to(DEVICE)
+        ids = m_data["model"].generate(**inputs, max_length=80, do_sample=True, temperature=temp, top_k=top_k, top_p=top_p)
+        return t_proc.batch_decode(ids, skip_special_tokens=True)[0].strip()
     else:
         proc = m_data["processor"]
         inputs = proc(images=image, return_tensors="pt").to(DEVICE)
+        ids = m_data["model"].generate(**inputs, max_length=80, do_sample=True, temperature=temp, top_k=top_k, top_p=top_p)
+        return proc.batch_decode(ids, skip_special_tokens=True)[0].strip()
+# --- Endpoints ---
 @app.post("/generate")
+async def generate_captions(
     file: UploadFile = File(...),
     temp: float = Query(0.8),
     top_k: int = Query(50),
     top_p: float = Query(0.9)
 ):
+    """Generates 5 diverse captions using the model ensemble."""
     image = Image.open(file.file).convert("RGB")
+    architectures = ["blip", "vit"]
+    selection = random.choices(architectures, k=5)
+    tasks = [asyncio.to_thread(_generate_sync, m, image, temp, top_k, top_p) for m in selection]
     captions = await asyncio.gather(*tasks)
+    return {"captions": captions, "metadata": {"models_used": selection, "temp": temp}}
+@app.post("/saliency")
+async def get_vision_saliency(file: UploadFile = File(...)):
+    """Objective Saliency: Shows what the Vision Encoder focuses on (Self-Attention)."""
     image_bytes = await file.read()
     orig_img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
     inputs = blip["processor"](images=orig_img, return_tensors="pt").to(DEVICE)
     with torch.no_grad():
+        outputs = blip["model"].vision_model(inputs.pixel_values, output_attentions=True)
+        attentions = outputs.attentions[-1] # Last layer
+        # Average heads, look at CLS token attention to patches
+        mask_1d = attentions[0, :, 0, 1:].mean(dim=0)
         grid_size = int(np.sqrt(mask_1d.shape[-1]))
         mask = mask_1d.view(grid_size, grid_size).cpu().numpy()
     mask = (mask - mask.min()) / (mask.max() - mask.min() + 1e-8)
+    mask_img = Image.fromarray((mask * 255).astype('uint8')).resize(orig_img.size, resample=Image.BICUBIC)
+    mask_img = mask_img.filter(ImageFilter.GaussianBlur(radius=10))
+    heatmap = plt.get_cmap('magma')(np.array(mask_img)/255.0)
+    heatmap_img = Image.fromarray((heatmap[:, :, :3] * 255).astype('uint8')).convert("RGB")
+    blended = Image.blend(orig_img, heatmap_img, alpha=0.6)
     buf = io.BytesIO()
+    blended.save(buf, format="PNG")
     buf.seek(0)
     return StreamingResponse(buf, media_type="image/png")
+@app.post("/audit")
+async def internal_debate_audit(file: UploadFile = File(...), user_prompt: str = Query(...)):
+    """The CLIP-Powered Jury: Compares User Intent vs. Model Perception."""
+    image_bytes = await file.read()
+    image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+    # 1. Model Perception
     blip_caption = await asyncio.to_thread(_generate_sync, "blip", image, 0.7, 50, 0.9)
+    # 2. CLIP Scoring (Multimodal Alignment)
+    clip_m = MODELS["clip"]["model"]
+    clip_p = MODELS["clip"]["processor"]
+    inputs = clip_p(text=[user_prompt, blip_caption], images=image, return_tensors="pt", padding=True).to(DEVICE)
+    with torch.no_grad():
+        outputs = clip_m(**inputs)
+        probs = outputs.logits_per_image.softmax(dim=-1).cpu().numpy()[0]
+    u_score, m_score = float(probs[0]), float(probs[1])
+    # 3. Decision Logic
+    if u_score < 0.35:
+        verdict = "Perspective Divergence: Intent not grounded in image."
+    elif abs(u_score - m_score) < 0.15:
+        verdict = "Consensus: High Alignment."
     else:
+        verdict = "Model Bias Detected."
     return {
+        "perspectives": {"user": user_prompt, "ai": blip_caption},
+        "audit_scores": {"intent_grounding": round(u_score, 4), "ai_grounding": round(m_score, 4)},
         "verdict": verdict
     }