Leacb4
/

gap-clip

@@ -28,18 +28,13 @@ import torch
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
-import seaborn as sns
 import difflib
 from collections import defaultdict
-import hashlib
-from pathlib import Path
-import requests
 from sklearn.metrics.pairwise import cosine_similarity
-from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
 from sklearn.preprocessing import normalize
-from tqdm import tqdm
 from torch.utils.data import Dataset, DataLoader
 from torchvision import transforms
 from PIL import Image
@@ -48,178 +43,29 @@ from io import BytesIO
 import warnings
 warnings.filterwarnings('ignore')
-from transformers import CLIPProcessor, CLIPModel as CLIPModel_transformers
 from config import (
     main_model_path,
     hierarchy_model_path,
     color_emb_dim,
     hierarchy_emb_dim,
     local_dataset_path,
     column_local_image_path,
-    images_dir,
 )
-# ============================================================================
-# 1. Fashion-MNIST utilities
-# ============================================================================
-def get_fashion_mnist_labels():
-    return {
-        0: "T-shirt/top",
-        1: "Trouser",
-        2: "Pullover",
-        3: "Dress",
-        4: "Coat",
-        5: "Sandal",
-        6: "Shirt",
-        7: "Sneaker",
-        8: "Bag",
-        9: "Ankle boot",
-    }
-def create_fashion_mnist_to_hierarchy_mapping(hierarchy_classes):
-    fashion_mnist_labels = get_fashion_mnist_labels()
-    hierarchy_classes_lower = [h.lower() for h in hierarchy_classes]
-    mapping = {}
-    for fm_label_id, fm_label in fashion_mnist_labels.items():
-        fm_label_lower = fm_label.lower()
-        matched_hierarchy = None
-        if fm_label_lower in hierarchy_classes_lower:
-            matched_hierarchy = hierarchy_classes[hierarchy_classes_lower.index(fm_label_lower)]
-        elif any(h in fm_label_lower or fm_label_lower in h for h in hierarchy_classes_lower):
-            for h_class in hierarchy_classes:
-                h_lower = h_class.lower()
-                if h_lower in fm_label_lower or fm_label_lower in h_lower:
-                    matched_hierarchy = h_class
-                    break
-        else:
-            if fm_label_lower in ['t-shirt/top', 'top']:
-                if 'top' in hierarchy_classes_lower:
-                    matched_hierarchy = hierarchy_classes[hierarchy_classes_lower.index('top')]
-            elif 'trouser' in fm_label_lower:
-                for possible in ['bottom', 'pants', 'trousers', 'trouser', 'pant']:
-                    if possible in hierarchy_classes_lower:
-                        matched_hierarchy = hierarchy_classes[hierarchy_classes_lower.index(possible)]
-                        break
-            elif 'pullover' in fm_label_lower:
-                for possible in ['sweater', 'pullover']:
-                    if possible in hierarchy_classes_lower:
-                        matched_hierarchy = hierarchy_classes[hierarchy_classes_lower.index(possible)]
-                        break
-            elif 'dress' in fm_label_lower:
-                if 'dress' in hierarchy_classes_lower:
-                    matched_hierarchy = hierarchy_classes[hierarchy_classes_lower.index('dress')]
-            elif 'coat' in fm_label_lower:
-                for possible in ['jacket', 'outerwear', 'coat']:
-                    if possible in hierarchy_classes_lower:
-                        matched_hierarchy = hierarchy_classes[hierarchy_classes_lower.index(possible)]
-                        break
-            elif fm_label_lower in ['sandal', 'sneaker', 'ankle boot']:
-                for possible in ['shoes', 'shoe', 'sandal', 'sneaker', 'boot']:
-                    if possible in hierarchy_classes_lower:
-                        matched_hierarchy = hierarchy_classes[hierarchy_classes_lower.index(possible)]
-                        break
-            elif 'bag' in fm_label_lower:
-                if 'bag' in hierarchy_classes_lower:
-                    matched_hierarchy = hierarchy_classes[hierarchy_classes_lower.index('bag')]
-        if matched_hierarchy is None:
-            close_matches = difflib.get_close_matches(
-                fm_label_lower, hierarchy_classes_lower, n=1, cutoff=0.6
-            )
-            if close_matches:
-                matched_hierarchy = hierarchy_classes[hierarchy_classes_lower.index(close_matches[0])]
-        mapping[fm_label_id] = matched_hierarchy
-        if matched_hierarchy:
-            print(f"  {fm_label} ({fm_label_id}) -> {matched_hierarchy}")
-        else:
-            print(f"  {fm_label} ({fm_label_id}) -> NO MATCH (will be filtered out)")
-    return mapping
-def convert_fashion_mnist_to_image(pixel_values):
-    image_array = np.array(pixel_values).reshape(28, 28).astype(np.uint8)
-    image_array = np.stack([image_array] * 3, axis=-1)
-    return Image.fromarray(image_array)
-class FashionMNISTDataset(Dataset):
-    def __init__(self, dataframe, image_size=224, label_mapping=None):
-        self.dataframe = dataframe
-        self.image_size = image_size
-        self.labels_map = get_fashion_mnist_labels()
-        self.label_mapping = label_mapping
-        self.transform = transforms.Compose([
-            transforms.Resize((image_size, image_size)),
-            transforms.ToTensor(),
-            transforms.Normalize(
-                mean=[0.485, 0.456, 0.406],
-                std=[0.229, 0.224, 0.225],
-            ),
-        ])
-    def __len__(self):
-        return len(self.dataframe)
-    def __getitem__(self, idx):
-        row = self.dataframe.iloc[idx]
-        pixel_cols = [f"pixel{i}" for i in range(1, 785)]
-        pixel_values = row[pixel_cols].values
-        image = convert_fashion_mnist_to_image(pixel_values)
-        image = self.transform(image)
-        label_id = int(row['label'])
-        description = self.labels_map[label_id]
-        color = "unknown"
-        if self.label_mapping and label_id in self.label_mapping:
-            hierarchy = self.label_mapping[label_id]
-        else:
-            hierarchy = self.labels_map[label_id]
-        return image, description, color, hierarchy
-def load_fashion_mnist_dataset(
-    max_samples=10000,
-    hierarchy_classes=None,
-    csv_path=None,
-):
-    if csv_path is None:
-        csv_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "data", "fashion-mnist_test.csv")
-    print("Loading Fashion-MNIST test dataset...")
-    df = pd.read_csv(csv_path)
-    print(f"Fashion-MNIST dataset loaded: {len(df)} samples")
-    label_mapping = None
-    if hierarchy_classes is not None:
-        print("\nCreating mapping from Fashion-MNIST labels to hierarchy classes:")
-        label_mapping = create_fashion_mnist_to_hierarchy_mapping(hierarchy_classes)
-        valid_label_ids = [lid for lid, h in label_mapping.items() if h is not None]
-        df_filtered = df[df['label'].isin(valid_label_ids)]
-        print(f"\nAfter filtering to mappable labels: {len(df_filtered)} samples (from {len(df)})")
-        df_sample = df_filtered.head(max_samples)
-    else:
-        df_sample = df.head(max_samples)
-    print(f"Using {len(df_sample)} samples for evaluation")
-    return FashionMNISTDataset(df_sample, label_mapping=label_mapping)
 # ============================================================================
@@ -256,21 +102,24 @@ class KaggleHierarchyDataset(Dataset):
         return image, description, color, hierarchy
-def load_kaggle_marqo_with_hierarchy(max_samples=10000, hierarchy_classes=None):
-    """Load KAGL Marqo dataset with hierarchy labels derived from articleType."""
-    from datasets import load_dataset
-    print("Loading KAGL Marqo dataset for hierarchy evaluation...")
-    dataset = load_dataset("Marqo/KAGL")
-    df = dataset["data"].to_pandas()
     print(f"Dataset loaded: {len(df)} samples, columns: {list(df.columns)}")
     # Use the most specific category column as hierarchy source
-    hierarchy_col = None
-    for col in ["articleType", "category3", "category2", "subCategory", "masterCategory", "category1"]:
-        if col in df.columns:
-            hierarchy_col = col
-            break
     if hierarchy_col is None:
         print("WARNING: No hierarchy column found in KAGL dataset")
@@ -335,29 +184,10 @@ class LocalHierarchyDataset(Dataset):
     def __getitem__(self, idx):
         row = self.dataframe.iloc[idx]
         try:
-            image_path = row.get(column_local_image_path) if hasattr(row, "get") else None
-            if isinstance(image_path, str) and image_path and os.path.exists(image_path):
-                image = Image.open(image_path).convert("RGB")
-            else:
-                # Fallback: download image from URL (and cache).
-                image_url = row.get("image_url") if hasattr(row, "get") else None
-                if isinstance(image_url, dict) and "bytes" in image_url:
-                    image = Image.open(BytesIO(image_url["bytes"])).convert("RGB")
-                elif isinstance(image_url, str) and image_url:
-                    cache_dir = Path(images_dir)
-                    cache_dir.mkdir(parents=True, exist_ok=True)
-                    url_hash = hashlib.md5(image_url.encode("utf-8")).hexdigest()
-                    cache_path = cache_dir / f"{url_hash}.jpg"
-                    if cache_path.exists():
-                        image = Image.open(cache_path).convert("RGB")
-                    else:
-                        resp = requests.get(image_url, timeout=10)
-                        resp.raise_for_status()
-                        image = Image.open(BytesIO(resp.content)).convert("RGB")
-                        # Cache so repeated runs are faster.
-                        image.save(cache_path, "JPEG", quality=85, optimize=True)
-                else:
-                    raise ValueError("Missing image_path and image_url")
         except Exception:
             image = Image.new("RGB", (224, 224), color="gray")
         image = self.transform(image)
@@ -367,18 +197,21 @@ class LocalHierarchyDataset(Dataset):
         return image, description, color, hierarchy
-def load_local_validation_with_hierarchy(max_samples=10000, hierarchy_classes=None):
-    """Load internal validation dataset with hierarchy labels."""
-    print("Loading local validation dataset for hierarchy evaluation...")
-    df = pd.read_csv(local_dataset_path)
-    print(f"Dataset loaded: {len(df)} samples")
-    # Some internal CSVs only contain `image_url` (no `local_image_path`).
-    # If so, we fall back to downloading images on-demand.
-    if column_local_image_path in df.columns:
-        df = df.dropna(subset=[column_local_image_path, "hierarchy"])
     else:
-        df = df.dropna(subset=["hierarchy"])
     df["hierarchy"] = df["hierarchy"].astype(str).str.strip()
     df = df[df["hierarchy"].str.len() > 0]
@@ -410,25 +243,32 @@ class CategoryModelEvaluator:
     baseline Fashion-CLIP on Fashion-MNIST, KAGL Marqo, and internal datasets.
     """
-    def __init__(self, device='mps', directory='figures/confusion_matrices/cm_hierarchy'):
-        self.device = torch.device(device)
         self.directory = directory
         self.color_emb_dim = color_emb_dim
         self.hierarchy_emb_dim = hierarchy_emb_dim
         os.makedirs(self.directory, exist_ok=True)
-        # --- load GAP-CLIP ---
-        print(f"Loading GAP-CLIP model from {main_model_path}")
-        if not os.path.exists(main_model_path):
-            raise FileNotFoundError(f"GAP-CLIP model file {main_model_path} not found")
-        print("Loading hierarchy classes from hierarchy model...")
-        if not os.path.exists(hierarchy_model_path):
-            raise FileNotFoundError(f"Hierarchy model file {hierarchy_model_path} not found")
-        hierarchy_checkpoint = torch.load(hierarchy_model_path, map_location=self.device)
-        self.hierarchy_classes = hierarchy_checkpoint.get('hierarchy_classes', [])
-        print(f"Found {len(self.hierarchy_classes)} hierarchy classes: {sorted(self.hierarchy_classes)}")
         self.validation_hierarchy_classes = self._load_validation_hierarchy_classes()
         if self.validation_hierarchy_classes:
@@ -438,21 +278,23 @@ class CategoryModelEvaluator:
             print("Unable to load validation hierarchy classes, falling back to hierarchy model classes.")
             self.validation_hierarchy_classes = self.hierarchy_classes
-        checkpoint = torch.load(main_model_path, map_location=self.device)
-        self.processor = CLIPProcessor.from_pretrained('laion/CLIP-ViT-B-32-laion2B-s34B-b79K')
-        self.model = CLIPModel_transformers.from_pretrained('laion/CLIP-ViT-B-32-laion2B-s34B-b79K')
-        self.model.load_state_dict(checkpoint['model_state_dict'])
-        self.model.to(self.device)
-        self.model.eval()
-        print("GAP-CLIP model loaded successfully")
-        # --- baseline Fashion-CLIP ---
-        print("Loading baseline Fashion-CLIP model...")
-        patrick_model_name = "patrickjohncyh/fashion-clip"
-        self.baseline_processor = CLIPProcessor.from_pretrained(patrick_model_name)
-        self.baseline_model = CLIPModel_transformers.from_pretrained(patrick_model_name).to(self.device)
-        self.baseline_model.eval()
-        print("Baseline Fashion-CLIP model loaded successfully")
     # ------------------------------------------------------------------
     # helpers
@@ -506,196 +348,23 @@ class CategoryModelEvaluator:
             )
     # ------------------------------------------------------------------
-    # embedding extraction — GAP-CLIP
     # ------------------------------------------------------------------
     def extract_full_embeddings(self, dataloader, embedding_type='text', max_samples=10000):
         """Full 512D embeddings from GAP-CLIP (text or image)."""
-        all_embeddings, all_colors, all_hierarchies = [], [], []
-        sample_count = 0
-        with torch.no_grad():
-            for batch in tqdm(dataloader, desc=f"GAP-CLIP {embedding_type} embeddings"):
-                if sample_count >= max_samples:
-                    break
-                images, texts, colors, hierarchies = batch
-                images = images.to(self.device).expand(-1, 3, -1, -1)
-                text_inputs = self.processor(text=list(texts), padding=True, return_tensors="pt")
-                text_inputs = {k: v.to(self.device) for k, v in text_inputs.items()}
-                outputs = self.model(**text_inputs, pixel_values=images)
-                if embedding_type == 'image':
-                    emb = outputs.image_embeds
-                else:
-                    emb = outputs.text_embeds
-                all_embeddings.append(emb.cpu().numpy())
-                all_colors.extend(colors)
-                all_hierarchies.extend(hierarchies)
-                sample_count += len(images)
-                del images, text_inputs, outputs, emb
-                if torch.cuda.is_available():
-                    torch.cuda.empty_cache()
-        return np.vstack(all_embeddings), all_colors, all_hierarchies
-    # ------------------------------------------------------------------
-    # embedding extraction — baseline Fashion-CLIP
-    # ------------------------------------------------------------------
     def extract_baseline_embeddings_batch(self, dataloader, embedding_type='text', max_samples=10000):
         """L2-normalised embeddings from baseline Fashion-CLIP."""
-        all_embeddings, all_colors, all_hierarchies = [], [], []
-        sample_count = 0
-        with torch.no_grad():
-            for batch in tqdm(dataloader, desc=f"Baseline {embedding_type} embeddings"):
-                if sample_count >= max_samples:
-                    break
-                images, texts, colors, hierarchies = batch
-                if embedding_type == 'text':
-                    inp = self.baseline_processor(
-                        text=list(texts), return_tensors="pt",
-                        padding=True, truncation=True, max_length=77,
-                    )
-                    inp = {k: v.to(self.device) for k, v in inp.items()}
-                    feats = self.baseline_model.get_text_features(**inp)
-                    feats = feats / feats.norm(dim=-1, keepdim=True)
-                    emb = feats
-                elif embedding_type == 'image':
-                    pil_images = []
-                    for i in range(images.shape[0]):
-                        t = images[i]
-                        if t.min() < 0 or t.max() > 1:
-                            mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
-                            std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
-                            t = torch.clamp(t * std + mean, 0, 1)
-                        pil_images.append(transforms.ToPILImage()(t))
-                    inp = self.baseline_processor(images=pil_images, return_tensors="pt")
-                    inp = {k: v.to(self.device) for k, v in inp.items()}
-                    feats = self.baseline_model.get_image_features(**inp)
-                    feats = feats / feats.norm(dim=-1, keepdim=True)
-                    emb = feats
-                else:
-                    inp = self.baseline_processor(
-                        text=list(texts), return_tensors="pt",
-                        padding=True, truncation=True, max_length=77,
-                    )
-                    inp = {k: v.to(self.device) for k, v in inp.items()}
-                    feats = self.baseline_model.get_text_features(**inp)
-                    feats = feats / feats.norm(dim=-1, keepdim=True)
-                    emb = feats
-                all_embeddings.append(emb.cpu().numpy())
-                all_colors.extend(colors)
-                all_hierarchies.extend(hierarchies)
-                sample_count += len(images)
-                del emb
-                if torch.cuda.is_available():
-                    torch.cuda.empty_cache()
-        return np.vstack(all_embeddings), all_colors, all_hierarchies
-    # ------------------------------------------------------------------
-    # metrics
-    # ------------------------------------------------------------------
-    def compute_embedding_accuracy(self, embeddings, labels, similarities=None):
-        n = len(embeddings)
-        if n == 0:
-            return 0.0
-        if similarities is None:
-            similarities = cosine_similarity(embeddings)
-        correct = 0
-        for i in range(n):
-            sims = similarities[i].copy()
-            sims[i] = -1.0
-            nearest_neighbor_idx = int(np.argmax(sims))
-            predicted = labels[nearest_neighbor_idx]
-            if predicted == labels[i]:
-                correct += 1
-        return correct / n
-    def compute_similarity_metrics(self, embeddings, labels):
-        max_samples = min(5000, len(embeddings))
-        if len(embeddings) > max_samples:
-            indices = np.random.choice(len(embeddings), max_samples, replace=False)
-            embeddings = embeddings[indices]
-            labels = [labels[i] for i in indices]
-        similarities = cosine_similarity(embeddings)
-        label_groups = defaultdict(list)
-        for i, label in enumerate(labels):
-            label_groups[label].append(i)
-        intra = []
-        for _, idxs in label_groups.items():
-            if len(idxs) > 1:
-                for i in range(len(idxs)):
-                    for j in range(i + 1, len(idxs)):
-                        intra.append(similarities[idxs[i], idxs[j]])
-        inter = []
-        keys = list(label_groups.keys())
-        for i in range(len(keys)):
-            for j in range(i + 1, len(keys)):
-                for idx1 in label_groups[keys[i]]:
-                    for idx2 in label_groups[keys[j]]:
-                        inter.append(similarities[idx1, idx2])
-        nn_acc = self.compute_embedding_accuracy(embeddings, labels, similarities)
-        return {
-            'intra_class_mean': float(np.mean(intra)) if intra else 0.0,
-            'inter_class_mean': float(np.mean(inter)) if inter else 0.0,
-            'separation_score': (float(np.mean(intra) - np.mean(inter))
-                                 if intra and inter else 0.0),
-            'nn_accuracy': nn_acc,
-        }
-    def compute_centroid_accuracy(self, embeddings, labels):
-        if len(embeddings) == 0:
-            return 0.0
-        emb_norm = normalize(embeddings, norm='l2')
-        unique_labels = sorted(set(labels))
-        centroids = {}
-        for label in unique_labels:
-            idx = [i for i, l in enumerate(labels) if l == label]
-            centroids[label] = normalize([emb_norm[idx].mean(axis=0)], norm='l2')[0]
-        correct = 0
-        for i, emb in enumerate(emb_norm):
-            best_sim, pred = -1, None
-            for label, c in centroids.items():
-                sim = cosine_similarity([emb], [c])[0][0]
-                if sim > best_sim:
-                    best_sim, pred = sim, label
-            if pred == labels[i]:
-                correct += 1
-        return correct / len(labels)
-    def predict_labels_from_embeddings(self, embeddings, labels):
-        emb_norm = normalize(embeddings, norm='l2')
-        unique_labels = sorted(set(labels))
-        centroids = {}
-        for label in unique_labels:
-            idx = [i for i, l in enumerate(labels) if l == label]
-            centroids[label] = normalize([emb_norm[idx].mean(axis=0)], norm='l2')[0]
-        preds = []
-        for emb in emb_norm:
-            best_sim, pred = -1, None
-            for label, c in centroids.items():
-                sim = cosine_similarity([emb], [c])[0][0]
-                if sim > best_sim:
-                    best_sim, pred = sim, label
-            preds.append(pred)
-        return preds
     def predict_labels_nearest_neighbor(self, embeddings, labels):
         """
@@ -741,23 +410,6 @@ class CategoryModelEvaluator:
     # ------------------------------------------------------------------
     # confusion matrix & classification report
     # ------------------------------------------------------------------
-    def create_confusion_matrix(self, true_labels, predicted_labels,
-                                title="Confusion Matrix", label_type="Label"):
-        unique_labels = sorted(set(true_labels + predicted_labels))
-        cm = confusion_matrix(true_labels, predicted_labels, labels=unique_labels)
-        acc = accuracy_score(true_labels, predicted_labels)
-        plt.figure(figsize=(10, 8))
-        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
-                    xticklabels=unique_labels, yticklabels=unique_labels)
-        plt.title(f'{title}\nAccuracy: {acc:.3f} ({acc * 100:.1f}%)')
-        plt.ylabel(f'True {label_type}')
-        plt.xlabel(f'Predicted {label_type}')
-        plt.xticks(rotation=45)
-        plt.yticks(rotation=0)
-        plt.tight_layout()
-        return plt.gcf(), acc, cm
     def evaluate_classification_performance(self, embeddings, labels,
                                             embedding_type="Embeddings",
                                             label_type="Label",
@@ -765,14 +417,14 @@ class CategoryModelEvaluator:
         if method == "nn":
             preds = self.predict_labels_nearest_neighbor(embeddings, labels)
         elif method == "centroid":
-            preds = self.predict_labels_from_embeddings(embeddings, labels)
         else:
             raise ValueError(f"Unknown classification method: {method}")
         acc = accuracy_score(labels, preds)
         unique_labels = sorted(set(labels))
-        fig, _, cm = self.create_confusion_matrix(
             labels, preds,
-            embedding_type,
             label_type,
         )
         report = classification_report(labels, preds, labels=unique_labels,
@@ -786,6 +438,15 @@ class CategoryModelEvaluator:
             'figure': fig,
         }
     # ==================================================================
     # 3. GAP-CLIP evaluation on Fashion-MNIST
     # ==================================================================
@@ -824,10 +485,10 @@ class CategoryModelEvaluator:
         text_hier_spec = text_full[:, self.color_emb_dim:self.color_emb_dim + self.hierarchy_emb_dim]
         print(f"  Specialized text hierarchy shape: {text_hier_spec.shape}")
-        text_metrics = self.compute_similarity_metrics(text_hier_spec, text_hier)
         text_class = self.evaluate_classification_performance(
             text_hier_spec, text_hier,
-            "Fashion-MNIST, text, hierarchy confusion matrix", "Hierarchy",
             method="nn",
         )
         text_metrics.update(text_class)
@@ -839,18 +500,18 @@ class CategoryModelEvaluator:
         print(f"  Specialized image hierarchy shape: {img_hier_spec.shape}")
         print("  Testing specialized 64D...")
-        spec_metrics = self.compute_similarity_metrics(img_hier_spec, img_hier)
         spec_class = self.evaluate_classification_performance(
             img_hier_spec, img_hier,
-            "Fashion-MNIST, image, hierarchy confusion matrix", "Hierarchy",
             method="nn",
         )
         print("  Testing full 512D...")
-        full_metrics = self.compute_similarity_metrics(img_full, img_hier)
         full_class = self.evaluate_classification_performance(
             img_full, img_hier,
-            "Fashion-MNIST, image, hierarchy confusion matrix", "Hierarchy",
             method="nn",
         )
@@ -889,6 +550,11 @@ class CategoryModelEvaluator:
                 os.path.join(self.directory, f"gap_clip_{key}_confusion_matrix.png"),
                 dpi=300, bbox_inches='tight',
             )
             plt.close(fig)
         del text_full, img_full, text_hier_spec, img_hier_spec
@@ -920,10 +586,10 @@ class CategoryModelEvaluator:
         self._validate_label_distribution(text_hier, expected_counts, "baseline text")
         print(f"  Baseline text shape: {text_emb.shape}")
-        text_metrics = self.compute_similarity_metrics(text_emb, text_hier)
         text_class = self.evaluate_classification_performance(
             text_emb, text_hier,
-            "Fashion-MNIST, text, hierarchy confusion matrix", "Hierarchy",
             method="nn",
         )
         text_metrics.update(text_class)
@@ -939,10 +605,10 @@ class CategoryModelEvaluator:
         self._validate_label_distribution(img_hier, expected_counts, "baseline image")
         print(f"  Baseline image shape: {img_emb.shape}")
-        img_metrics = self.compute_similarity_metrics(img_emb, img_hier)
         img_class = self.evaluate_classification_performance(
             img_emb, img_hier,
-            "Fashion-MNIST, image, hierarchy confusion matrix", "Hierarchy",
             method="nn",
         )
         img_metrics.update(img_class)
@@ -958,6 +624,11 @@ class CategoryModelEvaluator:
                 os.path.join(self.directory, f"baseline_{key}_hierarchy_confusion_matrix.png"),
                 dpi=300, bbox_inches='tight',
             )
             plt.close(fig)
         return results
@@ -980,10 +651,10 @@ class CategoryModelEvaluator:
         text_hier_spec = text_full[:, self.color_emb_dim:self.color_emb_dim + self.hierarchy_emb_dim]
         print(f"  Text shape: {text_full.shape}, hierarchy subspace: {text_hier_spec.shape}")
-        text_metrics = self.compute_similarity_metrics(text_hier_spec, text_hier)
         text_class = self.evaluate_classification_performance(
             text_hier_spec, text_hier,
-            f"{dataset_name}, text, hierarchy confusion matrix", "Hierarchy", method="nn",
         )
         text_metrics.update(text_class)
         results['text_hierarchy'] = text_metrics
@@ -993,16 +664,16 @@ class CategoryModelEvaluator:
         img_full, _, img_hier = self.extract_full_embeddings(dataloader, 'image', max_samples)
         img_hier_spec = img_full[:, self.color_emb_dim:self.color_emb_dim + self.hierarchy_emb_dim]
-        spec_metrics = self.compute_similarity_metrics(img_hier_spec, img_hier)
         spec_class = self.evaluate_classification_performance(
             img_hier_spec, img_hier,
-            f"{dataset_name}, image, hierarchy confusion matrix", "Hierarchy", method="nn",
         )
-        full_metrics = self.compute_similarity_metrics(img_full, img_hier)
         full_class = self.evaluate_classification_performance(
             img_full, img_hier,
-            f"{dataset_name}, image, hierarchy confusion matrix", "Hierarchy", method="nn",
         )
         if full_class['accuracy'] >= spec_class['accuracy']:
@@ -1023,6 +694,10 @@ class CategoryModelEvaluator:
                 os.path.join(self.directory, f"gap_clip_{prefix}_{key}_confusion_matrix.png"),
                 dpi=300, bbox_inches='tight',
             )
             plt.close(fig)
         del text_full, img_full, text_hier_spec, img_hier_spec
@@ -1044,10 +719,10 @@ class CategoryModelEvaluator:
         text_emb, _, text_hier = self.extract_baseline_embeddings_batch(dataloader, 'text', max_samples)
         print(f"  Baseline text shape: {text_emb.shape}")
-        text_metrics = self.compute_similarity_metrics(text_emb, text_hier)
         text_class = self.evaluate_classification_performance(
             text_emb, text_hier,
-            f"{dataset_name}, text, hierarchy confusion matrix", "Hierarchy", method="nn",
         )
         text_metrics.update(text_class)
         results['text'] = {'hierarchy': text_metrics}
@@ -1061,10 +736,10 @@ class CategoryModelEvaluator:
         img_emb, _, img_hier = self.extract_baseline_embeddings_batch(dataloader, 'image', max_samples)
         print(f"  Baseline image shape: {img_emb.shape}")
-        img_metrics = self.compute_similarity_metrics(img_emb, img_hier)
         img_class = self.evaluate_classification_performance(
             img_emb, img_hier,
-            f"{dataset_name}, image, hierarchy confusion matrix", "Hierarchy", method="nn",
         )
         img_metrics.update(img_class)
         results['image'] = {'hierarchy': img_metrics}
@@ -1080,6 +755,11 @@ class CategoryModelEvaluator:
                 os.path.join(self.directory, f"baseline_{prefix}_{key}_hierarchy_confusion_matrix.png"),
                 dpi=300, bbox_inches='tight',
             )
             plt.close(fig)
         return results
@@ -1087,10 +767,8 @@ class CategoryModelEvaluator:
     # ==================================================================
     # 6. Full evaluation across all datasets
     # ==================================================================
-    def run_full_evaluation(self, max_samples=10000, local_max_samples=None, batch_size=8):
         """Run hierarchy evaluation on all 3 datasets for both models."""
-        if local_max_samples is None:
-            local_max_samples = max_samples
         all_results = {}
         # --- Fashion-MNIST ---
@@ -1109,6 +787,7 @@ class CategoryModelEvaluator:
             kaggle_dataset = load_kaggle_marqo_with_hierarchy(
                 max_samples=max_samples,
                 hierarchy_classes=self.validation_hierarchy_classes or self.hierarchy_classes,
             )
             if kaggle_dataset is not None and len(kaggle_dataset) > 0:
                 kaggle_dataloader = DataLoader(kaggle_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
@@ -1126,16 +805,17 @@ class CategoryModelEvaluator:
         # --- Internal (local validation) ---
         try:
             local_dataset = load_local_validation_with_hierarchy(
-                max_samples=local_max_samples,
                 hierarchy_classes=self.validation_hierarchy_classes or self.hierarchy_classes,
             )
             if local_dataset is not None and len(local_dataset) > 0:
                 local_dataloader = DataLoader(local_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
                 all_results['local_gap'] = self.evaluate_gap_clip_generic(
-                    local_dataloader, "Internal", local_max_samples,
                 )
                 all_results['local_baseline'] = self.evaluate_baseline_generic(
-                    local_dataloader, "Internal", local_max_samples,
                 )
             else:
                 print("WARNING: Local validation dataset empty after hierarchy filtering, skipping.")
@@ -1161,13 +841,13 @@ class CategoryModelEvaluator:
             if 'text_hierarchy' in res:
                 t = res['text_hierarchy']
                 i = res['image_hierarchy']
-                print(f"  Text  NN Acc: {t['nn_accuracy']*100:.1f}% | Separation: {t['separation_score']:.4f}")
-                print(f"  Image NN Acc: {i['nn_accuracy']*100:.1f}% | Separation: {i['separation_score']:.4f}")
             elif 'text' in res:
                 t = res['text']['hierarchy']
                 i = res['image']['hierarchy']
-                print(f"  Text  NN Acc: {t['nn_accuracy']*100:.1f}% | Separation: {t['separation_score']:.4f}")
-                print(f"  Image NN Acc: {i['nn_accuracy']*100:.1f}% | Separation: {i['separation_score']:.4f}")
         return all_results
@@ -1180,33 +860,8 @@ if __name__ == "__main__":
     device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
     print(f"Using device: {device}")
-    directory = 'figures/confusion_matrices/cm_hierarchy'
     max_samples = 10000
-    local_max_samples = 1000
     evaluator = CategoryModelEvaluator(device=device, directory=directory)
-    # # Full evaluation including Fashion-MNIST and KAGL Marqo (skipped — CMs already generated)
-    # evaluator.run_full_evaluation(max_samples=max_samples, local_max_samples=local_max_samples, batch_size=8)
-    # Evaluate only the local/internal dataset
-    local_dataset = load_local_validation_with_hierarchy(
-        max_samples=local_max_samples,
-        hierarchy_classes=evaluator.validation_hierarchy_classes or evaluator.hierarchy_classes,
-    )
-    if local_dataset is not None and len(local_dataset) > 0:
-        local_dl = DataLoader(local_dataset, batch_size=8, shuffle=False, num_workers=0)
-        results_gap = evaluator.evaluate_gap_clip_generic(local_dl, "Internal", local_max_samples)
-        results_base = evaluator.evaluate_baseline_generic(local_dl, "Internal", local_max_samples)
-        print(f"\n{'=' * 60}")
-        print("INTERNAL DATASET — HIERARCHY EVALUATION SUMMARY")
-        print(f"{'=' * 60}")
-        print(f"\nGAP-CLIP:")
-        print(f"  Text  NN Acc: {results_gap['text_hierarchy']['nn_accuracy']*100:.1f}% | Separation: {results_gap['text_hierarchy']['separation_score']:.4f}")
-        print(f"  Image NN Acc: {results_gap['image_hierarchy']['nn_accuracy']*100:.1f}% | Separation: {results_gap['image_hierarchy']['separation_score']:.4f}")
-        print(f"\nBaseline:")
-        print(f"  Text  NN Acc: {results_base['text']['hierarchy']['nn_accuracy']*100:.1f}% | Separation: {results_base['text']['hierarchy']['separation_score']:.4f}")
-        print(f"  Image NN Acc: {results_base['image']['hierarchy']['nn_accuracy']*100:.1f}% | Separation: {results_base['image']['hierarchy']['separation_score']:.4f}")
-    else:
-        print("WARNING: Local validation dataset empty after hierarchy filtering.")

 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import difflib
 from collections import defaultdict
 from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.metrics import classification_report, accuracy_score
 from sklearn.preprocessing import normalize
 from torch.utils.data import Dataset, DataLoader
 from torchvision import transforms
 from PIL import Image
 import warnings
 warnings.filterwarnings('ignore')
 from config import (
+    ROOT_DIR,
     main_model_path,
+    main_emb_dim,
     hierarchy_model_path,
     color_emb_dim,
     hierarchy_emb_dim,
     local_dataset_path,
     column_local_image_path,
 )
+from utils.datasets import (
+    load_fashion_mnist_dataset,
+)
+from utils.embeddings import extract_clip_embeddings
+from utils.metrics import (
+    compute_similarity_metrics,
+    compute_embedding_accuracy,
+    compute_centroid_accuracy,
+    predict_labels_from_embeddings,
+    create_confusion_matrix,
+)
+from utils.model_loader import load_gap_clip, load_baseline_fashion_clip
 # ============================================================================
         return image, description, color, hierarchy
+def load_kaggle_marqo_with_hierarchy(max_samples=10000, hierarchy_classes=None, raw_df=None):
+    """Load KAGL Marqo dataset with hierarchy labels derived from articleType.
+    Args:
+        raw_df: Pre-downloaded DataFrame to skip the HuggingFace download.
+    """
+    if raw_df is not None:
+        df = raw_df.copy()
+        print(f"Using cached KAGL DataFrame for hierarchy evaluation: {len(df)} samples")
+    else:
+        from datasets import load_dataset
+        print("Loading KAGL Marqo dataset for hierarchy evaluation...")
+        dataset = load_dataset("Marqo/KAGL")
+        df = dataset["data"].to_pandas()
     print(f"Dataset loaded: {len(df)} samples, columns: {list(df.columns)}")
     # Use the most specific category column as hierarchy source
+    hierarchy_col = 'category2'
     if hierarchy_col is None:
         print("WARNING: No hierarchy column found in KAGL dataset")
     def __getitem__(self, idx):
         row = self.dataframe.iloc[idx]
         try:
+            img_path = row[column_local_image_path]
+            if not os.path.isabs(img_path):
+                img_path = os.path.join(ROOT_DIR, img_path)
+            image = Image.open(img_path).convert("RGB")
         except Exception:
             image = Image.new("RGB", (224, 224), color="gray")
         image = self.transform(image)
         return image, description, color, hierarchy
+def load_local_validation_with_hierarchy(max_samples=10000, hierarchy_classes=None, raw_df=None):
+    """Load internal validation dataset with hierarchy labels.
+    Args:
+        raw_df: Pre-loaded DataFrame to skip CSV read.
+    """
+    if raw_df is not None:
+        df = raw_df.copy()
+        print(f"Using cached local DataFrame for hierarchy evaluation: {len(df)} samples")
     else:
+        print("Loading local validation dataset for hierarchy evaluation...")
+        df = pd.read_csv(local_dataset_path)
+    print(f"Dataset loaded: {len(df)} samples")
+    df = df.dropna(subset=[column_local_image_path, "hierarchy"])
     df["hierarchy"] = df["hierarchy"].astype(str).str.strip()
     df = df[df["hierarchy"].str.len() > 0]
     baseline Fashion-CLIP on Fashion-MNIST, KAGL Marqo, and internal datasets.
     """
+    def __init__(self, device='mps', directory='gap_clip_confusion_matrices',
+                 gap_clip_model=None, gap_clip_processor=None,
+                 baseline_model=None, baseline_processor=None,
+                 hierarchy_classes=None,
+                 kaggle_raw_df=None, local_raw_df=None):
+        self.device = torch.device(device) if isinstance(device, str) else device
         self.directory = directory
+        self.kaggle_raw_df = kaggle_raw_df
+        self.local_raw_df = local_raw_df
         self.color_emb_dim = color_emb_dim
         self.hierarchy_emb_dim = hierarchy_emb_dim
+        self.main_emb_dim = main_emb_dim
+        self.hierarchy_end_dim = self.color_emb_dim + self.hierarchy_emb_dim
         os.makedirs(self.directory, exist_ok=True)
+        # --- hierarchy classes ---
+        if hierarchy_classes is not None:
+            self.hierarchy_classes = hierarchy_classes
+            print(f"Using provided hierarchy classes: {len(self.hierarchy_classes)} classes")
+        else:
+            print("Loading hierarchy classes from hierarchy model...")
+            if not os.path.exists(hierarchy_model_path):
+                raise FileNotFoundError(f"Hierarchy model file {hierarchy_model_path} not found")
+            hierarchy_checkpoint = torch.load(hierarchy_model_path, map_location=self.device)
+            self.hierarchy_classes = hierarchy_checkpoint.get('hierarchy_classes', [])
+            print(f"Found {len(self.hierarchy_classes)} hierarchy classes: {sorted(self.hierarchy_classes)}")
         self.validation_hierarchy_classes = self._load_validation_hierarchy_classes()
         if self.validation_hierarchy_classes:
             print("Unable to load validation hierarchy classes, falling back to hierarchy model classes.")
             self.validation_hierarchy_classes = self.hierarchy_classes
+        # --- load GAP-CLIP (accept pre-loaded or load from scratch) ---
+        if gap_clip_model is not None and gap_clip_processor is not None:
+            self.model = gap_clip_model
+            self.processor = gap_clip_processor
+            print("Using pre-loaded GAP-CLIP model")
+        else:
+            self.model, self.processor = load_gap_clip(main_model_path, self.device)
+            print("GAP-CLIP model loaded successfully")
+        # --- baseline Fashion-CLIP (accept pre-loaded or load from scratch) ---
+        if baseline_model is not None and baseline_processor is not None:
+            self.baseline_model = baseline_model
+            self.baseline_processor = baseline_processor
+            print("Using pre-loaded baseline Fashion-CLIP model")
+        else:
+            self.baseline_model, self.baseline_processor = load_baseline_fashion_clip(self.device)
+            print("Baseline Fashion-CLIP model loaded successfully")
     # ------------------------------------------------------------------
     # helpers
             )
     # ------------------------------------------------------------------
+    # embedding extraction (delegates to shared utils)
     # ------------------------------------------------------------------
     def extract_full_embeddings(self, dataloader, embedding_type='text', max_samples=10000):
         """Full 512D embeddings from GAP-CLIP (text or image)."""
+        return extract_clip_embeddings(
+            self.model, self.processor, dataloader, self.device,
+            embedding_type=embedding_type, max_samples=max_samples,
+            desc=f"GAP-CLIP {embedding_type} embeddings",
+        )
     def extract_baseline_embeddings_batch(self, dataloader, embedding_type='text', max_samples=10000):
         """L2-normalised embeddings from baseline Fashion-CLIP."""
+        return extract_clip_embeddings(
+            self.baseline_model, self.baseline_processor, dataloader, self.device,
+            embedding_type=embedding_type, max_samples=max_samples,
+            desc=f"Baseline {embedding_type} embeddings",
+        )
     def predict_labels_nearest_neighbor(self, embeddings, labels):
         """
     # ------------------------------------------------------------------
     # confusion matrix & classification report
     # ------------------------------------------------------------------
     def evaluate_classification_performance(self, embeddings, labels,
                                             embedding_type="Embeddings",
                                             label_type="Label",
         if method == "nn":
             preds = self.predict_labels_nearest_neighbor(embeddings, labels)
         elif method == "centroid":
+            preds = predict_labels_from_embeddings(embeddings, labels)
         else:
             raise ValueError(f"Unknown classification method: {method}")
         acc = accuracy_score(labels, preds)
         unique_labels = sorted(set(labels))
+        fig, _, cm = create_confusion_matrix(
             labels, preds,
+            f"{embedding_type} - {label_type} Classification ({method.upper()})",
             label_type,
         )
         report = classification_report(labels, preds, labels=unique_labels,
             'figure': fig,
         }
+    def save_confusion_matrix_table(self, cm, labels, output_csv_path):
+        """
+        Save confusion matrix values with per-row totals to CSV for auditing.
+        """
+        cm_df = pd.DataFrame(cm, index=labels, columns=labels)
+        cm_df["row_total"] = cm_df.sum(axis=1)
+        cm_df.loc["column_total"] = list(cm_df[labels].sum(axis=0)) + [cm_df["row_total"].sum()]
+        cm_df.to_csv(output_csv_path)
     # ==================================================================
     # 3. GAP-CLIP evaluation on Fashion-MNIST
     # ==================================================================
         text_hier_spec = text_full[:, self.color_emb_dim:self.color_emb_dim + self.hierarchy_emb_dim]
         print(f"  Specialized text hierarchy shape: {text_hier_spec.shape}")
+        text_metrics = compute_similarity_metrics(text_hier_spec, text_hier)
         text_class = self.evaluate_classification_performance(
             text_hier_spec, text_hier,
+            "GAP-CLIP Text Hierarchy (64D)", "Hierarchy",
             method="nn",
         )
         text_metrics.update(text_class)
         print(f"  Specialized image hierarchy shape: {img_hier_spec.shape}")
         print("  Testing specialized 64D...")
+        spec_metrics = compute_similarity_metrics(img_hier_spec, img_hier)
         spec_class = self.evaluate_classification_performance(
             img_hier_spec, img_hier,
+            "GAP-CLIP Image Hierarchy (64D)", "Hierarchy",
             method="nn",
         )
         print("  Testing full 512D...")
+        full_metrics = compute_similarity_metrics(img_full, img_hier)
         full_class = self.evaluate_classification_performance(
             img_full, img_hier,
+            "GAP-CLIP Image Hierarchy (512D full)", "Hierarchy",
             method="nn",
         )
                 os.path.join(self.directory, f"gap_clip_{key}_confusion_matrix.png"),
                 dpi=300, bbox_inches='tight',
             )
+            self.save_confusion_matrix_table(
+                results[key]['confusion_matrix'],
+                results[key]['labels'],
+                os.path.join(self.directory, f"gap_clip_{key}_confusion_matrix.csv"),
+            )
             plt.close(fig)
         del text_full, img_full, text_hier_spec, img_hier_spec
         self._validate_label_distribution(text_hier, expected_counts, "baseline text")
         print(f"  Baseline text shape: {text_emb.shape}")
+        text_metrics = compute_similarity_metrics(text_emb, text_hier)
         text_class = self.evaluate_classification_performance(
             text_emb, text_hier,
+            "Baseline Fashion-CLIP Text - Hierarchy", "Hierarchy",
             method="nn",
         )
         text_metrics.update(text_class)
         self._validate_label_distribution(img_hier, expected_counts, "baseline image")
         print(f"  Baseline image shape: {img_emb.shape}")
+        img_metrics = compute_similarity_metrics(img_emb, img_hier)
         img_class = self.evaluate_classification_performance(
             img_emb, img_hier,
+            "Baseline Fashion-CLIP Image - Hierarchy", "Hierarchy",
             method="nn",
         )
         img_metrics.update(img_class)
                 os.path.join(self.directory, f"baseline_{key}_hierarchy_confusion_matrix.png"),
                 dpi=300, bbox_inches='tight',
             )
+            self.save_confusion_matrix_table(
+                results[key]['hierarchy']['confusion_matrix'],
+                results[key]['hierarchy']['labels'],
+                os.path.join(self.directory, f"baseline_{key}_hierarchy_confusion_matrix.csv"),
+            )
             plt.close(fig)
         return results
         text_hier_spec = text_full[:, self.color_emb_dim:self.color_emb_dim + self.hierarchy_emb_dim]
         print(f"  Text shape: {text_full.shape}, hierarchy subspace: {text_hier_spec.shape}")
+        text_metrics = compute_similarity_metrics(text_hier_spec, text_hier)
         text_class = self.evaluate_classification_performance(
             text_hier_spec, text_hier,
+            f"GAP-CLIP Text Hierarchy – {dataset_name}", "Hierarchy", method="nn",
         )
         text_metrics.update(text_class)
         results['text_hierarchy'] = text_metrics
         img_full, _, img_hier = self.extract_full_embeddings(dataloader, 'image', max_samples)
         img_hier_spec = img_full[:, self.color_emb_dim:self.color_emb_dim + self.hierarchy_emb_dim]
+        spec_metrics = compute_similarity_metrics(img_hier_spec, img_hier)
         spec_class = self.evaluate_classification_performance(
             img_hier_spec, img_hier,
+            f"GAP-CLIP Image Hierarchy (64D) – {dataset_name}", "Hierarchy", method="nn",
         )
+        full_metrics = compute_similarity_metrics(img_full, img_hier)
         full_class = self.evaluate_classification_performance(
             img_full, img_hier,
+            f"GAP-CLIP Image Hierarchy (512D) – {dataset_name}", "Hierarchy", method="nn",
         )
         if full_class['accuracy'] >= spec_class['accuracy']:
                 os.path.join(self.directory, f"gap_clip_{prefix}_{key}_confusion_matrix.png"),
                 dpi=300, bbox_inches='tight',
             )
+            self.save_confusion_matrix_table(
+                results[key]['confusion_matrix'], results[key]['labels'],
+                os.path.join(self.directory, f"gap_clip_{prefix}_{key}_confusion_matrix.csv"),
+            )
             plt.close(fig)
         del text_full, img_full, text_hier_spec, img_hier_spec
         text_emb, _, text_hier = self.extract_baseline_embeddings_batch(dataloader, 'text', max_samples)
         print(f"  Baseline text shape: {text_emb.shape}")
+        text_metrics = compute_similarity_metrics(text_emb, text_hier)
         text_class = self.evaluate_classification_performance(
             text_emb, text_hier,
+            f"Baseline Text Hierarchy – {dataset_name}", "Hierarchy", method="nn",
         )
         text_metrics.update(text_class)
         results['text'] = {'hierarchy': text_metrics}
         img_emb, _, img_hier = self.extract_baseline_embeddings_batch(dataloader, 'image', max_samples)
         print(f"  Baseline image shape: {img_emb.shape}")
+        img_metrics = compute_similarity_metrics(img_emb, img_hier)
         img_class = self.evaluate_classification_performance(
             img_emb, img_hier,
+            f"Baseline Image Hierarchy – {dataset_name}", "Hierarchy", method="nn",
         )
         img_metrics.update(img_class)
         results['image'] = {'hierarchy': img_metrics}
                 os.path.join(self.directory, f"baseline_{prefix}_{key}_hierarchy_confusion_matrix.png"),
                 dpi=300, bbox_inches='tight',
             )
+            self.save_confusion_matrix_table(
+                results[key]['hierarchy']['confusion_matrix'],
+                results[key]['hierarchy']['labels'],
+                os.path.join(self.directory, f"baseline_{prefix}_{key}_hierarchy_confusion_matrix.csv"),
+            )
             plt.close(fig)
         return results
     # ==================================================================
     # 6. Full evaluation across all datasets
     # ==================================================================
+    def run_full_evaluation(self, max_samples=10000, batch_size=8):
         """Run hierarchy evaluation on all 3 datasets for both models."""
         all_results = {}
         # --- Fashion-MNIST ---
             kaggle_dataset = load_kaggle_marqo_with_hierarchy(
                 max_samples=max_samples,
                 hierarchy_classes=self.validation_hierarchy_classes or self.hierarchy_classes,
+                raw_df=self.kaggle_raw_df,
             )
             if kaggle_dataset is not None and len(kaggle_dataset) > 0:
                 kaggle_dataloader = DataLoader(kaggle_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
         # --- Internal (local validation) ---
         try:
             local_dataset = load_local_validation_with_hierarchy(
+                max_samples=max_samples,
                 hierarchy_classes=self.validation_hierarchy_classes or self.hierarchy_classes,
+                raw_df=self.local_raw_df,
             )
             if local_dataset is not None and len(local_dataset) > 0:
                 local_dataloader = DataLoader(local_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
                 all_results['local_gap'] = self.evaluate_gap_clip_generic(
+                    local_dataloader, "Internal", max_samples,
                 )
                 all_results['local_baseline'] = self.evaluate_baseline_generic(
+                    local_dataloader, "Internal", max_samples,
                 )
             else:
                 print("WARNING: Local validation dataset empty after hierarchy filtering, skipping.")
             if 'text_hierarchy' in res:
                 t = res['text_hierarchy']
                 i = res['image_hierarchy']
+                print(f"  Text  NN Acc: {t['accuracy']*100:.1f}% | Separation: {t['separation_score']:.4f}")
+                print(f"  Image NN Acc: {i['accuracy']*100:.1f}% | Separation: {i['separation_score']:.4f}")
             elif 'text' in res:
                 t = res['text']['hierarchy']
                 i = res['image']['hierarchy']
+                print(f"  Text  NN Acc: {t['accuracy']*100:.1f}% | Separation: {t['separation_score']:.4f}")
+                print(f"  Image NN Acc: {i['accuracy']*100:.1f}% | Separation: {i['separation_score']:.4f}")
         return all_results
     device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
     print(f"Using device: {device}")
+    directory = 'gap_clip_confusion_matrices'
     max_samples = 10000
     evaluator = CategoryModelEvaluator(device=device, directory=directory)
+    evaluator.run_full_evaluation(max_samples=max_samples, batch_size=8)