Spaces:

Stylique
/

recomendation

Paused

App Files Files Community

Ali Mohsin commited on Sep 2

Commit

fac18b7

1 Parent(s): acc496a

fixes

Browse files

Files changed (5) hide show

Dockerfile +2 -2
app.py +96 -28
data/polyvore.py +16 -7
inference.py +42 -29
utils/data_fetch.py +36 -129

Dockerfile CHANGED Viewed

@@ -15,10 +15,10 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 WORKDIR /app
-COPY recommendation/requirements.txt /app/requirements.txt
 RUN pip install --upgrade pip && pip install -r /app/requirements.txt
-COPY recommendation /app/
 EXPOSE 8000
 EXPOSE 7860

 WORKDIR /app
+COPY requirements.txt /app/requirements.txt
 RUN pip install --upgrade pip && pip install -r /app/requirements.txt
+COPY . /app/
 EXPOSE 8000
 EXPOSE 7860

app.py CHANGED Viewed

@@ -47,13 +47,16 @@ service = InferenceService()
 # Non-blocking bootstrap: fetch data, prepare splits, and train if needed in background
 BOOT_STATUS = "idle"
 def _background_bootstrap():
     global BOOT_STATUS
     try:
         BOOT_STATUS = "preparing-dataset"
         ds_root = ensure_dataset_ready()
         if not ds_root:
             BOOT_STATUS = "dataset-not-prepared"
             return
@@ -182,40 +185,104 @@ def gradio_embed(files: List[str]):
     return str([e.tolist() for e in embs])
-def gradio_compose(files: List[str], occasion: str, weather: str, num_outfits: int):
     if not files:
-        return []
     images = _load_images_from_files(files)
     if not images:
-        return []
-    embs = service.embed_images(images)
     items = [
-        {"id": f"item_{i}", "embedding": embs[i], "category": None, "image_url": None}
-        for i in range(len(embs))
     ]
-    results = service.compose_outfits(items, context={"occasion": occasion, "weather": weather, "num_outfits": int(num_outfits)})
-    # Render as a simple markdown summary
-    lines = []
-    for r in results:
-        lines.append(f"score={r['score']:.3f}, items={r['item_ids']}")
-    return "\n".join(lines)
-with gr.Blocks() as demo:
-    gr.Markdown("# Dressify Recommendations – HF Test UI")
-    with gr.Tab("Embed"):
         inp = gr.Files(label="Upload Items (multiple images)")
         out = gr.Textbox(label="Embeddings (JSON)")
         btn = gr.Button("Compute Embeddings")
         btn.click(fn=gradio_embed, inputs=inp, outputs=out)
-    with gr.Tab("Compose"):
-        inp2 = gr.Files(label="Upload Wardrobe (multiple images)")
-        occasion = gr.Dropdown(choices=["casual", "business", "formal", "sport"], value="casual", label="Occasion")
-        weather = gr.Dropdown(choices=["any", "hot", "mild", "cold", "rain"], value="any", label="Weather")
-        num_outfits = gr.Slider(minimum=1, maximum=10, step=1, value=3, label="Num outfits")
-        out2 = gr.Textbox(label="Recommendations")
-        btn2 = gr.Button("Generate")
-        btn2.click(fn=gradio_compose, inputs=[inp2, occasion, weather, num_outfits], outputs=out2)
     with gr.Tab("Downloads"):
         gr.Markdown("Download trained artifacts from models/exports")
         file_list = gr.JSON(label="Artifacts JSON")
@@ -241,7 +308,8 @@ with gr.Blocks() as demo:
 try:
-    # Mount Gradio onto FastAPI root path
     app = gr.mount_gradio_app(app, demo, path="/")
 except Exception:
     # In case mounting fails in certain runners, we still want FastAPI to be available
@@ -257,7 +325,7 @@ except Exception:
 if __name__ == "__main__":
-    # Local testing
-    demo.launch()

 # Non-blocking bootstrap: fetch data, prepare splits, and train if needed in background
 BOOT_STATUS = "idle"
+DATASET_ROOT: Optional[str] = None
 def _background_bootstrap():
     global BOOT_STATUS
+    global DATASET_ROOT
     try:
         BOOT_STATUS = "preparing-dataset"
         ds_root = ensure_dataset_ready()
+        DATASET_ROOT = ds_root
         if not ds_root:
             BOOT_STATUS = "dataset-not-prepared"
             return
     return str([e.tolist() for e in embs])
+def _stitch_strip(imgs: List[Image.Image], height: int = 256, pad: int = 6, bg=(245, 245, 245)) -> Image.Image:
+    if not imgs:
+        return Image.new("RGB", (1, height), color=bg)
+    resized = []
+    for im in imgs:
+        if im.mode != "RGB":
+            im = im.convert("RGB")
+        w, h = im.size
+        scale = height / float(h)
+        nw = max(1, int(w * scale))
+        resized.append(im.resize((nw, height)))
+    total_w = sum(im.size[0] for im in resized) + pad * (len(resized) + 1)
+    out = Image.new("RGB", (total_w, height + 2 * pad), color=bg)
+    x = pad
+    for im in resized:
+        out.paste(im, (x, pad))
+        x += im.size[0] + pad
+    return out
+def gradio_recommend(files: List[str], occasion: str, weather: str, num_outfits: int):
+    # Return stitched outfit images and a JSON with details
     if not files:
+        return [], {"error": "No files uploaded"}
     images = _load_images_from_files(files)
     if not images:
+        return [], {"error": "Could not load images"}
+    # Build items that allow on-the-fly embedding in service
     items = [
+        {"id": f"item_{i}", "image": images[i], "category": None}
+        for i in range(len(images))
     ]
+    res = service.compose_outfits(items, context={"occasion": occasion, "weather": weather, "num_outfits": int(num_outfits)})
+    # Prepare stitched previews
+    strips: List[Image.Image] = []
+    for r in res:
+        idxs = []
+        for iid in r.get("item_ids", []):
+            try:
+                idxs.append(int(str(iid).split("_")[-1]))
+            except Exception:
+                continue
+        imgs = [images[i] for i in idxs if 0 <= i < len(images)]
+        strips.append(_stitch_strip(imgs))
+    return strips, {"outfits": res}
+with gr.Blocks(fill_height=True) as demo:
+    gr.Markdown("## Dressify – Outfit Recommendations\nUpload multiple item images and generate complete looks.")
+    with gr.Tab("Recommend"):
+        inp2 = gr.Files(label="Upload wardrobe images", file_types=["image"], file_count="multiple")
+        with gr.Row():
+            occasion = gr.Dropdown(choices=["casual", "business", "formal", "sport"], value="casual", label="Occasion")
+            weather = gr.Dropdown(choices=["any", "hot", "mild", "cold", "rain"], value="any", label="Weather")
+            num_outfits = gr.Slider(minimum=1, maximum=8, step=1, value=3, label="Num outfits")
+        out_gallery = gr.Gallery(label="Recommended Outfits", columns=1, height=320)
+        out_json = gr.JSON(label="Details")
+        btn2 = gr.Button("Generate Outfits", variant="primary")
+        btn2.click(fn=gradio_recommend, inputs=[inp2, occasion, weather, num_outfits], outputs=[out_gallery, out_json])
+    with gr.Tab("Embed (debug)"):
         inp = gr.Files(label="Upload Items (multiple images)")
         out = gr.Textbox(label="Embeddings (JSON)")
         btn = gr.Button("Compute Embeddings")
         btn.click(fn=gradio_embed, inputs=inp, outputs=out)
+    with gr.Tab("Train"):
+        gr.Markdown("Train models on Stylique/Polyvore (70/10/10 split). This runs on the Space hardware.")
+        epochs_res = gr.Slider(1, 50, value=10, step=1, label="ResNet epochs")
+        epochs_vit = gr.Slider(1, 100, value=20, step=1, label="ViT epochs")
+        train_log = gr.Textbox(label="Training Log", lines=10)
+        start_btn = gr.Button("Start Training")
+        def start_training(res_epochs: int, vit_epochs: int):
+            def _runner():
+                try:
+                    import subprocess
+                    if not DATASET_ROOT:
+                        train_log.value = "Dataset not ready."
+                        return
+                    export_dir = os.getenv("EXPORT_DIR", "models/exports")
+                    os.makedirs(export_dir, exist_ok=True)
+                    train_log.value = "Training ResNet…\n"
+                    subprocess.run([
+                        "python", "train_resnet.py", "--data_root", DATASET_ROOT, "--epochs", str(res_epochs),
+                        "--out", os.path.join(export_dir, "resnet_item_embedder.pth")
+                    ], check=False)
+                    train_log.value += "\nTraining ViT (triplet)…\n"
+                    subprocess.run([
+                        "python", "train_vit_triplet.py", "--data_root", DATASET_ROOT, "--epochs", str(vit_epochs),
+                        "--export", os.path.join(export_dir, "vit_outfit_model.pth")
+                    ], check=False)
+                    service.reload_models()
+                    train_log.value += "\nDone. Artifacts in models/exports."
+                except Exception as e:
+                    train_log.value += f"\nError: {e}"
+            threading.Thread(target=_runner, daemon=True).start()
+            return "Started"
+        start_btn.click(fn=start_training, inputs=[epochs_res, epochs_vit], outputs=train_log)
     with gr.Tab("Downloads"):
         gr.Markdown("Download trained artifacts from models/exports")
         file_list = gr.JSON(label="Artifacts JSON")
 try:
+    # Mount Gradio onto FastAPI root path (disable SSR to avoid stray port fetches)
+    demo.queue()
     app = gr.mount_gradio_app(app, demo, path="/")
 except Exception:
     # In case mounting fails in certain runners, we still want FastAPI to be available
 if __name__ == "__main__":
+    # Local/Space run
+    demo.queue().launch(ssr_mode=False)

data/polyvore.py CHANGED Viewed

@@ -7,6 +7,7 @@ from torch.utils.data import Dataset
 from PIL import Image
 from utils.transforms import build_train_transforms
 class PolyvoreTripletDataset(Dataset):
@@ -31,11 +32,21 @@ class PolyvoreTripletDataset(Dataset):
         with open(triplet_path, "r") as f:
             self.samples: List[Dict[str, Any]] = json.load(f)
     def _load_image(self, item_id: str) -> Image.Image:
-        # Customize if images are arranged differently
-        img_path = os.path.join(self.root, "images", f"{item_id}.jpg")
-        if not os.path.exists(img_path):
-            raise FileNotFoundError(img_path)
         return Image.open(img_path).convert("RGB")
     def __len__(self) -> int:
@@ -74,9 +85,7 @@ class PolyvoreOutfitDataset(Dataset):
         # If metadata isn't available, we will rely on count >= 3 and let model learn; here, keep as-is.
     def _load_image(self, item_id: str) -> Image.Image:
-        img_path = os.path.join(self.root, "images", f"{item_id}.jpg")
-        if not os.path.exists(img_path):
-            raise FileNotFoundError(img_path)
         return Image.open(img_path).convert("RGB")
     def __len__(self) -> int:

 from PIL import Image
 from utils.transforms import build_train_transforms
+from pathlib import Path
 class PolyvoreTripletDataset(Dataset):
         with open(triplet_path, "r") as f:
             self.samples: List[Dict[str, Any]] = json.load(f)
+    def _find_image_path(self, item_id: str) -> str:
+        base = os.path.join(self.root, "images")
+        # direct common extensions
+        for ext in (".jpg", ".jpeg", ".png", ".webp"):
+            p = os.path.join(base, f"{item_id}{ext}")
+            if os.path.isfile(p):
+                return p
+        # recursive fuzzy search
+        for p in Path(base).rglob(f"*{item_id}*"):
+            if p.suffix.lower() in (".jpg", ".jpeg", ".png", ".webp"):
+                return str(p)
+        raise FileNotFoundError(f"Image for item {item_id} not found under {base}")
     def _load_image(self, item_id: str) -> Image.Image:
+        img_path = self._find_image_path(item_id)
         return Image.open(img_path).convert("RGB")
     def __len__(self) -> int:
         # If metadata isn't available, we will rely on count >= 3 and let model learn; here, keep as-is.
     def _load_image(self, item_id: str) -> Image.Image:
+        img_path = PolyvoreTripletDataset._find_image_path(self, item_id)  # reuse logic
         return Image.open(img_path).convert("RGB")
     def __len__(self) -> int:

inference.py CHANGED Viewed

@@ -89,54 +89,67 @@ class InferenceService:
     @torch.inference_mode()
     def compose_outfits(self, items: List[Dict[str, Any]], context: Dict[str, Any]) -> List[Dict[str, Any]]:
-        # Ensure embeddings
         proc_items: List[Dict[str, Any]] = []
         for it in items:
-            e = it.get("embedding")
-            if e is None and it.get("image") is not None:
-                # Not used in Gradio path, but kept for completeness
                 emb = self.embed_images([it["image"]])[0]
-            elif e is None:
-                # If missing embedding and image, skip
                 continue
-            else:
-                emb = np.asarray(e, dtype=np.float32)
-            proc_items.append({"id": it.get("id"), "embedding": emb, "category": it.get("category")})
         if len(proc_items) < 2:
             return []
-        # Candidate generation: enforce minimum slots (upper, bottom, shoes, accessory) if categories provided
-        rng = np.random.default_rng(42)
         num_outfits = int(context.get("num_outfits", 3))
-        min_size, max_size = 3, 5
-        candidates: List[List[int]] = []
         ids = list(range(len(proc_items)))
-        # slot-aware sampling if categories exist
-        def has_cat(i: int, cat_prefix: str) -> bool:
-            c = (proc_items[i].get("category") or "").lower()
-            return cat_prefix in c
-        uppers = [i for i in ids if any(k in (proc_items[i].get("category") or "").lower() for k in ["top", "shirt", "tshirt", "blouse", "jacket", "hoodie"]) ]
-        bottoms = [i for i in ids if any(k in (proc_items[i].get("category") or "").lower() for k in ["pant", "trouser", "jean", "skirt", "short"]) ]
-        shoes = [i for i in ids if "shoe" in (proc_items[i].get("category") or "").lower()]
-        accs = [i for i in ids if any(k in (proc_items[i].get("category") or "").lower() for k in ["watch", "belt", "ring", "bracelet", "accessor"]) ]
-        for _ in range(num_outfits * 10):
             if uppers and bottoms and shoes and accs:
-                subset = [rng.choice(uppers).item(), rng.choice(bottoms).item(), rng.choice(shoes).item(), rng.choice(accs).item()]
-                # optional: add one more random
                 remain = list(set(ids) - set(subset))
                 if remain and rng.random() < 0.5:
-                    subset.append(rng.choice(remain).item())
             else:
-                k = rng.integers(min_size, max_size + 1)
-                subset = rng.choice(ids, size=int(k), replace=False).tolist()
             candidates.append(subset)
-        # Score using ViT
         def score_subset(idx_subset: List[int]) -> float:
-            embs = torch.tensor(np.stack([proc_items[i]["embedding"] for i in idx_subset]), dtype=torch.float32, device=self.device)
             embs = embs.unsqueeze(0)  # (1, N, D)
             s = self.vit.score_compatibility(embs).item()
             return float(s)

     @torch.inference_mode()
     def compose_outfits(self, items: List[Dict[str, Any]], context: Dict[str, Any]) -> List[Dict[str, Any]]:
+        # 1) Ensure embeddings for each input item
         proc_items: List[Dict[str, Any]] = []
         for it in items:
+            emb = it.get("embedding")
+            if emb is None and it.get("image") is not None:
+                # Compute on-the-fly if image provided
                 emb = self.embed_images([it["image"]])[0]
+            if emb is None:
+                # Skip if we cannot get an embedding
                 continue
+            emb_np = np.asarray(emb, dtype=np.float32)
+            proc_items.append({
+                "id": it.get("id"),
+                "embedding": emb_np,
+                "category": it.get("category")
+            })
         if len(proc_items) < 2:
             return []
+        # 2) Candidate generation
+        rng = np.random.default_rng(int(context.get("seed", 42)))
         num_outfits = int(context.get("num_outfits", 3))
+        min_size, max_size = 4, 6
         ids = list(range(len(proc_items)))
+        # Slot-aware pools from categories (best-effort)
+        def cat_str(i: int) -> str:
+            return (proc_items[i].get("category") or "").lower()
+        uppers = [i for i in ids if any(k in cat_str(i) for k in ["top", "shirt", "tshirt", "blouse", "jacket", "hoodie"])]
+        bottoms = [i for i in ids if any(k in cat_str(i) for k in ["pant", "trouser", "jean", "skirt", "short"])]
+        shoes = [i for i in ids if any(k in cat_str(i) for k in ["shoe", "sneaker", "boot", "heel"])]
+        accs = [i for i in ids if any(k in cat_str(i) for k in ["watch", "belt", "ring", "bracelet", "accessor", "bag", "hat"])]
+        candidates: List[List[int]] = []
+        num_samples = max(num_outfits * 12, 24)
+        for _ in range(num_samples):
             if uppers and bottoms and shoes and accs:
+                subset = [
+                    int(rng.choice(uppers)),
+                    int(rng.choice(bottoms)),
+                    int(rng.choice(shoes)),
+                    int(rng.choice(accs)),
+                ]
+                # Optional: add one more random distinct item
                 remain = list(set(ids) - set(subset))
                 if remain and rng.random() < 0.5:
+                    subset.append(int(rng.choice(remain)))
             else:
+                k = int(rng.integers(min_size, max_size + 1))
+                subset = list(map(int, rng.choice(ids, size=k, replace=False).tolist()))
             candidates.append(subset)
+        # 3) Score using ViT
         def score_subset(idx_subset: List[int]) -> float:
+            embs = torch.tensor(
+                np.stack([proc_items[i]["embedding"] for i in idx_subset], axis=0),
+                dtype=torch.float32,
+                device=self.device,
+            )  # (N, D)
             embs = embs.unsqueeze(0)  # (1, N, D)
             s = self.vit.score_compatibility(embs).item()
             return float(s)

utils/data_fetch.py CHANGED Viewed

@@ -1,148 +1,55 @@
 import os
-import shutil
 import zipfile
 from pathlib import Path
-from typing import Optional, List
-import requests
-try:
-    from huggingface_hub import snapshot_download  # type: ignore
-except Exception:  # pragma: no cover
-    snapshot_download = None
-try:
-    import kagglehub  # type: ignore
-    from kagglehub import KaggleDatasetAdapter  # type: ignore
-except Exception:  # pragma: no cover
-    kagglehub = None
-    KaggleDatasetAdapter = None
-def _download_zip(url: str, dest_dir: str) -> str:
-    os.makedirs(dest_dir, exist_ok=True)
-    local_zip = os.path.join(dest_dir, "dataset.zip")
-    with requests.get(url, stream=True, timeout=60) as r:
-        r.raise_for_status()
-        with open(local_zip, "wb") as f:
-            for chunk in r.iter_content(chunk_size=1024 * 1024):
-                if chunk:
-                    f.write(chunk)
-    with zipfile.ZipFile(local_zip, "r") as zf:
-        zf.extractall(dest_dir)
-    os.remove(local_zip)
-    return dest_dir
-def _unzip_inner_archives(root: str) -> None:
-    """Find and extract any zip files inside root (e.g., images.zip)."""
-    for dirpath, _dirnames, filenames in os.walk(root):
-        for fn in filenames:
-            if fn.lower().endswith(".zip"):
-                zpath = os.path.join(dirpath, fn)
-                try:
-                    with zipfile.ZipFile(zpath, "r") as zf:
-                        zf.extractall(dirpath)
-                    # keep original zip to avoid repeated work? remove to save disk
-                    try:
-                        os.remove(zpath)
-                    except Exception:
-                        pass
-                except Exception as e:  # pragma: no cover
-                    print(f"Failed to unzip inner archive {zpath}: {e}")
-def _ensure_images_dir(root: str) -> None:
-    """Ensure a stable images/ path exists under root. Create a symlink if needed."""
-    images_root = os.path.join(root, "images")
-    if os.path.isdir(images_root):
         return
-    # Try to find a folder with many jpg/png files
-    candidate_dirs: List[str] = []
-    for dirpath, dirnames, filenames in os.walk(root):
-        if dirpath == root:
-            # skip root level files, look deeper
-            continue
-        img_files = [f for f in filenames if f.lower().endswith((".jpg", ".jpeg", ".png"))]
-        if len(img_files) > 1000:  # heuristic: big image folder
-            candidate_dirs.append(dirpath)
-    # Prefer the shallowest candidate
-    candidate_dirs.sort(key=lambda p: len(Path(p).parts))
-    if candidate_dirs:
-        src = candidate_dirs[0]
-        try:
-            os.symlink(src, images_root)
-            print(f"Created images symlink: {images_root} -> {src}")
-        except Exception:
-            # fallback: create folder and leave it empty (training will fail fast if missing)
-            os.makedirs(images_root, exist_ok=True)
-    else:
-        os.makedirs(images_root, exist_ok=True)
 def ensure_dataset_ready() -> Optional[str]:
     """
-    Ensure Polyvore dataset is present locally.
-    Priority:
-    1) If POLYVORE_ROOT exists and has splits, return it
-    2) Try Hugging Face dataset repo (defaults to Stylique/Polyvore if not set)
-    3) If DATA_ZIP_URL is set, download and unzip
-    4) Try KaggleHub (best-effort)
-    Returns resolved root path or None if nothing done.
     """
-    root = os.getenv("POLYVORE_ROOT", "./data/Polyvore")
-    auto_fetch = os.getenv("AUTO_FETCH_DATA", "true").lower() == "true"
     Path(root).mkdir(parents=True, exist_ok=True)
-    # Already prepared?
-    if os.path.isdir(os.path.join(root, "splits")):
-        _unzip_inner_archives(root)
-        _ensure_images_dir(root)
         return root
-    if not auto_fetch:
-        return None
-    # Try HF dataset repo
-    repo = os.getenv("HF_DATASET_REPO", "Stylique/Polyvore")
-    if repo and snapshot_download is not None:
-        try:
-            snapshot_download(repo, repo_type="dataset", local_dir=root)
-            _unzip_inner_archives(root)
-            _ensure_images_dir(root)
-            # If splits not provided, they'll be prepared by the caller
-            return root
-        except Exception as e:  # pragma: no cover
-            print(f"HF dataset download failed: {e}")
-    # Try ZIP URL
-    zip_url = os.getenv("DATA_ZIP_URL")
-    if zip_url:
-        try:
-            _download_zip(zip_url, root)
-            _unzip_inner_archives(root)
-            _ensure_images_dir(root)
-        except Exception as e:  # pragma: no cover
-            print(f"ZIP download failed: {e}")
-            return None
-    # Try KaggleHub (no Kaggle keys required for public datasets)
-    if kagglehub is not None and KaggleDatasetAdapter is not None:
-        try:
-            # Attempt to load core file to trigger dataset download locally
-            # User can override POLYVORE_FILE_PATH to select a specific CSV/JSON
-            file_path = os.getenv("POLYVORE_FILE_PATH", "")
-            kagglehub.load_dataset(
-                KaggleDatasetAdapter.PANDAS,
-                "dnepozitek/polyvore-outfits",
-                file_path,
-            )
-            # KaggleHub stores under ~/.cache/kagglehub/datasets/<slug>/...; copy to root if needed
-            # For simplicity, assume user will run prepare script using POLYVORE_ROOT pointing to extracted images
-            _unzip_inner_archives(root)
-            _ensure_images_dir(root)
-        except Exception as e:  # pragma: no cover
-            print(f"KaggleHub download failed: {e}")
-    return root

 import os
 import zipfile
 from pathlib import Path
+from typing import Optional
+from huggingface_hub import snapshot_download  # type: ignore
+def _unzip_images_if_needed(root: str) -> None:
+    """
+    If an archive like images.zip exists in the dataset root, extract it to root/images.
+    """
+    images_dir = os.path.join(root, "images")
+    if os.path.isdir(images_dir) and any(Path(images_dir).glob("*")):
         return
+    # Common zip names at root or subfolders
+    candidates = [os.path.join(root, name) for name in ("images.zip", "polyvore-images.zip", "imgs.zip")]
+    # Also search recursively for any *images*.zip
+    for p in Path(root).rglob("*images*.zip"):
+        candidates.append(str(p))
+    for zpath in candidates:
+        if os.path.isfile(zpath):
+            os.makedirs(images_dir, exist_ok=True)
+            with zipfile.ZipFile(zpath, "r") as zf:
+                zf.extractall(images_dir)
+            return
 def ensure_dataset_ready() -> Optional[str]:
     """
+    Self-contained dataset fetcher for the Polyvore dataset from Hugging Face.
+    - Downloads the dataset repo Stylique/Polyvore into ./data/Polyvore
+    - Unzips images.zip into ./data/Polyvore/images
+    - Returns the dataset root path
     """
+    root = os.path.abspath(os.path.join(os.getcwd(), "data", "Polyvore"))
     Path(root).mkdir(parents=True, exist_ok=True)
+    # If already present, ensure images are unzipped and return
+    _unzip_images_if_needed(root)
+    if os.path.isdir(os.path.join(root, "images")):
         return root
+    # Download the HF dataset snapshot into root
+    try:
+        snapshot_download("Stylique/Polyvore", repo_type="dataset", local_dir=root, local_dir_use_symlinks=False)
+    except Exception as e:  # pragma: no cover
+        print(f"Failed to download Stylique/Polyvore dataset: {e}")
+        return None
+    # Unzip images if needed
+    _unzip_images_if_needed(root)
+    return root if os.path.isdir(os.path.join(root, "images")) else None