Code updates

Browse files

Files changed (4) hide show

inference_brain2vec_PCA.py +222 -0
model.py +0 -115
requirements.txt +6 -3
brain2vec_PCA.py → train_brain2vec_PCA.py +145 -88

inference_brain2vec_PCA.py ADDED Viewed

	@@ -0,0 +1,222 @@

+#!/usr/bin/env python3
+"""
+inference_brain2vec_PCA.py
+Loads a pre-trained PCA-based Brain2Vec model (saved with joblib) and performs
+inference on one or more input images. Produces embeddings (and optional
+reconstructions) for each image.
+Example usage:
+    python inference_brain2vec_PCA.py \
+        --pca_model /path/to/pca_model.joblib \
+        --input_images /path/to/img1.nii.gz /path/to/img2.nii.gz \
+        --output_dir /path/to/out
+Or, if you have a CSV with image paths:
+    python inference_brain2vec_PCA.py \
+        --pca_model /path/to/pca_model.joblib \
+        --csv_input /path/to/images.csv \
+        --output_dir /path/to/out
+"""
+import os
+import argparse
+import numpy as np
+import torch
+import torch.nn as nn
+from joblib import load
+import pandas as pd
+from monai.transforms import (
+    Compose,
+    CopyItemsD,
+    LoadImageD,
+    EnsureChannelFirstD,
+    SpacingD,
+    ResizeWithPadOrCropD,
+    ScaleIntensityD,
+)
+# Global constants
+RESOLUTION = 2
+INPUT_SHAPE_AE = (80, 96, 80)
+FLATTENED_DIM = INPUT_SHAPE_AE[0] * INPUT_SHAPE_AE[1] * INPUT_SHAPE_AE[2]
+# Reusable MONAI pipeline for preprocessing
+transforms_fn = Compose([
+    CopyItemsD(keys={'image_path'}, names=['image']),
+    LoadImageD(image_only=True, keys=['image']),
+    EnsureChannelFirstD(keys=['image']),
+    SpacingD(pixdim=RESOLUTION, keys=['image']),
+    ResizeWithPadOrCropD(spatial_size=INPUT_SHAPE_AE, mode='minimum', keys=['image']),
+    ScaleIntensityD(minv=0, maxv=1, keys=['image']),
+])
+def preprocess_mri(image_path: str) -> torch.Tensor:
+    """
+    Preprocess an MRI using MONAI transforms to produce
+    a 5D Torch tensor: (batch=1, channel=1, D, H, W).
+    Args:
+        image_path (str): Path to the MRI (e.g., .nii.gz file).
+    Returns:
+        torch.Tensor: Preprocessed 5D tensor of shape (1, 1, D, H, W).
+    """
+    data_dict = {"image_path": image_path}
+    output_dict = transforms_fn(data_dict)
+    # shape => (1, D, H, W)
+    image_tensor = output_dict["image"].unsqueeze(0)  # => (1, 1, D, H, W)
+    return image_tensor.float()
+class PCABrain2vec(nn.Module):
+    """
+    A PCA-based 'autoencoder' that mimics a typical VAE interface:
+      - from_pretrained(...) to load a PCA model from disk
+      - forward(...) returns (reconstruction, embedding, None)
+    Steps:
+      1. Flatten the input volume (N, 1, D, H, W) => (N, 614400).
+      2. Transform -> embeddings => shape (N, n_components).
+      3. Inverse transform -> recon => shape (N, 614400).
+      4. Reshape => (N, 1, D, H, W).
+    """
+    def __init__(self, pca_model=None):
+        super().__init__()
+        self.pca_model = pca_model
+    def forward(self, x: torch.Tensor):
+        """
+        Perform a forward pass of the PCA-based "autoencoder".
+        Args:
+            x (torch.Tensor): Input of shape (N, 1, D, H, W).
+        Returns:
+            tuple(torch.Tensor, torch.Tensor, None):
+                - reconstruction: (N, 1, D, H, W)
+                - embedding: (N, n_components)
+                - None (to align with the typical VAE interface).
+        """
+        n_samples = x.shape[0]
+        x_cpu = x.detach().cpu().numpy()  # (N, 1, D, H, W)
+        x_flat = x_cpu.reshape(n_samples, -1)  # => (N, FLATTENED_DIM)
+        # PCA transform => embeddings shape (N, n_components)
+        embedding_np = self.pca_model.transform(x_flat)
+        # PCA inverse_transform => recon shape (N, FLATTENED_DIM)
+        recon_np = self.pca_model.inverse_transform(embedding_np)
+        recon_np = recon_np.reshape(n_samples, 1, *INPUT_SHAPE_AE)
+        # Convert back to torch
+        reconstruction_torch = torch.from_numpy(recon_np).float()
+        embedding_torch = torch.from_numpy(embedding_np).float()
+        return reconstruction_torch, embedding_torch, None
+    @staticmethod
+    def from_pretrained(pca_path: str) -> "PCABrain2vec":
+        """
+        Load a pre-trained PCA model (pickled or joblib) from disk.
+        Args:
+            pca_path (str): File path to the PCA model.
+        Returns:
+            PCABrain2vec: An instance wrapping the loaded PCA model.
+        """
+        if not os.path.exists(pca_path):
+            raise FileNotFoundError(f"Could not find PCA model at {pca_path}")
+        pca_model = load(pca_path)
+        return PCABrain2vec(pca_model=pca_model)
+def main() -> None:
+    """
+    Main function to parse command-line arguments and run inference
+    with a pre-trained PCA Brain2Vec model.
+    """
+    parser = argparse.ArgumentParser(
+        description="PCA-based Brain2Vec Inference Script"
+    )
+    parser.add_argument(
+        "--pca_model", type=str, required=True,
+        help="Path to the saved PCA model (.joblib)."
+    )
+    parser.add_argument(
+        "--output_dir", type=str, default="./pca_inference_outputs",
+        help="Directory to save embeddings/reconstructions."
+    )
+    # Two ways to supply images: multiple files or a CSV
+    parser.add_argument(
+        "--input_images", type=str, nargs="*",
+        help="One or more image paths for inference."
+    )
+    parser.add_argument(
+        "--csv_input", type=str, default=None,
+        help="Path to a CSV containing column 'image_path'."
+    )
+    args = parser.parse_args()
+    os.makedirs(args.output_dir, exist_ok=True)
+    # Build the PCA model
+    pca_brain2vec = PCABrain2vec.from_pretrained(args.pca_model)
+    pca_brain2vec.eval()
+    # Gather image paths
+    if args.csv_input:
+        df = pd.read_csv(args.csv_input)
+        if "image_path" not in df.columns:
+            raise ValueError("CSV must contain a column named 'image_path'.")
+        image_paths = df["image_path"].tolist()
+    else:
+        if not args.input_images:
+            raise ValueError(
+                "Must provide either --csv_input or --input_images."
+            )
+        image_paths = args.input_images
+    # Inference loop
+    all_embeddings = []
+    for i, img_path in enumerate(image_paths):
+        if not os.path.exists(img_path):
+            raise FileNotFoundError(f"Image not found: {img_path}")
+        # Preprocess
+        img_tensor = preprocess_mri(img_path)
+        # Forward pass
+        with torch.no_grad():
+            recon, embedding, _ = pca_brain2vec(img_tensor)
+        # Convert to CPU numpy
+        embedding_np = embedding.detach().cpu().numpy()
+        recon_np = recon.detach().cpu().numpy()
+        # Save (one embedding row per image)
+        all_embeddings.append(embedding_np)
+        # Optionally save or visualize reconstructions
+        out_recon_path = os.path.join(
+            args.output_dir, f"reconstruction_{i}.npy"
+        )
+        np.save(out_recon_path, recon_np)
+        print(f"[INFO] Saved reconstruction to: {out_recon_path}")
+    # Save all embeddings stacked
+    stacked_embeddings = np.vstack(all_embeddings)  # (N, n_components)
+    out_embed_path = os.path.join(args.output_dir, "all_pca_embeddings.npy")
+    np.save(out_embed_path, stacked_embeddings)
+    print(f"[INFO] Saved embeddings of shape {stacked_embeddings.shape} to: {out_embed_path}")
+if __name__ == "__main__":
+    main()

model.py DELETED Viewed

@@ -1,115 +0,0 @@
-# model.py
-import os
-import numpy as np
-import torch
-import torch.nn as nn
-from monai.transforms import (
-    Compose,
-    CopyItemsD,
-    LoadImageD,
-    EnsureChannelFirstD,
-    SpacingD,
-    ResizeWithPadOrCropD,
-    ScaleIntensityD,
-)
-# If you used joblib or pickle to save your PCA model:
-from joblib import load  # or "import pickle"
-#################################################
-# Constants
-#################################################
-RESOLUTION = 2
-INPUT_SHAPE_AE = (80, 96, 80)  # The typical shape from your pipelines
-FLATTENED_DIM = INPUT_SHAPE_AE[0] * INPUT_SHAPE_AE[1] * INPUT_SHAPE_AE[2]
-#################################################
-# Define MONAI Transforms for Preprocessing
-#################################################
-transforms_fn = Compose([
-    CopyItemsD(keys={'image_path'}, names=['image']),
-    LoadImageD(image_only=True, keys=['image']),
-    EnsureChannelFirstD(keys=['image']),
-    SpacingD(pixdim=RESOLUTION, keys=['image']),
-    ResizeWithPadOrCropD(spatial_size=INPUT_SHAPE_AE, mode='minimum', keys=['image']),
-    ScaleIntensityD(minv=0, maxv=1, keys=['image']),
-])
-def preprocess_mri(image_path: str) -> torch.Tensor:
-    """
-    Preprocess an MRI using MONAI transforms to produce
-    a 5D Torch tensor: (batch=1, channel=1, D, H, W).
-    """
-    data_dict = {"image_path": image_path}
-    output_dict = transforms_fn(data_dict)
-    # shape => (1, D, H, W)
-    image_tensor = output_dict["image"].unsqueeze(0)  # => (batch=1, channel=1, D, H, W)
-    return image_tensor.float()  # typically float32
-#################################################
-# PCA "Autoencoder" Wrapper
-#################################################
-class PCABrain2vec(nn.Module):
-    """
-    A PCA-based 'autoencoder' that mimics the old interface:
-      - from_pretrained(...) to load a PCA model from disk
-      - forward(...) returns (reconstruction, embedding, None)
-    Under the hood, it:
-      - takes in a torch tensor shape (N, 1, D, H, W)
-      - flattens it (N, 614400)
-      - uses PCA's transform(...) to get embeddings => shape (N, n_components)
-      - uses inverse_transform(...) to get reconstructions => shape (N, 614400)
-      - reshapes back to (N, 1, D, H, W)
-    """
-    def __init__(self, pca_model=None):
-        super().__init__()
-        # We'll store the fitted PCA model (from scikit-learn)
-        self.pca_model = pca_model  # e.g., an instance of IncrementalPCA or PCA
-    def forward(self, x: torch.Tensor):
-        """
-        Returns (reconstruction, embedding, None).
-        1) Convert x => numpy array => flatten => (N, 614400)
-        2) embedding = pca_model.transform(flat_x)
-        3) reconstruction_np = pca_model.inverse_transform(embedding)
-        4) reshape => (N, 1, 80, 96, 80)
-        5) convert to torch => return (recon, embed, None)
-        """
-        # Expect x shape => (N, 1, D, H, W) => flatten to (N, D*H*W)
-        n_samples = x.shape[0]
-        # Convert to CPU np
-        x_cpu = x.detach().cpu().numpy()  # shape: (N, 1, D, H, W)
-        x_flat = x_cpu.reshape(n_samples, -1)  # shape: (N, 614400)
-        # PCA transform => embeddings shape (N, n_components)
-        embedding_np = self.pca_model.transform(x_flat)
-        # PCA inverse_transform => recon shape (N, 614400)
-        recon_np = self.pca_model.inverse_transform(embedding_np)
-        # Reshape back => (N, 1, 80, 96, 80)
-        recon_np = recon_np.reshape(n_samples, 1, *INPUT_SHAPE_AE)
-        # Convert back to torch
-        reconstruction_torch = torch.from_numpy(recon_np).float()
-        embedding_torch = torch.from_numpy(embedding_np).float()
-        return reconstruction_torch, embedding_torch, None
-    @staticmethod
-    def from_pretrained(pca_path: str):
-        """
-        Load a pre-trained PCA model (pickled or joblib).
-        Returns an instance of PCABrain2vec with that model.
-        """
-        if not os.path.exists(pca_path):
-            raise FileNotFoundError(f"Could not find PCA model at {pca_path}")
-        # Example: pca_model = pickle.load(open(pca_path, 'rb'))
-        # or use joblib:
-        pca_model = load(pca_path)
-        return PCABrain2vec(pca_model=pca_model)

requirements.txt CHANGED Viewed

@@ -1,12 +1,15 @@
 # requirements.txt
-# PyTorch (CUDA or CPU version). For GPU install, see PyTorch docs for the correct wheel.
 torch>=1.12
-# MONAI v1.2+ has the 'generative' subpackage with AutoencoderKL, PatchDiscriminator, etc.
-monai-weekly
 monai-generative
 # For perceptual losses in MONAI's generative module.
 lpips

 # requirements.txt
+# PyTorch (CUDA or CPU version).
 torch>=1.12
+# Install MONAI Generative first
 monai-generative
+# Now force reinstall MONAI Weekly so its (newer) MONAI version takes precedence
+--force-reinstall
+monai-weekly
 # For perceptual losses in MONAI's generative module.
 lpips

brain2vec_PCA.py → train_brain2vec_PCA.py RENAMED Viewed

@@ -1,101 +1,115 @@
 #!/usr/bin/env python3
 """
-pca_autoencoder.py
-Adjustments requested:
-  1. Only fit on scans with a 'train' label in the inputs.csv 'split' column.
-  2. An option to either run incremental PCA or standard PCA.
 Example usage:
-    python pca_autoencoder.py \
         --inputs_csv /path/to/inputs.csv \
         --output_dir ./pca_outputs \
         --pca_type standard \
-        --n_components 100
 """
 import os
 import argparse
 import numpy as np
 import pandas as pd
 import torch
 from torch.utils.data import DataLoader
 from monai import transforms
 from monai.data import Dataset, PersistentDataset
-# We'll import both PCA classes, and decide which to use based on CLI arg.
 from sklearn.decomposition import PCA, IncrementalPCA
-###################################################################
-# Constants for your typical config
-###################################################################
 RESOLUTION = 2
 INPUT_SHAPE_AE = (80, 96, 80)
 DEFAULT_N_COMPONENTS = 1200
-###################################################################
-# Helper: get_dataset_from_pd (same as in brain2vec_linearAE.py)
-###################################################################
-def get_dataset_from_pd(df: pd.DataFrame, transforms_fn, cache_dir: str):
     """
-    Returns a monai.data.Dataset or monai.data.PersistentDataset
-    if `cache_dir` is defined, to speed up loading.
     """
     if cache_dir and cache_dir.strip():
         os.makedirs(cache_dir, exist_ok=True)
-        dataset = PersistentDataset(data=df.to_dict(orient='records'),
-                                    transform=transforms_fn,
-                                    cache_dir=cache_dir)
     else:
-        dataset = Dataset(data=df.to_dict(orient='records'),
-                          transform=transforms_fn)
     return dataset
-###################################################################
-# PCAAutoencoder
-###################################################################
 class PCAAutoencoder:
     """
     A PCA 'autoencoder' that can use either standard PCA or IncrementalPCA:
       - fit(X): trains the model
       - transform(X): get embeddings
       - inverse_transform(Z): reconstruct data from embeddings
-      - forward(X): returns (X_recon, Z)
-    If using standard PCA, we do a single call to .fit(X).
-    If using incremental PCA, we do .partial_fit on data in batches.
     """
-    def __init__(self, n_components=DEFAULT_N_COMPONENTS, batch_size=128, pca_type='incremental'):
         """
         Args:
-            n_components (int): number of principal components to keep
-            batch_size (int): chunk size for either partial_fit or chunked .transform
-            pca_type (str): 'incremental' or 'standard'
         """
         self.n_components = n_components
         self.batch_size = batch_size
         self.pca_type = pca_type.lower()
-        if self.pca_type == 'standard':
-            self.ipca = PCA(n_components=self.n_components, svd_solver='randomized')
-        else:
-            # default to incremental
             self.ipca = IncrementalPCA(n_components=self.n_components)
-    def fit(self, X: np.ndarray):
         """
-        Fit the PCA model. If incremental, calls partial_fit in batches.
-        If standard, calls .fit once on the entire data matrix.
-        X: shape (n_samples, n_features)
         """
         if self.pca_type == 'standard':
-            # Potentially large memory usage, so be sure your system can handle it.
             self.ipca.fit(X)
         else:
             # IncrementalPCA
@@ -107,7 +121,12 @@ class PCAAutoencoder:
     def transform(self, X: np.ndarray) -> np.ndarray:
         """
         Project data into the PCA latent space in batches for memory efficiency.
-        Returns Z with shape (n_samples, n_components)
         """
         results = []
         n_samples = X.shape[0]
@@ -120,7 +139,12 @@ class PCAAutoencoder:
     def inverse_transform(self, Z: np.ndarray) -> np.ndarray:
         """
         Reconstruct data from PCA latent space in batches.
-        Returns X_recon with shape (n_samples, n_features).
         """
         results = []
         n_samples = Z.shape[0]
@@ -130,80 +154,113 @@ class PCAAutoencoder:
             results.append(X_chunk)
         return np.vstack(results)
-    def forward(self, X: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
         """
-        Mimics a linear AE's forward() returning (X_recon, Z).
         """
         Z = self.transform(X)
         X_recon = self.inverse_transform(Z)
         return X_recon, Z
-###################################################################
-# Load and Flatten Data
-###################################################################
-def load_and_flatten_dataset(csv_path: str, cache_dir: str, transforms_fn) -> np.ndarray:
     """
     1) Reads CSV.
-    2) Filters rows if 'split' in columns => only keep 'split' == 'train'.
-    3) Applies transforms to each image, flattening them into a 1D vector (614,400).
-    4) Returns a NumPy array X: shape (n_samples, 614400).
     """
     df = pd.read_csv(csv_path)
-    # Filter only 'train' if the split column exists
     if 'split' in df.columns:
         df = df[df['split'] == 'train']
-    # If there is no 'split' column, we assume the entire CSV is for training.
     dataset = get_dataset_from_pd(df, transforms_fn, cache_dir)
     loader = DataLoader(dataset, batch_size=1, num_workers=0)
-    # We'll store each flattened volume in a list, then stack
     X_list = []
     for batch in loader:
-        # batch["image"] shape => (1, 1, 80, 96, 80)
-        img = batch["image"].squeeze(0)  # => (1, 80, 96, 80)
-        img_np = img.numpy()
-        flattened = img_np.flatten()  # => (614400,)
         X_list.append(flattened)
-    if len(X_list) == 0:
-        raise ValueError("No training samples found (split='train'). Check your CSV or 'split' values.")
     X = np.vstack(X_list)
     return X
-###################################################################
-# Main
-###################################################################
-def main():
-    parser = argparse.ArgumentParser(description="PCA Autoencoder with MONAI transforms and 'split' filtering.")
-    parser.add_argument("--inputs_csv", type=str, required=True,
-                        help="Path to CSV with at least 'image_path' column, optional 'split' column.")
-    parser.add_argument("--cache_dir", type=str, default="",
-                        help="Cache directory for MONAI PersistentDataset (optional).")
-    parser.add_argument("--output_dir", type=str, default="./pca_outputs",
-                        help="Where to save PCA model and embeddings.")
-    parser.add_argument("--batch_size_ipca", type=int, default=128,
-                        help="Batch size for partial_fit or chunked transform.")
-    parser.add_argument("--n_components", type=int, default=1200,
-                        help="Number of PCA components to keep.")
-    parser.add_argument("--pca_type", type=str, default="incremental",
-                        choices=["incremental", "standard"],
-                        help="Which PCA algorithm to use: 'incremental' or 'standard'.")
     args = parser.parse_args()
     os.makedirs(args.output_dir, exist_ok=True)
-    # define transforms as in brain2vec_linearAE.py
     transforms_fn = transforms.Compose([
         transforms.CopyItemsD(keys={'image_path'}, names=['image']),
         transforms.LoadImageD(image_only=True, keys=['image']),
         transforms.EnsureChannelFirstD(keys=['image']),
         transforms.SpacingD(pixdim=RESOLUTION, keys=['image']),
-        transforms.ResizeWithPadOrCropD(spatial_size=INPUT_SHAPE_AE, mode='minimum', keys=['image']),
         transforms.ScaleIntensityD(minv=0, maxv=1, keys=['image']),
     ])
@@ -225,10 +282,10 @@ def main():
     # Get embeddings & reconstruction
     X_recon, Z = model.forward(X)
-    print("Embeddings shape:", Z.shape)         # (n_samples, n_components)
-    print("Reconstruction shape:", X_recon.shape)  # (n_samples, 614400)
-    # Save
     embeddings_path = os.path.join(args.output_dir, "pca_embeddings.npy")
     recons_path = os.path.join(args.output_dir, "pca_reconstructions.npy")
     np.save(embeddings_path, Z)

 #!/usr/bin/env python3
 """
+train_brain2vec_PCA.py
+A PCA-based "autoencoder" script for brain MRI data, with support for both
+incremental PCA and standard PCA. Only scans labeled 'train' in the CSV
+(split == 'train') will be used for fitting.
 Example usage:
+    python train_brain2vec_PCA.py \
         --inputs_csv /path/to/inputs.csv \
         --output_dir ./pca_outputs \
         --pca_type standard \
+        --n_components 1200
 """
 import os
 import argparse
 import numpy as np
 import pandas as pd
 import torch
 from torch.utils.data import DataLoader
 from monai import transforms
 from monai.data import Dataset, PersistentDataset
+from monai.transforms.transform import Transform
 from sklearn.decomposition import PCA, IncrementalPCA
+from typing import Optional, Union, Tuple
+# voxel resolution
 RESOLUTION = 2
+# cropped image dimensions after transform
 INPUT_SHAPE_AE = (80, 96, 80)
 DEFAULT_N_COMPONENTS = 1200
+def get_dataset_from_pd(
+    df: pd.DataFrame,
+    transforms_fn: Transform,
+    cache_dir: Optional[str]
+) -> Union[Dataset, PersistentDataset]:
     """
+    Create a MONAI Dataset or PersistentDataset from the given DataFrame.
+    Args:
+        df (pd.DataFrame): DataFrame with at least 'image_path' column.
+        transforms_fn (Transform): MONAI transform pipeline.
+        cache_dir (Optional[str]): If provided, use PersistentDataset caching.
+    Returns:
+        Dataset|PersistentDataset: A dataset for training or inference.
     """
+    data_dicts = df.to_dict(orient='records')
     if cache_dir and cache_dir.strip():
         os.makedirs(cache_dir, exist_ok=True)
+        dataset = PersistentDataset(
+            data=data_dicts,
+            transform=transforms_fn,
+            cache_dir=cache_dir
+        )
     else:
+        dataset = Dataset(data=data_dicts, transform=transforms_fn)
     return dataset
 class PCAAutoencoder:
     """
     A PCA 'autoencoder' that can use either standard PCA or IncrementalPCA:
       - fit(X): trains the model
       - transform(X): get embeddings
       - inverse_transform(Z): reconstruct data from embeddings
+      - forward(X): returns (X_recon, Z).
+    If using standard PCA, a single call to .fit(X) is made.
+    If using incremental PCA, .partial_fit is called in batches.
     """
+    def __init__(
+        self,
+        n_components: int = DEFAULT_N_COMPONENTS,
+        batch_size: int = 128,
+        pca_type: str = 'standard'
+    ) -> None:
         """
+        Initialize the PCAAutoencoder.
         Args:
+            n_components (int): Number of principal components to keep.
+            batch_size (int): Chunk size for partial_fit or chunked transform.
+            pca_type (str): Either 'incremental' or 'standard'.
         """
         self.n_components = n_components
         self.batch_size = batch_size
         self.pca_type = pca_type.lower()
+        if self.pca_type == 'incremental':
             self.ipca = IncrementalPCA(n_components=self.n_components)
+        else:
+            # Default to standard PCA
+            self.ipca = PCA(n_components=self.n_components, svd_solver='randomized')
+    def fit(self, X: np.ndarray) -> None:
         """
+        Fit the PCA model. If incremental PCA, calls partial_fit in batches;
+        otherwise calls .fit once on the entire data array.
+        Args:
+            X (np.ndarray): Shape (n_samples, n_features).
         """
         if self.pca_type == 'standard':
             self.ipca.fit(X)
         else:
             # IncrementalPCA
     def transform(self, X: np.ndarray) -> np.ndarray:
         """
         Project data into the PCA latent space in batches for memory efficiency.
+        Args:
+            X (np.ndarray): Shape (n_samples, n_features).
+        Returns:
+            np.ndarray: Latent embeddings of shape (n_samples, n_components).
         """
         results = []
         n_samples = X.shape[0]
     def inverse_transform(self, Z: np.ndarray) -> np.ndarray:
         """
         Reconstruct data from PCA latent space in batches.
+        Args:
+            Z (np.ndarray): Latent embeddings of shape (n_samples, n_components).
+        Returns:
+            np.ndarray: Reconstructed data of shape (n_samples, n_features).
         """
         results = []
         n_samples = Z.shape[0]
             results.append(X_chunk)
         return np.vstack(results)
+    def forward(self, X: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
         """
+        Mimic a linear AE's forward() returning (X_recon, Z).
+        Args:
+            X (np.ndarray): Original data of shape (n_samples, n_features).
+        Returns:
+            tuple[np.ndarray, np.ndarray]: (X_recon, Z).
         """
         Z = self.transform(X)
         X_recon = self.inverse_transform(Z)
         return X_recon, Z
+def load_and_flatten_dataset(
+    csv_path: str,
+    cache_dir: str,
+    transforms_fn: Transform
+) -> np.ndarray:
     """
+    Load and flatten MRI volumes from the provided CSV.
     1) Reads CSV.
+    2) Filters rows if 'split' in columns => only keep rows with split == 'train'.
+    3) Applies transforms to each image, flattening them into a 1D vector.
+    4) Returns a NumPy array X of shape (n_samples, 614400) after flattening.
+    Args:
+        csv_path (str): Path to a CSV containing at least 'image_path' column.
+                        Optionally has a 'split' column.
+        cache_dir (str): Path to cache directory for MONAI PersistentDataset.
+        transforms_fn (Transform): MONAI transform pipeline.
+    Returns:
+        np.ndarray: Flattened image data of shape (n_samples, 614400).
     """
     df = pd.read_csv(csv_path)
+    # Keep only 'train' samples if split column exists
     if 'split' in df.columns:
         df = df[df['split'] == 'train']
     dataset = get_dataset_from_pd(df, transforms_fn, cache_dir)
     loader = DataLoader(dataset, batch_size=1, num_workers=0)
     X_list = []
     for batch in loader:
+        # batch["image"] => shape (1, 1, 80, 96, 80)
+        img = batch["image"].squeeze(0)  # => shape (1, 80, 96, 80)
+        flattened = img.numpy().flatten()  # => (614400,)
         X_list.append(flattened)
+    if not X_list:
+        raise ValueError(
+            "No training samples found (split='train'). Check your CSV or 'split' values."
+        )
     X = np.vstack(X_list)
     return X
+def main() -> None:
+    """
+    Main function to parse command-line arguments and fit a PCA or IncrementalPCA model,
+    then save embeddings and reconstructions.
+    """
+    parser = argparse.ArgumentParser(
+        description="PCA Autoencoder with MONAI transforms and 'split' filtering."
+    )
+    parser.add_argument(
+        "--inputs_csv", type=str, required=True,
+        help="Path to CSV with at least 'image_path' column and optional 'split' column."
+    )
+    parser.add_argument(
+        "--cache_dir", type=str, default="",
+        help="Cache directory for MONAI PersistentDataset (optional)."
+    )
+    parser.add_argument(
+        "--output_dir", type=str, default="./pca_outputs",
+        help="Where to save PCA model and embeddings."
+    )
+    parser.add_argument(
+        "--batch_size_ipca", type=int, default=128,
+        help="Batch size for partial_fit or chunked transform."
+    )
+    parser.add_argument(
+        "--n_components", type=int, default=1200,
+        help="Number of PCA components to keep."
+    )
+    parser.add_argument(
+        "--pca_type", type=str, default="incremental",
+        choices=["incremental", "standard"],
+        help="Which PCA algorithm to use: 'incremental' or 'standard'."
+    )
     args = parser.parse_args()
     os.makedirs(args.output_dir, exist_ok=True)
     transforms_fn = transforms.Compose([
         transforms.CopyItemsD(keys={'image_path'}, names=['image']),
         transforms.LoadImageD(image_only=True, keys=['image']),
         transforms.EnsureChannelFirstD(keys=['image']),
         transforms.SpacingD(pixdim=RESOLUTION, keys=['image']),
+        transforms.ResizeWithPadOrCropD(
+            spatial_size=INPUT_SHAPE_AE, mode='minimum', keys=['image']
+        ),
         transforms.ScaleIntensityD(minv=0, maxv=1, keys=['image']),
     ])
     # Get embeddings & reconstruction
     X_recon, Z = model.forward(X)
+    print("Embeddings shape:", Z.shape)
+    print("Reconstruction shape:", X_recon.shape)
+    # Save embeddings and reconstructions
     embeddings_path = os.path.join(args.output_dir, "pca_embeddings.npy")
     recons_path = os.path.join(args.output_dir, "pca_reconstructions.npy")
     np.save(embeddings_path, Z)