Morelli001 commited on Nov 13, 2025

Commit

aee1a39

verified ·

1 Parent(s): feee4b6

Upload folder using huggingface_hub

Browse files

Files changed (19) hide show

45.png +0 -0
README.md +109 -0
config.json +16 -0
configuration_film_unet2d.py +23 -0
image_processing_film_unet2d.py +218 -0
model.safetensors +3 -0
modeling_film_unet2d.py +141 -0
preprocessor_config.json +8 -0
test.py +32 -0
test_4_stages.py +53 -0
test_5_stages_testicles.py +46 -0
tmp.png +0 -0
tmp_4_stages.png +0 -0
unet_4_stages/config.json +16 -0
unet_4_stages/configuration_film_unet2d.py +23 -0
unet_4_stages/image_processing_film_unet2d.py +218 -0
unet_4_stages/model.safetensors +3 -0
unet_4_stages/modeling_film_unet2d.py +141 -0
unet_4_stages/preprocessor_config.json +8 -0

45.png ADDED Viewed

README.md ADDED Viewed

	@@ -0,0 +1,109 @@

+# FILMUnet2D (Transformers-compatible)
+This model is a 2D U-Net with FiLM conditioning for multi-organ segmentation.
+## Installation
+Make sure you have `transformers` and `torch` installed.
+```bash
+pip install transformers torch
+```
+## Usage
+You can load the model and processor using the `Auto` classes from `transformers`. Since this repository contains custom code, make sure to pass `trust_remote_code=True`.
+```python
+import torch
+from transformers import AutoModel, AutoImageProcessor
+from PIL import Image
+# 1. Load model and processor
+repo_id = "Morelli001/US_UNet2DFiLM"
+processor = AutoImageProcessor.from_pretrained(repo_id, trust_remote_code=True)
+model = AutoModel.from_pretrained(repo_id, trust_remote_code=True)
+model.eval()
+# 2. Load and preprocess your image
+#    The processor handles resizing, letterboxing, and normalization.
+image = Image.open("path/to/your/image.png").convert("RGB")
+inputs = processor(images=image, return_tensors="pt")
+# 3. Prepare conditioning input
+#    This should be an integer tensor representing the organ ID.
+#    Replace `4` with the appropriate ID for your use case.
+organ_id = torch.tensor([4])
+# 4. Run inference
+with torch.no_grad():
+    outputs = model(**inputs, organ_id=organ_id)
+# 5. Post-process the output to get the final segmentation mask
+#    The processor can convert the logits to a binary mask, automatically handling
+#    the removal of letterbox padding and resizing to the original image dimensions.
+mask = processor.post_process_semantic_segmentation(
+    outputs,
+    inputs,
+    threshold=0.7,
+    return_as_pil=True
+)[0]
+# 6. Save the result
+mask.save("output_mask.png")
+print("Segmentation mask saved to output_mask.png")
+```
+### Model Details
+- **Architecture:** U-Net with FiLM layers for conditional segmentation.
+- **Conditioning:** The model's output is conditioned on an `organ_id` input.
+- **Input:** RGB images.
+- **Output:** A single-channel segmentation mask.
+### Configuration
+The model configuration can be accessed via `model.config`. Key parameters include:
+- `in_channels`: Number of input channels (default: 3).
+- `num_classes`: Number of output classes (default: 1).
+- `n_organs`: The number of different organs the model was trained to condition on.
+- `depth`: The depth of the U-Net.
+- `size`: The base number of filters in the first layer.
+### Organ IDs
+The `organ_id` passed to the model corresponds to the following mapping:
+```python
+organ_to_class_dict = {
+    "appendix": 0,
+    "breast": 1,
+    "breast_luminal": 1,
+    "cardiac": 2,
+    "thyroid": 3,
+    "fetal": 4,
+    "kidney": 5,
+    "liver": 6,
+    "testicle": 7,
+}
+```
+### Alternative Versions
+This repository contains multiple versions of the model located in subfolders. You can load a specific version by using the `subfolder` parameter.
+#### 4-Stage U-Net
+This version has a U-Net depth of 4.
+```python
+from transformers import AutoModel
+model_4_stages = AutoModel.from_pretrained(
+    "Morelli001/US_UNet2DFiLM",
+    subfolder="unet_4_stages",
+    trust_remote_code=True
+)
+```

config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "model_type": "film_unet2d",
+  "architectures": ["FilmUnet2DModel"],
+  "auto_map": {
+    "AutoConfig": "configuration_film_unet2d.FilmUnet2DConfig",
+    "AutoModel":  "modeling_film_unet2d.FilmUnet2DModel",
+    "AutoImageProcessor": "image_processing_film_unet2d.FilmUnet2DImageProcessor"
+  },
+  "in_channels": 3,
+  "num_classes": 1,
+  "n_organs": 9,
+  "size": 32,
+  "depth": 5,
+  "film_start": 0,
+  "use_film": 1
+}

configuration_film_unet2d.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from transformers.configuration_utils import PretrainedConfig
+class FilmUnet2DConfig(PretrainedConfig):
+    model_type = "film_unet2d"
+    def __init__(self,
+                 in_channels=3,
+                 num_classes=1,
+                 n_organs=9,
+                 size=32,
+                 depth=5,
+                 film_start=0,
+                 use_film=True,
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.n_organs = n_organs
+        self.size = size
+        self.depth = depth
+        self.film_start = film_start
+        self.use_film = use_film

image_processing_film_unet2d.py ADDED Viewed

	@@ -0,0 +1,218 @@

+# image_processing_film_unet2d.py
+from typing import List, Union, Tuple, Optional
+import numpy as np
+from PIL import Image
+import torch
+from transformers.image_processing_utils import ImageProcessingMixin
+ArrayLike = Union[np.ndarray, torch.Tensor, Image.Image]
+def _to_rgb_numpy(im: ArrayLike) -> np.ndarray:
+    # -> float32 HWC in [0,255], 3 channels
+    if isinstance(im, Image.Image):
+        if im.mode != "RGB":
+            im = im.convert("RGB")
+        arr = np.array(im, dtype=np.uint8).astype(np.float32)
+    elif isinstance(im, torch.Tensor):
+        t = im.detach().cpu()
+        if t.ndim != 3:
+            raise ValueError("Tensor must be 3D (CHW or HWC).")
+        if t.shape[0] in (1, 3):        # CHW
+            if t.shape[0] == 1:
+                t = t.repeat(3, 1, 1)
+            t = t.permute(1, 2, 0)      # HWC
+        elif t.shape[-1] == 1:          # HWC gray
+            t = t.repeat(1, 1, 3)
+        arr = t.numpy()
+        if arr.dtype in (np.float32, np.float64) and arr.max() <= 1.5:
+            arr = (arr * 255.0).astype(np.float32)
+        else:
+            arr = arr.astype(np.float32)
+    else:
+        arr = np.array(im)
+        if arr.ndim == 2:
+            arr = np.repeat(arr[..., None], 3, axis=-1)
+        arr = arr.astype(np.float32)
+        if arr.max() <= 1.5:
+            arr = (arr * 255.0).astype(np.float32)
+    if arr.ndim != 3 or arr.shape[-1] != 3:
+        raise ValueError("Expected RGB image with shape HxWx3.")
+    return arr
+def _letterbox_keep_ratio(arr: np.ndarray, target_hw: Tuple[int, int]):
+    """Resize with aspect ratio preserved and pad with 0 (black) to target (H,W).
+    Returns: out(H,W,3), (top, left, new_h, new_w)
+    """
+    th, tw = target_hw
+    h, w = arr.shape[:2]
+    scale = min(th / h, tw / w)
+    nh, nw = int(round(h * scale)), int(round(w * scale))
+    if nh <= 0 or nw <= 0:
+        raise ValueError("Invalid resize result.")
+    pil = Image.fromarray(np.clip(arr, 0, 255).astype(np.uint8))
+    pil = pil.resize((nw, nh), resample=Image.BILINEAR)
+    rs = np.array(pil, dtype=np.float32)
+    out = np.zeros((th, tw, 3), dtype=np.float32)
+    top = (th - nh) // 2
+    left = (tw - nw) // 2
+    out[top:top+nh, left:left+nw] = rs
+    return out, (top, left, nh, nw)
+def _zscore_ignore_black(chw: np.ndarray, eps: float = 1e-8) -> np.ndarray:
+    mask = (chw.sum(axis=0) > 0)  # HxW
+    if not mask.any():
+        return chw.copy()
+    valid = chw[:, mask]
+    mean = valid.mean()
+    std = valid.std()
+    return (chw - mean) / std if std > eps else (chw - mean)
+class FilmUnet2DImageProcessor(ImageProcessingMixin):
+    """
+    Processor for FILMUnet2D:
+      - Convert to RGB
+      - Keep-aspect-ratio resize+pad (letterbox) to 512x512 (configurable)
+      - Normalize with mean/std in 0–255 space (like your training)
+      - Optional z-score 'self_norm' ignoring black pixels
+    Returns dict with:
+      - pixel_values: torch.FloatTensor [B,3,H,W]
+      - original_sizes: torch.LongTensor [B,2] (H,W)
+      - letterbox_params: torch.LongTensor [B,4] (top, left, nh, nw)  # NEW
+    """
+    model_input_names = ["pixel_values"]
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Tuple[int, int] = (512, 512),
+        keep_ratio: bool = True,
+        image_mean: Tuple[float, float, float] = (123.675, 116.28, 103.53),
+        image_std:  Tuple[float, float, float]  = (58.395, 57.12, 57.375),
+        self_norm: bool = False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.do_resize = bool(do_resize)
+        self.size = tuple(size)
+        self.keep_ratio = bool(keep_ratio)
+        self.image_mean = tuple(float(x) for x in image_mean)
+        self.image_std  = tuple(float(x) for x in image_std)
+        self.self_norm = bool(self_norm)
+    def __call__(
+        self,
+        images: Union[ArrayLike, List[ArrayLike]],
+        return_tensors: Optional[str] = "pt",
+        **kwargs,
+    ):
+        imgs = images if isinstance(images, (list, tuple)) else [images]
+        batch = []
+        orig_sizes = []
+        lb_params = []
+        for im in imgs:
+            arr = _to_rgb_numpy(im)  # HWC float32 in 0–255
+            oh, ow = arr.shape[:2]
+            orig_sizes.append((oh, ow))
+            if self.do_resize:
+                if self.keep_ratio:
+                    arr, meta = _letterbox_keep_ratio(arr, self.size)  # meta=(top,left,nh,nw)
+                else:
+                    h, w = self.size
+                    pil = Image.fromarray(np.clip(arr, 0, 255).astype(np.uint8))
+                    arr = np.array(pil.resize((w, h), resample=Image.BILINEAR), dtype=np.float32)
+                    meta = (0, 0, h, w)
+            else:
+                # no resize: still expose meta so postprocess can handle consistently
+                h, w = arr.shape[:2]
+                pad_h = self.size[0] - h
+                pad_w = self.size[1] - w
+                top = max(pad_h // 2, 0)
+                left = max(pad_w // 2, 0)
+                out = np.zeros((*self.size, 3), dtype=np.float32)
+                out[top:top+h, left:left+w] = arr[:self.size[0]-top, :self.size[1]-left]
+                arr = out
+                meta = (top, left, h, w)
+            lb_params.append(meta)
+            mean = np.array(self.image_mean, dtype=np.float32).reshape(1, 1, 3)
+            std  = np.array(self.image_std,  dtype=np.float32).reshape(1, 1, 3)
+            arr = (arr - mean) / std  # HWC
+            chw = np.transpose(arr, (2, 0, 1))  # C,H,W
+            if self.self_norm:
+                chw = _zscore_ignore_black(chw)
+            batch.append(chw)
+        pixel_values = np.stack(batch, axis=0)  # B,C,H,W
+        if return_tensors == "pt":
+            pixel_values = torch.from_numpy(pixel_values).to(torch.float32)
+            original_sizes = torch.tensor(orig_sizes, dtype=torch.long)
+            letterbox_params = torch.tensor(lb_params, dtype=torch.long)
+        else:
+            original_sizes = orig_sizes
+            letterbox_params = lb_params
+        return {
+            "pixel_values": pixel_values,
+            "original_sizes": original_sizes,     # (B,2) H,W
+            "letterbox_params": letterbox_params  # (B,4) top,left,nh,nw in 512x512
+        }
+    # ---------- POST-PROCESSING ----------
+    def post_process_semantic_segmentation(
+        self,
+        outputs: dict,
+        processor_inputs: Optional[dict] = None,
+        threshold: float = 0.5,
+        return_as_pil: bool = True,
+    ):
+        """
+        Turn model outputs into masks resized back to the ORIGINAL image sizes,
+        with letterbox padding removed.
+        Args:
+            outputs: dict from model forward (expects 'logits': [B,1,512,512])
+            processor_inputs: the dict returned by __call__ (must contain
+                'original_sizes' [B,2] and 'letterbox_params' [B,4])
+            threshold: probability threshold for binarization
+            return_as_pil: return a list of PIL Images (uint8 0/255) if True,
+                           else a list of torch tensors [H,W] uint8
+        Returns:
+            List of masks back in original sizes (H,W).
+        """
+        logits = outputs["logits"]            # [B,1,H,W]
+        probs = torch.sigmoid(logits)
+        masks = (probs > threshold).to(torch.uint8) * 255  # [B,1,H,W] uint8
+        if processor_inputs is None:
+            raise ValueError("processor_inputs must be provided to undo letterboxing.")
+        orig_sizes = processor_inputs["original_sizes"]    # [B,2]
+        lb_params  = processor_inputs["letterbox_params"]  # [B,4] top,left,nh,nw
+        results = []
+        B = masks.shape[0]
+        for i in range(B):
+            m = masks[i, 0]      # [512,512]
+            top, left, nh, nw = [int(x) for x in lb_params[i].tolist()]
+            # crop letterbox
+            m_cropped = m[top:top+nh, left:left+nw]  # [nh,nw]
+            # resize back to original
+            oh, ow = [int(x) for x in orig_sizes[i].tolist()]
+            m_resized = torch.nn.functional.interpolate(
+                m_cropped.unsqueeze(0).unsqueeze(0).float(),
+                size=(oh, ow),
+                mode="nearest"
+            )[0,0].to(torch.uint8)  # [oh,ow]
+            if return_as_pil:
+                results.append(Image.fromarray(m_resized.cpu().numpy(), mode="L"))
+            else:
+                results.append(m_resized)
+        return results

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:66b39afecca324389126efc9d7995b707cfe2cc0330e2bfa12728829bd79b2a6
+size 603375348

modeling_film_unet2d.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import torch
+import torch.nn as nn
+from transformers.modeling_utils import PreTrainedModel
+from .configuration_film_unet2d import FilmUnet2DConfig
+class ConvBlock(nn.Module):
+    def __init__(self, in_ch, out_ch, k=3, s=1, p=1):
+        super().__init__()
+        self.block = nn.Sequential(
+            nn.Conv2d(in_ch, out_ch, kernel_size=k, stride=s, padding=p),
+            nn.InstanceNorm2d(out_ch),
+            nn.LeakyReLU(inplace=True),
+        )
+    def forward(self, x): return self.block(x)
+class FiLM2d(nn.Module):
+    def __init__(self, n_organs, in_channels, emb_dim=64, hidden=None):
+        super().__init__()
+        hidden = hidden or 2 * in_channels
+        self.embed = nn.Embedding(n_organs, emb_dim)
+        self.mlp = nn.Sequential(
+            nn.Linear(emb_dim, hidden), nn.ReLU(inplace=True), nn.Linear(hidden, 2*in_channels)
+        )
+        nn.init.zeros_(self.mlp[-1].weight)
+        nn.init.constant_(self.mlp[-1].bias[:in_channels], 0)
+        nn.init.constant_(self.mlp[-1].bias[in_channels:], 1)
+    def forward(self, x, organ_id):
+        beta_gamma = self.mlp(self.embed(organ_id))
+        beta, gamma = beta_gamma.chunk(2, dim=-1)
+        beta = beta.unsqueeze(-1).unsqueeze(-1)
+        gamma = gamma.unsqueeze(-1).unsqueeze(-1)
+        return gamma * x + beta
+class DownFiLM(nn.Module):
+    def __init__(self, in_chs, out_chs, n_organs):
+        super().__init__()
+        self.conv_blocks = nn.ModuleList([ConvBlock(i, o) for i,o in zip(in_chs,out_chs)])
+        self.film_blocks = nn.ModuleList([FiLM2d(n_organs, o) for o in out_chs])
+        self.pool = nn.MaxPool2d(2,2)
+    def forward(self, x, organ_id):
+        for c,f in zip(self.conv_blocks, self.film_blocks):
+            x = f(c(x), organ_id)
+        return self.pool(x), x
+class Down(nn.Module):
+    def __init__(self, in_chs, out_chs):
+        super().__init__()
+        self.conv_blocks = nn.ModuleList([ConvBlock(i, o) for i,o in zip(in_chs,out_chs)])
+        self.pool = nn.MaxPool2d(2,2)
+    def forward(self, x):
+        for c in self.conv_blocks: x = c(x)
+        return self.pool(x), x
+class UpFiLM(nn.Module):
+    def __init__(self, in_chs, out_chs, n_organs, up=True):
+        super().__init__()
+        self.conv_blocks = nn.ModuleList([ConvBlock(i, o) for i,o in zip(in_chs,out_chs)])
+        self.film_blocks = nn.ModuleList([FiLM2d(n_organs, o) for o in out_chs])
+        self.up_conv_op = nn.ConvTranspose2d(out_chs[-1], out_chs[-1], kernel_size=2, stride=2) if up else None
+    def forward(self, x, organ_id):
+        for c,f in zip(self.conv_blocks, self.film_blocks):
+            x = f(c(x), organ_id)
+        return self.up_conv_op(x) if self.up_conv_op is not None else x
+class Up(nn.Module):
+    def __init__(self, in_chs, out_chs, up=True):
+        super().__init__()
+        self.conv_blocks = nn.ModuleList([ConvBlock(i, o) for i,o in zip(in_chs,out_chs)])
+        self.up_conv_op = nn.ConvTranspose2d(out_chs[-1], out_chs[-1], kernel_size=2, stride=2) if up else None
+    def forward(self, x):
+        for c in self.conv_blocks: x = c(x)
+        return self.up_conv_op(x) if self.up_conv_op is not None else x
+class UNet2DFiLMCore(nn.Module):
+    def __init__(self, cfg: FilmUnet2DConfig):
+        super().__init__()
+        size, depth, n_organs = cfg.size, cfg.depth, cfg.n_organs
+        use_film, film_start = cfg.use_film, cfg.film_start
+        self.encoder = nn.ModuleDict()
+        if use_film and 0 >= film_start:
+            self.encoder["0"] = DownFiLM([cfg.in_channels, size], [size, size*2], n_organs)
+        else:
+            self.encoder["0"] = Down([cfg.in_channels, size], [size, size*2])
+        for i in range(1, depth):
+            in_ch = [size*(2**i), size*(2**i)]
+            out_ch = [size*(2**i), size*(2**(i+1))]
+            if use_film and i >= film_start:
+                self.encoder[str(i)] = DownFiLM(in_ch, out_ch, n_organs)
+            else:
+                self.encoder[str(i)] = Down(in_ch, out_ch)
+        if use_film:
+            self.bottleneck = UpFiLM([size*(2**depth), size*(2**depth)], [size*(2**depth), size*(2**(depth+1))], n_organs)
+        else:
+            self.bottleneck = Up([size*(2**depth), size*(2**depth)], [size*(2**depth), size*(2**(depth+1))])
+        self.decoder = nn.ModuleDict()
+        for i in range(depth, 1, -1):
+            use_film_here = use_film and (i-1) >= film_start
+            if use_film_here:
+                self.decoder[str(i-1)] = UpFiLM([size*(2**(i+1))+size*(2**i), size*(2**i)], [size*(2**i), size*(2**i)], n_organs)
+            else:
+                self.decoder[str(i-1)] = Up([size*(2**(i+1))+size*(2**i), size*(2**i)], [size*(2**i), size*(2**i)])
+        if use_film and 0 >= film_start:
+            self.decoder["0"] = UpFiLM([size*4+size*2, size*2], [size*2, size*2], n_organs, up=False)
+        else:
+            self.decoder["0"] = Up([size*4+size*2, size*2], [size*2, size*2], up=False)
+        self.out_layer = ConvBlock(
+            size * 2,
+            cfg.num_classes,
+            k= 1,s= 1,p=0
+        )
+    def forward(self, pixel_values, organ_id):
+        feats = []
+        out, feat = (self.encoder["0"](pixel_values, organ_id) if isinstance(self.encoder["0"], DownFiLM) else self.encoder["0"](pixel_values))
+        feats.append(feat)
+        for k in list(self.encoder.keys())[1:]:
+            blk = self.encoder[k]
+            out, feat = (blk(out, organ_id) if isinstance(blk, DownFiLM) else blk(out))
+            feats.append(feat)
+        out = self.bottleneck(out, organ_id) if isinstance(self.bottleneck, UpFiLM) else self.bottleneck(out)
+        for k in self.decoder:
+            cat = torch.cat([out, feats[int(k)]], dim=1)
+            blk = self.decoder[k]
+            out = blk(cat, organ_id) if isinstance(blk, UpFiLM) else blk(cat)
+        return self.out_layer(out)
+class FilmUnet2DModel(PreTrainedModel):
+    config_class = FilmUnet2DConfig
+    base_model_prefix = "model"
+    def __init__(self, config: FilmUnet2DConfig):
+        super().__init__(config)
+        self.model = UNet2DFiLMCore(config)
+        self.post_init()
+    def forward(self, pixel_values, organ_id, labels=None, **kwargs):
+        logits = self.model(pixel_values, organ_id)
+        if labels is None:
+            return {"logits": logits}
+        loss = nn.functional.binary_cross_entropy_with_logits(logits, labels)
+        return {"loss": loss, "logits": logits}

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "do_resize": true,
+  "size": [512, 512],
+  "keep_ratio": true,
+  "image_mean": [123.675, 116.28, 103.53],
+  "image_std": [58.395, 57.12, 57.375],
+  "self_norm": false
+}

test.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# test_load_film_unet2d.py
+import torch, os
+from transformers import AutoModel, AutoConfig, AutoImageProcessor
+# ✅ point to your local folder (or your HF repo id after pushing)
+repo_or_path = os.path.abspath("/home/nicola/Downloads/FILMUnet2D_transformers_repo/FILMUnet2D_transformers")
+print("Loading config...")
+cfg = AutoConfig.from_pretrained(repo_or_path, trust_remote_code=True)
+print(cfg)
+print("Loading model and weights...")
+proc  = AutoImageProcessor.from_pretrained(repo_or_path, trust_remote_code=True)
+model = AutoModel.from_pretrained(repo_or_path, trust_remote_code=True)
+model.eval()
+# --- quick synthetic forward ---
+# x = torch.randn(1, cfg.in_channels, 512, 512)
+from PIL import Image
+from torchvision.transforms.v2.functional import pil_to_tensor, to_pil_image
+x = Image.open("/home/nicola/Downloads/45.png").convert("RGB")
+inputs = proc(images=x, return_tensors="pt") # {'pixel_values': B,C,H,W}
+organ_id = torch.tensor([4])  # any valid organ id < cfg.n_organs
+with torch.no_grad():
+    out = model(**inputs, organ_id=organ_id)
+# Post-process: undo letterbox & resize back to original, with threshold 0.7
+masks = proc.post_process_semantic_segmentation(out, inputs, threshold=0.7, return_as_pil=True)
+# Save the first (since you used a single image, that'll be masks[0])
+masks[0].save("/home/nicola/Downloads/FILMUnet2D_transformers_repo/FILMUnet2D_transformers/tmp.png")

test_4_stages.py ADDED Viewed

	@@ -0,0 +1,53 @@

+# test_load_film_unet_4_stages.py
+import torch, os
+from transformers import AutoModel, AutoConfig, AutoImageProcessor
+from PIL import Image
+# This script tests the 4-stage U-Net model.
+# ✅ Point to the root folder of your repository
+repo_or_path = os.path.abspath("/home/nicola/Downloads/FILMUnet2D_transformers_repo/FILMUnet2D_transformers")
+subfolder_4_stages = "unet_4_stages"
+# --- IMPORTANT ---
+# You need to place the correct model weights for the 4-stage U-Net in the
+# 'unet_4_stages' directory. The file should be named 'model.safetensors'.
+# The path is: /home/nicola/Downloads/FILMUnet2D_transformers_repo/FILMUnet2D_transformers/unet_4_stages/model.safetensors
+# -----------------
+print("Loading 4-stage model and processor...")
+try:
+    proc = AutoImageProcessor.from_pretrained(repo_or_path, subfolder=subfolder_4_stages, trust_remote_code=True)
+    model = AutoModel.from_pretrained(repo_or_path, subfolder=subfolder_4_stages, trust_remote_code=True)
+    model.eval()
+    print("Model loaded successfully.")
+except Exception as e:
+    print(f"Error loading the 4-stage model: {e}")
+    print("Please ensure the 'model.safetensors' file in the 'unet_4_stages' directory is compatible with the 4-stage architecture.")
+    exit()
+# --- Inference ---
+image_path = "/home/nicola/Downloads/45.png"
+if not os.path.exists(image_path):
+    print(f"Error: Image file not found at {image_path}")
+    exit()
+print(f"Loading image from {image_path}...")
+image = Image.open(image_path).convert("RGB")
+inputs = proc(images=image, return_tensors="pt")
+# Use an appropriate organ ID for your test case
+organ_id = torch.tensor([4])
+print("Running inference...")
+with torch.no_grad():
+    out = model(**inputs, organ_id=organ_id)
+# Post-process to get the segmentation mask
+masks = proc.post_process_semantic_segmentation(out, inputs, threshold=0.7, return_as_pil=True)
+# Save the output mask
+output_path = "/home/nicola/Downloads/FILMUnet2D_transformers_repo/FILMUnet2D_transformers/tmp_4_stages.png"
+masks[0].save(output_path)
+print(f"✅ Test complete. Segmentation mask saved to {output_path}")

test_5_stages_testicles.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# test_load_film_unet_5_stages_testicles.py
+import torch, os
+from transformers import AutoModel, AutoImageProcessor
+from PIL import Image
+# This script tests the 5-stage U-Net model fine-tuned on testicle ultrasounds.
+# ✅ Point to the root folder of your repository
+repo_or_path = os.path.abspath("/home/nicola/Downloads/FILMUnet2D_transformers_repo/FILMUnet2D_transformers")
+subfolder_5_stages = "unet_5_stages_testicles"
+print("Loading 5-stage testicle-finetuned model and processor...")
+try:
+    proc = AutoImageProcessor.from_pretrained(repo_or_path, subfolder=subfolder_5_stages, trust_remote_code=True)
+    model = AutoModel.from_pretrained(repo_or_path, subfolder=subfolder_5_stages, trust_remote_code=True)
+    model.eval()
+    print("Model loaded successfully.")
+except Exception as e:
+    print(f"Error loading the model: {e}")
+    exit()
+# --- Inference ---
+image_path = "/home/nicola/Downloads/45.png"
+if not os.path.exists(image_path):
+    print(f"Error: Image file not found at {image_path}")
+    exit()
+print(f"Loading image from {image_path}...")
+image = Image.open(image_path).convert("RGB")
+inputs = proc(images=image, return_tensors="pt")
+# From the dictionary you provided, 'testicle' corresponds to ID 7
+organ_id = torch.tensor([7])
+print("Running inference with organ_id=7 (testicle)...")
+with torch.no_grad():
+    out = model(**inputs, organ_id=organ_id)
+# Post-process to get the segmentation mask
+masks = proc.post_process_semantic_segmentation(out, inputs, threshold=0.7, return_as_pil=True)
+# Save the output mask
+output_path = "/home/nicola/Downloads/FILMUnet2D_transformers_repo/FILMUnet2D_transformers/tmp_5_stages_testicles.png"
+masks[0].save(output_path)
+print(f"✅ Test complete. Segmentation mask saved to {output_path}")

tmp.png ADDED Viewed

tmp_4_stages.png ADDED Viewed

unet_4_stages/config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "model_type": "film_unet2d",
+  "architectures": ["FilmUnet2DModel"],
+  "auto_map": {
+    "AutoConfig": "configuration_film_unet2d.FilmUnet2DConfig",
+    "AutoModel":  "modeling_film_unet2d.FilmUnet2DModel",
+    "AutoImageProcessor": "image_processing_film_unet2d.FilmUnet2DImageProcessor"
+  },
+  "in_channels": 3,
+  "num_classes": 1,
+  "n_organs": 9,
+  "size": 32,
+  "depth": 4,
+  "film_start": 0,
+  "use_film": 1
+}

unet_4_stages/configuration_film_unet2d.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from transformers.configuration_utils import PretrainedConfig
+class FilmUnet2DConfig(PretrainedConfig):
+    model_type = "film_unet2d"
+    def __init__(self,
+                 in_channels=3,
+                 num_classes=1,
+                 n_organs=9,
+                 size=32,
+                 depth=5,
+                 film_start=0,
+                 use_film=True,
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.n_organs = n_organs
+        self.size = size
+        self.depth = depth
+        self.film_start = film_start
+        self.use_film = use_film

unet_4_stages/image_processing_film_unet2d.py ADDED Viewed

	@@ -0,0 +1,218 @@

+# image_processing_film_unet2d.py
+from typing import List, Union, Tuple, Optional
+import numpy as np
+from PIL import Image
+import torch
+from transformers.image_processing_utils import ImageProcessingMixin
+ArrayLike = Union[np.ndarray, torch.Tensor, Image.Image]
+def _to_rgb_numpy(im: ArrayLike) -> np.ndarray:
+    # -> float32 HWC in [0,255], 3 channels
+    if isinstance(im, Image.Image):
+        if im.mode != "RGB":
+            im = im.convert("RGB")
+        arr = np.array(im, dtype=np.uint8).astype(np.float32)
+    elif isinstance(im, torch.Tensor):
+        t = im.detach().cpu()
+        if t.ndim != 3:
+            raise ValueError("Tensor must be 3D (CHW or HWC).")
+        if t.shape[0] in (1, 3):        # CHW
+            if t.shape[0] == 1:
+                t = t.repeat(3, 1, 1)
+            t = t.permute(1, 2, 0)      # HWC
+        elif t.shape[-1] == 1:          # HWC gray
+            t = t.repeat(1, 1, 3)
+        arr = t.numpy()
+        if arr.dtype in (np.float32, np.float64) and arr.max() <= 1.5:
+            arr = (arr * 255.0).astype(np.float32)
+        else:
+            arr = arr.astype(np.float32)
+    else:
+        arr = np.array(im)
+        if arr.ndim == 2:
+            arr = np.repeat(arr[..., None], 3, axis=-1)
+        arr = arr.astype(np.float32)
+        if arr.max() <= 1.5:
+            arr = (arr * 255.0).astype(np.float32)
+    if arr.ndim != 3 or arr.shape[-1] != 3:
+        raise ValueError("Expected RGB image with shape HxWx3.")
+    return arr
+def _letterbox_keep_ratio(arr: np.ndarray, target_hw: Tuple[int, int]):
+    """Resize with aspect ratio preserved and pad with 0 (black) to target (H,W).
+    Returns: out(H,W,3), (top, left, new_h, new_w)
+    """
+    th, tw = target_hw
+    h, w = arr.shape[:2]
+    scale = min(th / h, tw / w)
+    nh, nw = int(round(h * scale)), int(round(w * scale))
+    if nh <= 0 or nw <= 0:
+        raise ValueError("Invalid resize result.")
+    pil = Image.fromarray(np.clip(arr, 0, 255).astype(np.uint8))
+    pil = pil.resize((nw, nh), resample=Image.BILINEAR)
+    rs = np.array(pil, dtype=np.float32)
+    out = np.zeros((th, tw, 3), dtype=np.float32)
+    top = (th - nh) // 2
+    left = (tw - nw) // 2
+    out[top:top+nh, left:left+nw] = rs
+    return out, (top, left, nh, nw)
+def _zscore_ignore_black(chw: np.ndarray, eps: float = 1e-8) -> np.ndarray:
+    mask = (chw.sum(axis=0) > 0)  # HxW
+    if not mask.any():
+        return chw.copy()
+    valid = chw[:, mask]
+    mean = valid.mean()
+    std = valid.std()
+    return (chw - mean) / std if std > eps else (chw - mean)
+class FilmUnet2DImageProcessor(ImageProcessingMixin):
+    """
+    Processor for FILMUnet2D:
+      - Convert to RGB
+      - Keep-aspect-ratio resize+pad (letterbox) to 512x512 (configurable)
+      - Normalize with mean/std in 0–255 space (like your training)
+      - Optional z-score 'self_norm' ignoring black pixels
+    Returns dict with:
+      - pixel_values: torch.FloatTensor [B,3,H,W]
+      - original_sizes: torch.LongTensor [B,2] (H,W)
+      - letterbox_params: torch.LongTensor [B,4] (top, left, nh, nw)  # NEW
+    """
+    model_input_names = ["pixel_values"]
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Tuple[int, int] = (512, 512),
+        keep_ratio: bool = True,
+        image_mean: Tuple[float, float, float] = (123.675, 116.28, 103.53),
+        image_std:  Tuple[float, float, float]  = (58.395, 57.12, 57.375),
+        self_norm: bool = False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.do_resize = bool(do_resize)
+        self.size = tuple(size)
+        self.keep_ratio = bool(keep_ratio)
+        self.image_mean = tuple(float(x) for x in image_mean)
+        self.image_std  = tuple(float(x) for x in image_std)
+        self.self_norm = bool(self_norm)
+    def __call__(
+        self,
+        images: Union[ArrayLike, List[ArrayLike]],
+        return_tensors: Optional[str] = "pt",
+        **kwargs,
+    ):
+        imgs = images if isinstance(images, (list, tuple)) else [images]
+        batch = []
+        orig_sizes = []
+        lb_params = []
+        for im in imgs:
+            arr = _to_rgb_numpy(im)  # HWC float32 in 0–255
+            oh, ow = arr.shape[:2]
+            orig_sizes.append((oh, ow))
+            if self.do_resize:
+                if self.keep_ratio:
+                    arr, meta = _letterbox_keep_ratio(arr, self.size)  # meta=(top,left,nh,nw)
+                else:
+                    h, w = self.size
+                    pil = Image.fromarray(np.clip(arr, 0, 255).astype(np.uint8))
+                    arr = np.array(pil.resize((w, h), resample=Image.BILINEAR), dtype=np.float32)
+                    meta = (0, 0, h, w)
+            else:
+                # no resize: still expose meta so postprocess can handle consistently
+                h, w = arr.shape[:2]
+                pad_h = self.size[0] - h
+                pad_w = self.size[1] - w
+                top = max(pad_h // 2, 0)
+                left = max(pad_w // 2, 0)
+                out = np.zeros((*self.size, 3), dtype=np.float32)
+                out[top:top+h, left:left+w] = arr[:self.size[0]-top, :self.size[1]-left]
+                arr = out
+                meta = (top, left, h, w)
+            lb_params.append(meta)
+            mean = np.array(self.image_mean, dtype=np.float32).reshape(1, 1, 3)
+            std  = np.array(self.image_std,  dtype=np.float32).reshape(1, 1, 3)
+            arr = (arr - mean) / std  # HWC
+            chw = np.transpose(arr, (2, 0, 1))  # C,H,W
+            if self.self_norm:
+                chw = _zscore_ignore_black(chw)
+            batch.append(chw)
+        pixel_values = np.stack(batch, axis=0)  # B,C,H,W
+        if return_tensors == "pt":
+            pixel_values = torch.from_numpy(pixel_values).to(torch.float32)
+            original_sizes = torch.tensor(orig_sizes, dtype=torch.long)
+            letterbox_params = torch.tensor(lb_params, dtype=torch.long)
+        else:
+            original_sizes = orig_sizes
+            letterbox_params = lb_params
+        return {
+            "pixel_values": pixel_values,
+            "original_sizes": original_sizes,     # (B,2) H,W
+            "letterbox_params": letterbox_params  # (B,4) top,left,nh,nw in 512x512
+        }
+    # ---------- POST-PROCESSING ----------
+    def post_process_semantic_segmentation(
+        self,
+        outputs: dict,
+        processor_inputs: Optional[dict] = None,
+        threshold: float = 0.5,
+        return_as_pil: bool = True,
+    ):
+        """
+        Turn model outputs into masks resized back to the ORIGINAL image sizes,
+        with letterbox padding removed.
+        Args:
+            outputs: dict from model forward (expects 'logits': [B,1,512,512])
+            processor_inputs: the dict returned by __call__ (must contain
+                'original_sizes' [B,2] and 'letterbox_params' [B,4])
+            threshold: probability threshold for binarization
+            return_as_pil: return a list of PIL Images (uint8 0/255) if True,
+                           else a list of torch tensors [H,W] uint8
+        Returns:
+            List of masks back in original sizes (H,W).
+        """
+        logits = outputs["logits"]            # [B,1,H,W]
+        probs = torch.sigmoid(logits)
+        masks = (probs > threshold).to(torch.uint8) * 255  # [B,1,H,W] uint8
+        if processor_inputs is None:
+            raise ValueError("processor_inputs must be provided to undo letterboxing.")
+        orig_sizes = processor_inputs["original_sizes"]    # [B,2]
+        lb_params  = processor_inputs["letterbox_params"]  # [B,4] top,left,nh,nw
+        results = []
+        B = masks.shape[0]
+        for i in range(B):
+            m = masks[i, 0]      # [512,512]
+            top, left, nh, nw = [int(x) for x in lb_params[i].tolist()]
+            # crop letterbox
+            m_cropped = m[top:top+nh, left:left+nw]  # [nh,nw]
+            # resize back to original
+            oh, ow = [int(x) for x in orig_sizes[i].tolist()]
+            m_resized = torch.nn.functional.interpolate(
+                m_cropped.unsqueeze(0).unsqueeze(0).float(),
+                size=(oh, ow),
+                mode="nearest"
+            )[0,0].to(torch.uint8)  # [oh,ow]
+            if return_as_pil:
+                results.append(Image.fromarray(m_resized.cpu().numpy(), mode="L"))
+            else:
+                results.append(m_resized)
+        return results

unet_4_stages/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9bf559438d470c899a73302e24c3f150cd673ffb67a9e7b844623f8156bbabf6
+size 151840188

unet_4_stages/modeling_film_unet2d.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import torch
+import torch.nn as nn
+from transformers.modeling_utils import PreTrainedModel
+from .configuration_film_unet2d import FilmUnet2DConfig
+class ConvBlock(nn.Module):
+    def __init__(self, in_ch, out_ch, k=3, s=1, p=1):
+        super().__init__()
+        self.block = nn.Sequential(
+            nn.Conv2d(in_ch, out_ch, kernel_size=k, stride=s, padding=p),
+            nn.InstanceNorm2d(out_ch),
+            nn.LeakyReLU(inplace=True),
+        )
+    def forward(self, x): return self.block(x)
+class FiLM2d(nn.Module):
+    def __init__(self, n_organs, in_channels, emb_dim=64, hidden=None):
+        super().__init__()
+        hidden = hidden or 2 * in_channels
+        self.embed = nn.Embedding(n_organs, emb_dim)
+        self.mlp = nn.Sequential(
+            nn.Linear(emb_dim, hidden), nn.ReLU(inplace=True), nn.Linear(hidden, 2*in_channels)
+        )
+        nn.init.zeros_(self.mlp[-1].weight)
+        nn.init.constant_(self.mlp[-1].bias[:in_channels], 0)
+        nn.init.constant_(self.mlp[-1].bias[in_channels:], 1)
+    def forward(self, x, organ_id):
+        beta_gamma = self.mlp(self.embed(organ_id))
+        beta, gamma = beta_gamma.chunk(2, dim=-1)
+        beta = beta.unsqueeze(-1).unsqueeze(-1)
+        gamma = gamma.unsqueeze(-1).unsqueeze(-1)
+        return gamma * x + beta
+class DownFiLM(nn.Module):
+    def __init__(self, in_chs, out_chs, n_organs):
+        super().__init__()
+        self.conv_blocks = nn.ModuleList([ConvBlock(i, o) for i,o in zip(in_chs,out_chs)])
+        self.film_blocks = nn.ModuleList([FiLM2d(n_organs, o) for o in out_chs])
+        self.pool = nn.MaxPool2d(2,2)
+    def forward(self, x, organ_id):
+        for c,f in zip(self.conv_blocks, self.film_blocks):
+            x = f(c(x), organ_id)
+        return self.pool(x), x
+class Down(nn.Module):
+    def __init__(self, in_chs, out_chs):
+        super().__init__()
+        self.conv_blocks = nn.ModuleList([ConvBlock(i, o) for i,o in zip(in_chs,out_chs)])
+        self.pool = nn.MaxPool2d(2,2)
+    def forward(self, x):
+        for c in self.conv_blocks: x = c(x)
+        return self.pool(x), x
+class UpFiLM(nn.Module):
+    def __init__(self, in_chs, out_chs, n_organs, up=True):
+        super().__init__()
+        self.conv_blocks = nn.ModuleList([ConvBlock(i, o) for i,o in zip(in_chs,out_chs)])
+        self.film_blocks = nn.ModuleList([FiLM2d(n_organs, o) for o in out_chs])
+        self.up_conv_op = nn.ConvTranspose2d(out_chs[-1], out_chs[-1], kernel_size=2, stride=2) if up else None
+    def forward(self, x, organ_id):
+        for c,f in zip(self.conv_blocks, self.film_blocks):
+            x = f(c(x), organ_id)
+        return self.up_conv_op(x) if self.up_conv_op is not None else x
+class Up(nn.Module):
+    def __init__(self, in_chs, out_chs, up=True):
+        super().__init__()
+        self.conv_blocks = nn.ModuleList([ConvBlock(i, o) for i,o in zip(in_chs,out_chs)])
+        self.up_conv_op = nn.ConvTranspose2d(out_chs[-1], out_chs[-1], kernel_size=2, stride=2) if up else None
+    def forward(self, x):
+        for c in self.conv_blocks: x = c(x)
+        return self.up_conv_op(x) if self.up_conv_op is not None else x
+class UNet2DFiLMCore(nn.Module):
+    def __init__(self, cfg: FilmUnet2DConfig):
+        super().__init__()
+        size, depth, n_organs = cfg.size, cfg.depth, cfg.n_organs
+        use_film, film_start = cfg.use_film, cfg.film_start
+        self.encoder = nn.ModuleDict()
+        if use_film and 0 >= film_start:
+            self.encoder["0"] = DownFiLM([cfg.in_channels, size], [size, size*2], n_organs)
+        else:
+            self.encoder["0"] = Down([cfg.in_channels, size], [size, size*2])
+        for i in range(1, depth):
+            in_ch = [size*(2**i), size*(2**i)]
+            out_ch = [size*(2**i), size*(2**(i+1))]
+            if use_film and i >= film_start:
+                self.encoder[str(i)] = DownFiLM(in_ch, out_ch, n_organs)
+            else:
+                self.encoder[str(i)] = Down(in_ch, out_ch)
+        if use_film:
+            self.bottleneck = UpFiLM([size*(2**depth), size*(2**depth)], [size*(2**depth), size*(2**(depth+1))], n_organs)
+        else:
+            self.bottleneck = Up([size*(2**depth), size*(2**depth)], [size*(2**depth), size*(2**(depth+1))])
+        self.decoder = nn.ModuleDict()
+        for i in range(depth, 1, -1):
+            use_film_here = use_film and (i-1) >= film_start
+            if use_film_here:
+                self.decoder[str(i-1)] = UpFiLM([size*(2**(i+1))+size*(2**i), size*(2**i)], [size*(2**i), size*(2**i)], n_organs)
+            else:
+                self.decoder[str(i-1)] = Up([size*(2**(i+1))+size*(2**i), size*(2**i)], [size*(2**i), size*(2**i)])
+        if use_film and 0 >= film_start:
+            self.decoder["0"] = UpFiLM([size*4+size*2, size*2], [size*2, size*2], n_organs, up=False)
+        else:
+            self.decoder["0"] = Up([size*4+size*2, size*2], [size*2, size*2], up=False)
+        self.out_layer = ConvBlock(
+            size * 2,
+            cfg.num_classes,
+            k= 1,s= 1,p=0
+        )
+    def forward(self, pixel_values, organ_id):
+        feats = []
+        out, feat = (self.encoder["0"](pixel_values, organ_id) if isinstance(self.encoder["0"], DownFiLM) else self.encoder["0"](pixel_values))
+        feats.append(feat)
+        for k in list(self.encoder.keys())[1:]:
+            blk = self.encoder[k]
+            out, feat = (blk(out, organ_id) if isinstance(blk, DownFiLM) else blk(out))
+            feats.append(feat)
+        out = self.bottleneck(out, organ_id) if isinstance(self.bottleneck, UpFiLM) else self.bottleneck(out)
+        for k in self.decoder:
+            cat = torch.cat([out, feats[int(k)]], dim=1)
+            blk = self.decoder[k]
+            out = blk(cat, organ_id) if isinstance(blk, UpFiLM) else blk(cat)
+        return self.out_layer(out)
+class FilmUnet2DModel(PreTrainedModel):
+    config_class = FilmUnet2DConfig
+    base_model_prefix = "model"
+    def __init__(self, config: FilmUnet2DConfig):
+        super().__init__(config)
+        self.model = UNet2DFiLMCore(config)
+        self.post_init()
+    def forward(self, pixel_values, organ_id, labels=None, **kwargs):
+        logits = self.model(pixel_values, organ_id)
+        if labels is None:
+            return {"logits": logits}
+        loss = nn.functional.binary_cross_entropy_with_logits(logits, labels)
+        return {"loss": loss, "logits": logits}

unet_4_stages/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "do_resize": true,
+  "size": [512, 512],
+  "keep_ratio": true,
+  "image_mean": [123.675, 116.28, 103.53],
+  "image_std": [58.395, 57.12, 57.375],
+  "self_norm": false
+}