Upload 3 files

Browse files

Files changed (3) hide show

__init__.py +0 -0
configuration_bert_ordinal.py +107 -0
modeling_bert_ordinal.py +282 -0

__init__.py ADDED Viewed

File without changes

configuration_bert_ordinal.py ADDED Viewed

	@@ -0,0 +1,107 @@

+"""
+bert_ordinal.py
+---------------
+BERT-based ordinal regression model, fully integrated with the HuggingFace
+Transformers API:
+    model.save_pretrained("my-checkpoint/")
+    model = BertOrdinal.from_pretrained("my-checkpoint/")
+Architecture
+------------
+1. A (optionally frozen) BERT backbone.
+2. A projection head on the [CLS] token:
+       Linear(hidden_size → hidden_dim) → ReLU → Dropout(p) → Linear(hidden_dim → 1)
+   producing a single latent score s ∈ ℝ.
+3. K-1 learnable raw_threshold parameters enforcing monotonicity via
+   cumsum(softplus(·)).
+4. Cumulative-link probabilities:
+       P(Y ≤ j | x) = σ(θ_j − s)
+Usage
+-----
+    from bert_ordinal import BertOrdinalConfig, BertOrdinal
+    # ── Create from scratch ──────────────────────────────────────────────────
+    cfg = BertOrdinalConfig(
+        bert_model_name="bert-base-uncased",
+        num_classes=3,
+        hidden_dim=128,
+        dropout=0.1,
+        freeze_bert=True,
+    )
+    model = BertOrdinal(cfg)
+    # ── Save ────────────────────────────────────────────────────────────────
+    model.save_pretrained("my-checkpoint/")
+    tokenizer.save_pretrained("my-checkpoint/")   # keep tokenizer alongside
+    # ── Reload ──────────────────────────────────────────────────────────────
+    model     = BertOrdinal.from_pretrained("my-checkpoint/")
+    tokenizer = AutoTokenizer.from_pretrained("my-checkpoint/")
+"""
+from __future__ import annotations
+from typing import Optional
+from transformers import PretrainedConfig
+# ---------------------------------------------------------------------------
+# 1. Config  —  subclass PretrainedConfig for full HF serialisation
+# ---------------------------------------------------------------------------
+class BertOrdinalConfig(PretrainedConfig):
+    """
+    Configuration for :class:`BertOrdinal`.
+    Because this inherits from :class:`~transformers.PretrainedConfig`,
+    ``save_pretrained`` writes a ``config.json`` that ``from_pretrained``
+    can read back without any extra bookkeeping.
+    Parameters
+    ----------
+    bert_model_name : str
+        HuggingFace model name or local path for the BERT backbone.
+    num_classes : int
+        Number of ordinal classes K.  Creates K-1 learnable thresholds.
+    hidden_dim : int
+        Inner dimension of the projection head.
+    dropout : float
+        Dropout probability inside the projection head.
+    freeze_bert : bool
+        Freeze backbone weights at construction time.
+    loss_reduction : str
+        ``'mean'`` or ``'sum'``.
+    """
+    # Tells HF which class owns this config (written into config.json).
+    model_type = "bert_ordinal"
+    problem_type = "single_label_classification"
+    def __init__(
+        self,
+        bert_model_name: str  = "allenai/scibert_scivocab_uncased",
+        num_classes:     int  = 3,
+        hidden_dim:      int  = 256,
+        dropout:         float = 0.1,
+        freeze_bert:     bool = True,
+        loss_reduction:  str  = "mean",
+        # hidden_size is set automatically by the model after loading BERT;
+        # it is stored here so from_pretrained can rebuild the head offline.
+        hidden_size:     Optional[int] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.bert_model_name = bert_model_name
+        self.num_classes     = num_classes
+        self.hidden_dim      = hidden_dim
+        self.dropout         = dropout
+        self.freeze_bert     = freeze_bert
+        self.loss_reduction  = loss_reduction
+        self.hidden_size     = hidden_size   # filled in by BertOrdinal.__init__
+        self.auto_map = {
+            "AutoConfig": "configuration_bert_ordinal.BertOrdinalConfig",
+            "AutoModel": "modeling_bert_ordinal.BertOrdinal",
+            "AutoModelForSequenceClassification": "modeling_bert_ordinal.BertOrdinal",
+        }

modeling_bert_ordinal.py ADDED Viewed

	@@ -0,0 +1,282 @@

+"""
+bert_ordinal.py
+---------------
+BERT-based ordinal regression model, fully integrated with the HuggingFace
+Transformers API:
+    model.save_pretrained("my-checkpoint/")
+    model = BertOrdinal.from_pretrained("my-checkpoint/")
+Architecture
+------------
+1. A (optionally frozen) BERT backbone.
+2. A projection head on the [CLS] token:
+       Linear(hidden_size → hidden_dim) → ReLU → Dropout(p) → Linear(hidden_dim → 1)
+   producing a single latent score s ∈ ℝ.
+3. K-1 learnable raw_threshold parameters enforcing monotonicity via
+   cumsum(softplus(·)).
+4. Cumulative-link probabilities:
+       P(Y ≤ j | x) = σ(θ_j − s)
+Usage
+-----
+    from bert_ordinal import BertOrdinalConfig, BertOrdinal
+    # ── Create from scratch ──────────────────────────────────────────────────
+    cfg = BertOrdinalConfig(
+        bert_model_name="bert-base-uncased",
+        num_classes=3,
+        hidden_dim=128,
+        dropout=0.1,
+        freeze_bert=True,
+    )
+    model = BertOrdinal(cfg)
+    # ── Save ────────────────────────────────────────────────────────────────
+    model.save_pretrained("my-checkpoint/")
+    tokenizer.save_pretrained("my-checkpoint/")   # keep tokenizer alongside
+    # ── Reload ──────────────────────────────────────────────────────────────
+    model     = BertOrdinal.from_pretrained("my-checkpoint/")
+    tokenizer = AutoTokenizer.from_pretrained("my-checkpoint/")
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import AutoModel, PreTrainedModel
+from transformers.modeling_outputs import ModelOutput
+from configuration_bert_ordinal import BertOrdinalConfig
+# ---------------------------------------------------------------------------
+# 1. Output dataclass
+# ---------------------------------------------------------------------------
+@dataclass
+class BertOrdinalOutput(ModelOutput):
+    """
+    Return type of :class:`BertOrdinal`.
+    Attributes
+    ----------
+    loss : torch.Tensor or None
+        Ordinal cross-entropy loss (scalar). Present only when ``labels``
+        are supplied.
+    logits : torch.Tensor  (B,)
+        Raw latent score from the projection head.
+    predictions : torch.Tensor  (B,)
+        Predicted class index — argmax of ``class_probs``.
+    cum_probs : torch.Tensor  (B, K-1)
+        Cumulative probabilities P(Y ≤ j | x).
+    class_probs : torch.Tensor  (B, K)
+        Per-class probabilities P(Y = j | x).
+    """
+    loss:        Optional[torch.Tensor] = None
+    logits:      Optional[torch.Tensor] = None
+    predictions: Optional[torch.Tensor] = None
+    cum_probs:   Optional[torch.Tensor] = None
+    class_probs: Optional[torch.Tensor] = None
+# ---------------------------------------------------------------------------
+# 3. Model  —  subclass PreTrainedModel for save / from_pretrained
+# ---------------------------------------------------------------------------
+class BertOrdinal(PreTrainedModel):
+    """
+    BERT encoder with an ordinal-regression head.
+    Fully compatible with the HuggingFace checkpoint API::
+        model.save_pretrained("my-checkpoint/")
+        model = BertOrdinal.from_pretrained("my-checkpoint/")
+    What gets saved
+    ~~~~~~~~~~~~~~~
+    ``save_pretrained`` writes two files:
+    * ``config.json``  — the full :class:`BertOrdinalConfig` (including
+      ``bert_model_name``, ``hidden_size``, thresholds shape, …).
+    * ``model.safetensors`` (or ``pytorch_model.bin``) — a **single flat
+      state_dict** containing both the BERT backbone weights and the
+      head/threshold parameters.
+    ``from_pretrained`` reconstructs the model from the config (which
+    already has ``hidden_size`` cached), loads the state_dict, and
+    re-applies the ``freeze_bert`` setting — no internet access needed
+    after the first save.
+    """
+    config_class = BertOrdinalConfig
+    def __init__(self, config: BertOrdinalConfig) -> None:
+        super().__init__(config)
+        K = config.num_classes
+        # ── 1. BERT backbone ────────────────────────────────────────────────
+        # If hidden_size is already in the config (i.e. we are being called
+        # from from_pretrained after a save), build the backbone from the
+        # cached backbone config instead of re-downloading weights —
+        # from_pretrained will overwrite with the saved state_dict anyway.
+        self.bert = AutoModel.from_pretrained(config.bert_model_name)
+        hidden_size: int = self.bert.config.hidden_size
+        # Cache so the head can be rebuilt offline after save_pretrained.
+        config.hidden_size = hidden_size
+        if config.freeze_bert:
+            for param in self.bert.parameters():
+                param.requires_grad = False
+        # ── 2. Projection head ──────────────────────────────────────────────
+        self.head = nn.Sequential(
+            nn.Linear(hidden_size, config.hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(config.dropout),
+            nn.Linear(config.hidden_dim, 1),
+        )
+        self._init_head()
+        # ── 3. Ordinal thresholds ───────────────────────────────────────────
+        # K-1 raw values; monotonicity enforced via cumsum(softplus(·)).
+        self.raw_thresholds = nn.Parameter(torch.zeros(K - 1))
+        with torch.no_grad():
+            targets = torch.linspace(-1.0, 1.0, K - 1)
+            diffs   = torch.cat([targets[:1], targets[1:] - targets[:-1]])
+            self.raw_thresholds.copy_(
+                torch.log(torch.expm1(diffs.clamp(min=1e-3)))
+            )
+        # Finalises weight init bookkeeping required by PreTrainedModel.
+        self.post_init()
+    # -----------------------------------------------------------------------
+    # Helpers
+    # -----------------------------------------------------------------------
+    def _init_head(self) -> None:
+        for m in self.head.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
+                nn.init.zeros_(m.bias)
+    @property
+    def thresholds(self) -> torch.Tensor:
+        """Monotone thresholds θ₁ ≤ … ≤ θ_{K-1}  (shape: K-1)."""
+        return torch.cumsum(F.softplus(self.raw_thresholds), dim=0)
+    # -----------------------------------------------------------------------
+    # Forward
+    # -----------------------------------------------------------------------
+    def forward(
+        self,
+        input_ids:      torch.Tensor,
+        attention_mask: torch.Tensor,
+        token_type_ids: Optional[torch.Tensor] = None,
+        labels:         Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> BertOrdinalOutput:
+        """
+        Parameters
+        ----------
+        input_ids       : (B, L)
+        attention_mask  : (B, L)
+        token_type_ids  : (B, L)  optional
+        labels          : (B,)    long — class indices in {0, …, K-1}
+        Returns
+        -------
+        BertOrdinalOutput
+        """
+        # ── Encode ──────────────────────────────────────────────────────────
+        bert_kwargs = dict(input_ids=input_ids, attention_mask=attention_mask)
+        if token_type_ids is not None:
+            bert_kwargs["token_type_ids"] = token_type_ids
+        cls_repr = self.bert(**bert_kwargs).last_hidden_state[:, 0, :]  # (B, H)
+        # ── Latent score ────────────────────────────────────────────────────
+        score = self.head(cls_repr).squeeze(-1)  # (B,)
+        # ── Cumulative probs  P(Y ≤ j) = σ(θ_j − score) ────────────────────
+        cum_logits = self.thresholds.unsqueeze(0) - score.unsqueeze(1)  # (B, K-1)
+        cum_probs  = torch.sigmoid(cum_logits)                           # (B, K-1)
+        # ── Class probs  P(Y = j) = P(Y ≤ j) − P(Y ≤ j-1) ─────────────────
+        B, dev = cum_probs.size(0), cum_probs.device
+        F_ = torch.cat(
+            [torch.zeros(B, 1, device=dev), cum_probs, torch.ones(B, 1, device=dev)],
+            dim=1,
+        )                                                               # (B, K+1)
+        class_probs = (F_[:, 1:] - F_[:, :-1]).clamp(min=1e-9)        # (B, K)
+        # ── Predictions ──────────────────────────────────────────────────────
+        predictions = class_probs.argmax(dim=-1)                        # (B,)
+        # ── Loss ─────────────────────────────────────────────────────────────
+        loss: Optional[torch.Tensor] = None
+        if labels is not None:
+            loss = ordinal_cross_entropy(
+                class_probs, labels, reduction=self.config.loss_reduction
+            )
+        return BertOrdinalOutput(
+            loss=loss,
+            logits=score,
+            predictions=predictions,
+            cum_probs=cum_probs,
+            class_probs=class_probs,
+        )
+    # -----------------------------------------------------------------------
+    # Convenience
+    # -----------------------------------------------------------------------
+    @torch.no_grad()
+    def predict(
+        self,
+        input_ids:      torch.Tensor,
+        attention_mask: torch.Tensor,
+        token_type_ids: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Return predicted class indices (no loss computed)."""
+        return self.forward(input_ids, attention_mask, token_type_ids).predictions
+# ---------------------------------------------------------------------------
+# Loss function
+# ---------------------------------------------------------------------------
+def ordinal_cross_entropy(
+    class_probs: torch.Tensor,
+    labels:      torch.Tensor,
+    reduction:   str = "mean",
+) -> torch.Tensor:
+    """
+    Ordinal cross-entropy.
+    Parameters
+    ----------
+    class_probs : (B, K)  — P(Y=j|x), clamped > 0
+    labels      : (B,)    — ground-truth indices in {0, …, K-1}
+    reduction   : 'mean' | 'sum' | 'none'
+    """
+    return F.nll_loss(torch.log(class_probs), labels, reduction=reduction)
+# ---------------------------------------------------------------------------
+# Register the model with the Transformers library
+# ---------------------------------------------------------------------------
+from transformers import AutoConfig, AutoModel, AutoModelForSequenceClassification
+AutoConfig.register("bert_ordinal", BertOrdinalConfig)
+AutoModel.register(BertOrdinalConfig, BertOrdinal)
+AutoModelForSequenceClassification.register(BertOrdinalConfig, BertOrdinal)